프로그래밍/Python
파이썬 - html 파싱
체크개발자
2017. 12. 16. 17:17
from collections import Counter
from konlpy.tag import Twitter
import pytagcloud
f = open('blog_data1.txt')
data = f.read()
nlp = Twitter()
nouns = nlp.nouns(data)
count = Counter(nouns)
tags2 = count.most_common(100)
taglist = pytagcloud.make_tags(tags2, maxsize=200)
pytagcloud.create_tag_image(taglist, 'wordcloud.png',size=(900,600), fontname='Korean1',rectangular=False)
f.close()
from urllib.request import urlopen
from bs4 import BeautifulSoup
#html = urlopen("https://www.naver.com")
#html = urlopen("http://www.pythonscraping.com/exercises/exercise1.html")
#url = "http://www.pythonscraping.com"
#path ="/pages/warandpeace.html"
url ="http://v.media.daum.net/v/20171216090219353"
path=""
html = urlopen(url+path)
s = html.read()
#print(s)
bsObj = BeautifulSoup(s, 'html.parser')
#print (bsObj.prettify())
data = ""
tag = bsObj.find_all("div",{"class":"hcg_media_pc_mArticle"})
for item in tag:
data = data + " " + item.get_text()
print(data)
#print("div data : ", item.get_text())
from wordcloud import WordCloud, STOPWORDS
import webbrowser
import numpy as np
from PIL import Image
stopwords = set(STOPWORDS)
stopwords.add("힘차게")
pic_mask = np.array(Image.open("stormtrooper_mask.png"))
wc = WordCloud(
font_path ="c:/windows/fonts/malgun.ttf",
background_color ='white',
max_words = 2000, mask = pic_mask
)
wc.generate(data)
wc.to_file("cloudchart1.png")
webbrowser.open("cloudchart1.png")
#print("title : " , bsObj.title)
#print("h1 : " , bsObj.h1)
#print("div : " , bsObj.div)
#print("title : " , bsObj.title.get_text())
#print("h1 : " , bsObj.h1.get_text())
#print("div : " , bsObj.div.get_text())
#-*- coding: utf-8 -*-
#filename : naver_crawling.py
import sys
from bs4 import BeautifulSoup
from urllib.request import urlopen
#살충제달걀 --> 컴퓨터식으로 인코딩
from urllib.parse import quote
url1 ="https://search.naver.com/search.naver?ie=utf8&where=news"
url2 = "&query="
url3 = "&sm=tab_pge&sort=0&photo=0&field=0&reporter_article=&pd=0&ds=&de=&docid=&nso=so:r,p:all,a:all&mynews=0&cluster_rank=32&start=11&refresh_start=0"
def getUrl(target_url, page_num):
pos = target_url.index('start')
#print(pos)
url = target_url[0:pos+6]
#print(url)
#& 여러번 등장
pos2 = target_url.index("&", pos+6)
#print(target_url[pos+6:pos2])
temp = target_url[pos2:]
target_url = url + str(page_num*10+1)
target_url = target_url + temp
print(target_url)
return target_url
def get_naver_url(page_num, keyword,filename):
target_url = url1 + url2 + str(quote(keyword)) + url3
#페이지 변경해서 온 url로 작업하기
for i in range(int(page_num)):
target_url = getUrl(target_url, i)
doc = urlopen(target_url)
soup = BeautifulSoup(doc, "html.parser")
result = soup.find_all('ul',{"class":"type01"})
if(len(result)==0):
print("기사가 없습니다.")
return
output_file = open(filename, 'w', encoding="utf-8")
li = result[0].select('li')
for item in li:
title_link = item.find_all('a',
{"class":"_sp_each_url"})
#print( title_link)
if(len(title_link)>=1):
article_url = title_link[0]['href']
if article_url.count("http://news.naver")>=1:
text =get_title_contents(article_url)
#파일에 저장하기
print(text,file=output_file)
output_file.close()
def get_title_contents(article_url):
result=""
doc = urlopen(article_url)
soup = BeautifulSoup(doc,
"html.parser")
#print(soup.prettify())
#title만 읽어오기
titleList = soup.find_all('h3',
{"id":"articleTitle"})
if( len(titleList)>0):
print("제목 : ",
titleList[0].get_text())
result = titleList[0].get_text()
contentsList = soup.find_all('div',
{"id":"articleBodyContents"})
if( len(contentsList)>0):
temp = contentsList[0].get_text()
temp = temp.replace("// flash 오류를 우회하기 위한 함수 추가", "")
temp = temp.replace("function _flash_removeCallback() {}", "")
temp = temp.replace("\n", "")
temp = temp.replace("\t", "")
print("내용", temp)
result = result + temp
return result
#main
keyword = input("검색어 : ")
page_num=input("페이지 개수 : ")
filename = input("파일입력 : ")
get_naver_url(page_num, keyword, filename)
read_file = open(filename, 'r',
encoding="utf-8")
text = read_file.read()
read_file.close()
print("------- 파일 읽은것 --------")
print(text)