파이썬 - html 파싱

프로그래밍/Python

파이썬 - html 파싱

체크개발자 2017. 12. 16. 17:17

from collections import Counter
from konlpy.tag import Twitter

import pytagcloud

f = open('blog_data1.txt')
data = f.read()
nlp = Twitter()
nouns = nlp.nouns(data)
count = Counter(nouns)
tags2 = count.most_common(100)
taglist = pytagcloud.make_tags(tags2, maxsize=200)
pytagcloud.create_tag_image(taglist, 'wordcloud.png',size=(900,600), fontname='Korean1',rectangular=False)
f.close()

from urllib.request import urlopen
from bs4 import BeautifulSoup

#html = urlopen("https://www.naver.com")

#html = urlopen("http://www.pythonscraping.com/exercises/exercise1.html")

#url = "http://www.pythonscraping.com"
#path ="/pages/warandpeace.html"
url ="http://v.media.daum.net/v/20171216090219353"
path=""
html = urlopen(url+path)
s = html.read()
#print(s)
bsObj = BeautifulSoup(s, 'html.parser')

#print (bsObj.prettify())

data = ""
tag = bsObj.find_all("div",{"class":"hcg_media_pc_mArticle"})
for item in tag:
    data = data + " " + item.get_text()
print(data)
    #print("div data : ", item.get_text())


from wordcloud import WordCloud, STOPWORDS
import webbrowser
import numpy as np
from PIL import Image


stopwords = set(STOPWORDS)
stopwords.add("힘차게")

pic_mask = np.array(Image.open("stormtrooper_mask.png"))

wc = WordCloud(
    font_path ="c:/windows/fonts/malgun.ttf",
    background_color ='white',
    max_words = 2000, mask = pic_mask
)

wc.generate(data)
wc.to_file("cloudchart1.png")
webbrowser.open("cloudchart1.png")
#print("title : " , bsObj.title)
#print("h1 : " , bsObj.h1)
#print("div : " , bsObj.div)


#print("title : " , bsObj.title.get_text())
#print("h1 : " , bsObj.h1.get_text())
#print("div : " , bsObj.div.get_text())

#-*- coding: utf-8 -*-
#filename : naver_crawling.py
import sys
from bs4 import BeautifulSoup
from urllib.request import urlopen 
#살충제달걀 --> 컴퓨터식으로 인코딩
from urllib.parse import quote

url1 ="https://search.naver.com/search.naver?ie=utf8&where=news"
url2 = "&query="
url3 = "&sm=tab_pge&sort=0&photo=0&field=0&reporter_article=&pd=0&ds=&de=&docid=&nso=so:r,p:all,a:all&mynews=0&cluster_rank=32&start=11&refresh_start=0"
def getUrl(target_url, page_num):
    pos = target_url.index('start')
    #print(pos)
    url = target_url[0:pos+6]
    #print(url)
    #& 여러번 등장 
    pos2 = target_url.index("&", pos+6)
    #print(target_url[pos+6:pos2])
    temp = target_url[pos2:]
    target_url = url + str(page_num*10+1)
    target_url = target_url + temp
    print(target_url)
    return target_url 
def get_naver_url(page_num, keyword,filename):
    target_url = url1 + url2 + str(quote(keyword)) + url3 
    #페이지 변경해서 온 url로 작업하기 
    for i in range(int(page_num)):
        target_url = getUrl(target_url, i)
        doc = urlopen(target_url)
        soup = BeautifulSoup(doc, "html.parser")
        result = soup.find_all('ul',{"class":"type01"})
        if(len(result)==0):
            print("기사가 없습니다.")
            return 
        output_file = open(filename, 'w', encoding="utf-8")
        li = result[0].select('li')
        for item in li:
            title_link = item.find_all('a', 
                    {"class":"_sp_each_url"})
            #print( title_link)
            if(len(title_link)>=1):
                article_url = title_link[0]['href']
                if article_url.count("http://news.naver")>=1:
                    text =get_title_contents(article_url)
                    #파일에 저장하기 
                    print(text,file=output_file)
        output_file.close()                
def get_title_contents(article_url):
    result=""
    doc = urlopen(article_url)
    soup = BeautifulSoup(doc,
                           "html.parser")
    #print(soup.prettify())
    #title만 읽어오기 
    titleList = soup.find_all('h3',
                 {"id":"articleTitle"})
    if( len(titleList)>0):
        print("제목 : ",
            titleList[0].get_text())
        result = titleList[0].get_text()
    contentsList = soup.find_all('div',
            {"id":"articleBodyContents"})
    if( len(contentsList)>0):
        temp = contentsList[0].get_text() 
        temp = temp.replace("// flash 오류를 우회하기 위한 함수 추가", "")
        temp = temp.replace("function _flash_removeCallback() {}", "")
        temp = temp.replace("\n", "")
        temp = temp.replace("\t", "")
        print("내용", temp)  
        result = result + temp
    return result  
#main 
keyword = input("검색어 : ")
page_num=input("페이지 개수 : ")
filename = input("파일입력 : ")
get_naver_url(page_num, keyword, filename)
read_file = open(filename, 'r', 
                 encoding="utf-8")
text = read_file.read()
read_file.close()
print("------- 파일 읽은것 --------")
print(text)

저작자표시 (새창열림)