# 국민 청원 접속 및 청원 목록 url 수집
url = [] for i in range(1,21): html = urlopen("https://www1.president.go.kr/petitions/best?page={}".format(i)) soup = BeautifulSoup(html,'html.parser') for j in soup.findAll('div',{'class':'bl_body'}): for k in j.findAll('div',{'class':'bl_subject'}): a = k.find('a')['href'] if bool(re.match('/[a-z].*/[0-9].*\?navigation=best',a)): url.append(a)
# 청원 제목 수집
title = [] for i in url: html = urlopen('https://www1.president.go.kr/'+i) soup = BeautifulSoup(html, "html.parser") for i in soup.findAll('h3',{'class':'petitionsView_title'}): title.append(i.text)
# 문장 정제작업
txt = '' for i in title : txt = txt + ' ' + re.sub('[<|>|(|)|!|\,|/|\.]',' ',i)
# 단어별 빈도수 체크 / 불용어 처리
from konlpy.tag import Hannanum hannanum = Hannanum() text_list = hannanum.nouns(txt) stopword = ['및','등'] ko = [i for i in text_list if i not in stopword] ko = nltk.Text(ko) data = ko.vocab()
# wordcloud 만들기
wordcloud = WordCloud(font_path='C:\windows/fonts/malgun.ttf', background_color = 'white', width = 1000, height = 800).generate_from_frequencies(dict(data)) plt.figure(figsize=(20,20)) plt.imshow(wordcloud) plt.axis("off") plt.show()
|