컴퓨터/파이썬

파이썬(Python) - 크롤링 연습 ① 사람인 빅데이터 채용 조건 수집

해피밀세트 2020. 4. 1. 19:49

 

 

1. 채용 조건 wordcloud로 만들기

browser = webdriver.Chrome('C:/chromedriver.exe')
browser.get("http://www.saramin.co.kr/zf_user/search/recruit?search_area=main&search_done=y&search_optional_item=n&searchType=search&searchword=%EB%B9%85%EB%8D%B0%EC%9D%B4%ED%84%B0&recruitPage=1")
time.sleep(3)

soup = BeautifulSoup(browser.page_source,'html.parser')

job = []
for i in soup.findAll('div',class_='item_recruit'):
        job.append(i.find('div',class_='job_condition').text.strip())

for i in range(2,11):
    browser.find_element_by_xpath('//*[@id="recruit_info_list"]/div[2]/div/a[{}]'.format(i)).click()
    time.sleep(3)
    for i in soup.findAll('div',class_='item_recruit'):
        job.append(i.find('div',class_='job_condition').text.strip())

for i in range(2,12):
    browser.find_element_by_xpath('//*[@id="recruit_info_list"]/div[2]/div/a[{}]'.format(i)).click()
    time.sleep(3)
    for i in soup.findAll('div',class_='item_recruit'):
        job.append(i.find('div',class_='job_condition').text.strip())

for i in range(3,6):
    browser.find_element_by_xpath('//*[@id="recruit_info_list"]/div[2]/div/a[{}]'.format(i)).click()
    time.sleep(3)
    for i in soup.findAll('div',class_='item_recruit'):
        job.append(i.find('div',class_='job_condition').text.strip())
        
# list -> str 형변환
txt = ''
for i in job :
        txt = re.sub('[·|↑]',' ',i)+ ''+txt


# wordcloud 만들기
wordcloud = WordCloud(font_path='C:\windows/fonts/malgun.ttf',
                      stopwords = STOPWORDS,
                      background_color = 'white',
                      width = 1500, height = 1200).generate(txt)
plt.figure(figsize=(20,200))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

browser.quit()   

 

 

2. 데이터 프레임으로 만들기

browser = webdriver.Chrome('C:/chromedriver.exe')
browser.get("http://www.saramin.co.kr/zf_user/search/recruit?search_area=main&search_done=y&search_optional_item=n&searchType=search&searchword=%EB%B9%85%EB%8D%B0%EC%9D%B4%ED%84%B0&recruitPage=1")
time.sleep(3)

soup = BeautifulSoup(browser.page_source,'html.parser')

df = DataFrame(columns=['공고명','회사명','근무지역','채용조건'])

for j in soup.findAll('div',class_='item_recruit'):
    job_name = j.find('h2',class_='job_tit').text.strip()
    company = j.find('strong',class_='corp_name').text
    working_area = j.find('div',class_='job_condition').find('a').text.strip()
    job_info = j.find('div',class_='job_condition')
    [s.extract() for s in job_info('a')]
    job_info = job_info.text.strip()
    df = df.append({'공고명':job_name,'회사명':company,'근무지역':working_area,'채용조건':job_info}, ignore_index = True)


for i in range(2,11):
    browser.find_element_by_xpath('//*[@id="recruit_info_list"]/div[2]/div/a[{}]'.format(i)).click()
    time.sleep(3)
    for i in soup.findAll('div',class_='item_recruit'):
        job_name = i.find('h2',class_='job_tit').text.strip()
        company = i.find('strong',class_='corp_name').text
        job_info = i.find('div',class_='job_condition').text.strip()
        df = df.append({'공고명':job_name,'회사명':company,'근무지역':working_area,'채용건':job_info}, ignore_index = True)

for i in range(2,12):
    browser.find_element_by_xpath('//*[@id="recruit_info_list"]/div[2]/div/a[{}]'.format(i)).click()
    time.sleep(3)
    for i in soup.findAll('div',class_='item_recruit'):
        job_name = i.find('h2',class_='job_tit').text.strip()
        company = i.find('strong',class_='corp_name').text
        job_info = i.find('div',class_='job_condition').text.strip()
        df = df.append({'공고명':job_name,'회사명':company,'근무지역':working_area,'채용건':job_info}, ignore_index = True)

for i in range(3,6):
    browser.find_element_by_xpath('//*[@id="recruit_info_list"]/div[2]/div/a[{}]'.format(i)).click()
    time.sleep(3)
    for i in soup.findAll('div',class_='item_recruit'):
        job_name = i.find('h2',class_='job_tit').text.strip()
        company = i.find('strong',class_='corp_name').text
        job_info = i.find('div',class_='job_condition').text.strip()
        df = df.append({'공고명':job_name,'회사명':company,'근무지역':working_area,'채용건':job_info}, ignore_index = True)

browser.quit()        
        
df

 

 

3. 엑셀 파일(.xlsx)로 만들기

# 엑셀 파일 생성 / 실제 파일은 안만들어짐
workbook = xlsxwriter.Workbook("C:/data/job_bigdata.xlsx")
# 워크시트 생성
worksheet = workbook.add_worksheet()


browser = webdriver.Chrome('C:/chromedriver.exe')
browser.get("http://www.saramin.co.kr/zf_user/search/recruit?search_area=main&search_done=y&search_optional_item=n&searchType=search&searchword=%EB%B9%85%EB%8D%B0%EC%9D%B4%ED%84%B0&recruitPage=1")
time.sleep(3)

soup = BeautifulSoup(browser.page_source,'html.parser')

df = DataFrame(columns=['공고명','회사명','근무지역','채용조건'])

row_cnt = 1 # 엑셀의 행 수

for j in soup.findAll('div',class_='item_recruit'):
    job_name = j.find('h2',class_='job_tit').text.strip()
    company = j.find('strong',class_='corp_name').text
    working_area = j.find('div',class_='job_condition').find('a').text.strip()
    job_info = j.find('div',class_='job_condition')
    [s.extract() for s in job_info('a')]
    job_info = job_info.text.strip()
    df = df.append({'공고명':job_name,'회사명':company,'근무지역':working_area,'채용조건':job_info}, ignore_index = True)
    worksheet.write('A{}'.format(row_cnt),job_name)
    worksheet.write('B{}'.format(row_cnt),company)
    worksheet.write('C{}'.format(row_cnt),working_area)
    worksheet.write('D{}'.format(row_cnt),job_info)
    row_cnt += 1


for i in range(2,11):
    browser.find_element_by_xpath('//*[@id="recruit_info_list"]/div[2]/div/a[{}]'.format(i)).click()
    time.sleep(3)
    for i in soup.findAll('div',class_='item_recruit'):
        job_name = i.find('h2',class_='job_tit').text.strip()
        company = i.find('strong',class_='corp_name').text
        job_info = i.find('div',class_='job_condition').text.strip()
        df = df.append({'공고명':job_name,'회사명':company,'근무지역':working_area,'채용조건':job_info}, ignore_index = True)
        worksheet.write('A{}'.format(row_cnt),job_name)
        worksheet.write('B{}'.format(row_cnt),company)
        worksheet.write('C{}'.format(row_cnt),working_area)
        worksheet.write('D{}'.format(row_cnt),job_info)
        row_cnt += 1

for i in range(2,12):
    browser.find_element_by_xpath('//*[@id="recruit_info_list"]/div[2]/div/a[{}]'.format(i)).click()
    time.sleep(3)
    for i in soup.findAll('div',class_='item_recruit'):
        job_name = i.find('h2',class_='job_tit').text.strip()
        company = i.find('strong',class_='corp_name').text
        job_info = i.find('div',class_='job_condition').text.strip()
        df = df.append({'공고명':job_name,'회사명':company,'근무지역':working_area,'채용조건':job_info}, ignore_index = True)
        worksheet.write('A{}'.format(row_cnt),job_name)
        worksheet.write('B{}'.format(row_cnt),company)
        worksheet.write('C{}'.format(row_cnt),working_area)
        worksheet.write('D{}'.format(row_cnt),job_info)
        row_cnt += 1

for i in range(3,6):
    browser.find_element_by_xpath('//*[@id="recruit_info_list"]/div[2]/div/a[{}]'.format(i)).click()
    time.sleep(3)
    for i in soup.findAll('div',class_='item_recruit'):
        job_name = i.find('h2',class_='job_tit').text.strip()
        company = i.find('strong',class_='corp_name').text
        job_info = i.find('div',class_='job_condition').text.strip()
        df = df.append({'공고명':job_name,'회사명':company,'근무지역':working_area,'채용조건':job_info}, ignore_index = True)
        worksheet.write('A{}'.format(row_cnt),job_name)
        worksheet.write('B{}'.format(row_cnt),company)
        worksheet.write('C{}'.format(row_cnt),working_area)
        worksheet.write('D{}'.format(row_cnt),job_info)
        row_cnt += 1

browser.quit()           

# 엑셀 파일 생성 및 닫기 / 이때 실제 파일이 만들어짐
workbook.close()

 

 

 

4. 공고명 wordcloud로 만들기 (+불용어 처리)

# url 접속 및 데이터 불러오기

browser = webdriver.Chrome('C:/chromedriver.exe')
browser.get("http://www.saramin.co.kr/zf_user/search/recruit?search_area=main&search_done=y&search_optional_item=n&searchType=search&searchword=%EB%B9%85%EB%8D%B0%EC%9D%B4%ED%84%B0&recruitPage=1")
time.sleep(3)

soup = BeautifulSoup(browser.page_source,'html.parser')

job = []
for i in soup.findAll('div',class_='item_recruit'):
        job.append(i.find('h2',class_='job_tit').text.strip().split(' '))
        #print(i.find('div',class_='job_condition').text.strip().split(' '))

for i in range(2,11):
    browser.find_element_by_xpath('//*[@id="recruit_info_list"]/div[2]/div/a[{}]'.format(i)).click()
    time.sleep(3)
    for i in soup.findAll('div',class_='item_recruit'):
        job.append(i.find('h2',class_='job_tit').text.strip().split(' '))

for i in range(2,12):
    browser.find_element_by_xpath('//*[@id="recruit_info_list"]/div[2]/div/a[{}]'.format(i)).click()
    time.sleep(3)
    for i in soup.findAll('div',class_='item_recruit'):
        job.append(i.find('h2',class_='job_tit').text.strip().split(' '))

for i in range(3,6):
    browser.find_element_by_xpath('//*[@id="recruit_info_list"]/div[2]/div/a[{}]'.format(i)).click()
    time.sleep(3)
    for i in soup.findAll('div',class_='item_recruit'):
        job.append(i.find('h2',class_='job_tit').text.strip().split(' '))
        
browser.quit()

 

# 문장 정제작업

txt = ''
for i in job :
    for j in i:
        txt = re.sub('[·|↑|\,|\.|\[|\]|(|)|-|&|/]',' ',j) + ' ' + txt
         

 

# 단어별 빈도수 체크 / 불용어 처리

okt = Okt()
text_list = okt.nouns(txt)
stopword = ['및','외']
ko = [i for i in text_list if (i not in stopword)and(len(i)>=2)]
ko = nltk.Text(ko)
data = ko.vocab()

 

 

# wordcloud 만들기

wordcloud = WordCloud(font_path='C:\windows/fonts/malgun.ttf',
                      background_color = 'white',
                      width = 1000, height = 800).generate_from_frequencies(dict(data))
plt.figure(figsize=(20,20))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

 

 

 

 

 

5. 채용정보 wordcloud로 만들기 (+불용어 처리)

# url 접속 및 데이터 불러오기

browser = webdriver.Chrome('C:/chromedriver.exe')
browser.get("http://www.saramin.co.kr/zf_user/search/recruit?search_area=main&search_done=y&search_optional_item=n&searchType=search&searchword=%EB%B9%85%EB%8D%B0%EC%9D%B4%ED%84%B0&recruitPage=1")
time.sleep(3)

soup = BeautifulSoup(browser.page_source,'html.parser')

job = []
for i in soup.findAll('div',class_='item_recruit'):
        job.append(i.find('div',class_='job_condition').text.strip())

        
for i in range(2,11):
    browser.find_element_by_xpath('//*[@id="recruit_info_list"]/div[2]/div/a[{}]'.format(i)).click()
    time.sleep(3)
    for i in soup.findAll('div',class_='item_recruit'):
        job.append(i.find('div',class_='job_condition').text.strip())
    

for i in range(2,12):
    browser.find_element_by_xpath('//*[@id="recruit_info_list"]/div[2]/div/a[{}]'.format(i)).click()
    time.sleep(3)
    for i in soup.findAll('div',class_='item_recruit'):
        job.append(i.find('div',class_='job_condition').text.strip())

for i in range(3,6):
    browser.find_element_by_xpath('//*[@id="recruit_info_list"]/div[2]/div/a[{}]'.format(i)).click()
    time.sleep(3)
    for i in soup.findAll('div',class_='item_recruit'):
        job.append(i.find('div',class_='job_condition').text.strip())
        
browser.quit()

 

 

# 문장 정제작업

txt = ''
for i in job :
        txt = re.sub('[·|↑]','',i)+ ' ' + txt

 

# 단어별 빈도수 체크 / 불용어 처리

from konlpy.tag import Hannanum
hannanum = Hannanum()
text_list = hannanum.nouns(txt)
stopword = ['만원']
ko = [i for i in text_list if (i not in stopword)and(len(i)>=2)]
ko = nltk.Text(ko)
data = ko.vocab()
data['벨기에'] = data.pop('벨기')

 

# wordcloud 만들기

wordcloud = WordCloud(font_path='C:\windows/fonts/malgun.ttf',
                      background_color = 'white',
                      width = 1000, height = 800).generate_from_frequencies(dict(data))
plt.figure(figsize=(20,20))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

 

 

반응형