Selenium 기초 및 활용 하기 6 - 국회의원 스크래핑

빅데이터/Selenium 2022. 1. 13. 21:46

유투버 '이수안컴퓨터연구소' 강의 참조

역시나 강의와 다르게 웹사이트가 변경 되었다

* 기본 세팅

import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
import pandas as pd
from selenium.common.exceptions import ElementClickInterceptedException, NoSuchElementException, ElementNotInteractableException  #(클릭시 없을때, 엘리멘트 자체가 없을떄, 엘리멘트가 상호작용을 못할때 )
import os
import shutil 
from urllib.request import urlretrieve



#웹브라우저를 띄우지 않고 진행하기 위한 설정
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

01 폴더 세팅

dir_name = './politicial' #폴더 생성
if os.path.isdir(dir_name): #폴더가 있으면
    shutil.rmtree(dir_name) #지움

os.makedirs(dri_name)
print(f"{dir_name} 디렉토리 생성")

politician_df = scraping(dir_name)

02 스크래핑 함수

def scraping(dir_name):
    page_no = 0
    politician_count = 0
    politician_df = pd.DataFrame()
    
    wd = webdriver.Chrome('chromedriver', options=chrome_options)
    wd.execute_script('window.open("about:blank", "_blank");')
    tabs = wd.window_handles
    
    while True:
        page_no += 1
        wd.switch_to.window(tabs[0])
        
        url = f"https://www.bigkinds.or.kr/v2/depthAnalysis/assembly.do?page={page_no}"
        wd.get(url)

국회의원이 나오는 페이지에서 국회의원을 클릭하면 페이지가 바뀌는 형식이기 때문에 탭으로 열도록 처리
(.execute_script() / wd.switcvh_to_window())
국회의원은 대략 300명이고 한 페이지에 다보는게아닌 페이징이 달려있는 사이트. 즉 URL의 파라미터로 페이지가 바뀌면서 국회의원도 바뀜

02 - 2 기본 데이터 뽑아보기

1 - 국회의원 총 수 뽑기

politician_total = wd.find_element(By.XPATH, '//*[@id="contents"]/section[1]/div/div/div[1]/div[2]').text

2 - 기본 정보 뽑기

'<li>' 태그로 각각의 국회의원 정보가 들어있으니 저 리스트를 반복문으로 가져오면 된다

1차 테스트

def scraping(dir_name, politician_max=5):
    page_no = 0
    politician_count = 0
    politician_df = pd.DataFrame()

    wd = webdriver.Chrome('chromedriver', options=chrome_options)
    wd.execute_script('window.open("about:blank", "_blank");')
    tabs = wd.window_handles

    while True:

        try:
            page_no += 1
            wd.switch_to.window(tabs[0])

            url = f"https://www.bigkinds.or.kr/v2/depthAnalysis/assembly.do?page={page_no}"
            wd.get(url)

            politician_total = wd.find_element(By.XPATH, '//*[@id="contents"]/section[1]/div/div/div[1]/div[2]').text
            politician_items = wd.find_elements(By.CSS_SELECTOR, '#contents > section.spacial-person.spacial-page > div > ul > li')

            if not politician_items:
                break

            for item in politician_items:
                wd.switch_to.window(tabs[0])

                politician_count += 1

                print('--------------------------------------')
                print(f"[국회의원 {politician_count}/{politician_total}명]" )
                name = item.find_element(By.CLASS_NAME, 'sp-kname').text
    #             name = name[:3] 이름 자를때 쓰자!
                print(name)

                if politician_count >= politician_max:
                    break
            if politician_count >= politician_max:
                    break
        except AttributeError as e:
            print(e)
            break

        except NoSuchElementException as e:
            print(e)
            break

    wd.close()

    return politician_df

03 국회의원 정보 뽑기

이제 각각의 국회의원을 클릭을 하여 들어가서 정보를 뽑을 예정.

detail_link = item.find_element(By.TAG_NAME, 'a').get_attribute('href')
wd.switch_to.window(tabs[1])
wd.get(detail_link)

링크를 가져온 뒤 새로운 탭을 열어서 그 탭에서 다시 정보를 뽑자

* 셀레니엄 자주 쓰는 함수 정리 하고 가자

execute_script() - JavaScript 코드를 사용 가능 하도록 만들어 줌. '( )' 안의 자바 코드를 실행.
→ 'window.open("about:blank", "_blank");' 은 JS코드로 웹브라우저에서 새창을 열기위한 코드(.open()).
새창을 열때 여러 속성을 지정 할 수 있고 그 중 'blank'는 새창이 열리는 가장 기본 값
tabs = wd.window_handles - 열리는 웹브라우저를 핸들링을 할 수 있는 코드
wd.switch_to.window(tabs[0]) - 선택된 브라우저 탭으로 이동 즉 'tabs[0]'에 열린 탭으로 이동

* 참고 링크 (https://gorokke.tistory.com/8)

- 이미지 뽑기

print("[프로필 이미지 다운로드]")
profile_image = wd.find_element(By.CLASS_NAME, 'thumb')
image_src = profile_image.find_element(By.TAG_NAME, 'img').get_attribute('src')
file_name = dir_name + '/' + name + '.jpg'
print(file_name)
urlretrieve(image_src, file_name)

urlretrieve() - 원하는 대상을 다운로드 하는데 'urlretrieve()' 함수를 쓸 경우 정확한 경로가 필요하다. urlretrieve(다운을 원하는 대상, 다운시에 저장 이름)

- 기본 정보 뽑기

print("[기본 정보 스크래핑]")
politician_dic = get_politician_info(dir_name, wd)
print("    ",politician_dic)
politician_df = politician_df.append(politician_dic, ignore_index=True)


def get_politician_info(dir_name, wd):
    profile_info = wd.find_element(By.CSS_SELECTOR, '#contents > div.sp-person.contents > section.person-info.spacial-page.cl')
    cols = profile_info.find_elements(By.TAG_NAME, 'dt')
    columns = [col.text for col in cols]
    
    infos = profile_info.find_elements(By.TAG_NAME, 'dd')
    detail_info = [info.text for info in infos]
    
    info_dic = {col:info for col, info in zip(columns, detail_info)}
    
    return info_dic

기본 정보 스크래핑은 함수로 처리 하도록 세팅
dirname / wd 를 매개변수로 넘김
기본정보가 들은 태그를 들고오고 거기서 <dt>/<dd> 태그로 나뉘기 때문에 각각 찾아서 변수화 및 텍스트만 뽑음
딕셔너리 컴프리헨션은 다음 링크 참조(https://wikidocs.net/92540)

04 인용문 들고 오기

print("[뉴스 인용문 스크래핑]")
tab = wd.find_element(By.CSS_SELECTOR, 'li.analysisTab-01.ui-state-active').find_element(By.TAG_NAME, 'a').click()
time.sleep(1)
get_news_quote(dir_name, name, wd)

* 페이징

def get_news_quote(dir_name, name, wd):
    
    page_no = 0
    news_count = 0
    news_total = wd.find_element(By.ID, 'newsInQoutTotalCount').text[1:-2]
    news_file = open(dir_name + '/' + name + '뉴스 인용문.txt','w')
    
    while True:
        try:
            page_no += 1
            print(f"---------(뉴스 인용문 {page_no} 페이지)----------")
            time.sleep(1)
            
            box_list = wd.find_element(By.ID, 'newsInQuotList')
            quotoes = box_list.find_elements(By.CLASS_NAME, 'title')
            
            time.sleep(1)
            
            for quoto in quotoes:
                news_file.write(quoto.text + '\n')
                news_count += 1
                print(f"{news_count}/{news_total}", [quoto.text])
                print()
#                 if news_count >= news_max:
#                     break


            paging = wd.find_element(By.ID, 'newsInQuotListPaging')

            next_button = paging.find_element(By.CSS_SELECTOR, '#newsInQuotListPaging > a.page-next.page-link')
            if next_button.get_attribute('onclick') == None:
                break
                    
                         
            pagination = paging.find_element(By.CSS_SELECTOR, '#newsInQuotListPaging > a.page-next.page-link').click()
            time.sleep(1)

                    
#             if news_count >= news_max:
#                 break
                

        except Exception as e:
            print(e)
            break
            
    news_file.close()

여기서 많이 해맸다. 웹사이트가 바뀌었지만 나름 알아서 잘 들고 왔고 마지막 포문 페이징에서 막혔다
일단 한번 긁고 그 다음 마지막 페이지인지 확인을 먼저 해야 한다. 그리고 마지막 페이지가 아니므로 다음 페이지 클릭. 그리고 다시 긁고 반복. 그러다가 마지막 페이지에 오면 한번 긁고 마지막인지 확인하고 BREAK가 걸림

저작자표시

'빅데이터 > Selenium' 카테고리의 다른 글

Selenium 기초 및 활용 하기 7 - 인스타 그램 이미지 크롤링 (0)	2022.01.14
Selenium 기초 및 활용 하기 6 - 2 - 국회의원 스크래핑 (0)	2022.01.14
Selenium 기초 및 활용 하기 5-2 - 네이버 뉴스 댓글 스크래핑 (0)	2022.01.12
Selenium 기초 및 활용 하기 5 - 네이버 뉴스 댓글 스크래핑 (0)	2022.01.12
Selenium 기초 및 활용 하기 04-2 - 구글 이미지 스크래핑 (0)	2022.01.12

ABOUT ME

Treasure Treasure

유투버 '이수안컴퓨터연구소' 강의 참조

'빅데이터 > Selenium' 카테고리의 다른 글

티스토리툴바

ABOUT ME

유투버 '이수안컴퓨터연구소' 강의 참조

'빅데이터 > Selenium' 카테고리의 다른 글

관련글 관련글 더보기

티스토리툴바