Selenium 기초 및 활용 하기 04-2 - 구글 이미지 스크래핑

빅데이터/Selenium

Selenium 기초 및 활용 하기 04-2 - 구글 이미지 스크래핑

H-V 2022. 1. 12. 12:56

유투버 '이수안컴퓨터연구소' 강의 참조

현재까지 스크롤 내리는 함수, 썸네일 이미지 선택후 이미지 원본 저장 함수, 구글 이미지를 검색 및 예외 처리 함수 까지 세팅을 끝냈다.

01 이미지 퀄리티 세팅

어떤 이미지 원본은 원본을 받더라도 해상도가 떨어지는 경우가 있다. 그래서 모든 해상도 퀄리티를 맞춰줄 필요가 있다.

filter_and_remove(dir_name, query, 400)

def filter_and_remove(dir_name, query, filter_size):
    filtered_count = 0
    for index, file_name in enumerate(os.listdir(dir_name)): #os.listdir = ()의 모든 파일 리스트를 가져 온다
        try:
            file_path = os.path.join(dir_name, file_name) #디렉토리 이름 + 파일 이름 = full path
            img = Image.open(file_path) #풀 패스로 접근해서 하니씩 열도록 세팅 (.image())

            if img.width < filter_size and img.height < filter_size:
                img.close()
                os.remove(file_path) # 필터 사이즈 값보다 작으면 그 위치의 파일을 삭제
                print(f"{index} 이미지 제거")
                filtered_count += 1
        except OSError as e:
            print(e)
            os.remove(file_path) #OS 에러 (안열리거나, 파일손상시 제거)
            filtered_count += 1
    print(f"[이미지 제거 개수: {filtered_count}/{scraped_count}]")

02 1차 테스트

* 1차 테스트 전체 코드

import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
import pandas as pd
#셀레니엄 예외처리용
from selenium.common.exceptions import ElementClickInterceptedException, NoSuchElementException, ElementNotInteractableException #(클릭시 없을때, 엘리멘트 자체가 없을떄, 엘리멘트가 상호작용을 못할때 )
import os #이미지 파일 다운로드 제어 용
import socket #소켓 에러 방지용
from urllib.request import urlretrieve #이미지 다운로드 라이브러리
from urllib.error import HTTPError, URLError #각종 에러 방지 라이브러리 (HTTP/URL 에러 방지)
from PIL import Image # 이미지를 사용가능하도록 처리 하는 라이브러리

#웹브라우저를 띄우지 않고 진행하기 위한 설정
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

def scroll_down():
    scroll_count = 0
    print('[스크롤 다운 함수 시작!]')

    #스크롤을 내릴려면 위치값이 필요하다. 'execute_script()' 를 통해 스크롤 위치값(Height)을 가져 올 수 있음
    last_height = wd.execute_script("return document.body.scrollHeight")
    after_click = False #스크롤을 계속내리다가 '더 보기'가 나오는지 체크용


    while True:
        print(f"[스크롤 다운 중: {scroll_count}]")
        # 'scrollTo()' 함수를 써서 0부터 최대 위치값까지 스크롤을 함
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        scroll_count += 1 #스크롤이 최대로 갈때마다 카운트
        time.sleep(1) #JS 액션이 실행되고 반응이 될때까지 기다릴 시간

        new_height = wd.execute_script("return document.body.scrollHeight") # 최대값에 도달하면 스크롤이 다시 생성되니 거기서 다시 최대값 구함

        if last_height == new_height: #스크롤이 더이상 되지 않다면
            if after_click is True: #'더 보기'가 나온다면..
                break
            else:
                try:
                    more_button = wd.find_element(By.XPATH, '//*[@id="islmp"]/div/div/div/div[1]/div[2]/div[2]/input')
                    if more_button.is_displayed(): # '더 보기' 버튼이 나오면
                        more_button.click()
                        time.sleep(1)
                        after_click = True
                except NoSuchElementException as e:
                    print(e)
                    break

        last_height = new_height

def click_and_save(dir_name, index, img, img_list_length):
    global scraped_count

    try:
        img.click() #이미지를 클릭
        wd.implicitly_wait(3) #클릭 후 로드 시간이 필요하므로 대기
        src = wd.find_element(By.XPATH, '//*[@id="Sva75c"]/div/div/div[3]/div[2]/c-wiz/div/div[1]/div[1]/div[2]/div/a/img').get_attribute('src')
        if src.split('.')[-1] == 'png':
            urlretrieve(src, dir_name + '/' + str(scraped_count + 1) + '.png')
            print(f' {index+1} / {img_list_length} PNG 이미지 저장 완료!')
        else:
            urlretrieve(src, dir_name + '/' + str(scraped_count + 1) + '.jpg')
            print(f' {index+1} / {img_list_length} JPG 이미지 저장 완료!')

        scraped_count += 1

    except HTTPError as e:
        print(e)
        pass #PNG, JPG 이외는 다 제외

def scraping(dir_name, query):
    global scraped_count

    url = f'https://www.google.com/search?q={query}&tbm=isch&hl=en&tbs=isz:l&rlz=1C1CHZN_koKR971KR971&sa=X&ved=0CAIQpwVqFwoTCIC5mMe9qfUCFQAAAAAdAAAAABAC&biw=942&bih=941'

    wd.get(url)
    wd.maximize_window() #전체화면으로 스크래핑

    scroll_down()

    div = wd.find_element(By.XPATH, '//*[@id="islrg"]/div[1]')
    img_list = div.find_elements(By.CSS_SELECTOR, '.rg_i Q4LuWd')

    for index, img in enumerate(img_list):
        try:
            #이 함수로 저장 이름, 인덱스, 이미지, 이미지 총 개수를 파라미터로 넘긴다
            click_and_save(dir_name, index, img, len(img_list))
        except ElementClickInterceptedException as e: # 클릭시 문제 발생하면 아래로
            print(e)
            wd.execute_script("window.scrllTo(0, window.scrollY + 100)") #스크롤을 다시 하도록
            time.sleep(1)
            click_and_save(dir_name, index, img, len(img_list))
        except NoSuchElementException as e:
            print(e)
            wd.execute_script("window.scrllTo(0, window.scrollY + 100)") #스크롤을 다시 하도록
            time.sleep(1)
            click_and_save(dir_name, index, img, len(img_list))
        except ConnectionResetError as e: #연결 문제는 pass
            print(e)
            pass
        except URLError as e: #URL 문제시 PASS
            print(e)
            pass
        except socket.timeout as e: #소켓 통신 에러 pass
            print(e)
            pass
        except soket.gaierror as e: #소켓 통신 address 에러시 pass
            print(e)
            pass
        except ElementNotInteractableException as e: #엘리멘트 호환이 되지 않을때
            print(e)
            break

    try:
        print("[스크래핑 종료 (성공률: %.2f%%)]" % (scraped_count / len(img_list) * 100.0))
    except ZeroDivisionError as e: #성공률 계산시 0이 나오면 패스
        print(e)
        pass

    wd.quit()

socket.setdefaulttimeout(30)#소켓 통신 시간 제어

wd = webdriver.Chrome('chromedriver', options=chrome_options)

scraped_count = 0 #이미지 스크래핑 개수 카운트 용

path = './' #현재 사용하는 위치에서 검색어 폴더를 만들고 이미지 다운로드 용
query = input('검색어 입력: ')

dir_name = path + query
os.makedirs(dir_name) # 'dir_name' 으로 폴더 생성

print(f"[{dir_name} 디렉토리 생성]")

scraping(dir_name, query)

filter_and_remove(dir_name, query, 400)

1차 테스트 코드를 돌려보면 아래와 같은 오류가 뜨고 멈춘다. 해결 해 보자. 강의에서는 'scraping()' 함수의 문제라고 한다. 수정해 보자.

'ZeroDivisionError' 가 떳으니 이 에러처리가 어디에 있는지보고 거기서 프린터를 하나씩 차근차근 찍어보면서 어디까지 작동이 되는지 봐야 한다.

scroll_down() 함수 시작 프린터가 찍히니 그 함수 전/후 로 프린터를 찍어보면 될 듯 하다.

URL을 클릭해보면 정상이다.
scroll_dwon() 까지 정상 작동 되고 div 로 엘리멘트까지는 잘 찾는다
그러나 그다음 img_list가 문제로 보인다.

▶ 1차 오류 해결 방법

img_list = div.find_elements(By.CSS_SELECTOR, '.rg_i Q4LuWd')

→ 전 블로깅에서도 언급했지만 클래스를 찾을때 스페이가 들어간것은 찾아지지 않는 경우가 많다. 
→ 즉 CSS 셀렉터 값을 정확하게 넣고 다시 돌리면 img_list는 해결이 된다.
img_list = div.find_elements(By.CSS_SELECTOR, 'div.bRMDJf.islir > img')

- 2차 오류

'ElementClickInterceptedException' 에러 처리를 넣어 놨는데도 불구하고 계속해서 뜬다.

ElementClickInterceptedException 은 여기에 타기는 타지만

'click_and_save()' 함수에도 똑같이 'ElementClickInterceptedException' 예외 처리를 넣어 주자.

def click_and_save(dir_name, index, img, img_list_length):
    global scraped_count
    
    try:
        img.click() #이미지를 클릭
        wd.implicitly_wait(5) #클릭 후 로드 시간이 필요하므로 대기
        src = wd.find_element(By.XPATH, '//*[@id="Sva75c"]/div/div/div[3]/div[2]/c-wiz/div/div[1]/div[1]/div[2]/div/a/img').get_attribute('src')
        if src.split('.')[-1] == 'png':
            urlretrieve(src, dir_name + '/' + str(scraped_count + 1) + '.png')
            print(f' {index+1} / {img_list_length} PNG 이미지 저장 완료!')
        else:
            urlretrieve(src, dir_name + '/' + str(scraped_count + 1) + '.jpg')
            print(f' {index+1} / {img_list_length} JPG 이미지 저장 완료!')

        scraped_count += 1
    
    except HTTPError as e:
        print(e)
        pass #PNG, JPG 이외는 다 제외
    except ElementClickInterceptedException as e: # 클릭시 문제 발생하면 아래로
        print(e)
        pass

저작자표시 (새창열림)