본문 바로가기
- 배움이 있는 삶/- AI | Big data

Data crawling python program

by story of interesting 2021. 3. 17.
반응형


# reference site
# https://ecsimsw.tistory.com/entry/Google-image-crawler-Crawling-Scraping-python

import sys, os
from bs4 import BeautifulSoup
from selenium import webdriver
import urllib, urllib.request
import requests
import random
import time
from selenium.webdriver.common.keys import Keys

###initial set
#cromdriver c drver로 이동 시켜서 문제 해결함

folder = ".image/"
url = "https://www.google.com/search"
webDriver = "C:\cromedriver\chromedriver.exe"
searchItem = "yogurt product"
size = 300

params ={
"q":searchItem
,"tbm":"isch"
,"sa":"1"
,"source":"lnms&tbm=isch"
}

# browser 구동
url = url+"?"+urllib.parse.urlencode(params)
browser = webdriver.Chrome(webDriver)
time.sleep(0.5)
browser.get(url)
html = browser.page_source
time.sleep(0.5)

print(url)

### get number of image for a page

soup_temp = BeautifulSoup(html,'html.parser')
img4page = len(soup_temp.findAll("img"))

### page down

elem = browser.find_element_by_tag_name("body")
imgCnt =0
while imgCnt < size*10:
elem.send_keys(Keys.PAGE_DOWN)
rnd = random.random()
print(imgCnt)
time.sleep(rnd)
imgCnt+=img4page

# html 가공, src 추출
html = browser.page_source
soup = BeautifulSoup(html,'html.parser')
img = soup.findAll("img")

browser.find_elements_by_tag_name('img')

fileNum=0
srcURL=[]

for line in img:
if str(line).find('data-src') != -1 and str(line).find('http')<100:
print(fileNum, " : ", line['data-src'])
srcURL.append(line['data-src'])
fileNum+=1


print("here")

# 폴더 생성 및 파일 저장장
saveDir = folder+searchItem

try:
if not(os.path.isdir(saveDir)):
os.makedirs(os.path.join(saveDir))
except OSError as e:
if e.errno != errno.EEXIST:
print("Failed to create directory!!!!!")
raise

for i,src in zip(range(fileNum),srcURL):
urllib.request.urlretrieve(src, saveDir+"/"+str(i)+".jpg")
print(i,"saved")

print("here2")

반응형