# reference site
# https://ecsimsw.tistory.com/entry/Google-image-crawler-Crawling-Scraping-python
import sys, os
from bs4 import BeautifulSoup
from selenium import webdriver
import urllib, urllib.request
import requests
import random
import time
from selenium.webdriver.common.keys import Keys
###initial set
#cromdriver를 c drver로 이동 시켜서 문제 해결함
folder = ".image/"
url = "https://www.google.com/search"
webDriver = "C:\cromedriver\chromedriver.exe"
searchItem = "yogurt product"
size = 300
params ={
"q":searchItem
,"tbm":"isch"
,"sa":"1"
,"source":"lnms&tbm=isch"
}
# browser 구동
url = url+"?"+urllib.parse.urlencode(params)
browser = webdriver.Chrome(webDriver)
time.sleep(0.5)
browser.get(url)
html = browser.page_source
time.sleep(0.5)
print(url)
### get number of image for a page
soup_temp = BeautifulSoup(html,'html.parser')
img4page = len(soup_temp.findAll("img"))
### page down
elem = browser.find_element_by_tag_name("body")
imgCnt =0
while imgCnt < size*10:
elem.send_keys(Keys.PAGE_DOWN)
rnd = random.random()
print(imgCnt)
time.sleep(rnd)
imgCnt+=img4page
# html 가공, src 추출
html = browser.page_source
soup = BeautifulSoup(html,'html.parser')
img = soup.findAll("img")
browser.find_elements_by_tag_name('img')
fileNum=0
srcURL=[]
for line in img:
if str(line).find('data-src') != -1 and str(line).find('http')<100:
print(fileNum, " : ", line['data-src'])
srcURL.append(line['data-src'])
fileNum+=1
print("here")
# 폴더 생성 및 파일 저장장
saveDir = folder+searchItem
try:
if not(os.path.isdir(saveDir)):
os.makedirs(os.path.join(saveDir))
except OSError as e:
if e.errno != errno.EEXIST:
print("Failed to create directory!!!!!")
raise
for i,src in zip(range(fileNum),srcURL):
urllib.request.urlretrieve(src, saveDir+"/"+str(i)+".jpg")
print(i,"saved")
print("here2")
'- 배움이 있는 삶 > - AI | Big data' 카테고리의 다른 글
[3/18] Tensorflow 설치 (Window 10 + Anaconda 3 + pycham) (0) | 2021.03.19 |
---|---|
Image classification python program (0) | 2021.03.17 |
AI -> Machine learning -> Deep Learning 차이 (0) | 2020.07.10 |
Trump announces unprecedented action against China (1) | 2020.05.30 |
[python] list의 데이타 합의 while 문과 list의 pop을 이용하여 구하라 (0) | 2020.05.28 |