Data crawling python program

# reference site
# https://ecsimsw.tistory.com/entry/Google-image-crawler-Crawling-Scraping-python

import sys, os
from bs4 import BeautifulSoup
from selenium import webdriver
import urllib, urllib.request
import requests
import random
import time
from selenium.webdriver.common.keys import Keys

###initial set
#cromdriver를 c drver로 이동 시켜서 문제 해결함

folder = ".image/"
url = "https://www.google.com/search"
webDriver = "C:\cromedriver\chromedriver.exe"
searchItem = "yogurt product"
size = 300

params ={
"q":searchItem
,"tbm":"isch"
,"sa":"1"
,"source":"lnms&tbm=isch"
}

# browser 구동
url = url+"?"+urllib.parse.urlencode(params)
browser = webdriver.Chrome(webDriver)
time.sleep(0.5)
browser.get(url)
html = browser.page_source
time.sleep(0.5)

print(url)

### get number of image for a page

soup_temp = BeautifulSoup(html,'html.parser')
img4page = len(soup_temp.findAll("img"))

### page down

elem = browser.find_element_by_tag_name("body")
imgCnt =0
while imgCnt < size*10:
elem.send_keys(Keys.PAGE_DOWN)
rnd = random.random()
print(imgCnt)
time.sleep(rnd)
imgCnt+=img4page

# html 가공, src 추출
html = browser.page_source
soup = BeautifulSoup(html,'html.parser')
img = soup.findAll("img")

browser.find_elements_by_tag_name('img')

fileNum=0
srcURL=[]

for line in img:
if str(line).find('data-src') != -1 and str(line).find('http')<100:
print(fileNum, " : ", line['data-src'])
srcURL.append(line['data-src'])
fileNum+=1

print("here")

# 폴더 생성 및 파일 저장장
saveDir = folder+searchItem

try:
if not(os.path.isdir(saveDir)):
os.makedirs(os.path.join(saveDir))
except OSError as e:
if e.errno != errno.EEXIST:
print("Failed to create directory!!!!!")
raise

for i,src in zip(range(fileNum),srcURL):
urllib.request.urlretrieve(src, saveDir+"/"+str(i)+".jpg")
print(i,"saved")

print("here2")

저작자표시 (새창열림)

'- 배움이 있는 삶 > - AI | Big data' 카테고리의 다른 글

[3/18] Tensorflow 설치 (Window 10 + Anaconda 3 + pycham) (0)	2021.03.19
Image classification python program (0)	2021.03.17
AI -> Machine learning -> Deep Learning 차이 (0)	2020.07.10
Trump announces unprecedented action against China (1)	2020.05.30
[python] list의 데이타 합의 while 문과 list의 pop을 이용하여 구하라 (0)	2020.05.28

여유가 있는 삶

Data crawling python program

'- 배움이 있는 삶 > - AI | Big data' 카테고리의 다른 글

티스토리툴바

Data crawling python program

'- 배움이 있는 삶 > - AI | Big data' 카테고리의 다른 글

관련글

티스토리툴바