-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathbs4_scraper.py
195 lines (186 loc) · 6.54 KB
/
bs4_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
from bs4 import BeautifulSoup
import urllib2
import MySQLdb
import time
import requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
def data_import(sql):
print 'Import into mysql :'+sql
flag=False
db = MySQLdb.connect(host='localhost',port=3307,user='root',passwd='',db='py_aliexpress',charset='utf8')
cursor = db.cursor()
cursor.execute("SET NAMES utf8mb4;")
try:
cursor.execute(sql)
db.commit()
flag=True
except:
db.rollback()
flag=False
db.close()
return flag
def data_select(sql):
print sql
db = MySQLdb.connect(host='localhost',port=3307,user='root',passwd='',db='py_aliexpress',charset='utf8')
cursor = db.cursor()
print('Begin select ...')
try:
cursor.execute(sql)
results=cursor.fetchall()
return results
except:
print "Error: unable to fecth data"
db.close()
return False
db.close()
return results
def url_handle(strs):
strs = strs.replace("//www", "https://www")
return strs
def get_urls(url_category,request):
get_products(url_category, request)
html_doc=request.get(url_category).text
soup = BeautifulSoup(html_doc, "html.parser")
categories=soup.find("div","ui-pagination-navi util-left")
categories=categories.find_all("a")
for category in categories:
category=str(category.attrs['href'])
print "Start get products of "+str(category)
category=url_handle(category)
get_products(category,request)
print 'Fetch urls and products successfully !!'
def get_products(url_category,request):
print "Getting Category of "+str(url_category)
num=0
num_duplicate=0
html_doc = request.get(url_category).text
soup = BeautifulSoup(html_doc, "html.parser")
products=soup.find_all("div","detail")
for product in products:
product=BeautifulSoup(str(product), "html.parser")
product_url=product.find("a").attrs['href']
product_url=url_handle(str(product_url))
sql = "insert into products (product_url,status0) values ('%s',0)" %(product_url)
sql_select = "select * from products where product_url='%s'"%(product_url)
if data_select(sql_select):
num_duplicate = num_duplicate+1
continue
try:
if data_import(sql):
num = num+1
else:
num_duplicate = num_duplicate+1
except Exception as e:
print (str(e))
print 'Get products '+str(num)+'(new) '+str(num_duplicate)+'(duplicate) successfully !!'
def data_handle(datas):
datas=MySQLdb.escape_string(datas)
return datas
def get_detail(limit ,request):
flag=False
count=1
if limit==0:
limit=6000
sql="select * from products where status0 !=1 order by id limit "+str(limit)
results=data_select(sql)
for purl in results:
url=purl[0]
print 'Begin Get P_detail of '+str(url)+' '+str(count)+' of '+str(len(results))
if get_products_detail(url,request):
flag=True
else:
flag=False
time.sleep(3)
count=count+1
return flag
def get_products_detail(url_product,request):
flag=False
print "Getting Details of "+str(url_product)
html_doc = request.get(url_product).text
if html_doc==None:
return flag
soup = BeautifulSoup(html_doc, "html.parser")
name = soup.find("h1", "product-name").string
name=data_handle(name)
price = soup.find("span", "p-price", id="j-sku-discount-price").string
image = soup.find("a", "ui-image-viewer-thumb-frame").img.attrs['src']
image=data_handle(image)
category_soup=soup.find("div", "module m-sop m-sop-crumb")
category=""
for abtext in category_soup.find_all(["a", "b"]):
category = category + "|" + str(abtext.string)
category=data_handle(category)
brand="baellerry"
sql = "insert into products_detail (pname,price,category,images,brand) values ('%s','%s','%s','%s','%s')" % (name,price,category,image,brand)
try:
if data_import(sql):
flag=True
else:
flag = False
except Exception as e:
print (str(e))
print 'Get product detail of '+str(url_product)+' successfully !!'
if flag:
sql2 = "update products set status0=1 where product_url ='%s'" % url_product
data_import(sql2)
return flag
def setup():
browser = webdriver.Firefox()
browser.implicitly_wait(10)
return browser
def login(username, password,browser=None):
browser.get("https://login.aliexpress.com/buyer.htm")
browser.switch_to.frame('alibaba-login-box')
pwd_btn = browser.find_element_by_name("password")
act_btn = browser.find_element_by_name("loginId")
submit_btn = browser.find_element_by_name("submit-btn")
act_btn.send_keys(username)
time.sleep(2)
pwd_btn.send_keys(password)
time.sleep(5)
submit_btn.send_keys(Keys.ENTER)
return browser
def set_sessions(browser):
request = requests.session()
#headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64)" "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36"}
#request.headers.update(headers)
cookies = browser.get_cookies()
for cookie in cookies:
request.cookies.set(cookie['name'], cookie['value'])
print "request return"
return request
def prepare():
print "Preparing sessions"
browser = login("[email protected]", "yourpassword", setup())
rq = set_sessions(browser)
print "Sessions ok"
return rq
if __name__=='__main__':
str_input1 = int(input("0->Quit;\n 1->Search P_url through Category;\n 2->Get P_detail;\n 3->How many reviews;\n 4->Products Left;\n"))
if str_input1 == 1:
rq = prepare()
while 1:
print "Start geting new url"
str_input2 = raw_input("Enter the category you want to scrapy p_url,\n Url after :\nhttp//www.aliexpress.com/\n")
if len(str_input2) == 1:
break
str_input2 = 'https://www.aliexpress.com/' + str(str_input2)
get_urls(str(str_input2), rq)
elif str_input1 == 2:
rq = prepare()
while 1:
print "Start geting product details"
str_input2 = input("How many products:\n")
if int(str_input2) == 0:
break
get_detail(int(str_input2), rq)
elif str_input1 == 3:
print "3"
#total_reviews()
elif str_input1 == 4:
print "4"
#left_products()
else:
print 'Quit...'
#https://www.aliexpress.com/store/520