forked from cmbgift/text-analysis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjd_skus_spiders.py
76 lines (65 loc) · 3.21 KB
/
jd_skus_spiders.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# https://www.jd.com/allSort.aspx
import requests
from pyquery import PyQuery as pq
from prettyprinter import cpprint
import json
from urllib.parse import urlencode
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
import datetime
import sys
import sql
import pandas as pd
def get_page(url):
browser.get(url)
submit = wait.until(EC.presence_of_element_located((By.XPATH, '//div[contains(@class,"tab-main")]/ul/li[5]')))
time.sleep(2)
for i in range(30):
browser.execute_script("window.scrollBy(0,50)")
time.sleep(0.1)
submit.click()
time.sleep(3)
return browser.page_source
if __name__ == '__main__':
# 前期准备>>>>>>>>>>
browser = webdriver.Chrome(executable_path='D:/chromedriver.exe') # selenium模拟浏览器
# driver = webdriver.Chrome(executable_path='C:/path/to/chromedriver.exe')
wait = WebDriverWait(browser, 20)
MAXINDEX = 7 # 最大请求评论页数,为了控制评论数量在500条左右,应该设置为35左右,35时略大于500(网页评论非无限下拉)
# 用户自定义配置区********************************
start = time.time()
# 爬取商品信息
# ALL_PAGE_URL=list(pd.read_excel("GiftGoodsInfo.xls")["skuId"])
ALL_PAGE_URL=list(pd.ExcelFile("Gift2.xlsx").parse("Sheet3")["SKU_ID"])
# i=ALL_PAGE_URL.index(36063041972)
# print(i)
# exit()
# ALL_PAGE_URL=[12707870,12707870,27026831696]
for SKU_ID in ALL_PAGE_URL:
try:
page_url="https://item.jd.com/"+str(SKU_ID)+".html"
# print(page_url)
html = get_page(page_url) # 请求网页,selenium动态渲染
doc = pq(html, parser='html')
SKU_INTRODUCE= doc('#detail > div.tab-con > div:nth-child(1)').text()
SKU_SIZE = doc('#detail > div.tab-con > div:nth-child(2)').text()
SKU_PRICE= doc('div.itemInfo-wrap span.p-price span.price').text()
SKU_LABEL1= doc('#crumb-wrap > div > div.crumb.fl.clearfix > div:nth-child(1) > a').text()
SKU_LABEL3= doc('#crumb-wrap > div > div.crumb.fl.clearfix > div:nth-child(3) > a').text()
SKU_LABEL5= doc('#crumb-wrap > div > div.crumb.fl.clearfix > div:nth-child(5) > a').text()
SKU_LABEL7= doc('#crumb-wrap > div > div.crumb.fl.clearfix > div:nth-child(7) > a').text()
SKU_TITLE= doc('div.sku-name').text() #图书类的 与其他商品 html格式不一样
SKU_COMMENT_NUMS= doc('#detail > div.tab-main.large > ul > li.current > s').text().replace('(', '').replace(
')', '')
SKU_GOOD_RATE= doc('#comment > div.mc > div.comment-info.J-comment-info > div.comment-percent > div').text()
# print(SKU_ID,SKU_TITLE)
sql.insert_skus(SKU_ID, SKU_INTRODUCE, SKU_SIZE, SKU_PRICE,SKU_LABEL5,SKU_TITLE,SKU_COMMENT_NUMS,SKU_GOOD_RATE)
except Exception as error:
print(SKU_ID,error)
pass
end = time.time()
print('总共用时{}秒'.format(end - start))