-
Notifications
You must be signed in to change notification settings - Fork 22
/
Copy pathtest_crawler.py
156 lines (131 loc) · 5.06 KB
/
test_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/usr/bin/env python
# encoding: utf-8
"""
test_crawler.py
Created by 徐 光硕 on 2011-11-24.
Copyright (c) 2011 __MyCompanyName__. All rights reserved.
"""
from pyTOP.crawler import Crawler
from pyTOP.user import User
from pyTOP.insight import TopLevelCats, WordsBase, WordsAnalysis
from pyTOP.item import Items
from pprint import pprint
import requests
import urllib, re
from BeautifulSoup import BeautifulSoup
top_session = '4020831d426896a5328c50f8117b920fb78579dYCThsHyd6517981601'
def extract_form_fields(soup):
fields = {}
for input in soup.findAll('input'):
# ignore submit/image with no name attribute
if input['type'] in ('submit', 'image') and not input.has_key('name'):
continue
# single element nome/value fields
if input['type'] in ('text', 'hidden', 'password', 'submit', 'image'):
value = ''
if input.has_key('value'):
value = input['value']
fields[input['name']] = value
continue
# checkboxes and radios
if input['type'] in ('checkbox', 'radio'):
value = ''
if input.has_key('checked'):
if input.has_key('value'):
value = input['value']
else:
value = 'on'
if 'name' in input and fields.has_key(input['name']) and value:
fields[input['name']] = value
if 'name' in input and not fields.has_key(input['name']):
fields[input['name']] = value
continue
assert False, 'input type %s not supported' % input['type']
# textareas
for textarea in soup.findAll('textarea'):
fields[textarea['name']] = textarea.string or ''
# select fields
for select in soup.findAll('select'):
value = ''
options = select.findAll('option')
is_multiple = select.has_key('multiple')
selected_options = [
option for option in options
if option.has_key('selected')
]
# If no select options, go with the first one
if not selected_options and options:
selected_options = [options[0]]
if not is_multiple:
assert(len(selected_options) < 2)
if len(selected_options) == 1:
value = selected_options[0]['value']
else:
value = [option['value'] for option in selected_options]
fields[select['name']] = value
return fields
def get_cats():
crawler = Crawler()
cats = crawler.get_cats()
crawler.save_cats(cats)
def get_top_keywords():
crawler = Crawler()
crawler.get_top_keywords()
def get_sug():
crawler = Crawler()
crawler.sug('儿童卫衣 男')
def get_user():
user = User()
print user.get('北京喜宝')
items = Items()
print items.onsale_get(top_session)
return
tlc = TopLevelCats()
print tlc.get(top_session)
def adwords_login():
s = requests.session()
login_url = 'https://login.taobao.com/member/login.jhtml'
r = s.get(login_url)
soup = BeautifulSoup(r.content)
#pprint(r.cookies)
forms = extract_form_fields(soup.find('form', id='J_StaticForm'))
forms['TPL_username'] = u'喜宝_03'.encode('gbk')
forms['TPL_password'] = 'uwUe3tlToXtZgO6Y'
#forms['_tb_token_'] = 'L3T8QONzL1/PqT8QON0M1/mga8QON4M1/Niw9QONCQ1/YM3AQONJQ1/9fZWUON8f3/QFbWUONAf3/qyfWUONEf3'
#forms['umto'] = 'T7f5c7eb9e08c689752f741e81dcbe2c5,'
#forms['gvfdcre'] = '68747470733A2F2F6C6F67696E2E74616F62616F2E636F6D2F6D656D6265722F6C6F67696E2E6A68746D6C'
headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Content-Type':'application/x-www-form-urlencoded',
'Origin':'https://login.taobao.com',
'Referer':'https://login.taobao.com/member/login.jhtml',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.53.11 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10'}
r = s.post(login_url, data=forms, headers=headers)
print r.status_code
pprint(r.request.data)
pprint(r.headers)
pprint(r.cookies)
headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-cn',
'Origin':'https://login.taobao.com',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.53.11 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10'}
if 'location' in r.headers:
r = s.get(r.headers['location'], headers=headers)
pprint(r.headers)
else:
m = re.findall(r'window\.location = "([^^]*?)";', r.text)
if m:
r = s.get(m[0], headers=headers)
print r.url
r = s.get('http://subway.simba.taobao.com/login.htm?outSideKey=taobao', headers=headers)
print r.content
#print r.status_code
#print r.cookies
def main():
#get_cats()
#get_top_keywords()
#get_sug()
#get_user()
adwords_login()
if __name__ == '__main__':
main()