-
Notifications
You must be signed in to change notification settings - Fork 59
/
Copy pathRoboBrowser_baidu_spider.py
67 lines (47 loc) · 1.17 KB
/
RoboBrowser_baidu_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/env python
# encoding: utf-8
"""
@version: v1.0
@author: xag
@license: Apache Licence
@contact: [email protected]
@site: http://www.xingag.top
@software: PyCharm
@file: baidu_spider.py
@time: 2021/3/6 下午11:34
@description:TODO
"""
from time import sleep
from robobrowser import RoboBrowser
home_url = 'https://baidu.com'
# parser: 解析器,HTML parser; used by BeautifulSoup
# 官方推荐:lxml
rb = RoboBrowser(history=True, parser='lxml')
# 打开目标网站
rb.open(home_url)
sleep(1)
# 找到form表单
bd_form = rb.get_form()
print(bd_form)
bd_form['wd'].value = "AirPython"
# 提交表单,模拟一次搜索
rb.submit_form(bd_form)
sleep(1)
# 查看结果
result_elements = rb.select(".result")
# 搜索结果
search_result = []
# 第一项的链接地址
first_href = ''
for index, element in enumerate(result_elements):
title = element.find("a").text
href = element.find("a")['href']
search_result.append(title)
if index == 0:
first_href = element.find("a")
print('第一项地址为:', href)
print(search_result)
# 跳转到第一个链接
rb.follow_link(first_href)
# 获取历史
print(rb.url)