-
Notifications
You must be signed in to change notification settings - Fork 68
/
test46.py
84 lines (65 loc) · 2.58 KB
/
test46.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env python3
# -*- conding: utf-8 -*-
'练习内建模块之HTMLParser'
__author__ = 'sergiojune'
from html.parser import HTMLParser
import requests
class MyHTMLParser(HTMLParser):
def handle_starttag(self, tag, attrs): # 这个是处理开始标签
print('<%s>' % tag, list(attrs))
def handle_endtag(self, tag): # 这个是处理结束标签
print('</%s>' % tag)
def handle_data(self, data): # 这个是处理标签里的内容
print(data)
def handle_comment(self, data): # 这个是处理注释
print('<!--', data, '-->')
def handle_entityref(self, name): # 这个是处理特殊字符,比如
print('&%s;' % name)
def handle_charref(self, name): # 这个是处理特殊字符,比如Ӓ
print('&#%s;' % name)
parser = MyHTMLParser()
parser.feed('''<html>
<head></head>
<body>
<!-- test html parser -->
<p>Some <a href=\"#\">html</a> HTML tutorial...<br>END</p>
</body></html>''')
# 作业:找一个网页,例如https://www.python.org/events/python-events/,用浏览器查看源码并复制,然后尝试解析一下HTML,输出Python官网发布的会议时间、名称和地点。
class DealHTML(HTMLParser):
def __init__(self):
super(DealHTML, self).__init__()
self.thing = 0
self.time = 0
self.address = 0
def handle_starttag(self, tag, attrs):
if len(attrs) == 1:
if 'python-events' in list(attrs)[0][1]: # 获取工作事件
print('<href=%s>' % list(attrs)[0][1], end='')
self.thing = 1
if 'datetime' in list(attrs)[0][0]: # 获取工作时间
print('<%s>' % list(attrs)[0][0], end='')
self.time = 1
if 'location' in list(attrs)[0][1]: # 获取工作地点
print('<%s>' % list(attrs)[0][1], end='')
self.address = 1
def handle_data(self, data):
if self.thing:
print(data, end='')
if self.time:
print(data, end='')
if self.address:
print(data, end='')
def handle_endtag(self, tag):
if self.thing:
print('</%s>' % tag)
self.thing = 0
if self.time:
print('</%s>' % tag)
self.time = 0
if self.address:
print('</%s>' % tag)
print('')
self.address = 0
response = requests.get('https://www.python.org/events/python-events/').text
dh = DealHTML()
dh.feed(response)