forked from aigc-in-all/HousingWeb
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhouse.py
76 lines (61 loc) · 2.11 KB
/
house.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#-*- coding:utf-8 -*-
from bs4 import BeautifulSoup
from urlparse import urljoin
import requests
import re
import sys
import MySQLdb
reload(sys)
sys.setdefaultencoding('utf8')
db = MySQLdb.connect("localhost", "root", "heqingbao", "housing", charset='utf8')
url = "http://sz.58.com/pinpaigongyu/pn/{page}"
def main():
for page in range(1,21):
print 'fetch: ', url.format(page=page)
response = requests.get(url.format(page=page))
html = BeautifulSoup(response.text)
# html = BeautifulSoup(open('house.html'))
house_list = html.select(".list > li")
if not house_list:
retrun
cursor = db.cursor()
try:
for house in house_list:
house_title = house.select("h2")[0].string.encode("utf-8").split(' ')
house_url = urljoin(url, house.select("a")[0]["href"])
house_img = house.select("img")[0]["lazy_src"]
room = parseRoomTag(house.encode('utf-8'))
house_type = room[0]
house_area = room[1]
house_floor = room[2]
# print house_type, house_area, house_floor
rent_price = house.find_all('div', class_='money')[0].select('b')[0].string
rent_type = house_title[0][0:12]
build_region = house_title[0][12:]
rent_room = house_title[2]
build_name = house_title[1]
sql = "insert into house(build_name, build_region, house_type, house_area, \
house_floor, rent_type, rent_room, rent_price, img, url, c_id) \
values ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', %d)" \
% (build_name.strip(), build_region.strip(), house_type.strip(), house_area.strip(), house_floor, \
rent_type.strip(), rent_room.strip(), rent_price, house_img.strip(), house_url.strip(), 240)
cursor.execute(sql)
db.commit()
except Exception, e:
print e
db.rollback()
finally:
cursor.close()
db.close()
def parseRoomTag(tag):
result = []
rc = re.compile("<p class=\"room\">(.*?)</p>", re.DOTALL)
s = rc.findall(tag)[0].split()
t = [elem for elem in s if elem != ' ']
for value in t:
if value.startswith('<b>') == False :
if value != '\xc2\xa0':
result.append(value.encode('utf-8').strip())
return result
if __name__ == '__main__':
main()