-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsource_crawl.py
executable file
·47 lines (37 loc) · 1.26 KB
/
source_crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import os
import requests
from bs4 import BeautifulSoup
from threading import Thread
def get_all_url():
host = 'http://tianqi.2345.com'
sheng_list = requests.get('http://tianqi.2345.com').content
sheng_soup = BeautifulSoup(sheng_list)
div_sheng = sheng_soup.findAll('div', {'class':'bmeta'})[1].findAll('div', {'class':'clearfix'})[0]
all_url = []
for a in div_sheng:
all_url.append(a['href'])
return all_url
def get_city(url):
city_list = requests.get(url).content
city_soup = BeautifulSoup(city_list)
dd_city = city_soup.findAll('dd')
dd_city.pop()
for dd in dd_city:
a_list = dd.findAll('a')
for a in a_list:
href = a['href']
city_name = a.string
city_pinyin = re.findall(r'\w+', href)[0]
city_code = re.findall(r'\d+', href)[0]
string = u"'%s': {'code':%s, 'name':u'%s'},\n" % (city_pinyin, city_code, city_name)
f = open('city.txt', 'a')
f.write(string.encode('utf-8'))
f.close()
if __name__ == "__main__":
all_url = get_all_url()
for url in all_url:
Thread(target = get_city, args = [url]).start()
#cat city.txt | sort -n > city_code.txt