Skip to content

Commit f6e9c79

Browse files
authored
Merge pull request #11 from xwjdsh/master
通过豆瓣小组网页获取数据
2 parents 9bdef89 + ba142bd commit f6e9c79

File tree

4 files changed

+55
-18
lines changed

4 files changed

+55
-18
lines changed

crawler_main.py

+39-12
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,14 @@
55
import re
66
import time
77
from datetime import datetime
8+
from datetime import date
89
from itertools import cycle
10+
from bs4 import BeautifulSoup
911

1012
from django.core.exceptions import ObjectDoesNotExist
1113
from django.utils.timezone import make_aware
1214

13-
from douban_group_spy.const import USER_AGENT, DATETIME_FORMAT
15+
from douban_group_spy.const import USER_AGENT, DATETIME_FORMAT, DATE_FORMAT
1416

1517
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'douban_group_spy.settings')
1618
import django
@@ -29,7 +31,7 @@
2931

3032

3133
def process_posts(posts, group, keywords, exclude):
32-
for t in posts['topics']:
34+
for t in posts:
3335
# ignore title or content including exclude keywords
3436
exclude_flag = False
3537
for e in exclude:
@@ -64,7 +66,7 @@ def process_posts(posts, group, keywords, exclude):
6466
post_id=t['id'], group=group,
6567
author_info=t['author'], alt=t['alt'],
6668
title=t['title'], content=t['content'],
67-
photo_list=[i['alt'] for i in t['photos']],
69+
photo_list=t['photos'],
6870
# rent=0.0, subway='', contact='',
6971
is_matched=is_matched, keyword_list=keyword_list,
7072
created=make_aware(datetime.strptime(t['created'], DATETIME_FORMAT)),
@@ -79,14 +81,17 @@ def crawl(group_id, pages, keywords, exclude):
7981
try:
8082
group = Group.objects.get(id=group_id)
8183
except ObjectDoesNotExist:
82-
g_info = requests.get(GROUP_INFO_BASE_URL.format(DOUBAN_BASE_HOST, group_id), headers={'User-Agent': USER_AGENT}).json()
84+
html = requests.get(GROUP_INFO_BASE_URL.format(DOUBAN_BASE_HOST, group_id), headers={'User-Agent': USER_AGENT}).text
85+
g_info = BeautifulSoup(html,'lxml')
8386
lg.info(f'Getting group: {group_id} successful')
87+
member_count_text=g_info.select_one(f"a[href='https://www.douban.com/group/{group_id}/members']").get_text()
88+
created_text=g_info.select_one('div[class="group-board"] p').get_text()
8489
group = Group(
85-
id=g_info['uid'],
86-
name=g_info['name'],
87-
alt=g_info['alt'],
88-
member_count=g_info['member_count'],
89-
created=make_aware(datetime.strptime(g_info['created'], DATETIME_FORMAT))
90+
id=group_id,
91+
name=g_info.select_one('h1').get_text().strip(),
92+
alt=g_info.select_one("div[class='group-intro']").get_text(),
93+
member_count=int(re.findall(r'[(](.*?)[)]', member_count_text)[0]),
94+
created=make_aware(datetime.strptime(re.findall(r"创建于(.+?) ",created_text)[0], DATE_FORMAT))
9095
)
9196
group.save(force_insert=True)
9297

@@ -95,7 +100,7 @@ def crawl(group_id, pages, keywords, exclude):
95100
# host = next(douban_base_host)
96101
kwargs = {
97102
'url': GROUP_TOPICS_BASE_URL.format(DOUBAN_BASE_HOST, group_id),
98-
'params': {'start': p},
103+
'params': {'start': p*25},
99104
'headers': {'User-Agent': USER_AGENT}
100105
}
101106
req = getattr(requests, 'get')(**kwargs)
@@ -111,9 +116,31 @@ def crawl(group_id, pages, keywords, exclude):
111116
lg.warning(f'Fail to getting: {req.url}, status: {req.status_code}')
112117
continue
113118

114-
posts = req.json()
119+
soup = BeautifulSoup(req.text,'lxml')
120+
posts=[]
121+
for row in soup.select('table[class="olt"] tr[class=""]'):
122+
link=row.select_one('td[class="title"] a')
123+
link_href=link["href"]
124+
post_detail_html = requests.get(link_href, headers={'User-Agent': USER_AGENT}).text
125+
post_detail = BeautifulSoup(post_detail_html,'lxml')
126+
post_content=post_detail.select_one('div[class="topic-content"]')
127+
post_photos=[]
128+
for photo_row in post_content.select('img'):
129+
post_photos.append(photo_row['src'])
130+
131+
result={}
132+
result['id']=int(re.findall(r"https://www.douban.com/group/topic/(.+?)/",link_href)[0])
133+
result['title']=link["title"]
134+
result['content']=post_content.get_text().strip()
135+
result['alt']=link_href
136+
author_link=row.select("td")[1].select_one('a')
137+
result['author']={'name':author_link.get_text(),'alt':author_link["href"]}
138+
result['photos']=post_photos
139+
result['created']=post_detail.select_one('.create-time').get_text()
140+
result['updated']=f'{date.today().year}-{row.select("td")[3].get_text()}:00'
141+
posts.append(result)
115142
process_posts(posts, group, keywords, exclude)
116-
143+
117144

118145
@click.command(help='example: python crawler_main.py -g 10086 -g 12345 -k xx花园 -k xx地铁 -e 求租')
119146
@click.option('--groups', '-g', help='group id', required=True, multiple=True, type=str)

douban_group_spy/const.py

+1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S'
2+
DATE_FORMAT = '%Y-%m-%d'
23

34
HREF_FORMAT = "<a href='{url}'>{url}</a>"
45
IMG_FORMAT = '<img src="{url}" height="400" width="400" referrerpolicy ="never"/><br/>'

douban_group_spy/settings.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@
120120

121121
STATIC_URL = '/static/'
122122

123-
DOUBAN_BASE_HOST = 'https://api.douban.com'
123+
DOUBAN_BASE_HOST = 'https://www.douban.com'
124124

125-
GROUP_TOPICS_BASE_URL = '{}/v2/group/{}/topics'
126-
GROUP_INFO_BASE_URL = '{}/v2/group/{}/'
125+
GROUP_TOPICS_BASE_URL = '{}/group/{}/discussion'
126+
GROUP_INFO_BASE_URL = '{}/group/{}/'

requirements.txt

+12-3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,13 @@
1-
django==2.2.20
1+
beautifulsoup4==4.9.3
2+
certifi==2021.5.30
3+
charset-normalizer==2.0.2
4+
Click==7.0
5+
Django==2.2.20
6+
idna==3.2
27
jsonfield==2.0.2
3-
click==7.0.0
4-
requests
8+
lxml==4.6.3
9+
pytz==2021.1
10+
requests==2.26.0
11+
soupsieve==2.2.1
12+
sqlparse==0.4.1
13+
urllib3==1.26.6

0 commit comments

Comments
 (0)