5
5
import re
6
6
import time
7
7
from datetime import datetime
8
+ from datetime import date
8
9
from itertools import cycle
10
+ from bs4 import BeautifulSoup
9
11
10
12
from django .core .exceptions import ObjectDoesNotExist
11
13
from django .utils .timezone import make_aware
12
14
13
- from douban_group_spy .const import USER_AGENT , DATETIME_FORMAT
15
+ from douban_group_spy .const import USER_AGENT , DATETIME_FORMAT , DATE_FORMAT
14
16
15
17
os .environ .setdefault ('DJANGO_SETTINGS_MODULE' , 'douban_group_spy.settings' )
16
18
import django
29
31
30
32
31
33
def process_posts (posts , group , keywords , exclude ):
32
- for t in posts [ 'topics' ] :
34
+ for t in posts :
33
35
# ignore title or content including exclude keywords
34
36
exclude_flag = False
35
37
for e in exclude :
@@ -64,7 +66,7 @@ def process_posts(posts, group, keywords, exclude):
64
66
post_id = t ['id' ], group = group ,
65
67
author_info = t ['author' ], alt = t ['alt' ],
66
68
title = t ['title' ], content = t ['content' ],
67
- photo_list = [ i [ 'alt' ] for i in t ['photos' ] ],
69
+ photo_list = t ['photos' ],
68
70
# rent=0.0, subway='', contact='',
69
71
is_matched = is_matched , keyword_list = keyword_list ,
70
72
created = make_aware (datetime .strptime (t ['created' ], DATETIME_FORMAT )),
@@ -79,14 +81,17 @@ def crawl(group_id, pages, keywords, exclude):
79
81
try :
80
82
group = Group .objects .get (id = group_id )
81
83
except ObjectDoesNotExist :
82
- g_info = requests .get (GROUP_INFO_BASE_URL .format (DOUBAN_BASE_HOST , group_id ), headers = {'User-Agent' : USER_AGENT }).json ()
84
+ html = requests .get (GROUP_INFO_BASE_URL .format (DOUBAN_BASE_HOST , group_id ), headers = {'User-Agent' : USER_AGENT }).text
85
+ g_info = BeautifulSoup (html ,'lxml' )
83
86
lg .info (f'Getting group: { group_id } successful' )
87
+ member_count_text = g_info .select_one (f"a[href='https://www.douban.com/group/{ group_id } /members']" ).get_text ()
88
+ created_text = g_info .select_one ('div[class="group-board"] p' ).get_text ()
84
89
group = Group (
85
- id = g_info [ 'uid' ] ,
86
- name = g_info [ 'name' ] ,
87
- alt = g_info [ 'alt' ] ,
88
- member_count = g_info [ 'member_count' ] ,
89
- created = make_aware (datetime .strptime (g_info [ 'created' ], DATETIME_FORMAT ))
90
+ id = group_id ,
91
+ name = g_info . select_one ( 'h1' ). get_text (). strip () ,
92
+ alt = g_info . select_one ( "div[class='group-intro']" ). get_text () ,
93
+ member_count = int ( re . findall ( r'[(](.*?)[)]' , member_count_text )[ 0 ]) ,
94
+ created = make_aware (datetime .strptime (re . findall ( r"创建于(.+?) " , created_text )[ 0 ], DATE_FORMAT ))
90
95
)
91
96
group .save (force_insert = True )
92
97
@@ -95,7 +100,7 @@ def crawl(group_id, pages, keywords, exclude):
95
100
# host = next(douban_base_host)
96
101
kwargs = {
97
102
'url' : GROUP_TOPICS_BASE_URL .format (DOUBAN_BASE_HOST , group_id ),
98
- 'params' : {'start' : p },
103
+ 'params' : {'start' : p * 25 },
99
104
'headers' : {'User-Agent' : USER_AGENT }
100
105
}
101
106
req = getattr (requests , 'get' )(** kwargs )
@@ -111,9 +116,31 @@ def crawl(group_id, pages, keywords, exclude):
111
116
lg .warning (f'Fail to getting: { req .url } , status: { req .status_code } ' )
112
117
continue
113
118
114
- posts = req .json ()
119
+ soup = BeautifulSoup (req .text ,'lxml' )
120
+ posts = []
121
+ for row in soup .select ('table[class="olt"] tr[class=""]' ):
122
+ link = row .select_one ('td[class="title"] a' )
123
+ link_href = link ["href" ]
124
+ post_detail_html = requests .get (link_href , headers = {'User-Agent' : USER_AGENT }).text
125
+ post_detail = BeautifulSoup (post_detail_html ,'lxml' )
126
+ post_content = post_detail .select_one ('div[class="topic-content"]' )
127
+ post_photos = []
128
+ for photo_row in post_content .select ('img' ):
129
+ post_photos .append (photo_row ['src' ])
130
+
131
+ result = {}
132
+ result ['id' ]= int (re .findall (r"https://www.douban.com/group/topic/(.+?)/" ,link_href )[0 ])
133
+ result ['title' ]= link ["title" ]
134
+ result ['content' ]= post_content .get_text ().strip ()
135
+ result ['alt' ]= link_href
136
+ author_link = row .select ("td" )[1 ].select_one ('a' )
137
+ result ['author' ]= {'name' :author_link .get_text (),'alt' :author_link ["href" ]}
138
+ result ['photos' ]= post_photos
139
+ result ['created' ]= post_detail .select_one ('.create-time' ).get_text ()
140
+ result ['updated' ]= f'{ date .today ().year } -{ row .select ("td" )[3 ].get_text ()} :00'
141
+ posts .append (result )
115
142
process_posts (posts , group , keywords , exclude )
116
-
143
+
117
144
118
145
@click .command (help = 'example: python crawler_main.py -g 10086 -g 12345 -k xx花园 -k xx地铁 -e 求租' )
119
146
@click .option ('--groups' , '-g' , help = 'group id' , required = True , multiple = True , type = str )
0 commit comments