5
5
import re
6
6
import time
7
7
from datetime import datetime
8
+ from datetime import date
8
9
from itertools import cycle
10
+ from bs4 import BeautifulSoup
9
11
10
12
from django .core .exceptions import ObjectDoesNotExist
11
13
from django .utils .timezone import make_aware
12
14
13
- from douban_group_spy .const import USER_AGENT , DATETIME_FORMAT
15
+ from douban_group_spy .const import USER_AGENT , DATETIME_FORMAT , DATE_FORMAT
14
16
15
17
os .environ .setdefault ('DJANGO_SETTINGS_MODULE' , 'douban_group_spy.settings' )
16
18
import django
29
31
30
32
31
33
def process_posts (posts , group , keywords , exclude ):
32
- for t in posts [ 'topics' ] :
34
+ for t in posts :
33
35
# ignore title or content including exclude keywords
34
36
exclude_flag = False
35
37
for e in exclude :
@@ -79,14 +81,17 @@ def crawl(group_id, pages, keywords, exclude):
79
81
try :
80
82
group = Group .objects .get (id = group_id )
81
83
except ObjectDoesNotExist :
82
- g_info = requests .get (GROUP_INFO_BASE_URL .format (DOUBAN_BASE_HOST , group_id ), headers = {'User-Agent' : USER_AGENT }).json ()
84
+ html = requests .get (GROUP_INFO_BASE_URL .format (DOUBAN_BASE_HOST , group_id ), headers = {'User-Agent' : USER_AGENT }).text
85
+ g_info = BeautifulSoup (html ,'lxml' )
83
86
lg .info (f'Getting group: { group_id } successful' )
87
+ member_count_text = g_info .select_one (f"a[href='https://www.douban.com/group/{ group_id } /members']" ).get_text ()
88
+ created_text = g_info .select_one ('div[class="group-board"] p' ).get_text ()
84
89
group = Group (
85
- id = g_info [ 'uid' ] ,
86
- name = g_info [ 'name' ] ,
87
- alt = g_info [ 'alt' ] ,
88
- member_count = g_info [ 'member_count' ] ,
89
- created = make_aware (datetime .strptime (g_info [ 'created' ], DATETIME_FORMAT ))
90
+ id = group_id ,
91
+ name = g_info . select_one ( 'h1' ). get_text (). strip () ,
92
+ alt = g_info . select_one ( "div[class='group-intro']" ). get_text () ,
93
+ member_count = int ( re . findall ( r'[(](.*?)[)]' , member_count_text )[ 0 ]) ,
94
+ created = make_aware (datetime .strptime (re . findall ( r"创建于(.+?) " , created_text )[ 0 ], DATE_FORMAT ))
90
95
)
91
96
group .save (force_insert = True )
92
97
@@ -95,7 +100,7 @@ def crawl(group_id, pages, keywords, exclude):
95
100
# host = next(douban_base_host)
96
101
kwargs = {
97
102
'url' : GROUP_TOPICS_BASE_URL .format (DOUBAN_BASE_HOST , group_id ),
98
- 'params' : {'start' : p },
103
+ 'params' : {'start' : p * 25 },
99
104
'headers' : {'User-Agent' : USER_AGENT }
100
105
}
101
106
req = getattr (requests , 'get' )(** kwargs )
@@ -111,9 +116,23 @@ def crawl(group_id, pages, keywords, exclude):
111
116
lg .warning (f'Fail to getting: { req .url } , status: { req .status_code } ' )
112
117
continue
113
118
114
- posts = req .json ()
119
+ soup = BeautifulSoup (req .text ,'lxml' )
120
+ posts = []
121
+ for row in soup .select ('table[class="olt"] tr[class=""]' ):
122
+ result = {}
123
+ link = row .select_one ('td[class="title"] a' )
124
+ result ['id' ]= int (re .findall (r"https://www.douban.com/group/topic/(.+?)/" ,link ["href" ])[0 ])
125
+ result ['title' ]= link ["title" ]
126
+ result ['content' ]= ''
127
+ result ['alt' ]= ''
128
+ author_link = row .select ("td" )[1 ].select_one ('a' )
129
+ result ['author' ]= {'name' :author_link .get_text (),'alt' :author_link ["href" ]}
130
+ result ['photos' ]= []
131
+ result ['created' ]= '1970-01-01 00:00:00'
132
+ result ['updated' ]= f'{ date .today ().year } -{ row .select ("td" )[3 ].get_text ()} :00'
133
+ posts .append (result )
115
134
process_posts (posts , group , keywords , exclude )
116
-
135
+
117
136
118
137
@click .command (help = 'example: python crawler_main.py -g 10086 -g 12345 -k xx花园 -k xx地铁 -e 求租' )
119
138
@click .option ('--groups' , '-g' , help = 'group id' , required = True , multiple = True , type = str )
0 commit comments