16
16
from ara .settings import PORTAL_ID , PORTAL_PASSWORD
17
17
18
18
LOGIN_INFO_SSO2 = {
19
- ' userid' : PORTAL_ID ,
20
- ' password' : PORTAL_PASSWORD ,
21
- ' saveid' : 'on' ,
22
- ' phase' : ' pass1' ,
19
+ " userid" : PORTAL_ID ,
20
+ " password" : PORTAL_PASSWORD ,
21
+ " saveid" : "on" ,
22
+ " phase" : " pass1" ,
23
23
}
24
24
25
25
26
26
LOGIN_INFO_SSO = {
27
- 'userid' : PORTAL_ID ,
28
- 'password' : PORTAL_PASSWORD ,
29
- 'saveid' : 'on' ,
30
- 'phase' : 'pass2' ,
27
+ "user_id" : PORTAL_ID ,
28
+ "pw" : PORTAL_PASSWORD ,
29
+ "login_page" : "L_P_COMMON" ,
31
30
}
32
31
33
32
34
- BASE_URL = ' https://portal.kaist.ac.kr'
33
+ BASE_URL = " https://portal.kaist.ac.kr"
35
34
36
35
37
36
def _login_kaist_portal ():
38
37
session = requests .Session ()
39
- login_req1 = session .post ('https://portalsso.kaist.ac.kr/ssoProcess2.ps' , data = LOGIN_INFO_SSO2 ,)
40
- login_req2 = session .post ('https://portalsso.kaist.ac.kr/ssoProcess.ps' , data = LOGIN_INFO_SSO ,)
41
-
42
- print (f'sso2: { login_req1 .status_code } & sso: { login_req2 .status_code } ' )
38
+ init_response = session .get (
39
+ "https://portal.kaist.ac.kr/portal/" , allow_redirects = True
40
+ )
41
+ login_param_id = init_response .url .split ("=" )[- 1 ]
42
+
43
+ login_response = session .post (
44
+ "https://iam2.kaist.ac.kr/api/sso/login" ,
45
+ data = {** LOGIN_INFO_SSO , "param_id" : login_param_id ,},
46
+ )
47
+
48
+ k_uid = login_response .json ()["dataMap" ]["USER_INFO" ]["kaist_uid" ]
49
+ state = login_response .json ()["dataMap" ]["state" ]
50
+
51
+ session .post (
52
+ "https://portal.kaist.ac.kr/statics/redirectUri.jsp" ,
53
+ data = {
54
+ "k_uid" : k_uid ,
55
+ "state" : state ,
56
+ "success" : "true" ,
57
+ "result" : login_response .text ,
58
+ "user_id" : PORTAL_ID ,
59
+ },
60
+ )
43
61
44
62
return session
45
63
46
64
47
65
def _get_article (url , session ):
48
-
49
66
def _already_hyperlinked (html ):
50
- soup = bs (html , ' lxml' )
67
+ soup = bs (html , " lxml" )
51
68
tagged_links = []
52
69
for child in soup .descendants :
53
- name = getattr (child , ' name' , None )
70
+ name = getattr (child , " name" , None )
54
71
if name :
55
- linked = child .attrs .get (' src' ) or child .attrs .get (' href' )
72
+ linked = child .attrs .get (" src" ) or child .attrs .get (" href" )
56
73
if linked :
57
74
tagged_links .append (linked )
58
75
@@ -64,7 +81,7 @@ def _enable_hyperlink(s):
64
81
links = [x [0 ] for x in url ]
65
82
66
83
start_index = 0
67
- new_string = ''
84
+ new_string = ""
68
85
already_hyperlinked = _already_hyperlinked (s )
69
86
for link in links :
70
87
start = start_index + s [start_index :].find (link )
@@ -83,38 +100,51 @@ def _enable_hyperlink(s):
83
100
return new_string
84
101
85
102
article_req = session .get (url )
86
- soup = bs (article_req .text , 'lxml' )
87
-
88
- writer = soup .find ('th' , text = '작성자(소속)' ).findNext ('td' ).select ('label' )[0 ].contents [0 ].strip ()
89
- created_at_str = soup .find ('th' , text = '작성일(조회수)' ).findNext ('td' ).contents [0 ].strip ().split ('(' )[0 ]
90
- created_at = timezone .get_current_timezone ().localize (datetime .strptime (created_at_str , '%Y.%m.%d %H:%M:%S' ))
91
- title = soup .select ('table > tbody > tr > td.req_first' )[0 ].contents [0 ]
92
-
93
- trs = soup .select ('table > tbody > tr' )
103
+ soup = bs (article_req .text , "lxml" )
104
+
105
+ writer = (
106
+ soup .find ("th" , text = "작성자(소속)" )
107
+ .findNext ("td" )
108
+ .select ("label" )[0 ]
109
+ .contents [0 ]
110
+ .strip ()
111
+ )
112
+ created_at_str = (
113
+ soup .find ("th" , text = "작성일(조회수)" )
114
+ .findNext ("td" )
115
+ .contents [0 ]
116
+ .strip ()
117
+ .split ("(" )[0 ]
118
+ )
119
+ created_at = timezone .get_current_timezone ().localize (
120
+ datetime .strptime (created_at_str , "%Y.%m.%d %H:%M:%S" )
121
+ )
122
+ title = soup .select ("table > tbody > tr > td.req_first" )[0 ].contents [0 ]
123
+
124
+ trs = soup .select ("table > tbody > tr" )
94
125
html = None
95
126
96
127
for tr in trs :
97
128
if len (list (tr .children )) == 3 :
98
- html = tr .find ('td' ).prettify ()
129
+ html = tr .find ("td" ).prettify ()
99
130
break
100
131
101
132
html = _enable_hyperlink (html )
102
133
103
134
if html is None :
104
- raise RuntimeError (gettext (' No content for portal article' ))
135
+ raise RuntimeError (gettext (" No content for portal article" ))
105
136
106
- content_text = ' ' .join (bs (html , features = ' html5lib' ).find_all (text = True ))
137
+ content_text = " " .join (bs (html , features = " html5lib" ).find_all (text = True ))
107
138
108
139
return {
109
- ' title' : title ,
110
- ' content_text' : content_text ,
111
- ' content' : html ,
112
- ' writer' : writer ,
113
- ' created_at' : created_at ,
140
+ " title" : title ,
141
+ " content_text" : content_text ,
142
+ " content" : html ,
143
+ " writer" : writer ,
144
+ " created_at" : created_at ,
114
145
}
115
146
116
147
117
-
118
148
def crawl_hour (day = None ):
119
149
# parameter에서 default로 바로 today()하면, 캐싱되어서 업데이트가 안됨
120
150
if day is None :
@@ -125,24 +155,25 @@ def crawl_hour(day=None):
125
155
def _get_board_today (page_num ):
126
156
today = True
127
157
board_req = session .get (
128
- f'{ BASE_URL } /board/list.brd?boardId=today_notice&lang_knd=ko&userAgent=Chrome&isMobile=false&page={ page_num } &userAgent=Chrome&isMobile=False&sortColumn=REG_DATIM&sortMethod=DESC' )
129
- soup = bs (board_req .text , 'lxml' )
158
+ f"{ BASE_URL } /board/list.brd?boardId=today_notice&lang_knd=ko&userAgent=Chrome&isMobile=false&page={ page_num } &userAgent=Chrome&isMobile=False&sortColumn=REG_DATIM&sortMethod=DESC"
159
+ )
160
+ soup = bs (board_req .text , "lxml" )
130
161
linklist = []
131
- links = soup .select (' table > tbody > tr > td > a' )
132
- dates = soup .select (' table > tbody > tr > td:nth-child(5)' )
162
+ links = soup .select (" table > tbody > tr > td > a" )
163
+ dates = soup .select (" table > tbody > tr > td:nth-child(5)" )
133
164
134
165
if links :
135
- print (' ------- portal login success!' )
166
+ print (" ------- portal login success!" )
136
167
else :
137
- print (' ------- portal login failed!' )
168
+ print (" ------- portal login failed!" )
138
169
139
- today_date = str (day ).replace ('-' , '.' )
170
+ today_date = str (day ).replace ("-" , "." )
140
171
for link , date in zip (links , dates ):
141
172
article_date = date .get_text ()
142
173
if article_date > today_date :
143
174
continue
144
175
elif article_date == today_date :
145
- linklist .append ({' link' : link .attrs [' href' ], ' date' : article_date })
176
+ linklist .append ({" link" : link .attrs [" href" ], " date" : article_date })
146
177
else :
147
178
today = False
148
179
return linklist , today
@@ -163,99 +194,110 @@ def _get_board_today(page_num):
163
194
page_num += 1
164
195
165
196
for link in links :
166
- link = link [' link' ]
167
- board_id = link .split ('/' )[- 2 ]
168
- num = link .split ('/' )[- 1 ]
169
- full_link = f' { BASE_URL } /board/read.brd?cmd=READ&boardId={ board_id } &bltnNo={ num } &lang_knd=ko'
197
+ link = link [" link" ]
198
+ board_id = link .split ("/" )[- 2 ]
199
+ num = link .split ("/" )[- 1 ]
200
+ full_link = f" { BASE_URL } /board/read.brd?cmd=READ&boardId={ board_id } &bltnNo={ num } &lang_knd=ko"
170
201
171
202
info = _get_article (full_link , session )
172
203
173
204
# Since it is time ordered, consequent ones have been posted more than 1 hour ago.
174
205
175
- exist = UserProfile .objects .filter (nickname = info [' writer' ], is_newara = False )
206
+ exist = UserProfile .objects .filter (nickname = info [" writer" ], is_newara = False )
176
207
if exist :
177
208
user = exist .first ().user
178
209
else :
179
- user = get_user_model ().objects .create (username = str (uuid .uuid1 ()), is_active = False )
210
+ user = get_user_model ().objects .create (
211
+ username = str (uuid .uuid1 ()), is_active = False
212
+ )
180
213
user_profile = UserProfile .objects .create (
181
214
is_newara = False ,
182
215
user = user ,
183
- nickname = info [' writer' ],
184
- picture = ' user_profiles/default_pictures/KAIST-logo.png' ,
216
+ nickname = info [" writer" ],
217
+ picture = " user_profiles/default_pictures/KAIST-logo.png" ,
185
218
)
186
219
187
220
a , created = Article .objects .get_or_create (
188
221
url = full_link ,
189
222
defaults = {
190
- ' parent_board_id' : 1 , # 포탈공지 게시판
191
- ' title' : info [' title' ],
192
- ' content' : info [' content' ],
193
- ' content_text' : info [' content_text' ],
194
- ' created_by' : user ,
195
- }
223
+ " parent_board_id" : 1 , # 포탈공지 게시판
224
+ " title" : info [" title" ],
225
+ " content" : info [" content" ],
226
+ " content_text" : info [" content_text" ],
227
+ " created_by" : user ,
228
+ },
196
229
)
197
230
198
231
if created :
199
- a .created_at = info [' created_at' ]
232
+ a .created_at = info [" created_at" ]
200
233
a .save ()
201
- print (f' crawled id: { a .id } - { a .title } ' )
234
+ print (f" crawled id: { a .id } - { a .title } " )
202
235
203
236
204
237
def crawl_all ():
205
238
session = _login_kaist_portal ()
206
239
207
240
def _get_board (page_num ):
208
241
board_req = session .get (
209
- f'{ BASE_URL } /board/list.brd?boardId=today_notice&lang_knd=ko&userAgent=Chrome&isMobile=false&page={ page_num } &sortColumn=REG_DATIM&sortMethod=DESC' )
210
- soup = bs (board_req .text , 'lxml' )
242
+ f"{ BASE_URL } /board/list.brd?boardId=today_notice&lang_knd=ko&userAgent=Chrome&isMobile=false&page={ page_num } &sortColumn=REG_DATIM&sortMethod=DESC"
243
+ )
244
+ soup = bs (board_req .text , "lxml" )
211
245
link = []
212
- titles = soup .select (' table > tbody > tr > td > a' )
246
+ titles = soup .select (" table > tbody > tr > td > a" )
213
247
for title in titles :
214
- link .append (title .attrs [' href' ])
248
+ link .append (title .attrs [" href" ])
215
249
216
250
return link
217
251
218
252
page_num = 1
219
253
220
254
while True :
221
- print (' page_num:' , page_num )
255
+ print (" page_num:" , page_num )
222
256
links = []
223
257
link = _get_board (page_num )
224
258
if link :
225
259
links .extend (link )
226
260
227
261
with transaction .atomic ():
228
262
for link in tqdm (links ):
229
- board_id = link .split ('/' )[- 2 ]
230
- num = link .split ('/' )[- 1 ]
231
- full_link = f' { BASE_URL } /board/read.brd?cmd=READ&boardId={ board_id } &bltnNo={ num } &lang_knd=ko'
263
+ board_id = link .split ("/" )[- 2 ]
264
+ num = link .split ("/" )[- 1 ]
265
+ full_link = f" { BASE_URL } /board/read.brd?cmd=READ&boardId={ board_id } &bltnNo={ num } &lang_knd=ko"
232
266
info = _get_article (full_link , session )
233
267
234
- exist = UserProfile .objects .filter (nickname = info ['writer' ], is_newara = False )
268
+ exist = UserProfile .objects .filter (
269
+ nickname = info ["writer" ], is_newara = False
270
+ )
235
271
if exist :
236
272
user = exist .first ().user
237
273
else :
238
- user = get_user_model ().objects .create (username = str (uuid .uuid1 ()), is_active = False )
274
+ user = get_user_model ().objects .create (
275
+ username = str (uuid .uuid1 ()), is_active = False
276
+ )
239
277
user_profile = UserProfile .objects .create (
240
278
is_newara = False ,
241
279
user = user ,
242
- nickname = info [' writer' ],
243
- picture = ' user_profiles/default_pictures/KAIST-logo.png' ,
280
+ nickname = info [" writer" ],
281
+ picture = " user_profiles/default_pictures/KAIST-logo.png" ,
244
282
)
245
283
246
284
a = Article .objects .create (
247
285
parent_board_id = 1 , # 포탈공지 게시판
248
- title = info [' title' ],
249
- content = info [' content' ],
250
- content_text = info [' content_text' ],
286
+ title = info [" title" ],
287
+ content = info [" content" ],
288
+ content_text = info [" content_text" ],
251
289
created_by = user ,
252
290
url = full_link ,
253
291
)
254
292
255
- a .created_at = info [' created_at' ]
293
+ a .created_at = info [" created_at" ]
256
294
a .save ()
257
295
258
296
page_num += 1
259
297
260
298
else :
261
299
break
300
+
301
+
302
+ if __name__ == "__main__" :
303
+ _login_kaist_portal ()
0 commit comments