Skip to content

Commit 80a7b77

Browse files
Merge pull request #207 from sparcs-kaist/fix/portal-login
Fix KAIST portal login
2 parents 139dd9e + 8920fdb commit 80a7b77

File tree

1 file changed

+117
-75
lines changed

1 file changed

+117
-75
lines changed

apps/core/management/scripts/portal_crawler.py

+117-75
Original file line numberDiff line numberDiff line change
@@ -16,43 +16,60 @@
1616
from ara.settings import PORTAL_ID, PORTAL_PASSWORD
1717

1818
LOGIN_INFO_SSO2 = {
19-
'userid': PORTAL_ID,
20-
'password': PORTAL_PASSWORD,
21-
'saveid': 'on',
22-
'phase': 'pass1',
19+
"userid": PORTAL_ID,
20+
"password": PORTAL_PASSWORD,
21+
"saveid": "on",
22+
"phase": "pass1",
2323
}
2424

2525

2626
LOGIN_INFO_SSO = {
27-
'userid': PORTAL_ID,
28-
'password': PORTAL_PASSWORD,
29-
'saveid': 'on',
30-
'phase': 'pass2',
27+
"user_id": PORTAL_ID,
28+
"pw": PORTAL_PASSWORD,
29+
"login_page": "L_P_COMMON",
3130
}
3231

3332

34-
BASE_URL = 'https://portal.kaist.ac.kr'
33+
BASE_URL = "https://portal.kaist.ac.kr"
3534

3635

3736
def _login_kaist_portal():
3837
session = requests.Session()
39-
login_req1 = session.post('https://portalsso.kaist.ac.kr/ssoProcess2.ps', data=LOGIN_INFO_SSO2,)
40-
login_req2 = session.post('https://portalsso.kaist.ac.kr/ssoProcess.ps', data=LOGIN_INFO_SSO,)
41-
42-
print(f'sso2: {login_req1.status_code} & sso: {login_req2.status_code}')
38+
init_response = session.get(
39+
"https://portal.kaist.ac.kr/portal/", allow_redirects=True
40+
)
41+
login_param_id = init_response.url.split("=")[-1]
42+
43+
login_response = session.post(
44+
"https://iam2.kaist.ac.kr/api/sso/login",
45+
data={**LOGIN_INFO_SSO, "param_id": login_param_id,},
46+
)
47+
48+
k_uid = login_response.json()["dataMap"]["USER_INFO"]["kaist_uid"]
49+
state = login_response.json()["dataMap"]["state"]
50+
51+
session.post(
52+
"https://portal.kaist.ac.kr/statics/redirectUri.jsp",
53+
data={
54+
"k_uid": k_uid,
55+
"state": state,
56+
"success": "true",
57+
"result": login_response.text,
58+
"user_id": PORTAL_ID,
59+
},
60+
)
4361

4462
return session
4563

4664

4765
def _get_article(url, session):
48-
4966
def _already_hyperlinked(html):
50-
soup = bs(html, 'lxml')
67+
soup = bs(html, "lxml")
5168
tagged_links = []
5269
for child in soup.descendants:
53-
name = getattr(child, 'name', None)
70+
name = getattr(child, "name", None)
5471
if name:
55-
linked = child.attrs.get('src') or child.attrs.get('href')
72+
linked = child.attrs.get("src") or child.attrs.get("href")
5673
if linked:
5774
tagged_links.append(linked)
5875

@@ -64,7 +81,7 @@ def _enable_hyperlink(s):
6481
links = [x[0] for x in url]
6582

6683
start_index = 0
67-
new_string = ''
84+
new_string = ""
6885
already_hyperlinked = _already_hyperlinked(s)
6986
for link in links:
7087
start = start_index + s[start_index:].find(link)
@@ -83,38 +100,51 @@ def _enable_hyperlink(s):
83100
return new_string
84101

85102
article_req = session.get(url)
86-
soup = bs(article_req.text, 'lxml')
87-
88-
writer = soup.find('th', text='작성자(소속)').findNext('td').select('label')[0].contents[0].strip()
89-
created_at_str = soup.find('th', text='작성일(조회수)').findNext('td').contents[0].strip().split('(')[0]
90-
created_at = timezone.get_current_timezone().localize(datetime.strptime(created_at_str, '%Y.%m.%d %H:%M:%S'))
91-
title = soup.select('table > tbody > tr > td.req_first')[0].contents[0]
92-
93-
trs = soup.select('table > tbody > tr')
103+
soup = bs(article_req.text, "lxml")
104+
105+
writer = (
106+
soup.find("th", text="작성자(소속)")
107+
.findNext("td")
108+
.select("label")[0]
109+
.contents[0]
110+
.strip()
111+
)
112+
created_at_str = (
113+
soup.find("th", text="작성일(조회수)")
114+
.findNext("td")
115+
.contents[0]
116+
.strip()
117+
.split("(")[0]
118+
)
119+
created_at = timezone.get_current_timezone().localize(
120+
datetime.strptime(created_at_str, "%Y.%m.%d %H:%M:%S")
121+
)
122+
title = soup.select("table > tbody > tr > td.req_first")[0].contents[0]
123+
124+
trs = soup.select("table > tbody > tr")
94125
html = None
95126

96127
for tr in trs:
97128
if len(list(tr.children)) == 3:
98-
html = tr.find('td').prettify()
129+
html = tr.find("td").prettify()
99130
break
100131

101132
html = _enable_hyperlink(html)
102133

103134
if html is None:
104-
raise RuntimeError(gettext('No content for portal article'))
135+
raise RuntimeError(gettext("No content for portal article"))
105136

106-
content_text = ' '.join(bs(html, features='html5lib').find_all(text=True))
137+
content_text = " ".join(bs(html, features="html5lib").find_all(text=True))
107138

108139
return {
109-
'title': title,
110-
'content_text': content_text,
111-
'content': html,
112-
'writer': writer,
113-
'created_at': created_at,
140+
"title": title,
141+
"content_text": content_text,
142+
"content": html,
143+
"writer": writer,
144+
"created_at": created_at,
114145
}
115146

116147

117-
118148
def crawl_hour(day=None):
119149
# parameter에서 default로 바로 today()하면, 캐싱되어서 업데이트가 안됨
120150
if day is None:
@@ -125,24 +155,25 @@ def crawl_hour(day=None):
125155
def _get_board_today(page_num):
126156
today = True
127157
board_req = session.get(
128-
f'{BASE_URL}/board/list.brd?boardId=today_notice&lang_knd=ko&userAgent=Chrome&isMobile=false&page={page_num}&userAgent=Chrome&isMobile=False&sortColumn=REG_DATIM&sortMethod=DESC')
129-
soup = bs(board_req.text, 'lxml')
158+
f"{BASE_URL}/board/list.brd?boardId=today_notice&lang_knd=ko&userAgent=Chrome&isMobile=false&page={page_num}&userAgent=Chrome&isMobile=False&sortColumn=REG_DATIM&sortMethod=DESC"
159+
)
160+
soup = bs(board_req.text, "lxml")
130161
linklist = []
131-
links = soup.select('table > tbody > tr > td > a')
132-
dates = soup.select('table > tbody > tr > td:nth-child(5)')
162+
links = soup.select("table > tbody > tr > td > a")
163+
dates = soup.select("table > tbody > tr > td:nth-child(5)")
133164

134165
if links:
135-
print('------- portal login success!')
166+
print("------- portal login success!")
136167
else:
137-
print('------- portal login failed!')
168+
print("------- portal login failed!")
138169

139-
today_date = str(day).replace('-', '.')
170+
today_date = str(day).replace("-", ".")
140171
for link, date in zip(links, dates):
141172
article_date = date.get_text()
142173
if article_date > today_date:
143174
continue
144175
elif article_date == today_date:
145-
linklist.append({'link': link.attrs['href'], 'date': article_date})
176+
linklist.append({"link": link.attrs["href"], "date": article_date})
146177
else:
147178
today = False
148179
return linklist, today
@@ -163,99 +194,110 @@ def _get_board_today(page_num):
163194
page_num += 1
164195

165196
for link in links:
166-
link = link['link']
167-
board_id = link.split('/')[-2]
168-
num = link.split('/')[-1]
169-
full_link = f'{BASE_URL}/board/read.brd?cmd=READ&boardId={board_id}&bltnNo={num}&lang_knd=ko'
197+
link = link["link"]
198+
board_id = link.split("/")[-2]
199+
num = link.split("/")[-1]
200+
full_link = f"{BASE_URL}/board/read.brd?cmd=READ&boardId={board_id}&bltnNo={num}&lang_knd=ko"
170201

171202
info = _get_article(full_link, session)
172203

173204
# Since it is time ordered, consequent ones have been posted more than 1 hour ago.
174205

175-
exist = UserProfile.objects.filter(nickname=info['writer'], is_newara=False)
206+
exist = UserProfile.objects.filter(nickname=info["writer"], is_newara=False)
176207
if exist:
177208
user = exist.first().user
178209
else:
179-
user = get_user_model().objects.create(username=str(uuid.uuid1()), is_active=False)
210+
user = get_user_model().objects.create(
211+
username=str(uuid.uuid1()), is_active=False
212+
)
180213
user_profile = UserProfile.objects.create(
181214
is_newara=False,
182215
user=user,
183-
nickname=info['writer'],
184-
picture='user_profiles/default_pictures/KAIST-logo.png',
216+
nickname=info["writer"],
217+
picture="user_profiles/default_pictures/KAIST-logo.png",
185218
)
186219

187220
a, created = Article.objects.get_or_create(
188221
url=full_link,
189222
defaults={
190-
'parent_board_id': 1, # 포탈공지 게시판
191-
'title': info['title'],
192-
'content': info['content'],
193-
'content_text': info['content_text'],
194-
'created_by': user,
195-
}
223+
"parent_board_id": 1, # 포탈공지 게시판
224+
"title": info["title"],
225+
"content": info["content"],
226+
"content_text": info["content_text"],
227+
"created_by": user,
228+
},
196229
)
197230

198231
if created:
199-
a.created_at = info['created_at']
232+
a.created_at = info["created_at"]
200233
a.save()
201-
print(f'crawled id: {a.id} - {a.title}')
234+
print(f"crawled id: {a.id} - {a.title}")
202235

203236

204237
def crawl_all():
205238
session = _login_kaist_portal()
206239

207240
def _get_board(page_num):
208241
board_req = session.get(
209-
f'{BASE_URL}/board/list.brd?boardId=today_notice&lang_knd=ko&userAgent=Chrome&isMobile=false&page={page_num}&sortColumn=REG_DATIM&sortMethod=DESC')
210-
soup = bs(board_req.text, 'lxml')
242+
f"{BASE_URL}/board/list.brd?boardId=today_notice&lang_knd=ko&userAgent=Chrome&isMobile=false&page={page_num}&sortColumn=REG_DATIM&sortMethod=DESC"
243+
)
244+
soup = bs(board_req.text, "lxml")
211245
link = []
212-
titles = soup.select('table > tbody > tr > td > a')
246+
titles = soup.select("table > tbody > tr > td > a")
213247
for title in titles:
214-
link.append(title.attrs['href'])
248+
link.append(title.attrs["href"])
215249

216250
return link
217251

218252
page_num = 1
219253

220254
while True:
221-
print('page_num:', page_num)
255+
print("page_num:", page_num)
222256
links = []
223257
link = _get_board(page_num)
224258
if link:
225259
links.extend(link)
226260

227261
with transaction.atomic():
228262
for link in tqdm(links):
229-
board_id = link.split('/')[-2]
230-
num = link.split('/')[-1]
231-
full_link = f'{BASE_URL}/board/read.brd?cmd=READ&boardId={board_id}&bltnNo={num}&lang_knd=ko'
263+
board_id = link.split("/")[-2]
264+
num = link.split("/")[-1]
265+
full_link = f"{BASE_URL}/board/read.brd?cmd=READ&boardId={board_id}&bltnNo={num}&lang_knd=ko"
232266
info = _get_article(full_link, session)
233267

234-
exist = UserProfile.objects.filter(nickname=info['writer'], is_newara=False)
268+
exist = UserProfile.objects.filter(
269+
nickname=info["writer"], is_newara=False
270+
)
235271
if exist:
236272
user = exist.first().user
237273
else:
238-
user = get_user_model().objects.create(username=str(uuid.uuid1()), is_active=False)
274+
user = get_user_model().objects.create(
275+
username=str(uuid.uuid1()), is_active=False
276+
)
239277
user_profile = UserProfile.objects.create(
240278
is_newara=False,
241279
user=user,
242-
nickname=info['writer'],
243-
picture='user_profiles/default_pictures/KAIST-logo.png',
280+
nickname=info["writer"],
281+
picture="user_profiles/default_pictures/KAIST-logo.png",
244282
)
245283

246284
a = Article.objects.create(
247285
parent_board_id=1, # 포탈공지 게시판
248-
title=info['title'],
249-
content=info['content'],
250-
content_text=info['content_text'],
286+
title=info["title"],
287+
content=info["content"],
288+
content_text=info["content_text"],
251289
created_by=user,
252290
url=full_link,
253291
)
254292

255-
a.created_at = info['created_at']
293+
a.created_at = info["created_at"]
256294
a.save()
257295

258296
page_num += 1
259297

260298
else:
261299
break
300+
301+
302+
if __name__ == "__main__":
303+
_login_kaist_portal()

0 commit comments

Comments
 (0)