Merge pull request #207 from sparcs-kaist/fix/portal-login

victory-jooyon · web-flow · commit 80a7b7790d88 · 2021-07-07T18:42:44.000+09:00
Fix KAIST portal login
diff --git a/apps/core/management/scripts/portal_crawler.py b/apps/core/management/scripts/portal_crawler.py
@@ -16,43 +16,60 @@
 from ara.settings import PORTAL_ID, PORTAL_PASSWORD
 
 LOGIN_INFO_SSO2 = {
-    'userid': PORTAL_ID,
-    'password': PORTAL_PASSWORD,
-    'saveid': 'on',
-    'phase': 'pass1',
+    "userid": PORTAL_ID,
+    "password": PORTAL_PASSWORD,
+    "saveid": "on",
+    "phase": "pass1",
 }
 
 
 LOGIN_INFO_SSO = {
-    'userid': PORTAL_ID,
-    'password': PORTAL_PASSWORD,
-    'saveid': 'on',
-    'phase': 'pass2',
+    "user_id": PORTAL_ID,
+    "pw": PORTAL_PASSWORD,
+    "login_page": "L_P_COMMON",
 }
 
 
-BASE_URL = 'https://portal.kaist.ac.kr'
+BASE_URL = "https://portal.kaist.ac.kr"
 
 
 def _login_kaist_portal():
     session = requests.Session()
-    login_req1 = session.post('https://portalsso.kaist.ac.kr/ssoProcess2.ps', data=LOGIN_INFO_SSO2,)
-    login_req2 = session.post('https://portalsso.kaist.ac.kr/ssoProcess.ps', data=LOGIN_INFO_SSO,)
-
-    print(f'sso2: {login_req1.status_code} & sso: {login_req2.status_code}')
+    init_response = session.get(
+        "https://portal.kaist.ac.kr/portal/", allow_redirects=True
+    )
+    login_param_id = init_response.url.split("=")[-1]
+
+    login_response = session.post(
+        "https://iam2.kaist.ac.kr/api/sso/login",
+        data={**LOGIN_INFO_SSO, "param_id": login_param_id,},
+    )
+
+    k_uid = login_response.json()["dataMap"]["USER_INFO"]["kaist_uid"]
+    state = login_response.json()["dataMap"]["state"]
+
+    session.post(
+        "https://portal.kaist.ac.kr/statics/redirectUri.jsp",
+        data={
+            "k_uid": k_uid,
+            "state": state,
+            "success": "true",
+            "result": login_response.text,
+            "user_id": PORTAL_ID,
+        },
+    )
 
     return session
 
 
 def _get_article(url, session):
-
     def _already_hyperlinked(html):
-        soup = bs(html, 'lxml')
+        soup = bs(html, "lxml")
         tagged_links = []
         for child in soup.descendants:
-            name = getattr(child, 'name', None)
+            name = getattr(child, "name", None)
             if name:
-                linked = child.attrs.get('src') or child.attrs.get('href')
+                linked = child.attrs.get("src") or child.attrs.get("href")
                 if linked:
                     tagged_links.append(linked)
 
@@ -64,7 +81,7 @@ def _enable_hyperlink(s):
         links = [x[0] for x in url]
 
         start_index = 0
-        new_string = ''
+        new_string = ""
         already_hyperlinked = _already_hyperlinked(s)
         for link in links:
             start = start_index + s[start_index:].find(link)
@@ -83,38 +100,51 @@ def _enable_hyperlink(s):
         return new_string
 
     article_req = session.get(url)
-    soup = bs(article_req.text, 'lxml')
-
-    writer = soup.find('th', text='작성자(소속)').findNext('td').select('label')[0].contents[0].strip()
-    created_at_str = soup.find('th', text='작성일(조회수)').findNext('td').contents[0].strip().split('(')[0]
-    created_at = timezone.get_current_timezone().localize(datetime.strptime(created_at_str, '%Y.%m.%d %H:%M:%S'))
-    title = soup.select('table > tbody > tr > td.req_first')[0].contents[0]
-
-    trs = soup.select('table > tbody > tr')
+    soup = bs(article_req.text, "lxml")
+
+    writer = (
+        soup.find("th", text="작성자(소속)")
+        .findNext("td")
+        .select("label")[0]
+        .contents[0]
+        .strip()
+    )
+    created_at_str = (
+        soup.find("th", text="작성일(조회수)")
+        .findNext("td")
+        .contents[0]
+        .strip()
+        .split("(")[0]
+    )
+    created_at = timezone.get_current_timezone().localize(
+        datetime.strptime(created_at_str, "%Y.%m.%d %H:%M:%S")
+    )
+    title = soup.select("table > tbody > tr > td.req_first")[0].contents[0]
+
+    trs = soup.select("table > tbody > tr")
     html = None
 
     for tr in trs:
         if len(list(tr.children)) == 3:
-            html = tr.find('td').prettify()
+            html = tr.find("td").prettify()
             break
 
     html = _enable_hyperlink(html)
 
     if html is None:
-        raise RuntimeError(gettext('No content for portal article'))
+        raise RuntimeError(gettext("No content for portal article"))
 
-    content_text = ' '.join(bs(html, features='html5lib').find_all(text=True))
+    content_text = " ".join(bs(html, features="html5lib").find_all(text=True))
 
     return {
-        'title': title,
-        'content_text': content_text,
-        'content': html,
-        'writer': writer,
-        'created_at': created_at,
+        "title": title,
+        "content_text": content_text,
+        "content": html,
+        "writer": writer,
+        "created_at": created_at,
     }
 
 
-
 def crawl_hour(day=None):
     # parameter에서 default로 바로 today()하면, 캐싱되어서 업데이트가 안됨
     if day is None:
@@ -125,24 +155,25 @@ def crawl_hour(day=None):
     def _get_board_today(page_num):
         today = True
         board_req = session.get(
-            f'{BASE_URL}/board/list.brd?boardId=today_notice&lang_knd=ko&userAgent=Chrome&isMobile=false&page={page_num}&userAgent=Chrome&isMobile=False&sortColumn=REG_DATIM&sortMethod=DESC')
-        soup = bs(board_req.text, 'lxml')
+            f"{BASE_URL}/board/list.brd?boardId=today_notice&lang_knd=ko&userAgent=Chrome&isMobile=false&page={page_num}&userAgent=Chrome&isMobile=False&sortColumn=REG_DATIM&sortMethod=DESC"
+        )
+        soup = bs(board_req.text, "lxml")
         linklist = []
-        links = soup.select('table > tbody > tr > td > a')
-        dates = soup.select('table > tbody > tr > td:nth-child(5)')
+        links = soup.select("table > tbody > tr > td > a")
+        dates = soup.select("table > tbody > tr > td:nth-child(5)")
 
         if links:
-            print('------- portal login success!')
+            print("------- portal login success!")
         else:
-            print('------- portal login failed!')
+            print("------- portal login failed!")
 
-        today_date = str(day).replace('-', '.')
+        today_date = str(day).replace("-", ".")
         for link, date in zip(links, dates):
             article_date = date.get_text()
             if article_date > today_date:
                 continue
             elif article_date == today_date:
-                linklist.append({'link': link.attrs['href'], 'date': article_date})
+                linklist.append({"link": link.attrs["href"], "date": article_date})
             else:
                 today = False
                 return linklist, today
@@ -163,99 +194,110 @@ def _get_board_today(page_num):
         page_num += 1
 
     for link in links:
-        link = link['link']
-        board_id = link.split('/')[-2]
-        num = link.split('/')[-1]
-        full_link = f'{BASE_URL}/board/read.brd?cmd=READ&boardId={board_id}&bltnNo={num}&lang_knd=ko'
+        link = link["link"]
+        board_id = link.split("/")[-2]
+        num = link.split("/")[-1]
+        full_link = f"{BASE_URL}/board/read.brd?cmd=READ&boardId={board_id}&bltnNo={num}&lang_knd=ko"
 
         info = _get_article(full_link, session)
 
         # Since it is time ordered, consequent ones have been posted more than 1 hour ago.
 
-        exist = UserProfile.objects.filter(nickname=info['writer'], is_newara=False)
+        exist = UserProfile.objects.filter(nickname=info["writer"], is_newara=False)
         if exist:
             user = exist.first().user
         else:
-            user = get_user_model().objects.create(username=str(uuid.uuid1()), is_active=False)
+            user = get_user_model().objects.create(
+                username=str(uuid.uuid1()), is_active=False
+            )
             user_profile = UserProfile.objects.create(
                 is_newara=False,
                 user=user,
-                nickname=info['writer'],
-                picture='user_profiles/default_pictures/KAIST-logo.png',
+                nickname=info["writer"],
+                picture="user_profiles/default_pictures/KAIST-logo.png",
             )
 
         a, created = Article.objects.get_or_create(
             url=full_link,
             defaults={
-                'parent_board_id': 1,  # 포탈공지 게시판
-                'title': info['title'],
-                'content': info['content'],
-                'content_text': info['content_text'],
-                'created_by': user,
-            }
+                "parent_board_id": 1,  # 포탈공지 게시판
+                "title": info["title"],
+                "content": info["content"],
+                "content_text": info["content_text"],
+                "created_by": user,
+            },
         )
 
         if created:
-            a.created_at = info['created_at']
+            a.created_at = info["created_at"]
             a.save()
-            print(f'crawled id: {a.id} - {a.title}')
+            print(f"crawled id: {a.id} - {a.title}")
 
 
 def crawl_all():
     session = _login_kaist_portal()
 
     def _get_board(page_num):
         board_req = session.get(
-            f'{BASE_URL}/board/list.brd?boardId=today_notice&lang_knd=ko&userAgent=Chrome&isMobile=false&page={page_num}&sortColumn=REG_DATIM&sortMethod=DESC')
-        soup = bs(board_req.text, 'lxml')
+            f"{BASE_URL}/board/list.brd?boardId=today_notice&lang_knd=ko&userAgent=Chrome&isMobile=false&page={page_num}&sortColumn=REG_DATIM&sortMethod=DESC"
+        )
+        soup = bs(board_req.text, "lxml")
         link = []
-        titles = soup.select('table > tbody > tr > td > a')
+        titles = soup.select("table > tbody > tr > td > a")
         for title in titles:
-            link.append(title.attrs['href'])
+            link.append(title.attrs["href"])
 
         return link
 
     page_num = 1
 
     while True:
-        print('page_num:', page_num)
+        print("page_num:", page_num)
         links = []
         link = _get_board(page_num)
         if link:
             links.extend(link)
 
             with transaction.atomic():
                 for link in tqdm(links):
-                    board_id = link.split('/')[-2]
-                    num = link.split('/')[-1]
-                    full_link = f'{BASE_URL}/board/read.brd?cmd=READ&boardId={board_id}&bltnNo={num}&lang_knd=ko'
+                    board_id = link.split("/")[-2]
+                    num = link.split("/")[-1]
+                    full_link = f"{BASE_URL}/board/read.brd?cmd=READ&boardId={board_id}&bltnNo={num}&lang_knd=ko"
                     info = _get_article(full_link, session)
 
-                    exist = UserProfile.objects.filter(nickname=info['writer'], is_newara=False)
+                    exist = UserProfile.objects.filter(
+                        nickname=info["writer"], is_newara=False
+                    )
                     if exist:
                         user = exist.first().user
                     else:
-                        user = get_user_model().objects.create(username=str(uuid.uuid1()), is_active=False)
+                        user = get_user_model().objects.create(
+                            username=str(uuid.uuid1()), is_active=False
+                        )
                         user_profile = UserProfile.objects.create(
                             is_newara=False,
                             user=user,
-                            nickname=info['writer'],
-                            picture='user_profiles/default_pictures/KAIST-logo.png',
+                            nickname=info["writer"],
+                            picture="user_profiles/default_pictures/KAIST-logo.png",
                         )
 
                     a = Article.objects.create(
                         parent_board_id=1,  # 포탈공지 게시판
-                        title=info['title'],
-                        content=info['content'],
-                        content_text=info['content_text'],
+                        title=info["title"],
+                        content=info["content"],
+                        content_text=info["content_text"],
                         created_by=user,
                         url=full_link,
                     )
 
-                    a.created_at = info['created_at']
+                    a.created_at = info["created_at"]
                     a.save()
 
             page_num += 1
 
         else:
             break
+
+
+if __name__ == "__main__":
+    _login_kaist_portal()