Skip to content

Commit

Permalink
Merge pull request #208 from sparcs-kaist/develop
Browse files Browse the repository at this point in the history
2021/07/07 v1.1.12 update
  • Loading branch information
victory-jooyon authored Jul 7, 2021
2 parents 2927638 + 80a7b77 commit e44f975
Show file tree
Hide file tree
Showing 3 changed files with 132 additions and 75 deletions.
192 changes: 117 additions & 75 deletions apps/core/management/scripts/portal_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,43 +16,60 @@
from ara.settings import PORTAL_ID, PORTAL_PASSWORD

LOGIN_INFO_SSO2 = {
'userid': PORTAL_ID,
'password': PORTAL_PASSWORD,
'saveid': 'on',
'phase': 'pass1',
"userid": PORTAL_ID,
"password": PORTAL_PASSWORD,
"saveid": "on",
"phase": "pass1",
}


LOGIN_INFO_SSO = {
'userid': PORTAL_ID,
'password': PORTAL_PASSWORD,
'saveid': 'on',
'phase': 'pass2',
"user_id": PORTAL_ID,
"pw": PORTAL_PASSWORD,
"login_page": "L_P_COMMON",
}


BASE_URL = 'https://portal.kaist.ac.kr'
BASE_URL = "https://portal.kaist.ac.kr"


def _login_kaist_portal():
session = requests.Session()
login_req1 = session.post('https://portalsso.kaist.ac.kr/ssoProcess2.ps', data=LOGIN_INFO_SSO2,)
login_req2 = session.post('https://portalsso.kaist.ac.kr/ssoProcess.ps', data=LOGIN_INFO_SSO,)

print(f'sso2: {login_req1.status_code} & sso: {login_req2.status_code}')
init_response = session.get(
"https://portal.kaist.ac.kr/portal/", allow_redirects=True
)
login_param_id = init_response.url.split("=")[-1]

login_response = session.post(
"https://iam2.kaist.ac.kr/api/sso/login",
data={**LOGIN_INFO_SSO, "param_id": login_param_id,},
)

k_uid = login_response.json()["dataMap"]["USER_INFO"]["kaist_uid"]
state = login_response.json()["dataMap"]["state"]

session.post(
"https://portal.kaist.ac.kr/statics/redirectUri.jsp",
data={
"k_uid": k_uid,
"state": state,
"success": "true",
"result": login_response.text,
"user_id": PORTAL_ID,
},
)

return session


def _get_article(url, session):

def _already_hyperlinked(html):
soup = bs(html, 'lxml')
soup = bs(html, "lxml")
tagged_links = []
for child in soup.descendants:
name = getattr(child, 'name', None)
name = getattr(child, "name", None)
if name:
linked = child.attrs.get('src') or child.attrs.get('href')
linked = child.attrs.get("src") or child.attrs.get("href")
if linked:
tagged_links.append(linked)

Expand All @@ -64,7 +81,7 @@ def _enable_hyperlink(s):
links = [x[0] for x in url]

start_index = 0
new_string = ''
new_string = ""
already_hyperlinked = _already_hyperlinked(s)
for link in links:
start = start_index + s[start_index:].find(link)
Expand All @@ -83,38 +100,51 @@ def _enable_hyperlink(s):
return new_string

article_req = session.get(url)
soup = bs(article_req.text, 'lxml')

writer = soup.find('th', text='작성자(소속)').findNext('td').select('label')[0].contents[0].strip()
created_at_str = soup.find('th', text='작성일(조회수)').findNext('td').contents[0].strip().split('(')[0]
created_at = timezone.get_current_timezone().localize(datetime.strptime(created_at_str, '%Y.%m.%d %H:%M:%S'))
title = soup.select('table > tbody > tr > td.req_first')[0].contents[0]

trs = soup.select('table > tbody > tr')
soup = bs(article_req.text, "lxml")

writer = (
soup.find("th", text="작성자(소속)")
.findNext("td")
.select("label")[0]
.contents[0]
.strip()
)
created_at_str = (
soup.find("th", text="작성일(조회수)")
.findNext("td")
.contents[0]
.strip()
.split("(")[0]
)
created_at = timezone.get_current_timezone().localize(
datetime.strptime(created_at_str, "%Y.%m.%d %H:%M:%S")
)
title = soup.select("table > tbody > tr > td.req_first")[0].contents[0]

trs = soup.select("table > tbody > tr")
html = None

for tr in trs:
if len(list(tr.children)) == 3:
html = tr.find('td').prettify()
html = tr.find("td").prettify()
break

html = _enable_hyperlink(html)

if html is None:
raise RuntimeError(gettext('No content for portal article'))
raise RuntimeError(gettext("No content for portal article"))

content_text = ' '.join(bs(html, features='html5lib').find_all(text=True))
content_text = " ".join(bs(html, features="html5lib").find_all(text=True))

return {
'title': title,
'content_text': content_text,
'content': html,
'writer': writer,
'created_at': created_at,
"title": title,
"content_text": content_text,
"content": html,
"writer": writer,
"created_at": created_at,
}



def crawl_hour(day=None):
# parameter에서 default로 바로 today()하면, 캐싱되어서 업데이트가 안됨
if day is None:
Expand All @@ -125,24 +155,25 @@ def crawl_hour(day=None):
def _get_board_today(page_num):
today = True
board_req = session.get(
f'{BASE_URL}/board/list.brd?boardId=today_notice&lang_knd=ko&userAgent=Chrome&isMobile=false&page={page_num}&userAgent=Chrome&isMobile=False&sortColumn=REG_DATIM&sortMethod=DESC')
soup = bs(board_req.text, 'lxml')
f"{BASE_URL}/board/list.brd?boardId=today_notice&lang_knd=ko&userAgent=Chrome&isMobile=false&page={page_num}&userAgent=Chrome&isMobile=False&sortColumn=REG_DATIM&sortMethod=DESC"
)
soup = bs(board_req.text, "lxml")
linklist = []
links = soup.select('table > tbody > tr > td > a')
dates = soup.select('table > tbody > tr > td:nth-child(5)')
links = soup.select("table > tbody > tr > td > a")
dates = soup.select("table > tbody > tr > td:nth-child(5)")

if links:
print('------- portal login success!')
print("------- portal login success!")
else:
print('------- portal login failed!')
print("------- portal login failed!")

today_date = str(day).replace('-', '.')
today_date = str(day).replace("-", ".")
for link, date in zip(links, dates):
article_date = date.get_text()
if article_date > today_date:
continue
elif article_date == today_date:
linklist.append({'link': link.attrs['href'], 'date': article_date})
linklist.append({"link": link.attrs["href"], "date": article_date})
else:
today = False
return linklist, today
Expand All @@ -163,99 +194,110 @@ def _get_board_today(page_num):
page_num += 1

for link in links:
link = link['link']
board_id = link.split('/')[-2]
num = link.split('/')[-1]
full_link = f'{BASE_URL}/board/read.brd?cmd=READ&boardId={board_id}&bltnNo={num}&lang_knd=ko'
link = link["link"]
board_id = link.split("/")[-2]
num = link.split("/")[-1]
full_link = f"{BASE_URL}/board/read.brd?cmd=READ&boardId={board_id}&bltnNo={num}&lang_knd=ko"

info = _get_article(full_link, session)

# Since it is time ordered, consequent ones have been posted more than 1 hour ago.

exist = UserProfile.objects.filter(nickname=info['writer'], is_newara=False)
exist = UserProfile.objects.filter(nickname=info["writer"], is_newara=False)
if exist:
user = exist.first().user
else:
user = get_user_model().objects.create(username=str(uuid.uuid1()), is_active=False)
user = get_user_model().objects.create(
username=str(uuid.uuid1()), is_active=False
)
user_profile = UserProfile.objects.create(
is_newara=False,
user=user,
nickname=info['writer'],
picture='user_profiles/default_pictures/KAIST-logo.png',
nickname=info["writer"],
picture="user_profiles/default_pictures/KAIST-logo.png",
)

a, created = Article.objects.get_or_create(
url=full_link,
defaults={
'parent_board_id': 1, # 포탈공지 게시판
'title': info['title'],
'content': info['content'],
'content_text': info['content_text'],
'created_by': user,
}
"parent_board_id": 1, # 포탈공지 게시판
"title": info["title"],
"content": info["content"],
"content_text": info["content_text"],
"created_by": user,
},
)

if created:
a.created_at = info['created_at']
a.created_at = info["created_at"]
a.save()
print(f'crawled id: {a.id} - {a.title}')
print(f"crawled id: {a.id} - {a.title}")


def crawl_all():
session = _login_kaist_portal()

def _get_board(page_num):
board_req = session.get(
f'{BASE_URL}/board/list.brd?boardId=today_notice&lang_knd=ko&userAgent=Chrome&isMobile=false&page={page_num}&sortColumn=REG_DATIM&sortMethod=DESC')
soup = bs(board_req.text, 'lxml')
f"{BASE_URL}/board/list.brd?boardId=today_notice&lang_knd=ko&userAgent=Chrome&isMobile=false&page={page_num}&sortColumn=REG_DATIM&sortMethod=DESC"
)
soup = bs(board_req.text, "lxml")
link = []
titles = soup.select('table > tbody > tr > td > a')
titles = soup.select("table > tbody > tr > td > a")
for title in titles:
link.append(title.attrs['href'])
link.append(title.attrs["href"])

return link

page_num = 1

while True:
print('page_num:', page_num)
print("page_num:", page_num)
links = []
link = _get_board(page_num)
if link:
links.extend(link)

with transaction.atomic():
for link in tqdm(links):
board_id = link.split('/')[-2]
num = link.split('/')[-1]
full_link = f'{BASE_URL}/board/read.brd?cmd=READ&boardId={board_id}&bltnNo={num}&lang_knd=ko'
board_id = link.split("/")[-2]
num = link.split("/")[-1]
full_link = f"{BASE_URL}/board/read.brd?cmd=READ&boardId={board_id}&bltnNo={num}&lang_knd=ko"
info = _get_article(full_link, session)

exist = UserProfile.objects.filter(nickname=info['writer'], is_newara=False)
exist = UserProfile.objects.filter(
nickname=info["writer"], is_newara=False
)
if exist:
user = exist.first().user
else:
user = get_user_model().objects.create(username=str(uuid.uuid1()), is_active=False)
user = get_user_model().objects.create(
username=str(uuid.uuid1()), is_active=False
)
user_profile = UserProfile.objects.create(
is_newara=False,
user=user,
nickname=info['writer'],
picture='user_profiles/default_pictures/KAIST-logo.png',
nickname=info["writer"],
picture="user_profiles/default_pictures/KAIST-logo.png",
)

a = Article.objects.create(
parent_board_id=1, # 포탈공지 게시판
title=info['title'],
content=info['content'],
content_text=info['content_text'],
title=info["title"],
content=info["content"],
content_text=info["content_text"],
created_by=user,
url=full_link,
)

a.created_at = info['created_at']
a.created_at = info["created_at"]
a.save()

page_num += 1

else:
break


if __name__ == "__main__":
_login_kaist_portal()
6 changes: 6 additions & 0 deletions apps/core/serializers/article.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ def get_my_scrap(obj) -> typing.Optional[dict]:

return BaseScrapSerializer(my_scrap).data

def get_is_mine(self, obj) -> bool:
return self.context['request'].user == obj.created_by

def get_is_hidden(self, obj) -> bool:
if self.validate_hidden(obj):
return True
Expand Down Expand Up @@ -304,6 +307,9 @@ def get_side_articles_of_recent_article(self, obj, request):
source='comment_set',
)

is_mine = serializers.SerializerMethodField(
read_only=True,
)
is_hidden = serializers.SerializerMethodField(
read_only=True,
)
Expand Down
9 changes: 9 additions & 0 deletions apps/core/serializers/comment.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ def get_my_vote(obj) -> typing.Optional[bool]:

return my_vote.is_positive

def get_is_mine(self, obj) -> bool:
return self.context['request'].user == obj.created_by

def get_is_hidden(self, obj) -> bool:
if self.validate_hidden(obj):
return True
Expand Down Expand Up @@ -76,6 +79,9 @@ class CommentSerializer(BaseCommentSerializer):
my_vote = serializers.SerializerMethodField(
read_only=True,
)
is_mine = serializers.SerializerMethodField(
read_only=True,
)
is_hidden = serializers.SerializerMethodField(
read_only=True,
)
Expand All @@ -101,6 +107,9 @@ class CommentListActionSerializer(BaseCommentSerializer):
my_vote = serializers.SerializerMethodField(
read_only=True,
)
is_mine = serializers.SerializerMethodField(
read_only=True,
)
is_hidden = serializers.SerializerMethodField(
read_only=True,
)
Expand Down

0 comments on commit e44f975

Please sign in to comment.