-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawl.py
36 lines (36 loc) · 1.28 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import requests
from bs4 import BeautifulSoup
import re
import json
f = open("CCFAURL.txt", encoding="utf-8") # CCF A 会议和期刊的URL 文档
key = 'cloud' # 搜索关键词
year = 2018 # 只要大于year的论文
filename = 'CCFsearch.txt' # 爬到的年份+题目保存在文件中
links = f.readlines()
f.close()
links = [i for i in links if i != '\n']
newlinks = []
for i in links:
newlinks.append(i.strip().replace('\n', ''))
writeli=""
with open(filename, 'w') as file_object:
writeli += key + "\n"
for link in newlinks:
link_s = link.split('/')
if len(link_s) < 4 or link_s[-4] != 'db':
continue
jouc = link_s[-3]
aname = link_s[-2]
searchurl = 'https://dblp.uni-trier.de/search?q={que}%20streamid%3A{jorc}%2F{name}%3A'.format(que=key,jorc=jouc,name=aname)
strhtml = requests.get(searchurl)
soup=BeautifulSoup(strhtml.text,'html.parser')
titles = soup.select('span.title')
time = soup.select('span[itemprop="datePublished"]')
print(jouc+" "+aname)
writeli += jouc+" "+aname+"\n"
for i in range(len(titles)):
if int(time[i].text) < year:
break
writeli += time[i].text+" "+titles[i].text+"\n"
file_object.write(writeli)
file_object.close()