-
Notifications
You must be signed in to change notification settings - Fork 0
/
jstorDaily_noContent.py
179 lines (130 loc) · 5.42 KB
/
jstorDaily_noContent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
from bs4 import BeautifulSoup
import requests
import json
import time
import re
#list of dictionaries - each dictionary one article
jdaily_articles = []
#Ex. indv. article, practice use. Eventually want to scrape all articles
#start with oldest article: http://daily.jstor.org/maya-angelou-has-died/
url = 'http://daily.jstor.org/maya-angelou-has-died'
a_article = requests.get (url)
if a_article.status_code != 200:
print ("There was an error with", url)
#url's complete html
article_html = a_article.text
#soup parsing
soup = BeautifulSoup (article_html, "html.parser")
#creating, cleaning author variable
art_author2 = soup.find('span', attrs = {"class": "entry-author"}).text
if 'By' in art_author2:
art_author = art_author2.replace('By' , '').strip()
#creating other variables for article metadata
art_date = soup.find("span", attrs = {"class": "entry-date"}).text
title = soup.find("h1", attrs = {"class": "entry-title single-title"}).text
if '\n' in title:
art_title = title.replace ('\n' , ''). strip()
#create list for JSTOR citation(s) / article(s) acting as source
jstor_dois = []
#create variable for indv. Jstor citations - just the DOI suffix
art_source = soup.find_all("h3", attrs = {"class": "citation-title"})
for link in art_source:
a_link = link.find("a")
#add link(s) to jstor_links list
text = a_link.get('href')
doi = text.replace('http://www.jstor.org/stable/' , '')
jstor_dois.append(doi)
#create list for titles of journals used as sources
journals_sourced = []
#Journal Name, Vol/Issue information
for title_source in soup.find_all('p', attrs = {"class": "citation-source"}):
a_source = title_source.text
#append journals to list
journals_sourced.append(a_source)
#create list for and article's 'tag' variables
jdaily_tags = []
#find and create variable for 'tags'
for tag in soup.find_all("a", attrs = {"rel": "tag"}):
a_tag = tag.string
#append tag(s) to list
jdaily_tags.append(a_tag)
#finding the "Next" link
n_link = soup.find("a",attrs = {"rel": "next"})
#create / define a dictionary of indv article's data
this_article = {}
#assigning data fields as dictionary keys, with the variables created above
this_article['date'] = art_date
this_article['jdaily_author'] = art_author
this_article['jdaily_title'] = art_title
this_article['sources'] = jstor_dois
this_article['tags'] = jdaily_tags
this_article['journals'] = journals_sourced
this_article ['publishers'] = "None listed"
#add the dictionary to the list created at top or return message
jdaily_articles.append(this_article)
print (this_article)
while n_link != None:
url = n_link['href']
a_article = requests.get(url)
time.sleep(1)
soup = BeautifulSoup(a_article.text, "html.parser")
#creating, cleaning author variable
art_author2 = soup.find('span', attrs = {"class": "entry-author"}).text
if 'By' in art_author2:
art_author = art_author2.replace('By' , '').strip()
#creating other variables for article metadata
art_date = soup.find("span", attrs = {"class": "entry-date"}).text
title = soup.find("h1", attrs = {"class": "entry-title single-title"}).text
if '\n' in title:
art_title = title.replace ('\n' , ''). strip()
#create list for JSTOR citation(s) / article(s) acting as source
jstor_dois = []
#create variable for indv. Jstor citations - just the DOI suffix
art_source = soup.find_all("h3", attrs = {"class": "citation-title"})
for link in art_source:
a_link = link.find("a")
#add link(s) to jstor_links list
text = a_link.get('href')
doi = text.replace('http://www.jstor.org/stable/' , '')
jstor_dois.append(doi)
#add doi(s) to jstor_dois list
jstor_dois.append(doi)
#create list for titles of journals used as sources
journals_sourced = []
#Journal Name, Vol/Issue information
for title_source in soup.find_all('p', attrs = {"class": "citation-source"}):
a_source = title_source.text
#append journals to list
journals_sourced.append(a_source)
#create list for publishers of journals used as sources
publishers_sourced = []
#Publisher Statement
for publisher_source in soup.find_all('p', attrs = {"class": "citation-source"}):
a_pub = publisher_source.text
#append journals to list
publishers_sourced.append(a_pub)
#create list for and article's 'tag' variables
jdaily_tags = []
#find and create variable for 'tags'
for tag in soup.find_all("a", attrs = {"rel": "tag"}):
a_tag = tag.string
#append tag(s) to list
jdaily_tags.append(a_tag)
#find > create variable for next page link
n_link = soup.find("a",attrs = {"rel": "next"})
#create / define a dictionary of indv article's data
this_article = {}
#assigning data fields as dictionary keys, with the variables created above
this_article['date'] = art_date
this_article['jdaily_author'] = art_author
this_article['jdaily_title'] = art_title
this_article['sources'] = jstor_dois
this_article['tags'] = jdaily_tags
this_article['journals'] = journals_sourced
this_article['publishers'] = publishers_sourced
#add the dictionary to the list created at top or return message
jdaily_articles.append(this_article)
print(this_article)
print("\n")
with open('scraped_jdaily.json', 'w') as f:
f.write(json.dumps(jdaily_articles, indent=4))