-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathcsmedia.py
243 lines (216 loc) · 8.57 KB
/
csmedia.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
# pylama:ignore=W0401,W0612
import requests
import re
import time
from bs4 import BeautifulSoup
from datetime import datetime
from general_functions import *
def cl_search_txt(soup, class_str, index = 0):
container = soup.select(class_str)
return container[index].text
def restrip(str, find, replace=""):
out = str.replace(find, replace)
return out.strip()
def text_add(str, add, pretext="\n"):
return str + pretext + add
def build_url(movie_title, lib_type, dict, library_types = ['movie', 'show']):
base_url = dict.get('base')
if lib_type == library_types[0]:
base_url = dict.get('movie')
elif lib_type == library_types[1]:
base_url = dict.get('tv')
else:
return ""
print(movie_title)
movie_title = re.sub("[^a-zA-Z0-9 ']", '', movie_title)
movie_title = re.sub(' ', '%20', movie_title)
URL = base_url + movie_title
return URL
def get_age(date):
age = datetime.now()-date
age = age.total_seconds()
age = age/3600
age = age/24
age = age/365
return age
def page_search(urls, search, lib, movie_dict, backup_search = ""):
for u in urls:
url = u
backup_url = None
# print(u)
page = requests.get(u)
search_m = re.search(str(search), page.text)
if search_m is not None:
print(u)
break
else:
if backup_search != "":
search_t = re.search(str(backup_search), page.text)
if search_t is not None:
backup_url = url
search_t = None
url = None
page = None
time.sleep(0.4) # I put this line in to offload Common Sense requests during large server get_search_results
#- The above line should only slow down the first server run
if url is None and backup_url is not None:
url = backup_url
page = requests.get(backup_url)
return [page,url]
def get_search_results(URL, url_match, url_dict, skip_urls=[]):
search_page = requests.get(URL)
soup = BeautifulSoup(search_page.content, 'html.parser')
links = soup.find_all('a', href=re.compile('^' + url_match))
urls = []
for link in links:
b = url_dict.get('base')
link = link.get('href')
url = b + link
if url not in urls and url not in skip_urls:
urls.append(url)
return urls
#url = "https://www.commonsensemedia.org/movie-reviews/dc-league-of-super-pets"
def scrape_CSM_page(movie_dict, page, parents_review = False):
if page.status_code == 200:
soup = BeautifulSoup(page.content, "html.parser")
to_know = cl_search_txt(soup, "div[class^=review-view-parents-need-know]")
to_know = restrip(to_know, "Show more")
to_know = restrip(to_know, "\n\n", "\n")
to_know = restrip(to_know, "\n\n", "\n")
# print(to_know)
if parents_review == True:
age = cl_search_txt(soup, "span[class^=rating__age]",1)
else:
age = cl_search_txt(soup, "span[class^=rating__age]")
age = age.strip()
non_decimal = re.compile(r'[^\d.]+')
cs_age = non_decimal.sub('', age)
movie_dict.update({"cs_age": int(cs_age)})
div = soup.select("div[class^=content-grid-item]")
text = "[Common Sense Media]"
text = text_add(text, age, " ")
text = text_add(text, to_know, "\n")
# print(text)
for d in div:
x = restrip(d.text, "Not present", "")
s = d["data-text"]
s = re.sub('<[^>]*>', '', s)
s = re.sub('Did you know [^?.!]*[?.!]', '', s)
s = re.sub('Adjust limits [^?.!]*[?.!]', '', s)
s = restrip(s, "Join now")
s = s.strip()
text = text_add(text, x, "\n|")
text = text_add(text, s, "|: ")
now = datetime.now()
text = restrip(text, "\n\n\n", "\n")
text = restrip(text, "\n\n", "\n")
text = restrip(text, "\n\n", "\n")
text = text_add(text, "[Date:" + now.strftime('%Y-%m-%d') + "]")
movie_dict.update({'cs_summary': text})
else:
print("ERROR BAD STATUS CODE")
print(URL)
return movie_dict
def CSM_get(movie, movie_dict, movies, lib_type='movie', url_dict = {}, update_age_factor = 1, library_types = ['movie', 'show'], parents_review = False): # Download information from Common Sense media
updated = movie_dict.get('updated')
if updated is not None: # Skip the movie if it's been updated recent enough. Whether or not found/verified
updated = datetime.strptime(updated, '%Y-%m-%d')
oaa = movie.originallyAvailableAt
if oaa is None:
oaa = datetime.strptime('2010-01-01', '%Y-%m-%d')
m_age = get_age(oaa)
s_age = get_age(updated)
if m_age/s_age > update_age_factor and s_age < 5:
return movie_dict
updated = datetime.now()
imdb = [guid.id for guid in movie.guids if 'imdb' in guid.id]
if len(imdb) == 0:
update_log(movie.title + " Doesn't have an IMDbId in Plex")
imdb = 'NOMATCH'
else:
imdb = re.sub('imdb://', '', imdb[0])
# Check if the URL was verified against IMDB
if movie_dict.get('verified') is True:
url = movie_dict.get('url')
page = requests.get(url)
else:
rep = 2
url = None
page = None
URL = build_url(movie.title, lib_type, url_dict, library_types)
backup_url = None
print("searching " + movie.title + " in common sense media")
if lib_type == library_types[0]:
find_u = url_dict.get('movie_reviews')
elif lib_type == library_types[1]:
find_u = url_dict.get('tv_reviews')
urls = get_search_results(URL, find_u, url_dict)
while rep > 0: # I want to re-search using a shorter search with this
if len(urls) > 0:
psearch = page_search(urls, imdb, movies, movie_dict, '\"'+movie.title +'\"')
page = psearch[0]
url = psearch[1]
if backup_url is None and len(urls) == 1 and rep == 2:
backup_url = urls[0]
if (len(urls) == 0 or url is None) and rep != 0:
print(movie.title + ' search had no verified results, trying a shorter search')
m_title = re.sub("The ", "", movie.title)
m_title = movie.title[0:7]
URL = build_url(m_title, lib_type, url_dict, library_types)
urls = get_search_results(URL, find_u, url_dict, urls)
rep = rep - 1
else:
rep = 0
if url is None and backup_url is not None:
url = backup_url
page = requests.get(url)
print(movie.title + " Search only showed 1 unverified result. I'm going to use that")
if page is None: # if no page
movie_dict.update({"updated": updated.strftime('%Y-%m-%d')})
movie_dict.update({"verified": False})
update_log(movie.title + ": Missing from Common Sense Media")
return movie_dict
# print(url)
movie_dict.update({"url": url})
search_m = re.search(str(imdb), page.text)
if search_m is not None:
movie_dict.update({"verified": True})
else:
movie_dict.update({"verified": False})
update_log(movie.title + ": Unverified IMDb match")
print(movie.title + " was pulled in but the match could not be verified")
movie_dict = scrape_CSM_page(movie_dict, page, parents_review)
return movie_dict
def CSM_age(summary): # Get date that Common Sense Media was updated
match = re.search('\[Date:[12345677890-]*\]', summary)
d = match.group()
date = d[6:int(len(d)-1)]
date = datetime.strptime(date, '%Y-%m-%d')
return get_age(date)
def check_csm(movie):
return "[Common Sense Media]" in movie.summary
def should_i_get_csm(movie, update_old_summaries = True, update_age_factor = 1):
b = False
if check_csm(movie) is False:
b = True
else:
if update_old_summaries is True:
m_age = get_age(movie.originallyAvailableAt)
s_age = CSM_age(movie.summary)
if m_age/s_age < update_age_factor or s_age > 10:
b = True
return b
def remove_csm(movie):
s = movie.summary
if check_csm(movie):
summary = s
m1 = match = re.search('\[Common Sense Media\]', summary)
m2 = re.search('\[Date:[12345677890-]*\]', summary)
start = m1.start()
if m2 is None:
end = len(summary)
else:
end = m2.end()
s = summary[0:start]+summary[end:len(summary)]
s = s.strip()
return s