-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathbeautiful_soup_helper.py
92 lines (68 loc) · 2.58 KB
/
beautiful_soup_helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
"""
beautiful_soup_helper.py
Module used for implementing some wrapper functions for BeautifulSoup
"""
from bs4 import BeautifulSoup, Comment
import requests
from datetime import date
class Http404Exception(Exception):
def __init__(self, invalid_url):
super(Http404Exception, self).__init__("Attempt to access invalid URL %s." % invalid_url)
def str_to_date(date_string):
""" Convert a PitchFx date string to a Date object
:param date_string: a PitchFx date string
:return the Date object representing the string
"""
date_members = date_string.split("/")
date_object = date(int(date_members[0]), int(date_members[1]), int(date_members[2]))
return date_object
def url_to_comment_soup(url):
""" In order to mine JavaScript, mine the comments
:param url: the absolute URL string
:return: the BeautifulSoup object containing the comments, return None if the object was not
successfully created
"""
response = requests.get(url)
if response.status_code == 404:
print("Attempt to access invalid URL: " + response.url)
raise Http404Exception(url)
soup_initial = BeautifulSoup(response.text, "lxml")
soup_comments = soup_initial.findAll(text=lambda text: isinstance(text, Comment))
soup = str()
for soup_comment in soup_comments:
soup += soup_comment
return BeautifulSoup(soup, "lxml")
def url_to_soup(url):
""" Take a URL and get the BeautifulSoup object
:param url: the absolute URL string
:return the BeautifulSoup object returned, return None if the object was not successfully created
"""
response = requests.get(url)
if response.status_code == 404:
print("Attempt to access invalid URL: " + response.url)
raise Http404Exception(url)
return BeautifulSoup(response.text, "lxml")
def get_soup_from_url(url):
for i in range(5):
try:
soup = url_to_soup(url)
except IOError:
print("Socket error. Trying to obtain soup again.")
continue
except Http404Exception:
return None
return soup
print("Exhausted all attempts to get the soup. Check your internet connection.")
assert 0
def get_comment_soup_from_url(url):
for i in range(5):
try:
soup = url_to_comment_soup(url)
except IOError:
print("Socket error. Trying to obtain soup again.")
continue
except Http404Exception:
return None
return soup
print("Exhausted all attempts to get the soup. Check your internet connection.")
assert 0