-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsite_to_pdf.py
65 lines (58 loc) · 2.35 KB
/
site_to_pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import urllib.request as urllib2
from bs4 import BeautifulSoup
from fpdf import FPDF
from fpdf import HTMLMixin
class create_pdf(object):
# Getting the urls of articles in archive page and save them in an array
def get_urls():
page = urllib2.urlopen("http://www.paulgraham.com/articles.html")
soup = BeautifulSoup(page,'html.parser')
urls = soup.findAll('table', {'width': '435'})[1].findAll('a')
articles = []
for url in urls:
if "sep.yimg" not in url["href"]:
articles.append("http://www.paulgraham.com/" + url["href"])
return articles
# Getting the Article of given link
def get_article(link):
try:
page = urllib2.urlopen(link)
soup = BeautifulSoup(page,'html.parser')
font = str(soup.findAll('table', {'width':'435'})[0].findAll('font')[0])
if not 'Get funded by' in font and not 'Watch how this essay was' in font and not 'Like to build things?' in font and not len(font)<100:
content = font
else:
content = ''
for par in soup.findAll('table', {'width':'435'})[0].findAll('p'):
content += str(par)
return content
except IndexError as erorr:
print(erorr)
#Eliminate the tags from given String and write it to the file
def add_to_pdf(text_content,pdf):
try:
new_text = text_content \
.replace("<br />","\n") \
.replace("<br/>","\n") \
.replace("<br/><br/>","\n") \
.replace("<br /><br />","\n") \
.replace("<font face=\"verdana\" size=\"2\">", " ") \
.replace("</font>"," ") \
.replace("<b>"," ") \
.replace("<i>"," ") \
.replace("<ol>"," ") \
pdf.add_page()
pdf.set_xy(0, 0)
pdf.add_font('DejaVu', '', 'DejaVuSansCondensed.ttf', uni=True)
pdf.set_font('DejaVu', '', 14.0)
pdf.write(10, new_text)
except (UnicodeEncodeError , AttributeError) as erorr:
print(erorr)
pdf = FPDF()
article_links = get_urls()
for link in article_links:
content_html = get_article(link)
add_to_pdf(content_html,pdf)
print(link)
pdf.output('paul_graham.pdf', 'F')
print("done")