-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbook-importer.py
108 lines (97 loc) · 3.97 KB
/
book-importer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
from urllib.request import urlopen
import html5lib
import lxml
import os
import re
import sys
SOURCE_URL = "http://www.69shu.com/"
def import_book(name, book_id):
book_url = SOURCE_URL + str(book_id) + "/"
parser = html5lib.HTMLParser(tree = html5lib.getTreeBuilder("lxml"),
namespaceHTMLElements = False)
print("Downloading %s." % (book_url))
chapters = {}
with urlopen(book_url) as response:
doc = parser.parse(response)
html = lxml.etree.tostring(doc.getroot())
p_str = "<li><a href=\"/txt/%s/\\d+\">[0-9\.$&#;a-zA-Z \(\)~]+</a></li>" % (book_id)
p = re.compile(p_str)
chapters_html = p.findall(str(html))
n = 1
for c in chapters_html:
url_re = re.compile("/txt/%s/\\d+" % (book_id))
chapter_re = re.compile(">[0-9\.$&#;a-zA-Z \(\)~]+</a>")
url = url_re.search(c).group()
chapter = chapter_re.search(c).group()[1:-4]
chapters[n] = (chapter, url)
n += 1
output_path = "output/%s" % (book_id)
if not os.path.exists(output_path):
os.makedirs(output_path)
# Gets chapters and cleans.
numOfChapters = len(chapters)
for i in range(1, numOfChapters + 1):
if i in chapters:
output_html = "%s/%08d.html" % (output_path, i)
if os.path.isfile(output_html):
# Skip
print(" Wrote chapter %d or %d" % (i, numOfChapters), end='\r')
continue
url = "%s%s" % (SOURCE_URL, chapters[i][1][1:])
with urlopen(url) as response:
doc = parser.parse(response)
root = doc.getroot()
written = False
for child in root[1].iter():
if child.get('class') == 'yd_text2':
html = str(lxml.etree.tostring(child))
html = re.sub(r'<br />', '<br />\n', html)
html = re.sub(r'<script.+/script>', '', html)
html = re.sub(r'<div.*>', '', html)
html = re.sub(r'</div>', '', html)
html = re.sub(r'<!--.+-->', '', html)
html = re.sub(r'\\n', '\n', html)
html = re.sub(r'^\n', '', html)
html = re.sub(r'^\W+\n', '', html)
html = re.sub(r'^\W+$', '', html)
with open("%s/%08d.html" % (output_path, i), "w") as out:
out.write("<h2>%s</h2>\n" % (chapters[i][0]))
out.write("<br />\n")
out.write(html[2:-1])
print(" Wrote chapter %d or %d" % (i, numOfChapters), end='\r')
else:
print(" WARNING: Chapter %d missing" % (i))
print("Wrote %d chapters." % (len(chapters)))
tmp_file = "output/tmp.html"
header = "<html><head><title>%s</title></head><body>" % (name)
head_cmd = "echo \"%s\" >> %s" % (header, tmp_file)
cat_cmd = "cat %s/* >> %s" % (output_path, tmp_file)
foot_cmd = "echo \"</body></html>\" >> %s" % (tmp_file)
if os.path.exists(tmp_file):
os.remove(tmp_file)
os.system(head_cmd)
os.system(cat_cmd)
os.system(foot_cmd)
os.system("kindlegen %s" % (tmp_file))
print("Finished.")
def send_book(email):
"""
Sends imported book to email.
"""
tmp_file = "output/tmp.html"
if os.path.exists(tmp_file):
os.system("echo \"\" | mutt -a \"output/tmp.mobi\" -s \"Book\" -- %s" % (email))
def main():
"""
Entry point.
"""
if len(sys.argv) != 3:
print("Usage: book-importer.py <name> <book_id> <kindle_email>")
print(" name between quotes.")
print(" book_id id used at 69shu.com to locate the book.")
#print(" kindle_email your kindle email.")
return
import_book(sys.argv[1], sys.argv[2])
#send_book(sys.argv[3])
if __name__ == "__main__":
main()