-
Notifications
You must be signed in to change notification settings - Fork 22
/
html2text.py
executable file
·115 lines (98 loc) · 4.86 KB
/
html2text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python
"""
Method to convert html2text.
It uses the same procedure discussed in:
https://github.com/turian/common-scripts/blob/master/html2text/README.txt
See that file, to understand the requirements.
Essentially, we pass the HTML through tidy then Bayer's html2text utility
(not aaronsw's html2text.py, mind you).
We can also use NCleaner = http://webascorpus.sourceforge.net/, converts
HTML to text and removes boilerplate. This cleans a lot more than html2text,
and is perhaps more suitable for text mining.
ISSUES: I may have the character encodings wrong :(
See https://github.com/turian/common-scripts/blob/master/html2text/README.txt
for more information.
TODO: Trap stderr output.
"""
from common.misc import runcmd
from common.tidy import tidy
import os.path
import sys
import tempfile, shutil, os
from common.stats import stats
import common.json
import re
import urllib,urllib2
def html2text(html, html2textrc=os.path.expanduser("~/dev/common-scripts/html2text/html2textrc"), forceoutput=True, veryquiet=True):
"""
If veryquiet, all errors and warnings from tidy are written to /dev/null.
"""
assert os.path.exists(html2textrc)
tidyhtml = tidy(html, xml=False, forceoutput=forceoutput, veryquiet=veryquiet)
text = runcmd("html2text -nobs -style pretty -rcfile %s" % html2textrc, input=tidyhtml)
return text
def batch_nclean(htmls, strip_html_output=True, ncleaner=os.path.join(os.environ["UTILS"], "bin/ncleaner")):
"""
For a list of HTML byte strings, run ncleaner and return a list.
NCleaner = http://webascorpus.sourceforge.net/, converts HTML to text and removes boilerplate.
Return None if there was some error.
strip_html_output=True means remove "<p>" or "<l>" which is inserted at the beginning of each segmented line.
"""
indir = tempfile.mkdtemp()
outdir = tempfile.mkdtemp()
txts = None
assert os.path.exists(ncleaner)
htmlre = re.compile("^\s*<[^<>]*>\s*", re.MULTILINE)
try:
htmlfiles = ["%d.html" % i for i in range(len(htmls))]
txtfiles = ["%d.txt" % i for i in range(len(htmls))]
for f, html in zip(htmlfiles, htmls):
# print os.path.join(indir, f)
open(os.path.join(indir, f), "wb").write(html)
cmd = "%s %s %s" % (ncleaner, indir, outdir)
print >> sys.stderr, "About to run NCleaner on %d files: %s..." % (len(htmlfiles), cmd)
print >> sys.stderr, stats()
os.system("%s %s %s" % (ncleaner, indir, outdir))
print >> sys.stderr, "...done running NCleaner on %d files" % (len(htmlfiles))
print >> sys.stderr, stats()
txts = [open(os.path.join(outdir, txtfil)).read() for txtfil in txtfiles]
if strip_html_output:
txts = [htmlre.sub("\n", txt) for txt in txts]
assert len(txts) == len(htmls)
except:
print >> sys.stderr, "Problem in batch_nclean: %s" % sys.exc_info()[0]
shutil.rmtree(indir, ignore_errors=False, onerror=lambda function, path, excinfo: sys.stderr.write("Could not shutil.rmtree, function=%s, path=%s, excinfo=%s\n" % function, path, excinfo))
shutil.rmtree(outdir, ignore_errors=False, onerror=lambda function, path, excinfo: sys.stderr.write("Could not shutil.rmtree, function=%s, path=%s, excinfo=%s\n" % function, path, excinfo))
return txts
def boilerpipe_html2text(html):
values = {"text": html.encode("utf-8"), "extractor": "DefaultExtractor", "output": "text"}
data = urllib.urlencode(values)
boilerpipe_response = urllib2.urlopen("http://localhost:8080/boilerpipe-api/extract", data)
return boilerpipe_response.read().decode("utf-8")
def boilerpipe_url2text(url):
"""
Use Kohlschuetter Search Intelligence's boilerpipe boilerplate stripper.
"""
newurl = "http://boilerpipe-web.appspot.com/extract?url=%s+&extractor=ArticleExtractor&output=text" % urllib.quote_plus(url)
# print newurl
f = urllib2.urlopen(newurl)
data = f.read()
f.close()
return data
#http://boilerpipe-web.appspot.com/extract?url=http%3A%2F%2Fnarrativebranding.wordpress.com%2F2010%2F08%2F18%2Fgone-fishin%2F+&extractor=ArticleExtractor&output=text
def diffbot_url2text(url, token):
"""
Use diffbot article API: http://www.diffbot.com/docs/api/article
"""
import urllib,urllib2
newurl = "http://www.diffbot.com/api/article?token=%s&url=%s" % (token, urllib.quote_plus(url))
# print newurl
f = urllib2.urlopen(newurl)
data = f.read()
f.close()
return common.json.loads(data)
if __name__ == "__main__":
# import sys
# print html2text(sys.stdin.read())
# print boilerpipe_url2text("http://www.bianet.org/english/freedom-of-expression/122506-dairy-company-yorsan-advocates-for-internet-censorship")
print common.json.dumps(diffbot_url2text("http://www.bianet.org/english/freedom-of-expression/122506-dairy-company-yorsan-advocates-for-internet-censorship", "XXXXX"), indent=4)