Skip to content

Commit 65adc8b

Browse files
committed
Merge pull request buriy#50.
buriy#50
1 parent 0c2f29e commit 65adc8b

File tree

4 files changed

+21
-80
lines changed

4 files changed

+21
-80
lines changed

readability/encoding.py

Lines changed: 0 additions & 48 deletions
This file was deleted.

readability/htmls.py

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
from cleaners import normalize_spaces, clean_attributes
2-
from encoding import get_encoding
1+
from .cleaners import normalize_spaces, clean_attributes
32
from lxml.html import tostring
43
import logging
54
import lxml.html
@@ -8,14 +7,8 @@
87
utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
98

109
def build_doc(page):
11-
if isinstance(page, unicode):
12-
enc = None
13-
page_unicode = page
14-
else:
15-
enc = get_encoding(page) or 'utf-8'
16-
page_unicode = page.decode(enc, 'replace')
17-
doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser)
18-
return doc, enc
10+
doc = lxml.html.document_fromstring(page, parser=utf8_parser)
11+
return doc
1912

2013
def js_re(src, pattern, flags, repl):
2114
return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
@@ -104,7 +97,7 @@ def shorten_title(doc):
10497

10598
def get_body(doc):
10699
[ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
107-
raw_html = unicode(tostring(doc.body or doc))
100+
raw_html = tostring(doc.body or doc)
108101
cleaned = clean_attributes(raw_html)
109102
try:
110103
#BeautifulSoup(cleaned) #FIXME do we really need to try loading it?

readability/readability.py

Lines changed: 16 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,16 @@
44
import sys
55

66
from collections import defaultdict
7-
from lxml.etree import tostring
8-
from lxml.etree import tounicode
7+
from lxml.etree import tostring, tounicode
98
from lxml.html import document_fromstring
109
from lxml.html import fragment_fromstring
1110

12-
from cleaners import clean_attributes
13-
from cleaners import html_cleaner
14-
from htmls import build_doc
15-
from htmls import get_body
16-
from htmls import get_title
17-
from htmls import shorten_title
11+
from .cleaners import clean_attributes
12+
from .cleaners import html_cleaner
13+
from .htmls import build_doc
14+
from .htmls import get_body
15+
from .htmls import get_title
16+
from .htmls import shorten_title
1817

1918

2019
logging.basicConfig(level=logging.INFO)
@@ -110,7 +109,6 @@ def __init__(self, input, positive_keywords=None, negative_keywords=None, **opti
110109
self.input = input
111110
self.options = options
112111
self.html = None
113-
self.encoding = None
114112
self.positive_keywords = compile_pattern(positive_keywords)
115113
self.negative_keywords = compile_pattern(negative_keywords)
116114

@@ -120,7 +118,7 @@ def _html(self, force=False):
120118
return self.html
121119

122120
def _parse(self, input):
123-
doc, self.encoding = build_doc(input)
121+
doc = build_doc(input)
124122
doc = html_cleaner.clean_html(doc)
125123
base_href = self.options.get('url', None)
126124
if base_href:
@@ -194,9 +192,9 @@ def summary(self, html_partial=False):
194192
continue
195193
else:
196194
return cleaned_article
197-
except StandardError, e:
195+
except Exception as e:
198196
log.exception('error getting summary: ')
199-
raise Unparseable(str(e)), None, sys.exc_info()[2]
197+
raise Unparseable(str(e))
200198

201199
def get_article(self, candidates, best_candidate, html_partial=False):
202200
# Now that we have the top candidate, look through its siblings for
@@ -387,7 +385,7 @@ def transform_misused_divs_into_paragraphs(self):
387385
# This results in incorrect results in case there is an <img>
388386
# buried within an <a> for example
389387
if not REGEXES['divToPElementsRe'].search(
390-
unicode(''.join(map(tostring, list(elem))))):
388+
''.join(map(tounicode, list(elem)))):
391389
#self.debug("Altering %s to p" % (describe(elem)))
392390
elem.tag = "p"
393391
#print "Fixed element "+describe(elem)
@@ -599,20 +597,18 @@ def main():
599597
parser.print_help()
600598
sys.exit(1)
601599

602-
file = None
603600
if options.url:
604-
import urllib
605-
file = urllib.urlopen(options.url)
601+
import requests
602+
data = requests.get(options.url).raw_text
606603
else:
607-
file = open(args[0], 'rt')
608-
enc = sys.__stdout__.encoding or 'utf-8' # XXX: this hack could not always work, better to set PYTHONIOENCODING
604+
data = open(args[0], 'rt').read()
609605
try:
610-
print Document(file.read(),
606+
print(Document(data,
611607
debug=options.verbose,
612608
url=options.url,
613609
positive_keywords = options.positive_keywords,
614610
negative_keywords = options.negative_keywords,
615-
).summary().encode(enc, 'replace')
611+
).summary())
616612
finally:
617613
file.close()
618614

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import platform
88
mac_ver = platform.mac_ver()[0]
99
if mac_ver < '10.9':
10-
print "Using lxml<2.4"
10+
print("Using lxml<2.4")
1111
lxml_requirement = "lxml<2.4"
1212

1313
setup(

0 commit comments

Comments
 (0)