Merge pull request buriy#50.

sypa · sypa · commit 65adc8bef9e1 · 2014-10-11T16:55:29.000+02:00
buriy#50
diff --git a/readability/encoding.py b/readability/encoding.py
diff --git a/readability/htmls.py b/readability/htmls.py
@@ -1,5 +1,4 @@
-from cleaners import normalize_spaces, clean_attributes
-from encoding import get_encoding
+from .cleaners import normalize_spaces, clean_attributes
 from lxml.html import tostring
 import logging
 import lxml.html
@@ -8,14 +7,8 @@
 utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
 
 def build_doc(page):
-    if isinstance(page, unicode):
-        enc = None
-        page_unicode = page
-    else:
-        enc = get_encoding(page) or 'utf-8'
-        page_unicode = page.decode(enc, 'replace')
-    doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser)
-    return doc, enc
+    doc = lxml.html.document_fromstring(page, parser=utf8_parser)
+    return doc
 
 def js_re(src, pattern, flags, repl):
     return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
@@ -104,7 +97,7 @@ def shorten_title(doc):
 
 def get_body(doc):
     [ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
-    raw_html = unicode(tostring(doc.body or doc))
+    raw_html = tostring(doc.body or doc)
     cleaned = clean_attributes(raw_html)
     try:
         #BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
diff --git a/readability/readability.py b/readability/readability.py
@@ -4,17 +4,16 @@
 import sys
 
 from collections import defaultdict
-from lxml.etree import tostring
-from lxml.etree import tounicode
+from lxml.etree import tostring, tounicode
 from lxml.html import document_fromstring
 from lxml.html import fragment_fromstring
 
-from cleaners import clean_attributes
-from cleaners import html_cleaner
-from htmls import build_doc
-from htmls import get_body
-from htmls import get_title
-from htmls import shorten_title
+from .cleaners import clean_attributes
+from .cleaners import html_cleaner
+from .htmls import build_doc
+from .htmls import get_body
+from .htmls import get_title
+from .htmls import shorten_title
 
 
 logging.basicConfig(level=logging.INFO)
@@ -110,7 +109,6 @@ def __init__(self, input, positive_keywords=None, negative_keywords=None, **opti
         self.input = input
         self.options = options
         self.html = None
-        self.encoding = None
         self.positive_keywords = compile_pattern(positive_keywords)
         self.negative_keywords = compile_pattern(negative_keywords)
 
@@ -120,7 +118,7 @@ def _html(self, force=False):
         return self.html
 
     def _parse(self, input):
-        doc, self.encoding = build_doc(input)
+        doc = build_doc(input)
         doc = html_cleaner.clean_html(doc)
         base_href = self.options.get('url', None)
         if base_href:
@@ -194,9 +192,9 @@ def summary(self, html_partial=False):
                     continue
                 else:
                     return cleaned_article
-        except StandardError, e:
+        except Exception as e:
             log.exception('error getting summary: ')
-            raise Unparseable(str(e)), None, sys.exc_info()[2]
+            raise Unparseable(str(e))
 
     def get_article(self, candidates, best_candidate, html_partial=False):
         # Now that we have the top candidate, look through its siblings for
@@ -387,7 +385,7 @@ def transform_misused_divs_into_paragraphs(self):
             # This results in incorrect results in case there is an <img>
             # buried within an <a> for example
             if not REGEXES['divToPElementsRe'].search(
-                    unicode(''.join(map(tostring, list(elem))))):
+                    ''.join(map(tounicode, list(elem)))):
                 #self.debug("Altering %s to p" % (describe(elem)))
                 elem.tag = "p"
                 #print "Fixed element "+describe(elem)
@@ -599,20 +597,18 @@ def main():
         parser.print_help()
         sys.exit(1)
 
-    file = None
     if options.url:
-        import urllib
-        file = urllib.urlopen(options.url)
+        import requests
+        data = requests.get(options.url).raw_text
     else:
-        file = open(args[0], 'rt')
-    enc = sys.__stdout__.encoding or 'utf-8' # XXX: this hack could not always work, better to set PYTHONIOENCODING
+        data = open(args[0], 'rt').read()
     try:
-        print Document(file.read(),
+        print(Document(data,
             debug=options.verbose,
             url=options.url,
             positive_keywords = options.positive_keywords,
             negative_keywords = options.negative_keywords,
-        ).summary().encode(enc, 'replace')
+        ).summary())
     finally:
         file.close()
 
diff --git a/setup.py b/setup.py
@@ -7,7 +7,7 @@
     import platform
     mac_ver = platform.mac_ver()[0]
     if mac_ver < '10.9':
-        print "Using lxml<2.4"
+        print("Using lxml<2.4")
         lxml_requirement = "lxml<2.4"
 
 setup(