Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 0 additions & 48 deletions readability/encoding.py

This file was deleted.

15 changes: 4 additions & 11 deletions readability/htmls.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from cleaners import normalize_spaces, clean_attributes
from encoding import get_encoding
from .cleaners import normalize_spaces, clean_attributes
from lxml.html import tostring
import logging
import lxml.html
Expand All @@ -8,14 +7,8 @@
utf8_parser = lxml.html.HTMLParser(encoding='utf-8')

def build_doc(page):
if isinstance(page, unicode):
enc = None
page_unicode = page
else:
enc = get_encoding(page) or 'utf-8'
page_unicode = page.decode(enc, 'replace')
doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser)
return doc, enc
doc = lxml.html.document_fromstring(page, parser=utf8_parser)
return doc

def js_re(src, pattern, flags, repl):
return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
Expand Down Expand Up @@ -104,7 +97,7 @@ def shorten_title(doc):

def get_body(doc):
[ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
raw_html = unicode(tostring(doc.body or doc))
raw_html = tostring(doc.body or doc)
cleaned = clean_attributes(raw_html)
try:
#BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
Expand Down
36 changes: 16 additions & 20 deletions readability/readability.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,16 @@
import sys

from collections import defaultdict
from lxml.etree import tostring
from lxml.etree import tounicode
from lxml.etree import tostring, tounicode
from lxml.html import document_fromstring
from lxml.html import fragment_fromstring

from cleaners import clean_attributes
from cleaners import html_cleaner
from htmls import build_doc
from htmls import get_body
from htmls import get_title
from htmls import shorten_title
from .cleaners import clean_attributes
from .cleaners import html_cleaner
from .htmls import build_doc
from .htmls import get_body
from .htmls import get_title
from .htmls import shorten_title


logging.basicConfig(level=logging.INFO)
Expand Down Expand Up @@ -110,7 +109,6 @@ def __init__(self, input, positive_keywords=None, negative_keywords=None, **opti
self.input = input
self.options = options
self.html = None
self.encoding = None
self.positive_keywords = compile_pattern(positive_keywords)
self.negative_keywords = compile_pattern(negative_keywords)

Expand All @@ -120,7 +118,7 @@ def _html(self, force=False):
return self.html

def _parse(self, input):
doc, self.encoding = build_doc(input)
doc = build_doc(input)
doc = html_cleaner.clean_html(doc)
base_href = self.options.get('url', None)
if base_href:
Expand Down Expand Up @@ -194,9 +192,9 @@ def summary(self, html_partial=False):
continue
else:
return cleaned_article
except StandardError, e:
except Exception as e:
log.exception('error getting summary: ')
raise Unparseable(str(e)), None, sys.exc_info()[2]
raise Unparseable(str(e))

def get_article(self, candidates, best_candidate, html_partial=False):
# Now that we have the top candidate, look through its siblings for
Expand Down Expand Up @@ -387,7 +385,7 @@ def transform_misused_divs_into_paragraphs(self):
# This results in incorrect results in case there is an <img>
# buried within an <a> for example
if not REGEXES['divToPElementsRe'].search(
unicode(''.join(map(tostring, list(elem))))):
''.join(map(tounicode, list(elem)))):
#self.debug("Altering %s to p" % (describe(elem)))
elem.tag = "p"
#print "Fixed element "+describe(elem)
Expand Down Expand Up @@ -598,20 +596,18 @@ def main():
parser.print_help()
sys.exit(1)

file = None
if options.url:
import urllib
file = urllib.urlopen(options.url)
import requests
data = requests.get(options.url).raw_text
else:
file = open(args[0], 'rt')
enc = sys.__stdout__.encoding or 'utf-8' # XXX: this hack could not always work, better to set PYTHONIOENCODING
data = open(args[0], 'rt').read()
try:
print Document(file.read(),
print(Document(data,
debug=options.verbose,
url=options.url,
positive_keywords = options.positive_keywords,
negative_keywords = options.negative_keywords,
).summary().encode(enc, 'replace')
).summary())
finally:
file.close()

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import platform
mac_ver = platform.mac_ver()[0]
if mac_ver < '10.9':
print "Using lxml<2.4"
print("Using lxml<2.4")
lxml_requirement = "lxml<2.4"

setup(
Expand Down