44import sys
55
66from collections import defaultdict
7- from lxml .etree import tostring
8- from lxml .etree import tounicode
7+ from lxml .etree import tostring , tounicode
98from lxml .html import document_fromstring
109from lxml .html import fragment_fromstring
1110
12- from cleaners import clean_attributes
13- from cleaners import html_cleaner
14- from htmls import build_doc
15- from htmls import get_body
16- from htmls import get_title
17- from htmls import shorten_title
11+ from . cleaners import clean_attributes
12+ from . cleaners import html_cleaner
13+ from . htmls import build_doc
14+ from . htmls import get_body
15+ from . htmls import get_title
16+ from . htmls import shorten_title
1817
1918
2019logging .basicConfig (level = logging .INFO )
@@ -110,7 +109,6 @@ def __init__(self, input, positive_keywords=None, negative_keywords=None, **opti
110109 self .input = input
111110 self .options = options
112111 self .html = None
113- self .encoding = None
114112 self .positive_keywords = compile_pattern (positive_keywords )
115113 self .negative_keywords = compile_pattern (negative_keywords )
116114
@@ -120,7 +118,7 @@ def _html(self, force=False):
120118 return self .html
121119
122120 def _parse (self , input ):
123- doc , self . encoding = build_doc (input )
121+ doc = build_doc (input )
124122 doc = html_cleaner .clean_html (doc )
125123 base_href = self .options .get ('url' , None )
126124 if base_href :
@@ -194,9 +192,9 @@ def summary(self, html_partial=False):
194192 continue
195193 else :
196194 return cleaned_article
197- except StandardError , e :
195+ except Exception as e :
198196 log .exception ('error getting summary: ' )
199- raise Unparseable (str (e )), None , sys . exc_info ()[ 2 ]
197+ raise Unparseable (str (e ))
200198
201199 def get_article (self , candidates , best_candidate , html_partial = False ):
202200 # Now that we have the top candidate, look through its siblings for
@@ -387,7 +385,7 @@ def transform_misused_divs_into_paragraphs(self):
387385 # This results in incorrect results in case there is an <img>
388386 # buried within an <a> for example
389387 if not REGEXES ['divToPElementsRe' ].search (
390- unicode ( '' .join (map (tostring , list (elem ) )))):
388+ '' .join (map (tounicode , list (elem )))):
391389 #self.debug("Altering %s to p" % (describe(elem)))
392390 elem .tag = "p"
393391 #print "Fixed element "+describe(elem)
@@ -599,20 +597,18 @@ def main():
599597 parser .print_help ()
600598 sys .exit (1 )
601599
602- file = None
603600 if options .url :
604- import urllib
605- file = urllib . urlopen (options .url )
601+ import requests
602+ data = requests . get (options .url ). raw_text
606603 else :
607- file = open (args [0 ], 'rt' )
608- enc = sys .__stdout__ .encoding or 'utf-8' # XXX: this hack could not always work, better to set PYTHONIOENCODING
604+ data = open (args [0 ], 'rt' ).read ()
609605 try :
610- print Document (file . read () ,
606+ print ( Document (data ,
611607 debug = options .verbose ,
612608 url = options .url ,
613609 positive_keywords = options .positive_keywords ,
614610 negative_keywords = options .negative_keywords ,
615- ).summary (). encode ( enc , 'replace' )
611+ ).summary ())
616612 finally :
617613 file .close ()
618614
0 commit comments