diff --git a/setup.py b/setup.py index 088a41d..cf8e7e6 100755 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ def finalize_options(self): setup(name='talon', - version='1.5.0', + version='1.6.0', description=("Mailgun library " "to extract message quotations and signatures."), long_description=open("README.rst").read(), diff --git a/talon/quotations.py b/talon/quotations.py index c86809d..b244c6c 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -6,18 +6,17 @@ """ from __future__ import absolute_import -import regex as re + import logging from copy import deepcopy -from lxml import html, etree - -from talon.utils import (get_delimiter, html_tree_to_text, - html_document_fromstring) -from talon import html_quotations +import regex as re +from lxml import etree, html from six.moves import range -import six +from talon import html_quotations +from talon.utils import (get_delimiter, html_document_fromstring, + html_tree_to_text) log = logging.getLogger(__name__) @@ -94,7 +93,7 @@ ) RE_QUOTATION = re.compile( - r''' + r""" ( # quotation border: splitter line or a number of quotation marker lines (?: @@ -112,10 +111,10 @@ # after quotations should be text only or nothing at all [te]*$ - ''', re.VERBOSE) + """, re.VERBOSE) RE_EMPTY_QUOTATION = re.compile( - r''' + r""" ( # quotation border: splitter line or a number of quotation marker lines (?: @@ -125,7 +124,7 @@ ) ) e* - ''', re.VERBOSE) + """, re.VERBOSE) # ------Original Message------ or ---- Reply Message ---- # With variations in other languages. @@ -343,9 +342,6 @@ def _replace_link_brackets(msg_body): Converts msg_body into a unicode """ - if isinstance(msg_body, bytes): - msg_body = msg_body.decode('utf8') - def link_wrapper(link): newline_index = msg_body[:link.start()].rfind("\n") if msg_body[newline_index + 1] == ">": @@ -385,8 +381,6 @@ def postprocess(msg_body): def extract_from_plain(msg_body): """Extracts a non quoted message from provided plain text.""" - stripped_text = msg_body - delimiter = get_delimiter(msg_body) msg_body = preprocess(msg_body, delimiter) # don't process too long messages @@ -418,17 +412,13 @@ def extract_from_html(msg_body): Returns a unicode string. """ - msg_body_bytes = msg_body - if isinstance(msg_body, six.text_type): - msg_body_bytes = msg_body.encode('utf8') - - if msg_body_bytes.strip() == b'': + if msg_body.strip() == "": return msg_body - msg_body_bytes = msg_body_bytes.replace(b'\r\n', b'\n') + msg_body = msg_body.replace("\r\n", "\n") # Cut out xml and doctype tags to avoid conflict with unicode decoding. - msg_body_bytes = re.sub(br"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", b"", msg_body_bytes) - html_tree = html_document_fromstring(msg_body_bytes) + msg_body = re.sub(r"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body) + html_tree = html_document_fromstring(msg_body) if html_tree is None: return msg_body @@ -531,11 +521,11 @@ def extract_from_html_tree(html_tree): # of replacing data outside the which might be essential to # the customer. remove_namespaces(html_tree_copy) - s = html.tostring(html_tree_copy) + s = html.tostring(html_tree_copy, encoding="ascii") if not s: return None - return s.decode('utf-8') + return s.decode("ascii") def remove_namespaces(root): @@ -654,10 +644,10 @@ def _readable_text_empty(html_tree): def is_splitter(line): - ''' + """ Returns Matcher object if provided string is a splitter and None otherwise. - ''' + """ for pattern in SPLITTER_PATTERNS: matcher = re.match(pattern, line) if matcher: @@ -665,12 +655,12 @@ def is_splitter(line): def text_content(context): - '''XPath Extension function to return a node text content.''' + """XPath Extension function to return a node text content.""" return context.context_node.xpath("string()").strip() def tail(context): - '''XPath Extension function to return a node tail text.''' + """XPath Extension function to return a node tail text.""" return context.context_node.tail or '' diff --git a/talon/signature/learning/helpers.py b/talon/signature/learning/helpers.py index 6814f81..2bdb2a2 100644 --- a/talon/signature/learning/helpers.py +++ b/talon/signature/learning/helpers.py @@ -5,21 +5,17 @@ * regexp's constants used when evaluating signature's features """ - -from __future__ import absolute_import import unicodedata -import regex as re -from talon.utils import to_unicode +import regex as re from talon.signature.constants import SIGNATURE_MAX_LINES - rc = re.compile RE_EMAIL = rc('\S@\S') RE_RELAX_PHONE = rc('(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}') -RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''') +RE_URL = rc(r"""https?://|www\.[\S]+\.[\S]""") # Taken from: # http://www.cs.cmu.edu/~vitor/papers/sigFilePaper_finalversion.pdf @@ -55,7 +51,7 @@ def binary_regex_search(prog): - '''Returns a function that returns 1 or 0 depending on regex search result. + """Returns a function that returns 1 or 0 depending on regex search result. If regular expression compiled into prog is present in a string the result of calling the returned function with the string will be 1 @@ -66,12 +62,12 @@ def binary_regex_search(prog): 1 >>> binary_regex_search(re.compile("12"))("34") 0 - ''' + """ return lambda s: 1 if prog.search(s) else 0 def binary_regex_match(prog): - '''Returns a function that returns 1 or 0 depending on regex match result. + """Returns a function that returns 1 or 0 depending on regex match result. If a string matches regular expression compiled into prog the result of calling the returned function with the string will be 1 @@ -82,7 +78,7 @@ def binary_regex_match(prog): 1 >>> binary_regex_match(re.compile("12"))("3 12") 0 - ''' + """ return lambda s: 1 if prog.match(s) else 0 @@ -135,7 +131,6 @@ def extract_names(sender): >>> extract_names('') [] """ - sender = to_unicode(sender, precise=True) # Remove non-alphabetical characters sender = "".join([char if char.isalpha() else ' ' for char in sender]) # Remove too short words and words from "black" list i.e. @@ -154,7 +149,7 @@ def extract_names(sender): def categories_percent(s, categories): - '''Returns category characters percent. + """Returns category characters percent. >>> categories_percent("qqq ggg hhh", ["Po"]) 0.0 @@ -166,9 +161,8 @@ def categories_percent(s, categories): 50.0 >>> categories_percent("s.s,5s", ["Po", "Nd"]) 50.0 - ''' + """ count = 0 - s = to_unicode(s, precise=True) for c in s: if unicodedata.category(c) in categories: count += 1 @@ -176,19 +170,18 @@ def categories_percent(s, categories): def punctuation_percent(s): - '''Returns punctuation percent. + """Returns punctuation percent. >>> punctuation_percent("qqq ggg hhh") 0.0 >>> punctuation_percent("q,w.") 50.0 - ''' + """ return categories_percent(s, ['Po']) def capitalized_words_percent(s): - '''Returns capitalized words percent.''' - s = to_unicode(s, precise=True) + """Returns capitalized words percent.""" words = re.split('\s', s) words = [w for w in words if w.strip()] words = [w for w in words if len(w) > 2] diff --git a/talon/utils.py b/talon/utils.py index 14f4509..b6b5559 100644 --- a/talon/utils.py +++ b/talon/utils.py @@ -1,110 +1,17 @@ # coding:utf-8 +from __future__ import annotations -from __future__ import absolute_import - -from random import shuffle - -import cchardet -import chardet import html5lib import regex as re -import six +from html5lib import HTMLParser from lxml.cssselect import CSSSelector +from lxml.etree import _Element from lxml.html import html5parser from talon.constants import RE_DELIMITER -def safe_format(format_string, *args, **kwargs): - """ - Helper: formats string with any combination of bytestrings/unicode - strings without raising exceptions - """ - try: - if not args and not kwargs: - return format_string - else: - return format_string.format(*args, **kwargs) - - # catch encoding errors and transform everything into utf-8 string - # before logging: - except (UnicodeEncodeError, UnicodeDecodeError): - format_string = to_utf8(format_string) - args = [to_utf8(p) for p in args] - kwargs = {k: to_utf8(v) for k, v in six.iteritems(kwargs)} - return format_string.format(*args, **kwargs) - - # ignore other errors - except: - return u'' - - -def to_unicode(str_or_unicode, precise=False): - """ - Safely returns a unicode version of a given string - >>> utils.to_unicode('привет') - u'привет' - >>> utils.to_unicode(u'привет') - u'привет' - If `precise` flag is True, tries to guess the correct encoding first. - """ - if not isinstance(str_or_unicode, six.text_type): - encoding = quick_detect_encoding(str_or_unicode) if precise else 'utf-8' - return six.text_type(str_or_unicode, encoding, 'replace') - return str_or_unicode - - -def detect_encoding(string): - """ - Tries to detect the encoding of the passed string. - - Defaults to UTF-8. - """ - assert isinstance(string, bytes) - try: - detected = chardet.detect(string) - if detected: - return detected.get('encoding') or 'utf-8' - except Exception as e: - pass - return 'utf-8' - - -def quick_detect_encoding(string): - """ - Tries to detect the encoding of the passed string. - - Uses cchardet. Fallbacks to detect_encoding. - """ - assert isinstance(string, bytes) - try: - detected = cchardet.detect(string) - if detected: - return detected.get('encoding') or detect_encoding(string) - except Exception as e: - pass - return detect_encoding(string) - - -def to_utf8(str_or_unicode): - """ - Safely returns a UTF-8 version of a given string - >>> utils.to_utf8(u'hi') - 'hi' - """ - if not isinstance(str_or_unicode, six.text_type): - return str_or_unicode.encode("utf-8", "ignore") - return str(str_or_unicode) - - -def random_token(length=7): - vals = ("a b c d e f g h i j k l m n o p q r s t u v w x y z " - "0 1 2 3 4 5 6 7 8 9").split(' ') - shuffle(vals) - return ''.join(vals[:length]) - - -def get_delimiter(msg_body): +def get_delimiter(msg_body: str) -> str: delimiter = RE_DELIMITER.search(msg_body) if delimiter: delimiter = delimiter.group() @@ -114,7 +21,7 @@ def get_delimiter(msg_body): return delimiter -def html_tree_to_text(tree): +def html_tree_to_text(tree: _Element) -> str: for style in CSSSelector('style')(tree): style.getparent().remove(style) @@ -146,26 +53,22 @@ def html_tree_to_text(tree): not text.endswith("\n") and not el_text): text += "\n" - retval = _rm_excessive_newlines(text) - return _encode_utf8(retval) + text = _rm_excessive_newlines(text) + return text -def html_to_text(string): +def html_to_text(s: str) -> str | None: """ Dead-simple HTML-to-text converter: >>> html_to_text("one
two
three") - >>> "one\ntwo\nthree" + <<< "one\ntwo\nthree" NOTES: 1. the string is expected to contain UTF-8 encoded HTML! - 2. returns utf-8 encoded str (not unicode) 3. if html can't be parsed returns None """ - if isinstance(string, six.text_type): - string = string.encode('utf8') - - s = _prepend_utf8_declaration(string) - s = s.replace(b"\n", b"") + s = _prepend_utf8_declaration(s) + s = s.replace("\n", "") tree = html_fromstring(s) if tree is None: @@ -174,62 +77,46 @@ def html_to_text(string): return html_tree_to_text(tree) -def html_fromstring(s): +def html_fromstring(s: str) -> _Element: """Parse html tree from string. Return None if the string can't be parsed. """ - if isinstance(s, six.text_type): - s = s.encode('utf8') - try: - return html5parser.fromstring(s, parser=_html5lib_parser()) - except Exception: - pass + return html5parser.fromstring(s, parser=_html5lib_parser()) -def html_document_fromstring(s): +def html_document_fromstring(s: str) -> _Element: """Parse html tree from string. Return None if the string can't be parsed. """ - if isinstance(s, six.text_type): - s = s.encode('utf8') - try: - return html5parser.document_fromstring(s, parser=_html5lib_parser()) - except Exception: - pass + return html5parser.document_fromstring(s, parser=_html5lib_parser()) -def cssselect(expr, tree): +def cssselect(expr: str, tree: str) -> list[_Element]: return CSSSelector(expr)(tree) -def _contains_charset_spec(s): +def _contains_charset_spec(s: str) -> str: """Return True if the first 4KB contain charset spec """ - return s.lower().find(b'html; charset=', 0, 4096) != -1 + return s.lower().find('html; charset=', 0, 4096) != -1 -def _prepend_utf8_declaration(s): +def _prepend_utf8_declaration(s: str) -> str: """Prepend 'utf-8' encoding declaration if the first 4KB don't have any """ return s if _contains_charset_spec(s) else _UTF8_DECLARATION + s -def _rm_excessive_newlines(s): +def _rm_excessive_newlines(s: str) -> str: """Remove excessive newlines that often happen due to tons of divs """ return _RE_EXCESSIVE_NEWLINES.sub("\n\n", s).strip() -def _encode_utf8(s): - """Encode in 'utf-8' if unicode - """ - return s.encode('utf-8') if isinstance(s, six.text_type) else s - - -def _html5lib_parser(): +def _html5lib_parser() -> HTMLParser: """ html5lib is a pure-python library that conforms to the WHATWG HTML spec and is not vulnarable to certain attacks common for XML libraries """ - return html5lib.HTMLParser( + return HTMLParser( # build lxml tree html5lib.treebuilders.getTreeBuilder("lxml"), # remove namespace value from inside lxml.html.html5paser element tag @@ -239,8 +126,8 @@ def _html5lib_parser(): ) -_UTF8_DECLARATION = (b'') +_UTF8_DECLARATION = ('') _BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] _HARDBREAKS = ['br', 'hr', 'tr'] diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py index 2e5812a..85871e7 100644 --- a/tests/html_quotations_test.py +++ b/tests/html_quotations_test.py @@ -4,14 +4,17 @@ # noinspection PyUnresolvedReferences import re +from unittest.mock import Mock, patch +from nose.tools import assert_false, assert_true, eq_, ok_ + +from tests.fixtures import (OLK_SRC_BODY_SECTION, + REPLY_QUOTATIONS_SHARE_BLOCK, + REPLY_SEPARATED_BY_HR) from talon import quotations, utils as u -from . import * -from .fixtures import * -from lxml import html -RE_WHITESPACE = re.compile("\s") -RE_DOUBLE_WHITESPACE = re.compile("\s") +RE_WHITESPACE = re.compile(r"\s") +RE_DOUBLE_WHITESPACE = re.compile(r"\s") def test_quotation_splitter_inside_blockquote(): @@ -166,7 +169,7 @@ def test_unicode_in_reply():
Quote -
""".encode("utf-8") +""" eq_("Reply  Text

" "", @@ -314,7 +317,6 @@ def extract_reply_and_check(filename): msg_body = f.read() reply = quotations.extract_from_html(msg_body) plain_reply = u.html_to_text(reply) - plain_reply = plain_reply.decode('utf8') eq_(RE_WHITESPACE.sub('', "Hi. I am fine.\n\nThanks,\nAlex"), RE_WHITESPACE.sub('', plain_reply)) diff --git a/tests/utils_test.py b/tests/utils_test.py index e7d529d..0027752 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -2,9 +2,6 @@ from __future__ import absolute_import -import cchardet -import six - from talon import utils as u from . import * @@ -15,58 +12,6 @@ def test_get_delimiter(): eq_('\n', u.get_delimiter('abc')) -def test_unicode(): - eq_(u'hi', u.to_unicode('hi')) - eq_(type(u.to_unicode('hi')), six.text_type) - eq_(type(u.to_unicode(u'hi')), six.text_type) - eq_(type(u.to_unicode('привет')), six.text_type) - eq_(type(u.to_unicode(u'привет')), six.text_type) - eq_(u"привет", u.to_unicode('привет')) - eq_(u"привет", u.to_unicode(u'привет')) - # some latin1 stuff - eq_(u"Versión", u.to_unicode(u'Versi\xf3n'.encode('iso-8859-2'), precise=True)) - - -def test_detect_encoding(): - eq_('ascii', u.detect_encoding(b'qwe').lower()) - ok_(u.detect_encoding( - u'Versi\xf3n'.encode('iso-8859-2')).lower() in [ - 'iso-8859-1', 'iso-8859-2']) - eq_('utf-8', u.detect_encoding(u'привет'.encode('utf8')).lower()) - # fallback to utf-8 - with patch.object(u.chardet, 'detect') as detect: - detect.side_effect = Exception - eq_('utf-8', u.detect_encoding('qwe'.encode('utf8')).lower()) - - -def test_quick_detect_encoding(): - eq_('ascii', u.quick_detect_encoding(b'qwe').lower()) - ok_(u.quick_detect_encoding( - u'Versi\xf3n'.encode('windows-1252')).lower() in [ - 'windows-1252', 'windows-1250']) - eq_('utf-8', u.quick_detect_encoding(u'привет'.encode('utf8')).lower()) - - -@patch.object(cchardet, 'detect') -@patch.object(u, 'detect_encoding') -def test_quick_detect_encoding_edge_cases(detect_encoding, cchardet_detect): - cchardet_detect.return_value = {'encoding': 'ascii'} - eq_('ascii', u.quick_detect_encoding(b"qwe")) - cchardet_detect.assert_called_once_with(b"qwe") - - # fallback to detect_encoding - cchardet_detect.return_value = {} - detect_encoding.return_value = 'utf-8' - eq_('utf-8', u.quick_detect_encoding(b"qwe")) - - # exception - detect_encoding.reset_mock() - cchardet_detect.side_effect = Exception() - detect_encoding.return_value = 'utf-8' - eq_('utf-8', u.quick_detect_encoding(b"qwe")) - ok_(detect_encoding.called) - - def test_html_to_text(): html = """

Hello world!

@@ -80,11 +25,11 @@ def test_html_to_text():

""" text = u.html_to_text(html) - eq_(b"Hello world! \n\n * One! \n * Two \nHaha", text) - eq_(u"привет!", u.html_to_text("привет!").decode('utf8')) + eq_("Hello world! \n\n * One! \n * Two \nHaha", text) + eq_(u"привет!", u.html_to_text("привет!")) html = '

Hi' - eq_(b'Hi', u.html_to_text(html)) + eq_('Hi', u.html_to_text(html)) html = """Hi """ - eq_(b'Hi', u.html_to_text(html)) + eq_('Hi', u.html_to_text(html)) html = """
TEXT 1

TEXT 2

""" - eq_(b'TEXT 1 \nTEXT 2', u.html_to_text(html)) + eq_('TEXT 1 \nTEXT 2', u.html_to_text(html)) def test_comment_no_parent(): - s = b' no comment' + s = ' no comment' d = u.html_document_fromstring(s) - eq_(b"no comment", u.html_tree_to_text(d)) - - -@patch.object(u.html5parser, 'fromstring', Mock(side_effect=Exception())) -def test_html_fromstring_exception(): - eq_(None, u.html_fromstring("")) - - -@patch.object(u.html5parser, 'document_fromstring') -def test_html_document_fromstring_exception(document_fromstring): - document_fromstring.side_effect = Exception() - eq_(None, u.html_document_fromstring("")) + eq_("no comment", u.html_tree_to_text(d)) @patch.object(u, 'html_fromstring', Mock(return_value=None))