html5lib · Mic92 · Sep 9, 2024 · Sep 9, 2024 · Sep 9, 2024
diff --git a/.appveyor.yml b/.appveyor.yml
diff --git a/.github/workflows/python-tox.yml b/.github/workflows/python-tox.yml
@@ -12,9 +12,6 @@ jobs:
         os: [ubuntu-latest, windows-latest]
         deps: [base, optional]
         include:
-          - python: "pypy-2.7"
-            os: ubuntu-latest
-            deps: base
           - python: "pypy-3.10"
             os: ubuntu-latest
             deps: base

diff --git a/README.rst b/README.rst
@@ -29,7 +29,7 @@ or:
 
 By default, the ``document`` will be an ``xml.etree`` element instance.
 Whenever possible, html5lib chooses the accelerated ``ElementTree``
-implementation (i.e. ``xml.etree.cElementTree`` on Python 2.x).
+implementation.
 
 Two other tree types are supported: ``xml.dom.minidom`` and
 ``lxml.etree``. To use an alternative format, specify the name of
@@ -41,18 +41,6 @@ a treebuilder:
   with open("mydocument.html", "rb") as f:
       lxml_etree_document = html5lib.parse(f, treebuilder="lxml")
 
-When using with ``urllib2`` (Python 2), the charset from HTTP should be
-pass into html5lib as follows:
-
-.. code-block:: python
-
-  from contextlib import closing
-  from urllib2 import urlopen
-  import html5lib
-
-  with closing(urlopen("http://example.com/")) as f:
-      document = html5lib.parse(f, transport_encoding=f.info().getparam("charset"))
-
 When using with ``urllib.request`` (Python 3), the charset from HTTP
 should be pass into html5lib as follows:
 
@@ -90,7 +78,7 @@ More documentation is available at https://html5lib.readthedocs.io/.
 Installation
 ------------
 
-html5lib works on CPython 2.7+, CPython 3.5+ and PyPy. To install:
+html5lib works on CPython 3.8+ and PyPy. To install:
 
 .. code-block:: bash
 

diff --git a/debug-info.py b/debug-info.py
@@ -1,4 +1,3 @@
-from __future__ import print_function, unicode_literals
 
 import platform
 import sys

diff --git a/doc/conf.py b/doc/conf.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 #
 # html5lib documentation build configuration file, created by
 # sphinx-quickstart on Wed May  8 00:04:49 2013.
@@ -100,7 +99,7 @@
 }
 
 
-class CExtMock(object):
+class CExtMock:
     """Required for autodoc on readthedocs.org where you cannot build C extensions."""
     def __init__(self, *args, **kwargs):
         pass

diff --git a/html5lib/__init__.py b/html5lib/__init__.py
@@ -20,7 +20,6 @@
 * :func:`~.serializer.serialize`
 """
 
-from __future__ import absolute_import, division, unicode_literals
 
 from .html5parser import HTMLParser, parse, parseFragment
 from .treebuilders import getTreeBuilder

diff --git a/html5lib/_ihatexml.py b/html5lib/_ihatexml.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
 
 import re
 import warnings
@@ -181,7 +180,7 @@ def escapeRegexp(string):
 nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\\-'()+,./:=?;!*#@$_%]")
 
 
-class InfosetFilter(object):
+class InfosetFilter:
     replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
 
     def __init__(self,

diff --git a/html5lib/_inputstream.py b/html5lib/_inputstream.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
 
 from six import text_type
 from six.moves import http_client, urllib
@@ -48,7 +47,7 @@
 charsUntilRegEx = {}
 
 
-class BufferedStream(object):
+class BufferedStream:
     """Buffering for streams that do not have buffering of their own
 
     The buffer is implemented as a list of chunks on the assumption that
@@ -145,7 +144,7 @@ def HTMLInputStream(source, **kwargs):
         return HTMLBinaryInputStream(source, **kwargs)
 
 
-class HTMLUnicodeInputStream(object):
+class HTMLUnicodeInputStream:
     """Provides a unicode stream of characters to the HTMLTokenizer.
 
     This class takes care of character encoding and removing or replacing
@@ -673,7 +672,7 @@ def jumpTo(self, bytes):
         return True
 
 
-class EncodingParser(object):
+class EncodingParser:
     """Mini parser for detecting character encoding from meta elements"""
 
     def __init__(self, data):
@@ -861,7 +860,7 @@ def getAttribute(self):
                 attrValue.append(c)
 
 
-class ContentAttrParser(object):
+class ContentAttrParser:
     def __init__(self, data):
         assert isinstance(data, bytes)
         self.data = data

diff --git a/html5lib/_tokenizer.py b/html5lib/_tokenizer.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
 
 from six import unichr as chr
 
@@ -24,7 +23,7 @@
     attributeMap = OrderedDict
 
 
-class HTMLTokenizer(object):
+class HTMLTokenizer:
     """ This class takes care of tokenizing HTML.
 
     * self.currentToken

diff --git a/html5lib/_trie/__init__.py b/html5lib/_trie/__init__.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
 
 from .py import Trie
 

diff --git a/html5lib/_trie/_base.py b/html5lib/_trie/_base.py
@@ -1,9 +1,5 @@
-from __future__ import absolute_import, division, unicode_literals
 
-try:
-    from collections.abc import Mapping
-except ImportError:  # Python 2.7
-    from collections import Mapping
+from collections.abc import Mapping
 
 
 class Trie(Mapping):

diff --git a/html5lib/_trie/py.py b/html5lib/_trie/py.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
 from six import text_type
 
 from bisect import bisect_left

diff --git a/html5lib/_utils.py b/html5lib/_utils.py
@@ -1,19 +1,15 @@
-from __future__ import absolute_import, division, unicode_literals
 
 from types import ModuleType
 
-try:
-    from collections.abc import Mapping
-except ImportError:
-    from collections import Mapping
+from collections.abc import Mapping
 
 from six import text_type, PY3
 
 if PY3:
     import xml.etree.ElementTree as default_etree
 else:
     try:
-        import xml.etree.cElementTree as default_etree
+        import xml.etree.ElementTree as default_etree
     except ImportError:
         import xml.etree.ElementTree as default_etree
 
@@ -122,7 +118,7 @@ def moduleFactoryFactory(factory):
     moduleCache = {}
 
     def moduleFactory(baseModule, *args, **kwargs):
-        if isinstance(ModuleType.__name__, type("")):
+        if isinstance(ModuleType.__name__, str):
             name = "_%s_factory" % baseModule.__name__
         else:
             name = b"_%s_factory" % baseModule.__name__

diff --git a/html5lib/constants.py b/html5lib/constants.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
 
 import string
 

diff --git a/html5lib/filters/alphabeticalattributes.py b/html5lib/filters/alphabeticalattributes.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
 
 from . import base
 

diff --git a/html5lib/filters/base.py b/html5lib/filters/base.py
@@ -1,7 +1,6 @@
-from __future__ import absolute_import, division, unicode_literals
 
 
-class Filter(object):
+class Filter:
     def __init__(self, source):
         self.source = source
 

diff --git a/html5lib/filters/inject_meta_charset.py b/html5lib/filters/inject_meta_charset.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
 
 from . import base
 

diff --git a/html5lib/filters/lint.py b/html5lib/filters/lint.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
 
 from six import text_type
 

diff --git a/html5lib/filters/optionaltags.py b/html5lib/filters/optionaltags.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
 
 from . import base
 

diff --git a/html5lib/filters/sanitizer.py b/html5lib/filters/sanitizer.py
@@ -6,7 +6,6 @@
 if Bleach is unsuitable for your needs.
 
 """
-from __future__ import absolute_import, division, unicode_literals
 
 import re
 import warnings

diff --git a/html5lib/filters/whitespace.py b/html5lib/filters/whitespace.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
 
 import re
 

diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
 from six import viewkeys
 
 from . import _inputstream
@@ -69,7 +68,7 @@ def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElemen
     return p.parseFragment(doc, container=container, **kwargs)
 
 
-class HTMLParser(object):
+class HTMLParser:
     """HTML parser
 
     Generates a tree structure from a stream of (possibly malformed) HTML.
@@ -397,7 +396,7 @@ def parseRCDataRawtext(self, token, contentType):
         self.phase = self.phases["text"]
 
 
-class Phase(object):
+class Phase:
     """Base class for helper object that implements each phase of processing
     """
     __slots__ = ("parser", "tree", "__startTagCache", "__endTagCache")
@@ -428,7 +427,7 @@ def processSpaceCharacters(self, token):
     def processStartTag(self, token):
         # Note the caching is done here rather than BoundMethodDispatcher as doing it there
         # requires a circular reference to the Phase, and this ends up with a significant
-        # (CPython 2.7, 3.8) GC cost when parsing many short inputs
+        # (CPython 3.8) GC cost when parsing many short inputs
         name = token["name"]
         # In Py2, using `in` is quicker in general than try/except KeyError
         # In Py3, `in` is quicker when there are few cache hits (typically short inputs)
@@ -455,7 +454,7 @@ def startTagHtml(self, token):
     def processEndTag(self, token):
         # Note the caching is done here rather than BoundMethodDispatcher as doing it there
         # requires a circular reference to the Phase, and this ends up with a significant
-        # (CPython 2.7, 3.8) GC cost when parsing many short inputs
+        # (CPython 3.8) GC cost when parsing many short inputs
         name = token["name"]
         # In Py2, using `in` is quicker in general than try/except KeyError
         # In Py3, `in` is quicker when there are few cache hits (typically short inputs)

diff --git a/html5lib/serializer.py b/html5lib/serializer.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
 from six import text_type
 
 import re
@@ -101,7 +100,7 @@ def serialize(input, tree="etree", encoding=None, **serializer_opts):
     return s.render(walker(input), encoding)
 
 
-class HTMLSerializer(object):
+class HTMLSerializer:
 
     # attribute quoting options
     quote_attr_values = "legacy"  # be secure by default

diff --git a/html5lib/tests/__init__.py b/html5lib/tests/__init__.py
@@ -1 +0,0 @@
-from __future__ import absolute_import, division, unicode_literals

diff --git a/html5lib/tests/conftest.py b/html5lib/tests/conftest.py
@@ -1,4 +1,3 @@
-from __future__ import print_function
 import os.path
 import sys
 
@@ -54,7 +53,7 @@ def pytest_configure(config):
         # Check for optional requirements
         req_file = os.path.join(_root, "requirements-optional.txt")
         if os.path.exists(req_file):
-            with open(req_file, "r") as fp:
+            with open(req_file) as fp:
                 for line in fp:
                     if (line.strip() and
                         not (line.startswith("-r") or
@@ -79,7 +78,7 @@ def pytest_configure(config):
         import xml.etree.ElementTree as ElementTree
 
         try:
-            import xml.etree.cElementTree as cElementTree
+            import xml.etree.ElementTree as cElementTree
         except ImportError:
             msgs.append("cElementTree unable to be imported")
         else:

diff --git a/html5lib/tests/sanitizer.py b/html5lib/tests/sanitizer.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
 
 import codecs
 import json

diff --git a/html5lib/tests/support.py b/html5lib/tests/support.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
 
 # pylint:disable=wrong-import-position
 
@@ -86,7 +85,7 @@ def __getitem__(self, key):
         return dict.get(self, key, self.default)
 
 
-class TestData(object):
+class TestData:
     def __init__(self, filename, newTestHeading="data", encoding="utf8"):
         if encoding is None:
             self.f = open(filename, mode="rb")

diff --git a/html5lib/tests/test_alphabeticalattributes.py b/html5lib/tests/test_alphabeticalattributes.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
 
 from collections import OrderedDict
 

diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
 
 import os
 
@@ -9,7 +8,7 @@
 
 
 def test_basic_prescan_length():
-    data = "<title>Caf\u00E9</title><!--a--><meta charset='utf-8'>".encode('utf-8')
+    data = "<title>Caf\u00E9</title><!--a--><meta charset='utf-8'>".encode()
     pad = 1024 - len(data) + 1
     data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-")
     assert len(data) == 1024  # Sanity
@@ -18,7 +17,7 @@ def test_basic_prescan_length():
 
 
 def test_parser_reparse():
-    data = "<title>Caf\u00E9</title><!--a--><meta charset='utf-8'>".encode('utf-8')
+    data = "<title>Caf\u00E9</title><!--a--><meta charset='utf-8'>".encode()
     pad = 10240 - len(data) + 1
     data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-")
     assert len(data) == 10240  # Sanity