Merge pull request #43 from weblyzard/feature/custom-table-separation-character

AlbertWeichselbraun · web-flow · commit 6fa9516acb2b · 2021-10-22T12:38:19.000+02:00
Feature/custom table separation character
diff --git a/README.rst b/README.rst
@@ -130,38 +130,42 @@ the corresponding text representation.
 
 Command line parameters
 -----------------------
+
 The inscript.py command line client supports the following parameters::
 
-  usage: inscript.py [-h] [-o OUTPUT] [-e ENCODING] [-i] [-d] [-l] [-a] [-r ANNOTATION_RULES] [-p POSTPROCESSOR]
-                     [--indentation INDENTATION] [-v]
-                     [input]
-  
-  Convert the given HTML document to text.
-  
-  positional arguments:
-    input                 Html input either from a file or a URL (default:stdin).
-  
-  optional arguments:
-    -h, --help            show this help message and exit
-    -o OUTPUT, --output OUTPUT
-                          Output file (default:stdout).
-    -e ENCODING, --encoding ENCODING
-                          Input encoding to use (default:utf-8 for files; detected server encoding for Web URLs).
-    -i, --display-image-captions
-                          Display image captions (default:false).
-    -d, --deduplicate-image-captions
-                          Deduplicate image captions (default:false).
-    -l, --display-link-targets
-                          Display link targets (default:false).
-    -a, --display-anchor-urls
-                          Display anchor URLs (default:false).
-    -r ANNOTATION_RULES, --annotation-rules ANNOTATION_RULES
-                          Path to an optional JSON file containing rules for annotating the retrieved text.
-    -p POSTPROCESSOR, --postprocessor POSTPROCESSOR
-                          Optional component for postprocessing the result (html, surface, xml).
-    --indentation INDENTATION
-                          How to handle indentation (extended or strict; default: extended).
-    -v, --version         display version information
+    usage: inscript.py [-h] [-o OUTPUT] [-e ENCODING] [-i] [-d] [-l] [-a] [-r ANNOTATION_RULES] [-p POSTPROCESSOR] [--indentation INDENTATION]
+                       [--table-cell-separator TABLE_CELL_SEPARATOR] [-v]
+                       [input]
+
+    Convert the given HTML document to text.
+
+    positional arguments:
+      input                 Html input either from a file or a URL (default:stdin).
+
+    optional arguments:
+      -h, --help            show this help message and exit
+      -o OUTPUT, --output OUTPUT
+                            Output file (default:stdout).
+      -e ENCODING, --encoding ENCODING
+                            Input encoding to use (default:utf-8 for files; detected server encoding for Web URLs).
+      -i, --display-image-captions
+                            Display image captions (default:false).
+      -d, --deduplicate-image-captions
+                            Deduplicate image captions (default:false).
+      -l, --display-link-targets
+                            Display link targets (default:false).
+      -a, --display-anchor-urls
+                            Display anchor URLs (default:false).
+      -r ANNOTATION_RULES, --annotation-rules ANNOTATION_RULES
+                            Path to an optional JSON file containing rules for annotating the retrieved text.
+      -p POSTPROCESSOR, --postprocessor POSTPROCESSOR
+                            Optional component for postprocessing the result (html, surface, xml).
+      --indentation INDENTATION
+                            How to handle indentation (extended or strict; default: extended).
+      --table-cell-separator TABLE_CELL_SEPARATOR
+                            Separator to use between table cells (default: three spaces).
+      -v, --version         display version information
+
    
 
 HTML to text conversion
@@ -508,7 +512,36 @@ The following options are available for fine tuning inscriptis' HTML rendering:
       html_tree = fromstring(html)
       # create a parser using a custom css
       config = ParserConfig(css=css)
-      parser = Inscriptis(html_tree, config)
+      parser = Inscriptis(html_tree, config)  usage: inscript.py [-h] [-o OUTPUT] [-e ENCODING] [-i] [-d] [-l] [-a] [-r ANNOTATION_RULES] [-p POSTPROCESSOR]
+                     [--indentation INDENTATION] [-v]
+                     [input]
+
+  Convert the given HTML document to text.
+
+  positional arguments:
+    input                 Html input either from a file or a URL (default:stdin).
+
+  optional arguments:
+    -h, --help            show this help message and exit
+    -o OUTPUT, --output OUTPUT
+                          Output file (default:stdout).
+    -e ENCODING, --encoding ENCODING
+                          Input encoding to use (default:utf-8 for files; detected server encoding for Web URLs).
+    -i, --display-image-captions
+                          Display image captions (default:false).
+    -d, --deduplicate-image-captions
+                          Deduplicate image captions (default:false).
+    -l, --display-link-targets
+                          Display link targets (default:false).
+    -a, --display-anchor-urls
+                          Display anchor URLs (default:false).
+    -r ANNOTATION_RULES, --annotation-rules ANNOTATION_RULES
+                          Path to an optional JSON file containing rules for annotating the retrieved text.
+    -p POSTPROCESSOR, --postprocessor POSTPROCESSOR
+                          Optional component for postprocessing the result (html, surface, xml).
+    --indentation INDENTATION
+                          How to handle indentation (extended or strict; default: extended).
+    -v, --version         display version information
       text = parser.get_text()
 
 
diff --git a/scripts/inscript.py b/scripts/inscript.py
@@ -66,6 +66,9 @@ def get_parser():
     parser.add_argument('--indentation', default='extended',
                         help='How to handle indentation (extended or strict;'
                              ' default: extended).')
+    parser.add_argument('--table-cell-separator', default='  ',
+                        help='Separator to use between table cells (default: '
+                             'three spaces).')
     parser.add_argument('-v', '--version',
                         action='store_true', default=False,
                         help='display version information')
@@ -118,7 +121,8 @@ def get_parser():
                           deduplicate_captions=args.deduplicate_image_captions,
                           display_links=args.display_link_targets,
                           display_anchors=args.display_anchor_urls,
-                          annotation_rules=annotation_rules)
+                          annotation_rules=annotation_rules,
+                          table_cell_separator=args.table_cell_separator)
     if not annotation_rules:
         output = get_text(html_content, config)
     else:
diff --git a/src/inscriptis/html_engine.py b/src/inscriptis/html_engine.py
@@ -199,7 +199,8 @@ def _start_li(self, _):
     def _start_table(self, _):
         self.tags[-1].set_canvas(Canvas())
         self.current_table.append(Table(
-            left_margin_len=self.tags[-1].canvas.left_margin))
+            left_margin_len=self.tags[-1].canvas.left_margin,
+            cell_separator=self.config.table_cell_separator))
 
     def _start_tr(self, _):
         if self.current_table:
diff --git a/src/inscriptis/metadata.py b/src/inscriptis/metadata.py
@@ -4,4 +4,4 @@
 __author_email__ = 'albert.weichselbraun@fhgr.ch, fabian.odoni@fhgr.ch'
 __copyright__ = '2016-2021 Albert Weichselbraun, Fabian Odoni'
 __license__ = 'Apache 2.0'
-__version__ = '2.1.1'
+__version__ = '2.2.0'
diff --git a/src/inscriptis/model/config.py b/src/inscriptis/model/config.py
@@ -2,20 +2,26 @@
 """Provide configuration objects for the Inscriptis HTML to text converter."""
 
 from copy import deepcopy
+from typing import Dict
 
 from inscriptis.css_profiles import CSS_PROFILES
 from inscriptis.annotation.parser import AnnotationModel
 from inscriptis.model.attribute import Attribute
+from inscriptis.model.html_element import HtmlElement
 
 DEFAULT_CSS_PROFILE_NAME = 'relaxed'
 
 
 class ParserConfig:
     """Encapsulate configuration options and CSS definitions."""
 
-    def __init__(self, css=None, display_images=False,
-                 deduplicate_captions=False, display_links=False,
-                 display_anchors=False, annotation_rules=None):
+    def __init__(self, css: Dict[str, HtmlElement] = None,
+                 display_images: bool = False,
+                 deduplicate_captions: bool = False,
+                 display_links: bool = False,
+                 display_anchors: bool = False,
+                 annotation_rules: Attribute = None,
+                 table_cell_separator: str = '  '):
         """Create a ParserConfig configuration.
 
         Args:
@@ -29,13 +35,15 @@ def __init__(self, css=None, display_images=False,
             display_anchors: whether to display anchors (e.g. `[here](#here)`).
             annotation_rules: an optional dictionary of annotation rules which
                               specify tags and attributes to annotation.
+            table_cell_separator: separator to use between table cells.
         """
         self.display_images = display_images
         self.deduplicate_captions = deduplicate_captions
         self.display_links = display_links
         self.display_anchors = display_anchors
         self.css = css or CSS_PROFILES[DEFAULT_CSS_PROFILE_NAME]
         self.attribute_handler = Attribute()
+        self.table_cell_separator = table_cell_separator
         if annotation_rules:
             # ensure that we do not modify the original model or its
             # members.
diff --git a/src/inscriptis/model/table.py b/src/inscriptis/model/table.py
@@ -145,11 +145,16 @@ def get_annotations(self, idx: int, row_width: int) -> List[Annotation]:
 
 
 class TableRow:
-    """A single row within a table."""
+    """A single row within a table.
+
+    Attributes:
+        columns: the table row's columns.
+        cell_separator: string used for separating columns from each other.
+    """
 
     __slots__ = ('columns', 'cell_separator')
 
-    def __init__(self, cell_separator: str = '  '):
+    def __init__(self, cell_separator):
         self.columns: List[TableCell] = []
         self.cell_separator = cell_separator
 
@@ -179,17 +184,19 @@ class Table:
     Attributes:
         rows: the table's rows.
         left_margin_len: length of the left margin before the table.
+        cell_separator: string used for separating cells from each other.
     """
 
-    __slots__ = ('rows', 'left_margin_len')
+    __slots__ = ('rows', 'left_margin_len', 'cell_separator')
 
-    def __init__(self, left_margin_len: int):
+    def __init__(self, left_margin_len: int, cell_separator):
         self.rows = []
         self.left_margin_len = left_margin_len
+        self.cell_separator = cell_separator
 
     def add_row(self):
         """Add an empty :class:`TableRow` to the table."""
-        self.rows.append(TableRow())
+        self.rows.append(TableRow(self.cell_separator))
 
     def add_cell(self, table_cell: TableCell):
         """Add  a new :class:`TableCell` to the table's last row.
@@ -256,12 +263,14 @@ def get_annotations(self, idx: int,
         for row in self.rows:
             if not row.columns:
                 continue
+
             row_width = row.width + left_margin_len
+            row_height = row.columns[0].height
             cell_idx = idx
             for cell in row.columns:
                 annotations += cell.get_annotations(cell_idx, row_width)
                 cell_idx += cell.width + len(row.cell_separator)
 
-            idx += (row_width + 1) * cell.height   # linebreak
+            idx += (row_width + 1) * row_height   # linebreak
 
         return annotations
diff --git a/tests/test_table_row.py b/tests/test_table_row.py
@@ -5,10 +5,23 @@
 Test borderline cases for table rows
 """
 
+from inscriptis import get_text
+from inscriptis.model.config import ParserConfig
 from inscriptis.model.table import TableRow
 
+
 def test_empty_row():
-    tr = TableRow()
+    tr = TableRow(cell_separator='   ')
 
     assert tr.width == 0
     assert tr.get_text() == ''
+
+
+def test_table_cell_separator():
+    html = '<html><body><table><tr><td>Hallo<br>Eins</td><td>Echo<br>Zwei</td></tr></table></html>'
+
+    config = ParserConfig()
+    assert get_text(html, config) == 'Hallo  Echo\nEins   Zwei\n'
+
+    config = ParserConfig(table_cell_separator='\t')
+    assert get_text(html, config) == 'Hallo\tEcho\nEins \tZwei\n'
diff --git a/tox.ini b/tox.ini
@@ -1,4 +1,3 @@
-
 # standard unit tests
 [testenv:pytest]
 deps = pytest
@@ -10,29 +9,34 @@ commands = pytest --cov-config=.coveragerc --cov=inscriptis ./tests
 deps = pyroma
 commands = pyroma .
 
-# coding style
-[testenv:pep8]
-deps = flake8
-       flake8-blind-except
-       flake8-bandit
-       flake8-bugbear
-       flake8-builtins
-       flake8-cognitive-complexity
-       flake8-colors
-       flake8-comprehensions
-       flake8-docstrings
-       flake8-eradicate
-       flake8-expression-complexity
-       flake8-mutable
-       flake8-pathlib
-       flake8-pytest
-       flake8-quotes
-       flake8-raise
-       flake8-simplify
-       flake8-string-format
-       flake8-tuple
-       flake8-logging-format
-       pep8-naming
+# checks compatible with flake 4
+[testenv:flake8-4]
+deps = flake8 ~= 4.0.1
+       flake8-blind-except ~= 0.2.0
+       flake8-bandit ~= 2.1.2
+       flake8-bugbear ~= 21.9.2
+       flake8-builtins ~= 1.5.3
+       flake8-cognitive-complexity ~= 0.1.0
+       flake8-colors ~= 0.1.9
+       flake8-comprehensions ~= 3.7.0
+       flake8-docstrings ~= 1.6.0
+       flake8-eradicate ~= 1.2.0
+       flake8-expression-complexity ~= 0.0.9
+       flake8-string-format ~= 0.3.0
+       flake8-tuple ~= 0.4.1
+       flake8-logging-format ~= 0.6.0
+       flake8-pytest ~= 1.3
+       flake8-quotes ~= 3.3.1
+       flake8-raise ~= 0.0.5
+       flake8-simplify ~= 0.14.2
+       pep8-naming ~= 0.12.1
+       flake8-mutable ~= 1.2.0
+commands = flake8
+
+# checks compatible with flake < 4.0.0
+[testenv:flake8-3]
+deps = flake8 < 4.0.0
+       flake8-use-pathlib ~= 0.2.0
 commands = flake8
 
 [flake8]