Skip to content

Commit 6fa9516

Browse files
Merge pull request #43 from weblyzard/feature/custom-table-separation-character
Feature/custom table separation character
2 parents 9c9aca1 + 39ece5d commit 6fa9516

File tree

8 files changed

+140
-68
lines changed

8 files changed

+140
-68
lines changed

README.rst

+64-31
Original file line numberDiff line numberDiff line change
@@ -130,38 +130,42 @@ the corresponding text representation.
130130

131131
Command line parameters
132132
-----------------------
133+
133134
The inscript.py command line client supports the following parameters::
134135

135-
usage: inscript.py [-h] [-o OUTPUT] [-e ENCODING] [-i] [-d] [-l] [-a] [-r ANNOTATION_RULES] [-p POSTPROCESSOR]
136-
[--indentation INDENTATION] [-v]
137-
[input]
138-
139-
Convert the given HTML document to text.
140-
141-
positional arguments:
142-
input Html input either from a file or a URL (default:stdin).
143-
144-
optional arguments:
145-
-h, --help show this help message and exit
146-
-o OUTPUT, --output OUTPUT
147-
Output file (default:stdout).
148-
-e ENCODING, --encoding ENCODING
149-
Input encoding to use (default:utf-8 for files; detected server encoding for Web URLs).
150-
-i, --display-image-captions
151-
Display image captions (default:false).
152-
-d, --deduplicate-image-captions
153-
Deduplicate image captions (default:false).
154-
-l, --display-link-targets
155-
Display link targets (default:false).
156-
-a, --display-anchor-urls
157-
Display anchor URLs (default:false).
158-
-r ANNOTATION_RULES, --annotation-rules ANNOTATION_RULES
159-
Path to an optional JSON file containing rules for annotating the retrieved text.
160-
-p POSTPROCESSOR, --postprocessor POSTPROCESSOR
161-
Optional component for postprocessing the result (html, surface, xml).
162-
--indentation INDENTATION
163-
How to handle indentation (extended or strict; default: extended).
164-
-v, --version display version information
136+
usage: inscript.py [-h] [-o OUTPUT] [-e ENCODING] [-i] [-d] [-l] [-a] [-r ANNOTATION_RULES] [-p POSTPROCESSOR] [--indentation INDENTATION]
137+
[--table-cell-separator TABLE_CELL_SEPARATOR] [-v]
138+
[input]
139+
140+
Convert the given HTML document to text.
141+
142+
positional arguments:
143+
input Html input either from a file or a URL (default:stdin).
144+
145+
optional arguments:
146+
-h, --help show this help message and exit
147+
-o OUTPUT, --output OUTPUT
148+
Output file (default:stdout).
149+
-e ENCODING, --encoding ENCODING
150+
Input encoding to use (default:utf-8 for files; detected server encoding for Web URLs).
151+
-i, --display-image-captions
152+
Display image captions (default:false).
153+
-d, --deduplicate-image-captions
154+
Deduplicate image captions (default:false).
155+
-l, --display-link-targets
156+
Display link targets (default:false).
157+
-a, --display-anchor-urls
158+
Display anchor URLs (default:false).
159+
-r ANNOTATION_RULES, --annotation-rules ANNOTATION_RULES
160+
Path to an optional JSON file containing rules for annotating the retrieved text.
161+
-p POSTPROCESSOR, --postprocessor POSTPROCESSOR
162+
Optional component for postprocessing the result (html, surface, xml).
163+
--indentation INDENTATION
164+
How to handle indentation (extended or strict; default: extended).
165+
--table-cell-separator TABLE_CELL_SEPARATOR
166+
Separator to use between table cells (default: three spaces).
167+
-v, --version display version information
168+
165169
166170

167171
HTML to text conversion
@@ -508,7 +512,36 @@ The following options are available for fine tuning inscriptis' HTML rendering:
508512
html_tree = fromstring(html)
509513
# create a parser using a custom css
510514
config = ParserConfig(css=css)
511-
parser = Inscriptis(html_tree, config)
515+
parser = Inscriptis(html_tree, config) usage: inscript.py [-h] [-o OUTPUT] [-e ENCODING] [-i] [-d] [-l] [-a] [-r ANNOTATION_RULES] [-p POSTPROCESSOR]
516+
[--indentation INDENTATION] [-v]
517+
[input]
518+
519+
Convert the given HTML document to text.
520+
521+
positional arguments:
522+
input Html input either from a file or a URL (default:stdin).
523+
524+
optional arguments:
525+
-h, --help show this help message and exit
526+
-o OUTPUT, --output OUTPUT
527+
Output file (default:stdout).
528+
-e ENCODING, --encoding ENCODING
529+
Input encoding to use (default:utf-8 for files; detected server encoding for Web URLs).
530+
-i, --display-image-captions
531+
Display image captions (default:false).
532+
-d, --deduplicate-image-captions
533+
Deduplicate image captions (default:false).
534+
-l, --display-link-targets
535+
Display link targets (default:false).
536+
-a, --display-anchor-urls
537+
Display anchor URLs (default:false).
538+
-r ANNOTATION_RULES, --annotation-rules ANNOTATION_RULES
539+
Path to an optional JSON file containing rules for annotating the retrieved text.
540+
-p POSTPROCESSOR, --postprocessor POSTPROCESSOR
541+
Optional component for postprocessing the result (html, surface, xml).
542+
--indentation INDENTATION
543+
How to handle indentation (extended or strict; default: extended).
544+
-v, --version display version information
512545
text = parser.get_text()
513546
514547

scripts/inscript.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,9 @@ def get_parser():
6666
parser.add_argument('--indentation', default='extended',
6767
help='How to handle indentation (extended or strict;'
6868
' default: extended).')
69+
parser.add_argument('--table-cell-separator', default=' ',
70+
help='Separator to use between table cells (default: '
71+
'three spaces).')
6972
parser.add_argument('-v', '--version',
7073
action='store_true', default=False,
7174
help='display version information')
@@ -118,7 +121,8 @@ def get_parser():
118121
deduplicate_captions=args.deduplicate_image_captions,
119122
display_links=args.display_link_targets,
120123
display_anchors=args.display_anchor_urls,
121-
annotation_rules=annotation_rules)
124+
annotation_rules=annotation_rules,
125+
table_cell_separator=args.table_cell_separator)
122126
if not annotation_rules:
123127
output = get_text(html_content, config)
124128
else:

src/inscriptis/html_engine.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,8 @@ def _start_li(self, _):
199199
def _start_table(self, _):
200200
self.tags[-1].set_canvas(Canvas())
201201
self.current_table.append(Table(
202-
left_margin_len=self.tags[-1].canvas.left_margin))
202+
left_margin_len=self.tags[-1].canvas.left_margin,
203+
cell_separator=self.config.table_cell_separator))
203204

204205
def _start_tr(self, _):
205206
if self.current_table:

src/inscriptis/metadata.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@
44
55
__copyright__ = '2016-2021 Albert Weichselbraun, Fabian Odoni'
66
__license__ = 'Apache 2.0'
7-
__version__ = '2.1.1'
7+
__version__ = '2.2.0'

src/inscriptis/model/config.py

+11-3
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,26 @@
22
"""Provide configuration objects for the Inscriptis HTML to text converter."""
33

44
from copy import deepcopy
5+
from typing import Dict
56

67
from inscriptis.css_profiles import CSS_PROFILES
78
from inscriptis.annotation.parser import AnnotationModel
89
from inscriptis.model.attribute import Attribute
10+
from inscriptis.model.html_element import HtmlElement
911

1012
DEFAULT_CSS_PROFILE_NAME = 'relaxed'
1113

1214

1315
class ParserConfig:
1416
"""Encapsulate configuration options and CSS definitions."""
1517

16-
def __init__(self, css=None, display_images=False,
17-
deduplicate_captions=False, display_links=False,
18-
display_anchors=False, annotation_rules=None):
18+
def __init__(self, css: Dict[str, HtmlElement] = None,
19+
display_images: bool = False,
20+
deduplicate_captions: bool = False,
21+
display_links: bool = False,
22+
display_anchors: bool = False,
23+
annotation_rules: Attribute = None,
24+
table_cell_separator: str = ' '):
1925
"""Create a ParserConfig configuration.
2026
2127
Args:
@@ -29,13 +35,15 @@ def __init__(self, css=None, display_images=False,
2935
display_anchors: whether to display anchors (e.g. `[here](#here)`).
3036
annotation_rules: an optional dictionary of annotation rules which
3137
specify tags and attributes to annotation.
38+
table_cell_separator: separator to use between table cells.
3239
"""
3340
self.display_images = display_images
3441
self.deduplicate_captions = deduplicate_captions
3542
self.display_links = display_links
3643
self.display_anchors = display_anchors
3744
self.css = css or CSS_PROFILES[DEFAULT_CSS_PROFILE_NAME]
3845
self.attribute_handler = Attribute()
46+
self.table_cell_separator = table_cell_separator
3947
if annotation_rules:
4048
# ensure that we do not modify the original model or its
4149
# members.

src/inscriptis/model/table.py

+15-6
Original file line numberDiff line numberDiff line change
@@ -145,11 +145,16 @@ def get_annotations(self, idx: int, row_width: int) -> List[Annotation]:
145145

146146

147147
class TableRow:
148-
"""A single row within a table."""
148+
"""A single row within a table.
149+
150+
Attributes:
151+
columns: the table row's columns.
152+
cell_separator: string used for separating columns from each other.
153+
"""
149154

150155
__slots__ = ('columns', 'cell_separator')
151156

152-
def __init__(self, cell_separator: str = ' '):
157+
def __init__(self, cell_separator):
153158
self.columns: List[TableCell] = []
154159
self.cell_separator = cell_separator
155160

@@ -179,17 +184,19 @@ class Table:
179184
Attributes:
180185
rows: the table's rows.
181186
left_margin_len: length of the left margin before the table.
187+
cell_separator: string used for separating cells from each other.
182188
"""
183189

184-
__slots__ = ('rows', 'left_margin_len')
190+
__slots__ = ('rows', 'left_margin_len', 'cell_separator')
185191

186-
def __init__(self, left_margin_len: int):
192+
def __init__(self, left_margin_len: int, cell_separator):
187193
self.rows = []
188194
self.left_margin_len = left_margin_len
195+
self.cell_separator = cell_separator
189196

190197
def add_row(self):
191198
"""Add an empty :class:`TableRow` to the table."""
192-
self.rows.append(TableRow())
199+
self.rows.append(TableRow(self.cell_separator))
193200

194201
def add_cell(self, table_cell: TableCell):
195202
"""Add a new :class:`TableCell` to the table's last row.
@@ -256,12 +263,14 @@ def get_annotations(self, idx: int,
256263
for row in self.rows:
257264
if not row.columns:
258265
continue
266+
259267
row_width = row.width + left_margin_len
268+
row_height = row.columns[0].height
260269
cell_idx = idx
261270
for cell in row.columns:
262271
annotations += cell.get_annotations(cell_idx, row_width)
263272
cell_idx += cell.width + len(row.cell_separator)
264273

265-
idx += (row_width + 1) * cell.height # linebreak
274+
idx += (row_width + 1) * row_height # linebreak
266275

267276
return annotations

tests/test_table_row.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,23 @@
55
Test borderline cases for table rows
66
"""
77

8+
from inscriptis import get_text
9+
from inscriptis.model.config import ParserConfig
810
from inscriptis.model.table import TableRow
911

12+
1013
def test_empty_row():
11-
tr = TableRow()
14+
tr = TableRow(cell_separator=' ')
1215

1316
assert tr.width == 0
1417
assert tr.get_text() == ''
18+
19+
20+
def test_table_cell_separator():
21+
html = '<html><body><table><tr><td>Hallo<br>Eins</td><td>Echo<br>Zwei</td></tr></table></html>'
22+
23+
config = ParserConfig()
24+
assert get_text(html, config) == 'Hallo Echo\nEins Zwei\n'
25+
26+
config = ParserConfig(table_cell_separator='\t')
27+
assert get_text(html, config) == 'Hallo\tEcho\nEins \tZwei\n'

tox.ini

+28-24
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
21
# standard unit tests
32
[testenv:pytest]
43
deps = pytest
@@ -10,29 +9,34 @@ commands = pytest --cov-config=.coveragerc --cov=inscriptis ./tests
109
deps = pyroma
1110
commands = pyroma .
1211

13-
# coding style
14-
[testenv:pep8]
15-
deps = flake8
16-
flake8-blind-except
17-
flake8-bandit
18-
flake8-bugbear
19-
flake8-builtins
20-
flake8-cognitive-complexity
21-
flake8-colors
22-
flake8-comprehensions
23-
flake8-docstrings
24-
flake8-eradicate
25-
flake8-expression-complexity
26-
flake8-mutable
27-
flake8-pathlib
28-
flake8-pytest
29-
flake8-quotes
30-
flake8-raise
31-
flake8-simplify
32-
flake8-string-format
33-
flake8-tuple
34-
flake8-logging-format
35-
pep8-naming
12+
# checks compatible with flake 4
13+
[testenv:flake8-4]
14+
deps = flake8 ~= 4.0.1
15+
flake8-blind-except ~= 0.2.0
16+
flake8-bandit ~= 2.1.2
17+
flake8-bugbear ~= 21.9.2
18+
flake8-builtins ~= 1.5.3
19+
flake8-cognitive-complexity ~= 0.1.0
20+
flake8-colors ~= 0.1.9
21+
flake8-comprehensions ~= 3.7.0
22+
flake8-docstrings ~= 1.6.0
23+
flake8-eradicate ~= 1.2.0
24+
flake8-expression-complexity ~= 0.0.9
25+
flake8-string-format ~= 0.3.0
26+
flake8-tuple ~= 0.4.1
27+
flake8-logging-format ~= 0.6.0
28+
flake8-pytest ~= 1.3
29+
flake8-quotes ~= 3.3.1
30+
flake8-raise ~= 0.0.5
31+
flake8-simplify ~= 0.14.2
32+
pep8-naming ~= 0.12.1
33+
flake8-mutable ~= 1.2.0
34+
commands = flake8
35+
36+
# checks compatible with flake < 4.0.0
37+
[testenv:flake8-3]
38+
deps = flake8 < 4.0.0
39+
flake8-use-pathlib ~= 0.2.0
3640
commands = flake8
3741

3842
[flake8]

0 commit comments

Comments
 (0)