Skip to content

Commit c47709c

Browse files
committed
Merge branch 'develop'
2 parents 2656689 + fbc1353 commit c47709c

File tree

11 files changed

+310
-28
lines changed

11 files changed

+310
-28
lines changed

.github/workflows/python-app.yml

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ jobs:
1515
runs-on: ubuntu-latest
1616

1717
steps:
18-
- uses: actions/checkout@v2
18+
- uses: actions/checkout@v4
1919
- name: Set up Python 3.8
2020
uses: actions/setup-python@v2
2121
with:
@@ -30,3 +30,22 @@ jobs:
3030
- name: Build
3131
run: |
3232
python -m build -nwsx .
33+
34+
types:
35+
36+
runs-on: ubuntu-latest
37+
38+
steps:
39+
- uses: actions/checkout@v2
40+
- name: Set up Python 3.8
41+
uses: actions/setup-python@v2
42+
with:
43+
python-version: 3.8
44+
- name: Install dependencies
45+
run: |
46+
python -m pip install --upgrade pip
47+
pip install --upgrade setuptools setuptools_scm wheel build tox mypy types-beautifulsoup4
48+
- name: Check types
49+
run: |
50+
mypy .
51+
mypy --strict tests/types.py

.github/workflows/python-publish.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ jobs:
1313
runs-on: ubuntu-latest
1414

1515
steps:
16-
- uses: actions/checkout@v2
16+
- uses: actions/checkout@v4
1717
- name: Set up Python
1818
uses: actions/setup-python@v2
1919
with:

README.rst

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ code_language_callback
110110
When the HTML code contains ``pre`` tags that in some way provide the code
111111
language, for example as class, this callback can be used to extract the
112112
language from the tag and prefix it to the converted ``pre`` tag.
113-
The callback gets one single argument, an BeautifylSoup object, and returns
113+
The callback gets one single argument, a BeautifulSoup object, and returns
114114
a string containing the code language, or ``None``.
115115
An example to use the class name as code language could be::
116116

@@ -157,6 +157,23 @@ strip_document
157157
within the document are unaffected.
158158
Defaults to ``STRIP``.
159159

160+
strip_pre
161+
Controls whether leading/trailing blank lines are removed from ``<pre>``
162+
tags. Supported values are ``STRIP`` (all leading/trailing blank lines),
163+
``STRIP_ONE`` (one leading/trailing blank line), and ``None`` (neither).
164+
Defaults to ``STRIP``.
165+
166+
bs4_options
167+
Specify additional configuration options for the ``BeautifulSoup`` object
168+
used to interpret the HTML markup. String and list values (such as ``lxml``
169+
or ``html5lib``) are treated as ``features`` arguments to control parser
170+
selection. Dictionary values (such as ``{"from_encoding": "iso-8859-8"}``)
171+
are treated as full kwargs to be used for the BeautifulSoup constructor,
172+
allowing specification of any parameter. For parameter details, see the
173+
Beautiful Soup documentation at:
174+
175+
.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
176+
160177
Options may be specified as kwargs to the ``markdownify`` function, or as a
161178
nested ``Options`` class in ``MarkdownConverter`` subclasses.
162179

markdownify/__init__.py

Lines changed: 76 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@
1111
re_all_whitespace = re.compile(r'[\t \r\n]+')
1212
re_newline_whitespace = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*')
1313
re_html_heading = re.compile(r'h(\d+)')
14+
re_pre_lstrip1 = re.compile(r'^ *\n')
15+
re_pre_rstrip1 = re.compile(r'\n *$')
16+
re_pre_lstrip = re.compile(r'^[ \n]*\n')
17+
re_pre_rstrip = re.compile(r'[ \n]*$')
1418

1519
# Pattern for creating convert_<tag> function names from tag names
1620
re_make_convert_fn_name = re.compile(r'[\[\]:-]')
@@ -37,6 +41,9 @@
3741
# confused with a list item
3842
re_escape_misc_list_items = re.compile(r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))')
3943

44+
# Find consecutive backtick sequences in a string
45+
re_backtick_runs = re.compile(r'`+')
46+
4047
# Heading styles
4148
ATX = 'atx'
4249
ATX_CLOSED = 'atx_closed'
@@ -51,10 +58,25 @@
5158
ASTERISK = '*'
5259
UNDERSCORE = '_'
5360

54-
# Document strip styles
61+
# Document/pre strip styles
5562
LSTRIP = 'lstrip'
5663
RSTRIP = 'rstrip'
5764
STRIP = 'strip'
65+
STRIP_ONE = 'strip_one'
66+
67+
68+
def strip1_pre(text):
69+
"""Strip one leading and trailing newline from a <pre> string."""
70+
text = re_pre_lstrip1.sub('', text)
71+
text = re_pre_rstrip1.sub('', text)
72+
return text
73+
74+
75+
def strip_pre(text):
76+
"""Strip all leading and trailing newlines from a <pre> string."""
77+
text = re_pre_lstrip.sub('', text)
78+
text = re_pre_rstrip.sub('', text)
79+
return text
5880

5981

6082
def chomp(text):
@@ -154,6 +176,7 @@ def _next_block_content_sibling(el):
154176
class MarkdownConverter(object):
155177
class DefaultOptions:
156178
autolinks = True
179+
bs4_options = 'html.parser'
157180
bullets = '*+-' # An iterable of bullet types.
158181
code_language = ''
159182
code_language_callback = None
@@ -167,6 +190,7 @@ class DefaultOptions:
167190
newline_style = SPACES
168191
strip = None
169192
strip_document = STRIP
193+
strip_pre = STRIP
170194
strong_em_symbol = ASTERISK
171195
sub_symbol = ''
172196
sup_symbol = ''
@@ -187,11 +211,15 @@ def __init__(self, **options):
187211
raise ValueError('You may specify either tags to strip or tags to'
188212
' convert, but not both.')
189213

214+
# If a string or list is passed to bs4_options, assume it is a 'features' specification
215+
if not isinstance(self.options['bs4_options'], dict):
216+
self.options['bs4_options'] = {'features': self.options['bs4_options']}
217+
190218
# Initialize the conversion function cache
191219
self.convert_fn_cache = {}
192220

193221
def convert(self, html):
194-
soup = BeautifulSoup(html, 'html.parser')
222+
soup = BeautifulSoup(html, **self.options['bs4_options'])
195223
return self.convert_soup(soup)
196224

197225
def convert_soup(self, soup):
@@ -362,16 +390,20 @@ def get_conv_fn(self, tag_name):
362390
if not self.should_convert_tag(tag_name):
363391
return None
364392

365-
# Handle headings with _convert_hn() function
393+
# Look for an explicitly defined conversion function by tag name first
394+
convert_fn_name = "convert_%s" % re_make_convert_fn_name.sub("_", tag_name)
395+
convert_fn = getattr(self, convert_fn_name, None)
396+
if convert_fn:
397+
return convert_fn
398+
399+
# If tag is any heading, handle with convert_hN() function
366400
match = re_html_heading.match(tag_name)
367401
if match:
368-
n = int(match.group(1))
369-
return lambda el, text, parent_tags: self._convert_hn(n, el, text, parent_tags)
402+
n = int(match.group(1)) # get value of N from <hN>
403+
return lambda el, text, parent_tags: self.convert_hN(n, el, text, parent_tags)
370404

371-
# For other tags, look up their conversion function by tag name
372-
convert_fn_name = "convert_%s" % re_make_convert_fn_name.sub('_', tag_name)
373-
convert_fn = getattr(self, convert_fn_name, None)
374-
return convert_fn
405+
# No conversion function was found
406+
return None
375407

376408
def should_convert_tag(self, tag):
377409
"""Given a tag name, return whether to convert based on strip/convert options."""
@@ -451,10 +483,24 @@ def convert_br(self, el, text, parent_tags):
451483
return ' \n'
452484

453485
def convert_code(self, el, text, parent_tags):
454-
if 'pre' in parent_tags:
486+
if '_noformat' in parent_tags:
455487
return text
456-
converter = abstract_inline_conversion(lambda self: '`')
457-
return converter(self, el, text, parent_tags)
488+
489+
prefix, suffix, text = chomp(text)
490+
if not text:
491+
return ''
492+
493+
# Find the maximum number of consecutive backticks in the text, then
494+
# delimit the code span with one more backtick than that
495+
max_backticks = max((len(match) for match in re.findall(re_backtick_runs, text)), default=0)
496+
markup_delimiter = '`' * (max_backticks + 1)
497+
498+
# If the maximum number of backticks is greater than zero, add a space
499+
# to avoid interpretation of inside backticks as literals
500+
if max_backticks > 0:
501+
text = " " + text + " "
502+
503+
return '%s%s%s%s%s' % (prefix, markup_delimiter, text, markup_delimiter, suffix)
458504

459505
convert_del = abstract_inline_conversion(lambda self: '~~')
460506

@@ -509,12 +555,12 @@ def convert_dt(self, el, text, parent_tags):
509555

510556
return '\n\n%s\n' % text
511557

512-
def _convert_hn(self, n, el, text, parent_tags):
513-
""" Method name prefixed with _ to prevent <hn> to call this """
558+
def convert_hN(self, n, el, text, parent_tags):
559+
# convert_hN() converts <hN> tags, where N is any integer
514560
if '_inline' in parent_tags:
515561
return text
516562

517-
# prevent MemoryErrors in case of very large n
563+
# Markdown does not support heading depths of n > 6
518564
n = max(1, min(6, n))
519565

520566
style = self.options['heading_style'].lower()
@@ -647,8 +693,20 @@ def convert_pre(self, el, text, parent_tags):
647693
if self.options['code_language_callback']:
648694
code_language = self.options['code_language_callback'](el) or code_language
649695

696+
if self.options['strip_pre'] == STRIP:
697+
text = strip_pre(text) # remove all leading/trailing newlines
698+
elif self.options['strip_pre'] == STRIP_ONE:
699+
text = strip1_pre(text) # remove one leading/trailing newline
700+
elif self.options['strip_pre'] is None:
701+
pass # leave leading and trailing newlines as-is
702+
else:
703+
raise ValueError('Invalid value for strip_pre: %s' % self.options['strip_pre'])
704+
650705
return '\n\n```%s\n%s\n```\n\n' % (code_language, text)
651706

707+
def convert_q(self, el, text, parent_tags):
708+
return '"' + text + '"'
709+
652710
def convert_script(self, el, text, parent_tags):
653711
return ''
654712

@@ -677,13 +735,13 @@ def convert_figcaption(self, el, text, parent_tags):
677735
def convert_td(self, el, text, parent_tags):
678736
colspan = 1
679737
if 'colspan' in el.attrs and el['colspan'].isdigit():
680-
colspan = int(el['colspan'])
738+
colspan = max(1, min(1000, int(el['colspan'])))
681739
return ' ' + text.strip().replace("\n", " ") + ' |' * colspan
682740

683741
def convert_th(self, el, text, parent_tags):
684742
colspan = 1
685743
if 'colspan' in el.attrs and el['colspan'].isdigit():
686-
colspan = int(el['colspan'])
744+
colspan = max(1, min(1000, int(el['colspan'])))
687745
return ' ' + text.strip().replace("\n", " ") + ' |' * colspan
688746

689747
def convert_tr(self, el, text, parent_tags):
@@ -704,7 +762,7 @@ def convert_tr(self, el, text, parent_tags):
704762
full_colspan = 0
705763
for cell in cells:
706764
if 'colspan' in cell.attrs and cell['colspan'].isdigit():
707-
full_colspan += int(cell["colspan"])
765+
full_colspan += max(1, min(1000, int(cell['colspan'])))
708766
else:
709767
full_colspan += 1
710768
if ((is_headrow

markdownify/__init__.pyi

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
from _typeshed import Incomplete
2+
from typing import Callable, Union
3+
4+
ATX: str
5+
ATX_CLOSED: str
6+
UNDERLINED: str
7+
SETEXT = UNDERLINED
8+
SPACES: str
9+
BACKSLASH: str
10+
ASTERISK: str
11+
UNDERSCORE: str
12+
LSTRIP: str
13+
RSTRIP: str
14+
STRIP: str
15+
STRIP_ONE: str
16+
17+
18+
def markdownify(
19+
html: str,
20+
autolinks: bool = ...,
21+
bs4_options: str = ...,
22+
bullets: str = ...,
23+
code_language: str = ...,
24+
code_language_callback: Union[Callable[[Incomplete], Union[str, None]], None] = ...,
25+
convert: Union[list[str], None] = ...,
26+
default_title: bool = ...,
27+
escape_asterisks: bool = ...,
28+
escape_underscores: bool = ...,
29+
escape_misc: bool = ...,
30+
heading_style: str = ...,
31+
keep_inline_images_in: list[str] = ...,
32+
newline_style: str = ...,
33+
strip: Union[list[str], None] = ...,
34+
strip_document: Union[str, None] = ...,
35+
strip_pre: str = ...,
36+
strong_em_symbol: str = ...,
37+
sub_symbol: str = ...,
38+
sup_symbol: str = ...,
39+
table_infer_header: bool = ...,
40+
wrap: bool = ...,
41+
wrap_width: int = ...,
42+
) -> str: ...
43+
44+
45+
class MarkdownConverter:
46+
def __init__(
47+
self,
48+
autolinks: bool = ...,
49+
bs4_options: str = ...,
50+
bullets: str = ...,
51+
code_language: str = ...,
52+
code_language_callback: Union[Callable[[Incomplete], Union[str, None]], None] = ...,
53+
convert: Union[list[str], None] = ...,
54+
default_title: bool = ...,
55+
escape_asterisks: bool = ...,
56+
escape_underscores: bool = ...,
57+
escape_misc: bool = ...,
58+
heading_style: str = ...,
59+
keep_inline_images_in: list[str] = ...,
60+
newline_style: str = ...,
61+
strip: Union[list[str], None] = ...,
62+
strip_document: Union[str, None] = ...,
63+
strip_pre: str = ...,
64+
strong_em_symbol: str = ...,
65+
sub_symbol: str = ...,
66+
sup_symbol: str = ...,
67+
table_infer_header: bool = ...,
68+
wrap: bool = ...,
69+
wrap_width: int = ...,
70+
) -> None:
71+
...
72+
73+
def convert(self, html: str) -> str:
74+
...
75+
76+
def convert_soup(self, soup: Incomplete) -> str:
77+
...

markdownify/main.py

100644100755
Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,9 @@ def main(argv=sys.argv[1:]):
5555
parser.add_argument('--no-escape-underscores', dest='escape_underscores',
5656
action='store_false',
5757
help="Do not escape '_' to '\\_' in text.")
58-
parser.add_argument('-i', '--keep-inline-images-in', nargs='*',
58+
parser.add_argument('-i', '--keep-inline-images-in',
59+
default=[],
60+
nargs='*',
5961
help="Images are converted to their alt-text when the images are "
6062
"located inside headlines or table cells. If some inline images "
6163
"should be converted to markdown images instead, this option can "
@@ -68,6 +70,11 @@ def main(argv=sys.argv[1:]):
6870
parser.add_argument('-w', '--wrap', action='store_true',
6971
help="Wrap all text paragraphs at --wrap-width characters.")
7072
parser.add_argument('--wrap-width', type=int, default=80)
73+
parser.add_argument('--bs4-options',
74+
default='html.parser',
75+
help="Specifies the parser that BeautifulSoup should use to parse "
76+
"the HTML markup. Examples include 'html5.parser', 'lxml', and "
77+
"'html5lib'.")
7178

7279
args = parser.parse_args(argv)
7380
print(markdownify(**vars(args)))

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "markdownify"
7-
version = "1.1.0"
7+
version = "1.2.0"
88
authors = [{name = "Matthew Tretter", email = "[email protected]"}]
99
description = "Convert HTML to markdown."
1010
readme = "README.rst"

0 commit comments

Comments
 (0)