11
11
re_all_whitespace = re .compile (r'[\t \r\n]+' )
12
12
re_newline_whitespace = re .compile (r'[\t \r\n]*[\r\n][\t \r\n]*' )
13
13
re_html_heading = re .compile (r'h(\d+)' )
14
+ re_pre_lstrip1 = re .compile (r'^ *\n' )
15
+ re_pre_rstrip1 = re .compile (r'\n *$' )
16
+ re_pre_lstrip = re .compile (r'^[ \n]*\n' )
17
+ re_pre_rstrip = re .compile (r'[ \n]*$' )
14
18
15
19
# Pattern for creating convert_<tag> function names from tag names
16
20
re_make_convert_fn_name = re .compile (r'[\[\]:-]' )
37
41
# confused with a list item
38
42
re_escape_misc_list_items = re .compile (r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))' )
39
43
44
+ # Find consecutive backtick sequences in a string
45
+ re_backtick_runs = re .compile (r'`+' )
46
+
40
47
# Heading styles
41
48
ATX = 'atx'
42
49
ATX_CLOSED = 'atx_closed'
51
58
ASTERISK = '*'
52
59
UNDERSCORE = '_'
53
60
54
- # Document strip styles
61
+ # Document/pre strip styles
55
62
LSTRIP = 'lstrip'
56
63
RSTRIP = 'rstrip'
57
64
STRIP = 'strip'
65
+ STRIP_ONE = 'strip_one'
66
+
67
+
68
+ def strip1_pre (text ):
69
+ """Strip one leading and trailing newline from a <pre> string."""
70
+ text = re_pre_lstrip1 .sub ('' , text )
71
+ text = re_pre_rstrip1 .sub ('' , text )
72
+ return text
73
+
74
+
75
+ def strip_pre (text ):
76
+ """Strip all leading and trailing newlines from a <pre> string."""
77
+ text = re_pre_lstrip .sub ('' , text )
78
+ text = re_pre_rstrip .sub ('' , text )
79
+ return text
58
80
59
81
60
82
def chomp (text ):
@@ -154,6 +176,7 @@ def _next_block_content_sibling(el):
154
176
class MarkdownConverter (object ):
155
177
class DefaultOptions :
156
178
autolinks = True
179
+ bs4_options = 'html.parser'
157
180
bullets = '*+-' # An iterable of bullet types.
158
181
code_language = ''
159
182
code_language_callback = None
@@ -167,6 +190,7 @@ class DefaultOptions:
167
190
newline_style = SPACES
168
191
strip = None
169
192
strip_document = STRIP
193
+ strip_pre = STRIP
170
194
strong_em_symbol = ASTERISK
171
195
sub_symbol = ''
172
196
sup_symbol = ''
@@ -187,11 +211,15 @@ def __init__(self, **options):
187
211
raise ValueError ('You may specify either tags to strip or tags to'
188
212
' convert, but not both.' )
189
213
214
+ # If a string or list is passed to bs4_options, assume it is a 'features' specification
215
+ if not isinstance (self .options ['bs4_options' ], dict ):
216
+ self .options ['bs4_options' ] = {'features' : self .options ['bs4_options' ]}
217
+
190
218
# Initialize the conversion function cache
191
219
self .convert_fn_cache = {}
192
220
193
221
def convert (self , html ):
194
- soup = BeautifulSoup (html , 'html.parser' )
222
+ soup = BeautifulSoup (html , ** self . options [ 'bs4_options' ] )
195
223
return self .convert_soup (soup )
196
224
197
225
def convert_soup (self , soup ):
@@ -362,16 +390,20 @@ def get_conv_fn(self, tag_name):
362
390
if not self .should_convert_tag (tag_name ):
363
391
return None
364
392
365
- # Handle headings with _convert_hn() function
393
+ # Look for an explicitly defined conversion function by tag name first
394
+ convert_fn_name = "convert_%s" % re_make_convert_fn_name .sub ("_" , tag_name )
395
+ convert_fn = getattr (self , convert_fn_name , None )
396
+ if convert_fn :
397
+ return convert_fn
398
+
399
+ # If tag is any heading, handle with convert_hN() function
366
400
match = re_html_heading .match (tag_name )
367
401
if match :
368
- n = int (match .group (1 ))
369
- return lambda el , text , parent_tags : self ._convert_hn (n , el , text , parent_tags )
402
+ n = int (match .group (1 )) # get value of N from <hN>
403
+ return lambda el , text , parent_tags : self .convert_hN (n , el , text , parent_tags )
370
404
371
- # For other tags, look up their conversion function by tag name
372
- convert_fn_name = "convert_%s" % re_make_convert_fn_name .sub ('_' , tag_name )
373
- convert_fn = getattr (self , convert_fn_name , None )
374
- return convert_fn
405
+ # No conversion function was found
406
+ return None
375
407
376
408
def should_convert_tag (self , tag ):
377
409
"""Given a tag name, return whether to convert based on strip/convert options."""
@@ -451,10 +483,24 @@ def convert_br(self, el, text, parent_tags):
451
483
return ' \n '
452
484
453
485
def convert_code (self , el , text , parent_tags ):
454
- if 'pre ' in parent_tags :
486
+ if '_noformat ' in parent_tags :
455
487
return text
456
- converter = abstract_inline_conversion (lambda self : '`' )
457
- return converter (self , el , text , parent_tags )
488
+
489
+ prefix , suffix , text = chomp (text )
490
+ if not text :
491
+ return ''
492
+
493
+ # Find the maximum number of consecutive backticks in the text, then
494
+ # delimit the code span with one more backtick than that
495
+ max_backticks = max ((len (match ) for match in re .findall (re_backtick_runs , text )), default = 0 )
496
+ markup_delimiter = '`' * (max_backticks + 1 )
497
+
498
+ # If the maximum number of backticks is greater than zero, add a space
499
+ # to avoid interpretation of inside backticks as literals
500
+ if max_backticks > 0 :
501
+ text = " " + text + " "
502
+
503
+ return '%s%s%s%s%s' % (prefix , markup_delimiter , text , markup_delimiter , suffix )
458
504
459
505
convert_del = abstract_inline_conversion (lambda self : '~~' )
460
506
@@ -509,12 +555,12 @@ def convert_dt(self, el, text, parent_tags):
509
555
510
556
return '\n \n %s\n ' % text
511
557
512
- def _convert_hn (self , n , el , text , parent_tags ):
513
- """ Method name prefixed with _ to prevent <hn> to call this """
558
+ def convert_hN (self , n , el , text , parent_tags ):
559
+ # convert_hN() converts <hN> tags, where N is any integer
514
560
if '_inline' in parent_tags :
515
561
return text
516
562
517
- # prevent MemoryErrors in case of very large n
563
+ # Markdown does not support heading depths of n > 6
518
564
n = max (1 , min (6 , n ))
519
565
520
566
style = self .options ['heading_style' ].lower ()
@@ -647,8 +693,20 @@ def convert_pre(self, el, text, parent_tags):
647
693
if self .options ['code_language_callback' ]:
648
694
code_language = self .options ['code_language_callback' ](el ) or code_language
649
695
696
+ if self .options ['strip_pre' ] == STRIP :
697
+ text = strip_pre (text ) # remove all leading/trailing newlines
698
+ elif self .options ['strip_pre' ] == STRIP_ONE :
699
+ text = strip1_pre (text ) # remove one leading/trailing newline
700
+ elif self .options ['strip_pre' ] is None :
701
+ pass # leave leading and trailing newlines as-is
702
+ else :
703
+ raise ValueError ('Invalid value for strip_pre: %s' % self .options ['strip_pre' ])
704
+
650
705
return '\n \n ```%s\n %s\n ```\n \n ' % (code_language , text )
651
706
707
+ def convert_q (self , el , text , parent_tags ):
708
+ return '"' + text + '"'
709
+
652
710
def convert_script (self , el , text , parent_tags ):
653
711
return ''
654
712
@@ -677,13 +735,13 @@ def convert_figcaption(self, el, text, parent_tags):
677
735
def convert_td (self , el , text , parent_tags ):
678
736
colspan = 1
679
737
if 'colspan' in el .attrs and el ['colspan' ].isdigit ():
680
- colspan = int (el ['colspan' ])
738
+ colspan = max ( 1 , min ( 1000 , int (el ['colspan' ])) )
681
739
return ' ' + text .strip ().replace ("\n " , " " ) + ' |' * colspan
682
740
683
741
def convert_th (self , el , text , parent_tags ):
684
742
colspan = 1
685
743
if 'colspan' in el .attrs and el ['colspan' ].isdigit ():
686
- colspan = int (el ['colspan' ])
744
+ colspan = max ( 1 , min ( 1000 , int (el ['colspan' ])) )
687
745
return ' ' + text .strip ().replace ("\n " , " " ) + ' |' * colspan
688
746
689
747
def convert_tr (self , el , text , parent_tags ):
@@ -704,7 +762,7 @@ def convert_tr(self, el, text, parent_tags):
704
762
full_colspan = 0
705
763
for cell in cells :
706
764
if 'colspan' in cell .attrs and cell ['colspan' ].isdigit ():
707
- full_colspan += int (cell [" colspan" ] )
765
+ full_colspan += max ( 1 , min ( 1000 , int (cell [' colspan' ])) )
708
766
else :
709
767
full_colspan += 1
710
768
if ((is_headrow
0 commit comments