|
4 | 4 | import six
|
5 | 5 |
|
6 | 6 |
|
7 |
| -re_convert_heading = re.compile(r'convert_h(\d+)') |
8 | 7 | re_line_with_content = re.compile(r'^(.*)', flags=re.MULTILINE)
|
9 | 8 | re_whitespace = re.compile(r'[\t ]+')
|
10 | 9 | re_all_whitespace = re.compile(r'[\t \r\n]+')
|
11 | 10 | re_newline_whitespace = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*')
|
12 |
| -re_html_heading = re.compile(r'h[1-6]') |
| 11 | +re_html_heading = re.compile(r'h(\d+)') |
13 | 12 |
|
14 | 13 | # extract (leading_nl, content, trailing_nl) from a string
|
15 | 14 | # (functionally equivalent to r'^(\n*)(.*?)(\n*)$', but greedy is faster than reluctant here)
|
@@ -165,6 +164,9 @@ def __init__(self, **options):
|
165 | 164 | raise ValueError('You may specify either tags to strip or tags to'
|
166 | 165 | ' convert, but not both.')
|
167 | 166 |
|
| 167 | + # Initialize the conversion function cache |
| 168 | + self.convert_fn_cache = {} |
| 169 | + |
168 | 170 | def convert(self, html):
|
169 | 171 | soup = BeautifulSoup(html, 'html.parser')
|
170 | 172 | return self.convert_soup(soup)
|
@@ -266,9 +268,8 @@ def _can_ignore(el):
|
266 | 268 | text = ''.join(child_strings)
|
267 | 269 |
|
268 | 270 | # apply this tag's final conversion function
|
269 |
| - convert_fn_name = "convert_%s" % re.sub(r"[\[\]:-]", "_", node.name) |
270 |
| - convert_fn = getattr(self, convert_fn_name, None) |
271 |
| - if convert_fn and self.should_convert_tag(node.name): |
| 271 | + convert_fn = self.get_conv_fn_cached(node.name) |
| 272 | + if convert_fn is not None: |
272 | 273 | text = convert_fn(node, text, parent_tags=parent_tags)
|
273 | 274 |
|
274 | 275 | return text
|
@@ -321,23 +322,36 @@ def process_text(self, el, parent_tags=None):
|
321 | 322 |
|
322 | 323 | return text
|
323 | 324 |
|
324 |
| - def __getattr__(self, attr): |
325 |
| - # Handle headings |
326 |
| - m = re_convert_heading.match(attr) |
327 |
| - if m: |
328 |
| - n = int(m.group(1)) |
| 325 | + def get_conv_fn_cached(self, tag_name): |
| 326 | + """Given a tag name, return the conversion function using the cache.""" |
| 327 | + # If conversion function is not in cache, add it |
| 328 | + if tag_name not in self.convert_fn_cache: |
| 329 | + self.convert_fn_cache[tag_name] = self.get_conv_fn(tag_name) |
| 330 | + |
| 331 | + # Return the cached entry |
| 332 | + return self.convert_fn_cache[tag_name] |
329 | 333 |
|
330 |
| - def convert_tag(el, text, parent_tags): |
331 |
| - return self._convert_hn(n, el, text, parent_tags) |
| 334 | + def get_conv_fn(self, tag_name): |
| 335 | + """Given a tag name, find and return the conversion function.""" |
| 336 | + tag_name = tag_name.lower() |
332 | 337 |
|
333 |
| - convert_tag.__name__ = 'convert_h%s' % n |
334 |
| - setattr(self, convert_tag.__name__, convert_tag) |
335 |
| - return convert_tag |
| 338 | + # Handle strip/convert exclusion options |
| 339 | + if not self.should_convert_tag(tag_name): |
| 340 | + return None |
336 | 341 |
|
337 |
| - raise AttributeError(attr) |
| 342 | + # Handle headings with _convert_hn() function |
| 343 | + match = re_html_heading.match(tag_name) |
| 344 | + if match: |
| 345 | + n = int(match.group(1)) |
| 346 | + return lambda el, text, parent_tags: self._convert_hn(n, el, text, parent_tags) |
| 347 | + |
| 348 | + # For other tags, look up their conversion function by tag name |
| 349 | + convert_fn_name = "convert_%s" % re.sub(r"[\[\]:-]", "_", tag_name) |
| 350 | + convert_fn = getattr(self, convert_fn_name, None) |
| 351 | + return convert_fn |
338 | 352 |
|
339 | 353 | def should_convert_tag(self, tag):
|
340 |
| - tag = tag.lower() |
| 354 | + """Given a tag name, return whether to convert based on strip/convert options.""" |
341 | 355 | strip = self.options['strip']
|
342 | 356 | convert = self.options['convert']
|
343 | 357 | if strip is not None:
|
|
0 commit comments