Skip to content

Commit

Permalink
fix: handling of long sequence of unescaped underscore chars in markd…
Browse files Browse the repository at this point in the history
…own (#173)

* Fix for md hanging when encountering long sequence of unescaped underscore chars

Signed-off-by: Maksym Lysak <[email protected]>

* Added comment explaining reason for fix

Signed-off-by: Maksym Lysak <[email protected]>

* Fixed trailing inline text handling (at the end of a file), and corrected underscore sequence shortening

Signed-off-by: Maksym Lysak <[email protected]>

* making fix more rare

Signed-off-by: Maksym Lysak <[email protected]>

---------

Signed-off-by: Maksym Lysak <[email protected]>
Co-authored-by: Maksym Lysak <[email protected]>
  • Loading branch information
maxmnemonic and Maksym Lysak authored Oct 28, 2024
1 parent 2cece27 commit 94d0729
Showing 1 changed file with 36 additions and 2 deletions.
38 changes: 36 additions & 2 deletions docling/backend/md_backend.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import logging
import re
import warnings
from io import BytesIO
from pathlib import Path
from typing import Set, Union
Expand All @@ -25,6 +27,30 @@


class MarkdownDocumentBackend(DeclarativeDocumentBackend):

def shorten_underscore_sequences(self, markdown_text, max_length=10):
# This regex will match any sequence of underscores
pattern = r"_+"

def replace_match(match):
underscore_sequence = match.group(
0
) # Get the full match (sequence of underscores)

# Shorten the sequence if it exceeds max_length
if len(underscore_sequence) > max_length:
return "_" * max_length
else:
return underscore_sequence # Leave it unchanged if it is shorter or equal to max_length

# Use re.sub to replace long underscore sequences
shortened_text = re.sub(pattern, replace_match, markdown_text)

if len(shortened_text) != len(markdown_text):
warnings.warn("Detected potentially incorrect Markdown, correcting...")

return shortened_text

def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)

Expand All @@ -42,11 +68,19 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
try:
if isinstance(self.path_or_stream, BytesIO):
text_stream = self.path_or_stream.getvalue().decode("utf-8")
self.markdown = text_stream
# remove invalid sequences
# very long sequences of underscores will lead to unnecessary long processing times.
# In any proper Markdown files, underscores have to be escaped,
# otherwise they represent emphasis (bold or italic)
self.markdown = self.shorten_underscore_sequences(text_stream)
if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, "r", encoding="utf-8") as f:
md_content = f.read()
self.markdown = md_content
# remove invalid sequences
# very long sequences of underscores will lead to unnecessary long processing times.
# In any proper Markdown files, underscores have to be escaped,
# otherwise they represent emphasis (bold or italic)
self.markdown = self.shorten_underscore_sequences(md_content)
self.valid = True

_log.debug(self.markdown)
Expand Down

0 comments on commit 94d0729

Please sign in to comment.