fix: handling of long sequence of unescaped underscore chars in markd…

…own (#173) * Fix for md hanging when encountering long sequence of unescaped underscore chars Signed-off-by: Maksym Lysak <[email protected]> * Added comment explaining reason for fix Signed-off-by: Maksym Lysak <[email protected]> * Fixed trailing inline text handling (at the end of a file), and corrected underscore sequence shortening Signed-off-by: Maksym Lysak <[email protected]> * making fix more rare Signed-off-by: Maksym Lysak <[email protected]> --------- Signed-off-by: Maksym Lysak <[email protected]> Co-authored-by: Maksym Lysak <[email protected]>
DS4SD · Oct 28, 2024 · 94d0729 · 94d0729
1 parent 2cece27
commit 94d0729
Showing 1 changed file with 36 additions and 2 deletions.
diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py
@@ -1,4 +1,6 @@
 import logging
+import re
+import warnings
 from io import BytesIO
 from pathlib import Path
 from typing import Set, Union
@@ -25,6 +27,30 @@
 
 
 class MarkdownDocumentBackend(DeclarativeDocumentBackend):
+
+    def shorten_underscore_sequences(self, markdown_text, max_length=10):
+        # This regex will match any sequence of underscores
+        pattern = r"_+"
+
+        def replace_match(match):
+            underscore_sequence = match.group(
+                0
+            )  # Get the full match (sequence of underscores)
+
+            # Shorten the sequence if it exceeds max_length
+            if len(underscore_sequence) > max_length:
+                return "_" * max_length
+            else:
+                return underscore_sequence  # Leave it unchanged if it is shorter or equal to max_length
+
+        # Use re.sub to replace long underscore sequences
+        shortened_text = re.sub(pattern, replace_match, markdown_text)
+
+        if len(shortened_text) != len(markdown_text):
+            warnings.warn("Detected potentially incorrect Markdown, correcting...")
+
+        return shortened_text
+
     def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
         super().__init__(in_doc, path_or_stream)
 
@@ -42,11 +68,19 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
         try:
             if isinstance(self.path_or_stream, BytesIO):
                 text_stream = self.path_or_stream.getvalue().decode("utf-8")
-                self.markdown = text_stream
+                # remove invalid sequences
+                # very long sequences of underscores will lead to unnecessary long processing times.
+                # In any proper Markdown files, underscores have to be escaped,
+                # otherwise they represent emphasis (bold or italic)
+                self.markdown = self.shorten_underscore_sequences(text_stream)
             if isinstance(self.path_or_stream, Path):
                 with open(self.path_or_stream, "r", encoding="utf-8") as f:
                     md_content = f.read()
-                    self.markdown = md_content
+                    # remove invalid sequences
+                    # very long sequences of underscores will lead to unnecessary long processing times.
+                    # In any proper Markdown files, underscores have to be escaped,
+                    # otherwise they represent emphasis (bold or italic)
+                    self.markdown = self.shorten_underscore_sequences(md_content)
             self.valid = True
 
             _log.debug(self.markdown)