improve CODE_BLOCK_PATTERN for a more robust code match (#571)

* improve CODE_BLOCK_PATTERN for more robust match * improve and add tests * Add support for \r\n * Updated the regex to support indented code blocks (per the Markdown spec). Added test cases for both. * Update formatting --------- Co-authored-by: Adam Fourney <[email protected]> Co-authored-by: Chi Wang <[email protected]>
microsoft · Nov 21, 2023 · d22664f · d22664f
1 parent 19c7da2
commit d22664f
Show file tree

Hide file tree

Showing 2 changed files with 85 additions and 7 deletions.
diff --git a/autogen/code_utils.py b/autogen/code_utils.py
@@ -19,7 +19,15 @@
 DEFAULT_MODEL = "gpt-4"
 FAST_MODEL = "gpt-3.5-turbo"
 # Regular expression for finding a code block
-CODE_BLOCK_PATTERN = r"```(\w*)\n(.*?)\n```"
+# ```[ \t]*(\w+)?[ \t]*\r?\n(.*?)[ \t]*\r?\n``` Matches multi-line code blocks.
+#   The [ \t]* matches the potential spaces before language name.
+#   The (\w+)? matches the language, where the ? indicates it is optional.
+#   The [ \t]* matches the potential spaces (not newlines) after language name.
+#   The \r?\n makes sure there is a linebreak after ```.
+#   The (.*?) matches the code itself (non-greedy).
+#   The \r?\n makes sure there is a linebreak before ```.
+#   The [ \t]* matches the potential spaces before closing ``` (the spec allows indentation).
+CODE_BLOCK_PATTERN = r"```[ \t]*(\w+)?[ \t]*\r?\n(.*?)\r?\n[ \t]*```"
 WORKING_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "extensions")
 UNKNOWN = "unknown"
 TIMEOUT_MSG = "Timeout"
@@ -59,6 +67,8 @@ def infer_lang(code):
         return UNKNOWN
 
 
+# TODO: In the future move, to better support https://spec.commonmark.org/0.30/#fenced-code-blocks
+#       perhaps by using a full Markdown parser.
 def extract_code(
     text: Union[str, List], pattern: str = CODE_BLOCK_PATTERN, detect_single_line_code: bool = False
 ) -> List[Tuple[str, str]]:
@@ -83,10 +93,8 @@ def extract_code(
         return match if match else [(UNKNOWN, text)]
 
     # Extract both multi-line and single-line code block, separated by the | operator
-    # `{3}(\w+)?\s*([\s\S]*?)`{3}: Matches multi-line code blocks.
-    #    The (\w+)? matches the language, where the ? indicates it is optional.
     # `([^`]+)`: Matches inline code.
-    code_pattern = re.compile(r"`{3}(\w+)?\s*([\s\S]*?)`{3}|`([^`]+)`")
+    code_pattern = re.compile(CODE_BLOCK_PATTERN + r"|`([^`]+)`")
     code_blocks = code_pattern.findall(text)
 
     # Extract the individual code blocks and languages from the matched groups

diff --git a/test/test_code.py b/test/test_code.py
@@ -185,6 +185,8 @@ def test_extract_code():
 """,
         detect_single_line_code=True,
     )
+    print(codeblocks2)
+
     assert codeblocks2 == codeblocks
     # import pdb; pdb.set_trace()
 
@@ -207,9 +209,77 @@ def scrape(url):
 title, text = scrape(url)
 print(f"Title: {title}")
 print(f"Text: {text}")
+```
 """
     )
     print(codeblocks)
+    assert len(codeblocks) == 2 and codeblocks[0][0] == "python" and codeblocks[1][0] == "python"
+
+    codeblocks = extract_code(
+        """
+Example:
+``` python
+def scrape(url):
+    import requests
+    from bs4 import BeautifulSoup
+    response = requests.get(url)
+    soup = BeautifulSoup(response.text, "html.parser")
+    title = soup.find("title").text
+    text = soup.find("div", {"id": "bodyContent"}).text
+    return title, text
+```
+Test:
+``` python
+url = "https://en.wikipedia.org/wiki/Web_scraping"
+title, text = scrape(url)
+print(f"Title: {title}")
+print(f"Text: {text}")
+```
+"""
+    )
+    print(codeblocks)
+    assert len(codeblocks) == 2 and codeblocks[0][0] == "python" and codeblocks[1][0] == "python"
+
+    # Check for indented code blocks
+    codeblocks = extract_code(
+        """
+Example:
+   ```python
+   def scrape(url):
+       import requests
+       from bs4 import BeautifulSoup
+       response = requests.get(url)
+       soup = BeautifulSoup(response.text, "html.parser")
+       title = soup.find("title").text
+       text = soup.find("div", {"id": "bodyContent"}).text
+       return title, text
+   ```
+"""
+    )
+    print(codeblocks)
+    assert len(codeblocks) == 1 and codeblocks[0][0] == "python"
+
+    # Check for codeblocks with \r\n
+    codeblocks = extract_code(
+        """
+Example:
+``` python
+def scrape(url):
+   import requests
+   from bs4 import BeautifulSoup
+   response = requests.get(url)
+   soup = BeautifulSoup(response.text, "html.parser")
+   title = soup.find("title").text
+   text = soup.find("div", {"id": "bodyContent"}).text
+   return title, text
+```
+""".replace(
+            "\n", "\r\n"
+        )
+    )
+    print(codeblocks)
+    assert len(codeblocks) == 1 and codeblocks[0][0] == "python"
+
     codeblocks = extract_code("no code block")
     assert len(codeblocks) == 1 and codeblocks[0] == (UNKNOWN, "no code block")
 
@@ -348,7 +418,7 @@ def test_non_dict_in_list(self):
 
 if __name__ == "__main__":
     # test_infer_lang()
-    # test_extract_code()
-    test_execute_code()
+    test_extract_code()
+    # test_execute_code()
     # test_find_code()
-    unittest.main()
+    # unittest.main()