From d22664f7e79f0eb3dfdd0cbcc7c177b8f6f2dc77 Mon Sep 17 00:00:00 2001 From: "Dear.Va" <55902546+DearVa@users.noreply.github.com> Date: Tue, 21 Nov 2023 13:06:56 +0800 Subject: [PATCH] improve CODE_BLOCK_PATTERN for a more robust code match (#571) * improve CODE_BLOCK_PATTERN for more robust match * improve and add tests * Add support for \r\n * Updated the regex to support indented code blocks (per the Markdown spec). Added test cases for both. * Update formatting --------- Co-authored-by: Adam Fourney Co-authored-by: Chi Wang --- autogen/code_utils.py | 16 ++++++--- test/test_code.py | 76 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 85 insertions(+), 7 deletions(-) diff --git a/autogen/code_utils.py b/autogen/code_utils.py index 616bcf6da576..f03094c8b918 100644 --- a/autogen/code_utils.py +++ b/autogen/code_utils.py @@ -19,7 +19,15 @@ DEFAULT_MODEL = "gpt-4" FAST_MODEL = "gpt-3.5-turbo" # Regular expression for finding a code block -CODE_BLOCK_PATTERN = r"```(\w*)\n(.*?)\n```" +# ```[ \t]*(\w+)?[ \t]*\r?\n(.*?)[ \t]*\r?\n``` Matches multi-line code blocks. +# The [ \t]* matches the potential spaces before language name. +# The (\w+)? matches the language, where the ? indicates it is optional. +# The [ \t]* matches the potential spaces (not newlines) after language name. +# The \r?\n makes sure there is a linebreak after ```. +# The (.*?) matches the code itself (non-greedy). +# The \r?\n makes sure there is a linebreak before ```. +# The [ \t]* matches the potential spaces before closing ``` (the spec allows indentation). +CODE_BLOCK_PATTERN = r"```[ \t]*(\w+)?[ \t]*\r?\n(.*?)\r?\n[ \t]*```" WORKING_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "extensions") UNKNOWN = "unknown" TIMEOUT_MSG = "Timeout" @@ -59,6 +67,8 @@ def infer_lang(code): return UNKNOWN +# TODO: In the future move, to better support https://spec.commonmark.org/0.30/#fenced-code-blocks +# perhaps by using a full Markdown parser. def extract_code( text: Union[str, List], pattern: str = CODE_BLOCK_PATTERN, detect_single_line_code: bool = False ) -> List[Tuple[str, str]]: @@ -83,10 +93,8 @@ def extract_code( return match if match else [(UNKNOWN, text)] # Extract both multi-line and single-line code block, separated by the | operator - # `{3}(\w+)?\s*([\s\S]*?)`{3}: Matches multi-line code blocks. - # The (\w+)? matches the language, where the ? indicates it is optional. # `([^`]+)`: Matches inline code. - code_pattern = re.compile(r"`{3}(\w+)?\s*([\s\S]*?)`{3}|`([^`]+)`") + code_pattern = re.compile(CODE_BLOCK_PATTERN + r"|`([^`]+)`") code_blocks = code_pattern.findall(text) # Extract the individual code blocks and languages from the matched groups diff --git a/test/test_code.py b/test/test_code.py index 18d4b640ec3d..a00ef072fa96 100644 --- a/test/test_code.py +++ b/test/test_code.py @@ -185,6 +185,8 @@ def test_extract_code(): """, detect_single_line_code=True, ) + print(codeblocks2) + assert codeblocks2 == codeblocks # import pdb; pdb.set_trace() @@ -207,9 +209,77 @@ def scrape(url): title, text = scrape(url) print(f"Title: {title}") print(f"Text: {text}") +``` """ ) print(codeblocks) + assert len(codeblocks) == 2 and codeblocks[0][0] == "python" and codeblocks[1][0] == "python" + + codeblocks = extract_code( + """ +Example: +``` python +def scrape(url): + import requests + from bs4 import BeautifulSoup + response = requests.get(url) + soup = BeautifulSoup(response.text, "html.parser") + title = soup.find("title").text + text = soup.find("div", {"id": "bodyContent"}).text + return title, text +``` +Test: +``` python +url = "https://en.wikipedia.org/wiki/Web_scraping" +title, text = scrape(url) +print(f"Title: {title}") +print(f"Text: {text}") +``` +""" + ) + print(codeblocks) + assert len(codeblocks) == 2 and codeblocks[0][0] == "python" and codeblocks[1][0] == "python" + + # Check for indented code blocks + codeblocks = extract_code( + """ +Example: + ```python + def scrape(url): + import requests + from bs4 import BeautifulSoup + response = requests.get(url) + soup = BeautifulSoup(response.text, "html.parser") + title = soup.find("title").text + text = soup.find("div", {"id": "bodyContent"}).text + return title, text + ``` +""" + ) + print(codeblocks) + assert len(codeblocks) == 1 and codeblocks[0][0] == "python" + + # Check for codeblocks with \r\n + codeblocks = extract_code( + """ +Example: +``` python +def scrape(url): + import requests + from bs4 import BeautifulSoup + response = requests.get(url) + soup = BeautifulSoup(response.text, "html.parser") + title = soup.find("title").text + text = soup.find("div", {"id": "bodyContent"}).text + return title, text +``` +""".replace( + "\n", "\r\n" + ) + ) + print(codeblocks) + assert len(codeblocks) == 1 and codeblocks[0][0] == "python" + codeblocks = extract_code("no code block") assert len(codeblocks) == 1 and codeblocks[0] == (UNKNOWN, "no code block") @@ -348,7 +418,7 @@ def test_non_dict_in_list(self): if __name__ == "__main__": # test_infer_lang() - # test_extract_code() - test_execute_code() + test_extract_code() + # test_execute_code() # test_find_code() - unittest.main() + # unittest.main()