@@ -34,24 +34,43 @@ def infer_lang(code):
34
34
return "python"
35
35
36
36
37
- def extract_code (text : str , pattern : str = CODE_BLOCK_PATTERN ) -> List [Tuple [str , str ]]:
37
+ def extract_code (
38
+ text : str , pattern : str = CODE_BLOCK_PATTERN , detect_single_line_code : bool = False
39
+ ) -> List [Tuple [str , str ]]:
38
40
"""Extract code from a text.
39
41
40
42
Args:
41
43
text (str): The text to extract code from.
42
- pattern (Optional, str): The regular expression pattern for finding the code block.
44
+ pattern (str, optional): The regular expression pattern for finding the
45
+ code block. Defaults to CODE_BLOCK_PATTERN.
46
+ detect_single_line_code (bool, optional): Enable the new feature for
47
+ extracting single line code. Defaults to False.
43
48
44
49
Returns:
45
50
list: A list of tuples, each containing the language and the code.
51
+ If there is no code block in the input text, the language would be "unknown".
52
+ If there is code block but the language is not specified, the language would be "".
46
53
"""
47
- # Use a regular expression to find all the code blocks
48
- match = re .findall (pattern , text , flags = re .DOTALL )
49
- # match = re.search(pattern, text, flags=re.DOTALL)
50
- # If a match is found, return the code
51
- # if match:
52
- # return match.group(2), match.group(1)
53
- # If no code block is found, return the whole text
54
- return match if match else [(UNKNOWN , text )]
54
+ if not detect_single_line_code :
55
+ match = re .findall (pattern , text , flags = re .DOTALL )
56
+ return match if match else [(UNKNOWN , text )]
57
+
58
+ # Extract both multi-line and single-line code block, separated by the | operator
59
+ # `{3}(\w+)?\s*([\s\S]*?)`{3}: Matches multi-line code blocks.
60
+ # The (\w+)? matches the language, where the ? indicates it is optional.
61
+ # `([^`]+)`: Matches inline code.
62
+ code_pattern = re .compile (r"`{3}(\w+)?\s*([\s\S]*?)`{3}|`([^`]+)`" )
63
+ code_blocks = code_pattern .findall (text )
64
+
65
+ # Extract the individual code blocks and languages from the matched groups
66
+ extracted = []
67
+ for lang , group1 , group2 in code_blocks :
68
+ if group1 :
69
+ extracted .append ((lang .strip (), group1 .strip ()))
70
+ elif group2 :
71
+ extracted .append (("" , group2 .strip ()))
72
+
73
+ return extracted
55
74
56
75
57
76
# _FIND_CODE_SYS_MSG = [
0 commit comments