Skip to content

Commit

Permalink
bugfix: HTML tags inside code blocks won't be removed now.
Browse files Browse the repository at this point in the history
Beta
  • Loading branch information
GaiZhenbiao committed Sep 19, 2024
1 parent 22007a7 commit ffea5e9
Showing 1 changed file with 21 additions and 5 deletions.
26 changes: 21 additions & 5 deletions modules/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,11 +245,27 @@ def convert_mdtext(md_text): # deprecated

def remove_html_tags(data):
def clean_text(text):
# Remove all HTML tags
cleaned = re.sub(r'<[^>]+>', '', text)
# Remove any remaining HTML entities
cleaned = re.sub(r'&[#\w]+;', '', cleaned)
return cleaned.strip()
# Regular expression to match code blocks, including all newlines
code_block_pattern = r'(```[\s\S]*?```)'

# Split the text into code blocks and non-code blocks
parts = re.split(code_block_pattern, text)

cleaned_parts = []
for part in parts:
if part.startswith('```') and part.endswith('```'):
# This is a code block, keep it exactly as is
cleaned_parts.append(part)
else:
# This is not a code block, remove HTML tags
# Remove all HTML tags
cleaned = re.sub(r'<[^>]+>', '', part)
# Remove any remaining HTML entities
cleaned = re.sub(r'&[#\w]+;', '', cleaned)
cleaned_parts.append(cleaned) # Don't strip here to preserve newlines

# Join the cleaned parts back together
return ''.join(cleaned_parts)

return [
[clean_text(item) for item in sublist]
Expand Down

0 comments on commit ffea5e9

Please sign in to comment.