diff --git a/api/utils/file_utils.py b/api/utils/file_utils.py index 1b34d23fc6..a062b93eed 100644 --- a/api/utils/file_utils.py +++ b/api/utils/file_utils.py @@ -156,7 +156,7 @@ def filename_type(filename): return FileType.PDF.value if re.match( - r".*\.(doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md)$", filename): + r".*\.(doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename): return FileType.DOC.value if re.match( diff --git a/rag/app/naive.py b/rag/app/naive.py index c557a62670..01bb4de1d3 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -136,7 +136,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, excel_parser = ExcelParser() sections = [(excel_parser.html(binary), "")] - elif re.search(r"\.(txt|md)$", filename, re.IGNORECASE): + elif re.search(r"\.(txt|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") txt = "" if binary: