-
Notifications
You must be signed in to change notification settings - Fork 4.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: nltk text splitter support (#3403)
* feat: nltk text splitter support * feat: add doc link to nltk text splitter * [autofix.ci] apply automated fixes --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
- Loading branch information
1 parent
3ca5469
commit 0bd9de1
Showing
2 changed files
with
64 additions
and
0 deletions.
There are no files selected for viewing
62 changes: 62 additions & 0 deletions
62
src/backend/base/langflow/components/textsplitters/NaturalLanguageTextSplitter.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
from typing import Any | ||
|
||
from langchain_text_splitters import NLTKTextSplitter, TextSplitter | ||
|
||
from langflow.base.textsplitters.model import LCTextSplitterComponent | ||
from langflow.inputs import DataInput, IntInput, MessageTextInput | ||
from langflow.utils.util import unescape_string | ||
|
||
|
||
class NaturalLanguageTextSplitterComponent(LCTextSplitterComponent): | ||
display_name = "Natural Language Text Splitter" | ||
description = "Split text based on natural language boundaries, optimized for a specified language." | ||
documentation = ( | ||
"https://python.langchain.com/v0.1/docs/modules/data_connection/document_transformers/split_by_token/#nltk" | ||
) | ||
name = "NaturalLanguageTextSplitter" | ||
|
||
inputs = [ | ||
IntInput( | ||
name="chunk_size", | ||
display_name="Chunk Size", | ||
info="The maximum number of characters in each chunk after splitting.", | ||
value=1000, | ||
), | ||
IntInput( | ||
name="chunk_overlap", | ||
display_name="Chunk Overlap", | ||
info="The number of characters that overlap between consecutive chunks.", | ||
value=200, | ||
), | ||
DataInput( | ||
name="data_input", | ||
display_name="Input", | ||
info="The text data to be split.", | ||
input_types=["Document", "Data"], | ||
), | ||
MessageTextInput( | ||
name="separator", | ||
display_name="Separator", | ||
info='The character(s) to use as a delimiter when splitting text.\nDefaults to "\\n\\n" if left empty.', | ||
), | ||
MessageTextInput( | ||
name="language", | ||
display_name="Language", | ||
info='The language of the text. Default is "English". Supports multiple languages for better text boundary recognition.', | ||
), | ||
] | ||
|
||
def get_data_input(self) -> Any: | ||
return self.data_input | ||
|
||
def build_text_splitter(self) -> TextSplitter: | ||
if self.separator: | ||
separator = unescape_string(self.separator) | ||
else: | ||
separator = "\n\n" | ||
return NLTKTextSplitter( | ||
language=self.language.lower() if self.language else "english", | ||
separator=separator, | ||
chunk_size=self.chunk_size, | ||
chunk_overlap=self.chunk_overlap, | ||
) |
2 changes: 2 additions & 0 deletions
2
src/backend/base/langflow/components/textsplitters/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,11 @@ | ||
from .CharacterTextSplitter import CharacterTextSplitterComponent | ||
from .LanguageRecursiveTextSplitter import LanguageRecursiveTextSplitterComponent | ||
from .RecursiveCharacterTextSplitter import RecursiveCharacterTextSplitterComponent | ||
from .NaturalLanguageTextSplitter import NaturalLanguageTextSplitterComponent | ||
|
||
__all__ = [ | ||
"CharacterTextSplitterComponent", | ||
"LanguageRecursiveTextSplitterComponent", | ||
"RecursiveCharacterTextSplitterComponent", | ||
"NaturalLanguageTextSplitterComponent", | ||
] |