Skip to content

Commit

Permalink
feat: nltk text splitter support (#3403)
Browse files Browse the repository at this point in the history
* feat: nltk text splitter support

* feat: add doc link to nltk text splitter

* [autofix.ci] apply automated fixes

---------

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
  • Loading branch information
2 people authored and ogabrielluiz committed Aug 27, 2024
1 parent 3ca5469 commit 0bd9de1
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from typing import Any

from langchain_text_splitters import NLTKTextSplitter, TextSplitter

from langflow.base.textsplitters.model import LCTextSplitterComponent
from langflow.inputs import DataInput, IntInput, MessageTextInput
from langflow.utils.util import unescape_string


class NaturalLanguageTextSplitterComponent(LCTextSplitterComponent):
display_name = "Natural Language Text Splitter"
description = "Split text based on natural language boundaries, optimized for a specified language."
documentation = (
"https://python.langchain.com/v0.1/docs/modules/data_connection/document_transformers/split_by_token/#nltk"
)
name = "NaturalLanguageTextSplitter"

inputs = [
IntInput(
name="chunk_size",
display_name="Chunk Size",
info="The maximum number of characters in each chunk after splitting.",
value=1000,
),
IntInput(
name="chunk_overlap",
display_name="Chunk Overlap",
info="The number of characters that overlap between consecutive chunks.",
value=200,
),
DataInput(
name="data_input",
display_name="Input",
info="The text data to be split.",
input_types=["Document", "Data"],
),
MessageTextInput(
name="separator",
display_name="Separator",
info='The character(s) to use as a delimiter when splitting text.\nDefaults to "\\n\\n" if left empty.',
),
MessageTextInput(
name="language",
display_name="Language",
info='The language of the text. Default is "English". Supports multiple languages for better text boundary recognition.',
),
]

def get_data_input(self) -> Any:
return self.data_input

def build_text_splitter(self) -> TextSplitter:
if self.separator:
separator = unescape_string(self.separator)
else:
separator = "\n\n"
return NLTKTextSplitter(
language=self.language.lower() if self.language else "english",
separator=separator,
chunk_size=self.chunk_size,
chunk_overlap=self.chunk_overlap,
)
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from .CharacterTextSplitter import CharacterTextSplitterComponent
from .LanguageRecursiveTextSplitter import LanguageRecursiveTextSplitterComponent
from .RecursiveCharacterTextSplitter import RecursiveCharacterTextSplitterComponent
from .NaturalLanguageTextSplitter import NaturalLanguageTextSplitterComponent

__all__ = [
"CharacterTextSplitterComponent",
"LanguageRecursiveTextSplitterComponent",
"RecursiveCharacterTextSplitterComponent",
"NaturalLanguageTextSplitterComponent",
]

0 comments on commit 0bd9de1

Please sign in to comment.