|
2 | 2 | ParseNode Module |
3 | 3 | """ |
4 | 4 | from typing import List, Optional |
5 | | -from semchunk import chunk |
6 | 5 | from langchain_community.document_transformers import Html2TextTransformer |
7 | 6 | from langchain_core.documents import Document |
8 | 7 | from .base_node import BaseNode |
9 | | -from tokenizer import num_tokens_calculus |
| 8 | +from ..utils.split_text_into_chunks import split_text_into_chunks |
10 | 9 |
|
11 | 10 | class ParseNode(BaseNode): |
12 | 11 | """ |
@@ -69,26 +68,20 @@ def execute(self, state: dict) -> dict: |
69 | 68 | docs_transformed = Html2TextTransformer(ignore_links=False).transform_documents(input_data[0]) |
70 | 69 | docs_transformed = docs_transformed[0] |
71 | 70 |
|
72 | | - chunks = chunk(text=docs_transformed.page_content, |
73 | | - chunk_size=self.node_config.get("chunk_size", 4096)-250, |
74 | | - token_counter=lambda text: len(text.split()), |
75 | | - memoize=False) |
| 71 | + chunks = split_text_into_chunks(text=docs_transformed.page_content, |
| 72 | + chunk_size=self.node_config.get("chunk_size", 4096)-250) |
76 | 73 | else: |
77 | 74 | docs_transformed = docs_transformed[0] |
78 | 75 |
|
79 | 76 | chunk_size = self.node_config.get("chunk_size", 4096) |
80 | 77 | chunk_size = min(chunk_size - 500, int(chunk_size * 0.9)) |
81 | 78 |
|
82 | 79 | if isinstance(docs_transformed, Document): |
83 | | - chunks = chunk(text=docs_transformed.page_content, |
84 | | - chunk_size=chunk_size, |
85 | | - token_counter=lambda text: len(text.split()), |
86 | | - memoize=False) |
| 80 | + chunks = split_text_into_chunks(text=docs_transformed.page_content, |
| 81 | + chunk_size=chunk_size) |
87 | 82 | else: |
88 | | - chunks = chunk(text=docs_transformed, |
89 | | - chunk_size=chunk_size, |
90 | | - token_counter=lambda text: len(text.split()), |
91 | | - memoize=False) |
| 83 | + chunks = split_text_into_chunks(text=docs_transformed, |
| 84 | + chunk_size=chunk_size) |
92 | 85 |
|
93 | 86 | state.update({self.output[0]: chunks}) |
94 | 87 |
|
|
0 commit comments