Skip to content

Commit a628189

Browse files
committed
Support text.content split to 2000 characters
1 parent 12642f6 commit a628189

File tree

2 files changed

+75
-11
lines changed

2 files changed

+75
-11
lines changed

html2notion/translate/html2json_base.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ def parse_one_style(tag_soup: Tag, text_params: dict):
117117
text_params["url"] = href
118118
return
119119

120+
# https://developers.notion.com/reference/request-limits
120121
# Process one tag and return a list of objects
121122
# <b><u>unlineline and bold</u></b>
122123
# <div><font color="#ff2600">Red color4</font></div>
@@ -126,18 +127,21 @@ def generate_inline_obj(tag: PageElement):
126127
res_obj = []
127128
text_with_parents = Html2JsonBase.extract_text_and_parents(tag)
128129
for (text, parent_tags) in text_with_parents:
129-
text_params = {"plain_text": text}
130-
for parent in parent_tags:
131-
Html2JsonBase.parse_one_style(parent, text_params)
132-
133-
if text_params.get("url", ""):
134-
text_obj = Html2JsonBase.generate_link(**text_params)
135-
else:
136-
text_obj = Html2JsonBase.generate_text(**text_params)
137-
if text_obj:
138-
res_obj.append(text_obj)
130+
# Split the text into chunks of 2000 characters
131+
text_chunks = [text[i:i+2000] for i in range(0, len(text), 2000)]
132+
for chunk in text_chunks:
133+
text_params = {"plain_text": chunk}
134+
for parent in parent_tags:
135+
Html2JsonBase.parse_one_style(parent, text_params)
136+
137+
if text_params.get("url", ""):
138+
text_obj = Html2JsonBase.generate_link(**text_params)
139+
else:
140+
text_obj = Html2JsonBase.generate_text(**text_params)
141+
if text_obj:
142+
res_obj.append(text_obj)
139143
return res_obj
140-
144+
141145
@staticmethod
142146
def generate_link(**kwargs):
143147
if not kwargs.get("plain_text", ""):

tests/test_reqlimit.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
import json
2+
import os
3+
from html2notion.translate.html2json_yinxiang import Html2JsonYinXiang
4+
5+
paragram_rich_content = f'<div>{"Some words" * 400} more words</div>'
6+
block_max_conent = "Some words" * 200
7+
paragram_rich_block = [
8+
{
9+
"object": "block",
10+
"type": "paragraph",
11+
"paragraph": {
12+
"rich_text": [
13+
{
14+
"plain_text": block_max_conent,
15+
"text": {
16+
"content": block_max_conent
17+
},
18+
"type": "text"
19+
},
20+
{
21+
"plain_text": block_max_conent,
22+
"text": {
23+
"content": block_max_conent
24+
},
25+
"type": "text"
26+
},
27+
{
28+
"plain_text": " more words",
29+
"text": {
30+
"content": " more words"
31+
},
32+
"type": "text"
33+
}
34+
]
35+
}
36+
}
37+
]
38+
39+
40+
def test_reqlimit():
41+
if 'GITHUB_ACTIONS' not in os.environ:
42+
from html2notion.utils import test_prepare_conf, logger
43+
test_prepare_conf()
44+
logger.info("prepare_conf_fixture")
45+
46+
html_jsons = {
47+
paragram_rich_content: paragram_rich_block,
48+
}
49+
50+
for html_content in html_jsons:
51+
body_content = '<body>' + html_content + '</body>'
52+
yinxiang = Html2JsonYinXiang(body_content)
53+
yinxiang.process()
54+
json_obj = yinxiang.children
55+
# print(json.dumps(json_obj, indent=4))
56+
assert json_obj == html_jsons[html_content]
57+
58+
59+
if __name__ == '__main__':
60+
test_reqlimit()

0 commit comments

Comments
 (0)