Support text.content split to 2000 characters

selfboot · selfboot · commit a6281894a4ac · 2023-04-29T17:15:33.000+08:00
diff --git a/html2notion/translate/html2json_base.py b/html2notion/translate/html2json_base.py
@@ -117,6 +117,7 @@ def parse_one_style(tag_soup: Tag, text_params: dict):
             text_params["url"] = href
         return
 
+    # https://developers.notion.com/reference/request-limits
     # Process one tag and return a list of objects
     # <b><u>unlineline and bold</u></b>
     # <div><font color="#ff2600">Red color4</font></div>
@@ -126,18 +127,21 @@ def generate_inline_obj(tag: PageElement):
         res_obj = []
         text_with_parents = Html2JsonBase.extract_text_and_parents(tag)
         for (text, parent_tags) in text_with_parents:
-            text_params = {"plain_text": text}
-            for parent in parent_tags:
-                Html2JsonBase.parse_one_style(parent, text_params)
-
-            if text_params.get("url", ""):
-                text_obj = Html2JsonBase.generate_link(**text_params)
-            else:
-                text_obj = Html2JsonBase.generate_text(**text_params)
-            if text_obj:
-                res_obj.append(text_obj)
+            # Split the text into chunks of 2000 characters
+            text_chunks = [text[i:i+2000] for i in range(0, len(text), 2000)]
+            for chunk in text_chunks:
+                text_params = {"plain_text": chunk}
+                for parent in parent_tags:
+                    Html2JsonBase.parse_one_style(parent, text_params)
+
+                if text_params.get("url", ""):
+                    text_obj = Html2JsonBase.generate_link(**text_params)
+                else:
+                    text_obj = Html2JsonBase.generate_text(**text_params)
+                if text_obj:
+                    res_obj.append(text_obj)
         return res_obj
-    
+
     @staticmethod
     def generate_link(**kwargs):
         if not kwargs.get("plain_text", ""):
diff --git a/tests/test_reqlimit.py b/tests/test_reqlimit.py
@@ -0,0 +1,60 @@
+import json
+import os
+from html2notion.translate.html2json_yinxiang import Html2JsonYinXiang
+
+paragram_rich_content = f'<div>{"Some words" * 400} more words</div>'
+block_max_conent = "Some words" * 200
+paragram_rich_block = [
+    {
+        "object": "block",
+        "type": "paragraph",
+        "paragraph": {
+            "rich_text": [
+                {
+                    "plain_text": block_max_conent,
+                    "text": {
+                        "content": block_max_conent
+                    },
+                    "type": "text"
+                },
+                {
+                    "plain_text": block_max_conent,
+                    "text": {
+                        "content": block_max_conent
+                    },
+                    "type": "text"
+                },
+                {
+                    "plain_text": " more words",
+                    "text": {
+                        "content": " more words"
+                    },
+                    "type": "text"
+                }
+            ]
+        }
+    }
+]
+
+
+def test_reqlimit():
+    if 'GITHUB_ACTIONS' not in os.environ:
+        from html2notion.utils import test_prepare_conf, logger
+        test_prepare_conf()
+        logger.info("prepare_conf_fixture")
+
+    html_jsons = {
+        paragram_rich_content: paragram_rich_block,
+    }
+
+    for html_content in html_jsons:
+        body_content = '<body>' + html_content + '</body>'
+        yinxiang = Html2JsonYinXiang(body_content)
+        yinxiang.process()
+        json_obj = yinxiang.children
+        # print(json.dumps(json_obj, indent=4))
+        assert json_obj == html_jsons[html_content]
+
+
+if __name__ == '__main__':
+    test_reqlimit()