Merge pull request #1947 from darwintree/twitter-url

feat: improve twitter paragraph splitting if containing url
elizaOS · Jan 7, 2025 · 25fba34 · 25fba34
2 parents 62de238 + 5319131
commit 25fba34
Showing 1 changed file with 59 additions and 6 deletions.
diff --git a/packages/client-twitter/src/utils.ts b/packages/client-twitter/src/utils.ts
@@ -321,11 +321,31 @@ function splitTweetContent(content: string, maxLength: number): string[] {
     return tweets;
 }
 
-function splitParagraph(paragraph: string, maxLength: number): string[] {
-    // eslint-disable-next-line
-    const sentences = paragraph.match(/[^\.!\?]+[\.!\?]+|[^\.!\?]+$/g) || [
-        paragraph,
-    ];
+function extractUrls(paragraph: string): {
+    textWithPlaceholders: string;
+    placeholderMap: Map<string, string>;
+} {
+    // replace https urls with placeholder
+    const urlRegex = /https?:\/\/[^\s]+/g;
+    const placeholderMap = new Map<string, string>();
+
+    let urlIndex = 0;
+    const textWithPlaceholders = paragraph.replace(urlRegex, (match) => {
+        // twitter url would be considered as 23 characters
+        // <<URL_CONSIDERER_23_1>> is also 23 characters
+        const placeholder = `<<URL_CONSIDERER_23_${urlIndex}>>`; // Placeholder without . ? ! etc
+        placeholderMap.set(placeholder, match);
+        urlIndex++;
+        return placeholder;
+    });
+
+    return { textWithPlaceholders, placeholderMap };
+}
+
+function splitSentencesAndWords(text: string, maxLength: number): string[] {
+    // Split by periods, question marks and exclamation marks
+    // Note that URLs in text have been replaced with `<<URL_xxx>>` and won't be split by dots
+    const sentences = text.match(/[^\.!\?]+[\.!\?]+|[^\.!\?]+$/g) || [text];
     const chunks: string[] = [];
     let currentChunk = "";
 
@@ -337,13 +357,16 @@ function splitParagraph(paragraph: string, maxLength: number): string[] {
                 currentChunk = sentence;
             }
         } else {
+            // Can't fit more, push currentChunk to results
             if (currentChunk) {
                 chunks.push(currentChunk.trim());
             }
+
+            // If current sentence itself is less than or equal to maxLength
             if (sentence.length <= maxLength) {
                 currentChunk = sentence;
             } else {
-                // Split long sentence into smaller pieces
+                // Need to split sentence by spaces
                 const words = sentence.split(" ");
                 currentChunk = "";
                 for (const word of words) {
@@ -366,9 +389,39 @@ function splitParagraph(paragraph: string, maxLength: number): string[] {
         }
     }
 
+    // Handle remaining content
     if (currentChunk) {
         chunks.push(currentChunk.trim());
     }
 
     return chunks;
 }
+
+function restoreUrls(
+    chunks: string[],
+    placeholderMap: Map<string, string>
+): string[] {
+    return chunks.map((chunk) => {
+        // Replace all <<URL_CONSIDERER_23_>> in chunk back to original URLs using regex
+        return chunk.replace(/<<URL_CONSIDERER_23_(\d+)>>/g, (match) => {
+            const original = placeholderMap.get(match);
+            return original || match; // Return placeholder if not found (theoretically won't happen)
+        });
+    });
+}
+
+function splitParagraph(paragraph: string, maxLength: number): string[] {
+    // 1) Extract URLs and replace with placeholders
+    const { textWithPlaceholders, placeholderMap } = extractUrls(paragraph);
+
+    // 2) Use first section's logic to split by sentences first, then do secondary split
+    const splittedChunks = splitSentencesAndWords(
+        textWithPlaceholders,
+        maxLength
+    );
+
+    // 3) Replace placeholders back to original URLs
+    const restoredChunks = restoreUrls(splittedChunks, placeholderMap);
+
+    return restoredChunks;
+}