Skip to content

Commit

Permalink
Merge pull request #1947 from darwintree/twitter-url
Browse files Browse the repository at this point in the history
feat: improve twitter paragraph splitting if containing url
  • Loading branch information
shakkernerd authored Jan 7, 2025
2 parents 62de238 + 5319131 commit 25fba34
Showing 1 changed file with 59 additions and 6 deletions.
65 changes: 59 additions & 6 deletions packages/client-twitter/src/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -321,11 +321,31 @@ function splitTweetContent(content: string, maxLength: number): string[] {
return tweets;
}

function splitParagraph(paragraph: string, maxLength: number): string[] {
// eslint-disable-next-line
const sentences = paragraph.match(/[^\.!\?]+[\.!\?]+|[^\.!\?]+$/g) || [
paragraph,
];
function extractUrls(paragraph: string): {
textWithPlaceholders: string;
placeholderMap: Map<string, string>;
} {
// replace https urls with placeholder
const urlRegex = /https?:\/\/[^\s]+/g;
const placeholderMap = new Map<string, string>();

let urlIndex = 0;
const textWithPlaceholders = paragraph.replace(urlRegex, (match) => {
// twitter url would be considered as 23 characters
// <<URL_CONSIDERER_23_1>> is also 23 characters
const placeholder = `<<URL_CONSIDERER_23_${urlIndex}>>`; // Placeholder without . ? ! etc
placeholderMap.set(placeholder, match);
urlIndex++;
return placeholder;
});

return { textWithPlaceholders, placeholderMap };
}

function splitSentencesAndWords(text: string, maxLength: number): string[] {
// Split by periods, question marks and exclamation marks
// Note that URLs in text have been replaced with `<<URL_xxx>>` and won't be split by dots
const sentences = text.match(/[^\.!\?]+[\.!\?]+|[^\.!\?]+$/g) || [text];
const chunks: string[] = [];
let currentChunk = "";

Expand All @@ -337,13 +357,16 @@ function splitParagraph(paragraph: string, maxLength: number): string[] {
currentChunk = sentence;
}
} else {
// Can't fit more, push currentChunk to results
if (currentChunk) {
chunks.push(currentChunk.trim());
}

// If current sentence itself is less than or equal to maxLength
if (sentence.length <= maxLength) {
currentChunk = sentence;
} else {
// Split long sentence into smaller pieces
// Need to split sentence by spaces
const words = sentence.split(" ");
currentChunk = "";
for (const word of words) {
Expand All @@ -366,9 +389,39 @@ function splitParagraph(paragraph: string, maxLength: number): string[] {
}
}

// Handle remaining content
if (currentChunk) {
chunks.push(currentChunk.trim());
}

return chunks;
}

function restoreUrls(
chunks: string[],
placeholderMap: Map<string, string>
): string[] {
return chunks.map((chunk) => {
// Replace all <<URL_CONSIDERER_23_>> in chunk back to original URLs using regex
return chunk.replace(/<<URL_CONSIDERER_23_(\d+)>>/g, (match) => {
const original = placeholderMap.get(match);
return original || match; // Return placeholder if not found (theoretically won't happen)
});
});
}

function splitParagraph(paragraph: string, maxLength: number): string[] {
// 1) Extract URLs and replace with placeholders
const { textWithPlaceholders, placeholderMap } = extractUrls(paragraph);

// 2) Use first section's logic to split by sentences first, then do secondary split
const splittedChunks = splitSentencesAndWords(
textWithPlaceholders,
maxLength
);

// 3) Replace placeholders back to original URLs
const restoredChunks = restoreUrls(splittedChunks, placeholderMap);

return restoredChunks;
}

0 comments on commit 25fba34

Please sign in to comment.