Skip to content

Commit

Permalink
Merge pull request #1605 from omnivore-app/fix/save-twitter-thread
Browse files Browse the repository at this point in the history
Fix twitter thread not saving correctly
  • Loading branch information
sywhb authored Jan 3, 2023
2 parents 5c21509 + ab16447 commit c5dd8a7
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 25 deletions.
54 changes: 31 additions & 23 deletions packages/content-handler/src/websites/twitter-handler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -215,23 +215,19 @@ const getTweetIds = async (
const page = await context.newPage()

await page.goto(pageURL, {
waitUntil: 'networkidle2',
timeout: 60000,
waitUntil: 'networkidle0',
})

await waitFor(4000)

return (await page.evaluate(async (author) => {
const MAX_THREAD_DEPTH = 100
const ids: string[] = []

/**
* Wait for `ms` amount of milliseconds
* @param {number} ms
*/
const waitFor = (ms: number) =>
new Promise((resolve) => setTimeout(resolve, ms))

const ids: Set<string> = new Set()

// Find the first Show thread button and click it
const showRepliesButton = Array.from(
document.querySelectorAll('div[dir="auto"]')
Expand All @@ -247,30 +243,42 @@ const getTweetIds = async (
await waitFor(2000)
}

const timeNodes = Array.from(document.querySelectorAll('time'))
const distance = 1080
const scrollHeight = document.body.scrollHeight
let currentHeight = 0
// keep scrolling until there are no more elements
while (currentHeight < scrollHeight) {
const timeNodes = Array.from(document.querySelectorAll('time'))

for (let i = 0; i < timeNodes.length && i < MAX_THREAD_DEPTH; i++) {
const timeContainerAnchor: HTMLAnchorElement | HTMLSpanElement | null =
timeNodes[i].parentElement
if (!timeContainerAnchor) continue
for (let i = 0; i < timeNodes.length; i++) {
const timeContainerAnchor:
| HTMLAnchorElement
| HTMLSpanElement
| null = timeNodes[i].parentElement
if (!timeContainerAnchor) continue

if (timeContainerAnchor.tagName === 'SPAN') continue
if (timeContainerAnchor.tagName === 'SPAN') continue

const href = timeContainerAnchor.getAttribute('href')
if (!href) continue
const href = timeContainerAnchor.getAttribute('href')
if (!href) continue

// Get the tweet id and username from the href: https://twitter.com/username/status/1234567890
const match = href.match(/\/([^/]+)\/status\/(\d+)/)
if (!match) continue
// Get the tweet id and username from the href: https://twitter.com/username/status/1234567890
const match = href.match(/\/([^/]+)\/status\/(\d+)/)
if (!match) continue

const id = match[2]
const username = match[1]
const id = match[2]
const username = match[1]

// skip non-author replies
username === author && ids.add(id)
}

// skip non-author replies
username === author && ids.push(id)
window.scrollBy(0, distance)
await waitFor(100)
currentHeight += distance
}

return ids
return Array.from(ids)
}, author)) as string[]
} catch (error) {
console.log(error)
Expand Down
4 changes: 2 additions & 2 deletions packages/puppeteer-parse/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -534,7 +534,7 @@ async function retrieveHtml(page, logRecord) {
logRecord.title = title;

const pageScrollingStart = Date.now();
/* scroll with a 5 second timeout */
/* scroll with a 5 seconds timeout */
await Promise.race([
new Promise(resolve => {
(async function () {
Expand Down Expand Up @@ -562,7 +562,7 @@ async function retrieveHtml(page, logRecord) {
}
})();
}),
await page.waitForTimeout(1000),
await page.waitForTimeout(5000),
]);
logRecord.timing = { ...logRecord.timing, pageScrolled: Date.now() - pageScrollingStart };

Expand Down

1 comment on commit c5dd8a7

@vercel
Copy link

@vercel vercel bot commented on c5dd8a7 Jan 3, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.