Skip to content

Commit 106e404

Browse files
committed
don't rely on page.goto to complete
1 parent be7ee9e commit 106e404

File tree

2 files changed

+29
-22
lines changed

2 files changed

+29
-22
lines changed

src/collector.ts

+26-21
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import { writeFileSync } from 'fs';
22
import sampleSize from 'lodash.samplesize';
33
import os from 'os';
44
import { join } from 'path';
5-
import puppeteer, { Browser, Page, PuppeteerLifeCycleEvent, KnownDevices, PuppeteerLaunchOptions } from 'puppeteer';
5+
import puppeteer, { Browser, Page, KnownDevices, PuppeteerLaunchOptions } from 'puppeteer';
66
import PuppeteerHar from 'puppeteer-har';
77
import { getDomain, getSubdomain, parse } from 'tldts';
88
import url from 'url';
@@ -30,9 +30,7 @@ const DEFAULT_OPTIONS = {
3030
clearCache: true,
3131
quiet: true,
3232
headless: true,
33-
defaultTimeout: 35000,
3433
numPages: 3,
35-
defaultWaitUntil: 'networkidle2' as PuppeteerLifeCycleEvent,
3634
saveBrowserProfile: false,
3735
saveScreenshots: true,
3836
blTests: [
@@ -100,6 +98,8 @@ export const collect = async (inUrl: string, args: CollectorOptions) => {
10098
let page: Page;
10199
let pageIndex = 1;
102100
let har = {} as any;
101+
let page_url = null;
102+
let page_request = null;
103103
let page_response = null;
104104
let loadError = false;
105105
const userDataDir = args.saveBrowserProfile ? join(args.outDir, 'browser-profile') : undefined;
@@ -139,7 +139,13 @@ export const collect = async (inUrl: string, args: CollectorOptions) => {
139139
page.emulate(args.emulateDevice);
140140

141141
// record all requested hosts
142-
await page.on('request', request => {
142+
page.on('request', request => {
143+
const redirects = request.redirectChain();
144+
if (redirects.length == 0 && request.url() == page_url ||
145+
redirects.length > 0 && redirects[0].url() == page_url) {
146+
console.log(`Assigning page_request: ${request.url()}`);
147+
page_request = request;
148+
}
143149
const l = parse(request.url());
144150
// note that hosts may appear as first and third party depending on the path
145151
if (FIRST_PARTY.domain === l.domain) {
@@ -176,33 +182,33 @@ export const collect = async (inUrl: string, args: CollectorOptions) => {
176182
}
177183

178184
// Function to navigate to a page with a timeout guard
179-
const navigateWithTimeout = async (page: Page, url: string, timeout: number, waitUntil: PuppeteerLifeCycleEvent) => {
185+
const navigateWithTimeout = async (page: Page, navigateUrl: string) => {
180186
try {
181-
console.log(`Going to ${url}`);
187+
page_url = url.parse(navigateUrl).href; // normalize URL
188+
console.log(`Going to ${page_url}`);
182189
page_response = await Promise.race([
183-
page.goto(url, {
184-
timeout: timeout,
185-
waitUntil: waitUntil
190+
page.goto(page_url, {
191+
timeout: 0,
192+
waitUntil: ['networkidle2', 'domcontentloaded']
186193
}),
187194
new Promise((_, reject) =>
188195
setTimeout(() => {
189-
console.log(`Failed loading with ${waitUntil}`);
190-
reject(new Error(`Failed loading with ${waitUntil}`));
191-
}, 10000)
196+
reject(new Error('Done waiting for navigation'));
197+
}, 15000)
192198
)
193199
]);
194200
} catch (error) {
195-
console.log('Trying with domcontentloaded');
196-
page_response = await page.goto(url, {
197-
timeout: timeout,
198-
waitUntil: 'domcontentloaded'
199-
});
201+
if (error.message != 'Done waiting for navigation') {
202+
loadError = true;
203+
}
204+
console.log(error.message);
200205
}
201206
};
202207

203208
// Go to the first url
204-
await navigateWithTimeout(page, inUrl, args.defaultTimeout, args.defaultWaitUntil as PuppeteerLifeCycleEvent);
209+
await navigateWithTimeout(page, inUrl);
205210
await savePageContent(pageIndex, args.outDir, page, args.saveScreenshots);
211+
console.log('Done saving page');
206212

207213
let duplicatedLinks = [];
208214
const outputLinks = {
@@ -221,8 +227,7 @@ export const collect = async (inUrl: string, args: CollectorOptions) => {
221227
}
222228
return { status: 'failed', page_response };
223229
}
224-
output.uri_redirects = page_response
225-
.request()
230+
output.uri_redirects = page_request
226231
.redirectChain()
227232
.map(req => {
228233
return req.url();
@@ -278,7 +283,7 @@ export const collect = async (inUrl: string, args: CollectorOptions) => {
278283
};
279284
}
280285

281-
await navigateWithTimeout(page, link, args.defaultTimeout, args.defaultWaitUntil as PuppeteerLifeCycleEvent);
286+
await navigateWithTimeout(page, link);
282287
await savePageContent(pageIndex, args.outDir, page, args.saveScreenshots);
283288

284289
console.log(`Interacting with page ${pageIndex}`);

src/pptr-utils/default.ts

+3-1
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,12 @@ export const savePageContent = async (index, outDir, page: Page, screenshot = tr
1313
if (screenshot) {
1414
console.log(`Saving ${index}.jpeg`);
1515
const outPathImg = path.join(outDir, `${index}.jpeg`);
16+
console.log('About to screenshot');
1617
await page.screenshot({ path: outPathImg, type: 'jpeg', quality: 50 });
18+
console.log('done with screenshot');
1719
}
1820
} catch (error) {
19-
console.log(`couldnt save page content: ${JSON.stringify(error)}`);
21+
console.log(`couldnt save page content: ${error.message}`);
2022
}
2123
};
2224
/**

0 commit comments

Comments
 (0)