@@ -2,7 +2,7 @@ import { writeFileSync } from 'fs';
2
2
import sampleSize from 'lodash.samplesize' ;
3
3
import os from 'os' ;
4
4
import { join } from 'path' ;
5
- import puppeteer , { Browser , Page , PuppeteerLifeCycleEvent , KnownDevices , PuppeteerLaunchOptions } from 'puppeteer' ;
5
+ import puppeteer , { Browser , Page , KnownDevices , PuppeteerLaunchOptions } from 'puppeteer' ;
6
6
import PuppeteerHar from 'puppeteer-har' ;
7
7
import { getDomain , getSubdomain , parse } from 'tldts' ;
8
8
import url from 'url' ;
@@ -30,9 +30,7 @@ const DEFAULT_OPTIONS = {
30
30
clearCache : true ,
31
31
quiet : true ,
32
32
headless : true ,
33
- defaultTimeout : 35000 ,
34
33
numPages : 3 ,
35
- defaultWaitUntil : 'networkidle2' as PuppeteerLifeCycleEvent ,
36
34
saveBrowserProfile : false ,
37
35
saveScreenshots : true ,
38
36
blTests : [
@@ -100,6 +98,8 @@ export const collect = async (inUrl: string, args: CollectorOptions) => {
100
98
let page : Page ;
101
99
let pageIndex = 1 ;
102
100
let har = { } as any ;
101
+ let page_url = null ;
102
+ let page_request = null ;
103
103
let page_response = null ;
104
104
let loadError = false ;
105
105
const userDataDir = args . saveBrowserProfile ? join ( args . outDir , 'browser-profile' ) : undefined ;
@@ -139,7 +139,13 @@ export const collect = async (inUrl: string, args: CollectorOptions) => {
139
139
page . emulate ( args . emulateDevice ) ;
140
140
141
141
// record all requested hosts
142
- await page . on ( 'request' , request => {
142
+ page . on ( 'request' , request => {
143
+ const redirects = request . redirectChain ( ) ;
144
+ if ( redirects . length == 0 && request . url ( ) == page_url ||
145
+ redirects . length > 0 && redirects [ 0 ] . url ( ) == page_url ) {
146
+ console . log ( `Assigning page_request: ${ request . url ( ) } ` ) ;
147
+ page_request = request ;
148
+ }
143
149
const l = parse ( request . url ( ) ) ;
144
150
// note that hosts may appear as first and third party depending on the path
145
151
if ( FIRST_PARTY . domain === l . domain ) {
@@ -176,33 +182,33 @@ export const collect = async (inUrl: string, args: CollectorOptions) => {
176
182
}
177
183
178
184
// Function to navigate to a page with a timeout guard
179
- const navigateWithTimeout = async ( page : Page , url : string , timeout : number , waitUntil : PuppeteerLifeCycleEvent ) => {
185
+ const navigateWithTimeout = async ( page : Page , navigateUrl : string ) => {
180
186
try {
181
- console . log ( `Going to ${ url } ` ) ;
187
+ page_url = url . parse ( navigateUrl ) . href ; // normalize URL
188
+ console . log ( `Going to ${ page_url } ` ) ;
182
189
page_response = await Promise . race ( [
183
- page . goto ( url , {
184
- timeout : timeout ,
185
- waitUntil : waitUntil
190
+ page . goto ( page_url , {
191
+ timeout : 0 ,
192
+ waitUntil : [ 'networkidle2' , 'domcontentloaded' ]
186
193
} ) ,
187
194
new Promise ( ( _ , reject ) =>
188
195
setTimeout ( ( ) => {
189
- console . log ( `Failed loading with ${ waitUntil } ` ) ;
190
- reject ( new Error ( `Failed loading with ${ waitUntil } ` ) ) ;
191
- } , 10000 )
196
+ reject ( new Error ( 'Done waiting for navigation' ) ) ;
197
+ } , 15000 )
192
198
)
193
199
] ) ;
194
200
} catch ( error ) {
195
- console . log ( 'Trying with domcontentloaded' ) ;
196
- page_response = await page . goto ( url , {
197
- timeout : timeout ,
198
- waitUntil : 'domcontentloaded'
199
- } ) ;
201
+ if ( error . message != 'Done waiting for navigation' ) {
202
+ loadError = true ;
203
+ }
204
+ console . log ( error . message ) ;
200
205
}
201
206
} ;
202
207
203
208
// Go to the first url
204
- await navigateWithTimeout ( page , inUrl , args . defaultTimeout , args . defaultWaitUntil as PuppeteerLifeCycleEvent ) ;
209
+ await navigateWithTimeout ( page , inUrl ) ;
205
210
await savePageContent ( pageIndex , args . outDir , page , args . saveScreenshots ) ;
211
+ console . log ( 'Done saving page' ) ;
206
212
207
213
let duplicatedLinks = [ ] ;
208
214
const outputLinks = {
@@ -221,8 +227,7 @@ export const collect = async (inUrl: string, args: CollectorOptions) => {
221
227
}
222
228
return { status : 'failed' , page_response } ;
223
229
}
224
- output . uri_redirects = page_response
225
- . request ( )
230
+ output . uri_redirects = page_request
226
231
. redirectChain ( )
227
232
. map ( req => {
228
233
return req . url ( ) ;
@@ -278,7 +283,7 @@ export const collect = async (inUrl: string, args: CollectorOptions) => {
278
283
} ;
279
284
}
280
285
281
- await navigateWithTimeout ( page , link , args . defaultTimeout , args . defaultWaitUntil as PuppeteerLifeCycleEvent ) ;
286
+ await navigateWithTimeout ( page , link ) ;
282
287
await savePageContent ( pageIndex , args . outDir , page , args . saveScreenshots ) ;
283
288
284
289
console . log ( `Interacting with page ${ pageIndex } ` ) ;
0 commit comments