@@ -262,83 +262,203 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
262262 * @returns {Array.<Array.<Object>> } Array of arrays of scraped items, one sub-array per list
263263 */
264264 window . scrapeList = async function ( { listSelector, fields, limit = 10 } ) {
265- const tableFields = { } ;
266- const nonTableFields = { } ;
267-
268- for ( const [ label , field ] of Object . entries ( fields ) ) {
269- if ( [ 'TD' , 'TH' , 'TR' ] . includes ( field . tag ) ) {
270- tableFields [ label ] = field ;
271- } else {
272- nonTableFields [ label ] = field ;
265+ // Helper function to extract values from elements
266+ function extractValue ( element , attribute ) {
267+ if ( ! element ) return null ;
268+
269+ if ( attribute === 'innerText' ) {
270+ return element . innerText . trim ( ) ;
271+ } else if ( attribute === 'innerHTML' ) {
272+ return element . innerHTML . trim ( ) ;
273+ } else if ( attribute === 'src' || attribute === 'href' ) {
274+ const attrValue = element . getAttribute ( attribute ) ;
275+ return attrValue ? new URL ( attrValue , window . location . origin ) . href : null ;
276+ }
277+ return element . getAttribute ( attribute ) ;
278+ }
279+
280+ // Helper function to find table ancestors
281+ function findTableAncestor ( element ) {
282+ let currentElement = element ;
283+ const MAX_DEPTH = 5 ;
284+ let depth = 0 ;
285+
286+ while ( currentElement && depth < MAX_DEPTH ) {
287+ if ( currentElement . tagName === 'TD' ) {
288+ return { type : 'TD' , element : currentElement } ;
289+ } else if ( currentElement . tagName === 'TR' ) {
290+ return { type : 'TR' , element : currentElement } ;
291+ }
292+ currentElement = currentElement . parentElement ;
293+ depth ++ ;
294+ }
295+ return null ;
296+ }
297+
298+ function getCellIndex ( td ) {
299+ let index = 0 ;
300+ let sibling = td ;
301+ while ( sibling = sibling . previousElementSibling ) {
302+ index ++ ;
303+ }
304+ return index ;
305+ }
306+
307+ function hasThElement ( row , tableFields ) {
308+ for ( const [ label , { selector } ] of Object . entries ( tableFields ) ) {
309+ const element = row . querySelector ( selector ) ;
310+ if ( element ) {
311+ let current = element ;
312+ while ( current && current !== row ) {
313+ if ( current . tagName === 'TH' ) {
314+ return true ;
315+ }
316+ current = current . parentElement ;
317+ }
318+ }
319+ }
320+ return false ;
321+ }
322+
323+ function filterRowsBasedOnTag ( rows , tableFields ) {
324+ for ( const row of rows ) {
325+ if ( hasThElement ( row , tableFields ) ) {
326+ return rows ;
327+ }
273328 }
329+ return rows . filter ( row => row . getElementsByTagName ( 'TH' ) . length === 0 ) ;
274330 }
275331
276- const parentElements = Array . from ( document . querySelectorAll ( listSelector ) ) ;
332+ // Get all containers that match the listSelector
333+ const containers = Array . from ( document . querySelectorAll ( listSelector ) ) ;
334+ if ( containers . length === 0 ) return [ ] ;
335+
336+ // Initialize arrays to store field classifications for each container
337+ const containerFields = containers . map ( ( ) => ( {
338+ tableFields : { } ,
339+ nonTableFields : { }
340+ } ) ) ;
341+
342+ // Analyze field types for each container
343+ containers . forEach ( ( container , containerIndex ) => {
344+ for ( const [ label , field ] of Object . entries ( fields ) ) {
345+ const sampleElement = container . querySelector ( field . selector ) ;
346+
347+ if ( sampleElement ) {
348+ const ancestor = findTableAncestor ( sampleElement ) ;
349+ if ( ancestor ) {
350+ containerFields [ containerIndex ] . tableFields [ label ] = {
351+ ...field ,
352+ tableContext : ancestor . type ,
353+ cellIndex : ancestor . type === 'TD' ? getCellIndex ( ancestor . element ) : - 1
354+ } ;
355+ } else {
356+ containerFields [ containerIndex ] . nonTableFields [ label ] = field ;
357+ }
358+ } else {
359+ containerFields [ containerIndex ] . nonTableFields [ label ] = field ;
360+ }
361+ }
362+ } ) ;
363+
277364 const scrapedData = [ ] ;
278365
279- for ( const parent of parentElements ) {
280- // Get the first field's elements to determine how many items we have
281- const firstField = Object . values ( fields ) [ 0 ] ;
282- const baseElements = Array . from ( parent . querySelectorAll ( firstField . selector ) ) ;
283-
284- // Process each item up to the limit
285- for ( let i = 0 ; i < Math . min ( baseElements . length , limit ) ; i ++ ) {
286- const record = { } ;
366+ // Process each container
367+ containers . forEach ( ( container , containerIndex ) => {
368+ const { tableFields, nonTableFields } = containerFields [ containerIndex ] ;
369+
370+ // Handle table fields
371+ if ( Object . keys ( tableFields ) . length > 0 ) {
372+ // Find the common table ancestor
373+ const firstField = Object . values ( tableFields ) [ 0 ] ;
374+ const firstElement = container . querySelector ( firstField . selector ) ;
375+ let tableContext = firstElement ;
287376
288- // Process table fields
289- for ( const [ label , { selector, attribute } ] of Object . entries ( tableFields ) ) {
290- const elements = Array . from ( parent . querySelectorAll ( selector ) ) ;
291- // Use the same index to maintain correspondence between fields
292- const element = elements [ i ] ;
377+ while ( tableContext && tableContext . tagName !== 'TABLE' && tableContext !== container ) {
378+ tableContext = tableContext . parentElement ;
379+ }
380+
381+ if ( tableContext ) {
382+ const rows = Array . from ( tableContext . getElementsByTagName ( 'TR' ) ) ;
383+ const processedRows = filterRowsBasedOnTag ( rows , tableFields ) ;
293384
294- if ( element ) {
295- let value ;
296- if ( attribute === 'innerText' ) {
297- value = element . innerText . trim ( ) ;
298- } else if ( attribute === 'innerHTML' ) {
299- value = element . innerHTML . trim ( ) ;
300- } else if ( attribute === 'src' || attribute === 'href' ) {
301- const attrValue = element . getAttribute ( attribute ) ;
302- value = attrValue ? new URL ( attrValue , window . location . origin ) . href : null ;
303- } else {
304- value = element . getAttribute ( attribute ) ;
385+ for ( let rowIndex = 0 ; rowIndex < Math . min ( processedRows . length , limit ) ; rowIndex ++ ) {
386+ const record = { } ;
387+ const currentRow = processedRows [ rowIndex ] ;
388+
389+ for ( const [ label , { selector, attribute, cellIndex } ] of Object . entries ( tableFields ) ) {
390+ let element = null ;
391+
392+ if ( cellIndex >= 0 ) {
393+ const td = currentRow . children [ cellIndex ] ;
394+ if ( td ) {
395+ element = td . querySelector ( selector ) ;
396+
397+ if ( ! element && selector . split ( ">" ) . pop ( ) . includes ( 'td:nth-child' ) ) {
398+ element = td ;
399+ }
400+
401+ if ( ! element ) {
402+ const tagOnlySelector = selector . split ( '.' ) [ 0 ] ;
403+ element = td . querySelector ( tagOnlySelector ) ;
404+ }
405+
406+ if ( ! element ) {
407+ let currentElement = td ;
408+ while ( currentElement && currentElement . children . length > 0 ) {
409+ let foundContentChild = false ;
410+ for ( const child of currentElement . children ) {
411+ if ( extractValue ( child , attribute ) ) {
412+ currentElement = child ;
413+ foundContentChild = true ;
414+ break ;
415+ }
416+ }
417+ if ( ! foundContentChild ) break ;
418+ }
419+ element = currentElement ;
420+ }
421+ }
422+ } else {
423+ element = currentRow . querySelector ( selector ) ;
424+ }
425+
426+ if ( element ) {
427+ record [ label ] = extractValue ( element , attribute ) ;
428+ }
429+ }
430+
431+ if ( Object . keys ( record ) . length > 0 ) {
432+ scrapedData . push ( record ) ;
305433 }
306- record [ label ] = value ;
307434 }
308435 }
309-
310- // Process non-table fields
311- for ( const [ label , { selector, attribute } ] of Object . entries ( nonTableFields ) ) {
436+ }
437+
438+ // Handle non-table fields
439+ if ( Object . keys ( nonTableFields ) . length > 0 ) {
440+ const firstField = Object . values ( nonTableFields ) [ 0 ] ;
441+ const baseElements = Array . from ( container . querySelectorAll ( firstField . selector ) ) ;
442+
443+ for ( let i = 0 ; i < Math . min ( baseElements . length , limit ) ; i ++ ) {
444+ const record = { } ;
445+
446+ for ( const [ label , { selector, attribute } ] of Object . entries ( nonTableFields ) ) {
312447 const elements = Array . from ( parent . querySelectorAll ( selector ) ) ;
313448 // Use the same index to maintain correspondence between fields
314449 const element = elements [ i ] ;
315450
316451 if ( element ) {
317- let value ;
318- if ( attribute === 'innerText' ) {
319- value = element . innerText . trim ( ) ;
320- } else if ( attribute === 'innerHTML' ) {
321- value = element . innerHTML . trim ( ) ;
322- } else if ( attribute === 'src' || attribute === 'href' ) {
323- const attrValue = element . getAttribute ( attribute ) ;
324- value = attrValue ? new URL ( attrValue , window . location . origin ) . href : null ;
325- } else {
326- value = element . getAttribute ( attribute ) ;
327- }
328- record [ label ] = value ;
452+ record [ label ] = extractValue ( element , attribute ) ;
329453 }
454+ }
455+
456+ if ( Object . keys ( record ) . length > 0 ) {
457+ scrapedData . push ( record ) ;
458+ }
330459 }
331-
332- if ( Object . keys ( record ) . length > 0 ) {
333- scrapedData . push ( record ) ;
334- }
335- }
336-
337- if ( scrapedData . length >= limit ) {
338- scrapedData . length = limit ;
339- break ;
340460 }
341- }
461+ } ) ;
342462
343463 return scrapedData ;
344464} ;
0 commit comments