@@ -262,73 +262,83 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
262262 * @returns {Array.<Array.<Object>> } Array of arrays of scraped items, one sub-array per list
263263 */
264264 window . scrapeList = async function ( { listSelector, fields, limit = 10 } ) {
265+ // Separate fields into table and non-table categories
266+ const tableFields = { } ;
267+ const nonTableFields = { } ;
268+
269+ for ( const [ label , field ] of Object . entries ( fields ) ) {
270+ if ( [ 'TD' , 'TH' , 'TR' ] . includes ( field . tag ) ) {
271+ tableFields [ label ] = field ;
272+ } else {
273+ nonTableFields [ label ] = field ;
274+ }
275+ }
276+
277+ const parentElements = Array . from ( document . querySelectorAll ( listSelector ) ) ;
265278 const scrapedData = [ ] ;
266279
267- while ( scrapedData . length < limit ) {
268- let parentElements = Array . from ( document . querySelectorAll ( listSelector ) ) ;
269-
270- // If we only got one element or none, try a more generic approach
271- if ( limit > 1 && parentElements . length <= 1 ) {
272- const [ containerSelector , _ ] = listSelector . split ( '>' ) . map ( s => s . trim ( ) ) ;
273- const container = document . querySelector ( containerSelector ) ;
280+ for ( const parent of parentElements ) {
281+ // First, get the number of rows we'll need by checking the first table field
282+ const firstTableField = Object . values ( tableFields ) [ 0 ] ;
283+ const tableRows = firstTableField
284+ ? Array . from ( parent . querySelectorAll ( firstTableField . selector ) ) . slice ( 0 , limit )
285+ : [ null ] ;
286+
287+ tableRows . forEach ( ( _ , rowIndex ) => {
288+ const record = { } ;
274289
275- if ( container ) {
276- const allChildren = Array . from ( container . children ) ;
290+ // Table fields
291+ for ( const [ label , { selector, attribute } ] of Object . entries ( tableFields ) ) {
292+ const elements = Array . from ( parent . querySelectorAll ( selector ) ) ;
293+ const element = elements [ rowIndex ] ;
277294
278- const firstMatch = document . querySelector ( listSelector ) ;
279- if ( firstMatch ) {
280- // Get classes from the first matching element
281- const firstMatchClasses = Array . from ( firstMatch . classList ) ;
282-
283- // Find similar elements by matching most of their classes
284- parentElements = allChildren . filter ( element => {
285- const elementClasses = Array . from ( element . classList ) ;
286-
287- // Element should share at least 70% of classes with the first match
288- const commonClasses = firstMatchClasses . filter ( cls =>
289- elementClasses . includes ( cls ) ) ;
290- return commonClasses . length >= Math . floor ( firstMatchClasses . length * 0.7 ) ;
291- } ) ;
295+ if ( element ) {
296+ let value ;
297+ if ( attribute === 'innerText' ) {
298+ value = element . innerText . trim ( ) ;
299+ } else if ( attribute === 'innerHTML' ) {
300+ value = element . innerHTML . trim ( ) ;
301+ } else if ( attribute === 'src' || attribute === 'href' ) {
302+ const attrValue = element . getAttribute ( attribute ) ;
303+ value = attrValue ? new URL ( attrValue , window . location . origin ) . href : null ;
304+ } else {
305+ value = element . getAttribute ( attribute ) ;
306+ }
307+ record [ label ] = value ;
292308 }
293309 }
294- }
295-
296- // Iterate through each parent element
297- for ( const parent of parentElements ) {
298- if ( scrapedData . length >= limit ) break ;
299- const record = { } ;
300-
301- // For each field, select the corresponding element within the parent
302- for ( const [ label , { selector, attribute } ] of Object . entries ( fields ) ) {
303- const fieldElement = parent . querySelector ( selector ) ;
304-
305- if ( fieldElement ) {
310+
311+ // Non table fields
312+ for ( const [ label , { selector, attribute } ] of Object . entries ( nonTableFields ) ) {
313+ const element = parent . querySelector ( selector ) ;
314+
315+ if ( element ) {
316+ let value ;
306317 if ( attribute === 'innerText' ) {
307- record [ label ] = fieldElement . innerText . trim ( ) ;
318+ value = element . innerText . trim ( ) ;
308319 } else if ( attribute === 'innerHTML' ) {
309- record [ label ] = fieldElement . innerHTML . trim ( ) ;
310- } else if ( attribute === 'src' ) {
311- // Handle relative 'src' URLs
312- const src = fieldElement . getAttribute ( 'src' ) ;
313- record [ label ] = src ? new URL ( src , window . location . origin ) . href : null ;
314- } else if ( attribute === 'href' ) {
315- // Handle relative 'href' URLs
316- const href = fieldElement . getAttribute ( 'href' ) ;
317- record [ label ] = href ? new URL ( href , window . location . origin ) . href : null ;
320+ value = element . innerHTML . trim ( ) ;
321+ } else if ( attribute === 'src' || attribute === 'href' ) {
322+ const attrValue = element . getAttribute ( attribute ) ;
323+ value = attrValue ? new URL ( attrValue , window . location . origin ) . href : null ;
318324 } else {
319- record [ label ] = fieldElement . getAttribute ( attribute ) ;
325+ value = element . getAttribute ( attribute ) ;
320326 }
327+ record [ label ] = value ;
321328 }
322329 }
323- scrapedData . push ( record ) ;
324- }
330+
331+ if ( Object . keys ( record ) . length > 0 ) {
332+ scrapedData . push ( record ) ;
333+ }
334+ } ) ;
325335
326- // If we've processed all available elements and still haven't reached the limit,
327- // break to avoid infinite loop
328- if ( parentElements . length === 0 || scrapedData . length >= parentElements . length ) {
336+ if ( scrapedData . length >= limit ) {
337+ scrapedData . length = limit ;
329338 break ;
330339 }
331340 }
341+
332342 return scrapedData ;
333343} ;
334344
0 commit comments