Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 23 additions & 3 deletions maxun-core/src/browserSide/scraper.js
Original file line number Diff line number Diff line change
Expand Up @@ -524,7 +524,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
// Enhanced value extraction with context awareness
function extractValue(element, attribute) {
if (!element) return null;

// Get context-aware base URL
const baseURL = element.ownerDocument?.location?.href || window.location.origin;

Expand All @@ -535,14 +535,34 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
return shadowContent.trim();
}
}

if (attribute === 'innerText') {
return element.innerText.trim();
} else if (attribute === 'innerHTML') {
return element.innerHTML.trim();
} else if (attribute === 'src' || attribute === 'href') {
const attrValue = element.getAttribute(attribute);
return attrValue ? new URL(attrValue, baseURL).href : null;

const dataAttr = attrValue || element.getAttribute('data-' + attribute);

if (!dataAttr || dataAttr.trim() === '') {
if (attribute === 'src') {
const style = window.getComputedStyle(element);
const bgImage = style.backgroundImage;
if (bgImage && bgImage !== 'none') {
const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
return matches ? new URL(matches[1], baseURL).href : null;
}
}
return null;
}

try {
return new URL(dataAttr, baseURL).href;
} catch (e) {
console.warn('Error creating URL from', dataAttr, e);
return dataAttr; // Return the original value if URL construction fails
}
}
return element.getAttribute(attribute);
}
Expand Down
116 changes: 74 additions & 42 deletions maxun-core/src/interpret.ts
Original file line number Diff line number Diff line change
Expand Up @@ -704,13 +704,13 @@ export default class Interpreter extends EventEmitter {

await scrapeCurrentPage();
if (checkLimit()) return allResults;

const { button, workingSelector, updatedSelectors } = await findWorkingButton(availableSelectors);

availableSelectors = updatedSelectors;

if (!button || !workingSelector) {
// Final retry for navigation when no selectors work
// Final retry for navigation when no selectors work
const success = await retryOperation(async () => {
try {
await page.evaluate(() => window.history.forward());
Expand All @@ -724,70 +724,102 @@ export default class Interpreter extends EventEmitter {
if (!success) return allResults;
break;
}

let retryCount = 0;
let navigationSuccess = false;

while (retryCount < MAX_RETRIES && !navigationSuccess) {
let paginationSuccess = false;

// Capture basic content signature before click
const captureContentSignature = async () => {
return await page.evaluate((selector) => {
const items = document.querySelectorAll(selector);
return {
url: window.location.href,
itemCount: items.length,
firstItems: Array.from(items).slice(0, 3).map(el => el.textContent || '').join('|')
};
}, config.listSelector);
};

const beforeSignature = await captureContentSignature();
debugLog(`Before click: ${beforeSignature.itemCount} items`);

while (retryCount < MAX_RETRIES && !paginationSuccess) {
try {
try {
await Promise.all([
page.waitForNavigation({
waitUntil: 'networkidle',
timeout: 15000
}).catch(e => {
throw e;
}),
button.click()
]);
navigationSuccess = true;
} catch (error) {
debugLog(`Regular click failed on attempt ${retryCount + 1}. Trying DispatchEvent`);

// If regular click fails, try dispatchEvent
if (page.url() === currentUrl) {
debugLog("Navigation successful after regular click");
paginationSuccess = true;
} catch (navError) {
debugLog("Regular click with navigation failed, trying dispatch event with navigation");
try {
await Promise.all([
page.waitForNavigation({
waitUntil: 'networkidle',
timeout: 15000
}).catch(e => {
throw e;
}),
button.dispatchEvent('click')
]);
debugLog("Navigation successful after dispatch event");
paginationSuccess = true;
} catch (dispatchNavError) {
try {
await Promise.all([
page.waitForNavigation({
waitUntil: 'networkidle',
timeout: 15000
}),
button.dispatchEvent('click')
]);
navigationSuccess = true;
} catch (dispatchError) {
debugLog(`DispatchEvent failed on attempt ${retryCount + 1}.`);
await button.click();
await page.waitForTimeout(2000);
} catch (clickError) {
await button.dispatchEvent('click');
await page.waitForTimeout(2000);
}
} else {
navigationSuccess = true;
}
}

const newUrl = page.url();
if (visitedUrls.has(newUrl)) {
debugLog(`Detected navigation to previously visited URL ${newUrl} on attempt ${retryCount + 1}`);
navigationSuccess = false;
}

if (navigationSuccess) {
await page.waitForTimeout(1000);

await page.waitForLoadState('networkidle', { timeout: 5000 }).catch(() => {});

if (!paginationSuccess) {
const newUrl = page.url();
const afterSignature = await captureContentSignature();

if (newUrl !== currentUrl) {
debugLog(`URL changed to ${newUrl}`);
visitedUrls.add(newUrl);
paginationSuccess = true;
}
else if (afterSignature.firstItems !== beforeSignature.firstItems) {
debugLog("Content changed without URL change");
paginationSuccess = true;
}
else if (afterSignature.itemCount !== beforeSignature.itemCount) {
debugLog(`Item count changed from ${beforeSignature.itemCount} to ${afterSignature.itemCount}`);
paginationSuccess = true;
}
}
} catch (error) {
debugLog(`Navigation attempt ${retryCount + 1} failed completely.`);
navigationSuccess = false;
debugLog(`Pagination attempt ${retryCount + 1} failed: ${error.message}`);
}

if (!navigationSuccess) {
if (!paginationSuccess) {
retryCount++;
if (retryCount < MAX_RETRIES) {
debugLog(`Retrying navigation - attempt ${retryCount + 1} of ${MAX_RETRIES}`);
debugLog(`Retrying pagination - attempt ${retryCount + 1} of ${MAX_RETRIES}`);
await page.waitForTimeout(RETRY_DELAY);
}
}
}

if (!navigationSuccess) {
debugLog(`Navigation failed after ${MAX_RETRIES} attempts`);
if (!paginationSuccess) {
debugLog(`Pagination failed after ${MAX_RETRIES} attempts`);
return allResults;
}

break;
}

Expand Down