Skip to content

Commit 801ae5a

Browse files
committed
fix: scrapeList pagination persistence and action data separation
1 parent 334fbbc commit 801ae5a

File tree

2 files changed

+80
-28
lines changed

2 files changed

+80
-28
lines changed

maxun-core/src/interpret.ts

Lines changed: 52 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,8 @@ export default class Interpreter extends EventEmitter {
8282
scrapeSchema: {}
8383
};
8484

85+
private scrapeListCounter: number = 0;
86+
8587
constructor(workflow: WorkflowFile, options?: Partial<InterpreterOptions>) {
8688
super();
8789
this.workflow = workflow.workflow;
@@ -575,12 +577,13 @@ export default class Interpreter extends EventEmitter {
575577

576578
try {
577579
await this.ensureScriptsLoaded(page);
578-
580+
579581
if (this.options.debugChannel?.incrementScrapeListIndex) {
580582
this.options.debugChannel.incrementScrapeListIndex();
581583
}
582584

583585
let scrapeResults = [];
586+
let paginationUsed = false;
584587

585588
if (!config.pagination) {
586589
scrapeResults = await page.evaluate((cfg) => {
@@ -592,32 +595,47 @@ export default class Interpreter extends EventEmitter {
592595
}
593596
}, config);
594597
} else {
598+
paginationUsed = true;
595599
scrapeResults = await this.handlePagination(page, config);
596600
}
597601

598602
if (!Array.isArray(scrapeResults)) {
599603
scrapeResults = [];
600604
}
601605

602-
const actionType = "scrapeList";
603-
const actionName = (config as any).__name || "List";
606+
console.log(`ScrapeList completed with ${scrapeResults.length} results`);
604607

605-
if (!this.serializableDataByType[actionType]) this.serializableDataByType[actionType] = {};
606-
if (!this.serializableDataByType[actionType][actionName]) {
607-
this.serializableDataByType[actionType][actionName] = [];
608-
}
608+
if (!paginationUsed) {
609+
const actionType = "scrapeList";
610+
let actionName = (config as any).__name || "";
609611

610-
this.serializableDataByType[actionType][actionName].push(...scrapeResults);
612+
if (!actionName || actionName.trim() === "") {
613+
this.scrapeListCounter++;
614+
actionName = `List ${this.scrapeListCounter}`;
615+
}
611616

612-
await this.options.serializableCallback({
613-
scrapeList: this.serializableDataByType.scrapeList,
614-
scrapeSchema: this.serializableDataByType.scrapeSchema
615-
});
617+
if (!this.serializableDataByType[actionType]) this.serializableDataByType[actionType] = {};
618+
if (!this.serializableDataByType[actionType][actionName]) {
619+
this.serializableDataByType[actionType][actionName] = [];
620+
}
621+
622+
this.serializableDataByType[actionType][actionName].push(...scrapeResults);
623+
624+
await this.options.serializableCallback({
625+
scrapeList: this.serializableDataByType.scrapeList,
626+
scrapeSchema: this.serializableDataByType.scrapeSchema
627+
});
628+
}
616629
} catch (error) {
617630
console.error('ScrapeList action failed completely:', error.message);
618-
631+
619632
const actionType = "scrapeList";
620-
const actionName = (config as any).__name || "List";
633+
let actionName = (config as any).__name || "";
634+
635+
if (!actionName || actionName.trim() === "") {
636+
this.scrapeListCounter++;
637+
actionName = `List ${this.scrapeListCounter}`;
638+
}
621639

622640
if (!this.namedResults[actionType]) this.namedResults[actionType] = {};
623641
this.namedResults[actionType][actionName] = [];
@@ -818,20 +836,33 @@ export default class Interpreter extends EventEmitter {
818836
return [];
819837
}
820838

839+
const actionType = "scrapeList";
840+
let actionName = (config as any).__name || "";
841+
if (!actionName || actionName.trim() === "") {
842+
this.scrapeListCounter++;
843+
actionName = `List ${this.scrapeListCounter}`;
844+
}
845+
846+
if (!this.serializableDataByType[actionType]) {
847+
this.serializableDataByType[actionType] = {};
848+
}
849+
if (!this.serializableDataByType[actionType][actionName]) {
850+
this.serializableDataByType[actionType][actionName] = [];
851+
}
852+
821853
let allResults: Record<string, any>[] = [];
822854
let previousHeight = 0;
823855
let scrapedItems: Set<string> = new Set<string>();
824856
let visitedUrls: Set<string> = new Set<string>();
825857
const MAX_RETRIES = 3;
826-
const RETRY_DELAY = 1000; // 1 second delay between retries
858+
const RETRY_DELAY = 1000;
827859
const MAX_UNCHANGED_RESULTS = 5;
828860

829861
const debugLog = (message: string, ...args: any[]) => {
830862
console.log(`[Page ${visitedUrls.size}] [URL: ${page.url()}] ${message}`, ...args);
831863
};
832864

833865
const scrapeCurrentPage = async () => {
834-
// Check abort flag before scraping current page
835866
if (this.isAborted) {
836867
debugLog("Workflow aborted, stopping scrapeCurrentPage");
837868
return;
@@ -849,7 +880,6 @@ export default class Interpreter extends EventEmitter {
849880
debugLog(`Page evaluation failed: ${error.message}`);
850881
return;
851882
}
852-
853883
const newResults = results.filter(item => {
854884
const uniqueKey = JSON.stringify(item);
855885
if (scrapedItems.has(uniqueKey)) return false;
@@ -859,7 +889,11 @@ export default class Interpreter extends EventEmitter {
859889
allResults = allResults.concat(newResults);
860890
debugLog("Results collected:", allResults.length);
861891

862-
await this.options.serializableCallback(allResults);
892+
this.serializableDataByType[actionType][actionName] = [...allResults];
893+
await this.options.serializableCallback({
894+
scrapeList: this.serializableDataByType.scrapeList,
895+
scrapeSchema: this.serializableDataByType.scrapeSchema
896+
});
863897
};
864898

865899
const checkLimit = () => {

server/src/workflow-management/classes/Interpreter.ts

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -567,37 +567,55 @@ export class WorkflowInterpreter {
567567
typeKey = "scrapeSchema";
568568
}
569569

570-
if (this.currentActionType === "scrapeList" && data.scrapeList) {
570+
if (typeKey === "scrapeList" && data.scrapeList) {
571571
data = data.scrapeList;
572-
} else if (this.currentActionType === "scrapeSchema" && data.scrapeSchema) {
572+
} else if (typeKey === "scrapeSchema" && data.scrapeSchema) {
573573
data = data.scrapeSchema;
574574
}
575575

576-
let actionName = this.currentActionName || "";
577-
if (typeKey === "scrapeList") {
578-
actionName = this.getUniqueActionName(typeKey, this.currentActionName);
576+
let actionName = "";
577+
if (typeKey === "scrapeList" && data && typeof data === "object" && !Array.isArray(data)) {
578+
const keys = Object.keys(data);
579+
if (keys.length === 1) {
580+
actionName = keys[0];
581+
data = data[actionName];
582+
} else if (keys.length > 1) {
583+
actionName = keys[keys.length - 1];
584+
data = data[actionName];
585+
}
586+
}
587+
588+
if (!actionName) {
589+
actionName = this.currentActionName || "";
590+
if (typeKey === "scrapeList" && !actionName) {
591+
actionName = this.getUniqueActionName(typeKey, "");
592+
}
579593
}
580594

581595
const flattened = Array.isArray(data)
582596
? data
583-
: (data?.List ?? (data && typeof data === 'object' ? Object.values(data).flat?.() ?? data : []));
597+
: (
598+
data?.List ??
599+
(data && typeof data === "object"
600+
? Object.values(data).flat?.() ?? data
601+
: [])
602+
);
584603

585604
if (!this.serializableDataByType[typeKey]) {
586605
this.serializableDataByType[typeKey] = {};
587606
}
588607

589608
this.serializableDataByType[typeKey][actionName] = flattened;
590609

591-
await this.persistDataToDatabase(typeKey, { [actionName]: flattened });
610+
await this.persistDataToDatabase(typeKey, {
611+
[actionName]: flattened,
612+
});
592613

593614
this.socket.emit("serializableCallback", {
594615
type: typeKey,
595616
name: actionName,
596617
data: flattened,
597618
});
598-
599-
this.currentActionType = null;
600-
this.currentActionName = null;
601619
} catch (err: any) {
602620
logger.log('error', `serializableCallback handler failed: ${err.message}`);
603621
}

0 commit comments

Comments
 (0)