@@ -18,6 +18,7 @@ import { WorkflowFile } from "maxun-core";
1818import { googleSheetUpdateTasks , processGoogleSheetUpdates } from "../workflow-management/integrations/gsheet" ;
1919import { airtableUpdateTasks , processAirtableUpdates } from "../workflow-management/integrations/airtable" ;
2020import { sendWebhook } from "../routes/webhook" ;
21+ import { convertPageToHTML , convertPageToMarkdown } from '../markdownify/scrape' ;
2122
2223chromium . use ( stealthPlugin ( ) ) ;
2324
@@ -344,7 +345,9 @@ function formatRunResponse(run: any) {
344345 runByAPI : run . runByAPI ,
345346 data : {
346347 textData : { } ,
347- listData : { }
348+ listData : { } ,
349+ markdown : '' ,
350+ html : ''
348351 } ,
349352 screenshots : [ ] as any [ ] ,
350353 } ;
@@ -359,6 +362,14 @@ function formatRunResponse(run: any) {
359362 formattedRun . data . listData = output . scrapeList ;
360363 }
361364
365+ if ( output . markdown && Array . isArray ( output . markdown ) ) {
366+ formattedRun . data . markdown = output . markdown [ 0 ] ?. content || '' ;
367+ }
368+
369+ if ( output . html && Array . isArray ( output . html ) ) {
370+ formattedRun . data . html = output . html [ 0 ] ?. content || '' ;
371+ }
372+
362373 if ( run . binaryOutput ) {
363374 Object . keys ( run . binaryOutput ) . forEach ( key => {
364375 if ( run . binaryOutput [ key ] ) {
@@ -569,9 +580,9 @@ async function triggerIntegrationUpdates(runId: string, robotMetaId: string): Pr
569580 }
570581}
571582
572- async function readyForRunHandler ( browserId : string , id : string , userId : string ) {
583+ async function readyForRunHandler ( browserId : string , id : string , userId : string , requestedFormats ?: string [ ] ) {
573584 try {
574- const result = await executeRun ( id , userId ) ;
585+ const result = await executeRun ( id , userId , requestedFormats ) ;
575586
576587 if ( result && result . success ) {
577588 logger . log ( 'info' , `Interpretation of ${ id } succeeded` ) ;
@@ -608,7 +619,7 @@ function AddGeneratedFlags(workflow: WorkflowFile) {
608619 return copy ;
609620} ;
610621
611- async function executeRun ( id : string , userId : string ) {
622+ async function executeRun ( id : string , userId : string , requestedFormats ?: string [ ] ) {
612623 let browser : any = null ;
613624
614625 try {
@@ -651,6 +662,166 @@ async function executeRun(id: string, userId: string) {
651662 } ;
652663 }
653664
665+ if ( recording . recording_meta . type === 'scrape' ) {
666+ logger . log ( 'info' , `Executing scrape robot for API run ${ id } ` ) ;
667+
668+ let formats = recording . recording_meta . formats || [ 'markdown' ] ;
669+
670+ // Override if API request defines formats
671+ if ( requestedFormats && Array . isArray ( requestedFormats ) && requestedFormats . length > 0 ) {
672+ formats = requestedFormats . filter ( ( f ) : f is 'markdown' | 'html' => [ 'markdown' , 'html' ] . includes ( f ) ) ;
673+ }
674+
675+ await run . update ( {
676+ status : 'running' ,
677+ log : `Converting page to: ${ formats . join ( ', ' ) } `
678+ } ) ;
679+
680+ try {
681+ const url = recording . recording_meta . url ;
682+
683+ if ( ! url ) {
684+ throw new Error ( 'No URL specified for markdown robot' ) ;
685+ }
686+
687+ let markdown = '' ;
688+ let html = '' ;
689+ const serializableOutput : any = { } ;
690+
691+ // Markdown conversion
692+ if ( formats . includes ( 'markdown' ) ) {
693+ markdown = await convertPageToMarkdown ( url ) ;
694+ serializableOutput . markdown = [ { content : markdown } ] ;
695+ }
696+
697+ // HTML conversion
698+ if ( formats . includes ( 'html' ) ) {
699+ html = await convertPageToHTML ( url ) ;
700+ serializableOutput . html = [ { content : html } ] ;
701+ }
702+
703+ await run . update ( {
704+ status : 'success' ,
705+ finishedAt : new Date ( ) . toLocaleString ( ) ,
706+ log : `${ formats . join ( ', ' ) } conversion completed successfully` ,
707+ serializableOutput,
708+ binaryOutput : { } ,
709+ } ) ;
710+
711+ logger . log ( 'info' , `Markdown robot execution completed for API run ${ id } ` ) ;
712+
713+ // Push success socket event
714+ try {
715+ const completionData = {
716+ runId : plainRun . runId ,
717+ robotMetaId : plainRun . robotMetaId ,
718+ robotName : recording . recording_meta . name ,
719+ status : 'success' ,
720+ finishedAt : new Date ( ) . toLocaleString ( )
721+ } ;
722+
723+ serverIo
724+ . of ( '/queued-run' )
725+ . to ( `user-${ userId } ` )
726+ . emit ( 'run-completed' , completionData ) ;
727+ } catch ( socketError : any ) {
728+ logger . log (
729+ 'warn' ,
730+ `Failed to send run-completed notification for markdown robot run ${ id } : ${ socketError . message } `
731+ ) ;
732+ }
733+
734+ // Build webhook payload
735+ const webhookPayload : any = {
736+ robot_id : plainRun . robotMetaId ,
737+ run_id : plainRun . runId ,
738+ robot_name : recording . recording_meta . name ,
739+ status : 'success' ,
740+ started_at : plainRun . startedAt ,
741+ finished_at : new Date ( ) . toLocaleString ( ) ,
742+ metadata : {
743+ browser_id : plainRun . browserId ,
744+ user_id : userId ,
745+ } ,
746+ } ;
747+
748+ if ( formats . includes ( 'markdown' ) ) webhookPayload . markdown = markdown ;
749+ if ( formats . includes ( 'html' ) ) webhookPayload . html = html ;
750+
751+ try {
752+ await sendWebhook ( plainRun . robotMetaId , 'run_completed' , webhookPayload ) ;
753+ logger . log (
754+ 'info' ,
755+ `Webhooks sent successfully for markdown robot API run ${ plainRun . runId } `
756+ ) ;
757+ } catch ( webhookError : any ) {
758+ logger . log (
759+ 'warn' ,
760+ `Failed to send webhooks for markdown robot run ${ plainRun . runId } : ${ webhookError . message } `
761+ ) ;
762+ }
763+
764+ capture ( "maxun-oss-run-created-api" , {
765+ runId : plainRun . runId ,
766+ user_id : userId ,
767+ status : "success" ,
768+ robot_type : "scrape" ,
769+ formats
770+ } ) ;
771+
772+ await destroyRemoteBrowser ( plainRun . browserId , userId ) ;
773+
774+ return {
775+ success : true ,
776+ interpretationInfo : run . toJSON ( )
777+ } ;
778+ } catch ( error : any ) {
779+ logger . log (
780+ 'error' ,
781+ `${ formats . join ( ', ' ) } conversion failed for API run ${ id } : ${ error . message } `
782+ ) ;
783+
784+ await run . update ( {
785+ status : 'failed' ,
786+ finishedAt : new Date ( ) . toLocaleString ( ) ,
787+ log : `${ formats . join ( ', ' ) } conversion failed: ${ error . message } ` ,
788+ } ) ;
789+
790+ // Send failure socket event
791+ try {
792+ const failureData = {
793+ runId : plainRun . runId ,
794+ robotMetaId : plainRun . robotMetaId ,
795+ robotName : recording . recording_meta . name ,
796+ status : 'failed' ,
797+ finishedAt : new Date ( ) . toLocaleString ( )
798+ } ;
799+
800+ serverIo
801+ . of ( '/queued-run' )
802+ . to ( `user-${ userId } ` )
803+ . emit ( 'run-completed' , failureData ) ;
804+ } catch ( socketError : any ) {
805+ logger . log (
806+ 'warn' ,
807+ `Failed to send run-failed notification for markdown robot run ${ id } : ${ socketError . message } `
808+ ) ;
809+ }
810+
811+ capture ( "maxun-oss-run-created-api" , {
812+ runId : plainRun . runId ,
813+ user_id : userId ,
814+ status : "failed" ,
815+ robot_type : "scrape" ,
816+ formats
817+ } ) ;
818+
819+ await destroyRemoteBrowser ( plainRun . browserId , userId ) ;
820+
821+ throw error ;
822+ }
823+ }
824+
654825 plainRun . status = 'running' ;
655826
656827 browser = browserPool . getRemoteBrowser ( plainRun . browserId ) ;
@@ -848,7 +1019,7 @@ async function executeRun(id: string, userId: string) {
8481019 }
8491020}
8501021
851- export async function handleRunRecording ( id : string , userId : string ) {
1022+ export async function handleRunRecording ( id : string , userId : string , requestedFormats ?: string [ ] ) {
8521023 try {
8531024 const result = await createWorkflowAndStoreMetadata ( id , userId ) ;
8541025 const { browserId, runId : newRunId } = result ;
@@ -862,7 +1033,7 @@ export async function handleRunRecording(id: string, userId: string) {
8621033 rejectUnauthorized : false
8631034 } ) ;
8641035
865- socket . on ( 'ready-for-run' , ( ) => readyForRunHandler ( browserId , newRunId , userId ) ) ;
1036+ socket . on ( 'ready-for-run' , ( ) => readyForRunHandler ( browserId , newRunId , userId , requestedFormats ) ) ;
8661037
8671038 logger . log ( 'info' , `Running Robot: ${ id } ` ) ;
8681039
@@ -889,12 +1060,11 @@ async function waitForRunCompletion(runId: string, interval: number = 2000) {
8891060 if ( ! run ) throw new Error ( 'Run not found' ) ;
8901061
8911062 if ( run . status === 'success' ) {
892- return run . toJSON ( ) ;
1063+ return run ;
8931064 } else if ( run . status === 'failed' ) {
8941065 throw new Error ( 'Run failed' ) ;
8951066 }
8961067
897- // Wait for the next polling interval
8981068 await new Promise ( resolve => setTimeout ( resolve , interval ) ) ;
8991069 }
9001070}
@@ -914,6 +1084,21 @@ async function waitForRunCompletion(runId: string, interval: number = 2000) {
9141084 * type: string
9151085 * required: true
9161086 * description: The ID of the robot to run.
1087+ * requestBody:
1088+ * required: false
1089+ * content:
1090+ * application/json:
1091+ * schema:
1092+ * type: object
1093+ * properties:
1094+ * formats:
1095+ * type: array
1096+ * items:
1097+ * type: string
1098+ * enum: [markdown, html]
1099+ * description: Optional override formats for this run.
1100+ * example:
1101+ * formats: ["html"]
9171102 * responses:
9181103 * 200:
9191104 * description: Robot run started successfully.
@@ -972,7 +1157,10 @@ router.post("/robots/:id/runs", requireAPIKey, async (req: AuthenticatedRequest,
9721157 if ( ! req . user ) {
9731158 return res . status ( 401 ) . json ( { ok : false , error : 'Unauthorized' } ) ;
9741159 }
975- const runId = await handleRunRecording ( req . params . id , req . user . id ) ;
1160+
1161+ const requestedFormats = req . body . formats ;
1162+
1163+ const runId = await handleRunRecording ( req . params . id , req . user . id , requestedFormats ) ;
9761164
9771165 if ( ! runId ) {
9781166 throw new Error ( 'Run ID is undefined' ) ;
0 commit comments