Skip to content

Commit a1515c2

Browse files
authored
Merge pull request #889 from getmaxun/markdownify
feat: scrape [html + markdown]
2 parents 1bb1cc8 + a1b2117 commit a1515c2

File tree

18 files changed

+1420
-208
lines changed

18 files changed

+1420
-208
lines changed

package.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
"idcac-playwright": "^0.1.3",
4747
"ioredis": "^5.4.1",
4848
"joi": "^17.6.0",
49+
"joplin-turndown-plugin-gfm": "^1.0.12",
4950
"jsonwebtoken": "^9.0.2",
5051
"jwt-decode": "^4.0.0",
5152
"lodash": "^4.17.21",
@@ -80,6 +81,7 @@
8081
"styled-components": "^5.3.3",
8182
"swagger-jsdoc": "^6.2.8",
8283
"swagger-ui-express": "^5.0.1",
84+
"turndown": "^7.2.2",
8385
"typedoc": "^0.23.8",
8486
"typescript": "^5.0.0",
8587
"uuid": "^8.3.2",
@@ -126,6 +128,7 @@
126128
"@types/styled-components": "^5.1.23",
127129
"@types/swagger-jsdoc": "^6.0.4",
128130
"@types/swagger-ui-express": "^4.1.6",
131+
"@types/turndown": "^5.0.6",
129132
"@vitejs/plugin-react": "^4.3.3",
130133
"ajv": "^8.8.2",
131134
"concurrently": "^7.0.0",

server/src/api/record.ts

Lines changed: 197 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import { WorkflowFile } from "maxun-core";
1818
import { googleSheetUpdateTasks, processGoogleSheetUpdates } from "../workflow-management/integrations/gsheet";
1919
import { airtableUpdateTasks, processAirtableUpdates } from "../workflow-management/integrations/airtable";
2020
import { sendWebhook } from "../routes/webhook";
21+
import { convertPageToHTML, convertPageToMarkdown } from '../markdownify/scrape';
2122

2223
chromium.use(stealthPlugin());
2324

@@ -344,7 +345,9 @@ function formatRunResponse(run: any) {
344345
runByAPI: run.runByAPI,
345346
data: {
346347
textData: {},
347-
listData: {}
348+
listData: {},
349+
markdown: '',
350+
html: ''
348351
},
349352
screenshots: [] as any[],
350353
};
@@ -359,6 +362,14 @@ function formatRunResponse(run: any) {
359362
formattedRun.data.listData = output.scrapeList;
360363
}
361364

365+
if (output.markdown && Array.isArray(output.markdown)) {
366+
formattedRun.data.markdown = output.markdown[0]?.content || '';
367+
}
368+
369+
if (output.html && Array.isArray(output.html)) {
370+
formattedRun.data.html = output.html[0]?.content || '';
371+
}
372+
362373
if (run.binaryOutput) {
363374
Object.keys(run.binaryOutput).forEach(key => {
364375
if (run.binaryOutput[key]) {
@@ -569,9 +580,9 @@ async function triggerIntegrationUpdates(runId: string, robotMetaId: string): Pr
569580
}
570581
}
571582

572-
async function readyForRunHandler(browserId: string, id: string, userId: string){
583+
async function readyForRunHandler(browserId: string, id: string, userId: string, requestedFormats?: string[]){
573584
try {
574-
const result = await executeRun(id, userId);
585+
const result = await executeRun(id, userId, requestedFormats);
575586

576587
if (result && result.success) {
577588
logger.log('info', `Interpretation of ${id} succeeded`);
@@ -608,7 +619,7 @@ function AddGeneratedFlags(workflow: WorkflowFile) {
608619
return copy;
609620
};
610621

611-
async function executeRun(id: string, userId: string) {
622+
async function executeRun(id: string, userId: string, requestedFormats?: string[]) {
612623
let browser: any = null;
613624

614625
try {
@@ -651,6 +662,166 @@ async function executeRun(id: string, userId: string) {
651662
};
652663
}
653664

665+
if (recording.recording_meta.type === 'scrape') {
666+
logger.log('info', `Executing scrape robot for API run ${id}`);
667+
668+
let formats = recording.recording_meta.formats || ['markdown'];
669+
670+
// Override if API request defines formats
671+
if (requestedFormats && Array.isArray(requestedFormats) && requestedFormats.length > 0) {
672+
formats = requestedFormats.filter((f): f is 'markdown' | 'html' => ['markdown', 'html'].includes(f));
673+
}
674+
675+
await run.update({
676+
status: 'running',
677+
log: `Converting page to: ${formats.join(', ')}`
678+
});
679+
680+
try {
681+
const url = recording.recording_meta.url;
682+
683+
if (!url) {
684+
throw new Error('No URL specified for markdown robot');
685+
}
686+
687+
let markdown = '';
688+
let html = '';
689+
const serializableOutput: any = {};
690+
691+
// Markdown conversion
692+
if (formats.includes('markdown')) {
693+
markdown = await convertPageToMarkdown(url);
694+
serializableOutput.markdown = [{ content: markdown }];
695+
}
696+
697+
// HTML conversion
698+
if (formats.includes('html')) {
699+
html = await convertPageToHTML(url);
700+
serializableOutput.html = [{ content: html }];
701+
}
702+
703+
await run.update({
704+
status: 'success',
705+
finishedAt: new Date().toLocaleString(),
706+
log: `${formats.join(', ')} conversion completed successfully`,
707+
serializableOutput,
708+
binaryOutput: {},
709+
});
710+
711+
logger.log('info', `Markdown robot execution completed for API run ${id}`);
712+
713+
// Push success socket event
714+
try {
715+
const completionData = {
716+
runId: plainRun.runId,
717+
robotMetaId: plainRun.robotMetaId,
718+
robotName: recording.recording_meta.name,
719+
status: 'success',
720+
finishedAt: new Date().toLocaleString()
721+
};
722+
723+
serverIo
724+
.of('/queued-run')
725+
.to(`user-${userId}`)
726+
.emit('run-completed', completionData);
727+
} catch (socketError: any) {
728+
logger.log(
729+
'warn',
730+
`Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`
731+
);
732+
}
733+
734+
// Build webhook payload
735+
const webhookPayload: any = {
736+
robot_id: plainRun.robotMetaId,
737+
run_id: plainRun.runId,
738+
robot_name: recording.recording_meta.name,
739+
status: 'success',
740+
started_at: plainRun.startedAt,
741+
finished_at: new Date().toLocaleString(),
742+
metadata: {
743+
browser_id: plainRun.browserId,
744+
user_id: userId,
745+
},
746+
};
747+
748+
if (formats.includes('markdown')) webhookPayload.markdown = markdown;
749+
if (formats.includes('html')) webhookPayload.html = html;
750+
751+
try {
752+
await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
753+
logger.log(
754+
'info',
755+
`Webhooks sent successfully for markdown robot API run ${plainRun.runId}`
756+
);
757+
} catch (webhookError: any) {
758+
logger.log(
759+
'warn',
760+
`Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`
761+
);
762+
}
763+
764+
capture("maxun-oss-run-created-api", {
765+
runId: plainRun.runId,
766+
user_id: userId,
767+
status: "success",
768+
robot_type: "scrape",
769+
formats
770+
});
771+
772+
await destroyRemoteBrowser(plainRun.browserId, userId);
773+
774+
return {
775+
success: true,
776+
interpretationInfo: run.toJSON()
777+
};
778+
} catch (error: any) {
779+
logger.log(
780+
'error',
781+
`${formats.join(', ')} conversion failed for API run ${id}: ${error.message}`
782+
);
783+
784+
await run.update({
785+
status: 'failed',
786+
finishedAt: new Date().toLocaleString(),
787+
log: `${formats.join(', ')} conversion failed: ${error.message}`,
788+
});
789+
790+
// Send failure socket event
791+
try {
792+
const failureData = {
793+
runId: plainRun.runId,
794+
robotMetaId: plainRun.robotMetaId,
795+
robotName: recording.recording_meta.name,
796+
status: 'failed',
797+
finishedAt: new Date().toLocaleString()
798+
};
799+
800+
serverIo
801+
.of('/queued-run')
802+
.to(`user-${userId}`)
803+
.emit('run-completed', failureData);
804+
} catch (socketError: any) {
805+
logger.log(
806+
'warn',
807+
`Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`
808+
);
809+
}
810+
811+
capture("maxun-oss-run-created-api", {
812+
runId: plainRun.runId,
813+
user_id: userId,
814+
status: "failed",
815+
robot_type: "scrape",
816+
formats
817+
});
818+
819+
await destroyRemoteBrowser(plainRun.browserId, userId);
820+
821+
throw error;
822+
}
823+
}
824+
654825
plainRun.status = 'running';
655826

656827
browser = browserPool.getRemoteBrowser(plainRun.browserId);
@@ -848,7 +1019,7 @@ async function executeRun(id: string, userId: string) {
8481019
}
8491020
}
8501021

851-
export async function handleRunRecording(id: string, userId: string) {
1022+
export async function handleRunRecording(id: string, userId: string, requestedFormats?: string[]) {
8521023
try {
8531024
const result = await createWorkflowAndStoreMetadata(id, userId);
8541025
const { browserId, runId: newRunId } = result;
@@ -862,7 +1033,7 @@ export async function handleRunRecording(id: string, userId: string) {
8621033
rejectUnauthorized: false
8631034
});
8641035

865-
socket.on('ready-for-run', () => readyForRunHandler(browserId, newRunId, userId));
1036+
socket.on('ready-for-run', () => readyForRunHandler(browserId, newRunId, userId, requestedFormats));
8661037

8671038
logger.log('info', `Running Robot: ${id}`);
8681039

@@ -889,12 +1060,11 @@ async function waitForRunCompletion(runId: string, interval: number = 2000) {
8891060
if (!run) throw new Error('Run not found');
8901061

8911062
if (run.status === 'success') {
892-
return run.toJSON();
1063+
return run;
8931064
} else if (run.status === 'failed') {
8941065
throw new Error('Run failed');
8951066
}
8961067

897-
// Wait for the next polling interval
8981068
await new Promise(resolve => setTimeout(resolve, interval));
8991069
}
9001070
}
@@ -914,6 +1084,21 @@ async function waitForRunCompletion(runId: string, interval: number = 2000) {
9141084
* type: string
9151085
* required: true
9161086
* description: The ID of the robot to run.
1087+
* requestBody:
1088+
* required: false
1089+
* content:
1090+
* application/json:
1091+
* schema:
1092+
* type: object
1093+
* properties:
1094+
* formats:
1095+
* type: array
1096+
* items:
1097+
* type: string
1098+
* enum: [markdown, html]
1099+
* description: Optional override formats for this run.
1100+
* example:
1101+
* formats: ["html"]
9171102
* responses:
9181103
* 200:
9191104
* description: Robot run started successfully.
@@ -972,7 +1157,10 @@ router.post("/robots/:id/runs", requireAPIKey, async (req: AuthenticatedRequest,
9721157
if (!req.user) {
9731158
return res.status(401).json({ ok: false, error: 'Unauthorized' });
9741159
}
975-
const runId = await handleRunRecording(req.params.id, req.user.id);
1160+
1161+
const requestedFormats = req.body.formats;
1162+
1163+
const runId = await handleRunRecording(req.params.id, req.user.id, requestedFormats);
9761164

9771165
if (!runId) {
9781166
throw new Error('Run ID is undefined');

0 commit comments

Comments
 (0)