Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
83 commits
Select commit Hold shift + click to select a range
01a2678
wip: get html
amhsirak Nov 17, 2025
994142a
fix: define browser context
amhsirak Nov 17, 2025
0c9dc89
feat: get input text for llm
amhsirak Nov 17, 2025
560f5a3
feat: get llm ready text
amhsirak Nov 17, 2025
9b71cfc
fix: return empty empty str on error
amhsirak Nov 17, 2025
191ac52
fix: return empty empty str on error
amhsirak Nov 17, 2025
af95706
fix: get important content
amhsirak Nov 17, 2025
a3891f6
wip: markdown + plain text
amhsirak Nov 17, 2025
dae4e83
wip: markdown + plain text
amhsirak Nov 17, 2025
28f1bf8
fix: better markdown output
amhsirak Nov 17, 2025
1651763
fix: better markdown output
amhsirak Nov 17, 2025
6e6d6c6
chore(deps): install cheerio, turndown
amhsirak Nov 18, 2025
f22f6ef
debug(temporary): turndown x amzn
amhsirak Nov 18, 2025
0fa5397
debug(temporary): test url -> llm text
amhsirak Nov 18, 2025
4158896
chore: link replace
amhsirak Nov 19, 2025
6c8850a
chore: link replace
amhsirak Nov 19, 2025
7da4647
wip: to markdown
amhsirak Nov 19, 2025
dd1a9a8
chore(deps): install koffi
amhsirak Nov 19, 2025
ec49565
chore: ignore build files
amhsirak Nov 19, 2025
f0d6712
chore: build
amhsirak Nov 19, 2025
da48d46
chore: build
amhsirak Nov 19, 2025
713d374
feat: to markdown
amhsirak Nov 19, 2025
6c93cbc
feat: html -> markdown
amhsirak Nov 19, 2025
0837ac5
fix: go parser path
amhsirak Nov 19, 2025
66d8291
fix: export convert fxn
amhsirak Nov 19, 2025
1d65f90
feat: use parser to scrape
amhsirak Nov 19, 2025
3fd9bb5
chore(debug): test
amhsirak Nov 19, 2025
1a291c2
chore: cleanup
amhsirak Nov 19, 2025
ecaa23f
chore: install scrape plugins
amhsirak Nov 19, 2025
767fa5f
chore: del go
amhsirak Nov 19, 2025
b4644ba
feat: use turndown
amhsirak Nov 19, 2025
b14d84d
fix: -rm debug turndown
amhsirak Nov 19, 2025
839f9fa
fix: plugin imports
amhsirak Nov 19, 2025
0a7a1eb
fix: make baseUrl optional param
amhsirak Nov 19, 2025
9257b15
feat: pass url param
amhsirak Nov 19, 2025
d1f13cf
feat: add robot markdown creation section ui
RohitR311 Nov 20, 2025
c1373d8
feat: display separate field md content
RohitR311 Nov 20, 2025
0d45d1d
feat: markdownify manual, scheduled, api runs
RohitR311 Nov 20, 2025
b19e02f
feat: add markdown route
RohitR311 Nov 20, 2025
05d2d1b
feat: add optional type and url fields
RohitR311 Nov 20, 2025
d444756
chore: add static markdown import
RohitR311 Nov 20, 2025
ddcb3df
feat: extend turndown + clean
amhsirak Nov 20, 2025
28d2288
Merge branch 'markdownify' of https://github.com/getmaxun/maxun into …
amhsirak Nov 20, 2025
8346c96
chore: cleanup
amhsirak Nov 20, 2025
924d687
feat: add create markdown api
RohitR311 Nov 20, 2025
e711326
feat: extract
amhsirak Nov 20, 2025
51a0c3a
chore: remove icon
amhsirak Nov 20, 2025
672a182
feat: extract
amhsirak Nov 20, 2025
d0b8d0c
chore: remove icon
amhsirak Nov 20, 2025
53bf9eb
feat: scrape
amhsirak Nov 20, 2025
8428314
feat: scrape
amhsirak Nov 20, 2025
6de6c3b
feat: remove header
amhsirak Nov 20, 2025
ef43116
feat: markdown
amhsirak Nov 20, 2025
f745089
feat: markdown
amhsirak Nov 20, 2025
81d69a4
chore: lint
amhsirak Nov 20, 2025
eb86b6e
feat: markdown
amhsirak Nov 20, 2025
febc6c1
feat: markdown
amhsirak Nov 20, 2025
dbb6c87
feat: change mui default tabs
amhsirak Nov 20, 2025
606790e
chore: lint
amhsirak Nov 20, 2025
3dac1a0
feat: change mui default tabs
amhsirak Nov 20, 2025
9601905
feat: turn to markdown
amhsirak Nov 20, 2025
930c7b6
fix: lesser restrictions
amhsirak Nov 20, 2025
691dedc
fix: lesser restrictions
amhsirak Nov 20, 2025
7f48e27
chore: lint
amhsirak Nov 20, 2025
fef038b
chore: cleanup wanted deps
amhsirak Nov 20, 2025
5aafe6e
feat: add html
amhsirak Nov 20, 2025
418100c
feat: scrape robot
amhsirak Nov 20, 2025
f3c79bd
feat: scrape robot
amhsirak Nov 20, 2025
e90cd99
feat: add html scrape support
RohitR311 Nov 20, 2025
a9a8e20
fix: resolve merge conflicts
RohitR311 Nov 20, 2025
c89b2af
feat: modify scrape api to support html
RohitR311 Nov 20, 2025
0987183
chore: increase goto timeout scrape 100s
RohitR311 Nov 20, 2025
ac0c70e
feat: disable sheets and airtable scrape robot
RohitR311 Nov 20, 2025
f646713
fix: format
amhsirak Nov 20, 2025
174a09f
fix: use p instead of formlabel
amhsirak Nov 20, 2025
565c858
fix: remove margin
amhsirak Nov 20, 2025
7f22a77
Merge branch 'markdownify' of https://github.com/getmaxun/maxun into …
amhsirak Nov 20, 2025
6477fee
fix: dont show selected output format
amhsirak Nov 20, 2025
b2b5a91
chore: add telemetry for scrape robots and runs
RohitR311 Nov 20, 2025
467ffe3
feat: rm display integrations scrape robot
RohitR311 Nov 20, 2025
25fd74e
feat: center tabs
amhsirak Nov 20, 2025
5cfbd1e
Merge branch 'markdownify' of https://github.com/getmaxun/maxun into …
amhsirak Nov 20, 2025
a1b2117
fix: less gap
amhsirak Nov 20, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
"idcac-playwright": "^0.1.3",
"ioredis": "^5.4.1",
"joi": "^17.6.0",
"joplin-turndown-plugin-gfm": "^1.0.12",
"jsonwebtoken": "^9.0.2",
"jwt-decode": "^4.0.0",
"lodash": "^4.17.21",
Expand Down Expand Up @@ -80,6 +81,7 @@
"styled-components": "^5.3.3",
"swagger-jsdoc": "^6.2.8",
"swagger-ui-express": "^5.0.1",
"turndown": "^7.2.2",
"typedoc": "^0.23.8",
"typescript": "^5.0.0",
"uuid": "^8.3.2",
Expand Down Expand Up @@ -126,6 +128,7 @@
"@types/styled-components": "^5.1.23",
"@types/swagger-jsdoc": "^6.0.4",
"@types/swagger-ui-express": "^4.1.6",
"@types/turndown": "^5.0.6",
"@vitejs/plugin-react": "^4.3.3",
"ajv": "^8.8.2",
"concurrently": "^7.0.0",
Expand Down
206 changes: 197 additions & 9 deletions server/src/api/record.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import { WorkflowFile } from "maxun-core";
import { googleSheetUpdateTasks, processGoogleSheetUpdates } from "../workflow-management/integrations/gsheet";
import { airtableUpdateTasks, processAirtableUpdates } from "../workflow-management/integrations/airtable";
import { sendWebhook } from "../routes/webhook";
import { convertPageToHTML, convertPageToMarkdown } from '../markdownify/scrape';

chromium.use(stealthPlugin());

Expand Down Expand Up @@ -344,7 +345,9 @@ function formatRunResponse(run: any) {
runByAPI: run.runByAPI,
data: {
textData: {},
listData: {}
listData: {},
markdown: '',
html: ''
},
screenshots: [] as any[],
};
Expand All @@ -359,6 +362,14 @@ function formatRunResponse(run: any) {
formattedRun.data.listData = output.scrapeList;
}

if (output.markdown && Array.isArray(output.markdown)) {
formattedRun.data.markdown = output.markdown[0]?.content || '';
}

if (output.html && Array.isArray(output.html)) {
formattedRun.data.html = output.html[0]?.content || '';
}

if (run.binaryOutput) {
Object.keys(run.binaryOutput).forEach(key => {
if (run.binaryOutput[key]) {
Expand Down Expand Up @@ -569,9 +580,9 @@ async function triggerIntegrationUpdates(runId: string, robotMetaId: string): Pr
}
}

async function readyForRunHandler(browserId: string, id: string, userId: string){
async function readyForRunHandler(browserId: string, id: string, userId: string, requestedFormats?: string[]){
try {
const result = await executeRun(id, userId);
const result = await executeRun(id, userId, requestedFormats);

if (result && result.success) {
logger.log('info', `Interpretation of ${id} succeeded`);
Expand Down Expand Up @@ -608,7 +619,7 @@ function AddGeneratedFlags(workflow: WorkflowFile) {
return copy;
};

async function executeRun(id: string, userId: string) {
async function executeRun(id: string, userId: string, requestedFormats?: string[]) {
let browser: any = null;

try {
Expand Down Expand Up @@ -651,6 +662,166 @@ async function executeRun(id: string, userId: string) {
};
}

if (recording.recording_meta.type === 'scrape') {
logger.log('info', `Executing scrape robot for API run ${id}`);

let formats = recording.recording_meta.formats || ['markdown'];

// Override if API request defines formats
if (requestedFormats && Array.isArray(requestedFormats) && requestedFormats.length > 0) {
formats = requestedFormats.filter((f): f is 'markdown' | 'html' => ['markdown', 'html'].includes(f));
}
Comment on lines +670 to +673
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Missing validation: requestedFormats should be validated before use.

When requestedFormats is provided, line 672 filters it to only include valid formats, but there's no check to ensure the resulting array isn't empty. If a caller passes formats: ['invalid'], the filtered array would be empty and no output would be generated, but the run would still succeed.

Apply this diff to add validation:

             // Override if API request defines formats
             if (requestedFormats && Array.isArray(requestedFormats) && requestedFormats.length > 0) {
                 formats = requestedFormats.filter((f): f is 'markdown' | 'html' => ['markdown', 'html'].includes(f));
+                if (formats.length === 0) {
+                    throw new Error('No valid formats specified. Supported formats: markdown, html');
+                }
             }
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
// Override if API request defines formats
if (requestedFormats && Array.isArray(requestedFormats) && requestedFormats.length > 0) {
formats = requestedFormats.filter((f): f is 'markdown' | 'html' => ['markdown', 'html'].includes(f));
}
// Override if API request defines formats
if (requestedFormats && Array.isArray(requestedFormats) && requestedFormats.length > 0) {
formats = requestedFormats.filter((f): f is 'markdown' | 'html' => ['markdown', 'html'].includes(f));
if (formats.length === 0) {
throw new Error('No valid formats specified. Supported formats: markdown, html');
}
}
🤖 Prompt for AI Agents
In server/src/api/record.ts around lines 670 to 673, the code filters
requestedFormats but does not validate the result; after filtering, check
whether the resulting formats array is non-empty and if it is empty return a 400
Bad Request (or throw a validation error) with a clear message like
"requestedFormats contains no supported formats; supported: markdown, html";
ensure you stop further processing when invalid so the run does not succeed with
no outputs.


await run.update({
status: 'running',
log: `Converting page to: ${formats.join(', ')}`
});

try {
const url = recording.recording_meta.url;

if (!url) {
throw new Error('No URL specified for markdown robot');
}

let markdown = '';
let html = '';
const serializableOutput: any = {};

// Markdown conversion
if (formats.includes('markdown')) {
markdown = await convertPageToMarkdown(url);
serializableOutput.markdown = [{ content: markdown }];
}

// HTML conversion
if (formats.includes('html')) {
html = await convertPageToHTML(url);
serializableOutput.html = [{ content: html }];
}

await run.update({
status: 'success',
finishedAt: new Date().toLocaleString(),
log: `${formats.join(', ')} conversion completed successfully`,
serializableOutput,
binaryOutput: {},
});

logger.log('info', `Markdown robot execution completed for API run ${id}`);

// Push success socket event
try {
const completionData = {
runId: plainRun.runId,
robotMetaId: plainRun.robotMetaId,
robotName: recording.recording_meta.name,
status: 'success',
finishedAt: new Date().toLocaleString()
};

serverIo
.of('/queued-run')
.to(`user-${userId}`)
.emit('run-completed', completionData);
} catch (socketError: any) {
logger.log(
'warn',
`Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`
);
}

// Build webhook payload
const webhookPayload: any = {
robot_id: plainRun.robotMetaId,
run_id: plainRun.runId,
robot_name: recording.recording_meta.name,
status: 'success',
started_at: plainRun.startedAt,
finished_at: new Date().toLocaleString(),
metadata: {
browser_id: plainRun.browserId,
user_id: userId,
},
};

if (formats.includes('markdown')) webhookPayload.markdown = markdown;
if (formats.includes('html')) webhookPayload.html = html;

try {
await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
logger.log(
'info',
`Webhooks sent successfully for markdown robot API run ${plainRun.runId}`
);
} catch (webhookError: any) {
logger.log(
'warn',
`Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`
);
}

capture("maxun-oss-run-created-api", {
runId: plainRun.runId,
user_id: userId,
status: "success",
robot_type: "scrape",
formats
});

await destroyRemoteBrowser(plainRun.browserId, userId);

return {
success: true,
interpretationInfo: run.toJSON()
};
} catch (error: any) {
logger.log(
'error',
`${formats.join(', ')} conversion failed for API run ${id}: ${error.message}`
);

await run.update({
status: 'failed',
finishedAt: new Date().toLocaleString(),
log: `${formats.join(', ')} conversion failed: ${error.message}`,
});

// Send failure socket event
try {
const failureData = {
runId: plainRun.runId,
robotMetaId: plainRun.robotMetaId,
robotName: recording.recording_meta.name,
status: 'failed',
finishedAt: new Date().toLocaleString()
};

serverIo
.of('/queued-run')
.to(`user-${userId}`)
.emit('run-completed', failureData);
} catch (socketError: any) {
logger.log(
'warn',
`Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`
);
}

capture("maxun-oss-run-created-api", {
runId: plainRun.runId,
user_id: userId,
status: "failed",
robot_type: "scrape",
formats
});

await destroyRemoteBrowser(plainRun.browserId, userId);

throw error;
}
}

plainRun.status = 'running';

browser = browserPool.getRemoteBrowser(plainRun.browserId);
Expand Down Expand Up @@ -848,7 +1019,7 @@ async function executeRun(id: string, userId: string) {
}
}

export async function handleRunRecording(id: string, userId: string) {
export async function handleRunRecording(id: string, userId: string, requestedFormats?: string[]) {
try {
const result = await createWorkflowAndStoreMetadata(id, userId);
const { browserId, runId: newRunId } = result;
Expand All @@ -862,7 +1033,7 @@ export async function handleRunRecording(id: string, userId: string) {
rejectUnauthorized: false
});

socket.on('ready-for-run', () => readyForRunHandler(browserId, newRunId, userId));
socket.on('ready-for-run', () => readyForRunHandler(browserId, newRunId, userId, requestedFormats));

logger.log('info', `Running Robot: ${id}`);

Expand All @@ -889,12 +1060,11 @@ async function waitForRunCompletion(runId: string, interval: number = 2000) {
if (!run) throw new Error('Run not found');

if (run.status === 'success') {
return run.toJSON();
return run;
} else if (run.status === 'failed') {
throw new Error('Run failed');
}

// Wait for the next polling interval
await new Promise(resolve => setTimeout(resolve, interval));
}
}
Expand All @@ -914,6 +1084,21 @@ async function waitForRunCompletion(runId: string, interval: number = 2000) {
* type: string
* required: true
* description: The ID of the robot to run.
* requestBody:
* required: false
* content:
* application/json:
* schema:
* type: object
* properties:
* formats:
* type: array
* items:
* type: string
* enum: [markdown, html]
* description: Optional override formats for this run.
* example:
* formats: ["html"]
* responses:
* 200:
* description: Robot run started successfully.
Expand Down Expand Up @@ -972,7 +1157,10 @@ router.post("/robots/:id/runs", requireAPIKey, async (req: AuthenticatedRequest,
if (!req.user) {
return res.status(401).json({ ok: false, error: 'Unauthorized' });
}
const runId = await handleRunRecording(req.params.id, req.user.id);

const requestedFormats = req.body.formats;

const runId = await handleRunRecording(req.params.id, req.user.id, requestedFormats);

if (!runId) {
throw new Error('Run ID is undefined');
Expand Down
Loading