Skip to content

Commit

Permalink
FIX FlowiseAI#2617 Cherio Web Crawler doesn't work with large sites (F…
Browse files Browse the repository at this point in the history
…lowiseAI#2678)

* FIX FlowiseAI#2617 Big sites scan error

* FIX FlowiseAI#2617 Big sites scan error - review fix

---------

Co-authored-by: Ahmed Osman <[email protected]>
  • Loading branch information
ahmosman and Ahmed Osman authored Jul 5, 2024
1 parent b1e3878 commit 90558ca
Showing 1 changed file with 6 additions and 1 deletion.
7 changes: 6 additions & 1 deletion packages/components/nodes/documentloaders/Cheerio/Cheerio.ts
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,11 @@ class Cheerio_DocumentLoaders implements INode {

async function cheerioLoader(url: string): Promise<any> {
try {
let docs = []
let docs: IDocument[] = []
if (url.endsWith('.pdf')) {
if (process.env.DEBUG === 'true') options.logger.info(`CheerioWebBaseLoader does not support PDF files: ${url}`)
return docs
}
const loader = new CheerioWebBaseLoader(url, params)
if (textSplitter) {
docs = await loader.loadAndSplit(textSplitter)
Expand All @@ -141,6 +145,7 @@ class Cheerio_DocumentLoaders implements INode {
return docs
} catch (err) {
if (process.env.DEBUG === 'true') options.logger.error(`error in CheerioWebBaseLoader: ${err.message}, on page: ${url}`)
return []
}
}

Expand Down

0 comments on commit 90558ca

Please sign in to comment.