Skip to content

Commit

Permalink
Merge branch 'main' into feature/Couchbase-vector-store
Browse files Browse the repository at this point in the history
  • Loading branch information
prajwal-pai77 authored May 3, 2024
2 parents 95d09ef + 2254d16 commit d35e7fc
Show file tree
Hide file tree
Showing 7 changed files with 385 additions and 13 deletions.
176 changes: 176 additions & 0 deletions packages/components/nodes/documentloaders/Unstructured/Unstructured.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
import {
HiResModelName,
SkipInferTableTypes,
UnstructuredLoaderOptions,
UnstructuredLoaderStrategy
} from 'langchain/document_loaders/fs/unstructured'
import { BaseDocumentLoader } from 'langchain/document_loaders/base'
import { StringWithAutocomplete } from 'langchain/dist/util/types'
import { Document } from '@langchain/core/documents'

/**
* Set the chunking_strategy to chunk text into larger or smaller elements. Defaults to None with optional arg of by_title
*/
type ChunkingStrategy = 'None' | 'by_title'

/**
* Represents an element returned by the Unstructured API. It has
* properties for the element type, text content, and metadata.
*/
type Element = {
type: string
text: string
// this is purposefully loosely typed
metadata: {
[key: string]: unknown
}
}

export class UnstructuredLoader extends BaseDocumentLoader {
public filePath: string

private apiUrl = 'https://api.unstructured.io/general/v0/general'

private apiKey?: string

private strategy: StringWithAutocomplete<UnstructuredLoaderStrategy> = 'hi_res'

private encoding?: string

private ocrLanguages: Array<string> = []

private coordinates?: boolean

private pdfInferTableStructure?: boolean

private xmlKeepTags?: boolean

private skipInferTableTypes?: Array<StringWithAutocomplete<SkipInferTableTypes>>

private hiResModelName?: StringWithAutocomplete<HiResModelName>

private includePageBreaks?: boolean

private chunkingStrategy?: StringWithAutocomplete<ChunkingStrategy>

private multiPageSections?: boolean

private combineUnderNChars?: number

private newAfterNChars?: number

private maxCharacters?: number

constructor(optionsOrLegacyFilePath: UnstructuredLoaderOptions) {
super()

const options = optionsOrLegacyFilePath
this.apiKey = options.apiKey
this.apiUrl = options.apiUrl ?? this.apiUrl
this.strategy = options.strategy ?? this.strategy
this.encoding = options.encoding
this.ocrLanguages = options.ocrLanguages ?? this.ocrLanguages
this.coordinates = options.coordinates
this.pdfInferTableStructure = options.pdfInferTableStructure
this.xmlKeepTags = options.xmlKeepTags
this.skipInferTableTypes = options.skipInferTableTypes
this.hiResModelName = options.hiResModelName
this.includePageBreaks = options.includePageBreaks
this.chunkingStrategy = options.chunkingStrategy
this.multiPageSections = options.multiPageSections
this.combineUnderNChars = options.combineUnderNChars
this.newAfterNChars = options.newAfterNChars
this.maxCharacters = options.maxCharacters
}

async _partition(buffer: Buffer, fileName: string): Promise<Element[]> {
const formData = new FormData()
formData.append('files', new Blob([buffer]), fileName)
formData.append('strategy', this.strategy)
this.ocrLanguages.forEach((language) => {
formData.append('ocr_languages', language)
})
if (this.encoding) {
formData.append('encoding', this.encoding)
}
if (this.coordinates === true) {
formData.append('coordinates', 'true')
}
if (this.pdfInferTableStructure === true) {
formData.append('pdf_infer_table_structure', 'true')
}
if (this.xmlKeepTags === true) {
formData.append('xml_keep_tags', 'true')
}
if (this.skipInferTableTypes) {
formData.append('skip_infer_table_types', JSON.stringify(this.skipInferTableTypes))
}
if (this.hiResModelName) {
formData.append('hi_res_model_name', this.hiResModelName)
}
if (this.includePageBreaks) {
formData.append('include_page_breaks', 'true')
}
if (this.chunkingStrategy) {
formData.append('chunking_strategy', this.chunkingStrategy)
}
if (this.multiPageSections !== undefined) {
formData.append('multipage_sections', this.multiPageSections ? 'true' : 'false')
}
if (this.combineUnderNChars !== undefined) {
formData.append('combine_under_n_chars', String(this.combineUnderNChars))
}
if (this.newAfterNChars !== undefined) {
formData.append('new_after_n_chars', String(this.newAfterNChars))
}
if (this.maxCharacters !== undefined) {
formData.append('max_characters', String(this.maxCharacters))
}

const headers = {
'UNSTRUCTURED-API-KEY': this.apiKey ?? ''
}

const response = await fetch(this.apiUrl, {
method: 'POST',
body: formData,
headers
})

if (!response.ok) {
throw new Error(`Failed to partition file ${this.filePath} with error ${response.status} and message ${await response.text()}`)
}

const elements = await response.json()
if (!Array.isArray(elements)) {
throw new Error(`Expected partitioning request to return an array, but got ${elements}`)
}
return elements.filter((el) => typeof el.text === 'string') as Element[]
}

async loadAndSplitBuffer(buffer: Buffer, fileName: string): Promise<Document[]> {
const elements = await this._partition(buffer, fileName)

const documents: Document[] = []
for (const element of elements) {
const { metadata, text } = element
if (typeof text === 'string') {
documents.push(
new Document({
pageContent: text,
metadata: {
...metadata,
category: element.type
}
})
)
}
}

return documents
}

async load(): Promise<Document[]> {
return Promise.reject(new Error('load() is not supported for UnstructuredLoader. Use loadAndSplitBuffer() instead.'))
}
}
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
import {
UnstructuredLoader,
UnstructuredLoaderOptions,
UnstructuredLoaderStrategy,
SkipInferTableTypes,
HiResModelName
HiResModelName,
UnstructuredLoader as LCUnstructuredLoader
} from 'langchain/document_loaders/fs/unstructured'
import { getCredentialData, getCredentialParam } from '../../../src/utils'
import { getFileFromStorage } from '../../../src'
import { UnstructuredLoader } from './Unstructured'

class UnstructuredFile_DocumentLoaders implements INode {
label: string
Expand All @@ -23,7 +25,7 @@ class UnstructuredFile_DocumentLoaders implements INode {
constructor() {
this.label = 'Unstructured File Loader'
this.name = 'unstructuredFileLoader'
this.version = 2.0
this.version = 3.0
this.type = 'Document'
this.icon = 'unstructured-file.svg'
this.category = 'Document Loaders'
Expand All @@ -41,7 +43,18 @@ class UnstructuredFile_DocumentLoaders implements INode {
label: 'File Path',
name: 'filePath',
type: 'string',
placeholder: ''
placeholder: '',
optional: true,
warning:
'Use the File Upload instead of File path. If file is uploaded, this path is ignored. Path will be deprecated in future releases.'
},
{
label: 'Files Upload',
name: 'fileObject',
type: 'file',
description: 'Files to be processed. Multiple files can be uploaded.',
fileType:
'.txt, .text, .pdf, .docx, .doc, .jpg, .jpeg, .eml, .html, .htm, .md, .pptx, .ppt, .msg, .rtf, .xlsx, .xls, .odt, .epub'
},
{
label: 'Unstructured API URL',
Expand Down Expand Up @@ -416,6 +429,7 @@ class UnstructuredFile_DocumentLoaders implements INode {
const combineUnderNChars = nodeData.inputs?.combineUnderNChars as number
const newAfterNChars = nodeData.inputs?.newAfterNChars as number
const maxCharacters = nodeData.inputs?.maxCharacters as number
const fileBase64 = nodeData.inputs?.fileObject as string

const obj: UnstructuredLoaderOptions = {
apiUrl: unstructuredAPIUrl,
Expand All @@ -438,8 +452,48 @@ class UnstructuredFile_DocumentLoaders implements INode {
const unstructuredAPIKey = getCredentialParam('unstructuredAPIKey', credentialData, nodeData)
if (unstructuredAPIKey) obj.apiKey = unstructuredAPIKey

const loader = new UnstructuredLoader(filePath, obj)
let docs = await loader.load()
let docs: any[] = []
let files: string[] = []

if (fileBase64) {
const loader = new UnstructuredLoader(obj)
//FILE-STORAGE::["CONTRIBUTING.md","LICENSE.md","README.md"]
if (fileBase64.startsWith('FILE-STORAGE::')) {
const fileName = fileBase64.replace('FILE-STORAGE::', '')
if (fileName.startsWith('[') && fileName.endsWith(']')) {
files = JSON.parse(fileName)
} else {
files = [fileName]
}
const chatflowid = options.chatflowid

for (const file of files) {
const fileData = await getFileFromStorage(file, chatflowid)
const loaderDocs = await loader.loadAndSplitBuffer(fileData, file)
docs.push(...loaderDocs)
}
} else {
if (fileBase64.startsWith('[') && fileBase64.endsWith(']')) {
files = JSON.parse(fileBase64)
} else {
files = [fileBase64]
}

for (const file of files) {
const splitDataURI = file.split(',')
const filename = splitDataURI.pop()?.split(':')[1] ?? ''
const bf = Buffer.from(splitDataURI.pop() || '', 'base64')
const loaderDocs = await loader.loadAndSplitBuffer(bf, filename)
docs.push(...loaderDocs)
}
}
} else if (filePath) {
const loader = new LCUnstructuredLoader(filePath, obj)
const loaderDocs = await loader.load()
docs.push(...loaderDocs)
} else {
throw new Error('File path or File upload is required')
}

if (metadata) {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
Expand Down
1 change: 1 addition & 0 deletions packages/components/src/Interface.ts
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ export interface INodeProperties {
description?: string
filePath?: string
badge?: string
deprecateMessage?: string
}

export interface INode extends INodeProperties {
Expand Down
2 changes: 1 addition & 1 deletion packages/components/src/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ function getURLsFromHTML(htmlBody: string, baseURL: string): string[] {
*/
function normalizeURL(urlString: string): string {
const urlObj = new URL(urlString)
const hostPath = urlObj.hostname + urlObj.pathname
const hostPath = urlObj.hostname + urlObj.pathname + urlObj.search
if (hostPath.length > 0 && hostPath.slice(-1) == '/') {
// handling trailing slash
return hostPath.slice(0, -1)
Expand Down
66 changes: 66 additions & 0 deletions packages/ui/src/utils/genericHelper.js
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,72 @@ export const initNode = (nodeData, newNodeId) => {
return nodeData
}

export const updateOutdatedNodeData = (newComponentNodeData, existingComponentNodeData) => {
const initNewComponentNodeData = initNode(newComponentNodeData, existingComponentNodeData.id)

// Update credentials with existing credentials
if (existingComponentNodeData.credential) {
initNewComponentNodeData.credential = existingComponentNodeData.credential
}

// Update inputs with existing inputs
if (existingComponentNodeData.inputs) {
for (const key in existingComponentNodeData.inputs) {
if (key in initNewComponentNodeData.inputs) {
initNewComponentNodeData.inputs[key] = existingComponentNodeData.inputs[key]
}
}
}

// Update outputs with existing outputs
if (existingComponentNodeData.outputs) {
for (const key in existingComponentNodeData.outputs) {
if (key in initNewComponentNodeData.outputs) {
initNewComponentNodeData.outputs[key] = existingComponentNodeData.outputs[key]
}
}
}

return initNewComponentNodeData
}

export const updateOutdatedNodeEdge = (newComponentNodeData, edges) => {
const removedEdges = []
for (const edge of edges) {
const targetNodeId = edge.targetHandle.split('-')[0]
const sourceNodeId = edge.sourceHandle.split('-')[0]

if (targetNodeId === newComponentNodeData.id) {
// Check if targetHandle is in inputParams or inputAnchors
const inputParam = newComponentNodeData.inputParams.find((param) => param.id === edge.targetHandle)
const inputAnchor = newComponentNodeData.inputAnchors.find((param) => param.id === edge.targetHandle)

if (!inputParam && !inputAnchor) {
removedEdges.push(edge)
}
}

if (sourceNodeId === newComponentNodeData.id) {
if (newComponentNodeData.outputAnchors?.length) {
for (const outputAnchor of newComponentNodeData.outputAnchors) {
const outputAnchorType = outputAnchor.type
if (outputAnchorType === 'options') {
if (!outputAnchor.options.find((outputOption) => outputOption.id === edge.sourceHandle)) {
removedEdges.push(edge)
}
} else {
if (outputAnchor.id !== edge.sourceHandle) {
removedEdges.push(edge)
}
}
}
}
}
}

return removedEdges
}

export const isValidConnection = (connection, reactFlowInstance) => {
const sourceHandle = connection.sourceHandle
const targetHandle = connection.targetHandle
Expand Down
11 changes: 8 additions & 3 deletions packages/ui/src/views/canvas/CanvasNode.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,12 @@ const CanvasNode = ({ data }) => {
} else if (data.version && componentNode.version > data.version) {
setWarningMessage(nodeOutdatedMessage(data.version, componentNode.version))
} else if (componentNode.badge === 'DEPRECATING') {
setWarningMessage('This node will be deprecated in the next release. Change to a new node tagged with NEW')
setWarningMessage(
componentNode?.deprecateMessage ??
'This node will be deprecated in the next release. Change to a new node tagged with NEW'
)
} else {
setWarningMessage('')
}
}
}, [canvas.componentNodes, data.name, data.version])
Expand Down Expand Up @@ -238,8 +243,8 @@ const CanvasNode = ({ data }) => {
</Typography>
</Box>
<Divider />
{data.outputAnchors.map((outputAnchor, index) => (
<NodeOutputHandler key={index} outputAnchor={outputAnchor} data={data} />
{data.outputAnchors.map((outputAnchor) => (
<NodeOutputHandler key={JSON.stringify(data)} outputAnchor={outputAnchor} data={data} />
))}
</Box>
</NodeTooltip>
Expand Down
Loading

0 comments on commit d35e7fc

Please sign in to comment.