Skip to content

Commit 79031bf

Browse files
HenryHengZJ0xi4o
authored andcommitted
Feature/add ability to specify dynamic metadata to jsonlines (#3238)
* add ability to specify dynamic metadata to jsonlines * fix additional metadata
1 parent 2cb7352 commit 79031bf

File tree

1 file changed

+155
-7
lines changed
  • packages/components/nodes/documentloaders/Jsonlines

1 file changed

+155
-7
lines changed

Diff for: packages/components/nodes/documentloaders/Jsonlines/Jsonlines.ts

+155-7
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,31 @@
11
import { omit } from 'lodash'
22
import { ICommonObject, IDocument, INode, INodeData, INodeParams } from '../../../src/Interface'
33
import { TextSplitter } from 'langchain/text_splitter'
4-
import { JSONLinesLoader } from 'langchain/document_loaders/fs/json'
4+
import jsonpointer from 'jsonpointer'
55
import { getFileFromStorage } from '../../../src'
6+
import { BaseDocumentLoader } from 'langchain/document_loaders/base'
7+
import { Document } from '@langchain/core/documents'
8+
import type { readFile as ReadFileT } from 'node:fs/promises'
9+
10+
const howToUseCode = `
11+
You can add metadata dynamically from the document:
12+
13+
For example, if the document is:
14+
\`\`\`jsonl
15+
{
16+
"source": "www.example.com", "content": "Hello World!"
17+
}
18+
{
19+
"source": "www.example2.com", "content": "Hi World!"
20+
}
21+
\`\`\`
22+
23+
You can have the "source" value as metadata by returning the following:
24+
\`\`\`json
25+
{
26+
"source": "/source"
27+
}
28+
\`\`\``
629

730
class Jsonlines_DocumentLoaders implements INode {
831
label: string
@@ -18,7 +41,7 @@ class Jsonlines_DocumentLoaders implements INode {
1841
constructor() {
1942
this.label = 'Json Lines File'
2043
this.name = 'jsonlinesFile'
21-
this.version = 1.0
44+
this.version = 2.0
2245
this.type = 'Document'
2346
this.icon = 'jsonlines.svg'
2447
this.category = 'Document Loaders'
@@ -41,14 +64,20 @@ class Jsonlines_DocumentLoaders implements INode {
4164
label: 'Pointer Extraction',
4265
name: 'pointerName',
4366
type: 'string',
44-
placeholder: 'Enter pointer name',
67+
placeholder: 'key',
68+
description: 'Ex: { "key": "value" }, Pointer Extraction = "key", "value" will be extracted as pageContent of the chunk',
4569
optional: false
4670
},
4771
{
4872
label: 'Additional Metadata',
4973
name: 'metadata',
5074
type: 'json',
51-
description: 'Additional metadata to be added to the extracted documents',
75+
description:
76+
'Additional metadata to be added to the extracted documents. You can add metadata dynamically from the document. Ex: { "key": "value", "source": "www.example.com" }. Metadata: { "page": "/source" } will extract the value of the key "source" from the document and add it to the metadata with the key "page"',
77+
hint: {
78+
label: 'How to use',
79+
value: howToUseCode
80+
},
5281
optional: true,
5382
additionalParams: true
5483
},
@@ -96,7 +125,7 @@ class Jsonlines_DocumentLoaders implements INode {
96125
if (!file) continue
97126
const fileData = await getFileFromStorage(file, chatflowid)
98127
const blob = new Blob([fileData])
99-
const loader = new JSONLinesLoader(blob, pointer)
128+
const loader = new JSONLinesLoader(blob, pointer, metadata)
100129

101130
if (textSplitter) {
102131
let splittedDocs = await loader.load()
@@ -119,7 +148,7 @@ class Jsonlines_DocumentLoaders implements INode {
119148
splitDataURI.pop()
120149
const bf = Buffer.from(splitDataURI.pop() || '', 'base64')
121150
const blob = new Blob([bf])
122-
const loader = new JSONLinesLoader(blob, pointer)
151+
const loader = new JSONLinesLoader(blob, pointer, metadata)
123152

124153
if (textSplitter) {
125154
let splittedDocs = await loader.load()
@@ -132,7 +161,8 @@ class Jsonlines_DocumentLoaders implements INode {
132161
}
133162

134163
if (metadata) {
135-
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
164+
let parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
165+
parsedMetadata = removeValuesStartingWithSlash(parsedMetadata)
136166
docs = docs.map((doc) => ({
137167
...doc,
138168
metadata:
@@ -167,4 +197,122 @@ class Jsonlines_DocumentLoaders implements INode {
167197
}
168198
}
169199

200+
const removeValuesStartingWithSlash = (obj: Record<string, any>): Record<string, any> => {
201+
const result: Record<string, any> = {}
202+
203+
for (const key in obj) {
204+
const value = obj[key]
205+
if (typeof value === 'string' && value.startsWith('/')) {
206+
continue
207+
}
208+
result[key] = value
209+
}
210+
211+
return result
212+
}
213+
214+
class TextLoader extends BaseDocumentLoader {
215+
constructor(public filePathOrBlob: string | Blob) {
216+
super()
217+
}
218+
219+
protected async parse(raw: string): Promise<{ pageContent: string; metadata: ICommonObject }[]> {
220+
return [{ pageContent: raw, metadata: {} }]
221+
}
222+
223+
public async load(): Promise<Document[]> {
224+
let text: string
225+
let metadata: Record<string, string>
226+
if (typeof this.filePathOrBlob === 'string') {
227+
const { readFile } = await TextLoader.imports()
228+
text = await readFile(this.filePathOrBlob, 'utf8')
229+
metadata = { source: this.filePathOrBlob }
230+
} else {
231+
text = await this.filePathOrBlob.text()
232+
metadata = { source: 'blob', blobType: this.filePathOrBlob.type }
233+
}
234+
const parsed = await this.parse(text)
235+
parsed.forEach((parsedData, i) => {
236+
const { pageContent } = parsedData
237+
if (typeof pageContent !== 'string') {
238+
throw new Error(`Expected string, at position ${i} got ${typeof pageContent}`)
239+
}
240+
})
241+
return parsed.map((parsedData, i) => {
242+
const { pageContent, metadata: additionalMetadata } = parsedData
243+
return new Document({
244+
pageContent,
245+
metadata:
246+
parsed.length === 1
247+
? { ...metadata, ...additionalMetadata }
248+
: {
249+
...metadata,
250+
line: i + 1,
251+
...additionalMetadata
252+
}
253+
})
254+
})
255+
}
256+
257+
static async imports(): Promise<{
258+
readFile: typeof ReadFileT
259+
}> {
260+
try {
261+
const { readFile } = await import('node:fs/promises')
262+
return { readFile }
263+
} catch (e) {
264+
console.error(e)
265+
throw new Error(`Failed to load fs/promises. Make sure you are running in Node.js environment.`)
266+
}
267+
}
268+
}
269+
270+
class JSONLinesLoader extends TextLoader {
271+
metadata?: ICommonObject
272+
additionalMetadata: ICommonObject[] = []
273+
274+
constructor(filePathOrBlob: string | Blob, public pointer: string, metadata?: any) {
275+
super(filePathOrBlob)
276+
if (metadata) {
277+
this.metadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
278+
}
279+
}
280+
281+
async getAdditionalMetadata(): Promise<ICommonObject[]> {
282+
return this.additionalMetadata
283+
}
284+
285+
protected async parse(raw: string): Promise<{ pageContent: string; metadata: ICommonObject }[]> {
286+
const lines = raw.split('\n')
287+
const jsons = lines
288+
.map((line) => line.trim())
289+
.filter(Boolean)
290+
.map((line) => JSON.parse(line))
291+
const pointer = jsonpointer.compile(this.pointer)
292+
if (this.metadata) {
293+
const values = Object.values(this.metadata).filter((value) => typeof value === 'string' && value.startsWith('/'))
294+
let newJsons = []
295+
for (const json of jsons) {
296+
let metadata = {}
297+
for (const value of values) {
298+
if (value) {
299+
const key = Object.keys(this.metadata).find((key) => this.metadata?.[key] === value)
300+
if (key) {
301+
metadata = {
302+
...metadata,
303+
[key]: jsonpointer.get(json, value)
304+
}
305+
}
306+
}
307+
}
308+
newJsons.push({ pageContent: pointer.get(json), metadata })
309+
}
310+
return newJsons
311+
}
312+
return jsons.map((json) => {
313+
return { pageContent: pointer.get(json), metadata: {} }
314+
})
315+
}
316+
}
317+
170318
module.exports = { nodeClass: Jsonlines_DocumentLoaders }

0 commit comments

Comments
 (0)