Trying to Create BM25 Embeddings/Sementic Search - Error: prepareInput(text, "search").filter is not a function #116
-
Please forgive me, I've been hammering at this code for a few hours and I'm stuck. My goal is to take a passage of text. Chunk it into chunks of ~3 sentences each. Then add them to BM25 vectorizer. Then search for the chunk index using semantic search. The error I am seeing is:
My code is below. It's a series of functions. One breaks a passage into chunks. The next feeds the chunks to the BM25 engine. The last one is to perform the search. The line of code throwing the error is :
And here is how I use these functions:
Hopefully it is something simple I am overlooking. Thank you for your time. 🙏 |
Beta Was this translation helpful? Give feedback.
Replies: 1 comment 1 reply
-
Hello @Chaddeus It seems the code has not followed the required workflow – the define prep tasks has to be defined. Here is the revised code in JS for your reference: // Load wink-bm25-text-search
var bm25 = require( 'wink-bm25-text-search' );
// Create search engine's instance
var engine = bm25();
// Load wink nlp and its model
const winkNLP = require( 'wink-nlp' );
// Use web model
const model = require( 'wink-eng-lite-web-model' );
const nlp = winkNLP( model );
const its = nlp.its;
function createChunks(text) {
let contextDoc = nlp.readDoc(text)
let sentences = []
contextDoc.sentences().each((s) => {
const sentence = []
s.tokens().each((t) => sentence.push( t.out(its.precedingSpaces), t.out() ))
sentences.push(sentence.join(''))
})
const chunks = []
for (let i = 0; i < sentences.length; i += 3) {
const chunk = sentences.slice(i, i + 3).join(' ')
chunks.push(chunk)
}
return chunks
}
const prepTask = function ( text ) {
const tokens = [];
nlp.readDoc(text)
.tokens()
// Use only words ignoring punctuations etc and from them remove stop words
.filter( (t) => ( t.out(its.type) === 'word' && !t.out(its.stopWordFlag) ) )
// Handle negation and extract stem of the word
.each( (t) => tokens.push( (t.out(its.negationFlag)) ? '!' + t.out(its.stem) : t.out(its.stem) ) );
return tokens;
};
function createEmbeddings(chunks) {
engine.defineConfig({ fldWeights: { text: 1 } })
engine.definePrepTasks( [ prepTask ] );
chunks.forEach((text, index) => {
engine.addDoc({ text }, index)
})
engine.consolidate() // consolidate the learnings
}
function semanticSearch(query) {
const results = engine.search(query)
if (results.length > 0) return results[0]
else return null
}
const text = `Sen. Edward Kennedy (D., Mass.) said, "It's a bottom-line issue".
The Nasdaq 100 rose 7.08 to 445.23. (Are parenthesis part of a sentence?)
"This is a quoted... sentence." "(This is a quoted sentence within parenthesis.)"
('Like the previous one!') AI Inc. is focussing on AI. I work for AI Inc.
My mail is [email protected]! U.S.A is my birth place.`
const theChunks = [ ... new Set(createChunks(text))];
createEmbeddings(theChunks)
const result = semanticSearch('quoted sentence');
if (result !== null) console.log(theChunks[result[0]])
// --> " AI Inc. is focussing on AI. I work for AI Inc. \nMy mail is [email protected]!" |
Beta Was this translation helpful? Give feedback.
Hello @Chaddeus
It seems the code has not followed the required workflow – the define prep tasks has to be defined. Here is the revised code in JS for your reference: