oramasearch · micheleriva · Oct 15, 2024 · Oct 15, 2024 · Oct 15, 2024
diff --git a/packages/orama/src/components/index.ts b/packages/orama/src/components/index.ts
@@ -233,7 +233,7 @@ function insertScalarBuilder(
         break
       }
       case 'Radix': {
-        const tokens = tokenizer.tokenize(value as string, language, prop)
+        const tokens = tokenizer.tokenize(value as string, language, prop, false)
         implementation.insertDocumentScoreParameters(index, prop, internalId, tokens, docsCount)
 
         for (const token of tokens) {

diff --git a/packages/orama/src/components/tokenizer/index.ts b/packages/orama/src/components/tokenizer/index.ts
@@ -13,19 +13,21 @@ export interface DefaultTokenizer extends Tokenizer {
   stopWords?: string[]
   allowDuplicates: boolean
   normalizationCache: Map<string, string>
-  normalizeToken(this: DefaultTokenizer, token: string, prop: Optional<string>): string
+  normalizeToken(this: DefaultTokenizer, prop: Optional<string>, token: string, withCache:Optional<boolean>): string
 }
 
-export function normalizeToken(this: DefaultTokenizer, prop: string, token: string): string {
+export function normalizeToken(this: DefaultTokenizer, prop: string, token: string, withCache: boolean = true): string {
   const key = `${this.language}:${prop}:${token}`
 
-  if (this.normalizationCache.has(key)) {
+  if (withCache && this.normalizationCache.has(key)) {
     return this.normalizationCache.get(key)!
   }
 
   // Remove stopwords if enabled
   if (this.stopWords?.includes(token)) {
-    this.normalizationCache.set(key, '')
+    if (withCache) {
+      this.normalizationCache.set(key, '')
+    }
     return ''
   }
 
@@ -35,7 +37,9 @@ export function normalizeToken(this: DefaultTokenizer, prop: string, token: stri
   }
 
   token = replaceDiacritics(token)
-  this.normalizationCache.set(key, token)
+  if (withCache) {
+    this.normalizationCache.set(key, token)
+  }
   return token
 }
 
@@ -50,7 +54,7 @@ function trim(text: string[]): string[] {
   return text
 }
 
-function tokenize(this: DefaultTokenizer, input: string, language?: string, prop?: string): string[] {
+function tokenize(this: DefaultTokenizer, input: string, language?: string, prop?: string, withCache: boolean = true): string[] {
   if (language && language !== this.language) {
     throw createError('LANGUAGE_NOT_SUPPORTED', language)
   }
@@ -60,15 +64,16 @@ function tokenize(this: DefaultTokenizer, input: string, language?: string, prop
     return [input]
   }
 
+  const normalizeToken = this.normalizeToken.bind(this, prop ?? '')
   let tokens: string[]
   if (prop && this.tokenizeSkipProperties.has(prop)) {
-    tokens = [this.normalizeToken.bind(this, prop ?? '')(input)]
+    tokens = [normalizeToken(input, withCache)]
   } else {
     const splitRule = SPLITTERS[this.language]
     tokens = input
       .toLowerCase()
       .split(splitRule)
-      .map(this.normalizeToken.bind(this, prop ?? ''))
+      .map(t => normalizeToken(t, withCache))
       .filter(Boolean)
   }
 

diff --git a/packages/orama/src/types.ts b/packages/orama/src/types.ts
@@ -1063,7 +1063,7 @@ export type DefaultTokenizerConfig = {
 export interface Tokenizer {
   language: string
   normalizationCache: Map<string, string>
-  tokenize: (raw: string, language?: string, prop?: string) => string[]
+  tokenize: (raw: string, language?: string, prop?: string, withCache?: boolean) => string[]
 }
 
 export interface ObjectComponents<I, D, So> {

diff --git a/packages/orama/tests/insert.test.ts b/packages/orama/tests/insert.test.ts
@@ -549,6 +549,21 @@ t.test('insertMultiple method', async (t) => {
   })
 })
 
+t.test('insert shouldn\'t use tokenizer cache', async (t) => {
+  const db = await create({
+    schema: {
+      name: 'string'
+    } as const
+  })
+
+  await insert(db, {
+    name: 'The quick brown fox jumps over the lazy dog'
+  })
+
+  // Empty map
+  t.strictSame(db.tokenizer.normalizationCache, new Map())
+});
+
 interface BaseDataEvent extends AnyDocument {
   description: string
   lang: string