Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
138b810
add piscina worker
neptunian Jul 8, 2025
ab0061f
remove log
neptunian Jul 8, 2025
be5e0c4
add tests
neptunian Jul 8, 2025
880feaa
setup_node_env path
neptunian Jul 9, 2025
a6d8a1a
fix jest tests
neptunian Jul 9, 2025
f4a0f99
Merge branch 'main' into anonymization-regex-worker
neptunian Jul 9, 2025
330d36a
fix setup_node_env path
neptunian Jul 9, 2025
eb58584
Merge branch 'anonymization-regex-worker' of https://github.com/neptu…
neptunian Jul 9, 2025
97adb50
remove tsconfig path
neptunian Jul 9, 2025
bcbc382
remove unused tracing related code
neptunian Jul 9, 2025
f2e5572
remove setup_node_env
neptunian Jul 9, 2025
a67c8e0
update comment
neptunian Jul 9, 2025
2513288
change to anonymizationRegexWorker
neptunian Jul 9, 2025
1da8481
add back setup_node_env with diff paths
neptunian Jul 9, 2025
58e28fa
change config naming
neptunian Jul 10, 2025
3bc8a97
Merge branch 'main' into anonymization-regex-worker
neptunian Jul 10, 2025
5cab80a
update test
neptunian Jul 10, 2025
1fc7035
Merge branch 'anonymization-regex-worker' of https://github.com/neptu…
neptunian Jul 10, 2025
e340e12
Merge branch 'main' into anonymization-regex-worker
neptunian Jul 10, 2025
f39bc7b
add skipCloud to anonymization tests due to llm proxy
neptunian Jul 11, 2025
5959304
Merge branch 'anonymization-regex-worker' of https://github.com/neptu…
neptunian Jul 11, 2025
fe3fabb
try same path setup_node_env
neptunian Jul 11, 2025
f851324
fix path
neptunian Jul 11, 2025
dcf01c8
try direct path
neptunian Jul 11, 2025
b9ba0e0
try REPO_ROOT
neptunian Jul 11, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -1299,6 +1299,7 @@
"papaparse": "^5.5.3",
"pbf": "3.2.1",
"pdfmake": "^0.2.15",
"piscina": "^3.2.0",
"polished": "^4.3.1",
"pretty-ms": "6.0.0",
"prop-types": "^15.8.1",
Expand Down Expand Up @@ -1922,7 +1923,6 @@
"peggy": "^4.2.0",
"picomatch": "^4.0.2",
"pirates": "^4.0.7",
"piscina": "^3.2.0",
"pixelmatch": "^5.3.0",
"playwright": "1.53.1",
"pngjs": "^7.0.0",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ export {
type DeanonymizationOutput,
type DeanonymizedMessage,
type AnonymizationSettings,
type AnonymizationRegexWorkerTaskPayload,
} from './src/chat_complete';

export type { BoundInferenceClient, InferenceClient } from './src/inference_client';
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,5 @@ export type {
RegexAnonymizationRule,
NamedEntityRecognitionRule,
AnonymizationSettings,
AnonymizationRegexWorkerTaskPayload,
} from './types';
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,7 @@ export interface DeanonymizationOutput {
}

export type DeanonymizedMessage = Message & { deanonymizations: Deanonymization[] };
export interface AnonymizationRegexWorkerTaskPayload {
rule: RegexAnonymizationRule;
records: Array<Record<string, string>>;
}
Original file line number Diff line number Diff line change
Expand Up @@ -79,4 +79,5 @@ export type {
RegexAnonymizationRule,
NamedEntityRecognitionRule,
AnonymizationSettings,
AnonymizationRegexWorkerTaskPayload,
} from './anonymization';
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
*/

import { MlInferenceResponseResult } from '@elastic/elasticsearch/lib/api/types';
import { loggerMock, type MockedLogger } from '@kbn/logging-mocks';
import { anonymizeMessages } from './anonymize_messages';
import {
AnonymizationRule,
Expand All @@ -16,16 +17,23 @@ import {
} from '@kbn/inference-common';
import { messageToAnonymizationRecords } from './message_to_anonymization_records';
import { getEntityMask } from './get_entity_mask';

import { RegexWorkerService } from './regex_worker_service';
import { AnonymizationWorkerConfig } from '../../config';
const mockEsClient = {
ml: {
inferTrainedModel: jest.fn(),
},
} as any;

const testConfig = {
enabled: false,
} as AnonymizationWorkerConfig;
describe('anonymizeMessages', () => {
let logger: MockedLogger;
let regexWorker: RegexWorkerService;
beforeEach(() => {
jest.resetAllMocks();
logger = loggerMock.create();
regexWorker = new RegexWorkerService(testConfig, logger);
});

const setupMockResponse = (entities: MlInferenceResponseResult[]) => {
Expand Down Expand Up @@ -88,6 +96,7 @@ describe('anonymizeMessages', () => {
const result = await anonymizeMessages({
messages,
anonymizationRules: [nerRule],
regexWorker,
esClient: mockEsClient,
});

Expand Down Expand Up @@ -126,6 +135,7 @@ describe('anonymizeMessages', () => {
anonymizeMessages({
messages,
anonymizationRules: [nerRule],
regexWorker,
esClient: mockEsClient,
})
).resolves.toBeDefined();
Expand All @@ -137,6 +147,7 @@ describe('anonymizeMessages', () => {
const result = await anonymizeMessages({
messages,
anonymizationRules: [disabledRule],
regexWorker,
esClient: mockEsClient,
});

Expand All @@ -154,6 +165,7 @@ describe('anonymizeMessages', () => {
const result = await anonymizeMessages({
messages,
anonymizationRules: [disabledRule],
regexWorker,
esClient: mockEsClient,
});

Expand Down Expand Up @@ -181,6 +193,7 @@ describe('anonymizeMessages', () => {
const result = await anonymizeMessages({
messages,
anonymizationRules: [regexRule],
regexWorker,
esClient: mockEsClient,
});

Expand Down Expand Up @@ -251,6 +264,7 @@ describe('anonymizeMessages', () => {

const result = await anonymizeMessages({
messages,
regexWorker,
anonymizationRules: [nerRule],
esClient: mockEsClient,
});
Expand Down Expand Up @@ -295,6 +309,7 @@ describe('anonymizeMessages', () => {
system: systemPrompt,
messages: [],
anonymizationRules: [nerRule],
regexWorker,
esClient: mockEsClient,
});
expect(result.system).toBe(
Expand Down Expand Up @@ -349,6 +364,7 @@ describe('anonymizeMessages', () => {
},
],
anonymizationRules: [nerRule], // nerRule allows only PER
regexWorker,
esClient: mockEsClient,
});

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,19 @@ import { merge } from 'lodash';
import { anonymizeRecords } from './anonymize_records';
import { messageFromAnonymizationRecords } from './message_from_anonymization_records';
import { messageToAnonymizationRecords } from './message_to_anonymization_records';
import { RegexWorkerService } from './regex_worker_service';

export async function anonymizeMessages({
system,
messages,
anonymizationRules,
regexWorker,
esClient,
}: {
system?: string | undefined;
messages: Message[];
anonymizationRules: AnonymizationRule[];
regexWorker: RegexWorkerService;
esClient: ElasticsearchClient;
}): Promise<AnonymizationOutput> {
const rules = anonymizationRules.filter((rule) => rule.enabled);
Expand All @@ -41,6 +44,7 @@ export async function anonymizeMessages({
const { records, anonymizations } = await anonymizeRecords({
input: toAnonymize,
anonymizationRules: rules,
regexWorker,
esClient,
});

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
import { anonymizeRecords } from './anonymize_records';
import { AnonymizationRule } from '@kbn/inference-common';
import { MlInferenceResponseResult } from '@elastic/elasticsearch/lib/api/types';

import { loggerMock, type MockedLogger } from '@kbn/logging-mocks';
import { RegexWorkerService } from './regex_worker_service';
import { AnonymizationWorkerConfig } from '../../config';
const mockEsClient = {
ml: {
inferTrainedModel: jest.fn(),
Expand All @@ -20,27 +22,36 @@ const setupMockResponse = (entitiesPerDoc: MlInferenceResponseResult[]) => {
inference_results: entitiesPerDoc,
});
};
const nerRule: AnonymizationRule = {
type: 'NER',
enabled: true,
modelId: 'model-1',
};
const nerRule2: AnonymizationRule = {
type: 'NER',
enabled: true,
modelId: 'model-2',
};
const regexRule: AnonymizationRule = {
type: 'RegExp',
enabled: true,
entityClass: 'EMAIL',
pattern: '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}',
};

const testConfig = {
enabled: false,
} as AnonymizationWorkerConfig;

describe('anonymizeRecords', () => {
const nerRule: AnonymizationRule = {
type: 'NER',
enabled: true,
modelId: 'model-1',
};
const nerRule2: AnonymizationRule = {
type: 'NER',
enabled: true,
modelId: 'model-2',
};
const regexRule: AnonymizationRule = {
type: 'RegExp',
enabled: true,
entityClass: 'EMAIL',
pattern: '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}',
};
let logger: MockedLogger;

let regexWorker: RegexWorkerService;

beforeEach(() => {
jest.resetAllMocks();
logger = loggerMock.create();
regexWorker = new RegexWorkerService(testConfig, logger);
});

it('masks values using regex rule', async () => {
Expand All @@ -49,6 +60,7 @@ describe('anonymizeRecords', () => {
const { records, anonymizations } = await anonymizeRecords({
input,
anonymizationRules: [regexRule],
regexWorker,
esClient: mockEsClient,
});

Expand All @@ -63,6 +75,7 @@ describe('anonymizeRecords', () => {
await anonymizeRecords({
input: [{ content: shortText }],
anonymizationRules: [nerRule],
regexWorker,
esClient: mockEsClient,
});

Expand All @@ -80,6 +93,7 @@ describe('anonymizeRecords', () => {
const { records } = await anonymizeRecords({
input: [{ content: longText }],
anonymizationRules: [nerRule],
regexWorker,
esClient: mockEsClient,
});

Expand Down Expand Up @@ -134,6 +148,7 @@ describe('anonymizeRecords', () => {
const { records, anonymizations } = await anonymizeRecords({
input,
anonymizationRules: [nerRule, nerRule2],
regexWorker,
esClient: mockEsClient,
});

Expand All @@ -156,6 +171,7 @@ describe('anonymizeRecords', () => {
const result = await anonymizeRecords({
input,
anonymizationRules: [regexRule],
regexWorker,
esClient: mockEsClient,
});

Expand Down Expand Up @@ -190,6 +206,7 @@ describe('anonymizeRecords', () => {
const result = await anonymizeRecords({
input,
anonymizationRules: [nerRule],
regexWorker,
esClient: mockEsClient,
});

Expand Down Expand Up @@ -242,6 +259,7 @@ describe('anonymizeRecords', () => {
const result = await anonymizeRecords({
input,
anonymizationRules: [nerRule, nerRule2],
regexWorker,
esClient: mockEsClient,
});

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,24 +11,29 @@ import { partition } from 'lodash';
import { AnonymizationState } from './types';
import { executeRegexRule } from './execute_regex_rule';
import { executeNerRule } from './execute_ner_rule';
import { RegexWorkerService } from './regex_worker_service';

export async function anonymizeRecords<T extends Record<string, string | undefined>>({
input,
anonymizationRules,
regexWorker,
esClient,
}: {
input: T[];
anonymizationRules: AnonymizationRule[];
regexWorker: RegexWorkerService;
esClient: ElasticsearchClient;
}): Promise<AnonymizationState>;

export async function anonymizeRecords({
input,
anonymizationRules,
regexWorker,
esClient,
}: {
input: Array<Record<string, string>>;
anonymizationRules: AnonymizationRule[];
regexWorker: RegexWorkerService;
esClient: ElasticsearchClient;
}): Promise<AnonymizationState> {
let state: AnonymizationState = {
Expand All @@ -42,9 +47,10 @@ export async function anonymizeRecords({
);

for (const rule of regexRules) {
state = executeRegexRule({
state = await executeRegexRule({
rule,
state,
regexWorker,
});
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,41 +5,26 @@
* 2.0.
*/

import { Anonymization, RegexAnonymizationRule } from '@kbn/inference-common';
import { RegexAnonymizationRule } from '@kbn/inference-common';
import { AnonymizationState } from './types';
import { getEntityMask } from './get_entity_mask';
import { RegexWorkerService } from './regex_worker_service';

/**
* Executes a regex anonymization rule, by iterating over the matches,
* and replacing each occurrence with a masked value.
*/
export function executeRegexRule({
export async function executeRegexRule({
state,
rule,
regexWorker,
}: {
state: AnonymizationState;
rule: RegexAnonymizationRule;
}): AnonymizationState {
const regex = new RegExp(rule.pattern, 'g');

const anonymizations: Anonymization[] = state.anonymizations.concat();

const nextRecords = state.records.map((record) => {
const newRecord: Record<string, string> = {};
for (const [key, value] of Object.entries(record)) {
newRecord[key] = value.replace(regex, (match) => {
const mask = getEntityMask({ value: match, class_name: rule.entityClass });

anonymizations.push({
entity: { value: match, class_name: rule.entityClass, mask },
rule: { type: rule.type },
});

return mask;
});
}
return newRecord;
regexWorker: RegexWorkerService;
}): Promise<AnonymizationState> {
const { records, anonymizations } = await regexWorker.run({
rule,
records: state.records,
});

return { records: nextRecords, anonymizations };
return { records, anonymizations: state.anonymizations.concat(anonymizations) };
}
Loading