-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathblacklist.ts
400 lines (347 loc) · 12.3 KB
/
blacklist.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
// @ts-ignore
import * as logg from './logtest';
namespace findWord {
interface wordPosition {
matchedWord: string;
rawMatch: string;
index: number;
probability: number;
checkedString: string; // the filtered string
}
// obfuscating the word
// const probabilityDecrease: { [key: string]: number } = {
// //notStart: 20, // bc -> abc
// //notEnd: 20, // ab -> abc
// charsMissing: 20, // abc -> ac
// charsInbetween: 10 // ac -> abc
// //charReplace: 5 // abc -> acc
// //swappedChars: 30, // ab -> ba
// };
// letter allisases
const characters: { char: string; aliases: string[] }[] = [
{ char: 'a', aliases: 'âäàáåãªæ@4'.split('') },
{ char: 'b', aliases: '8'.split('') },
{ char: 'c', aliases: 'ç¢'.split('') },
{ char: 'e', aliases: 'éêëè€3'.split('') },
{ char: 'f', aliases: 'ƒ'.split('') },
{ char: 'i', aliases: 'ïîìíı!1'.split('') },
{ char: 'l', aliases: '£'.split('') },
{ char: 'n', aliases: 'ñ'.split('') },
{ char: 'o', aliases: 'ôöòóõø¤ðº0'.split('') },
{ char: 'p', aliases: '¶'.split('') },
{ char: 's', aliases: '$ߧ5'.split('') },
{ char: 't', aliases: '7'.split('') },
{ char: 'u', aliases: 'ûüùúµ'.split('') },
{ char: 'x', aliases: '×'.split('') },
{ char: 'y', aliases: 'ÿý¥'.split('') }
];
export function checkForWord(
text: string,
wordlist: string[]
): wordPosition[] {
text = normaliseText(text);
wordlist = wordlist.map((word) => normaliseText(word).replace(/ /g, ''));
const foundWords: wordPosition[] = [];
for (const word of wordlist) {
let answers: wordPosition[] = [];
for (let i = 0; i < text.length; ++i) {
// go through each char of the text
// start at -50% of the word from curChar
// and go until +50% of the word from curChar
for (
let y = Math.floor(word.length * 0.5);
y <= Math.ceil(word.length * 1.5) && i + y <= text.length;
++y
)
answers.push(findWord(text.slice(i, i + y), word, i));
answers = answers
.filter((e) => e !== null)
.map((e) => {
e.checkedString = text; // add the used string
return e;
});
}
foundWords.push(...answers);
}
// remove double findings of a word,
// by getting all the ones with the same flagged word
// and the same overlapping index + checking for the highest probability
// group them by flagged words
const foundWordsByName: { [word: string]: wordPosition[] } = {};
for (const entry of foundWords.values())
(foundWordsByName[entry.matchedWord] ||= []).push(entry);
Object.keys(foundWordsByName).map(
(key) =>
(foundWordsByName[key] = foundWordsByName[key].sort(
(a, b) => a.probability - b.probability
))
);
return removeDuplicates(foundWordsByName);
}
function findWord(
text: string,
word: string,
index: number
): wordPosition | null {
const array = new Array(1000);
const characterCodeCache = new Array(1000);
// @ts-ignore
const ans: wordPosition = {
matchedWord: word,
rawMatch: text,
index: index,
probability: leven(text, word)
};
// TODO, is 30% a good choice?
if (word.length * 0.3 >= ans.probability) {
return ans;
} else return null;
function leven(first: string, second: string): number {
if (first === second) return 0; // is the same
// Swapping the strings if `a` is longer than `b` so we know which one is the
// shortest & which one is the longest
if (first.length > second.length) {
const swap: string = first;
first = second;
second = swap;
}
let firstLength: number = first.length;
let secondLength: number = second.length;
// Performing suffix trimming:
// We can linearly drop suffix common to both strings since they
// don't increase distance at all
// Note: `~-` is the bitwise way to perform a `- 1` operation
while (
firstLength > 0 &&
first.charCodeAt(firstLength - 1) ===
second.charCodeAt(secondLength - 1)
) {
firstLength--;
secondLength--;
}
// Performing prefix trimming
// We can linearly drop prefix common to both strings since they
// don't increase distance at all
let start: number = 0;
while (
start < firstLength &&
first.charCodeAt(start) === second.charCodeAt(start)
) {
start++;
}
firstLength -= start;
secondLength -= start;
if (firstLength === 0) return secondLength;
let bCharacterCode: number = 0;
let result: number = 0;
let temporary: number = 0;
let temporary2: number = 0;
let index = 0;
let index2 = 0;
while (index < firstLength) {
characterCodeCache[index] = first.charCodeAt(start + index);
array[index] = ++index;
}
while (index2 < secondLength) {
bCharacterCode = second.charCodeAt(start + index2);
temporary = index2++;
result = index2;
for (index = 0; index < firstLength; index++) {
temporary2 =
bCharacterCode === characterCodeCache[index]
? temporary
: temporary + 1;
temporary = array[index];
// eslint-disable-next-line no-multi-assign
result = array[index] =
temporary > result
? temporary2 > result
? result + 1
: temporary2
: temporary2 > temporary
? temporary + 1
: temporary2;
}
}
return result;
}
}
export function normaliseText(text: string): string {
return text
.toLowerCase() // lower case
.split('')
.map((char) => {
//return char; // TODO
let ans: string = char;
characters.forEach((c) => {
if (c.aliases.includes(char)) ans = c.char; // replace alliases
});
const ret = !ans.match(/a-z0-9/) ? ans : ' ';
return ret; // only letters and numbers
})
.join('')
.replace(/(.)\1\1+/g, '$1$1'); // "aaa+" => "aa", up to 2 repeating characters
}
function removeDuplicates(foundWordsByName: {
[word: string]: wordPosition[];
}) {
const results: wordPosition[] = [];
// remove values which are the same word and overlapp with another instance of that same word
// at the SAME overlapping index but with lower probabilty
for (const word in foundWordsByName) {
const toRemoveFlagsIndexes: number[] = [];
let workingFlags: wordPosition[] = foundWordsByName[word];
for (let i = 0; i < workingFlags.length; ++i) {
const current: wordPosition = workingFlags[i];
if (
workingFlags.some((value, index) => {
if (index === i) return false; // if we are talking about the same flag, we shoudlnt do something
// check if we are talking about the same substring
const overlapps: boolean =
(current.index >= value.index &&
current.index <=
value.index +
value.rawMatch
.length) /*overlap by start index in between*/ ||
(current.index + current.rawMatch.length >= value.index &&
current.index + current.rawMatch.length <=
value.index +
value.rawMatch
.length) /*overlap by end index in between*/ ||
(current.index <= value.index &&
current.index + current.rawMatch.length >=
value.index +
value.rawMatch
.length); /*overlap by both start and end outside*/
// console.log(
// current.index,
// value.index,
// overlapps,
// current.probability < value.probability
// );
return (
overlapps && // its the same substring we are talking about
current.probability > value.probability // current probability is higher than one alternative
// case: "tesl", "tes" is one off, but "tesl" aswell
// case: "lest", "est" and "lest" are one off
);
})
)
toRemoveFlagsIndexes.push(i);
}
// "return" all the values for this word without the duplicated (the ones in "toRemoveIndexes")
results.push(
...workingFlags.filter(
(_, index) => !toRemoveFlagsIndexes.includes(index)
)
);
}
return results;
}
}
// TODO, fix "acbd", "abbcd", "zbcd", "abcz", "abcbd", "aabbccdd",
// TODO, test it with rust message type styl!!
//' testtest this lest is a tesl sentence with b@ddww0rd t€st',
// ['badword', 'test'];
// 'abcd zabcd abcdz acd abzcd abzd acbd abcdabcd bacd zbcd abcz aabcd abcdd abbcd aabbccdd aaabbbcccddd aaabcccd abcbd | accd'
//console.log(
findWord
.checkForWord(
'abcd zabcd abcdz acd abzcd abzd acbd abcdabcd bacd zbcd abcz aabcd abcdd abbcd aabbccdd aaabbbcccddd aaabcccd abcbd | accd',
['abcd']
)
.sort((a, b) => a.index - b.index)
.forEach((res) => {
logg.logger.logInfo(
{ author: '', fileName: '' },
findWord.normaliseText(
'abcd zabcd abcdz acd abzcd abzd acbd abcdabcd bacd zbcd abcz aabcd abcdd abbcd aabbccdd aaabbbcccddd aaabcccd abcbd | accd'
),
[
{
index: res.index,
length: res.rawMatch.length,
markColor: 1,
message: `Matched the word "abcd" with the substring "${
res.rawMatch
}". The probability of a correct match is ${
(4 - res.probability) / 4
}`,
infoCode: 'none',
infoType: 'warning',
infoDescription: ''
}
],
true
);
});
//);
//for (const foundWord of foundWords)
// if (!Object.keys(foundWordsByName).includes(foundWord.matchedWord))
// foundWordsByName[foundWord.matchedWord] = [foundWord]; // add this word
// else foundWordsByName[foundWord.matchedWord].push(foundWord); // push this word as it exists already
// sort the groups by probability, NOT NEEDED
// for (const word of Object.keys(foundWordsByName))
// foundWordsByName[word] = foundWordsByName[word].sort(
// (w1, w2) => w2.probability - w1.probability
// );
// TODO DEBUG
// for (const key in foundWordsByName)
// for (let i = 0; i < foundWordsByName[key].length; ++i)
// if (foundWordsByName[key][i].probability === 70) {
// foundWordsByName[key][i].index = -1;
// foundWordsByName[key][i].length = -1;
// }
/*
function findWord(
text: string,
index: number,
word: string
): wordPosition | null {
const originalText: string = text;
text = text.slice(index);
let found: boolean = false;
let probabilty: number = 100;
let foundIndex: number = -1;
let wordIndex: number = 0;
for (let i = 0; i < text.length; ++i) {
const char: string = text[i];
// TODO both at the same time!
// charsBetween: ac -> abc
// charsMissing: 20; // abc -> ac
if (char === word[wordIndex]) {
// increase to next char in the word
if (++wordIndex >= word.length) {
//console.log(index, i, i + index + 1, char);
found = true;
foundIndex = i + index + 1; // index is -1 cause length...
break;
}
} else if (text[i + 1] === word[wordIndex]) {
// probably a char inbetween
probabilty -= probabilityDecrease.charsInbetween;
} else if (
word.length !== wordIndex + 1 &&
char === word[wordIndex + 1]
) {
// probably a char missing
probabilty -= probabilityDecrease.charsMissing;
// // go to the next char
// if (++wordIndex >= word.length) {
// found = true;
// foundIndex = i + index + 1; // index is -1 cause length...
// break;
// }
} else {
}
}
if (found === true)
return {
matchedWord: word,
index: index,
length: foundIndex - index,
probability: probabilty,
rawMatch: originalText.slice(index, foundIndex)
};
else return null;
}*/