Skip to content

Commit

Permalink
fix(transliterate): distinguish lj and ĺj
Browse files Browse the repository at this point in the history
  • Loading branch information
noomorph committed Dec 14, 2023
1 parent 7ef7922 commit 803a94b
Show file tree
Hide file tree
Showing 14 changed files with 219 additions and 1,675 deletions.
54 changes: 0 additions & 54 deletions scripts/generate-nj-suite.mjs

This file was deleted.

113 changes: 113 additions & 0 deletions scripts/generate-rule-exceptions.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
#!/usr/bin/env node

import fs from "node:fs";
import utils from '../dist/index.js';

function* extractWords(str) {
// include letters and combining marks
const regex = /([\p{L}\p{M}]+)/gu;
let match;

while ((match = regex.exec(str)) !== null) {
yield match[1];
}
}

function* extractWordsFromFile(filePath) {
const raw = fs.readFileSync(filePath, 'utf8');
yield* extractWords(raw);
}

function* allWords() {
yield* extractWordsFromFile('src/adjective/testCases.json');
yield* extractWordsFromFile('src/noun/__snapshots__/declensionNoun.test.ts.snap');
yield* extractWordsFromFile('src/numeral/testCases.json');
yield* extractWordsFromFile('src/pronoun/testCases.json');
yield* extractWordsFromFile('src/verb/testCases.json');
}

function buildExceptionList(predicate) {
const set = new Set();
for (const word of allWords()) {
if (predicate(word)) {
set.add(utils.transliterate(word.toLowerCase(), 'art-Latn-x-interslv'));
}
}
return [...set].sort();
}

function toTrieToken(word) {
return '%' + word + '%';
}

/**
* @param {string[]} tokens
* @returns string
*/
function buildSuffixTrie(tokens) {
const trie = {};

// iterate over the tokens array.
tokens.forEach((token) => {
let lettersBreakdown = token.split("").reverse();
let current = trie;

// iterate over every letter in the token/word.
lettersBreakdown.forEach((letter, index) => {
const position = current[letter];

if (position == null) {
// for the last letter of the word, assign 0. For others, assign empty object.
current = current[letter] = index === lettersBreakdown.length - 1 ? 0 : {};
} else if (position === 0) {
current = current[letter] = { $: 0 };
} else {
current = current[letter];
}
});

});

return JSON.stringify(trie) + '\n';
}

function generateRuleExceptions(predicate) {
return buildSuffixTrie(buildExceptionList(predicate).map(toTrieToken));
}

function containsLjj(word) {
return word.includes('ľj');
}

function endsWithNj(word) {
return word.endsWith('nja')
|| word.endsWith('njah')
|| word.endsWith('njam')
|| word.endsWith('njami')
|| word.endsWith('nje')
|| word.endsWith('njem')
|| word.endsWith('nju');
}

fs.writeFileSync(
'src/transliterate/lj-nj/exceptions-lj.json',
generateRuleExceptions(containsLjj)
);

fs.writeFileSync(
'src/transliterate/lj-nj/exceptions-nj.json',
generateRuleExceptions(endsWithNj)
);

fs.writeFileSync(
'src/transliterate/lj-nj/endings-nje.json',
buildSuffixTrie([
'nja%',
'njah%',
'njam%',
'njami%',
'nje%',
'njem%',
'nju%',
]),
);
19 changes: 6 additions & 13 deletions src/transliterate/index.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -59,20 +59,13 @@ describe('transliterate to', () => {
},
);

test.failing(
'double transliteration should work equally from Latin and Cyrillic scripts',
() => {
const latn2cyrl = transliterate(latin, 'art-Cyrl-x-interslv');
const cyrl2latn = transliterate(cyrillic, 'art-Latn-x-interslv');
test('double transliteration should work equally from Latin and Cyrillic scripts', () => {
const latn2cyrl = transliterate(latin, 'art-Cyrl-x-interslv');
const cyrl2latn = transliterate(cyrillic, 'art-Latn-x-interslv');

expect(transliterate(latn2cyrl, 'art-Latn-x-interslv')).toEqual(
cyrl2latn,
);
expect(transliterate(cyrl2latn, 'art-Cyrl-x-interslv')).toEqual(
latn2cyrl,
);
},
);
expect(transliterate(latn2cyrl, 'art-Latn-x-interslv')).toEqual(cyrl2latn);
expect(transliterate(cyrl2latn, 'art-Cyrl-x-interslv')).toEqual(latn2cyrl);
});

test('unknown code', () => {
expect(() => transliterate('', 'en' as any)).toThrowErrorMatchingSnapshot();
Expand Down
1 change: 1 addition & 0 deletions src/transliterate/lj-nj/endings-nje.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"%":{"a":{"j":{"n":0}},"h":{"a":{"j":{"n":0}}},"m":{"a":{"j":{"n":0}},"e":{"j":{"n":0}}},"i":{"m":{"a":{"j":{"n":0}}}},"e":{"j":{"n":0}},"u":{"j":{"n":0}}}}
1 change: 1 addition & 0 deletions src/transliterate/lj-nj/exceptions-lj.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"%":{"a":{"j":{"l":{"i":{"s":{"u":{"z":{"e":{"b":{"%":0}}},"%":0},"a":{"n":{"%":0}}},"b":{"o":{"%":0}}},"e":{"g":{"n":{"a":{"v":{"e":{"%":0}}}}},"m":{"h":{"o":{"p":{"%":0}}}},"č":{"p":{"%":0}},"s":{"e":{"v":{"%":0}}}},"r":{"o":{"%":0}}}}},"e":{"j":{"l":{"i":{"s":{"u":{"z":{"e":{"b":{"%":0}}},"%":0},"a":{"n":{"%":0}}},"b":{"o":{"%":0}}},"e":{"g":{"n":{"a":{"v":{"e":{"%":0}}}}},"m":{"h":{"o":{"p":{"%":0}}}},"č":{"p":{"%":0}},"s":{"e":{"v":{"%":0}}}},"r":{"o":{"%":0}}},"e":{"j":{"l":{"r":{"o":{"j":{"a":{"n":{"%":0}}},"%":0}},"e":{"č":{"p":{"j":{"a":{"n":{"%":0}}},"%":0}}}}}}}},"m":{"e":{"j":{"l":{"i":{"s":{"u":{"z":{"e":{"b":{"%":0}}},"%":0},"a":{"n":{"%":0}}},"b":{"o":{"%":0}}},"e":{"g":{"n":{"a":{"v":{"e":{"%":0}}}}},"m":{"h":{"o":{"p":{"%":0}}}},"č":{"p":{"%":0}},"s":{"e":{"v":{"%":0}}}},"r":{"o":{"%":0}}}}},"a":{"j":{"l":{"e":{"g":{"n":{"a":{"v":{"e":{"%":0}}}}},"m":{"h":{"o":{"p":{"%":0}}}}},"i":{"b":{"o":{"%":0}},"s":{"u":{"%":0}}}}}},"i":{"j":{"l":{"r":{"o":{"%":0}},"e":{"č":{"p":{"%":0}}}}}}},"u":{"j":{"l":{"i":{"s":{"u":{"z":{"e":{"b":{"%":0}}},"%":0},"a":{"n":{"%":0}}},"b":{"o":{"%":0}}},"e":{"g":{"n":{"a":{"v":{"e":{"%":0}}}}},"m":{"h":{"o":{"p":{"%":0}}}},"č":{"p":{"%":0}},"s":{"e":{"v":{"%":0}}}},"r":{"o":{"%":0}}},"e":{"j":{"l":{"r":{"o":{"%":0}},"e":{"č":{"p":{"%":0}}}}}}},"m":{"e":{"j":{"l":{"r":{"o":{"%":0}},"e":{"č":{"p":{"%":0}}}}}}}},"h":{"a":{"j":{"l":{"e":{"g":{"n":{"a":{"v":{"e":{"%":0}}}}},"m":{"h":{"o":{"p":{"%":0}}}}},"i":{"b":{"o":{"%":0}},"s":{"u":{"%":0}}}}}},"i":{"j":{"l":{"r":{"o":{"%":0}},"e":{"č":{"p":{"%":0}}}}}}},"i":{"m":{"a":{"j":{"l":{"e":{"g":{"n":{"a":{"v":{"e":{"%":0}}}}},"m":{"h":{"o":{"p":{"%":0}}}}},"i":{"b":{"o":{"%":0}},"s":{"u":{"%":0}}}}}},"i":{"j":{"l":{"r":{"o":{"%":0}},"e":{"č":{"p":{"%":0}}}}}}},"š":{"j":{"e":{"j":{"l":{"r":{"o":{"j":{"a":{"n":{"%":0}}},"%":0}},"e":{"č":{"p":{"j":{"a":{"n":{"%":0}}},"%":0}}}}}}}},"j":{"l":{"r":{"o":{"%":0}},"e":{"č":{"p":{"%":0}}}}}},"o":{"g":{"e":{"j":{"l":{"r":{"o":{"%":0}},"e":{"č":{"p":{"%":0}}}}}}}},"j":{"e":{"j":{"l":{"r":{"o":{"%":0}},"e":{"č":{"p":{"%":0}}}}}}}}}
1 change: 1 addition & 0 deletions src/transliterate/lj-nj/exceptions-nj.json

Large diffs are not rendered by default.

37 changes: 37 additions & 0 deletions src/transliterate/lj-nj/findTrieWord.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
export interface Dict {
[key: string]: Dict | number;
}

export const ENDS = 1;
export const WHOLE = 0;
export const MISMATCH = -1;

export function findTrieWord(
word: string,
dict: Dict,
): Dict | number | undefined {
let node: Dict | number = dict;
const length = word.length;
let chr = '';
let i: number;

for (i = length - 1; typeof node === 'object' && i >= 0; i--) {
chr = word[i];
node = node[chr];
}

return node === 0 ? (i === -1 ? WHOLE : ENDS) : MISMATCH;
}

export function findTriePosition(endings: Dict, word: string): number {
const length = word.length;
let node: Dict | number = endings;
let i: number;
let chr: string;
for (i = length - 1; typeof node === 'object' && i >= 0; i--) {
chr = word[i];
node = node[chr];
}

return node === 0 ? i + 1 : -1;
}
38 changes: 38 additions & 0 deletions src/transliterate/lj-nj/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import ljeExceptions from './exceptions-lj.json';
import njeExceptions from './exceptions-nj.json';
import njeEndings from './endings-nje.json';
import {
ENDS,
MISMATCH,
findTrieWord,
findTriePosition,
WHOLE,
} from './findTrieWord';

/**
* Check whether we should soften lj to ĺj
*/
export function ljeCheck(word: string) {
return findTrieWord(word, ljeExceptions) === WHOLE;
}

/**
* Check whether we should soften lj to ĺj
*/
export function ljePosition(word: string) {
return word.lastIndexOf('lj');
}

/**
* Check whether we should soften nj to ńj
*/
export function njeCheck(word: string) {
return (
findTrieWord(word, njeEndings) === ENDS &&
findTrieWord(word, njeExceptions) === MISMATCH
);
}

export function njePosition(word: string): number {
return findTriePosition(njeEndings, word);
}
File renamed without changes.
3 changes: 0 additions & 3 deletions src/transliterate/nje/Dict.ts

This file was deleted.

Loading

0 comments on commit 803a94b

Please sign in to comment.