Skip to content

Commit 753af73

Browse files
include chinese-specific transcription in zh-ipa (#222)
* baseline * fix * fix types * type
1 parent 7352b00 commit 753af73

File tree

9 files changed

+494
-17
lines changed

9 files changed

+494
-17
lines changed

Diff for: 3-tidy-up.js

+63-17
Original file line numberDiff line numberDiff line change
@@ -135,23 +135,7 @@ function handleLine(parsedLine) {
135135
const {senses, head_templates, tags} = parsedLine;
136136
if (!senses) return;
137137

138-
/** @type {IpaInfo[]} */
139-
const ipa = /** @type {IpaInfo[]} */ (sounds
140-
? sounds
141-
.filter(sound => sound && sound.ipa)
142-
.map(({ipa, tags, note}) => {
143-
if(!tags) {
144-
if (note) {
145-
tags = [note];
146-
} else {
147-
tags = [];
148-
}
149-
}
150-
return ({ipa, tags})
151-
})
152-
.flatMap(ipaObj => typeof ipaObj.ipa === 'string' ? [ipaObj] : ipaObj?.ipa?.map(ipa => ({ ipa, tags: ipaObj.tags })) )
153-
.filter(ipaObj => ipaObj?.ipa)
154-
: []);
138+
const ipa = getPhoneticTranscriptions(sounds);
155139

156140
/** @type {TidySense[]} */
157141
const sensesWithGlosses = /** @type {TidySense[]} */ (senses
@@ -259,6 +243,68 @@ function handleLine(parsedLine) {
259243
}
260244

261245
}
246+
/**
247+
* @param {Sound[]} sounds
248+
* @returns {IpaInfo[]}
249+
*/
250+
function getPhoneticTranscriptions(sounds) {
251+
if(!sounds) return [];
252+
switch(sourceIso) {
253+
case 'zh': {
254+
const ipaInfos = sounds.filter(sound => {
255+
if (!sound) return false;
256+
return sound.ipa || sound['zh-pron'];
257+
})
258+
.map(({ ipa, tags, note, 'zh-pron': zh_pron }) => {
259+
if (!tags) {
260+
if (note) {
261+
tags = [note];
262+
} else {
263+
tags = [];
264+
}
265+
}
266+
return ({ ipa, tags, zh_pron });
267+
})
268+
269+
/** @type {IpaInfo[]} */
270+
const ipaInfosWithStringIpa = []
271+
272+
for (const ipaObj of ipaInfos) {
273+
if (typeof ipaObj.ipa === 'string') {
274+
ipaInfosWithStringIpa.push(/** @type {IpaInfo} */ (ipaObj));
275+
} else if (Array.isArray(ipaObj.ipa)) {
276+
for (const ipa of ipaObj.ipa) {
277+
ipaInfosWithStringIpa.push({ ipa, tags: ipaObj.tags });
278+
}
279+
} else if (ipaObj.zh_pron) {
280+
ipaInfosWithStringIpa.push({ ipa: ipaObj.zh_pron, tags: ipaObj.tags });
281+
}
282+
}
283+
284+
return ipaInfosWithStringIpa;
285+
}
286+
default: {
287+
const ipaInfos = sounds.filter(sound => {
288+
if (!sound) return false;
289+
return !!sound.ipa;
290+
})
291+
.map(({ ipa, tags, note }) => {
292+
if (!tags) {
293+
if (note) {
294+
tags = [note];
295+
} else {
296+
tags = [];
297+
}
298+
}
299+
return ({ ipa, tags });
300+
})
301+
302+
const ipaInfosWithStringIpa = /** @type {IpaInfo[]}*/ (ipaInfos.flatMap(ipaObj => typeof ipaObj.ipa === 'string' ? [ipaObj] : ipaObj?.ipa?.map(ipa => ({ ipa, tags: ipaObj.tags }))));
303+
304+
return ipaInfosWithStringIpa;
305+
}
306+
}
307+
}
262308

263309
/**
264310
* @param {string} text

Diff for: data/test/dict/zh/en/tag_bank_1.json

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
[
2+
[
3+
"char",
4+
"",
5+
0,
6+
"character",
7+
0
8+
]
9+
]

Diff for: data/test/dict/zh/en/term_bank_1.json

+118
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
[
2+
[
3+
"",
4+
"",
5+
"char",
6+
"character",
7+
0,
8+
[
9+
{
10+
"type": "structured-content",
11+
"content": [
12+
{
13+
"tag": "div",
14+
"content": [
15+
{
16+
"tag": "div",
17+
"data": {
18+
"content": "preamble"
19+
},
20+
"content": [
21+
{
22+
"tag": "details",
23+
"data": {
24+
"content": "details-entry-Grammar"
25+
},
26+
"content": [
27+
{
28+
"tag": "summary",
29+
"data": {
30+
"content": "summary-entry"
31+
},
32+
"content": "Grammar"
33+
},
34+
{
35+
"tag": "div",
36+
"data": {
37+
"content": "Grammar-content"
38+
},
39+
"content": "(Min)"
40+
}
41+
]
42+
},
43+
{
44+
"tag": "details",
45+
"data": {
46+
"content": "details-entry-Etymology"
47+
},
48+
"content": [
49+
{
50+
"tag": "summary",
51+
"data": {
52+
"content": "summary-entry"
53+
},
54+
"content": "Etymology"
55+
},
56+
{
57+
"tag": "div",
58+
"data": {
59+
"content": "Etymology-content"
60+
},
61+
"content": "The Min native word for “paddy field; field”.\nEtymology unknown. Chinese scholars identify 塍 (OC *ɦljɯŋ, “raised path between fields”) as the etymological character (本字), although Norman proposes that this is related to 層 (OC *zɯːŋ, “layer”), reflecting the terraced fields commonly found in Fujian (Schuessler, 2007). Compare also 㽪 (“wet field”)."
62+
}
63+
]
64+
}
65+
]
66+
}
67+
]
68+
},
69+
{
70+
"tag": "ol",
71+
"data": {
72+
"content": "glosses"
73+
},
74+
"content": [
75+
{
76+
"tag": "li",
77+
"content": [
78+
{
79+
"tag": "div",
80+
"content": [
81+
"paddy field; wet field (Classifier: 坵 mn)"
82+
]
83+
}
84+
]
85+
},
86+
{
87+
"tag": "li",
88+
"content": [
89+
{
90+
"tag": "div",
91+
"content": [
92+
"farmland in general; field (Classifier: 坵 mn)"
93+
]
94+
}
95+
]
96+
}
97+
]
98+
},
99+
{
100+
"tag": "div",
101+
"data": {
102+
"content": "backlink"
103+
},
104+
"content": [
105+
{
106+
"tag": "a",
107+
"href": "https://en.wiktionary.org/wiki/田#Chinese",
108+
"content": "Wiktionary"
109+
}
110+
]
111+
}
112+
]
113+
}
114+
],
115+
0,
116+
""
117+
]
118+
]

Diff for: data/test/ipa/zh/en/tag_bank_1.json

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
[]

Diff for: data/test/ipa/zh/en/term_meta_bank_1.json

+126
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
[
2+
[
3+
"",
4+
"ipa",
5+
{
6+
"reading": "",
7+
"transcriptions": [
8+
{
9+
"ipa": "châing",
10+
"tags": [
11+
"Jian'ou",
12+
"Kienning-Colloquial-Romanized"
13+
]
14+
},
15+
{
16+
"ipa": "chèng",
17+
"tags": [
18+
"Foochow-Romanized",
19+
"Fuzhou"
20+
]
21+
},
22+
{
23+
"ipa": "ceng²",
24+
"tags": [
25+
"Puxian-Min"
26+
]
27+
},
28+
{
29+
"ipa": "chhân",
30+
"tags": [
31+
"Hokkien",
32+
"POJ"
33+
]
34+
},
35+
{
36+
"ipa": "cang⁵",
37+
"tags": [
38+
"Peng'im",
39+
"Teochew"
40+
]
41+
},
42+
{
43+
"ipa": "/t͡sʰaiŋ³³/",
44+
"tags": [
45+
"Jian'ou",
46+
"Sinological-IPA"
47+
]
48+
},
49+
{
50+
"ipa": "/t͡sʰɛiŋ⁵³/",
51+
"tags": [
52+
"Fuzhou",
53+
"Sinological-IPA"
54+
]
55+
},
56+
{
57+
"ipa": "chéng",
58+
"tags": [
59+
"Puxian-Min"
60+
]
61+
},
62+
{
63+
"ipa": "/t͡sʰɛŋ¹³/",
64+
"tags": [
65+
"Puxian-Min",
66+
"Sinological-IPA"
67+
]
68+
},
69+
{
70+
"ipa": "tshân",
71+
"tags": [
72+
"Hokkien",
73+
"Tai-lo"
74+
]
75+
},
76+
{
77+
"ipa": "zhaan",
78+
"tags": [
79+
"Hokkien",
80+
"Phofsit-Daibuun"
81+
]
82+
},
83+
{
84+
"ipa": "/t͡sʰan²⁴/",
85+
"tags": [
86+
"Changtai",
87+
"Hokkien",
88+
"Jinjiang",
89+
"Quanzhou",
90+
"Singapore",
91+
"Taipei",
92+
"Xiamen"
93+
]
94+
},
95+
{
96+
"ipa": "/t͡sʰan¹³/",
97+
"tags": [
98+
"Hokkien",
99+
"Zhangzhou"
100+
]
101+
},
102+
{
103+
"ipa": "/t͡sʰan²³/",
104+
"tags": [
105+
"Hokkien",
106+
"Kaohsiung"
107+
]
108+
},
109+
{
110+
"ipa": "tshâng",
111+
"tags": [
112+
"POJ",
113+
"Teochew"
114+
]
115+
},
116+
{
117+
"ipa": "/t͡sʰaŋ⁵⁵/",
118+
"tags": [
119+
"Sinological-IPA",
120+
"Teochew"
121+
]
122+
}
123+
]
124+
}
125+
]
126+
]

Diff for: data/test/kaikki/zh-en.json

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"pos": "character", "head_templates": [{"name": "head", "args": {"1": "zh", "2": "hanzi"}, "expansion": "田"}, {"name": "tlb", "args": {"1": "zh", "2": "Min"}, "expansion": "(Min)"}], "sounds": [{"zh-pron": "châing"}, {"zh-pron": "chèng"}, {"zh-pron": "ceng²"}, {"tags": ["Hokkien", "POJ"], "zh-pron": "chhân"}, {"tags": ["Peng'im", "Teochew"], "zh-pron": "cang⁵"}, {"tags": ["Jian'ou", "Kienning-Colloquial-Romanized"], "zh-pron": "châing"}, {"tags": ["Jian'ou", "Sinological-IPA"], "ipa": "/t͡sʰaiŋ³³/"}, {"tags": ["Foochow-Romanized", "Fuzhou"], "zh-pron": "chèng"}, {"tags": ["Fuzhou", "Sinological-IPA"], "ipa": "/t͡sʰɛiŋ⁵³/"}, {"tags": ["Puxian-Min"], "zh-pron": "ceng²"}, {"tags": ["Puxian-Min"], "zh-pron": "chéng"}, {"tags": ["Puxian-Min", "Sinological-IPA"], "ipa": "/t͡sʰɛŋ¹³/"}, {"tags": ["Hokkien", "Tai-lo"], "zh-pron": "tshân"}, {"tags": ["Hokkien", "Phofsit-Daibuun"], "zh-pron": "zhaan"}, {"tags": ["Changtai", "Hokkien", "Jinjiang", "Quanzhou", "Singapore", "Taipei", "Xiamen"], "ipa": "/t͡sʰan²⁴/"}, {"tags": ["Hokkien", "Zhangzhou"], "ipa": "/t͡sʰan¹³/"}, {"tags": ["Hokkien", "Kaohsiung"], "ipa": "/t͡sʰan²³/"}, {"tags": ["POJ", "Teochew"], "zh-pron": "tshâng"}, {"tags": ["Sinological-IPA", "Teochew"], "ipa": "/t͡sʰaŋ⁵⁵/"}, {"ipa": "/t͡sʰaiŋ³³/"}, {"ipa": "/t͡sʰɛiŋ⁵³/"}, {"ipa": "/t͡sʰɛŋ¹³/"}, {"ipa": "/t͡sʰan²⁴/"}, {"ipa": "/t͡sʰan¹³/"}, {"ipa": "/t͡sʰan²³/"}, {"ipa": "/t͡sʰaŋ⁵⁵/"}], "etymology_number": 2, "wikipedia": ["Jerry Norman (sinologist)"], "etymology_text": "The Min native word for “paddy field; field”.\nEtymology unknown. Chinese scholars identify 塍 (OC *ɦljɯŋ, “raised path between fields”) as the etymological character (本字), although Norman proposes that this is related to 層 (OC *zɯːŋ, “layer”), reflecting the terraced fields commonly found in Fujian (Schuessler, 2007). Compare also 㽪 (“wet field”).", "etymology_templates": [{"name": "och-l", "args": {"1": "塍", "2": "raised path between fields"}, "expansion": "塍 (OC *ɦljɯŋ, “raised path between fields”)"}, {"name": "zh-l", "args": {"1": "本字", "tr": "-"}, "expansion": "本字"}, {"name": "och-l", "args": {"1": "層", "2": "layer"}, "expansion": "層 (OC *zɯːŋ, “layer”)"}, {"name": "zh-ref", "args": {"1": "Schuessler, 2007"}, "expansion": "Schuessler, 2007"}, {"name": "och-l", "args": {"1": "㽪", "2": "wet field"}, "expansion": "㽪 (“wet field”)"}], "word": "田", "lang": "Chinese", "lang_code": "zh", "forms": [{"form": "塍"}, {"form": "堘"}], "derived": [{"roman": "tītián", "word": "梯田", "_dis1": "0 0 0 0 0 0 0 0"}, {"roman": "shuǐtián", "word": "水田", "_dis1": "0 0 0 0 0 0 0 0"}, {"roman": "lítián", "word": "犁田", "_dis1": "0 0 0 0 0 0 0 0"}, {"roman": "tiánzhǔ", "word": "田主", "_dis1": "0 0 0 0 0 0 0 0"}, {"roman": "tiánchéng", "word": "田塍", "_dis1": "0 0 0 0 0 0 0 0"}, {"roman": "tiánqì", "word": "田契", "_dis1": "0 0 0 0 0 0 0 0"}, {"word": "田客", "_dis1": "0 0 0 0 0 0 0 0"}, {"word": "田畦", "_dis1": "0 0 0 0 0 0 0 0"}, {"word": "田租", "_dis1": "0 0 0 0 0 0 0 0"}, {"roman": "tiánjī", "word": "田雞", "_dis1": "0 0 0 0 0 0 0 0"}, {"roman": "tiánjī", "word": "田鸡", "_dis1": "0 0 0 0 0 0 0 0"}, {"roman": "tiánshǔ", "word": "田鼠", "_dis1": "0 0 0 0 0 0 0 0"}, {"word": "佈田", "_dis1": "0 0"}, {"word": "布田", "_dis1": "0 0"}, {"word": "作田", "_dis1": "0 0"}, {"word": "使田", "_dis1": "0 0"}, {"word": "做田", "_dis1": "0 0"}, {"word": "去田", "_dis1": "0 0"}, {"word": "壅田", "_dis1": "0 0"}, {"word": "山壠田", "_dis1": "0 0"}, {"word": "山垅田", "_dis1": "0 0"}, {"roman": "shāntián", "word": "山田", "_dis1": "0 0"}, {"word": "播田", "_dis1": "0 0"}, {"word": "洲田", "_dis1": "0 0"}, {"english": "muddy field", "word": "湳田", "_dis1": "0 0"}, {"word": "田佃", "_dis1": "0 0"}, {"roman": "tiányuán", "word": "田園", "_dis1": "0 0"}, {"roman": "tiányuán", "word": "田园", "_dis1": "0 0"}, {"roman": "tiándì", "word": "田地", "_dis1": "0 0"}, {"word": "田堘", "_dis1": "0 0"}, {"word": "田塗", "_dis1": "0 0"}, {"word": "田涂", "_dis1": "0 0"}, {"word": "田嬰", "_dis1": "0 0"}, {"word": "田婴", "_dis1": "0 0"}, {"roman": "Tiánwěi", "word": "田尾", "_dis1": "0 0"}, {"word": "田缺", "_dis1": "0 0"}, {"roman": "tiánluó", "word": "田螺", "_dis1": "0 0"}, {"word": "田面", "_dis1": "0 0"}, {"word": "瘦田", "_dis1": "0 0"}, {"word": "破田", "_dis1": "0 0"}, {"word": "耘田", "_dis1": "0 0"}, {"word": "荒田", "_dis1": "0 0"}, {"roman": "Xíngtián", "word": "行田", "_dis1": "0 0"}, {"word": "巡田", "_dis1": "0 0"}, {"word": "鈀田", "_dis1": "0 0"}, {"word": "钯田", "_dis1": "0 0"}], "senses": [{"links": [["paddy field", "paddy field"], ["wet", "wet"], ["field", "field"], ["坵", "坵#Chinese"]], "glosses": ["paddy field; wet field (Classifier: 坵 mn)"], "tags": ["Min"], "id": "en-田-zh-character-Ob9waJpb", "categories": [{"name": "Chinese nouns classified by 坵", "kind": "other", "parents": [], "source": "w"}, {"name": "Chinese entries with incorrect language header", "kind": "other", "parents": ["Entries with incorrect language header", "Entry maintenance"], "source": "w+disamb", "_dis": "13 7 2 3 23 5 43 5"}, {"name": "Chinese hanzi", "kind": "other", "parents": [], "source": "w+disamb", "_dis": "7 4 1 1 29 4 51 3"}, {"name": "Eastern Min hanzi", "kind": "other", "parents": [], "source": "w+disamb", "_dis": "5 3 1 1 30 10 46 2"}, {"name": "Elementary Mandarin", "kind": "other", "parents": [], "source": "w+disamb", "_dis": "20 7 4 5 6 17 25 16"}, {"name": "Northern Min hanzi", "kind": "other", "parents": [], "source": "w+disamb", "_dis": "5 4 1 1 30 10 46 2"}, {"name": "Agriculture", "kind": "topical", "parents": ["Applied sciences", "Sciences", "All topics", "Fundamental"], "source": "w+disamb", "orig": "zh:Agriculture", "langcode": "zh", "_dis": "6 3 1 2 24 6 55 3"}]}, {"links": [["farmland", "farmland"], ["field", "field"], ["坵", "坵#Chinese"]], "glosses": ["farmland in general; field (Classifier: 坵 mn)"], "tags": ["Min"], "id": "en-田-zh-character-Jka-zqy4", "categories": [{"name": "Chinese nouns classified by 坵", "kind": "other", "parents": [], "source": "w"}, {"name": "Elementary Mandarin", "kind": "other", "parents": [], "source": "w+disamb", "_dis": "20 7 4 5 6 17 25 16"}]}]}

Diff for: data/test/tidy/zh-en-forms-0.json

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"_type": "map",
3+
"map": []
4+
}

0 commit comments

Comments
 (0)