Skip to content

Commit

Permalink
CLDR-18043 Fix some transform issues (unicode-org#4152)
Browse files Browse the repository at this point in the history
  • Loading branch information
robertbastian authored Nov 4, 2024
1 parent a1961cc commit 48afc24
Show file tree
Hide file tree
Showing 26 changed files with 58 additions and 58 deletions.
2 changes: 1 addition & 1 deletion common/transforms/Arabic-Latin-BGN.xml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ For terms of use, see http://www.unicode.org/copyright.html
# MINIMAL FILTER: Arabic-Latin
#

:: [[:arabic:][:block=ARABIC:][ءآابةتثجحخدذرزسشصضطظعغفقكلمنهوىيًٌٍَُِّْ٠١٢٣٤٥٦٧٨٩ٱ]] ;
:: [[:Arabic:][:Block=Arabic:][ءآابةتثجحخدذرزسشصضطظعغفقكلمنهوىيًٌٍَُِّْ٠١٢٣٤٥٦٧٨٩ٱ]] ;
:: NFKD (NFC) ;
#
#
Expand Down
2 changes: 1 addition & 1 deletion common/transforms/Arabic-Latin.xml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ For terms of use, see http://www.unicode.org/copyright.html
# Does *not* do assimilation of "al", nor hyphenation.
# While it could be done, we need to determine whether a prefix "al" could
# occur other than as the definite article (since no space is used).
:: [[:Arabic:][:block=ARABIC:][‎ⁿ،؛؟ـً-ٕ٠-٬۰-۹﷼ښ][\u0611\u0670]] ;
:: [[:Arabic:][:Block=Arabic:][‎ⁿ،؛؟ـً-ٕ٠-٬۰-۹﷼ښ][\u0611\u0670]] ;
:: NFKD (NFC);
$disambig = ̱ ;
$disambig2 = ̰ ;
Expand Down
2 changes: 1 addition & 1 deletion common/transforms/Bengali-Latin.xml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ For terms of use, see http://www.unicode.org/copyright.html
<transforms>
<transform source="Beng" target="Latn" direction="forward" alias="Bengali-Latin und-Latn-t-und-beng">
<tRule>
::[[:script=bengali:][।-॥ঁ-ঃঅ-ঌএ-ঐও-নপ-রলশ-হ়-ৄে-ৈো-্ৗড়-ঢ়য়-ৣ০-৺ৎ]];
::[[:Script=Bengali:][।-॥ঁ-ঃঅ-ঌএ-ঐও-নপ-রলশ-হ়-ৄে-ৈো-্ৗড়-ঢ়য়-ৣ০-৺ৎ]];
::NFD;
::Bengali-InterIndic;
::InterIndic-Latin;
Expand Down
8 changes: 4 additions & 4 deletions common/transforms/Cyrillic-Latin.xml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ For terms of use, see http://www.unicode.org/copyright.html
# Should add variants for Russian-English, Russian-German
# Those can use this as a base, and then remap cases
# like a $hat to ya or ja.
# :: [\u0000-\u007E ʹ ʺ [:Cyrillic:] [:Latin:] [:nonspacing mark:]] ;
# :: [\u0000-\u007E ʹ ʺ [:Cyrillic:] [:Latin:] [:Nonspacing_Mark:]] ;
### WARNING, ̈ must be added to the generated filters, in both directions ###
# MINIMAL FILTER
# Cyrillic-Latin
Expand Down Expand Up @@ -282,13 +282,13 @@ $ignore = [[:Mark:]''] * ;
| K ← Q ;
| u ← w ;
| U ← W ;
| KS ← X } $ignore [:UppercaseLetter:] ;
| KS ← [:UppercaseLetter:] $ignore { X ;
| KS ← X } $ignore [:Uppercase_Letter:] ;
| KS ← [:Uppercase_Letter:] $ignore { X ;
| Ks ← X ;
| ks ← x ;
:: NFC (NFD) ;
# note: a global filter is more efficient, but MUST include all source chars!!
# :: ([\u0000-\u007E ʹ ʺ [:Cyrillic:] [:Latin:] [:nonspacing mark:] ‧]);
# :: ([\u0000-\u007E ʹ ʺ [:Cyrillic:] [:Latin:] [:Nonspacing_Mark:] ‧]);
# MINIMAL FILTER: Latin-Cyrillic
:: ( [ḫḪhH‧ˌ̈A-Za-zÀ-ÏÑ-ÖÙ-Ýà-ïñ-öù-ýÿ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƏƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǰǴ-ǵǸ-țȞ-ȟȦ-ȳəʹ-ʺ̀-̂̆-̦̱̇̌̀-́̈́ʹ΅-ΆΈ-ΊΌΎ-ΐά-ΰό-ώϓЀЃЌ-ЎЙйѐѓќ-ўӁ-ӂӐ-ӑӖ-ӗḀ-ẙẛẠ-ỹἂ-ἅἊ-Ἅἒ-ἕἚ-Ἕἢ-ἥἪ-Ἥἲ-ἵἺ-Ἵὂ-ὅὊ-Ὅὒ-ὕὛὝὢ-ὥὪ-Ὥὰ-ώᾂ-ᾅᾊ-ᾍᾒ-ᾕᾚ-ᾝᾢ-ᾥᾪ-ᾭᾰᾲᾴᾸᾺ-ΆῂῄῈ-Ή῍-῎ῐῒ-ΐῘῚ-Ί῝-῞ῠῢ-ΰῨῪ-Ύ῭-΅ῲῴῸ-ΏK-Å] ) ;
]]></tRule>
Expand Down
16 changes: 8 additions & 8 deletions common/transforms/Greek-Latin.xml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ For terms of use, see http://www.unicode.org/copyright.html
<transform source="Grek" target="Latn" direction="both" alias="Greek-Latin und-Latn-t-und-grek" backwardAlias="Latin-Greek und-Grek-t-und-latn">
<tRule><![CDATA[
# Rules are predicated on running NFD first, and NFC afterwards
# :: [\u0000-\u007F \u0370-Ͽ [:Greek:] [:nonspacing mark:]] ;
# :: [\u0000-\u007F \u0370-Ͽ [:Greek:] [:Nonspacing_Mark:]] ;
# MINIMAL FILTER GENERATED FOR: Greek-Latin
:: [΄´;µ·ÄËÏÖÜäëïöüÿ-āĒ-ēĪ-īŌ-ōŪ-ūŸǕ-ǜǞ-ǣǬ-ǭȪ-ȭȰ-ȳ̄̈̓-̔͂-ͅͺ;Ά-ΊΌΎ-ΡΣ-ώϐ-ϗϛϝϟϡϣϥϧϩϫϭϯ-ϵϷ-\u07FBЁЇёїӒ-ӓӚ-ӟӢ-ӧӪ-ӱӴ-ӵӸ-ӹḔ-ḗḠ-ḡḦ-ḧḮ-ḯḸ-ḹṎ-ṓṜ-ṝṺ-ṻẄ-ẅẌ-ẍẗἀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼι῁-ῄῆ-ῌ῏-ΐῖ-Ί῟-Ῥῲ-ῴῶ-ῼΩϹ] ;
:: NFD (NFC) ;
Expand All @@ -24,9 +24,9 @@ For terms of use, see http://www.unicode.org/copyright.html
# ὨΣ ὩΣ ὪΣ ὫΣ
# Ạ, ạ, Ẹ, ẹ, Ọ, ọ
# Useful variables
$lower = [[:latin:][:greek:] & [:Ll:]];
$glower = [[:greek:] & [:Ll:]];
$upper = [[:latin:][:greek:] & [:Lu:]] ;
$lower = [[:Latin:][:Greek:] & [:Ll:]];
$glower = [[:Greek:] & [:Ll:]];
$upper = [[:Latin:][:Greek:] & [:Lu:]] ;
$accent = [:M:] ;
# NOTE: restrict to just the Greek & Latin accents that we care about
# TODO: broaden out once interation is fixed
Expand Down Expand Up @@ -233,8 +233,8 @@ $ignore = [[:Mark:]''] * ;
| B ← W } $vowel ;
| U ← V ;
| U ← W ;
$rough } $ignore [:UppercaseLetter:] → H ;
$ignore [:UppercaseLetter:] { $rough → H ;
$rough } $ignore [:Uppercase_Letter:] → H ;
$ignore [:Uppercase_Letter:] { $rough → H ;
$rough ← H ;
$rough ↔ h ;
# Completeness for Greek
Expand All @@ -257,8 +257,8 @@ $rough ↔ h ;
← [Νν] { \' } $egammaLike ;
::NFC (NFD) ;
# ([\u0000-\u007F [:Latin:] [:Greek:] [:nonspacing mark:]]) ;
# ([\u0000-\u007F · [:Latin:] [:nonspacing mark:]]) ;
# ([\u0000-\u007F [:Latin:] [:Greek:] [:Nonspacing_Mark:]]) ;
# ([\u0000-\u007F · [:Latin:] [:Nonspacing_Mark:]]) ;
# MINIMAL FILTER GENERATED FOR: Latin-Greek BACKWARD
:: ( [':?A-Za-zÀ-ÅÇ-ÏÑ-ÖÙ-Ýà-åç-ïñ-öù-ýÿ-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǰǴ-ǵǸ-țȞ-ȟȦ-ȳ̀-̷̹-ͅ΅-ΆΈ-ΊΌΎ-ΐΪ-ΰϊ-ώϓ-ϔЀ-ЁЃЇЌ-ЎЙйѐ-ёѓїќ-ўѶ-ѷӁ-ӂӐ-ӓӖ-ӗӚ-ӟӢ-ӧӪ-ӵӸ-ӹḀ-ẙẛẠ-ỹἀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼ῁-ῄῆ-ΐῖ-Ί῝-΅ῲ-ῴῶ-ῼK-Å] ) ;
]]></tRule>
Expand Down
4 changes: 2 additions & 2 deletions common/transforms/Greek_Latin_UNGEGN.xml
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ For terms of use, see http://www.unicode.org/copyright.html
:: [[[:Greek:][:Mn:][:Me:]] [\:-;?·;·]] ;
::NFD (NFC) ;
# Useful variables
$lower = [[:latin:][:greek:] & [:Ll:]] ;
$upper = [[:latin:][:greek:] & [:Lu:]] ;
$lower = [[:Latin:][:Greek:] & [:Ll:]] ;
$upper = [[:Latin:][:Greek:] & [:Lu:]] ;
$accent = [[:Mn:][:Me:]] ;
$macron = ̄ ;
$ddot = ̈ ;
Expand Down
2 changes: 1 addition & 1 deletion common/transforms/Han-Latin-Names.xml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ For terms of use, see http://www.unicode.org/copyright.html
# Do this before ::Han-Spacedhan() to catch Han after space in original text,
# and to apply before all other rules.
$startOfHanMarker = \uFDD1;
[:^script=Han:] { ([:script=Han:]) → $startOfHanMarker $1;
[:^Script=Han:] { ([:Script=Han:]) → $startOfHanMarker $1;
# Need Spacedhan so the name transliterations get spaced properly
::Han-Spacedhan();
# Convert special name readings that depend on next character
Expand Down
8 changes: 4 additions & 4 deletions common/transforms/Han-Spacedhan.xml
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ For terms of use, see http://www.unicode.org/copyright.html
<tRule>
# Only intended for internal use
# Make sure Han are normalized, including characters that contain them.
# The first set in the filter is computed with http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:tonfkd:/XXX/:]-[:ideographic:]-[:sc=han:]
# Where XXX is the resolved [:ideographic:][:sc=han:]. It needs updating with each Unicode release!
:: [[、。々《-』〜・㆒-㆟㈠-㉇㊀-㊰㋀-㋋ ㍘-㍰㍻-㍿㏠-㏾🈐-🈒🈔-🈺🉀-🉈🉐🉑][:ideographic:][:sc=han:]] nfkc;
# The first set in the filter is computed with http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:tonfkd:/XXX/:]-[:Ideographic:]-[:sc=Han:]
# Where XXX is the resolved [:Ideographic:][:sc=Han:]. It needs updating with each Unicode release!
:: [[、。々《-』〜・㆒-㆟㈠-㉇㊀-㊰㋀-㋋ ㍘-㍰㍻-㍿㏠-㏾🈐-🈒🈔-🈺🉀-🉈🉐🉑][:Ideographic:][:sc=Han:]] nfkc;
:: fullwidth-halfwidth;
。 → '.';
。→ '.';
Expand All @@ -37,7 +37,7 @@ For terms of use, see http://www.unicode.org/copyright.html
〜→ '~';

$terminalPunct = [\.\,\:\;\?\!.,:?!。、;[:Pe:][:Pf:]];
$initialPunct = [:Ps:][:Pi:];
$initialPunct = [[:Ps:][:Pi:]];
# add space between any Han or terminal punctuation and letters, and
# between letters and Han or initial punct
[[:Ideographic:] $terminalPunct] {} [:Letter:] → ' ' ;
Expand Down
4 changes: 2 additions & 2 deletions common/transforms/Hiragana-Katakana.xml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ For terms of use, see http://www.unicode.org/copyright.html
<transform source="Hira" target="Kana" direction="both" alias="Hiragana-Katakana und-Kana-t-und-hira" backwardAlias="Katakana-Hiragana und-Hira-t-und-kana">
<tRule>
# note: a global filter is more efficient, but MUST include all source chars
:: [[\u0000-\u007E 、。 ゙-゜ ァ-ー 。-゚ー[:Hiragana:] [:Katakana:] [:nonspacing mark:]]-[\u309B \u309C]];
:: [[\u0000-\u007E 、。 ゙-゜ ァ-ー 。-゚ー[:Hiragana:] [:Katakana:] [:Nonspacing_Mark:]]-[\u309B \u309C]];
:: NFKC (NFC);
# Hiragana-Katakana
# This is largely a one-to-one mapping, but it has a
Expand Down Expand Up @@ -185,7 +185,7 @@ $xo = [
お ← $xo {ー};
:: NFC (NFKC) ;
# note: a global filter is more efficient, but MUST include all source chars!!
:: ([[\u0000-\u007E 、。 ゙-゜ ァ-ー 。-゚ー[:Hiragana:] [:Katakana:] [:nonspacing mark:]]-[\u309B \u309C]]);
:: ([[\u0000-\u007E 、。 ゙-゜ ァ-ー 。-゚ー[:Hiragana:] [:Katakana:] [:Nonspacing_Mark:]]-[\u309B \u309C]]);
# eof
</tRule>
</transform>
Expand Down
4 changes: 2 additions & 2 deletions common/transforms/Latin-Jamo.xml
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@ For terms of use, see http://www.unicode.org/copyright.html
<transforms>
<transform source="Latn" target="Jamo" direction="forward" alias="Latin-Jamo und-Jamo-t-und-latn">
<tRule>
::[[:script=Latin:][:M:]-];
::[[:Script=Latin:][:M:]-];
::NFD;
::Lower;
::Latin-ConjoiningJamo;
::[[:script=Latin:][:M:]] NFC;
::[[:Script=Latin:][:M:]] NFC;
</tRule>
</transform>
</transforms>
Expand Down
6 changes: 3 additions & 3 deletions common/transforms/Latin-Katakana.xml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ For terms of use, see http://www.unicode.org/copyright.html
<transform source="Latn" target="Kana" direction="both" alias="Latin-Katakana und-Kana-t-und-latn" backwardAlias="Katakana-Latin und-Latn-t-und-kana">
<tRule>
# note: a global filter is more efficient, but MUST include all source chars
#:: [\u0000-\u007E 、。 ゙-゜ ァ-ー 。-゚ [:Latin:][:Katakana:] [:nonspacing mark:]] ;
#:: [\u0000-\u007E 、。 ゙-゜ ァ-ー 。-゚ [:Latin:][:Katakana:] [:Nonspacing_Mark:]] ;
# MINIMAL FILTER GENERATED FOR: Latin-Katakana
### WARNING -- must add width filter, both here and below!!! ###
:: [[ᄀ-ᄒᄚᄡ\u1160-ᅵᆪᆬ-ᆭᆰ-ᆵ←-↓│■○\u3000-。「-」゙-゚ァ-ロワヲ-ヴヷヺ-ー!-~¢-₩][',.A-Za-z~À-ÖØ-öø-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǭǰǴ-ǵǸ-țȞ-ȟȦ-ȳ̄Ӣ-ӣӮ-ӯḀ-ẙẠ-ỹᾱᾹῑῙῡῩK-Å]] ;
Expand Down Expand Up @@ -382,11 +382,11 @@ x → | ks ;
# Final cleanup
'~' → ; # delete stray tildes between letters
[:Katakana:] { '' } [:Latin:] → ; # delete stray quotes between letters
# [ʾ[:Nonspacing Mark:]-[゙-゜]] → ; # delete any non-spacing marks that we didn't use
# [ʾ[:Nonspacing_Mark:]-[゙-゜]] → ; # delete any non-spacing marks that we didn't use
:: NFC (NFD) ;
:: ([[:Katakana:][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] halfwidth-fullwidth);
# note: a global filter is more efficient, but MUST include all source chars!!
#:: ([\u0000-\u007E 、。 ゙-゜ ァ-ー 。-゚ [:Latin:][:Katakana:] [:nonspacing mark:]]);
#:: ([\u0000-\u007E 、。 ゙-゜ ァ-ー 。-゚ [:Latin:][:Katakana:] [:Nonspacing_Mark:]]);
# MINIMAL FILTER GENERATED FOR: Latin-Katakana BACKWARD
:: ( [[\ -~¢-£¥-¦¬̄₩。-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ│-○][~、-。がぎぐげござじずぜぞだぢづでどば-ぱび-ぴぶ-ぷべ-ぺぼ-ぽゔ゙-゛ゞァ-ヺー-ヾ][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] ) ;
# eof
Expand Down
2 changes: 1 addition & 1 deletion common/transforms/Latin-NumericPinyin.xml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ $digit = [1-5];
$1 &NumericPinyin-Pinyin($3) $2 ← ([aAeE]) ($vowel* $consonant*) ($digit);
$1 &NumericPinyin-Pinyin($3) $2 ← ([oO]) ([$vowel-[aeAE]]* $consonant*) ($digit);
$1 &NumericPinyin-Pinyin($3) $2 ← ($vowel) ($consonant*) ($digit);
&NumericPinyin-Pinyin($1) ← [:letter:] {($digit)};
&NumericPinyin-Pinyin($1) ← [:Letter:] {($digit)};
::NFC (NFD);
]]></tRule>
</transform>
Expand Down
2 changes: 1 addition & 1 deletion common/transforms/Maldivian-Latin-BGN.xml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ For terms of use, see http://www.unicode.org/copyright.html
# These appears to be used in Maldivian text, for example in the Universal
# Declaration of Human Rights.
::[[:block=thaana:][،؛؟٪٫٬]\uFDF2] ;
::[[:Block=Thaana:][،؛؟٪٫٬]\uFDF2] ;
::NFD;
$wordBoundary = [^[:L:][:M:][:N:]] ;
Expand Down
2 changes: 1 addition & 1 deletion common/transforms/Persian-Latin-BGN.xml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ For terms of use, see http://www.unicode.org/copyright.html
# MINIMAL FILTER: Persian-Latin
#
:: [[:arabic:][:block=ARABIC:][ءآابةتثجحخدذرزسشصضطظعغفقكلمنهویيَُِّْ٠١٢٣٤٥٦٧٨٩پچژگی]] ;
:: [[:Arabic:][:Block=Arabic:][ءآابةتثجحخدذرزسشصضطظعغفقكلمنهویيَُِّْ٠١٢٣٤٥٦٧٨٩پچژگی]] ;
:: NFKD (NFC) ;
#
#
Expand Down
2 changes: 1 addition & 1 deletion common/transforms/Thai-Latin.xml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ For terms of use, see http://www.unicode.org/copyright.html
<transforms>
<transform source="Thai" target="Latn" direction="forward" alias="Thai-Latin und-Latn-t-und-thai">
<tRule>
::[[:thai:] ก-ฺเ-๛];
::[[:Thai:] ก-ฺเ-๛];
::NFD;
::Thai-ThaiSemi;
::Any-BreakInternal;
Expand Down
2 changes: 1 addition & 1 deletion common/transforms/Thai-ThaiLogical.xml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ For terms of use, see http://www.unicode.org/copyright.html
# This reverses the Thai LogicalOrderException vowels, and does (part of) spaces
# The rules that convert space into semicolon are in another file;
# since they have to come BEFORE the break iterator
$thai = [[:thai:] ก-ฺเ-๛] ;
$thai = [[:Thai:] ก-ฺเ-๛] ;
# First convert the semicolon back
' ' ← $thai { '; ' } $thai;
# Remove any other spaces between thai letters
Expand Down
2 changes: 1 addition & 1 deletion common/transforms/Thai-ThaiSemi.xml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ For terms of use, see http://www.unicode.org/copyright.html
<tRule>
# The rules that convert space into semicolon are in this file;
# since they have to come BEFORE the break iterator.
$thai = [[:thai:] ก-ฺเ-๛] ;
$thai = [[:Thai:] ก-ฺเ-๛] ;
$thai { ' ' } $thai → '; ' ;
</tRule>
</transform>
Expand Down
4 changes: 2 additions & 2 deletions common/transforms/ThaiLogical-Latin.xml
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ For terms of use, see http://www.unicode.org/copyright.html
#{ ( $consonant ) } [^$vowel \uE000] → | $1 \uE000 ;
#\uE000 → ọ ;
# ← ọ ;
$notAbove = [^\p{ccc=0}\p{ccc=above}] ;
$notBelow = [^\p{ccc=0}\p{ccc=below}] ;
$notAbove = [^\p{ccc=0}\p{ccc=Above}] ;
$notBelow = [^\p{ccc=0}\p{ccc=Below}] ;
# Consonants
# Warning: the 'h's need to be handled carefully!
# What we really want to say is the following, but we can't
Expand Down
2 changes: 1 addition & 1 deletion common/transforms/am-Ethi-t-d0-morse.xml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
#
# MINIMAL FILTER: Ethiopic-Morse Code
#
:: [[:Zs:]0-9!\?\+/@()\[\]_:;,\.'"$=\-[:Ethiopic:]] ;
:: [[:Zs:]0-9!\?\+/@()\[\]_:;,\.'"\$=\-[:Ethiopic:]] ;
([:Lo:])([:Zs:]+)([:Lo:]) → | $1⁄⁂⁄$2$3 ; # ⁄⁂⁄ is assumed to be a sufficiently weird enough sequence that won't naturally appear in any normal content
#
Expand Down
6 changes: 3 additions & 3 deletions common/transforms/az-Title.xml
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ For terms of use, see http://www.unicode.org/copyright.html
<tRule><![CDATA[
# I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
# Make any string of letters after a cased letter be lower, with rules for i
[:cased:] [:case-ignorable:]* { İ → i;
[:cased:] [:case-ignorable:]* { I → ı;
[:cased:] [:case-ignorable:]* { (.) → &Any-Lower($1) ;
[:Cased:] [:Case_Ignorable:]* { İ → i;
[:Cased:] [:Case_Ignorable:]* { I → ı;
[:Cased:] [:Case_Ignorable:]* { (.) → &Any-Lower($1) ;
# Otherwise all lowercase go to upper (titlecase stay as is)
i→İ ;
([:Lowercase:]) → &Any-Upper($1) ;
Expand Down
4 changes: 2 additions & 2 deletions common/transforms/el-Lower.xml
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ For terms of use, see http://www.unicode.org/copyright.html
# and C is not followed by a sequence consisting of zero or more case-ignorable characters and then a cased letter.
# 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
# With translit rules, easiest is to handle the negative condition first, mapping in that case to the regular sigma.
Σ } [:case-ignorable:]* [:cased:] → σ;
[:cased:] [:case-ignorable:]* { Σ → ς;
Σ } [:Case_Ignorable:]* [:Cased:] → σ;
[:Cased:] [:Case_Ignorable:]* { Σ → ς;
::Any-Lower;
::NFC();
</tRule>
Expand Down
6 changes: 3 additions & 3 deletions common/transforms/el-Title.xml
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ For terms of use, see http://www.unicode.org/copyright.html
# Remove \0301 following Greek, with possible intervening 0308 marks.
# [[:Greek:] & [:Ll:]] [\u0308]? { \u0301 → ;
# Make any string of letters after a cased letter be lower, with rules for sigma
[:cased:] [:case-ignorable:]* { Σ } [:case-ignorable:]* [:cased:] → σ;
[:cased:] [:case-ignorable:]* { Σ → ς;
[:cased:] [:case-ignorable:]* { (.) → &Any-Lower($1) ;
[:Cased:] [:Case_Ignorable:]* { Σ } [:Case_Ignorable:]* [:Cased:] → σ;
[:Cased:] [:Case_Ignorable:]* { Σ → ς;
[:Cased:] [:Case_Ignorable:]* { (.) → &Any-Lower($1) ;
# Otherwise all lowercase go to upper (titlecase stay as is)
([:Lowercase:]) → &Any-Title($1) ;
::NFC();
Expand Down
2 changes: 1 addition & 1 deletion common/transforms/it-am.xml
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ z → ዝ;
#
#

[:nonspacing mark:] → ;
[:Nonspacing_Mark:] → ;
::NFC(NFD);
</tRule>
</transform>
Expand Down
2 changes: 1 addition & 1 deletion common/transforms/it-ja.xml
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ z → ツ;
#
#

[:nonspacing mark:] → ;
[:Nonspacing_Mark:] → ;
::NFC(NFD);
</tRule>
</transform>
Expand Down
14 changes: 7 additions & 7 deletions common/transforms/lt-Title.xml
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@ For terms of use, see http://www.unicode.org/copyright.html
<tRule><![CDATA[
# Make any string of letters after a cased letter be lower
::NFD();
[:cased:] [:case-ignorable:]* {I } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0307;
[:cased:] [:case-ignorable:]* {J } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → j \u0307;
[:cased:] [:case-ignorable:]* {I \u0328 } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0328 \u0307;
[:cased:] [:case-ignorable:]* {I \u0300 → i \u0307 \u0300;
[:cased:] [:case-ignorable:]* {I \u0301 → i \u0307 \u0301;
[:cased:] [:case-ignorable:]* {I \u0303 → i \u0307 \u0303;
[:cased:] [:case-ignorable:]* { (.) → &Any-Lower($1) ;
[:Cased:] [:Case_Ignorable:]* {I } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0307;
[:Cased:] [:Case_Ignorable:]* {J } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → j \u0307;
[:Cased:] [:Case_Ignorable:]* {I \u0328 } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0328 \u0307;
[:Cased:] [:Case_Ignorable:]* {I \u0300 → i \u0307 \u0300;
[:Cased:] [:Case_Ignorable:]* {I \u0301 → i \u0307 \u0301;
[:Cased:] [:Case_Ignorable:]* {I \u0303 → i \u0307 \u0303;
[:Cased:] [:Case_Ignorable:]* { (.) → &Any-Lower($1) ;
# Otherwise all lowercase go to upper (titlecase stay as is)
[:Soft_Dotted:] [^[:ccc=Not_Reordered:][:ccc=Above:]]* { \u0307 → ;
([:Lowercase:]) → &Any-Upper($1) ;
Expand Down
6 changes: 3 additions & 3 deletions common/transforms/tr-Title.xml
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ For terms of use, see http://www.unicode.org/copyright.html
<tRule><![CDATA[
# I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
# Make any string of letters after a cased letter be lower, with rules for i
[:cased:] [:case-ignorable:]* { İ → i;
[:cased:] [:case-ignorable:]* { I → ı;
[:cased:] [:case-ignorable:]* { (.) → &Any-Lower($1) ;
[:Cased:] [:Case_Ignorable:]* { İ → i;
[:Cased:] [:Case_Ignorable:]* { I → ı;
[:Cased:] [:Case_Ignorable:]* { (.) → &Any-Lower($1) ;
# Otherwise all lowercase go to upper (titlecase stay as is)
i→İ ;
([:Lowercase:]) → &Any-Upper($1) ;
Expand Down

0 comments on commit 48afc24

Please sign in to comment.