1
1
package mediathek .tool ;
2
2
3
+ import com .ibm .icu .text .Transliterator ;
3
4
import org .apache .commons .lang3 .StringUtils ;
4
5
import org .apache .commons .lang3 .SystemUtils ;
5
6
import org .apache .logging .log4j .LogManager ;
@@ -140,19 +141,14 @@ private static String convertToNativeEncoding(String fileName, boolean isPath) {
140
141
* @param fileName The UTF-16 filename string.
141
142
* @return US-ASCII encoded string for the OS.
142
143
*/
143
- private static String convertToASCIIEncoding (String fileName , boolean isPath ) {
144
+ protected static String convertToASCIIEncoding (String fileName , boolean isPath ) {
144
145
String ret = fileName ;
146
+ //remove NUL character from conversion...
147
+ ret = ret .replaceAll ("\\ u0000" , "" );
145
148
146
- ret = ret .replace ("ä" , "ae" );
147
- ret = ret .replace ("ö" , "oe" );
148
- ret = ret .replace ("ü" , "ue" );
149
- ret = ret .replace ("Ä" , "Ae" );
150
- ret = ret .replace ("Ö" , "Oe" );
151
- ret = ret .replace ("Ü" , "Ue" );
152
- ret = ret .replace ("ß" , "ss" );
153
-
154
- // ein Versuch zu vereinfachen
155
- ret = cleanUnicode (ret );
149
+ //convert to ASCII with icu4j
150
+ var transliterator = Transliterator .getInstance ("de-ASCII" );
151
+ ret = transliterator .transliterate (ret );
156
152
157
153
ret = removeIllegalCharacters (ret , isPath );
158
154
@@ -167,104 +163,18 @@ private static String convertToASCIIEncoding(String fileName, boolean isPath) {
167
163
if (buf .hasArray ()) {
168
164
ret = new String (buf .array ());
169
165
}
170
-
171
- //remove NUL character from conversion...
172
- ret = ret .replaceAll ("\\ u0000" , "" );
173
166
} catch (CharacterCodingException e ) {
174
167
logger .error ("convertToASCIIEncoding" , e );
175
168
}
176
169
177
170
return ret ;
178
171
}
179
172
180
- private static String cleanUnicode (String ret ) {
181
- String r = "" ;
182
- char c ;
183
- for (int i = 0 ; i < ret .length (); ++i ) {
184
- c = ret .charAt (i );
185
- //char hex = ret.charAt(i);
186
- if (Character .UnicodeBlock .of (c ) == Character .UnicodeBlock .BASIC_LATIN ) {
187
- r += c ;
188
- } else if (c == 'ß' ) {
189
- r += "ß" ;
190
- } else // Buchstaben
191
- {
192
- if (c == 'Â' || c == 'À' || c == 'Å' || c == 'Á' ) {
193
- r += "A" ;
194
- } else if (c == 'å' || c == 'á' || c == 'à' || c == 'â' ) {
195
- r += "a" ;
196
- } else if (c == 'Č' ) {
197
- r += "C" ;
198
- } else if (c == 'ć' || c == 'č' || c == 'ç' ) {
199
- r += "c" ;
200
- } else if (c == 'Đ' ) {
201
- r += "D" ;
202
- } else if (c == 'É' || c == 'È' ) {
203
- r += "E" ;
204
- } else if (c == 'é' || c == 'è' || c == 'ê' || c == 'ě' || c == 'ë' ) {
205
- r += "e" ;
206
- } else if (c == 'í' ) {
207
- r += "i" ;
208
- } else if (c == 'ñ' ) {
209
- r += "n" ;
210
- } else if (c == 'ó' || c == 'ô' || c == 'ø' ) {
211
- r += "o" ;
212
- } else if (c == 'Š' ) {
213
- r += "S" ;
214
- } else if (c == 'ś' || c == 'š' || c == 'ş' ) {
215
- r += "s" ;
216
- } else if (c == 'ł' || c == 'Ł' ) {
217
- r += "t" ;
218
- } else if (c == 'û' || c == 'ù' ) {
219
- r += "u" ;
220
- } else if (c == 'ý' ) {
221
- r += "y" ;
222
- } else if (c == 'Ž' || c == 'Ź' ) {
223
- r += "Z" ;
224
- } else if (c == 'ž' || c == 'ź' ) {
225
- r += "z" ;
226
- } else if (c == 'æ' ) {
227
- r += "ae" ;
228
- } else if (c == '–' ) {
229
- r += "-" ;
230
- } else if (c == '„' ) {
231
- r += "\" " ;
232
- } else if (c == '”' || c == '“' || c == '«' || c == '»' ) {
233
- r += "\" " ;
234
- } else if (c == '?' ) {
235
- r += "?" ;
236
- } else if (c == '°' || c == '™' ) {
237
- } else if (c == '…' ) {
238
- r += "..." ;
239
- } else if (c == '€' ) {
240
- r += "€" ;
241
- } else if (c == '´' || c == '’' || c == '‘' || c == '¿' ) {
242
- r += "'" ;
243
- } else if (c == '\u003F' ) {
244
- r += "?" ;
245
- } else if (c == '\u0096' ) {
246
- r += "-" ;
247
- } else if (c == '\u0085' ) {
248
- } else if (c == '\u0080' ) {
249
- } else if (c == '\u0084' ) {
250
- } else if (c == '\u0092' ) {
251
- } else if (c == '\u0093' ) {
252
- } else if (c == '\u0091' ) {
253
- r += "-" ;
254
- } else if (c == '\n' ) {
255
- } else {
256
- r += "_" ;
257
- }
258
- }
259
- }
260
- return r ;
261
- }
262
-
263
173
/**
264
174
* Remove illegal characters from String based on current OS.
265
175
*
266
176
* @param input The input string
267
- * @param isPath
177
+ * @param isPath true if this is a path.
268
178
* @return Cleaned-up string.
269
179
*/
270
180
public static String removeIllegalCharacters (final String input , boolean isPath ) {
@@ -281,11 +191,7 @@ public static String removeIllegalCharacters(final String input, boolean isPath)
281
191
ret = StringUtils .stripStart (ret , "." );
282
192
ret = ret .replaceAll (isPath ? REGEXP_ILLEGAL_CHARACTERS_OTHERS_PATH : REGEXP_ILLEGAL_CHARACTERS_OTHERS , "_" );
283
193
} else {
284
- logger .warn ("This code path should NOT have been taken" );
285
- //we need to be more careful on Linux when using e.g. FAT32
286
- //Therefore be more conservative by default and replace more characters.
287
- ret = StringUtils .stripStart (ret , "." );
288
- ret = ret .replaceAll (isPath ? REGEXP_ILLEGAL_CHARACTERS_WINDOWS_PATH : REGEXP_ILLEGAL_CHARACTERS_WINDOWS , "_" );
194
+ throw new IllegalStateException ("Unsupported OS: " + SystemUtils .OS_NAME );
289
195
}
290
196
291
197
return ret ;
0 commit comments