Skip to content

Commit a8e2dac

Browse files
committed
- fix incorrect Unicode to ASCII conversion
- throw exception for unsupported OS
1 parent 25b7cea commit a8e2dac

File tree

2 files changed

+37
-103
lines changed

2 files changed

+37
-103
lines changed

src/main/java/mediathek/tool/FilenameUtils.java

+9-103
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
package mediathek.tool;
22

3+
import com.ibm.icu.text.Transliterator;
34
import org.apache.commons.lang3.StringUtils;
45
import org.apache.commons.lang3.SystemUtils;
56
import org.apache.logging.log4j.LogManager;
@@ -140,19 +141,14 @@ private static String convertToNativeEncoding(String fileName, boolean isPath) {
140141
* @param fileName The UTF-16 filename string.
141142
* @return US-ASCII encoded string for the OS.
142143
*/
143-
private static String convertToASCIIEncoding(String fileName, boolean isPath) {
144+
protected static String convertToASCIIEncoding(String fileName, boolean isPath) {
144145
String ret = fileName;
146+
//remove NUL character from conversion...
147+
ret = ret.replaceAll("\\u0000", "");
145148

146-
ret = ret.replace("ä", "ae");
147-
ret = ret.replace("ö", "oe");
148-
ret = ret.replace("ü", "ue");
149-
ret = ret.replace("Ä", "Ae");
150-
ret = ret.replace("Ö", "Oe");
151-
ret = ret.replace("Ü", "Ue");
152-
ret = ret.replace("ß", "ss");
153-
154-
// ein Versuch zu vereinfachen
155-
ret = cleanUnicode(ret);
149+
//convert to ASCII with icu4j
150+
var transliterator = Transliterator.getInstance("de-ASCII");
151+
ret = transliterator.transliterate(ret);
156152

157153
ret = removeIllegalCharacters(ret, isPath);
158154

@@ -167,104 +163,18 @@ private static String convertToASCIIEncoding(String fileName, boolean isPath) {
167163
if (buf.hasArray()) {
168164
ret = new String(buf.array());
169165
}
170-
171-
//remove NUL character from conversion...
172-
ret = ret.replaceAll("\\u0000", "");
173166
} catch (CharacterCodingException e) {
174167
logger.error("convertToASCIIEncoding", e);
175168
}
176169

177170
return ret;
178171
}
179172

180-
private static String cleanUnicode(String ret) {
181-
String r = "";
182-
char c;
183-
for (int i = 0; i < ret.length(); ++i) {
184-
c = ret.charAt(i);
185-
//char hex = ret.charAt(i);
186-
if (Character.UnicodeBlock.of(c) == Character.UnicodeBlock.BASIC_LATIN) {
187-
r += c;
188-
} else if (c == 'ß') {
189-
r += "ß";
190-
} else // Buchstaben
191-
{
192-
if (c == 'Â' || c == 'À' || c == 'Å' || c == 'Á') {
193-
r += "A";
194-
} else if (c == 'å' || c == 'á' || c == 'à' || c == 'â') {
195-
r += "a";
196-
} else if (c == 'Č') {
197-
r += "C";
198-
} else if (c == 'ć' || c == 'č' || c == 'ç') {
199-
r += "c";
200-
} else if (c == 'Đ') {
201-
r += "D";
202-
} else if (c == 'É' || c == 'È') {
203-
r += "E";
204-
} else if (c == 'é' || c == 'è' || c == 'ê' || c == 'ě' || c == 'ë') {
205-
r += "e";
206-
} else if (c == 'í') {
207-
r += "i";
208-
} else if (c == 'ñ') {
209-
r += "n";
210-
} else if (c == 'ó' || c == 'ô' || c == 'ø') {
211-
r += "o";
212-
} else if (c == 'Š') {
213-
r += "S";
214-
} else if (c == 'ś' || c == 'š' || c == 'ş') {
215-
r += "s";
216-
} else if (c == 'ł' || c == 'Ł') {
217-
r += "t";
218-
} else if (c == 'û' || c == 'ù') {
219-
r += "u";
220-
} else if (c == 'ý') {
221-
r += "y";
222-
} else if (c == 'Ž' || c == 'Ź') {
223-
r += "Z";
224-
} else if (c == 'ž' || c == 'ź') {
225-
r += "z";
226-
} else if (c == 'æ') {
227-
r += "ae";
228-
} else if (c == '–') {
229-
r += "-";
230-
} else if (c == '„') {
231-
r += "\"";
232-
} else if (c == '”' || c == '“' || c == '«' || c == '»') {
233-
r += "\"";
234-
} else if (c == '?') {
235-
r += "?";
236-
} else if (c == '°' || c == '™') {
237-
} else if (c == '…') {
238-
r += "...";
239-
} else if (c == '€') {
240-
r += "€";
241-
} else if (c == '´' || c == '’' || c == '‘' || c == '¿') {
242-
r += "'";
243-
} else if (c == '\u003F') {
244-
r += "?";
245-
} else if (c == '\u0096') {
246-
r += "-";
247-
} else if (c == '\u0085') {
248-
} else if (c == '\u0080') {
249-
} else if (c == '\u0084') {
250-
} else if (c == '\u0092') {
251-
} else if (c == '\u0093') {
252-
} else if (c == '\u0091') {
253-
r += "-";
254-
} else if (c == '\n') {
255-
} else {
256-
r += "_";
257-
}
258-
}
259-
}
260-
return r;
261-
}
262-
263173
/**
264174
* Remove illegal characters from String based on current OS.
265175
*
266176
* @param input The input string
267-
* @param isPath
177+
* @param isPath true if this is a path.
268178
* @return Cleaned-up string.
269179
*/
270180
public static String removeIllegalCharacters(final String input, boolean isPath) {
@@ -281,11 +191,7 @@ public static String removeIllegalCharacters(final String input, boolean isPath)
281191
ret = StringUtils.stripStart(ret, ".");
282192
ret = ret.replaceAll(isPath ? REGEXP_ILLEGAL_CHARACTERS_OTHERS_PATH : REGEXP_ILLEGAL_CHARACTERS_OTHERS, "_");
283193
} else {
284-
logger.warn("This code path should NOT have been taken");
285-
//we need to be more careful on Linux when using e.g. FAT32
286-
//Therefore be more conservative by default and replace more characters.
287-
ret = StringUtils.stripStart(ret, ".");
288-
ret = ret.replaceAll(isPath ? REGEXP_ILLEGAL_CHARACTERS_WINDOWS_PATH : REGEXP_ILLEGAL_CHARACTERS_WINDOWS, "_");
194+
throw new IllegalStateException("Unsupported OS: " + SystemUtils.OS_NAME);
289195
}
290196

291197
return ret;

src/test/java/mediathek/tool/FilenameUtilsTest.java

+28
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
package mediathek.tool;
22

3+
import com.ibm.icu.text.Transliterator;
34
import org.apache.commons.lang3.StringUtils;
45
import org.junit.jupiter.api.Assertions;
56
import org.junit.jupiter.api.Test;
67

8+
import static org.junit.jupiter.api.Assertions.assertEquals;
9+
710
class FilenameUtilsTest {
811
@Test
912
void trailing_test_with_leading_whitespace() {
@@ -34,4 +37,29 @@ void test_remove_starting_dots_with_leading_whitespace() {
3437
var result = StringUtils.stripStart(testStr, ".");
3538
Assertions.assertEquals(testStr, result);
3639
}
40+
41+
@Test
42+
void test_utf_to_ascii_encoding() {
43+
var src = "Häuser Bäume Höfe Gärten daß Ü ü ö ä Ä Ö ß Â À Å Á Č Đ É ł Ł \u003F";
44+
var expected = "Haeuser Baeume Hoefe Gaerten dass UE ue oe ae AE OE ss A A A A C D E l L ?";
45+
46+
var transliterator = Transliterator.getInstance("de-ASCII");
47+
var res = transliterator.transliterate(src);
48+
49+
assertEquals(expected, res);
50+
}
51+
52+
@Test
53+
void removeWindowsTrailingDots() {
54+
var testStr = "betrifft: ... ";
55+
var result = FilenameUtils.removeWindowsTrailingDots(testStr);
56+
assertEquals("betrifft:", result);
57+
}
58+
59+
@Test
60+
void convertToASCIIEncoding() {
61+
var testStr = "hellöworld.txt";
62+
var result = FilenameUtils.convertToASCIIEncoding(testStr, false);
63+
assertEquals("helloeworld.txt", result);
64+
}
3765
}

0 commit comments

Comments
 (0)