|
| 1 | +package sanitize |
| 2 | + |
| 3 | +import ( |
| 4 | + "strings" |
| 5 | + "unicode" |
| 6 | + |
| 7 | + "golang.org/x/text/unicode/rangetable" |
| 8 | +) |
| 9 | + |
| 10 | +// invisibleRunes unicode.IsPrint does not include all invisible characters, |
| 11 | +// so I got this list from https://invisible-characters.com/ |
| 12 | +var invisibleRunes = []rune{ |
| 13 | + '\u0000', // NULL |
| 14 | + '\u0009', // CHARACTER TABULATION |
| 15 | + '\u00A0', // NO-BREAK SPACE |
| 16 | + '\u00AD', // SOFT HYPHEN |
| 17 | + '\u034F', // COMBINING GRAPHEME JOINER |
| 18 | + '\u061C', // ARABIC LETTER MARK |
| 19 | + '\u115F', // HANGUL CHOSEONG FILLER |
| 20 | + '\u1160', // HANGUL JUNGSEONG FILLER |
| 21 | + '\u17B4', // KHMER VOWEL INHERENT AQ |
| 22 | + '\u17B5', // KHMER VOWEL INHERENT AA |
| 23 | + '\u180E', // MONGOLIAN VOWEL SEPARATOR |
| 24 | + '\u2000', // EN QUAD |
| 25 | + '\u2001', // EM QUAD |
| 26 | + '\u2002', // EN SPACE |
| 27 | + '\u2003', // EM SPACE |
| 28 | + '\u2004', // THREE-PER-EM SPACE |
| 29 | + '\u2005', // FOUR-PER-EM SPACE |
| 30 | + '\u2006', // SIX-PER-EM SPACE |
| 31 | + '\u2007', // FIGURE SPACE |
| 32 | + '\u2008', // PUNCTUATION SPACE |
| 33 | + '\u2009', // THIN SPACE |
| 34 | + '\u200A', // HAIR SPACE |
| 35 | + '\u200B', // ZERO WIDTH SPACE |
| 36 | + '\u200C', // ZERO WIDTH NON-JOINER |
| 37 | + '\u200D', // ZERO WIDTH JOINER |
| 38 | + '\u200E', // LEFT-TO-RIGHT MARK |
| 39 | + '\u200F', // RIGHT-TO-LEFT MARK |
| 40 | + '\u202F', // NARROW NO-BREAK SPACE |
| 41 | + '\u205F', // MEDIUM MATHEMATICAL SPACE |
| 42 | + '\u2060', // WORD JOINER |
| 43 | + '\u2061', // FUNCTION APPLICATION |
| 44 | + '\u2062', // INVISIBLE TIMES |
| 45 | + '\u2063', // INVISIBLE SEPARATOR |
| 46 | + '\u2064', // INVISIBLE PLUS |
| 47 | + '\u206A', // INHIBIT SYMMETRIC SWAPPING |
| 48 | + '\u206B', // ACTIVATE SYMMETRIC SWAPPING |
| 49 | + '\u206C', // INHIBIT ARABIC FORM SHAPING |
| 50 | + '\u206D', // ACTIVATE ARABIC FORM SHAPING |
| 51 | + '\u206E', // NATIONAL DIGIT SHAPES |
| 52 | + '\u206F', // NOMINAL DIGIT SHAPES |
| 53 | + '\u3000', // IDEOGRAPHIC SPACE |
| 54 | + '\u2800', // BRAILLE PATTERN BLANK |
| 55 | + '\u3164', // HANGUL FILLER |
| 56 | + '\uFEFF', // ZERO WIDTH NO-BREAK SPACE |
| 57 | + '\uFFA0', // HALFWIDTH HANGUL FILLER |
| 58 | +} |
| 59 | + |
| 60 | +var invisibleRangeTable *unicode.RangeTable |
| 61 | + |
| 62 | +func init() { |
| 63 | + invisibleRangeTable = rangetable.New(invisibleRunes...) |
| 64 | +} |
| 65 | + |
| 66 | +// Unicode removes irregularly invisible characters from a string. |
| 67 | +// |
| 68 | +// Irregularly invisible characters are defined as: |
| 69 | +// - Non-printable characters according to Go's unicode package (unicode.IsPrint). |
| 70 | +// - Characters in the invisibleRunes list (https://invisible-characters.com/). |
| 71 | +// |
| 72 | +// Note: Regular ASCII space (0x20) is not removed. |
| 73 | +func Unicode(str string) string { |
| 74 | + return strings.Map(func(r rune) rune { |
| 75 | + if unicode.Is(invisibleRangeTable, r) || !unicode.IsPrint(r) { |
| 76 | + return -1 |
| 77 | + } |
| 78 | + return r |
| 79 | + }, str) |
| 80 | +} |
0 commit comments