From d7a4a36f8506813670159ee35b2e1238cbc082cc Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Sat, 16 Mar 2024 00:01:26 +0100 Subject: [PATCH] Clean xgettext strings and ignore empties Reference: https://github.com/nexB/source-inspector/issues/11 Reported-by: Armijn Hemel @armijnhemel Signed-off-by: Philippe Ombredanne --- src/source_inspector/strings_xgettext.py | 66 ++++++++++++++++--- .../strings_xgettext/lineedit.c-expected.json | 22 +------ tests/test_symbols_xgettext.py | 4 +- 3 files changed, 59 insertions(+), 33 deletions(-) diff --git a/src/source_inspector/strings_xgettext.py b/src/source_inspector/strings_xgettext.py index 2cf98b5..41ed1b1 100644 --- a/src/source_inspector/strings_xgettext.py +++ b/src/source_inspector/strings_xgettext.py @@ -9,6 +9,7 @@ # import logging +import string import attr from commoncode import command @@ -55,13 +56,13 @@ def get_source_strings(location, **kwargs): """ Return a mapping of strings for a source file at ``location``. """ - return dict(source_strings=list(collect_strings(location=location, strip=True))) + return dict(source_strings=list(collect_strings(location=location, clean=True))) -def collect_strings(location, strip=False): +def collect_strings(location, clean=True): """ Yield mappings of strings collected from file at location. - Strip strings if ``strip`` is True. + Clean strings if ``clean`` is True. """ if not is_xgettext_installed(): return @@ -82,13 +83,13 @@ def collect_strings(location, strip=False): if rc != 0: raise Exception(open(err).read()) - yield from parse_po_text(po_text=result, strip=strip) + yield from parse_po_text(po_text=result, clean=clean) -def parse_po_text(po_text, strip=False): +def parse_po_text(po_text, clean=True): """ Yield mappings of strings collected from the ``po_text`` string. - Strip strings if ``strip`` is True. + Clean strings if ``clean`` is True. The po text lines looks like this: - Blocks sperated by 2 lines. @@ -143,12 +144,57 @@ def parse_po_text(po_text, strip=False): elif line.startswith('"'): strings.append(line) - strings = [l.strip('"').replace("\\n", "\n") for l in strings] + strings = [l.strip('"') for l in strings] string = "".join(strings) - if strip: - string = string.strip() + if clean: + string = clean_string(string) + if string: + yield dict(line_numbers=line_numbers, string=string) - yield dict(line_numbers=line_numbers, string=string) + +def clean_string(s): + """ + Return a cleaned and normalized string or None. + """ + s = s.strip('"') + s = s.replace("\\n", "\n") + s = s.strip() + non_printables = { + "\\a": "\a", + "\\b": "\b", + "\\v": "\v", + "\\f": "\f", + "\\x01": "\x01", + "\\x02": "\x02", + "\\x03": "\x03", + "\\x04": "\x04", + "\\x05": "\x05", + "\\x06": "\x06", + "\\x0e": "\x0e", + "\\x0f": "\x0f", + "\\x10": "\x10", + "\\x11": "\x11", + "\\x12": "\x12", + "\\x13": "\x13", + "\\x14": "\x14", + "\\x15": "\x15", + "\\x16": "\x16", + "\\x17": "\x17", + "\\x18": "\x18", + "\\x19": "\x19", + "\\x1a": "\x1a", + "\\x1b": "\x1b", + "\\x1c": "\x1c", + "\\x1d": "\x1d", + "\\x1e": "\x1e", + "\\x1f": "\x1f", + "\\x7f": "\x7f", + } + + for plain, encoded in non_printables.items(): + s = s.replace(plain, "") + s = s.replace(encoded, "") + return s _IS_XGETTEXT_INSTALLED = None diff --git a/tests/data/strings_xgettext/lineedit.c-expected.json b/tests/data/strings_xgettext/lineedit.c-expected.json index 0e28c53..790f512 100644 --- a/tests/data/strings_xgettext/lineedit.c-expected.json +++ b/tests/data/strings_xgettext/lineedit.c-expected.json @@ -4,12 +4,6 @@ "path": "lineedit.c", "type": "file", "source_strings": [ - { - "line_numbers": [ - 126 - ], - "string": "\u001b" - }, { "line_numbers": [ 128, @@ -24,14 +18,6 @@ ], "string": "HOME" }, - { - "line_numbers": [ - 275, - 1166, - 2858 - ], - "string": "" - }, { "line_numbers": [ 454, @@ -79,12 +65,6 @@ ], "string": "." }, - { - "line_numbers": [ - 905 - ], - "string": "" - }, { "line_numbers": [ 1000 @@ -200,7 +180,7 @@ "line_numbers": [ 3052 ], - "string": "\\\\[\\\\033[32;1m\\\\]\\\\u@\\\\[\\\\x1b[33;1m\\\\]\\\\h:\\\\[\\\\033[34;1m\\\\]\\\\w\\\\[\\\\033[35;1m\\\\] \\\\!\\\\[\\\\e[36;1m\\\\]\\\\$ \\\\[\\\\E[m\\\\]" + "string": "\\\\[\\\\033[32;1m\\\\]\\\\u@\\\\[\\[33;1m\\\\]\\\\h:\\\\[\\\\033[34;1m\\\\]\\\\w\\\\[\\\\033[35;1m\\\\] \\\\!\\\\[\\\\e[36;1m\\\\]\\\\$ \\\\[\\\\E[m\\\\]" }, { "line_numbers": [ diff --git a/tests/test_symbols_xgettext.py b/tests/test_symbols_xgettext.py index 01fad31..1e952ac 100644 --- a/tests/test_symbols_xgettext.py +++ b/tests/test_symbols_xgettext.py @@ -50,7 +50,7 @@ def test_parse_po_text(self): msgid "Collect source symbols using Universal ctags." msgstr "" """ - results = list(parse_po_text(test)) + results = list(parse_po_text(test, clean=False)) expected = [ { "line_numbers": [ @@ -80,7 +80,7 @@ def test_parse_po_text(self): assert results == expected - results = list(parse_po_text(test, strip=True)) + results = list(parse_po_text(test, clean=True)) expected = [ { "line_numbers": [