Merge pull request #16 from nexB/clean-strings

Improve xgettext handlings
aboutcode-org · Mar 19, 2024 · 002ead1 · 002ead1
2 parents 364314a + 9b74543
commit 002ead1
Show file tree

Hide file tree

Showing 11 changed files with 3,565 additions and 63 deletions.
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -27,48 +27,3 @@ jobs:
                  sudo apt-get install universal-ctags gettext
                  venv/bin/pytest -n 2 -vvs
 
-    - template: etc/ci/azure-posix.yml
-      parameters:
-          job_name: macos11_cpython
-          image_name: macOS-11
-          python_versions: ['3.8', '3.9', '3.10', '3.11', '3.12']
-          test_suites:
-              all: |
-                 brew install universal-ctags gettext
-                 venv/bin/pytest -n 2 -vvs
-
-    - template: etc/ci/azure-posix.yml
-      parameters:
-          job_name: macos12_cpython
-          image_name: macOS-12
-          python_versions: ['3.8', '3.9', '3.10', '3.11', '3.12']
-          test_suites:
-              all: |
-                 brew install universal-ctags gettext
-                 venv/bin/pytest -n 2 -vvs
-
-    - template: etc/ci/azure-posix.yml
-      parameters:
-          job_name: macos13_cpython
-          image_name: macOS-13
-          python_versions: ['3.8', '3.9', '3.10', '3.11', '3.12']
-          test_suites:
-              all: |
-                 brew install universal-ctags gettext
-                 venv/bin/pytest -n 2 -vvs
-
-# - template: etc/ci/azure-win.yml
-#   parameters:
-#       job_name: win2019_cpython
-#       image_name: windows-2019
-#       python_versions: ['3.8', '3.9', '3.10', '3.11', '3.12']
-#       test_suites:
-#           all: venv\Scripts\pytest -n 2 -vvs
-#
-# - template: etc/ci/azure-win.yml
-#   parameters:
-#       job_name: win2022_cpython
-#       image_name: windows-2022
-#       python_versions: ['3.8', '3.9', '3.10', '3.11', '3.12']
-#       test_suites:
-#           all: venv\Scripts\pytest -n 2 -vvs
diff --git a/setup.cfg b/setup.cfg
@@ -44,6 +44,7 @@ install_requires =
     scancode-toolkit
     plugincode
     commoncode
+    typecode
 
 [options.packages.find]
 where = src

diff --git a/src/source_inspector/strings_xgettext.py b/src/source_inspector/strings_xgettext.py
@@ -9,13 +9,15 @@
 #
 
 import logging
+import string
 
 import attr
 from commoncode import command
 from commoncode.cliutils import SCAN_GROUP
 from commoncode.cliutils import PluggableCommandLineOption
 from plugincode.scan import ScanPlugin
 from plugincode.scan import scan_impl
+from typecode.contenttype import Type
 
 """
 Extract strinsg from source code files with xgettext.
@@ -55,36 +57,50 @@ def get_source_strings(location, **kwargs):
     """
     Return a mapping of strings for a source file at ``location``.
     """
-    return dict(source_strings=list(collect_strings(location=location, strip=True)))
+    return dict(source_strings=list(collect_strings(location=location, clean=True)))
 
 
-def collect_strings(location, strip=False):
+def collect_strings(location, clean=True):
     """
     Yield mappings of strings collected from file at location.
-    Strip strings if ``strip`` is True.
+    Clean strings if ``clean`` is True.
     """
     if not is_xgettext_installed():
         return
 
+    if not Type(location).is_source:
+        return
+
     rc, result, err = command.execute(
         cmd_loc="xgettext",
-        args=["--omit-header", "--no-wrap", "--extract-all", "--output=-", location],
+        args=[
+            # this is a trick to force getting UTF back
+            # see https://github.com/nexB/source-inspector/issues/14#issuecomment-2001893496
+            '--copyright-holder="ø"',
+            "--no-wrap",
+            "--extract-all",
+            "--from-code=UTF-8",
+            "--output=-",
+            location,
+        ],
         to_files=False,
     )
 
     if rc != 0:
         raise Exception(open(err).read())
 
-    yield from parse_po_text(po_text=result, strip=strip)
+    yield from parse_po_text(po_text=result, drop_header=True, clean=clean)
 
 
-def parse_po_text(po_text, strip=False):
+def parse_po_text(po_text, drop_header=False, clean=True):
     """
     Yield mappings of strings collected from the ``po_text`` string.
-    Strip strings if ``strip`` is True.
+    Clean strings if ``clean`` is True.
+    Drop the "header" first block if ``drop_header`` is True
 
     The po text lines looks like this:
-    - Blocks sperated by 2 lines.
+    - Blocks separated by 2 lines.
+    - Optional first header block
     - The first lines starting with #: are comments with the line numbers.
     - The lines starting with #, are flags, not interesting
     - We care about the lines in the middle starting with the first msgid
@@ -104,14 +120,25 @@ def parse_po_text(po_text, strip=False):
     msgstr ""
     """
 
-    for chunk in po_text.split("\n\n"):
-        lines = chunk.splitlines(False)
+    blocks = po_text.split("\n\n")
+    if drop_header:
+        # drop the first block which is the header
+        blocks = blocks[1:]
+
+    for block in blocks:
+        lines = block.splitlines(False)
         line_numbers = []
         strings = []
         for line in lines:
             if line.startswith("#: "):
-                _, _, start_line = line.rpartition(":")
-                line_numbers.append(int(start_line.strip()))
+                # we can have either of these two forms:
+                # #: lineedit.c:1571 lineedit.c:1587 lineedit.c:163
+                # #: lineedit.c:1571
+                _, _, line = line.partition("#: ")
+                filename, _, _ = line.partition(":")
+                numbers = line.replace(filename + ":", "")
+                numbers = [int(l) for ln in numbers.split() if (l := ln.strip())]
+                line_numbers.extend(numbers)
 
             elif line.startswith(
                 (
@@ -130,12 +157,57 @@ def parse_po_text(po_text, strip=False):
             elif line.startswith('"'):
                 strings.append(line)
 
-        strings = [l.strip('"').replace("\\n", "\n") for l in strings]
+        strings = [l.strip('"') for l in strings]
         string = "".join(strings)
-        if strip:
-            string = string.strip()
+        if clean:
+            string = clean_string(string)
+        if string:
+            yield dict(line_numbers=line_numbers, string=string)
 
-        yield dict(line_numbers=line_numbers, string=string)
+
+def clean_string(s):
+    """
+    Return a cleaned and normalized string or None.
+    """
+    s = s.strip('"')
+    s = s.replace("\\n", "\n")
+    s = s.strip()
+    non_printables = {
+        "\\a": "\a",
+        "\\b": "\b",
+        "\\v": "\v",
+        "\\f": "\f",
+        "\\x01": "\x01",
+        "\\x02": "\x02",
+        "\\x03": "\x03",
+        "\\x04": "\x04",
+        "\\x05": "\x05",
+        "\\x06": "\x06",
+        "\\x0e": "\x0e",
+        "\\x0f": "\x0f",
+        "\\x10": "\x10",
+        "\\x11": "\x11",
+        "\\x12": "\x12",
+        "\\x13": "\x13",
+        "\\x14": "\x14",
+        "\\x15": "\x15",
+        "\\x16": "\x16",
+        "\\x17": "\x17",
+        "\\x18": "\x18",
+        "\\x19": "\x19",
+        "\\x1a": "\x1a",
+        "\\x1b": "\x1b",
+        "\\x1c": "\x1c",
+        "\\x1d": "\x1d",
+        "\\x1e": "\x1e",
+        "\\x1f": "\x1f",
+        "\\x7f": "\x7f",
+    }
+
+    for plain, encoded in non_printables.items():
+        s = s.replace(plain, "")
+        s = s.replace(encoded, "")
+    return s
 
 
 _IS_XGETTEXT_INSTALLED = None

diff --git a/src/source_inspector/symbols_ctags.py b/src/source_inspector/symbols_ctags.py
@@ -17,6 +17,7 @@
 from commoncode.cliutils import PluggableCommandLineOption
 from plugincode.scan import ScanPlugin
 from plugincode.scan import scan_impl
+from typecode.contenttype import Type
 
 """
 Extract symbols information from source code files with ctags.
@@ -67,6 +68,9 @@ def collect_symbols(location):
     if not is_ctags_installed():
         return
 
+    if not Type(location).is_source:
+        return
+
     rc, result, err = command.execute(
         cmd_loc="ctags",
         args=["--output-format=json", "-f", "-", location],

diff --git a/tests/data/strings_xgettext/fdisk.c b/tests/data/strings_xgettext/fdisk.c
@@ -0,0 +1,32 @@
+/* vi: set sw=4 ts=4: */
+/*
+ * fdisk.c -- Partition table manipulator for Linux.
+ *
+ * Copyright (C) 1992  A. V. Le Blanc ([email protected])
+ * Copyright (C) 2001,2002 Vladimir Oleynik <[email protected]> (initial bb port)
+ *
+ * Licensed under GPLv2 or later, see file LICENSE in this source tree.
+ */
+//applet:IF_FDISK(APPLET(fdisk, BB_DIR_SBIN, BB_SUID_DROP))
+
+//kbuild:lib-$(CONFIG_FDISK) += fdisk.o
+
+/* Looks like someone forgot to add this to config system */
+//usage:#ifndef ENABLE_FEATURE_FDISK_BLKSIZE
+#include "libbb.h"
+#include "unicode.h"
+
+static void list_types(const char *const *sys);
+	"\x0a" "OS/2 Boot Manager",/* OS/2 Boot Manager */
+	"\x42" "SFS",
+	"\x63" "GNU HURD or SysV", /* GNU HURD or Mach or Sys V/386 (such as ISC UNIX) */
+	"\x80" "Old Minix",        /* Minix 1.4a and earlier */
+	"\x81" "Minix / old Linux",/* Minix 1.4b and later */
+	"\x82" "Linux swap",       /* also Solaris */
+	"\x83" "Linux",
+	"\x84" "OS/2 hidden C: drive",
+	"\x85" "Linux extended",
+	"\x86" "NTFS volume set",
+	"\x87" "NTFS volume set",
+	"\x8e" "Linux LVM",
+	"\x9f" "BSD/OS",           /* BSDI */
diff --git a/tests/data/strings_xgettext/fdisk.c-expected.json b/tests/data/strings_xgettext/fdisk.c-expected.json
@@ -0,0 +1,89 @@
+{
+  "files": [
+    {
+      "path": "fdisk.c",
+      "type": "file",
+      "source_strings": [
+        {
+          "line_numbers": [
+            20
+          ],
+          "string": "OS/2 Boot Manager"
+        },
+        {
+          "line_numbers": [
+            21
+          ],
+          "string": "BSFS"
+        },
+        {
+          "line_numbers": [
+            22
+          ],
+          "string": "cGNU HURD or SysV"
+        },
+        {
+          "line_numbers": [
+            23
+          ],
+          "string": "Old Minix"
+        },
+        {
+          "line_numbers": [
+            24
+          ],
+          "string": "Minix / old Linux"
+        },
+        {
+          "line_numbers": [
+            25
+          ],
+          "string": "Linux swap"
+        },
+        {
+          "line_numbers": [
+            26
+          ],
+          "string": "Linux"
+        },
+        {
+          "line_numbers": [
+            27
+          ],
+          "string": "OS/2 hidden C: drive"
+        },
+        {
+          "line_numbers": [
+            28
+          ],
+          "string": "Linux extended"
+        },
+        {
+          "line_numbers": [
+            29
+          ],
+          "string": "NTFS volume set"
+        },
+        {
+          "line_numbers": [
+            30
+          ],
+          "string": "NTFS volume set"
+        },
+        {
+          "line_numbers": [
+            31
+          ],
+          "string": "Linux LVM"
+        },
+        {
+          "line_numbers": [
+            32
+          ],
+          "string": "BSD/OS"
+        }
+      ],
+      "scan_errors": []
+    }
+  ]
+}
diff --git a/tests/data/strings_xgettext/fdisk.c.ABOUT b/tests/data/strings_xgettext/fdisk.c.ABOUT
@@ -0,0 +1,3 @@
+date: 2024-03-16
+download_url: https://git.busybox.net/busybox/plain/util-linux/fdisk.c?h=1_35_stable
+notes: stripped down to the few klines we care for