Skip to content

Commit

Permalink
Merge pull request #16 from nexB/clean-strings
Browse files Browse the repository at this point in the history
Improve xgettext handlings
  • Loading branch information
keshav-space authored Mar 19, 2024
2 parents 364314a + 9b74543 commit 002ead1
Show file tree
Hide file tree
Showing 11 changed files with 3,565 additions and 63 deletions.
45 changes: 0 additions & 45 deletions azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,48 +27,3 @@ jobs:
sudo apt-get install universal-ctags gettext
venv/bin/pytest -n 2 -vvs
- template: etc/ci/azure-posix.yml
parameters:
job_name: macos11_cpython
image_name: macOS-11
python_versions: ['3.8', '3.9', '3.10', '3.11', '3.12']
test_suites:
all: |
brew install universal-ctags gettext
venv/bin/pytest -n 2 -vvs
- template: etc/ci/azure-posix.yml
parameters:
job_name: macos12_cpython
image_name: macOS-12
python_versions: ['3.8', '3.9', '3.10', '3.11', '3.12']
test_suites:
all: |
brew install universal-ctags gettext
venv/bin/pytest -n 2 -vvs
- template: etc/ci/azure-posix.yml
parameters:
job_name: macos13_cpython
image_name: macOS-13
python_versions: ['3.8', '3.9', '3.10', '3.11', '3.12']
test_suites:
all: |
brew install universal-ctags gettext
venv/bin/pytest -n 2 -vvs
# - template: etc/ci/azure-win.yml
# parameters:
# job_name: win2019_cpython
# image_name: windows-2019
# python_versions: ['3.8', '3.9', '3.10', '3.11', '3.12']
# test_suites:
# all: venv\Scripts\pytest -n 2 -vvs
#
# - template: etc/ci/azure-win.yml
# parameters:
# job_name: win2022_cpython
# image_name: windows-2022
# python_versions: ['3.8', '3.9', '3.10', '3.11', '3.12']
# test_suites:
# all: venv\Scripts\pytest -n 2 -vvs
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ install_requires =
scancode-toolkit
plugincode
commoncode
typecode

[options.packages.find]
where = src
Expand Down
104 changes: 88 additions & 16 deletions src/source_inspector/strings_xgettext.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,15 @@
#

import logging
import string

import attr
from commoncode import command
from commoncode.cliutils import SCAN_GROUP
from commoncode.cliutils import PluggableCommandLineOption
from plugincode.scan import ScanPlugin
from plugincode.scan import scan_impl
from typecode.contenttype import Type

"""
Extract strinsg from source code files with xgettext.
Expand Down Expand Up @@ -55,36 +57,50 @@ def get_source_strings(location, **kwargs):
"""
Return a mapping of strings for a source file at ``location``.
"""
return dict(source_strings=list(collect_strings(location=location, strip=True)))
return dict(source_strings=list(collect_strings(location=location, clean=True)))


def collect_strings(location, strip=False):
def collect_strings(location, clean=True):
"""
Yield mappings of strings collected from file at location.
Strip strings if ``strip`` is True.
Clean strings if ``clean`` is True.
"""
if not is_xgettext_installed():
return

if not Type(location).is_source:
return

rc, result, err = command.execute(
cmd_loc="xgettext",
args=["--omit-header", "--no-wrap", "--extract-all", "--output=-", location],
args=[
# this is a trick to force getting UTF back
# see https://github.com/nexB/source-inspector/issues/14#issuecomment-2001893496
'--copyright-holder="ø"',
"--no-wrap",
"--extract-all",
"--from-code=UTF-8",
"--output=-",
location,
],
to_files=False,
)

if rc != 0:
raise Exception(open(err).read())

yield from parse_po_text(po_text=result, strip=strip)
yield from parse_po_text(po_text=result, drop_header=True, clean=clean)


def parse_po_text(po_text, strip=False):
def parse_po_text(po_text, drop_header=False, clean=True):
"""
Yield mappings of strings collected from the ``po_text`` string.
Strip strings if ``strip`` is True.
Clean strings if ``clean`` is True.
Drop the "header" first block if ``drop_header`` is True
The po text lines looks like this:
- Blocks sperated by 2 lines.
- Blocks separated by 2 lines.
- Optional first header block
- The first lines starting with #: are comments with the line numbers.
- The lines starting with #, are flags, not interesting
- We care about the lines in the middle starting with the first msgid
Expand All @@ -104,14 +120,25 @@ def parse_po_text(po_text, strip=False):
msgstr ""
"""

for chunk in po_text.split("\n\n"):
lines = chunk.splitlines(False)
blocks = po_text.split("\n\n")
if drop_header:
# drop the first block which is the header
blocks = blocks[1:]

for block in blocks:
lines = block.splitlines(False)
line_numbers = []
strings = []
for line in lines:
if line.startswith("#: "):
_, _, start_line = line.rpartition(":")
line_numbers.append(int(start_line.strip()))
# we can have either of these two forms:
# #: lineedit.c:1571 lineedit.c:1587 lineedit.c:163
# #: lineedit.c:1571
_, _, line = line.partition("#: ")
filename, _, _ = line.partition(":")
numbers = line.replace(filename + ":", "")
numbers = [int(l) for ln in numbers.split() if (l := ln.strip())]
line_numbers.extend(numbers)

elif line.startswith(
(
Expand All @@ -130,12 +157,57 @@ def parse_po_text(po_text, strip=False):
elif line.startswith('"'):
strings.append(line)

strings = [l.strip('"').replace("\\n", "\n") for l in strings]
strings = [l.strip('"') for l in strings]
string = "".join(strings)
if strip:
string = string.strip()
if clean:
string = clean_string(string)
if string:
yield dict(line_numbers=line_numbers, string=string)

yield dict(line_numbers=line_numbers, string=string)

def clean_string(s):
"""
Return a cleaned and normalized string or None.
"""
s = s.strip('"')
s = s.replace("\\n", "\n")
s = s.strip()
non_printables = {
"\\a": "\a",
"\\b": "\b",
"\\v": "\v",
"\\f": "\f",
"\\x01": "\x01",
"\\x02": "\x02",
"\\x03": "\x03",
"\\x04": "\x04",
"\\x05": "\x05",
"\\x06": "\x06",
"\\x0e": "\x0e",
"\\x0f": "\x0f",
"\\x10": "\x10",
"\\x11": "\x11",
"\\x12": "\x12",
"\\x13": "\x13",
"\\x14": "\x14",
"\\x15": "\x15",
"\\x16": "\x16",
"\\x17": "\x17",
"\\x18": "\x18",
"\\x19": "\x19",
"\\x1a": "\x1a",
"\\x1b": "\x1b",
"\\x1c": "\x1c",
"\\x1d": "\x1d",
"\\x1e": "\x1e",
"\\x1f": "\x1f",
"\\x7f": "\x7f",
}

for plain, encoded in non_printables.items():
s = s.replace(plain, "")
s = s.replace(encoded, "")
return s


_IS_XGETTEXT_INSTALLED = None
Expand Down
4 changes: 4 additions & 0 deletions src/source_inspector/symbols_ctags.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from commoncode.cliutils import PluggableCommandLineOption
from plugincode.scan import ScanPlugin
from plugincode.scan import scan_impl
from typecode.contenttype import Type

"""
Extract symbols information from source code files with ctags.
Expand Down Expand Up @@ -67,6 +68,9 @@ def collect_symbols(location):
if not is_ctags_installed():
return

if not Type(location).is_source:
return

rc, result, err = command.execute(
cmd_loc="ctags",
args=["--output-format=json", "-f", "-", location],
Expand Down
32 changes: 32 additions & 0 deletions tests/data/strings_xgettext/fdisk.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/* vi: set sw=4 ts=4: */
/*
* fdisk.c -- Partition table manipulator for Linux.
*
* Copyright (C) 1992 A. V. Le Blanc ([email protected])
* Copyright (C) 2001,2002 Vladimir Oleynik <[email protected]> (initial bb port)
*
* Licensed under GPLv2 or later, see file LICENSE in this source tree.
*/
//applet:IF_FDISK(APPLET(fdisk, BB_DIR_SBIN, BB_SUID_DROP))

//kbuild:lib-$(CONFIG_FDISK) += fdisk.o

/* Looks like someone forgot to add this to config system */
//usage:#ifndef ENABLE_FEATURE_FDISK_BLKSIZE
#include "libbb.h"
#include "unicode.h"

static void list_types(const char *const *sys);
"\x0a" "OS/2 Boot Manager",/* OS/2 Boot Manager */
"\x42" "SFS",
"\x63" "GNU HURD or SysV", /* GNU HURD or Mach or Sys V/386 (such as ISC UNIX) */
"\x80" "Old Minix", /* Minix 1.4a and earlier */
"\x81" "Minix / old Linux",/* Minix 1.4b and later */
"\x82" "Linux swap", /* also Solaris */
"\x83" "Linux",
"\x84" "OS/2 hidden C: drive",
"\x85" "Linux extended",
"\x86" "NTFS volume set",
"\x87" "NTFS volume set",
"\x8e" "Linux LVM",
"\x9f" "BSD/OS", /* BSDI */
89 changes: 89 additions & 0 deletions tests/data/strings_xgettext/fdisk.c-expected.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
{
"files": [
{
"path": "fdisk.c",
"type": "file",
"source_strings": [
{
"line_numbers": [
20
],
"string": "OS/2 Boot Manager"
},
{
"line_numbers": [
21
],
"string": "BSFS"
},
{
"line_numbers": [
22
],
"string": "cGNU HURD or SysV"
},
{
"line_numbers": [
23
],
"string": "Old Minix"
},
{
"line_numbers": [
24
],
"string": "Minix / old Linux"
},
{
"line_numbers": [
25
],
"string": "Linux swap"
},
{
"line_numbers": [
26
],
"string": "Linux"
},
{
"line_numbers": [
27
],
"string": "OS/2 hidden C: drive"
},
{
"line_numbers": [
28
],
"string": "Linux extended"
},
{
"line_numbers": [
29
],
"string": "NTFS volume set"
},
{
"line_numbers": [
30
],
"string": "NTFS volume set"
},
{
"line_numbers": [
31
],
"string": "Linux LVM"
},
{
"line_numbers": [
32
],
"string": "BSD/OS"
}
],
"scan_errors": []
}
]
}
3 changes: 3 additions & 0 deletions tests/data/strings_xgettext/fdisk.c.ABOUT
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
date: 2024-03-16
download_url: https://git.busybox.net/busybox/plain/util-linux/fdisk.c?h=1_35_stable
notes: stripped down to the few klines we care for
Loading

0 comments on commit 002ead1

Please sign in to comment.