src/textcode/analysis.py

#
# Copyright (c) nexB Inc. and others. All rights reserved.
# ScanCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/scancode-toolkit for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

import io
import json
import os
import re
import unicodedata

import chardet
import typecode

from textcode import pdf
from textcode import markup
from textcode import sfdb
from textcode import strings

"""
Utilities to analyze text. Files are the input.
Once a file is read its output are unicode text lines.
All internal processing assumes unicode in and out.
"""

# Tracing flags
TRACE = False or os.environ.get('SCANCODE_DEBUG_TEXT_ANALYSIS', False)


# Tracing flags
def logger_debug(*args):
    pass


if TRACE:
    import logging
    import sys

    logger = logging.getLogger(__name__)
    logging.basicConfig(stream=sys.stdout)
    logger.setLevel(logging.DEBUG)

    def logger_debug(*args):
        return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args))


def numbered_text_lines(
    location,
    demarkup=False,
    plain_text=False,
    start_line=1,
):
    """
    Yield tuples of (line number, text line) from the file at `location`. Return
    an empty iterator if no text content is extractible. Text extraction is
    based on detected file type. Long lines are broken down in chunks, therefore
    two items can have the same line number.

    line numbers start at ``start_line`` which is 1-based by default.

    If `demarkup` is True, attempt to detect if a file contains HTML/XML-like
    markup and cleanup this markup.

    If `plain_text` is True treat the file as a plain text file and do not
    attempt to detect its type and extract its content with special procedures.
    This is used mostly when loading license texts and rules.

    Note: For testing or building from strings, location can be a is a list of
    unicode line strings.
    """
    if not location:
        return iter([])

    if not isinstance(location, str):
        # not a path: wrap an iterator on location which should be a sequence
        # of lines
        if TRACE:
            logger_debug('numbered_text_lines:', 'location is not a file')
        return enumerate(iter(location), start_line)

    if plain_text:
        if TRACE:
            logger_debug('numbered_text_lines:', 'plain_text')
        return enumerate(unicode_text_lines(location), start_line)

    T = typecode.get_type(location)

    if TRACE:
        logger_debug('numbered_text_lines: T.filetype_file:', T.filetype_file)
        logger_debug('numbered_text_lines: T.is_text_with_long_lines:', T.is_text_with_long_lines)
        logger_debug('numbered_text_lines: T.is_binary:', T.is_binary)

    # TODO: we should have a command line to force digging inside binaries
    if not T.contains_text:
        return iter([])

    # Should we read this as some markup, pdf office doc, text or binary?
    if T.is_pdf and T.is_pdf_with_text:
        if TRACE:
            logger_debug('numbered_text_lines:', 'is_pdf')
        return enumerate(unicode_text_lines_from_pdf(location), start_line)

    if T.filetype_file.startswith('Spline Font Database'):
        if TRACE:
            logger_debug('numbered_text_lines:', 'Spline Font Database')
        return enumerate(
            (as_unicode(l) for l in sfdb.get_text_lines(location)),
            start_line,
        )

    # lightweight markup stripping support
    if demarkup and markup.is_markup(location):
        try:
            numbered_lines = list(enumerate(markup.demarkup(location), start_line))
            if TRACE:
                logger_debug('numbered_text_lines:', 'demarkup')

            numbered_lines = break_numbered_unicode_text_lines(numbered_lines)

            if TRACE:
                logger_debug('numbered_text_lines demarkup:', 'break_numbered_unicode_text_lines')

            return numbered_lines
        except:
            # try again later with as plain text
            pass

    if T.is_js_map:
        try:
            numbered_lines = list(enumerate(js_map_sources_lines(location), start_line))
            if TRACE:
                logger_debug('numbered_text_lines:', 'js_map')
            return numbered_lines
        except:
            # try again later with as plain text otherwise
            pass

    if T.is_text:
        lines = unicode_text_lines(location=location, decrlf=is_source(location))
        numbered_lines = enumerate(lines, start_line)

        # text with very long lines such minified JS, JS map files or large JSON
        if (
            not location.endswith('package.json')
            and (
                T.is_text_with_long_lines or T.is_compact_js
                or T.filetype_file == 'data' or 'locale' in location
            )
        ):

            numbered_lines = break_numbered_unicode_text_lines(numbered_lines)

            if TRACE:
                logger_debug('numbered_text_lines:', 'break_numbered_unicode_text_lines')

        return numbered_lines

    # TODO: handle Office-like documents, RTF, etc
    # if T.is_doc:
    #     return unicode_text_lines_from_doc(location)

    # TODO: add support for "wide" UTF-16-like strings where each char is
    # followed by a zero as is often found in some Windows binaries. Do this for
    # binaries only. This is may conflicting  with "strings" extraction as
    # currently implemented
    if T.is_binary:
        # fall back to binary
        if TRACE:
            logger_debug('numbered_text_lines:', 'is_binary')

        return enumerate(unicode_text_lines_from_binary(location), start_line)

    return iter([])


def unicode_text_lines_from_binary(location):
    """
    Return an iterable over unicode text lines extracted from a binary file at
    location.
    """
    T = typecode.get_type(location)
    if T.contains_text:
        for line in strings.strings_from_file(location):
            yield remove_verbatim_cr_lf_tab_chars(line)


def unicode_text_lines_from_pdf(location):
    """
    Return an iterable over unicode text lines extracted from a pdf file at
    location.
    """
    for line in pdf.get_text_lines(location):
        yield as_unicode(line)


def break_numbered_unicode_text_lines(
    numbered_lines,
    split=u'([",\'])',
    max_len=200,
    chunk_len=30,
):
    """
    Yield text lines breaking long lines on `split` where numbered_lines is an
    iterator of (line number, line text).
    """
    splitter = re.compile(split).split
    for line_number, line in numbered_lines:
        if len(line) > max_len:
            # spli then reassemble in more reasonable chunks
            splitted = splitter(line)
            chunks = (splitted[i:i + chunk_len] for i in range(0, len(splitted), chunk_len))
            for chunk in chunks:
                full_chunk = u''.join(chunk)
                if full_chunk:
                    yield line_number, full_chunk
        else:
            yield line_number, line


def js_map_sources_lines(location):
    """
    Yield unicode text lines from the js.map or css.map file at `location`.
    Spec is at:
    https://docs.google.com/document/d/1U1RGAehQwRypUTovF1KRlpiOFze0b-_2gc6fAH0KY0k/edit
    The format is:
        {
            "version" : 3,
            "file": "out.js",
            "sourceRoot": "",
            "sources": ["foo.js", "bar.js"],
            "sourcesContent": [null, null],
            "names": ["src", "maps", "are", "fun"],
            "mappings": "A,AAAB;;ABCDE;"
        }
    We care only about the presence of these tags for detection: version, sources, sourcesContent.
    """
    with io.open(location, encoding='utf-8') as jsm:
        content = json.load(jsm)
        sources = content.get('sourcesContent', [])
        for entry in sources:
            entry = replace_verbatim_cr_lf_chars(entry)
            for line in entry.splitlines():
                l = remove_verbatim_cr_lf_tab_chars(line)
                yield l


def as_unicode(line):
    """
    Return a unicode text line from a text line.
    Try to decode line as Unicode. Try first some default encodings,
    then attempt Unicode trans-literation and finally
    fall-back to ASCII strings extraction.

    TODO: Add file/magic detection, unicodedmanit/BS3/4
    """
    if isinstance(line, str):
        return remove_null_bytes(line)

    try:
        s = line.decode('UTF-8')
    except UnicodeDecodeError:
        try:
            # FIXME: latin-1 may never fail
            s = line.decode('LATIN-1')
        except UnicodeDecodeError:
            try:
                # Convert some byte string to ASCII characters as Unicode including
                # replacing accented characters with their non- accented NFKD
                # equivalent. Non ISO-Latin and non ASCII characters are stripped
                # from the output. Does not preserve the original length offsets.
                # For Unicode NFKD equivalence, see:
                # http://en.wikipedia.org/wiki/Unicode_equivalence
                s = unicodedata.normalize('NFKD', line).encode('ASCII')
            except UnicodeDecodeError:
                try:
                    enc = chardet.detect(line)['encoding']
                    s = str(line, enc)
                except UnicodeDecodeError:
                    # fall-back to strings extraction if all else fails
                    s = strings.string_from_string(s)
    return remove_null_bytes(s)


def remove_null_bytes(s):
    """
    Return a string replacing by a space all null bytes.

    There are some rare cases where we can have binary strings that are not
    caught early when detecting a file type, but only late at the line level.
    This help catch most of these cases.
    """
    return s.replace('\x00', ' ')


def remove_verbatim_cr_lf_tab_chars(s):
    """
    Return a string replacing by a space any verbatim but escaped line endings
    and tabs (such as a literal \n or \r \t).
    """
    return s.replace('\\r', ' ').replace('\\n', ' ').replace('\\t', ' ')


def replace_verbatim_cr_lf_chars(s):
    """
    Return a string replacing by a LF any verbatim but escaped line endings
    and tabs (such as a literal \n or \r.
    """
    return (s
        .replace('\\\\r\\\\n', '\n')
        .replace('\\r\\n', '\n')
        .replace('\\\\r', '\n')
        .replace('\\\\n', '\n')
        .replace('\\r', '\n')
        .replace('\\n', '\n')
    )


def unicode_text_lines(location, decrlf=False):
    """
    Yield unicode text lines from a file at ``location`` if it
    contains text.

    Open the file as binary then try to decode each line as Unicode.
    Remove verbatim, escaped CR, LF and tabs if ``decrlf`` is True.
    """
    lines = _unicode_text_lines(location)
    if decrlf:
        return map(remove_verbatim_cr_lf_tab_chars, lines)
    else:
        return lines


def _unicode_text_lines(location):
    with open(location, 'rb') as f:
        for line in f.read().splitlines(True):
            yield as_unicode(line)


def unicode_text(location, decrlf=False):
    """
    Return a string guaranteed to be unicode from the content of the file at
    location. The whole file content is returned at once, which may be a
    problem for very large files.
    """
    return u' '.join(unicode_text_lines(location, decrlf=decrlf))


def is_source(location):
    """
    Return True if the file at location is source code, based on its file
    extension
    """
    return location.endswith((
        '.ada',
        '.adb',
        '.asm',
        '.asp',
        '.aj',
        '.bas',
        '.bat',
        '.c',
        '.c++',
        '.cc',
        '.clj',
        '.cob',
        '.cpp',
        '.cs',
        '.csh',
        '.csx',
        '.cxx',
        '.d',
        '.e',
        '.el',
        '.f',
        '.fs',
        '.f77',
        '.f90',
        '.for',
        '.fth',
        '.ftn',
        '.go',
        '.h',
        '.hh',
        '.hpp',
        '.hs',
        '.html',
        '.htm',
        '.hxx',
        '.java',
        '.js',
        '.jsx',
        '.jsp',
        '.ksh',
        '.kt',
        '.lisp',
        '.lua',
        '.m',
        '.m4',
        '.nim',
        '.pas',
        '.php',
        '.pl',
        '.pp',
        '.ps1',
        '.py',
        '.r',
        '.rb',
        '.ruby',
        '.rs',
        '.s',
        '.scala',
        '.sh',
        '.swift',
        '.ts',
        '.vhdl',
        '.verilog',
        '.vb',
        '.groovy',
        '.po',
    ))