Skip to content

Commit

Permalink
Add tree-sitter plugin for symbol and string collection
Browse files Browse the repository at this point in the history
Signed-off-by: Keshav Priyadarshi <[email protected]>
  • Loading branch information
keshav-space committed Apr 24, 2024
1 parent 71d6ea2 commit fa30882
Showing 1 changed file with 158 additions and 0 deletions.
158 changes: 158 additions & 0 deletions src/source_inspector/symbols_strings_tree_shitter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# ScanCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/source-inspector for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

import importlib
import logging

import attr
from commoncode.cliutils import SCAN_GROUP
from commoncode.cliutils import PluggableCommandLineOption
from plugincode.scan import ScanPlugin
from plugincode.scan import scan_impl
from tree_sitter import Language
from tree_sitter import Parser
from typecode.contenttype import Type

# Tracing flags
TRACE = False
TRACE_LIGHT = False


def logger_debug(*args):
pass


if TRACE or TRACE_LIGHT:
import logging
import sys

logger = logging.getLogger(__name__)
logging.basicConfig(stream=sys.stdout)
logger.setLevel(logging.DEBUG)

def logger_debug(*args):
return logger.debug(" ".join(isinstance(a, str) and a or repr(a) for a in args))


"""
Extract symbols and strings information from source code files with tree-sitter.
See https://tree-sitter.github.io/
"""


@scan_impl
class TSSymbolAndStringScannerPlugin(ScanPlugin):
"""
Scan a source file for symbols and strings using tree-sitter.
"""

resource_attributes = dict(
source_symbols=attr.ib(default=attr.Factory(list), repr=False),
source_strings=attr.ib(default=attr.Factory(list), repr=False),
)

options = [
PluggableCommandLineOption(
("--source-symbol-string",),
is_flag=True,
default=False,
help="Collect source symbols and strings using tree-sitter.",
help_group=SCAN_GROUP,
sort_order=100,
),
]

def is_enabled(self, source_symbol_string, **kwargs):
return source_symbol_string

def get_scanner(self, **kwargs):
return get_symbols_and_strings


def get_symbols_and_strings(location, **kwargs):
"""
Return a mapping of symbols and strings for a source file at ``location``.
"""

symbols, strings = collect_symbols_and_strings(location=location)
return dict(
source_symbols=symbols,
source_strings=strings,
)


def collect_symbols_and_strings(location):
"""
Return lists containing mappings of symbols and strings collected from file at location.
"""
symbols, strings = [], []

if parser_result := get_parser(location):
parser, string_id = parser_result

with open(location, "rb") as f:
source = f.read()

tree = parser.parse(source)
traverse(tree.root_node, symbols, strings, string_id)

return symbols, strings


def get_parser(location):
"""
Get the appropriate tree-sitter parser and string identifier for
file at location.
"""
file_type = Type(location)
language = file_type.programming_language

if not language or language not in TS_LANGUAGE_WHEELS:
return

wheel = TS_LANGUAGE_WHEELS[language]["wheel"]
string_id = TS_LANGUAGE_WHEELS[language]["string_id"]

try:
grammar = importlib.import_module(wheel)
except ModuleNotFoundError:
raise TreeSitterWheelNotInstalled(f"{wheel} package is not installed")

LANGUAGE = Language(grammar.language(), language)
parser = Parser()
parser.set_language(LANGUAGE)

return parser, string_id


def traverse(node, symbols, strings, string_id, depth=0):
"""Recursively traverse the parse tree node to collect symbols and strings."""
if node.type == "identifier":
symbols.append(node.text.decode())
elif node.type == string_id:
strings.append(node.text.decode("utf8").replace('"', ""))
for child in node.children:
traverse(child, symbols, strings, string_id, depth + 1)


TS_LANGUAGE_WHEELS = {
"Bash": {"wheel": "tree_sitter_bash", "string_id": "raw_string"},
"C": {"wheel": "tree_sitter_c", "string_id": "string_literal"},
"C++": {"wheel": "tree_sitter_cpp", "string_id": "raw_string_literal"},
"Go": {"wheel": "tree_sitter_go", "string_id": "raw_string_literal"},
"Java": {"wheel": "tree_sitter_java", "string_id": "string_literal"},
"JavaScript": {"wheel": "tree_sitter_javascript", "string_id": "string"},
"Python": {"wheel": "tree_sitter_python", "string_id": "string"},
"Rust": {"wheel": "tree_sitter_rust", "string_id": "raw_string_literal"},
}


class TreeSitterWheelNotInstalled(Exception):
pass

0 comments on commit fa30882

Please sign in to comment.