Skip to content

Commit

Permalink
Merge pull request #20 from nexB/pygments
Browse files Browse the repository at this point in the history
Add support for Pygments lexing
  • Loading branch information
keshav-space authored Apr 24, 2024
2 parents 71d6ea2 + 3f09a80 commit 0c36cb7
Show file tree
Hide file tree
Showing 11 changed files with 40,396 additions and 0 deletions.
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ install_requires =
plugincode
commoncode
typecode
pygments

[options.packages.find]
where = src
Expand All @@ -54,6 +55,7 @@ where = src
scancode_scan =
source_symbol = source_inspector.symbols_ctags:CtagsSymbolScannerPlugin
source_string = source_inspector.strings_xgettext:XgettextStringScannerPlugin
pygments_symbol_and_string = source_inspector.symbols_pygments:PygmentsSymbolsAndStringScannerPlugin


[options.extras_require]
Expand Down
1 change: 1 addition & 0 deletions src/source_inspector/strings_xgettext.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class XgettextStringScannerPlugin(ScanPlugin):
help="Collect source strings using xgettext.",
help_group=SCAN_GROUP,
sort_order=100,
conflicting_options=["treesitter_symbol_and_string", "pygments_symbol_and_string"],
),
]

Expand Down
1 change: 1 addition & 0 deletions src/source_inspector/symbols_ctags.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ class CtagsSymbolScannerPlugin(ScanPlugin):
help="Collect source symbols using Universal ctags.",
help_group=SCAN_GROUP,
sort_order=100,
conflicting_options=["treesitter_symbol_and_string", "pygments_symbol_and_string"],
),
]

Expand Down
133 changes: 133 additions & 0 deletions src/source_inspector/symbols_pygments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# ScanCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/source-inspector for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

import logging

import attr
from commoncode.cliutils import SCAN_GROUP
from commoncode.cliutils import PluggableCommandLineOption
from plugincode.scan import ScanPlugin
from plugincode.scan import scan_impl
from pygments.lexers import get_lexer_for_filename
from pygments.token import Comment
from pygments.token import Literal
from pygments.token import Name
from pygments.token import Punctuation
from pygments.token import Whitespace
from pygments.util import ClassNotFound
from textcode import analysis
from typecode.contenttype import Type

"""
Extract strings and symbols from source code files with pygments.
"""
LOG = logging.getLogger(__name__)


@scan_impl
class PygmentsSymbolsAndStringScannerPlugin(ScanPlugin):
"""
Scan a source file for symbols, strings and comments using Pygments.
"""

resource_attributes = dict(
source_symbols=attr.ib(default=attr.Factory(list), repr=False),
source_strings=attr.ib(default=attr.Factory(list), repr=False),
source_comments=attr.ib(default=attr.Factory(list), repr=False),
)

options = [
PluggableCommandLineOption(
("--pygments-symbol-and-string",),
is_flag=True,
default=False,
help="Collect source symbols, strings and comments using pygments.",
help_group=SCAN_GROUP,
sort_order=100,
conflicting_options=["source_symbol", "source_string", "treesitter_symbol_and_string"],
),
]

def is_enabled(self, pygments_symbol_and_string, **kwargs):
return pygments_symbol_and_string

def get_scanner(self, **kwargs):
return get_pygments_symbols


def get_pygments_symbols(location, **kwargs):
"""
Return a mapping of symbol, string and comment lists for a source file at ``location``.
"""
source_strings = []
source_comments = []
source_symbols = []

for token in get_tokens(location=location):
token_type = token["token_type"]
token_value = token["token_value"]

if token_type == "string":
source_strings.append(token_value)
elif token_type == "comment":
source_comments.append(token_value)
elif token_type == "symbol":
source_symbols.append(token_value)

return dict(
source_symbols=source_symbols,
source_strings=source_strings,
source_comments=source_comments,
)


def get_tokens(location, with_literals=True, with_comments=False):
"""
Yield a stream of strings tagged as symbols. Include optional literals (aka. strings.) and comments.
Yield nothing for files that are not parsable.
"""
if not Type(location).is_source:
return

try:
lexer = get_lexer_for_filename(location)
except ClassNotFound:
return

text = analysis.unicode_text(location)

symbols = (
Name.Function,
Name.Entity,
Name.Constant,
Name.Class,
Name.Namespace,
Name.Property,
)

for pos, ttype, tvalue in lexer.get_tokens_unprocessed(text):
tvalue = tvalue.strip()
if not tvalue:
continue

if ttype in (
Punctuation,
Whitespace,
):
continue

if with_literals and ttype in Literal:
yield dict(position=pos, token_type="string", token_value=tvalue)

elif with_comments and ttype in Comment:
yield dict(position=pos, token_type="comment", token_value=tvalue)

elif ttype in symbols:
yield dict(position=pos, token_type="symbol", token_value=tvalue)
Loading

0 comments on commit 0c36cb7

Please sign in to comment.