diff --git a/build.py b/build.py index a72f96f..94b6160 100644 --- a/build.py +++ b/build.py @@ -1,4 +1,8 @@ +import json import os +import glob +import pprint +import re import subprocess import sys from tree_sitter import Language @@ -32,7 +36,40 @@ subprocess.check_call(["git", "fetch", "--depth=1", "origin", commit], cwd=clone_directory) subprocess.check_call(["git", "checkout", commit], cwd=clone_directory) -print() + +langs = {} +for _, _, clone_directory in repos: + keys = [] + for parser_path in glob.glob(os.path.join(clone_directory, "**/parser.c"), recursive=True): + with open(parser_path, 'r') as parser: + for line in parser: + if line.startswith("extern const TSLanguage *tree_sitter_"): + key = re.search(r"tree_sitter_(.+?)\(", line).group(1) + keys.append(key) + package_json_path = os.path.join(clone_directory, 'package.json') + if not os.path.isfile(package_json_path): + for key in keys: + langs[key] = {} + continue + with open(package_json_path, 'r') as file: + package_json = json.load(file) + if 'tree-sitter' not in package_json: + for key in keys: + langs[key] = {} + continue + for entry in package_json['tree-sitter']: + if len(keys) == 1: + langs[keys[0]] = entry + continue + for key in keys: + if entry['scope'].endswith(key) or ('path' in entry and entry['path'] == key): + langs[key] = entry + break + +with open('tree_sitter_languages/generated.pyx', 'w') as file: + file.write('compiled_languages = ') + pprint.pprint(langs, stream=file) + if sys.platform == "win32": languages_filename = "tree_sitter_languages\\languages.dll" diff --git a/setup.py b/setup.py index f5b2ae0..4ace7ff 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,7 @@ author_email='contact@grantjenks.com', url='https://github.com/grantjenks/py-tree-sitter-languages', license='Apache 2.0', - ext_modules=cythonize('tree_sitter_languages/core.pyx', language_level='3'), + ext_modules=cythonize('tree_sitter_languages/*.pyx', language_level='3'), packages=['tree_sitter_languages'], package_data={'tree_sitter_languages': ['languages.so', 'languages.dll']}, install_requires=['tree-sitter'], diff --git a/tests/test_tree_sitter_languages.py b/tests/test_tree_sitter_languages.py index b4ebc31..08c8f07 100644 --- a/tests/test_tree_sitter_languages.py +++ b/tests/test_tree_sitter_languages.py @@ -1,4 +1,5 @@ -from tree_sitter_languages import get_language, get_parser +from tree_sitter_languages import get_language, get_parser, get_language_by_filename +from tree_sitter_languages.generated import compiled_languages LANGUAGES = [ 'bash', @@ -45,6 +46,7 @@ 'sqlite', 'toml', 'tsq', + 'tsx', 'typescript', 'yaml', ] @@ -87,3 +89,14 @@ def test_get_language(): for language in LANGUAGES: language = get_language(language) assert language + +def test_generated(): + for language in LANGUAGES: + assert compiled_languages[language] is not None + +def test_get_language_by_filename(): + for filename, lang in { + 'file.sh': 'bash', + 'test.go': 'go', + }.items(): + assert get_language_by_filename(filename).name == get_language(lang).name diff --git a/tree_sitter_languages/__init__.py b/tree_sitter_languages/__init__.py index 1ebad68..4289c6f 100644 --- a/tree_sitter_languages/__init__.py +++ b/tree_sitter_languages/__init__.py @@ -1,7 +1,7 @@ """Tree Sitter with Languages """ -from .core import get_language, get_parser +from .core import get_language, get_parser, get_language_by_filename __version__ = '1.7.0' __title__ = 'tree_sitter_languages' diff --git a/tree_sitter_languages/core.pyx b/tree_sitter_languages/core.pyx index a27377c..70fbce1 100644 --- a/tree_sitter_languages/core.pyx +++ b/tree_sitter_languages/core.pyx @@ -1,6 +1,8 @@ import pathlib +import re import sys +from .generated import compiled_languages from tree_sitter import Language, Parser @@ -14,6 +16,31 @@ def get_language(language): language = Language(binary_path, language) return language +def get_language_by_filename(name, contents=None): + matching_keys = [] + for key, entry in compiled_languages.items(): + if 'file-types' not in entry: + continue + for ft in entry['file-types']: + if name == ft or name.endswith(ft): + matching_keys.append(key) + + if contents is None or not matching_keys: + return get_language(matching_keys[0]) if matching_keys else None + + best_score = -1 + best_key = None + for key in matching_keys: + entry = compiled_languages[key] + if 'content-regex' in entry and contents is not None: + match = re.search(entry['content-regex'], contents) + if match: + score = match.end() - match.start() + if score > best_score: + best_score = score + best_key = key + + return get_language(best_key) if best_key else get_language(matching_keys[0]) def get_parser(language): language = get_language(language) diff --git a/tree_sitter_languages/generated.pyx b/tree_sitter_languages/generated.pyx new file mode 100644 index 0000000..02ad209 --- /dev/null +++ b/tree_sitter_languages/generated.pyx @@ -0,0 +1,117 @@ +compiled_languages = {'bash': {'file-types': ['sh', 'bash', 'zsh'], 'scope': 'source.bash'}, + 'c': {'file-types': ['c', 'h'], 'scope': 'source.c'}, + 'c_sharp': {'file-types': ['cs'], 'scope': 'source.cs'}, + 'commonlisp': {}, + 'cpp': {'file-types': ['cc', 'cpp', 'hpp', 'h'], + 'highlights': ['queries/highlights.scm', + 'node_modules/tree-sitter-c/queries/highlights.scm'], + 'scope': 'source.cpp'}, + 'css': {'file-types': ['css'], + 'injection-regex': '^css$', + 'scope': 'source.css'}, + 'dockerfile': {}, + 'dot': {'file-types': ['dot', 'gv'], 'scope': 'source.dot'}, + 'elisp': {'file-types': ['el'], 'scope': 'source.emacs.lisp'}, + 'elixir': {'file-types': ['ex', 'exs'], + 'injection-regex': '^(ex|elixir)$', + 'scope': 'source.elixir'}, + 'elm': {'file-types': ['elm'], 'scope': 'source.elm'}, + 'embedded_template': {'file-types': ['erb'], + 'injection-regex': 'erb', + 'injections': 'queries/injections-erb.scm', + 'scope': 'text.html.erb'}, + 'erlang': {}, + 'go': {'file-types': ['go'], 'scope': 'source.go'}, + 'gomod': {}, + 'hack': {'file-types': ['hack'], + 'first-line-regex': '^((<\\?hh.*)|(#!.+ hhvm))', + 'scope': 'source.hack'}, + 'haskell': {'file-types': ['hs'], + 'highlights': ['queries/highlights.scm'], + 'injection-regex': '^(hs|haskell)$', + 'scope': 'source.haskell'}, + 'hcl': {'file-types': ['hcl'], 'scope': 'source.hcl'}, + 'html': {'file-types': ['html'], + 'injection-regex': 'html', + 'scope': 'text.html.basic'}, + 'java': {'file-types': ['java'], 'scope': 'source.java'}, + 'javascript': {'file-types': ['js'], + 'highlights': ['queries/highlights-jsx.scm', + 'queries/highlights-params.scm', + 'queries/highlights.scm'], + 'injection-regex': '^(js|javascript)$', + 'scope': 'source.js'}, + 'jsdoc': {'injection-regex': 'jsdoc', 'scope': 'text.jsdoc'}, + 'json': {'file-types': ['json'], 'scope': 'source.json'}, + 'julia': {'file-types': ['jl'], 'scope': 'source.julia'}, + 'kotlin': {}, + 'lua': {'file-types': ['lua'], 'scope': 'source.lua'}, + 'make': {'file-types': ['makefile', + 'Makefile', + 'MAKEFILE', + 'GNUmakefile', + 'mk', + 'mak', + 'dsp'], + 'scope': 'source.mk'}, + 'markdown': {}, + 'objc': {'file-types': ['h', 'm'], + 'highlights': ['queries/highlights.scm', + 'node_modules/tree-sitter-c/queries/highlights.scm'], + 'scope': 'source.objc'}, + 'ocaml': {'file-types': ['ml'], + 'first-line-regex': '', + 'injection-regex': '^(ocaml|ml)$', + 'path': 'ocaml', + 'scope': 'source.ocaml'}, + 'perl': {'file-types': ['pl'], 'scope': 'source.perl'}, + 'php': {'file-types': ['php'], + 'highlights': 'queries/highlights.scm', + 'scope': 'source.php'}, + 'python': {'file-types': ['py'], 'scope': 'source.python'}, + 'ql': {'file-types': ['ql', 'qll'], 'scope': 'source.ql'}, + 'r': {'file-types': ['R', 'r'], + 'first-line-regex': '#!.*\\bRscript$', + 'scope': 'source.R'}, + 'regex': {'injection-regex': '^regex$', 'scope': 'source.regex'}, + 'rst': {'file-types': ['rst'], 'injection-regex': 'rst', 'scope': 'text.rst'}, + 'ruby': {'file-types': ['rb'], + 'injection-regex': 'ruby', + 'scope': 'source.ruby'}, + 'rust': {'file-types': ['rs'], + 'injection-regex': 'rust', + 'scope': 'source.rust'}, + 'scala': {'file-types': ['scala'], 'scope': 'source.scala'}, + 'sql': {'file-types': ['sql'], 'scope': 'source.sql'}, + 'sqlite': {'file-types': ['sql'], + 'highlights': 'queries/highlights.scm', + 'injection-regex': '^(sql)$', + 'scope': 'source.sql'}, + 'toml': {'file-types': ['toml'], + 'highlights': ['queries/highlights.scm'], + 'injection-regex': '^toml$', + 'scope': 'source.toml'}, + 'tsq': {'file-types': ['tsq', 'scm'], 'scope': 'scope.tsq'}, + 'tsx': {'content-regex': '@flow', + 'file-types': ['js'], + 'highlights': ['queries/highlights.scm', + 'node_modules/tree-sitter-javascript/queries/highlights-jsx.scm', + 'node_modules/tree-sitter-javascript/queries/highlights.scm'], + 'injections': 'node_modules/tree-sitter-javascript/queries/injections.scm', + 'locals': 'node_modules/tree-sitter-javascript/queries/locals.scm', + 'path': 'tsx', + 'scope': 'source.js.flow', + 'tags': ['queries/tags.scm', + 'node_modules/tree-sitter-javascript/queries/tags.scm']}, + 'typescript': {'file-types': ['ts'], + 'highlights': ['queries/highlights.scm', + 'node_modules/tree-sitter-javascript/queries/highlights.scm'], + 'injection-regex': '^(ts|typescript)$', + 'injections': 'node_modules/tree-sitter-javascript/queries/injections.scm', + 'locals': ['queries/locals.scm', + 'node_modules/tree-sitter-javascript/queries/locals.scm'], + 'path': 'typescript', + 'scope': 'source.ts', + 'tags': ['queries/tags.scm', + 'node_modules/tree-sitter-javascript/queries/tags.scm']}, + 'yaml': {}}