Skip to content

Commit aa7d922

Browse files
Shebang and Modeline Detection (#47)
* Shebang & Modeline Detection
1 parent d0c7979 commit aa7d922

File tree

5 files changed

+345
-22
lines changed

5 files changed

+345
-22
lines changed

Package.swift

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
1-
// swift-tools-version: 5.6
1+
// swift-tools-version: 5.7
22
// The swift-tools-version declares the minimum version of Swift required to build this package.
33

44
import PackageDescription
55

66
let package = Package(
77
name: "CodeEditLanguages",
8-
platforms: [.macOS(.v12)],
8+
platforms: [.macOS(.v13)],
99
products: [
1010
.library(
1111
name: "CodeEditLanguages",

Sources/CodeEditLanguages/CodeLanguage+Definitions.swift

+4-2
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,8 @@ public extension CodeLanguage {
149149
id: .javascript,
150150
tsName: "javascript",
151151
extensions: ["js"],
152-
highlights: ["injections"]
152+
highlights: ["injections"],
153+
additionalIdentifiers: ["node", "deno"]
153154
)
154155

155156
/// A language structure for `JSDoc`
@@ -238,7 +239,8 @@ public extension CodeLanguage {
238239
static let python: CodeLanguage = .init(
239240
id: .python,
240241
tsName: "python",
241-
extensions: ["py"]
242+
extensions: ["py"],
243+
additionalIdentifiers: ["python2", "python3"]
242244
)
243245

244246
/// A language structure for `Regex`
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
//
2+
// CodeLanguage+DetectLanguage.swift
3+
// CodeEditLanguages
4+
//
5+
// Created by Khan Winter on 6/17/23.
6+
//
7+
8+
import Foundation
9+
import RegexBuilder
10+
11+
public extension CodeLanguage {
12+
13+
/// Gets the corresponding language for the given file URL
14+
///
15+
/// Uses the `pathExtension` URL component to detect the language
16+
/// - Returns: A language structure
17+
/// - Parameters:
18+
/// - url: The URL to get the language for.
19+
/// - prefixBuffer: The first few lines of the document.
20+
/// - suffixBuffer: The last few lines of the document.
21+
static func detectLanguageFrom(url: URL, prefixBuffer: String? = nil, suffixBuffer: String? = nil) -> CodeLanguage {
22+
if let urlLanguage = detectLanguageUsingURL(url: url) {
23+
return urlLanguage
24+
} else if let prefixBuffer,
25+
let shebangLanguage = detectLanguageUsingShebang(contents: prefixBuffer.lowercased()) {
26+
return shebangLanguage
27+
} else if let prefixBuffer,
28+
let modelineLanguage = detecLanguageUsingModeline(
29+
prefixBuffer: prefixBuffer.lowercased(),
30+
suffixBuffer: suffixBuffer?.lowercased()
31+
) {
32+
return modelineLanguage
33+
} else {
34+
return .default
35+
}
36+
}
37+
38+
/// Detects a file's language using the file url.
39+
/// - Parameter url: The URL of the file.
40+
/// - Returns: The detected code language, if any.
41+
private static func detectLanguageUsingURL(url: URL) -> CodeLanguage? {
42+
let fileExtension = url.pathExtension.lowercased()
43+
let fileName = url.pathComponents.last // should not be lowercase since it has to match e.g. `Dockerfile`
44+
// This is to handle special file types without an extension (e.g., Makefile, Dockerfile)
45+
let fileNameOrExtension = fileExtension.isEmpty ? (fileName != nil ? fileName! : "") : fileExtension
46+
if let lang = allLanguages.first(where: { lang in lang.extensions.contains(fileNameOrExtension)}) {
47+
return lang
48+
} else {
49+
return nil
50+
}
51+
}
52+
53+
/// Detects code langauges from the shebang of a file.
54+
/// Eg: `#!/usr/bin/env/python2.6` will detect the `python` code language.
55+
/// Or, `#! /usr/bin/env perl` will detect the `perl` code language.
56+
/// - Parameter contents: The contents of the first few lines of the file.
57+
/// - Returns: The detected code language, if any.
58+
private static func detectLanguageUsingShebang(contents: String) -> CodeLanguage? {
59+
var contents = String(contents.split(separator: "\n").first ?? "")
60+
// Make sure:
61+
// - First line is a shebang
62+
// - There are contents after the shebang
63+
// - There is a valid script component (eg: "swift" in "/usr/env/swift")
64+
guard
65+
contents.starts(with: "#!"),
66+
contents.trimmingCharacters(in: .whitespacesAndNewlines) != "#!",
67+
let result = contents
68+
.split(separator: "/", omittingEmptySubsequences: true)
69+
.last?
70+
.firstMatch(of: Regex { OneOrMore(.word) })
71+
else {
72+
return nil
73+
}
74+
75+
var script = result.output.trimmingCharacters(in: .whitespacesAndNewlines)
76+
77+
// If script is "env" walk the string until we find a valid-looking script component
78+
if script == "env" {
79+
// If env is the end of the string, return
80+
guard result.endIndex != contents.endIndex else { return nil }
81+
82+
let argumentRegex = Regex {
83+
ZeroOrMore(.whitespace)
84+
ChoiceOf {
85+
One("-")
86+
One("--")
87+
}
88+
ZeroOrMore(.word)
89+
ZeroOrMore(.whitespace)
90+
}
91+
let parameterRegex = Regex {
92+
OneOrMore(.word)
93+
One("=")
94+
OneOrMore(.word)
95+
}
96+
97+
// Skip over any optional arguments or parameters (eg: -x or x=y) and make script the next valid string
98+
// https://www.gnu.org/software/coreutils/manual/html_node/env-invocation.html
99+
// Skip first shebang-path string
100+
contents.trimPrefix(Regex {
101+
OneOrMore("#!")
102+
ZeroOrMore(.whitespace)
103+
OneOrMore(.any, .reluctant)
104+
OneOrMore(.whitespace)
105+
})
106+
while !contents.isEmpty {
107+
if contents.prefixMatch(of: argumentRegex) != nil {
108+
contents.trimPrefix(argumentRegex)
109+
} else if contents.prefixMatch(of: parameterRegex) != nil {
110+
contents.trimPrefix(parameterRegex)
111+
} else {
112+
break
113+
}
114+
}
115+
guard let newScript = contents.firstMatch(of: Regex { OneOrMore(.word) })?.output else {
116+
return nil
117+
}
118+
script = String(newScript)
119+
}
120+
121+
return languageFromIdentifier(script)
122+
}
123+
124+
/// Detects modelines in either the beginning or end of a file.
125+
///
126+
/// Examples of valid modelines:
127+
/// ```
128+
/// # vim: set ft=js ts=4 sw=4 et:
129+
/// # vim: ts=4:sw=4:et:ft=js
130+
/// -*- mode: js; indent-tabs-mode: nil; tab-width: 4 -*-
131+
/// code: language=javascript insertSpaces=true tabSize=4
132+
/// ```
133+
/// All of the above would resolve to `javascript`
134+
///
135+
/// - Parameters:
136+
/// - prefixBuffer: The first few lines of a document.
137+
/// - suffixBuffer: The last few lines of a document.
138+
/// - Returns: The detected code language, if any.
139+
private static func detecLanguageUsingModeline(prefixBuffer: String, suffixBuffer: String?) -> CodeLanguage? {
140+
func detectModeline(in string: String) -> CodeLanguage? {
141+
guard !string.isEmpty else { return nil }
142+
143+
// Regex for detecting emacs modelines.
144+
let emacsLineRegex = Regex {
145+
"-*-"
146+
Capture {
147+
#/.*/#
148+
}
149+
"-*-"
150+
}
151+
152+
// Regex to find language parameters in a emacs modeline.
153+
let emacsLanguageRegex = Regex {
154+
"mode:"
155+
ZeroOrMore(.whitespace)
156+
Capture {
157+
OneOrMore(.word)
158+
}
159+
}
160+
161+
// Regex for detecting vim modelines.
162+
let vimLineRegex = Regex {
163+
ChoiceOf {
164+
One("//")
165+
One("/*")
166+
}
167+
OneOrMore(.whitespace)
168+
#/vim:.*/#
169+
Optionally(.newlineSequence)
170+
}
171+
172+
// Regex to find language parameters in a vim modeline.
173+
let vimLanguageRegex = Regex {
174+
"ft="
175+
Capture {
176+
OneOrMore(.word)
177+
}
178+
}
179+
180+
if let emacsLine = string.firstMatch(of: emacsLineRegex)?.1,
181+
let emacsLanguage = emacsLine.firstMatch(of: emacsLanguageRegex)?.1 {
182+
return languageFromIdentifier(String(emacsLanguage))
183+
} else if let vimLine = string.firstMatch(of: vimLineRegex)?.0,
184+
let vimLanguage = vimLine.firstMatch(of: vimLanguageRegex)?.1 {
185+
return languageFromIdentifier(String(vimLanguage))
186+
} else {
187+
return nil
188+
}
189+
}
190+
191+
return detectModeline(in: prefixBuffer) ?? detectModeline(in: suffixBuffer ?? "")
192+
}
193+
194+
/// Finds a language to match a parsed identifier.
195+
/// - Parameter identifier: The identifier to use.
196+
/// - Returns: The found code language, if any.
197+
private static func languageFromIdentifier(_ identifier: String) -> CodeLanguage? {
198+
return allLanguages.first {
199+
$0.tsName == identifier
200+
|| $0.extensions.contains(identifier)
201+
|| $0.additionalIdentifiers.contains(identifier)
202+
}
203+
}
204+
}

Sources/CodeEditLanguages/CodeLanguage.swift

+10-18
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import Foundation
99
import tree_sitter
1010
import SwiftTreeSitter
1111
import CodeLanguages_Container
12+
import RegexBuilder
1213

1314
/// A structure holding metadata for code languages
1415
public struct CodeLanguage {
@@ -17,13 +18,15 @@ public struct CodeLanguage {
1718
tsName: String,
1819
extensions: Set<String>,
1920
parentURL: URL? = nil,
20-
highlights: Set<String>? = nil
21+
highlights: Set<String>? = nil,
22+
additionalIdentifiers: Set<String> = []
2123
) {
2224
self.id = id
2325
self.tsName = tsName
2426
self.extensions = extensions
2527
self.parentQueryURL = parentURL
2628
self.additionalHighlights = highlights
29+
self.additionalIdentifiers = additionalIdentifiers
2730
}
2831

2932
/// The ID of the language
@@ -52,6 +55,9 @@ public struct CodeLanguage {
5255
/// The bundle's resource URL
5356
internal var resourceURL: URL? = Bundle.module.resourceURL
5457

58+
/// A set of aditional identifiers to use for things like shebang matching.
59+
public let additionalIdentifiers: Set<String>
60+
5561
/// The tree-sitter language for the language if available
5662
public var language: Language? {
5763
guard let tsLanguage = tsLanguage else { return nil }
@@ -144,22 +150,8 @@ public struct CodeLanguage {
144150
}
145151
}
146152

147-
public extension CodeLanguage {
148-
149-
/// Gets the corresponding language for the given file URL
150-
///
151-
/// Uses the `pathExtension` URL component to detect the language
152-
/// - Parameter url: The URL to get the language for.
153-
/// - Returns: A language structure
154-
static func detectLanguageFrom(url: URL) -> CodeLanguage {
155-
let fileExtension = url.pathExtension.lowercased()
156-
let fileName = url.pathComponents.last // should not be lowercase since it has to match e.g. `Dockerfile`
157-
// This is to handle special file types without an extension (e.g., Makefile, Dockerfile)
158-
let fileNameOrExtension = fileExtension.isEmpty ? (fileName != nil ? fileName! : "") : fileExtension
159-
if let lang = allLanguages.first(where: { lang in lang.extensions.contains(fileNameOrExtension)}) {
160-
return lang
161-
} else {
162-
return .default
163-
}
153+
extension CodeLanguage: Hashable {
154+
public func hash(into hasher: inout Hasher) {
155+
hasher.combine(id)
164156
}
165157
}

0 commit comments

Comments
 (0)