|
| 1 | +// |
| 2 | +// CodeLanguage+DetectLanguage.swift |
| 3 | +// CodeEditLanguages |
| 4 | +// |
| 5 | +// Created by Khan Winter on 6/17/23. |
| 6 | +// |
| 7 | + |
| 8 | +import Foundation |
| 9 | +import RegexBuilder |
| 10 | + |
| 11 | +public extension CodeLanguage { |
| 12 | + |
| 13 | + /// Gets the corresponding language for the given file URL |
| 14 | + /// |
| 15 | + /// Uses the `pathExtension` URL component to detect the language |
| 16 | + /// - Returns: A language structure |
| 17 | + /// - Parameters: |
| 18 | + /// - url: The URL to get the language for. |
| 19 | + /// - prefixBuffer: The first few lines of the document. |
| 20 | + /// - suffixBuffer: The last few lines of the document. |
| 21 | + static func detectLanguageFrom(url: URL, prefixBuffer: String? = nil, suffixBuffer: String? = nil) -> CodeLanguage { |
| 22 | + if let urlLanguage = detectLanguageUsingURL(url: url) { |
| 23 | + return urlLanguage |
| 24 | + } else if let prefixBuffer, |
| 25 | + let shebangLanguage = detectLanguageUsingShebang(contents: prefixBuffer.lowercased()) { |
| 26 | + return shebangLanguage |
| 27 | + } else if let prefixBuffer, |
| 28 | + let modelineLanguage = detecLanguageUsingModeline( |
| 29 | + prefixBuffer: prefixBuffer.lowercased(), |
| 30 | + suffixBuffer: suffixBuffer?.lowercased() |
| 31 | + ) { |
| 32 | + return modelineLanguage |
| 33 | + } else { |
| 34 | + return .default |
| 35 | + } |
| 36 | + } |
| 37 | + |
| 38 | + /// Detects a file's language using the file url. |
| 39 | + /// - Parameter url: The URL of the file. |
| 40 | + /// - Returns: The detected code language, if any. |
| 41 | + private static func detectLanguageUsingURL(url: URL) -> CodeLanguage? { |
| 42 | + let fileExtension = url.pathExtension.lowercased() |
| 43 | + let fileName = url.pathComponents.last // should not be lowercase since it has to match e.g. `Dockerfile` |
| 44 | + // This is to handle special file types without an extension (e.g., Makefile, Dockerfile) |
| 45 | + let fileNameOrExtension = fileExtension.isEmpty ? (fileName != nil ? fileName! : "") : fileExtension |
| 46 | + if let lang = allLanguages.first(where: { lang in lang.extensions.contains(fileNameOrExtension)}) { |
| 47 | + return lang |
| 48 | + } else { |
| 49 | + return nil |
| 50 | + } |
| 51 | + } |
| 52 | + |
| 53 | + /// Detects code langauges from the shebang of a file. |
| 54 | + /// Eg: `#!/usr/bin/env/python2.6` will detect the `python` code language. |
| 55 | + /// Or, `#! /usr/bin/env perl` will detect the `perl` code language. |
| 56 | + /// - Parameter contents: The contents of the first few lines of the file. |
| 57 | + /// - Returns: The detected code language, if any. |
| 58 | + private static func detectLanguageUsingShebang(contents: String) -> CodeLanguage? { |
| 59 | + var contents = String(contents.split(separator: "\n").first ?? "") |
| 60 | + // Make sure: |
| 61 | + // - First line is a shebang |
| 62 | + // - There are contents after the shebang |
| 63 | + // - There is a valid script component (eg: "swift" in "/usr/env/swift") |
| 64 | + guard |
| 65 | + contents.starts(with: "#!"), |
| 66 | + contents.trimmingCharacters(in: .whitespacesAndNewlines) != "#!", |
| 67 | + let result = contents |
| 68 | + .split(separator: "/", omittingEmptySubsequences: true) |
| 69 | + .last? |
| 70 | + .firstMatch(of: Regex { OneOrMore(.word) }) |
| 71 | + else { |
| 72 | + return nil |
| 73 | + } |
| 74 | + |
| 75 | + var script = result.output.trimmingCharacters(in: .whitespacesAndNewlines) |
| 76 | + |
| 77 | + // If script is "env" walk the string until we find a valid-looking script component |
| 78 | + if script == "env" { |
| 79 | + // If env is the end of the string, return |
| 80 | + guard result.endIndex != contents.endIndex else { return nil } |
| 81 | + |
| 82 | + let argumentRegex = Regex { |
| 83 | + ZeroOrMore(.whitespace) |
| 84 | + ChoiceOf { |
| 85 | + One("-") |
| 86 | + One("--") |
| 87 | + } |
| 88 | + ZeroOrMore(.word) |
| 89 | + ZeroOrMore(.whitespace) |
| 90 | + } |
| 91 | + let parameterRegex = Regex { |
| 92 | + OneOrMore(.word) |
| 93 | + One("=") |
| 94 | + OneOrMore(.word) |
| 95 | + } |
| 96 | + |
| 97 | + // Skip over any optional arguments or parameters (eg: -x or x=y) and make script the next valid string |
| 98 | + // https://www.gnu.org/software/coreutils/manual/html_node/env-invocation.html |
| 99 | + // Skip first shebang-path string |
| 100 | + contents.trimPrefix(Regex { |
| 101 | + OneOrMore("#!") |
| 102 | + ZeroOrMore(.whitespace) |
| 103 | + OneOrMore(.any, .reluctant) |
| 104 | + OneOrMore(.whitespace) |
| 105 | + }) |
| 106 | + while !contents.isEmpty { |
| 107 | + if contents.prefixMatch(of: argumentRegex) != nil { |
| 108 | + contents.trimPrefix(argumentRegex) |
| 109 | + } else if contents.prefixMatch(of: parameterRegex) != nil { |
| 110 | + contents.trimPrefix(parameterRegex) |
| 111 | + } else { |
| 112 | + break |
| 113 | + } |
| 114 | + } |
| 115 | + guard let newScript = contents.firstMatch(of: Regex { OneOrMore(.word) })?.output else { |
| 116 | + return nil |
| 117 | + } |
| 118 | + script = String(newScript) |
| 119 | + } |
| 120 | + |
| 121 | + return languageFromIdentifier(script) |
| 122 | + } |
| 123 | + |
| 124 | + /// Detects modelines in either the beginning or end of a file. |
| 125 | + /// |
| 126 | + /// Examples of valid modelines: |
| 127 | + /// ``` |
| 128 | + /// # vim: set ft=js ts=4 sw=4 et: |
| 129 | + /// # vim: ts=4:sw=4:et:ft=js |
| 130 | + /// -*- mode: js; indent-tabs-mode: nil; tab-width: 4 -*- |
| 131 | + /// code: language=javascript insertSpaces=true tabSize=4 |
| 132 | + /// ``` |
| 133 | + /// All of the above would resolve to `javascript` |
| 134 | + /// |
| 135 | + /// - Parameters: |
| 136 | + /// - prefixBuffer: The first few lines of a document. |
| 137 | + /// - suffixBuffer: The last few lines of a document. |
| 138 | + /// - Returns: The detected code language, if any. |
| 139 | + private static func detecLanguageUsingModeline(prefixBuffer: String, suffixBuffer: String?) -> CodeLanguage? { |
| 140 | + func detectModeline(in string: String) -> CodeLanguage? { |
| 141 | + guard !string.isEmpty else { return nil } |
| 142 | + |
| 143 | + // Regex for detecting emacs modelines. |
| 144 | + let emacsLineRegex = Regex { |
| 145 | + "-*-" |
| 146 | + Capture { |
| 147 | + #/.*/# |
| 148 | + } |
| 149 | + "-*-" |
| 150 | + } |
| 151 | + |
| 152 | + // Regex to find language parameters in a emacs modeline. |
| 153 | + let emacsLanguageRegex = Regex { |
| 154 | + "mode:" |
| 155 | + ZeroOrMore(.whitespace) |
| 156 | + Capture { |
| 157 | + OneOrMore(.word) |
| 158 | + } |
| 159 | + } |
| 160 | + |
| 161 | + // Regex for detecting vim modelines. |
| 162 | + let vimLineRegex = Regex { |
| 163 | + ChoiceOf { |
| 164 | + One("//") |
| 165 | + One("/*") |
| 166 | + } |
| 167 | + OneOrMore(.whitespace) |
| 168 | + #/vim:.*/# |
| 169 | + Optionally(.newlineSequence) |
| 170 | + } |
| 171 | +
|
| 172 | + // Regex to find language parameters in a vim modeline. |
| 173 | + let vimLanguageRegex = Regex { |
| 174 | + "ft=" |
| 175 | + Capture { |
| 176 | + OneOrMore(.word) |
| 177 | + } |
| 178 | + } |
| 179 | +
|
| 180 | + if let emacsLine = string.firstMatch(of: emacsLineRegex)?.1, |
| 181 | + let emacsLanguage = emacsLine.firstMatch(of: emacsLanguageRegex)?.1 { |
| 182 | + return languageFromIdentifier(String(emacsLanguage)) |
| 183 | + } else if let vimLine = string.firstMatch(of: vimLineRegex)?.0, |
| 184 | + let vimLanguage = vimLine.firstMatch(of: vimLanguageRegex)?.1 { |
| 185 | + return languageFromIdentifier(String(vimLanguage)) |
| 186 | + } else { |
| 187 | + return nil |
| 188 | + } |
| 189 | + } |
| 190 | +
|
| 191 | + return detectModeline(in: prefixBuffer) ?? detectModeline(in: suffixBuffer ?? "") |
| 192 | + } |
| 193 | +
|
| 194 | + /// Finds a language to match a parsed identifier. |
| 195 | + /// - Parameter identifier: The identifier to use. |
| 196 | + /// - Returns: The found code language, if any. |
| 197 | + private static func languageFromIdentifier(_ identifier: String) -> CodeLanguage? { |
| 198 | + return allLanguages.first { |
| 199 | + $0.tsName == identifier |
| 200 | + || $0.extensions.contains(identifier) |
| 201 | + || $0.additionalIdentifiers.contains(identifier) |
| 202 | + } |
| 203 | + } |
| 204 | +} |
0 commit comments