From 17daf6ac0a1a7ef4a44078ef11cc150a8fa41ff0 Mon Sep 17 00:00:00 2001 From: Michael Davis Date: Fri, 21 Oct 2022 19:34:15 -0500 Subject: [PATCH] Change syntax for suffix file-types configurations (#4414) The change in d801a6693c3d475b3942f705d3ef48d7966bdf65 to search for suffixes in `file-types` is too permissive: files like the tutor or `*.txt` files are now mistakenly interpreted as R or perl, respectively. This change changes the syntax for specifying a file-types entry that matches by suffix: ```toml file-types = [{ suffix = ".git/config" }] ``` And changes the file-type detection to first search for any non-suffix patterns and then search for suffixes only with the file-types entries marked explicitly as suffixes. --- book/src/languages.md | 28 ++++++++++- helix-core/src/syntax.rs | 102 ++++++++++++++++++++++++++++++++++----- languages.toml | 5 +- 3 files changed, 119 insertions(+), 16 deletions(-) diff --git a/book/src/languages.md b/book/src/languages.md index 9b90a2112b52..133e6447941c 100644 --- a/book/src/languages.md +++ b/book/src/languages.md @@ -50,7 +50,7 @@ These configuration keys are available: | `name` | The name of the language | | `scope` | A string like `source.js` that identifies the language. Currently, we strive to match the scope names used by popular TextMate grammars and by the Linguist library. Usually `source.` or `text.` in case of markup languages | | `injection-regex` | regex pattern that will be tested against a language name in order to determine whether this language should be used for a potential [language injection][treesitter-language-injection] site. | -| `file-types` | The filetypes of the language, for example `["yml", "yaml"]`. This attempts to match by exact file name (`.zshrc`), then by file extension (`toml`), then by path suffix (`.git/config`). | +| `file-types` | The filetypes of the language, for example `["yml", "yaml"]`. See the file-type detection section below. | | `shebangs` | The interpreters from the shebang line, for example `["sh", "bash"]` | | `roots` | A set of marker files to look for when trying to find the workspace root. For example `Cargo.lock`, `yarn.lock` | | `auto-format` | Whether to autoformat this language when saving | @@ -63,6 +63,32 @@ These configuration keys are available: | `formatter` | The formatter for the language, it will take precedence over the lsp when defined. The formatter must be able to take the original file as input from stdin and write the formatted file to stdout | | `max-line-length` | Maximum line length. Used for the `:reflow` command | +### File-type detection and the `file-types` key + +Helix determines which language configuration to use with the `file-types` key +from the above section. `file-types` is a list of strings or tables, for +example: + +```toml +file-types = ["Makefile", "toml", { suffix = ".git/config" }] +``` + +When determining a language configuration to use, Helix searches the file-types +with the following priorities: + +1. Exact match: if the filename of a file is an exact match of a string in a + `file-types` list, that language wins. In the example above, `"Makefile"` + will match against `Makefile` files. +2. Extension: if there are no exact matches, any `file-types` string that + matches the file extension of a given file wins. In the example above, the + `"toml"` matches files like `Cargo.toml` or `languages.toml`. +3. Suffix: if there are still no matches, any values in `suffix` tables + are checked against the full path of the given file. In the example above, + the `{ suffix = ".git/config" }` would match against any `config` files + in `.git` directories. Note: `/` is used as the directory separator but is + replaced at runtime with the appropriate path separator for the operating + system, so this rule would match against `.git\config` files on Windows. + ### Language Server configuration The `language-server` field takes the following keys: diff --git a/helix-core/src/syntax.rs b/helix-core/src/syntax.rs index 21d19ce78635..c17655a90bbd 100644 --- a/helix-core/src/syntax.rs +++ b/helix-core/src/syntax.rs @@ -73,11 +73,11 @@ impl Default for Configuration { pub struct LanguageConfiguration { #[serde(rename = "name")] pub language_id: String, // c-sharp, rust - pub scope: String, // source.rust - pub file_types: Vec, // filename ends_with? + pub scope: String, // source.rust + pub file_types: Vec, // filename extension or ends_with? #[serde(default)] pub shebangs: Vec, // interpreter(s) associated with language - pub roots: Vec, // these indicate project roots <.git, Cargo.toml> + pub roots: Vec, // these indicate project roots <.git, Cargo.toml> pub comment_token: Option, pub max_line_length: Option, @@ -125,6 +125,78 @@ pub struct LanguageConfiguration { pub rulers: Option>, // if set, override editor's rulers } +#[derive(Debug, PartialEq, Eq, Hash)] +pub enum FileType { + /// The extension of the file, either the `Path::extension` or the full + /// filename if the file does not have an extension. + Extension(String), + /// The suffix of a file. This is compared to a given file's absolute + /// path, so it can be used to detect files based on their directories. + Suffix(String), +} + +impl Serialize for FileType { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + use serde::ser::SerializeMap; + + match self { + FileType::Extension(extension) => serializer.serialize_str(extension), + FileType::Suffix(suffix) => { + let mut map = serializer.serialize_map(Some(1))?; + map.serialize_entry("suffix", &suffix.replace(std::path::MAIN_SEPARATOR, "/"))?; + map.end() + } + } + } +} + +impl<'de> Deserialize<'de> for FileType { + fn deserialize(deserializer: D) -> Result + where + D: serde::de::Deserializer<'de>, + { + struct FileTypeVisitor; + + impl<'de> serde::de::Visitor<'de> for FileTypeVisitor { + type Value = FileType; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + formatter.write_str("string or table") + } + + fn visit_str(self, value: &str) -> Result + where + E: serde::de::Error, + { + Ok(FileType::Extension(value.to_string())) + } + + fn visit_map(self, mut map: M) -> Result + where + M: serde::de::MapAccess<'de>, + { + match map.next_entry::()? { + Some((key, suffix)) if key == "suffix" => Ok(FileType::Suffix( + suffix.replace('/', &std::path::MAIN_SEPARATOR.to_string()), + )), + Some((key, _value)) => Err(serde::de::Error::custom(format!( + "unknown key in `file-types` list: {}", + key + ))), + None => Err(serde::de::Error::custom( + "expected a `suffix` key in the `file-types` entry", + )), + } + } + } + + deserializer.deserialize_any(FileTypeVisitor) + } +} + #[derive(Debug, Serialize, Deserialize)] #[serde(rename_all = "kebab-case")] pub struct LanguageServerConfiguration { @@ -454,7 +526,8 @@ impl LanguageConfiguration { pub struct Loader { // highlight_names ? language_configs: Vec>, - language_config_ids_by_file_type: HashMap, // Vec + language_config_ids_by_extension: HashMap, // Vec + language_config_ids_by_suffix: HashMap, language_config_ids_by_shebang: HashMap, scopes: ArcSwap>, @@ -464,7 +537,8 @@ impl Loader { pub fn new(config: Configuration) -> Self { let mut loader = Self { language_configs: Vec::new(), - language_config_ids_by_file_type: HashMap::new(), + language_config_ids_by_extension: HashMap::new(), + language_config_ids_by_suffix: HashMap::new(), language_config_ids_by_shebang: HashMap::new(), scopes: ArcSwap::from_pointee(Vec::new()), }; @@ -475,10 +549,14 @@ impl Loader { for file_type in &config.file_types { // entry().or_insert(Vec::new).push(language_id); - let file_type = file_type.replace('/', &std::path::MAIN_SEPARATOR.to_string()); - loader - .language_config_ids_by_file_type - .insert(file_type, language_id); + match file_type { + FileType::Extension(extension) => loader + .language_config_ids_by_extension + .insert(extension.clone(), language_id), + FileType::Suffix(suffix) => loader + .language_config_ids_by_suffix + .insert(suffix.clone(), language_id), + }; } for shebang in &config.shebangs { loader @@ -498,14 +576,14 @@ impl Loader { let configuration_id = path .file_name() .and_then(|n| n.to_str()) - .and_then(|file_name| self.language_config_ids_by_file_type.get(file_name)) + .and_then(|file_name| self.language_config_ids_by_extension.get(file_name)) .or_else(|| { path.extension() .and_then(|extension| extension.to_str()) - .and_then(|extension| self.language_config_ids_by_file_type.get(extension)) + .and_then(|extension| self.language_config_ids_by_extension.get(extension)) }) .or_else(|| { - self.language_config_ids_by_file_type + self.language_config_ids_by_suffix .iter() .find_map(|(file_type, id)| { if path.to_str()?.ends_with(file_type) { diff --git a/languages.toml b/languages.toml index 5ad5c6e65849..a639ccadf281 100644 --- a/languages.toml +++ b/languages.toml @@ -1053,8 +1053,7 @@ source = { git = "https://github.com/tree-sitter/tree-sitter-regex", rev = "e1cf name = "git-config" scope = "source.gitconfig" roots = [] -# TODO: allow specifying file-types as a regex so we can read directory names (e.g. `.git/config`) -file-types = [".gitmodules", ".gitconfig", ".git/config", ".config/git/config"] +file-types = [".gitmodules", ".gitconfig", { suffix = ".git/config" }, { suffix = ".config/git/config" }] injection-regex = "git-config" comment-token = "#" indent = { tab-width = 4, unit = "\t" } @@ -1491,7 +1490,7 @@ source = { git = "https://github.com/bearcove/tree-sitter-meson", rev = "feea83b [[language]] name = "sshclientconfig" scope = "source.sshclientconfig" -file-types = [".ssh/config", "/etc/ssh/ssh_config"] +file-types = [{ suffix = ".ssh/config" }, { suffix = "/etc/ssh/ssh_config" }] roots = [] [[grammar]]