From 9d376417a0dfef81b0118146f1254acb784ae7fd Mon Sep 17 00:00:00 2001 From: Ed Page Date: Sat, 18 Mar 2023 01:20:01 -0500 Subject: [PATCH 1/2] test: Baseline for generic ignore --- .../tests/cmd/extend-ignore-re.in/_typos.toml | 5 +++++ .../tests/cmd/extend-ignore-re.in/file.ignore | 1 + .../typos-cli/tests/cmd/extend-ignore-re.toml | 18 ++++++++++++++++++ 3 files changed, 24 insertions(+) create mode 100644 crates/typos-cli/tests/cmd/extend-ignore-re.in/_typos.toml create mode 100644 crates/typos-cli/tests/cmd/extend-ignore-re.in/file.ignore create mode 100644 crates/typos-cli/tests/cmd/extend-ignore-re.toml diff --git a/crates/typos-cli/tests/cmd/extend-ignore-re.in/_typos.toml b/crates/typos-cli/tests/cmd/extend-ignore-re.in/_typos.toml new file mode 100644 index 000000000..de98037fa --- /dev/null +++ b/crates/typos-cli/tests/cmd/extend-ignore-re.in/_typos.toml @@ -0,0 +1,5 @@ +[files] +extend-exclude = ["_typos.toml"] + +[default.extend-identifiers] +hello = "goodbye" diff --git a/crates/typos-cli/tests/cmd/extend-ignore-re.in/file.ignore b/crates/typos-cli/tests/cmd/extend-ignore-re.in/file.ignore new file mode 100644 index 000000000..b4cfe094c --- /dev/null +++ b/crates/typos-cli/tests/cmd/extend-ignore-re.in/file.ignore @@ -0,0 +1 @@ +hello `hello` diff --git a/crates/typos-cli/tests/cmd/extend-ignore-re.toml b/crates/typos-cli/tests/cmd/extend-ignore-re.toml new file mode 100644 index 000000000..f87f0830e --- /dev/null +++ b/crates/typos-cli/tests/cmd/extend-ignore-re.toml @@ -0,0 +1,18 @@ +bin.name = "typos" +stdin = "" +stdout = """ +error: `hello` should be `goodbye` + --> ./file.ignore:1:1 + | +1 | hello `hello` + | ^^^^^ + | +error: `hello` should be `goodbye` + --> ./file.ignore:1:8 + | +1 | hello `hello` + | ^^^^^ + | +""" +stderr = "" +status.code = 2 From ac46a6ba54f9e348e521afe2fb423da60e8474d3 Mon Sep 17 00:00:00 2001 From: Ed Page Date: Sat, 18 Mar 2023 01:25:39 -0500 Subject: [PATCH 2/2] feat(config): Custom ignores Typos primarily works off of identifiers and words. We have built-in support to detect constructs that span identifiers that should not be spell checked, like UUIDs, emails, domains, etc. This opens it up for for user-defined identifier-spanning constructs using regexes via `extend-ignore-re`. This works differently than any of the previous ways of ignoring thing because the regexes require extra parse passes. Under the assumption that (1) actual typos are rare and (2) number of files relying on `extend-ignore-re` are rare, we only do these extra parse passes when a typo is found, causing almost no performance hit in the expected case. While this could be used for more generic types of ignores, it isn't the most maintainable because it is separate from the source files in question. Ideally, we'd implement document settings / directives for these cases (#316). --- crates/typos-cli/src/bin/typos-cli/args.rs | 1 + crates/typos-cli/src/config.rs | 28 ++++++++- crates/typos-cli/src/file.rs | 62 +++++++++++++++++++ crates/typos-cli/src/policy.rs | 27 ++++++-- .../tests/cmd/extend-ignore-re.in/_typos.toml | 3 + .../typos-cli/tests/cmd/extend-ignore-re.toml | 6 -- crates/typos/src/check.rs | 6 ++ crates/typos/src/tokens.rs | 14 +++++ docs/reference.md | 1 + 9 files changed, 136 insertions(+), 12 deletions(-) diff --git a/crates/typos-cli/src/bin/typos-cli/args.rs b/crates/typos-cli/src/bin/typos-cli/args.rs index 826148629..124bf91c1 100644 --- a/crates/typos-cli/src/bin/typos-cli/args.rs +++ b/crates/typos-cli/src/bin/typos-cli/args.rs @@ -156,6 +156,7 @@ impl FileArgs { locale: self.locale, ..Default::default() }), + extend_ignore_re: Default::default(), } } diff --git a/crates/typos-cli/src/config.rs b/crates/typos-cli/src/config.rs index 4e366b0c7..a2a7d1628 100644 --- a/crates/typos-cli/src/config.rs +++ b/crates/typos-cli/src/config.rs @@ -268,7 +268,7 @@ impl GlobEngineConfig { } } -#[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)] //#[serde(deny_unknown_fields)] // Doesn't work with `flatten` #[serde(default)] #[serde(rename_all = "kebab-case")] @@ -283,6 +283,8 @@ pub struct EngineConfig { pub tokenizer: Option, #[serde(flatten)] pub dict: Option, + #[serde(with = "serde_regex")] + pub extend_ignore_re: Vec, } impl EngineConfig { @@ -298,6 +300,7 @@ impl EngineConfig { .unwrap_or_else(TokenizerConfig::from_defaults), ), dict: Some(empty.dict.unwrap_or_else(DictConfig::from_defaults)), + extend_ignore_re: Default::default(), } } @@ -327,6 +330,8 @@ impl EngineConfig { let mut dict = Some(dict); std::mem::swap(&mut dict, &mut self.dict); } + self.extend_ignore_re + .extend(source.extend_ignore_re.iter().cloned()); } pub fn binary(&self) -> bool { @@ -340,8 +345,29 @@ impl EngineConfig { pub fn check_file(&self) -> bool { self.check_file.unwrap_or(true) } + + pub fn extend_ignore_re(&self) -> Box + '_> { + Box::new(self.extend_ignore_re.iter()) + } } +impl PartialEq for EngineConfig { + fn eq(&self, rhs: &Self) -> bool { + self.binary == rhs.binary + && self.check_filename == rhs.check_filename + && self.check_file == rhs.check_file + && self.tokenizer == rhs.tokenizer + && self.dict == rhs.dict + && self + .extend_ignore_re + .iter() + .map(|r| r.as_str()) + .eq(rhs.extend_ignore_re.iter().map(|r| r.as_str())) + } +} + +impl Eq for EngineConfig {} + #[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)] #[serde(deny_unknown_fields)] #[serde(default)] diff --git a/crates/typos-cli/src/file.rs b/crates/typos-cli/src/file.rs index f727f792b..cd6924ecb 100644 --- a/crates/typos-cli/src/file.rs +++ b/crates/typos-cli/src/file.rs @@ -48,7 +48,14 @@ impl FileChecker for Typos { reporter.report(msg.into())?; } else { let mut accum_line_num = AccumulateLineNum::new(); + let mut ignores: Option = None; for typo in typos::check_bytes(&buffer, policy.tokenizer, policy.dict) { + if ignores + .get_or_insert_with(|| Ignores::new(&buffer, policy.ignore)) + .is_ignored(typo.span()) + { + continue; + } let line_num = accum_line_num.line_num(&buffer, typo.byte_offset); let (line, line_offset) = extract_line(&buffer, typo.byte_offset); let msg = report::Typo { @@ -86,7 +93,14 @@ impl FileChecker for FixTypos { } else { let mut fixes = Vec::new(); let mut accum_line_num = AccumulateLineNum::new(); + let mut ignores: Option = None; for typo in typos::check_bytes(&buffer, policy.tokenizer, policy.dict) { + if ignores + .get_or_insert_with(|| Ignores::new(&buffer, policy.ignore)) + .is_ignored(typo.span()) + { + continue; + } if is_fixable(&typo) { fixes.push(typo.into_owned()); } else { @@ -163,7 +177,14 @@ impl FileChecker for DiffTypos { } else { let mut fixes = Vec::new(); let mut accum_line_num = AccumulateLineNum::new(); + let mut ignores: Option = None; for typo in typos::check_bytes(&buffer, policy.tokenizer, policy.dict) { + if ignores + .get_or_insert_with(|| Ignores::new(&buffer, policy.ignore)) + .is_ignored(typo.span()) + { + continue; + } if is_fixable(&typo) { fixes.push(typo.into_owned()); } else { @@ -276,7 +297,14 @@ impl FileChecker for Identifiers { let msg = report::BinaryFile { path }; reporter.report(msg.into())?; } else { + let mut ignores: Option = None; for word in policy.tokenizer.parse_bytes(&buffer) { + if ignores + .get_or_insert_with(|| Ignores::new(&buffer, policy.ignore)) + .is_ignored(word.span()) + { + continue; + } // HACK: Don't look up the line_num per entry to better match the performance // of Typos for comparison purposes. We don't really get much out of it // anyway. @@ -329,11 +357,18 @@ impl FileChecker for Words { let msg = report::BinaryFile { path }; reporter.report(msg.into())?; } else { + let mut ignores: Option = None; for word in policy .tokenizer .parse_bytes(&buffer) .flat_map(|i| i.split()) { + if ignores + .get_or_insert_with(|| Ignores::new(&buffer, policy.ignore)) + .is_ignored(word.span()) + { + continue; + } // HACK: Don't look up the line_num per entry to better match the performance // of Typos for comparison purposes. We don't really get much out of it // anyway. @@ -644,6 +679,33 @@ fn walk_entry( Ok(()) } +#[derive(Clone, Debug)] +struct Ignores { + blocks: Vec>, +} + +impl Ignores { + fn new(content: &[u8], ignores: &[regex::Regex]) -> Self { + let mut blocks = Vec::new(); + if let Ok(content) = std::str::from_utf8(content) { + for ignore in ignores { + for mat in ignore.find_iter(content) { + blocks.push(mat.range()); + } + } + } + Self { blocks } + } + + fn is_ignored(&self, span: std::ops::Range) -> bool { + let start = span.start; + let end = span.end.saturating_sub(1); + self.blocks + .iter() + .any(|block| block.contains(&start) || block.contains(&end)) + } +} + #[cfg(test)] mod test { use super::*; diff --git a/crates/typos-cli/src/policy.rs b/crates/typos-cli/src/policy.rs index 054729d7b..88703413f 100644 --- a/crates/typos-cli/src/policy.rs +++ b/crates/typos-cli/src/policy.rs @@ -42,6 +42,7 @@ pub struct ConfigEngine<'s> { walk: Intern, tokenizer: Intern, dict: Intern>, + ignore: Intern>, } impl<'s> ConfigEngine<'s> { @@ -54,6 +55,7 @@ impl<'s> ConfigEngine<'s> { walk: Default::default(), tokenizer: Default::default(), dict: Default::default(), + ignore: Default::default(), } } @@ -88,7 +90,7 @@ impl<'s> ConfigEngine<'s> { dir.type_matcher.definitions() } - pub fn policy(&self, path: &std::path::Path) -> Policy<'_, '_> { + pub fn policy(&self, path: &std::path::Path) -> Policy<'_, '_, '_> { debug_assert!(path.is_absolute(), "{} is not absolute", path.display()); let dir = self.get_dir(path).expect("`walk()` should be called first"); let (file_type, file_config) = dir.get_file_config(path); @@ -99,6 +101,7 @@ impl<'s> ConfigEngine<'s> { binary: file_config.binary, tokenizer: self.get_tokenizer(&file_config), dict: self.get_dict(&file_config), + ignore: self.get_ignore(&file_config), } } @@ -114,6 +117,10 @@ impl<'s> ConfigEngine<'s> { self.dict.get(file.dict) } + fn get_ignore(&self, file: &FileConfig) -> &[regex::Regex] { + self.ignore.get(file.ignore) + } + fn get_dir(&self, path: &std::path::Path) -> Option<&DirConfig> { for path in path.ancestors() { if let Some(dir) = self.configs.get(path) { @@ -220,7 +227,10 @@ impl<'s> ConfigEngine<'s> { let check_filename = engine.check_filename(); let check_file = engine.check_file(); let crate::config::EngineConfig { - tokenizer, dict, .. + tokenizer, + dict, + extend_ignore_re, + .. } = engine; let tokenizer_config = tokenizer.unwrap_or_else(crate::config::TokenizerConfig::from_defaults); @@ -254,12 +264,15 @@ impl<'s> ConfigEngine<'s> { let dict = self.dict.intern(dict); let tokenizer = self.tokenizer.intern(tokenizer); + let ignore = self.ignore.intern(extend_ignore_re); + FileConfig { check_filenames: check_filename, check_files: check_file, binary, tokenizer, dict, + ignore, } } } @@ -328,20 +341,22 @@ struct FileConfig { check_filenames: bool, check_files: bool, binary: bool, + ignore: usize, } #[non_exhaustive] #[derive(derive_setters::Setters)] -pub struct Policy<'t, 'd> { +pub struct Policy<'t, 'd, 'i> { pub check_filenames: bool, pub check_files: bool, pub file_type: Option<&'d str>, pub binary: bool, pub tokenizer: &'t typos::tokens::Tokenizer, pub dict: &'d dyn typos::Dictionary, + pub ignore: &'i [regex::Regex], } -impl<'t, 'd> Policy<'t, 'd> { +impl<'t, 'd, 'i> Policy<'t, 'd, 'i> { pub fn new() -> Self { Default::default() } @@ -350,8 +365,9 @@ impl<'t, 'd> Policy<'t, 'd> { static DEFAULT_TOKENIZER: once_cell::sync::Lazy = once_cell::sync::Lazy::new(typos::tokens::Tokenizer::new); static DEFAULT_DICT: crate::dict::BuiltIn = crate::dict::BuiltIn::new(crate::config::Locale::En); +static DEFAULT_IGNORE: &[regex::Regex] = &[]; -impl<'t, 'd> Default for Policy<'t, 'd> { +impl<'t, 'd, 'i> Default for Policy<'t, 'd, 'i> { fn default() -> Self { Self { check_filenames: true, @@ -360,6 +376,7 @@ impl<'t, 'd> Default for Policy<'t, 'd> { binary: false, tokenizer: &DEFAULT_TOKENIZER, dict: &DEFAULT_DICT, + ignore: DEFAULT_IGNORE, } } } diff --git a/crates/typos-cli/tests/cmd/extend-ignore-re.in/_typos.toml b/crates/typos-cli/tests/cmd/extend-ignore-re.in/_typos.toml index de98037fa..f248fcb30 100644 --- a/crates/typos-cli/tests/cmd/extend-ignore-re.in/_typos.toml +++ b/crates/typos-cli/tests/cmd/extend-ignore-re.in/_typos.toml @@ -1,5 +1,8 @@ [files] extend-exclude = ["_typos.toml"] +[default] +extend-ignore-re = ["`.*`"] + [default.extend-identifiers] hello = "goodbye" diff --git a/crates/typos-cli/tests/cmd/extend-ignore-re.toml b/crates/typos-cli/tests/cmd/extend-ignore-re.toml index f87f0830e..af1b948b2 100644 --- a/crates/typos-cli/tests/cmd/extend-ignore-re.toml +++ b/crates/typos-cli/tests/cmd/extend-ignore-re.toml @@ -7,12 +7,6 @@ error: `hello` should be `goodbye` 1 | hello `hello` | ^^^^^ | -error: `hello` should be `goodbye` - --> ./file.ignore:1:8 - | -1 | hello `hello` - | ^^^^^ - | """ stderr = "" status.code = 2 diff --git a/crates/typos/src/check.rs b/crates/typos/src/check.rs index cfe8372e1..c20eddea3 100644 --- a/crates/typos/src/check.rs +++ b/crates/typos/src/check.rs @@ -86,6 +86,12 @@ impl<'m> Typo<'m> { corrections: self.corrections.borrow(), } } + + pub fn span(&self) -> std::ops::Range { + let start = self.byte_offset; + let end = start + self.typo.len(); + start..end + } } impl<'m> Default for Typo<'m> { diff --git a/crates/typos/src/tokens.rs b/crates/typos/src/tokens.rs index c5ae1f8d7..db0891491 100644 --- a/crates/typos/src/tokens.rs +++ b/crates/typos/src/tokens.rs @@ -634,6 +634,13 @@ impl<'t> Identifier<'t> { self.offset } + #[inline] + pub fn span(&self) -> std::ops::Range { + let start = self.offset; + let end = start + self.token.len(); + start..end + } + /// Split into individual Words. #[inline] pub fn split(&self) -> impl Iterator> { @@ -702,6 +709,13 @@ impl<'t> Word<'t> { pub fn offset(&self) -> usize { self.offset } + + #[inline] + pub fn span(&self) -> std::ops::Range { + let start = self.offset; + let end = start + self.token.len(); + start..end + } } struct SplitIdent<'s> { diff --git a/docs/reference.md b/docs/reference.md index d5d69c6c6..32ce1dda5 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -27,6 +27,7 @@ Configuration is read from the following (in precedence order) | default.check-file | \- | bool | Verifying spelling in files. | | default.unicode | --unicode | bool | Allow unicode characters in identifiers (and not just ASCII) | | default.locale | --locale | en, en-us, en-gb, en-ca, en-au | English dialect to correct to. | +| default.extend-ignore-re | \- | list of [regexes](https://docs.rs/regex/latest/regex/index.html#syntax) | Custom uncorrectable sections (e.g. markdown code fences, PGP signatures, etc) | | default.extend-identifiers | \- | table of strings | Corrections for [identifiers](./design.md#identifiers-and-words). When the correction is blank, the identifier is never valid. When the correction is the key, the identifier is always valid. | | default.extend-ignore-identifiers-re | \- | list of [regexes](https://docs.rs/regex/latest/regex/index.html#syntax) | Pattern-match always-valid identifiers | | default.extend-words | \- | table of strings | Corrections for [words](./design.md#identifiers-and-words). When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. |