Skip to content

Commit

Permalink
[ruff] Implement unnecessary-regular-expression (RUF055) (#14659)
Browse files Browse the repository at this point in the history
Co-authored-by: Micha Reiser <micha@reiser.io>
Co-authored-by: Alex Waygood <Alex.Waygood@Gmail.com>
Co-authored-by: Simon Brugman <sbrugman@users.noreply.github.com>
  • Loading branch information
4 people authored Nov 28, 2024
1 parent dc29f52 commit 224fe75
Show file tree
Hide file tree
Showing 8 changed files with 464 additions and 0 deletions.
76 changes: 76 additions & 0 deletions crates/ruff_linter/resources/test/fixtures/ruff/RUF055.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import re

s = "str"

# this should be replaced with s.replace("abc", "")
re.sub("abc", "", s)


# this example, adapted from https://docs.python.org/3/library/re.html#re.sub,
# should *not* be replaced because repl is a function, not a string
def dashrepl(matchobj):
if matchobj.group(0) == "-":
return " "
else:
return "-"


re.sub("-", dashrepl, "pro----gram-files")

# this one should be replaced with s.startswith("abc") because the Match is
# used in an if context for its truth value
if re.match("abc", s):
pass
if m := re.match("abc", s): # this should *not* be replaced
pass
re.match("abc", s) # this should not be replaced because match returns a Match

# this should be replaced with "abc" in s
if re.search("abc", s):
pass
re.search("abc", s) # this should not be replaced

# this should be replaced with "abc" == s
if re.fullmatch("abc", s):
pass
re.fullmatch("abc", s) # this should not be replaced

# this should be replaced with s.split("abc")
re.split("abc", s)

# these currently should not be modified because the patterns contain regex
# metacharacters
re.sub("ab[c]", "", s)
re.match("ab[c]", s)
re.search("ab[c]", s)
re.fullmatch("ab[c]", s)
re.split("ab[c]", s)

# test that all of the metacharacters prevent the rule from triggering, also
# use raw strings in line with RUF039
re.sub(r"abc.", "", s)
re.sub(r"^abc", "", s)
re.sub(r"abc$", "", s)
re.sub(r"abc*", "", s)
re.sub(r"abc+", "", s)
re.sub(r"abc?", "", s)
re.sub(r"abc{2,3}", "", s)
re.sub(r"abc\n", "", s) # this one could be fixed but is not currently
re.sub(r"abc|def", "", s)
re.sub(r"(a)bc", "", s)

# and these should not be modified because they have extra arguments
re.sub("abc", "", s, flags=re.A)
re.match("abc", s, flags=re.I)
re.search("abc", s, flags=re.L)
re.fullmatch("abc", s, flags=re.M)
re.split("abc", s, maxsplit=2)

# this should trigger an unsafe fix because of the presence of comments
re.sub(
# pattern
"abc",
# repl
"",
s, # string
)
3 changes: 3 additions & 0 deletions crates/ruff_linter/src/checkers/ast/analyze/expression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1081,6 +1081,9 @@ pub(crate) fn expression(expr: &Expr, checker: &mut Checker) {
if checker.enabled(Rule::AirflowDagNoScheduleArgument) {
airflow::rules::dag_no_schedule_argument(checker, expr);
}
if checker.enabled(Rule::UnnecessaryRegularExpression) {
ruff::rules::unnecessary_regular_expression(checker, call);
}
}
Expr::Dict(dict) => {
if checker.any_enabled(&[
Expand Down
1 change: 1 addition & 0 deletions crates/ruff_linter/src/codes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -984,6 +984,7 @@ pub fn code_to_rule(linter: Linter, code: &str) -> Option<(RuleGroup, Rule)> {
(Ruff, "040") => (RuleGroup::Preview, rules::ruff::rules::InvalidAssertMessageLiteralArgument),
(Ruff, "041") => (RuleGroup::Preview, rules::ruff::rules::UnnecessaryNestedLiteral),
(Ruff, "048") => (RuleGroup::Preview, rules::ruff::rules::MapIntVersionParsing),
(Ruff, "055") => (RuleGroup::Preview, rules::ruff::rules::UnnecessaryRegularExpression),
(Ruff, "100") => (RuleGroup::Stable, rules::ruff::rules::UnusedNOQA),
(Ruff, "101") => (RuleGroup::Stable, rules::ruff::rules::RedirectedNOQA),

Expand Down
1 change: 1 addition & 0 deletions crates/ruff_linter/src/rules/ruff/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -409,6 +409,7 @@ mod tests {
#[test_case(Rule::MapIntVersionParsing, Path::new("RUF048_1.py"))]
#[test_case(Rule::UnrawRePattern, Path::new("RUF039.py"))]
#[test_case(Rule::UnrawRePattern, Path::new("RUF039_concat.py"))]
#[test_case(Rule::UnnecessaryRegularExpression, Path::new("RUF055.py"))]
fn preview_rules(rule_code: Rule, path: &Path) -> Result<()> {
let snapshot = format!(
"preview__{}_{}",
Expand Down
2 changes: 2 additions & 0 deletions crates/ruff_linter/src/rules/ruff/rules/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ pub(crate) use test_rules::*;
pub(crate) use unnecessary_iterable_allocation_for_first_element::*;
pub(crate) use unnecessary_key_check::*;
pub(crate) use unnecessary_nested_literal::*;
pub(crate) use unnecessary_regular_expression::*;
pub(crate) use unraw_re_pattern::*;
pub(crate) use unsafe_markup_use::*;
pub(crate) use unused_async::*;
Expand Down Expand Up @@ -79,6 +80,7 @@ pub(crate) mod test_rules;
mod unnecessary_iterable_allocation_for_first_element;
mod unnecessary_key_check;
mod unnecessary_nested_literal;
mod unnecessary_regular_expression;
mod unraw_re_pattern;
mod unsafe_markup_use;
mod unused_async;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@
use ruff_diagnostics::{AlwaysFixableViolation, Applicability, Diagnostic, Edit, Fix};
use ruff_macros::{derive_message_formats, ViolationMetadata};
use ruff_python_ast::{
Arguments, CmpOp, Expr, ExprAttribute, ExprCall, ExprCompare, ExprContext, Identifier,
};
use ruff_python_semantic::{Modules, SemanticModel};
use ruff_text_size::TextRange;

use crate::checkers::ast::Checker;

/// ## What it does
///
/// Checks for uses of the `re` module that can be replaced with builtin `str` methods.
///
/// ## Why is this bad?
///
/// Performing checks on strings directly can make the code simpler, may require
/// less escaping, and will often be faster.
///
/// ## Example
///
/// ```python
/// re.sub("abc", "", s)
/// ```
///
/// Use instead:
///
/// ```python
/// s.replace("abc", "")
/// ```
///
/// ## Details
///
/// The rule reports the following calls when the first argument to the call is
/// a plain string literal, and no additional flags are passed:
///
/// - `re.sub`
/// - `re.match`
/// - `re.search`
/// - `re.fullmatch`
/// - `re.split`
///
/// For `re.sub`, the `repl` (replacement) argument must also be a string literal,
/// not a function. For `re.match`, `re.search`, and `re.fullmatch`, the return
/// value must also be used only for its truth value.
///
/// ## Fix safety
///
/// This rule's fix is marked as unsafe if the affected expression contains comments. Otherwise,
/// the fix can be applied safely.
///
/// ## References
/// - [Python Regular Expression HOWTO: Common Problems - Use String Methods](https://docs.python.org/3/howto/regex.html#use-string-methods)
#[derive(ViolationMetadata)]
pub(crate) struct UnnecessaryRegularExpression {
replacement: String,
}

impl AlwaysFixableViolation for UnnecessaryRegularExpression {
#[derive_message_formats]
fn message(&self) -> String {
"Plain string pattern passed to `re` function".to_string()
}

fn fix_title(&self) -> String {
format!("Replace with `{}`", self.replacement)
}
}

/// RUF055
pub(crate) fn unnecessary_regular_expression(checker: &mut Checker, call: &ExprCall) {
// adapted from unraw_re_pattern
let semantic = checker.semantic();

if !semantic.seen_module(Modules::RE) {
return;
}

let Some(qualified_name) = semantic.resolve_qualified_name(&call.func) else {
return;
};

let ["re", func] = qualified_name.segments() else {
return;
};

// skip calls with more than `pattern` and `string` arguments (and `repl`
// for `sub`)
let Some(re_func) = ReFunc::from_call_expr(semantic, call, func) else {
return;
};

// For now, restrict this rule to string literals
let Some(string_lit) = re_func.pattern.as_string_literal_expr() else {
return;
};

// For now, reject any regex metacharacters. Compare to the complete list
// from https://docs.python.org/3/howto/regex.html#matching-characters
let has_metacharacters = string_lit
.value
.to_str()
.contains(['.', '^', '$', '*', '+', '?', '{', '[', '\\', '|', '(']);

if has_metacharacters {
return;
}

// Here we know the pattern is a string literal with no metacharacters, so
// we can proceed with the str method replacement
let new_expr = re_func.replacement();

let repl = checker.generator().expr(&new_expr);
let diagnostic = Diagnostic::new(
UnnecessaryRegularExpression {
replacement: repl.clone(),
},
call.range,
);

let fix = Fix::applicable_edit(
Edit::range_replacement(repl, call.range),
if checker
.comment_ranges()
.has_comments(call, checker.source())
{
Applicability::Unsafe
} else {
Applicability::Safe
},
);

checker.diagnostics.push(diagnostic.with_fix(fix));
}

/// The `re` functions supported by this rule.
#[derive(Debug)]
enum ReFuncKind<'a> {
Sub { repl: &'a Expr },
Match,
Search,
Fullmatch,
Split,
}

#[derive(Debug)]
struct ReFunc<'a> {
kind: ReFuncKind<'a>,
pattern: &'a Expr,
string: &'a Expr,
}

impl<'a> ReFunc<'a> {
fn from_call_expr(
semantic: &SemanticModel,
call: &'a ExprCall,
func_name: &str,
) -> Option<Self> {
// the proposed fixes for match, search, and fullmatch rely on the
// return value only being used for its truth value
let in_if_context = semantic.in_boolean_test();

match (func_name, call.arguments.len()) {
// `split` is the safest of these to fix, as long as metacharacters
// have already been filtered out from the `pattern`
("split", 2) => Some(ReFunc {
kind: ReFuncKind::Split,
pattern: call.arguments.find_argument("pattern", 0)?,
string: call.arguments.find_argument("string", 1)?,
}),
// `sub` is only safe to fix if `repl` is a string. `re.sub` also
// allows it to be a function, which will *not* work in the str
// version
("sub", 3) => {
let repl = call.arguments.find_argument("repl", 1)?;
if !repl.is_string_literal_expr() {
return None;
}
Some(ReFunc {
kind: ReFuncKind::Sub { repl },
pattern: call.arguments.find_argument("pattern", 0)?,
string: call.arguments.find_argument("string", 2)?,
})
}
("match", 2) if in_if_context => Some(ReFunc {
kind: ReFuncKind::Match,
pattern: call.arguments.find_argument("pattern", 0)?,
string: call.arguments.find_argument("string", 1)?,
}),
("search", 2) if in_if_context => Some(ReFunc {
kind: ReFuncKind::Search,
pattern: call.arguments.find_argument("pattern", 0)?,
string: call.arguments.find_argument("string", 1)?,
}),
("fullmatch", 2) if in_if_context => Some(ReFunc {
kind: ReFuncKind::Fullmatch,
pattern: call.arguments.find_argument("pattern", 0)?,
string: call.arguments.find_argument("string", 1)?,
}),
_ => None,
}
}

fn replacement(&self) -> Expr {
match self.kind {
// string.replace(pattern, repl)
ReFuncKind::Sub { repl } => {
self.method_expr("replace", vec![self.pattern.clone(), repl.clone()])
}
// string.startswith(pattern)
ReFuncKind::Match => self.method_expr("startswith", vec![self.pattern.clone()]),
// pattern in string
ReFuncKind::Search => self.compare_expr(CmpOp::In),
// string == pattern
ReFuncKind::Fullmatch => self.compare_expr(CmpOp::Eq),
// string.split(pattern)
ReFuncKind::Split => self.method_expr("split", vec![self.pattern.clone()]),
}
}

/// Return a new compare expr of the form `self.pattern op self.string`
fn compare_expr(&self, op: CmpOp) -> Expr {
Expr::Compare(ExprCompare {
left: Box::new(self.pattern.clone()),
ops: Box::new([op]),
comparators: Box::new([self.string.clone()]),
range: TextRange::default(),
})
}

/// Return a new method call expression on `self.string` with `args` like
/// `self.string.method(args...)`
fn method_expr(&self, method: &str, args: Vec<Expr>) -> Expr {
let method = Expr::Attribute(ExprAttribute {
value: Box::new(self.string.clone()),
attr: Identifier::new(method, TextRange::default()),
ctx: ExprContext::Load,
range: TextRange::default(),
});
Expr::Call(ExprCall {
func: Box::new(method),
arguments: Arguments {
args: args.into_boxed_slice(),
keywords: Box::new([]),
range: TextRange::default(),
},
range: TextRange::default(),
})
}
}
Loading

0 comments on commit 224fe75

Please sign in to comment.