Migrate to ruff.

scrapy · Feb 22, 2025 · 8450791 · 8450791
1 parent 6484393
commit 8450791
Show file tree

Hide file tree

Showing 14 changed files with 266 additions and 235 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
diff --git a/.coveragerc b/.coveragerc
diff --git a/.flake8 b/.flake8
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,20 +1,7 @@
 repos:
-  - hooks:
-      - id: black
-    repo: https://github.com/psf/black
-    rev: 24.10.0
-  - hooks:
-      - id: isort
-        language_version: python3
-    repo: https://github.com/timothycrosley/isort
-    rev: 5.13.2
-  - hooks:
-      - id: flake8
-        language_version: python3
-        additional_dependencies:
-          - flake8-bugbear
-          - flake8-comprehensions
-          - flake8-debugger
-          - flake8-string-format
-    repo: https://github.com/pycqa/flake8
-    rev: 7.1.1
+- repo: https://github.com/astral-sh/ruff-pre-commit
+  rev: v0.9.7
+  hooks:
+    - id: ruff
+      args: [ --fix ]
+    - id: ruff-format
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,3 +1,142 @@
-[tool.isort]
-profile = "black"
-multi_line_output = 3
+[tool.bumpversion]
+current_version = "0.4.0"
+commit = true
+tag = true
+tag_name = "{new_version}"
+
+[[tool.bumpversion.files]]
+filename = "setup.py"
+
+[tool.coverage.run]
+branch = true
+
+[tool.coverage.report]
+exclude_also = [
+    "if TYPE_CHECKING:",
+]
+
+[tool.ruff.lint]
+extend-select = [
+    # flake8-bugbear
+    "B",
+    # flake8-comprehensions
+    "C4",
+    # pydocstyle
+    "D",
+    # flake8-future-annotations
+    "FA",
+    # flynt
+    "FLY",
+    # refurb
+    "FURB",
+    # isort
+    "I",
+    # flake8-implicit-str-concat
+    "ISC",
+    # flake8-logging
+    "LOG",
+    # Perflint
+    "PERF",
+    # pygrep-hooks
+    "PGH",
+    # flake8-pie
+    "PIE",
+    # pylint
+    "PL",
+    # flake8-use-pathlib
+    "PTH",
+    # flake8-pyi
+    "PYI",
+    # flake8-quotes
+    "Q",
+    # flake8-return
+    "RET",
+    # flake8-raise
+    "RSE",
+    # Ruff-specific rules
+    "RUF",
+    # flake8-bandit
+    "S",
+    # flake8-simplify
+    "SIM",
+    # flake8-slots
+    "SLOT",
+    # flake8-debugger
+    "T10",
+    # flake8-type-checking
+    "TC",
+    # pyupgrade
+    "UP",
+    # pycodestyle warnings
+    "W",
+    # flake8-2020
+    "YTT",
+]
+ignore = [
+    # Missing docstring in public module
+    "D100",
+    # Missing docstring in public class
+    "D101",
+    # Missing docstring in public method
+    "D102",
+    # Missing docstring in public function
+    "D103",
+    # Missing docstring in public package
+    "D104",
+    # Missing docstring in magic method
+    "D105",
+    # Missing docstring in public nested class
+    "D106",
+    # Missing docstring in __init__
+    "D107",
+    # One-line docstring should fit on one line with quotes
+    "D200",
+    # No blank lines allowed after function docstring
+    "D202",
+    # 1 blank line required between summary line and description
+    "D205",
+    # Multi-line docstring closing quotes should be on a separate line
+    "D209",
+    # First line should end with a period
+    "D400",
+    # First line should be in imperative mood; try rephrasing
+    "D401",
+    # First line should not be the function's "signature"
+    "D402",
+    # First word of the first line should be properly capitalized
+    "D403",
+    # `try`-`except` within a loop incurs performance overhead
+    "PERF203",
+    # Too many return statements
+    "PLR0911",
+    # Too many branches
+    "PLR0912",
+    # Too many arguments in function definition
+    "PLR0913",
+    # Too many statements
+    "PLR0915",
+    # Magic value used in comparison
+    "PLR2004",
+    # `for` loop variable `line` overwritten by assignment target
+    "PLW2901",
+    # String contains ambiguous {}.
+    "RUF001",
+    # Docstring contains ambiguous {}.
+    "RUF002",
+    # Comment contains ambiguous {}.
+    "RUF003",
+    # Mutable class attributes should be annotated with `typing.ClassVar`
+    "RUF012",
+    # Use of `assert` detected
+    "S101",
+
+    # to be done when adding type hints
+    # Use `typing.NamedTuple` instead of `collections.namedtuple`
+    "PYI024",
+]
+
+[tool.ruff.lint.pydocstyle]
+convention = "pep257"
+
+[tool.ruff.lint.per-file-ignores]
+"tests/*" = ["S"]
diff --git a/setup.cfg b/setup.cfg
diff --git a/setup.py b/setup.py
@@ -1,11 +1,12 @@
-#!/usr/bin/env python
+from pathlib import Path
+
 from setuptools import find_packages, setup
 
 setup(
     name="Protego",
     version="0.4.0",
     description="Pure-Python robots.txt parser with support for modern conventions",
-    long_description=open("README.rst").read(),
+    long_description=Path("README.rst").read_text(encoding="utf-8"),
     long_description_content_type="text/x-rst",
     url="https://github.com/scrapy/protego",
     author="Anubhav Patel",

diff --git a/src/protego.py b/src/protego.py
@@ -32,7 +32,7 @@
 
 _HEX_DIGITS = set("0123456789ABCDEFabcdef")
 
-__all__ = ["RequestRate", "Protego"]
+__all__ = ["Protego", "RequestRate"]
 
 
 def _is_valid_directive_field(field):
@@ -49,7 +49,7 @@ def _is_valid_directive_field(field):
     )
 
 
-class _URLPattern(object):
+class _URLPattern:
     """Internal class which represents a URL pattern."""
 
     def __init__(self, pattern):
@@ -96,11 +96,10 @@ def _prepare_pattern_for_regex(self, pattern):
                 s[index] = re.escape(substr)
             elif s[index] == "*":
                 s[index] = ".*?"
-        pattern = "".join(s)
-        return pattern
+        return "".join(s)
 
 
-class _RuleSet(object):
+class _RuleSet:
     """Internal class which stores rules for a user agent."""
 
     def __init__(self, parser_instance):
@@ -131,23 +130,21 @@ def hex_to_byte(h):
 
         # ignore contains %xy escapes for characters that are not
         # meant to be converted back.
-        ignore = {"{ord_c:02X}".format(ord_c=ord(c)) for c in ignore}
+        ignore = {f"{ord(c):02X}" for c in ignore}
 
         parts = url.split("%")
         parts[0] = parts[0].encode("utf-8")
 
         for i in range(1, len(parts)):
-            if len(parts[i]) >= 2:
-                # %xy is a valid escape only if x and y are hexadecimal digits.
-                if set(parts[i][:2]).issubset(_HEX_DIGITS):
-                    # make sure that all %xy escapes are in uppercase.
-                    hexcode = parts[i][:2].upper()
-                    leftover = parts[i][2:]
-                    if hexcode not in ignore:
-                        parts[i] = hex_to_byte(hexcode) + leftover.encode("utf-8")
-                        continue
-                    else:
-                        parts[i] = hexcode + leftover
+            # %xy is a valid escape only if x and y are hexadecimal digits.
+            if len(parts[i]) >= 2 and set(parts[i][:2]).issubset(_HEX_DIGITS):
+                # make sure that all %xy escapes are in uppercase.
+                hexcode = parts[i][:2].upper()
+                leftover = parts[i][2:]
+                if hexcode not in ignore:
+                    parts[i] = hex_to_byte(hexcode) + leftover.encode("utf-8")
+                    continue
+                parts[i] = hexcode + leftover
 
             # add back the '%' we removed during splitting.
             parts[i] = b"%" + parts[i].encode("utf-8")
@@ -158,8 +155,8 @@ def hexescape(self, char):
         """Escape char as RFC 2396 specifies"""
         hex_repr = hex(ord(char))[2:].upper()
         if len(hex_repr) == 1:
-            hex_repr = "0%s" % hex_repr
-        return "%" + hex_repr
+            hex_repr = f"0{hex_repr}"
+        return f"%{hex_repr}"
 
     def _quote_path(self, path):
         """Return percent encoded path."""
@@ -172,7 +169,7 @@ def _quote_path(self, path):
         return path or "/"
 
     def _quote_pattern(self, pattern):
-        if pattern.startswith("https://") or pattern.startswith("http://"):
+        if pattern.startswith(("https://", "http://")):
             pattern = "/" + pattern
         if pattern.startswith("//"):
             pattern = "//" + pattern
@@ -191,8 +188,7 @@ def _quote_pattern(self, pattern):
         parts = ParseResult(
             "", "", pattern + last_char, parts.params, parts.query, parts.fragment
         )
-        pattern = urlunparse(parts)
-        return pattern
+        return urlunparse(parts)
 
     def allow(self, pattern):
         if "$" in pattern:
@@ -244,10 +240,8 @@ def crawl_delay(self, delay):
         except ValueError:
             # Value is malformed, do nothing.
             logger.debug(
-                "Malformed rule at line {line_seen} : cannot set crawl delay to '{delay}'. "
-                "Ignoring this rule.".format(
-                    line_seen=self._parser_instance._total_line_seen, delay=delay
-                )
+                f"Malformed rule at line {self._parser_instance._total_line_seen} : "
+                f"cannot set crawl delay to '{delay}'. Ignoring this rule."
             )
             return
 
@@ -285,10 +279,8 @@ def request_rate(self, value):
         except Exception:
             # Value is malformed, do nothing.
             logger.debug(
-                "Malformed rule at line {line_seen} : cannot set request rate using '{value}'. "
-                "Ignoring this rule.".format(
-                    line_seen=self._parser_instance._total_line_seen, value=value
-                )
+                f"Malformed rule at line {self._parser_instance._total_line_seen} : "
+                f"cannot set request rate using '{value}'. Ignoring this rule."
             )
             return
 
@@ -312,16 +304,14 @@ def visit_time(self, value):
             start_time, end_time = self._parse_time_period(value, separator=" ")
         except Exception:
             logger.debug(
-                "Malformed rule at line {line_seen} : cannot set visit time using '{value}'. "
-                "Ignoring this rule.".format(
-                    line_seen=self._parser_instance._total_line_seen, value=value
-                )
+                f"Malformed rule at line {self._parser_instance._total_line_seen} : "
+                f"cannot set visit time using '{value}'. Ignoring this rule."
             )
             return
         self._visit_time = VisitTime(start_time, end_time)
 
 
-class Protego(object):
+class Protego:
     def __init__(self):
         # A dict mapping user agents (specified in robots.txt) to rule sets.
         self._user_agents = {}
@@ -403,9 +393,7 @@ def _parse_robotstxt(self, content):
                 and field not in _SITEMAP_DIRECTIVE
             ):
                 logger.debug(
-                    "Rule at line {line_seen} without any user agent to enforce it on.".format(
-                        line_seen=self._total_line_seen
-                    )
+                    f"Rule at line {self._total_line_seen} without any user agent to enforce it on."
                 )
                 continue