Skip to content

Commit

Permalink
Migrate to ruff.
Browse files Browse the repository at this point in the history
  • Loading branch information
wRAR committed Feb 22, 2025
1 parent 6484393 commit 8450791
Show file tree
Hide file tree
Showing 14 changed files with 266 additions and 235 deletions.
7 changes: 0 additions & 7 deletions .bumpversion.cfg

This file was deleted.

4 changes: 0 additions & 4 deletions .coveragerc

This file was deleted.

15 changes: 0 additions & 15 deletions .flake8

This file was deleted.

25 changes: 6 additions & 19 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,20 +1,7 @@
repos:
- hooks:
- id: black
repo: https://github.com/psf/black
rev: 24.10.0
- hooks:
- id: isort
language_version: python3
repo: https://github.com/timothycrosley/isort
rev: 5.13.2
- hooks:
- id: flake8
language_version: python3
additional_dependencies:
- flake8-bugbear
- flake8-comprehensions
- flake8-debugger
- flake8-string-format
repo: https://github.com/pycqa/flake8
rev: 7.1.1
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.9.7
hooks:
- id: ruff
args: [ --fix ]
- id: ruff-format
145 changes: 142 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,3 +1,142 @@
[tool.isort]
profile = "black"
multi_line_output = 3
[tool.bumpversion]
current_version = "0.4.0"
commit = true
tag = true
tag_name = "{new_version}"

[[tool.bumpversion.files]]
filename = "setup.py"

[tool.coverage.run]
branch = true

[tool.coverage.report]
exclude_also = [
"if TYPE_CHECKING:",
]

[tool.ruff.lint]
extend-select = [
# flake8-bugbear
"B",
# flake8-comprehensions
"C4",
# pydocstyle
"D",
# flake8-future-annotations
"FA",
# flynt
"FLY",
# refurb
"FURB",
# isort
"I",
# flake8-implicit-str-concat
"ISC",
# flake8-logging
"LOG",
# Perflint
"PERF",
# pygrep-hooks
"PGH",
# flake8-pie
"PIE",
# pylint
"PL",
# flake8-use-pathlib
"PTH",
# flake8-pyi
"PYI",
# flake8-quotes
"Q",
# flake8-return
"RET",
# flake8-raise
"RSE",
# Ruff-specific rules
"RUF",
# flake8-bandit
"S",
# flake8-simplify
"SIM",
# flake8-slots
"SLOT",
# flake8-debugger
"T10",
# flake8-type-checking
"TC",
# pyupgrade
"UP",
# pycodestyle warnings
"W",
# flake8-2020
"YTT",
]
ignore = [
# Missing docstring in public module
"D100",
# Missing docstring in public class
"D101",
# Missing docstring in public method
"D102",
# Missing docstring in public function
"D103",
# Missing docstring in public package
"D104",
# Missing docstring in magic method
"D105",
# Missing docstring in public nested class
"D106",
# Missing docstring in __init__
"D107",
# One-line docstring should fit on one line with quotes
"D200",
# No blank lines allowed after function docstring
"D202",
# 1 blank line required between summary line and description
"D205",
# Multi-line docstring closing quotes should be on a separate line
"D209",
# First line should end with a period
"D400",
# First line should be in imperative mood; try rephrasing
"D401",
# First line should not be the function's "signature"
"D402",
# First word of the first line should be properly capitalized
"D403",
# `try`-`except` within a loop incurs performance overhead
"PERF203",
# Too many return statements
"PLR0911",
# Too many branches
"PLR0912",
# Too many arguments in function definition
"PLR0913",
# Too many statements
"PLR0915",
# Magic value used in comparison
"PLR2004",
# `for` loop variable `line` overwritten by assignment target
"PLW2901",
# String contains ambiguous {}.
"RUF001",
# Docstring contains ambiguous {}.
"RUF002",
# Comment contains ambiguous {}.
"RUF003",
# Mutable class attributes should be annotated with `typing.ClassVar`
"RUF012",
# Use of `assert` detected
"S101",

# to be done when adding type hints
# Use `typing.NamedTuple` instead of `collections.namedtuple`
"PYI024",
]

[tool.ruff.lint.pydocstyle]
convention = "pep257"

[tool.ruff.lint.per-file-ignores]
"tests/*" = ["S"]
8 changes: 0 additions & 8 deletions setup.cfg

This file was deleted.

5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
#!/usr/bin/env python
from pathlib import Path

from setuptools import find_packages, setup

setup(
name="Protego",
version="0.4.0",
description="Pure-Python robots.txt parser with support for modern conventions",
long_description=open("README.rst").read(),
long_description=Path("README.rst").read_text(encoding="utf-8"),
long_description_content_type="text/x-rst",
url="https://github.com/scrapy/protego",
author="Anubhav Patel",
Expand Down
64 changes: 26 additions & 38 deletions src/protego.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@

_HEX_DIGITS = set("0123456789ABCDEFabcdef")

__all__ = ["RequestRate", "Protego"]
__all__ = ["Protego", "RequestRate"]


def _is_valid_directive_field(field):
Expand All @@ -49,7 +49,7 @@ def _is_valid_directive_field(field):
)


class _URLPattern(object):
class _URLPattern:
"""Internal class which represents a URL pattern."""

def __init__(self, pattern):
Expand Down Expand Up @@ -96,11 +96,10 @@ def _prepare_pattern_for_regex(self, pattern):
s[index] = re.escape(substr)
elif s[index] == "*":
s[index] = ".*?"
pattern = "".join(s)
return pattern
return "".join(s)


class _RuleSet(object):
class _RuleSet:
"""Internal class which stores rules for a user agent."""

def __init__(self, parser_instance):
Expand Down Expand Up @@ -131,23 +130,21 @@ def hex_to_byte(h):

# ignore contains %xy escapes for characters that are not
# meant to be converted back.
ignore = {"{ord_c:02X}".format(ord_c=ord(c)) for c in ignore}
ignore = {f"{ord(c):02X}" for c in ignore}

parts = url.split("%")
parts[0] = parts[0].encode("utf-8")

for i in range(1, len(parts)):
if len(parts[i]) >= 2:
# %xy is a valid escape only if x and y are hexadecimal digits.
if set(parts[i][:2]).issubset(_HEX_DIGITS):
# make sure that all %xy escapes are in uppercase.
hexcode = parts[i][:2].upper()
leftover = parts[i][2:]
if hexcode not in ignore:
parts[i] = hex_to_byte(hexcode) + leftover.encode("utf-8")
continue
else:
parts[i] = hexcode + leftover
# %xy is a valid escape only if x and y are hexadecimal digits.
if len(parts[i]) >= 2 and set(parts[i][:2]).issubset(_HEX_DIGITS):
# make sure that all %xy escapes are in uppercase.
hexcode = parts[i][:2].upper()
leftover = parts[i][2:]
if hexcode not in ignore:
parts[i] = hex_to_byte(hexcode) + leftover.encode("utf-8")
continue
parts[i] = hexcode + leftover

# add back the '%' we removed during splitting.
parts[i] = b"%" + parts[i].encode("utf-8")
Expand All @@ -158,8 +155,8 @@ def hexescape(self, char):
"""Escape char as RFC 2396 specifies"""
hex_repr = hex(ord(char))[2:].upper()
if len(hex_repr) == 1:
hex_repr = "0%s" % hex_repr
return "%" + hex_repr
hex_repr = f"0{hex_repr}"
return f"%{hex_repr}"

def _quote_path(self, path):
"""Return percent encoded path."""
Expand All @@ -172,7 +169,7 @@ def _quote_path(self, path):
return path or "/"

def _quote_pattern(self, pattern):
if pattern.startswith("https://") or pattern.startswith("http://"):
if pattern.startswith(("https://", "http://")):
pattern = "/" + pattern
if pattern.startswith("//"):
pattern = "//" + pattern
Expand All @@ -191,8 +188,7 @@ def _quote_pattern(self, pattern):
parts = ParseResult(
"", "", pattern + last_char, parts.params, parts.query, parts.fragment
)
pattern = urlunparse(parts)
return pattern
return urlunparse(parts)

def allow(self, pattern):
if "$" in pattern:
Expand Down Expand Up @@ -244,10 +240,8 @@ def crawl_delay(self, delay):
except ValueError:
# Value is malformed, do nothing.
logger.debug(
"Malformed rule at line {line_seen} : cannot set crawl delay to '{delay}'. "
"Ignoring this rule.".format(
line_seen=self._parser_instance._total_line_seen, delay=delay
)
f"Malformed rule at line {self._parser_instance._total_line_seen} : "
f"cannot set crawl delay to '{delay}'. Ignoring this rule."
)
return

Expand Down Expand Up @@ -285,10 +279,8 @@ def request_rate(self, value):
except Exception:
# Value is malformed, do nothing.
logger.debug(
"Malformed rule at line {line_seen} : cannot set request rate using '{value}'. "
"Ignoring this rule.".format(
line_seen=self._parser_instance._total_line_seen, value=value
)
f"Malformed rule at line {self._parser_instance._total_line_seen} : "
f"cannot set request rate using '{value}'. Ignoring this rule."
)
return

Expand All @@ -312,16 +304,14 @@ def visit_time(self, value):
start_time, end_time = self._parse_time_period(value, separator=" ")
except Exception:
logger.debug(
"Malformed rule at line {line_seen} : cannot set visit time using '{value}'. "
"Ignoring this rule.".format(
line_seen=self._parser_instance._total_line_seen, value=value
)
f"Malformed rule at line {self._parser_instance._total_line_seen} : "
f"cannot set visit time using '{value}'. Ignoring this rule."
)
return
self._visit_time = VisitTime(start_time, end_time)


class Protego(object):
class Protego:
def __init__(self):
# A dict mapping user agents (specified in robots.txt) to rule sets.
self._user_agents = {}
Expand Down Expand Up @@ -403,9 +393,7 @@ def _parse_robotstxt(self, content):
and field not in _SITEMAP_DIRECTIVE
):
logger.debug(
"Rule at line {line_seen} without any user agent to enforce it on.".format(
line_seen=self._total_line_seen
)
f"Rule at line {self._total_line_seen} without any user agent to enforce it on."
)
continue

Expand Down
Loading

0 comments on commit 8450791

Please sign in to comment.