Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add script to shrink all formatter errors #5943

Merged
merged 3 commits into from
Jul 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 27 additions & 9 deletions crates/ruff_python_formatter/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -267,15 +267,6 @@ git clone --branch 3.10 https://github.com/python/cpython.git crates/ruff/resour
cargo run --bin ruff_dev -- format-dev --stability-check crates/ruff/resources/test/cpython
```

It is also possible large number of repositories using ruff. This dataset is large (~60GB), so we
only do this occasionally:

```shell
curl https://raw.githubusercontent.com/akx/ruff-usage-aggregate/master/data/known-github-tomls-clean.jsonl> github_search.jsonl
python scripts/check_ecosystem.py --checkouts target/checkouts --projects github_search.jsonl -v $(which true) $(which true)
cargo run --bin ruff_dev -- format-dev --stability-check --multi-project target/checkouts
```

Compared to `ruff check`, `cargo run --bin ruff_dev -- format-dev` has 4 additional options:

- `--write`: Format the files and write them back to disk
Expand All @@ -284,6 +275,33 @@ Compared to `ruff check`, `cargo run --bin ruff_dev -- format-dev` has 4 additio
- `--error-file`: Use together with `--multi-project`, this writes all errors (but not status
messages) to a file.

It is also possible to check a large number of repositories. This dataset is large (~60GB), so we
only do this occasionally:

```shell
# Get the list of projects
curl https://raw.githubusercontent.com/akx/ruff-usage-aggregate/master/data/known-github-tomls-clean.jsonl > github_search.jsonl
# Repurpose this script to download the repositories for us
python scripts/check_ecosystem.py --checkouts target/checkouts --projects github_search.jsonl -v $(which true) $(which true)
# Check each project for formatter stability
cargo run --bin ruff_dev -- format-dev --stability-check --error-file target/formatter-ecosystem-errors.txt --multi-project target/checkouts
```

To shrink a formatter error from an entire file to a minimal reproducible example, you can use
`ruff_shrinking`:

```shell
cargo run --bin ruff_shrinking -- <your_file> target/shrinking.py "Unstable formatting" "target/release/ruff_dev format-dev --stability-check target/shrinking.py"
```

The first argument is the input file, the second is the output file where the candidates
and the eventual minimized version will be written to. The third argument is a regex matching the
error message, e.g. "Unstable formatting" or "Formatter error". The last argument is the command
with the error, e.g. running the stability check on the candidate file. The script will try various
strategies to remove parts of the code. If the output of the command still matches, it will use that
slightly smaller code as starting point for the next iteration, otherwise it will revert and try
a different strategy until all strategies are exhausted.

## The orphan rules and trait structure

For the formatter, we would like to implement `Format` from the rust_formatter crate for all AST
Expand Down
75 changes: 75 additions & 0 deletions crates/ruff_python_formatter/shrink_formatter_errors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
"""Take `format-dev --stability-check` output and shrink all stability errors into a
single Python file. Used to update https://github.com/astral-sh/ruff/issues/5828 ."""

from __future__ import annotations

import json
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from subprocess import check_output
from tempfile import NamedTemporaryFile

from tqdm import tqdm

root = Path(
check_output(["git", "rev-parse", "--show-toplevel"], text=True).strip(),
)
target = root.joinpath("target")

error_report = target.joinpath("formatter-ecosystem-errors.txt")
error_lines_prefix = "Unstable formatting "


def get_filenames() -> list[str]:
files = []
for line in error_report.read_text().splitlines():
if not line.startswith(error_lines_prefix):
continue
files.append(line.removeprefix(error_lines_prefix))
return files


def shrink_file(file: str) -> tuple[str, str]:
"""Returns filename and minimization"""
with NamedTemporaryFile(suffix=".py") as temp_file:
print(f"Starting {file}")
ruff_dev = target.joinpath("release").joinpath("ruff_dev")
check_output(
[
target.joinpath("release").joinpath("ruff_shrinking"),
file,
temp_file.name,
"Unstable formatting",
f"{ruff_dev} format-dev --stability-check {temp_file.name}",
],
)
print(f"Finished {file}")
return file, Path(temp_file.name).read_text()


def main():
storage = target.joinpath("minimizations.json")
output_file = target.joinpath("minimizations.py")
if storage.is_file():
outputs = json.loads(storage.read_text())
else:
outputs = {}
files = sorted(set(get_filenames()) - set(outputs))
# Each process will saturate one core
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
tasks = [executor.submit(shrink_file, file) for file in files]
for future in tqdm(as_completed(tasks), total=len(files)):
file, output = future.result()
outputs[file] = output
storage.write_text(json.dumps(outputs, indent=4))

# Write to one shareable python file
with output_file.open("w") as formatted:
for file, code in sorted(json.loads(storage.read_text()).items()):
file = file.split("/target/checkouts/")[1]
formatted.write(f"# {file}\n{code}\n")


if __name__ == "__main__":
main()
18 changes: 14 additions & 4 deletions crates/ruff_shrinking/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -422,9 +422,18 @@ fn run() -> Result<()> {
let loop_start = Instant::now();
let mut stats = HashMap::new();

let mut num_iterations = 0;
// normalize line endings for the remove newline dependent rules
// Normalize line endings for the remove newline dependent rules
let mut input = fs::read_to_string(args.input_file)?.replace('\r', "");

// This can happen e.g. when main changed between collecting the errors list and running this
// script
if !is_failing(&input, &args.output_file, &command_args, &pattern)? {
println!("Input doesn't match");
fs::write(&args.output_file, "")?;
return Ok(());
}

let mut num_iterations = 0;
let mut last_strategy_and_idx = None;
loop {
let start = Instant::now();
Expand Down Expand Up @@ -461,9 +470,10 @@ fn run() -> Result<()> {

println!("Strategies taken: {stats:?}");
println!(
"Done with {num_iterations} iterations in {:.2}s. Find your minimized example in {}",
"Done with {num_iterations} iterations in {:.2}s. Find your minimized example in {}:\n---\n{}\n---\n",
loop_start.elapsed().as_secs_f32(),
args.output_file.display()
args.output_file.display(),
input
);

Ok(())
Expand Down