Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactoring suggestins #1

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions diffFiles/.gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
venv
dist
*.egg-info
.pytest_cache
__pycache__
42 changes: 36 additions & 6 deletions diffFiles/Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,51 @@

## Installation

```
### During development
In `diffFiles/`:
```shell
python3 -m venv venv
source venv/bin/activate
pip install -r requirements.txt
pip install -e .
```

## Run
This makes an editable installation. You can test it by running
```shell
diff-file --help
```

To build an installable wheel and package, make sure you have
`build` installed and run
```shell
python -m build
```
source venv/bin/activate
bash run.sh
This will create `dist/` with `diff_files-0.1.tar.gz` and
`diff_files-0.1-py3-none-any.whl` in it.

To pin your requirements, make sure you have `pip-tools` installed and call
```shell
pip-compile -o requirements.txt pyproject.toml
```
This will update `requirements.txt` with the latest pinned versions.

### In "production"
The wheel and package can be installed on any computer, e.g. with
```shell
python -m pip install diff_files-0.1.tar.gz
```

## Run
An installation provides run-able executables in `venv/bin` (on Windows: `venv/Scripts`).

The main entry-point is `diff-file`, so you can use it like this:
```shell
diff-file FILE1 FILE2
```

## Test

In diffFiles-folder, run

```
bash test.sh
pytest
```
25 changes: 25 additions & 0 deletions diffFiles/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
[project]
name = "diff-files"
version = "0.1"
description = "Return the difference between FILE1 and FILE2"
readme = "Readme.md"
authors = [{name = "Lucas Payr", email = "orasund@gmail.com"}]
dependencies = [
"click",
]

[project.scripts]
diff-files = "diff_files:diff_files"

[project.optional-dependencies]
test = [
"pytest",
]
dev = [
"build",
"pip-tools",
]

[build-system]
requires = ["setuptools >= 61.0"]
build-backend = "setuptools.build_meta"
12 changes: 10 additions & 2 deletions diffFiles/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,10 @@
pytest
click
#
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile --output-file='.\requirements.txt' pyproject.toml
#
click==8.1.7
# via diff-files (pyproject.toml)
colorama==0.4.6
# via click
148 changes: 86 additions & 62 deletions diffFiles/src/diff_files.py
Original file line number Diff line number Diff line change
@@ -1,106 +1,130 @@
from typing import cast
from dataclasses import dataclass
from typing import cast, IO
import click


@dataclass
class Position:
x: int
y: int


@dataclass
class Node:
def __init__(self, next_pos: tuple[int, int] | None, length: int) -> None:
self.next_pos = next_pos
self.length = length
next_pos: Position
length: int


def build_subsequence_matrix(l1: list[str], l2: list[str]) -> list[list[Node]]:
# we use the matrix as a graph
# where each node points to one of their neighbors
nodes: list[list[Node]] = []

def get_node(pos: tuple[int, int]) -> Node | None:
return nodes[pos[0]][pos[1]] if pos[0] >= 0 and pos[1] >= 0 else None

def compute_node(i1: int, i2: int) -> Node:
next_pos: tuple[int, int] | None
length: int

if l1[i1] == l2[i2]:
next_pos = (i1 - 1, i2 - 1)
length = nodes[next_pos[0]][next_pos[1]].length + 1
else:
candidate1 = (i1, i2 - 1)
candidate2 = (i1 - 1, i2)
node1 = get_node(candidate1)
node2 = get_node(candidate2)

# find the candidate with the bigger length
if node1 is None and node2 is None:
next_pos = None
length = 0
elif node2 is not None and (node1 is None or node1.length < node2.length):
next_pos = candidate2
length = node2.length
else:
next_pos = candidate1
length = cast(Node, node1).length
@dataclass
class Matrix:
nodes: list[list[Node]]

return Node(next_pos, length)
def get_node(self, pos: Position) -> Node | None:
if pos.x < 0 or pos.y < 0:
return None
return self.nodes[pos.x][pos.y]

for i1 in range(len(l1)):
nodes.append([])
for i2 in range(len(l2)):
nodes[i1].append(compute_node(i1, i2))
@classmethod
def make(cls, l1: list[str], l2: list[str]) -> "Matrix":
def compute_node(i1: int, i2: int, line1, line2) -> Node:
next_pos: Position | None
length: int

if line1 == line2:
next_pos = Position(i1 - 1, i2 - 1)
length = result.nodes[next_pos.x][next_pos.y].length + 1
else:
candidate1 = Position(i1, i2 - 1)
candidate2 = Position(i1 - 1, i2)
node1 = result.get_node(candidate1)
node2 = result.get_node(candidate2)

# find the candidate with the bigger length
if node1 is None and node2 is None:
next_pos = None
length = 0
elif node2 is not None and (node1 is None or node1.length < node2.length):
next_pos = candidate2
length = node2.length
else:
next_pos = candidate1
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In order for next_pos.y = -1 we would need to enter this branch with node1 is None (as there is a guard inside get_node). However, if node1 is None, then either branch 1 or 2 will be entered depending on node1 is None.

So im not seeing what should be wrong here.

length = cast(Node, node1).length

return Node(next_pos, length)

result = Matrix([])
for i1, line1 in enumerate(l1):
result.nodes.append([])
for i2, line2 in enumerate(l2):
result.nodes[i1].append(compute_node(i1, i2, line1, line2))
return result


def build_subsequence_matrix(l1: list[str], l2: list[str]) -> Matrix:
# we use the matrix as a graph
# where each node points to one of their neighbors
nodes = Matrix.make(l1, l2)
return nodes


def compute_common_indices(nodes: list[list[Node]]) -> list[tuple[int, int]]:
pos: tuple[int, int] = (len(nodes) - 1, len(nodes[0]) - 1)
def compute_common_indices(nodes: Matrix) -> list[Position]:
common_indices: list[Position] = []
if not nodes.nodes:
return common_indices

pos = Position(len(nodes.nodes) - 1, len(nodes.nodes[0]) - 1)

def get_node(pos: tuple[int, int]) -> Node:
return nodes[pos[0]][pos[1]]
def get_node(p: Position) -> Node:
result = nodes.get_node(p)
if result is None:
return Node(Position(0, 0), 0)
Copy link
Author

@frip frip Apr 21, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This here is my "tricky" fix for p.y=-1. It occurs in the "standard-test-case"!

Copy link
Owner

@Orasund Orasund Apr 22, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a bit puzzled. This is only possible if len(nodes.nodes[0] - 1) = -1 or next_pos.y = -1.

The first case occurs if the second file is empty. Second case occurs if I forgot a guard inside compute_node. In particular, this would mean that the cast is not current.

return result

node: Node = get_node(pos)
commonIndices: list[tuple[int, int]] = []

while node.next_pos is not None:
nextNode = get_node(node.next_pos)
next_node = get_node(node.next_pos)

if nextNode.length < node.length:
commonIndices.append(pos)
if next_node.length < node.length:
common_indices.append(pos)

pos = node.next_pos
node = nextNode
node = next_node

commonIndices.reverse()
return commonIndices
common_indices.reverse()
return common_indices


def compute_differences(
lines1: list[str], lines2: list[str], commonsIndices: list[tuple[int, int]]
lines1: list[str], lines2: list[str], commons_indices: list[Position]
):
indices: list[tuple[int, int]] = (
[(-1, -1)] + commonsIndices + [(len(lines1), len(lines2))]
indices: list[Position] = (
[Position(-1, -1)] + commons_indices + [Position(len(lines1), len(lines2))]
)

return [
(lines1[iMin[0] + 1 : iMax[0]], lines2[iMin[1] + 1 : iMax[1]])
(lines1[iMin.x + 1: iMax.x], lines2[iMin.y + 1: iMax.y])
for (iMin, iMax) in zip(indices, indices[1:])
]


@click.command()
@click.argument("file1")
@click.argument("file2")
def diff_files(file1: str, file2: str):
@click.argument("file1", type=click.File(mode="r"))
@click.argument("file2", type=click.File(mode="r"))
def diff_files(file1: IO, file2: IO):
"""Return the difference between FILE1 and FILE2"""
lines1: list[str] = open(file1, "r").readlines()
lines2: list[str] = open(file2, "r").readlines()
lines1: list[str] = file1.readlines()
lines2: list[str] = file2.readlines()

# Algorithm taken from
# https://en.wikipedia.org/wiki/Longest_common_subsequence
#
# 1. Build a graph of subsequences (stores as a matrix)
matrix = build_subsequence_matrix(lines1, lines2)
# 2. Walk the graph and collect all the indecies where the strings match
commonIndices = compute_common_indices(matrix)
common_indices = compute_common_indices(matrix)
# 3. Compute the differences based on the collected indecies
differences = compute_differences(lines1, lines2, commonIndices)
differences = compute_differences(lines1, lines2, common_indices)

click.echo(differences)

Expand Down
2 changes: 1 addition & 1 deletion diffFiles/test.sh
Original file line number Diff line number Diff line change
@@ -1 +1 @@
pytest tests/
pytest tests/
Empty file added diffFiles/tests/data/empty
Empty file.
40 changes: 36 additions & 4 deletions diffFiles/tests/test_diff_files.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,41 @@
import os

from click.testing import CliRunner
from src.diff_files import diff_files
from diff_files import diff_files, build_subsequence_matrix, compute_common_indices, compute_differences


def test_empty():
os.chdir(os.path.dirname(__file__))
runner = CliRunner()
result = runner.invoke(diff_files, ["data/empty", "data/empty"])
assert result.exit_code == 0
assert result.output == "[([], [])]\n"


def test():
def test_basic():
os.chdir(os.path.dirname(__file__))
runner = CliRunner()
result = runner.invoke(diff_files, ["tests/data/file1", "tests/data/file2"])
expected = "[(['Hello \\n'], []), (['beautiful\\n'], ['little\\n', 'boy\\n', 'lives\\n', 'in\\n', 'his\\n', 'own\\n']), (['\\n'], ['!'])]\n"
result = runner.invoke(diff_files, ["data/file1", "data/file2"])
expected = ("["
"(['Hello \\n'], []), "
"(['beautiful\\n'], "
"['little\\n', 'boy\\n', 'lives\\n', 'in\\n', 'his\\n', 'own\\n']), "
"(['\\n'], ['!'])"
"]\n")
assert result.output == expected


def test_diff():
os.chdir(os.path.dirname(__file__))
with open("data/file1") as f1, open("data/file2") as f2:
lines1 = f1.readlines()
lines2 = f2.readlines()
matrix = build_subsequence_matrix(lines1, lines2)
# 2. Walk the graph and collect all the indecies where the strings match
common_indices = compute_common_indices(matrix)
# 3. Compute the differences based on the collected indecies
differences = compute_differences(lines1, lines2, common_indices)
assert differences == [
(['Hello \n'], []),
(['beautiful\n'], ['little\n', 'boy\n', 'lives\n', 'in\n', 'his\n', 'own\n']),
(['\n'], ['!'])]