Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CsvMergeExclusive class delete rows when all column values match. #471

Merged
merged 1 commit into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 33 additions & 12 deletions cliboa/scenario/transform/csv.py
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

all_columnの分岐によりややコードの見通しが悪化しているので、リファクタリングを検討してください。
このブランチについてはマージさせていただきます。

Original file line number Diff line number Diff line change
Expand Up @@ -360,9 +360,11 @@ class CsvMergeExclusive(FileBaseTransform):

def __init__(self):
super().__init__()
self.df_target_list = None
self._src_column = None
self._target_compare_path = None
self._target_column = None
self._all_column = False

def src_column(self, src_column):
self._src_column = src_column
Expand All @@ -373,15 +375,16 @@ def target_compare_path(self, target_compare_path):
def target_column(self, target_column):
self._target_column = target_column

def all_column(self, all_column):
self._all_column = all_column

def execute(self, *args):
valid = EssentialParameters(
self.__class__.__name__,
[
self._src_dir,
self._src_pattern,
self._src_column,
self._target_compare_path,
self._target_column,
],
)
valid()
Expand All @@ -395,32 +398,47 @@ def execute(self, *args):
)
self.check_file_existence(target)

if self._all_column and (self._src_column or self._target_column):
raise KeyError("all_column cannot coexist with src_column or target_column.")

header = pandas.read_csv(self._target_compare_path, nrows=0)
if self._target_column not in header:
if self._all_column is False and self._target_column not in header:
raise KeyError(
"Target Compare file does not exist target column [%s]." % self._target_column
)

df_target = pandas.read_csv(self._target_compare_path, usecols=[self._target_column])
self.df_target_list = df_target[self._target_column].values.tolist()
if self._all_column:
df_target = pandas.read_csv(self._target_compare_path, dtype=str)
self.df_target_list = df_target.values.tolist()
else:
df_target = pandas.read_csv(
self._target_compare_path, usecols=[self._target_column], dtype=str
)
self.df_target_list = df_target[self._target_column].values.tolist()

super().io_files(files, func=self.convert)

def convert(self, fi, fo):
header = pandas.read_csv(fi, dtype=str, encoding=self._encoding, nrows=0)
try:
header[self._src_column].values.tolist()
except KeyError:
raise KeyError("Src file does not exist target column [%s]." % self._target_column)
if self._all_column is False:
header = pandas.read_csv(fi, dtype=str, encoding=self._encoding, nrows=0)
try:
header[self._src_column].values.tolist()
except KeyError:
raise KeyError("Src file does not exist target column [%s]." % self._target_column)

chunk_size_handling(self._read_csv_func, fi, fo)

def _read_csv_func(self, chunksize, fi, fo):
# Used in chunk_size_handling
first_write = True
tfr = pandas.read_csv(fi, chunksize=chunksize, na_filter=False)
tfr = pandas.read_csv(fi, dtype=str, chunksize=chunksize, na_filter=False)
if self._all_column:
df_target_set = {hash(tuple(row)) for row in self.df_target_list}
for df in tfr:
df = df[~df[self._src_column].isin(self.df_target_list)]
if self._all_column:
df = df.drop(self._all_elements_match(df.values.tolist(), df_target_set))
else:
df = df[~df[self._src_column].isin(self.df_target_list)]
df.to_csv(
fo,
encoding=self._encoding,
Expand All @@ -430,6 +448,9 @@ def _read_csv_func(self, chunksize, fi, fo):
)
first_write = False

def _all_elements_match(self, df_src_list, df_target_set):
return [i for i, row in enumerate(df_src_list) if hash(tuple(row)) in df_target_set]


class ColumnLengthAdjust(FileBaseTransform):
"""
Expand Down
99 changes: 99 additions & 0 deletions cliboa/test/scenario/transform/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -1083,6 +1083,38 @@ def test_execute_ok_with_non_target(self):
assert r["data"] == test_src_csv_data[2][1]
assert rows == 2

def test_execute_ok_with_all_column(self):
# create test csv
test_src_csv_data = [["key", "data"], ["1", "spam1"], ["2", "spam2"], ["3", "spam3"]]
self._create_csv(test_src_csv_data, fname="test.csv")
test_target_csv_data = [["key", "data"], ["1", "spam1"], ["2", "second"], ["c", "spam3"]]
self._create_csv(test_target_csv_data, fname="alter.csv")

# set the essential attributes
instance = CsvMergeExclusive()
Helper.set_property(instance, "logger", LisboaLog.get_logger(__name__))
Helper.set_property(instance, "src_dir", self._data_dir)
Helper.set_property(instance, "src_pattern", "test.csv")
Helper.set_property(instance, "all_column", True)
Helper.set_property(
instance, "target_compare_path", os.path.join(self._data_dir, "alter.csv")
)

instance.execute()
output_file = os.path.join(self._data_dir, "test.csv")
rows = 0
with open(output_file, "r") as o:
reader = csv.DictReader(o)
for r in reader:
rows += 1
if rows == 1:
assert r["key"] == test_src_csv_data[2][0]
assert r["data"] == test_src_csv_data[2][1]
if rows == 2:
assert r["key"] == test_src_csv_data[3][0]
assert r["data"] == test_src_csv_data[3][1]
assert rows == 2

def test_execute_ng_with_src_column_not_exist(self):
# create test csv
test_src_csv_data = [["key", "data"], ["1", "spam1"], ["2", "spam2"]]
Expand Down Expand Up @@ -1127,6 +1159,73 @@ def test_execute_ng_with_target_column_not_exist(self):
instance.execute()
assert "'Target Compare file does not exist target column [dummy].'" == str(e.value)

def test_execute_ng_with_all_column_and_src_column(self):
# create test csv
test_src_csv_data = [["key", "data"], ["1", "spam1"], ["2", "spam2"]]
self._create_csv(test_src_csv_data, fname="test.csv")
test_target_csv_data = [["id", "name"], ["3", "third"], ["4", "fourth"]]
self._create_csv(test_target_csv_data, fname="alter.csv")

# set the essential attributes
instance = CsvMergeExclusive()
Helper.set_property(instance, "logger", LisboaLog.get_logger(__name__))
Helper.set_property(instance, "src_dir", self._data_dir)
Helper.set_property(instance, "src_pattern", "test.csv")
Helper.set_property(instance, "all_column", True)
Helper.set_property(instance, "src_column", "key")
Helper.set_property(
instance, "target_compare_path", os.path.join(self._data_dir, "alter.csv")
)

with pytest.raises(KeyError) as e:
instance.execute()
assert "'all_column cannot coexist with src_column or target_column.'" == str(e.value)

def test_execute_ng_with_all_column_and_target_column(self):
# create test csv
test_src_csv_data = [["key", "data"], ["1", "spam1"], ["2", "spam2"]]
self._create_csv(test_src_csv_data, fname="test.csv")
test_target_csv_data = [["id", "name"], ["3", "third"], ["4", "fourth"]]
self._create_csv(test_target_csv_data, fname="alter.csv")

# set the essential attributes
instance = CsvMergeExclusive()
Helper.set_property(instance, "logger", LisboaLog.get_logger(__name__))
Helper.set_property(instance, "src_dir", self._data_dir)
Helper.set_property(instance, "src_pattern", "test.csv")
Helper.set_property(instance, "all_column", True)
Helper.set_property(instance, "target_column", "key")
Helper.set_property(
instance, "target_compare_path", os.path.join(self._data_dir, "alter.csv")
)

with pytest.raises(KeyError) as e:
instance.execute()
assert "'all_column cannot coexist with src_column or target_column.'" == str(e.value)

def test_execute_ng_with_all_column_and_src_column_and_target_column(self):
# create test csv
test_src_csv_data = [["key", "data"], ["1", "spam1"], ["2", "spam2"]]
self._create_csv(test_src_csv_data, fname="test.csv")
test_target_csv_data = [["id", "name"], ["3", "third"], ["4", "fourth"]]
self._create_csv(test_target_csv_data, fname="alter.csv")

# set the essential attributes
instance = CsvMergeExclusive()
Helper.set_property(instance, "logger", LisboaLog.get_logger(__name__))
Helper.set_property(instance, "src_dir", self._data_dir)
Helper.set_property(instance, "src_pattern", "test.csv")
Helper.set_property(instance, "all_column", True)
Helper.set_property(instance, "src_column", "key")
Helper.set_property(instance, "target_column", "key")
Helper.set_property(
instance, "target_compare_path", os.path.join(self._data_dir, "alter.csv")
)

with pytest.raises(KeyError) as e:
instance.execute()
assert "'all_column cannot coexist with src_column or target_column.'" == str(e.value)


class TestColumnLengthAdjust(TestCsvTransform):
def test_ok(self):
Expand Down
53 changes: 42 additions & 11 deletions docs/modules/csv_merge_exclusive.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,18 @@ Compare specific columns each file.
If matched, exclude rows.

# Parameters
|Parameters|Explanation|Required|Default|Remarks|
|----------|-----------|--------|-------|-------|
|src_dir|Path of the directory which target files are placed.|Yes|None||
|src_pattern|Regex which is to find target files.|Yes|None||
|dest_dir|Path of the directory which is for output files.|No|None|If this parameter is not set, the file is created in the same directory as the processing file. If a non-existent directory path is specified, the directory is automatically created.|
|src_column|compare target column for "src_dir" and "src_path".|Yes|None|Specify only one column.|
|target_compare_path|Path of the file which target for comparison.|Yes|None||
|target_column|compare target column for "target_compare_path".|Yes|None|Specify only one column.|
|encoding|Character encoding when read and write|No|utf-8||

# Example
| Parameters | Explanation | Required | Default | Remarks |
|---------------------|------------------------------------------------------|----------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| src_dir | Path of the directory which target files are placed. | Yes | None | |
| src_pattern | Regex which is to find target files. | Yes | None | |
| dest_dir | Path of the directory which is for output files. | No | None | If this parameter is not set, the file is created in the same directory as the processing file. If a non-existent directory path is specified, the directory is automatically created. |
| src_column | Compare target column for "src_dir" and "src_path". | Yes | None | Specify only one column. |
| target_compare_path | Path of the file which target for comparison. | Yes | None | |
| target_column | Compare target column for "target_compare_path". | Yes | None | Specify only one column. |
| all_column | Delete rows when all column values match. | No | False | src_column and target_column cannot be used together when all_column is "True". |
| encoding | Character encoding when read and write | No | utf-8 | |

# Example 1
```
scenario:
- step:
Expand Down Expand Up @@ -42,3 +43,33 @@ id, name, data
2, two, second
3, three, third
```

# Example 2
```
scenario:
- step:
class: CsvMergeExclusive
arguments:
src_dir: /in
src_pattern: test\.csv
target_compare_path: /in/compare.csv
all_column: True
dest_dir: /out

Input: /in/test.csv
id, name, data
1, one, first
2, two, second
3, three, third

Input Compare Target: /in/compare.csv
id, name, data
1, one, first
2, two, secondary
3, three, third
4, four, fourth

Output: /out/test.csv
id, name, data
2, two, second
```
Loading