From 39cee990517840737ffe46a36e571e9cab2d3ae7 Mon Sep 17 00:00:00 2001 From: Nissy0409 Date: Tue, 22 Oct 2024 21:10:12 +0900 Subject: [PATCH] CsvMergeExclusive class delete rows when all column values match. --- cliboa/scenario/transform/csv.py | 45 +++++++--- cliboa/test/scenario/transform/test_csv.py | 99 ++++++++++++++++++++++ docs/modules/csv_merge_exclusive.md | 53 +++++++++--- 3 files changed, 174 insertions(+), 23 deletions(-) diff --git a/cliboa/scenario/transform/csv.py b/cliboa/scenario/transform/csv.py index 26bd4559..901f4731 100644 --- a/cliboa/scenario/transform/csv.py +++ b/cliboa/scenario/transform/csv.py @@ -360,9 +360,11 @@ class CsvMergeExclusive(FileBaseTransform): def __init__(self): super().__init__() + self.df_target_list = None self._src_column = None self._target_compare_path = None self._target_column = None + self._all_column = False def src_column(self, src_column): self._src_column = src_column @@ -373,15 +375,16 @@ def target_compare_path(self, target_compare_path): def target_column(self, target_column): self._target_column = target_column + def all_column(self, all_column): + self._all_column = all_column + def execute(self, *args): valid = EssentialParameters( self.__class__.__name__, [ self._src_dir, self._src_pattern, - self._src_column, self._target_compare_path, - self._target_column, ], ) valid() @@ -395,32 +398,47 @@ def execute(self, *args): ) self.check_file_existence(target) + if self._all_column and (self._src_column or self._target_column): + raise KeyError("all_column cannot coexist with src_column or target_column.") + header = pandas.read_csv(self._target_compare_path, nrows=0) - if self._target_column not in header: + if self._all_column is False and self._target_column not in header: raise KeyError( "Target Compare file does not exist target column [%s]." % self._target_column ) - df_target = pandas.read_csv(self._target_compare_path, usecols=[self._target_column]) - self.df_target_list = df_target[self._target_column].values.tolist() + if self._all_column: + df_target = pandas.read_csv(self._target_compare_path, dtype=str) + self.df_target_list = df_target.values.tolist() + else: + df_target = pandas.read_csv( + self._target_compare_path, usecols=[self._target_column], dtype=str + ) + self.df_target_list = df_target[self._target_column].values.tolist() super().io_files(files, func=self.convert) def convert(self, fi, fo): - header = pandas.read_csv(fi, dtype=str, encoding=self._encoding, nrows=0) - try: - header[self._src_column].values.tolist() - except KeyError: - raise KeyError("Src file does not exist target column [%s]." % self._target_column) + if self._all_column is False: + header = pandas.read_csv(fi, dtype=str, encoding=self._encoding, nrows=0) + try: + header[self._src_column].values.tolist() + except KeyError: + raise KeyError("Src file does not exist target column [%s]." % self._target_column) chunk_size_handling(self._read_csv_func, fi, fo) def _read_csv_func(self, chunksize, fi, fo): # Used in chunk_size_handling first_write = True - tfr = pandas.read_csv(fi, chunksize=chunksize, na_filter=False) + tfr = pandas.read_csv(fi, dtype=str, chunksize=chunksize, na_filter=False) + if self._all_column: + df_target_set = {hash(tuple(row)) for row in self.df_target_list} for df in tfr: - df = df[~df[self._src_column].isin(self.df_target_list)] + if self._all_column: + df = df.drop(self._all_elements_match(df.values.tolist(), df_target_set)) + else: + df = df[~df[self._src_column].isin(self.df_target_list)] df.to_csv( fo, encoding=self._encoding, @@ -430,6 +448,9 @@ def _read_csv_func(self, chunksize, fi, fo): ) first_write = False + def _all_elements_match(self, df_src_list, df_target_set): + return [i for i, row in enumerate(df_src_list) if hash(tuple(row)) in df_target_set] + class ColumnLengthAdjust(FileBaseTransform): """ diff --git a/cliboa/test/scenario/transform/test_csv.py b/cliboa/test/scenario/transform/test_csv.py index dec6b03d..17990c2d 100644 --- a/cliboa/test/scenario/transform/test_csv.py +++ b/cliboa/test/scenario/transform/test_csv.py @@ -1083,6 +1083,38 @@ def test_execute_ok_with_non_target(self): assert r["data"] == test_src_csv_data[2][1] assert rows == 2 + def test_execute_ok_with_all_column(self): + # create test csv + test_src_csv_data = [["key", "data"], ["1", "spam1"], ["2", "spam2"], ["3", "spam3"]] + self._create_csv(test_src_csv_data, fname="test.csv") + test_target_csv_data = [["key", "data"], ["1", "spam1"], ["2", "second"], ["c", "spam3"]] + self._create_csv(test_target_csv_data, fname="alter.csv") + + # set the essential attributes + instance = CsvMergeExclusive() + Helper.set_property(instance, "logger", LisboaLog.get_logger(__name__)) + Helper.set_property(instance, "src_dir", self._data_dir) + Helper.set_property(instance, "src_pattern", "test.csv") + Helper.set_property(instance, "all_column", True) + Helper.set_property( + instance, "target_compare_path", os.path.join(self._data_dir, "alter.csv") + ) + + instance.execute() + output_file = os.path.join(self._data_dir, "test.csv") + rows = 0 + with open(output_file, "r") as o: + reader = csv.DictReader(o) + for r in reader: + rows += 1 + if rows == 1: + assert r["key"] == test_src_csv_data[2][0] + assert r["data"] == test_src_csv_data[2][1] + if rows == 2: + assert r["key"] == test_src_csv_data[3][0] + assert r["data"] == test_src_csv_data[3][1] + assert rows == 2 + def test_execute_ng_with_src_column_not_exist(self): # create test csv test_src_csv_data = [["key", "data"], ["1", "spam1"], ["2", "spam2"]] @@ -1127,6 +1159,73 @@ def test_execute_ng_with_target_column_not_exist(self): instance.execute() assert "'Target Compare file does not exist target column [dummy].'" == str(e.value) + def test_execute_ng_with_all_column_and_src_column(self): + # create test csv + test_src_csv_data = [["key", "data"], ["1", "spam1"], ["2", "spam2"]] + self._create_csv(test_src_csv_data, fname="test.csv") + test_target_csv_data = [["id", "name"], ["3", "third"], ["4", "fourth"]] + self._create_csv(test_target_csv_data, fname="alter.csv") + + # set the essential attributes + instance = CsvMergeExclusive() + Helper.set_property(instance, "logger", LisboaLog.get_logger(__name__)) + Helper.set_property(instance, "src_dir", self._data_dir) + Helper.set_property(instance, "src_pattern", "test.csv") + Helper.set_property(instance, "all_column", True) + Helper.set_property(instance, "src_column", "key") + Helper.set_property( + instance, "target_compare_path", os.path.join(self._data_dir, "alter.csv") + ) + + with pytest.raises(KeyError) as e: + instance.execute() + assert "'all_column cannot coexist with src_column or target_column.'" == str(e.value) + + def test_execute_ng_with_all_column_and_target_column(self): + # create test csv + test_src_csv_data = [["key", "data"], ["1", "spam1"], ["2", "spam2"]] + self._create_csv(test_src_csv_data, fname="test.csv") + test_target_csv_data = [["id", "name"], ["3", "third"], ["4", "fourth"]] + self._create_csv(test_target_csv_data, fname="alter.csv") + + # set the essential attributes + instance = CsvMergeExclusive() + Helper.set_property(instance, "logger", LisboaLog.get_logger(__name__)) + Helper.set_property(instance, "src_dir", self._data_dir) + Helper.set_property(instance, "src_pattern", "test.csv") + Helper.set_property(instance, "all_column", True) + Helper.set_property(instance, "target_column", "key") + Helper.set_property( + instance, "target_compare_path", os.path.join(self._data_dir, "alter.csv") + ) + + with pytest.raises(KeyError) as e: + instance.execute() + assert "'all_column cannot coexist with src_column or target_column.'" == str(e.value) + + def test_execute_ng_with_all_column_and_src_column_and_target_column(self): + # create test csv + test_src_csv_data = [["key", "data"], ["1", "spam1"], ["2", "spam2"]] + self._create_csv(test_src_csv_data, fname="test.csv") + test_target_csv_data = [["id", "name"], ["3", "third"], ["4", "fourth"]] + self._create_csv(test_target_csv_data, fname="alter.csv") + + # set the essential attributes + instance = CsvMergeExclusive() + Helper.set_property(instance, "logger", LisboaLog.get_logger(__name__)) + Helper.set_property(instance, "src_dir", self._data_dir) + Helper.set_property(instance, "src_pattern", "test.csv") + Helper.set_property(instance, "all_column", True) + Helper.set_property(instance, "src_column", "key") + Helper.set_property(instance, "target_column", "key") + Helper.set_property( + instance, "target_compare_path", os.path.join(self._data_dir, "alter.csv") + ) + + with pytest.raises(KeyError) as e: + instance.execute() + assert "'all_column cannot coexist with src_column or target_column.'" == str(e.value) + class TestColumnLengthAdjust(TestCsvTransform): def test_ok(self): diff --git a/docs/modules/csv_merge_exclusive.md b/docs/modules/csv_merge_exclusive.md index 900b21ac..99fa3ec9 100644 --- a/docs/modules/csv_merge_exclusive.md +++ b/docs/modules/csv_merge_exclusive.md @@ -3,17 +3,18 @@ Compare specific columns each file. If matched, exclude rows. # Parameters -|Parameters|Explanation|Required|Default|Remarks| -|----------|-----------|--------|-------|-------| -|src_dir|Path of the directory which target files are placed.|Yes|None|| -|src_pattern|Regex which is to find target files.|Yes|None|| -|dest_dir|Path of the directory which is for output files.|No|None|If this parameter is not set, the file is created in the same directory as the processing file. If a non-existent directory path is specified, the directory is automatically created.| -|src_column|compare target column for "src_dir" and "src_path".|Yes|None|Specify only one column.| -|target_compare_path|Path of the file which target for comparison.|Yes|None|| -|target_column|compare target column for "target_compare_path".|Yes|None|Specify only one column.| -|encoding|Character encoding when read and write|No|utf-8|| - -# Example +| Parameters | Explanation | Required | Default | Remarks | +|---------------------|------------------------------------------------------|----------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| src_dir | Path of the directory which target files are placed. | Yes | None | | +| src_pattern | Regex which is to find target files. | Yes | None | | +| dest_dir | Path of the directory which is for output files. | No | None | If this parameter is not set, the file is created in the same directory as the processing file. If a non-existent directory path is specified, the directory is automatically created. | +| src_column | Compare target column for "src_dir" and "src_path". | Yes | None | Specify only one column. | +| target_compare_path | Path of the file which target for comparison. | Yes | None | | +| target_column | Compare target column for "target_compare_path". | Yes | None | Specify only one column. | +| all_column | Delete rows when all column values match. | No | False | src_column and target_column cannot be used together when all_column is "True". | +| encoding | Character encoding when read and write | No | utf-8 | | + +# Example 1 ``` scenario: - step: @@ -42,3 +43,33 @@ id, name, data 2, two, second 3, three, third ``` + +# Example 2 +``` +scenario: +- step: + class: CsvMergeExclusive + arguments: + src_dir: /in + src_pattern: test\.csv + target_compare_path: /in/compare.csv + all_column: True + dest_dir: /out + +Input: /in/test.csv +id, name, data +1, one, first +2, two, second +3, three, third + +Input Compare Target: /in/compare.csv +id, name, data +1, one, first +2, two, secondary +3, three, third +4, four, fourth + +Output: /out/test.csv +id, name, data +2, two, second +``` \ No newline at end of file