Skip to content

Commit

Permalink
CsvMergeExclusive class delete rows when all column values match.
Browse files Browse the repository at this point in the history
  • Loading branch information
nissy0409240 committed Oct 22, 2024
1 parent 586d61b commit 91b2cb7
Show file tree
Hide file tree
Showing 3 changed files with 176 additions and 22 deletions.
46 changes: 35 additions & 11 deletions cliboa/scenario/transform/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,9 +360,11 @@ class CsvMergeExclusive(FileBaseTransform):

def __init__(self):
super().__init__()
self.df_target_list = None
self._src_column = None
self._target_compare_path = None
self._target_column = None
self._all_column = False

def src_column(self, src_column):
self._src_column = src_column
Expand All @@ -373,15 +375,16 @@ def target_compare_path(self, target_compare_path):
def target_column(self, target_column):
self._target_column = target_column

def all_column(self, all_column):
self._all_column = all_column

def execute(self, *args):
valid = EssentialParameters(
self.__class__.__name__,
[
self._src_dir,
self._src_pattern,
self._src_column,
self._target_compare_path,
self._target_column,
],
)
valid()
Expand All @@ -395,23 +398,31 @@ def execute(self, *args):
)
self.check_file_existence(target)

if self._all_column and (self._src_column or self._target_column):
raise KeyError("all_column cannot coexist with src_column or target_column.")

header = pandas.read_csv(self._target_compare_path, nrows=0)
if self._target_column not in header:
if self._all_column is False and self._target_column not in header:
raise KeyError(
"Target Compare file does not exist target column [%s]." % self._target_column
)

df_target = pandas.read_csv(self._target_compare_path, usecols=[self._target_column])
self.df_target_list = df_target[self._target_column].values.tolist()
if self._all_column:
df_target = pandas.read_csv(self._target_compare_path)
self.df_target_list = df_target.values.tolist()
else:
df_target = pandas.read_csv(self._target_compare_path, usecols=[self._target_column])
self.df_target_list = df_target[self._target_column].values.tolist()

super().io_files(files, func=self.convert)

def convert(self, fi, fo):
header = pandas.read_csv(fi, dtype=str, encoding=self._encoding, nrows=0)
try:
header[self._src_column].values.tolist()
except KeyError:
raise KeyError("Src file does not exist target column [%s]." % self._target_column)
if self._all_column is False:
header = pandas.read_csv(fi, dtype=str, encoding=self._encoding, nrows=0)
try:
header[self._src_column].values.tolist()
except KeyError:
raise KeyError("Src file does not exist target column [%s]." % self._target_column)

chunk_size_handling(self._read_csv_func, fi, fo)

Expand All @@ -420,7 +431,10 @@ def _read_csv_func(self, chunksize, fi, fo):
first_write = True
tfr = pandas.read_csv(fi, chunksize=chunksize, na_filter=False)
for df in tfr:
df = df[~df[self._src_column].isin(self.df_target_list)]
if self._all_column:
df = df.drop(self._all_elements_match(df.values.tolist()))
else:
df = df[~df[self._src_column].isin(self.df_target_list)]
df.to_csv(
fo,
encoding=self._encoding,
Expand All @@ -430,6 +444,16 @@ def _read_csv_func(self, chunksize, fi, fo):
)
first_write = False

def _all_elements_match(self, df_src_list):
target_list = []
for i in range(len(df_src_list)):
for j in range(len(self.df_target_list)):
if all(str(x) == str(y) for x, y in zip(df_src_list[i], self.df_target_list[j])):
target_list.append(i)
else:
continue
return target_list


class ColumnLengthAdjust(FileBaseTransform):
"""
Expand Down
99 changes: 99 additions & 0 deletions cliboa/test/scenario/transform/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -1083,6 +1083,38 @@ def test_execute_ok_with_non_target(self):
assert r["data"] == test_src_csv_data[2][1]
assert rows == 2

def test_execute_ok_with_all_column(self):
# create test csv
test_src_csv_data = [["key", "data"], ["1", "spam1"], ["2", "spam2"], ["3", "spam3"]]
self._create_csv(test_src_csv_data, fname="test.csv")
test_target_csv_data = [["id", "name"], ["1", "spam1"], ["2", "second"], ["c", "spam3"]]
self._create_csv(test_target_csv_data, fname="alter.csv")

# set the essential attributes
instance = CsvMergeExclusive()
Helper.set_property(instance, "logger", LisboaLog.get_logger(__name__))
Helper.set_property(instance, "src_dir", self._data_dir)
Helper.set_property(instance, "src_pattern", "test.csv")
Helper.set_property(instance, "all_column", True)
Helper.set_property(
instance, "target_compare_path", os.path.join(self._data_dir, "alter.csv")
)

instance.execute()
output_file = os.path.join(self._data_dir, "test.csv")
rows = 0
with open(output_file, "r") as o:
reader = csv.DictReader(o)
for r in reader:
rows += 1
if rows == 1:
assert r["key"] == test_src_csv_data[2][0]
assert r["data"] == test_src_csv_data[2][1]
if rows == 2:
assert r["key"] == test_src_csv_data[3][0]
assert r["data"] == test_src_csv_data[3][1]
assert rows == 2

def test_execute_ng_with_src_column_not_exist(self):
# create test csv
test_src_csv_data = [["key", "data"], ["1", "spam1"], ["2", "spam2"]]
Expand Down Expand Up @@ -1127,6 +1159,73 @@ def test_execute_ng_with_target_column_not_exist(self):
instance.execute()
assert "'Target Compare file does not exist target column [dummy].'" == str(e.value)

def test_execute_ng_with_all_column_and_src_column(self):
# create test csv
test_src_csv_data = [["key", "data"], ["1", "spam1"], ["2", "spam2"]]
self._create_csv(test_src_csv_data, fname="test.csv")
test_target_csv_data = [["id", "name"], ["3", "third"], ["4", "fourth"]]
self._create_csv(test_target_csv_data, fname="alter.csv")

# set the essential attributes
instance = CsvMergeExclusive()
Helper.set_property(instance, "logger", LisboaLog.get_logger(__name__))
Helper.set_property(instance, "src_dir", self._data_dir)
Helper.set_property(instance, "src_pattern", "test.csv")
Helper.set_property(instance, "all_column", True)
Helper.set_property(instance, "src_column", "key")
Helper.set_property(
instance, "target_compare_path", os.path.join(self._data_dir, "alter.csv")
)

with pytest.raises(KeyError) as e:
instance.execute()
assert "'all_column cannot coexist with src_column or target_column.'" == str(e.value)

def test_execute_ng_with_all_column_and_target_column(self):
# create test csv
test_src_csv_data = [["key", "data"], ["1", "spam1"], ["2", "spam2"]]
self._create_csv(test_src_csv_data, fname="test.csv")
test_target_csv_data = [["id", "name"], ["3", "third"], ["4", "fourth"]]
self._create_csv(test_target_csv_data, fname="alter.csv")

# set the essential attributes
instance = CsvMergeExclusive()
Helper.set_property(instance, "logger", LisboaLog.get_logger(__name__))
Helper.set_property(instance, "src_dir", self._data_dir)
Helper.set_property(instance, "src_pattern", "test.csv")
Helper.set_property(instance, "all_column", True)
Helper.set_property(instance, "target_column", "key")
Helper.set_property(
instance, "target_compare_path", os.path.join(self._data_dir, "alter.csv")
)

with pytest.raises(KeyError) as e:
instance.execute()
assert "'all_column cannot coexist with src_column or target_column.'" == str(e.value)

def test_execute_ng_with_all_column_and_src_column_and_target_column(self):
# create test csv
test_src_csv_data = [["key", "data"], ["1", "spam1"], ["2", "spam2"]]
self._create_csv(test_src_csv_data, fname="test.csv")
test_target_csv_data = [["id", "name"], ["3", "third"], ["4", "fourth"]]
self._create_csv(test_target_csv_data, fname="alter.csv")

# set the essential attributes
instance = CsvMergeExclusive()
Helper.set_property(instance, "logger", LisboaLog.get_logger(__name__))
Helper.set_property(instance, "src_dir", self._data_dir)
Helper.set_property(instance, "src_pattern", "test.csv")
Helper.set_property(instance, "all_column", True)
Helper.set_property(instance, "src_column", "key")
Helper.set_property(instance, "target_column", "key")
Helper.set_property(
instance, "target_compare_path", os.path.join(self._data_dir, "alter.csv")
)

with pytest.raises(KeyError) as e:
instance.execute()
assert "'all_column cannot coexist with src_column or target_column.'" == str(e.value)


class TestColumnLengthAdjust(TestCsvTransform):
def test_ok(self):
Expand Down
53 changes: 42 additions & 11 deletions docs/modules/csv_merge_exclusive.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,18 @@ Compare specific columns each file.
If matched, exclude rows.

# Parameters
|Parameters|Explanation|Required|Default|Remarks|
|----------|-----------|--------|-------|-------|
|src_dir|Path of the directory which target files are placed.|Yes|None||
|src_pattern|Regex which is to find target files.|Yes|None||
|dest_dir|Path of the directory which is for output files.|No|None|If this parameter is not set, the file is created in the same directory as the processing file. If a non-existent directory path is specified, the directory is automatically created.|
|src_column|compare target column for "src_dir" and "src_path".|Yes|None|Specify only one column.|
|target_compare_path|Path of the file which target for comparison.|Yes|None||
|target_column|compare target column for "target_compare_path".|Yes|None|Specify only one column.|
|encoding|Character encoding when read and write|No|utf-8||

# Example
| Parameters | Explanation | Required | Default | Remarks |
|---------------------|------------------------------------------------------|----------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| src_dir | Path of the directory which target files are placed. | Yes | None | |
| src_pattern | Regex which is to find target files. | Yes | None | |
| dest_dir | Path of the directory which is for output files. | No | None | If this parameter is not set, the file is created in the same directory as the processing file. If a non-existent directory path is specified, the directory is automatically created. |
| src_column | Compare target column for "src_dir" and "src_path". | Yes | None | Specify only one column. |
| target_compare_path | Path of the file which target for comparison. | Yes | None | |
| target_column | Compare target column for "target_compare_path". | Yes | None | Specify only one column. |
| all_column | Delete rows when all column values match. | No | False | src_column and target_column cannot be used together when all_column is "True". |
| encoding | Character encoding when read and write | No | utf-8 | |

# Example 1
```
scenario:
- step:
Expand Down Expand Up @@ -42,3 +43,33 @@ id, name, data
2, two, second
3, three, third
```

# Example 2
```
scenario:
- step:
class: CsvMergeExclusive
arguments:
src_dir: /in
src_pattern: test\.csv
target_compare_path: /in/compare.csv
all_column: True
dest_dir: /out
Input: /in/test.csv
id, name, data
1, one, first
2, two, second
3, three, third
Input Compare Target: /in/compare.csv
id, name, data
1, one, first
2, two, secondary
3, three, third
4, four, fourth
Output: /out/test.csv
id, name, data
2, two, second
```

0 comments on commit 91b2cb7

Please sign in to comment.