diff --git a/cliboa/scenario/__init__.py b/cliboa/scenario/__init__.py index 12a5c360..eed11335 100644 --- a/cliboa/scenario/__init__.py +++ b/cliboa/scenario/__init__.py @@ -51,6 +51,7 @@ CsvRowDelete, CsvSort, CsvToJsonl, + CsvTypeConvert, CsvValueExtract, ) from .transform.file import ( diff --git a/cliboa/scenario/transform/csv.py b/cliboa/scenario/transform/csv.py index 34d7c4a2..26bd4559 100644 --- a/cliboa/scenario/transform/csv.py +++ b/cliboa/scenario/transform/csv.py @@ -285,6 +285,73 @@ def _read_csv_func(self, chunksize, fi, fo): first_write = False +class CsvTypeConvert(FileBaseTransform): + """ + Convert the type of specific column in a csv file. + """ + + def __init__(self): + super().__init__() + self._column = None + self._type = None + + def column(self, column): + self._column = column + + def type(self, type): + self._type = type + + def execute(self, *args): + valid = EssentialParameters( + self.__class__.__name__, + [self._src_dir, self._src_pattern, self._dest_dir, self._column, self._type], + ) + valid() + + os.makedirs(self._dest_dir, exist_ok=True) + + files = super().get_target_files(self._src_dir, self._src_pattern) + self.check_file_existence(files) + self._logger.info("Files found %s" % files) + super().io_files(files, func=self.convert) + + def convert(self, fi, fo): + chunk_size_handling(self._read_csv_func, fi, fo) + + def _read_csv_func(self, chunksize, fi, fo): + first_write = True + tfr = pandas.read_csv( + fi, + dtype=object, + encoding=self._encoding, + chunksize=chunksize, + na_filter=False, + ) + for df in tfr: + for column in self._column: + try: + if self._type == "int": + # When reading from csv, the following error occurs: + # ValueError: invalid literal for int() with base 10 + # To avoid this, convert to float and then convert to int + df[column] = df[column].astype("float") + df[column] = df[column].astype("int") + else: + df[column] = df[column].astype(self._type) + except Exception: + raise InvalidParameter( + "Conversion to this type is not possible. %s" % self._type + ) + df.to_csv( + fo, + encoding=self._encoding, + header=True if first_write else False, + index=False, + mode="w" if first_write else "a", + ) + first_write = False + + class CsvMergeExclusive(FileBaseTransform): """ Compare specific columns each file. diff --git a/cliboa/template/Pipfile.above310 b/cliboa/template/Pipfile.above310 index 7def6efa..837038bd 100644 --- a/cliboa/template/Pipfile.above310 +++ b/cliboa/template/Pipfile.above310 @@ -40,6 +40,7 @@ python-gnupg = "==0.4.8" openpyxl = "==3.0.9" pyminizip = "==0.2.5" psycopg2 = "==2.9.1" +numpy = "<2" [requires] python_version = "3.10" diff --git a/cliboa/template/Pipfile.above37 b/cliboa/template/Pipfile.above37 index 7ee17d15..07e0b2ea 100644 --- a/cliboa/template/Pipfile.above37 +++ b/cliboa/template/Pipfile.above37 @@ -40,6 +40,7 @@ python-gnupg = "==0.4.8" openpyxl = "==3.0.9" pyminizip = "==0.2.5" psycopg2 = "==2.9.1" +numpy = "<2" [requires] python_version = "3.7" diff --git a/cliboa/template/Pipfile.above38 b/cliboa/template/Pipfile.above38 index a2708db1..5a1500da 100644 --- a/cliboa/template/Pipfile.above38 +++ b/cliboa/template/Pipfile.above38 @@ -40,6 +40,7 @@ python-gnupg = "==0.4.8" openpyxl = "==3.0.9" pyminizip = "==0.2.5" psycopg2 = "==2.9.1" +numpy = "<2" [requires] python_version = "3.8" diff --git a/cliboa/template/Pipfile.above39 b/cliboa/template/Pipfile.above39 index 31eae87a..c0b450d2 100644 --- a/cliboa/template/Pipfile.above39 +++ b/cliboa/template/Pipfile.above39 @@ -40,6 +40,7 @@ python-gnupg = "==0.4.8" openpyxl = "==3.0.9" pyminizip = "==0.2.5" psycopg2 = "==2.9.1" +numpy = "<2" [requires] python_version = "3.9" diff --git a/cliboa/template/pyproject.above310.toml b/cliboa/template/pyproject.above310.toml index 333d20b2..26574cc9 100644 --- a/cliboa/template/pyproject.above310.toml +++ b/cliboa/template/pyproject.above310.toml @@ -31,6 +31,7 @@ python-gnupg = "==0.4.8" openpyxl = "==3.0.9" pyminizip = "==0.2.5" psycopg2 = "==2.9.1" +numpy = "<2" [tool.poetry.dev-dependencies] autoflake = "==1.3.1" diff --git a/cliboa/template/pyproject.above37.toml b/cliboa/template/pyproject.above37.toml index beed4d57..662805b6 100644 --- a/cliboa/template/pyproject.above37.toml +++ b/cliboa/template/pyproject.above37.toml @@ -30,6 +30,7 @@ python-gnupg = "==0.4.8" openpyxl = "==3.0.9" pyminizip = "==0.2.5" psycopg2 = "==2.9.1" +numpy = "<2" [tool.poetry.dev-dependencies] autoflake = "==1.3.1" diff --git a/cliboa/template/pyproject.above38.toml b/cliboa/template/pyproject.above38.toml index 8beac635..41df8d40 100644 --- a/cliboa/template/pyproject.above38.toml +++ b/cliboa/template/pyproject.above38.toml @@ -30,6 +30,7 @@ python-gnupg = "==0.4.8" openpyxl = "==3.0.9" pyminizip = "==0.2.5" psycopg2 = "==2.9.1" +numpy = "<2" [tool.poetry.dev-dependencies] autoflake = "==1.3.1" diff --git a/cliboa/template/pyproject.above39.toml b/cliboa/template/pyproject.above39.toml index 72c6ee4d..4ec0285b 100644 --- a/cliboa/template/pyproject.above39.toml +++ b/cliboa/template/pyproject.above39.toml @@ -30,6 +30,7 @@ python-gnupg = "==0.4.8" openpyxl = "==3.0.9" pyminizip = "==0.2.5" psycopg2 = "==2.9.1" +numpy = "<2" [tool.poetry.dev-dependencies] autoflake = "==1.3.1" diff --git a/cliboa/test/scenario/transform/test_csv.py b/cliboa/test/scenario/transform/test_csv.py index 251b36f1..dec6b03d 100644 --- a/cliboa/test/scenario/transform/test_csv.py +++ b/cliboa/test/scenario/transform/test_csv.py @@ -37,6 +37,7 @@ CsvRowDelete, CsvSort, CsvToJsonl, + CsvTypeConvert, CsvValueExtract, ) from cliboa.test import BaseCliboaTest @@ -797,6 +798,199 @@ def test_execute_ng_with_specify_not_exist_column(self): assert "'test'" == str(e.value) +class TestCsvTypeConvert(TestCsvTransform): + def test_execute_ok_1(self): + # create test csv + test_csv_data = [["key", "data"], ["1", "SPAM1"], ["2", "SPAM2"]] + self._create_csv(test_csv_data) + + # set the essential attributes + instance = CsvTypeConvert() + Helper.set_property(instance, "logger", LisboaLog.get_logger(__name__)) + Helper.set_property(instance, "src_dir", self._data_dir) + Helper.set_property(instance, "src_pattern", "test.csv") + Helper.set_property(instance, "dest_dir", self._data_dir) + Helper.set_property(instance, "column", ["key"]) + Helper.set_property(instance, "type", "float") + + instance.execute() + output_file = os.path.join(self._data_dir, "test.csv") + with open(output_file, "r") as o: + reader = csv.reader(o) + for i, row in enumerate(reader): + if i == 0: + self.assertEqual(["key", "data"], row) + if i == 1: + self.assertEqual(["1.0", "SPAM1"], row) + if i == 2: + self.assertEqual(["2.0", "SPAM2"], row) + + def test_execute_ok_2(self): + # create test csv + test_csv_data = [["key", "data", "number"], ["1", "1", "1"], ["2", "2", "2"]] + self._create_csv(test_csv_data) + + # set the essential attributes + instance = CsvTypeConvert() + Helper.set_property(instance, "logger", LisboaLog.get_logger(__name__)) + Helper.set_property(instance, "src_dir", self._data_dir) + Helper.set_property(instance, "src_pattern", "test.csv") + Helper.set_property(instance, "dest_dir", self._data_dir) + Helper.set_property(instance, "column", ["key", "number"]) + Helper.set_property(instance, "type", "float") + + instance.execute() + output_file = os.path.join(self._data_dir, "test.csv") + with open(output_file, "r") as o: + reader = csv.reader(o) + for i, row in enumerate(reader): + if i == 0: + self.assertEqual(["key", "data", "number"], row) + if i == 1: + self.assertEqual(["1.0", "1", "1.0"], row) + if i == 2: + self.assertEqual(["2.0", "2", "2.0"], row) + + def test_execute_ok_3(self): + # create test csv + test_csv_data = [["key", "data", "number"], [1, "SPAM1", "1"], ["2", "SPAM2", 2]] + self._create_csv(test_csv_data) + + # set the essential attributes + instance = CsvTypeConvert() + Helper.set_property(instance, "logger", LisboaLog.get_logger(__name__)) + Helper.set_property(instance, "src_dir", self._data_dir) + Helper.set_property(instance, "src_pattern", "test.csv") + Helper.set_property(instance, "dest_dir", self._data_dir) + Helper.set_property(instance, "column", ["key", "number"]) + Helper.set_property(instance, "type", "str") + + instance.execute() + output_file = os.path.join(self._data_dir, "test.csv") + with open(output_file, "r") as o: + reader = csv.reader(o) + for i, row in enumerate(reader): + if i == 0: + self.assertEqual(["key", "data", "number"], row) + if i == 1: + self.assertEqual(["1", "SPAM1", "1"], row) + if i == 2: + self.assertEqual(["2", "SPAM2", "2"], row) + + def test_execute_ok_4(self): + # create test csv + test_csv_data = [ + ["key", "data", "number"], + ["1.0", "SPAM1", "1.0"], + ["2.0", "SPAM2", "2.0"], + ] + self._create_csv(test_csv_data) + + # set the essential attributes + instance = CsvTypeConvert() + Helper.set_property(instance, "logger", LisboaLog.get_logger(__name__)) + Helper.set_property(instance, "src_dir", self._data_dir) + Helper.set_property(instance, "src_pattern", "test.csv") + Helper.set_property(instance, "dest_dir", self._data_dir) + Helper.set_property(instance, "column", ["key", "number"]) + Helper.set_property(instance, "type", "int") + + instance.execute() + output_file = os.path.join(self._data_dir, "test.csv") + with open(output_file, "r") as o: + reader = csv.reader(o) + for i, row in enumerate(reader): + if i == 0: + self.assertEqual(["key", "data", "number"], row) + if i == 1: + self.assertEqual(["1", "SPAM1", "1"], row) + if i == 2: + self.assertEqual(["2", "SPAM2", "2"], row) + + def test_execute_ok_5(self): + # create test csv + test_csv_data = [["key", "data", "number"], [1.0, "SPAM1", "1.0"], [2.0, "SPAM2", "2.0"]] + self._create_csv(test_csv_data) + + # set the essential attributes + instance = CsvTypeConvert() + Helper.set_property(instance, "logger", LisboaLog.get_logger(__name__)) + Helper.set_property(instance, "src_dir", self._data_dir) + Helper.set_property(instance, "src_pattern", "test.csv") + Helper.set_property(instance, "dest_dir", self._data_dir) + Helper.set_property(instance, "column", ["key"]) + Helper.set_property(instance, "type", "str") + + instance.execute() + output_file = os.path.join(self._data_dir, "test.csv") + with open(output_file, "r") as o: + reader = csv.reader(o) + for i, row in enumerate(reader): + if i == 0: + self.assertEqual(["key", "data", "number"], row) + if i == 1: + self.assertEqual(["1.0", "SPAM1", "1.0"], row) + if i == 2: + self.assertEqual(["2.0", "SPAM2", "2.0"], row) + + def test_execute_ok_6(self): + # create test csv + test_csv_data = [["key", "data", "number"], [1.0, "SPAM1", "1.0"], [2.0, "SPAM2", "2.0"]] + test_csv_data_2 = [["key", "data", "number"], [3.0, "SPAM3", "3.0"], [4.0, "SPAM4", "4.0"]] + self._create_csv(test_csv_data) + self._create_csv(test_csv_data_2, fname="test_2.csv") + + # set the essential attributes + instance = CsvTypeConvert() + Helper.set_property(instance, "logger", LisboaLog.get_logger(__name__)) + Helper.set_property(instance, "src_dir", self._data_dir) + Helper.set_property(instance, "src_pattern", "test.*.csv") + Helper.set_property(instance, "dest_dir", self._data_dir) + Helper.set_property(instance, "column", ["key"]) + Helper.set_property(instance, "type", "str") + + instance.execute() + output_file = os.path.join(self._data_dir, "test.csv") + with open(output_file, "r") as o: + reader = csv.reader(o) + for i, row in enumerate(reader): + if i == 0: + self.assertEqual(["key", "data", "number"], row) + if i == 1: + self.assertEqual(["1.0", "SPAM1", "1.0"], row) + if i == 2: + self.assertEqual(["2.0", "SPAM2", "2.0"], row) + + output_file = os.path.join(self._data_dir, "test_2.csv") + with open(output_file, "r") as o: + reader = csv.reader(o) + for i, row in enumerate(reader): + if i == 0: + self.assertEqual(["key", "data", "number"], row) + if i == 1: + self.assertEqual(["3.0", "SPAM3", "3.0"], row) + if i == 2: + self.assertEqual(["4.0", "SPAM4", "4.0"], row) + + def test_execute_ng(self): + # create test csv + test_csv_data = [["key", "data"], ["1", "SPAM1"], ["2", "SPAM2"]] + self._create_csv(test_csv_data) + + # set the essential attributes + instance = CsvTypeConvert() + Helper.set_property(instance, "logger", LisboaLog.get_logger(__name__)) + Helper.set_property(instance, "src_dir", self._data_dir) + Helper.set_property(instance, "src_pattern", "test.csv") + Helper.set_property(instance, "dest_dir", self._data_dir) + Helper.set_property(instance, "column", ["key"]) + Helper.set_property(instance, "type", "list") + + with pytest.raises(Exception) as e: + instance.execute() + assert "Conversion to this type is not possible. list" == str(e.value) + + class TestCsvMergeExclusive(TestCsvTransform): def test_execute_ok(self): # create test csv diff --git a/docs/modules/csv_column_type_convert.md b/docs/modules/csv_column_type_convert.md new file mode 100644 index 00000000..d74e454d --- /dev/null +++ b/docs/modules/csv_column_type_convert.md @@ -0,0 +1,33 @@ +# CsvColumnTypeConvert +Convert the type of specific column in a csv file. + +# Parameters +| Parameters | Explanation | Required | Default | Remarks | +|-------------|------------------------------------------------------|----------|---------|----------------------------------------------------------------------------------------| +| src_dir | Path of the directory which target files are placed. | Yes | None | | +| src_pattern | Regex which is to find target files. | Yes | None | | +| dest_dir | Path of the directory which is for output files. | Yes | None | If a non-existent directory path is specified, the directory is automatically created. | +| column | Type conversion target column. | Yes | None | | +| type | Type of the converted data. | Yes | None | Specify a valid value for the dtype of 'pandas.DataFrame.astype'. | + +# Examples +``` +scenario: +- step: Column Type Convert + class: CsvColumnTypeConvert + arguments: + src_dir: /in + src_pattern: test\.csv + column: + - number + type: float + dest_dir: /out + +Input: /in/test.csv +id, name, number +1, test, 1 + +Output: /out/test.csv +id, name, number +1, test, 1.0 +```