Skip to content

Commit

Permalink
Merge pull request #455 from BrainPad/454
Browse files Browse the repository at this point in the history
Convert the type of a specific column in a csv file.
  • Loading branch information
yasuhiro-ohba authored Jul 22, 2024
2 parents 060426f + 40a5ccf commit 0d94f86
Show file tree
Hide file tree
Showing 12 changed files with 303 additions and 0 deletions.
1 change: 1 addition & 0 deletions cliboa/scenario/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
CsvRowDelete,
CsvSort,
CsvToJsonl,
CsvTypeConvert,
CsvValueExtract,
)
from .transform.file import (
Expand Down
67 changes: 67 additions & 0 deletions cliboa/scenario/transform/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,73 @@ def _read_csv_func(self, chunksize, fi, fo):
first_write = False


class CsvTypeConvert(FileBaseTransform):
"""
Convert the type of specific column in a csv file.
"""

def __init__(self):
super().__init__()
self._column = None
self._type = None

def column(self, column):
self._column = column

def type(self, type):
self._type = type

def execute(self, *args):
valid = EssentialParameters(
self.__class__.__name__,
[self._src_dir, self._src_pattern, self._dest_dir, self._column, self._type],
)
valid()

os.makedirs(self._dest_dir, exist_ok=True)

files = super().get_target_files(self._src_dir, self._src_pattern)
self.check_file_existence(files)
self._logger.info("Files found %s" % files)
super().io_files(files, func=self.convert)

def convert(self, fi, fo):
chunk_size_handling(self._read_csv_func, fi, fo)

def _read_csv_func(self, chunksize, fi, fo):
first_write = True
tfr = pandas.read_csv(
fi,
dtype=object,
encoding=self._encoding,
chunksize=chunksize,
na_filter=False,
)
for df in tfr:
for column in self._column:
try:
if self._type == "int":
# When reading from csv, the following error occurs:
# ValueError: invalid literal for int() with base 10
# To avoid this, convert to float and then convert to int
df[column] = df[column].astype("float")
df[column] = df[column].astype("int")
else:
df[column] = df[column].astype(self._type)
except Exception:
raise InvalidParameter(
"Conversion to this type is not possible. %s" % self._type
)
df.to_csv(
fo,
encoding=self._encoding,
header=True if first_write else False,
index=False,
mode="w" if first_write else "a",
)
first_write = False


class CsvMergeExclusive(FileBaseTransform):
"""
Compare specific columns each file.
Expand Down
1 change: 1 addition & 0 deletions cliboa/template/Pipfile.above310
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ python-gnupg = "==0.4.8"
openpyxl = "==3.0.9"
pyminizip = "==0.2.5"
psycopg2 = "==2.9.1"
numpy = "<2"

[requires]
python_version = "3.10"
Expand Down
1 change: 1 addition & 0 deletions cliboa/template/Pipfile.above37
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ python-gnupg = "==0.4.8"
openpyxl = "==3.0.9"
pyminizip = "==0.2.5"
psycopg2 = "==2.9.1"
numpy = "<2"

[requires]
python_version = "3.7"
Expand Down
1 change: 1 addition & 0 deletions cliboa/template/Pipfile.above38
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ python-gnupg = "==0.4.8"
openpyxl = "==3.0.9"
pyminizip = "==0.2.5"
psycopg2 = "==2.9.1"
numpy = "<2"

[requires]
python_version = "3.8"
Expand Down
1 change: 1 addition & 0 deletions cliboa/template/Pipfile.above39
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ python-gnupg = "==0.4.8"
openpyxl = "==3.0.9"
pyminizip = "==0.2.5"
psycopg2 = "==2.9.1"
numpy = "<2"

[requires]
python_version = "3.9"
Expand Down
1 change: 1 addition & 0 deletions cliboa/template/pyproject.above310.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ python-gnupg = "==0.4.8"
openpyxl = "==3.0.9"
pyminizip = "==0.2.5"
psycopg2 = "==2.9.1"
numpy = "<2"

[tool.poetry.dev-dependencies]
autoflake = "==1.3.1"
Expand Down
1 change: 1 addition & 0 deletions cliboa/template/pyproject.above37.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ python-gnupg = "==0.4.8"
openpyxl = "==3.0.9"
pyminizip = "==0.2.5"
psycopg2 = "==2.9.1"
numpy = "<2"

[tool.poetry.dev-dependencies]
autoflake = "==1.3.1"
Expand Down
1 change: 1 addition & 0 deletions cliboa/template/pyproject.above38.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ python-gnupg = "==0.4.8"
openpyxl = "==3.0.9"
pyminizip = "==0.2.5"
psycopg2 = "==2.9.1"
numpy = "<2"

[tool.poetry.dev-dependencies]
autoflake = "==1.3.1"
Expand Down
1 change: 1 addition & 0 deletions cliboa/template/pyproject.above39.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ python-gnupg = "==0.4.8"
openpyxl = "==3.0.9"
pyminizip = "==0.2.5"
psycopg2 = "==2.9.1"
numpy = "<2"

[tool.poetry.dev-dependencies]
autoflake = "==1.3.1"
Expand Down
194 changes: 194 additions & 0 deletions cliboa/test/scenario/transform/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
CsvRowDelete,
CsvSort,
CsvToJsonl,
CsvTypeConvert,
CsvValueExtract,
)
from cliboa.test import BaseCliboaTest
Expand Down Expand Up @@ -797,6 +798,199 @@ def test_execute_ng_with_specify_not_exist_column(self):
assert "'test'" == str(e.value)


class TestCsvTypeConvert(TestCsvTransform):
def test_execute_ok_1(self):
# create test csv
test_csv_data = [["key", "data"], ["1", "SPAM1"], ["2", "SPAM2"]]
self._create_csv(test_csv_data)

# set the essential attributes
instance = CsvTypeConvert()
Helper.set_property(instance, "logger", LisboaLog.get_logger(__name__))
Helper.set_property(instance, "src_dir", self._data_dir)
Helper.set_property(instance, "src_pattern", "test.csv")
Helper.set_property(instance, "dest_dir", self._data_dir)
Helper.set_property(instance, "column", ["key"])
Helper.set_property(instance, "type", "float")

instance.execute()
output_file = os.path.join(self._data_dir, "test.csv")
with open(output_file, "r") as o:
reader = csv.reader(o)
for i, row in enumerate(reader):
if i == 0:
self.assertEqual(["key", "data"], row)
if i == 1:
self.assertEqual(["1.0", "SPAM1"], row)
if i == 2:
self.assertEqual(["2.0", "SPAM2"], row)

def test_execute_ok_2(self):
# create test csv
test_csv_data = [["key", "data", "number"], ["1", "1", "1"], ["2", "2", "2"]]
self._create_csv(test_csv_data)

# set the essential attributes
instance = CsvTypeConvert()
Helper.set_property(instance, "logger", LisboaLog.get_logger(__name__))
Helper.set_property(instance, "src_dir", self._data_dir)
Helper.set_property(instance, "src_pattern", "test.csv")
Helper.set_property(instance, "dest_dir", self._data_dir)
Helper.set_property(instance, "column", ["key", "number"])
Helper.set_property(instance, "type", "float")

instance.execute()
output_file = os.path.join(self._data_dir, "test.csv")
with open(output_file, "r") as o:
reader = csv.reader(o)
for i, row in enumerate(reader):
if i == 0:
self.assertEqual(["key", "data", "number"], row)
if i == 1:
self.assertEqual(["1.0", "1", "1.0"], row)
if i == 2:
self.assertEqual(["2.0", "2", "2.0"], row)

def test_execute_ok_3(self):
# create test csv
test_csv_data = [["key", "data", "number"], [1, "SPAM1", "1"], ["2", "SPAM2", 2]]
self._create_csv(test_csv_data)

# set the essential attributes
instance = CsvTypeConvert()
Helper.set_property(instance, "logger", LisboaLog.get_logger(__name__))
Helper.set_property(instance, "src_dir", self._data_dir)
Helper.set_property(instance, "src_pattern", "test.csv")
Helper.set_property(instance, "dest_dir", self._data_dir)
Helper.set_property(instance, "column", ["key", "number"])
Helper.set_property(instance, "type", "str")

instance.execute()
output_file = os.path.join(self._data_dir, "test.csv")
with open(output_file, "r") as o:
reader = csv.reader(o)
for i, row in enumerate(reader):
if i == 0:
self.assertEqual(["key", "data", "number"], row)
if i == 1:
self.assertEqual(["1", "SPAM1", "1"], row)
if i == 2:
self.assertEqual(["2", "SPAM2", "2"], row)

def test_execute_ok_4(self):
# create test csv
test_csv_data = [
["key", "data", "number"],
["1.0", "SPAM1", "1.0"],
["2.0", "SPAM2", "2.0"],
]
self._create_csv(test_csv_data)

# set the essential attributes
instance = CsvTypeConvert()
Helper.set_property(instance, "logger", LisboaLog.get_logger(__name__))
Helper.set_property(instance, "src_dir", self._data_dir)
Helper.set_property(instance, "src_pattern", "test.csv")
Helper.set_property(instance, "dest_dir", self._data_dir)
Helper.set_property(instance, "column", ["key", "number"])
Helper.set_property(instance, "type", "int")

instance.execute()
output_file = os.path.join(self._data_dir, "test.csv")
with open(output_file, "r") as o:
reader = csv.reader(o)
for i, row in enumerate(reader):
if i == 0:
self.assertEqual(["key", "data", "number"], row)
if i == 1:
self.assertEqual(["1", "SPAM1", "1"], row)
if i == 2:
self.assertEqual(["2", "SPAM2", "2"], row)

def test_execute_ok_5(self):
# create test csv
test_csv_data = [["key", "data", "number"], [1.0, "SPAM1", "1.0"], [2.0, "SPAM2", "2.0"]]
self._create_csv(test_csv_data)

# set the essential attributes
instance = CsvTypeConvert()
Helper.set_property(instance, "logger", LisboaLog.get_logger(__name__))
Helper.set_property(instance, "src_dir", self._data_dir)
Helper.set_property(instance, "src_pattern", "test.csv")
Helper.set_property(instance, "dest_dir", self._data_dir)
Helper.set_property(instance, "column", ["key"])
Helper.set_property(instance, "type", "str")

instance.execute()
output_file = os.path.join(self._data_dir, "test.csv")
with open(output_file, "r") as o:
reader = csv.reader(o)
for i, row in enumerate(reader):
if i == 0:
self.assertEqual(["key", "data", "number"], row)
if i == 1:
self.assertEqual(["1.0", "SPAM1", "1.0"], row)
if i == 2:
self.assertEqual(["2.0", "SPAM2", "2.0"], row)

def test_execute_ok_6(self):
# create test csv
test_csv_data = [["key", "data", "number"], [1.0, "SPAM1", "1.0"], [2.0, "SPAM2", "2.0"]]
test_csv_data_2 = [["key", "data", "number"], [3.0, "SPAM3", "3.0"], [4.0, "SPAM4", "4.0"]]
self._create_csv(test_csv_data)
self._create_csv(test_csv_data_2, fname="test_2.csv")

# set the essential attributes
instance = CsvTypeConvert()
Helper.set_property(instance, "logger", LisboaLog.get_logger(__name__))
Helper.set_property(instance, "src_dir", self._data_dir)
Helper.set_property(instance, "src_pattern", "test.*.csv")
Helper.set_property(instance, "dest_dir", self._data_dir)
Helper.set_property(instance, "column", ["key"])
Helper.set_property(instance, "type", "str")

instance.execute()
output_file = os.path.join(self._data_dir, "test.csv")
with open(output_file, "r") as o:
reader = csv.reader(o)
for i, row in enumerate(reader):
if i == 0:
self.assertEqual(["key", "data", "number"], row)
if i == 1:
self.assertEqual(["1.0", "SPAM1", "1.0"], row)
if i == 2:
self.assertEqual(["2.0", "SPAM2", "2.0"], row)

output_file = os.path.join(self._data_dir, "test_2.csv")
with open(output_file, "r") as o:
reader = csv.reader(o)
for i, row in enumerate(reader):
if i == 0:
self.assertEqual(["key", "data", "number"], row)
if i == 1:
self.assertEqual(["3.0", "SPAM3", "3.0"], row)
if i == 2:
self.assertEqual(["4.0", "SPAM4", "4.0"], row)

def test_execute_ng(self):
# create test csv
test_csv_data = [["key", "data"], ["1", "SPAM1"], ["2", "SPAM2"]]
self._create_csv(test_csv_data)

# set the essential attributes
instance = CsvTypeConvert()
Helper.set_property(instance, "logger", LisboaLog.get_logger(__name__))
Helper.set_property(instance, "src_dir", self._data_dir)
Helper.set_property(instance, "src_pattern", "test.csv")
Helper.set_property(instance, "dest_dir", self._data_dir)
Helper.set_property(instance, "column", ["key"])
Helper.set_property(instance, "type", "list")

with pytest.raises(Exception) as e:
instance.execute()
assert "Conversion to this type is not possible. list" == str(e.value)


class TestCsvMergeExclusive(TestCsvTransform):
def test_execute_ok(self):
# create test csv
Expand Down
33 changes: 33 additions & 0 deletions docs/modules/csv_column_type_convert.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# CsvColumnTypeConvert
Convert the type of specific column in a csv file.

# Parameters
| Parameters | Explanation | Required | Default | Remarks |
|-------------|------------------------------------------------------|----------|---------|----------------------------------------------------------------------------------------|
| src_dir | Path of the directory which target files are placed. | Yes | None | |
| src_pattern | Regex which is to find target files. | Yes | None | |
| dest_dir | Path of the directory which is for output files. | Yes | None | If a non-existent directory path is specified, the directory is automatically created. |
| column | Type conversion target column. | Yes | None | |
| type | Type of the converted data. | Yes | None | Specify a valid value for the dtype of 'pandas.DataFrame.astype'. |

# Examples
```
scenario:
- step: Column Type Convert
class: CsvColumnTypeConvert
arguments:
src_dir: /in
src_pattern: test\.csv
column:
- number
type: float
dest_dir: /out
Input: /in/test.csv
id, name, number
1, test, 1
Output: /out/test.csv
id, name, number
1, test, 1.0
```

0 comments on commit 0d94f86

Please sign in to comment.