Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Convert the type of a specific column in a csv file. #455

Merged
merged 1 commit into from
Jul 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cliboa/scenario/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
CsvRowDelete,
CsvSort,
CsvToJsonl,
CsvTypeConvert,
CsvValueExtract,
)
from .transform.file import (
Expand Down
67 changes: 67 additions & 0 deletions cliboa/scenario/transform/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,73 @@ def _read_csv_func(self, chunksize, fi, fo):
first_write = False


class CsvTypeConvert(FileBaseTransform):
"""
Convert the type of specific column in a csv file.
"""

def __init__(self):
super().__init__()
self._column = None
self._type = None

def column(self, column):
self._column = column

def type(self, type):
self._type = type

def execute(self, *args):
valid = EssentialParameters(
self.__class__.__name__,
[self._src_dir, self._src_pattern, self._dest_dir, self._column, self._type],
)
valid()

os.makedirs(self._dest_dir, exist_ok=True)

files = super().get_target_files(self._src_dir, self._src_pattern)
self.check_file_existence(files)
self._logger.info("Files found %s" % files)
super().io_files(files, func=self.convert)

def convert(self, fi, fo):
chunk_size_handling(self._read_csv_func, fi, fo)

def _read_csv_func(self, chunksize, fi, fo):
first_write = True
tfr = pandas.read_csv(
fi,
dtype=object,
encoding=self._encoding,
chunksize=chunksize,
na_filter=False,
)
for df in tfr:
for column in self._column:
try:
if self._type == "int":
# When reading from csv, the following error occurs:
# ValueError: invalid literal for int() with base 10
# To avoid this, convert to float and then convert to int
df[column] = df[column].astype("float")
df[column] = df[column].astype("int")
else:
df[column] = df[column].astype(self._type)
except Exception:
raise InvalidParameter(
"Conversion to this type is not possible. %s" % self._type
)
df.to_csv(
fo,
encoding=self._encoding,
header=True if first_write else False,
index=False,
mode="w" if first_write else "a",
)
first_write = False


class CsvMergeExclusive(FileBaseTransform):
"""
Compare specific columns each file.
Expand Down
1 change: 1 addition & 0 deletions cliboa/template/Pipfile.above310
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ python-gnupg = "==0.4.8"
openpyxl = "==3.0.9"
pyminizip = "==0.2.5"
psycopg2 = "==2.9.1"
numpy = "<2"
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed several tests that would fail if the numpy version was not fixed.


[requires]
python_version = "3.10"
Expand Down
1 change: 1 addition & 0 deletions cliboa/template/Pipfile.above37
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ python-gnupg = "==0.4.8"
openpyxl = "==3.0.9"
pyminizip = "==0.2.5"
psycopg2 = "==2.9.1"
numpy = "<2"

[requires]
python_version = "3.7"
Expand Down
1 change: 1 addition & 0 deletions cliboa/template/Pipfile.above38
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ python-gnupg = "==0.4.8"
openpyxl = "==3.0.9"
pyminizip = "==0.2.5"
psycopg2 = "==2.9.1"
numpy = "<2"

[requires]
python_version = "3.8"
Expand Down
1 change: 1 addition & 0 deletions cliboa/template/Pipfile.above39
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ python-gnupg = "==0.4.8"
openpyxl = "==3.0.9"
pyminizip = "==0.2.5"
psycopg2 = "==2.9.1"
numpy = "<2"

[requires]
python_version = "3.9"
Expand Down
1 change: 1 addition & 0 deletions cliboa/template/pyproject.above310.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ python-gnupg = "==0.4.8"
openpyxl = "==3.0.9"
pyminizip = "==0.2.5"
psycopg2 = "==2.9.1"
numpy = "<2"

[tool.poetry.dev-dependencies]
autoflake = "==1.3.1"
Expand Down
1 change: 1 addition & 0 deletions cliboa/template/pyproject.above37.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ python-gnupg = "==0.4.8"
openpyxl = "==3.0.9"
pyminizip = "==0.2.5"
psycopg2 = "==2.9.1"
numpy = "<2"

[tool.poetry.dev-dependencies]
autoflake = "==1.3.1"
Expand Down
1 change: 1 addition & 0 deletions cliboa/template/pyproject.above38.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ python-gnupg = "==0.4.8"
openpyxl = "==3.0.9"
pyminizip = "==0.2.5"
psycopg2 = "==2.9.1"
numpy = "<2"

[tool.poetry.dev-dependencies]
autoflake = "==1.3.1"
Expand Down
1 change: 1 addition & 0 deletions cliboa/template/pyproject.above39.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ python-gnupg = "==0.4.8"
openpyxl = "==3.0.9"
pyminizip = "==0.2.5"
psycopg2 = "==2.9.1"
numpy = "<2"

[tool.poetry.dev-dependencies]
autoflake = "==1.3.1"
Expand Down
194 changes: 194 additions & 0 deletions cliboa/test/scenario/transform/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
CsvRowDelete,
CsvSort,
CsvToJsonl,
CsvTypeConvert,
CsvValueExtract,
)
from cliboa.test import BaseCliboaTest
Expand Down Expand Up @@ -797,6 +798,199 @@ def test_execute_ng_with_specify_not_exist_column(self):
assert "'test'" == str(e.value)


class TestCsvTypeConvert(TestCsvTransform):
def test_execute_ok_1(self):
# create test csv
test_csv_data = [["key", "data"], ["1", "SPAM1"], ["2", "SPAM2"]]
self._create_csv(test_csv_data)

# set the essential attributes
instance = CsvTypeConvert()
Helper.set_property(instance, "logger", LisboaLog.get_logger(__name__))
Helper.set_property(instance, "src_dir", self._data_dir)
Helper.set_property(instance, "src_pattern", "test.csv")
Helper.set_property(instance, "dest_dir", self._data_dir)
Helper.set_property(instance, "column", ["key"])
Helper.set_property(instance, "type", "float")

instance.execute()
output_file = os.path.join(self._data_dir, "test.csv")
with open(output_file, "r") as o:
reader = csv.reader(o)
for i, row in enumerate(reader):
if i == 0:
self.assertEqual(["key", "data"], row)
if i == 1:
self.assertEqual(["1.0", "SPAM1"], row)
if i == 2:
self.assertEqual(["2.0", "SPAM2"], row)

def test_execute_ok_2(self):
# create test csv
test_csv_data = [["key", "data", "number"], ["1", "1", "1"], ["2", "2", "2"]]
self._create_csv(test_csv_data)

# set the essential attributes
instance = CsvTypeConvert()
Helper.set_property(instance, "logger", LisboaLog.get_logger(__name__))
Helper.set_property(instance, "src_dir", self._data_dir)
Helper.set_property(instance, "src_pattern", "test.csv")
Helper.set_property(instance, "dest_dir", self._data_dir)
Helper.set_property(instance, "column", ["key", "number"])
Helper.set_property(instance, "type", "float")

instance.execute()
output_file = os.path.join(self._data_dir, "test.csv")
with open(output_file, "r") as o:
reader = csv.reader(o)
for i, row in enumerate(reader):
if i == 0:
self.assertEqual(["key", "data", "number"], row)
if i == 1:
self.assertEqual(["1.0", "1", "1.0"], row)
if i == 2:
self.assertEqual(["2.0", "2", "2.0"], row)

def test_execute_ok_3(self):
# create test csv
test_csv_data = [["key", "data", "number"], [1, "SPAM1", "1"], ["2", "SPAM2", 2]]
self._create_csv(test_csv_data)

# set the essential attributes
instance = CsvTypeConvert()
Helper.set_property(instance, "logger", LisboaLog.get_logger(__name__))
Helper.set_property(instance, "src_dir", self._data_dir)
Helper.set_property(instance, "src_pattern", "test.csv")
Helper.set_property(instance, "dest_dir", self._data_dir)
Helper.set_property(instance, "column", ["key", "number"])
Helper.set_property(instance, "type", "str")

instance.execute()
output_file = os.path.join(self._data_dir, "test.csv")
with open(output_file, "r") as o:
reader = csv.reader(o)
for i, row in enumerate(reader):
if i == 0:
self.assertEqual(["key", "data", "number"], row)
if i == 1:
self.assertEqual(["1", "SPAM1", "1"], row)
if i == 2:
self.assertEqual(["2", "SPAM2", "2"], row)

def test_execute_ok_4(self):
# create test csv
test_csv_data = [
["key", "data", "number"],
["1.0", "SPAM1", "1.0"],
["2.0", "SPAM2", "2.0"],
]
self._create_csv(test_csv_data)

# set the essential attributes
instance = CsvTypeConvert()
Helper.set_property(instance, "logger", LisboaLog.get_logger(__name__))
Helper.set_property(instance, "src_dir", self._data_dir)
Helper.set_property(instance, "src_pattern", "test.csv")
Helper.set_property(instance, "dest_dir", self._data_dir)
Helper.set_property(instance, "column", ["key", "number"])
Helper.set_property(instance, "type", "int")

instance.execute()
output_file = os.path.join(self._data_dir, "test.csv")
with open(output_file, "r") as o:
reader = csv.reader(o)
for i, row in enumerate(reader):
if i == 0:
self.assertEqual(["key", "data", "number"], row)
if i == 1:
self.assertEqual(["1", "SPAM1", "1"], row)
if i == 2:
self.assertEqual(["2", "SPAM2", "2"], row)

def test_execute_ok_5(self):
# create test csv
test_csv_data = [["key", "data", "number"], [1.0, "SPAM1", "1.0"], [2.0, "SPAM2", "2.0"]]
self._create_csv(test_csv_data)

# set the essential attributes
instance = CsvTypeConvert()
Helper.set_property(instance, "logger", LisboaLog.get_logger(__name__))
Helper.set_property(instance, "src_dir", self._data_dir)
Helper.set_property(instance, "src_pattern", "test.csv")
Helper.set_property(instance, "dest_dir", self._data_dir)
Helper.set_property(instance, "column", ["key"])
Helper.set_property(instance, "type", "str")

instance.execute()
output_file = os.path.join(self._data_dir, "test.csv")
with open(output_file, "r") as o:
reader = csv.reader(o)
for i, row in enumerate(reader):
if i == 0:
self.assertEqual(["key", "data", "number"], row)
if i == 1:
self.assertEqual(["1.0", "SPAM1", "1.0"], row)
if i == 2:
self.assertEqual(["2.0", "SPAM2", "2.0"], row)

def test_execute_ok_6(self):
# create test csv
test_csv_data = [["key", "data", "number"], [1.0, "SPAM1", "1.0"], [2.0, "SPAM2", "2.0"]]
test_csv_data_2 = [["key", "data", "number"], [3.0, "SPAM3", "3.0"], [4.0, "SPAM4", "4.0"]]
self._create_csv(test_csv_data)
self._create_csv(test_csv_data_2, fname="test_2.csv")

# set the essential attributes
instance = CsvTypeConvert()
Helper.set_property(instance, "logger", LisboaLog.get_logger(__name__))
Helper.set_property(instance, "src_dir", self._data_dir)
Helper.set_property(instance, "src_pattern", "test.*.csv")
Helper.set_property(instance, "dest_dir", self._data_dir)
Helper.set_property(instance, "column", ["key"])
Helper.set_property(instance, "type", "str")

instance.execute()
output_file = os.path.join(self._data_dir, "test.csv")
with open(output_file, "r") as o:
reader = csv.reader(o)
for i, row in enumerate(reader):
if i == 0:
self.assertEqual(["key", "data", "number"], row)
if i == 1:
self.assertEqual(["1.0", "SPAM1", "1.0"], row)
if i == 2:
self.assertEqual(["2.0", "SPAM2", "2.0"], row)

output_file = os.path.join(self._data_dir, "test_2.csv")
with open(output_file, "r") as o:
reader = csv.reader(o)
for i, row in enumerate(reader):
if i == 0:
self.assertEqual(["key", "data", "number"], row)
if i == 1:
self.assertEqual(["3.0", "SPAM3", "3.0"], row)
if i == 2:
self.assertEqual(["4.0", "SPAM4", "4.0"], row)

def test_execute_ng(self):
# create test csv
test_csv_data = [["key", "data"], ["1", "SPAM1"], ["2", "SPAM2"]]
self._create_csv(test_csv_data)

# set the essential attributes
instance = CsvTypeConvert()
Helper.set_property(instance, "logger", LisboaLog.get_logger(__name__))
Helper.set_property(instance, "src_dir", self._data_dir)
Helper.set_property(instance, "src_pattern", "test.csv")
Helper.set_property(instance, "dest_dir", self._data_dir)
Helper.set_property(instance, "column", ["key"])
Helper.set_property(instance, "type", "list")

with pytest.raises(Exception) as e:
instance.execute()
assert "Conversion to this type is not possible. list" == str(e.value)


class TestCsvMergeExclusive(TestCsvTransform):
def test_execute_ok(self):
# create test csv
Expand Down
33 changes: 33 additions & 0 deletions docs/modules/csv_column_type_convert.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# CsvColumnTypeConvert
Convert the type of specific column in a csv file.

# Parameters
| Parameters | Explanation | Required | Default | Remarks |
|-------------|------------------------------------------------------|----------|---------|----------------------------------------------------------------------------------------|
| src_dir | Path of the directory which target files are placed. | Yes | None | |
| src_pattern | Regex which is to find target files. | Yes | None | |
| dest_dir | Path of the directory which is for output files. | Yes | None | If a non-existent directory path is specified, the directory is automatically created. |
| column | Type conversion target column. | Yes | None | |
| type | Type of the converted data. | Yes | None | Specify a valid value for the dtype of 'pandas.DataFrame.astype'. |

# Examples
```
scenario:
- step: Column Type Convert
class: CsvColumnTypeConvert
arguments:
src_dir: /in
src_pattern: test\.csv
column:
- number
type: float
dest_dir: /out

Input: /in/test.csv
id, name, number
1, test, 1

Output: /out/test.csv
id, name, number
1, test, 1.0
```
Loading