Merge pull request #448 from BrainPad/447

Prevent the value from being empty in pandas read_csv
BrainPad · Mar 7, 2024 · 6182b27 · 6182b27
2 parents 0830d75 + 3538a9d
commit 6182b27
Show file tree

Hide file tree

Showing 2 changed files with 281 additions and 5 deletions.
diff --git a/cliboa/scenario/transform/csv.py b/cliboa/scenario/transform/csv.py
@@ -68,6 +68,7 @@ def _read_csv_func(self, chunksize, fi, fo):
             dtype=str,
             encoding=self._encoding,
             chunksize=chunksize,
+            na_filter=False,
         )
         for df in tfr:
             for c in self._columns:
@@ -159,6 +160,7 @@ def _read_csv_func(self, chunksize, fi, fo):
             dtype=str,
             encoding=self._encoding,
             chunksize=chunksize,
+            na_filter=False,
         )
         pattern = re.compile(self._regex_pattern)
         for df in tfr:
@@ -262,6 +264,7 @@ def _read_csv_func(self, chunksize, fi, fo):
             dtype=str,
             encoding=self._encoding,
             chunksize=chunksize,
+            na_filter=False,
         )
         dest_str = None
         for df in tfr:
@@ -348,7 +351,7 @@ def convert(self, fi, fo):
     def _read_csv_func(self, chunksize, fi, fo):
         # Used in chunk_size_handling
         first_write = True
-        tfr = pandas.read_csv(fi, chunksize=chunksize)
+        tfr = pandas.read_csv(fi, chunksize=chunksize, na_filter=False)
         for df in tfr:
             df = df[~df[self._src_column].isin(self.df_target_list)]
             df.to_csv(
@@ -460,6 +463,7 @@ def _read_csv_func(self, chunksize, target1_files, target2_files):
             dtype=str,
             encoding=self._encoding,
             chunksize=chunksize,
+            na_filter=False,
         )
         for df in tfr1:
             df.to_csv(
@@ -524,6 +528,7 @@ def _read_csv_func(self, chunksize, fi, fo):
             dtype=str,
             encoding=self._encoding,
             chunksize=chunksize,
+            na_filter=False,
         )
         for df in tfr:
             if set(self._column_order) - set(df.columns.values):
@@ -584,12 +589,16 @@ def execute(self, *args):
 
         # Create output headers to conform to the concat specification.
         file_1 = files[0]
-        output_header = pandas.read_csv(file_1, dtype=str, encoding=self._encoding, nrows=0)
+        output_header = pandas.read_csv(
+            file_1, dtype=str, encoding=self._encoding, nrows=0, na_filter=False
+        )
         for file in files[1:]:
             output_header = pandas.concat(
                 [
                     output_header,
-                    pandas.read_csv(file, dtype=str, encoding=self._encoding, nrows=0),
+                    pandas.read_csv(
+                        file, dtype=str, encoding=self._encoding, nrows=0, na_filter=False
+                    ),
                 ]
             )
 
@@ -604,6 +613,7 @@ def _read_csv_func(self, chunksize, files, output_header):
                 dtype=str,
                 encoding=self._encoding,
                 chunksize=chunksize,
+                na_filter=False,
             )
             for df in tfr:
                 # Change the header order to the one you plan to output.
@@ -858,7 +868,13 @@ def execute(self, *args):
         super().io_files(files, func=self.convert)
 
     def convert(self, fi, fo):
-        header = pandas.read_csv(fi, dtype=str, encoding=self._encoding, nrows=0)
+        header = pandas.read_csv(
+            fi,
+            dtype=str,
+            encoding=self._encoding,
+            nrows=0,
+            na_filter=False,
+        )
         if self._src_column not in header:
             raise KeyError("Copy source column does not exist in file. [%s]" % self._src_column)
 
@@ -872,6 +888,7 @@ def _read_csv_func(self, chunksize, fi, fo):
             dtype=str,
             encoding=self._encoding,
             chunksize=chunksize,
+            na_filter=False,
         )
 
         for df in tfr:
@@ -931,7 +948,7 @@ def execute(self, *args):
         super().io_files(files, func=self.convert)
 
     def convert(self, fi, fo):
-        header = pandas.read_csv(fi, dtype=str, encoding=self._encoding, nrows=0)
+        header = pandas.read_csv(fi, dtype=str, encoding=self._encoding, nrows=0, na_filter=False)
         if self._column not in header:
             raise KeyError("Replace source column does not exist in file. [%s]" % self._column)
 
@@ -945,6 +962,7 @@ def _read_csv_func(self, chunksize, fi, fo):
             dtype=str,
             encoding=self._encoding,
             chunksize=chunksize,
+            na_filter=False,
         )
 
         for df in tfr: