format

airbytehq · girarda · Aug 3, 2023 · Jul 19, 2023 · Jul 19, 2023 · Jul 19, 2023
commit 6324257d8c7e0577c4d326268d3ee41982200edf
diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py
@@ -4,7 +4,7 @@
 
 import codecs
 from enum import Enum
-from typing import Optional, List
+from typing import List, Optional
 
 from pydantic import BaseModel, Field, validator
 from typing_extensions import Literal
@@ -49,17 +49,15 @@ class CsvFormat(BaseModel):
     null_values: List[str] = Field(
         title="Null Values",
         default=[],
-        description="A set of strings that should be interpreted as null values. For example, if the value 'NA' should be interpreted as null, enter 'NA' in this field."
+        description="A set of strings that should be interpreted as null values. For example, if the value 'NA' should be interpreted as null, enter 'NA' in this field.",
     )
     skip_rows_before_header: int = Field(
         title="Skip Rows Before Header",
         default=0,
         description="The number of rows to skip before the header row. For example, if the header row is on the 3rd row, enter 2 in this field.",
     )
     skip_rows_after_header: int = Field(
-        title="Skip Rows After Header",
-        default=0,
-        description="The number of rows to skip after the header row."
+        title="Skip Rows After Header", default=0, description="The number of rows to skip after the header row."
     )
 
     # Noting that the existing S3 connector had a config option newlines_in_values. This was only supported by pyarrow and not

diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py
@@ -7,11 +7,11 @@
 import logging
 from distutils.util import strtobool
 from io import IOBase
-from typing import Any, Dict, Iterable, Mapping, Optional, Set, List, IO
+from typing import Any, Dict, Iterable, List, Mapping, Optional
 
 from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat, QuotingBehavior
 from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
-from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
+from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError
 from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
 from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
 from airbyte_cdk.sources.file_based.remote_file import RemoteFile
@@ -27,7 +27,6 @@
 }
 
 
-
 class CsvParser(FileTypeParser):
     async def infer_schema(
         self,
@@ -132,6 +131,7 @@ def _skip_rows_before_header(fp: IOBase, rows_to_skip: int) -> None:
         for _ in range(rows_to_skip):
             fp.readline()
 
+
 def cast_types(row: Dict[str, str], property_types: Dict[str, Any], logger: logging.Logger) -> Dict[str, Any]:
     """
     Casts the values in the input 'row' dictionary according to the types defined in the JSON schema.

diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py
@@ -4,10 +4,7 @@
 
 import asyncio
 import itertools
-import logging
 import traceback
-from configparser import ParsingError
-import json
 from functools import cache
 from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Set, Union
 
@@ -19,7 +16,7 @@
     InvalidSchemaError,
     MissingSchemaError,
     SchemaInferenceError,
-    StopSyncPerValidationPolicy, RecordParseError,
+    StopSyncPerValidationPolicy,
 )
 from airbyte_cdk.sources.file_based.remote_file import RemoteFile
 from airbyte_cdk.sources.file_based.schema_helpers import merge_schemas, schemaless_schema
@@ -106,7 +103,7 @@ def read_records_from_slice(self, stream_slice: StreamSlice) -> Iterable[Mapping
                                     level=Level.ERROR,
                                     message=f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream={self.name} file={file.uri} line_no={line_no} n_skipped={n_skipped}",
                                     stack_trace=traceback.format_exc(),
-                                )
+                                ),
                             )
                             return
                     if self.config.schemaless:
@@ -129,7 +126,7 @@ def read_records_from_slice(self, stream_slice: StreamSlice) -> Iterable[Mapping
                 )
                 break
 
-            except Exception as e:
+            except Exception:
                 yield AirbyteMessage(
                     type=MessageType.LOG,
                     log=AirbyteLogMessage(

diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/test_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/test_scenarios.py
@@ -33,8 +33,33 @@
     success_multi_stream_scenario,
     success_user_provided_schema_scenario,
 )
-from unit_tests.sources.file_based.scenarios.csv_scenarios import *
-
+from unit_tests.sources.file_based.scenarios.csv_scenarios import (
+    csv_custom_delimiter_in_double_quotes_scenario,
+    csv_custom_delimiter_with_escape_char_scenario,
+    csv_custom_format_scenario,
+    csv_double_quote_is_set_scenario,
+    csv_escape_char_is_set_scenario,
+    csv_legacy_format_scenario,
+    csv_multi_stream_scenario,
+    csv_newline_in_values_quoted_value_scenario,
+    csv_simple_scenario,
+    csv_single_stream_scenario,
+    csv_skip_after_header_scenario,
+    csv_skip_before_header_scenario,
+    csv_string_can_be_null_with_input_schemas_scenario,
+    csv_string_not_null_if_no_null_values_scenario,
+    csv_strings_can_be_null_not_quoted_scenario,
+    empty_schema_inference_scenario,
+    invalid_csv_scenario,
+    multi_csv_scenario,
+    multi_csv_stream_n_file_exceeds_limit_for_inference,
+    multi_stream_custom_format,
+    schemaless_csv_multi_stream_scenario,
+    schemaless_csv_scenario,
+    schemaless_with_user_input_schema_fails_connection_check_multi_stream_scenario,
+    schemaless_with_user_input_schema_fails_connection_check_scenario,
+    single_csv_scenario,
+)
 from unit_tests.sources.file_based.scenarios.incremental_scenarios import (
     multi_csv_different_timestamps_scenario,
     multi_csv_include_missing_files_within_history_range,
@@ -245,7 +270,6 @@ def _verify_read_output(output: Dict[str, Any], scenario: TestScenario) -> None:
         elif "state" in actual:
             assert actual["state"]["data"] == expected
 
-
     if scenario.expected_logs:
         read_logs = scenario.expected_logs.get("read")
         assert len(logs) == (len(read_logs) if read_logs else 0)