skip before header

airbytehq · girarda · Aug 3, 2023 · Jul 19, 2023 · Jul 19, 2023 · Jul 19, 2023
commit 79f77488f2788a90aa8c48a862f71dba7c22c147
diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py
@@ -6,7 +6,8 @@
 import json
 import logging
 from distutils.util import strtobool
-from typing import Any, Dict, Iterable, Mapping, Optional, Set, List
+from io import IOBase
+from typing import Any, Dict, Iterable, Mapping, Optional, Set, List, IO
 
 from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat, QuotingBehavior
 from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
@@ -52,6 +53,7 @@ async def infer_schema(
                 # todo: the existing InMemoryFilesSource.open_file() test source doesn't currently require an encoding, but actual
                 #  sources will likely require one. Rather than modify the interface now we can wait until the real use case
                 reader = csv.DictReader(fp, dialect=dialect_name)  # type: ignore
+                self._skip_rows_before_header(fp, config_format.skip_rows_before_header)
                 schema = {field.strip(): {"type": "string"} for field in next(reader)}
                 csv.unregister_dialect(dialect_name)
                 return schema
@@ -87,16 +89,18 @@ def parse_records(
             with stream_reader.open_file(file) as fp:
                 # todo: the existing InMemoryFilesSource.open_file() test source doesn't currently require an encoding, but actual
                 #  sources will likely require one. Rather than modify the interface now we can wait until the real use case
+                self._skip_rows_before_header(fp, config_format.skip_rows_before_header)
                 reader = csv.DictReader(fp, dialect=dialect_name)  # type: ignore
-                yield from self._read_and_cast_types(reader, schema, null_values, logger)
+
+                yield from self._read_and_cast_types(reader, schema, null_values, config_format.skip_rows_before_header, logger)
         else:
             with stream_reader.open_file(file) as fp:
                 reader = csv.DictReader(fp)  # type: ignore
-                yield from self._read_and_cast_types(reader, schema, [], logger)
+                yield from self._read_and_cast_types(reader, schema, [], 0, logger)
 
     @staticmethod
     def _read_and_cast_types(
-        reader: csv.DictReader, schema: Optional[Mapping[str, Any]], null_values: List[str], logger: logging.Logger  # type: ignore
+        reader: csv.DictReader, schema: Optional[Mapping[str, Any]], null_values: List[str], skip_rows_before_header: int, logger: logging.Logger  # type: ignore
     ) -> Iterable[Optional[Dict[str, Any]]]:
         """
         If the user provided a schema, attempt to cast the record values to the associated type.
@@ -121,6 +125,11 @@ def _to_nullable(row: Mapping[str, str], null_values: List[str]) -> Optional[Dic
         nullable = row | {k: None if v in null_values else v for k, v in row.items()}
         return nullable
 
+    @staticmethod
+    def _skip_rows_before_header(fp: IOBase, rows_to_skip: int) -> None:
+        for _ in range(rows_to_skip):
+            fp.readline()
+
 def cast_types(row: Dict[str, str], property_types: Dict[str, Any], logger: logging.Logger) -> Dict[str, Any]:
     """
     Casts the values in the input 'row' dictionary according to the types defined in the JSON schema.

diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py
@@ -2185,7 +2185,7 @@
     )
 ).build()
 
-csv_simple = (
+csv_simple_scenario = (
     TestScenarioBuilder()
     .set_name("csv_simple")
     .set_config(
@@ -2213,9 +2213,9 @@
         {
             "a.csv": {
                 "contents": [
-                    '''col1,col2''',
-                    '''val11,val12''',
-                    '''val21,val22''',
+                    ("col1", "col2"),
+                    ("val11", "val12"),
+                    ("val21", "val22"),
                 ],
                 "last_modified": "2023-06-05T03:54:07.000Z",
             }
@@ -2260,3 +2260,76 @@
         ]
     )
 ).build()
+
+csv_skip_before_header_scenario = (
+    TestScenarioBuilder()
+    .set_name("csv_skip_before_header")
+    .set_config(
+        {
+            "streams": [
+                {
+                    "name": "stream1",
+                    "file_type": "csv",
+                    "globs": ["*"],
+                    "validation_policy": "emit_record",
+                    "format": {
+                        "csv": {
+                            "filetype": "csv",
+                            "skip_rows_before_header": 2
+                        }
+                    }
+                }
+            ],
+            "start_date": "2023-06-04T03:54:07Z"
+        }
+    )
+    .set_files(
+        {
+            "a.csv": {
+                "contents": [
+                    ("skip_this", "skip_this"),
+                    ("skip_this_too", "skip_this_too"),
+                    ("col1", "col2"),
+                    ("val11", "val12"),
+                ],
+                "last_modified": "2023-06-05T03:54:07.000Z",
+            }
+        }
+    )
+    .set_file_type("csv")
+    .set_expected_catalog(
+        {
+            "streams": [
+                {
+                    "default_cursor_field": ["_ab_source_file_last_modified"],
+                    "json_schema": {
+                        "type": "object",
+                        "properties": {
+                            "col1": {
+                                "type": "string"
+                            },
+                            "col2": {
+                                "type": "string"
+                            },
+                            "_ab_source_file_last_modified": {
+                                "type": "string"
+                            },
+                            "_ab_source_file_url": {
+                                "type": "string"
+                            },
+                        },
+                    },
+                    "name": "stream1",
+                    "source_defined_cursor": True,
+                    "supported_sync_modes": ["full_refresh", "incremental"],
+                }
+            ]
+        }
+    )
+    .set_expected_records(
+        [
+            {"data": {"col1": "val11", "col2": "val12", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z",
+                      "_ab_source_file_url": "a.csv"}, "stream": "stream1"},
+        ]
+    )
+).build()
diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/test_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/test_scenarios.py
@@ -156,12 +156,13 @@
     csv_double_quote_is_set_scenario,
     csv_custom_delimiter_with_escape_char_scenario,
     csv_custom_delimiter_in_double_quotes_scenario,
-    csv_simple,
+    csv_simple_scenario,
     single_avro_scenario,
     avro_all_types_scenario,
     multiple_avro_combine_schema_scenario,
     multiple_streams_avro_scenario,
     avro_file_with_decimal_as_float_scenario,
+    csv_skip_before_header_scenario,
 ]