diff --git a/tap_google_sheets/sync.py b/tap_google_sheets/sync.py index fd06701..8670887 100644 --- a/tap_google_sheets/sync.py +++ b/tap_google_sheets/sync.py @@ -514,7 +514,18 @@ def sync(client, config, catalog, state): from_row=from_row, columns=columns, sheet_data_rows=sheet_data_rows) - if row_num < to_row: + + # Here row_num is the addition of from_row and total records get in response(per batch). + # Condition row_num < to_row was checking that if records on the current page are less than expected(to_row) or not. + # If the condition returns true then it was breaking the loop. + # API does not return the last empty rows in response. + # For example, rows 199 and 200 are empty, and a total of 400 rows are there in the sheet. So, in 1st iteration, + # to_row = 200, from_row = 2, row_num = 2(from_row) + 197 = 199(1st row contain header value) + # So, the above condition become true and breaks the loop without syncing records from 201 to 400. + # sheet_data_rows is no of records return in the current page. If it's a whole blank page then stop looping. + # So, in the above case, it syncs records 201 to 400 also even if rows 199 and 200 are blank. + # Then when the next batch 401 to 600 is empty, it breaks the loop. + if not sheet_data_rows: # If a whole blank page found, then stop looping. is_last_row = True # Process records, send batch of records to target diff --git a/tests/test_google_sheets_pagination.py b/tests/test_google_sheets_pagination.py index fcd4d3e..82dd481 100644 --- a/tests/test_google_sheets_pagination.py +++ b/tests/test_google_sheets_pagination.py @@ -6,11 +6,6 @@ from base import GoogleSheetsBaseTest -# BUG_TDL-14376 | https://jira.talendforge.org/browse/TDL-14376 -# Expectation: Tap will pick up next page (200 rows) iff there is a non-null value on that page -# We observed a BUG where the tap does not paginate properly on sheets where the last two rows in a batch -# are empty values. The tap does not capture anything on the subsequent pages when this happens. -# class PaginationTest(GoogleSheetsBaseTest): @@ -25,6 +20,7 @@ def test_run(self): Verify that for each stream you can get multiple pages of data and that when all fields are selected more than the automatic fields are replicated. + Verify by primary keys that data is unique for page PREREQUISITE This test relies on the existence of a specific sheet with the name Pagination that has a column called 'id' with values 1 -> 238. @@ -43,9 +39,8 @@ def test_run(self): record_count_by_stream = self.run_and_verify_sync(conn_id) synced_records = runner.get_records_from_target_output() - for stream in testable_streams.difference({ - 'sadsheet-pagination', # BUG TDL-14376 - }): + # Added back `sadsheet-pagination` to testable_streams as # BUG TDL-14376 resolved. + for stream in testable_streams: with self.subTest(stream=stream): our_fake_pk = 'id' @@ -61,6 +56,22 @@ def test_run(self): # verify that we can paginate with all fields selected self.assertGreater(record_count_by_stream.get(stream, 0), self.API_LIMIT) + + if stream == "sadsheet-pagination": + # verify the data for the "sadsheet-pagination" stream is free of any duplicates or breaks by checking + # our fake pk value ('id') + expected_pk_list = list(range(1, 238)) + expected_pk_list = [x for x in expected_pk_list if x not in [198, 199]] + self.assertEqual(expected_pk_list, fake_pk_list) + + # verify the data for the "sadsheet-pagination" stream is free of any duplicates or breaks by checking + # the actual primary key values (__sdc_row) + expected_pk_list = list(range(2, 239)) + expected_pk_list = [x for x in expected_pk_list if x not in [199, 200]] + self.assertEqual(expected_pk_list, actual_pk_list) + + continue + # verify the data for the "Pagination" stream is free of any duplicates or breaks by checking # our fake pk value ('id') # THIS ASSERTION CAN BE MADE BECAUSE WE SETUP DATA IN A SPECIFIC WAY. DONT COPY THIS