From 5db95bd182a093d926809315dfd718394dc8a5db Mon Sep 17 00:00:00 2001 From: prijendev Date: Tue, 26 Oct 2021 18:17:10 +0530 Subject: [PATCH 1/7] Initial commit for pagination failer --- tap_google_sheets/sync.py | 2 +- tests/test_google_sheets_pagination.py | 19 +++++++++++++++---- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/tap_google_sheets/sync.py b/tap_google_sheets/sync.py index fd06701..ad8a997 100644 --- a/tap_google_sheets/sync.py +++ b/tap_google_sheets/sync.py @@ -514,7 +514,7 @@ def sync(client, config, catalog, state): from_row=from_row, columns=columns, sheet_data_rows=sheet_data_rows) - if row_num < to_row: + if not sheet_data_rows: # If a whole blank page found, then stop looping. is_last_row = True # Process records, send batch of records to target diff --git a/tests/test_google_sheets_pagination.py b/tests/test_google_sheets_pagination.py index fcd4d3e..53fe919 100644 --- a/tests/test_google_sheets_pagination.py +++ b/tests/test_google_sheets_pagination.py @@ -43,9 +43,7 @@ def test_run(self): record_count_by_stream = self.run_and_verify_sync(conn_id) synced_records = runner.get_records_from_target_output() - for stream in testable_streams.difference({ - 'sadsheet-pagination', # BUG TDL-14376 - }): + for stream in testable_streams: with self.subTest(stream=stream): our_fake_pk = 'id' @@ -61,10 +59,23 @@ def test_run(self): # verify that we can paginate with all fields selected self.assertGreater(record_count_by_stream.get(stream, 0), self.API_LIMIT) + record_count_sync = record_count_by_stream.get(stream, 0) + + if record_count_sync > self.API_LIMIT: + primary_keys_list_1 = actual_pk_list[:self.API_LIMIT] + primary_keys_list_2 = actual_pk_list[self.API_LIMIT:2*self.API_LIMIT] + + primary_keys_page_1 = set(primary_keys_list_1) + primary_keys_page_2 = set(primary_keys_list_2) + + # Verify by primary keys that data is unique for page + self.assertTrue( + primary_keys_page_1.isdisjoint(primary_keys_page_2)) + # verify the data for the "Pagination" stream is free of any duplicates or breaks by checking # our fake pk value ('id') # THIS ASSERTION CAN BE MADE BECAUSE WE SETUP DATA IN A SPECIFIC WAY. DONT COPY THIS - self.assertEqual(list(range(1, 239)), fake_pk_list) + self.assertEqual(list(range(1, 239)), fake_pk_list) # verify the data for the "Pagination" stream is free of any duplicates or breaks by checking # the actual primary key values (__sdc_row) From 136c84b27845b36e01e8eacb77e04a43be5d8e67 Mon Sep 17 00:00:00 2001 From: prijendev Date: Thu, 28 Oct 2021 13:59:15 +0530 Subject: [PATCH 2/7] Fixed pagination test cases --- tests/test_google_sheets_pagination.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/tests/test_google_sheets_pagination.py b/tests/test_google_sheets_pagination.py index 53fe919..a4b61c5 100644 --- a/tests/test_google_sheets_pagination.py +++ b/tests/test_google_sheets_pagination.py @@ -25,6 +25,7 @@ def test_run(self): Verify that for each stream you can get multiple pages of data and that when all fields are selected more than the automatic fields are replicated. + Verify by primary keys that data is unique for page PREREQUISITE This test relies on the existence of a specific sheet with the name Pagination that has a column called 'id' with values 1 -> 238. @@ -43,6 +44,7 @@ def test_run(self): record_count_by_stream = self.run_and_verify_sync(conn_id) synced_records = runner.get_records_from_target_output() + # Added back `sadsheet-pagination` to testable_streams as # BUG TDL-14376 resolved. for stream in testable_streams: with self.subTest(stream=stream): @@ -72,10 +74,25 @@ def test_run(self): self.assertTrue( primary_keys_page_1.isdisjoint(primary_keys_page_2)) + if stream == "sadsheet-pagination": + # verify the data for the "sadsheet-pagination" stream is free of any duplicates or breaks by checking + # our fake pk value ('id') + expected_pk_list = list(range(1, 238)) + expected_pk_list = [x for x in expected_pk_list if x not in [198, 199]] + self.assertEqual(expected_pk_list, fake_pk_list) + + # verify the data for the "sadsheet-pagination" stream is free of any duplicates or breaks by checking + # the actual primary key values (__sdc_row) + expected_pk_list = list(range(2, 239)) + expected_pk_list = [x for x in expected_pk_list if x not in [199, 200]] + self.assertEqual(expected_pk_list, actual_pk_list) + + continue + # verify the data for the "Pagination" stream is free of any duplicates or breaks by checking # our fake pk value ('id') # THIS ASSERTION CAN BE MADE BECAUSE WE SETUP DATA IN A SPECIFIC WAY. DONT COPY THIS - self.assertEqual(list(range(1, 239)), fake_pk_list) + self.assertEqual(list(range(1, 239)), fake_pk_list) # verify the data for the "Pagination" stream is free of any duplicates or breaks by checking # the actual primary key values (__sdc_row) From 93185b6860e828f00b7117a1979d32ee5d4a2375 Mon Sep 17 00:00:00 2001 From: prijendev Date: Thu, 2 Dec 2021 17:13:58 +0530 Subject: [PATCH 3/7] Added comments --- tests/test_google_sheets_pagination.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/tests/test_google_sheets_pagination.py b/tests/test_google_sheets_pagination.py index a4b61c5..a1da3f3 100644 --- a/tests/test_google_sheets_pagination.py +++ b/tests/test_google_sheets_pagination.py @@ -73,18 +73,27 @@ def test_run(self): # Verify by primary keys that data is unique for page self.assertTrue( primary_keys_page_1.isdisjoint(primary_keys_page_2)) - + if stream == "sadsheet-pagination": # verify the data for the "sadsheet-pagination" stream is free of any duplicates or breaks by checking - # our fake pk value ('id') - expected_pk_list = list(range(1, 238)) + # our fake pk value ('id'). 'id' is column in sheet which contains unique index 1 to 237 + expected_pk_list = list(range(1, 238)) # Return 1 to 237 + + # Remove 198 and 199 from expected_pk_list because id by this number does not exist. + # That means those two rows are blank. expected_pk_list = [x for x in expected_pk_list if x not in [198, 199]] + + # This assertion can be made because the data is in a specific way. self.assertEqual(expected_pk_list, fake_pk_list) # verify the data for the "sadsheet-pagination" stream is free of any duplicates or breaks by checking # the actual primary key values (__sdc_row) - expected_pk_list = list(range(2, 239)) + expected_pk_list = list(range(2, 239)) # Return 2 to 238. Because 1st row contains header values itself. + + # Remove 199 and 200 from expected_pk_list because __sdc_row by this number does not exist. + # That means those two rows are blank. expected_pk_list = [x for x in expected_pk_list if x not in [199, 200]] + self.assertEqual(expected_pk_list, actual_pk_list) continue From ab60957b3ac01fef9e166acc5b00315a557d2729 Mon Sep 17 00:00:00 2001 From: prijendev Date: Thu, 9 Dec 2021 09:29:55 +0530 Subject: [PATCH 4/7] Added detail comment into the code --- tap_google_sheets/sync.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tap_google_sheets/sync.py b/tap_google_sheets/sync.py index ad8a997..8670887 100644 --- a/tap_google_sheets/sync.py +++ b/tap_google_sheets/sync.py @@ -514,6 +514,17 @@ def sync(client, config, catalog, state): from_row=from_row, columns=columns, sheet_data_rows=sheet_data_rows) + + # Here row_num is the addition of from_row and total records get in response(per batch). + # Condition row_num < to_row was checking that if records on the current page are less than expected(to_row) or not. + # If the condition returns true then it was breaking the loop. + # API does not return the last empty rows in response. + # For example, rows 199 and 200 are empty, and a total of 400 rows are there in the sheet. So, in 1st iteration, + # to_row = 200, from_row = 2, row_num = 2(from_row) + 197 = 199(1st row contain header value) + # So, the above condition become true and breaks the loop without syncing records from 201 to 400. + # sheet_data_rows is no of records return in the current page. If it's a whole blank page then stop looping. + # So, in the above case, it syncs records 201 to 400 also even if rows 199 and 200 are blank. + # Then when the next batch 401 to 600 is empty, it breaks the loop. if not sheet_data_rows: # If a whole blank page found, then stop looping. is_last_row = True From 6ee91a56243e8a4b963eee776989ba500b69b37a Mon Sep 17 00:00:00 2001 From: prijendev Date: Thu, 9 Dec 2021 18:41:16 +0530 Subject: [PATCH 5/7] Removed unnecessary comment --- tests/test_google_sheets_pagination.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_google_sheets_pagination.py b/tests/test_google_sheets_pagination.py index a1da3f3..3f06c3a 100644 --- a/tests/test_google_sheets_pagination.py +++ b/tests/test_google_sheets_pagination.py @@ -44,7 +44,6 @@ def test_run(self): record_count_by_stream = self.run_and_verify_sync(conn_id) synced_records = runner.get_records_from_target_output() - # Added back `sadsheet-pagination` to testable_streams as # BUG TDL-14376 resolved. for stream in testable_streams: with self.subTest(stream=stream): From 25136fce609ca297256d63c2da2112fe2512ebae Mon Sep 17 00:00:00 2001 From: prijendev Date: Thu, 9 Dec 2021 19:11:25 +0530 Subject: [PATCH 6/7] Removed unnecessary assertion --- tests/test_google_sheets_pagination.py | 28 ++++---------------------- 1 file changed, 4 insertions(+), 24 deletions(-) diff --git a/tests/test_google_sheets_pagination.py b/tests/test_google_sheets_pagination.py index 3f06c3a..f8b7c12 100644 --- a/tests/test_google_sheets_pagination.py +++ b/tests/test_google_sheets_pagination.py @@ -44,6 +44,7 @@ def test_run(self): record_count_by_stream = self.run_and_verify_sync(conn_id) synced_records = runner.get_records_from_target_output() + # Added back `sadsheet-pagination` to testable_streams as # BUG TDL-14376 resolved. for stream in testable_streams: with self.subTest(stream=stream): @@ -60,39 +61,18 @@ def test_run(self): # verify that we can paginate with all fields selected self.assertGreater(record_count_by_stream.get(stream, 0), self.API_LIMIT) - record_count_sync = record_count_by_stream.get(stream, 0) - if record_count_sync > self.API_LIMIT: - primary_keys_list_1 = actual_pk_list[:self.API_LIMIT] - primary_keys_list_2 = actual_pk_list[self.API_LIMIT:2*self.API_LIMIT] - - primary_keys_page_1 = set(primary_keys_list_1) - primary_keys_page_2 = set(primary_keys_list_2) - - # Verify by primary keys that data is unique for page - self.assertTrue( - primary_keys_page_1.isdisjoint(primary_keys_page_2)) - if stream == "sadsheet-pagination": # verify the data for the "sadsheet-pagination" stream is free of any duplicates or breaks by checking - # our fake pk value ('id'). 'id' is column in sheet which contains unique index 1 to 237 - expected_pk_list = list(range(1, 238)) # Return 1 to 237 - - # Remove 198 and 199 from expected_pk_list because id by this number does not exist. - # That means those two rows are blank. + # our fake pk value ('id') + expected_pk_list = list(range(1, 238)) expected_pk_list = [x for x in expected_pk_list if x not in [198, 199]] - - # This assertion can be made because the data is in a specific way. self.assertEqual(expected_pk_list, fake_pk_list) # verify the data for the "sadsheet-pagination" stream is free of any duplicates or breaks by checking # the actual primary key values (__sdc_row) - expected_pk_list = list(range(2, 239)) # Return 2 to 238. Because 1st row contains header values itself. - - # Remove 199 and 200 from expected_pk_list because __sdc_row by this number does not exist. - # That means those two rows are blank. + expected_pk_list = list(range(2, 239)) expected_pk_list = [x for x in expected_pk_list if x not in [199, 200]] - self.assertEqual(expected_pk_list, actual_pk_list) continue From 2cf83bc9fc7b319f0d15509a4139eb9216b113e9 Mon Sep 17 00:00:00 2001 From: prijendev Date: Thu, 9 Dec 2021 20:27:29 +0530 Subject: [PATCH 7/7] Removed extra comment --- tests/test_google_sheets_pagination.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/test_google_sheets_pagination.py b/tests/test_google_sheets_pagination.py index f8b7c12..82dd481 100644 --- a/tests/test_google_sheets_pagination.py +++ b/tests/test_google_sheets_pagination.py @@ -6,11 +6,6 @@ from base import GoogleSheetsBaseTest -# BUG_TDL-14376 | https://jira.talendforge.org/browse/TDL-14376 -# Expectation: Tap will pick up next page (200 rows) iff there is a non-null value on that page -# We observed a BUG where the tap does not paginate properly on sheets where the last two rows in a batch -# are empty values. The tap does not capture anything on the subsequent pages when this happens. -# class PaginationTest(GoogleSheetsBaseTest):