From 67ea18232e5a69da879669e4f020b5a0b36a5f94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C4=90=E1=BA=B7ng=20Minh=20D=C5=A9ng?= Date: Wed, 16 Dec 2020 05:47:48 +0700 Subject: [PATCH] chore: using f-string instead of string.format (#428) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * cleanup: string.format -> f-string Signed-off-by: Đặng Minh Dũng * add isort Signed-off-by: Đặng Minh Dũng * isort them all Signed-off-by: Đặng Minh Dũng * split long line in setup.cfg file Signed-off-by: Đặng Minh Dũng --- databuilder/.editorconfig | 18 ++ databuilder/Makefile | 11 +- databuilder/databuilder/__init__.py | 2 +- databuilder/databuilder/callback/call_back.py | 1 - .../extractor/athena_metadata_extractor.py | 12 +- .../extractor/base_bigquery_extractor.py | 7 +- .../databuilder/extractor/base_extractor.py | 2 +- .../base_postgres_metadata_extractor.py | 13 +- .../extractor/bigquery_metadata_extractor.py | 9 +- .../extractor/bigquery_usage_extractor.py | 22 +- .../extractor/bigquery_watermark_extractor.py | 17 +- .../extractor/cassandra_extractor.py | 12 +- .../databuilder/extractor/csv_extractor.py | 4 +- .../mode_dashboard_charts_batch_extractor.py | 11 +- .../mode_dashboard_charts_extractor.py | 11 +- .../mode_dashboard_executions_extractor.py | 8 +- .../mode_dashboard_extractor.py | 13 +- ...board_last_modified_timestamp_extractor.py | 15 +- ...rd_last_successful_executions_extractor.py | 7 +- .../mode_dashboard_owner_extractor.py | 4 +- .../mode_dashboard_queries_extractor.py | 16 +- .../mode_dashboard_usage_extractor.py | 2 +- .../mode_dashboard_user_extractor.py | 13 +- .../mode_analytics/mode_dashboard_utils.py | 17 +- .../redash/redash_dashboard_extractor.py | 32 +-- .../redash/redash_dashboard_utils.py | 10 +- .../tableau/tableau_dashboard_extractor.py | 41 ++- ...bleau_dashboard_last_modified_extractor.py | 17 +- .../tableau_dashboard_query_extractor.py | 11 +- .../tableau_dashboard_table_extractor.py | 14 +- .../tableau/tableau_dashboard_utils.py | 11 +- .../tableau_external_table_extractor.py | 11 +- .../extractor/db2_metadata_extractor.py | 14 +- .../databuilder/extractor/db_api_extractor.py | 5 +- .../delta_lake_metadata_extractor.py | 51 ++-- .../extractor/dremio_metadata_extractor.py | 11 +- .../extractor/druid_metadata_extractor.py | 13 +- .../databuilder/extractor/feast_extractor.py | 4 +- .../extractor/generic_extractor.py | 2 +- .../databuilder/extractor/glue_extractor.py | 10 +- .../hive_table_last_updated_extractor.py | 55 ++-- .../hive_table_metadata_extractor.py | 15 +- .../extractor/kafka_source_extractor.py | 9 +- .../extractor/mssql_metadata_extractor.py | 30 +- .../extractor/mysql_metadata_extractor.py | 17 +- .../databuilder/extractor/neo4j_extractor.py | 10 +- .../extractor/neo4j_search_data_extractor.py | 3 +- .../extractor/postgres_metadata_extractor.py | 13 +- .../presto_view_metadata_extractor.py | 11 +- .../extractor/redshift_metadata_extractor.py | 13 +- .../extractor/restapi/rest_api_extractor.py | 9 +- .../extractor/snowflake_metadata_extractor.py | 27 +- .../snowflake_table_last_updated_extractor.py | 7 +- .../extractor/sql_alchemy_extractor.py | 4 +- .../user/bamboohr/bamboohr_user_extractor.py | 13 +- .../databuilder/filesystem/filesystem.py | 2 +- .../databuilder/filesystem/metadata.py | 3 +- databuilder/databuilder/job/job.py | 7 +- databuilder/databuilder/loader/base_loader.py | 2 +- .../loader/file_system_csv_loader.py | 7 +- .../loader/file_system_neo4j_csv_loader.py | 25 +- .../databuilder/loader/generic_loader.py | 4 +- databuilder/databuilder/models/application.py | 5 +- databuilder/databuilder/models/badge.py | 10 +- .../databuilder/models/column_usage_model.py | 22 +- .../models/dashboard/dashboard_chart.py | 23 +- .../models/dashboard/dashboard_execution.py | 20 +- .../dashboard/dashboard_last_modified.py | 20 +- .../models/dashboard/dashboard_metadata.py | 29 +- .../models/dashboard/dashboard_owner.py | 22 +- .../models/dashboard/dashboard_query.py | 21 +- .../models/dashboard/dashboard_table.py | 21 +- .../models/dashboard/dashboard_usage.py | 27 +- .../dashboard_elasticsearch_document.py | 4 +- .../databuilder/models/graph_serializable.py | 13 +- .../models/neo4j_es_last_updated.py | 4 +- .../databuilder/models/schema/schema.py | 12 +- .../databuilder/models/table_column_usage.py | 26 +- .../models/table_elasticsearch_document.py | 2 +- .../databuilder/models/table_last_updated.py | 10 +- .../databuilder/models/table_lineage.py | 15 +- .../databuilder/models/table_metadata.py | 32 +-- databuilder/databuilder/models/table_owner.py | 21 +- .../databuilder/models/table_source.py | 18 +- databuilder/databuilder/models/table_stats.py | 4 +- databuilder/databuilder/models/user.py | 42 ++- databuilder/databuilder/models/watermark.py | 11 +- .../databuilder/publisher/base_publisher.py | 2 +- .../publisher/elasticsearch_publisher.py | 4 +- .../publisher/neo4j_csv_publisher.py | 64 ++--- .../publisher/neo4j_preprocessor.py | 7 +- .../rest_api/base_rest_api_query.py | 5 +- .../mode_paginated_rest_api_query.py | 8 +- .../rest_api/rest_api_failure_handlers.py | 1 - .../databuilder/rest_api/rest_api_query.py | 20 +- .../serializers/neo4_serializer.py | 26 +- .../task/neo4j_staleness_removal_task.py | 40 +-- databuilder/databuilder/task/task.py | 9 +- .../transformer/base_transformer.py | 4 +- .../transformer/bigquery_usage_transformer.py | 7 +- .../databuilder/transformer/dict_to_model.py | 2 +- .../transformer/generic_transformer.py | 2 +- .../regex_str_replace_transformer.py | 4 +- .../transformer/table_tag_transformer.py | 5 +- ...plate_variable_substitution_transformer.py | 2 +- .../transformer/timestamp_string_to_epoch.py | 5 +- databuilder/databuilder/utils/closer.py | 4 +- databuilder/example/dags/athena_sample_dag.py | 80 +++--- databuilder/example/dags/hive_sample_dag.py | 81 +++--- .../example/dags/postgres_sample_dag.py | 81 +++--- .../example/dags/snowflake_sample_dag.py | 92 +++--- .../scripts/sample_bigquery_metadata.py | 41 ++- .../example/scripts/sample_bq_usage_loader.py | 41 ++- .../example/scripts/sample_data_loader.py | 74 ++--- .../example/scripts/sample_db2_data_loader.py | 85 +++--- .../scripts/sample_deltalake_metadata.py | 45 ++- .../scripts/sample_dremio_data_loader.py | 82 +++--- .../example/scripts/sample_feast_loader.py | 117 +++----- .../example/scripts/sample_glue_loader.py | 60 ++-- .../example/scripts/sample_mssql_metadata.py | 98 +++---- .../example/scripts/sample_mysql_loader.py | 74 +++-- .../example/scripts/sample_postgres_loader.py | 77 +++-- .../scripts/sample_snowflake_data_loader.py | 94 +++---- .../scripts/sample_tableau_data_loader.py | 65 ++--- databuilder/requirements.txt | 5 + databuilder/setup.cfg | 30 +- databuilder/setup.py | 3 +- .../tests/unit/callback/test_call_back.py | 2 +- ...t_mode_dashboard_charts_batch_extractor.py | 6 +- .../redash/test_redash_dashboard_extractor.py | 14 +- .../redash/test_redash_dashboard_utils.py | 11 +- .../test_tableau_dashboard_extractor.py | 6 +- ...bleau_dashboard_last_modified_extractor.py | 11 +- .../test_tableau_dashboard_query_extractor.py | 6 +- .../test_tableau_dashboard_table_extractor.py | 6 +- .../restapi/test_rest_api_extractor.py | 5 +- .../test_athena_metadata_extractor.py | 125 ++++----- .../test_bigquery_metadata_extractor.py | 186 ++++++++----- .../test_bigquery_usage_extractor.py | 262 +++++++++--------- .../test_bigquery_watermark_extractor.py | 131 ++++++--- .../extractor/test_cassandra_extractor.py | 6 +- .../unit/extractor/test_csv_extractor.py | 4 +- .../extractor/test_deltalake_extractor.py | 48 ++-- .../test_dremio_metadata_extractor.py | 6 +- .../unit/extractor/test_feast_extractor.py | 24 +- .../unit/extractor/test_generic_extractor.py | 3 +- .../unit/extractor/test_glue_extractor.py | 2 +- .../test_hive_table_last_updated_extractor.py | 59 ++-- .../test_hive_table_metadata_extractor.py | 23 +- .../extractor/test_kafka_source_extractor.py | 13 +- .../test_mssql_metadata_extractor.py | 49 ++-- .../test_neo4j_es_last_updated_extractor.py | 4 +- .../unit/extractor/test_neo4j_extractor.py | 20 +- .../test_neo4j_search_data_extractor.py | 30 +- .../test_postgres_metadata_extractor.py | 29 +- .../test_presto_view_metadata_extractor.py | 7 +- .../test_redshift_metadata_extractor.py | 13 +- .../test_snowflake_metadata_extractor.py | 51 ++-- ..._snowflake_table_last_updated_extractor.py | 43 ++- .../extractor/test_sql_alchemy_extractor.py | 2 +- .../test_sql_server_metadata_extractor.py | 46 ++- .../bamboohr/test_bamboohr_user_extractor.py | 5 +- .../loader/test_file_system_csv_loader.py | 4 +- ...t_file_system_elasticsearch_json_loader.py | 4 +- .../unit/loader/test_fs_neo4j_csv_loader.py | 36 +-- .../tests/unit/loader/test_generic_loader.py | 2 +- .../models/dashboard/test_dashboard_chart.py | 7 +- .../dashboard/test_dashboard_last_modified.py | 7 +- .../models/dashboard/test_dashboard_owner.py | 6 +- .../models/dashboard/test_dashboard_query.py | 7 +- .../models/dashboard/test_dashboard_table.py | 6 +- .../models/dashboard/test_dashboard_usage.py | 7 +- .../tests/unit/models/test_application.py | 10 +- databuilder/tests/unit/models/test_badge.py | 7 +- .../unit/models/test_graph_serializable.py | 7 +- .../unit/models/test_neo4j_es_last_updated.py | 5 +- .../unit/models/test_table_column_usage.py | 2 +- .../unit/models/test_table_last_updated.py | 7 +- .../tests/unit/models/test_table_lineage.py | 17 +- .../tests/unit/models/test_table_owner.py | 13 +- .../tests/unit/models/test_table_source.py | 23 +- .../tests/unit/models/test_table_stats.py | 9 +- databuilder/tests/unit/models/test_user.py | 12 +- .../tests/unit/models/test_watermark.py | 45 +-- .../publisher/test_elasticsearch_publisher.py | 4 +- .../publisher/test_neo4j_csv_publisher.py | 19 +- .../unit/publisher/test_neo4j_preprocessor.py | 2 +- .../tests/unit/publisher/test_publisher.py | 2 +- .../test_mode_paginated_rest_api_query.py | 2 +- .../test_rest_api_failure_handlers.py | 3 +- .../unit/rest_api/test_rest_api_query.py | 2 +- .../task/test_neo4j_staleness_removal_task.py | 222 ++++++--------- databuilder/tests/unit/test_base_job.py | 29 +- .../test_bigquery_usage_transformer.py | 2 +- .../test_dict_to_model_transformer.py | 2 +- .../test_regex_str_replace_transformer.py | 7 +- .../test_remove_field_transformer.py | 2 +- .../transformer/test_table_tag_transformer.py | 2 +- ...plate_variable_substitution_transformer.py | 5 +- ...t_timestamp_string_to_epoch_transformer.py | 4 +- 200 files changed, 2131 insertions(+), 2449 deletions(-) create mode 100644 databuilder/.editorconfig diff --git a/databuilder/.editorconfig b/databuilder/.editorconfig new file mode 100644 index 0000000000..ae13a73fdb --- /dev/null +++ b/databuilder/.editorconfig @@ -0,0 +1,18 @@ +root = true + +[*] +charset = utf-8 +end_of_line = lf +indent_size = 2 +indent_style = space +insert_final_newline = true +trim_trailing_whitespace = true + +[*.py] +indent_size = 4 + +[{*.mk,*.make,Makefile}] +indent_style = tab + +[*.md] +trim_trailing_whitespace = false diff --git a/databuilder/Makefile b/databuilder/Makefile index 1eab3cd1f7..15ae86c89d 100644 --- a/databuilder/Makefile +++ b/databuilder/Makefile @@ -14,6 +14,13 @@ lint: mypy: mypy . -.PHONY: test -test: test_unit lint mypy +.PHONY: isort +isort: + isort . + +.PHONY: isort_check +isort_check: + isort ./ --check --diff +.PHONY: test +test: test_unit lint mypy isort_check diff --git a/databuilder/databuilder/__init__.py b/databuilder/databuilder/__init__.py index 0a76ea6ece..6f1d8f2fb4 100644 --- a/databuilder/databuilder/__init__.py +++ b/databuilder/databuilder/__init__.py @@ -3,7 +3,7 @@ import abc -from pyhocon import ConfigTree, ConfigFactory +from pyhocon import ConfigFactory, ConfigTree class Scoped(object, metaclass=abc.ABCMeta): diff --git a/databuilder/databuilder/callback/call_back.py b/databuilder/databuilder/callback/call_back.py index 4631280ce7..905c70e668 100644 --- a/databuilder/databuilder/callback/call_back.py +++ b/databuilder/databuilder/callback/call_back.py @@ -3,7 +3,6 @@ import abc import logging - from typing import List, Optional LOGGER = logging.getLogger(__name__) diff --git a/databuilder/databuilder/extractor/athena_metadata_extractor.py b/databuilder/databuilder/extractor/athena_metadata_extractor.py index e9d9406366..9ef75e31a7 100644 --- a/databuilder/databuilder/extractor/athena_metadata_extractor.py +++ b/databuilder/databuilder/extractor/athena_metadata_extractor.py @@ -3,15 +3,17 @@ import logging from collections import namedtuple +from itertools import groupby +from typing import ( + Any, Dict, Iterator, Union, +) from pyhocon import ConfigFactory, ConfigTree -from typing import Iterator, Union, Dict, Any from databuilder import Scoped from databuilder.extractor.base_extractor import Extractor from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor -from databuilder.models.table_metadata import TableMetadata, ColumnMetadata -from itertools import groupby +from databuilder.models.table_metadata import ColumnMetadata, TableMetadata TableKey = namedtuple('TableKey', ['schema', 'table_name']) @@ -45,14 +47,14 @@ class AthenaMetadataExtractor(Extractor): def init(self, conf: ConfigTree) -> None: conf = conf.with_fallback(AthenaMetadataExtractor.DEFAULT_CONFIG) - self._cluster = '{}'.format(conf.get_string(AthenaMetadataExtractor.CATALOG_KEY)) + self._cluster = conf.get_string(AthenaMetadataExtractor.CATALOG_KEY) self.sql_stmt = AthenaMetadataExtractor.SQL_STATEMENT.format( where_clause_suffix=conf.get_string(AthenaMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY), catalog_source=self._cluster ) - LOGGER.info('SQL for Athena metadata: {}'.format(self.sql_stmt)) + LOGGER.info('SQL for Athena metadata: %s', self.sql_stmt) self._alchemy_extractor = SQLAlchemyExtractor() sql_alch_conf = Scoped.get_scoped_conf(conf, self._alchemy_extractor.get_scope())\ diff --git a/databuilder/databuilder/extractor/base_bigquery_extractor.py b/databuilder/databuilder/extractor/base_bigquery_extractor.py index b0abc6dc2a..511f2563d3 100644 --- a/databuilder/databuilder/extractor/base_bigquery_extractor.py +++ b/databuilder/databuilder/extractor/base_bigquery_extractor.py @@ -4,17 +4,18 @@ import json import logging from collections import namedtuple +from typing import ( + Any, Dict, Iterator, List, +) import google.oauth2.service_account import google_auth_httplib2 -from googleapiclient.discovery import build import httplib2 +from googleapiclient.discovery import build from pyhocon import ConfigTree -from typing import Any, Dict, Iterator, List from databuilder.extractor.base_extractor import Extractor - DatasetRef = namedtuple('DatasetRef', ['datasetId', 'projectId']) TableKey = namedtuple('TableKey', ['schema', 'table_name']) diff --git a/databuilder/databuilder/extractor/base_extractor.py b/databuilder/databuilder/extractor/base_extractor.py index 4de7a84372..7d9d52da9d 100644 --- a/databuilder/databuilder/extractor/base_extractor.py +++ b/databuilder/databuilder/extractor/base_extractor.py @@ -2,9 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 import abc +from typing import Any from pyhocon import ConfigTree -from typing import Any from databuilder import Scoped diff --git a/databuilder/databuilder/extractor/base_postgres_metadata_extractor.py b/databuilder/databuilder/extractor/base_postgres_metadata_extractor.py index 2c1d77da62..74d669dbc2 100644 --- a/databuilder/databuilder/extractor/base_postgres_metadata_extractor.py +++ b/databuilder/databuilder/extractor/base_postgres_metadata_extractor.py @@ -4,16 +4,17 @@ import abc import logging from collections import namedtuple +from itertools import groupby +from typing import ( + Any, Dict, Iterator, Union, +) from pyhocon import ConfigFactory, ConfigTree -from typing import Iterator, Union, Dict, Any from databuilder import Scoped from databuilder.extractor.base_extractor import Extractor from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor -from databuilder.models.table_metadata import TableMetadata, ColumnMetadata -from itertools import groupby - +from databuilder.models.table_metadata import ColumnMetadata, TableMetadata TableKey = namedtuple('TableKey', ['schema', 'table_name']) @@ -47,7 +48,7 @@ def get_sql_statement(self, use_catalog_as_cluster_name: bool, where_clause_suff def init(self, conf: ConfigTree) -> None: conf = conf.with_fallback(BasePostgresMetadataExtractor.DEFAULT_CONFIG) - self._cluster = '{}'.format(conf.get_string(BasePostgresMetadataExtractor.CLUSTER_KEY)) + self._cluster = conf.get_string(BasePostgresMetadataExtractor.CLUSTER_KEY) self._database = conf.get_string(BasePostgresMetadataExtractor.DATABASE_KEY, default='postgres') @@ -62,7 +63,7 @@ def init(self, conf: ConfigTree) -> None: self.sql_stmt = sql_alch_conf.get_string(SQLAlchemyExtractor.EXTRACT_SQL) - LOGGER.info('SQL for postgres metadata: {}'.format(self.sql_stmt)) + LOGGER.info('SQL for postgres metadata: %s', self.sql_stmt) self._alchemy_extractor.init(sql_alch_conf) self._extract_iter: Union[None, Iterator] = None diff --git a/databuilder/databuilder/extractor/bigquery_metadata_extractor.py b/databuilder/databuilder/extractor/bigquery_metadata_extractor.py index 715fb69bd8..258370111a 100644 --- a/databuilder/databuilder/extractor/bigquery_metadata_extractor.py +++ b/databuilder/databuilder/extractor/bigquery_metadata_extractor.py @@ -2,13 +2,14 @@ # SPDX-License-Identifier: Apache-2.0 import logging +from typing import ( + Any, Dict, List, Set, cast, +) from pyhocon import ConfigTree -from typing import cast, Any, Dict, List, Set from databuilder.extractor.base_bigquery_extractor import BaseBigQueryExtractor, DatasetRef -from databuilder.models.table_metadata import TableMetadata, ColumnMetadata - +from databuilder.models.table_metadata import ColumnMetadata, TableMetadata LOGGER = logging.getLogger(__name__) @@ -91,7 +92,7 @@ def _iterate_over_cols(self, cols: List[ColumnMetadata], total_cols: int) -> int: if len(parent) > 0: - col_name = '{parent}.{field}'.format(parent=parent, field=column['name']) + col_name = f'{parent}.{column["name"]}' else: col_name = column['name'] diff --git a/databuilder/databuilder/extractor/bigquery_usage_extractor.py b/databuilder/databuilder/extractor/bigquery_usage_extractor.py index d9e116fe93..193365d3bc 100644 --- a/databuilder/databuilder/extractor/bigquery_usage_extractor.py +++ b/databuilder/databuilder/extractor/bigquery_usage_extractor.py @@ -1,14 +1,16 @@ # Copyright Contributors to the Amundsen project. # SPDX-License-Identifier: Apache-2.0 -from collections import namedtuple -from datetime import date, timedelta import logging import re +from collections import namedtuple +from datetime import date, timedelta from time import sleep +from typing import ( + Any, Dict, Iterator, List, Optional, Tuple, +) from pyhocon import ConfigTree -from typing import Any, Iterator, Dict, Optional, Tuple, List from databuilder.extractor.base_bigquery_extractor import BaseBigQueryExtractor @@ -47,7 +49,7 @@ def _count_usage(self) -> None: # noqa: C901 for entry in self._retrieve_records(): count += 1 if count % self.pagesize == 0: - LOGGER.info('Aggregated {} records'.format(count)) + LOGGER.info(f'Aggregated %i records', count) if entry is None: continue @@ -93,9 +95,7 @@ def _create_records(self, refResources: List[dict], resourcesProcessed: int, ema return if len(refResources) != resourcesProcessed: - LOGGER.warn( - 'The number of tables listed in job {job_id} is not consistent' - .format(job_id=jobId)) + LOGGER.warning(f'The number of tables listed in job {jobId} is not consistent') return for refResource in refResources: @@ -117,17 +117,15 @@ def _retrieve_records(self) -> Iterator[Optional[Dict]]: :return: Provides a record or None if no more to extract """ body = { - 'resourceNames': [ - 'projects/{project_id}'.format(project_id=self.project_id) - ], + 'resourceNames': [f'projects/{self.project_id}'], 'pageSize': self.pagesize, 'filter': 'resource.type="bigquery_resource" AND ' 'protoPayload.methodName="jobservice.jobcompleted" AND ' - 'timestamp >= "{timestamp}"'.format(timestamp=self.timestamp) + f'timestamp >= "{self.timestamp}"' } for page in self._page_over_results(body): for entry in page['entries']: - yield(entry) + yield entry def extract(self) -> Optional[Tuple[Any, int]]: try: diff --git a/databuilder/databuilder/extractor/bigquery_watermark_extractor.py b/databuilder/databuilder/extractor/bigquery_watermark_extractor.py index be0402197c..412501c3dd 100644 --- a/databuilder/databuilder/extractor/bigquery_watermark_extractor.py +++ b/databuilder/databuilder/extractor/bigquery_watermark_extractor.py @@ -1,14 +1,15 @@ # Copyright Contributors to the Amundsen project. # SPDX-License-Identifier: Apache-2.0 -from collections import namedtuple - -import logging import datetime +import logging import textwrap +from collections import namedtuple +from typing import ( + Any, Dict, Iterator, List, Tuple, Union, +) from pyhocon import ConfigTree -from typing import Any, Dict, Iterator, List, Tuple, Union from databuilder.extractor.base_bigquery_extractor import BaseBigQueryExtractor, DatasetRef from databuilder.models.watermark import Watermark @@ -70,7 +71,7 @@ def _retrieve_tables(self, 'bigquery', tableRef['datasetId'], prefix, - '__table__={partition_id}'.format(partition_id=td['low']), + f'__table__={td["low"]}', part_type="low_watermark", cluster=tableRef['projectId'] ) @@ -80,7 +81,7 @@ def _retrieve_tables(self, 'bigquery', tableRef['datasetId'], prefix, - '__table__={partition_id}'.format(partition_id=td['high']), + f'__table__={td["high"]}', part_type="high_watermark", cluster=tableRef['projectId'] ) @@ -129,7 +130,7 @@ def _get_partition_watermarks(self, 'bigquery', tableRef['datasetId'], tableRef['tableId'], - '{field}={partition_id}'.format(field=field, partition_id=low.partition_id), + f'{field}={low.partition_id}', part_type="low_watermark", cluster=tableRef['projectId'] ) @@ -140,7 +141,7 @@ def _get_partition_watermarks(self, 'bigquery', tableRef['datasetId'], tableRef['tableId'], - '{field}={partition_id}'.format(field=field, partition_id=high.partition_id), + f'{field}={high.partition_id}', part_type="high_watermark", cluster=tableRef['projectId'] ) diff --git a/databuilder/databuilder/extractor/cassandra_extractor.py b/databuilder/databuilder/extractor/cassandra_extractor.py index b776428096..81b6ddc88d 100644 --- a/databuilder/databuilder/extractor/cassandra_extractor.py +++ b/databuilder/databuilder/extractor/cassandra_extractor.py @@ -1,14 +1,16 @@ # Copyright Contributors to the Amundsen project. # SPDX-License-Identifier: Apache-2.0 -from cassandra.cluster import Cluster -import cassandra.metadata +from typing import ( + Dict, Iterator, Union, +) +import cassandra.metadata +from cassandra.cluster import Cluster from pyhocon import ConfigFactory, ConfigTree -from typing import Iterator, Union, Dict from databuilder.extractor.base_extractor import Extractor -from databuilder.models.table_metadata import TableMetadata, ColumnMetadata +from databuilder.models.table_metadata import ColumnMetadata, TableMetadata class CassandraExtractor(Extractor): @@ -38,7 +40,7 @@ class CassandraExtractor(Extractor): def init(self, conf: ConfigTree) -> None: conf = conf.with_fallback(CassandraExtractor.DEFAULT_CONFIG) - self._cluster = '{}'.format(conf.get_string(CassandraExtractor.CLUSTER_KEY)) + self._cluster = conf.get_string(CassandraExtractor.CLUSTER_KEY) self._filter = conf.get(CassandraExtractor.FILTER_FUNCTION_KEY) ips = conf.get_list(CassandraExtractor.IPS_KEY) kwargs = conf.get(CassandraExtractor.KWARGS_KEY) diff --git a/databuilder/databuilder/extractor/csv_extractor.py b/databuilder/databuilder/extractor/csv_extractor.py index 297bfe2ff8..68e3816178 100644 --- a/databuilder/databuilder/extractor/csv_extractor.py +++ b/databuilder/databuilder/extractor/csv_extractor.py @@ -4,13 +4,13 @@ import csv import importlib from collections import defaultdict +from typing import Any from pyhocon import ConfigTree -from typing import Any from databuilder.extractor.base_extractor import Extractor -from databuilder.models.table_metadata import TableMetadata, ColumnMetadata from databuilder.models.badge import Badge, BadgeMetadata +from databuilder.models.table_metadata import ColumnMetadata, TableMetadata class CsvExtractor(Extractor): diff --git a/databuilder/databuilder/extractor/dashboard/mode_analytics/batch/mode_dashboard_charts_batch_extractor.py b/databuilder/databuilder/extractor/dashboard/mode_analytics/batch/mode_dashboard_charts_batch_extractor.py index 379cd69304..d8cb3c1b7a 100644 --- a/databuilder/databuilder/extractor/dashboard/mode_analytics/batch/mode_dashboard_charts_batch_extractor.py +++ b/databuilder/databuilder/extractor/dashboard/mode_analytics/batch/mode_dashboard_charts_batch_extractor.py @@ -2,19 +2,18 @@ # SPDX-License-Identifier: Apache-2.0 import logging - -from pyhocon import ConfigTree, ConfigFactory from typing import Any +from pyhocon import ConfigFactory, ConfigTree + from databuilder import Scoped from databuilder.extractor.base_extractor import Extractor +from databuilder.extractor.dashboard.mode_analytics.mode_dashboard_constants import ORGANIZATION from databuilder.extractor.dashboard.mode_analytics.mode_dashboard_utils import ModeDashboardUtils +from databuilder.rest_api.base_rest_api_query import RestApiQuerySeed from databuilder.rest_api.mode_analytics.mode_paginated_rest_api_query import ModePaginatedRestApiQuery from databuilder.rest_api.rest_api_query import RestApiQuery -from databuilder.rest_api.base_rest_api_query import RestApiQuerySeed -from databuilder.extractor.dashboard.mode_analytics.mode_dashboard_constants import ORGANIZATION -from databuilder.transformer.dict_to_model import DictToModel, MODEL_CLASS - +from databuilder.transformer.dict_to_model import MODEL_CLASS, DictToModel LOGGER = logging.getLogger(__name__) diff --git a/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_charts_extractor.py b/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_charts_extractor.py index 6d9d3cac44..d276fdef4c 100644 --- a/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_charts_extractor.py +++ b/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_charts_extractor.py @@ -2,19 +2,20 @@ # SPDX-License-Identifier: Apache-2.0 import logging - -from pyhocon import ConfigTree, ConfigFactory from typing import Any, List +from pyhocon import ConfigFactory, ConfigTree + from databuilder import Scoped from databuilder.extractor.base_extractor import Extractor from databuilder.extractor.dashboard.mode_analytics.mode_dashboard_utils import ModeDashboardUtils from databuilder.rest_api.mode_analytics.mode_paginated_rest_api_query import ModePaginatedRestApiQuery from databuilder.rest_api.rest_api_query import RestApiQuery from databuilder.transformer.base_transformer import ChainedTransformer, Transformer -from databuilder.transformer.dict_to_model import DictToModel, MODEL_CLASS -from databuilder.transformer.template_variable_substitution_transformer import \ - TemplateVariableSubstitutionTransformer, FIELD_NAME, TEMPLATE +from databuilder.transformer.dict_to_model import MODEL_CLASS, DictToModel +from databuilder.transformer.template_variable_substitution_transformer import ( + FIELD_NAME, TEMPLATE, TemplateVariableSubstitutionTransformer, +) LOGGER = logging.getLogger(__name__) diff --git a/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_executions_extractor.py b/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_executions_extractor.py index e3c4d9d385..12be4bda1e 100644 --- a/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_executions_extractor.py +++ b/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_executions_extractor.py @@ -2,18 +2,18 @@ # SPDX-License-Identifier: Apache-2.0 import logging - -from pyhocon import ConfigTree, ConfigFactory from typing import Any, List +from pyhocon import ConfigFactory, ConfigTree + from databuilder import Scoped from databuilder.extractor.base_extractor import Extractor from databuilder.extractor.dashboard.mode_analytics.mode_dashboard_utils import ModeDashboardUtils from databuilder.rest_api.mode_analytics.mode_paginated_rest_api_query import ModePaginatedRestApiQuery from databuilder.rest_api.rest_api_query import RestApiQuery from databuilder.transformer.base_transformer import ChainedTransformer, Transformer -from databuilder.transformer.dict_to_model import DictToModel, MODEL_CLASS -from databuilder.transformer.timestamp_string_to_epoch import TimestampStringToEpoch, FIELD_NAME +from databuilder.transformer.dict_to_model import MODEL_CLASS, DictToModel +from databuilder.transformer.timestamp_string_to_epoch import FIELD_NAME, TimestampStringToEpoch LOGGER = logging.getLogger(__name__) diff --git a/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_extractor.py b/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_extractor.py index f29b1337e3..1a86e971bd 100644 --- a/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_extractor.py +++ b/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_extractor.py @@ -2,20 +2,21 @@ # SPDX-License-Identifier: Apache-2.0 import logging - -from pyhocon import ConfigTree, ConfigFactory from typing import Any, List +from pyhocon import ConfigFactory, ConfigTree + from databuilder import Scoped from databuilder.extractor.base_extractor import Extractor from databuilder.extractor.dashboard.mode_analytics.mode_dashboard_utils import ModeDashboardUtils from databuilder.rest_api.mode_analytics.mode_paginated_rest_api_query import ModePaginatedRestApiQuery from databuilder.rest_api.rest_api_query import RestApiQuery from databuilder.transformer.base_transformer import ChainedTransformer, Transformer -from databuilder.transformer.dict_to_model import DictToModel, MODEL_CLASS -from databuilder.transformer.template_variable_substitution_transformer import \ - TemplateVariableSubstitutionTransformer, TEMPLATE, FIELD_NAME as VAR_FIELD_NAME -from databuilder.transformer.timestamp_string_to_epoch import TimestampStringToEpoch, FIELD_NAME +from databuilder.transformer.dict_to_model import MODEL_CLASS, DictToModel +from databuilder.transformer.template_variable_substitution_transformer import ( + FIELD_NAME as VAR_FIELD_NAME, TEMPLATE, TemplateVariableSubstitutionTransformer, +) +from databuilder.transformer.timestamp_string_to_epoch import FIELD_NAME, TimestampStringToEpoch LOGGER = logging.getLogger(__name__) diff --git a/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_last_modified_timestamp_extractor.py b/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_last_modified_timestamp_extractor.py index 57b4ef5fd0..0ef0d20308 100644 --- a/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_last_modified_timestamp_extractor.py +++ b/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_last_modified_timestamp_extractor.py @@ -3,16 +3,17 @@ import logging -from pyhocon import ConfigTree, ConfigFactory +from pyhocon import ConfigFactory, ConfigTree -from databuilder.extractor.dashboard.mode_analytics.mode_dashboard_executions_extractor import \ - ModeDashboardExecutionsExtractor +from databuilder.extractor.dashboard.mode_analytics.mode_dashboard_executions_extractor import ( + ModeDashboardExecutionsExtractor, +) from databuilder.extractor.dashboard.mode_analytics.mode_dashboard_utils import ModeDashboardUtils from databuilder.extractor.restapi.rest_api_extractor import STATIC_RECORD_DICT from databuilder.rest_api.mode_analytics.mode_paginated_rest_api_query import ModePaginatedRestApiQuery from databuilder.rest_api.rest_api_query import RestApiQuery -from databuilder.transformer.dict_to_model import DictToModel, MODEL_CLASS -from databuilder.transformer.timestamp_string_to_epoch import TimestampStringToEpoch, FIELD_NAME +from databuilder.transformer.dict_to_model import MODEL_CLASS, DictToModel +from databuilder.transformer.timestamp_string_to_epoch import FIELD_NAME, TimestampStringToEpoch LOGGER = logging.getLogger(__name__) @@ -30,9 +31,9 @@ def init(self, conf: ConfigTree) -> None: conf = conf.with_fallback( ConfigFactory.from_dict({ STATIC_RECORD_DICT: {'product': 'mode'}, - '{}.{}'.format(DictToModel().get_scope(), MODEL_CLASS): + f'{DictToModel().get_scope()}.{MODEL_CLASS}': 'databuilder.models.dashboard.dashboard_last_modified.DashboardLastModifiedTimestamp', - '{}.{}'.format(TimestampStringToEpoch().get_scope(), FIELD_NAME): + f'{TimestampStringToEpoch().get_scope()}.{FIELD_NAME}': 'last_modified_timestamp' }) ) diff --git a/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_last_successful_executions_extractor.py b/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_last_successful_executions_extractor.py index 6fa947527d..ac4ed5fd1a 100644 --- a/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_last_successful_executions_extractor.py +++ b/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_last_successful_executions_extractor.py @@ -3,10 +3,11 @@ import logging -from pyhocon import ConfigTree, ConfigFactory +from pyhocon import ConfigFactory, ConfigTree -from databuilder.extractor.dashboard.mode_analytics.mode_dashboard_executions_extractor import \ - ModeDashboardExecutionsExtractor +from databuilder.extractor.dashboard.mode_analytics.mode_dashboard_executions_extractor import ( + ModeDashboardExecutionsExtractor, +) from databuilder.extractor.dashboard.mode_analytics.mode_dashboard_utils import ModeDashboardUtils from databuilder.extractor.restapi.rest_api_extractor import STATIC_RECORD_DICT from databuilder.models.dashboard.dashboard_execution import DashboardExecution diff --git a/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_owner_extractor.py b/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_owner_extractor.py index 32ec6b204c..a616ea0dad 100644 --- a/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_owner_extractor.py +++ b/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_owner_extractor.py @@ -2,10 +2,10 @@ # SPDX-License-Identifier: Apache-2.0 import logging - -from pyhocon import ConfigTree, ConfigFactory from typing import Any +from pyhocon import ConfigFactory, ConfigTree + from databuilder.extractor.base_extractor import Extractor from databuilder.extractor.dashboard.mode_analytics.mode_dashboard_utils import ModeDashboardUtils from databuilder.extractor.restapi.rest_api_extractor import MODEL_CLASS diff --git a/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_queries_extractor.py b/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_queries_extractor.py index 3d6f9f9fee..c3debcff6b 100644 --- a/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_queries_extractor.py +++ b/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_queries_extractor.py @@ -2,21 +2,23 @@ # SPDX-License-Identifier: Apache-2.0 import logging - -from pyhocon import ConfigTree, ConfigFactory from typing import Any, List +from pyhocon import ConfigFactory, ConfigTree + from databuilder import Scoped from databuilder.extractor.base_extractor import Extractor from databuilder.extractor.dashboard.mode_analytics.mode_dashboard_utils import ModeDashboardUtils from databuilder.rest_api.mode_analytics.mode_paginated_rest_api_query import ModePaginatedRestApiQuery from databuilder.rest_api.rest_api_query import RestApiQuery from databuilder.transformer.base_transformer import ChainedTransformer, Transformer -from databuilder.transformer.dict_to_model import DictToModel, MODEL_CLASS -from databuilder.transformer.regex_str_replace_transformer import RegexStrReplaceTransformer, \ - REGEX_REPLACE_TUPLE_LIST, ATTRIBUTE_NAME -from databuilder.transformer.template_variable_substitution_transformer import \ - TemplateVariableSubstitutionTransformer, TEMPLATE, FIELD_NAME +from databuilder.transformer.dict_to_model import MODEL_CLASS, DictToModel +from databuilder.transformer.regex_str_replace_transformer import ( + ATTRIBUTE_NAME, REGEX_REPLACE_TUPLE_LIST, RegexStrReplaceTransformer, +) +from databuilder.transformer.template_variable_substitution_transformer import ( + FIELD_NAME, TEMPLATE, TemplateVariableSubstitutionTransformer, +) LOGGER = logging.getLogger(__name__) diff --git a/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_usage_extractor.py b/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_usage_extractor.py index 6aa7e9178e..a84b7343db 100644 --- a/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_usage_extractor.py +++ b/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_usage_extractor.py @@ -2,9 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 import logging +from typing import Any from pyhocon import ConfigTree -from typing import Any from databuilder.extractor.base_extractor import Extractor from databuilder.extractor.dashboard.mode_analytics.mode_dashboard_utils import ModeDashboardUtils diff --git a/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_user_extractor.py b/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_user_extractor.py index 4a2a5227d0..45fd4f52d6 100644 --- a/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_user_extractor.py +++ b/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_user_extractor.py @@ -2,22 +2,23 @@ # SPDX-License-Identifier: Apache-2.0 import logging +from typing import Any, List -from pyhocon import ConfigTree, ConfigFactory +from pyhocon import ConfigFactory, ConfigTree from requests.auth import HTTPBasicAuth -from typing import Any, List from databuilder import Scoped from databuilder.extractor.base_extractor import Extractor -from databuilder.extractor.dashboard.mode_analytics.mode_dashboard_constants import ORGANIZATION, MODE_ACCESS_TOKEN, \ - MODE_PASSWORD_TOKEN +from databuilder.extractor.dashboard.mode_analytics.mode_dashboard_constants import ( + MODE_ACCESS_TOKEN, MODE_PASSWORD_TOKEN, ORGANIZATION, +) from databuilder.extractor.dashboard.mode_analytics.mode_dashboard_utils import ModeDashboardUtils from databuilder.rest_api.base_rest_api_query import RestApiQuerySeed from databuilder.rest_api.rest_api_failure_handlers import HttpFailureSkipOnStatus from databuilder.rest_api.rest_api_query import RestApiQuery from databuilder.transformer.base_transformer import ChainedTransformer, Transformer -from databuilder.transformer.dict_to_model import DictToModel, MODEL_CLASS -from databuilder.transformer.remove_field_transformer import RemoveFieldTransformer, FIELD_NAMES +from databuilder.transformer.dict_to_model import MODEL_CLASS, DictToModel +from databuilder.transformer.remove_field_transformer import FIELD_NAMES, RemoveFieldTransformer LOGGER = logging.getLogger(__name__) diff --git a/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_utils.py b/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_utils.py index 68b1a3b0d9..ecd963395a 100644 --- a/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_utils.py +++ b/databuilder/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_utils.py @@ -1,16 +1,19 @@ # Copyright Contributors to the Amundsen project. # SPDX-License-Identifier: Apache-2.0 -from pyhocon import ConfigTree, ConfigFactory -from requests.auth import HTTPBasicAuth from typing import Any, Dict +from pyhocon import ConfigFactory, ConfigTree +from requests.auth import HTTPBasicAuth + from databuilder import Scoped -from databuilder.extractor.dashboard.mode_analytics.mode_dashboard_constants import ORGANIZATION, MODE_ACCESS_TOKEN, \ - MODE_PASSWORD_TOKEN, MODE_BEARER_TOKEN -from databuilder.extractor.restapi.rest_api_extractor import RestAPIExtractor, REST_API_QUERY, STATIC_RECORD_DICT -from databuilder.rest_api.base_rest_api_query import BaseRestApiQuery -from databuilder.rest_api.base_rest_api_query import RestApiQuerySeed +from databuilder.extractor.dashboard.mode_analytics.mode_dashboard_constants import ( + MODE_ACCESS_TOKEN, MODE_BEARER_TOKEN, MODE_PASSWORD_TOKEN, ORGANIZATION, +) +from databuilder.extractor.restapi.rest_api_extractor import ( + REST_API_QUERY, STATIC_RECORD_DICT, RestAPIExtractor, +) +from databuilder.rest_api.base_rest_api_query import BaseRestApiQuery, RestApiQuerySeed from databuilder.rest_api.rest_api_query import RestApiQuery diff --git a/databuilder/databuilder/extractor/dashboard/redash/redash_dashboard_extractor.py b/databuilder/databuilder/extractor/dashboard/redash/redash_dashboard_extractor.py index 4a3e4d9a93..630c8c0b99 100644 --- a/databuilder/databuilder/extractor/dashboard/redash/redash_dashboard_extractor.py +++ b/databuilder/databuilder/extractor/dashboard/redash/redash_dashboard_extractor.py @@ -2,26 +2,29 @@ # SPDX-License-Identifier: Apache-2.0 import importlib +from typing import ( + Any, Dict, Iterator, Optional, +) from pyhocon import ConfigFactory, ConfigTree -from typing import Any, Dict, Iterator, Optional -from databuilder.models.dashboard.dashboard_metadata import DashboardMetadata +from databuilder.extractor.base_extractor import Extractor +from databuilder.extractor.dashboard.redash.redash_dashboard_utils import ( + RedashPaginatedRestApiQuery, generate_dashboard_description, get_auth_headers, get_text_widgets, + get_visualization_widgets, sort_widgets, +) +from databuilder.extractor.restapi.rest_api_extractor import REST_API_QUERY, RestAPIExtractor +from databuilder.models.dashboard.dashboard_chart import DashboardChart from databuilder.models.dashboard.dashboard_last_modified import DashboardLastModifiedTimestamp +from databuilder.models.dashboard.dashboard_metadata import DashboardMetadata from databuilder.models.dashboard.dashboard_owner import DashboardOwner from databuilder.models.dashboard.dashboard_query import DashboardQuery from databuilder.models.dashboard.dashboard_table import DashboardTable -from databuilder.models.dashboard.dashboard_chart import DashboardChart from databuilder.models.table_metadata import TableMetadata -from databuilder.extractor.base_extractor import Extractor -from databuilder.rest_api.rest_api_query import RestApiQuery from databuilder.rest_api.base_rest_api_query import EmptyRestApiQuerySeed -from databuilder.extractor.restapi.rest_api_extractor import RestAPIExtractor, REST_API_QUERY -from databuilder.extractor.dashboard.redash.redash_dashboard_utils import \ - get_auth_headers, get_text_widgets, get_visualization_widgets, sort_widgets, \ - generate_dashboard_description, RedashPaginatedRestApiQuery +from databuilder.rest_api.rest_api_query import RestApiQuery from databuilder.transformer.base_transformer import ChainedTransformer -from databuilder.transformer.timestamp_string_to_epoch import TimestampStringToEpoch, FIELD_NAME as TS_FIELD_NAME +from databuilder.transformer.timestamp_string_to_epoch import FIELD_NAME as TS_FIELD_NAME, TimestampStringToEpoch class TableRelationData: @@ -35,12 +38,10 @@ def __init__(self, cluster: str, schema: str, name: str) -> None: - self._data = {'db': database, 'cluster': cluster, 'schema': schema, 'tbl': name} @property def key(self) -> str: - return TableMetadata.TABLE_KEY_FORMAT.format(**self._data) @@ -126,8 +127,7 @@ def _get_extract_iter(self) -> Iterator[Any]: 'dashboard_name': record['dashboard_name'], 'dashboard_url': - '{redash}/dashboards/{id}' - .format(redash=self._redash_base_url, id=record['dashboard_id']), + f'{self._redash_base_url}/dashboards/{record["dashboard_id"]}', 'created_timestamp': record['created_timestamp'] } @@ -195,7 +195,7 @@ def _build_restapi_query(self) -> RestApiQuery: dashes_query = RedashPaginatedRestApiQuery( query_to_join=EmptyRestApiQuerySeed(), - url='{redash_api}/dashboards'.format(redash_api=self._api_base_url), + url=f'{self._api_base_url}/dashboards', params=self._get_default_api_query_params(), json_path='results[*].[id,name,slug,created_at,updated_at,is_archived,is_draft,user]', field_names=[ @@ -207,7 +207,7 @@ def _build_restapi_query(self) -> RestApiQuery: return RestApiQuery( query_to_join=dashes_query, - url='{redash_api}/dashboards/{{dashboard_id}}'.format(redash_api=self._api_base_url), + url=f'{self._api_base_url}/dashboards/{{dashboard_id}}', params=self._get_default_api_query_params(), json_path='widgets', field_names=['widgets'], diff --git a/databuilder/databuilder/extractor/dashboard/redash/redash_dashboard_utils.py b/databuilder/databuilder/extractor/dashboard/redash/redash_dashboard_utils.py index 81c8402a6b..11a5f480bf 100644 --- a/databuilder/databuilder/extractor/dashboard/redash/redash_dashboard_utils.py +++ b/databuilder/databuilder/extractor/dashboard/redash/redash_dashboard_utils.py @@ -1,7 +1,9 @@ # Copyright Contributors to the Amundsen project. # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, Iterable, List, Tuple +from typing import ( + Any, Dict, Iterable, List, Tuple, +) from databuilder.rest_api.rest_api_query import RestApiQuery @@ -31,7 +33,7 @@ def query_id(self) -> int: @property def query_relative_url(self) -> str: - return '/queries/{id}'.format(id=self.query_id) + return f'/queries/{self.query_id}' @property def query_name(self) -> str: @@ -131,7 +133,7 @@ def get_visualization_widgets(widgets: Iterable[Dict[str, Any]]) -> List[RedashV def get_auth_headers(api_key: str) -> Dict[str, str]: - return {'Authorization': 'Key {}'.format(api_key)} + return {'Authorization': f'Key {api_key}'} def generate_dashboard_description(text_widgets: List[RedashTextWidget], @@ -147,7 +149,7 @@ def generate_dashboard_description(text_widgets: List[RedashTextWidget], if len(text_widgets) > 0: return '\n\n'.join([w.text for w in text_widgets]) elif len(viz_widgets) > 0: - query_list = '\n'.join(set(['- {}'.format(v.query_name) for v in set(viz_widgets)])) + query_list = '\n'.join(set([f'- {v.query_name}' for v in set(viz_widgets)])) return 'A dashboard containing the following queries:\n\n' + query_list return 'This dashboard appears to be empty!' diff --git a/databuilder/databuilder/extractor/dashboard/tableau/tableau_dashboard_extractor.py b/databuilder/databuilder/extractor/dashboard/tableau/tableau_dashboard_extractor.py index 165ac6772a..4bbac12c59 100644 --- a/databuilder/databuilder/extractor/dashboard/tableau/tableau_dashboard_extractor.py +++ b/databuilder/databuilder/extractor/dashboard/tableau/tableau_dashboard_extractor.py @@ -2,21 +2,22 @@ # SPDX-License-Identifier: Apache-2.0 import logging -from typing import Any, Dict, Iterator, List +from typing import ( + Any, Dict, Iterator, List, +) from pyhocon import ConfigFactory, ConfigTree import databuilder.extractor.dashboard.tableau.tableau_dashboard_constants as const from databuilder import Scoped from databuilder.extractor.base_extractor import Extractor -from databuilder.extractor.dashboard.tableau.tableau_dashboard_utils import TableauGraphQLApiExtractor,\ - TableauDashboardUtils +from databuilder.extractor.dashboard.tableau.tableau_dashboard_utils import ( + TableauDashboardUtils, TableauGraphQLApiExtractor, +) from databuilder.extractor.restapi.rest_api_extractor import STATIC_RECORD_DICT -from databuilder.transformer.base_transformer import ChainedTransformer -from databuilder.transformer.base_transformer import Transformer -from databuilder.transformer.dict_to_model import DictToModel, MODEL_CLASS -from databuilder.transformer.timestamp_string_to_epoch import TimestampStringToEpoch, FIELD_NAME - +from databuilder.transformer.base_transformer import ChainedTransformer, Transformer +from databuilder.transformer.dict_to_model import MODEL_CLASS, DictToModel +from databuilder.transformer.timestamp_string_to_epoch import FIELD_NAME, TimestampStringToEpoch LOGGER = logging.getLogger(__name__) @@ -37,21 +38,15 @@ def execute(self) -> Iterator[Dict[str, Any]]: workbooks_data = [workbook for workbook in response['workbooks'] if workbook['projectName'] not in self._conf.get_list(TableauGraphQLApiMetadataExtractor.EXCLUDED_PROJECTS)] - + base_url = self._conf.get(TableauGraphQLApiMetadataExtractor.TABLEAU_BASE_URL) for workbook in workbooks_data: data = { 'dashboard_group': workbook['projectName'], 'dashboard_name': TableauDashboardUtils.sanitize_workbook_name(workbook['name']), 'description': workbook.get('description', ''), 'created_timestamp': workbook['createdAt'], - 'dashboard_group_url': '{}/#/projects/{}'.format( - self._conf.get(TableauGraphQLApiMetadataExtractor.TABLEAU_BASE_URL), - workbook['projectVizportalUrlId'] - ), - 'dashboard_url': '{}/#/workbooks/{}/views'.format( - self._conf.get(TableauGraphQLApiMetadataExtractor.TABLEAU_BASE_URL), - workbook['vizportalUrlId'] - ), + 'dashboard_group_url': f'{base_url}/#/projects/{workbook["projectVizportalUrlId"]}', + 'dashboard_url': f'{base_url}/#/workbooks/{workbook["vizportalUrlId"]}/views', 'cluster': self._conf.get_string(TableauGraphQLApiMetadataExtractor.CLUSTER) } yield data @@ -126,13 +121,9 @@ def _build_extractor(self) -> TableauGraphQLApiMetadataExtractor: :return: A TableauGraphQLApiMetadataExtractor that provides core dashboard metadata. """ extractor = TableauGraphQLApiMetadataExtractor() - tableau_extractor_conf = \ - Scoped.get_scoped_conf(self._conf, extractor.get_scope())\ - .with_fallback(self._conf)\ - .with_fallback(ConfigFactory.from_dict({TableauGraphQLApiExtractor.QUERY: self.query, - STATIC_RECORD_DICT: {'product': 'tableau'} - } - ) - ) + tableau_extractor_conf = Scoped.get_scoped_conf(self._conf, extractor.get_scope()) \ + .with_fallback(self._conf) \ + .with_fallback(ConfigFactory.from_dict({TableauGraphQLApiExtractor.QUERY: self.query, + STATIC_RECORD_DICT: {'product': 'tableau'}})) extractor.init(conf=tableau_extractor_conf) return extractor diff --git a/databuilder/databuilder/extractor/dashboard/tableau/tableau_dashboard_last_modified_extractor.py b/databuilder/databuilder/extractor/dashboard/tableau/tableau_dashboard_last_modified_extractor.py index 861b64914b..ce165f686d 100644 --- a/databuilder/databuilder/extractor/dashboard/tableau/tableau_dashboard_last_modified_extractor.py +++ b/databuilder/databuilder/extractor/dashboard/tableau/tableau_dashboard_last_modified_extractor.py @@ -2,21 +2,22 @@ # SPDX-License-Identifier: Apache-2.0 import logging -from typing import Any, Dict, Iterator, List +from typing import ( + Any, Dict, Iterator, List, +) from pyhocon import ConfigFactory, ConfigTree import databuilder.extractor.dashboard.tableau.tableau_dashboard_constants as const from databuilder import Scoped from databuilder.extractor.base_extractor import Extractor -from databuilder.extractor.dashboard.tableau.tableau_dashboard_utils import TableauGraphQLApiExtractor,\ - TableauDashboardUtils +from databuilder.extractor.dashboard.tableau.tableau_dashboard_utils import ( + TableauDashboardUtils, TableauGraphQLApiExtractor, +) from databuilder.extractor.restapi.rest_api_extractor import STATIC_RECORD_DICT -from databuilder.transformer.base_transformer import ChainedTransformer -from databuilder.transformer.base_transformer import Transformer -from databuilder.transformer.dict_to_model import DictToModel, MODEL_CLASS -from databuilder.transformer.timestamp_string_to_epoch import TimestampStringToEpoch, FIELD_NAME - +from databuilder.transformer.base_transformer import ChainedTransformer, Transformer +from databuilder.transformer.dict_to_model import MODEL_CLASS, DictToModel +from databuilder.transformer.timestamp_string_to_epoch import FIELD_NAME, TimestampStringToEpoch LOGGER = logging.getLogger(__name__) diff --git a/databuilder/databuilder/extractor/dashboard/tableau/tableau_dashboard_query_extractor.py b/databuilder/databuilder/extractor/dashboard/tableau/tableau_dashboard_query_extractor.py index 7396e4763d..2d6632d9c6 100644 --- a/databuilder/databuilder/extractor/dashboard/tableau/tableau_dashboard_query_extractor.py +++ b/databuilder/databuilder/extractor/dashboard/tableau/tableau_dashboard_query_extractor.py @@ -2,18 +2,21 @@ # SPDX-License-Identifier: Apache-2.0 import logging -from typing import Any, Dict, Iterator +from typing import ( + Any, Dict, Iterator, +) from pyhocon import ConfigFactory, ConfigTree import databuilder.extractor.dashboard.tableau.tableau_dashboard_constants as const from databuilder import Scoped from databuilder.extractor.base_extractor import Extractor -from databuilder.extractor.dashboard.tableau.tableau_dashboard_utils import TableauGraphQLApiExtractor,\ - TableauDashboardUtils +from databuilder.extractor.dashboard.tableau.tableau_dashboard_utils import ( + TableauDashboardUtils, TableauGraphQLApiExtractor, +) from databuilder.extractor.restapi.rest_api_extractor import STATIC_RECORD_DICT from databuilder.transformer.base_transformer import ChainedTransformer -from databuilder.transformer.dict_to_model import DictToModel, MODEL_CLASS +from databuilder.transformer.dict_to_model import MODEL_CLASS, DictToModel LOGGER = logging.getLogger(__name__) diff --git a/databuilder/databuilder/extractor/dashboard/tableau/tableau_dashboard_table_extractor.py b/databuilder/databuilder/extractor/dashboard/tableau/tableau_dashboard_table_extractor.py index ce4714a81d..cf9bcf559c 100644 --- a/databuilder/databuilder/extractor/dashboard/tableau/tableau_dashboard_table_extractor.py +++ b/databuilder/databuilder/extractor/dashboard/tableau/tableau_dashboard_table_extractor.py @@ -2,20 +2,22 @@ # SPDX-License-Identifier: Apache-2.0 import logging -from typing import Any, Dict, Iterator +from typing import ( + Any, Dict, Iterator, +) from pyhocon import ConfigFactory, ConfigTree import databuilder.extractor.dashboard.tableau.tableau_dashboard_constants as const from databuilder import Scoped from databuilder.extractor.base_extractor import Extractor -from databuilder.extractor.dashboard.tableau.tableau_dashboard_utils import TableauGraphQLApiExtractor,\ - TableauDashboardUtils +from databuilder.extractor.dashboard.tableau.tableau_dashboard_utils import ( + TableauDashboardUtils, TableauGraphQLApiExtractor, +) from databuilder.extractor.restapi.rest_api_extractor import STATIC_RECORD_DICT -from databuilder.transformer.base_transformer import ChainedTransformer -from databuilder.transformer.dict_to_model import DictToModel, MODEL_CLASS - from databuilder.models.table_metadata import TableMetadata +from databuilder.transformer.base_transformer import ChainedTransformer +from databuilder.transformer.dict_to_model import MODEL_CLASS, DictToModel LOGGER = logging.getLogger(__name__) diff --git a/databuilder/databuilder/extractor/dashboard/tableau/tableau_dashboard_utils.py b/databuilder/databuilder/extractor/dashboard/tableau/tableau_dashboard_utils.py index 861dc45f79..72b971b2ea 100644 --- a/databuilder/databuilder/extractor/dashboard/tableau/tableau_dashboard_utils.py +++ b/databuilder/databuilder/extractor/dashboard/tableau/tableau_dashboard_utils.py @@ -2,10 +2,12 @@ # SPDX-License-Identifier: Apache-2.0 import json -import requests import re -from typing import Any, Dict, Iterator, Optional +from typing import ( + Any, Dict, Iterator, Optional, +) +import requests from pyhocon import ConfigTree import databuilder.extractor.dashboard.tableau.tableau_dashboard_constants as const @@ -169,10 +171,7 @@ def _authenticate(self) -> str: See https://help.tableau.com/current/api/rest_api/en-us/REST/rest_api_concepts_versions.htm for details or ask your Tableau server administrator. """ - self._auth_url = "{api_base_url}/api/{api_version}/auth/signin".format( - api_base_url=self._api_base_url, - api_version=self._api_version - ) + self._auth_url = f"{self._api_base_url}/api/{self._api_version}/auth/signin" payload = json.dumps({ 'credentials': { diff --git a/databuilder/databuilder/extractor/dashboard/tableau/tableau_external_table_extractor.py b/databuilder/databuilder/extractor/dashboard/tableau/tableau_external_table_extractor.py index d02cd12269..744c7eb97e 100644 --- a/databuilder/databuilder/extractor/dashboard/tableau/tableau_external_table_extractor.py +++ b/databuilder/databuilder/extractor/dashboard/tableau/tableau_external_table_extractor.py @@ -2,17 +2,20 @@ # SPDX-License-Identifier: Apache-2.0 import logging -from typing import Any, Dict, Iterator +from typing import ( + Any, Dict, Iterator, +) from pyhocon import ConfigFactory, ConfigTree import databuilder.extractor.dashboard.tableau.tableau_dashboard_constants as const from databuilder import Scoped from databuilder.extractor.base_extractor import Extractor -from databuilder.extractor.dashboard.tableau.tableau_dashboard_utils import TableauGraphQLApiExtractor,\ - TableauDashboardUtils +from databuilder.extractor.dashboard.tableau.tableau_dashboard_utils import ( + TableauDashboardUtils, TableauGraphQLApiExtractor, +) from databuilder.transformer.base_transformer import ChainedTransformer -from databuilder.transformer.dict_to_model import DictToModel, MODEL_CLASS +from databuilder.transformer.dict_to_model import MODEL_CLASS, DictToModel LOGGER = logging.getLogger(__name__) diff --git a/databuilder/databuilder/extractor/db2_metadata_extractor.py b/databuilder/databuilder/extractor/db2_metadata_extractor.py index e7760a1c66..6ad8149fbc 100644 --- a/databuilder/databuilder/extractor/db2_metadata_extractor.py +++ b/databuilder/databuilder/extractor/db2_metadata_extractor.py @@ -3,15 +3,17 @@ import logging from collections import namedtuple +from itertools import groupby +from typing import ( + Any, Dict, Iterator, Union, +) from pyhocon import ConfigFactory, ConfigTree -from typing import Iterator, Union, Dict, Any from databuilder import Scoped from databuilder.extractor.base_extractor import Extractor from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor -from databuilder.models.table_metadata import TableMetadata, ColumnMetadata -from itertools import groupby +from databuilder.models.table_metadata import ColumnMetadata, TableMetadata TableKey = namedtuple('TableKey', ['schema', 'table_name']) @@ -54,9 +56,9 @@ class Db2MetadataExtractor(Extractor): def init(self, conf: ConfigTree) -> None: conf = conf.with_fallback(Db2MetadataExtractor.DEFAULT_CONFIG) - self._cluster = '{}'.format(conf.get_string(Db2MetadataExtractor.CLUSTER_KEY)) + self._cluster = conf.get_string(Db2MetadataExtractor.CLUSTER_KEY) - cluster_source = "'{}'".format(self._cluster) + cluster_source = f"'{self._cluster}'" self._database = conf.get_string(Db2MetadataExtractor.DATABASE_KEY, default='db2') @@ -71,7 +73,7 @@ def init(self, conf: ConfigTree) -> None: self.sql_stmt = sql_alch_conf.get_string(SQLAlchemyExtractor.EXTRACT_SQL) - LOGGER.info('SQL for Db2 metadata: {}'.format(self.sql_stmt)) + LOGGER.info('SQL for Db2 metadata: %s', self.sql_stmt) self._alchemy_extractor.init(sql_alch_conf) self._extract_iter: Union[None, Iterator] = None diff --git a/databuilder/databuilder/extractor/db_api_extractor.py b/databuilder/databuilder/extractor/db_api_extractor.py index 928cf9d5ac..33447277b1 100644 --- a/databuilder/databuilder/extractor/db_api_extractor.py +++ b/databuilder/databuilder/extractor/db_api_extractor.py @@ -3,13 +3,12 @@ import importlib import logging -from typing import Iterable, Any +from typing import Any, Iterable from pyhocon import ConfigTree from databuilder.extractor.base_extractor import Extractor - LOGGER = logging.getLogger(__name__) @@ -47,7 +46,7 @@ def _execute_query(self) -> Iterable[Any]: Use cursor to execute the {sql} :return: """ - LOGGER.info('Executing query: \n{}'.format(self.sql)) + LOGGER.info('Executing query: \n%s', self.sql) self.cursor.execute(self.sql) return self.cursor.fetchall() diff --git a/databuilder/databuilder/extractor/delta_lake_metadata_extractor.py b/databuilder/databuilder/extractor/delta_lake_metadata_extractor.py index 0501d7c8bc..7bb518f46b 100644 --- a/databuilder/databuilder/extractor/delta_lake_metadata_extractor.py +++ b/databuilder/databuilder/extractor/delta_lake_metadata_extractor.py @@ -1,18 +1,22 @@ # Copyright Contributors to the Amundsen project. # SPDX-License-Identifier: Apache-2.0 -from datetime import datetime + +import concurrent.futures import logging from collections import namedtuple +from datetime import datetime +from typing import ( # noqa: F401 + Dict, Iterator, List, Optional, Union, +) -from databuilder.extractor.base_extractor import Extractor -from databuilder.models.table_last_updated import TableLastUpdated -from databuilder.models.table_metadata import TableMetadata, ColumnMetadata from pyhocon import ConfigFactory, ConfigTree # noqa: F401 from pyspark.sql import SparkSession from pyspark.sql.catalog import Table from pyspark.sql.utils import AnalysisException -from typing import Iterator, Union, List, Dict, Optional # noqa: F401 -import concurrent.futures + +from databuilder.extractor.base_extractor import Extractor +from databuilder.models.table_last_updated import TableLastUpdated +from databuilder.models.table_metadata import ColumnMetadata, TableMetadata TableKey = namedtuple('TableKey', ['schema', 'table_name']) @@ -35,15 +39,15 @@ def set_is_partition(self, is_partition: bool) -> None: def __eq__(self, other: object) -> bool: if not isinstance(other, ScrapedColumnMetadata): return False - return self.name == other.name and \ - self.data_type == other.data_type and \ - self.description == other.description and \ - self.sort_order == other.sort_order and \ - self.is_partition == other.is_partition and \ - self.attributes == other.attributes + return (self.name == other.name and + self.data_type == other.data_type and + self.description == other.description and + self.sort_order == other.sort_order and + self.is_partition == other.is_partition and + self.attributes == other.attributes) def __repr__(self) -> str: - return "{0}:{1}".format(self.name, self.data_type) + return f'{self.name}:{self.data_type}' # TODO consider deprecating this for using TableMetadata directly @@ -108,7 +112,7 @@ def is_delta_table(self) -> bool: return False def __repr__(self) -> str: - return "{schema}.{table}".format(schema=self.schema, table=self.table) + return f'{self.schema}.{self.table}' class DeltaLakeMetadataExtractor(Extractor): @@ -164,13 +168,13 @@ def _get_extract_iter(self) -> Iterator[Union[TableMetadata, TableLastUpdated, N - last updated information """ if self.schema_list: - LOGGER.info("working on {}".format(self.schema_list)) + LOGGER.info("working on %s", self.schema_list) tables = self.get_all_tables(self.schema_list) else: LOGGER.info("fetching all schemas") - LOGGER.info("Excluding: {}".format(self.exclude_list)) + LOGGER.info("Excluding: %s", self.exclude_list) schemas = self.get_schemas(self.exclude_list) - LOGGER.info("working on {}".format(schemas)) + LOGGER.info("working on %s", schemas) tables = self.get_all_tables(schemas) # TODO add the programmatic information as well? # TODO add watermarks @@ -179,7 +183,7 @@ def _get_extract_iter(self) -> Iterator[Union[TableMetadata, TableLastUpdated, N if not scraped_table: continue if self.delta_tables_only and not scraped_table.is_delta_table(): - LOGGER.info("Skipping none delta table {}".format(scraped_table.table)) + LOGGER.info("Skipping none delta table %s", scraped_table.table) continue else: yield self.create_table_metadata(scraped_table) @@ -245,7 +249,7 @@ def scrape_table(self, table: Table) -> Optional[ScrapedTableMetadata]: def scrape_table_detail(self, table_name: str) -> Optional[Dict]: try: - table_details_df = self.spark.sql("describe detail {0}".format(table_name)) + table_details_df = self.spark.sql(f"describe detail {table_name}") table_detail = table_details_df.collect()[0] return table_detail.asDict() except Exception as e: @@ -256,8 +260,7 @@ def scrape_view_detail(self, view_name: str) -> Optional[Dict]: # TODO the blanket try catches need to be changed describeExtendedOutput = [] try: - describeExtendedOutput = self.spark.sql("describe extended {view_name}" - .format(view_name=view_name)).collect() + describeExtendedOutput = self.spark.sql(f"describe extended {view_name}").collect() except Exception as e: LOGGER.error(e) return None @@ -277,7 +280,7 @@ def fetch_columns(self, schema: str, table: str) -> List[ScrapedColumnMetadata]: in the general case cannot rely on spark.catalog.listColumns.''' raw_columns = [] try: - raw_columns = self.spark.sql("describe {0}.{1}".format(schema, table)).collect() + raw_columns = self.spark.sql(f"describe {schema}.{table}").collect() except AnalysisException as e: LOGGER.error(e) return raw_columns @@ -301,10 +304,10 @@ def fetch_columns(self, schema: str, table: str) -> List[ScrapedColumnMetadata]: sort_order += 1 else: if row['data_type'] in parsed_columns: - LOGGER.debug("Adding partition column table for {0}".format(row['data_type'])) + LOGGER.debug(f"Adding partition column table for {row['data_type']}") parsed_columns[row['data_type']].set_is_partition(True) elif row['col_name'] in parsed_columns: - LOGGER.debug("Adding partition column table for {0}".format(row['col_name'])) + LOGGER.debug(f"Adding partition column table for {row['col_name']}") parsed_columns[row['col_name']].set_is_partition(True) return list(parsed_columns.values()) diff --git a/databuilder/databuilder/extractor/dremio_metadata_extractor.py b/databuilder/databuilder/extractor/dremio_metadata_extractor.py index 3875de0171..71ac03651e 100644 --- a/databuilder/databuilder/extractor/dremio_metadata_extractor.py +++ b/databuilder/databuilder/extractor/dremio_metadata_extractor.py @@ -2,17 +2,18 @@ # SPDX-License-Identifier: Apache-2.0 +import logging from collections import namedtuple from itertools import groupby -import logging -from typing import Iterator, Union, Dict, Any +from typing import ( + Any, Dict, Iterator, Union, +) from pyhocon import ConfigFactory, ConfigTree from pyodbc import connect from databuilder.extractor.base_extractor import Extractor -from databuilder.models.table_metadata import TableMetadata, ColumnMetadata - +from databuilder.models.table_metadata import ColumnMetadata, TableMetadata TableKey = namedtuple('TableKey', ['schema', 'table_name']) @@ -108,7 +109,7 @@ def init(self, conf: ConfigTree) -> None: where_stmt=where_stmt ) - LOGGER.info('SQL for Dremio metadata: {}'.format(self.sql_stmt)) + LOGGER.info('SQL for Dremio metadata: %s', self.sql_stmt) self._pyodbc_cursor = connect( conf.get_string(DremioMetadataExtractor.DREMIO_DRIVER_KEY), diff --git a/databuilder/databuilder/extractor/druid_metadata_extractor.py b/databuilder/databuilder/extractor/druid_metadata_extractor.py index f84eb53300..b2f7766ba1 100644 --- a/databuilder/databuilder/extractor/druid_metadata_extractor.py +++ b/databuilder/databuilder/extractor/druid_metadata_extractor.py @@ -2,18 +2,19 @@ # SPDX-License-Identifier: Apache-2.0 import logging -from collections import namedtuple import textwrap +from collections import namedtuple +from itertools import groupby +from typing import ( + Any, Dict, Iterator, Union, +) from pyhocon import ConfigFactory, ConfigTree -from typing import Iterator, Union, Dict, Any from databuilder import Scoped from databuilder.extractor.base_extractor import Extractor from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor -from databuilder.models.table_metadata import TableMetadata, ColumnMetadata -from itertools import groupby - +from databuilder.models.table_metadata import ColumnMetadata, TableMetadata TableKey = namedtuple('TableKey', ['schema', 'table_name']) @@ -45,7 +46,7 @@ class DruidMetadataExtractor(Extractor): def init(self, conf: ConfigTree) -> None: conf = conf.with_fallback(DruidMetadataExtractor.DEFAULT_CONFIG) - self._cluster = '{}'.format(conf.get_string(DruidMetadataExtractor.CLUSTER_KEY)) + self._cluster = conf.get_string(DruidMetadataExtractor.CLUSTER_KEY) self.sql_stmt = DruidMetadataExtractor.SQL_STATEMENT.format( where_clause_suffix=conf.get_string(DruidMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY, diff --git a/databuilder/databuilder/extractor/feast_extractor.py b/databuilder/databuilder/extractor/feast_extractor.py index 7a17e60a17..624f135971 100644 --- a/databuilder/databuilder/extractor/feast_extractor.py +++ b/databuilder/databuilder/extractor/feast_extractor.py @@ -1,8 +1,8 @@ # Copyright Contributors to the Amundsen project. # SPDX-License-Identifier: Apache-2.0 -from typing import Iterator, Union from datetime import datetime +from typing import Iterator, Union import yaml from feast import Client @@ -10,7 +10,7 @@ from pyhocon import ConfigFactory, ConfigTree from databuilder.extractor.base_extractor import Extractor -from databuilder.models.table_metadata import TableMetadata, ColumnMetadata +from databuilder.models.table_metadata import ColumnMetadata, TableMetadata class FeastExtractor(Extractor): diff --git a/databuilder/databuilder/extractor/generic_extractor.py b/databuilder/databuilder/extractor/generic_extractor.py index e2de3b3e9e..979849816e 100644 --- a/databuilder/databuilder/extractor/generic_extractor.py +++ b/databuilder/databuilder/extractor/generic_extractor.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import importlib -from typing import Iterable, Any +from typing import Any, Iterable from pyhocon import ConfigTree diff --git a/databuilder/databuilder/extractor/glue_extractor.py b/databuilder/databuilder/extractor/glue_extractor.py index 7e6ccb5d1b..6b89831d98 100644 --- a/databuilder/databuilder/extractor/glue_extractor.py +++ b/databuilder/databuilder/extractor/glue_extractor.py @@ -1,13 +1,15 @@ # Copyright Contributors to the Amundsen project. # SPDX-License-Identifier: Apache-2.0 -import boto3 +from typing import ( + Any, Dict, Iterator, List, Union, +) +import boto3 from pyhocon import ConfigFactory, ConfigTree -from typing import Iterator, Union, Dict, Any, List from databuilder.extractor.base_extractor import Extractor -from databuilder.models.table_metadata import TableMetadata, ColumnMetadata +from databuilder.models.table_metadata import ColumnMetadata, TableMetadata class GlueExtractor(Extractor): @@ -21,7 +23,7 @@ class GlueExtractor(Extractor): def init(self, conf: ConfigTree) -> None: conf = conf.with_fallback(GlueExtractor.DEFAULT_CONFIG) - self._cluster = '{}'.format(conf.get_string(GlueExtractor.CLUSTER_KEY)) + self._cluster = conf.get_string(GlueExtractor.CLUSTER_KEY) self._filters = conf.get(GlueExtractor.FILTER_KEY) self._glue = boto3.client('glue') self._extract_iter: Union[None, Iterator] = None diff --git a/databuilder/databuilder/extractor/hive_table_last_updated_extractor.py b/databuilder/databuilder/extractor/hive_table_last_updated_extractor.py index 36411a6009..248e414833 100644 --- a/databuilder/databuilder/extractor/hive_table_last_updated_extractor.py +++ b/databuilder/databuilder/extractor/hive_table_last_updated_extractor.py @@ -6,10 +6,12 @@ from datetime import datetime from functools import wraps from multiprocessing.pool import ThreadPool +from typing import ( + Any, Iterator, List, Union, +) from pyhocon import ConfigFactory, ConfigTree from pytz import UTC -from typing import Iterator, Union, Any, List from databuilder import Scoped from databuilder.extractor.base_extractor import Extractor @@ -36,10 +38,10 @@ def wrapper(*args: Any, **kwargs: Any) -> Any: return f(*args, **kwargs) except Exception as e: if is_client_side_error(e): - LOGGER.info('Invalid metadata. Skipping. args: {}, kwargs: {}. error: {}'.format(args, kwargs, e)) + LOGGER.info('Invalid metadata. Skipping. args: %s, kwargs: %s. error: %s', args, kwargs, e) return None else: - LOGGER.exception('Unknown exception while processing args: {}, kwargs: {}'.format(args, kwargs)) + LOGGER.exception('Unknown exception while processing args: %s, kwargs: %s', args, kwargs) return None return wrapper @@ -109,12 +111,12 @@ def init(self, conf: ConfigTree) -> None: self._conf = conf.with_fallback(HiveTableLastUpdatedExtractor.DEFAULT_CONFIG) pool_size = self._conf.get_int(HiveTableLastUpdatedExtractor.FS_WORKER_POOL_SIZE) - LOGGER.info('Using thread pool size: {}'.format(pool_size)) + LOGGER.info('Using thread pool size: %s', pool_size) self._fs_worker_pool = ThreadPool(processes=pool_size) self._fs_worker_timeout = self._conf.get_int(HiveTableLastUpdatedExtractor.FS_WORKER_TIMEOUT_SEC) - LOGGER.info('Using thread timeout: {} seconds'.format(self._fs_worker_timeout)) + LOGGER.info('Using thread timeout: %s seconds', self._fs_worker_timeout) - self._cluster = '{}'.format(self._conf.get_string(HiveTableLastUpdatedExtractor.CLUSTER_KEY)) + self._cluster = self._conf.get_string(HiveTableLastUpdatedExtractor.CLUSTER_KEY) self._partitioned_table_extractor = self._get_partitioned_table_sql_alchemy_extractor() self._non_partitioned_table_extractor = self._get_non_partitioned_table_sql_alchemy_extractor() @@ -134,7 +136,7 @@ def _get_partitioned_table_sql_alchemy_extractor(self) -> Extractor: where_clause_suffix=self._conf.get_string( HiveTableLastUpdatedExtractor.PARTITIONED_TABLE_WHERE_CLAUSE_SUFFIX_KEY, ' ')) - LOGGER.info('SQL for partitioned table against Hive metastore: {}'.format(sql_stmt)) + LOGGER.info('SQL for partitioned table against Hive metastore: %s', sql_stmt) sql_alchemy_extractor = SQLAlchemyExtractor() sql_alchemy_conf = Scoped.get_scoped_conf(self._conf, sql_alchemy_extractor.get_scope()) \ @@ -162,7 +164,7 @@ def _get_non_partitioned_table_sql_alchemy_extractor(self) -> Extractor: sql_stmt = HiveTableLastUpdatedExtractor.NON_PARTITIONED_TABLE_SQL_STATEMENT.format( where_clause_suffix=where_clause_suffix) - LOGGER.info('SQL for non-partitioned table against Hive metastore: {}'.format(sql_stmt)) + LOGGER.info('SQL for non-partitioned table against Hive metastore: %s', sql_stmt) sql_alchemy_extractor = SQLAlchemyExtractor() sql_alchemy_conf = Scoped.get_scoped_conf(self._conf, sql_alchemy_extractor.get_scope()) \ @@ -211,10 +213,10 @@ def _get_extract_iter(self) -> Iterator[TableLastUpdated]: while non_partitioned_tbl_row: count += 1 if count % 10 == 0: - LOGGER.info('Processed {} non-partitioned tables'.format(count)) + LOGGER.info('Processed %i non-partitioned tables', count) if not non_partitioned_tbl_row['location']: - LOGGER.warning('Skipping as no storage location available. {}'.format(non_partitioned_tbl_row)) + LOGGER.warning('Skipping as no storage location available. %s', non_partitioned_tbl_row) non_partitioned_tbl_row = self._non_partitioned_table_extractor.extract() continue @@ -223,7 +225,7 @@ def _get_extract_iter(self) -> Iterator[TableLastUpdated]: table=non_partitioned_tbl_row['table_name'], schema=non_partitioned_tbl_row['schema'], storage_location=non_partitioned_tbl_row['location']) - LOGGER.info('Elapsed: {} seconds'.format(time.time() - start)) + LOGGER.info(f'Elapsed: %i seconds', time.time() - start) if table_last_updated: yield table_last_updated @@ -247,41 +249,35 @@ def _get_last_updated_datetime_from_filesystem(self, """ if LOGGER.isEnabledFor(logging.DEBUG): - LOGGER.debug('Getting last updated datetime for {}.{} in {}'.format(schema, table, storage_location)) + LOGGER.debug(f'Getting last updated datetime for {schema}.{table} in {storage_location}') last_updated = OLDEST_TIMESTAMP paths = self._ls(storage_location) if not paths: - LOGGER.info('{schema}.{table} does not have any file in path {path}. Skipping' - .format(schema=schema, table=table, path=storage_location)) + LOGGER.info(f'{schema}.{table} does not have any file in path {storage_location}. Skipping') return None - LOGGER.info('Fetching metadata for {schema}.{table} of {num_files} files' - .format(schema=schema, table=table, num_files=len(paths))) + LOGGER.info(f'Fetching metadata for {schema}.{table} of {len(paths)} files') - if self._last_updated_filecheck_threshold > 0 and len(paths) > self._last_updated_filecheck_threshold: - LOGGER.info('Skipping {schema}.{table} due to too many files. {len_files} files exist in {location}' - .format(schema=schema, table=table, len_files=len(paths), location=storage_location)) + if 0 < self._last_updated_filecheck_threshold < len(paths): + LOGGER.info(f'Skipping {schema}.{table} due to too many files. ' + f'{len(paths)} files exist in {storage_location}') return None time_stamp_futures = \ - [self._fs_worker_pool.apply_async(self._get_timestamp, (path, schema, table, storage_location)) for path in - paths] + [self._fs_worker_pool.apply_async(self._get_timestamp, (path, schema, table, storage_location)) + for path in paths] for time_stamp_future in time_stamp_futures: try: time_stamp = time_stamp_future.get(timeout=self._fs_worker_timeout) if time_stamp: last_updated = max(time_stamp, last_updated) - except Exception as e: - if e.__class__.__name__ == 'TimeoutError': - LOGGER.warning('Timed out on paths {} . Skipping'.format(paths)) - else: - raise e + except TimeoutError: + LOGGER.warning('Timed out on paths %s . Skipping', paths) if last_updated == OLDEST_TIMESTAMP: - LOGGER.info('No timestamp was derived on {schema}.{table} from location: {location} . Skipping'.format( - schema=schema, table=table, location=storage_location)) + LOGGER.info(f'No timestamp was derived on {schema}.{table} from location: {storage_location} . Skipping') return None result = TableLastUpdated(table_name=table, @@ -317,8 +313,7 @@ def _get_timestamp(self, :return: """ if not path: - LOGGER.info('Empty path {path} on {schema}.{table} in storage location {location} . Skipping' - .format(path=path, schema=schema, table=table, location=storage_location)) + LOGGER.info(f'Empty path {path} on {schema}.{table} in storage location {storage_location} . Skipping') return None if not self._fs.is_file(path): diff --git a/databuilder/databuilder/extractor/hive_table_metadata_extractor.py b/databuilder/databuilder/extractor/hive_table_metadata_extractor.py index 47e7d99a64..8a4675bfa7 100644 --- a/databuilder/databuilder/extractor/hive_table_metadata_extractor.py +++ b/databuilder/databuilder/extractor/hive_table_metadata_extractor.py @@ -3,18 +3,19 @@ import logging from collections import namedtuple +from itertools import groupby +from typing import ( + Any, Dict, Iterator, Union, +) from pyhocon import ConfigFactory, ConfigTree -from typing import Iterator, Union, Dict, Any from sqlalchemy.engine.url import make_url from databuilder import Scoped -from databuilder.extractor.table_metadata_constants import PARTITION_BADGE from databuilder.extractor.base_extractor import Extractor from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor -from databuilder.models.table_metadata import TableMetadata, ColumnMetadata -from itertools import groupby - +from databuilder.extractor.table_metadata_constants import PARTITION_BADGE +from databuilder.models.table_metadata import ColumnMetadata, TableMetadata TableKey = namedtuple('TableKey', ['schema', 'table_name']) @@ -94,7 +95,7 @@ class HiveTableMetadataExtractor(Extractor): def init(self, conf: ConfigTree) -> None: conf = conf.with_fallback(HiveTableMetadataExtractor.DEFAULT_CONFIG) - self._cluster = '{}'.format(conf.get_string(HiveTableMetadataExtractor.CLUSTER_KEY)) + self._cluster = conf.get_string(HiveTableMetadataExtractor.CLUSTER_KEY) self._alchemy_extractor = SQLAlchemyExtractor() @@ -104,7 +105,7 @@ def init(self, conf: ConfigTree) -> None: self.sql_stmt = conf.get_string(HiveTableMetadataExtractor.EXTRACT_SQL, default=default_sql) - LOGGER.info('SQL for hive metastore: {}'.format(self.sql_stmt)) + LOGGER.info('SQL for hive metastore: %i', self.sql_stmt) sql_alch_conf = sql_alch_conf.with_fallback(ConfigFactory.from_dict( {SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt})) diff --git a/databuilder/databuilder/extractor/kafka_source_extractor.py b/databuilder/databuilder/extractor/kafka_source_extractor.py index 68383200e8..99600cd8d2 100644 --- a/databuilder/databuilder/extractor/kafka_source_extractor.py +++ b/databuilder/databuilder/extractor/kafka_source_extractor.py @@ -1,20 +1,21 @@ # Copyright Contributors to the Amundsen project. # SPDX-License-Identifier: Apache-2.0 -from datetime import datetime, timedelta import importlib import logging +from datetime import datetime, timedelta +from typing import Any -from confluent_kafka import Consumer, KafkaException, KafkaError +from confluent_kafka import ( + Consumer, KafkaError, KafkaException, +) from pyhocon import ConfigTree -from typing import Any from databuilder import Scoped from databuilder.callback.call_back import Callback from databuilder.extractor.base_extractor import Extractor from databuilder.transformer.base_transformer import Transformer - LOGGER = logging.getLogger(__name__) diff --git a/databuilder/databuilder/extractor/mssql_metadata_extractor.py b/databuilder/databuilder/extractor/mssql_metadata_extractor.py index e0dee5a983..ce89752a09 100644 --- a/databuilder/databuilder/extractor/mssql_metadata_extractor.py +++ b/databuilder/databuilder/extractor/mssql_metadata_extractor.py @@ -3,15 +3,17 @@ import logging from collections import namedtuple +from itertools import groupby +from typing import ( + Any, Dict, Iterator, Union, +) from pyhocon import ConfigFactory, ConfigTree -from typing import Iterator, Union, Dict, Any from databuilder import Scoped from databuilder.extractor.base_extractor import Extractor from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor -from databuilder.models.table_metadata import TableMetadata, ColumnMetadata -from itertools import groupby +from databuilder.models.table_metadata import ColumnMetadata, TableMetadata TableKey = namedtuple('TableKey', ['schema_name', 'table_name']) @@ -76,13 +78,12 @@ class MSSQLMetadataExtractor(Extractor): def init(self, conf: ConfigTree) -> None: conf = conf.with_fallback(MSSQLMetadataExtractor.DEFAULT_CONFIG) - self._cluster = '{}'.format( - conf.get_string(MSSQLMetadataExtractor.CLUSTER_KEY)) + self._cluster = conf.get_string(MSSQLMetadataExtractor.CLUSTER_KEY) if conf.get_bool(MSSQLMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME): cluster_source = "DB_NAME()" else: - cluster_source = "'{}'".format(self._cluster) + cluster_source = f"'{self._cluster}'" self._database = conf.get_string( MSSQLMetadataExtractor.DATABASE_KEY, @@ -91,11 +92,11 @@ def init(self, conf: ConfigTree) -> None: config_where_clause = conf.get_string( MSSQLMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY) - logging.info("Crawling for Schemas %s", config_where_clause) + LOGGER.info("Crawling for Schemas %s", config_where_clause) - if len(config_where_clause) > 0: - where_clause_suffix = MSSQLMetadataExtractor\ - .DEFAULT_WHERE_CLAUSE_VALUE\ + if config_where_clause: + where_clause_suffix = MSSQLMetadataExtractor \ + .DEFAULT_WHERE_CLAUSE_VALUE \ .format(schemas=config_where_clause) else: where_clause_suffix = '' @@ -105,15 +106,12 @@ def init(self, conf: ConfigTree) -> None: cluster_source=cluster_source ) - LOGGER.info('SQL for MS SQL Metadata: {}'.format(self.sql_stmt)) + LOGGER.info('SQL for MS SQL Metadata: %s', self.sql_stmt) self._alchemy_extractor = SQLAlchemyExtractor() - sql_alch_conf = Scoped\ + sql_alch_conf = Scoped \ .get_scoped_conf(conf, self._alchemy_extractor.get_scope()) \ - .with_fallback( - ConfigFactory.from_dict({ - SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt}) - ) + .with_fallback(ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt})) self._alchemy_extractor.init(sql_alch_conf) self._extract_iter: Union[None, Iterator] = None diff --git a/databuilder/databuilder/extractor/mysql_metadata_extractor.py b/databuilder/databuilder/extractor/mysql_metadata_extractor.py index dd3586d526..ab58a7f31d 100644 --- a/databuilder/databuilder/extractor/mysql_metadata_extractor.py +++ b/databuilder/databuilder/extractor/mysql_metadata_extractor.py @@ -3,16 +3,17 @@ import logging from collections import namedtuple +from itertools import groupby +from typing import ( + Any, Dict, Iterator, Union, +) from pyhocon import ConfigFactory, ConfigTree -from typing import Iterator, Union, Dict, Any from databuilder import Scoped from databuilder.extractor.base_extractor import Extractor from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor -from databuilder.models.table_metadata import TableMetadata, ColumnMetadata -from itertools import groupby - +from databuilder.models.table_metadata import ColumnMetadata, TableMetadata TableKey = namedtuple('TableKey', ['schema', 'table_name']) @@ -60,12 +61,12 @@ class MysqlMetadataExtractor(Extractor): def init(self, conf: ConfigTree) -> None: conf = conf.with_fallback(MysqlMetadataExtractor.DEFAULT_CONFIG) - self._cluster = '{}'.format(conf.get_string(MysqlMetadataExtractor.CLUSTER_KEY)) + self._cluster = conf.get_string(MysqlMetadataExtractor.CLUSTER_KEY) if conf.get_bool(MysqlMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME): cluster_source = "c.table_catalog" else: - cluster_source = "'{}'".format(self._cluster) + cluster_source = f"'{self._cluster}'" self._database = conf.get_string(MysqlMetadataExtractor.DATABASE_KEY, default='mysql') @@ -75,12 +76,12 @@ def init(self, conf: ConfigTree) -> None: ) self._alchemy_extractor = SQLAlchemyExtractor() - sql_alch_conf = Scoped.get_scoped_conf(conf, self._alchemy_extractor.get_scope())\ + sql_alch_conf = Scoped.get_scoped_conf(conf, self._alchemy_extractor.get_scope()) \ .with_fallback(ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt})) self.sql_stmt = sql_alch_conf.get_string(SQLAlchemyExtractor.EXTRACT_SQL) - LOGGER.info('SQL for mysql metadata: {}'.format(self.sql_stmt)) + LOGGER.info('SQL for mysql metadata: %s', self.sql_stmt) self._alchemy_extractor.init(sql_alch_conf) self._extract_iter: Union[None, Iterator] = None diff --git a/databuilder/databuilder/extractor/neo4j_extractor.py b/databuilder/databuilder/extractor/neo4j_extractor.py index c0ae424428..4e6565b29a 100644 --- a/databuilder/databuilder/extractor/neo4j_extractor.py +++ b/databuilder/databuilder/extractor/neo4j_extractor.py @@ -3,11 +3,13 @@ import importlib import logging -from typing import Any, Iterator, Union +from typing import ( + Any, Iterator, Union, +) -from pyhocon import ConfigTree, ConfigFactory -from neo4j import GraphDatabase import neo4j +from neo4j import GraphDatabase +from pyhocon import ConfigFactory, ConfigTree from databuilder.extractor.base_extractor import Extractor @@ -78,7 +80,7 @@ def _execute_query(self, tx: Any) -> Any: """ Create an iterator to execute sql. """ - LOGGER.info('Executing query {}'.format(self.cypher_query)) + LOGGER.info('Executing query %s', self.cypher_query) result = tx.run(self.cypher_query) return result diff --git a/databuilder/databuilder/extractor/neo4j_search_data_extractor.py b/databuilder/databuilder/extractor/neo4j_search_data_extractor.py index 959cc04f4f..de1457cf8a 100644 --- a/databuilder/databuilder/extractor/neo4j_search_data_extractor.py +++ b/databuilder/databuilder/extractor/neo4j_search_data_extractor.py @@ -171,6 +171,5 @@ def _add_publish_tag_filter(self, publish_tag: str, cypher_query: str) -> str: else: if not hasattr(self, 'entity'): self.entity = 'table' - publish_tag_filter = """WHERE {entity}.published_tag = '{tag}'""".format(entity=self.entity, - tag=publish_tag) + publish_tag_filter = f"WHERE {self.entity}.published_tag = '{publish_tag}'" return cypher_query.format(publish_tag_filter=publish_tag_filter) diff --git a/databuilder/databuilder/extractor/postgres_metadata_extractor.py b/databuilder/databuilder/extractor/postgres_metadata_extractor.py index 3ed9385513..db380ccfc5 100644 --- a/databuilder/databuilder/extractor/postgres_metadata_extractor.py +++ b/databuilder/databuilder/extractor/postgres_metadata_extractor.py @@ -1,8 +1,11 @@ # Copyright Contributors to the Amundsen project. # SPDX-License-Identifier: Apache-2.0 +from typing import ( # noqa: F401 + Any, Dict, Iterator, Union, +) + from pyhocon import ConfigFactory, ConfigTree # noqa: F401 -from typing import Iterator, Union, Dict, Any # noqa: F401 from databuilder.extractor.base_postgres_metadata_extractor import BasePostgresMetadataExtractor @@ -12,12 +15,11 @@ class PostgresMetadataExtractor(BasePostgresMetadataExtractor): Extracts Postgres table and column metadata from underlying meta store database using SQLAlchemyExtractor """ - def get_sql_statement(self, use_catalog_as_cluster_name, where_clause_suffix): - # type: (bool, str) -> str + def get_sql_statement(self, use_catalog_as_cluster_name: bool, where_clause_suffix: str) -> str: if use_catalog_as_cluster_name: cluster_source = "c.table_catalog" else: - cluster_source = "'{}'".format(self._cluster) + cluster_source = f"'{self._cluster}'" return """ SELECT @@ -38,6 +40,5 @@ def get_sql_statement(self, use_catalog_as_cluster_name, where_clause_suffix): where_clause_suffix=where_clause_suffix, ) - def get_scope(self): - # type: () -> str + def get_scope(self) -> str: return 'extractor.postgres_metadata' diff --git a/databuilder/databuilder/extractor/presto_view_metadata_extractor.py b/databuilder/databuilder/extractor/presto_view_metadata_extractor.py index 45cb2a40d9..0a34527740 100644 --- a/databuilder/databuilder/extractor/presto_view_metadata_extractor.py +++ b/databuilder/databuilder/extractor/presto_view_metadata_extractor.py @@ -4,15 +4,16 @@ import base64 import json import logging +from typing import ( + Iterator, List, Union, +) from pyhocon import ConfigFactory, ConfigTree -from typing import Iterator, List, Union from databuilder import Scoped from databuilder.extractor.base_extractor import Extractor from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor -from databuilder.models.table_metadata import TableMetadata, ColumnMetadata - +from databuilder.models.table_metadata import ColumnMetadata, TableMetadata LOGGER = logging.getLogger(__name__) @@ -47,12 +48,12 @@ class PrestoViewMetadataExtractor(Extractor): def init(self, conf: ConfigTree) -> None: conf = conf.with_fallback(PrestoViewMetadataExtractor.DEFAULT_CONFIG) - self._cluster = '{}'.format(conf.get_string(PrestoViewMetadataExtractor.CLUSTER_KEY)) + self._cluster = conf.get_string(PrestoViewMetadataExtractor.CLUSTER_KEY) self.sql_stmt = PrestoViewMetadataExtractor.SQL_STATEMENT.format( where_clause_suffix=conf.get_string(PrestoViewMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY)) - LOGGER.info('SQL for hive metastore: {}'.format(self.sql_stmt)) + LOGGER.info('SQL for hive metastore: %s', self.sql_stmt) self._alchemy_extractor = SQLAlchemyExtractor() sql_alch_conf = Scoped.get_scoped_conf(conf, self._alchemy_extractor.get_scope())\ diff --git a/databuilder/databuilder/extractor/redshift_metadata_extractor.py b/databuilder/databuilder/extractor/redshift_metadata_extractor.py index 9e2732f39d..c711ac8fd1 100644 --- a/databuilder/databuilder/extractor/redshift_metadata_extractor.py +++ b/databuilder/databuilder/extractor/redshift_metadata_extractor.py @@ -1,8 +1,11 @@ # Copyright Contributors to the Amundsen project. # SPDX-License-Identifier: Apache-2.0 +from typing import ( # noqa: F401 + Any, Dict, Iterator, Union, +) + from pyhocon import ConfigFactory, ConfigTree # noqa: F401 -from typing import Iterator, Union, Dict, Any # noqa: F401 from databuilder.extractor.base_postgres_metadata_extractor import BasePostgresMetadataExtractor @@ -15,12 +18,11 @@ class RedshiftMetadataExtractor(BasePostgresMetadataExtractor): we need to join the INFORMATION_SCHEMA data against the function PG_GET_LATE_BINDING_VIEW_COLS(). """ - def get_sql_statement(self, use_catalog_as_cluster_name, where_clause_suffix): - # type: (bool, str) -> str + def get_sql_statement(self, use_catalog_as_cluster_name: bool, where_clause_suffix: str) -> str: if use_catalog_as_cluster_name: cluster_source = "CURRENT_DATABASE()" else: - cluster_source = "'{}'".format(self._cluster) + cluster_source = f"'{self._cluster}'" return """ SELECT @@ -66,6 +68,5 @@ def get_sql_statement(self, use_catalog_as_cluster_name, where_clause_suffix): where_clause_suffix=where_clause_suffix, ) - def get_scope(self): - # type: () -> str + def get_scope(self) -> str: return 'extractor.redshift_metadata' diff --git a/databuilder/databuilder/extractor/restapi/rest_api_extractor.py b/databuilder/databuilder/extractor/restapi/rest_api_extractor.py index bf2b8856b8..65c22dc9f0 100644 --- a/databuilder/databuilder/extractor/restapi/rest_api_extractor.py +++ b/databuilder/databuilder/extractor/restapi/rest_api_extractor.py @@ -1,16 +1,17 @@ # Copyright Contributors to the Amundsen project. # SPDX-License-Identifier: Apache-2.0 -import logging import importlib -from typing import Any, Iterator, Dict, Optional +import logging +from typing import ( + Any, Dict, Iterator, Optional, +) from pyhocon import ConfigTree from databuilder.extractor.base_extractor import Extractor from databuilder.rest_api.base_rest_api_query import BaseRestApiQuery - REST_API_QUERY = 'restapi_query' MODEL_CLASS = 'model_class' @@ -33,7 +34,7 @@ def init(self, conf: ConfigTree) -> None: self._restapi_query: BaseRestApiQuery = conf.get(REST_API_QUERY) self._iterator: Optional[Iterator[Dict[str, Any]]] = None self._static_dict = conf.get(STATIC_RECORD_DICT, dict()) - LOGGER.info('static record: {}'.format(self._static_dict)) + LOGGER.info('static record: %s', self._static_dict) model_class = conf.get(MODEL_CLASS, None) if model_class: diff --git a/databuilder/databuilder/extractor/snowflake_metadata_extractor.py b/databuilder/databuilder/extractor/snowflake_metadata_extractor.py index adfad2f996..33f0eda3b0 100644 --- a/databuilder/databuilder/extractor/snowflake_metadata_extractor.py +++ b/databuilder/databuilder/extractor/snowflake_metadata_extractor.py @@ -4,17 +4,18 @@ import logging from collections import namedtuple +from itertools import groupby +from typing import ( + Any, Dict, Iterator, Union, +) from pyhocon import ConfigFactory, ConfigTree -from typing import Iterator, Union, Dict, Any from unidecode import unidecode from databuilder import Scoped from databuilder.extractor.base_extractor import Extractor from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor -from databuilder.models.table_metadata import TableMetadata, ColumnMetadata -from itertools import groupby - +from databuilder.models.table_metadata import ColumnMetadata, TableMetadata TableKey = namedtuple('TableKey', ['schema', 'table_name']) @@ -77,12 +78,12 @@ class SnowflakeMetadataExtractor(Extractor): def init(self, conf: ConfigTree) -> None: conf = conf.with_fallback(SnowflakeMetadataExtractor.DEFAULT_CONFIG) - self._cluster = '{}'.format(conf.get_string(SnowflakeMetadataExtractor.CLUSTER_KEY)) + self._cluster = conf.get_string(SnowflakeMetadataExtractor.CLUSTER_KEY) if conf.get_bool(SnowflakeMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME): cluster_source = "c.table_catalog" else: - cluster_source = "'{}'".format(self._cluster) + cluster_source = f"'{self._cluster}'" self._database = conf.get_string(SnowflakeMetadataExtractor.DATABASE_KEY) self._schema = conf.get_string(SnowflakeMetadataExtractor.DATABASE_KEY) @@ -96,10 +97,10 @@ def init(self, conf: ConfigTree) -> None: schema=self._snowflake_schema ) - LOGGER.info('SQL for snowflake metadata: {}'.format(self.sql_stmt)) + LOGGER.info('SQL for snowflake metadata: %s', self.sql_stmt) self._alchemy_extractor = SQLAlchemyExtractor() - sql_alch_conf = Scoped.get_scoped_conf(conf, self._alchemy_extractor.get_scope())\ + sql_alch_conf = Scoped.get_scoped_conf(conf, self._alchemy_extractor.get_scope()) \ .with_fallback(ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt})) self._alchemy_extractor.init(sql_alch_conf) @@ -127,11 +128,11 @@ def _get_extract_iter(self) -> Iterator[TableMetadata]: for row in group: last_row = row columns.append(ColumnMetadata( - row['col_name'], - unidecode(row['col_description']) if row['col_description'] else None, - row['col_type'], - row['col_sort_order']) - ) + row['col_name'], + unidecode(row['col_description']) if row['col_description'] else None, + row['col_type'], + row['col_sort_order']) + ) yield TableMetadata(self._database, last_row['cluster'], last_row['schema'], diff --git a/databuilder/databuilder/extractor/snowflake_table_last_updated_extractor.py b/databuilder/databuilder/extractor/snowflake_table_last_updated_extractor.py index 0fbdce7dc7..d477cece47 100644 --- a/databuilder/databuilder/extractor/snowflake_table_last_updated_extractor.py +++ b/databuilder/databuilder/extractor/snowflake_table_last_updated_extractor.py @@ -2,9 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 import logging +from typing import Iterator, Union from pyhocon import ConfigFactory, ConfigTree -from typing import Iterator, Union from databuilder import Scoped from databuilder.extractor.base_extractor import Extractor @@ -57,11 +57,12 @@ class SnowflakeTableLastUpdatedExtractor(Extractor): def init(self, conf: ConfigTree) -> None: conf = conf.with_fallback(SnowflakeTableLastUpdatedExtractor.DEFAULT_CONFIG) + self._cluster = conf.get_string(SnowflakeTableLastUpdatedExtractor.CLUSTER_KEY) if conf.get_bool(SnowflakeTableLastUpdatedExtractor.USE_CATALOG_AS_CLUSTER_NAME): cluster_source = "t.table_catalog" else: - cluster_source = "'{}'".format(conf.get_string(SnowflakeTableLastUpdatedExtractor.CLUSTER_KEY)) + cluster_source = f"'{self._cluster}'" self._database = conf.get_string(SnowflakeTableLastUpdatedExtractor.DATABASE_KEY) self._snowflake_database = conf.get_string(SnowflakeTableLastUpdatedExtractor.SNOWFLAKE_DATABASE_KEY) @@ -72,7 +73,7 @@ def init(self, conf: ConfigTree) -> None: database=self._snowflake_database ) - LOGGER.info('SQL for snowflake table last updated timestamp: {}'.format(self.sql_stmt)) + LOGGER.info('SQL for snowflake table last updated timestamp: %s', self.sql_stmt) # use an sql_alchemy_extractor to execute sql self._alchemy_extractor = SQLAlchemyExtractor() diff --git a/databuilder/databuilder/extractor/sql_alchemy_extractor.py b/databuilder/databuilder/extractor/sql_alchemy_extractor.py index b231c23fb7..2c90a8fd30 100644 --- a/databuilder/databuilder/extractor/sql_alchemy_extractor.py +++ b/databuilder/databuilder/extractor/sql_alchemy_extractor.py @@ -2,10 +2,10 @@ # SPDX-License-Identifier: Apache-2.0 import importlib -from sqlalchemy import create_engine +from typing import Any from pyhocon import ConfigTree -from typing import Any +from sqlalchemy import create_engine from databuilder.extractor.base_extractor import Extractor diff --git a/databuilder/databuilder/extractor/user/bamboohr/bamboohr_user_extractor.py b/databuilder/databuilder/extractor/user/bamboohr/bamboohr_user_extractor.py index 47f42fd83a..8477144b04 100644 --- a/databuilder/databuilder/extractor/user/bamboohr/bamboohr_user_extractor.py +++ b/databuilder/databuilder/extractor/user/bamboohr/bamboohr_user_extractor.py @@ -2,12 +2,13 @@ # SPDX-License-Identifier: Apache-2.0 -from pyhocon import ConfigTree -import requests -from requests.auth import HTTPBasicAuth from typing import Iterator, Optional from xml.etree import ElementTree +import requests +from pyhocon import ConfigTree +from requests.auth import HTTPBasicAuth + from databuilder.extractor.base_extractor import Extractor from databuilder.models.user import User @@ -32,9 +33,7 @@ def extract(self) -> Optional[User]: return None def _employee_directory_uri(self) -> str: - return 'https://api.bamboohr.com/api/gateway.php/{subdomain}/v1/employees/directory'.format( - subdomain=self._subdomain - ) + return f'https://api.bamboohr.com/api/gateway.php/{self._subdomain}/v1/employees/directory' def _get_extract_iter(self) -> Iterator[User]: response = requests.get( @@ -46,7 +45,7 @@ def _get_extract_iter(self) -> Iterator[User]: for user in root.findall('./employees/employee'): def get_field(name: str) -> str: - field = user.find('./field[@id=\'{name}\']'.format(name=name)) + field = user.find(f"./field[@id='{name}']") if field is not None and field.text is not None: return field.text else: diff --git a/databuilder/databuilder/filesystem/filesystem.py b/databuilder/databuilder/filesystem/filesystem.py index 0f87ce0236..3c6e147df9 100644 --- a/databuilder/databuilder/filesystem/filesystem.py +++ b/databuilder/databuilder/filesystem/filesystem.py @@ -2,10 +2,10 @@ # SPDX-License-Identifier: Apache-2.0 import logging +from typing import List from pyhocon import ConfigFactory, ConfigTree from retrying import retry -from typing import List from databuilder import Scoped from databuilder.filesystem.metadata import FileMetadata diff --git a/databuilder/databuilder/filesystem/metadata.py b/databuilder/databuilder/filesystem/metadata.py index 4d5e7786c2..3ebed1e210 100644 --- a/databuilder/databuilder/filesystem/metadata.py +++ b/databuilder/databuilder/filesystem/metadata.py @@ -16,5 +16,4 @@ def __init__(self, self.size = size def __repr__(self) -> str: - return """FileMetadata(path={!r}, last_updated={!r}, size={!r})""" \ - .format(self.path, self.last_updated, self.size) + return f'FileMetadata(path={self.path!r}, last_updated={self.last_updated!r}, size={self.size!r})' diff --git a/databuilder/databuilder/job/job.py b/databuilder/databuilder/job/job.py index af8f44c955..b3791a395e 100644 --- a/databuilder/databuilder/job/job.py +++ b/databuilder/databuilder/job/job.py @@ -8,8 +8,7 @@ from databuilder import Scoped from databuilder.job.base_job import Job -from databuilder.publisher.base_publisher import NoopPublisher -from databuilder.publisher.base_publisher import Publisher +from databuilder.publisher.base_publisher import NoopPublisher, Publisher from databuilder.task.base_task import Task LOGGER = logging.getLogger(__name__) @@ -39,8 +38,8 @@ def __init__(self, self.scoped_conf = Scoped.get_scoped_conf(self.conf, self.get_scope()) if self.scoped_conf.get_bool(DefaultJob.IS_STATSD_ENABLED, False): - prefix = 'amundsen.databuilder.job.{}'.format(self.scoped_conf.get_string(DefaultJob.JOB_IDENTIFIER)) - LOGGER.info('Setting statsd for job metrics with prefix: {}'.format(prefix)) + prefix = f'amundsen.databuilder.job.{self.scoped_conf.get_string(DefaultJob.JOB_IDENTIFIER)}' + LOGGER.info('Setting statsd for job metrics with prefix: %s', prefix) self.statsd = StatsClient(prefix=prefix) else: self.statsd = None diff --git a/databuilder/databuilder/loader/base_loader.py b/databuilder/databuilder/loader/base_loader.py index 2b5e0faf74..38562e7006 100644 --- a/databuilder/databuilder/loader/base_loader.py +++ b/databuilder/databuilder/loader/base_loader.py @@ -2,11 +2,11 @@ # SPDX-License-Identifier: Apache-2.0 import abc +from typing import Any from pyhocon import ConfigTree from databuilder import Scoped -from typing import Any class Loader(Scoped): diff --git a/databuilder/databuilder/loader/file_system_csv_loader.py b/databuilder/databuilder/loader/file_system_csv_loader.py index f13c06972f..ac6d8794e7 100644 --- a/databuilder/databuilder/loader/file_system_csv_loader.py +++ b/databuilder/databuilder/loader/file_system_csv_loader.py @@ -3,12 +3,14 @@ import csv import logging +from typing import Any from pyhocon import ConfigTree -from typing import Any from databuilder.loader.base_loader import Loader +LOGGER = logging.getLogger(__name__) + class FileSystemCSVLoader(Loader): """ @@ -52,8 +54,7 @@ def close(self) -> None: if self.file_handler: self.file_handler.close() except Exception as e: - logging.warning("Failed trying to close a file handler! %s", - str(e)) + LOGGER.warning("Failed trying to close a file handler! %s", e) def get_scope(self) -> str: return "loader.filesystem.csv" diff --git a/databuilder/databuilder/loader/file_system_neo4j_csv_loader.py b/databuilder/databuilder/loader/file_system_neo4j_csv_loader.py index fab05d2eed..ab500cef10 100644 --- a/databuilder/databuilder/loader/file_system_neo4j_csv_loader.py +++ b/databuilder/databuilder/loader/file_system_neo4j_csv_loader.py @@ -6,16 +6,17 @@ import os import shutil from csv import DictWriter +from typing import ( + Any, Dict, FrozenSet, +) -from pyhocon import ConfigTree, ConfigFactory -from typing import Dict, Any, FrozenSet +from pyhocon import ConfigFactory, ConfigTree from databuilder.job.base_job import Job from databuilder.loader.base_loader import Loader from databuilder.models.graph_serializable import GraphSerializable -from databuilder.utils.closer import Closer from databuilder.serializers import neo4_serializer - +from databuilder.utils.closer import Closer LOGGER = logging.getLogger(__name__) @@ -72,19 +73,19 @@ def _create_directory(self, path: str) -> None: """ if os.path.exists(path): if self._force_create_dir: - LOGGER.info('Directory exist. Deleting directory {}'.format(path)) + LOGGER.info('Directory exist. Deleting directory %s', path) shutil.rmtree(path) else: - raise RuntimeError('Directory should not exist: {}'.format(path)) + raise RuntimeError(f'Directory should not exist: {path}') os.makedirs(path) def _delete_dir() -> None: if not self._delete_created_dir: - LOGGER.warn('Skip Deleting directory {}'.format(path)) + LOGGER.warning('Skip Deleting directory %s', path) return - LOGGER.info('Deleting directory {}'.format(path)) + LOGGER.info('Deleting directory %s', path) shutil.rmtree(path) # Directory should be deleted after publish is finished @@ -128,7 +129,7 @@ def load(self, csv_serializable: GraphSerializable) -> None: relation.type, self._make_key(relation_dict)) - file_suffix = '{}_{}_{}'.format(key2[0], key2[1], key2[2]) + file_suffix = f'{key2[0]}_{key2[1]}_{key2[2]}' relation_writer = self._get_writer(relation_dict, self._relation_file_mapping, key2, @@ -159,14 +160,14 @@ def _get_writer(self, if writer: return writer - LOGGER.info('Creating file for {}'.format(key)) + LOGGER.info('Creating file for %s', key) - file_out = open('{}/{}.csv'.format(dir_path, file_suffix), 'w', encoding='utf8') + file_out = open(f'{dir_path}/{file_suffix}.csv', 'w', encoding='utf8') writer = csv.DictWriter(file_out, fieldnames=csv_record_dict.keys(), quoting=csv.QUOTE_NONNUMERIC) def file_out_close() -> None: - LOGGER.info('Closing file IO {}'.format(file_out)) + LOGGER.info('Closing file IO %s', file_out) file_out.close() self._closer.register(file_out_close) diff --git a/databuilder/databuilder/loader/generic_loader.py b/databuilder/databuilder/loader/generic_loader.py index 5e7b769fdd..38db553295 100644 --- a/databuilder/databuilder/loader/generic_loader.py +++ b/databuilder/databuilder/loader/generic_loader.py @@ -2,9 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 import logging +from typing import Any, Optional from pyhocon import ConfigTree -from typing import Optional, Any from databuilder.loader.base_loader import Loader @@ -19,7 +19,7 @@ def log_call_back(record: Optional[Any]) -> None: :param record: :return: """ - LOGGER.info('record: {}'.format(record)) + LOGGER.info('record: %s', record) class GenericLoader(Loader): diff --git a/databuilder/databuilder/models/application.py b/databuilder/databuilder/models/application.py index d86b0e2ce0..d7327a6cac 100644 --- a/databuilder/databuilder/models/application.py +++ b/databuilder/databuilder/models/application.py @@ -3,11 +3,10 @@ from typing import List, Union -from databuilder.models.graph_serializable import GraphSerializable - -from databuilder.models.table_metadata import TableMetadata from databuilder.models.graph_node import GraphNode from databuilder.models.graph_relationship import GraphRelationship +from databuilder.models.graph_serializable import GraphSerializable +from databuilder.models.table_metadata import TableMetadata class Application(GraphSerializable): diff --git a/databuilder/databuilder/models/badge.py b/databuilder/databuilder/models/badge.py index 4176123d49..51ebd86887 100644 --- a/databuilder/databuilder/models/badge.py +++ b/databuilder/databuilder/models/badge.py @@ -1,12 +1,12 @@ # Copyright Contributors to the Amundsen project. # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional import re +from typing import List, Optional -from databuilder.models.graph_serializable import GraphSerializable from databuilder.models.graph_node import GraphNode from databuilder.models.graph_relationship import GraphRelationship +from databuilder.models.graph_serializable import GraphSerializable class Badge: @@ -15,8 +15,7 @@ def __init__(self, name: str, category: str): self.category = category def __repr__(self) -> str: - return 'Badge({!r}, {!r})'.format(self.name, - self.category) + return f'Badge({self.name!r}, {self.category!r})' def __eq__(self, other: object) -> bool: if not isinstance(other, Badge): @@ -65,8 +64,7 @@ def __init__(self, self._relation_iter = iter(self.create_relation()) def __repr__(self) -> str: - return 'BadgeMetadata({!r}, {!r})'.format(self.start_label, - self.start_key) + return f'BadgeMetadata({self.start_label!r}, {self.start_key!r})' def create_next_node(self) -> Optional[GraphNode]: # return the string representation of the data diff --git a/databuilder/databuilder/models/column_usage_model.py b/databuilder/databuilder/models/column_usage_model.py index c9197f3986..efd941e423 100644 --- a/databuilder/databuilder/models/column_usage_model.py +++ b/databuilder/databuilder/models/column_usage_model.py @@ -1,20 +1,21 @@ # Copyright Contributors to the Amundsen project. # SPDX-License-Identifier: Apache-2.0 -from typing import Union, Iterable, List +from typing import ( + Iterable, List, Union, +) +from databuilder.models.graph_node import GraphNode +from databuilder.models.graph_relationship import GraphRelationship from databuilder.models.graph_serializable import GraphSerializable +from databuilder.models.table_metadata import TableMetadata from databuilder.models.usage.usage_constants import ( - READ_RELATION_TYPE, READ_REVERSE_RELATION_TYPE, READ_RELATION_COUNT_PROPERTY + READ_RELATION_COUNT_PROPERTY, READ_RELATION_TYPE, READ_REVERSE_RELATION_TYPE, ) -from databuilder.models.table_metadata import TableMetadata from databuilder.models.user import User -from databuilder.models.graph_node import GraphNode -from databuilder.models.graph_relationship import GraphRelationship class ColumnUsageModel(GraphSerializable): - """ A model represents user <--> column graph model Currently it only support to serialize to table level @@ -93,10 +94,5 @@ def _get_user_key(self, email: str) -> str: return User.get_user_model_key(email=email) def __repr__(self) -> str: - return 'TableColumnUsage({!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r})'.format(self.database, - self.cluster, - self.schema, - self.table_name, - self.column_name, - self.user_email, - self.read_count) + return f'TableColumnUsage({self.database!r}, {self.cluster!r}, {self.schema!r}, ' \ + f'{self.table_name!r}, {self.column_name!r}, {self.user_email!r}, {self.read_count!r})' diff --git a/databuilder/databuilder/models/dashboard/dashboard_chart.py b/databuilder/databuilder/models/dashboard/dashboard_chart.py index 2185ce551c..c00b6444b3 100644 --- a/databuilder/databuilder/models/dashboard/dashboard_chart.py +++ b/databuilder/databuilder/models/dashboard/dashboard_chart.py @@ -2,15 +2,14 @@ # SPDX-License-Identifier: Apache-2.0 import logging -from typing import Optional, Any, Union, Iterator - +from typing import ( + Any, Iterator, Optional, Union, +) from databuilder.models.dashboard.dashboard_query import DashboardQuery -from databuilder.models.graph_serializable import ( - GraphSerializable) - from databuilder.models.graph_node import GraphNode from databuilder.models.graph_relationship import GraphRelationship +from databuilder.models.graph_serializable import GraphSerializable LOGGER = logging.getLogger(__name__) @@ -111,14 +110,6 @@ def _get_chart_node_key(self) -> str: ) def __repr__(self) -> str: - return 'DashboardChart({!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r})'.format( - self._dashboard_group_id, - self._dashboard_id, - self._query_id, - self._chart_id, - self._chart_name, - self._chart_type, - self._chart_url, - self._product, - self._cluster - ) + return f'DashboardChart({self._dashboard_group_id!r}, {self._dashboard_id!r}, ' \ + f'{self._query_id!r}, {self._chart_id!r}, {self._chart_name!r}, {self._chart_type!r}, ' \ + f'{self._chart_url!r}, {self._product!r}, {self._cluster!r})' diff --git a/databuilder/databuilder/models/dashboard/dashboard_execution.py b/databuilder/databuilder/models/dashboard/dashboard_execution.py index 6aa5a04df2..453b2687e3 100644 --- a/databuilder/databuilder/models/dashboard/dashboard_execution.py +++ b/databuilder/databuilder/models/dashboard/dashboard_execution.py @@ -2,14 +2,14 @@ # SPDX-License-Identifier: Apache-2.0 import logging - -from typing import Optional, Any, Union, Iterator +from typing import ( + Any, Iterator, Optional, Union, +) from databuilder.models.dashboard.dashboard_metadata import DashboardMetadata -from databuilder.models.graph_serializable import (GraphSerializable) - from databuilder.models.graph_node import GraphNode from databuilder.models.graph_relationship import GraphRelationship +from databuilder.models.graph_serializable import GraphSerializable LOGGER = logging.getLogger(__name__) @@ -97,12 +97,6 @@ def _get_last_execution_node_key(self) -> str: ) def __repr__(self) -> str: - return 'DashboardExecution({!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r})'.format( - self._dashboard_group_id, - self._dashboard_id, - self._execution_timestamp, - self._execution_state, - self._execution_id, - self._product, - self._cluster - ) + return f'DashboardExecution({self._dashboard_group_id!r}, {self._dashboard_id!r}, ' \ + f'{self._execution_timestamp!r}, {self._execution_state!r}, ' \ + f'{self._execution_id!r}, {self._product!r}, {self._cluster!r})' diff --git a/databuilder/databuilder/models/dashboard/dashboard_last_modified.py b/databuilder/databuilder/models/dashboard/dashboard_last_modified.py index 916248d60a..8d70e65e36 100644 --- a/databuilder/databuilder/models/dashboard/dashboard_last_modified.py +++ b/databuilder/databuilder/models/dashboard/dashboard_last_modified.py @@ -2,16 +2,15 @@ # SPDX-License-Identifier: Apache-2.0 import logging - -from typing import Optional, Any, Union, Iterator +from typing import ( + Any, Iterator, Optional, Union, +) from databuilder.models.dashboard.dashboard_metadata import DashboardMetadata -from databuilder.models.graph_serializable import ( - GraphSerializable) -from databuilder.models.timestamp import timestamp_constants - from databuilder.models.graph_node import GraphNode from databuilder.models.graph_relationship import GraphRelationship +from databuilder.models.graph_serializable import GraphSerializable +from databuilder.models.timestamp import timestamp_constants LOGGER = logging.getLogger(__name__) @@ -90,10 +89,5 @@ def _get_last_modified_node_key(self) -> str: ) def __repr__(self) -> str: - return 'DashboardLastModifiedTimestamp({!r}, {!r}, {!r}, {!r}, {!r})'.format( - self._dashboard_group_id, - self._dashboard_id, - self._last_modified_timestamp, - self._product, - self._cluster - ) + return f'DashboardLastModifiedTimestamp({self._dashboard_group_id!r}, {self._dashboard_id!r}, ' \ + f'{self._last_modified_timestamp!r}, {self._product!r}, {self._cluster!r})' diff --git a/databuilder/databuilder/models/dashboard/dashboard_metadata.py b/databuilder/databuilder/models/dashboard/dashboard_metadata.py index 740b1e5688..cdc8351e80 100644 --- a/databuilder/databuilder/models/dashboard/dashboard_metadata.py +++ b/databuilder/databuilder/models/dashboard/dashboard_metadata.py @@ -1,17 +1,16 @@ # Copyright Contributors to the Amundsen project. # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Iterator, List, Optional, Set, Union, Dict - -from databuilder.models.cluster import cluster_constants -from databuilder.models.graph_serializable import ( - GraphSerializable +from typing import ( + Any, Dict, Iterator, List, Optional, Set, Union, ) -# TODO: We could separate TagMetadata from table_metadata to own module -from databuilder.models.table_metadata import TagMetadata +from databuilder.models.cluster import cluster_constants from databuilder.models.graph_node import GraphNode from databuilder.models.graph_relationship import GraphRelationship +from databuilder.models.graph_serializable import GraphSerializable +# TODO: We could separate TagMetadata from table_metadata to own module +from databuilder.models.table_metadata import TagMetadata class DashboardMetadata(GraphSerializable): @@ -91,18 +90,10 @@ def __init__(self, self._relation_iterator = self._create_next_relation() def __repr__(self) -> str: - return 'DashboardMetadata({!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r})' \ - .format(self.dashboard_group, - self.dashboard_name, - self.description, - self.tags, - self.dashboard_group_id, - self.dashboard_id, - self.dashboard_group_description, - self.created_timestamp, - self.dashboard_group_url, - self.dashboard_url, - ) + return f'DashboardMetadata(' \ + f'{self.dashboard_group!r}, {self.dashboard_name!r}, {self.description!r}, {self.tags!r}, ' \ + f'{self.dashboard_group_id!r}, {self.dashboard_id!r}, {self.dashboard_group_description!r}, ' \ + f'{self.created_timestamp!r}, {self.dashboard_group_url!r}, {self.dashboard_url!r})' def _get_cluster_key(self) -> str: return DashboardMetadata.CLUSTER_KEY_FORMAT.format(cluster=self.cluster, diff --git a/databuilder/databuilder/models/dashboard/dashboard_owner.py b/databuilder/databuilder/models/dashboard/dashboard_owner.py index eda0ee0ac0..f038469bee 100644 --- a/databuilder/databuilder/models/dashboard/dashboard_owner.py +++ b/databuilder/databuilder/models/dashboard/dashboard_owner.py @@ -2,17 +2,16 @@ # SPDX-License-Identifier: Apache-2.0 import logging - -from typing import Optional, Any, Union, Iterator +from typing import ( + Any, Iterator, Optional, Union, +) from databuilder.models.dashboard.dashboard_metadata import DashboardMetadata -from databuilder.models.graph_serializable import ( - GraphSerializable) -from databuilder.models.owner_constants import OWNER_OF_OBJECT_RELATION_TYPE, OWNER_RELATION_TYPE -from databuilder.models.user import User - from databuilder.models.graph_node import GraphNode from databuilder.models.graph_relationship import GraphRelationship +from databuilder.models.graph_serializable import GraphSerializable +from databuilder.models.owner_constants import OWNER_OF_OBJECT_RELATION_TYPE, OWNER_RELATION_TYPE +from databuilder.models.user import User LOGGER = logging.getLogger(__name__) @@ -70,10 +69,5 @@ def _create_relation_iterator(self) -> Iterator[GraphRelationship]: yield relationship def __repr__(self) -> str: - return 'DashboardOwner({!r}, {!r}, {!r}, {!r}, {!r})'.format( - self._dashboard_group_id, - self._dashboard_id, - self._email, - self._product, - self._cluster - ) + return f'DashboardOwner({self._dashboard_group_id!r}, {self._dashboard_id!r}, ' \ + f'{self._email!r}, {self._product!r}, {self._cluster!r})' diff --git a/databuilder/databuilder/models/dashboard/dashboard_query.py b/databuilder/databuilder/models/dashboard/dashboard_query.py index 61cd396207..721bb49a61 100644 --- a/databuilder/databuilder/models/dashboard/dashboard_query.py +++ b/databuilder/databuilder/models/dashboard/dashboard_query.py @@ -2,15 +2,14 @@ # SPDX-License-Identifier: Apache-2.0 import logging - -from typing import Optional, Any, Union, Iterator +from typing import ( + Any, Iterator, Optional, Union, +) from databuilder.models.dashboard.dashboard_metadata import DashboardMetadata -from databuilder.models.graph_serializable import ( - GraphSerializable) - from databuilder.models.graph_node import GraphNode from databuilder.models.graph_relationship import GraphRelationship +from databuilder.models.graph_serializable import GraphSerializable LOGGER = logging.getLogger(__name__) @@ -106,13 +105,5 @@ def _get_query_node_key(self) -> str: ) def __repr__(self) -> str: - return 'DashboardQuery({!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r})'.format( - self._dashboard_group_id, - self._dashboard_id, - self._query_name, - self._query_id, - self._url, - self._query_text, - self._product, - self._cluster - ) + return f'DashboardQuery({self._dashboard_group_id!r}, {self._dashboard_id!r}, {self._query_name!r}, ' \ + f'{self._query_id!r}, {self._url!r}, {self._query_text!r}, {self._product!r}, {self._cluster!r})' diff --git a/databuilder/databuilder/models/dashboard/dashboard_table.py b/databuilder/databuilder/models/dashboard/dashboard_table.py index cd8acb142e..d035f071bf 100644 --- a/databuilder/databuilder/models/dashboard/dashboard_table.py +++ b/databuilder/databuilder/models/dashboard/dashboard_table.py @@ -3,15 +3,15 @@ import logging import re - -from typing import Optional, Any, List, Union, Iterator +from typing import ( + Any, Iterator, List, Optional, Union, +) from databuilder.models.dashboard.dashboard_metadata import DashboardMetadata -from databuilder.models.graph_serializable import ( - GraphSerializable) -from databuilder.models.table_metadata import TableMetadata from databuilder.models.graph_node import GraphNode from databuilder.models.graph_relationship import GraphRelationship +from databuilder.models.graph_serializable import GraphSerializable +from databuilder.models.table_metadata import TableMetadata LOGGER = logging.getLogger(__name__) @@ -56,7 +56,7 @@ def create_next_relation(self) -> Union[GraphRelationship, None]: def _create_relation_iterator(self) -> Iterator[GraphRelationship]: for table_id in self._table_ids: - m = re.match('([^./]+)://([^./]+)\.([^./]+)\/([^./]+)', table_id) + m = re.match(r'([^./]+)://([^./]+)\.([^./]+)\/([^./]+)', table_id) if m: relationship = GraphRelationship( start_label=DashboardMetadata.DASHBOARD_NODE_LABEL, @@ -80,10 +80,5 @@ def _create_relation_iterator(self) -> Iterator[GraphRelationship]: yield relationship def __repr__(self) -> str: - return 'DashboardTable({!r}, {!r}, {!r}, {!r}, ({!r}))'.format( - self._dashboard_group_id, - self._dashboard_id, - self._product, - self._cluster, - ','.join(self._table_ids), - ) + return f'DashboardTable({self._dashboard_group_id!r}, {self._dashboard_id!r}, ' \ + f'{self._product!r}, {self._cluster!r}, ({",".join(self._table_ids)!r}))' diff --git a/databuilder/databuilder/models/dashboard/dashboard_usage.py b/databuilder/databuilder/models/dashboard/dashboard_usage.py index cbd9e89138..b98b3519d4 100644 --- a/databuilder/databuilder/models/dashboard/dashboard_usage.py +++ b/databuilder/databuilder/models/dashboard/dashboard_usage.py @@ -2,19 +2,18 @@ # SPDX-License-Identifier: Apache-2.0 import logging - -from typing import Optional, Any, Union, Iterator +from typing import ( + Any, Iterator, Optional, Union, +) from databuilder.models.dashboard.dashboard_metadata import DashboardMetadata -from databuilder.models.graph_serializable import ( - GraphSerializable -) +from databuilder.models.graph_node import GraphNode +from databuilder.models.graph_relationship import GraphRelationship +from databuilder.models.graph_serializable import GraphSerializable from databuilder.models.usage.usage_constants import ( - READ_RELATION_TYPE, READ_REVERSE_RELATION_TYPE, READ_RELATION_COUNT_PROPERTY + READ_RELATION_COUNT_PROPERTY, READ_RELATION_TYPE, READ_REVERSE_RELATION_TYPE, ) from databuilder.models.user import User -from databuilder.models.graph_node import GraphNode -from databuilder.models.graph_relationship import GraphRelationship LOGGER = logging.getLogger(__name__) @@ -90,12 +89,6 @@ def _create_relation_iterator(self) -> Iterator[GraphRelationship]: yield relationship def __repr__(self) -> str: - return 'DashboardUsage({!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r})'.format( - self._dashboard_group_id, - self._dashboard_id, - self._email, - self._view_count, - self._should_create_user_node, - self._product, - self._cluster - ) + return f'DashboardUsage({self._dashboard_group_id!r}, {self._dashboard_id!r}, ' \ + f'{self._email!r}, {self._view_count!r}, {self._should_create_user_node!r}, ' \ + f'{self._product!r}, {self._cluster!r})' diff --git a/databuilder/databuilder/models/dashboard_elasticsearch_document.py b/databuilder/databuilder/models/dashboard_elasticsearch_document.py index e1c79c7fff..7fe97a27de 100644 --- a/databuilder/databuilder/models/dashboard_elasticsearch_document.py +++ b/databuilder/databuilder/models/dashboard_elasticsearch_document.py @@ -1,7 +1,9 @@ # Copyright Contributors to the Amundsen project. # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional, Union +from typing import ( + List, Optional, Union, +) from databuilder.models.elasticsearch_document import ElasticsearchDocument diff --git a/databuilder/databuilder/models/graph_serializable.py b/databuilder/databuilder/models/graph_serializable.py index 7c50c57d14..8319e03bfe 100644 --- a/databuilder/databuilder/models/graph_serializable.py +++ b/databuilder/databuilder/models/graph_serializable.py @@ -2,8 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 import abc - from typing import Union # noqa: F401 + from databuilder.models.graph_node import GraphNode from databuilder.models.graph_relationship import GraphRelationship @@ -25,6 +25,7 @@ class GraphSerializable(object, metaclass=abc.ABCMeta): Any model class that needs to be pushed to a graph database should inherit this class. """ + def __init__(self) -> None: pass @@ -68,10 +69,10 @@ def _validate_node(self, node: GraphNode) -> None: node_id, node_label, _ = node if node_id is None: - RuntimeError('Required header missing. Required attributes id and label , Missing: id') + raise RuntimeError('Required header missing. Required attributes id and label , Missing: id') if node_label is None: - RuntimeError('Required header missing. Required attributes id and label , Missing: label') + raise RuntimeError('Required header missing. Required attributes id and label , Missing: label') self._validate_label_value(node_label) @@ -82,9 +83,9 @@ def _validate_relation(self, relation: GraphRelationship) -> None: self._validate_relation_type_value(relation.reverse_type) def _validate_relation_type_value(self, value: str) -> None: - if not value == value.upper(): - raise RuntimeError('TYPE needs to be upper case: {}'.format(value)) + if not value.isupper(): + raise RuntimeError(f'TYPE needs to be upper case: {value}') def _validate_label_value(self, value: str) -> None: if not value.istitle(): - raise RuntimeError('LABEL should only have upper case character on its first one: {}'.format(value)) + raise RuntimeError(f'LABEL should only have upper case character on its first one: {value}') diff --git a/databuilder/databuilder/models/neo4j_es_last_updated.py b/databuilder/databuilder/models/neo4j_es_last_updated.py index 76039facee..847b6ad97c 100644 --- a/databuilder/databuilder/models/neo4j_es_last_updated.py +++ b/databuilder/databuilder/models/neo4j_es_last_updated.py @@ -3,9 +3,9 @@ from typing import List, Union -from databuilder.models.graph_serializable import GraphSerializable -from databuilder.models.graph_relationship import GraphRelationship from databuilder.models.graph_node import GraphNode +from databuilder.models.graph_relationship import GraphRelationship +from databuilder.models.graph_serializable import GraphSerializable class Neo4jESLastUpdated(GraphSerializable): diff --git a/databuilder/databuilder/models/schema/schema.py b/databuilder/databuilder/models/schema/schema.py index 483abce363..a4f60e4502 100644 --- a/databuilder/databuilder/models/schema/schema.py +++ b/databuilder/databuilder/models/schema/schema.py @@ -1,13 +1,15 @@ # Copyright Contributors to the Amundsen project. # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Union, Iterator +from typing import ( + Any, Iterator, Union, +) -from databuilder.models.graph_serializable import (GraphSerializable) -from databuilder.models.schema.schema_constant import SCHEMA_NODE_LABEL, SCHEMA_NAME_ATTR -from databuilder.models.table_metadata import DescriptionMetadata from databuilder.models.graph_node import GraphNode from databuilder.models.graph_relationship import GraphRelationship +from databuilder.models.graph_serializable import GraphSerializable +from databuilder.models.schema.schema_constant import SCHEMA_NAME_ATTR, SCHEMA_NODE_LABEL +from databuilder.models.table_metadata import DescriptionMetadata class SchemaModel(GraphSerializable): @@ -54,7 +56,7 @@ def create_next_relation(self) -> Union[GraphRelationship, None]: def _get_description_node_key(self) -> str: desc = self._description.get_description_id() if self._description is not None else '' - return '{}/{}'.format(self._schema_key, desc) + return f'{self._schema_key}/{desc}' def _create_relation_iterator(self) -> Iterator[GraphRelationship]: if self._description: diff --git a/databuilder/databuilder/models/table_column_usage.py b/databuilder/databuilder/models/table_column_usage.py index ef7c602e5c..1968323d2e 100644 --- a/databuilder/databuilder/models/table_column_usage.py +++ b/databuilder/databuilder/models/table_column_usage.py @@ -1,15 +1,15 @@ # Copyright Contributors to the Amundsen project. # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, Union, Iterator - -from databuilder.models.graph_serializable import ( - GraphSerializable +from typing import ( + Iterable, Iterator, Union, ) -from databuilder.models.table_metadata import TableMetadata -from databuilder.models.user import User + from databuilder.models.graph_node import GraphNode from databuilder.models.graph_relationship import GraphRelationship +from databuilder.models.graph_serializable import GraphSerializable +from databuilder.models.table_metadata import TableMetadata +from databuilder.models.user import User class ColumnReader(object): @@ -35,9 +35,9 @@ def __init__(self, self.read_count = int(read_count) def __repr__(self) -> str: - return """\ -ColumnReader(database={!r}, cluster={!r}, schema={!r}, table={!r}, column={!r}, user_email={!r}, read_count={!r})"""\ - .format(self.database, self.cluster, self.schema, self.table, self.column, self.user_email, self.read_count) + return f"ColumnReader(database={self.database!r}, cluster={self.cluster!r}, " \ + f"schema={self.schema!r}, table={self.table!r}, column={self.column!r}, " \ + f"user_email={self.user_email!r}, read_count={self.read_count!r})" class TableColumnUsage(GraphSerializable): @@ -54,12 +54,10 @@ class TableColumnUsage(GraphSerializable): # Property key for relationship read, readby relationship READ_RELATION_COUNT = 'read_count' - def __init__(self, - col_readers: Iterable[ColumnReader], - ) -> None: + def __init__(self, col_readers: Iterable[ColumnReader]) -> None: for col_reader in col_readers: if col_reader.column != '*': - raise NotImplementedError('Column is not supported yet {}'.format(col_readers)) + raise NotImplementedError(f'Column is not supported yet {col_readers}') self.col_readers = col_readers self._node_iterator = self._create_node_iterator() @@ -108,4 +106,4 @@ def _get_user_key(self, email: str) -> str: return User.get_user_model_key(email=email) def __repr__(self) -> str: - return 'TableColumnUsage(col_readers={!r})'.format(self.col_readers) + return f'TableColumnUsage(col_readers={self.col_readers!r})' diff --git a/databuilder/databuilder/models/table_elasticsearch_document.py b/databuilder/databuilder/models/table_elasticsearch_document.py index e431606b00..1f35335764 100644 --- a/databuilder/databuilder/models/table_elasticsearch_document.py +++ b/databuilder/databuilder/models/table_elasticsearch_document.py @@ -33,7 +33,7 @@ def __init__(self, self.cluster = cluster self.schema = schema self.name = name - self.display_name = display_name if display_name else '{schema}.{table}'.format(schema=schema, table=name) + self.display_name = display_name if display_name else f'{schema}.{name}' self.key = key self.description = description # todo: use last_updated_timestamp to match the record in metadata diff --git a/databuilder/databuilder/models/table_last_updated.py b/databuilder/databuilder/models/table_last_updated.py index b70385c8f9..d4552f8f32 100644 --- a/databuilder/databuilder/models/table_last_updated.py +++ b/databuilder/databuilder/models/table_last_updated.py @@ -3,12 +3,11 @@ from typing import List, Union +from databuilder.models.graph_node import GraphNode +from databuilder.models.graph_relationship import GraphRelationship from databuilder.models.graph_serializable import GraphSerializable - from databuilder.models.table_metadata import TableMetadata from databuilder.models.timestamp import timestamp_constants -from databuilder.models.graph_node import GraphNode -from databuilder.models.graph_relationship import GraphRelationship class TableLastUpdated(GraphSerializable): @@ -38,9 +37,8 @@ def __init__(self, self._relation_iter = iter(self.create_relation()) def __repr__(self) -> str: - return \ - """TableLastUpdated(table_name={!r}, last_updated_time={!r}, schema={!r}, db={!r}, cluster={!r})"""\ - .format(self.table_name, self.last_updated_time, self.schema, self.db, self.cluster) + return f"TableLastUpdated(table_name={self.table_name!r}, last_updated_time={self.last_updated_time!r}, " \ + f"schema={self.schema!r}, db={self.db!r}, cluster={self.cluster!r})" def create_next_node(self) -> Union[GraphNode, None]: # creates new node diff --git a/databuilder/databuilder/models/table_lineage.py b/databuilder/databuilder/models/table_lineage.py index fda8e11558..393c649be3 100644 --- a/databuilder/databuilder/models/table_lineage.py +++ b/databuilder/databuilder/models/table_lineage.py @@ -4,11 +4,10 @@ import re from typing import List, Union -from databuilder.models.graph_serializable import GraphSerializable - -from databuilder.models.table_metadata import TableMetadata from databuilder.models.graph_node import GraphNode from databuilder.models.graph_relationship import GraphRelationship +from databuilder.models.graph_serializable import GraphSerializable +from databuilder.models.table_metadata import TableMetadata class TableLineage(GraphSerializable): @@ -56,10 +55,7 @@ def get_table_model_key(self, schema: str, table: str ) -> str: - return '{db}://{cluster}.{schema}/{table}'.format(db=db, - cluster=cluster, - schema=schema, - table=table) + return f'{db}://{cluster}.{schema}/{table}' def create_nodes(self) -> List[Union[GraphNode, None]]: """ @@ -103,7 +99,4 @@ def create_relation(self) -> List[GraphRelationship]: return results def __repr__(self) -> str: - return 'TableLineage({!r}, {!r}, {!r}, {!r})'.format(self.db, - self.cluster, - self.schema, - self.table) + return f'TableLineage({self.db!r}, {self.cluster!r}, {self.schema!r}, {self.table!r})' diff --git a/databuilder/databuilder/models/table_metadata.py b/databuilder/databuilder/models/table_metadata.py index 87ad74fbb3..1b76afa83d 100644 --- a/databuilder/databuilder/models/table_metadata.py +++ b/databuilder/databuilder/models/table_metadata.py @@ -2,16 +2,16 @@ # SPDX-License-Identifier: Apache-2.0 import copy +from typing import ( + Any, Dict, Iterable, Iterator, List, Optional, Set, Union, +) -from typing import Any, Dict, Iterable, Iterator, List, Optional, Set, Union - +from databuilder.models.badge import Badge, BadgeMetadata from databuilder.models.cluster import cluster_constants -from databuilder.models.graph_serializable import GraphSerializable -from databuilder.models.schema import schema_constant -from databuilder.models.badge import BadgeMetadata, Badge - from databuilder.models.graph_node import GraphNode from databuilder.models.graph_relationship import GraphRelationship +from databuilder.models.graph_serializable import GraphSerializable +from databuilder.models.schema import schema_constant DESCRIPTION_NODE_LABEL_VAL = 'Description' DESCRIPTION_NODE_LABEL = DESCRIPTION_NODE_LABEL_VAL @@ -127,7 +127,7 @@ def get_description_id(self) -> str: return "_" + self._source + "_description" def __repr__(self) -> str: - return 'DescriptionMetadata({!r}, {!r})'.format(self._source, self._text) + return f'DescriptionMetadata({self._source!r}, {self._text!r})' def get_node(self, node_key: str) -> GraphNode: node = GraphNode( @@ -186,11 +186,8 @@ def __init__(self, self.badges = [Badge(badge, 'column') for badge in formatted_badges] def __repr__(self) -> str: - return 'ColumnMetadata({!r}, {!r}, {!r}, {!r}, {!r})'.format(self.name, - self.description, - self.type, - self.sort_order, - self.badges) + return f'ColumnMetadata({self.name!r}, {self.description!r}, {self.type!r}, ' \ + f'{self.sort_order!r}, {self.badges!r})' class TableMetadata(GraphSerializable): @@ -278,15 +275,8 @@ def __init__(self, self._relation_iterator = self._create_next_relation() def __repr__(self) -> str: - return 'TableMetadata({!r}, {!r}, {!r}, {!r} ' \ - '{!r}, {!r}, {!r}, {!r})'.format(self.database, - self.cluster, - self.schema, - self.name, - self.description, - self.columns, - self.is_view, - self.tags) + return f'TableMetadata({self.database!r}, {self.cluster!r}, {self.schema!r}, {self.name!r} ' \ + f'{self.description!r}, {self.columns!r}, {self.is_view!r}, {self.tags!r})' def _get_table_key(self) -> str: return TableMetadata.TABLE_KEY_FORMAT.format(db=self.database, diff --git a/databuilder/databuilder/models/table_owner.py b/databuilder/databuilder/models/table_owner.py index c9b2503fba..499c2ae91f 100644 --- a/databuilder/databuilder/models/table_owner.py +++ b/databuilder/databuilder/models/table_owner.py @@ -1,13 +1,15 @@ # Copyright Contributors to the Amundsen project. # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional, Union +from typing import ( + List, Optional, Union, +) -from databuilder.models.graph_serializable import GraphSerializable -from databuilder.models.owner_constants import OWNER_RELATION_TYPE, OWNER_OF_OBJECT_RELATION_TYPE -from databuilder.models.user import User from databuilder.models.graph_node import GraphNode from databuilder.models.graph_relationship import GraphRelationship +from databuilder.models.graph_serializable import GraphSerializable +from databuilder.models.owner_constants import OWNER_OF_OBJECT_RELATION_TYPE, OWNER_RELATION_TYPE +from databuilder.models.user import User class TableOwner(GraphSerializable): @@ -52,10 +54,7 @@ def get_owner_model_key(self, owner: str) -> str: return User.USER_NODE_KEY_FORMAT.format(email=owner) def get_metadata_model_key(self) -> str: - return '{db}://{cluster}.{schema}/{table}'.format(db=self.db, - cluster=self.cluster, - schema=self.schema, - table=self.table) + return f'{self.db}://{self.cluster}.{self.schema}/{self.table}' def create_nodes(self) -> List[GraphNode]: """ @@ -96,8 +95,4 @@ def create_relation(self) -> List[GraphRelationship]: return results def __repr__(self) -> str: - return 'TableOwner({!r}, {!r}, {!r}, {!r}, {!r})'.format(self.db, - self.cluster, - self.schema, - self.table, - self.owners) + return f'TableOwner({self.db!r}, {self.cluster!r}, {self.schema!r}, {self.table!r}, {self.owners!r})' diff --git a/databuilder/databuilder/models/table_source.py b/databuilder/databuilder/models/table_source.py index 0cfaa07116..dda323e1d7 100644 --- a/databuilder/databuilder/models/table_source.py +++ b/databuilder/databuilder/models/table_source.py @@ -3,11 +3,10 @@ from typing import List, Optional -from databuilder.models.graph_serializable import GraphSerializable - -from databuilder.models.table_metadata import TableMetadata from databuilder.models.graph_node import GraphNode from databuilder.models.graph_relationship import GraphRelationship +from databuilder.models.graph_serializable import GraphSerializable +from databuilder.models.table_metadata import TableMetadata class TableSource(GraphSerializable): @@ -25,7 +24,7 @@ def __init__(self, table_name: str, cluster: str, source: str, - source_type: str='github', + source_type: str = 'github', ) -> None: self.db = db_name self.schema = schema @@ -58,10 +57,7 @@ def get_source_model_key(self) -> str: tbl=self.table) def get_metadata_model_key(self) -> str: - return '{db}://{cluster}.{schema}/{table}'.format(db=self.db, - cluster=self.cluster, - schema=self.schema, - table=self.table) + return f'{self.db}://{self.cluster}.{self.schema}/{self.table}' def create_nodes(self) -> List[GraphNode]: """ @@ -97,8 +93,4 @@ def create_relation(self) -> List[GraphRelationship]: return results def __repr__(self) -> str: - return 'TableSource({!r}, {!r}, {!r}, {!r}, {!r})'.format(self.db, - self.cluster, - self.schema, - self.table, - self.source) + return f'TableSource({self.db!r}, {self.cluster!r}, {self.schema!r}, {self.table!r}, {self.source!r})' diff --git a/databuilder/databuilder/models/table_stats.py b/databuilder/databuilder/models/table_stats.py index 7f8eb06094..ea3e73581d 100644 --- a/databuilder/databuilder/models/table_stats.py +++ b/databuilder/databuilder/models/table_stats.py @@ -2,10 +2,10 @@ # SPDX-License-Identifier: Apache-2.0 from typing import List, Optional -from databuilder.models.graph_serializable import GraphSerializable -from databuilder.models.table_metadata import ColumnMetadata from databuilder.models.graph_node import GraphNode from databuilder.models.graph_relationship import GraphRelationship +from databuilder.models.graph_serializable import GraphSerializable +from databuilder.models.table_metadata import ColumnMetadata class TableColumnStats(GraphSerializable): diff --git a/databuilder/databuilder/models/user.py b/databuilder/databuilder/models/user.py index f483851869..0aef16c965 100644 --- a/databuilder/databuilder/models/user.py +++ b/databuilder/databuilder/models/user.py @@ -2,12 +2,13 @@ # SPDX-License-Identifier: Apache-2.0 import copy +from typing import ( + Any, List, Optional, +) -from typing import Any, List, Optional - -from databuilder.models.graph_serializable import GraphSerializable from databuilder.models.graph_node import GraphNode from databuilder.models.graph_relationship import GraphRelationship +from databuilder.models.graph_serializable import GraphSerializable class User(GraphSerializable): @@ -109,7 +110,7 @@ def create_next_relation(self) -> Optional[GraphRelationship]: @classmethod def get_user_model_key(cls, - email: str=None + email: str = None ) -> str: if not email: return '' @@ -124,17 +125,16 @@ def create_nodes(self) -> List[GraphNode]: node_attributes = { User.USER_NODE_EMAIL: self.email, User.USER_NODE_IS_ACTIVE: self.is_active, + User.USER_NODE_FIRST_NAME: self.first_name or '', + User.USER_NODE_LAST_NAME: self.last_name or '', + User.USER_NODE_FULL_NAME: self.name or '', + User.USER_NODE_GITHUB_NAME: self.github_username or '', + User.USER_NODE_TEAM: self.team_name or '', + User.USER_NODE_EMPLOYEE_TYPE: self.employee_type or '', + User.USER_NODE_SLACK_ID: self.slack_id or '', + User.USER_NODE_ROLE_NAME: self.role_name or '' } - node_attributes[User.USER_NODE_FIRST_NAME] = self.first_name if self.first_name else '' - node_attributes[User.USER_NODE_LAST_NAME] = self.last_name if self.last_name else '' - node_attributes[User.USER_NODE_FULL_NAME] = self.name if self.name else '' - node_attributes[User.USER_NODE_GITHUB_NAME] = self.github_username if self.github_username else '' - node_attributes[User.USER_NODE_TEAM] = self.team_name if self.team_name else '' - node_attributes[User.USER_NODE_EMPLOYEE_TYPE] = self.employee_type if self.employee_type else '' - node_attributes[User.USER_NODE_SLACK_ID] = self.slack_id if self.slack_id else '' - node_attributes[User.USER_NODE_ROLE_NAME] = self.role_name if self.role_name else '' - if self.updated_at: node_attributes[User.USER_NODE_UPDATED_AT] = self.updated_at elif not self.do_not_update_empty_attribute: @@ -174,16 +174,6 @@ def create_relation(self) -> List[GraphRelationship]: return [] def __repr__(self) -> str: - return 'User({!r}, {!r}, {!r}, {!r}, {!r}, ' \ - '{!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r})'.format(self.first_name, - self.last_name, - self.name, - self.email, - self.github_username, - self.team_name, - self.slack_id, - self.manager_email, - self.employee_type, - self.is_active, - self.updated_at, - self.role_name) + return f'User({self.first_name!r}, {self.last_name!r}, {self.name!r}, {self.email!r}, ' \ + f'{self.github_username!r}, {self.team_name!r}, {self.slack_id!r}, {self.manager_email!r}, ' \ + f'{self.employee_type!r}, {self.is_active!r}, {self.updated_at!r}, {self.role_name!r})' diff --git a/databuilder/databuilder/models/watermark.py b/databuilder/databuilder/models/watermark.py index 53472bc2f2..9c4b751051 100644 --- a/databuilder/databuilder/models/watermark.py +++ b/databuilder/databuilder/models/watermark.py @@ -1,11 +1,13 @@ # Copyright Contributors to the Amundsen project. # SPDX-License-Identifier: Apache-2.0 -from typing import List, Union, Tuple +from typing import ( + List, Tuple, Union, +) -from databuilder.models.graph_serializable import GraphSerializable from databuilder.models.graph_node import GraphNode from databuilder.models.graph_relationship import GraphRelationship +from databuilder.models.graph_serializable import GraphSerializable class Watermark(GraphSerializable): @@ -67,10 +69,7 @@ def get_watermark_model_key(self) -> str: part_type=self.part_type) def get_metadata_model_key(self) -> str: - return '{database}://{cluster}.{schema}/{table}'.format(database=self.database, - cluster=self.cluster, - schema=self.schema, - table=self.table) + return f'{self.database}://{self.cluster}.{self.schema}/{self.table}' def create_nodes(self) -> List[GraphNode]: """ diff --git a/databuilder/databuilder/publisher/base_publisher.py b/databuilder/databuilder/publisher/base_publisher.py index dd6b4ea9d9..13c84a080d 100644 --- a/databuilder/databuilder/publisher/base_publisher.py +++ b/databuilder/databuilder/publisher/base_publisher.py @@ -2,9 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 import abc +from typing import List from pyhocon import ConfigTree -from typing import List from databuilder import Scoped from databuilder.callback import call_back diff --git a/databuilder/databuilder/publisher/elasticsearch_publisher.py b/databuilder/databuilder/publisher/elasticsearch_publisher.py index a89650515e..f756759341 100644 --- a/databuilder/databuilder/publisher/elasticsearch_publisher.py +++ b/databuilder/databuilder/publisher/elasticsearch_publisher.py @@ -3,10 +3,10 @@ import json import logging +from typing import List from elasticsearch.exceptions import NotFoundError from pyhocon import ConfigTree -from typing import List from databuilder.publisher.base_publisher import Publisher from databuilder.publisher.elasticsearch_constants import TABLE_ELASTICSEARCH_INDEX_MAPPING @@ -99,7 +99,7 @@ def publish_impl(self) -> None: cnt += 1 if cnt == self.elasticsearch_batch_size: self.elasticsearch_client.bulk(bulk_actions) - LOGGER.info('Publish {} of records to ES'.format(str(cnt))) + LOGGER.info('Publish %i of records to ES', cnt) cnt = 0 bulk_actions = [] diff --git a/databuilder/databuilder/publisher/neo4j_csv_publisher.py b/databuilder/databuilder/publisher/neo4j_csv_publisher.py index 434c6b1e2d..494597441c 100644 --- a/databuilder/databuilder/publisher/neo4j_csv_publisher.py +++ b/databuilder/databuilder/publisher/neo4j_csv_publisher.py @@ -1,27 +1,25 @@ # Copyright Contributors to the Amundsen project. # SPDX-License-Identifier: Apache-2.0 -import pandas import csv import ctypes -from io import open import logging import time +from io import open from os import listdir from os.path import isfile, join -from jinja2 import Template +from typing import List, Set -from neo4j import GraphDatabase, Transaction import neo4j +import pandas +from jinja2 import Template +from neo4j import GraphDatabase, Transaction from neo4j.exceptions import CypherError, TransientError -from pyhocon import ConfigFactory -from pyhocon import ConfigTree -from typing import Set, List +from pyhocon import ConfigFactory, ConfigTree from databuilder.publisher.base_publisher import Publisher from databuilder.publisher.neo4j_preprocessor import NoopRelationPreprocessor - # Setting field_size_limit to solve the error below # _csv.Error: field larger than field limit (131072) # https://stackoverflow.com/a/54517228/5972935 @@ -154,12 +152,11 @@ def init(self, conf: ConfigTree) -> None: self.labels: Set[str] = set() self.publish_tag: str = conf.get_string(JOB_PUBLISH_TAG) if not self.publish_tag: - raise Exception('{} should not be empty'.format(JOB_PUBLISH_TAG)) + raise Exception(f'{JOB_PUBLISH_TAG} should not be empty') self._relation_preprocessor = conf.get(RELATION_PREPROCESSOR) - LOGGER.info('Publishing Node csv files {}, and Relation CSV files {}' - .format(self._node_files, self._relation_files)) + LOGGER.info('Publishing Node csv files %s, and Relation CSV files %s', self._node_files, self._relation_files) def _list_files(self, conf: ConfigTree, path_key: str) -> List[str]: """ @@ -182,11 +179,11 @@ def publish_impl(self) -> None: # noqa: C901 start = time.time() - LOGGER.info('Creating indices using Node files: {}'.format(self._node_files)) + LOGGER.info('Creating indices using Node files: %s', self._node_files) for node_file in self._node_files: self._create_indices(node_file=node_file) - LOGGER.info('Publishing Node files: {}'.format(self._node_files)) + LOGGER.info('Publishing Node files: %s', self._node_files) try: tx = self._session.begin_transaction() while True: @@ -196,7 +193,7 @@ def publish_impl(self) -> None: # noqa: C901 except StopIteration: break - LOGGER.info('Publishing Relationship files: {}'.format(self._relation_files)) + LOGGER.info('Publishing Relationship files: %s', self._relation_files) while True: try: relation_file = next(self._relation_files_iter) @@ -205,10 +202,10 @@ def publish_impl(self) -> None: # noqa: C901 break tx.commit() - LOGGER.info('Committed total {} statements'.format(self._count)) + LOGGER.info('Committed total %i statements', self._count) # TODO: Add statsd support - LOGGER.info('Successfully published. Elapsed: {} seconds'.format(time.time() - start)) + LOGGER.info('Successfully published. Elapsed: %i seconds', time.time() - start) except Exception as e: LOGGER.exception('Failed to publish. Rolling back.') if not tx.closed(): @@ -305,7 +302,7 @@ def _publish_relation(self, relation_file: str, tx: Transaction) -> Transaction: """ if self._relation_preprocessor.is_perform_preprocess(): - LOGGER.info('Pre-processing relation with {}'.format(self._relation_preprocessor)) + LOGGER.info('Pre-processing relation with %s', self._relation_preprocessor) count = 0 with open(relation_file, 'r', encoding='utf8') as relation_csv: @@ -323,7 +320,7 @@ def _publish_relation(self, relation_file: str, tx: Transaction) -> Transaction: tx = self._execute_statement(stmt, tx=tx, params=params) count += 1 - LOGGER.info('Executed pre-processing Cypher statement {} times'.format(count)) + LOGGER.info('Executed pre-processing Cypher statement %i times', count) with open(relation_file, 'r', encoding='utf8') as relation_csv: for rel_record in pandas.read_csv(relation_csv, na_filter=False).to_dict(orient="records"): @@ -337,7 +334,7 @@ def _publish_relation(self, relation_file: str, tx: Transaction) -> Transaction: expect_result=self._confirm_rel_created) exception_exists = False except TransientError as e: - if rel_record[RELATION_START_LABEL] in self.deadlock_node_labels\ + if rel_record[RELATION_START_LABEL] in self.deadlock_node_labels \ or rel_record[RELATION_END_LABEL] in self.deadlock_node_labels: time.sleep(SLEEP_TIME) retries_for_exception -= 1 @@ -378,8 +375,6 @@ def _create_props_param(self, record_dict: dict) -> dict: for k, v in record_dict.items(): if k.endswith(UNQUOTED_SUFFIX): k = k[:-len(UNQUOTED_SUFFIX)] - else: - v = '{val}'.format(val=v) params[k] = v return params @@ -407,23 +402,18 @@ def _create_props_body(self, if k.endswith(UNQUOTED_SUFFIX): k = k[:-len(UNQUOTED_SUFFIX)] - props.append('{id}.{key} = {val}'.format(id=identifier, key=k, val=f'${k}')) - - props.append("""{id}.{key} = '{val}'""".format(id=identifier, - key=PUBLISHED_TAG_PROPERTY_NAME, - val=self.publish_tag)) + props.append(f'{identifier}.{k} = ${k}') - props.append("""{id}.{key} = {val}""".format(id=identifier, - key=LAST_UPDATED_EPOCH_MS, - val='timestamp()')) + props.append(f"{identifier}.{PUBLISHED_TAG_PROPERTY_NAME} = '{self.publish_tag}'") + props.append(f"{identifier}.{LAST_UPDATED_EPOCH_MS} = timestamp()") return ', '.join(props) def _execute_statement(self, stmt: str, tx: Transaction, - params: dict=None, - expect_result: bool=False) -> Transaction: + params: dict = None, + expect_result: bool = False) -> Transaction: """ Executes statement against Neo4j. If execution fails, it rollsback and raise exception. If 'expect_result' flag is True, it confirms if result object is not null. @@ -434,21 +424,20 @@ def _execute_statement(self, :return: """ try: - if LOGGER.isEnabledFor(logging.DEBUG): - LOGGER.debug('Executing statement: {} with params {}'.format(stmt, params)) + LOGGER.debug('Executing statement: %s with params %s', stmt, params) result = tx.run(str(stmt).encode('utf-8', 'ignore'), parameters=params) if expect_result and not result.single(): - raise RuntimeError('Failed to executed statement: {}'.format(stmt)) + raise RuntimeError(f'Failed to executed statement: {stmt}') self._count += 1 if self._count > 1 and self._count % self._transaction_size == 0: tx.commit() - LOGGER.info('Committed {} statements so far'.format(self._count)) + LOGGER.info(f'Committed {self._count} statements so far') return self._session.begin_transaction() if self._count > 1 and self._count % self._progress_report_frequency == 0: - LOGGER.info('Processed {} statements so far'.format(self._count)) + LOGGER.info(f'Processed {self._count} statements so far') return tx except Exception as e: @@ -468,8 +457,7 @@ def _try_create_index(self, label: str) -> None: CREATE CONSTRAINT ON (node:{{ LABEL }}) ASSERT node.key IS UNIQUE """).render(LABEL=label) - LOGGER.info('Trying to create index for label {label} if not exist: {stmt}'.format(label=label, - stmt=stmt)) + LOGGER.info(f'Trying to create index for label %s if not exist: %s', label, stmt) with self._driver.session() as session: try: session.run(stmt) diff --git a/databuilder/databuilder/publisher/neo4j_preprocessor.py b/databuilder/databuilder/publisher/neo4j_preprocessor.py index aee52a371c..543cf90621 100644 --- a/databuilder/databuilder/publisher/neo4j_preprocessor.py +++ b/databuilder/databuilder/publisher/neo4j_preprocessor.py @@ -2,10 +2,11 @@ # SPDX-License-Identifier: Apache-2.0 import abc - import logging -from typing import Dict, List, Tuple, Optional import textwrap +from typing import ( + Dict, List, Optional, Tuple, +) LOGGER = logging.getLogger(__name__) @@ -170,7 +171,7 @@ def preprocess_cypher_impl(self, """ if not (start_label or end_label or start_key or end_key): - raise Exception('all labels and keys are required: {}'.format(locals())) + raise Exception(f'all labels and keys are required: {locals()}') params = {'start_key': start_key, 'end_key': end_key} return DeleteRelationPreprocessor.RELATION_MERGE_TEMPLATE.format(start_label=start_label, diff --git a/databuilder/databuilder/rest_api/base_rest_api_query.py b/databuilder/databuilder/rest_api/base_rest_api_query.py index 8f6756b07f..18ce6b8534 100644 --- a/databuilder/databuilder/rest_api/base_rest_api_query.py +++ b/databuilder/databuilder/rest_api/base_rest_api_query.py @@ -3,8 +3,9 @@ import abc import logging - -from typing import Iterable, Any, Dict, Iterator +from typing import ( + Any, Dict, Iterable, Iterator, +) LOGGER = logging.getLogger(__name__) diff --git a/databuilder/databuilder/rest_api/mode_analytics/mode_paginated_rest_api_query.py b/databuilder/databuilder/rest_api/mode_analytics/mode_paginated_rest_api_query.py index 707e7cacf8..5937b313ec 100644 --- a/databuilder/databuilder/rest_api/mode_analytics/mode_paginated_rest_api_query.py +++ b/databuilder/databuilder/rest_api/mode_analytics/mode_paginated_rest_api_query.py @@ -2,10 +2,10 @@ # SPDX-License-Identifier: Apache-2.0 import logging +from typing import Any, Dict import requests from jsonpath_rw import parse -from typing import Any, Dict from databuilder.rest_api.rest_api_query import RestApiQuery @@ -49,12 +49,10 @@ def _preprocess_url(self, page_suffix = PAGE_SUFFIX_TEMPLATE.format(self._current_page) # example: ?page=2 # example: http://foo.bar/resources?page=2 - self._url = self._original_url + '{page_suffix}'.format(page_suffix=page_suffix) + self._url = f"{self._original_url}{page_suffix}" return self._url.format(**record) - def _post_process(self, - response: requests.Response, - ) -> None: + def _post_process(self, response: requests.Response, ) -> None: """ Updates trigger to pagination (self._more_pages) as well as current_page (self._current_page) Mode does not have explicit indicator that it just the number of records need to be certain number that diff --git a/databuilder/databuilder/rest_api/rest_api_failure_handlers.py b/databuilder/databuilder/rest_api/rest_api_failure_handlers.py index 3e01bc72d3..b551925adc 100644 --- a/databuilder/databuilder/rest_api/rest_api_failure_handlers.py +++ b/databuilder/databuilder/rest_api/rest_api_failure_handlers.py @@ -2,7 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 import abc - from typing import Iterable diff --git a/databuilder/databuilder/rest_api/rest_api_query.py b/databuilder/databuilder/rest_api/rest_api_query.py index 39f9ece7da..9282333945 100644 --- a/databuilder/databuilder/rest_api/rest_api_query.py +++ b/databuilder/databuilder/rest_api/rest_api_query.py @@ -3,11 +3,13 @@ import copy import logging +from typing import ( + Any, Callable, Dict, Iterator, List, Union, +) import requests from jsonpath_rw import parse from retrying import retry -from typing import List, Dict, Any, Union, Iterator, Callable from databuilder.rest_api.base_rest_api_query import BaseRestApiQuery @@ -56,10 +58,10 @@ def __init__(self, params: Dict[str, Any], json_path: str, field_names: List[str], - fail_no_result: bool=False, - skip_no_result: bool=False, - json_path_contains_or: bool=False, - can_skip_failure: Callable=None, + fail_no_result: bool = False, + skip_no_result: bool = False, + json_path_contains_or: bool = False, + can_skip_failure: Callable = None, **kwargs: Any ) -> None: """ @@ -153,8 +155,8 @@ def execute(self) -> Iterator[Dict[str, Any]]: # noqa: C901 result_list: List[Any] = [match.value for match in self._jsonpath_expr.find(response_json)] if not result_list: - log_msg = 'No result from URL: {url}, JSONPATH: {json_path} , response payload: {response}' \ - .format(url=self._url, json_path=self._json_path, response=response_json) + log_msg = f'No result from URL: {self._url}, JSONPATH: {self._json_path} , ' \ + f'response payload: {response_json}' LOGGER.info(log_msg) self._post_process(response) @@ -197,7 +199,7 @@ def _send_request(self, url: str) -> requests.Response: :param url: :return: """ - LOGGER.info('Calling URL {}'.format(url)) + LOGGER.info('Calling URL %s', url) response = requests.get(url, **self._params) response.raise_for_status() return response @@ -206,7 +208,7 @@ def _send_request(self, url: str) -> requests.Response: def _compute_sub_records(cls, result_list: List[Any], field_names: List[str], - json_path_contains_or: bool=False, + json_path_contains_or: bool = False, ) -> List[List[Any]]: """ The behavior of JSONPATH is different when it's extracting multiple fields using AND(,) vs OR(|) diff --git a/databuilder/databuilder/serializers/neo4_serializer.py b/databuilder/databuilder/serializers/neo4_serializer.py index baab36c0dc..7ee90dbd9f 100644 --- a/databuilder/databuilder/serializers/neo4_serializer.py +++ b/databuilder/databuilder/serializers/neo4_serializer.py @@ -1,19 +1,15 @@ # Copyright Contributors to the Amundsen project. # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, Any, Optional +from typing import ( + Any, Dict, Optional, +) -from databuilder.models.graph_relationship import GraphRelationship from databuilder.models.graph_node import GraphNode +from databuilder.models.graph_relationship import GraphRelationship from databuilder.models.graph_serializable import ( - NODE_LABEL, - NODE_KEY, - RELATION_END_KEY, - RELATION_END_LABEL, - RELATION_REVERSE_TYPE, - RELATION_START_KEY, - RELATION_START_LABEL, - RELATION_TYPE + NODE_KEY, NODE_LABEL, RELATION_END_KEY, RELATION_END_LABEL, RELATION_REVERSE_TYPE, RELATION_START_KEY, + RELATION_START_LABEL, RELATION_TYPE, ) from databuilder.publisher.neo4j_csv_publisher import UNQUOTED_SUFFIX @@ -28,10 +24,7 @@ def serialize_node(node: Optional[GraphNode]) -> Dict[str, Any]: } for key, value in node.attributes.items(): key_suffix = _get_neo4j_suffix_value(value) - formatted_key = "{key}{suffix}".format( - key=key, - suffix=key_suffix - ) + formatted_key = f'{key}{key_suffix}' node_dict[formatted_key] = value return node_dict @@ -50,10 +43,7 @@ def serialize_relationship(relationship: Optional[GraphRelationship]) -> Dict[st } for key, value in relationship.attributes.items(): key_suffix = _get_neo4j_suffix_value(value) - formatted_key = "{key}{suffix}".format( - key=key, - suffix=key_suffix - ) + formatted_key = f'{key}{key_suffix}' relationship_dict[formatted_key] = value return relationship_dict diff --git a/databuilder/databuilder/task/neo4j_staleness_removal_task.py b/databuilder/databuilder/task/neo4j_staleness_removal_task.py index dc15127163..738f9f2779 100644 --- a/databuilder/databuilder/task/neo4j_staleness_removal_task.py +++ b/databuilder/databuilder/task/neo4j_staleness_removal_task.py @@ -4,11 +4,13 @@ import logging import textwrap import time +from typing import ( + Any, Dict, Iterable, +) -from neo4j import GraphDatabase import neo4j +from neo4j import GraphDatabase from pyhocon import ConfigFactory, ConfigTree -from typing import Any, Dict, Iterable from databuilder import Scoped from databuilder.publisher.neo4j_csv_publisher import JOB_PUBLISH_TAG @@ -80,13 +82,13 @@ def init(self, conf: ConfigTree) -> None: self.staleness_pct_dict = conf.get(STALENESS_PCT_MAX_DICT) if JOB_PUBLISH_TAG in conf and MS_TO_EXPIRE in conf: - raise Exception('Cannot have both {} and {} in job config'.format(JOB_PUBLISH_TAG, MS_TO_EXPIRE)) + raise Exception(f'Cannot have both {JOB_PUBLISH_TAG} and {MS_TO_EXPIRE} in job config') self.ms_to_expire = None if MS_TO_EXPIRE in conf: self.ms_to_expire = conf.get_int(MS_TO_EXPIRE) if self.ms_to_expire < conf.get_int(MIN_MS_TO_EXPIRE): - raise Exception('{} is too small'.format(MS_TO_EXPIRE)) + raise Exception(f'{MS_TO_EXPIRE} is too small') self.marker = self.ms_to_expire else: self.marker = conf.get_string(JOB_PUBLISH_TAG) @@ -139,13 +141,13 @@ def _decorate_staleness(self, :return: """ if self.ms_to_expire: - return statement.format(textwrap.dedent(""" - n.publisher_last_updated_epoch_ms < (timestamp() - ${marker}) - OR NOT EXISTS(n.publisher_last_updated_epoch_ms)""".format(marker=MARKER_VAR_NAME))) + return statement.format(textwrap.dedent(f""" + n.publisher_last_updated_epoch_ms < (timestamp() - ${MARKER_VAR_NAME}) + OR NOT EXISTS(n.publisher_last_updated_epoch_ms)""")) - return statement.format(textwrap.dedent(""" - n.published_tag <> ${marker} - OR NOT EXISTS(n.published_tag)""".format(marker=MARKER_VAR_NAME))) + return statement.format(textwrap.dedent(f""" + n.published_tag <> ${MARKER_VAR_NAME} + OR NOT EXISTS(n.published_tag)""")) def _delete_stale_relations(self) -> None: statement = textwrap.dedent(""" @@ -168,7 +170,7 @@ def _batch_delete(self, :return: """ for t in targets: - LOGGER.info('Deleting stale data of {} with batch size {}'.format(t, self.batch_size)) + LOGGER.info('Deleting stale data of %s with batch size %i', t, self.batch_size) total_count = 0 while True: results = self._execute_cypher_query(statement=statement.format(type=t), @@ -180,7 +182,7 @@ def _batch_delete(self, total_count = total_count + count if count == 0: break - LOGGER.info('Deleted {} stale data of {}'.format(total_count, t)) + LOGGER.info('Deleted %i stale data of %s', total_count, t) def _validate_staleness_pct(self, total_records: Iterable[Dict[str, Any]], @@ -203,8 +205,8 @@ def _validate_staleness_pct(self, threshold = self.staleness_pct_dict.get(type_str, self.staleness_pct) if stale_pct >= threshold: - raise Exception('Staleness percentage of {} is {} %. Stopping due to over threshold {} %' - .format(type_str, stale_pct, threshold)) + raise Exception(f'Staleness percentage of {type_str} is {stale_pct} %. ' + f'Stopping due to over threshold {threshold} %') def _validate_node_staleness_pct(self) -> None: total_nodes_statement = textwrap.dedent(""" @@ -252,11 +254,10 @@ def _validate_relation_staleness_pct(self) -> None: def _execute_cypher_query(self, statement: str, - param_dict: Dict[str, Any]={}, - dry_run: bool=False + param_dict: Dict[str, Any] = {}, + dry_run: bool = False ) -> Iterable[Dict[str, Any]]: - LOGGER.info('Executing Cypher query: {statement} with params {params}: '.format(statement=statement, - params=param_dict)) + LOGGER.info('Executing Cypher query: %s with params %s: ', statement, param_dict) if dry_run: LOGGER.info('Skipping for it is a dryrun') @@ -268,5 +269,4 @@ def _execute_cypher_query(self, return session.run(statement, **param_dict) finally: - if LOGGER.isEnabledFor(logging.DEBUG): - LOGGER.debug('Cypher query execution elapsed for {} seconds'.format(time.time() - start)) + LOGGER.debug('Cypher query execution elapsed for %i seconds', time.time() - start) diff --git a/databuilder/databuilder/task/task.py b/databuilder/databuilder/task/task.py index f600fda514..44a6cfa7b7 100644 --- a/databuilder/databuilder/task/task.py +++ b/databuilder/databuilder/task/task.py @@ -9,12 +9,9 @@ from databuilder.extractor.base_extractor import Extractor from databuilder.loader.base_loader import Loader from databuilder.task.base_task import Task -from databuilder.transformer.base_transformer import Transformer -from databuilder.transformer.base_transformer \ - import NoopTransformer +from databuilder.transformer.base_transformer import NoopTransformer, Transformer from databuilder.utils.closer import Closer - LOGGER = logging.getLogger(__name__) @@ -42,7 +39,7 @@ def __init__(self, def init(self, conf: ConfigTree) -> None: self._progress_report_frequency = \ - conf.get_int('{}.{}'.format(self.get_scope(), DefaultTask.PROGRESS_REPORT_FREQUENCY), 500) + conf.get_int(f'{self.get_scope()}.{DefaultTask.PROGRESS_REPORT_FREQUENCY}', 500) self.extractor.init(Scoped.get_scoped_conf(conf, self.extractor.get_scope())) self.transformer.init(Scoped.get_scoped_conf(conf, self.transformer.get_scope())) @@ -66,7 +63,7 @@ def run(self) -> None: record = self.extractor.extract() count += 1 if count > 0 and count % self._progress_report_frequency == 0: - LOGGER.info('Extracted {} records so far'.format(count)) + LOGGER.info(f'Extracted %i records so far', count) finally: self._closer.close() diff --git a/databuilder/databuilder/transformer/base_transformer.py b/databuilder/databuilder/transformer/base_transformer.py index d1397fad13..769966e1ad 100644 --- a/databuilder/databuilder/transformer/base_transformer.py +++ b/databuilder/databuilder/transformer/base_transformer.py @@ -2,9 +2,11 @@ # SPDX-License-Identifier: Apache-2.0 import abc +from typing import ( + Any, Iterable, Optional, +) from pyhocon import ConfigTree -from typing import Any, Iterable, Optional from databuilder import Scoped diff --git a/databuilder/databuilder/transformer/bigquery_usage_transformer.py b/databuilder/databuilder/transformer/bigquery_usage_transformer.py index a401b6d372..b79ea5daca 100644 --- a/databuilder/databuilder/transformer/bigquery_usage_transformer.py +++ b/databuilder/databuilder/transformer/bigquery_usage_transformer.py @@ -1,12 +1,13 @@ # Copyright Contributors to the Amundsen project. # SPDX-License-Identifier: Apache-2.0 -from pyhocon import ConfigTree from typing import Optional, Tuple -from databuilder.transformer.base_transformer import Transformer -from databuilder.models.table_column_usage import ColumnReader, TableColumnUsage +from pyhocon import ConfigTree + from databuilder.extractor.bigquery_usage_extractor import TableColumnUsageTuple +from databuilder.models.table_column_usage import ColumnReader, TableColumnUsage +from databuilder.transformer.base_transformer import Transformer class BigqueryUsageTransformer(Transformer): diff --git a/databuilder/databuilder/transformer/dict_to_model.py b/databuilder/databuilder/transformer/dict_to_model.py index 91c76b339e..c9dfd6cdd8 100644 --- a/databuilder/databuilder/transformer/dict_to_model.py +++ b/databuilder/databuilder/transformer/dict_to_model.py @@ -3,9 +3,9 @@ import importlib import logging +from typing import Any, Dict from pyhocon import ConfigTree -from typing import Any, Dict from databuilder.transformer.base_transformer import Transformer diff --git a/databuilder/databuilder/transformer/generic_transformer.py b/databuilder/databuilder/transformer/generic_transformer.py index 16620c55db..665e984160 100644 --- a/databuilder/databuilder/transformer/generic_transformer.py +++ b/databuilder/databuilder/transformer/generic_transformer.py @@ -2,9 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 import logging +from typing import Any, Dict from pyhocon import ConfigTree -from typing import Any, Dict from databuilder.transformer.base_transformer import Transformer diff --git a/databuilder/databuilder/transformer/regex_str_replace_transformer.py b/databuilder/databuilder/transformer/regex_str_replace_transformer.py index 886d9c7b48..ce229034b6 100644 --- a/databuilder/databuilder/transformer/regex_str_replace_transformer.py +++ b/databuilder/databuilder/transformer/regex_str_replace_transformer.py @@ -2,11 +2,11 @@ # SPDX-License-Identifier: Apache-2.0 import logging -from pyhocon import ConfigTree from typing import Any -from databuilder.transformer.base_transformer import Transformer +from pyhocon import ConfigTree +from databuilder.transformer.base_transformer import Transformer LOGGER = logging.getLogger(__name__) diff --git a/databuilder/databuilder/transformer/table_tag_transformer.py b/databuilder/databuilder/transformer/table_tag_transformer.py index 17869605ce..288ec31148 100644 --- a/databuilder/databuilder/transformer/table_tag_transformer.py +++ b/databuilder/databuilder/transformer/table_tag_transformer.py @@ -1,11 +1,12 @@ # Copyright Contributors to the Amundsen project. # SPDX-License-Identifier: Apache-2.0 -from pyhocon import ConfigFactory, ConfigTree from typing import Any -from databuilder.transformer.base_transformer import Transformer +from pyhocon import ConfigFactory, ConfigTree + from databuilder.models.table_metadata import TableMetadata +from databuilder.transformer.base_transformer import Transformer class TableTagTransformer(Transformer): diff --git a/databuilder/databuilder/transformer/template_variable_substitution_transformer.py b/databuilder/databuilder/transformer/template_variable_substitution_transformer.py index 8a9c1ba11a..cc90599f3c 100644 --- a/databuilder/databuilder/transformer/template_variable_substitution_transformer.py +++ b/databuilder/databuilder/transformer/template_variable_substitution_transformer.py @@ -2,9 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 import logging +from typing import Any, Dict from pyhocon import ConfigTree -from typing import Any, Dict from databuilder.transformer.base_transformer import Transformer diff --git a/databuilder/databuilder/transformer/timestamp_string_to_epoch.py b/databuilder/databuilder/transformer/timestamp_string_to_epoch.py index e96b8e6d6f..46c297157a 100644 --- a/databuilder/databuilder/transformer/timestamp_string_to_epoch.py +++ b/databuilder/databuilder/transformer/timestamp_string_to_epoch.py @@ -3,11 +3,10 @@ import logging from datetime import datetime - -from pyhocon import ConfigFactory -from pyhocon import ConfigTree from typing import Any, Dict +from pyhocon import ConfigFactory, ConfigTree + from databuilder.transformer.base_transformer import Transformer TIMESTAMP_FORMAT = 'timestamp_format' diff --git a/databuilder/databuilder/utils/closer.py b/databuilder/databuilder/utils/closer.py index a735c34d62..e36262b4f9 100644 --- a/databuilder/databuilder/utils/closer.py +++ b/databuilder/databuilder/utils/closer.py @@ -2,7 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 import atexit - from typing import Callable, List @@ -27,8 +26,7 @@ def register(self, close_callable: Callable) -> None: :return: None """ if not callable(close_callable): - raise RuntimeError('Only callable can be registered: {}'.format( - close_callable)) + raise RuntimeError(f'Only callable can be registered: {close_callable}') self._stack.append(close_callable) diff --git a/databuilder/example/dags/athena_sample_dag.py b/databuilder/example/dags/athena_sample_dag.py index 2f64490c65..616846f1da 100644 --- a/databuilder/example/dags/athena_sample_dag.py +++ b/databuilder/example/dags/athena_sample_dag.py @@ -1,29 +1,28 @@ # Copyright Contributors to the Amundsen project. # SPDX-License-Identifier: Apache-2.0 -import textwrap -from datetime import datetime, timedelta import uuid +from datetime import datetime, timedelta -from elasticsearch import Elasticsearch from airflow import DAG # noqa from airflow import macros # noqa from airflow.operators.python_operator import PythonOperator # noqa +from elasticsearch import Elasticsearch from pyhocon import ConfigFactory -from databuilder.extractor.neo4j_search_data_extractor import Neo4jSearchDataExtractor + from databuilder.extractor.athena_metadata_extractor import AthenaMetadataExtractor -from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor -from databuilder.publisher.elasticsearch_publisher import ElasticsearchPublisher from databuilder.extractor.neo4j_extractor import Neo4jExtractor +from databuilder.extractor.neo4j_search_data_extractor import Neo4jSearchDataExtractor +from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor from databuilder.job.job import DefaultJob from databuilder.loader.file_system_elasticsearch_json_loader import FSElasticsearchJSONLoader from databuilder.loader.file_system_neo4j_csv_loader import FsNeo4jCSVLoader from databuilder.publisher import neo4j_csv_publisher +from databuilder.publisher.elasticsearch_publisher import ElasticsearchPublisher from databuilder.publisher.neo4j_csv_publisher import Neo4jCsvPublisher from databuilder.task.task import DefaultTask from databuilder.transformer.base_transformer import NoopTransformer - dag_args = { 'concurrency': 10, # One dagrun at a time @@ -63,7 +62,6 @@ # String format - ('schema1', schema2', .... 'schemaN') SUPPORTED_SCHEMA_SQL_IN_CLAUSE = "('{schemas}')".format(schemas="', '".join(SUPPORTED_SCHEMAS)) - OPTIONAL_TABLE_NAMES = '' AWS_ACCESS = 'YOUR_ACCESS_KEY' AWS_SECRET = 'YOUR_SECRET_KEY' @@ -78,36 +76,24 @@ def connection_string(): def create_table_extract_job(): - where_clause_suffix = textwrap.dedent(""" - where table_schema in {schemas} - """).format(schemas=SUPPORTED_SCHEMA_SQL_IN_CLAUSE) + where_clause_suffix = f"where table_schema in {SUPPORTED_SCHEMA_SQL_IN_CLAUSE}" tmp_folder = '/var/tmp/amundsen/table_metadata' - node_files_folder = '{tmp_folder}/nodes/'.format(tmp_folder=tmp_folder) - relationship_files_folder = '{tmp_folder}/relationships/'.format(tmp_folder=tmp_folder) + node_files_folder = f'{tmp_folder}/nodes/' + relationship_files_folder = f'{tmp_folder}/relationships/' job_config = ConfigFactory.from_dict({ - 'extractor.athena_metadata.{}'.format(AthenaMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY): - where_clause_suffix, - 'extractor.athena_metadata.extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): - connection_string(), - 'extractor.athena_metadata.{}'.format(AthenaMetadataExtractor.CATALOG_KEY): "'AwsDataCatalog'", - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): - node_files_folder, - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): - relationship_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): - node_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): - relationship_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): - neo4j_endpoint, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): - neo4j_user, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): - neo4j_password, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): - 'unique_tag', # should use unique tag here like {ds} + f'extractor.athena_metadata.{AthenaMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}': where_clause_suffix, + f'extractor.athena_metadata.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': connection_string(), + f'extractor.athena_metadata.{AthenaMetadataExtractor.CATALOG_KEY}': "'AwsDataCatalog'", + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password, + f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag', # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=DefaultTask(extractor=AthenaMetadataExtractor(), loader=FsNeo4jCSVLoader(), @@ -134,24 +120,22 @@ def create_es_publisher_sample_job(): elasticsearch_index_alias = 'table_search_index' job_config = ConfigFactory.from_dict({ - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.GRAPH_URL_CONFIG_KEY): neo4j_endpoint, - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.MODEL_CLASS_CONFIG_KEY): + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}': neo4j_endpoint, + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.MODEL_CLASS_CONFIG_KEY}': 'databuilder.models.table_elasticsearch_document.TableESDocument', - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_USER): neo4j_user, - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_PW): neo4j_password, - 'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY): - extracted_search_data_path, - 'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY): 'w', - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_PATH_CONFIG_KEY): - extracted_search_data_path, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_MODE_CONFIG_KEY): 'r', - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY): + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_USER}': neo4j_user, + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_PW}': neo4j_password, + f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY}': extracted_search_data_path, + f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY}': 'w', + f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_PATH_CONFIG_KEY}': extracted_search_data_path, + f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_MODE_CONFIG_KEY}': 'r', + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY}': elasticsearch_client, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY): + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY}': elasticsearch_new_index_key, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY): + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY}': elasticsearch_new_index_key_type, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY): + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY}': elasticsearch_index_alias }) diff --git a/databuilder/example/dags/hive_sample_dag.py b/databuilder/example/dags/hive_sample_dag.py index f6ec743ff9..063f2a4d18 100644 --- a/databuilder/example/dags/hive_sample_dag.py +++ b/databuilder/example/dags/hive_sample_dag.py @@ -13,14 +13,13 @@ from databuilder.extractor.hive_table_metadata_extractor import HiveTableMetadataExtractor from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor from databuilder.job.job import DefaultJob -from databuilder.models.table_metadata import DESCRIPTION_NODE_LABEL from databuilder.loader.file_system_neo4j_csv_loader import FsNeo4jCSVLoader +from databuilder.models.table_metadata import DESCRIPTION_NODE_LABEL from databuilder.publisher import neo4j_csv_publisher from databuilder.publisher.neo4j_csv_publisher import Neo4jCsvPublisher from databuilder.task.task import DefaultTask from databuilder.transformer.base_transformer import NoopTransformer - dag_args = { 'concurrency': 10, # One dagrun at a time @@ -57,6 +56,8 @@ # String format - ('schema1', schema2', .... 'schemaN') SUPPORTED_HIVE_SCHEMA_SQL_IN_CLAUSE = "('{schemas}')".format(schemas="', '".join(SUPPORTED_HIVE_SCHEMAS)) +LOGGER = logging.getLogger(__name__) + # Todo: user needs to modify and provide a hivemetastore connection string def connection_string(): @@ -85,11 +86,10 @@ def create_table_wm_job(**kwargs): watermark=kwargs['templates_dict'].get('watermark_type'), schemas=SUPPORTED_HIVE_SCHEMA_SQL_IN_CLAUSE) - logging.info('SQL query: {}'.format(sql)) - tmp_folder = '/var/tmp/amundsen/table_{hwm}'.format(hwm=kwargs['templates_dict'] - .get('watermark_type').strip("\"")) - node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) - relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder) + LOGGER.info('SQL query: %s', sql) + tmp_folder = '/var/tmp/amundsen/table_{hwm}'.format(hwm=kwargs['templates_dict'].get('watermark_type').strip("\"")) + node_files_folder = f'{tmp_folder}/nodes' + relationship_files_folder = f'{tmp_folder}/relationships' hwm_extractor = SQLAlchemyExtractor() csv_loader = FsNeo4jCSVLoader() @@ -99,23 +99,16 @@ def create_table_wm_job(**kwargs): transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): connection_string(), - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.EXTRACT_SQL): sql, + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': connection_string(), + f'extractor.sqlalchemy.{SQLAlchemyExtractor.EXTRACT_SQL}': sql, 'extractor.sqlalchemy.model_class': 'databuilder.models.watermark.Watermark', - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): - node_files_folder, - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): - relationship_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): - node_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): - relationship_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): - neo4j_endpoint, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): - neo4j_user, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): - neo4j_password, + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password, }) job = DefaultJob(conf=job_config, task=task, @@ -139,32 +132,21 @@ def create_table_metadata_databuilder_job(): """).format(schemas=SUPPORTED_HIVE_SCHEMA_SQL_IN_CLAUSE) tmp_folder = '/var/tmp/amundsen/table_metadata' - node_files_folder = '{tmp_folder}/nodes/'.format(tmp_folder=tmp_folder) - relationship_files_folder = '{tmp_folder}/relationships/'.format(tmp_folder=tmp_folder) + node_files_folder = f'{tmp_folder}/nodes/' + relationship_files_folder = f'{tmp_folder}/relationships/' job_config = ConfigFactory.from_dict({ - 'extractor.hive_table_metadata.{}'.format(HiveTableMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY): - where_clause_suffix, - 'extractor.hive_table_metadata.extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): - connection_string(), - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): - node_files_folder, - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): - relationship_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): - node_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): - relationship_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): - neo4j_endpoint, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): - neo4j_user, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): - neo4j_password, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_CREATE_ONLY_NODES): - [DESCRIPTION_NODE_LABEL], - 'publisher.neo4j.job_publish_tag': - 'some_unique_tag' # TO-DO unique tag must be added + f'extractor.hive_table_metadata.{HiveTableMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}': where_clause_suffix, + f'extractor.hive_table_metadata.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': connection_string(), + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_CREATE_ONLY_NODES}': [DESCRIPTION_NODE_LABEL], + 'publisher.neo4j.job_publish_tag': 'some_unique_tag' # TO-DO unique tag must be added }) job = DefaultJob(conf=job_config, @@ -174,7 +156,6 @@ def create_table_metadata_databuilder_job(): with DAG('amundsen_databuilder', default_args=default_args, **dag_args) as dag: - amundsen_databuilder_table_metadata_job = PythonOperator( task_id='amundsen_databuilder_table_metadata_job', python_callable=create_table_metadata_databuilder_job @@ -187,7 +168,7 @@ def create_table_metadata_databuilder_job(): provide_context=True, templates_dict={'agg_func': 'max', 'watermark_type': '"high_watermark"', - 'part_regex': '{}'.format('{{ ds }}')} + 'part_regex': '{{ ds }}'} ) # calculate hive low watermark @@ -197,7 +178,7 @@ def create_table_metadata_databuilder_job(): provide_context=True, templates_dict={'agg_func': 'min', 'watermark_type': '"low_watermark"', - 'part_regex': '{}'.format('{{ ds }}')} + 'part_regex': '{{ ds }}'} ) # Schedule high and low watermark task after metadata task diff --git a/databuilder/example/dags/postgres_sample_dag.py b/databuilder/example/dags/postgres_sample_dag.py index 403e97877b..93137ab72e 100644 --- a/databuilder/example/dags/postgres_sample_dag.py +++ b/databuilder/example/dags/postgres_sample_dag.py @@ -1,29 +1,28 @@ # Copyright Contributors to the Amundsen project. # SPDX-License-Identifier: Apache-2.0 -import textwrap -from datetime import datetime, timedelta import uuid +from datetime import datetime, timedelta -from elasticsearch import Elasticsearch from airflow import DAG # noqa from airflow import macros # noqa from airflow.operators.python_operator import PythonOperator # noqa +from elasticsearch import Elasticsearch from pyhocon import ConfigFactory + +from databuilder.extractor.neo4j_extractor import Neo4jExtractor from databuilder.extractor.neo4j_search_data_extractor import Neo4jSearchDataExtractor from databuilder.extractor.postgres_metadata_extractor import PostgresMetadataExtractor from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor -from databuilder.publisher.elasticsearch_publisher import ElasticsearchPublisher -from databuilder.extractor.neo4j_extractor import Neo4jExtractor from databuilder.job.job import DefaultJob from databuilder.loader.file_system_elasticsearch_json_loader import FSElasticsearchJSONLoader from databuilder.loader.file_system_neo4j_csv_loader import FsNeo4jCSVLoader from databuilder.publisher import neo4j_csv_publisher +from databuilder.publisher.elasticsearch_publisher import ElasticsearchPublisher from databuilder.publisher.neo4j_csv_publisher import Neo4jCsvPublisher from databuilder.task.task import DefaultTask from databuilder.transformer.base_transformer import NoopTransformer - dag_args = { 'concurrency': 10, # One dagrun at a time @@ -76,37 +75,24 @@ def connection_string(): def create_table_extract_job(): - where_clause_suffix = textwrap.dedent(""" - where table_schema in {schemas} - """).format(schemas=SUPPORTED_SCHEMA_SQL_IN_CLAUSE) + where_clause_suffix = f'where table_schema in {SUPPORTED_SCHEMA_SQL_IN_CLAUSE}' tmp_folder = '/var/tmp/amundsen/table_metadata' - node_files_folder = '{tmp_folder}/nodes/'.format(tmp_folder=tmp_folder) - relationship_files_folder = '{tmp_folder}/relationships/'.format(tmp_folder=tmp_folder) + node_files_folder = f'{tmp_folder}/nodes/' + relationship_files_folder = f'{tmp_folder}/relationships/' job_config = ConfigFactory.from_dict({ - 'extractor.postgres_metadata.{}'.format(PostgresMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY): - where_clause_suffix, - 'extractor.postgres_metadata.{}'.format(PostgresMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME): - True, - 'extractor.postgres_metadata.extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): - connection_string(), - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): - node_files_folder, - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): - relationship_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): - node_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): - relationship_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): - neo4j_endpoint, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): - neo4j_user, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): - neo4j_password, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): - 'unique_tag', # should use unique tag here like {ds} + f'extractor.postgres_metadata.{PostgresMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}': where_clause_suffix, + f'extractor.postgres_metadata.{PostgresMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME}': True, + f'extractor.postgres_metadata.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': connection_string(), + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password, + f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag', # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=DefaultTask(extractor=PostgresMetadataExtractor(), loader=FsNeo4jCSVLoader()), @@ -125,31 +111,29 @@ def create_es_publisher_sample_job(): # elastic search client instance elasticsearch_client = es # unique name of new index in Elasticsearch - elasticsearch_new_index_key = 'tables' + str(uuid.uuid4()) + elasticsearch_new_index_key = f'tables{uuid.uuid4()}' # related to mapping type from /databuilder/publisher/elasticsearch_publisher.py#L38 elasticsearch_new_index_key_type = 'table' # alias for Elasticsearch used in amundsensearchlibrary/search_service/config.py as an index elasticsearch_index_alias = 'table_search_index' job_config = ConfigFactory.from_dict({ - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.GRAPH_URL_CONFIG_KEY): neo4j_endpoint, - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.MODEL_CLASS_CONFIG_KEY): + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}': neo4j_endpoint, + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.MODEL_CLASS_CONFIG_KEY}': 'databuilder.models.table_elasticsearch_document.TableESDocument', - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_USER): neo4j_user, - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_PW): neo4j_password, - 'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY): - extracted_search_data_path, - 'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY): 'w', - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_PATH_CONFIG_KEY): - extracted_search_data_path, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_MODE_CONFIG_KEY): 'r', - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY): + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_USER}': neo4j_user, + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_PW}': neo4j_password, + f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY}': extracted_search_data_path, + f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY}': 'w', + f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_PATH_CONFIG_KEY}': extracted_search_data_path, + f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_MODE_CONFIG_KEY}': 'r', + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY}': elasticsearch_client, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY): + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY}': elasticsearch_new_index_key, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY): + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY}': elasticsearch_new_index_key_type, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY): + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY}': elasticsearch_index_alias }) @@ -160,7 +144,6 @@ def create_es_publisher_sample_job(): with DAG('amundsen_databuilder', default_args=default_args, **dag_args) as dag: - postgres_table_extract_job = PythonOperator( task_id='postgres_table_extract_job', python_callable=create_table_extract_job diff --git a/databuilder/example/dags/snowflake_sample_dag.py b/databuilder/example/dags/snowflake_sample_dag.py index efbfca5046..7ad3ad4ac3 100644 --- a/databuilder/example/dags/snowflake_sample_dag.py +++ b/databuilder/example/dags/snowflake_sample_dag.py @@ -7,25 +7,25 @@ """ import textwrap -from datetime import datetime, timedelta import uuid +from datetime import datetime, timedelta from airflow import DAG # noqa from airflow import macros # noqa from airflow.operators.python_operator import PythonOperator # noqa +from elasticsearch import Elasticsearch from pyhocon import ConfigFactory -from elasticsearch import Elasticsearch -from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor +from databuilder.extractor.neo4j_extractor import Neo4jExtractor +from databuilder.extractor.neo4j_search_data_extractor import Neo4jSearchDataExtractor from databuilder.extractor.snowflake_metadata_extractor import SnowflakeMetadataExtractor +from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor from databuilder.job.job import DefaultJob +from databuilder.loader.file_system_elasticsearch_json_loader import FSElasticsearchJSONLoader from databuilder.loader.file_system_neo4j_csv_loader import FsNeo4jCSVLoader from databuilder.publisher import neo4j_csv_publisher -from databuilder.publisher.neo4j_csv_publisher import Neo4jCsvPublisher -from databuilder.extractor.neo4j_search_data_extractor import Neo4jSearchDataExtractor -from databuilder.extractor.neo4j_extractor import Neo4jExtractor -from databuilder.loader.file_system_elasticsearch_json_loader import FSElasticsearchJSONLoader from databuilder.publisher.elasticsearch_publisher import ElasticsearchPublisher +from databuilder.publisher.neo4j_csv_publisher import Neo4jCsvPublisher from databuilder.task.task import DefaultTask from databuilder.transformer.base_transformer import NoopTransformer @@ -63,7 +63,6 @@ {'host': 'elasticsearch'}, ]) - # TODO: user provides a list of schema for indexing SUPPORTED_SCHEMAS = ['public'] SUPPORTED_SCHEMA_SQL_IN_CLAUSE = "('{schemas}')".format(schemas="', '".join(SUPPORTED_SCHEMAS)) @@ -81,13 +80,7 @@ def connection_string(): account = 'YourSnowflakeAccountHere' # specify a warehouse to connect to. warehouse = 'yourwarehouse' - return 'snowflake://{user}:{password}@{account}/{database}?warehouse={warehouse}'.format( - user=user, - password=password, - account=account, - database=SNOWFLAKE_DATABASE_KEY, - warehouse=warehouse, - ) + return f'snowflake://{user}:{password}@{account}/{SNOWFLAKE_DATABASE_KEY}?warehouse={warehouse}' def create_snowflake_table_metadata_job(): @@ -102,32 +95,21 @@ def create_snowflake_table_metadata_job(): """).format(schemas=SUPPORTED_SCHEMA_SQL_IN_CLAUSE) tmp_folder = '/var/tmp/amundsen/table_metadata' - node_files_folder = '{tmp_folder}/nodes/'.format(tmp_folder=tmp_folder) - relationship_files_folder = '{tmp_folder}/relationships/'.format(tmp_folder=tmp_folder) + node_files_folder = f'{tmp_folder}/nodes/' + relationship_files_folder = f'{tmp_folder}/relationships/' job_config = ConfigFactory.from_dict({ - 'extractor.snowflake.extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): - connection_string(), - 'extractor.snowflake.{}'.format(SnowflakeMetadataExtractor.SNOWFLAKE_DATABASE_KEY): - SNOWFLAKE_DATABASE_KEY, - 'extractor.snowflake.{}'.format(SnowflakeMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY): - where_clause_suffix, - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): - node_files_folder, - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): - relationship_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): - node_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): - relationship_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): - neo4j_endpoint, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): - neo4j_user, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): - neo4j_password, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): - 'some_unique_tag' # TO-DO unique tag must be added + f'extractor.snowflake.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': connection_string(), + f'extractor.snowflake.{SnowflakeMetadataExtractor.SNOWFLAKE_DATABASE_KEY}': SNOWFLAKE_DATABASE_KEY, + f'extractor.snowflake.{SnowflakeMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}': where_clause_suffix, + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password, + f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'some_unique_tag' # TO-DO unique tag must be added }) job = DefaultJob(conf=job_config, @@ -159,29 +141,22 @@ def create_snowflake_es_publisher_job(): elasticsearch_index_alias = 'table_search_index' job_config = ConfigFactory.from_dict({ - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.GRAPH_URL_CONFIG_KEY): - neo4j_endpoint, - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.MODEL_CLASS_CONFIG_KEY): + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}': neo4j_endpoint, + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.MODEL_CLASS_CONFIG_KEY}': 'databuilder.models.table_elasticsearch_document.TableESDocument', - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_USER): - neo4j_user, - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_PW): - neo4j_password, - 'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY): - extracted_search_data_path, - 'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY): - 'w', - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_PATH_CONFIG_KEY): - extracted_search_data_path, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_MODE_CONFIG_KEY): - 'r', - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY): + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_USER}': neo4j_user, + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_PW}': neo4j_password, + f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY}': extracted_search_data_path, + f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY}': 'w', + f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_PATH_CONFIG_KEY}': extracted_search_data_path, + f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_MODE_CONFIG_KEY}': 'r', + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY}': elasticsearch_client, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY): + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY}': elasticsearch_new_index_key, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY): + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY}': elasticsearch_new_index_key_type, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY): + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY}': elasticsearch_index_alias }) @@ -192,7 +167,6 @@ def create_snowflake_es_publisher_job(): with DAG('amundsen_databuilder', default_args=default_args, **dag_args) as dag: - snowflake_table_metadata_job = PythonOperator( task_id='snowflake_table_metadata_extract_job', python_callable=create_snowflake_table_metadata_job diff --git a/databuilder/example/scripts/sample_bigquery_metadata.py b/databuilder/example/scripts/sample_bigquery_metadata.py index 98d4708dd0..bb3c60bc0c 100644 --- a/databuilder/example/scripts/sample_bigquery_metadata.py +++ b/databuilder/example/scripts/sample_bigquery_metadata.py @@ -7,9 +7,10 @@ import logging import os -from pyhocon import ConfigFactory import sqlite3 +from pyhocon import ConfigFactory + from databuilder.extractor.bigquery_metadata_extractor import BigQueryMetadataExtractor from databuilder.job.job import DefaultJob from databuilder.loader.file_system_neo4j_csv_loader import FsNeo4jCSVLoader @@ -21,7 +22,7 @@ logging.basicConfig(level=logging.INFO) # set env NEO4J_HOST to override localhost -NEO4J_ENDPOINT = 'bolt://{}:7687'.format(os.getenv('NEO4J_HOST', 'localhost')) +NEO4J_ENDPOINT = f'bolt://{os.getenv("NEO4J_HOST", "localhost")}:7687' neo4j_endpoint = NEO4J_ENDPOINT neo4j_user = 'neo4j' @@ -39,9 +40,9 @@ def create_connection(db_file): # todo: Add a second model def create_bq_job(metadata_type, gcloud_project): - tmp_folder = '/var/tmp/amundsen/{metadata_type}'.format(metadata_type=metadata_type) - node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) - relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder) + tmp_folder = f'/var/tmp/amundsen/{metadata_type}' + node_files_folder = f'{tmp_folder}/nodes' + relationship_files_folder = f'{tmp_folder}/relationships' bq_meta_extractor = BigQueryMetadataExtractor() csv_loader = FsNeo4jCSVLoader() @@ -51,26 +52,16 @@ def create_bq_job(metadata_type, gcloud_project): transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ - 'extractor.bigquery_table_metadata.{}'.format(BigQueryMetadataExtractor.PROJECT_ID_KEY): - gcloud_project, - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): - node_files_folder, - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): - relationship_files_folder, - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR): - True, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): - node_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): - relationship_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): - neo4j_endpoint, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): - neo4j_user, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): - neo4j_password, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): - 'unique_tag', # should use unique tag here like {ds} + f'extractor.bigquery_table_metadata.{BigQueryMetadataExtractor.PROJECT_ID_KEY}': gcloud_project, + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR}': True, + f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password, + f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag', # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=task, diff --git a/databuilder/example/scripts/sample_bq_usage_loader.py b/databuilder/example/scripts/sample_bq_usage_loader.py index e8a148037b..ace31d2ca0 100644 --- a/databuilder/example/scripts/sample_bq_usage_loader.py +++ b/databuilder/example/scripts/sample_bq_usage_loader.py @@ -7,9 +7,10 @@ import logging import os -from pyhocon import ConfigFactory import sqlite3 +from pyhocon import ConfigFactory + from databuilder.extractor.bigquery_usage_extractor import BigQueryTableUsageExtractor from databuilder.job.job import DefaultJob from databuilder.loader.file_system_neo4j_csv_loader import FsNeo4jCSVLoader @@ -21,7 +22,7 @@ logging.basicConfig(level=logging.INFO) # set env NEO4J_HOST to override localhost -NEO4J_ENDPOINT = 'bolt://{}:7687'.format(os.getenv('NEO4J_HOST', 'localhost')) +NEO4J_ENDPOINT = f'bolt://{os.getenv("NEO4J_HOST", "localhost")}:7687' neo4j_endpoint = NEO4J_ENDPOINT neo4j_user = 'neo4j' @@ -39,9 +40,9 @@ def create_connection(db_file): # todo: Add a second model def create_bq_job(metadata_type, gcloud_project): - tmp_folder = '/var/tmp/amundsen/{metadata_type}'.format(metadata_type=metadata_type) - node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) - relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder) + tmp_folder = f'/var/tmp/amundsen/{metadata_type}' + node_files_folder = f'{tmp_folder}/nodes' + relationship_files_folder = f'{tmp_folder}/relationships' bq_usage_extractor = BigQueryTableUsageExtractor() csv_loader = FsNeo4jCSVLoader() @@ -51,26 +52,16 @@ def create_bq_job(metadata_type, gcloud_project): transformer=BigqueryUsageTransformer()) job_config = ConfigFactory.from_dict({ - 'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY): - gcloud_project, - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): - node_files_folder, - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): - relationship_files_folder, - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR): - True, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): - node_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): - relationship_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): - neo4j_endpoint, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): - neo4j_user, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): - neo4j_password, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): - 'unique_tag', # should use unique tag here like {ds} + f'extractor.bigquery_table_usage.{BigQueryTableUsageExtractor.PROJECT_ID_KEY}': gcloud_project, + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR}': True, + f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password, + f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag', # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=task, diff --git a/databuilder/example/scripts/sample_data_loader.py b/databuilder/example/scripts/sample_data_loader.py index 5ab4456d37..87197dbaeb 100644 --- a/databuilder/example/scripts/sample_data_loader.py +++ b/databuilder/example/scripts/sample_data_loader.py @@ -29,21 +29,25 @@ from pyhocon import ConfigFactory from sqlalchemy.ext.declarative import declarative_base -from databuilder.extractor.csv_extractor import CsvTableBadgeExtractor, CsvTableColumnExtractor, CsvExtractor +from databuilder.extractor.csv_extractor import ( + CsvExtractor, CsvTableBadgeExtractor, CsvTableColumnExtractor, +) from databuilder.extractor.neo4j_es_last_updated_extractor import Neo4jEsLastUpdatedExtractor from databuilder.extractor.neo4j_search_data_extractor import Neo4jSearchDataExtractor from databuilder.job.job import DefaultJob from databuilder.loader.file_system_elasticsearch_json_loader import FSElasticsearchJSONLoader from databuilder.loader.file_system_neo4j_csv_loader import FsNeo4jCSVLoader -from databuilder.publisher.elasticsearch_constants import DASHBOARD_ELASTICSEARCH_INDEX_MAPPING, \ - USER_ELASTICSEARCH_INDEX_MAPPING +from databuilder.publisher.elasticsearch_constants import ( + DASHBOARD_ELASTICSEARCH_INDEX_MAPPING, USER_ELASTICSEARCH_INDEX_MAPPING, +) from databuilder.publisher.elasticsearch_publisher import ElasticsearchPublisher from databuilder.publisher.neo4j_csv_publisher import Neo4jCsvPublisher from databuilder.task.task import DefaultTask -from databuilder.transformer.base_transformer import ChainedTransformer -from databuilder.transformer.base_transformer import NoopTransformer -from databuilder.transformer.dict_to_model import DictToModel, MODEL_CLASS -from databuilder.transformer.generic_transformer import GenericTransformer, CALLBACK_FUNCTION, FIELD_NAME +from databuilder.transformer.base_transformer import ChainedTransformer, NoopTransformer +from databuilder.transformer.dict_to_model import MODEL_CLASS, DictToModel +from databuilder.transformer.generic_transformer import ( + CALLBACK_FUNCTION, FIELD_NAME, GenericTransformer, +) es_host = os.getenv('CREDENTIALS_ELASTICSEARCH_PROXY_HOST', 'localhost') neo_host = os.getenv('CREDENTIALS_NEO4J_PROXY_HOST', 'localhost') @@ -61,7 +65,7 @@ Base = declarative_base() -NEO4J_ENDPOINT = 'bolt://{}:{}'.format(neo_host, neo_port) +NEO4J_ENDPOINT = f'bolt://{neo_host}:{neo_port}' neo4j_endpoint = NEO4J_ENDPOINT @@ -72,9 +76,9 @@ def run_csv_job(file_loc, job_name, model): - tmp_folder = '/var/tmp/amundsen/{job_name}'.format(job_name=job_name) - node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) - relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder) + tmp_folder = f'/var/tmp/amundsen/{job_name}' + node_files_folder = f'{tmp_folder}/nodes' + relationship_files_folder = f'{tmp_folder}/relationships' csv_extractor = CsvExtractor() csv_loader = FsNeo4jCSVLoader() @@ -105,8 +109,8 @@ def run_csv_job(file_loc, job_name, model): def run_table_badge_job(table_path, badge_path): tmp_folder = '/var/tmp/amundsen/table_badge' - node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) - relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder) + node_files_folder = f'{tmp_folder}/nodes' + relationship_files_folder = f'{tmp_folder}/relationships' extractor = CsvTableBadgeExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=extractor, @@ -134,8 +138,8 @@ def run_table_badge_job(table_path, badge_path): def run_table_column_job(table_path, column_path): tmp_folder = '/var/tmp/amundsen/table_column' - node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) - relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder) + node_files_folder = f'{tmp_folder}/nodes' + relationship_files_folder = f'{tmp_folder}/relationships' extractor = CsvTableColumnExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor, @@ -164,8 +168,8 @@ def run_table_column_job(table_path, column_path): def create_last_updated_job(): # loader saves data to these folders and publisher reads it from here tmp_folder = '/var/tmp/amundsen/last_updated_data' - node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) - relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder) + node_files_folder = f'{tmp_folder}/nodes' + relationship_files_folder = f'{tmp_folder}/relationships' task = DefaultTask(extractor=Neo4jEsLastUpdatedExtractor(), loader=FsNeo4jCSVLoader()) @@ -197,8 +201,8 @@ def _str_to_list(str_val): def create_dashboard_tables_job(): # loader saves data to these folders and publisher reads it from here tmp_folder = '/var/tmp/amundsen/dashboard_table' - node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) - relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder) + node_files_folder = f'{tmp_folder}/nodes' + relationship_files_folder = f'{tmp_folder}/relationships' csv_extractor = CsvExtractor() csv_loader = FsNeo4jCSVLoader() @@ -214,21 +218,21 @@ def create_dashboard_tables_job(): publisher = Neo4jCsvPublisher() job_config = ConfigFactory.from_dict({ - '{}.file_location'.format(csv_extractor.get_scope()): 'example/sample_data/sample_dashboard_table.csv', - '{}.{}.{}'.format(transformer.get_scope(), generic_transformer.get_scope(), FIELD_NAME): 'table_ids', - '{}.{}.{}'.format(transformer.get_scope(), generic_transformer.get_scope(), CALLBACK_FUNCTION): _str_to_list, - '{}.{}.{}'.format(transformer.get_scope(), dict_to_model_transformer.get_scope(), MODEL_CLASS): + f'{csv_extractor.get_scope()}.file_location': 'example/sample_data/sample_dashboard_table.csv', + f'{transformer.get_scope()}.{generic_transformer.get_scope()}.{FIELD_NAME}': 'table_ids', + f'{transformer.get_scope()}.{generic_transformer.get_scope()}.{CALLBACK_FUNCTION}': _str_to_list, + f'{transformer.get_scope()}.{dict_to_model_transformer.get_scope()}.{MODEL_CLASS}': 'databuilder.models.dashboard.dashboard_table.DashboardTable', - '{}.node_dir_path'.format(csv_loader.get_scope()): node_files_folder, - '{}.relationship_dir_path'.format(csv_loader.get_scope()): relationship_files_folder, - '{}.delete_created_directories'.format(csv_loader.get_scope()): True, - '{}.node_files_directory'.format(publisher.get_scope()): node_files_folder, - '{}.relation_files_directory'.format(publisher.get_scope()): relationship_files_folder, - '{}.neo4j_endpoint'.format(publisher.get_scope()): neo4j_endpoint, - '{}.neo4j_user'.format(publisher.get_scope()): neo4j_user, - '{}.neo4j_password'.format(publisher.get_scope()): neo4j_password, - '{}.neo4j_encrypted'.format(publisher.get_scope()): False, - '{}.job_publish_tag'.format(publisher.get_scope()): 'unique_tag', # should use unique tag here like {ds} + f'{csv_loader.get_scope()}.node_dir_path': node_files_folder, + f'{csv_loader.get_scope()}.relationship_dir_path': relationship_files_folder, + f'{csv_loader.get_scope()}.delete_created_directories': True, + f'{publisher.get_scope()}.node_files_directory': node_files_folder, + f'{publisher.get_scope()}.relation_files_directory': relationship_files_folder, + f'{publisher.get_scope()}.neo4j_endpoint': neo4j_endpoint, + f'{publisher.get_scope()}.neo4j_user': neo4j_user, + f'{publisher.get_scope()}.neo4j_password': neo4j_password, + f'{publisher.get_scope()}.neo4j_encrypted': False, + f'{publisher.get_scope()}.job_publish_tag': 'unique_tag', # should use unique tag here like {ds} }) return DefaultJob(conf=job_config, @@ -262,7 +266,7 @@ def create_es_publisher_sample_job(elasticsearch_index_alias='table_search_index # elastic search client instance elasticsearch_client = es # unique name of new index in Elasticsearch - elasticsearch_new_index_key = '{}_'.format(elasticsearch_doc_type_key) + str(uuid.uuid4()) + elasticsearch_new_index_key = f'{elasticsearch_doc_type_key}_{uuid.uuid4()}' job_config = ConfigFactory.from_dict({ 'extractor.search_data.entity_type': entity_type, @@ -283,7 +287,7 @@ def create_es_publisher_sample_job(elasticsearch_index_alias='table_search_index # only optionally add these keys, so need to dynamically `put` them if elasticsearch_mapping: - job_config.put('publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY), + job_config.put(f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY}', elasticsearch_mapping) job = DefaultJob(conf=job_config, diff --git a/databuilder/example/scripts/sample_db2_data_loader.py b/databuilder/example/scripts/sample_db2_data_loader.py index 1d79955d33..217eaad283 100644 --- a/databuilder/example/scripts/sample_db2_data_loader.py +++ b/databuilder/example/scripts/sample_db2_data_loader.py @@ -7,25 +7,24 @@ import logging import os -from pyhocon import ConfigFactory - import sys import uuid -from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor +from elasticsearch import Elasticsearch +from pyhocon import ConfigFactory + from databuilder.extractor.db2_metadata_extractor import Db2MetadataExtractor +from databuilder.extractor.neo4j_extractor import Neo4jExtractor +from databuilder.extractor.neo4j_search_data_extractor import Neo4jSearchDataExtractor +from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor from databuilder.job.job import DefaultJob +from databuilder.loader.file_system_elasticsearch_json_loader import FSElasticsearchJSONLoader from databuilder.loader.file_system_neo4j_csv_loader import FsNeo4jCSVLoader from databuilder.publisher import neo4j_csv_publisher +from databuilder.publisher.elasticsearch_publisher import ElasticsearchPublisher from databuilder.publisher.neo4j_csv_publisher import Neo4jCsvPublisher from databuilder.task.task import DefaultTask - -from elasticsearch import Elasticsearch -from databuilder.loader.file_system_elasticsearch_json_loader import FSElasticsearchJSONLoader -from databuilder.extractor.neo4j_search_data_extractor import Neo4jSearchDataExtractor from databuilder.transformer.base_transformer import NoopTransformer -from databuilder.extractor.neo4j_extractor import Neo4jExtractor -from databuilder.publisher.elasticsearch_publisher import ElasticsearchPublisher LOGGER = logging.getLogger(__name__) LOGGER.setLevel(logging.INFO) @@ -35,7 +34,7 @@ DB2_CONN_STRING = 'db2+ibm_db://username:password@database.host.name:50000/DB;' # set env NEO4J_HOST to override localhost -NEO4J_ENDPOINT = 'bolt://{}:7687'.format(os.getenv('NEO4J_HOST', 'localhost')) +NEO4J_ENDPOINT = f'bolt://{os.getenv("NEO4J_HOST", "localhost")}:7687' neo4j_endpoint = NEO4J_ENDPOINT neo4j_user = 'neo4j' @@ -56,12 +55,11 @@ def create_sample_db2_job(): + where_clause = f"WHERE c.TABSCHEMA not in ({','.join(IGNORED_SCHEMAS)}) ;" - where_clause = "WHERE c.TABSCHEMA not in ({0}) ;".format(','.join(IGNORED_SCHEMAS)) - - tmp_folder = '/var/tmp/amundsen/{}'.format('tables') - node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) - relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder) + tmp_folder = '/var/tmp/amundsen/tables' + node_files_folder = f'{tmp_folder}/nodes' + relationship_files_folder = f'{tmp_folder}/relationships' sql_extractor = Db2MetadataExtractor() csv_loader = FsNeo4jCSVLoader() @@ -70,19 +68,19 @@ def create_sample_db2_job(): loader=csv_loader) job_config = ConfigFactory.from_dict({ - 'extractor.db2_metadata.extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): DB2_CONN_STRING, - 'extractor.db2_metadata.{}'.format(Db2MetadataExtractor.DATABASE_KEY): 'DEMODB', - 'extractor.db2_metadata.{}'.format(Db2MetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY): where_clause, - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR): True, - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.FORCE_CREATE_DIR): True, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'unique_tag' + f'extractor.db2_metadata.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': DB2_CONN_STRING, + f'extractor.db2_metadata.{Db2MetadataExtractor.DATABASE_KEY}': 'DEMODB', + f'extractor.db2_metadata.{Db2MetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}': where_clause, + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR}': True, + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.FORCE_CREATE_DIR}': True, + f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password, + f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag' }) job = DefaultJob(conf=job_config, task=task, @@ -119,32 +117,29 @@ def create_es_publisher_sample_job(elasticsearch_index_alias='table_search_index elasticsearch_new_index_key = 'tables' + str(uuid.uuid4()) job_config = ConfigFactory.from_dict({ - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.GRAPH_URL_CONFIG_KEY): neo4j_endpoint, - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.MODEL_CLASS_CONFIG_KEY): model_name, - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_USER): neo4j_user, - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_PW): neo4j_password, - 'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY): - extracted_search_data_path, - 'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY): 'w', - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_PATH_CONFIG_KEY): - extracted_search_data_path, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_MODE_CONFIG_KEY): 'r', - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY): + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}': neo4j_endpoint, + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.MODEL_CLASS_CONFIG_KEY}': model_name, + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_USER}': neo4j_user, + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_PW}': neo4j_password, + f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY}': extracted_search_data_path, + f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY}': 'w', + f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_PATH_CONFIG_KEY}': extracted_search_data_path, + f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_MODE_CONFIG_KEY}': 'r', + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY}': elasticsearch_client, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY): + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY}': elasticsearch_new_index_key, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY): + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY}': elasticsearch_doc_type_key, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY): + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY}': elasticsearch_index_alias, }) # only optionally add these keys, so need to dynamically `put` them if cypher_query: - job_config.put('extractor.search_data.{}'.format(Neo4jSearchDataExtractor.CYPHER_QUERY_CONFIG_KEY), - cypher_query) + job_config.put(f'extractor.search_data.{Neo4jSearchDataExtractor.CYPHER_QUERY_CONFIG_KEY}', cypher_query) if elasticsearch_mapping: - job_config.put('publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY), + job_config.put(f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY}', elasticsearch_mapping) job = DefaultJob(conf=job_config, diff --git a/databuilder/example/scripts/sample_deltalake_metadata.py b/databuilder/example/scripts/sample_deltalake_metadata.py index d67e5bd22c..e1f96d1d18 100644 --- a/databuilder/example/scripts/sample_deltalake_metadata.py +++ b/databuilder/example/scripts/sample_deltalake_metadata.py @@ -5,6 +5,9 @@ This is a example script for extracting Delta Lake Metadata Results """ +from pyhocon import ConfigFactory +from pyspark.sql import SparkSession + from databuilder.extractor.delta_lake_metadata_extractor import DeltaLakeMetadataExtractor from databuilder.job.job import DefaultJob from databuilder.loader.file_system_neo4j_csv_loader import FsNeo4jCSVLoader @@ -12,8 +15,6 @@ from databuilder.publisher import neo4j_csv_publisher from databuilder.publisher.neo4j_csv_publisher import Neo4jCsvPublisher from databuilder.task.task import DefaultTask -from pyhocon import ConfigFactory -from pyspark.sql import SparkSession # NEO4J cluster endpoints NEO4J_ENDPOINT = 'bolt://localhost:7687/' @@ -32,33 +33,23 @@ def create_delta_lake_job_config(): tmp_folder = '/var/tmp/amundsen/table_metadata' - node_files_folder = '{tmp_folder}/nodes/'.format(tmp_folder=tmp_folder) - relationship_files_folder = '{tmp_folder}/relationships/'.format(tmp_folder=tmp_folder) + node_files_folder = f'{tmp_folder}/nodes/' + relationship_files_folder = f'{tmp_folder}/relationships/' job_config = ConfigFactory.from_dict({ - 'extractor.delta_lake_table_metadata.{}'.format(DeltaLakeMetadataExtractor.CLUSTER_KEY): cluster_key, - 'extractor.delta_lake_table_metadata.{}'.format(DeltaLakeMetadataExtractor.DATABASE_KEY): database, - 'extractor.delta_lake_table_metadata.{}'.format(DeltaLakeMetadataExtractor.SCHEMA_LIST_KEY): schema_list, - 'extractor.delta_lake_table_metadata.{}'.format(DeltaLakeMetadataExtractor.EXCLUDE_LIST_SCHEMAS_KEY): - exclude_list, - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): - node_files_folder, - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): - relationship_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): - node_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): - relationship_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): - neo4j_endpoint, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): - neo4j_user, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): - neo4j_password, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_CREATE_ONLY_NODES): - [DESCRIPTION_NODE_LABEL], - 'publisher.neo4j.job_publish_tag': - 'some_unique_tag' # TO-DO unique tag must be added + f'extractor.delta_lake_table_metadata.{DeltaLakeMetadataExtractor.CLUSTER_KEY}': cluster_key, + f'extractor.delta_lake_table_metadata.{DeltaLakeMetadataExtractor.DATABASE_KEY}': database, + f'extractor.delta_lake_table_metadata.{DeltaLakeMetadataExtractor.SCHEMA_LIST_KEY}': schema_list, + f'extractor.delta_lake_table_metadata.{DeltaLakeMetadataExtractor.EXCLUDE_LIST_SCHEMAS_KEY}': exclude_list, + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_CREATE_ONLY_NODES}': [DESCRIPTION_NODE_LABEL], + f'publisher.neo4j.job_publish_tag': 'some_unique_tag' # TO-DO unique tag must be added }) return job_config diff --git a/databuilder/example/scripts/sample_dremio_data_loader.py b/databuilder/example/scripts/sample_dremio_data_loader.py index 5bf434123e..4a4d456bfb 100644 --- a/databuilder/example/scripts/sample_dremio_data_loader.py +++ b/databuilder/example/scripts/sample_dremio_data_loader.py @@ -7,21 +7,22 @@ import logging import os -from pyhocon import ConfigFactory -import uuid import sys +import uuid + +from elasticsearch.client import Elasticsearch +from pyhocon import ConfigFactory from databuilder.extractor.dremio_metadata_extractor import DremioMetadataExtractor +from databuilder.extractor.neo4j_extractor import Neo4jExtractor +from databuilder.extractor.neo4j_search_data_extractor import Neo4jSearchDataExtractor from databuilder.job.job import DefaultJob +from databuilder.loader.file_system_elasticsearch_json_loader import FSElasticsearchJSONLoader from databuilder.loader.file_system_neo4j_csv_loader import FsNeo4jCSVLoader from databuilder.publisher import neo4j_csv_publisher +from databuilder.publisher.elasticsearch_publisher import ElasticsearchPublisher from databuilder.publisher.neo4j_csv_publisher import Neo4jCsvPublisher from databuilder.task.task import DefaultTask -from databuilder.extractor.neo4j_search_data_extractor import Neo4jSearchDataExtractor -from databuilder.extractor.neo4j_extractor import Neo4jExtractor -from databuilder.loader.file_system_elasticsearch_json_loader import FSElasticsearchJSONLoader -from databuilder.publisher.elasticsearch_publisher import ElasticsearchPublisher -from elasticsearch.client import Elasticsearch from databuilder.transformer.base_transformer import NoopTransformer LOGGER = logging.getLogger(__name__) @@ -33,7 +34,7 @@ DREMIO_PASSWORD = 'test' # set env NEO4J_HOST to override localhost -NEO4J_ENDPOINT = 'bolt://{}:7687'.format(os.getenv('NEO4J_HOST', 'localhost')) +NEO4J_ENDPOINT = f'bolt://{os.getenv("NEO4J_HOST", "localhost")}:7687' NEO4J_USER = 'neo4j' NEO4J_PASSWORD = 'test' @@ -50,10 +51,9 @@ def create_sample_dremio_job(): - - tmp_folder = '/var/tmp/amundsen/{}'.format('tables') - node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) - relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder) + tmp_folder = f'/var/tmp/amundsen/{"tables"}' + node_files_folder = f'{tmp_folder}/nodes' + relationship_files_folder = f'{tmp_folder}/relationships' extractor = DremioMetadataExtractor() loader = FsNeo4jCSVLoader() @@ -62,20 +62,20 @@ def create_sample_dremio_job(): loader=loader) job_config = ConfigFactory.from_dict({ - 'extractor.dremio.{}'.format(DremioMetadataExtractor.DREMIO_USER_KEY): DREMIO_USER, - 'extractor.dremio.{}'.format(DremioMetadataExtractor.DREMIO_PASSWORD_KEY): DREMIO_PASSWORD, - 'extractor.dremio.{}'.format(DremioMetadataExtractor.DREMIO_HOST_KEY): DREMIO_HOST, - 'extractor.dremio.{}'.format(DremioMetadataExtractor.DREMIO_EXCLUDE_PDS_TABLES_KEY): True, - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR): True, - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.FORCE_CREATE_DIR): True, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): NEO4J_ENDPOINT, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): NEO4J_USER, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): NEO4J_PASSWORD, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'unique_tag' + f'extractor.dremio.{DremioMetadataExtractor.DREMIO_USER_KEY}': DREMIO_USER, + f'extractor.dremio.{DremioMetadataExtractor.DREMIO_PASSWORD_KEY}': DREMIO_PASSWORD, + f'extractor.dremio.{DremioMetadataExtractor.DREMIO_HOST_KEY}': DREMIO_HOST, + f'extractor.dremio.{DremioMetadataExtractor.DREMIO_EXCLUDE_PDS_TABLES_KEY}': True, + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR}': True, + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.FORCE_CREATE_DIR}': True, + f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': NEO4J_ENDPOINT, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': NEO4J_USER, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': NEO4J_PASSWORD, + f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag' }) job = DefaultJob(conf=job_config, @@ -114,32 +114,30 @@ def create_es_publisher_sample_job(elasticsearch_index_alias='table_search_index elasticsearch_new_index_key = 'tables' + str(uuid.uuid4()) job_config = ConfigFactory.from_dict({ - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.GRAPH_URL_CONFIG_KEY): NEO4J_ENDPOINT, - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.MODEL_CLASS_CONFIG_KEY): model_name, - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_USER): NEO4J_USER, - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_PW): NEO4J_PASSWORD, - 'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY): - extracted_search_data_path, - 'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY): 'w', - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_PATH_CONFIG_KEY): - extracted_search_data_path, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_MODE_CONFIG_KEY): 'r', - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY): + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}': NEO4J_ENDPOINT, + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.MODEL_CLASS_CONFIG_KEY}': model_name, + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_USER}': NEO4J_USER, + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_PW}': NEO4J_PASSWORD, + f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY}': extracted_search_data_path, + f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY}': 'w', + f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_PATH_CONFIG_KEY}': extracted_search_data_path, + f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_MODE_CONFIG_KEY}': 'r', + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY}': elasticsearch_client, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY): + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY}': elasticsearch_new_index_key, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY): + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY}': elasticsearch_doc_type_key, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY): + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY}': elasticsearch_index_alias, }) # only optionally add these keys, so need to dynamically `put` them if cypher_query: - job_config.put('extractor.search_data.{}'.format(Neo4jSearchDataExtractor.CYPHER_QUERY_CONFIG_KEY), + job_config.put(f'extractor.search_data.{Neo4jSearchDataExtractor.CYPHER_QUERY_CONFIG_KEY}', cypher_query) if elasticsearch_mapping: - job_config.put('publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY), + job_config.put(f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY}', elasticsearch_mapping) job = DefaultJob(conf=job_config, diff --git a/databuilder/example/scripts/sample_feast_loader.py b/databuilder/example/scripts/sample_feast_loader.py index 66b7ad15fc..dd39b41b68 100644 --- a/databuilder/example/scripts/sample_feast_loader.py +++ b/databuilder/example/scripts/sample_feast_loader.py @@ -13,17 +13,19 @@ import sys import uuid + +from elasticsearch.client import Elasticsearch +from pyhocon import ConfigFactory + from databuilder.extractor.feast_extractor import FeastExtractor from databuilder.extractor.neo4j_extractor import Neo4jExtractor from databuilder.extractor.neo4j_search_data_extractor import Neo4jSearchDataExtractor from databuilder.job.job import DefaultJob -from databuilder.loader.file_system_neo4j_csv_loader import FsNeo4jCSVLoader from databuilder.loader.file_system_elasticsearch_json_loader import FSElasticsearchJSONLoader +from databuilder.loader.file_system_neo4j_csv_loader import FsNeo4jCSVLoader from databuilder.publisher import neo4j_csv_publisher from databuilder.publisher.elasticsearch_publisher import ElasticsearchPublisher from databuilder.task.task import DefaultTask -from pyhocon import ConfigFactory -from elasticsearch.client import Elasticsearch feast_endpoint = sys.argv[1] neo4j_endpoint = sys.argv[2] @@ -36,47 +38,31 @@ def create_feast_job_config(): tmp_folder = "/var/tmp/amundsen/table_metadata" - node_files_folder = "{tmp_folder}/nodes/".format(tmp_folder=tmp_folder) - relationship_files_folder = "{tmp_folder}/relationships/".format( - tmp_folder=tmp_folder - ) + node_files_folder = f"{tmp_folder}/nodes/" + relationship_files_folder = f"{tmp_folder}/relationships/" job_config = ConfigFactory.from_dict( { - "extractor.feast.{}".format( - FeastExtractor.FEAST_ENDPOINT_CONFIG_KEY - ): feast_endpoint, - "loader.filesystem_csv_neo4j.{}".format( - FsNeo4jCSVLoader.NODE_DIR_PATH - ): node_files_folder, - "loader.filesystem_csv_neo4j.{}".format( - FsNeo4jCSVLoader.RELATION_DIR_PATH - ): relationship_files_folder, - "publisher.neo4j.{}".format( - neo4j_csv_publisher.NODE_FILES_DIR - ): node_files_folder, - "publisher.neo4j.{}".format( - neo4j_csv_publisher.RELATION_FILES_DIR - ): relationship_files_folder, - "publisher.neo4j.{}".format( - neo4j_csv_publisher.NEO4J_END_POINT_KEY - ): neo4j_endpoint, - "publisher.neo4j.{}".format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, - "publisher.neo4j.{}".format( - neo4j_csv_publisher.NEO4J_PASSWORD - ): neo4j_password, - "publisher.neo4j.job_publish_tag": "some_unique_tag", # TO-DO unique tag must be added + f"extractor.feast.{FeastExtractor.FEAST_ENDPOINT_CONFIG_KEY}": feast_endpoint, + f"loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}": node_files_folder, + f"loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}": relationship_files_folder, + f"publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}": node_files_folder, + f"publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}": relationship_files_folder, + f"publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}": neo4j_endpoint, + f"publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}": neo4j_user, + f"publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}": neo4j_password, + f"publisher.neo4j.job_publish_tag": "some_unique_tag", # TO-DO unique tag must be added } ) return job_config def create_es_publish_job_config( - elasticsearch_index_alias="table_search_index", - elasticsearch_doc_type_key="table", - model_name="databuilder.models.table_elasticsearch_document.TableESDocument", - cypher_query=None, - elasticsearch_mapping=None, + elasticsearch_index_alias="table_search_index", + elasticsearch_doc_type_key="table", + model_name="databuilder.models.table_elasticsearch_document.TableESDocument", + cypher_query=None, + elasticsearch_mapping=None, ): """ :param elasticsearch_index_alias: alias for Elasticsearch used in @@ -99,58 +85,35 @@ def create_es_publish_job_config( job_config = ConfigFactory.from_dict( { - "extractor.search_data.extractor.neo4j.{}".format( - Neo4jExtractor.GRAPH_URL_CONFIG_KEY - ): neo4j_endpoint, - "extractor.search_data.extractor.neo4j.{}".format( - Neo4jExtractor.MODEL_CLASS_CONFIG_KEY - ): model_name, - "extractor.search_data.extractor.neo4j.{}".format( - Neo4jExtractor.NEO4J_AUTH_USER - ): neo4j_user, - "extractor.search_data.extractor.neo4j.{}".format( - Neo4jExtractor.NEO4J_AUTH_PW - ): neo4j_password, - "loader.filesystem.elasticsearch.{}".format( - FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY - ): extracted_search_data_path, - "loader.filesystem.elasticsearch.{}".format( - FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY - ): "w", - "publisher.elasticsearch.{}".format( - ElasticsearchPublisher.FILE_PATH_CONFIG_KEY - ): extracted_search_data_path, - "publisher.elasticsearch.{}".format( - ElasticsearchPublisher.FILE_MODE_CONFIG_KEY - ): "r", - "publisher.elasticsearch.{}".format( - ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY - ): elasticsearch_client, - "publisher.elasticsearch.{}".format( - ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY - ): elasticsearch_new_index_key, - "publisher.elasticsearch.{}".format( - ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY - ): elasticsearch_doc_type_key, - "publisher.elasticsearch.{}".format( - ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY - ): elasticsearch_index_alias, + f"extractor.search_data.extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}": neo4j_endpoint, + f"extractor.search_data.extractor.neo4j.{Neo4jExtractor.MODEL_CLASS_CONFIG_KEY}": model_name, + f"extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_USER}": neo4j_user, + f"extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_PW}": neo4j_password, + f"loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY}": + extracted_search_data_path, + f"loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY}": "w", + f"publisher.elasticsearch.{ElasticsearchPublisher.FILE_PATH_CONFIG_KEY}": extracted_search_data_path, + f"publisher.elasticsearch.{ElasticsearchPublisher.FILE_MODE_CONFIG_KEY}": "r", + f"publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY}": + elasticsearch_client, + f"publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY}": + elasticsearch_new_index_key, + f"publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY}": + elasticsearch_doc_type_key, + f"publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY}": + elasticsearch_index_alias, } ) # only optionally add these keys, so need to dynamically `put` them if cypher_query: job_config.put( - "extractor.search_data.{}".format( - Neo4jSearchDataExtractor.CYPHER_QUERY_CONFIG_KEY - ), + f"extractor.search_data.{Neo4jSearchDataExtractor.CYPHER_QUERY_CONFIG_KEY}", cypher_query, ) if elasticsearch_mapping: job_config.put( - "publisher.elasticsearch.{}".format( - ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY - ), + f"publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY}", elasticsearch_mapping, ) diff --git a/databuilder/example/scripts/sample_glue_loader.py b/databuilder/example/scripts/sample_glue_loader.py index 4915f3b043..0de510a369 100644 --- a/databuilder/example/scripts/sample_glue_loader.py +++ b/databuilder/example/scripts/sample_glue_loader.py @@ -1,9 +1,9 @@ # Copyright Contributors to the Amundsen project. # SPDX-License-Identifier: Apache-2.0 +import uuid from datetime import datetime from pathlib import Path -import uuid from elasticsearch import Elasticsearch from pyhocon import ConfigFactory @@ -20,7 +20,6 @@ from databuilder.task.task import DefaultTask from databuilder.transformer.base_transformer import NoopTransformer - # change the following values to your liking NEO4J_ENDPOINT = 'bolt://127.0.0.1:7687' NEO4j_USERNAME = 'neo4j' @@ -31,32 +30,21 @@ def create_glue_extractor_job(): - tmp_folder = '/var/tmp/amundsen/table_metadata' node_files_folder = Path(tmp_folder, 'nodes') relationship_files_folder = Path(tmp_folder, 'relationships') job_config = ConfigFactory.from_dict({ - 'extractor.glue.{}'.format(GlueExtractor.CLUSTER_KEY): - GLUE_CLUSTER_KEY, - 'extractor.glue.{}'.format(GlueExtractor.FILTER_KEY): - [], - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): - node_files_folder, - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): - relationship_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): - node_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): - relationship_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): - NEO4J_ENDPOINT, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): - NEO4j_USERNAME, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): - NEO4j_PASSWORD, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): - str(int(datetime.utcnow().timestamp())) + f'extractor.glue.{GlueExtractor.CLUSTER_KEY}': GLUE_CLUSTER_KEY, + f'extractor.glue.{GlueExtractor.FILTER_KEY}': [], + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': NEO4J_ENDPOINT, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': NEO4j_USERNAME, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': NEO4j_PASSWORD, + f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': str(int(datetime.utcnow().timestamp())) }) return DefaultJob(conf=job_config, @@ -68,7 +56,6 @@ def create_glue_extractor_job(): def create_es_publisher_job(): - # loader saves data to this location and publisher reads it from here extracted_search_data_path = '/var/tmp/amundsen/search_data.json' @@ -85,29 +72,29 @@ def create_es_publisher_job(): elasticsearch_index_alias = 'table_search_index' job_config = ConfigFactory.from_dict({ - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.GRAPH_URL_CONFIG_KEY): + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}': NEO4J_ENDPOINT, - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.MODEL_CLASS_CONFIG_KEY): + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.MODEL_CLASS_CONFIG_KEY}': 'databuilder.models.table_elasticsearch_document.TableESDocument', - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_USER): + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_USER}': NEO4j_USERNAME, - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_PW): + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_PW}': NEO4j_PASSWORD, - 'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY): + f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY}': extracted_search_data_path, - 'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY): + f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY}': 'w', - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_PATH_CONFIG_KEY): + f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_PATH_CONFIG_KEY}': extracted_search_data_path, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_MODE_CONFIG_KEY): + f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_MODE_CONFIG_KEY}': 'r', - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY): + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY}': elasticsearch_client, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY): + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY}': elasticsearch_new_index_key, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY): + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY}': elasticsearch_new_index_key_type, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY): + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY}': elasticsearch_index_alias }) @@ -117,7 +104,6 @@ def create_es_publisher_job(): if __name__ == "__main__": - glue_job = create_glue_extractor_job() glue_job.launch() diff --git a/databuilder/example/scripts/sample_mssql_metadata.py b/databuilder/example/scripts/sample_mssql_metadata.py index 064c1b420a..42a4fdddd6 100644 --- a/databuilder/example/scripts/sample_mssql_metadata.py +++ b/databuilder/example/scripts/sample_mssql_metadata.py @@ -9,16 +9,16 @@ """ import sys -import textwrap import uuid + from elasticsearch import Elasticsearch from pyhocon import ConfigFactory from sqlalchemy.ext.declarative import declarative_base from databuilder.extractor.mssql_metadata_extractor import MSSQLMetadataExtractor -from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor from databuilder.extractor.neo4j_extractor import Neo4jExtractor from databuilder.extractor.neo4j_search_data_extractor import Neo4jSearchDataExtractor +from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor from databuilder.job.job import DefaultJob from databuilder.loader.file_system_elasticsearch_json_loader import FSElasticsearchJSONLoader from databuilder.loader.file_system_neo4j_csv_loader import FsNeo4jCSVLoader @@ -36,13 +36,13 @@ neo_host = sys.argv[2] es = Elasticsearch([ - {'host': es_host if es_host else 'localhost'}, + {'host': es_host or 'localhost'}, ]) DB_FILE = '/tmp/test.db' Base = declarative_base() -NEO4J_ENDPOINT = 'bolt://{}:7687'.format(neo_host if neo_host else 'localhost') +NEO4J_ENDPOINT = f'bolt://{neo_host or "localhost"}:7687' neo4j_endpoint = NEO4J_ENDPOINT @@ -64,24 +64,18 @@ def connection_string(windows_auth=False): """ if windows_auth: - base_string = ( - "mssql+pyodbc://@{host}/{db}" + - "?driver=ODBC+Driver+17+for+SQL+Server" + - "?trusted_connection=yes" + - "&autocommit=true" # comment to disable autocommit. - ) + base_string = "mssql+pyodbc://@{host}/{db}" \ + "?driver=ODBC+Driver+17+for+SQL+Server" \ + "?trusted_connection=yes&autocommit=true" # comment to disable autocommit. params = { - "host": "localhost", "db": "master" } else: - base_string = ( - "mssql+pyodbc://{user}:{pword}@{host}/{db}" + - "?driver=ODBC+Driver+17+for+SQL+Server" + - "&autocommit=true" # comment to disable autocommit. - ) + base_string = "mssql+pyodbc://{user}:{pword}@{host}/{db}" \ + "?driver=ODBC+Driver+17+for+SQL+Server" \ + "&autocommit=true" # comment to disable autocommit. params = { "user": "username", "pword": "password", @@ -93,40 +87,26 @@ def connection_string(windows_auth=False): def run_mssql_job(): - where_clause_suffix = textwrap.dedent(""" - ('dbo') - """) + where_clause_suffix = "('dbo')" tmp_folder = '/var/tmp/amundsen/table_metadata' - node_files_folder = '{tmp_folder}/nodes/'.format(tmp_folder=tmp_folder) - relationship_files_folder = '{tmp_folder}/relationships/'.format( - tmp_folder=tmp_folder) + node_files_folder = f'{tmp_folder}/nodes/' + relationship_files_folder = f'{tmp_folder}/relationships/' job_config = ConfigFactory.from_dict({ # MSSQL Loader - 'extractor.mssql_metadata.{}'.format( - MSSQLMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY): where_clause_suffix, - 'extractor.mssql_metadata.{}'.format( - MSSQLMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME): True, - 'extractor.mssql_metadata.extractor.sqlalchemy.{}'.format( - SQLAlchemyExtractor.CONN_STRING): connection_string(), + f'extractor.mssql_metadata.{MSSQLMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}': where_clause_suffix, + f'extractor.mssql_metadata.{MSSQLMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME}': True, + f'extractor.mssql_metadata.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': connection_string(), # NEO4J Loader - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): - node_files_folder, - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): - relationship_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): - node_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): - relationship_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): - neo4j_endpoint, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): - neo4j_user, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): - neo4j_password, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): - 'unique_tag', # should use unique tag here like {ds} + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password, + f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag', # should use unique tag here like {ds} }) job = DefaultJob( @@ -167,32 +147,30 @@ def create_es_publisher_sample_job(elasticsearch_index_alias='table_search_index elasticsearch_new_index_key = 'tables' + str(uuid.uuid4()) job_config = ConfigFactory.from_dict({ - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.GRAPH_URL_CONFIG_KEY): neo4j_endpoint, - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.MODEL_CLASS_CONFIG_KEY): model_name, - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_USER): neo4j_user, - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_PW): neo4j_password, - 'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY): - extracted_search_data_path, - 'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY): 'w', - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_PATH_CONFIG_KEY): - extracted_search_data_path, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_MODE_CONFIG_KEY): 'r', - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY): + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}': neo4j_endpoint, + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.MODEL_CLASS_CONFIG_KEY}': model_name, + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_USER}': neo4j_user, + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_PW}': neo4j_password, + f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY}': extracted_search_data_path, + f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY}': 'w', + f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_PATH_CONFIG_KEY}': extracted_search_data_path, + f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_MODE_CONFIG_KEY}': 'r', + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY}': elasticsearch_client, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY): + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY}': elasticsearch_new_index_key, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY): + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY}': elasticsearch_doc_type_key, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY): + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY}': elasticsearch_index_alias, }) # only optionally add these keys, so need to dynamically `put` them if cypher_query: - job_config.put('extractor.search_data.{}'.format(Neo4jSearchDataExtractor.CYPHER_QUERY_CONFIG_KEY), + job_config.put(f'extractor.search_data.{Neo4jSearchDataExtractor.CYPHER_QUERY_CONFIG_KEY}', cypher_query) if elasticsearch_mapping: - job_config.put('publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY), + job_config.put(f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY}', elasticsearch_mapping) job = DefaultJob(conf=job_config, diff --git a/databuilder/example/scripts/sample_mysql_loader.py b/databuilder/example/scripts/sample_mysql_loader.py index 48aac0c29a..e1e979d758 100644 --- a/databuilder/example/scripts/sample_mysql_loader.py +++ b/databuilder/example/scripts/sample_mysql_loader.py @@ -10,14 +10,15 @@ import sys import textwrap import uuid + from elasticsearch import Elasticsearch from pyhocon import ConfigFactory from sqlalchemy.ext.declarative import declarative_base from databuilder.extractor.mysql_metadata_extractor import MysqlMetadataExtractor -from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor from databuilder.extractor.neo4j_extractor import Neo4jExtractor from databuilder.extractor.neo4j_search_data_extractor import Neo4jSearchDataExtractor +from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor from databuilder.job.job import DefaultJob from databuilder.loader.file_system_elasticsearch_json_loader import FSElasticsearchJSONLoader from databuilder.loader.file_system_neo4j_csv_loader import FsNeo4jCSVLoader @@ -35,14 +36,14 @@ neo_host = sys.argv[2] es = Elasticsearch([ - {'host': es_host if es_host else 'localhost'}, + {'host': es_host or 'localhost'}, ]) DB_FILE = '/tmp/test.db' SQLITE_CONN_STRING = 'sqlite:////tmp/test.db' Base = declarative_base() -NEO4J_ENDPOINT = 'bolt://{}:7687'.format(neo_host if neo_host else 'localhost') +NEO4J_ENDPOINT = f'bolt://{neo_host or "localhost"}:7687' neo4j_endpoint = NEO4J_ENDPOINT @@ -65,32 +66,21 @@ def run_mysql_job(): """) tmp_folder = '/var/tmp/amundsen/table_metadata' - node_files_folder = '{tmp_folder}/nodes/'.format(tmp_folder=tmp_folder) - relationship_files_folder = '{tmp_folder}/relationships/'.format(tmp_folder=tmp_folder) + node_files_folder = f'{tmp_folder}/nodes/' + relationship_files_folder = f'{tmp_folder}/relationships/' job_config = ConfigFactory.from_dict({ - 'extractor.mysql_metadata.{}'.format(MysqlMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY): - where_clause_suffix, - 'extractor.mysql_metadata.{}'.format(MysqlMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME): - True, - 'extractor.mysql_metadata.extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): - connection_string(), - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): - node_files_folder, - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): - relationship_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): - node_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): - relationship_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): - neo4j_endpoint, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): - neo4j_user, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): - neo4j_password, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): - 'unique_tag', # should use unique tag here like {ds} + f'extractor.mysql_metadata.{MysqlMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}': where_clause_suffix, + f'extractor.mysql_metadata.{MysqlMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME}': True, + f'extractor.mysql_metadata.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': connection_string(), + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password, + f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag', # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=DefaultTask(extractor=MysqlMetadataExtractor(), loader=FsNeo4jCSVLoader()), @@ -127,32 +117,30 @@ def create_es_publisher_sample_job(elasticsearch_index_alias='table_search_index elasticsearch_new_index_key = 'tables' + str(uuid.uuid4()) job_config = ConfigFactory.from_dict({ - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.GRAPH_URL_CONFIG_KEY): neo4j_endpoint, - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.MODEL_CLASS_CONFIG_KEY): model_name, - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_USER): neo4j_user, - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_PW): neo4j_password, - 'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY): - extracted_search_data_path, - 'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY): 'w', - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_PATH_CONFIG_KEY): - extracted_search_data_path, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_MODE_CONFIG_KEY): 'r', - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY): + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}': neo4j_endpoint, + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.MODEL_CLASS_CONFIG_KEY}': model_name, + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_USER}': neo4j_user, + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_PW}': neo4j_password, + f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY}': extracted_search_data_path, + f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY}': 'w', + f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_PATH_CONFIG_KEY}': extracted_search_data_path, + f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_MODE_CONFIG_KEY}': 'r', + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY}': elasticsearch_client, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY): + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY}': elasticsearch_new_index_key, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY): + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY}': elasticsearch_doc_type_key, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY): + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY}': elasticsearch_index_alias, }) # only optionally add these keys, so need to dynamically `put` them if cypher_query: - job_config.put('extractor.search_data.{}'.format(Neo4jSearchDataExtractor.CYPHER_QUERY_CONFIG_KEY), + job_config.put(f'extractor.search_data.{Neo4jSearchDataExtractor.CYPHER_QUERY_CONFIG_KEY}', cypher_query) if elasticsearch_mapping: - job_config.put('publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY), + job_config.put(f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY}', elasticsearch_mapping) job = DefaultJob(conf=job_config, diff --git a/databuilder/example/scripts/sample_postgres_loader.py b/databuilder/example/scripts/sample_postgres_loader.py index 8ba54791c3..9f566b2991 100644 --- a/databuilder/example/scripts/sample_postgres_loader.py +++ b/databuilder/example/scripts/sample_postgres_loader.py @@ -10,14 +10,15 @@ import sys import textwrap import uuid + from elasticsearch import Elasticsearch from pyhocon import ConfigFactory from sqlalchemy.ext.declarative import declarative_base -from databuilder.extractor.postgres_metadata_extractor import PostgresMetadataExtractor -from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor from databuilder.extractor.neo4j_extractor import Neo4jExtractor from databuilder.extractor.neo4j_search_data_extractor import Neo4jSearchDataExtractor +from databuilder.extractor.postgres_metadata_extractor import PostgresMetadataExtractor +from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor from databuilder.job.job import DefaultJob from databuilder.loader.file_system_elasticsearch_json_loader import FSElasticsearchJSONLoader from databuilder.loader.file_system_neo4j_csv_loader import FsNeo4jCSVLoader @@ -42,7 +43,7 @@ SQLITE_CONN_STRING = 'sqlite:////tmp/test.db' Base = declarative_base() -NEO4J_ENDPOINT = 'bolt://{}:7687'.format(neo_host if neo_host else 'localhost') +NEO4J_ENDPOINT = f'bolt://{neo_host or "localhost"}:7687' neo4j_endpoint = NEO4J_ENDPOINT @@ -65,34 +66,22 @@ def run_postgres_job(): """) tmp_folder = '/var/tmp/amundsen/table_metadata' - node_files_folder = '{tmp_folder}/nodes/'.format(tmp_folder=tmp_folder) - relationship_files_folder = '{tmp_folder}/relationships/'.format(tmp_folder=tmp_folder) + node_files_folder = f'{tmp_folder}/nodes/' + relationship_files_folder = f'{tmp_folder}/relationships/' job_config = ConfigFactory.from_dict({ - 'extractor.postgres_metadata.{}'.format(PostgresMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY): - where_clause_suffix, - 'extractor.postgres_metadata.{}'.format(PostgresMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME): - True, - 'extractor.postgres_metadata.extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): - connection_string(), - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): - node_files_folder, - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): - relationship_files_folder, - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR): - True, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): - node_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): - relationship_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): - neo4j_endpoint, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): - neo4j_user, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): - neo4j_password, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): - 'unique_tag', # should use unique tag here like {ds} + f'extractor.postgres_metadata.{PostgresMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}': where_clause_suffix, + f'extractor.postgres_metadata.{PostgresMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME}': True, + f'extractor.postgres_metadata.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': connection_string(), + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR}': True, + f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password, + f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag', # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=DefaultTask(extractor=PostgresMetadataExtractor(), loader=FsNeo4jCSVLoader()), @@ -129,32 +118,30 @@ def create_es_publisher_sample_job(elasticsearch_index_alias='table_search_index elasticsearch_new_index_key = 'tables' + str(uuid.uuid4()) job_config = ConfigFactory.from_dict({ - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.GRAPH_URL_CONFIG_KEY): neo4j_endpoint, - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.MODEL_CLASS_CONFIG_KEY): model_name, - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_USER): neo4j_user, - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_PW): neo4j_password, - 'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY): - extracted_search_data_path, - 'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY): 'w', - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_PATH_CONFIG_KEY): - extracted_search_data_path, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_MODE_CONFIG_KEY): 'r', - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY): + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}': neo4j_endpoint, + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.MODEL_CLASS_CONFIG_KEY}': model_name, + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_USER}': neo4j_user, + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_PW}': neo4j_password, + f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY}': extracted_search_data_path, + f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY}': 'w', + f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_PATH_CONFIG_KEY}': extracted_search_data_path, + f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_MODE_CONFIG_KEY}': 'r', + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY}': elasticsearch_client, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY): + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY}': elasticsearch_new_index_key, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY): + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY}': elasticsearch_doc_type_key, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY): + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY}': elasticsearch_index_alias, }) # only optionally add these keys, so need to dynamically `put` them if cypher_query: - job_config.put('extractor.search_data.{}'.format(Neo4jSearchDataExtractor.CYPHER_QUERY_CONFIG_KEY), + job_config.put(f'extractor.search_data.{Neo4jSearchDataExtractor.CYPHER_QUERY_CONFIG_KEY}', cypher_query) if elasticsearch_mapping: - job_config.put('publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY), + job_config.put(f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY}', elasticsearch_mapping) job = DefaultJob(conf=job_config, diff --git a/databuilder/example/scripts/sample_snowflake_data_loader.py b/databuilder/example/scripts/sample_snowflake_data_loader.py index ff3eb4552a..6b2f9b6570 100644 --- a/databuilder/example/scripts/sample_snowflake_data_loader.py +++ b/databuilder/example/scripts/sample_snowflake_data_loader.py @@ -7,22 +7,23 @@ import logging import os -from pyhocon import ConfigFactory -import uuid import sys +import uuid -from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor +from elasticsearch.client import Elasticsearch +from pyhocon import ConfigFactory + +from databuilder.extractor.neo4j_extractor import Neo4jExtractor +from databuilder.extractor.neo4j_search_data_extractor import Neo4jSearchDataExtractor from databuilder.extractor.snowflake_metadata_extractor import SnowflakeMetadataExtractor +from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor from databuilder.job.job import DefaultJob +from databuilder.loader.file_system_elasticsearch_json_loader import FSElasticsearchJSONLoader from databuilder.loader.file_system_neo4j_csv_loader import FsNeo4jCSVLoader from databuilder.publisher import neo4j_csv_publisher +from databuilder.publisher.elasticsearch_publisher import ElasticsearchPublisher from databuilder.publisher.neo4j_csv_publisher import Neo4jCsvPublisher from databuilder.task.task import DefaultTask -from databuilder.extractor.neo4j_search_data_extractor import Neo4jSearchDataExtractor -from databuilder.extractor.neo4j_extractor import Neo4jExtractor -from databuilder.loader.file_system_elasticsearch_json_loader import FSElasticsearchJSONLoader -from databuilder.publisher.elasticsearch_publisher import ElasticsearchPublisher -from elasticsearch.client import Elasticsearch from databuilder.transformer.base_transformer import NoopTransformer LOGGER = logging.getLogger(__name__) @@ -33,7 +34,7 @@ SNOWFLAKE_DATABASE_KEY = 'YourSnowflakeDbName' # set env NEO4J_HOST to override localhost -NEO4J_ENDPOINT = 'bolt://{}:7687'.format(os.getenv('NEO4J_HOST', 'localhost')) +NEO4J_ENDPOINT = f'bolt://{os.getenv("NEO4J_HOST", "localhost")}:7687' neo4j_endpoint = NEO4J_ENDPOINT neo4j_user = 'neo4j' @@ -62,26 +63,19 @@ def connection_string(): account = 'YourSnowflakeAccountHere' # specify a warehouse to connect to. warehouse = 'yourwarehouse' - return 'snowflake://{user}:{password}@{account}/{database}?warehouse={warehouse}'.format( - user=user, - password=password, - account=account, - database=SNOWFLAKE_DATABASE_KEY, - warehouse=warehouse, - ) + return f'snowflake://{user}:{password}@{account}/{SNOWFLAKE_DATABASE_KEY}?warehouse={warehouse}' def create_sample_snowflake_job(): - - where_clause = "WHERE c.TABLE_SCHEMA not in ({0}) \ + where_clause = f"WHERE c.TABLE_SCHEMA not in ({','.join(IGNORED_SCHEMAS)}) \ AND c.TABLE_SCHEMA not like 'STAGE_%' \ AND c.TABLE_SCHEMA not like 'HIST_%' \ AND c.TABLE_SCHEMA not like 'SNAP_%' \ - AND lower(c.COLUMN_NAME) not like 'dw_%';".format(','.join(IGNORED_SCHEMAS)) + AND lower(c.COLUMN_NAME) not like 'dw_%';" - tmp_folder = '/var/tmp/amundsen/{}'.format('tables') - node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) - relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder) + tmp_folder = '/var/tmp/amundsen/tables' + node_files_folder = f'{tmp_folder}/nodes' + relationship_files_folder = f'{tmp_folder}/relationships' sql_extractor = SnowflakeMetadataExtractor() csv_loader = FsNeo4jCSVLoader() @@ -90,19 +84,19 @@ def create_sample_snowflake_job(): loader=csv_loader) job_config = ConfigFactory.from_dict({ - 'extractor.snowflake.extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): connection_string(), - 'extractor.snowflake.{}'.format(SnowflakeMetadataExtractor.SNOWFLAKE_DATABASE_KEY): SNOWFLAKE_DATABASE_KEY, - 'extractor.snowflake.{}'.format(SnowflakeMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY): where_clause, - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR): True, - 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.FORCE_CREATE_DIR): True, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password, - 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'unique_tag' + f'extractor.snowflake.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': connection_string(), + f'extractor.snowflake.{SnowflakeMetadataExtractor.SNOWFLAKE_DATABASE_KEY}': SNOWFLAKE_DATABASE_KEY, + f'extractor.snowflake.{SnowflakeMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}': where_clause, + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR}': True, + f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.FORCE_CREATE_DIR}': True, + f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user, + f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password, + f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag' }) job = DefaultJob(conf=job_config, task=task, @@ -139,32 +133,30 @@ def create_es_publisher_sample_job(elasticsearch_index_alias='table_search_index elasticsearch_new_index_key = 'tables' + str(uuid.uuid4()) job_config = ConfigFactory.from_dict({ - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.GRAPH_URL_CONFIG_KEY): neo4j_endpoint, - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.MODEL_CLASS_CONFIG_KEY): model_name, - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_USER): neo4j_user, - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_PW): neo4j_password, - 'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY): - extracted_search_data_path, - 'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY): 'w', - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_PATH_CONFIG_KEY): - extracted_search_data_path, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_MODE_CONFIG_KEY): 'r', - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY): + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}': neo4j_endpoint, + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.MODEL_CLASS_CONFIG_KEY}': model_name, + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_USER}': neo4j_user, + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_PW}': neo4j_password, + f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY}': extracted_search_data_path, + f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY}': 'w', + f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_PATH_CONFIG_KEY}': extracted_search_data_path, + f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_MODE_CONFIG_KEY}': 'r', + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY}': elasticsearch_client, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY): + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY}': elasticsearch_new_index_key, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY): + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY}': elasticsearch_doc_type_key, - 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY): + f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY}': elasticsearch_index_alias, }) # only optionally add these keys, so need to dynamically `put` them if cypher_query: - job_config.put('extractor.search_data.{}'.format(Neo4jSearchDataExtractor.CYPHER_QUERY_CONFIG_KEY), + job_config.put(f'extractor.search_data.{Neo4jSearchDataExtractor.CYPHER_QUERY_CONFIG_KEY}', cypher_query) if elasticsearch_mapping: - job_config.put('publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY), + job_config.put(f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY}', elasticsearch_mapping) job = DefaultJob(conf=job_config, diff --git a/databuilder/example/scripts/sample_tableau_data_loader.py b/databuilder/example/scripts/sample_tableau_data_loader.py index 07a093eab3..f4916fd318 100644 --- a/databuilder/example/scripts/sample_tableau_data_loader.py +++ b/databuilder/example/scripts/sample_tableau_data_loader.py @@ -24,6 +24,15 @@ from pyhocon import ConfigFactory from sqlalchemy.ext.declarative import declarative_base +from databuilder.extractor.dashboard.tableau.tableau_dashboard_extractor import TableauDashboardExtractor +from databuilder.extractor.dashboard.tableau.tableau_dashboard_last_modified_extractor import ( + TableauDashboardLastModifiedExtractor, +) +from databuilder.extractor.dashboard.tableau.tableau_dashboard_query_extractor import TableauDashboardQueryExtractor +from databuilder.extractor.dashboard.tableau.tableau_dashboard_table_extractor import TableauDashboardTableExtractor +from databuilder.extractor.dashboard.tableau.tableau_external_table_extractor import ( + TableauDashboardExternalTableExtractor, +) from databuilder.extractor.neo4j_search_data_extractor import Neo4jSearchDataExtractor from databuilder.job.job import DefaultJob from databuilder.loader.file_system_elasticsearch_json_loader import FSElasticsearchJSONLoader @@ -34,15 +43,6 @@ from databuilder.task.task import DefaultTask from databuilder.transformer.base_transformer import NoopTransformer -from databuilder.extractor.dashboard.tableau.tableau_dashboard_extractor import TableauDashboardExtractor -from databuilder.extractor.dashboard.tableau.tableau_dashboard_last_modified_extractor import \ - TableauDashboardLastModifiedExtractor -from databuilder.extractor.dashboard.tableau.tableau_dashboard_query_extractor import TableauDashboardQueryExtractor -from databuilder.extractor.dashboard.tableau.tableau_dashboard_table_extractor import TableauDashboardTableExtractor -from databuilder.extractor.dashboard.tableau.tableau_external_table_extractor import \ - TableauDashboardExternalTableExtractor - - es_host = os.getenv('CREDENTIALS_ELASTICSEARCH_PROXY_HOST', 'localhost') neo_host = os.getenv('CREDENTIALS_NEO4J_PROXY_HOST', 'localhost') @@ -59,7 +59,7 @@ Base = declarative_base() -NEO4J_ENDPOINT = 'bolt://{}:{}'.format(neo_host, neo_port) +NEO4J_ENDPOINT = f'bolt://{neo_host}:{neo_port}' neo4j_endpoint = NEO4J_ENDPOINT @@ -117,7 +117,7 @@ def create_es_publisher_sample_job(elasticsearch_index_alias='table_search_index # elastic search client instance elasticsearch_client = es # unique name of new index in Elasticsearch - elasticsearch_new_index_key = '{}_'.format(elasticsearch_doc_type_key) + str(uuid.uuid4()) + elasticsearch_new_index_key = f'{elasticsearch_doc_type_key}_{uuid.uuid4()}' job_config = ConfigFactory.from_dict({ 'extractor.search_data.entity_type': entity_type, @@ -138,7 +138,7 @@ def create_es_publisher_sample_job(elasticsearch_index_alias='table_search_index # only optionally add these keys, so need to dynamically `put` them if elasticsearch_mapping: - job_config.put('publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY), + job_config.put(f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY}', elasticsearch_mapping) job = DefaultJob(conf=job_config, @@ -152,8 +152,8 @@ def run_tableau_metadata_job(): tmp_folder = '/var/tmp/amundsen/tableau_dashboard_metadata' - node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) - relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder) + node_files_folder = f'{tmp_folder}/nodes' + relationship_files_folder = f'{tmp_folder}/relationships' dict_config = common_tableau_config dict_config.update({ @@ -192,8 +192,8 @@ def run_tableau_last_modified_job(): tmp_folder = '/var/tmp/amundsen/tableau_dashboard_user' - node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) - relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder) + node_files_folder = f'{tmp_folder}/nodes' + relationship_files_folder = f'{tmp_folder}/relationships' dict_config = common_tableau_config dict_config.update({ @@ -231,23 +231,20 @@ def run_tableau_query_job(): tmp_folder = '/var/tmp/amundsen/tableau_dashboard_query' - node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) - relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder) + node_files_folder = f'{tmp_folder}/nodes' + relationship_files_folder = f'{tmp_folder}/relationships' dict_config = common_tableau_config dict_config.update({ 'extractor.tableau_dashboard_query.api_base_url': tableau_api_base_url, 'extractor.tableau_dashboard_query.api_version': tableau_api_version, 'extractor.tableau_dashboard_query.site_name': tableau_site_name, - 'extractor.tableau_dashboard_query.tableau_personal_access_token_name': - tableau_personal_access_token_name, - 'extractor.tableau_dashboard_query.tableau_personal_access_token_secret': - tableau_personal_access_token_secret, + 'extractor.tableau_dashboard_query.tableau_personal_access_token_name': tableau_personal_access_token_name, + 'extractor.tableau_dashboard_query.tableau_personal_access_token_secret': tableau_personal_access_token_secret, 'extractor.tableau_dashboard_query.excluded_projects': tableau_excluded_projects, 'extractor.tableau_dashboard_query.cluster': tableau_dashboard_cluster, 'extractor.tableau_dashboard_query.database': tableau_dashboard_database, - 'extractor.tableau_dashboard_query.transformer.timestamp_str_to_epoch.timestamp_format': - "%Y-%m-%dT%H:%M:%SZ", + 'extractor.tableau_dashboard_query.transformer.timestamp_str_to_epoch.timestamp_format': "%Y-%m-%dT%H:%M:%SZ", 'extractor.tableau_dashboard_query.verify_request': tableau_verify_request, 'loader.filesystem_csv_neo4j.node_dir_path': node_files_folder, 'loader.filesystem_csv_neo4j.relationship_dir_path': relationship_files_folder, @@ -270,18 +267,16 @@ def run_tableau_table_job(): tmp_folder = '/var/tmp/amundsen/tableau_dashboard_table' - node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) - relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder) + node_files_folder = f'{tmp_folder}/nodes' + relationship_files_folder = f'{tmp_folder}/relationships' dict_config = common_tableau_config dict_config.update({ 'extractor.tableau_dashboard_table.api_base_url': tableau_api_base_url, 'extractor.tableau_dashboard_table.api_version': tableau_api_version, 'extractor.tableau_dashboard_table.site_name': tableau_site_name, - 'extractor.tableau_dashboard_table.tableau_personal_access_token_name': - tableau_personal_access_token_name, - 'extractor.tableau_dashboard_table.tableau_personal_access_token_secret': - tableau_personal_access_token_secret, + 'extractor.tableau_dashboard_table.tableau_personal_access_token_name': tableau_personal_access_token_name, + 'extractor.tableau_dashboard_table.tableau_personal_access_token_secret': tableau_personal_access_token_secret, 'extractor.tableau_dashboard_table.excluded_projects': tableau_excluded_projects, 'extractor.tableau_dashboard_table.cluster': tableau_dashboard_cluster, 'extractor.tableau_dashboard_table.database': tableau_dashboard_database, @@ -310,18 +305,16 @@ def run_tableau_external_table_job(): tmp_folder = '/var/tmp/amundsen/tableau_dashboard_external_table' - node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) - relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder) + node_files_folder = f'{tmp_folder}/nodes' + relationship_files_folder = f'{tmp_folder}/relationships' dict_config = common_tableau_config dict_config.update({ 'extractor.tableau_external_table.api_base_url': tableau_api_base_url, 'extractor.tableau_external_table.api_version': tableau_api_version, 'extractor.tableau_external_table.site_name': tableau_site_name, - 'extractor.tableau_external_table.tableau_personal_access_token_name': - tableau_personal_access_token_name, - 'extractor.tableau_external_table.tableau_personal_access_token_secret': - tableau_personal_access_token_secret, + 'extractor.tableau_external_table.tableau_personal_access_token_name': tableau_personal_access_token_name, + 'extractor.tableau_external_table.tableau_personal_access_token_secret': tableau_personal_access_token_secret, 'extractor.tableau_external_table.excluded_projects': tableau_excluded_projects, 'extractor.tableau_external_table.cluster': tableau_dashboard_cluster, 'extractor.tableau_external_table.database': tableau_dashboard_database, diff --git a/databuilder/requirements.txt b/databuilder/requirements.txt index 05836a95c4..22a175f23b 100644 --- a/databuilder/requirements.txt +++ b/databuilder/requirements.txt @@ -8,6 +8,11 @@ flake8==3.5.0 # Upstream url: https://pypi.python.org/pypi/flake8-tidy-imports flake8-tidy-imports>=1.1.0,<2.0 +# A Python utility / library to sort imports. +# License: MIT +# Upstream url: https://github.com/PyCQA/isort +isort[colors]~=5.4 + # A mature full-featured Python testing tool. # License: MIT # Upstream url: http://pytest.org/ diff --git a/databuilder/setup.cfg b/databuilder/setup.cfg index 9430b1eb28..57ab209dae 100644 --- a/databuilder/setup.cfg +++ b/databuilder/setup.cfg @@ -1,6 +1,15 @@ [flake8] format = pylint -exclude = .svc,CVS,.bzr,.hg,.git,__pycache__,venv,build,databuilder/sql_parser/usage/presto/antlr_generated +exclude = + CVS, + .svc, + .bzr, + .hg, + .git, + __pycache__, + venv, + build, + databuilder/sql_parser/usage/presto/antlr_generated max-complexity = 10 max-line-length = 120 ignore = NONE @@ -9,7 +18,14 @@ ignore = NONE max-line-length = 120 [tool:pytest] -addopts = -rs --cov=databuilder --cov-fail-under=70 --cov-report=term-missing:skip-covered --cov-report=xml --cov-report=html -vvv +addopts = + -rs + --cov=databuilder + --cov-fail-under=70 + --cov-report=term-missing:skip-covered + --cov-report=xml + --cov-report=html + -vvv [coverage:run] branch = True @@ -25,3 +41,13 @@ directory = build/coverage_html python_version = 3.6 disallow_untyped_defs = True ignore_missing_imports = True + +[isort] +profile = django +line_length = 120 +force_grid_wrap = 3 +combine_star = true +combine_as_imports = true +remove_redundant_aliases = true +color_output = true +skip_glob = [] diff --git a/databuilder/setup.py b/databuilder/setup.py index 295b839213..42ac4e9a46 100644 --- a/databuilder/setup.py +++ b/databuilder/setup.py @@ -1,8 +1,7 @@ # Copyright Contributors to the Amundsen project. # SPDX-License-Identifier: Apache-2.0 -from setuptools import setup, find_packages - +from setuptools import find_packages, setup __version__ = '4.0.3' diff --git a/databuilder/tests/unit/callback/test_call_back.py b/databuilder/tests/unit/callback/test_call_back.py index d35c99e298..ffa2619f64 100644 --- a/databuilder/tests/unit/callback/test_call_back.py +++ b/databuilder/tests/unit/callback/test_call_back.py @@ -2,9 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 import unittest +from typing import List from mock import MagicMock -from typing import List from databuilder.callback.call_back import Callback, notify_callbacks diff --git a/databuilder/tests/unit/extractor/dashboard/mode_analytics/batch/test_mode_dashboard_charts_batch_extractor.py b/databuilder/tests/unit/extractor/dashboard/mode_analytics/batch/test_mode_dashboard_charts_batch_extractor.py index ac5522996b..5e11c663ff 100644 --- a/databuilder/tests/unit/extractor/dashboard/mode_analytics/batch/test_mode_dashboard_charts_batch_extractor.py +++ b/databuilder/tests/unit/extractor/dashboard/mode_analytics/batch/test_mode_dashboard_charts_batch_extractor.py @@ -2,12 +2,14 @@ # SPDX-License-Identifier: Apache-2.0 import unittest + from mock import patch from pyhocon import ConfigFactory from databuilder import Scoped -from databuilder.extractor.dashboard.mode_analytics.batch.\ - mode_dashboard_charts_batch_extractor import ModeDashboardChartsBatchExtractor +from databuilder.extractor.dashboard.mode_analytics.batch.mode_dashboard_charts_batch_extractor import ( + ModeDashboardChartsBatchExtractor, +) class TestModeDashboardChartsBatchExtractor(unittest.TestCase): diff --git a/databuilder/tests/unit/extractor/dashboard/redash/test_redash_dashboard_extractor.py b/databuilder/tests/unit/extractor/dashboard/redash/test_redash_dashboard_extractor.py index 968a46d44a..f99cf65d69 100644 --- a/databuilder/tests/unit/extractor/dashboard/redash/test_redash_dashboard_extractor.py +++ b/databuilder/tests/unit/extractor/dashboard/redash/test_redash_dashboard_extractor.py @@ -3,20 +3,22 @@ import logging import unittest +from typing import ( + Any, Dict, List, +) from mock import patch from pyhocon import ConfigFactory -from typing import Any, Dict, List from databuilder import Scoped -from databuilder.extractor.dashboard.redash.redash_dashboard_extractor import \ - RedashDashboardExtractor, TableRelationData +from databuilder.extractor.dashboard.redash.redash_dashboard_extractor import ( + RedashDashboardExtractor, TableRelationData, +) +from databuilder.models.dashboard.dashboard_chart import DashboardChart from databuilder.models.dashboard.dashboard_last_modified import DashboardLastModifiedTimestamp from databuilder.models.dashboard.dashboard_owner import DashboardOwner from databuilder.models.dashboard.dashboard_query import DashboardQuery from databuilder.models.dashboard.dashboard_table import DashboardTable -from databuilder.models.dashboard.dashboard_chart import DashboardChart - logging.basicConfig(level=logging.INFO) @@ -134,7 +136,7 @@ def mock_api_get(url: str, *args: Any, **kwargs: Any) -> MockApiResponse: expected_query = DashboardQuery( query_id='1234', query_name='Test Query', - url=u'{base}/queries/1234'.format(base=redash_base_url), + url=f'{redash_base_url}/queries/1234', query_text='SELECT id FROM users', **identity ) diff --git a/databuilder/tests/unit/extractor/dashboard/redash/test_redash_dashboard_utils.py b/databuilder/tests/unit/extractor/dashboard/redash/test_redash_dashboard_utils.py index 1d6bb3dbb9..4df500e577 100644 --- a/databuilder/tests/unit/extractor/dashboard/redash/test_redash_dashboard_utils.py +++ b/databuilder/tests/unit/extractor/dashboard/redash/test_redash_dashboard_utils.py @@ -4,14 +4,17 @@ import logging import random import unittest +from typing import ( + Any, Dict, List, +) from mock import patch -from typing import Any, Dict, List +from databuilder.extractor.dashboard.redash.redash_dashboard_utils import ( + RedashPaginatedRestApiQuery, generate_dashboard_description, get_auth_headers, get_text_widgets, + get_visualization_widgets, sort_widgets, +) from databuilder.rest_api.base_rest_api_query import EmptyRestApiQuerySeed -from databuilder.extractor.dashboard.redash.redash_dashboard_utils import \ - get_text_widgets, get_visualization_widgets, sort_widgets, \ - generate_dashboard_description, get_auth_headers, RedashPaginatedRestApiQuery logging.basicConfig(level=logging.INFO) diff --git a/databuilder/tests/unit/extractor/dashboard/tableau/test_tableau_dashboard_extractor.py b/databuilder/tests/unit/extractor/dashboard/tableau/test_tableau_dashboard_extractor.py index 4ba14fd15d..27eeb9d176 100644 --- a/databuilder/tests/unit/extractor/dashboard/tableau/test_tableau_dashboard_extractor.py +++ b/databuilder/tests/unit/extractor/dashboard/tableau/test_tableau_dashboard_extractor.py @@ -10,9 +10,9 @@ from databuilder import Scoped from databuilder.extractor.dashboard.tableau.tableau_dashboard_extractor import TableauDashboardExtractor -from databuilder.extractor.dashboard.tableau.tableau_dashboard_utils \ - import TableauDashboardAuth, TableauGraphQLApiExtractor - +from databuilder.extractor.dashboard.tableau.tableau_dashboard_utils import ( + TableauDashboardAuth, TableauGraphQLApiExtractor, +) logging.basicConfig(level=logging.INFO) diff --git a/databuilder/tests/unit/extractor/dashboard/tableau/test_tableau_dashboard_last_modified_extractor.py b/databuilder/tests/unit/extractor/dashboard/tableau/test_tableau_dashboard_last_modified_extractor.py index 77e66a0016..d01dd4e3d7 100644 --- a/databuilder/tests/unit/extractor/dashboard/tableau/test_tableau_dashboard_last_modified_extractor.py +++ b/databuilder/tests/unit/extractor/dashboard/tableau/test_tableau_dashboard_last_modified_extractor.py @@ -9,11 +9,12 @@ from pyhocon import ConfigFactory from databuilder import Scoped -from databuilder.extractor.dashboard.tableau.tableau_dashboard_last_modified_extractor \ - import TableauDashboardLastModifiedExtractor -from databuilder.extractor.dashboard.tableau.tableau_dashboard_utils \ - import TableauDashboardAuth, TableauGraphQLApiExtractor - +from databuilder.extractor.dashboard.tableau.tableau_dashboard_last_modified_extractor import ( + TableauDashboardLastModifiedExtractor, +) +from databuilder.extractor.dashboard.tableau.tableau_dashboard_utils import ( + TableauDashboardAuth, TableauGraphQLApiExtractor, +) logging.basicConfig(level=logging.INFO) diff --git a/databuilder/tests/unit/extractor/dashboard/tableau/test_tableau_dashboard_query_extractor.py b/databuilder/tests/unit/extractor/dashboard/tableau/test_tableau_dashboard_query_extractor.py index 18e282fa8c..34fb30ac83 100644 --- a/databuilder/tests/unit/extractor/dashboard/tableau/test_tableau_dashboard_query_extractor.py +++ b/databuilder/tests/unit/extractor/dashboard/tableau/test_tableau_dashboard_query_extractor.py @@ -10,9 +10,9 @@ from databuilder import Scoped from databuilder.extractor.dashboard.tableau.tableau_dashboard_query_extractor import TableauDashboardQueryExtractor -from databuilder.extractor.dashboard.tableau.tableau_dashboard_utils \ - import TableauDashboardAuth, TableauGraphQLApiExtractor - +from databuilder.extractor.dashboard.tableau.tableau_dashboard_utils import ( + TableauDashboardAuth, TableauGraphQLApiExtractor, +) logging.basicConfig(level=logging.INFO) diff --git a/databuilder/tests/unit/extractor/dashboard/tableau/test_tableau_dashboard_table_extractor.py b/databuilder/tests/unit/extractor/dashboard/tableau/test_tableau_dashboard_table_extractor.py index 47a84b9e00..4a317f0a7e 100644 --- a/databuilder/tests/unit/extractor/dashboard/tableau/test_tableau_dashboard_table_extractor.py +++ b/databuilder/tests/unit/extractor/dashboard/tableau/test_tableau_dashboard_table_extractor.py @@ -10,9 +10,9 @@ from databuilder import Scoped from databuilder.extractor.dashboard.tableau.tableau_dashboard_table_extractor import TableauDashboardTableExtractor -from databuilder.extractor.dashboard.tableau.tableau_dashboard_utils \ - import TableauDashboardAuth, TableauGraphQLApiExtractor - +from databuilder.extractor.dashboard.tableau.tableau_dashboard_utils import ( + TableauDashboardAuth, TableauGraphQLApiExtractor, +) logging.basicConfig(level=logging.INFO) diff --git a/databuilder/tests/unit/extractor/restapi/test_rest_api_extractor.py b/databuilder/tests/unit/extractor/restapi/test_rest_api_extractor.py index a8376d949d..c73231e8d7 100644 --- a/databuilder/tests/unit/extractor/restapi/test_rest_api_extractor.py +++ b/databuilder/tests/unit/extractor/restapi/test_rest_api_extractor.py @@ -5,8 +5,9 @@ from pyhocon import ConfigFactory -from databuilder.extractor.restapi.rest_api_extractor import RestAPIExtractor, REST_API_QUERY, MODEL_CLASS, \ - STATIC_RECORD_DICT +from databuilder.extractor.restapi.rest_api_extractor import ( + MODEL_CLASS, REST_API_QUERY, STATIC_RECORD_DICT, RestAPIExtractor, +) from databuilder.models.dashboard.dashboard_metadata import DashboardMetadata from databuilder.rest_api.base_rest_api_query import RestApiQuerySeed diff --git a/databuilder/tests/unit/extractor/test_athena_metadata_extractor.py b/databuilder/tests/unit/extractor/test_athena_metadata_extractor.py index d23d57f16b..653302c473 100644 --- a/databuilder/tests/unit/extractor/test_athena_metadata_extractor.py +++ b/databuilder/tests/unit/extractor/test_athena_metadata_extractor.py @@ -3,14 +3,14 @@ import logging import unittest +from typing import Any, Dict -from mock import patch, MagicMock +from mock import MagicMock, patch from pyhocon import ConfigFactory -from typing import Any, Dict from databuilder.extractor.athena_metadata_extractor import AthenaMetadataExtractor from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor -from databuilder.models.table_metadata import TableMetadata, ColumnMetadata +from databuilder.models.table_metadata import ColumnMetadata, TableMetadata class TestAthenaMetadataExtractor(unittest.TestCase): @@ -18,11 +18,8 @@ def setUp(self) -> None: logging.basicConfig(level=logging.INFO) config_dict = { - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): - 'TEST_CONNECTION', - 'extractor.athena_metadata.{}'.format(AthenaMetadataExtractor.CATALOG_KEY): - 'MY_CATALOG' - + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION', + f'extractor.athena_metadata.{AthenaMetadataExtractor.CATALOG_KEY}': 'MY_CATALOG' } self.conf = ConfigFactory.from_dict(config_dict) @@ -43,56 +40,64 @@ def test_extraction_with_single_result(self) -> None: mock_connection.return_value = connection sql_execute = MagicMock() connection.execute = sql_execute - table = {'schema': 'test_schema', - 'name': 'test_table', - 'description': '', - 'cluster': self.conf['extractor.athena_metadata.{}'.format(AthenaMetadataExtractor.CATALOG_KEY)], - } + table = { + 'schema': 'test_schema', + 'name': 'test_table', + 'description': '', + 'cluster': self.conf[f'extractor.athena_metadata.{AthenaMetadataExtractor.CATALOG_KEY}'], + } sql_execute.return_value = [ - self._union( - {'col_name': 'col_id1', - 'col_type': 'bigint', - 'col_description': 'description of id1', - 'col_sort_order': 0, - 'extras': None}, table), - self._union( - {'col_name': 'col_id2', - 'col_type': 'bigint', - 'col_description': 'description of id2', - 'col_sort_order': 1, - 'extras': None}, table), - self._union( - {'col_name': 'is_active', - 'col_type': 'boolean', - 'col_description': None, - 'col_sort_order': 2, - 'extras': None}, table), - self._union( - {'col_name': 'source', - 'col_type': 'varchar', - 'col_description': 'description of source', - 'col_sort_order': 3, - 'extras': None}, table), - self._union( - {'col_name': 'etl_created_at', - 'col_type': 'timestamp', - 'col_description': None, - 'col_sort_order': 4, - 'extras': 'partition key'}, table), - self._union( - {'col_name': 'ds', - 'col_type': 'varchar', - 'col_description': None, - 'col_sort_order': 5, - 'extras': None}, table) + self._union({ + 'col_name': 'col_id1', + 'col_type': 'bigint', + 'col_description': 'description of id1', + 'col_sort_order': 0, + 'extras': None + }, table), + self._union({ + 'col_name': 'col_id2', + 'col_type': 'bigint', + 'col_description': 'description of id2', + 'col_sort_order': 1, + 'extras': None + }, table), + self._union({ + 'col_name': 'is_active', + 'col_type': 'boolean', + 'col_description': None, + 'col_sort_order': 2, + 'extras': None + }, table), + self._union({ + 'col_name': 'source', + 'col_type': 'varchar', + 'col_description': 'description of source', + 'col_sort_order': 3, + 'extras': None + }, table), + self._union({ + 'col_name': 'etl_created_at', + 'col_type': 'timestamp', + 'col_description': None, + 'col_sort_order': 4, + 'extras': 'partition key' + }, table), + self._union({ + 'col_name': 'ds', + 'col_type': 'varchar', + 'col_description': None, + 'col_sort_order': 5, + 'extras': None + }, table) ] extractor = AthenaMetadataExtractor() extractor.init(self.conf) actual = extractor.extract() - expected = TableMetadata('athena', self.conf['extractor.athena_metadata.{}'. - format(AthenaMetadataExtractor.CATALOG_KEY)], 'test_schema', + expected = TableMetadata('athena', + self.conf[f'extractor.athena_metadata.{AthenaMetadataExtractor.CATALOG_KEY}'], + 'test_schema', 'test_table', '', [ColumnMetadata('col_id1', 'description of id1', 'bigint', 0), ColumnMetadata('col_id2', 'description of id2', 'bigint', 1), @@ -112,19 +117,19 @@ def test_extraction_with_multiple_result(self) -> None: table = {'schema': 'test_schema1', 'name': 'test_table1', 'description': '', - 'cluster': self.conf['extractor.athena_metadata.{}'.format(AthenaMetadataExtractor.CATALOG_KEY)], + 'cluster': self.conf[f'extractor.athena_metadata.{AthenaMetadataExtractor.CATALOG_KEY}'], } table1 = {'schema': 'test_schema1', 'name': 'test_table2', 'description': '', - 'cluster': self.conf['extractor.athena_metadata.{}'.format(AthenaMetadataExtractor.CATALOG_KEY)], + 'cluster': self.conf[f'extractor.athena_metadata.{AthenaMetadataExtractor.CATALOG_KEY}'], } table2 = {'schema': 'test_schema2', 'name': 'test_table3', 'description': '', - 'cluster': self.conf['extractor.athena_metadata.{}'.format(AthenaMetadataExtractor.CATALOG_KEY)], + 'cluster': self.conf[f'extractor.athena_metadata.{AthenaMetadataExtractor.CATALOG_KEY}'], } sql_execute.return_value = [ @@ -194,8 +199,7 @@ def test_extraction_with_multiple_result(self) -> None: extractor.init(self.conf) expected = TableMetadata('athena', - self.conf['extractor.athena_metadata.{}'.format( - AthenaMetadataExtractor.CATALOG_KEY)], + self.conf[f'extractor.athena_metadata.{AthenaMetadataExtractor.CATALOG_KEY}'], 'test_schema1', 'test_table1', '', [ColumnMetadata('col_id1', 'description of col_id1', 'bigint', 0), ColumnMetadata('col_id2', 'description of col_id2', 'bigint', 1), @@ -206,16 +210,14 @@ def test_extraction_with_multiple_result(self) -> None: self.assertEqual(expected.__repr__(), extractor.extract().__repr__()) expected = TableMetadata('athena', - self.conf['extractor.athena_metadata.{}'.format( - AthenaMetadataExtractor.CATALOG_KEY)], + self.conf[f'extractor.athena_metadata.{AthenaMetadataExtractor.CATALOG_KEY}'], 'test_schema1', 'test_table2', '', [ColumnMetadata('col_name', 'description of col_name', 'varchar', 0), ColumnMetadata('col_name2', 'description of col_name2', 'varchar', 1)]) self.assertEqual(expected.__repr__(), extractor.extract().__repr__()) expected = TableMetadata('athena', - self.conf['extractor.athena_metadata.{}'.format( - AthenaMetadataExtractor.CATALOG_KEY)], + self.conf[f'extractor.athena_metadata.{AthenaMetadataExtractor.CATALOG_KEY}'], 'test_schema2', 'test_table3', '', [ColumnMetadata('col_id3', 'description of col_id3', 'varchar', 0), ColumnMetadata('col_name3', 'description of col_name3', @@ -240,8 +242,7 @@ def setUp(self) -> None: """ config_dict = { AthenaMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY: self.where_clause_suffix, - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): - 'TEST_CONNECTION' + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION' } self.conf = ConfigFactory.from_dict(config_dict) diff --git a/databuilder/tests/unit/extractor/test_bigquery_metadata_extractor.py b/databuilder/tests/unit/extractor/test_bigquery_metadata_extractor.py index 810fe37b35..ae49039c24 100644 --- a/databuilder/tests/unit/extractor/test_bigquery_metadata_extractor.py +++ b/databuilder/tests/unit/extractor/test_bigquery_metadata_extractor.py @@ -3,10 +3,10 @@ import logging import unittest +from typing import Any -from mock import patch, Mock +from mock import Mock, patch from pyhocon import ConfigFactory -from typing import Any from databuilder import Scoped from databuilder.extractor.bigquery_metadata_extractor import BigQueryMetadataExtractor @@ -14,52 +14,95 @@ logging.basicConfig(level=logging.INFO) - NO_DATASETS = {'kind': 'bigquery#datasetList', 'etag': '1B2M2Y8AsgTpgAmY7PhCfg=='} -ONE_DATASET = {'kind': 'bigquery#datasetList', 'etag': 'yScH5WIHeNUBF9b/VKybXA==', - 'datasets': [{'kind': 'bigquery#dataset', 'id': 'your-project-here:empty', 'datasetReference': - {'datasetId': 'empty', 'projectId': 'your-project-here'}, 'location': 'US'}]} # noqa +ONE_DATASET = { + 'kind': 'bigquery#datasetList', 'etag': 'yScH5WIHeNUBF9b/VKybXA==', + 'datasets': [{ + 'kind': 'bigquery#dataset', + 'id': 'your-project-here:empty', + 'datasetReference': { + 'datasetId': 'empty', + 'projectId': 'your-project-here' + }, + 'location': 'US' + }] +} # noqa NO_TABLES = {'kind': 'bigquery#tableList', 'etag': '1B2M2Y8AsgTpgAmY7PhCfg==', 'totalItems': 0} -ONE_TABLE = {'kind': 'bigquery#tableList', 'etag': 'Iaqrz2TCDIANAOD/Xerkjw==', - 'tables': [{'kind': 'bigquery#table', 'id': 'your-project-here:fdgdfgh.nested_recs', 'tableReference': - {'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'nested_recs'}, - 'type': 'TABLE', 'creationTime': '1557578974009'}], - 'totalItems': 1} # noqa -ONE_VIEW = {'kind': 'bigquery#tableList', 'etag': 'Iaqrz2TCDIANAOD/Xerkjw==', - 'tables': [{'kind': 'bigquery#table', 'id': 'your-project-here:fdgdfgh.abab', 'tableReference': - {'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'abab'}, - 'type': 'VIEW', 'view': {'useLegacySql': False}, 'creationTime': '1557577874991'}], - 'totalItems': 1} # noqa -TIME_PARTITIONED = {'kind': 'bigquery#tableList', 'etag': 'Iaqrz2TCDIANAOD/Xerkjw==', - 'tables': [{'kind': 'bigquery#table', 'id': 'your-project-here:fdgdfgh.other', 'tableReference': - {'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'other'}, - 'type': 'TABLE', 'timePartitioning': {'type': 'DAY', 'requirePartitionFilter': False}, - 'creationTime': '1557577779306'}], 'totalItems': 1} # noqa -TABLE_DATE_RANGE = {'kind': 'bigquery#tableList', 'etag': 'Iaqrz2TCDIANAOD/Xerkjw==', - 'tables': [{'kind': 'bigquery#table', 'id': 'your-project-here:fdgdfgh.other_20190101', 'tableReference': - {'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'date_range_20190101'}, - 'type': 'TABLE', 'creationTime': '1557577779306'}, - {'kind': 'bigquery#table', 'id': 'your-project-here:fdgdfgh.other_20190102', 'tableReference': - {'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'date_range_20190102'}, - 'type': 'TABLE', 'creationTime': '1557577779306'}], 'totalItems': 2} # noqa -TABLE_DATA = {'kind': 'bigquery#table', 'etag': 'Hzc/56Rp9VR4Y6jhZApD/g==', 'id': 'your-project-here:fdgdfgh.test', +ONE_TABLE = { + 'kind': 'bigquery#tableList', 'etag': 'Iaqrz2TCDIANAOD/Xerkjw==', + 'tables': [{ + 'kind': 'bigquery#table', + 'id': 'your-project-here:fdgdfgh.nested_recs', + 'tableReference': { + 'projectId': 'your-project-here', + 'datasetId': 'fdgdfgh', + 'tableId': 'nested_recs' + }, + 'type': 'TABLE', + 'creationTime': '1557578974009' + }], + 'totalItems': 1 +} # noqa +ONE_VIEW = { + 'kind': 'bigquery#tableList', 'etag': 'Iaqrz2TCDIANAOD/Xerkjw==', + 'tables': [{ + 'kind': 'bigquery#table', + 'id': 'your-project-here:fdgdfgh.abab', + 'tableReference': {'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'abab'}, + 'type': 'VIEW', + 'view': {'useLegacySql': False}, + 'creationTime': '1557577874991' + }], + 'totalItems': 1 +} # noqa +TIME_PARTITIONED = { + 'kind': 'bigquery#tableList', 'etag': 'Iaqrz2TCDIANAOD/Xerkjw==', + 'tables': [{ + 'kind': 'bigquery#table', + 'id': 'your-project-here:fdgdfgh.other', + 'tableReference': {'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'other'}, + 'type': 'TABLE', + 'timePartitioning': {'type': 'DAY', 'requirePartitionFilter': False}, + 'creationTime': '1557577779306' + }], + 'totalItems': 1 +} # noqa +TABLE_DATE_RANGE = { + 'kind': 'bigquery#tableList', 'etag': 'Iaqrz2TCDIANAOD/Xerkjw==', + 'tables': [{ + 'kind': 'bigquery#table', 'id': 'your-project-here:fdgdfgh.other_20190101', + 'tableReference': {'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'date_range_20190101'}, + 'type': 'TABLE', + 'creationTime': '1557577779306' + }, { + 'kind': 'bigquery#table', 'id': 'your-project-here:fdgdfgh.other_20190102', + 'tableReference': {'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'date_range_20190102'}, + 'type': 'TABLE', + 'creationTime': '1557577779306' + }], + 'totalItems': 2 +} # noqa +TABLE_DATA = { + 'kind': 'bigquery#table', 'etag': 'Hzc/56Rp9VR4Y6jhZApD/g==', 'id': 'your-project-here:fdgdfgh.test', 'selfLink': 'https://www.googleapis.com/bigquery/v2/projects/your-project-here/datasets/fdgdfgh/tables/test', 'tableReference': {'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'test'}, 'schema': { - 'fields': [ - {'name': 'test', 'type': 'STRING', 'description': 'some_description'}, - {'name': 'test2', 'type': 'INTEGER'}, - {'name': 'test3', 'type': 'FLOAT', 'description': 'another description'}, - {'name': 'test4', 'type': 'BOOLEAN'}, - {'name': 'test5', 'type': 'DATETIME'}]}, + 'fields': [{'name': 'test', 'type': 'STRING', 'description': 'some_description'}, + {'name': 'test2', 'type': 'INTEGER'}, + {'name': 'test3', 'type': 'FLOAT', 'description': 'another description'}, + {'name': 'test4', 'type': 'BOOLEAN'}, + {'name': 'test5', 'type': 'DATETIME'}] + }, 'numBytes': '0', 'numLongTermBytes': '0', 'numRows': '0', 'creationTime': '1557577756303', 'lastModifiedTime': '1557577756370', 'type': 'TABLE', - 'location': 'EU'} # noqa -NO_SCHEMA = {'kind': 'bigquery#table', 'etag': 'Hzc/56Rp9VR4Y6jhZApD/g==', 'id': 'your-project-here:fdgdfgh.no_schema', + 'location': 'EU' +} # noqa +NO_SCHEMA = { + 'kind': 'bigquery#table', 'etag': 'Hzc/56Rp9VR4Y6jhZApD/g==', 'id': 'your-project-here:fdgdfgh.no_schema', 'selfLink': 'https://www.googleapis.com/bigquery/v2/projects/your-project-here/datasets/fdgdfgh/tables/no_schema', 'tableReference': {'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'no_schema'}, 'numBytes': '0', @@ -68,8 +111,10 @@ 'creationTime': '1557577756303', 'lastModifiedTime': '1557577756370', 'type': 'TABLE', - 'location': 'EU'} # noqa -NO_COLS = {'kind': 'bigquery#table', 'etag': 'Hzc/56Rp9VR4Y6jhZApD/g==', 'id': 'your-project-here:fdgdfgh.no_columns', + 'location': 'EU' +} # noqa +NO_COLS = { + 'kind': 'bigquery#table', 'etag': 'Hzc/56Rp9VR4Y6jhZApD/g==', 'id': 'your-project-here:fdgdfgh.no_columns', 'selfLink': 'https://www.googleapis.com/bigquery/v2/projects/your-project-here/datasets/fdgdfgh/tables/no_columns', 'tableReference': {'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'no_columns'}, 'schema': {}, @@ -79,16 +124,20 @@ 'creationTime': '1557577756303', 'lastModifiedTime': '1557577756370', 'type': 'TABLE', - 'location': 'EU'} # noqa -VIEW_DATA = {'kind': 'bigquery#table', 'etag': 'E6+jjbQ/HsegSNpTEgELUA==', 'id': 'gerard-cloud-2:fdgdfgh.abab', + 'location': 'EU' +} # noqa +VIEW_DATA = { + 'kind': 'bigquery#table', 'etag': 'E6+jjbQ/HsegSNpTEgELUA==', 'id': 'gerard-cloud-2:fdgdfgh.abab', 'selfLink': 'https://www.googleapis.com/bigquery/v2/projects/gerard-cloud-2/datasets/fdgdfgh/tables/abab', 'tableReference': {'projectId': 'gerard-cloud-2', 'datasetId': 'fdgdfgh', 'tableId': 'abab'}, - 'schema': {'fields': [ - {'name': 'test', 'type': 'STRING'}, - {'name': 'test2', 'type': 'INTEGER'}, - {'name': 'test3', 'type': 'FLOAT'}, - {'name': 'test4', 'type': 'BOOLEAN'}, - {'name': 'test5', 'type': 'DATETIME'}]}, + 'schema': { + 'fields': [ + {'name': 'test', 'type': 'STRING'}, + {'name': 'test2', 'type': 'INTEGER'}, + {'name': 'test3', 'type': 'FLOAT'}, + {'name': 'test4', 'type': 'BOOLEAN'}, + {'name': 'test5', 'type': 'DATETIME'}] + }, 'numBytes': '0', 'numLongTermBytes': '0', 'numRows': '0', @@ -96,20 +145,24 @@ 'lastModifiedTime': '1557577874991', 'type': 'VIEW', 'view': {'query': 'SELECT * from `gerard-cloud-2.fdgdfgh.test`', 'useLegacySql': False}, - 'location': 'EU'} # noqa -NESTED_DATA = {'kind': 'bigquery#table', 'etag': 'Hzc/56Rp9VR4Y6jhZApD/g==', 'id': 'your-project-here:fdgdfgh.test', + 'location': 'EU' +} # noqa +NESTED_DATA = { + 'kind': 'bigquery#table', 'etag': 'Hzc/56Rp9VR4Y6jhZApD/g==', 'id': 'your-project-here:fdgdfgh.test', 'selfLink': 'https://www.googleapis.com/bigquery/v2/projects/your-project-here/datasets/fdgdfgh/tables/test', 'tableReference': {'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'test'}, 'schema': { - 'fields': [ - {'name': 'nested', 'type': 'RECORD', - 'fields': [ - {'name': 'nested2', 'type': 'RECORD', - 'fields': [ - {'name': 'ahah', 'type': 'STRING'}]}]}]}, + 'fields': [{ + 'name': 'nested', 'type': 'RECORD', + 'fields': [{ + 'name': 'nested2', 'type': 'RECORD', + 'fields': [{'name': 'ahah', 'type': 'STRING'}] + }] + }] + }, 'type': 'TABLE', - 'location': 'EU'} # noqa - + 'location': 'EU' +} # noqa try: FileNotFoundError @@ -147,8 +200,8 @@ def tables(self) -> Any: class TestBigQueryMetadataExtractor(unittest.TestCase): def setUp(self) -> None: config_dict = { - 'extractor.bigquery_table_metadata.{}'.format(BigQueryMetadataExtractor.PROJECT_ID_KEY): - 'your-project-here'} + f'extractor.bigquery_table_metadata.{BigQueryMetadataExtractor.PROJECT_ID_KEY}': 'your-project-here' + } self.conf = ConfigFactory.from_dict(config_dict) @patch('databuilder.extractor.base_bigquery_extractor.build') @@ -172,10 +225,8 @@ def test_empty_dataset(self, mock_build: Any) -> None: @patch('databuilder.extractor.base_bigquery_extractor.build') def test_accepts_dataset_filter_by_label(self, mock_build: Any) -> None: config_dict = { - 'extractor.bigquery_table_metadata.{}'.format(BigQueryMetadataExtractor.PROJECT_ID_KEY): - 'your-project-here', - 'extractor.bigquery_table_metadata.{}'.format(BigQueryMetadataExtractor.FILTER_KEY): - 'label.key:value' + f'extractor.bigquery_table_metadata.{BigQueryMetadataExtractor.PROJECT_ID_KEY}': 'your-project-here', + f'extractor.bigquery_table_metadata.{BigQueryMetadataExtractor.FILTER_KEY}': 'label.key:value' } conf = ConfigFactory.from_dict(config_dict) @@ -269,12 +320,9 @@ def test_table_with_nested_records(self, mock_build: Any) -> None: @patch('databuilder.extractor.base_bigquery_extractor.build') def test_keypath_and_pagesize_can_be_set(self, mock_build: Any) -> None: config_dict = { - 'extractor.bigquery_table_metadata.{}'.format(BigQueryMetadataExtractor.PROJECT_ID_KEY): - 'your-project-here', - 'extractor.bigquery_table_metadata.{}'.format(BigQueryMetadataExtractor.PAGE_SIZE_KEY): - 200, - 'extractor.bigquery_table_metadata.{}'.format(BigQueryMetadataExtractor.KEY_PATH_KEY): - '/tmp/doesnotexist', + f'extractor.bigquery_table_metadata.{BigQueryMetadataExtractor.PROJECT_ID_KEY}': 'your-project-here', + f'extractor.bigquery_table_metadata.{BigQueryMetadataExtractor.PAGE_SIZE_KEY}': 200, + f'extractor.bigquery_table_metadata.{BigQueryMetadataExtractor.KEY_PATH_KEY}': '/tmp/doesnotexist', } conf = ConfigFactory.from_dict(config_dict) diff --git a/databuilder/tests/unit/extractor/test_bigquery_usage_extractor.py b/databuilder/tests/unit/extractor/test_bigquery_usage_extractor.py index 249001514a..60e005e835 100644 --- a/databuilder/tests/unit/extractor/test_bigquery_usage_extractor.py +++ b/databuilder/tests/unit/extractor/test_bigquery_usage_extractor.py @@ -1,137 +1,136 @@ # Copyright Contributors to the Amundsen project. # SPDX-License-Identifier: Apache-2.0 -from mock import patch, Mock import base64 import tempfile import unittest +from typing import Any +from mock import Mock, patch from pyhocon import ConfigFactory -from typing import Any from databuilder import Scoped -from databuilder.extractor.bigquery_usage_extractor import BigQueryTableUsageExtractor -from databuilder.extractor.bigquery_usage_extractor import TableColumnUsageTuple - - -CORRECT_DATA = {"entries": [ -{ -"protoPayload": { -"@type": "type.googleapis.com/google.cloud.audit.AuditLog", -"status": {}, -"authenticationInfo": { - "principalEmail": "your-user-here@test.com" -}, -"serviceName": "bigquery.googleapis.com", -"methodName": "jobservice.jobcompleted", -"resourceName": "projects/your-project-here/jobs/bquxjob_758c08d1_16a96889839", -"serviceData": { - "@type": "type.googleapis.com/google.cloud.bigquery.logging.v1.AuditData", - "jobCompletedEvent": { - "eventName": "query_job_completed", - "job": { - "jobName": { - "projectId": "your-project-here", - "jobId": "bquxjob_758c08d1_16a96889839", - "location": "US" - }, - "jobConfiguration": { - "query": { - "query": "select descript from " - "`bigquery-public-data.austin_incidents.incidents_2008`\n", - "destinationTable": { - "projectId": "your-project-here", - "datasetId": "_07147a061ddfd6dcaf246cfc5e858a0ccefa7080", - "tableId": "anon1dd83635c62357091e55a5f76fb62d7deebcfa4c" - }, - "createDisposition": "CREATE_IF_NEEDED", - "writeDisposition": "WRITE_TRUNCATE", - "defaultDataset": {}, - "queryPriority": "QUERY_INTERACTIVE", - "statementType": "SELECT" - } - }, - "jobStatus": { - "state": "DONE", - "error": {} +from databuilder.extractor.bigquery_usage_extractor import BigQueryTableUsageExtractor, TableColumnUsageTuple + +CORRECT_DATA = { + "entries": [{ + "protoPayload": { + "@type": "type.googleapis.com/google.cloud.audit.AuditLog", + "status": {}, + "authenticationInfo": { + "principalEmail": "your-user-here@test.com" }, - "jobStatistics": { - "createTime": "2019-05-08T08:22:56.349Z", - "startTime": "2019-05-08T08:22:56.660Z", - "endTime": "2019-05-08T08:23:00.049Z", - "totalProcessedBytes": "3637807", - "totalBilledBytes": "10485760", - "billingTier": 1, - "totalSlotMs": "452", - "referencedTables": [ - { - "projectId": "bigquery-public-data", - "datasetId": "austin_incidents", - "tableId": "incidents_2008" + "serviceName": "bigquery.googleapis.com", + "methodName": "jobservice.jobcompleted", + "resourceName": "projects/your-project-here/jobs/bquxjob_758c08d1_16a96889839", + "serviceData": { + "@type": "type.googleapis.com/google.cloud.bigquery.logging.v1.AuditData", + "jobCompletedEvent": { + "eventName": "query_job_completed", + "job": { + "jobName": { + "projectId": "your-project-here", + "jobId": "bquxjob_758c08d1_16a96889839", + "location": "US" + }, + "jobConfiguration": { + "query": { + "query": "select descript from " + "`bigquery-public-data.austin_incidents.incidents_2008`\n", + "destinationTable": { + "projectId": "your-project-here", + "datasetId": "_07147a061ddfd6dcaf246cfc5e858a0ccefa7080", + "tableId": "anon1dd83635c62357091e55a5f76fb62d7deebcfa4c" + }, + "createDisposition": "CREATE_IF_NEEDED", + "writeDisposition": "WRITE_TRUNCATE", + "defaultDataset": {}, + "queryPriority": "QUERY_INTERACTIVE", + "statementType": "SELECT" + } + }, + "jobStatus": { + "state": "DONE", + "error": {} + }, + "jobStatistics": { + "createTime": "2019-05-08T08:22:56.349Z", + "startTime": "2019-05-08T08:22:56.660Z", + "endTime": "2019-05-08T08:23:00.049Z", + "totalProcessedBytes": "3637807", + "totalBilledBytes": "10485760", + "billingTier": 1, + "totalSlotMs": "452", + "referencedTables": [ + { + "projectId": "bigquery-public-data", + "datasetId": "austin_incidents", + "tableId": "incidents_2008" + } + ], + "totalTablesProcessed": 1, + "queryOutputRowCount": "179524" + } } - ], - "totalTablesProcessed": 1, - "queryOutputRowCount": "179524" + } } - } - } -} -}, -"insertId": "-jyqvjse6lwjz", -"resource": { -"type": "bigquery_resource", -"labels": { - "project_id": "your-project-here" -} -}, -"timestamp": "2019-05-08T08:23:00.061Z", -"severity": "INFO", -"logName": "projects/your-project-here/logs/cloudaudit.googleapis.com%2Fdata_access", -"receiveTimestamp": "2019-05-08T08:23:00.310709609Z" -} -]} # noqa - -FAILURE = {"entries": [ -{ - "protoPayload": { - "authenticationInfo": { - "principalEmail": "your-user-here@test.com" }, - "methodName": "jobservice.jobcompleted", - "serviceData": { - "jobCompletedEvent": { - "job": { - "jobStatus": { - "state": "DONE", - "error": { - "code": 11, - "message": "Some descriptive error message" - } - }, - "jobStatistics": { - "createTime": "2019-05-08T08:22:56.349Z", - "startTime": "2019-05-08T08:22:56.660Z", - "endTime": "2019-05-08T08:23:00.049Z", - "totalProcessedBytes": "3637807", - "totalBilledBytes": "10485760", - "referencedTables": [ - { - "projectId": "bigquery-public-data", - "datasetId": "austin_incidents", - "tableId": "incidents_2008" + "insertId": "-jyqvjse6lwjz", + "resource": { + "type": "bigquery_resource", + "labels": { + "project_id": "your-project-here" + } + }, + "timestamp": "2019-05-08T08:23:00.061Z", + "severity": "INFO", + "logName": "projects/your-project-here/logs/cloudaudit.googleapis.com%2Fdata_access", + "receiveTimestamp": "2019-05-08T08:23:00.310709609Z" + }] +} # noqa + +FAILURE = { + "entries": [{ + "protoPayload": { + "authenticationInfo": { + "principalEmail": "your-user-here@test.com" + }, + "methodName": "jobservice.jobcompleted", + "serviceData": { + "jobCompletedEvent": { + "job": { + "jobStatus": { + "state": "DONE", + "error": { + "code": 11, + "message": "Some descriptive error message" + } + }, + "jobStatistics": { + "createTime": "2019-05-08T08:22:56.349Z", + "startTime": "2019-05-08T08:22:56.660Z", + "endTime": "2019-05-08T08:23:00.049Z", + "totalProcessedBytes": "3637807", + "totalBilledBytes": "10485760", + "referencedTables": [ + { + "projectId": "bigquery-public-data", + "datasetId": "austin_incidents", + "tableId": "incidents_2008" + } + ] } - ] - } + } } - } + }, }, - }, -}]} # noqa + }] +} # noqa # An empty dict will be ignored, but putting in nextPageToken causes the test # to loop infinitely, so we need a bogus key/value to ensure that we will try # to read entries -NO_ENTRIES = {'key': 'value'} # noqa +NO_ENTRIES = {'key': 'value'} # noqa KEYFILE_DATA = """ ewogICJ0eXBlIjogInNlcnZpY2VfYWNjb3VudCIsCiAgInByb2plY3RfaWQiOiAieW91ci1wcm9q @@ -200,8 +199,7 @@ def test_basic_extraction(self, mock_build: Any) -> None: Test Extraction using mock class """ config_dict = { - 'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY): - 'your-project-here', + f'extractor.bigquery_table_usage.{BigQueryTableUsageExtractor.PROJECT_ID_KEY}': 'your-project-here', } conf = ConfigFactory.from_dict(config_dict) @@ -227,8 +225,7 @@ def test_basic_extraction(self, mock_build: Any) -> None: @patch('databuilder.extractor.base_bigquery_extractor.build') def test_no_entries(self, mock_build: Any) -> None: config_dict = { - 'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY): - 'your-project-here', + f'extractor.bigquery_table_usage.{BigQueryTableUsageExtractor.PROJECT_ID_KEY}': 'your-project-here', } conf = ConfigFactory.from_dict(config_dict) @@ -252,10 +249,8 @@ def test_key_path(self, mock_build: Any) -> None: keyfile.write(base64.b64decode(KEYFILE_DATA)) keyfile.flush() config_dict = { - 'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY): - 'your-project-here', - 'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.KEY_PATH_KEY): - keyfile.name, + f'extractor.bigquery_table_usage.{BigQueryTableUsageExtractor.PROJECT_ID_KEY}': 'your-project-here', + f'extractor.bigquery_table_usage.{BigQueryTableUsageExtractor.KEY_PATH_KEY}': keyfile.name, } conf = ConfigFactory.from_dict(config_dict) @@ -278,12 +273,9 @@ def test_timestamp_pagesize_settings(self, mock_build: Any) -> None: PAGESIZE = 215 config_dict = { - 'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY): - 'your-project-here', - 'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.TIMESTAMP_KEY): - TIMESTAMP, - 'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PAGE_SIZE_KEY): - PAGESIZE, + f'extractor.bigquery_table_usage.{BigQueryTableUsageExtractor.PROJECT_ID_KEY}': 'your-project-here', + f'extractor.bigquery_table_usage.{BigQueryTableUsageExtractor.TIMESTAMP_KEY}': TIMESTAMP, + f'extractor.bigquery_table_usage.{BigQueryTableUsageExtractor.PAGE_SIZE_KEY}': PAGESIZE, } conf = ConfigFactory.from_dict(config_dict) @@ -301,10 +293,8 @@ def test_timestamp_pagesize_settings(self, mock_build: Any) -> None: @patch('databuilder.extractor.base_bigquery_extractor.build') def test_failed_jobs_should_not_be_counted(self, mock_build: Any) -> None: - config_dict = { - 'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY): - 'your-project-here', + f'extractor.bigquery_table_usage.{BigQueryTableUsageExtractor.PROJECT_ID_KEY}': 'your-project-here', } conf = ConfigFactory.from_dict(config_dict) @@ -320,10 +310,8 @@ def test_failed_jobs_should_not_be_counted(self, mock_build: Any) -> None: @patch('databuilder.extractor.base_bigquery_extractor.build') def test_email_filter_not_counted(self, mock_build: Any) -> None: config_dict = { - 'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY): - 'your-project-here', - 'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.EMAIL_PATTERN): - 'emailFilter', + f'extractor.bigquery_table_usage.{BigQueryTableUsageExtractor.PROJECT_ID_KEY}': 'your-project-here', + f'extractor.bigquery_table_usage.{BigQueryTableUsageExtractor.EMAIL_PATTERN}': 'emailFilter', } conf = ConfigFactory.from_dict(config_dict) @@ -337,10 +325,8 @@ def test_email_filter_not_counted(self, mock_build: Any) -> None: @patch('databuilder.extractor.base_bigquery_extractor.build') def test_email_filter_counted(self, mock_build: Any) -> None: config_dict = { - 'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY): - 'your-project-here', - 'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.EMAIL_PATTERN): - '.*@test.com.*', + f'extractor.bigquery_table_usage.{BigQueryTableUsageExtractor.PROJECT_ID_KEY}': 'your-project-here', + f'extractor.bigquery_table_usage.{BigQueryTableUsageExtractor.EMAIL_PATTERN}': '.*@test.com.*', } conf = ConfigFactory.from_dict(config_dict) diff --git a/databuilder/tests/unit/extractor/test_bigquery_watermark_extractor.py b/databuilder/tests/unit/extractor/test_bigquery_watermark_extractor.py index b1310a575f..54fdc53528 100644 --- a/databuilder/tests/unit/extractor/test_bigquery_watermark_extractor.py +++ b/databuilder/tests/unit/extractor/test_bigquery_watermark_extractor.py @@ -4,54 +4,101 @@ import logging import unittest from datetime import datetime +from typing import Any -from mock import patch, Mock +from mock import Mock, patch from pyhocon import ConfigFactory -from typing import Any from databuilder import Scoped from databuilder.extractor.bigquery_watermark_extractor import BigQueryWatermarkExtractor logging.basicConfig(level=logging.INFO) - NO_DATASETS = {'kind': 'bigquery#datasetList', 'etag': '1B2M2Y8AsgTpgAmY7PhCfg=='} -ONE_DATASET = {'kind': 'bigquery#datasetList', 'etag': 'yScH5WIHeNUBF9b/VKybXA==', - 'datasets': [{'kind': 'bigquery#dataset', 'id': 'your-project-here:empty', 'datasetReference': - {'datasetId': 'empty', 'projectId': 'your-project-here'}, 'location': 'US'}]} # noqa +ONE_DATASET = { + 'kind': 'bigquery#datasetList', 'etag': 'yScH5WIHeNUBF9b/VKybXA==', + 'datasets': [{ + 'kind': 'bigquery#dataset', + 'id': 'your-project-here:empty', + 'datasetReference': {'datasetId': 'empty', 'projectId': 'your-project-here'}, + 'location': 'US' + }] +} # noqa NO_TABLES = {'kind': 'bigquery#tableList', 'etag': '1B2M2Y8AsgTpgAmY7PhCfg==', 'totalItems': 0} -ONE_TABLE = {'kind': 'bigquery#tableList', 'etag': 'Iaqrz2TCDIANAOD/Xerkjw==', - 'tables': [{'kind': 'bigquery#table', 'id': 'your-project-here:fdgdfgh.nested_recs', 'tableReference': - {'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'nested_recs'}, - 'type': 'TABLE', 'creationTime': '1557578974009'}], - 'totalItems': 1} # noqa -TIME_PARTITIONED = {'kind': 'bigquery#tableList', 'etag': 'Iaqrz2TCDIANAOD/Xerkjw==', - 'tables': [{'kind': 'bigquery#table', 'id': 'your-project-here:fdgdfgh.other', 'tableReference': - {'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'other'}, - 'type': 'TABLE', 'timePartitioning': {'type': 'DAY', 'requirePartitionFilter': False}, - 'creationTime': '1557577779306'}], 'totalItems': 1} # noqa -TIME_PARTITIONED_WITH_FIELD = {'kind': 'bigquery#tableList', 'etag': 'Iaqrz2TCDIANAOD/Xerkjw==', - 'tables': [{'kind': 'bigquery#table', 'id': 'your-project-here:fdgdfgh.other', 'tableReference': - {'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'other'}, - 'type': 'TABLE', 'timePartitioning': {'type': 'DAY', 'field': 'processed_date', - 'requirePartitionFilter': False}, 'creationTime': '1557577779306'}], 'totalItems': 1} # noqa -TABLE_DATE_RANGE = {'kind': 'bigquery#tableList', 'etag': 'Iaqrz2TCDIANAOD/Xerkjw==', - 'tables': [{'kind': 'bigquery#table', 'id': 'your-project-here:fdgdfgh.other_20190101', 'tableReference': - {'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'date_range_20190101'}, - 'type': 'TABLE', 'creationTime': '1557577779306'}, - {'kind': 'bigquery#table', 'id': 'your-project-here:fdgdfgh.other_20190102', 'tableReference': - {'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'date_range_20190102'}, - 'type': 'TABLE', 'creationTime': '1557577779306'}], 'totalItems': 2} # noqa -PARTITION_DATA = {'kind': 'bigquery#queryResponse', - 'schema': {'fields': [{'name': 'partition_id', 'type': 'STRING', 'mode': 'NULLABLE'}, - {'name': 'creation_time', 'type': 'TIMESTAMP', 'mode': 'NULLABLE'}]}, - 'jobReference': {'projectId': 'your-project-here', 'jobId': 'job_bfTRGj3Lv0tRjcrotXbZSgMCpNhY', 'location': 'EU'}, - 'totalRows': '3', - 'rows': [{'f': [{'v': '20180802'}, {'v': '1.547512241348E9'}]}, - {'f': [{'v': '20180803'}, {'v': '1.547512241348E9'}]}, - {'f': [{'v': '20180804'}, {'v': '1.547512241348E9'}]}], - 'totalBytesProcessed': '0', 'jobComplete': True, 'cacheHit': False} # noqa - +ONE_TABLE = { + 'kind': 'bigquery#tableList', 'etag': 'Iaqrz2TCDIANAOD/Xerkjw==', + 'tables': [{ + 'kind': 'bigquery#table', + 'id': 'your-project-here:fdgdfgh.nested_recs', + 'tableReference': {'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'nested_recs'}, + 'type': 'TABLE', + 'creationTime': '1557578974009' + }], + 'totalItems': 1 +} # noqa +TIME_PARTITIONED = { + 'kind': 'bigquery#tableList', 'etag': 'Iaqrz2TCDIANAOD/Xerkjw==', + 'tables': [{ + 'kind': 'bigquery#table', + 'id': 'your-project-here:fdgdfgh.other', + 'tableReference': {'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'other'}, + 'type': 'TABLE', + 'timePartitioning': {'type': 'DAY', 'requirePartitionFilter': False}, + 'creationTime': '1557577779306' + }], + 'totalItems': 1 +} # noqa +TIME_PARTITIONED_WITH_FIELD = { + 'kind': 'bigquery#tableList', 'etag': 'Iaqrz2TCDIANAOD/Xerkjw==', + 'tables': [{ + 'kind': 'bigquery#table', + 'id': 'your-project-here:fdgdfgh.other', + 'tableReference': {'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'other'}, + 'type': 'TABLE', + 'timePartitioning': {'type': 'DAY', 'field': 'processed_date', 'requirePartitionFilter': False}, + 'creationTime': '1557577779306' + }], + 'totalItems': 1 +} # noqa +TABLE_DATE_RANGE = { + 'kind': 'bigquery#tableList', 'etag': 'Iaqrz2TCDIANAOD/Xerkjw==', + 'tables': [{ + 'kind': 'bigquery#table', + 'id': 'your-project-here:fdgdfgh.other_20190101', + 'tableReference': {'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'date_range_20190101'}, + 'type': 'TABLE', + 'creationTime': '1557577779306' + }, { + 'kind': 'bigquery#table', + 'id': 'your-project-here:fdgdfgh.other_20190102', + 'tableReference': {'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'date_range_20190102'}, + 'type': 'TABLE', + 'creationTime': '1557577779306' + }], + 'totalItems': 2 +} # noqa +PARTITION_DATA = { + 'kind': 'bigquery#queryResponse', + 'schema': { + 'fields': [{ + 'name': 'partition_id', + 'type': 'STRING', + 'mode': 'NULLABLE' + }, { + 'name': 'creation_time', + 'type': 'TIMESTAMP', + 'mode': 'NULLABLE' + }] + }, + 'jobReference': {'projectId': 'your-project-here', 'jobId': 'job_bfTRGj3Lv0tRjcrotXbZSgMCpNhY', 'location': 'EU'}, + 'totalRows': '3', + 'rows': [{'f': [{'v': '20180802'}, {'v': '1.547512241348E9'}]}, + {'f': [{'v': '20180803'}, {'v': '1.547512241348E9'}]}, + {'f': [{'v': '20180804'}, {'v': '1.547512241348E9'}]}], + 'totalBytesProcessed': '0', + 'jobComplete': True, + 'cacheHit': False +} # noqa try: FileNotFoundError @@ -93,7 +140,7 @@ def jobs(self) -> Any: class TestBigQueryWatermarkExtractor(unittest.TestCase): def setUp(self) -> None: config_dict = { - 'extractor.bigquery_watermarks.{}'.format(BigQueryWatermarkExtractor.PROJECT_ID_KEY): + f'extractor.bigquery_watermarks.{BigQueryWatermarkExtractor.PROJECT_ID_KEY}': 'your-project-here'} self.conf = ConfigFactory.from_dict(config_dict) @@ -177,10 +224,8 @@ def test_table_with_field_partitions(self, mock_build: Any) -> None: @patch('databuilder.extractor.base_bigquery_extractor.build') def test_keypath_can_be_set(self, mock_build: Any) -> None: config_dict = { - 'extractor.bigquery_watermarks.{}'.format(BigQueryWatermarkExtractor.PROJECT_ID_KEY): - 'your-project-here', - 'extractor.bigquery_watermarks.{}'.format(BigQueryWatermarkExtractor.KEY_PATH_KEY): - '/tmp/doesnotexist', + f'extractor.bigquery_watermarks.{BigQueryWatermarkExtractor.PROJECT_ID_KEY}': 'your-project-here', + f'extractor.bigquery_watermarks.{BigQueryWatermarkExtractor.KEY_PATH_KEY}': '/tmp/doesnotexist', } conf = ConfigFactory.from_dict(config_dict) diff --git a/databuilder/tests/unit/extractor/test_cassandra_extractor.py b/databuilder/tests/unit/extractor/test_cassandra_extractor.py index 469a4bcd73..e938c375d7 100644 --- a/databuilder/tests/unit/extractor/test_cassandra_extractor.py +++ b/databuilder/tests/unit/extractor/test_cassandra_extractor.py @@ -4,14 +4,14 @@ import logging import unittest from collections import OrderedDict +from typing import Any +from cassandra.metadata import ColumnMetadata as CassandraColumnMetadata from mock import patch from pyhocon import ConfigFactory -from typing import Any -from cassandra.metadata import ColumnMetadata as CassandraColumnMetadata from databuilder.extractor.cassandra_extractor import CassandraExtractor -from databuilder.models.table_metadata import TableMetadata, ColumnMetadata +from databuilder.models.table_metadata import ColumnMetadata, TableMetadata # patch whole class to avoid actually calling for boto3.client during tests diff --git a/databuilder/tests/unit/extractor/test_csv_extractor.py b/databuilder/tests/unit/extractor/test_csv_extractor.py index 2a6a382e03..8835765147 100644 --- a/databuilder/tests/unit/extractor/test_csv_extractor.py +++ b/databuilder/tests/unit/extractor/test_csv_extractor.py @@ -17,8 +17,8 @@ def test_extraction_with_model_class(self) -> None: Test Extraction using model class """ config_dict = { - 'extractor.csv.{}'.format(CsvExtractor.FILE_LOCATION): 'example/sample_data/sample_table.csv', - 'extractor.csv.model_class': 'databuilder.models.table_metadata.TableMetadata', + f'extractor.csv.{CsvExtractor.FILE_LOCATION}': 'example/sample_data/sample_table.csv', + f'extractor.csv.model_class': 'databuilder.models.table_metadata.TableMetadata', } self.conf = ConfigFactory.from_dict(config_dict) extractor = CsvExtractor() diff --git a/databuilder/tests/unit/extractor/test_deltalake_extractor.py b/databuilder/tests/unit/extractor/test_deltalake_extractor.py index 6276e4d8d5..5e8e51dfc7 100644 --- a/databuilder/tests/unit/extractor/test_deltalake_extractor.py +++ b/databuilder/tests/unit/extractor/test_deltalake_extractor.py @@ -4,36 +4,38 @@ import logging import tempfile import unittest +from typing import Dict -from databuilder import Scoped -from databuilder.extractor.delta_lake_metadata_extractor import DeltaLakeMetadataExtractor, \ - ScrapedTableMetadata, ScrapedColumnMetadata -from databuilder.models.table_metadata import TableMetadata, ColumnMetadata from pyhocon import ConfigFactory # patch whole class to avoid actually calling for boto3.client during tests from pyspark.sql import SparkSession from pyspark.sql.catalog import Table -from typing import Dict + +from databuilder import Scoped +from databuilder.extractor.delta_lake_metadata_extractor import ( + DeltaLakeMetadataExtractor, ScrapedColumnMetadata, ScrapedTableMetadata, +) +from databuilder.models.table_metadata import ColumnMetadata, TableMetadata class TestDeltaLakeExtractor(unittest.TestCase): def setUp(self) -> None: logging.basicConfig(level=logging.INFO) - self.spark = SparkSession.builder\ - .appName("Amundsen Delta Lake Metadata Extraction")\ - .master("local")\ + self.spark = SparkSession.builder \ + .appName("Amundsen Delta Lake Metadata Extraction") \ + .master("local") \ .config("spark.jars.packages", "io.delta:delta-core_2.12:0.7.0") \ .config("spark.sql.warehouse.dir", tempfile.TemporaryDirectory()) \ .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \ .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \ .getOrCreate() self.config_dict = { - 'extractor.delta_lake_table_metadata.{}'.format(DeltaLakeMetadataExtractor.CLUSTER_KEY): 'test_cluster', - 'extractor.delta_lake_table_metadata.{}'.format(DeltaLakeMetadataExtractor.SCHEMA_LIST_KEY): [], - 'extractor.delta_lake_table_metadata.{}'.format(DeltaLakeMetadataExtractor.EXCLUDE_LIST_SCHEMAS_KEY): [], - 'extractor.delta_lake_table_metadata.{}'.format(DeltaLakeMetadataExtractor.DATABASE_KEY): 'test_database', - 'extractor.delta_lake_table_metadata.{}'.format(DeltaLakeMetadataExtractor.DELTA_TABLES_ONLY): False + f'extractor.delta_lake_table_metadata.{DeltaLakeMetadataExtractor.CLUSTER_KEY}': 'test_cluster', + f'extractor.delta_lake_table_metadata.{DeltaLakeMetadataExtractor.SCHEMA_LIST_KEY}': [], + f'extractor.delta_lake_table_metadata.{DeltaLakeMetadataExtractor.EXCLUDE_LIST_SCHEMAS_KEY}': [], + f'extractor.delta_lake_table_metadata.{DeltaLakeMetadataExtractor.DATABASE_KEY}': 'test_database', + f'extractor.delta_lake_table_metadata.{DeltaLakeMetadataExtractor.DELTA_TABLES_ONLY}': False } conf = ConfigFactory.from_dict(self.config_dict) self.dExtractor = DeltaLakeMetadataExtractor() @@ -172,11 +174,10 @@ def test_extract(self) -> None: def test_extract_with_only_specific_schemas(self) -> None: self.config_dict = { - 'extractor.delta_lake_table_metadata.{}'.format(DeltaLakeMetadataExtractor.CLUSTER_KEY): 'test_cluster', - 'extractor.delta_lake_table_metadata.{}' - .format(DeltaLakeMetadataExtractor.SCHEMA_LIST_KEY): ['test_schema2'], - 'extractor.delta_lake_table_metadata.{}'.format(DeltaLakeMetadataExtractor.EXCLUDE_LIST_SCHEMAS_KEY): [], - 'extractor.delta_lake_table_metadata.{}'.format(DeltaLakeMetadataExtractor.DATABASE_KEY): 'test_database' + f'extractor.delta_lake_table_metadata.{DeltaLakeMetadataExtractor.CLUSTER_KEY}': 'test_cluster', + f'extractor.delta_lake_table_metadata.{DeltaLakeMetadataExtractor.SCHEMA_LIST_KEY}': ['test_schema2'], + f'extractor.delta_lake_table_metadata.{DeltaLakeMetadataExtractor.EXCLUDE_LIST_SCHEMAS_KEY}': [], + f'extractor.delta_lake_table_metadata.{DeltaLakeMetadataExtractor.DATABASE_KEY}': 'test_database' } conf = ConfigFactory.from_dict(self.config_dict) self.dExtractor.init(Scoped.get_scoped_conf(conf=conf, @@ -190,12 +191,11 @@ def test_extract_with_only_specific_schemas(self) -> None: def test_extract_when_excluding(self) -> None: self.config_dict = { - 'extractor.delta_lake_table_metadata.{}'.format(DeltaLakeMetadataExtractor.CLUSTER_KEY): 'test_cluster', - 'extractor.delta_lake_table_metadata.{}'.format(DeltaLakeMetadataExtractor.SCHEMA_LIST_KEY): [], - 'extractor.delta_lake_table_metadata.{}' - .format(DeltaLakeMetadataExtractor.EXCLUDE_LIST_SCHEMAS_KEY): ['test_schema2'], - 'extractor.delta_lake_table_metadata.{}'.format(DeltaLakeMetadataExtractor.DATABASE_KEY): 'test_database' - + f'extractor.delta_lake_table_metadata.{DeltaLakeMetadataExtractor.CLUSTER_KEY}': 'test_cluster', + f'extractor.delta_lake_table_metadata.{DeltaLakeMetadataExtractor.SCHEMA_LIST_KEY}': [], + f'extractor.delta_lake_table_metadata.{DeltaLakeMetadataExtractor.EXCLUDE_LIST_SCHEMAS_KEY}': + ['test_schema2'], + f'extractor.delta_lake_table_metadata.{DeltaLakeMetadataExtractor.DATABASE_KEY}': 'test_database' } conf = ConfigFactory.from_dict(self.config_dict) self.dExtractor.init(Scoped.get_scoped_conf(conf=conf, diff --git a/databuilder/tests/unit/extractor/test_dremio_metadata_extractor.py b/databuilder/tests/unit/extractor/test_dremio_metadata_extractor.py index 268cd9bf73..9b70db324c 100644 --- a/databuilder/tests/unit/extractor/test_dremio_metadata_extractor.py +++ b/databuilder/tests/unit/extractor/test_dremio_metadata_extractor.py @@ -2,14 +2,16 @@ # SPDX-License-Identifier: Apache-2.0 import logging -from typing import Dict, List, Any import unittest +from typing import ( + Any, Dict, List, +) from unittest.mock import MagicMock, patch from pyhocon import ConfigFactory from databuilder.extractor.dremio_metadata_extractor import DremioMetadataExtractor -from databuilder.models.table_metadata import TableMetadata, ColumnMetadata +from databuilder.models.table_metadata import ColumnMetadata, TableMetadata class TestDremioMetadataExtractor(unittest.TestCase): diff --git a/databuilder/tests/unit/extractor/test_feast_extractor.py b/databuilder/tests/unit/extractor/test_feast_extractor.py index f6963130ee..9350109498 100644 --- a/databuilder/tests/unit/extractor/test_feast_extractor.py +++ b/databuilder/tests/unit/extractor/test_feast_extractor.py @@ -5,17 +5,15 @@ import re import unittest -from mock import call, MagicMock -from pyhocon import ConfigFactory -from feast.feature_table import FeatureTable from feast.entity import Entity +from feast.feature_table import FeatureTable +from mock import MagicMock, call +from pyhocon import ConfigFactory from databuilder import Scoped from databuilder.extractor.feast_extractor import FeastExtractor from databuilder.models.table_metadata import ( - TableMetadata, - ColumnMetadata, - DescriptionMetadata, + ColumnMetadata, DescriptionMetadata, TableMetadata, ) @@ -167,15 +165,9 @@ def test_feature_table_extraction_with_description_stream(self) -> None: def _init_extractor(self, programmatic_description_enabled: bool = True) -> None: conf = { - "extractor.feast.{}".format( - FeastExtractor.FEAST_ENDPOINT_CONFIG_KEY - ): "feast-core.example.com:6565", - "extractor.feast.{}".format( - FeastExtractor.FEAST_SERVICE_CONFIG_KEY - ): "unittest-feast-instance", - "extractor.feast.{}".format( - FeastExtractor.DESCRIBE_FEATURE_TABLES - ): programmatic_description_enabled, + f'extractor.feast.{FeastExtractor.FEAST_ENDPOINT_CONFIG_KEY}': 'feast-core.example.com:6565', + f'extractor.feast.{FeastExtractor.FEAST_SERVICE_CONFIG_KEY}': 'unittest-feast-instance', + f'extractor.feast.{FeastExtractor.DESCRIBE_FEATURE_TABLES}': programmatic_description_enabled, } self.extractor = FeastExtractor() self.extractor.init( @@ -190,7 +182,7 @@ def _strip_margin(text: str) -> str: return re.sub("\n[ \t]*\\|", "\n", text) def _mock_feature_table( - self, labels: dict = {}, add_stream_source: bool = False + self, labels: dict = {}, add_stream_source: bool = False ) -> None: table_spec = { "name": "driver_trips", diff --git a/databuilder/tests/unit/extractor/test_generic_extractor.py b/databuilder/tests/unit/extractor/test_generic_extractor.py index e34bc9ed8e..5641269ccb 100644 --- a/databuilder/tests/unit/extractor/test_generic_extractor.py +++ b/databuilder/tests/unit/extractor/test_generic_extractor.py @@ -17,8 +17,7 @@ def test_extraction_with_model_class(self) -> None: """ config_dict = { 'extractor.generic.extraction_items': [{'timestamp': 10000000}], - 'extractor.generic.model_class': - 'databuilder.models.neo4j_es_last_updated.Neo4jESLastUpdated', + 'extractor.generic.model_class': 'databuilder.models.neo4j_es_last_updated.Neo4jESLastUpdated', } conf = ConfigFactory.from_dict(config_dict) diff --git a/databuilder/tests/unit/extractor/test_glue_extractor.py b/databuilder/tests/unit/extractor/test_glue_extractor.py index 79c79c471d..815e7eb386 100644 --- a/databuilder/tests/unit/extractor/test_glue_extractor.py +++ b/databuilder/tests/unit/extractor/test_glue_extractor.py @@ -8,7 +8,7 @@ from pyhocon import ConfigFactory from databuilder.extractor.glue_extractor import GlueExtractor -from databuilder.models.table_metadata import TableMetadata, ColumnMetadata +from databuilder.models.table_metadata import ColumnMetadata, TableMetadata # patch whole class to avoid actually calling for boto3.client during tests diff --git a/databuilder/tests/unit/extractor/test_hive_table_last_updated_extractor.py b/databuilder/tests/unit/extractor/test_hive_table_last_updated_extractor.py index 3f36cccb17..6ef31b755b 100644 --- a/databuilder/tests/unit/extractor/test_hive_table_last_updated_extractor.py +++ b/databuilder/tests/unit/extractor/test_hive_table_last_updated_extractor.py @@ -5,18 +5,19 @@ import logging import unittest from datetime import datetime -from pytz import UTC +from typing import ( + Iterable, Iterator, Optional, TypeVar, +) -from mock import patch, MagicMock +from mock import MagicMock, patch from pyhocon import ConfigFactory -from typing import Iterable, Iterator, Optional, TypeVar +from pytz import UTC from databuilder.extractor.hive_table_last_updated_extractor import HiveTableLastUpdatedExtractor from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor -from databuilder.models.table_last_updated import TableLastUpdated from databuilder.filesystem.filesystem import FileSystem from databuilder.filesystem.metadata import FileMetadata - +from databuilder.models.table_last_updated import TableLastUpdated T = TypeVar('T') @@ -36,11 +37,9 @@ def setUp(self) -> None: logging.basicConfig(level=logging.INFO) def test_extraction_with_empty_query_result(self) -> None: - config_dict = { - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): - 'TEST_CONNECTION', - 'filesystem.{}'.format(FileSystem.DASK_FILE_SYSTEM): MagicMock() + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION', + f'filesystem.{FileSystem.DASK_FILE_SYSTEM}': MagicMock() } conf = ConfigFactory.from_dict(config_dict) with patch.object(SQLAlchemyExtractor, '_get_connection'): @@ -52,24 +51,25 @@ def test_extraction_with_empty_query_result(self) -> None: def test_extraction_with_partition_table_result(self) -> None: config_dict = { - 'filesystem.{}'.format(FileSystem.DASK_FILE_SYSTEM): MagicMock() + f'filesystem.{FileSystem.DASK_FILE_SYSTEM}': MagicMock() } conf = ConfigFactory.from_dict(config_dict) pt_alchemy_extractor_instance = MagicMock() non_pt_alchemy_extractor_instance = MagicMock() with patch.object(HiveTableLastUpdatedExtractor, '_get_partitioned_table_sql_alchemy_extractor', - return_value=pt_alchemy_extractor_instance),\ + return_value=pt_alchemy_extractor_instance), \ patch.object(HiveTableLastUpdatedExtractor, '_get_non_partitioned_table_sql_alchemy_extractor', return_value=non_pt_alchemy_extractor_instance): - pt_alchemy_extractor_instance.extract = MagicMock(side_effect=null_iterator([ - {'schema': 'foo_schema', - 'table_name': 'table_1', - 'last_updated_time': 1}, - {'schema': 'foo_schema', - 'table_name': 'table_2', - 'last_updated_time': 2} - ])) + pt_alchemy_extractor_instance.extract = MagicMock(side_effect=null_iterator([{ + 'schema': 'foo_schema', + 'table_name': 'table_1', + 'last_updated_time': 1 + }, { + 'schema': 'foo_schema', + 'table_name': 'table_2', + 'last_updated_time': 2 + }])) non_pt_alchemy_extractor_instance.extract = MagicMock(return_value=None) @@ -102,20 +102,19 @@ def test_extraction(self) -> None: pt_alchemy_extractor_instance = MagicMock() non_pt_alchemy_extractor_instance = MagicMock() - with patch.object(HiveTableLastUpdatedExtractor, - '_get_partitioned_table_sql_alchemy_extractor', return_value=pt_alchemy_extractor_instance), \ - patch.object(HiveTableLastUpdatedExtractor, - '_get_non_partitioned_table_sql_alchemy_extractor', + with patch.object(HiveTableLastUpdatedExtractor, '_get_partitioned_table_sql_alchemy_extractor', + return_value=pt_alchemy_extractor_instance), \ + patch.object(HiveTableLastUpdatedExtractor, '_get_non_partitioned_table_sql_alchemy_extractor', return_value=non_pt_alchemy_extractor_instance), \ - patch.object(HiveTableLastUpdatedExtractor, - '_get_filesystem', return_value=fs): + patch.object(HiveTableLastUpdatedExtractor, '_get_filesystem', + return_value=fs): pt_alchemy_extractor_instance.extract = MagicMock(return_value=None) - non_pt_alchemy_extractor_instance.extract = MagicMock(side_effect=null_iterator([ - {'schema': 'foo_schema', - 'table_name': 'table_1', - 'location': '/foo/bar'}, - ])) + non_pt_alchemy_extractor_instance.extract = MagicMock(side_effect=null_iterator([{ + 'schema': 'foo_schema', + 'table_name': 'table_1', + 'location': '/foo/bar' + }])) extractor = HiveTableLastUpdatedExtractor() extractor.init(ConfigFactory.from_dict({})) diff --git a/databuilder/tests/unit/extractor/test_hive_table_metadata_extractor.py b/databuilder/tests/unit/extractor/test_hive_table_metadata_extractor.py index a603f80f74..d2ffbe4227 100644 --- a/databuilder/tests/unit/extractor/test_hive_table_metadata_extractor.py +++ b/databuilder/tests/unit/extractor/test_hive_table_metadata_extractor.py @@ -3,14 +3,14 @@ import logging import unittest +from typing import Any, Dict -from mock import patch, MagicMock +from mock import MagicMock, patch from pyhocon import ConfigFactory -from typing import Any, Dict from databuilder.extractor.hive_table_metadata_extractor import HiveTableMetadataExtractor from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor -from databuilder.models.table_metadata import TableMetadata, ColumnMetadata +from databuilder.models.table_metadata import ColumnMetadata, TableMetadata class TestHiveTableMetadataExtractor(unittest.TestCase): @@ -18,8 +18,7 @@ def setUp(self) -> None: logging.basicConfig(level=logging.INFO) config_dict = { - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): - 'TEST_CONNECTION' + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION' } self.conf = ConfigFactory.from_dict(config_dict) @@ -38,8 +37,8 @@ def test_extraction_with_empty_query_result(self) -> None: def test_extraction_with_single_result(self) -> None: with patch.object(SQLAlchemyExtractor, '_get_connection') as mock_connection, \ - patch.object(HiveTableMetadataExtractor, '_choose_default_sql_stm', - return_value=HiveTableMetadataExtractor.DEFAULT_SQL_STATEMENT): + patch.object(HiveTableMetadataExtractor, '_choose_default_sql_stm', + return_value=HiveTableMetadataExtractor.DEFAULT_SQL_STATEMENT): connection = MagicMock() mock_connection.return_value = connection sql_execute = MagicMock() @@ -106,8 +105,8 @@ def test_extraction_with_single_result(self) -> None: def test_extraction_with_multiple_result(self) -> None: with patch.object(SQLAlchemyExtractor, '_get_connection') as mock_connection, \ - patch.object(HiveTableMetadataExtractor, '_choose_default_sql_stm', - return_value=HiveTableMetadataExtractor.DEFAULT_SQL_STATEMENT): + patch.object(HiveTableMetadataExtractor, '_choose_default_sql_stm', + return_value=HiveTableMetadataExtractor.DEFAULT_SQL_STATEMENT): connection = MagicMock() mock_connection.return_value = connection sql_execute = MagicMock() @@ -237,8 +236,7 @@ def setUp(self) -> None: config_dict = { HiveTableMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY: self.where_clause_suffix, - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): - 'TEST_CONNECTION' + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION' } self.conf = ConfigFactory.from_dict(config_dict) @@ -263,8 +261,7 @@ def test_hive_sql_statement_with_custom_sql(self) -> None: return_value=HiveTableMetadataExtractor.DEFAULT_SQL_STATEMENT): config_dict = { HiveTableMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY: self.where_clause_suffix, - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): - 'TEST_CONNECTION', + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION', HiveTableMetadataExtractor.EXTRACT_SQL: 'select sth for test {where_clause_suffix}' } diff --git a/databuilder/tests/unit/extractor/test_kafka_source_extractor.py b/databuilder/tests/unit/extractor/test_kafka_source_extractor.py index 7227143c87..73d2b9aa5f 100644 --- a/databuilder/tests/unit/extractor/test_kafka_source_extractor.py +++ b/databuilder/tests/unit/extractor/test_kafka_source_extractor.py @@ -2,9 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 import logging -from mock import patch, MagicMock import unittest +from mock import MagicMock, patch from pyhocon import ConfigFactory from databuilder import Scoped @@ -15,13 +15,11 @@ class TestKafkaSourceExtractor(unittest.TestCase): def setUp(self) -> None: logging.basicConfig(level=logging.INFO) config_dict = { - 'extractor.kafka_source.consumer_config': {'"group.id"': 'consumer-group', - '"enable.auto.commit"': False}, - 'extractor.kafka_source.{}'.format(KafkaSourceExtractor.RAW_VALUE_TRANSFORMER): + f'extractor.kafka_source.consumer_config': {'"group.id"': 'consumer-group', '"enable.auto.commit"': False}, + f'extractor.kafka_source.{KafkaSourceExtractor.RAW_VALUE_TRANSFORMER}': 'databuilder.transformer.base_transformer.NoopTransformer', - 'extractor.kafka_source.{}'.format(KafkaSourceExtractor.TOPIC_NAME_LIST): ['test-topic'], - 'extractor.kafka_source.{}'.format(KafkaSourceExtractor.CONSUMER_TOTAL_TIMEOUT_SEC): 1, - + f'extractor.kafka_source.{KafkaSourceExtractor.TOPIC_NAME_LIST}': ['test-topic'], + f'extractor.kafka_source.{KafkaSourceExtractor.CONSUMER_TOTAL_TIMEOUT_SEC}': 1, } self.conf = ConfigFactory.from_dict(config_dict) @@ -31,7 +29,6 @@ def test_consume_success(self) -> None: scope=kafka_extractor.get_scope())) with patch.object(kafka_extractor, 'consumer') as mock_consumer: - mock_poll = MagicMock() mock_poll.error.return_value = False # only return once diff --git a/databuilder/tests/unit/extractor/test_mssql_metadata_extractor.py b/databuilder/tests/unit/extractor/test_mssql_metadata_extractor.py index d99ae6f734..2b18199f12 100644 --- a/databuilder/tests/unit/extractor/test_mssql_metadata_extractor.py +++ b/databuilder/tests/unit/extractor/test_mssql_metadata_extractor.py @@ -3,14 +3,14 @@ import logging import unittest +from typing import Any, Dict -from mock import patch, MagicMock +from mock import MagicMock, patch from pyhocon import ConfigFactory -from typing import Any, Dict from databuilder.extractor.mssql_metadata_extractor import MSSQLMetadataExtractor from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor -from databuilder.models.table_metadata import TableMetadata, ColumnMetadata +from databuilder.models.table_metadata import ColumnMetadata, TableMetadata class TestMSSQLMetadataExtractor(unittest.TestCase): @@ -18,15 +18,11 @@ def setUp(self) -> None: logging.basicConfig(level=logging.INFO) config_dict = { - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): - 'TEST_CONNECTION', - 'extractor.mssql_metadata.{}'.format(MSSQLMetadataExtractor.CLUSTER_KEY): - 'MY_CLUSTER', - 'extractor.mssql_metadata.{}'.format(MSSQLMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME): - False, - 'extractor.mssql_metadata.{}'.format(MSSQLMetadataExtractor.DATABASE_KEY): - 'mssql', - 'extractor.mssql_metadata.{}'.format(MSSQLMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY): '' + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION', + f'extractor.mssql_metadata.{MSSQLMetadataExtractor.CLUSTER_KEY}': 'MY_CLUSTER', + f'extractor.mssql_metadata.{MSSQLMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME}': False, + f'extractor.mssql_metadata.{MSSQLMetadataExtractor.DATABASE_KEY}': 'mssql', + f'extractor.mssql_metadata.{MSSQLMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}': '' } self.conf = ConfigFactory.from_dict(config_dict) @@ -51,7 +47,7 @@ def test_extraction_with_single_result(self) -> None: 'name': 'test_table', 'description': 'a table for testing', 'cluster': - self.conf['extractor.mssql_metadata.{}'.format(MSSQLMetadataExtractor.CLUSTER_KEY)] + self.conf[f'extractor.mssql_metadata.{MSSQLMetadataExtractor.CLUSTER_KEY}'] } sql_execute.return_value = [ @@ -111,21 +107,21 @@ def test_extraction_with_multiple_result(self) -> None: 'name': 'test_table1', 'description': 'test table 1', 'cluster': - self.conf['extractor.mssql_metadata.{}'.format(MSSQLMetadataExtractor.CLUSTER_KEY)] + self.conf[f'extractor.mssql_metadata.{MSSQLMetadataExtractor.CLUSTER_KEY}'] } table1 = {'schema_name': 'test_schema1', 'name': 'test_table2', 'description': 'test table 2', 'cluster': - self.conf['extractor.mssql_metadata.{}'.format(MSSQLMetadataExtractor.CLUSTER_KEY)] + self.conf[f'extractor.mssql_metadata.{MSSQLMetadataExtractor.CLUSTER_KEY}'] } table2 = {'schema_name': 'test_schema2', 'name': 'test_table3', 'description': 'test table 3', 'cluster': - self.conf['extractor.mssql_metadata.{}'.format(MSSQLMetadataExtractor.CLUSTER_KEY)] + self.conf[f'extractor.mssql_metadata.{MSSQLMetadataExtractor.CLUSTER_KEY}'] } sql_execute.return_value = [ @@ -185,8 +181,7 @@ def test_extraction_with_multiple_result(self) -> None: extractor.init(self.conf) expected = TableMetadata('mssql', - self.conf['extractor.mssql_metadata.{}'.format( - MSSQLMetadataExtractor.CLUSTER_KEY)], + self.conf[f'extractor.mssql_metadata.{MSSQLMetadataExtractor.CLUSTER_KEY}'], 'test_schema1', 'test_table1', 'test table 1', [ColumnMetadata('col_id1', 'description of col_id1', 'bigint', 0), ColumnMetadata('col_id2', 'description of col_id2', 'bigint', 1), @@ -197,8 +192,7 @@ def test_extraction_with_multiple_result(self) -> None: self.assertEqual(expected.__repr__(), extractor.extract().__repr__()) expected = TableMetadata('mssql', - self.conf['extractor.mssql_metadata.{}'.format( - MSSQLMetadataExtractor.CLUSTER_KEY)], + self.conf[f'extractor.mssql_metadata.{MSSQLMetadataExtractor.CLUSTER_KEY}'], 'test_schema1', 'test_table2', 'test table 2', [ColumnMetadata('col_name', 'description of col_name', 'varchar', 0), ColumnMetadata('col_name2', 'description of col_name2', 'varchar', 1)], @@ -206,8 +200,7 @@ def test_extraction_with_multiple_result(self) -> None: self.assertEqual(expected.__repr__(), extractor.extract().__repr__()) expected = TableMetadata('mssql', - self.conf['extractor.mssql_metadata.{}'.format( - MSSQLMetadataExtractor.CLUSTER_KEY)], + self.conf[f'extractor.mssql_metadata.{MSSQLMetadataExtractor.CLUSTER_KEY}'], 'test_schema2', 'test_table3', 'test table 3', [ColumnMetadata('col_id3', 'description of col_id3', 'varchar', 0), ColumnMetadata('col_name3', 'description of col_name3', @@ -233,8 +226,7 @@ def setUp(self) -> None: config_dict = { MSSQLMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY: self.where_clause_suffix, - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): - 'TEST_CONNECTION' + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION' } self.conf = ConfigFactory.from_dict(config_dict) @@ -256,8 +248,7 @@ def setUp(self) -> None: config_dict = { MSSQLMetadataExtractor.CLUSTER_KEY: self.cluster_key, - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): - 'TEST_CONNECTION', + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION', MSSQLMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME: False } self.conf = ConfigFactory.from_dict(config_dict) @@ -278,8 +269,7 @@ def setUp(self) -> None: logging.basicConfig(level=logging.INFO) config_dict = { - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): - 'TEST_CONNECTION', + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION', MSSQLMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME: False } self.conf = ConfigFactory.from_dict(config_dict) @@ -302,8 +292,7 @@ def setUp(self) -> None: config_dict = { MSSQLMetadataExtractor.CLUSTER_KEY: self.cluster_key, - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): - 'TEST_CONNECTION', + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION', MSSQLMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME: True } self.conf = ConfigFactory.from_dict(config_dict) diff --git a/databuilder/tests/unit/extractor/test_neo4j_es_last_updated_extractor.py b/databuilder/tests/unit/extractor/test_neo4j_es_last_updated_extractor.py index 96af15902b..1313bfee01 100644 --- a/databuilder/tests/unit/extractor/test_neo4j_es_last_updated_extractor.py +++ b/databuilder/tests/unit/extractor/test_neo4j_es_last_updated_extractor.py @@ -1,10 +1,10 @@ # Copyright Contributors to the Amundsen project. # SPDX-License-Identifier: Apache-2.0 -from mock import patch -from typing import Any import unittest +from typing import Any +from mock import patch from pyhocon import ConfigFactory from databuilder import Scoped diff --git a/databuilder/tests/unit/extractor/test_neo4j_extractor.py b/databuilder/tests/unit/extractor/test_neo4j_extractor.py index 7ae73f8426..a2aff38099 100644 --- a/databuilder/tests/unit/extractor/test_neo4j_extractor.py +++ b/databuilder/tests/unit/extractor/test_neo4j_extractor.py @@ -2,10 +2,10 @@ # SPDX-License-Identifier: Apache-2.0 import unittest +from typing import Any from mock import patch from pyhocon import ConfigFactory -from typing import Any from databuilder import Scoped from databuilder.extractor.neo4j_extractor import Neo4jExtractor @@ -16,10 +16,10 @@ class TestNeo4jExtractor(unittest.TestCase): def setUp(self) -> None: config_dict = { - 'extractor.neo4j.{}'.format(Neo4jExtractor.GRAPH_URL_CONFIG_KEY): 'TEST_GRAPH_URL', - 'extractor.neo4j.{}'.format(Neo4jExtractor.CYPHER_QUERY_CONFIG_KEY): 'TEST_QUERY', - 'extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_USER): 'TEST_USER', - 'extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_PW): 'TEST_PW' + f'extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}': 'TEST_GRAPH_URL', + f'extractor.neo4j.{Neo4jExtractor.CYPHER_QUERY_CONFIG_KEY}': 'TEST_QUERY', + f'extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_USER}': 'TEST_USER', + f'extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_PW}': 'TEST_PW' } self.conf = ConfigFactory.from_dict(config_dict) @@ -83,11 +83,11 @@ def test_extraction_with_model_class(self: Any) -> None: Test Extraction using model class """ config_dict = { - 'extractor.neo4j.{}'.format(Neo4jExtractor.GRAPH_URL_CONFIG_KEY): 'TEST_GRAPH_URL', - 'extractor.neo4j.{}'.format(Neo4jExtractor.CYPHER_QUERY_CONFIG_KEY): 'TEST_QUERY', - 'extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_USER): 'TEST_USER', - 'extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_PW): 'TEST_PW', - 'extractor.neo4j.{}'.format(Neo4jExtractor.MODEL_CLASS_CONFIG_KEY): + f'extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}': 'TEST_GRAPH_URL', + f'extractor.neo4j.{Neo4jExtractor.CYPHER_QUERY_CONFIG_KEY}': 'TEST_QUERY', + f'extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_USER}': 'TEST_USER', + f'extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_PW}': 'TEST_PW', + f'extractor.neo4j.{Neo4jExtractor.MODEL_CLASS_CONFIG_KEY}': 'databuilder.models.table_elasticsearch_document.TableESDocument' } diff --git a/databuilder/tests/unit/extractor/test_neo4j_search_data_extractor.py b/databuilder/tests/unit/extractor/test_neo4j_search_data_extractor.py index 76bc153587..25df5a78a9 100644 --- a/databuilder/tests/unit/extractor/test_neo4j_search_data_extractor.py +++ b/databuilder/tests/unit/extractor/test_neo4j_search_data_extractor.py @@ -2,10 +2,11 @@ # SPDX-License-Identifier: Apache-2.0 import unittest -from mock import patch from typing import Any +from mock import patch from pyhocon import ConfigFactory + from databuilder import Scoped from databuilder.extractor.neo4j_extractor import Neo4jExtractor from databuilder.extractor.neo4j_search_data_extractor import Neo4jSearchDataExtractor @@ -30,14 +31,10 @@ def test_default_search_query(self: Any) -> None: with patch.object(Neo4jExtractor, '_get_driver'): extractor = Neo4jSearchDataExtractor() conf = ConfigFactory.from_dict({ - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.GRAPH_URL_CONFIG_KEY): - 'test-endpoint', - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_USER): - 'test-user', - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_PW): - 'test-passwd', - 'extractor.search_data.{}'.format(Neo4jSearchDataExtractor.ENTITY_TYPE): - 'dashboard', + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}': 'test-endpoint', + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_USER}': 'test-user', + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_PW}': 'test-passwd', + f'extractor.search_data.{Neo4jSearchDataExtractor.ENTITY_TYPE}': 'dashboard', }) extractor.init(Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope())) @@ -48,16 +45,11 @@ def test_default_search_query_with_tag(self: Any) -> None: with patch.object(Neo4jExtractor, '_get_driver'): extractor = Neo4jSearchDataExtractor() conf = ConfigFactory.from_dict({ - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.GRAPH_URL_CONFIG_KEY): - 'test-endpoint', - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_USER): - 'test-user', - 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_PW): - 'test-passwd', - 'extractor.search_data.{}'.format(Neo4jSearchDataExtractor.ENTITY_TYPE): - 'dashboard', - 'extractor.search_data.{}'.format(JOB_PUBLISH_TAG): - 'test-date', + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}': 'test-endpoint', + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_USER}': 'test-user', + f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_PW}': 'test-passwd', + f'extractor.search_data.{Neo4jSearchDataExtractor.ENTITY_TYPE}': 'dashboard', + f'extractor.search_data.{JOB_PUBLISH_TAG}': 'test-date', }) extractor.init(Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope())) diff --git a/databuilder/tests/unit/extractor/test_postgres_metadata_extractor.py b/databuilder/tests/unit/extractor/test_postgres_metadata_extractor.py index 2d70e499a9..90b74959db 100644 --- a/databuilder/tests/unit/extractor/test_postgres_metadata_extractor.py +++ b/databuilder/tests/unit/extractor/test_postgres_metadata_extractor.py @@ -3,14 +3,14 @@ import logging import unittest +from typing import Any, Dict -from mock import patch, MagicMock +from mock import MagicMock, patch from pyhocon import ConfigFactory -from typing import Any, Dict from databuilder.extractor.postgres_metadata_extractor import PostgresMetadataExtractor from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor -from databuilder.models.table_metadata import TableMetadata, ColumnMetadata +from databuilder.models.table_metadata import ColumnMetadata, TableMetadata class TestPostgresMetadataExtractor(unittest.TestCase): @@ -18,8 +18,7 @@ def setUp(self) -> None: logging.basicConfig(level=logging.INFO) config_dict = { - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): - 'TEST_CONNECTION', + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION', PostgresMetadataExtractor.CLUSTER_KEY: 'MY_CLUSTER', PostgresMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME: False, PostgresMetadataExtractor.DATABASE_KEY: 'postgres' @@ -47,7 +46,7 @@ def test_extraction_with_single_result(self) -> None: 'name': 'test_table', 'description': 'a table for testing', 'cluster': - self.conf[PostgresMetadataExtractor.CLUSTER_KEY] + self.conf[PostgresMetadataExtractor.CLUSTER_KEY] } sql_execute.return_value = [ @@ -107,21 +106,21 @@ def test_extraction_with_multiple_result(self) -> None: 'name': 'test_table1', 'description': 'test table 1', 'cluster': - self.conf[PostgresMetadataExtractor.CLUSTER_KEY] + self.conf[PostgresMetadataExtractor.CLUSTER_KEY] } table1 = {'schema': 'test_schema1', 'name': 'test_table2', 'description': 'test table 2', 'cluster': - self.conf[PostgresMetadataExtractor.CLUSTER_KEY] + self.conf[PostgresMetadataExtractor.CLUSTER_KEY] } table2 = {'schema': 'test_schema2', 'name': 'test_table3', 'description': 'test table 3', 'cluster': - self.conf[PostgresMetadataExtractor.CLUSTER_KEY] + self.conf[PostgresMetadataExtractor.CLUSTER_KEY] } sql_execute.return_value = [ @@ -225,8 +224,7 @@ def setUp(self) -> None: config_dict = { PostgresMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY: self.where_clause_suffix, - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): - 'TEST_CONNECTION' + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION' } self.conf = ConfigFactory.from_dict(config_dict) @@ -248,8 +246,7 @@ def setUp(self) -> None: config_dict = { PostgresMetadataExtractor.CLUSTER_KEY: self.cluster_key, - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): - 'TEST_CONNECTION', + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION', PostgresMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME: False } self.conf = ConfigFactory.from_dict(config_dict) @@ -270,8 +267,7 @@ def setUp(self) -> None: logging.basicConfig(level=logging.INFO) config_dict = { - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): - 'TEST_CONNECTION', + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION', PostgresMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME: False } self.conf = ConfigFactory.from_dict(config_dict) @@ -294,8 +290,7 @@ def setUp(self) -> None: config_dict = { PostgresMetadataExtractor.CLUSTER_KEY: self.cluster_key, - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): - 'TEST_CONNECTION', + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION', PostgresMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME: True } self.conf = ConfigFactory.from_dict(config_dict) diff --git a/databuilder/tests/unit/extractor/test_presto_view_metadata_extractor.py b/databuilder/tests/unit/extractor/test_presto_view_metadata_extractor.py index b1c4ba117a..5646f56974 100644 --- a/databuilder/tests/unit/extractor/test_presto_view_metadata_extractor.py +++ b/databuilder/tests/unit/extractor/test_presto_view_metadata_extractor.py @@ -6,12 +6,12 @@ import logging import unittest -from mock import patch, MagicMock +from mock import MagicMock, patch from pyhocon import ConfigFactory from databuilder.extractor.presto_view_metadata_extractor import PrestoViewMetadataExtractor from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor -from databuilder.models.table_metadata import TableMetadata, ColumnMetadata +from databuilder.models.table_metadata import ColumnMetadata, TableMetadata class TestPrestoViewMetadataExtractor(unittest.TestCase): @@ -19,8 +19,7 @@ def setUp(self) -> None: logging.basicConfig(level=logging.INFO) config_dict = { - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): - 'TEST_CONNECTION' + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION' } self.conf = ConfigFactory.from_dict(config_dict) diff --git a/databuilder/tests/unit/extractor/test_redshift_metadata_extractor.py b/databuilder/tests/unit/extractor/test_redshift_metadata_extractor.py index 0983c09b55..636f1ec7d8 100644 --- a/databuilder/tests/unit/extractor/test_redshift_metadata_extractor.py +++ b/databuilder/tests/unit/extractor/test_redshift_metadata_extractor.py @@ -3,14 +3,14 @@ import logging import unittest +from typing import Any, Dict -from mock import patch, MagicMock +from mock import MagicMock, patch from pyhocon import ConfigFactory -from typing import Any, Dict from databuilder.extractor.redshift_metadata_extractor import RedshiftMetadataExtractor from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor -from databuilder.models.table_metadata import TableMetadata, ColumnMetadata +from databuilder.models.table_metadata import ColumnMetadata, TableMetadata class TestRedshiftMetadataExtractor(unittest.TestCase): @@ -18,8 +18,7 @@ def setUp(self) -> None: logging.basicConfig(level=logging.INFO) config_dict = { - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): - 'TEST_CONNECTION', + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION', RedshiftMetadataExtractor.CLUSTER_KEY: 'MY_CLUSTER', RedshiftMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME: False, RedshiftMetadataExtractor.DATABASE_KEY: 'redshift' @@ -47,7 +46,7 @@ def test_extraction_with_single_result(self) -> None: 'name': 'test_table', 'description': 'a table for testing', 'cluster': - self.conf[RedshiftMetadataExtractor.CLUSTER_KEY] + self.conf[RedshiftMetadataExtractor.CLUSTER_KEY] } sql_execute.return_value = [ @@ -113,7 +112,7 @@ def setUp(self) -> None: config_dict = { RedshiftMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY: self.where_clause_suffix, - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION' } self.conf = ConfigFactory.from_dict(config_dict) diff --git a/databuilder/tests/unit/extractor/test_snowflake_metadata_extractor.py b/databuilder/tests/unit/extractor/test_snowflake_metadata_extractor.py index 12c12806d6..e25758cd21 100644 --- a/databuilder/tests/unit/extractor/test_snowflake_metadata_extractor.py +++ b/databuilder/tests/unit/extractor/test_snowflake_metadata_extractor.py @@ -3,14 +3,14 @@ import logging import unittest +from typing import Any, Dict -from mock import patch, MagicMock +from mock import MagicMock, patch from pyhocon import ConfigFactory -from typing import Any, Dict from databuilder.extractor.snowflake_metadata_extractor import SnowflakeMetadataExtractor from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor -from databuilder.models.table_metadata import TableMetadata, ColumnMetadata +from databuilder.models.table_metadata import ColumnMetadata, TableMetadata class TestSnowflakeMetadataExtractor(unittest.TestCase): @@ -18,14 +18,10 @@ def setUp(self) -> None: logging.basicConfig(level=logging.INFO) config_dict = { - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): - 'TEST_CONNECTION', - 'extractor.snowflake_metadata.{}'.format(SnowflakeMetadataExtractor.CLUSTER_KEY): - 'MY_CLUSTER', - 'extractor.snowflake_metadata.{}'.format(SnowflakeMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME): - False, - 'extractor.snowflake_metadata.{}'.format(SnowflakeMetadataExtractor.SNOWFLAKE_DATABASE_KEY): - 'prod' + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION', + f'extractor.snowflake_metadata.{SnowflakeMetadataExtractor.CLUSTER_KEY}': 'MY_CLUSTER', + f'extractor.snowflake_metadata.{SnowflakeMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME}': False, + f'extractor.snowflake_metadata.{SnowflakeMetadataExtractor.SNOWFLAKE_DATABASE_KEY}': 'prod' } self.conf = ConfigFactory.from_dict(config_dict) @@ -49,8 +45,7 @@ def test_extraction_with_single_result(self) -> None: table = {'schema': 'test_schema', 'name': 'test_table', 'description': 'a table for testing', - 'cluster': - self.conf['extractor.snowflake_metadata.{}'.format(SnowflakeMetadataExtractor.CLUSTER_KEY)], + 'cluster': self.conf[f'extractor.snowflake_metadata.{SnowflakeMetadataExtractor.CLUSTER_KEY}'], 'is_view': 'false' } @@ -112,7 +107,7 @@ def test_extraction_with_multiple_result(self) -> None: 'name': 'test_table1', 'description': 'test table 1', 'cluster': - self.conf['extractor.snowflake_metadata.{}'.format(SnowflakeMetadataExtractor.CLUSTER_KEY)], + self.conf[f'extractor.snowflake_metadata.{SnowflakeMetadataExtractor.CLUSTER_KEY}'], 'is_view': 'nottrue' } @@ -120,7 +115,7 @@ def test_extraction_with_multiple_result(self) -> None: 'name': 'test_table2', 'description': 'test table 2', 'cluster': - self.conf['extractor.snowflake_metadata.{}'.format(SnowflakeMetadataExtractor.CLUSTER_KEY)], + self.conf[f'extractor.snowflake_metadata.{SnowflakeMetadataExtractor.CLUSTER_KEY}'], 'is_view': 'false' } @@ -128,7 +123,7 @@ def test_extraction_with_multiple_result(self) -> None: 'name': 'test_table3', 'description': 'test table 3', 'cluster': - self.conf['extractor.snowflake_metadata.{}'.format(SnowflakeMetadataExtractor.CLUSTER_KEY)], + self.conf[f'extractor.snowflake_metadata.{SnowflakeMetadataExtractor.CLUSTER_KEY}'], 'is_view': 'true' } @@ -189,8 +184,8 @@ def test_extraction_with_multiple_result(self) -> None: extractor.init(self.conf) expected = TableMetadata('snowflake', - self.conf['extractor.snowflake_metadata.{}'.format( - SnowflakeMetadataExtractor.CLUSTER_KEY)], + self.conf[ + f'extractor.snowflake_metadata.{SnowflakeMetadataExtractor.CLUSTER_KEY}'], 'test_schema1', 'test_table1', 'test table 1', [ColumnMetadata('col_id1', 'description of col_id1', 'number', 0), ColumnMetadata('col_id2', 'description of col_id2', 'number', 1), @@ -202,16 +197,16 @@ def test_extraction_with_multiple_result(self) -> None: self.assertEqual(expected.__repr__(), extractor.extract().__repr__()) expected = TableMetadata('snowflake', - self.conf['extractor.snowflake_metadata.{}'.format( - SnowflakeMetadataExtractor.CLUSTER_KEY)], + self.conf[ + f'extractor.snowflake_metadata.{SnowflakeMetadataExtractor.CLUSTER_KEY}'], 'test_schema1', 'test_table2', 'test table 2', [ColumnMetadata('col_name', 'description of col_name', 'varchar', 0), ColumnMetadata('col_name2', 'description of col_name2', 'varchar', 1)]) self.assertEqual(expected.__repr__(), extractor.extract().__repr__()) expected = TableMetadata('snowflake', - self.conf['extractor.snowflake_metadata.{}'.format( - SnowflakeMetadataExtractor.CLUSTER_KEY)], + self.conf[ + f'extractor.snowflake_metadata.{SnowflakeMetadataExtractor.CLUSTER_KEY}'], 'test_schema2', 'test_table3', 'test table 3', [ColumnMetadata('col_id3', 'description of col_id3', 'varchar', 0), ColumnMetadata('col_name3', 'description of col_name3', @@ -238,7 +233,7 @@ def setUp(self) -> None: config_dict = { SnowflakeMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY: self.where_clause_suffix, - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION' } self.conf = ConfigFactory.from_dict(config_dict) @@ -261,7 +256,7 @@ def setUp(self) -> None: config_dict = { SnowflakeMetadataExtractor.CLUSTER_KEY: self.cluster_key, - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION', SnowflakeMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME: False } @@ -285,7 +280,7 @@ def setUp(self) -> None: config_dict = { SnowflakeMetadataExtractor.SNOWFLAKE_DATABASE_KEY: self.snowflake_database_key, - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION' } self.conf = ConfigFactory.from_dict(config_dict) @@ -308,7 +303,7 @@ def setUp(self) -> None: config_dict = { SnowflakeMetadataExtractor.DATABASE_KEY: self.database_key, - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION' } self.conf = ConfigFactory.from_dict(config_dict) @@ -359,7 +354,7 @@ def setUp(self) -> None: logging.basicConfig(level=logging.INFO) config_dict = { - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION', SnowflakeMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME: False } @@ -383,7 +378,7 @@ def setUp(self) -> None: config_dict = { SnowflakeMetadataExtractor.CLUSTER_KEY: self.cluster_key, - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION', SnowflakeMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME: True } diff --git a/databuilder/tests/unit/extractor/test_snowflake_table_last_updated_extractor.py b/databuilder/tests/unit/extractor/test_snowflake_table_last_updated_extractor.py index d9e597c1d5..e6308079d1 100644 --- a/databuilder/tests/unit/extractor/test_snowflake_table_last_updated_extractor.py +++ b/databuilder/tests/unit/extractor/test_snowflake_table_last_updated_extractor.py @@ -3,7 +3,7 @@ import unittest -from mock import patch, MagicMock +from mock import MagicMock, patch from pyhocon import ConfigFactory from databuilder.extractor.snowflake_table_last_updated_extractor import SnowflakeTableLastUpdatedExtractor @@ -14,15 +14,13 @@ class TestSnowflakeTableLastUpdatedExtractor(unittest.TestCase): def setUp(self) -> None: config_dict = { - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION', - 'extractor.snowflake_table_last_updated.{}'.format(SnowflakeTableLastUpdatedExtractor.CLUSTER_KEY): + f'extractor.snowflake_table_last_updated.{SnowflakeTableLastUpdatedExtractor.CLUSTER_KEY}': 'MY_CLUSTER', - 'extractor.snowflake_table_last_updated.{}'.format( - SnowflakeTableLastUpdatedExtractor.USE_CATALOG_AS_CLUSTER_NAME): + f'extractor.snowflake_table_last_updated.{SnowflakeTableLastUpdatedExtractor.USE_CATALOG_AS_CLUSTER_NAME}': False, - 'extractor.snowflake_table_last_updated.{}'.format( - SnowflakeTableLastUpdatedExtractor.SNOWFLAKE_DATABASE_KEY): + f'extractor.snowflake_table_last_updated.{SnowflakeTableLastUpdatedExtractor.SNOWFLAKE_DATABASE_KEY}': 'prod' } self.conf = ConfigFactory.from_dict(config_dict) @@ -51,8 +49,8 @@ def test_extraction_with_single_result(self) -> None: {'schema': 'test_schema', 'table_name': 'test_table', 'last_updated_time': 1000, - 'cluster': self.conf['extractor.snowflake_table_last_updated.{}'.format( - SnowflakeTableLastUpdatedExtractor.CLUSTER_KEY)], + 'cluster': self.conf[ + f'extractor.snowflake_table_last_updated.{SnowflakeTableLastUpdatedExtractor.CLUSTER_KEY}'], } ] @@ -76,8 +74,8 @@ def test_extraction_with_multiple_result(self) -> None: sql_execute = MagicMock() connection.execute = sql_execute - default_cluster = self.conf['extractor.snowflake_table_last_updated.{}'.format( - SnowflakeTableLastUpdatedExtractor.CLUSTER_KEY)] + default_cluster = self.conf[ + f'extractor.snowflake_table_last_updated.{SnowflakeTableLastUpdatedExtractor.CLUSTER_KEY}'] table = {'schema': 'test_schema1', 'table_name': 'test_table1', @@ -124,6 +122,7 @@ class TestSnowflakeTableLastUpdatedExtractorWithWhereClause(unittest.TestCase): """ Test 'where_clause' config key in extractor """ + def setUp(self) -> None: self.where_clause_suffix = """ where table_schema in ('public') and table_name = 'movies' @@ -131,8 +130,7 @@ def setUp(self) -> None: config_dict = { SnowflakeTableLastUpdatedExtractor.WHERE_CLAUSE_SUFFIX_KEY: self.where_clause_suffix, - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): - 'TEST_CONNECTION' + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION' } self.conf = ConfigFactory.from_dict(config_dict) @@ -150,12 +148,13 @@ class TestSnowflakeTableLastUpdatedExtractorClusterKeyNoTableCatalog(unittest.Te """ Test with 'USE_CATALOG_AS_CLUSTER_NAME' is false and 'CLUSTER_KEY' is specified """ + def setUp(self) -> None: self.cluster_key = "not_master" config_dict = { SnowflakeTableLastUpdatedExtractor.CLUSTER_KEY: self.cluster_key, - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION', SnowflakeTableLastUpdatedExtractor.USE_CATALOG_AS_CLUSTER_NAME: False } @@ -175,13 +174,13 @@ class TestSnowflakeTableLastUpdatedExtractorDefaultSnowflakeDatabaseKey(unittest """ Test with SNOWFLAKE_DATABASE_KEY config specified """ + def setUp(self) -> None: self.snowflake_database_key = "not_prod" config_dict = { SnowflakeTableLastUpdatedExtractor.SNOWFLAKE_DATABASE_KEY: self.snowflake_database_key, - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): - 'TEST_CONNECTION' + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION' } self.conf = ConfigFactory.from_dict(config_dict) @@ -199,13 +198,13 @@ class TestSnowflakeTableLastUpdatedExtractorDefaultDatabaseKey(unittest.TestCase """ Test with DATABASE_KEY config specified """ + def setUp(self) -> None: self.database_key = 'not_snowflake' config_dict = { SnowflakeTableLastUpdatedExtractor.DATABASE_KEY: self.database_key, - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): - 'TEST_CONNECTION' + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION' } self.conf = ConfigFactory.from_dict(config_dict) @@ -250,10 +249,10 @@ class TestSnowflakeTableLastUpdatedExtractorNoClusterKeyNoTableCatalog(unittest. """ Test when USE_CATALOG_AS_CLUSTER_NAME is false and CLUSTER_KEY is NOT specified """ + def setUp(self) -> None: config_dict = { - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): - 'TEST_CONNECTION', + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION', SnowflakeTableLastUpdatedExtractor.USE_CATALOG_AS_CLUSTER_NAME: False } self.conf = ConfigFactory.from_dict(config_dict) @@ -272,13 +271,13 @@ class TestSnowflakeTableLastUpdatedExtractorTableCatalogEnabled(unittest.TestCas """ Test when USE_CATALOG_AS_CLUSTER_NAME is true (CLUSTER_KEY should be ignored) """ + def setUp(self) -> None: self.cluster_key = "not_master" config_dict = { SnowflakeTableLastUpdatedExtractor.CLUSTER_KEY: self.cluster_key, - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): - 'TEST_CONNECTION', + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION', SnowflakeTableLastUpdatedExtractor.USE_CATALOG_AS_CLUSTER_NAME: True } self.conf = ConfigFactory.from_dict(config_dict) diff --git a/databuilder/tests/unit/extractor/test_sql_alchemy_extractor.py b/databuilder/tests/unit/extractor/test_sql_alchemy_extractor.py index 78dc73ccff..b9bd967a61 100644 --- a/databuilder/tests/unit/extractor/test_sql_alchemy_extractor.py +++ b/databuilder/tests/unit/extractor/test_sql_alchemy_extractor.py @@ -2,10 +2,10 @@ # SPDX-License-Identifier: Apache-2.0 import unittest +from typing import Any from mock import patch from pyhocon import ConfigFactory -from typing import Any from databuilder import Scoped from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor diff --git a/databuilder/tests/unit/extractor/test_sql_server_metadata_extractor.py b/databuilder/tests/unit/extractor/test_sql_server_metadata_extractor.py index b4ce82f37b..10868463a1 100644 --- a/databuilder/tests/unit/extractor/test_sql_server_metadata_extractor.py +++ b/databuilder/tests/unit/extractor/test_sql_server_metadata_extractor.py @@ -3,14 +3,14 @@ import logging import unittest +from typing import Any, Dict -from mock import patch, MagicMock +from mock import MagicMock, patch from pyhocon import ConfigFactory -from typing import Any, Dict from databuilder.extractor.mssql_metadata_extractor import MSSQLMetadataExtractor from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor -from databuilder.models.table_metadata import TableMetadata, ColumnMetadata +from databuilder.models.table_metadata import ColumnMetadata, TableMetadata class TestMSSQLMetadataExtractor(unittest.TestCase): @@ -18,14 +18,10 @@ def setUp(self) -> None: logging.basicConfig(level=logging.INFO) config_dict = { - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): - 'TEST_CONNECTION', - 'extractor.mssql_metadata.{}'.format(MSSQLMetadataExtractor.CLUSTER_KEY): - 'MY_CLUSTER', - 'extractor.mssql_metadata.{}'.format(MSSQLMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME): - False, - 'extractor.mssql_metadata.{}'.format(MSSQLMetadataExtractor.DATABASE_KEY): - 'mssql' + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION', + f'extractor.mssql_metadata.{MSSQLMetadataExtractor.CLUSTER_KEY}': 'MY_CLUSTER', + f'extractor.mssql_metadata.{MSSQLMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME}': False, + f'extractor.mssql_metadata.{MSSQLMetadataExtractor.DATABASE_KEY}': 'mssql' } self.conf = ConfigFactory.from_dict(config_dict) @@ -50,7 +46,7 @@ def test_extraction_with_single_result(self) -> None: 'name': 'test_table', 'description': 'a table for testing', 'cluster': - self.conf['extractor.mssql_metadata.{}'.format(MSSQLMetadataExtractor.CLUSTER_KEY)] + self.conf[f'extractor.mssql_metadata.{MSSQLMetadataExtractor.CLUSTER_KEY}'] } sql_execute.return_value = [ @@ -112,21 +108,21 @@ def test_extraction_with_multiple_result(self) -> None: 'name': 'test_table1', 'description': 'test table 1', 'cluster': - self.conf['extractor.mssql_metadata.{}'.format(MSSQLMetadataExtractor.CLUSTER_KEY)] + self.conf[f'extractor.mssql_metadata.{MSSQLMetadataExtractor.CLUSTER_KEY}'] } table1 = {'schema_name': 'test_schema1', 'name': 'test_table2', 'description': 'test table 2', 'cluster': - self.conf['extractor.mssql_metadata.{}'.format(MSSQLMetadataExtractor.CLUSTER_KEY)] + self.conf[f'extractor.mssql_metadata.{MSSQLMetadataExtractor.CLUSTER_KEY}'] } table2 = {'schema_name': 'test_schema2', 'name': 'test_table3', 'description': 'test table 3', 'cluster': - self.conf['extractor.mssql_metadata.{}'.format(MSSQLMetadataExtractor.CLUSTER_KEY)] + self.conf[f'extractor.mssql_metadata.{MSSQLMetadataExtractor.CLUSTER_KEY}'] } sql_execute.return_value = [ @@ -187,8 +183,7 @@ def test_extraction_with_multiple_result(self) -> None: expected = TableMetadata( 'mssql', - self.conf['extractor.mssql_metadata.{}'.format( - MSSQLMetadataExtractor.CLUSTER_KEY)], + self.conf[f'extractor.mssql_metadata.{MSSQLMetadataExtractor.CLUSTER_KEY}'], 'test_schema1', 'test_table1', 'test table 1', [ColumnMetadata('col_id1', 'description of col_id1', 'bigint', 0), ColumnMetadata('col_id2', 'description of col_id2', 'bigint', 1), @@ -206,8 +201,7 @@ def test_extraction_with_multiple_result(self) -> None: expected = TableMetadata( 'mssql', - self.conf['extractor.mssql_metadata.{}'.format( - MSSQLMetadataExtractor.CLUSTER_KEY)], + self.conf[f'extractor.mssql_metadata.{MSSQLMetadataExtractor.CLUSTER_KEY}'], 'test_schema1', 'test_table2', 'test table 2', [ColumnMetadata('col_name', 'description of col_name', 'varchar', 0), ColumnMetadata('col_name2', 'description of col_name2', 'varchar', 1)], @@ -218,8 +212,7 @@ def test_extraction_with_multiple_result(self) -> None: expected = TableMetadata( 'mssql', - self.conf['extractor.mssql_metadata.{}'.format( - MSSQLMetadataExtractor.CLUSTER_KEY)], + self.conf[f'extractor.mssql_metadata.{MSSQLMetadataExtractor.CLUSTER_KEY}'], 'test_schema2', 'test_table3', 'test table 3', [ColumnMetadata('col_id3', 'description of col_id3', 'varchar', 0), ColumnMetadata('col_name3', 'description of col_name3', @@ -247,8 +240,7 @@ def setUp(self) -> None: config_dict = { MSSQLMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY: self.where_clause_suffix, - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): - 'TEST_CONNECTION' + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION' } self.conf = ConfigFactory.from_dict(config_dict) @@ -270,7 +262,7 @@ def setUp(self) -> None: config_dict = { MSSQLMetadataExtractor.CLUSTER_KEY: self.cluster_key, - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION', MSSQLMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME: False } @@ -292,8 +284,7 @@ def setUp(self) -> None: logging.basicConfig(level=logging.INFO) config_dict = { - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): - 'TEST_CONNECTION', + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION', MSSQLMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME: False } self.conf = ConfigFactory.from_dict(config_dict) @@ -316,8 +307,7 @@ def setUp(self) -> None: config_dict = { MSSQLMetadataExtractor.CLUSTER_KEY: self.cluster_key, - 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): - 'TEST_CONNECTION', + f'extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': 'TEST_CONNECTION', MSSQLMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME: True } self.conf = ConfigFactory.from_dict(config_dict) diff --git a/databuilder/tests/unit/extractor/user/bamboohr/test_bamboohr_user_extractor.py b/databuilder/tests/unit/extractor/user/bamboohr/test_bamboohr_user_extractor.py index ebba50bbec..68ae02482f 100644 --- a/databuilder/tests/unit/extractor/user/bamboohr/test_bamboohr_user_extractor.py +++ b/databuilder/tests/unit/extractor/user/bamboohr/test_bamboohr_user_extractor.py @@ -2,15 +2,14 @@ # SPDX-License-Identifier: Apache-2.0 import io -import unittest - import os +import unittest import responses from pyhocon import ConfigFactory -from databuilder.models.user import User from databuilder.extractor.user.bamboohr.bamboohr_user_extractor import BamboohrUserExtractor +from databuilder.models.user import User class TestBamboohrUserExtractor(unittest.TestCase): diff --git a/databuilder/tests/unit/loader/test_file_system_csv_loader.py b/databuilder/tests/unit/loader/test_file_system_csv_loader.py index a567fadc2b..0814abcf39 100644 --- a/databuilder/tests/unit/loader/test_file_system_csv_loader.py +++ b/databuilder/tests/unit/loader/test_file_system_csv_loader.py @@ -4,9 +4,9 @@ import shutil import tempfile import unittest +from typing import List from pyhocon import ConfigFactory -from typing import List from databuilder import Scoped from databuilder.loader.file_system_csv_loader import FileSystemCSVLoader @@ -17,7 +17,7 @@ class TestFileSystemCSVLoader(unittest.TestCase): def setUp(self) -> None: self.temp_dir_path = tempfile.mkdtemp() - self.dest_file_name = '{}/test_file.csv'.format(self.temp_dir_path) + self.dest_file_name = f'{self.temp_dir_path}/test_file.csv' self.file_mode = 'w' config_dict = {'loader.filesystem.csv.file_path': self.dest_file_name, 'loader.filesystem.csv.mode': self.file_mode} diff --git a/databuilder/tests/unit/loader/test_file_system_elasticsearch_json_loader.py b/databuilder/tests/unit/loader/test_file_system_elasticsearch_json_loader.py index e9c661734b..ef3d886083 100644 --- a/databuilder/tests/unit/loader/test_file_system_elasticsearch_json_loader.py +++ b/databuilder/tests/unit/loader/test_file_system_elasticsearch_json_loader.py @@ -5,9 +5,9 @@ import shutil import tempfile import unittest +from typing import List from pyhocon import ConfigFactory -from typing import List from databuilder import Scoped from databuilder.loader.file_system_elasticsearch_json_loader import FSElasticsearchJSONLoader @@ -18,7 +18,7 @@ class TestFSElasticsearchJSONLoader(unittest.TestCase): def setUp(self) -> None: self.temp_dir_path = tempfile.mkdtemp() - self.dest_file_name = '{}/test_file.json'.format(self.temp_dir_path) + self.dest_file_name = f'{self.temp_dir_path}/test_file.json' self.file_mode = 'w' config_dict = {'loader.filesystem.elasticsearch.file_path': self.dest_file_name, 'loader.filesystem.elasticsearch.mode': self.file_mode} diff --git a/databuilder/tests/unit/loader/test_fs_neo4j_csv_loader.py b/databuilder/tests/unit/loader/test_fs_neo4j_csv_loader.py index 59a346b3a2..c37f2cd720 100644 --- a/databuilder/tests/unit/loader/test_fs_neo4j_csv_loader.py +++ b/databuilder/tests/unit/loader/test_fs_neo4j_csv_loader.py @@ -6,17 +6,25 @@ import logging import os import unittest +from operator import itemgetter from os import listdir from os.path import isfile, join +from typing import ( + Any, Callable, Dict, Iterable, Optional, Union, +) from pyhocon import ConfigFactory, ConfigTree -from typing import Dict, Iterable, Any, Callable, Optional, Union -from databuilder.models.graph_serializable import GraphSerializable, GraphNode, GraphRelationship from databuilder.job.base_job import Job from databuilder.loader.file_system_neo4j_csv_loader import FsNeo4jCSVLoader -from tests.unit.models.test_graph_serializable import Movie, Actor, City -from operator import itemgetter +from databuilder.models.graph_serializable import ( + GraphNode, GraphRelationship, GraphSerializable, +) +from tests.unit.models.test_graph_serializable import ( + Actor, City, Movie, +) + +here = os.path.dirname(__file__) class TestFsNeo4jCSVLoader(unittest.TestCase): @@ -40,16 +48,13 @@ def test_load(self) -> None: loader.load(movie) loader.close() - expected_node_path = '{}/../resources/fs_neo4j_csv_loader/{}/nodes'\ - .format(os.path.join(os.path.dirname(__file__)), folder) + expected_node_path = os.path.join(here, f'../resources/fs_neo4j_csv_loader/{folder}/nodes') expected_nodes = self._get_csv_rows(expected_node_path, itemgetter('KEY')) actual_nodes = self._get_csv_rows(conf.get_string(FsNeo4jCSVLoader.NODE_DIR_PATH), itemgetter('KEY')) self.assertEqual(expected_nodes, actual_nodes) - expected_rel_path = \ - '{}/../resources/fs_neo4j_csv_loader/{}/relationships' \ - .format(os.path.join(os.path.dirname(__file__)), folder) + expected_rel_path = os.path.join(here, f'../resources/fs_neo4j_csv_loader/{folder}/relationships') expected_relations = self._get_csv_rows(expected_rel_path, itemgetter('START_KEY', 'END_KEY')) actual_relations = self._get_csv_rows(conf.get_string(FsNeo4jCSVLoader.RELATION_DIR_PATH), itemgetter('START_KEY', 'END_KEY')) @@ -71,8 +76,7 @@ def test_load_disjoint_properties(self) -> None: loader.load(people[1]) loader.close() - expected_node_path = '{}/../resources/fs_neo4j_csv_loader/{}/nodes'\ - .format(os.path.join(os.path.dirname(__file__)), folder) + expected_node_path = os.path.join(here, f'../resources/fs_neo4j_csv_loader/{folder}/nodes') expected_nodes = self._get_csv_rows(expected_node_path, itemgetter('KEY')) actual_nodes = self._get_csv_rows(conf.get_string(FsNeo4jCSVLoader.NODE_DIR_PATH), itemgetter('KEY')) @@ -81,11 +85,11 @@ def test_load_disjoint_properties(self) -> None: def _make_conf(self, test_name: str) -> ConfigTree: prefix = '/var/tmp/TestFsNeo4jCSVLoader' - return ConfigFactory.from_dict( - {FsNeo4jCSVLoader.NODE_DIR_PATH: '{}/{}/{}'.format(prefix, test_name, 'nodes'), - FsNeo4jCSVLoader.RELATION_DIR_PATH: '{}/{}/{}' - .format(prefix, test_name, 'relationships'), - FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR: True}) + return ConfigFactory.from_dict({ + FsNeo4jCSVLoader.NODE_DIR_PATH: f'{prefix}/{test_name}/{"nodes"}', + FsNeo4jCSVLoader.RELATION_DIR_PATH: f'{prefix}/{test_name}/{"relationships"}', + FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR: True + }) def _get_csv_rows(self, path: str, diff --git a/databuilder/tests/unit/loader/test_generic_loader.py b/databuilder/tests/unit/loader/test_generic_loader.py index aeb7f4ccf1..b44f4219f2 100644 --- a/databuilder/tests/unit/loader/test_generic_loader.py +++ b/databuilder/tests/unit/loader/test_generic_loader.py @@ -6,7 +6,7 @@ from mock import MagicMock from pyhocon import ConfigFactory -from databuilder.loader.generic_loader import GenericLoader, CALLBACK_FUNCTION +from databuilder.loader.generic_loader import CALLBACK_FUNCTION, GenericLoader class TestGenericLoader(unittest.TestCase): diff --git a/databuilder/tests/unit/models/dashboard/test_dashboard_chart.py b/databuilder/tests/unit/models/dashboard/test_dashboard_chart.py index 73b24d1c1d..bf10d139c2 100644 --- a/databuilder/tests/unit/models/dashboard/test_dashboard_chart.py +++ b/databuilder/tests/unit/models/dashboard/test_dashboard_chart.py @@ -2,12 +2,13 @@ # SPDX-License-Identifier: Apache-2.0 import unittest - from typing import Any, Dict from databuilder.models.dashboard.dashboard_chart import DashboardChart -from databuilder.models.graph_serializable import RELATION_START_KEY, RELATION_START_LABEL, RELATION_END_KEY, \ - RELATION_END_LABEL, RELATION_TYPE, RELATION_REVERSE_TYPE +from databuilder.models.graph_serializable import ( + RELATION_END_KEY, RELATION_END_LABEL, RELATION_REVERSE_TYPE, RELATION_START_KEY, RELATION_START_LABEL, + RELATION_TYPE, +) from databuilder.serializers import neo4_serializer diff --git a/databuilder/tests/unit/models/dashboard/test_dashboard_last_modified.py b/databuilder/tests/unit/models/dashboard/test_dashboard_last_modified.py index 58b7e96dd5..dd5dbd8c0f 100644 --- a/databuilder/tests/unit/models/dashboard/test_dashboard_last_modified.py +++ b/databuilder/tests/unit/models/dashboard/test_dashboard_last_modified.py @@ -2,12 +2,13 @@ # SPDX-License-Identifier: Apache-2.0 import unittest - from typing import Any, Dict from databuilder.models.dashboard.dashboard_last_modified import DashboardLastModifiedTimestamp -from databuilder.models.graph_serializable import RELATION_START_KEY, RELATION_START_LABEL, RELATION_END_KEY, \ - RELATION_END_LABEL, RELATION_TYPE, RELATION_REVERSE_TYPE +from databuilder.models.graph_serializable import ( + RELATION_END_KEY, RELATION_END_LABEL, RELATION_REVERSE_TYPE, RELATION_START_KEY, RELATION_START_LABEL, + RELATION_TYPE, +) from databuilder.serializers import neo4_serializer diff --git a/databuilder/tests/unit/models/dashboard/test_dashboard_owner.py b/databuilder/tests/unit/models/dashboard/test_dashboard_owner.py index 191ea00f24..ab6d4735ca 100644 --- a/databuilder/tests/unit/models/dashboard/test_dashboard_owner.py +++ b/databuilder/tests/unit/models/dashboard/test_dashboard_owner.py @@ -4,8 +4,10 @@ import unittest from databuilder.models.dashboard.dashboard_owner import DashboardOwner -from databuilder.models.graph_serializable import RELATION_START_KEY, RELATION_START_LABEL, RELATION_END_KEY, \ - RELATION_END_LABEL, RELATION_TYPE, RELATION_REVERSE_TYPE +from databuilder.models.graph_serializable import ( + RELATION_END_KEY, RELATION_END_LABEL, RELATION_REVERSE_TYPE, RELATION_START_KEY, RELATION_START_LABEL, + RELATION_TYPE, +) from databuilder.serializers import neo4_serializer diff --git a/databuilder/tests/unit/models/dashboard/test_dashboard_query.py b/databuilder/tests/unit/models/dashboard/test_dashboard_query.py index 29bb0f26b2..fe791b2e16 100644 --- a/databuilder/tests/unit/models/dashboard/test_dashboard_query.py +++ b/databuilder/tests/unit/models/dashboard/test_dashboard_query.py @@ -4,9 +4,10 @@ import unittest from databuilder.models.dashboard.dashboard_query import DashboardQuery -from databuilder.models.graph_serializable import NODE_KEY, \ - NODE_LABEL, RELATION_START_KEY, RELATION_START_LABEL, RELATION_END_KEY, \ - RELATION_END_LABEL, RELATION_TYPE, RELATION_REVERSE_TYPE +from databuilder.models.graph_serializable import ( + NODE_KEY, NODE_LABEL, RELATION_END_KEY, RELATION_END_LABEL, RELATION_REVERSE_TYPE, RELATION_START_KEY, + RELATION_START_LABEL, RELATION_TYPE, +) from databuilder.serializers import neo4_serializer diff --git a/databuilder/tests/unit/models/dashboard/test_dashboard_table.py b/databuilder/tests/unit/models/dashboard/test_dashboard_table.py index a65d8301a2..f4f00cebee 100644 --- a/databuilder/tests/unit/models/dashboard/test_dashboard_table.py +++ b/databuilder/tests/unit/models/dashboard/test_dashboard_table.py @@ -4,8 +4,10 @@ import unittest from databuilder.models.dashboard.dashboard_table import DashboardTable -from databuilder.models.graph_serializable import RELATION_START_KEY, RELATION_START_LABEL, RELATION_END_KEY, \ - RELATION_END_LABEL, RELATION_TYPE, RELATION_REVERSE_TYPE +from databuilder.models.graph_serializable import ( + RELATION_END_KEY, RELATION_END_LABEL, RELATION_REVERSE_TYPE, RELATION_START_KEY, RELATION_START_LABEL, + RELATION_TYPE, +) from databuilder.serializers import neo4_serializer diff --git a/databuilder/tests/unit/models/dashboard/test_dashboard_usage.py b/databuilder/tests/unit/models/dashboard/test_dashboard_usage.py index 2c7c04bf52..f1ae455f09 100644 --- a/databuilder/tests/unit/models/dashboard/test_dashboard_usage.py +++ b/databuilder/tests/unit/models/dashboard/test_dashboard_usage.py @@ -2,12 +2,13 @@ # SPDX-License-Identifier: Apache-2.0 import unittest - from typing import Any, Dict from databuilder.models.dashboard.dashboard_usage import DashboardUsage -from databuilder.models.graph_serializable import RELATION_START_KEY, RELATION_START_LABEL, RELATION_END_KEY, \ - RELATION_END_LABEL, RELATION_TYPE, RELATION_REVERSE_TYPE +from databuilder.models.graph_serializable import ( + RELATION_END_KEY, RELATION_END_LABEL, RELATION_REVERSE_TYPE, RELATION_START_KEY, RELATION_START_LABEL, + RELATION_TYPE, +) from databuilder.serializers import neo4_serializer diff --git a/databuilder/tests/unit/models/test_application.py b/databuilder/tests/unit/models/test_application.py index c3f7b1a50c..ecf909cf59 100644 --- a/databuilder/tests/unit/models/test_application.py +++ b/databuilder/tests/unit/models/test_application.py @@ -2,12 +2,12 @@ # SPDX-License-Identifier: Apache-2.0 import unittest -from databuilder.models.application import Application - -from databuilder.models.graph_serializable import NODE_KEY, \ - NODE_LABEL, RELATION_START_KEY, RELATION_START_LABEL, RELATION_END_KEY, \ - RELATION_END_LABEL, RELATION_TYPE, RELATION_REVERSE_TYPE +from databuilder.models.application import Application +from databuilder.models.graph_serializable import ( + NODE_KEY, NODE_LABEL, RELATION_END_KEY, RELATION_END_LABEL, RELATION_REVERSE_TYPE, RELATION_START_KEY, + RELATION_START_LABEL, RELATION_TYPE, +) from databuilder.models.table_metadata import TableMetadata from databuilder.serializers import neo4_serializer diff --git a/databuilder/tests/unit/models/test_badge.py b/databuilder/tests/unit/models/test_badge.py index c2b3afd220..91016d973d 100644 --- a/databuilder/tests/unit/models/test_badge.py +++ b/databuilder/tests/unit/models/test_badge.py @@ -2,10 +2,13 @@ # SPDX-License-Identifier: Apache-2.0 import unittest + from databuilder.models.badge import Badge, BadgeMetadata +from databuilder.models.graph_serializable import ( + NODE_KEY, NODE_LABEL, RELATION_END_KEY, RELATION_END_LABEL, RELATION_REVERSE_TYPE, RELATION_START_KEY, + RELATION_START_LABEL, RELATION_TYPE, +) from databuilder.serializers import neo4_serializer -from databuilder.models.graph_serializable import RELATION_START_KEY, RELATION_START_LABEL, RELATION_END_KEY, \ - RELATION_END_LABEL, RELATION_TYPE, RELATION_REVERSE_TYPE, NODE_KEY, NODE_LABEL db = 'hive' SCHEMA = 'BASE' diff --git a/databuilder/tests/unit/models/test_graph_serializable.py b/databuilder/tests/unit/models/test_graph_serializable.py index f46e5c1693..414520c7a6 100644 --- a/databuilder/tests/unit/models/test_graph_serializable.py +++ b/databuilder/tests/unit/models/test_graph_serializable.py @@ -2,12 +2,11 @@ # SPDX-License-Identifier: Apache-2.0 import unittest +from typing import Iterable, Union -from typing import Union, Iterable - -from databuilder.models.graph_serializable import GraphSerializable -from databuilder.models.graph_relationship import GraphRelationship from databuilder.models.graph_node import GraphNode +from databuilder.models.graph_relationship import GraphRelationship +from databuilder.models.graph_serializable import GraphSerializable from databuilder.serializers import neo4_serializer diff --git a/databuilder/tests/unit/models/test_neo4j_es_last_updated.py b/databuilder/tests/unit/models/test_neo4j_es_last_updated.py index f40b9a1b16..99c0749d2c 100644 --- a/databuilder/tests/unit/models/test_neo4j_es_last_updated.py +++ b/databuilder/tests/unit/models/test_neo4j_es_last_updated.py @@ -2,10 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 import unittest -from databuilder.models.neo4j_es_last_updated import Neo4jESLastUpdated -from databuilder.models.graph_serializable import NODE_KEY, \ - NODE_LABEL +from databuilder.models.graph_serializable import NODE_KEY, NODE_LABEL +from databuilder.models.neo4j_es_last_updated import Neo4jESLastUpdated from databuilder.serializers import neo4_serializer diff --git a/databuilder/tests/unit/models/test_table_column_usage.py b/databuilder/tests/unit/models/test_table_column_usage.py index 78da735f4d..e730347444 100644 --- a/databuilder/tests/unit/models/test_table_column_usage.py +++ b/databuilder/tests/unit/models/test_table_column_usage.py @@ -2,9 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 import unittest +from typing import no_type_check from databuilder.models.table_column_usage import ColumnReader, TableColumnUsage -from typing import no_type_check from databuilder.serializers import neo4_serializer diff --git a/databuilder/tests/unit/models/test_table_last_updated.py b/databuilder/tests/unit/models/test_table_last_updated.py index cc732339bc..632a8d6d0c 100644 --- a/databuilder/tests/unit/models/test_table_last_updated.py +++ b/databuilder/tests/unit/models/test_table_last_updated.py @@ -3,9 +3,10 @@ import unittest -from databuilder.models.graph_serializable import NODE_KEY, \ - NODE_LABEL, RELATION_START_KEY, RELATION_START_LABEL, RELATION_END_KEY, \ - RELATION_END_LABEL, RELATION_TYPE, RELATION_REVERSE_TYPE +from databuilder.models.graph_serializable import ( + NODE_KEY, NODE_LABEL, RELATION_END_KEY, RELATION_END_LABEL, RELATION_REVERSE_TYPE, RELATION_START_KEY, + RELATION_START_LABEL, RELATION_TYPE, +) from databuilder.models.table_last_updated import TableLastUpdated from databuilder.models.timestamp import timestamp_constants from databuilder.serializers import neo4_serializer diff --git a/databuilder/tests/unit/models/test_table_lineage.py b/databuilder/tests/unit/models/test_table_lineage.py index 2119d518a7..710a27eb43 100644 --- a/databuilder/tests/unit/models/test_table_lineage.py +++ b/databuilder/tests/unit/models/test_table_lineage.py @@ -3,12 +3,13 @@ import unittest +from databuilder.models.graph_serializable import ( + RELATION_END_KEY, RELATION_END_LABEL, RELATION_REVERSE_TYPE, RELATION_START_KEY, RELATION_START_LABEL, + RELATION_TYPE, +) from databuilder.models.table_lineage import TableLineage -from databuilder.models.graph_serializable import RELATION_START_KEY, RELATION_START_LABEL, RELATION_END_KEY, \ - RELATION_END_LABEL, RELATION_TYPE, RELATION_REVERSE_TYPE from databuilder.serializers import neo4_serializer - DB = 'hive' SCHEMA = 'base' TABLE = 'test' @@ -41,14 +42,8 @@ def test_create_relation(self) -> None: relations = self.table_lineage.create_relation() self.assertEqual(len(relations), 2) - start_key = '{db}://{cluster}.{schema}/{tbl}'.format(db=DB, - schema=SCHEMA, - tbl=TABLE, - cluster=CLUSTER) - end_key1 = '{db}://{cluster}.{schema}/{tbl}'.format(db=DB, - schema='test_schema', - tbl='test_table1', - cluster=CLUSTER) + start_key = f'{DB}://{CLUSTER}.{SCHEMA}/{TABLE}' + end_key1 = f'{DB}://{CLUSTER}.test_schema/test_table1' relation = { RELATION_START_KEY: start_key, diff --git a/databuilder/tests/unit/models/test_table_owner.py b/databuilder/tests/unit/models/test_table_owner.py index 841a6da29c..a6c6f3110c 100644 --- a/databuilder/tests/unit/models/test_table_owner.py +++ b/databuilder/tests/unit/models/test_table_owner.py @@ -2,16 +2,15 @@ # SPDX-License-Identifier: Apache-2.0 import unittest -from databuilder.models.user import User -from databuilder.models.table_owner import TableOwner - -from databuilder.models.graph_serializable import NODE_KEY, NODE_LABEL, \ - RELATION_START_KEY, RELATION_START_LABEL, RELATION_END_KEY, \ - RELATION_END_LABEL, RELATION_TYPE, RELATION_REVERSE_TYPE +from databuilder.models.graph_serializable import ( + NODE_KEY, NODE_LABEL, RELATION_END_KEY, RELATION_END_LABEL, RELATION_REVERSE_TYPE, RELATION_START_KEY, + RELATION_START_LABEL, RELATION_TYPE, +) +from databuilder.models.table_owner import TableOwner +from databuilder.models.user import User from databuilder.serializers import neo4_serializer - db = 'hive' SCHEMA = 'BASE' TABLE = 'TEST' diff --git a/databuilder/tests/unit/models/test_table_source.py b/databuilder/tests/unit/models/test_table_source.py index 754856adcd..b76714c12f 100644 --- a/databuilder/tests/unit/models/test_table_source.py +++ b/databuilder/tests/unit/models/test_table_source.py @@ -3,12 +3,13 @@ import unittest +from databuilder.models.graph_serializable import ( + RELATION_END_KEY, RELATION_END_LABEL, RELATION_REVERSE_TYPE, RELATION_START_KEY, RELATION_START_LABEL, + RELATION_TYPE, +) from databuilder.models.table_source import TableSource -from databuilder.models.graph_serializable import RELATION_START_KEY, RELATION_START_LABEL, RELATION_END_KEY, \ - RELATION_END_LABEL, RELATION_TYPE, RELATION_REVERSE_TYPE from databuilder.serializers import neo4_serializer - DB = 'hive' SCHEMA = 'base' TABLE = 'test' @@ -28,11 +29,7 @@ def setUp(self) -> None: def test_get_source_model_key(self) -> None: source = self.table_source.get_source_model_key() - self.assertEqual(source, '{db}://{cluster}.{schema}/{tbl}/_source'.format(db=DB, - schema=SCHEMA, - tbl=TABLE, - cluster=CLUSTER, - )) + self.assertEqual(source, f'{DB}://{CLUSTER}.{SCHEMA}/{TABLE}/_source') def test_get_metadata_model_key(self) -> None: metadata = self.table_source.get_metadata_model_key() @@ -47,14 +44,8 @@ def test_create_relation(self) -> None: self.assertEquals(len(relations), 1) serialized_relation = neo4_serializer.serialize_relationship(relations[0]) - start_key = '{db}://{cluster}.{schema}/{tbl}/_source'.format(db=DB, - schema=SCHEMA, - tbl=TABLE, - cluster=CLUSTER) - end_key = '{db}://{cluster}.{schema}/{tbl}'.format(db=DB, - schema=SCHEMA, - tbl=TABLE, - cluster=CLUSTER) + start_key = f'{DB}://{CLUSTER}.{SCHEMA}/{TABLE}/_source' + end_key = f'{DB}://{CLUSTER}.{SCHEMA}/{TABLE}' expected_relation = { RELATION_START_KEY: start_key, diff --git a/databuilder/tests/unit/models/test_table_stats.py b/databuilder/tests/unit/models/test_table_stats.py index 494f353a90..ca66fa956e 100644 --- a/databuilder/tests/unit/models/test_table_stats.py +++ b/databuilder/tests/unit/models/test_table_stats.py @@ -2,11 +2,12 @@ # SPDX-License-Identifier: Apache-2.0 import unittest -from databuilder.models.table_stats import TableColumnStats -from databuilder.models.graph_serializable import NODE_KEY, \ - NODE_LABEL, RELATION_START_KEY, RELATION_START_LABEL, RELATION_END_KEY, \ - RELATION_END_LABEL, RELATION_TYPE, RELATION_REVERSE_TYPE +from databuilder.models.graph_serializable import ( + NODE_KEY, NODE_LABEL, RELATION_END_KEY, RELATION_END_LABEL, RELATION_REVERSE_TYPE, RELATION_START_KEY, + RELATION_START_LABEL, RELATION_TYPE, +) +from databuilder.models.table_stats import TableColumnStats from databuilder.serializers import neo4_serializer diff --git a/databuilder/tests/unit/models/test_user.py b/databuilder/tests/unit/models/test_user.py index f974cc1d62..bb9d914a2a 100644 --- a/databuilder/tests/unit/models/test_user.py +++ b/databuilder/tests/unit/models/test_user.py @@ -3,8 +3,10 @@ import unittest -from databuilder.models.graph_serializable import RELATION_START_KEY, RELATION_START_LABEL, RELATION_END_KEY, \ - RELATION_END_LABEL, RELATION_TYPE, RELATION_REVERSE_TYPE +from databuilder.models.graph_serializable import ( + RELATION_END_KEY, RELATION_END_LABEL, RELATION_REVERSE_TYPE, RELATION_START_KEY, RELATION_START_LABEL, + RELATION_TYPE, +) from databuilder.models.user import User from databuilder.serializers import neo4_serializer @@ -28,7 +30,7 @@ def setUp(self) -> None: def test_get_user_model_key(self) -> None: user_email = User.get_user_model_key(email=self.user.email) - self.assertEqual(user_email, '{email}'.format(email='test@email.com')) + self.assertEqual(user_email, 'test@email.com') def test_create_nodes(self) -> None: nodes = self.user.create_nodes() @@ -58,8 +60,8 @@ def test_create_relation(self) -> None: relations = self.user.create_relation() self.assertEqual(len(relations), 1) - start_key = '{email}'.format(email='test@email.com') - end_key = '{email}'.format(email='test_manager@email.com') + start_key = 'test@email.com' + end_key = 'test_manager@email.com' expected_relation = { RELATION_START_KEY: start_key, diff --git a/databuilder/tests/unit/models/test_watermark.py b/databuilder/tests/unit/models/test_watermark.py index 01e6ed2489..5279a398a8 100644 --- a/databuilder/tests/unit/models/test_watermark.py +++ b/databuilder/tests/unit/models/test_watermark.py @@ -2,20 +2,14 @@ # SPDX-License-Identifier: Apache-2.0 import unittest -from databuilder.models.watermark import Watermark -from databuilder.models.graph_serializable import ( - NODE_KEY, - NODE_LABEL, - RELATION_START_KEY, - RELATION_START_LABEL, - RELATION_END_KEY, - RELATION_END_LABEL, - RELATION_TYPE, - RELATION_REVERSE_TYPE -) from databuilder.models.graph_node import GraphNode from databuilder.models.graph_relationship import GraphRelationship +from databuilder.models.graph_serializable import ( + NODE_KEY, NODE_LABEL, RELATION_END_KEY, RELATION_END_LABEL, RELATION_REVERSE_TYPE, RELATION_START_KEY, + RELATION_START_LABEL, RELATION_TYPE, +) +from databuilder.models.watermark import Watermark from databuilder.serializers import neo4_serializer CREATE_TIME = '2017-09-18T00:00:00' @@ -40,19 +34,8 @@ def setUp(self) -> None: part_type=PART_TYPE, part_name=NESTED_PART ) - start_key = '{database}://{cluster}.{schema}/{table}/{part_type}/'.format( - database=DATABASE, - cluster=CLUSTER, - schema=SCHEMA, - table=TABLE, - part_type=PART_TYPE - ) - end_key = '{database}://{cluster}.{schema}/{table}'.format( - database=DATABASE, - cluster=CLUSTER, - schema=SCHEMA, - table=TABLE - ) + start_key = f'{DATABASE}://{CLUSTER}.{SCHEMA}/{TABLE}/{PART_TYPE}/' + end_key = f'{DATABASE}://{CLUSTER}.{SCHEMA}/{TABLE}' self.expected_node_result = GraphNode( key=start_key, label='Watermark', @@ -92,21 +75,11 @@ def setUp(self) -> None: def test_get_watermark_model_key(self) -> None: watermark = self.watermark.get_watermark_model_key() - self.assertEqual( - watermark, '{database}://{cluster}.{schema}/{table}/{part_type}/' - .format(database=DATABASE, - cluster=CLUSTER, - schema=SCHEMA, - table=TABLE, - part_type=PART_TYPE)) + self.assertEqual(watermark, f'{DATABASE}://{CLUSTER}.{SCHEMA}/{TABLE}/{PART_TYPE}/') def test_get_metadata_model_key(self) -> None: metadata = self.watermark.get_metadata_model_key() - self.assertEqual(metadata, '{database}://{cluster}.{schema}/{table}' - .format(database=DATABASE, - cluster=CLUSTER, - schema=SCHEMA, - table=TABLE)) + self.assertEqual(metadata, f'{DATABASE}://{CLUSTER}.{SCHEMA}/{TABLE}') def test_create_nodes(self) -> None: nodes = self.watermark.create_nodes() diff --git a/databuilder/tests/unit/publisher/test_elasticsearch_publisher.py b/databuilder/tests/unit/publisher/test_elasticsearch_publisher.py index 2f53fd15f9..22b00b3c7f 100644 --- a/databuilder/tests/unit/publisher/test_elasticsearch_publisher.py +++ b/databuilder/tests/unit/publisher/test_elasticsearch_publisher.py @@ -2,9 +2,11 @@ # SPDX-License-Identifier: Apache-2.0 import json -from mock import MagicMock, mock_open, patch import unittest +from mock import ( + MagicMock, mock_open, patch, +) from pyhocon import ConfigFactory from databuilder import Scoped diff --git a/databuilder/tests/unit/publisher/test_neo4j_csv_publisher.py b/databuilder/tests/unit/publisher/test_neo4j_csv_publisher.py index 49fdc6ceb4..7aa7f17af1 100644 --- a/databuilder/tests/unit/publisher/test_neo4j_csv_publisher.py +++ b/databuilder/tests/unit/publisher/test_neo4j_csv_publisher.py @@ -6,20 +6,21 @@ import unittest import uuid -from mock import patch, MagicMock +from mock import MagicMock, patch from neo4j import GraphDatabase from pyhocon import ConfigFactory from databuilder.publisher import neo4j_csv_publisher from databuilder.publisher.neo4j_csv_publisher import Neo4jCsvPublisher +here = os.path.dirname(__file__) + class TestPublish(unittest.TestCase): def setUp(self) -> None: logging.basicConfig(level=logging.INFO) - self._resource_path = '{}/../resources/csv_publisher' \ - .format(os.path.join(os.path.dirname(__file__))) + self._resource_path = os.path.join(here, f'../resources/csv_publisher') def test_publisher(self) -> None: with patch.object(GraphDatabase, 'driver') as mock_driver: @@ -38,11 +39,11 @@ def test_publisher(self) -> None: conf = ConfigFactory.from_dict( {neo4j_csv_publisher.NEO4J_END_POINT_KEY: 'dummy://999.999.999.999:7687/', - neo4j_csv_publisher.NODE_FILES_DIR: '{}/nodes'.format(self._resource_path), - neo4j_csv_publisher.RELATION_FILES_DIR: '{}/relations'.format(self._resource_path), + neo4j_csv_publisher.NODE_FILES_DIR: f'{self._resource_path}/nodes', + neo4j_csv_publisher.RELATION_FILES_DIR: f'{self._resource_path}/relations', neo4j_csv_publisher.NEO4J_USER: 'neo4j_user', neo4j_csv_publisher.NEO4J_PASSWORD: 'neo4j_password', - neo4j_csv_publisher.JOB_PUBLISH_TAG: '{}'.format(uuid.uuid4())} + neo4j_csv_publisher.JOB_PUBLISH_TAG: str(uuid.uuid4())} ) publisher.init(conf) publisher.publish() @@ -73,12 +74,12 @@ def test_preprocessor(self) -> None: conf = ConfigFactory.from_dict( {neo4j_csv_publisher.NEO4J_END_POINT_KEY: 'dummy://999.999.999.999:7687/', - neo4j_csv_publisher.NODE_FILES_DIR: '{}/nodes'.format(self._resource_path), - neo4j_csv_publisher.RELATION_FILES_DIR: '{}/relations'.format(self._resource_path), + neo4j_csv_publisher.NODE_FILES_DIR: f'{self._resource_path}/nodes', + neo4j_csv_publisher.RELATION_FILES_DIR: f'{self._resource_path}/relations', neo4j_csv_publisher.RELATION_PREPROCESSOR: mock_preprocessor, neo4j_csv_publisher.NEO4J_USER: 'neo4j_user', neo4j_csv_publisher.NEO4J_PASSWORD: 'neo4j_password', - neo4j_csv_publisher.JOB_PUBLISH_TAG: '{}'.format(uuid.uuid4())} + neo4j_csv_publisher.JOB_PUBLISH_TAG: str(uuid.uuid4())} ) publisher.init(conf) publisher.publish() diff --git a/databuilder/tests/unit/publisher/test_neo4j_preprocessor.py b/databuilder/tests/unit/publisher/test_neo4j_preprocessor.py index 21dade0be3..41acbf7705 100644 --- a/databuilder/tests/unit/publisher/test_neo4j_preprocessor.py +++ b/databuilder/tests/unit/publisher/test_neo4j_preprocessor.py @@ -5,7 +5,7 @@ import unittest import uuid -from databuilder.publisher.neo4j_preprocessor import NoopRelationPreprocessor, DeleteRelationPreprocessor +from databuilder.publisher.neo4j_preprocessor import DeleteRelationPreprocessor, NoopRelationPreprocessor class TestNeo4jPreprocessor(unittest.TestCase): diff --git a/databuilder/tests/unit/publisher/test_publisher.py b/databuilder/tests/unit/publisher/test_publisher.py index 2535d8efbf..166914253d 100644 --- a/databuilder/tests/unit/publisher/test_publisher.py +++ b/databuilder/tests/unit/publisher/test_publisher.py @@ -6,7 +6,7 @@ from mock import MagicMock from pyhocon import ConfigTree -from databuilder.publisher.base_publisher import Publisher, NoopPublisher +from databuilder.publisher.base_publisher import NoopPublisher, Publisher class TestPublisher(unittest.TestCase): diff --git a/databuilder/tests/unit/rest_api/mode_analytics/test_mode_paginated_rest_api_query.py b/databuilder/tests/unit/rest_api/mode_analytics/test_mode_paginated_rest_api_query.py index 53adb6e077..c0f9d863a3 100644 --- a/databuilder/tests/unit/rest_api/mode_analytics/test_mode_paginated_rest_api_query.py +++ b/databuilder/tests/unit/rest_api/mode_analytics/test_mode_paginated_rest_api_query.py @@ -4,7 +4,7 @@ import logging import unittest -from mock import patch, call +from mock import call, patch from databuilder.rest_api.base_rest_api_query import RestApiQuerySeed from databuilder.rest_api.mode_analytics.mode_paginated_rest_api_query import ModePaginatedRestApiQuery diff --git a/databuilder/tests/unit/rest_api/test_rest_api_failure_handlers.py b/databuilder/tests/unit/rest_api/test_rest_api_failure_handlers.py index 7e00da1a5d..e115ac4c76 100644 --- a/databuilder/tests/unit/rest_api/test_rest_api_failure_handlers.py +++ b/databuilder/tests/unit/rest_api/test_rest_api_failure_handlers.py @@ -3,9 +3,10 @@ import unittest -from databuilder.rest_api.rest_api_failure_handlers import HttpFailureSkipOnStatus from mock import MagicMock +from databuilder.rest_api.rest_api_failure_handlers import HttpFailureSkipOnStatus + class TestHttpFailureSkipOnStatus(unittest.TestCase): diff --git a/databuilder/tests/unit/rest_api/test_rest_api_query.py b/databuilder/tests/unit/rest_api/test_rest_api_query.py index 48899a9d2e..b1e0271a93 100644 --- a/databuilder/tests/unit/rest_api/test_rest_api_query.py +++ b/databuilder/tests/unit/rest_api/test_rest_api_query.py @@ -5,7 +5,7 @@ from mock import patch -from databuilder.rest_api.base_rest_api_query import RestApiQuerySeed, EmptyRestApiQuerySeed +from databuilder.rest_api.base_rest_api_query import EmptyRestApiQuerySeed, RestApiQuerySeed from databuilder.rest_api.rest_api_query import RestApiQuery diff --git a/databuilder/tests/unit/task/test_neo4j_staleness_removal_task.py b/databuilder/tests/unit/task/test_neo4j_staleness_removal_task.py index 6da9114f68..08bd352137 100644 --- a/databuilder/tests/unit/task/test_neo4j_staleness_removal_task.py +++ b/databuilder/tests/unit/task/test_neo4j_staleness_removal_task.py @@ -27,15 +27,11 @@ def test_validation_failure(self) -> None: with patch.object(GraphDatabase, 'driver'): task = Neo4jStalenessRemovalTask() job_config = ConfigFactory.from_dict({ - 'job.identifier': 'remove_stale_data_job', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_END_POINT_KEY): - 'foobar', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_USER): - 'foo', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_PASSWORD): - 'bar', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.STALENESS_MAX_PCT): - 90, + f'job.identifier': 'remove_stale_data_job', + f'{task.get_scope()}.{neo4j_staleness_removal_task.NEO4J_END_POINT_KEY}': 'foobar', + f'{task.get_scope()}.{neo4j_staleness_removal_task.NEO4J_USER}': 'foo', + f'{task.get_scope()}.{neo4j_staleness_removal_task.NEO4J_PASSWORD}': 'bar', + f'{task.get_scope()}.{neo4j_staleness_removal_task.STALENESS_MAX_PCT}': 90, neo4j_csv_publisher.JOB_PUBLISH_TAG: 'foo' }) @@ -50,15 +46,11 @@ def test_validation(self) -> None: with patch.object(GraphDatabase, 'driver'): task = Neo4jStalenessRemovalTask() job_config = ConfigFactory.from_dict({ - 'job.identifier': 'remove_stale_data_job', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_END_POINT_KEY): - 'foobar', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_USER): - 'foo', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_PASSWORD): - 'bar', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.STALENESS_MAX_PCT): - 5, + f'job.identifier': 'remove_stale_data_job', + f'{task.get_scope()}.{neo4j_staleness_removal_task.NEO4J_END_POINT_KEY}': 'foobar', + f'{task.get_scope()}.{neo4j_staleness_removal_task.NEO4J_USER}': 'foo', + f'{task.get_scope()}.{neo4j_staleness_removal_task.NEO4J_PASSWORD}': 'bar', + f'{task.get_scope()}.{neo4j_staleness_removal_task.STALENESS_MAX_PCT}': 5, neo4j_csv_publisher.JOB_PUBLISH_TAG: 'foo' }) @@ -73,17 +65,12 @@ def test_validation_threshold_override(self) -> None: with patch.object(GraphDatabase, 'driver'): task = Neo4jStalenessRemovalTask() job_config = ConfigFactory.from_dict({ - 'job.identifier': 'remove_stale_data_job', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_END_POINT_KEY): - 'foobar', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_USER): - 'foo', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_PASSWORD): - 'bar', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.STALENESS_MAX_PCT): - 5, - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.STALENESS_PCT_MAX_DICT): - {'foo': 51}, + f'job.identifier': 'remove_stale_data_job', + f'{task.get_scope()}.{neo4j_staleness_removal_task.NEO4J_END_POINT_KEY}': 'foobar', + f'{task.get_scope()}.{neo4j_staleness_removal_task.NEO4J_USER}': 'foo', + f'{task.get_scope()}.{neo4j_staleness_removal_task.NEO4J_PASSWORD}': 'bar', + f'{task.get_scope()}.{neo4j_staleness_removal_task.STALENESS_MAX_PCT}': 5, + f'{task.get_scope()}.{neo4j_staleness_removal_task.STALENESS_PCT_MAX_DICT}': {'foo': 51}, neo4j_csv_publisher.JOB_PUBLISH_TAG: 'foo' }) @@ -99,15 +86,11 @@ def test_marker(self) -> None: with patch.object(GraphDatabase, 'driver'): task = Neo4jStalenessRemovalTask() job_config = ConfigFactory.from_dict({ - 'job.identifier': 'remove_stale_data_job', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_END_POINT_KEY): - 'foobar', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_USER): - 'foo', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_PASSWORD): - 'bar', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.STALENESS_MAX_PCT): - 5, + f'job.identifier': 'remove_stale_data_job', + f'{task.get_scope()}.{neo4j_staleness_removal_task.NEO4J_END_POINT_KEY}': 'foobar', + f'{task.get_scope()}.{neo4j_staleness_removal_task.NEO4J_USER}': 'foo', + f'{task.get_scope()}.{neo4j_staleness_removal_task.NEO4J_PASSWORD}': 'bar', + f'{task.get_scope()}.{neo4j_staleness_removal_task.STALENESS_MAX_PCT}': 5, neo4j_csv_publisher.JOB_PUBLISH_TAG: 'foo' }) @@ -117,17 +100,12 @@ def test_marker(self) -> None: task = Neo4jStalenessRemovalTask() job_config = ConfigFactory.from_dict({ - 'job.identifier': 'remove_stale_data_job', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_END_POINT_KEY): - 'foobar', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_USER): - 'foo', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_PASSWORD): - 'bar', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.STALENESS_MAX_PCT): - 5, - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.MS_TO_EXPIRE): - 86400000, + f'job.identifier': 'remove_stale_data_job', + f'{task.get_scope()}.{neo4j_staleness_removal_task.NEO4J_END_POINT_KEY}': 'foobar', + f'{task.get_scope()}.{neo4j_staleness_removal_task.NEO4J_USER}': 'foo', + f'{task.get_scope()}.{neo4j_staleness_removal_task.NEO4J_PASSWORD}': 'bar', + f'{task.get_scope()}.{neo4j_staleness_removal_task.STALENESS_MAX_PCT}': 5, + f'{task.get_scope()}.{neo4j_staleness_removal_task.MS_TO_EXPIRE}': 86400000, }) task.init(job_config) @@ -139,17 +117,12 @@ def test_validation_statement_publish_tag(self) -> None: as mock_execute: task = Neo4jStalenessRemovalTask() job_config = ConfigFactory.from_dict({ - 'job.identifier': 'remove_stale_data_job', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_END_POINT_KEY): - 'foobar', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_USER): - 'foo', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_PASSWORD): - 'bar', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.STALENESS_MAX_PCT): - 5, - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.TARGET_NODES): - ['Foo'], + f'job.identifier': 'remove_stale_data_job', + f'{task.get_scope()}.{neo4j_staleness_removal_task.NEO4J_END_POINT_KEY}': 'foobar', + f'{task.get_scope()}.{neo4j_staleness_removal_task.NEO4J_USER}': 'foo', + f'{task.get_scope()}.{neo4j_staleness_removal_task.NEO4J_PASSWORD}': 'bar', + f'{task.get_scope()}.{neo4j_staleness_removal_task.STALENESS_MAX_PCT}': 5, + f'{task.get_scope()}.{neo4j_staleness_removal_task.TARGET_NODES}': ['Foo'], neo4j_csv_publisher.JOB_PUBLISH_TAG: 'foo', }) @@ -188,17 +161,12 @@ def test_validation_statement_ms_to_expire(self) -> None: as mock_execute: task = Neo4jStalenessRemovalTask() job_config = ConfigFactory.from_dict({ - 'job.identifier': 'remove_stale_data_job', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_END_POINT_KEY): - 'foobar', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_USER): - 'foo', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_PASSWORD): - 'bar', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.STALENESS_MAX_PCT): - 5, - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.MS_TO_EXPIRE): - 9876543210 + f'job.identifier': 'remove_stale_data_job', + f'{task.get_scope()}.{neo4j_staleness_removal_task.NEO4J_END_POINT_KEY}': 'foobar', + f'{task.get_scope()}.{neo4j_staleness_removal_task.NEO4J_USER}': 'foo', + f'{task.get_scope()}.{neo4j_staleness_removal_task.NEO4J_PASSWORD}': 'bar', + f'{task.get_scope()}.{neo4j_staleness_removal_task.STALENESS_MAX_PCT}': 5, + f'{task.get_scope()}.{neo4j_staleness_removal_task.MS_TO_EXPIRE}': 9876543210 }) task.init(job_config) @@ -237,19 +205,13 @@ def test_delete_statement_publish_tag(self) -> None: mock_execute.return_value.single.return_value = {'count': 0} task = Neo4jStalenessRemovalTask() job_config = ConfigFactory.from_dict({ - 'job.identifier': 'remove_stale_data_job', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_END_POINT_KEY): - 'foobar', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_USER): - 'foo', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_PASSWORD): - 'bar', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.STALENESS_MAX_PCT): - 5, - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.TARGET_NODES): - ['Foo'], - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.TARGET_RELATIONS): - ['BAR'], + f'job.identifier': 'remove_stale_data_job', + f'{task.get_scope()}.{neo4j_staleness_removal_task.NEO4J_END_POINT_KEY}': 'foobar', + f'{task.get_scope()}.{neo4j_staleness_removal_task.NEO4J_USER}': 'foo', + f'{task.get_scope()}.{neo4j_staleness_removal_task.NEO4J_PASSWORD}': 'bar', + f'{task.get_scope()}.{neo4j_staleness_removal_task.STALENESS_MAX_PCT}': 5, + f'{task.get_scope()}.{neo4j_staleness_removal_task.TARGET_NODES}': ['Foo'], + f'{task.get_scope()}.{neo4j_staleness_removal_task.TARGET_RELATIONS}': ['BAR'], neo4j_csv_publisher.JOB_PUBLISH_TAG: 'foo', }) @@ -287,21 +249,14 @@ def test_delete_statement_ms_to_expire(self) -> None: mock_execute.return_value.single.return_value = {'count': 0} task = Neo4jStalenessRemovalTask() job_config = ConfigFactory.from_dict({ - 'job.identifier': 'remove_stale_data_job', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_END_POINT_KEY): - 'foobar', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_USER): - 'foo', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_PASSWORD): - 'bar', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.STALENESS_MAX_PCT): - 5, - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.TARGET_NODES): - ['Foo'], - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.TARGET_RELATIONS): - ['BAR'], - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.MS_TO_EXPIRE): - 9876543210 + f'job.identifier': 'remove_stale_data_job', + f'{task.get_scope()}.{neo4j_staleness_removal_task.NEO4J_END_POINT_KEY}': 'foobar', + f'{task.get_scope()}.{neo4j_staleness_removal_task.NEO4J_USER}': 'foo', + f'{task.get_scope()}.{neo4j_staleness_removal_task.NEO4J_PASSWORD}': 'bar', + f'{task.get_scope()}.{neo4j_staleness_removal_task.STALENESS_MAX_PCT}': 5, + f'{task.get_scope()}.{neo4j_staleness_removal_task.TARGET_NODES}': ['Foo'], + f'{task.get_scope()}.{neo4j_staleness_removal_task.TARGET_RELATIONS}': ['BAR'], + f'{task.get_scope()}.{neo4j_staleness_removal_task.MS_TO_EXPIRE}': 9876543210 }) task.init(job_config) @@ -336,21 +291,14 @@ def test_ms_to_expire_too_small(self) -> None: with patch.object(GraphDatabase, 'driver'): task = Neo4jStalenessRemovalTask() job_config = ConfigFactory.from_dict({ - 'job.identifier': 'remove_stale_data_job', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_END_POINT_KEY): - 'foobar', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_USER): - 'foo', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_PASSWORD): - 'bar', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.STALENESS_MAX_PCT): - 5, - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.TARGET_NODES): - ['Foo'], - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.TARGET_RELATIONS): - ['BAR'], - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.MS_TO_EXPIRE): - 24 * 60 * 60 * 100 - 10 + f'job.identifier': 'remove_stale_data_job', + f'{task.get_scope()}.{neo4j_staleness_removal_task.NEO4J_END_POINT_KEY}': 'foobar', + f'{task.get_scope()}.{neo4j_staleness_removal_task.NEO4J_USER}': 'foo', + f'{task.get_scope()}.{neo4j_staleness_removal_task.NEO4J_PASSWORD}': 'bar', + f'{task.get_scope()}.{neo4j_staleness_removal_task.STALENESS_MAX_PCT}': 5, + f'{task.get_scope()}.{neo4j_staleness_removal_task.TARGET_NODES}': ['Foo'], + f'{task.get_scope()}.{neo4j_staleness_removal_task.TARGET_RELATIONS}': ['BAR'], + f'{task.get_scope()}.{neo4j_staleness_removal_task.MS_TO_EXPIRE}': 24 * 60 * 60 * 100 - 10 }) try: @@ -362,21 +310,14 @@ def test_ms_to_expire_too_small(self) -> None: with patch.object(GraphDatabase, 'driver'): task = Neo4jStalenessRemovalTask() job_config = ConfigFactory.from_dict({ - 'job.identifier': 'remove_stale_data_job', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_END_POINT_KEY): - 'foobar', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_USER): - 'foo', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_PASSWORD): - 'bar', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.STALENESS_MAX_PCT): - 5, - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.TARGET_NODES): - ['Foo'], - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.TARGET_RELATIONS): - ['BAR'], - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.MS_TO_EXPIRE): - 24 * 60 * 60 * 1000, + f'job.identifier': 'remove_stale_data_job', + f'{task.get_scope()}.{neo4j_staleness_removal_task.NEO4J_END_POINT_KEY}': 'foobar', + f'{task.get_scope()}.{neo4j_staleness_removal_task.NEO4J_USER}': 'foo', + f'{task.get_scope()}.{neo4j_staleness_removal_task.NEO4J_PASSWORD}': 'bar', + f'{task.get_scope()}.{neo4j_staleness_removal_task.STALENESS_MAX_PCT}': 5, + f'{task.get_scope()}.{neo4j_staleness_removal_task.TARGET_NODES}': ['Foo'], + f'{task.get_scope()}.{neo4j_staleness_removal_task.TARGET_RELATIONS}': ['BAR'], + f'{task.get_scope()}.{neo4j_staleness_removal_task.MS_TO_EXPIRE}': 24 * 60 * 60 * 1000, }) task.init(job_config) @@ -386,21 +327,14 @@ def test_delete_dry_run(self) -> None: task = Neo4jStalenessRemovalTask() job_config = ConfigFactory.from_dict({ - 'job.identifier': 'remove_stale_data_job', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_END_POINT_KEY): - 'foobar', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_USER): - 'foo', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.NEO4J_PASSWORD): - 'bar', - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.STALENESS_MAX_PCT): - 5, - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.TARGET_NODES): - ['Foo'], - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.TARGET_RELATIONS): - ['BAR'], - '{}.{}'.format(task.get_scope(), neo4j_staleness_removal_task.DRY_RUN): - True, + f'job.identifier': 'remove_stale_data_job', + f'{task.get_scope()}.{neo4j_staleness_removal_task.NEO4J_END_POINT_KEY}': 'foobar', + f'{task.get_scope()}.{neo4j_staleness_removal_task.NEO4J_USER}': 'foo', + f'{task.get_scope()}.{neo4j_staleness_removal_task.NEO4J_PASSWORD}': 'bar', + f'{task.get_scope()}.{neo4j_staleness_removal_task.STALENESS_MAX_PCT}': 5, + f'{task.get_scope()}.{neo4j_staleness_removal_task.TARGET_NODES}': ['Foo'], + f'{task.get_scope()}.{neo4j_staleness_removal_task.TARGET_RELATIONS}': ['BAR'], + f'{task.get_scope()}.{neo4j_staleness_removal_task.DRY_RUN}': True, neo4j_csv_publisher.JOB_PUBLISH_TAG: 'foo', }) diff --git a/databuilder/tests/unit/test_base_job.py b/databuilder/tests/unit/test_base_job.py index 7b341f5a22..a6d872e138 100644 --- a/databuilder/tests/unit/test_base_job.py +++ b/databuilder/tests/unit/test_base_job.py @@ -2,34 +2,35 @@ # SPDX-License-Identifier: Apache-2.0 import json +import logging import shutil import tempfile import unittest -from mock import patch - -from pyhocon import ConfigTree, ConfigFactory from typing import Any +from mock import patch +from pyhocon import ConfigFactory, ConfigTree + from databuilder.extractor.base_extractor import Extractor from databuilder.job.job import DefaultJob from databuilder.loader.base_loader import Loader from databuilder.task.task import DefaultTask from databuilder.transformer.base_transformer import Transformer +LOGGER = logging.getLogger(__name__) + class TestJob(unittest.TestCase): def setUp(self) -> None: self.temp_dir_path = tempfile.mkdtemp() - self.dest_file_name = '{}/superhero.json'.format(self.temp_dir_path) - self.conf = ConfigFactory.from_dict( - {'loader.superhero.dest_file': self.dest_file_name}) + self.dest_file_name = f'{self.temp_dir_path}/superhero.json' + self.conf = ConfigFactory.from_dict({'loader.superhero.dest_file': self.dest_file_name}) def tearDown(self) -> None: shutil.rmtree(self.temp_dir_path) def test_job(self) -> None: - with patch("databuilder.job.job.StatsClient") as mock_statsd: task = DefaultTask(SuperHeroExtractor(), SuperHeroLoader(), @@ -53,7 +54,7 @@ class TestJobNoTransform(unittest.TestCase): def setUp(self) -> None: self.temp_dir_path = tempfile.mkdtemp() - self.dest_file_name = '{}/superhero.json'.format(self.temp_dir_path) + self.dest_file_name = f'{self.temp_dir_path}/superhero.json' self.conf = ConfigFactory.from_dict( {'loader.superhero.dest_file': self.dest_file_name}) @@ -79,7 +80,7 @@ class TestJobStatsd(unittest.TestCase): def setUp(self) -> None: self.temp_dir_path = tempfile.mkdtemp() - self.dest_file_name = '{}/superhero.json'.format(self.temp_dir_path) + self.dest_file_name = f'{self.temp_dir_path}/superhero.json' self.conf = ConfigFactory.from_dict( {'loader.superhero.dest_file': self.dest_file_name, 'job.is_statsd_enabled': True, @@ -133,7 +134,7 @@ def __init__(self, self.name = name def __repr__(self) -> str: - return "SuperHero(hero={0}, name={1})".format(self.hero, self.name) + return f'SuperHero(hero={self.hero}, name={self.name})' class SuperHeroReverseNameTransformer(Transformer): @@ -155,13 +156,13 @@ class SuperHeroLoader(Loader): def init(self, conf: ConfigTree) -> None: self.conf = conf dest_file_path = self.conf.get_string('dest_file') - print('Loading to {}'.format(dest_file_path)) + LOGGER.info('Loading to %s', dest_file_path) self.dest_file_obj = open(self.conf.get_string('dest_file'), 'w') def load(self, record: Any) -> None: - str = json.dumps(record.__dict__, sort_keys=True) - print('Writing record: {}'.format(str)) - self.dest_file_obj.write('{}\n'.format(str)) + rec = json.dumps(record.__dict__, sort_keys=True) + LOGGER.info('Writing record: %s', rec) + self.dest_file_obj.write(f'{rec}\n') self.dest_file_obj.flush() def get_scope(self) -> str: diff --git a/databuilder/tests/unit/transformer/test_bigquery_usage_transformer.py b/databuilder/tests/unit/transformer/test_bigquery_usage_transformer.py index 2b8aec14e3..a85df8ce71 100644 --- a/databuilder/tests/unit/transformer/test_bigquery_usage_transformer.py +++ b/databuilder/tests/unit/transformer/test_bigquery_usage_transformer.py @@ -5,9 +5,9 @@ from pyhocon import ConfigFactory -from databuilder.transformer.bigquery_usage_transformer import BigqueryUsageTransformer from databuilder.extractor.bigquery_usage_extractor import TableColumnUsageTuple from databuilder.models.table_column_usage import TableColumnUsage +from databuilder.transformer.bigquery_usage_transformer import BigqueryUsageTransformer class TestBigQueryUsageTransform(unittest.TestCase): diff --git a/databuilder/tests/unit/transformer/test_dict_to_model_transformer.py b/databuilder/tests/unit/transformer/test_dict_to_model_transformer.py index e422365822..53857c3436 100644 --- a/databuilder/tests/unit/transformer/test_dict_to_model_transformer.py +++ b/databuilder/tests/unit/transformer/test_dict_to_model_transformer.py @@ -5,8 +5,8 @@ from pyhocon import ConfigFactory -from databuilder.transformer.dict_to_model import DictToModel, MODEL_CLASS from databuilder.models.dashboard.dashboard_execution import DashboardExecution +from databuilder.transformer.dict_to_model import MODEL_CLASS, DictToModel class TestDictToModel(unittest.TestCase): diff --git a/databuilder/tests/unit/transformer/test_regex_str_replace_transformer.py b/databuilder/tests/unit/transformer/test_regex_str_replace_transformer.py index 1769edde6b..320c20361a 100644 --- a/databuilder/tests/unit/transformer/test_regex_str_replace_transformer.py +++ b/databuilder/tests/unit/transformer/test_regex_str_replace_transformer.py @@ -2,12 +2,13 @@ # SPDX-License-Identifier: Apache-2.0 import unittest +from typing import Any from pyhocon import ConfigFactory -from typing import Any -from databuilder.transformer.regex_str_replace_transformer import RegexStrReplaceTransformer, \ - REGEX_REPLACE_TUPLE_LIST, ATTRIBUTE_NAME +from databuilder.transformer.regex_str_replace_transformer import ( + ATTRIBUTE_NAME, REGEX_REPLACE_TUPLE_LIST, RegexStrReplaceTransformer, +) class TestRegexReplacement(unittest.TestCase): diff --git a/databuilder/tests/unit/transformer/test_remove_field_transformer.py b/databuilder/tests/unit/transformer/test_remove_field_transformer.py index 39e4587f94..0ce28e5ff0 100644 --- a/databuilder/tests/unit/transformer/test_remove_field_transformer.py +++ b/databuilder/tests/unit/transformer/test_remove_field_transformer.py @@ -5,7 +5,7 @@ from pyhocon import ConfigFactory -from databuilder.transformer.remove_field_transformer import RemoveFieldTransformer, FIELD_NAMES +from databuilder.transformer.remove_field_transformer import FIELD_NAMES, RemoveFieldTransformer class TestRemoveFieldTransformer(unittest.TestCase): diff --git a/databuilder/tests/unit/transformer/test_table_tag_transformer.py b/databuilder/tests/unit/transformer/test_table_tag_transformer.py index 8bebee3b20..4f8efda0e5 100644 --- a/databuilder/tests/unit/transformer/test_table_tag_transformer.py +++ b/databuilder/tests/unit/transformer/test_table_tag_transformer.py @@ -5,8 +5,8 @@ from pyhocon import ConfigFactory -from databuilder.transformer.table_tag_transformer import TableTagTransformer from databuilder.models.table_metadata import TableMetadata +from databuilder.transformer.table_tag_transformer import TableTagTransformer class TestTableTagTransformer(unittest.TestCase): diff --git a/databuilder/tests/unit/transformer/test_template_variable_substitution_transformer.py b/databuilder/tests/unit/transformer/test_template_variable_substitution_transformer.py index 6e10b03478..0e10428fdc 100644 --- a/databuilder/tests/unit/transformer/test_template_variable_substitution_transformer.py +++ b/databuilder/tests/unit/transformer/test_template_variable_substitution_transformer.py @@ -5,8 +5,9 @@ from pyhocon import ConfigFactory -from databuilder.transformer.template_variable_substitution_transformer import \ - TemplateVariableSubstitutionTransformer, FIELD_NAME, TEMPLATE +from databuilder.transformer.template_variable_substitution_transformer import ( + FIELD_NAME, TEMPLATE, TemplateVariableSubstitutionTransformer, +) class TestTemplateVariableSubstitutionTransformer(unittest.TestCase): diff --git a/databuilder/tests/unit/transformer/test_timestamp_string_to_epoch_transformer.py b/databuilder/tests/unit/transformer/test_timestamp_string_to_epoch_transformer.py index 9ca10adf4f..c87fe44e1f 100644 --- a/databuilder/tests/unit/transformer/test_timestamp_string_to_epoch_transformer.py +++ b/databuilder/tests/unit/transformer/test_timestamp_string_to_epoch_transformer.py @@ -5,7 +5,9 @@ from pyhocon import ConfigFactory -from databuilder.transformer.timestamp_string_to_epoch import TimestampStringToEpoch, FIELD_NAME, TIMESTAMP_FORMAT +from databuilder.transformer.timestamp_string_to_epoch import ( + FIELD_NAME, TIMESTAMP_FORMAT, TimestampStringToEpoch, +) class TestTimestampStrToEpoch(unittest.TestCase):