From 9ba91cff712d0c4824e1ac810f92a09103c8b5ff Mon Sep 17 00:00:00 2001 From: Hariharan Banukumar Date: Mon, 23 Aug 2021 07:13:54 -0400 Subject: [PATCH] fixed get_columns_in_relation for open source delta table (#207) * fixed get_columns_in_relation for open source delta table * fixed E501 linting error and added change log --- CHANGELOG.md | 2 ++ dbt/adapters/spark/impl.py | 11 ++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 60d85d50a..cbce884c8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,10 +1,12 @@ ## dbt-spark 0.21.0 (Release TBD) ### Fixes +- Enhanced get_columns_in_relation method to handle a bug in open source deltalake which doesnt return schema details in `show table extended in databasename like '*'` query output. This impacts dbt snapshots if file format is open source deltalake ([#207](https://github.com/dbt-labs/dbt-spark/pull/207)) - Add pyodbc import error message to dbt.exceptions.RuntimeException to get more detailed information when running `dbt debug` ([#192](https://github.com/dbt-labs/dbt-spark/pull/192)) - Add support for ODBC Server Side Parameters, allowing options that need to be set with the `SET` statement to be used ([#201](https://github.com/dbt-labs/dbt-spark/pull/201)) ### Contributors +- [@harryharanb](https://github.com/harryharanb) ([#207](https://github.com/dbt-labs/dbt-spark/pull/207)) - [@JCZuurmond](https://github.com/JCZuurmond) ([#192](https://github.com/fishtown-analytics/dbt-spark/pull/192)) - [@jethron](https://github.com/jethron) ([#201](https://github.com/fishtown-analytics/dbt-spark/pull/201)) diff --git a/dbt/adapters/spark/impl.py b/dbt/adapters/spark/impl.py index f8e72449a..03fba9fac 100644 --- a/dbt/adapters/spark/impl.py +++ b/dbt/adapters/spark/impl.py @@ -212,11 +212,16 @@ def get_columns_in_relation(self, relation: Relation) -> List[SparkColumn]: for cached_relation in cached_relations if str(cached_relation) == str(relation)), None) - if cached_relation is None or cached_relation.information is None: + columns = [] + if cached_relation and cached_relation.information: + columns = self.parse_columns_from_information(cached_relation) + if not columns: + # in open source delta 'show table extended' query output doesnt + # return relation's schema. if columns are empty from cache, + # use get_columns_in_relation spark macro + # which would execute 'describe extended tablename' query rows: List[agate.Row] = super().get_columns_in_relation(relation) columns = self.parse_describe_extended(relation, rows) - else: - columns = self.parse_columns_from_information(cached_relation) return columns def parse_columns_from_information(