Skip to content

Commit

Permalink
alias intermediate duplicate columns
Browse files Browse the repository at this point in the history
  • Loading branch information
Blizzara committed Jul 5, 2024
1 parent c08f2c1 commit 8c55e1b
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 2 deletions.
17 changes: 15 additions & 2 deletions datafusion/substrait/src/logical_plan/consumer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ use substrait::proto::{FunctionArgument, SortField};
use datafusion::arrow::array::GenericListArray;
use datafusion::common::scalar::ScalarStructBuilder;
use datafusion::logical_expr::expr::{InList, InSubquery, Sort};
use std::collections::HashMap;
use std::collections::{HashMap, HashSet};
use std::str::FromStr;
use std::sync::Arc;

Expand Down Expand Up @@ -403,6 +403,7 @@ pub async fn from_substrait_rel(
let mut input = LogicalPlanBuilder::from(
from_substrait_rel(ctx, input, extensions).await?,
);
let mut names: HashSet<String> = HashSet::new();
let mut exprs: Vec<Expr> = vec![];
for e in &p.expressions {
let x =
Expand All @@ -415,7 +416,19 @@ pub async fn from_substrait_rel(
}
_ => {}
}
exprs.push(x.as_ref().clone());
let name = x.display_name()?;
let mut new_name = name.clone();
let mut i = 0;
while names.contains(&new_name) {
new_name = format!("{}_{}", name, i);
i += 1;
}
names.insert(new_name.clone());
if new_name != name {
exprs.push(x.as_ref().clone().alias(new_name.clone()));
} else {
exprs.push(x.as_ref().clone());
}
}
input.project(exprs)?.build()
} else {
Expand Down
16 changes: 16 additions & 0 deletions datafusion/substrait/tests/cases/roundtrip_logical_plan.rs
Original file line number Diff line number Diff line change
Expand Up @@ -751,6 +751,22 @@ async fn roundtrip_values_duplicate_column_join() -> Result<()> {
.await
}

#[tokio::test]
async fn duplicate_column() -> Result<()> {
// Substrait does not keep column names (aliases) in the plan, rather it operates on column indices
// only. DataFusion however, is strict about not having duplicate column names appear in the plan.
// This test confirms that we generate aliases for columns in the plan which would otherwise have
// colliding names.
assert_expected_plan(
"SELECT a + 1 as sum_a, a + 1 as sum_a_2 FROM data",
"Projection: data.a + Int64(1) AS sum_a, data.a + Int64(1) AS data.a + Int64(1)_0 AS sum_a_2\
\n Projection: data.a + Int64(1)\
\n TableScan: data projection=[a]",
true,
)
.await
}

/// Construct a plan that cast columns. Only those SQL types are supported for now.
#[tokio::test]
async fn new_test_grammar() -> Result<()> {
Expand Down

0 comments on commit 8c55e1b

Please sign in to comment.