From a3c239bd68650b09d587d234f6e510a3b70532df Mon Sep 17 00:00:00 2001 From: Dharan Aditya Date: Sun, 4 Aug 2024 01:50:12 +0530 Subject: [PATCH 1/3] make query work --- datafusion/expr/src/type_coercion/binary.rs | 16 +++++++++++++++- .../sqllogictest/test_files/string_view.slt | 9 +++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index 17280289ed1b..4a4ca9069ec2 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -890,7 +890,7 @@ fn dictionary_coercion( /// 2. Data type of the other side should be able to cast to string type fn string_concat_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { use arrow::datatypes::DataType::*; - string_coercion(lhs_type, rhs_type).or(match (lhs_type, rhs_type) { + string_coercion_duplicate(lhs_type, rhs_type).or(match (lhs_type, rhs_type) { (Utf8, from_type) | (from_type, Utf8) => { string_concat_internal_coercion(from_type, &Utf8) } @@ -939,6 +939,20 @@ fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option } } +fn string_coercion_duplicate(lhs_type: &DataType, rhs_type: &DataType) -> Option { + use arrow::datatypes::DataType::*; + match (lhs_type, rhs_type) { + // If Utf8View is in any side, we coerce to Utf8. + (Utf8View, Utf8View | Utf8 | LargeUtf8) | (Utf8 | LargeUtf8, Utf8View) => { + Some(Utf8) + } + // Then, if LargeUtf8 is in any side, we coerce to LargeUtf8. + (LargeUtf8, Utf8 | LargeUtf8) | (Utf8, LargeUtf8) => Some(LargeUtf8), + (Utf8, Utf8) => Some(Utf8), + _ => None, + } +} + fn numeric_string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { use arrow::datatypes::DataType::*; match (lhs_type, rhs_type) { diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index 763b4e99c614..17cfd518477f 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -379,3 +379,12 @@ select t.dt from dates t where arrow_cast('2024-01-01', 'Utf8View') < t.dt; statement ok drop table dates; + +statement ok +create table temp as values ('value1', arrow_cast('one', 'Utf8View')), ('value1', arrow_cast('two', 'Utf8View')); + +query T +select column2||'ff' from temp; +---- +oneff +twoff \ No newline at end of file From 610fdecb7934a30161c0531bc7742798b12f28dd Mon Sep 17 00:00:00 2001 From: Dharan Aditya Date: Mon, 5 Aug 2024 12:56:00 +0530 Subject: [PATCH 2/3] hack string_concat_coercion --- datafusion/expr/src/type_coercion/binary.rs | 36 ++++++++------------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index 4a4ca9069ec2..0a45dc0876e1 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -890,15 +890,21 @@ fn dictionary_coercion( /// 2. Data type of the other side should be able to cast to string type fn string_concat_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { use arrow::datatypes::DataType::*; - string_coercion_duplicate(lhs_type, rhs_type).or(match (lhs_type, rhs_type) { - (Utf8, from_type) | (from_type, Utf8) => { - string_concat_internal_coercion(from_type, &Utf8) - } - (LargeUtf8, from_type) | (from_type, LargeUtf8) => { - string_concat_internal_coercion(from_type, &LargeUtf8) + match (lhs_type, rhs_type) { + // If Utf8View is in any side, we coerce to Utf8. + (Utf8View, Utf8View | Utf8 | LargeUtf8) | (Utf8 | LargeUtf8, Utf8View) => { + Some(Utf8) } - _ => None, - }) + _ => string_coercion(lhs_type, rhs_type).or(match (lhs_type, rhs_type) { + (Utf8, from_type) | (from_type, Utf8) => { + string_concat_internal_coercion(from_type, &Utf8) + } + (LargeUtf8, from_type) | (from_type, LargeUtf8) => { + string_concat_internal_coercion(from_type, &LargeUtf8) + } + _ => None, + }), + } } fn array_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { @@ -939,20 +945,6 @@ fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option } } -fn string_coercion_duplicate(lhs_type: &DataType, rhs_type: &DataType) -> Option { - use arrow::datatypes::DataType::*; - match (lhs_type, rhs_type) { - // If Utf8View is in any side, we coerce to Utf8. - (Utf8View, Utf8View | Utf8 | LargeUtf8) | (Utf8 | LargeUtf8, Utf8View) => { - Some(Utf8) - } - // Then, if LargeUtf8 is in any side, we coerce to LargeUtf8. - (LargeUtf8, Utf8 | LargeUtf8) | (Utf8, LargeUtf8) => Some(LargeUtf8), - (Utf8, Utf8) => Some(Utf8), - _ => None, - } -} - fn numeric_string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { use arrow::datatypes::DataType::*; match (lhs_type, rhs_type) { From b50887f00a1e6009d5ee01c9194ac5ea3269ed8b Mon Sep 17 00:00:00 2001 From: Dharan Aditya Date: Mon, 5 Aug 2024 15:35:43 +0530 Subject: [PATCH 3/3] more tests --- datafusion/expr/src/type_coercion/binary.rs | 1 + .../sqllogictest/test_files/string_view.slt | 52 +++++++++++++++++-- 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index 0a45dc0876e1..8da33081d652 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -892,6 +892,7 @@ fn string_concat_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { Some(Utf8) } diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index 17cfd518477f..5ecaea02aa95 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -381,10 +381,54 @@ statement ok drop table dates; statement ok -create table temp as values ('value1', arrow_cast('one', 'Utf8View')), ('value1', arrow_cast('two', 'Utf8View')); +create table temp as values +('value1', arrow_cast('rust', 'Utf8View'), arrow_cast('fast', 'Utf8View')), +('value2', arrow_cast('datafusion', 'Utf8View'), arrow_cast('cool', 'Utf8View')); query T -select column2||'ff' from temp; +select column2||' is fast' from temp; ---- -oneff -twoff \ No newline at end of file +rust is fast +datafusion is fast + + +query T +select column2 || ' is ' || column3 from temp; +---- +rust is fast +datafusion is cool + +query TT +explain select column2 || 'is' || column3 from temp; +---- +logical_plan +01)Projection: CAST(temp.column2 AS Utf8) || Utf8("is") || CAST(temp.column3 AS Utf8) +02)--TableScan: temp projection=[column2, column3] + + +query TT +explain select column2||' is fast' from temp; +---- +logical_plan +01)Projection: CAST(temp.column2 AS Utf8) || Utf8(" is fast") +02)--TableScan: temp projection=[column2] + + +query T +select column2||column3 from temp; +---- +rustfast +datafusioncool + +query TT +explain select column2||column3 from temp; +---- +logical_plan +01)Projection: CAST(temp.column2 AS Utf8) || CAST(temp.column3 AS Utf8) +02)--TableScan: temp projection=[column2, column3] + +query T +select column2|| ' ' ||column3 from temp; +---- +rust fast +datafusion cool