@@ -13,7 +13,6 @@ use futures::stream::{StreamExt, TryStreamExt};
13
13
use lance_arrow:: SchemaExt ;
14
14
use lance_core:: datatypes:: { Field , Schema } ;
15
15
use lance_datafusion:: utils:: StreamingWriteSource ;
16
- use lance_encoding:: version:: LanceFileVersion ;
17
16
use lance_table:: format:: Fragment ;
18
17
use snafu:: location;
19
18
@@ -23,6 +22,12 @@ use super::{
23
22
Dataset ,
24
23
} ;
25
24
25
+ mod optimize;
26
+
27
+ use optimize:: {
28
+ ChainedNewColumnTransformOptimizer , NewColumnTransformOptimizer , SqlToAllNullsOptimizer ,
29
+ } ;
30
+
26
31
#[ derive( Debug , Clone , PartialEq ) ]
27
32
pub struct BatchInfo {
28
33
pub fragment_id : u32 ,
@@ -149,6 +154,14 @@ pub(super) async fn add_columns_to_fragments(
149
154
Ok ( ( ) )
150
155
} ;
151
156
157
+ // Optimize the transforms
158
+ let mut optimizer = ChainedNewColumnTransformOptimizer :: new ( vec ! [ ] ) ;
159
+ // ALlNull transform can not performed on legacy files
160
+ if !dataset. is_legacy_storage ( ) {
161
+ optimizer. add_optimizer ( Box :: new ( SqlToAllNullsOptimizer :: new ( ) ) ) ;
162
+ }
163
+ let transforms = optimizer. optimize ( dataset, transforms) ?;
164
+
152
165
let ( output_schema, fragments) = match transforms {
153
166
NewColumnTransform :: BatchUDF ( udf) => {
154
167
check_names ( udf. output_schema . as_ref ( ) ) ?;
@@ -262,17 +275,7 @@ pub(super) async fn add_columns_to_fragments(
262
275
// can't add all-null columns as a metadata-only operation. The reason is because we
263
276
// use the NullReader for fragments that have missing columns and we can't mix legacy
264
277
// and non-legacy readers when reading the fragment.
265
- if fragments. iter ( ) . any ( |fragment| {
266
- fragment. files . iter ( ) . any ( |file| {
267
- matches ! (
268
- LanceFileVersion :: try_from_major_minor(
269
- file. file_major_version,
270
- file. file_minor_version
271
- ) ,
272
- Ok ( LanceFileVersion :: Legacy )
273
- )
274
- } )
275
- } ) {
278
+ if dataset. is_legacy_storage ( ) {
276
279
return Err ( Error :: NotSupported {
277
280
source : "Cannot add all-null columns to legacy dataset version." . into ( ) ,
278
281
location : location ! ( ) ,
@@ -1744,4 +1747,115 @@ mod test {
1744
1747
1745
1748
Ok ( ( ) )
1746
1749
}
1750
+
1751
+ #[ tokio:: test]
1752
+ async fn test_new_column_sql_to_all_nulls_transform_optimizer ( ) {
1753
+ let schema = Arc :: new ( ArrowSchema :: new ( vec ! [ ArrowField :: new(
1754
+ "a" ,
1755
+ DataType :: Int32 ,
1756
+ false ,
1757
+ ) ] ) ) ;
1758
+
1759
+ let batch = RecordBatch :: try_new (
1760
+ schema. clone ( ) ,
1761
+ vec ! [ Arc :: new( Int32Array :: from_iter( 0 ..100 ) ) ] ,
1762
+ )
1763
+ . unwrap ( ) ;
1764
+ let reader = RecordBatchIterator :: new ( vec ! [ Ok ( batch) ] , schema. clone ( ) ) ;
1765
+ let test_dir = tempfile:: tempdir ( ) . unwrap ( ) ;
1766
+ let test_uri = test_dir. path ( ) . to_str ( ) . unwrap ( ) ;
1767
+ let mut dataset = Dataset :: write (
1768
+ reader,
1769
+ test_uri,
1770
+ Some ( WriteParams {
1771
+ max_rows_per_file : 50 ,
1772
+ max_rows_per_group : 25 ,
1773
+ data_storage_version : Some ( LanceFileVersion :: Stable ) ,
1774
+ ..Default :: default ( )
1775
+ } ) ,
1776
+ )
1777
+ . await
1778
+ . unwrap ( ) ;
1779
+ dataset. validate ( ) . await . unwrap ( ) ;
1780
+
1781
+ let manifest_before = dataset. manifest . clone ( ) ;
1782
+
1783
+ // Add all null column
1784
+ dataset
1785
+ . add_columns (
1786
+ NewColumnTransform :: SqlExpressions ( vec ! [ (
1787
+ "b" . to_string( ) ,
1788
+ "CAST(NULL AS int)" . to_string( ) ,
1789
+ ) ] ) ,
1790
+ None ,
1791
+ None ,
1792
+ )
1793
+ . await
1794
+ . unwrap ( ) ;
1795
+ let manifest_after = dataset. manifest . clone ( ) ;
1796
+
1797
+ // Check that this is a metadata-only operation (the fragments don't change)
1798
+ assert_eq ! ( & manifest_before. fragments, & manifest_after. fragments) ;
1799
+
1800
+ // check that the new field was added to the schema
1801
+ let expected_schema = ArrowSchema :: new ( vec ! [
1802
+ ArrowField :: new( "a" , DataType :: Int32 , false ) ,
1803
+ ArrowField :: new( "b" , DataType :: Int32 , true ) ,
1804
+ ] ) ;
1805
+ assert_eq ! ( ArrowSchema :: from( dataset. schema( ) ) , expected_schema) ;
1806
+ }
1807
+
1808
+ #[ tokio:: test]
1809
+ async fn test_new_column_sql_to_all_nulls_transform_optimizer_legacy ( ) {
1810
+ let schema = Arc :: new ( ArrowSchema :: new ( vec ! [ ArrowField :: new(
1811
+ "a" ,
1812
+ DataType :: Int32 ,
1813
+ false ,
1814
+ ) ] ) ) ;
1815
+
1816
+ let batch = RecordBatch :: try_new (
1817
+ schema. clone ( ) ,
1818
+ vec ! [ Arc :: new( Int32Array :: from_iter( 0 ..100 ) ) ] ,
1819
+ )
1820
+ . unwrap ( ) ;
1821
+ let reader = RecordBatchIterator :: new ( vec ! [ Ok ( batch) ] , schema. clone ( ) ) ;
1822
+ let test_dir = tempfile:: tempdir ( ) . unwrap ( ) ;
1823
+ let test_uri = test_dir. path ( ) . to_str ( ) . unwrap ( ) ;
1824
+ let mut dataset = Dataset :: write (
1825
+ reader,
1826
+ test_uri,
1827
+ Some ( WriteParams {
1828
+ max_rows_per_file : 50 ,
1829
+ max_rows_per_group : 25 ,
1830
+ data_storage_version : Some ( LanceFileVersion :: Legacy ) ,
1831
+ ..Default :: default ( )
1832
+ } ) ,
1833
+ )
1834
+ . await
1835
+ . unwrap ( ) ;
1836
+ dataset. validate ( ) . await . unwrap ( ) ;
1837
+
1838
+ // Add all null column ...
1839
+ // This is basically a smoke test to ensure we don't try to use the all-nulls
1840
+ // transform optimizer where it's not supported, and then blow up when we try
1841
+ // to apply the transform
1842
+ dataset
1843
+ . add_columns (
1844
+ NewColumnTransform :: SqlExpressions ( vec ! [ (
1845
+ "b" . to_string( ) ,
1846
+ "CAST(NULL AS int)" . to_string( ) ,
1847
+ ) ] ) ,
1848
+ None ,
1849
+ None ,
1850
+ )
1851
+ . await
1852
+ . unwrap ( ) ;
1853
+
1854
+ // check that the new field was added to the schema
1855
+ let expected_schema = ArrowSchema :: new ( vec ! [
1856
+ ArrowField :: new( "a" , DataType :: Int32 , false ) ,
1857
+ ArrowField :: new( "b" , DataType :: Int32 , true ) ,
1858
+ ] ) ;
1859
+ assert_eq ! ( ArrowSchema :: from( dataset. schema( ) ) , expected_schema) ;
1860
+ }
1747
1861
}
0 commit comments