@@ -1507,10 +1507,11 @@ mod tests {
1507
1507
use arrow_schema:: { DataType , Field , Fields as ArrowFields , Schema as ArrowSchema } ;
1508
1508
use arrow_select:: take:: take;
1509
1509
use futures:: stream:: TryStreamExt ;
1510
+ use lance_core:: format:: WriterVersion ;
1510
1511
use lance_index:: vector:: DIST_COL ;
1511
1512
use lance_linalg:: distance:: MetricType ;
1512
1513
use lance_testing:: datagen:: generate_random_array;
1513
- use tempfile:: tempdir;
1514
+ use tempfile:: { tempdir, TempDir } ;
1514
1515
1515
1516
// Used to validate that futures returned are Send.
1516
1517
fn require_send < T : Send > ( t : T ) -> T {
@@ -1561,6 +1562,10 @@ mod tests {
1561
1562
1562
1563
let actual_ds = Dataset :: open ( test_uri) . await . unwrap ( ) ;
1563
1564
assert_eq ! ( actual_ds. version( ) . version, 1 ) ;
1565
+ assert_eq ! (
1566
+ actual_ds. manifest. writer_version,
1567
+ Some ( WriterVersion :: default ( ) )
1568
+ ) ;
1564
1569
let actual_schema = ArrowSchema :: from ( actual_ds. schema ( ) ) ;
1565
1570
assert_eq ! ( & actual_schema, schema. as_ref( ) ) ;
1566
1571
@@ -3188,4 +3193,186 @@ mod tests {
3188
3193
& [ 1 ] ,
3189
3194
) ;
3190
3195
}
3196
+
3197
+ fn copy_dir_all (
3198
+ src : impl AsRef < std:: path:: Path > ,
3199
+ dst : impl AsRef < std:: path:: Path > ,
3200
+ ) -> std:: io:: Result < ( ) > {
3201
+ use std:: fs;
3202
+ fs:: create_dir_all ( & dst) ?;
3203
+ for entry in fs:: read_dir ( src) ? {
3204
+ let entry = entry?;
3205
+ let ty = entry. file_type ( ) ?;
3206
+ if ty. is_dir ( ) {
3207
+ copy_dir_all ( entry. path ( ) , dst. as_ref ( ) . join ( entry. file_name ( ) ) ) ?;
3208
+ } else {
3209
+ fs:: copy ( entry. path ( ) , dst. as_ref ( ) . join ( entry. file_name ( ) ) ) ?;
3210
+ }
3211
+ }
3212
+ Ok ( ( ) )
3213
+ }
3214
+
3215
+ /// Copies a test dataset into a temporary directory, returning the tmpdir.
3216
+ ///
3217
+ /// The `table_path` should be relative to `test_data/` at the root of the
3218
+ /// repo.
3219
+ fn copy_test_data_to_tmp ( table_path : & str ) -> std:: io:: Result < TempDir > {
3220
+ use std:: path:: PathBuf ;
3221
+
3222
+ let mut src = PathBuf :: new ( ) ;
3223
+ src. push ( env ! ( "CARGO_MANIFEST_DIR" ) ) ;
3224
+ src. push ( "../../test_data" ) ;
3225
+ src. push ( table_path) ;
3226
+
3227
+ let test_dir = tempdir ( ) . unwrap ( ) ;
3228
+
3229
+ copy_dir_all ( src. as_path ( ) , test_dir. path ( ) ) ?;
3230
+
3231
+ Ok ( test_dir)
3232
+ }
3233
+
3234
+ #[ tokio:: test]
3235
+ async fn test_v0_7_5_migration ( ) {
3236
+ // We migrate to add Fragment.physical_rows and DeletionFile.num_deletions
3237
+ // after this version.
3238
+
3239
+ // Copy over table
3240
+ let test_dir = copy_test_data_to_tmp ( "v0.7.5/with_deletions" ) . unwrap ( ) ;
3241
+ let test_uri = test_dir. path ( ) . to_str ( ) . unwrap ( ) ;
3242
+
3243
+ // Assert num rows, deletions, and physical rows are all correct.
3244
+ let dataset = Dataset :: open ( test_uri) . await . unwrap ( ) ;
3245
+ assert_eq ! ( dataset. count_rows( ) . await . unwrap( ) , 90 ) ;
3246
+ assert_eq ! ( dataset. count_deleted_rows( ) . await . unwrap( ) , 10 ) ;
3247
+ let total_physical_rows = futures:: stream:: iter ( dataset. get_fragments ( ) )
3248
+ . then ( |f| async move { f. physical_rows ( ) . await } )
3249
+ . try_fold ( 0 , |acc, x| async move { Ok ( acc + x) } )
3250
+ . await
3251
+ . unwrap ( ) ;
3252
+ assert_eq ! ( total_physical_rows, 100 ) ;
3253
+
3254
+ // Append 5 rows
3255
+ let schema = Arc :: new ( ArrowSchema :: from ( dataset. schema ( ) ) ) ;
3256
+ let batch = RecordBatch :: try_new (
3257
+ schema. clone ( ) ,
3258
+ vec ! [ Arc :: new( Int64Array :: from_iter_values( 100 ..105 ) ) ] ,
3259
+ )
3260
+ . unwrap ( ) ;
3261
+ let batches = RecordBatchIterator :: new ( vec ! [ Ok ( batch) ] , schema. clone ( ) ) ;
3262
+ let write_params = WriteParams {
3263
+ mode : WriteMode :: Append ,
3264
+ ..Default :: default ( )
3265
+ } ;
3266
+ let dataset = Dataset :: write ( batches, test_uri, Some ( write_params) )
3267
+ . await
3268
+ . unwrap ( ) ;
3269
+
3270
+ // Assert num rows, deletions, and physical rows are all correct.
3271
+ assert_eq ! ( dataset. count_rows( ) . await . unwrap( ) , 95 ) ;
3272
+ assert_eq ! ( dataset. count_deleted_rows( ) . await . unwrap( ) , 10 ) ;
3273
+ let total_physical_rows = futures:: stream:: iter ( dataset. get_fragments ( ) )
3274
+ . then ( |f| async move { f. physical_rows ( ) . await } )
3275
+ . try_fold ( 0 , |acc, x| async move { Ok ( acc + x) } )
3276
+ . await
3277
+ . unwrap ( ) ;
3278
+ assert_eq ! ( total_physical_rows, 105 ) ;
3279
+
3280
+ dataset. validate ( ) . await . unwrap ( ) ;
3281
+
3282
+ // Scan data and assert it is as expected.
3283
+ let expected = RecordBatch :: try_new (
3284
+ schema. clone ( ) ,
3285
+ vec ! [ Arc :: new( Int64Array :: from_iter_values(
3286
+ ( 0 ..10 ) . chain( 20 ..105 ) ,
3287
+ ) ) ] ,
3288
+ )
3289
+ . unwrap ( ) ;
3290
+ let actual_batches = dataset
3291
+ . scan ( )
3292
+ . try_into_stream ( )
3293
+ . await
3294
+ . unwrap ( )
3295
+ . try_collect :: < Vec < _ > > ( )
3296
+ . await
3297
+ . unwrap ( ) ;
3298
+ let actual = concat_batches ( & actual_batches[ 0 ] . schema ( ) , & actual_batches) . unwrap ( ) ;
3299
+ assert_eq ! ( actual, expected) ;
3300
+ }
3301
+
3302
+ #[ tokio:: test]
3303
+ async fn test_fix_v0_8_0_broken_migration ( ) {
3304
+ // The migration from v0.7.5 was broken in 0.8.0. This validates we can
3305
+ // automatically fix tables that have this problem.
3306
+
3307
+ // Copy over table
3308
+ let test_dir = copy_test_data_to_tmp ( "v0.8.0/migrated_from_v0.7.5" ) . unwrap ( ) ;
3309
+ let test_uri = test_dir. path ( ) . to_str ( ) . unwrap ( ) ;
3310
+
3311
+ // Assert num rows, deletions, and physical rows are all correct, even
3312
+ // though stats are bad.
3313
+ let dataset = Dataset :: open ( test_uri) . await . unwrap ( ) ;
3314
+ assert_eq ! ( dataset. count_rows( ) . await . unwrap( ) , 92 ) ;
3315
+ assert_eq ! ( dataset. count_deleted_rows( ) . await . unwrap( ) , 10 ) ;
3316
+ let total_physical_rows = futures:: stream:: iter ( dataset. get_fragments ( ) )
3317
+ . then ( |f| async move { f. physical_rows ( ) . await } )
3318
+ . try_fold ( 0 , |acc, x| async move { Ok ( acc + x) } )
3319
+ . await
3320
+ . unwrap ( ) ;
3321
+ assert_eq ! ( total_physical_rows, 102 ) ;
3322
+
3323
+ // Append 5 rows to table.
3324
+ let schema = Arc :: new ( ArrowSchema :: from ( dataset. schema ( ) ) ) ;
3325
+ let batch = RecordBatch :: try_new (
3326
+ schema. clone ( ) ,
3327
+ vec ! [ Arc :: new( Int64Array :: from_iter_values( 100 ..105 ) ) ] ,
3328
+ )
3329
+ . unwrap ( ) ;
3330
+ let batches = RecordBatchIterator :: new ( vec ! [ Ok ( batch) ] , schema. clone ( ) ) ;
3331
+ let write_params = WriteParams {
3332
+ mode : WriteMode :: Append ,
3333
+ ..Default :: default ( )
3334
+ } ;
3335
+ let dataset = Dataset :: write ( batches, test_uri, Some ( write_params) )
3336
+ . await
3337
+ . unwrap ( ) ;
3338
+
3339
+ // Assert statistics are all now correct.
3340
+ let physical_rows: Vec < _ > = dataset
3341
+ . get_fragments ( )
3342
+ . iter ( )
3343
+ . map ( |f| f. metadata . physical_rows )
3344
+ . collect ( ) ;
3345
+ assert_eq ! ( physical_rows, vec![ Some ( 100 ) , Some ( 2 ) , Some ( 5 ) ] ) ;
3346
+ let num_deletions: Vec < _ > = dataset
3347
+ . get_fragments ( )
3348
+ . iter ( )
3349
+ . map ( |f| {
3350
+ f. metadata
3351
+ . deletion_file
3352
+ . as_ref ( )
3353
+ . and_then ( |df| df. num_deleted_rows )
3354
+ } )
3355
+ . collect ( ) ;
3356
+ assert_eq ! ( num_deletions, vec![ Some ( 10 ) , None , None ] ) ;
3357
+ assert_eq ! ( dataset. count_rows( ) . await . unwrap( ) , 97 ) ;
3358
+
3359
+ // Scan data and assert it is as expected.
3360
+ let expected = RecordBatch :: try_new (
3361
+ schema. clone ( ) ,
3362
+ vec ! [ Arc :: new( Int64Array :: from_iter_values(
3363
+ ( 0 ..10 ) . chain( 20 ..100 ) . chain( 0 ..2 ) . chain( 100 ..105 ) ,
3364
+ ) ) ] ,
3365
+ )
3366
+ . unwrap ( ) ;
3367
+ let actual_batches = dataset
3368
+ . scan ( )
3369
+ . try_into_stream ( )
3370
+ . await
3371
+ . unwrap ( )
3372
+ . try_collect :: < Vec < _ > > ( )
3373
+ . await
3374
+ . unwrap ( ) ;
3375
+ let actual = concat_batches ( & actual_batches[ 0 ] . schema ( ) , & actual_batches) . unwrap ( ) ;
3376
+ assert_eq ! ( actual, expected) ;
3377
+ }
3191
3378
}
0 commit comments