35
35
_write_fragments ,
36
36
)
37
37
from .progress import FragmentWriteProgress , NoopFragmentWriteProgress
38
+ from .types import _coerce_reader
38
39
from .udf import BatchUDF , normalize_transform
39
40
40
41
if TYPE_CHECKING :
@@ -406,6 +407,7 @@ def scanner(
406
407
limit : Optional [int ] = None ,
407
408
offset : Optional [int ] = None ,
408
409
with_row_id : bool = False ,
410
+ with_row_address : bool = False ,
409
411
batch_readahead : int = 16 ,
410
412
) -> "LanceScanner" :
411
413
"""See Dataset::scanner for details"""
@@ -424,6 +426,7 @@ def scanner(
424
426
limit = limit ,
425
427
offset = offset ,
426
428
with_row_id = with_row_id ,
429
+ with_row_address = with_row_address ,
427
430
batch_readahead = batch_readahead ,
428
431
** columns_arg ,
429
432
)
@@ -475,6 +478,78 @@ def to_table(
475
478
with_row_id = with_row_id ,
476
479
).to_table ()
477
480
481
+ def merge (
482
+ self ,
483
+ data_obj : ReaderLike ,
484
+ left_on : str ,
485
+ right_on : Optional [str ] = None ,
486
+ schema = None ,
487
+ ) -> Tuple [FragmentMetadata , LanceSchema ]:
488
+ """
489
+ Merge another dataset into this fragment.
490
+
491
+ Performs a left join, where the fragment is the left side and data_obj
492
+ is the right side. Rows existing in the dataset but not on the left will
493
+ be filled with null values, unless Lance doesn't support null values for
494
+ some types, in which case an error will be raised.
495
+
496
+ Parameters
497
+ ----------
498
+ data_obj: Reader-like
499
+ The data to be merged. Acceptable types are:
500
+ - Pandas DataFrame, Pyarrow Table, Dataset, Scanner,
501
+ Iterator[RecordBatch], or RecordBatchReader
502
+ left_on: str
503
+ The name of the column in the dataset to join on.
504
+ right_on: str or None
505
+ The name of the column in data_obj to join on. If None, defaults to
506
+ left_on.
507
+
508
+ Examples
509
+ --------
510
+
511
+ >>> import lance
512
+ >>> import pyarrow as pa
513
+ >>> df = pa.table({'x': [1, 2, 3], 'y': ['a', 'b', 'c']})
514
+ >>> dataset = lance.write_dataset(df, "dataset")
515
+ >>> dataset.to_table().to_pandas()
516
+ x y
517
+ 0 1 a
518
+ 1 2 b
519
+ 2 3 c
520
+ >>> fragments = dataset.get_fragments()
521
+ >>> new_df = pa.table({'x': [1, 2, 3], 'z': ['d', 'e', 'f']})
522
+ >>> merged = []
523
+ >>> schema = None
524
+ >>> for f in fragments:
525
+ ... f, schema = f.merge(new_df, 'x')
526
+ ... merged.append(f)
527
+ >>> merge = lance.LanceOperation.Merge(merged, schema)
528
+ >>> dataset = lance.LanceDataset.commit("dataset", merge, read_version=1)
529
+ >>> dataset.to_table().to_pandas()
530
+ x y z
531
+ 0 1 a d
532
+ 1 2 b e
533
+ 2 3 c f
534
+
535
+ See Also
536
+ --------
537
+ LanceDataset.merge_columns :
538
+ Add columns to this Fragment.
539
+
540
+ Returns
541
+ -------
542
+ Tuple[FragmentMetadata, LanceSchema]
543
+ A new fragment with the merged column(s) and the final schema.
544
+ """
545
+ if right_on is None :
546
+ right_on = left_on
547
+
548
+ reader = _coerce_reader (data_obj , schema )
549
+ max_field_id = self ._ds .max_field_id
550
+ metadata , schema = self ._fragment .merge (reader , left_on , right_on , max_field_id )
551
+ return metadata , schema
552
+
478
553
def merge_columns (
479
554
self ,
480
555
value_func : Dict [str , str ]
0 commit comments