Skip to content

Commit

Permalink
ESQL: Compute infrastruture for LEFT JOIN
Browse files Browse the repository at this point in the history
This adds some infrastructure that we can use to run LOOKUP JOIN using
real LEFT JOIN semantics.

Right now if LOOKUP JOIN matches many rows in the `lookup` index we
merge all of the values into a multivalued field. So the number of rows
emitted from LOOKUP JOIN is the same as the number of rows that comes
into LOOKUP JOIN.

This change builds the infrastructure to emit one row per match, mostly
reusing the infrastructure from ENRICH.
  • Loading branch information
nik9000 committed Dec 17, 2024
1 parent 6c56c32 commit 6f1b9bb
Show file tree
Hide file tree
Showing 7 changed files with 674 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -212,10 +212,44 @@ default boolean mvSortedAscending() {
/**
* Expand multivalued fields into one row per value. Returns the same block if there aren't any multivalued
* fields to expand. The returned block needs to be closed by the caller to release the block's resources.
* TODO: pass BlockFactory
*/
Block expand();

/**
* Build a {@link Block} with a {@code null} inserted {@code before} each
* listed position.
* <p>
* Note: {@code before} must be non-decreasing.
* </p>
*/
default Block insertNulls(IntVector before) {
// TODO remove default and scatter to implementation where it can be a lot more efficient
try (Builder builder = elementType().newBlockBuilder(getPositionCount() + before.getPositionCount(), blockFactory())) {
int beforeP = 0;
int nextNull = before.getInt(beforeP);
for (int mainP = 0; mainP < getPositionCount(); mainP++) {
while (mainP == nextNull) {
builder.appendNull();
beforeP++;
if (beforeP >= before.getPositionCount()) {
builder.copyFrom(this, mainP, getPositionCount());
return builder.build();
}
nextNull = before.getInt(beforeP);
}
// This line right below this is the super inefficient one.
builder.copyFrom(this, mainP, mainP + 1);
}
assert nextNull == getPositionCount();
while (beforeP < before.getPositionCount()) {
nextNull = before.getInt(beforeP++);
assert nextNull == getPositionCount();
builder.appendNull();
}
return builder.build();
}
}

/**
* Builds {@link Block}s. Typically, you use one of it's direct supinterfaces like {@link IntBlock.Builder}.
* This is {@link Releasable} and should be released after building the block or if building the block fails.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,28 +20,32 @@
import java.util.Objects;

/**
* Combines values at the given blocks with the same positions into a single position for the blocks at the given channels
* Combines values at the given blocks with the same positions into a single position
* for the blocks at the given channels.
* <p>
* Example, input pages consisting of three blocks:
* positions | field-1 | field-2 |
* -----------------------------------
* </p>
* <pre>{@code
* | positions | field-1 | field-2 |
* ------------------------------------
* Page 1:
* 1 | a,b | 2020 |
* 1 | c | 2021 |
* ---------------------------------
* | 1 | a,b | 2020 |
* | 1 | c | 2021 |
* Page 2:
* 2 | a,e | 2021 |
* ---------------------------------
* | 2 | a,e | 2021 |
* Page 3:
* 4 | d | null |
* ---------------------------------
* | 4 | d | null |
* }</pre>
* Output:
* <pre>{@code
* | field-1 | field-2 |
* ---------------------------
* | null | null |
* | a,b,c | 2020,2021 |
* | a,e | 2021 |
* | null | null |
* | d | 2023 |
* }</pre>
*/
public final class MergePositionsOperator implements Operator {
private boolean finished = false;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,252 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

package org.elasticsearch.compute.operator.lookup;

import org.elasticsearch.compute.data.Block;
import org.elasticsearch.compute.data.BlockFactory;
import org.elasticsearch.compute.data.IntBlock;
import org.elasticsearch.compute.data.IntVector;
import org.elasticsearch.compute.data.Page;
import org.elasticsearch.core.Releasable;
import org.elasticsearch.core.Releasables;

import java.util.Optional;
import java.util.stream.IntStream;

/**
* Performs a {@code LEFT JOIN} where many "right hand" pages are joined
* against a "left hand" {@link Page}. Each row on the "left hand" page
* is output at least once whether it appears in the "right hand" or not.
* And more than once if it appears in the "right hand" pages more than once.
* <p>
* The "right hand" page contains a non-decreasing {@code positions}
* column that controls which position in the "left hand" page the row
* in the "right hand" page. This'll make more sense with a picture:
* </p>
* <pre>{@code
* "left hand" "right hand"
* | lhdata | | positions | r1 | r2 |
* ---------- -----------------------
* | l00 | | 0 | 1 | 2 |
* | l01 | | 1 | 2 | 3 |
* | l02 | | 1 | 3 | 3 |
* | ... | | 3 | 9 | 9 |
* | l99 |
* }</pre>
* <p>
* Joins to:
* </p>
* <pre>{@code
* | lhdata | r1 | r2 |
* -------------------------
* | l00 | 1 | 2 |
* | l01 | 2 | 3 |
* | l01 | 3 | 3 | <1>
* | l02 | null | null | <2>
* | l03 | 9 | 9 |
* }</pre>
* <ol>
* <li>{@code l01} is duplicated because it's positions appears twice in
* the right hand page.</li>
* <li>{@code l02}'s row is filled with {@code null}s because it's position
* does not appear in the right hand page.</li>
* </ol>
* <p>
* This supports joining many "right hand" pages against the same
* "left hand" so long as the first value of the next {@code positions}
* column is the same or greater than the last value of the previous
* {@code positions} column. Large gaps are fine. Starting with the
* same number as you ended on is fine. This looks like:
* </p>
* <pre>{@code
* "left hand" "right hand"
* | lhdata | | positions | r1 | r2 |
* ---------- -----------------------
* | l00 | page 1
* | l01 | | 0 | 1 | 2 |
* | l02 | | 1 | 3 | 3 |
* | l03 | page 2
* | l04 | | 1 | 9 | 9 |
* | l05 | | 2 | 9 | 9 |
* | l06 | page 3
* | ... | | 5 | 10 | 10 |
* | l99 | | 7 | 11 | 11 |
* }</pre>
* <p>
* Which makes:
* </p>
* <pre>{@code
* | lhdata | r1 | r2 |
* -------------------------
* page 1
* | l00 | 1 | 2 |
* | l01 | 3 | 3 |
* page 2
* | l01 | 9 | 9 |
* | l02 | 9 | 9 |
* page 3
* | l03 | null | null |
* | l04 | null | null |
* | l05 | 10 | 10 |
* | l06 | null | null |
* | l07 | 11 | 11 |
* }</pre>
* <p>
* Note that the output pages are sized by the "right hand" pages with
* {@code null}s inserted.
* </p>
* <p>
* Finally, after all "right hand" pages have been joined this will produce
* all remaining "left hand" rows joined against {@code null}.
* Another picture:
* </p>
* <pre>{@code
* "left hand" "right hand"
* | lhdata | | positions | r1 | r2 |
* ---------- -----------------------
* | l00 | last page
* | l01 | | 96 | 1 | 2 |
* | ... | | 97 | 1 | 2 |
* | l99 |
* }</pre>
* <p>
* Which makes:
* </p>
* <pre>{@code
* | lhdata | r1 | r2 |
* -------------------------
* last matching page
* | l96 | 1 | 2 |
* | l97 | 2 | 3 |
* trailing nulls page
* | l98 | null | null |
* | l99 | null | null |
* }</pre>
*/
class RightChunkedLeftJoin implements Releasable {
private final Page leftHand;
private final int mergedElementCount;
/**
* The next position that we'll emit <strong>or</strong> one more than the
* next position we'll emit. This is used to cover gaps between "right hand"
* pages and to detect if "right hand" pages "go backwards".
*/
private int next = 0;

RightChunkedLeftJoin(Page leftHand, int mergedElementCounts) {
this.leftHand = leftHand;
this.mergedElementCount = mergedElementCounts;
}

Page join(Page rightHand) {
IntVector positions = rightHand.<IntBlock>getBlock(0).asVector();
if (positions.getInt(0) < next - 1) {
throw new IllegalArgumentException("maximum overlap is one position");
}
Block[] blocks = new Block[leftHand.getBlockCount() + mergedElementCount];
if (rightHand.getBlockCount() != mergedElementCount + 1) {
throw new IllegalArgumentException(
"expected right hand side with [" + (mergedElementCount + 1) + "] but got [" + rightHand.getBlockCount() + "]"
);
}
IntVector.Builder leftFilterBuilder = null;
IntVector leftFilter = null;
IntVector.Builder insertNullsBuilder = null;
IntVector insertNulls = null;
try {
leftFilterBuilder = positions.blockFactory().newIntVectorBuilder(positions.getPositionCount());
for (int p = 0; p < positions.getPositionCount(); p++) {
int pos = positions.getInt(p);
if (pos > next) {
if (insertNullsBuilder == null) {
insertNullsBuilder = positions.blockFactory().newIntVectorBuilder(pos - next);
}
for (int missing = next; missing < pos; missing++) {
leftFilterBuilder.appendInt(missing);
insertNullsBuilder.appendInt(p);
}
}
leftFilterBuilder.appendInt(pos);
next = pos + 1;
}
leftFilter = leftFilterBuilder.build();
int[] leftFilterArray = toArray(leftFilter);
insertNulls = insertNullsBuilder == null ? null : insertNullsBuilder.build();

int b = 0;
while (b < leftHand.getBlockCount()) {
blocks[b] = leftHand.getBlock(b).filter(leftFilterArray);
b++;
}
int rb = 1; // Skip the positions column
while (b < blocks.length) {
Block block = rightHand.getBlock(rb);
if (insertNulls == null) {
block.mustIncRef();
} else {
block = block.insertNulls(insertNulls);
}
blocks[b] = block;
b++;
rb++;
}
Page result = new Page(blocks);
blocks = null;
return result;
} finally {
Releasables.close(
blocks == null ? null : Releasables.wrap(blocks),
leftFilter,
leftFilterBuilder,
insertNullsBuilder,
insertNulls
);
}
}

Optional<Page> noMoreRightHandPages() {
if (next == leftHand.getPositionCount()) {
return Optional.empty();
}
BlockFactory factory = leftHand.getBlock(0).blockFactory();
Block[] blocks = new Block[leftHand.getBlockCount() + mergedElementCount];
int[] filter = IntStream.range(next, leftHand.getPositionCount()).toArray();
try {
int b = 0;
while (b < leftHand.getBlockCount()) {
blocks[b] = leftHand.getBlock(b).filter(filter);
b++;
}
while (b < blocks.length) {
blocks[b] = factory.newConstantNullBlock(leftHand.getPositionCount() - next);
b++;
}
Page result = new Page(blocks);
blocks = null;
return Optional.of(result);
} finally {
if (blocks != null) {
Releasables.close(blocks);
}
}
}

@Override
public void close() {
Releasables.close(leftHand::releaseBlocks);
}

private int[] toArray(IntVector vector) {
// TODO replace parameter to filter with vector and remove this
int[] array = new int[vector.getPositionCount()];
for (int p = 0; p < vector.getPositionCount(); p++) {
array[p] = vector.getInt(p);
}
return array;
}
}
Loading

0 comments on commit 6f1b9bb

Please sign in to comment.