Skip to content

Commit

Permalink
vector index test
Browse files Browse the repository at this point in the history
  • Loading branch information
jepett0 committed Oct 18, 2024
1 parent a0b4306 commit 270c3df
Show file tree
Hide file tree
Showing 5 changed files with 182 additions and 8 deletions.
22 changes: 17 additions & 5 deletions ydb/core/tx/schemeshard/ut_helpers/helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1699,11 +1699,23 @@ namespace NSchemeShardUT_Private {
} break;
case NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree: {
auto& settings = *index.mutable_global_vector_kmeans_tree_index();
settings = Ydb::Table::GlobalVectorKMeansTreeIndex();
// some random valid settings
settings.mutable_vector_settings()->mutable_settings()->set_vector_type(Ydb::Table::VectorIndexSettings::VECTOR_TYPE_FLOAT);
settings.mutable_vector_settings()->mutable_settings()->set_vector_dimension(42);
settings.mutable_vector_settings()->mutable_settings()->set_metric(Ydb::Table::VectorIndexSettings::DISTANCE_COSINE);

auto& vectorIndexSettings = *settings.mutable_vector_settings()->mutable_settings();
if (cfg.VectorIndexSettings) {
cfg.VectorIndexSettings->SerializeTo(vectorIndexSettings);
} else {
// some random valid settings
vectorIndexSettings.set_vector_type(Ydb::Table::VectorIndexSettings::VECTOR_TYPE_FLOAT);
vectorIndexSettings.set_vector_dimension(42);
vectorIndexSettings.set_metric(Ydb::Table::VectorIndexSettings::DISTANCE_COSINE);
}

if (cfg.GlobalIndexSettings) {
cfg.GlobalIndexSettings[0].SerializeTo(*settings.mutable_level_table_settings());
if (cfg.GlobalIndexSettings.size() > 1) {
cfg.GlobalIndexSettings[1].SerializeTo(*settings.mutable_posting_table_settings());
}
}
} break;
default:
UNIT_ASSERT_C(false, "Unknown index type: " << static_cast<ui32>(cfg.IndexType));
Expand Down
3 changes: 3 additions & 0 deletions ydb/core/tx/schemeshard/ut_helpers/helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@

namespace NYdb::NTable {
struct TGlobalIndexSettings;
struct TVectorIndexSettings;
}

namespace NSchemeShardUT_Private {
Expand Down Expand Up @@ -371,6 +372,8 @@ namespace NSchemeShardUT_Private {
TVector<TString> IndexColumns;
TVector<TString> DataColumns;
TVector<NYdb::NTable::TGlobalIndexSettings> GlobalIndexSettings = {};
// implementation note: it was made a pointer, not optional, to enable forward declaration
std::unique_ptr<NYdb::NTable::TVectorIndexSettings> VectorIndexSettings = {};
};

std::unique_ptr<TEvIndexBuilder::TEvCreateRequest> CreateBuildColumnRequest(ui64 id, const TString& dbName, const TString& src, const TString& columnName, const Ydb::TypedValue& literal);
Expand Down
56 changes: 54 additions & 2 deletions ydb/core/tx/schemeshard/ut_helpers/ls_checks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -853,7 +853,7 @@ TCheckFunc IndexDataColumns(const TVector<TString>& dataColumnNames) {
};
}

TCheckFunc VectorIndexDescription(Ydb::Table::VectorIndexSettings_Metric metric,
TCheckFunc VectorIndexDescription(Ydb::Table::VectorIndexSettings_Metric metric,
Ydb::Table::VectorIndexSettings_VectorType vectorType,
ui32 vectorDimension
) {
Expand Down Expand Up @@ -1309,11 +1309,63 @@ TCheckFunc PartitionKeys(TVector<TString> lastShardKeys) {
const auto& pathDescr = record.GetPathDescription();
UNIT_ASSERT_VALUES_EQUAL(lastShardKeys.size(), pathDescr.TablePartitionsSize());
for (size_t i = 0; i < lastShardKeys.size(); ++i) {
UNIT_ASSERT_STRING_CONTAINS(pathDescr.GetTablePartitions(i).GetEndOfRangeKeyPrefix(), lastShardKeys[i]);
const auto& partition = pathDescr.GetTablePartitions(i);
UNIT_ASSERT_STRING_CONTAINS_C(
partition.GetEndOfRangeKeyPrefix(), lastShardKeys[i],
"partition index: " << i << '\n'
<< "actual key prefix: " << partition.GetEndOfRangeKeyPrefix().Quote() << '\n'
<< "expected key prefix: " << lastShardKeys[i].Quote() << '\n'
);
}
};
}

namespace {

// Serializes / deserializes a value of type T to a cell vector string representation.
template <typename T>
struct TSplitBoundarySerializer {
static TString Serialize(T splitBoundary) {
const auto cell = TCell::Make(splitBoundary);
TSerializedCellVec cellVec(TArrayRef<const TCell>(&cell, 1));
return cellVec.ReleaseBuffer();
}

static TVector<T> Deserialize(const TString& serializedCells) {
TSerializedCellVec cells(serializedCells);
TVector<T> values;
for (const auto& cell : cells.GetCells()) {
if (cell.IsNull()) {
// the last cell
break;
}
values.emplace_back(cell.AsValue<T>());
}
return values;
}
};

}

template <typename T>
TCheckFunc SplitBoundaries(TVector<T>&& expectedBoundaries) {
return [expectedBoundaries = std::move(expectedBoundaries)] (const NKikimrScheme::TEvDescribeSchemeResult& record) {
const auto& pathDescr = record.GetPathDescription();
UNIT_ASSERT_VALUES_EQUAL(pathDescr.TablePartitionsSize(), expectedBoundaries.size() + 1);
for (size_t i = 0; i < expectedBoundaries.size(); ++i) {
const auto& partition = pathDescr.GetTablePartitions(i);
const auto actualBoundary = TSplitBoundarySerializer<T>::Deserialize(partition.GetEndOfRangeKeyPrefix()).at(0);
UNIT_ASSERT_VALUES_EQUAL_C(
actualBoundary, expectedBoundaries[i],
"partition index: " << i << '\n'
<< "actual key prefix: " << partition.GetEndOfRangeKeyPrefix().Quote() << '\n'
);
}
};
}

template TCheckFunc SplitBoundaries<ui32>(TVector<ui32>&&);

TCheckFunc ServerlessComputeResourcesMode(NKikimrSubDomains::EServerlessComputeResourcesMode serverlessComputeResourcesMode) {
return [=] (const NKikimrScheme::TEvDescribeSchemeResult& record) {
UNIT_ASSERT_C(IsGoodDomainStatus(record.GetStatus()), "Unexpected status: " << record.GetStatus());
Expand Down
6 changes: 5 additions & 1 deletion ydb/core/tx/schemeshard/ut_helpers/ls_checks.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,10 @@ namespace NLs {
void CheckBoundaries(const NKikimrScheme::TEvDescribeSchemeResult& record);
TCheckFunc PartitionCount(ui32 count);
TCheckFunc PartitionKeys(TVector<TString> lastShardKeys);
// Checks if the serialized representation of an expected boundary is a prefix of the actual one.
// Similar to PartitionKeys check, but does not require you to pass split boundaries in a serialized form.
template <typename T>
TCheckFunc SplitBoundaries(TVector<T>&& expectedBoundaries);
TCheckFunc FollowerCount(ui32 count);
TCheckFunc CrossDataCenterFollowerCount(ui32 count);
TCheckFunc AllowFollowerPromotion(bool val);
Expand Down Expand Up @@ -141,7 +145,7 @@ namespace NLs {
TCheckFunc IndexState(NKikimrSchemeOp::EIndexState state);
TCheckFunc IndexKeys(const TVector<TString>& keyNames);
TCheckFunc IndexDataColumns(const TVector<TString>& dataColumnNames);

TCheckFunc VectorIndexDescription(Ydb::Table::VectorIndexSettings_Metric metric,
Ydb::Table::VectorIndexSettings_VectorType vectorType,
ui32 vectorDimension
Expand Down
103 changes: 103 additions & 0 deletions ydb/core/tx/schemeshard/ut_index_build/ut_vector_index_build.cpp
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
#include <ydb/core/base/table_index.h>
#include <ydb/core/tx/schemeshard/ut_helpers/helpers.h>
#include <ydb/core/tx/schemeshard/schemeshard_billing_helpers.h>
#include <ydb/core/testlib/actors/block_events.h>
#include <ydb/core/testlib/tablet_helpers.h>

#include <ydb/core/tx/datashard/datashard.h>
#include <ydb/core/metering/metering.h>

#include <ydb/public/sdk/cpp/client/ydb_table/table.h>

using namespace NKikimr;
using namespace NSchemeShard;
using namespace NSchemeShardUT_Private;
Expand Down Expand Up @@ -225,4 +228,104 @@ Y_UNIT_TEST_SUITE (VectorIndexBuildTest) {

UNIT_ASSERT_VALUES_EQUAL(billRecords.size(), 0);
}

Y_UNIT_TEST(VectorIndexDescriptionIsPersisted) {
TTestBasicRuntime runtime;
TTestEnv env(runtime);
ui64 txId = 100;

TestCreateTable(runtime, ++txId, "/MyRoot", R"(
Name: "vectors"
Columns { Name: "id" Type: "Uint64" }
Columns { Name: "embedding" Type: "String" }
Columns { Name: "covered" Type: "String" }
KeyColumnNames: [ "id" ]
)");
env.TestWaitNotification(runtime, txId);

NYdb::NTable::TGlobalIndexSettings globalIndexSettings;
{
Ydb::Table::GlobalIndexSettings proto;
UNIT_ASSERT(google::protobuf::TextFormat::ParseFromString(R"(
partition_at_keys {
split_points {
type { tuple_type { elements { optional_type { item { type_id: UINT32 } } } } }
value { items { uint32_value: 12345 } }
}
split_points {
type { tuple_type { elements { optional_type { item { type_id: UINT32 } } } } }
value { items { uint32_value: 54321 } }
}
}
partitioning_settings {
min_partitions_count: 3
max_partitions_count: 3
}
)", &proto));
globalIndexSettings = NYdb::NTable::TGlobalIndexSettings::FromProto(proto);
}

std::unique_ptr<NYdb::NTable::TVectorIndexSettings> vectorIndexSettings;
{
Ydb::Table::VectorIndexSettings proto;
UNIT_ASSERT(google::protobuf::TextFormat::ParseFromString(R"(
metric: DISTANCE_COSINE,
vector_type: VECTOR_TYPE_FLOAT,
vector_dimension: 1024
)", &proto));
using T = NYdb::NTable::TVectorIndexSettings;
vectorIndexSettings = std::make_unique<T>(T::FromProto(proto));
}

TBlockEvents<TEvSchemeShard::TEvModifySchemeTransaction> indexCreationBlocker(runtime, [](const auto& ev) {
const auto& modifyScheme = ev->Get()->Record.GetTransaction(0);
return modifyScheme.GetOperationType() == NKikimrSchemeOp::ESchemeOpCreateIndexBuild;
});

const ui64 buildIndexTx = ++txId;
TestBuildIndex(runtime, buildIndexTx, TTestTxConfig::SchemeShard, "/MyRoot", "/MyRoot/vectors", TBuildIndexConfig{
"by_embedding", NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree, { "embedding" }, { "covered" },
{ globalIndexSettings, globalIndexSettings }, std::move(vectorIndexSettings)
});

RebootTablet(runtime, TTestTxConfig::SchemeShard, runtime.AllocateEdgeActor());

indexCreationBlocker.Stop().Unblock();
env.TestWaitNotification(runtime, buildIndexTx);

auto buildIndexOperation = TestGetBuildIndex(runtime, TTestTxConfig::SchemeShard, "/MyRoot", buildIndexTx);
UNIT_ASSERT_VALUES_EQUAL_C(
buildIndexOperation.GetIndexBuild().GetState(), Ydb::Table::IndexBuildState::STATE_DONE,
buildIndexOperation.DebugString()
);

TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/vectors/by_embedding"), {
NLs::PathExist,
NLs::IndexState(NKikimrSchemeOp::EIndexStateReady),
NLs::IndexType(NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree),
NLs::IndexKeys({"embedding"}),
NLs::IndexDataColumns({"covered"}),
NLs::VectorIndexDescription(
Ydb::Table::VectorIndexSettings::DISTANCE_COSINE,
Ydb::Table::VectorIndexSettings::VECTOR_TYPE_FLOAT,
1024
)
});

using namespace NKikimr::NTableIndex::NTableVectorKmeansTreeIndex;
TestDescribeResult(DescribePrivatePath(runtime, JoinFsPaths("/MyRoot/vectors/by_embedding", LevelTable), true, true), {
NLs::IsTable,
NLs::PartitionCount(3),
NLs::MinPartitionsCountEqual(3),
NLs::MaxPartitionsCountEqual(3),
NLs::SplitBoundaries<ui32>({12345, 54321})
});
TestDescribeResult(DescribePrivatePath(runtime, JoinFsPaths("/MyRoot/vectors/by_embedding", PostingTable), true, true), {
NLs::IsTable,
NLs::PartitionCount(3),
NLs::MinPartitionsCountEqual(3),
NLs::MaxPartitionsCountEqual(3),
NLs::SplitBoundaries<ui32>({12345, 54321})
});
}
}

0 comments on commit 270c3df

Please sign in to comment.