diff --git a/ydb/core/kqp/common/buffer/buffer.h b/ydb/core/kqp/common/buffer/buffer.h new file mode 100644 index 000000000000..75a31e2e0505 --- /dev/null +++ b/ydb/core/kqp/common/buffer/buffer.h @@ -0,0 +1,17 @@ +#pragma once + +#include +#include + +namespace NKikimr { +namespace NKqp { + +struct TKqpBufferWriterSettings { + TActorId SessionActorId; + IKqpTransactionManagerPtr TxManager; +}; + +NActors::IActor* CreateKqpBufferWriterActor(TKqpBufferWriterSettings&& settings); + +} +} diff --git a/ydb/core/kqp/common/buffer/events.cpp b/ydb/core/kqp/common/buffer/events.cpp new file mode 100644 index 000000000000..9b10573c4c94 --- /dev/null +++ b/ydb/core/kqp/common/buffer/events.cpp @@ -0,0 +1,16 @@ +#include "events.h" + +namespace NKikimr { +namespace NKqp { + +TEvKqpBuffer::TEvError::TEvError( + const TString& message, + NYql::NDqProto::StatusIds::StatusCode statusCode, + const NYql::TIssues& subIssues) + : Message(message) + , StatusCode(statusCode) + , SubIssues(subIssues) { +} + +} +} diff --git a/ydb/core/kqp/common/buffer/events.h b/ydb/core/kqp/common/buffer/events.h new file mode 100644 index 000000000000..3326d4f22d2d --- /dev/null +++ b/ydb/core/kqp/common/buffer/events.h @@ -0,0 +1,47 @@ +#pragma once + +#include +#include +#include + + +namespace NKikimr { +namespace NKqp { + +struct TEvKqpBuffer { + +struct TEvPrepare : public TEventLocal { + TActorId ExecuterActorId; +}; + +struct TEvCommit : public TEventLocal { + TActorId ExecuterActorId; + ui64 TxId; +}; + +struct TEvRollback : public TEventLocal { + TActorId ExecuterActorId; +}; + +struct TEvFlush : public TEventLocal { + TActorId ExecuterActorId; +}; + +struct TEvResult : public TEventLocal { +}; + +struct TEvError : public TEventLocal { + TString Message; + NYql::NDqProto::StatusIds::StatusCode StatusCode; + NYql::TIssues SubIssues; + + TEvError(const TString& message, NYql::NDqProto::StatusIds::StatusCode statusCode, const NYql::TIssues& subIssues); +}; + +struct TEvTerminate : public TEventLocal { +}; + +}; + +} +} diff --git a/ydb/core/kqp/common/buffer/ya.make b/ydb/core/kqp/common/buffer/ya.make new file mode 100644 index 000000000000..c4fb712d297f --- /dev/null +++ b/ydb/core/kqp/common/buffer/ya.make @@ -0,0 +1,14 @@ +LIBRARY() + +SRCS( + events.cpp +) + +PEERDIR( + ydb/core/kqp/common/simple + ydb/library/yql/public/issue +) + +YQL_LAST_ABI_VERSION() + +END() diff --git a/ydb/core/kqp/common/kqp_tx.h b/ydb/core/kqp/common/kqp_tx.h index 5c340f26e0b9..380b5db03ca2 100644 --- a/ydb/core/kqp/common/kqp_tx.h +++ b/ydb/core/kqp/common/kqp_tx.h @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -121,12 +122,6 @@ struct TDeferredEffects { friend class TKqpTransactionContext; }; -struct TTableInfo { - bool IsOlap = false; - THashSet Pathes; -}; - - class TShardIdToTableInfo { public: const TTableInfo& Get(ui64 shardId) const { @@ -204,6 +199,8 @@ class TKqpTransactionContext : public NYql::TKikimrTransactionContextBase { void Finish() final { YQL_ENSURE(DeferredEffects.Empty()); YQL_ENSURE(!Locks.HasLocks()); + YQL_ENSURE(!TxManager); + YQL_ENSURE(!BufferActorId); FinishTime = TInstant::Now(); @@ -351,6 +348,9 @@ class TKqpTransactionContext : public NYql::TKikimrTransactionContextBase { bool NeedUncommittedChangesFlush = false; THashSet ModifiedTablesSinceLastFlush; + TActorId BufferActorId; + IKqpTransactionManagerPtr TxManager = nullptr; + TShardIdToTableInfoPtr ShardIdToTableInfo = std::make_shared(); }; diff --git a/ydb/core/kqp/common/kqp_tx_manager.cpp b/ydb/core/kqp/common/kqp_tx_manager.cpp new file mode 100644 index 000000000000..82e280ec25dc --- /dev/null +++ b/ydb/core/kqp/common/kqp_tx_manager.cpp @@ -0,0 +1,431 @@ +#include "kqp_tx_manager.h" + +#include +#include + +namespace NKikimr { +namespace NKqp { + +namespace { + +struct TKqpLock { + using TKey = std::tuple; + TKey GetKey() const { return std::make_tuple(Proto.GetLockId(), Proto.GetDataShard(), Proto.GetSchemeShard(), Proto.GetPathId()); } + + bool Invalidated(const TKqpLock& newLock) const { + AFL_ENSURE(GetKey() == newLock.GetKey()); + return Proto.GetGeneration() != newLock.Proto.GetGeneration() || Proto.GetCounter() != newLock.Proto.GetCounter(); + } + + TKqpLock(const NKikimrDataEvents::TLock& proto) + : Proto(proto) {} + + NKikimrDataEvents::TLock Proto; +}; + +class TKqpTransactionManager : public IKqpTransactionManager { + enum ETransactionState { + COLLECTING, + PREPARING, + EXECUTING, + }; +public: + TKqpTransactionManager(bool collectOnly) + : CollectOnly(collectOnly) {} + + void AddShard(ui64 shardId, bool isOlap, const TString& path) override { + Y_ABORT_UNLESS(State == ETransactionState::COLLECTING); + ShardsIds.insert(shardId); + auto& shardInfo = ShardsInfo[shardId]; + shardInfo.IsOlap = isOlap; + HasOlapTableShard |= isOlap; + + const auto [stringsIter, _] = TablePathes.insert(path); + const TStringBuf pathBuf = *stringsIter; + shardInfo.Pathes.insert(pathBuf); + } + + void AddAction(ui64 shardId, ui8 action) override { + Y_ABORT_UNLESS(State == ETransactionState::COLLECTING); + ShardsInfo.at(shardId).Flags |= action; + if (action & EAction::WRITE) { + ReadOnly = false; + } + } + + bool AddLock(ui64 shardId, const NKikimrDataEvents::TLock& lockProto) override { + Y_ABORT_UNLESS(State == ETransactionState::COLLECTING); + TKqpLock lock(lockProto); + bool isError = (lock.Proto.GetCounter() >= NKikimr::TSysTables::TLocksTable::TLock::ErrorMin); + bool isInvalidated = (lock.Proto.GetCounter() == NKikimr::TSysTables::TLocksTable::TLock::ErrorAlreadyBroken) + || (lock.Proto.GetCounter() == NKikimr::TSysTables::TLocksTable::TLock::ErrorBroken); + bool isLocksAcquireFailure = isError && !isInvalidated; + bool broken = false; + + auto& shardInfo = ShardsInfo.at(shardId); + if (auto lockPtr = shardInfo.Locks.FindPtr(lock.GetKey()); lockPtr) { + if (lock.Proto.GetHasWrites()) { + lockPtr->Lock.Proto.SetHasWrites(true); + } + + lockPtr->LocksAcquireFailure |= isLocksAcquireFailure; + if (!lockPtr->LocksAcquireFailure) { + isInvalidated |= lockPtr->Lock.Invalidated(lock); + lockPtr->Invalidated |= isInvalidated; + } + broken = lockPtr->Invalidated || lockPtr->LocksAcquireFailure; + } else { + shardInfo.Locks.emplace( + lock.GetKey(), + TShardInfo::TLockInfo { + .Lock = std::move(lock), + .Invalidated = isInvalidated, + .LocksAcquireFailure = isLocksAcquireFailure, + }); + broken = isInvalidated || isLocksAcquireFailure; + } + + if (broken && !LocksIssue) { + if (isLocksAcquireFailure) { + LocksIssue = YqlIssue(NYql::TPosition(), NYql::TIssuesIds::KIKIMR_LOCKS_ACQUIRE_FAILURE); + return false; + } else if (isInvalidated) { + MakeLocksIssue(shardInfo); + return false; + } + AFL_ENSURE(false); + } + + return true; + } + + void BreakLock(ui64 shardId) override { + if (LocksIssue) { + return; + } + auto& shardInfo = ShardsInfo.at(shardId); + MakeLocksIssue(shardInfo); + } + + TTableInfo GetShardTableInfo(ui64 shardId) const override { + const auto& info = ShardsInfo.at(shardId); + return TTableInfo{ + .IsOlap = info.IsOlap, + .Pathes = info.Pathes, + }; + } + + EShardState GetState(ui64 shardId) const override { + return ShardsInfo.at(shardId).State; + } + + void SetState(ui64 shardId, EShardState state) override { + ShardsInfo.at(shardId).State = state; + } + + TVector GetLocks() const override { + TVector locks; + for (const auto& [_, shardInfo] : ShardsInfo) { + for (const auto& [_, lockInfo] : shardInfo.Locks) { + locks.push_back(lockInfo.Lock.Proto); + } + } + return locks; + } + + TVector GetLocks(ui64 shardId) const override { + TVector locks; + const auto& shardInfo = ShardsInfo.at(shardId); + for (const auto& [_, lockInfo] : shardInfo.Locks) { + locks.push_back(lockInfo.Lock.Proto); + } + return locks; + } + + bool IsTxPrepared() const override { + for (const auto& [_, shardInfo] : ShardsInfo) { + if (shardInfo.State != EShardState::PREPARED) { + return false; + } + } + return true; + } + + bool IsTxFinished() const override { + for (const auto& [_, shardInfo] : ShardsInfo) { + if (shardInfo.State != EShardState::FINISHED) { + return false; + } + } + return true; + } + + bool IsReadOnly() const override { + return ReadOnly; + } + + bool IsSingleShard() const override { + return GetShardsCount() == 1; + } + + bool HasOlapTable() const override { + return HasOlapTableShard; + } + + bool IsEmpty() const override { + return GetShardsCount() == 0; + } + + bool HasLocks() const override { + for (const auto& [_, shardInfo] : ShardsInfo) { + if (!shardInfo.Locks.empty()) { + return true; + } + } + return false; + } + + bool IsVolatile() const override { + return !HasOlapTable(); + } + + bool HasSnapshot() const override { + return ValidSnapshot; + } + + void SetHasSnapshot(bool hasSnapshot) override { + ValidSnapshot = hasSnapshot; + } + + bool BrokenLocks() const override { + return LocksIssue.has_value() && !(HasSnapshot() && IsReadOnly()); + } + + const std::optional& GetLockIssue() const override { + return LocksIssue; + } + + const THashSet& GetShards() const override { + return ShardsIds; + } + + ui64 GetShardsCount() const override { + return ShardsIds.size(); + } + + void StartPrepare() override { + AFL_ENSURE(!CollectOnly); + AFL_ENSURE(State == ETransactionState::COLLECTING); + AFL_ENSURE(!IsReadOnly()); + + THashSet sendingColumnShardsSet; + THashSet receivingColumnShardsSet; + + for (auto& [shardId, shardInfo] : ShardsInfo) { + if ((shardInfo.Flags & EAction::WRITE)) { + ReceivingShards.insert(shardId); + if (IsVolatile()) { + SendingShards.insert(shardId); + } + if (shardInfo.IsOlap) { + sendingColumnShardsSet.insert(shardId); + } + } + if (!shardInfo.Locks.empty()) { + SendingShards.insert(shardId); + if (shardInfo.IsOlap) { + receivingColumnShardsSet.insert(shardId); + } + } + + AFL_ENSURE(shardInfo.State == EShardState::PROCESSING); + shardInfo.State = EShardState::PREPARING; + } + + Y_ABORT_UNLESS(!ReceivingShards.empty()); + + constexpr size_t minArbiterMeshSize = 5; + if ((IsVolatile() && + ReceivingShards.size() >= minArbiterMeshSize)) + { + std::vector candidates; + candidates.reserve(ReceivingShards.size()); + for (ui64 candidate : ReceivingShards) { + // Note: all receivers are also senders in volatile transactions + if (Y_LIKELY(SendingShards.contains(candidate))) { + candidates.push_back(candidate); + } + } + if (candidates.size() >= minArbiterMeshSize) { + // Select a random arbiter + const ui32 index = RandomNumber(candidates.size()); + Arbiter = candidates.at(index); + } + } + + if (!receivingColumnShardsSet.empty() || !sendingColumnShardsSet.empty()) { + AFL_ENSURE(!IsVolatile()); + const auto& shards = receivingColumnShardsSet.empty() + ? sendingColumnShardsSet + : receivingColumnShardsSet; + + const ui32 index = RandomNumber(shards.size()); + auto arbiterIterator = std::begin(shards); + std::advance(arbiterIterator, index); + ArbiterColumnShard = *arbiterIterator; + } + + ShardsToWaitPrepare = ShardsIds; + + MinStep = std::numeric_limits::min(); + MaxStep = std::numeric_limits::max(); + Coordinator = 0; + + State = ETransactionState::PREPARING; + } + + TPrepareInfo GetPrepareTransactionInfo() override { + AFL_ENSURE(State == ETransactionState::PREPARING); + AFL_ENSURE(!ReceivingShards.empty()); + + TPrepareInfo result { + .SendingShards = SendingShards, + .ReceivingShards = ReceivingShards, + .Arbiter = Arbiter, + .ArbiterColumnShard = ArbiterColumnShard, + }; + + return result; + } + + bool ConsumePrepareTransactionResult(TPrepareResult&& result) override { + AFL_ENSURE(State == ETransactionState::PREPARING); + auto& shardInfo = ShardsInfo.at(result.ShardId); + AFL_ENSURE(shardInfo.State == EShardState::PREPARING); + shardInfo.State = EShardState::PREPARED; + + ShardsToWaitPrepare.erase(result.ShardId); + + MinStep = std::max(MinStep, result.MinStep); + MaxStep = std::min(MaxStep, result.MaxStep); + + if (result.Coordinator && !Coordinator) { + Coordinator = result.Coordinator; + } + + AFL_ENSURE(Coordinator && Coordinator == result.Coordinator)("prev_coordinator", Coordinator)("new_coordinator", result.Coordinator); + + return ShardsToWaitPrepare.empty(); + } + + void StartExecute() override { + AFL_ENSURE(!CollectOnly); + AFL_ENSURE(State == ETransactionState::PREPARING + || (State == ETransactionState::COLLECTING + && IsSingleShard())); + AFL_ENSURE(!IsReadOnly()); + State = ETransactionState::EXECUTING; + + for (auto& [_, shardInfo] : ShardsInfo) { + AFL_ENSURE(shardInfo.State == EShardState::PREPARED + || (shardInfo.State == EShardState::PROCESSING + && IsSingleShard())); + shardInfo.State = EShardState::EXECUTING; + } + + AFL_ENSURE(ReceivingShards.empty() || !IsSingleShard() || HasOlapTable()); + } + + TCommitInfo GetCommitInfo() override { + AFL_ENSURE(State == ETransactionState::EXECUTING); + TCommitInfo result; + result.MinStep = MinStep; + result.MaxStep = MaxStep; + result.Coordinator = Coordinator; + + for (auto& [shardId, shardInfo] : ShardsInfo) { + result.ShardsInfo.push_back(TCommitShardInfo{ + .ShardId = shardId, + .AffectedFlags = shardInfo.Flags, + }); + + AFL_ENSURE(shardInfo.State == EShardState::EXECUTING); + } + return result; + } + + bool ConsumeCommitResult(ui64 shardId) override { + AFL_ENSURE(State == ETransactionState::EXECUTING); + auto& shardInfo = ShardsInfo.at(shardId); + AFL_ENSURE(shardInfo.State == EShardState::EXECUTING); + shardInfo.State = EShardState::FINISHED; + + // Either all shards committed or all shards failed, + // so we need to wait only for one answer from ReceivingShards. + return ReceivingShards.contains(shardId) || IsSingleShard(); + } + +private: + bool CollectOnly = false; + ETransactionState State = ETransactionState::COLLECTING; + + struct TShardInfo { + EShardState State = EShardState::PROCESSING; + TActionFlags Flags = 0; + + struct TLockInfo { + TKqpLock Lock; + bool Invalidated = false; + bool LocksAcquireFailure = false; + }; + + THashMap Locks; + + bool IsOlap = false; + THashSet Pathes; + }; + + void MakeLocksIssue(const TShardInfo& shardInfo) { + TStringBuilder message; + message << "Transaction locks invalidated. Tables: "; + bool first = true; + // TODO: add error by pathid + for (const auto& path : shardInfo.Pathes) { + if (!first) { + message << ", "; + first = false; + } + message << "`" << path << "`"; + } + LocksIssue = YqlIssue(NYql::TPosition(), NYql::TIssuesIds::KIKIMR_LOCKS_INVALIDATED, message); + } + + THashSet ShardsIds; + THashMap ShardsInfo; + std::unordered_set TablePathes; + + bool ReadOnly = true; + bool ValidSnapshot = false; + bool HasOlapTableShard = false; + std::optional LocksIssue; + + THashSet SendingShards; + THashSet ReceivingShards; + std::optional Arbiter; + std::optional ArbiterColumnShard; + + THashSet ShardsToWaitPrepare; + + ui64 MinStep = 0; + ui64 MaxStep = 0; + ui64 Coordinator = 0; +}; + +} + +IKqpTransactionManagerPtr CreateKqpTransactionManager(bool collectOnly) { + return std::make_shared(collectOnly); +} + +} +} diff --git a/ydb/core/kqp/common/kqp_tx_manager.h b/ydb/core/kqp/common/kqp_tx_manager.h new file mode 100644 index 000000000000..03dd73ad034b --- /dev/null +++ b/ydb/core/kqp/common/kqp_tx_manager.h @@ -0,0 +1,118 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + + +namespace NKikimr { +namespace NKqp { + +struct TTableInfo { + bool IsOlap = false; + THashSet Pathes; +}; + +class IKqpTransactionManager { +public: + virtual ~IKqpTransactionManager() = default; + + enum EShardState { + PROCESSING, + PREPARING, + PREPARED, + EXECUTING, + FINISHED + }; + + enum EAction { + READ = 1, + WRITE = 2, + }; + + using TActionFlags = ui8; + + virtual void AddShard(ui64 shardId, bool isOlap, const TString& path) = 0; + virtual void AddAction(ui64 shardId, ui8 action) = 0; + virtual bool AddLock(ui64 shardId, const NKikimrDataEvents::TLock& lock) = 0; + + virtual void BreakLock(ui64 shardId) = 0; + + virtual TTableInfo GetShardTableInfo(ui64 shardId) const = 0; + + virtual TVector GetLocks() const = 0; + virtual TVector GetLocks(ui64 shardId) const = 0; + + virtual EShardState GetState(ui64 shardId) const = 0; + virtual void SetState(ui64 shardId, EShardState state) = 0; + + virtual bool IsTxPrepared() const = 0; + virtual bool IsTxFinished() const = 0; + + virtual bool IsReadOnly() const = 0; + virtual bool IsSingleShard() const = 0; + virtual bool HasOlapTable() const = 0; + + virtual bool IsEmpty() const = 0; + virtual bool HasLocks() const = 0; + + virtual bool IsVolatile() const = 0; + + virtual bool HasSnapshot() const = 0; + virtual void SetHasSnapshot(bool hasSnapshot) = 0; + + virtual bool BrokenLocks() const = 0; + virtual const std::optional& GetLockIssue() const = 0; + + virtual const THashSet& GetShards() const = 0; + virtual ui64 GetShardsCount() const = 0; + + virtual void StartPrepare() = 0; + + struct TPrepareInfo { + const THashSet& SendingShards; + const THashSet& ReceivingShards; + std::optional Arbiter; + std::optional ArbiterColumnShard; + }; + + virtual TPrepareInfo GetPrepareTransactionInfo() = 0; + + struct TPrepareResult { + ui64 ShardId; + ui64 MinStep; + ui64 MaxStep; + ui64 Coordinator; + }; + + virtual bool ConsumePrepareTransactionResult(TPrepareResult&& result) = 0; + + virtual void StartExecute() = 0; + + struct TCommitShardInfo { + ui64 ShardId; + ui32 AffectedFlags; + }; + + struct TCommitInfo { + ui64 MinStep; + ui64 MaxStep; + ui64 Coordinator; + + TVector ShardsInfo; + }; + + virtual TCommitInfo GetCommitInfo() = 0; + + virtual bool ConsumeCommitResult(ui64 shardId) = 0; +}; + +using IKqpTransactionManagerPtr = std::shared_ptr; + +IKqpTransactionManagerPtr CreateKqpTransactionManager(bool collectOnly = false); + +} +} diff --git a/ydb/core/kqp/common/simple/kqp_event_ids.h b/ydb/core/kqp/common/simple/kqp_event_ids.h index f6989d5b69bd..d3c6f3f2fd28 100644 --- a/ydb/core/kqp/common/simple/kqp_event_ids.h +++ b/ydb/core/kqp/common/simple/kqp_event_ids.h @@ -45,7 +45,9 @@ struct TKqpEvents { EvListProxyNodesRequest, EvListProxyNodesResponse, EvUpdateDatabaseInfo, - EvDelayedRequestError + EvDelayedRequestError, + EvBufferWrite, + EvBufferWriteResult, }; static_assert (EvCompileInvalidateRequest + 1 == EvAbortExecution); @@ -181,5 +183,17 @@ struct TKqpWorkloadServiceEvents { }; }; +struct TKqpBufferWriterEvents { + enum EKqpBufferWriterEvents { + EvPrepare = EventSpaceBegin(TKikimrEvents::ES_KQP) + 800, + EvCommit, + EvRollback, + EvFlush, + EvResult, + EvError, + EvTerminate, + }; +}; + } // namespace NKqp } // namespace NKikimr diff --git a/ydb/core/kqp/common/ya.make b/ydb/core/kqp/common/ya.make index 0559e96e994c..0a8050d7f7e4 100644 --- a/ydb/core/kqp/common/ya.make +++ b/ydb/core/kqp/common/ya.make @@ -11,6 +11,7 @@ SRCS( kqp_script_executions.cpp kqp_timeouts.cpp kqp_timeouts.h + kqp_tx_manager.cpp kqp_tx.cpp kqp_types.cpp kqp_types.h diff --git a/ydb/core/kqp/executer_actor/kqp_data_executer.cpp b/ydb/core/kqp/executer_actor/kqp_data_executer.cpp index 6fae68411ba9..bfdfcb2bcabc 100644 --- a/ydb/core/kqp/executer_actor/kqp_data_executer.cpp +++ b/ydb/core/kqp/executer_actor/kqp_data_executer.cpp @@ -8,10 +8,12 @@ #include #include -#include #include +#include #include #include +#include +#include #include #include #include @@ -129,11 +131,14 @@ class TKqpDataExecuter : public TKqpExecuterBase& userRequestContext, ui32 statementResultIndex, const std::optional& federatedQuerySetup, - const TGUCSettings::TPtr& GUCSettings, const TShardIdToTableInfoPtr& shardIdToTableInfo) + const TGUCSettings::TPtr& GUCSettings, + const TShardIdToTableInfoPtr& shardIdToTableInfo, + const IKqpTransactionManagerPtr& txManager, + const TActorId bufferActorId) : TBase(std::move(request), database, userToken, counters, tableServiceConfig, - userRequestContext, statementResultIndex, TWilsonKqp::DataExecuter, "DataExecuter", streamResult) + userRequestContext, statementResultIndex, TWilsonKqp::DataExecuter, + "DataExecuter", streamResult, bufferActorId, txManager) , AsyncIoFactory(std::move(asyncIoFactory)) - , UseEvWriteForOltp(tableServiceConfig.GetEnableOltpSink()) , FederatedQuerySetup(federatedQuerySetup) , GUCSettings(GUCSettings) , ShardIdToTableInfo(shardIdToTableInfo) @@ -141,6 +146,7 @@ class TKqpDataExecuter : public TKqpExecuterBaseBrokenLockShardId); return ReplyErrorAndDie(Ydb::StatusIds::ABORTED, {}); @@ -221,21 +229,40 @@ class TKqpDataExecuter : public TKqpExecuterBaseAdd(lock.GetDataShard(), stageInfo.Meta.TableKind == ETableKind::Olap, stageInfo.Meta.TablePath); + + if (TxManager) { + TxManager->AddShard(lock.GetDataShard(), stageInfo.Meta.TableKind == ETableKind::Olap, stageInfo.Meta.TablePath); + TxManager->AddAction(lock.GetDataShard(), IKqpTransactionManager::EAction::READ); + TxManager->AddLock(lock.GetDataShard(), lock); + } } } else if (data.GetData().template Is()) { NKikimrKqp::TEvKqpOutputActorResultInfo info; YQL_ENSURE(data.GetData().UnpackTo(&info), "Failed to unpack settings"); for (auto& lock : info.GetLocks()) { - Locks.push_back(lock); + if (!TxManager) { + Locks.push_back(lock); + } const auto& task = TasksGraph.GetTask(taskId); const auto& stageInfo = TasksGraph.GetStageInfo(task.StageId); ShardIdToTableInfo->Add(lock.GetDataShard(), stageInfo.Meta.TableKind == ETableKind::Olap, stageInfo.Meta.TablePath); + if (TxManager) { + YQL_ENSURE(stageInfo.Meta.TableKind == ETableKind::Olap); + TxManager->AddShard(lock.GetDataShard(), stageInfo.Meta.TableKind == ETableKind::Olap, stageInfo.Meta.TablePath); + TxManager->AddAction(lock.GetDataShard(), IKqpTransactionManager::EAction::WRITE); + if (info.GetHasRead()) { + TxManager->AddAction(lock.GetDataShard(), IKqpTransactionManager::EAction::READ); + } + TxManager->AddLock(lock.GetDataShard(), lock); + } } } }; @@ -255,13 +282,70 @@ class TKqpDataExecuter : public TKqpExecuterBaseSetHasSnapshot(GetSnapshot().IsValid()); + } + + if (!BufferActorId || (ReadOnlyTx && Request.LocksOp != ELocksOp::Rollback)) { + Become(&TKqpDataExecuter::FinalizeState); + MakeResponseAndPassAway(); + return; + } else if (Request.LocksOp == ELocksOp::Commit && !ReadOnlyTx) { + Become(&TKqpDataExecuter::FinalizeState); + LOG_D("Send Commit to BufferActor=" << BufferActorId); + + auto event = std::make_unique(); + event->ExecuterActorId = SelfId(); + event->TxId = TxId; + Send(BufferActorId, event.release()); + return; + } else if (Request.LocksOp == ELocksOp::Rollback) { + Become(&TKqpDataExecuter::FinalizeState); + LOG_D("Send Rollback to BufferActor=" << BufferActorId); + + auto event = std::make_unique(); + event->ExecuterActorId = SelfId(); + Send(BufferActorId, event.release()); + MakeResponseAndPassAway(); + return; + } else if (Request.UseImmediateEffects) { + Become(&TKqpDataExecuter::FinalizeState); + LOG_D("Send Flush to BufferActor=" << BufferActorId); + + auto event = std::make_unique(); + event->ExecuterActorId = SelfId(); + Send(BufferActorId, event.release()); + return; + } else { + Become(&TKqpDataExecuter::FinalizeState); + MakeResponseAndPassAway(); + return; + } + } + + STATEFN(FinalizeState) { + switch(ev->GetTypeRewrite()) { + hFunc(TEvKqp::TEvAbortExecution, HandleAbortExecution); + hFunc(TEvKqpBuffer::TEvResult, HandleFinalize); + default: + LOG_W("Unexpected event: " << ev->GetTypeName() << ", at state: FinalizeState"); + } + } + + void HandleFinalize(TEvKqpBuffer::TEvResult::TPtr&) { + MakeResponseAndPassAway(); + } + + void MakeResponseAndPassAway() { ResponseEv->Snapshot = GetSnapshot(); - if (!Locks.empty()) { + if (!Locks.empty() || (TxManager && TxManager->HasLocks())) { if (LockHandle) { ResponseEv->LockHandle = std::move(LockHandle); } - BuildLocks(*ResponseEv->Record.MutableResponse()->MutableResult()->MutableLocks(), Locks); + if (!TxManager) { + BuildLocks(*ResponseEv->Record.MutableResponse()->MutableResult()->MutableLocks(), Locks); + } } auto resultSize = ResponseEv->GetByteSize(); @@ -329,6 +413,8 @@ class TKqpDataExecuter : public TKqpExecuterBaseRecord.GetTxLocks(0).GetPathId()); } ReplyErrorAndDie(Ydb::StatusIds::ABORTED, {}); + return; } default: { @@ -912,6 +999,7 @@ class TKqpDataExecuter : public TKqpExecuterBase(); @@ -999,7 +1088,7 @@ class TKqpDataExecuter : public TKqpExecuterBaseAsActorContext()); - LOG_T("Execute planned transaction, coordinator: " << TxCoordinator); + LOG_D("Execute planned transaction, coordinator: " << TxCoordinator << " for " << affectedSet.size() << "shards"); Send(MakePipePerNodeCacheID(false), new TEvPipeCache::TEvForward(ev.Release(), TxCoordinator, /* subscribe */ true)); } @@ -1185,6 +1274,7 @@ class TKqpDataExecuter : public TKqpExecuterBaseRecord.GetTxLocks(0).GetSchemeShard(), res->Record.GetTxLocks(0).GetPathId()); ReplyErrorAndDie(Ydb::StatusIds::ABORTED, {}); + return; } CheckExecutionComplete(); return; @@ -1197,6 +1287,7 @@ class TKqpDataExecuter : public TKqpExecuterBaseGet(); ResponseEv->Orbit.Join(res->Orbit); const ui64 shardId = res->GetOrigin(); @@ -1463,7 +1554,7 @@ class TKqpDataExecuter : public TKqpExecuterBase TTask& { - YQL_ENSURE(!UseEvWriteForOltp); + YQL_ENSURE(!TxManager); auto it = shardTasks.find(shardId); if (it != shardTasks.end()) { return TasksGraph.GetTask(it->second); @@ -1603,7 +1694,7 @@ class TKqpDataExecuter : public TKqpExecuterBase ev; if (isOlap) { @@ -1698,6 +1790,7 @@ class TKqpDataExecuter : public TKqpExecuterBaseGet(shardId).IsOlap) { + if (TxManager || ShardIdToTableInfo->Get(shardId).IsOlap) { if (auto it = evWriteTxs.find(shardId); it != evWriteTxs.end()) { locks = it->second->MutableLocks(); } else { @@ -2642,7 +2741,9 @@ class TKqpDataExecuter : public TKqpExecuterBase writeId; if (Request.TopicOperations.HasWriteId()) { writeId = Request.TopicOperations.GetWriteId(); @@ -2789,7 +2889,7 @@ class TKqpDataExecuter : public TKqpExecuterBaseSender); if (ev->Sender == SelfId()) { PassAway(); @@ -2844,7 +2944,6 @@ class TKqpDataExecuter : public TKqpExecuterBase FederatedQuerySetup; const TGUCSettings::TPtr GUCSettings; TShardIdToTableInfoPtr ShardIdToTableInfo; @@ -2893,11 +2992,11 @@ IActor* CreateKqpDataExecuter(IKqpGateway::TExecPhysicalRequest&& request, const NYql::NDq::IDqAsyncIoFactory::TPtr asyncIoFactory, const TActorId& creator, const TIntrusivePtr& userRequestContext, ui32 statementResultIndex, const std::optional& federatedQuerySetup, const TGUCSettings::TPtr& GUCSettings, - const TShardIdToTableInfoPtr& shardIdToTableInfo) + const TShardIdToTableInfoPtr& shardIdToTableInfo, const IKqpTransactionManagerPtr& txManager, const TActorId bufferActorId) { return new TKqpDataExecuter(std::move(request), database, userToken, counters, streamResult, tableServiceConfig, - std::move(asyncIoFactory), creator, userRequestContext, - statementResultIndex, federatedQuerySetup, GUCSettings, shardIdToTableInfo); + std::move(asyncIoFactory), creator, userRequestContext, statementResultIndex, federatedQuerySetup, GUCSettings, + shardIdToTableInfo, txManager, bufferActorId); } } // namespace NKqp diff --git a/ydb/core/kqp/executer_actor/kqp_executer.h b/ydb/core/kqp/executer_actor/kqp_executer.h index 7a0fd546eb69..6bf6d794cf51 100644 --- a/ydb/core/kqp/executer_actor/kqp_executer.h +++ b/ydb/core/kqp/executer_actor/kqp_executer.h @@ -97,7 +97,7 @@ IActor* CreateKqpExecuter(IKqpGateway::TExecPhysicalRequest&& request, const TSt NYql::NDq::IDqAsyncIoFactory::TPtr asyncIoFactory, TPreparedQueryHolder::TConstPtr preparedQuery, const TActorId& creator, const TIntrusivePtr& userRequestContext, ui32 statementResultIndex, const std::optional& federatedQuerySetup, const TGUCSettings::TPtr& GUCSettings, - const TShardIdToTableInfoPtr& shardIdToTableInfo); + const TShardIdToTableInfoPtr& shardIdToTableInfo, const IKqpTransactionManagerPtr& txManager, const TActorId bufferActorId); IActor* CreateKqpSchemeExecuter( TKqpPhyTxHolder::TConstPtr phyTx, NKikimrKqp::EQueryType queryType, const TActorId& target, diff --git a/ydb/core/kqp/executer_actor/kqp_executer_impl.cpp b/ydb/core/kqp/executer_actor/kqp_executer_impl.cpp index c81fcf313461..71be8e351bec 100644 --- a/ydb/core/kqp/executer_actor/kqp_executer_impl.cpp +++ b/ydb/core/kqp/executer_actor/kqp_executer_impl.cpp @@ -82,7 +82,7 @@ IActor* CreateKqpExecuter(IKqpGateway::TExecPhysicalRequest&& request, const TSt TPreparedQueryHolder::TConstPtr preparedQuery, const TActorId& creator, const TIntrusivePtr& userRequestContext, ui32 statementResultIndex, const std::optional& federatedQuerySetup, const TGUCSettings::TPtr& GUCSettings, - const TShardIdToTableInfoPtr& shardIdToTableInfo) + const TShardIdToTableInfoPtr& shardIdToTableInfo, const IKqpTransactionManagerPtr& txManager, const TActorId bufferActorId) { if (request.Transactions.empty()) { // commit-only or rollback-only data transaction @@ -90,7 +90,8 @@ IActor* CreateKqpExecuter(IKqpGateway::TExecPhysicalRequest&& request, const TSt std::move(request), database, userToken, counters, false, tableServiceConfig, std::move(asyncIoFactory), creator, userRequestContext, statementResultIndex, - federatedQuerySetup, /*GUCSettings*/nullptr, shardIdToTableInfo + federatedQuerySetup, /*GUCSettings*/nullptr, + shardIdToTableInfo, txManager, bufferActorId ); } @@ -113,7 +114,8 @@ IActor* CreateKqpExecuter(IKqpGateway::TExecPhysicalRequest&& request, const TSt std::move(request), database, userToken, counters, false, tableServiceConfig, std::move(asyncIoFactory), creator, userRequestContext, statementResultIndex, - federatedQuerySetup, /*GUCSettings*/nullptr, shardIdToTableInfo + federatedQuerySetup, /*GUCSettings*/nullptr, + shardIdToTableInfo, txManager, bufferActorId ); case NKqpProto::TKqpPhyTx::TYPE_SCAN: @@ -128,7 +130,8 @@ IActor* CreateKqpExecuter(IKqpGateway::TExecPhysicalRequest&& request, const TSt std::move(request), database, userToken, counters, true, tableServiceConfig, std::move(asyncIoFactory), creator, userRequestContext, statementResultIndex, - federatedQuerySetup, GUCSettings, shardIdToTableInfo + federatedQuerySetup, GUCSettings, + shardIdToTableInfo, txManager, bufferActorId ); default: diff --git a/ydb/core/kqp/executer_actor/kqp_executer_impl.h b/ydb/core/kqp/executer_actor/kqp_executer_impl.h index 221bb1ddfb99..522f8bc053c9 100644 --- a/ydb/core/kqp/executer_actor/kqp_executer_impl.h +++ b/ydb/core/kqp/executer_actor/kqp_executer_impl.h @@ -130,8 +130,11 @@ class TKqpExecuterBase : public TActorBootstrapped { TKqpRequestCounters::TPtr counters, const NKikimrConfig::TTableServiceConfig& tableServiceConfig, const TIntrusivePtr& userRequestContext, - ui32 statementResultIndex, ui64 spanVerbosity = 0, TString spanName = "KqpExecuterBase", bool streamResult = false) + ui32 statementResultIndex, ui64 spanVerbosity = 0, TString spanName = "KqpExecuterBase", + bool streamResult = false, const TActorId bufferActorId = {}, const IKqpTransactionManagerPtr& txManager = nullptr) : Request(std::move(request)) + , BufferActorId(bufferActorId) + , TxManager(txManager) , Database(database) , UserToken(userToken) , Counters(counters) @@ -507,6 +510,12 @@ class TKqpExecuterBase : public TActorBootstrapped { } } + if (BufferActorId && Request.LocksOp == ELocksOp::Rollback) { + YQL_ENSURE(Request.Transactions.empty()); + static_cast(this)->Finalize(); + return; + } + ExecuterStateSpan = NWilson::TSpan(TWilsonKqp::ExecuterTableResolve, ExecuterSpan.GetTraceId(), "WaitForTableResolve", NWilson::EFlags::AUTO_END); auto kqpTableResolver = CreateKqpTableResolver(this->SelfId(), TxId, UserToken, Request.Transactions, @@ -940,6 +949,9 @@ class TKqpExecuterBase : public TActorBootstrapped { settings.SetLockTxId(*lockTxId); settings.SetLockNodeId(SelfId().NodeId()); } + if (!settings.GetInconsistentTx() && !settings.GetIsOlap()) { + ActorIdToProto(BufferActorId, settings.MutableBufferActorId()); + } output.SinkSettings.ConstructInPlace(); output.SinkSettings->PackFrom(settings); } else { @@ -1996,6 +2008,8 @@ class TKqpExecuterBase : public TActorBootstrapped { protected: IKqpGateway::TExecPhysicalRequest Request; + TActorId BufferActorId; + IKqpTransactionManagerPtr TxManager; const TString Database; const TIntrusiveConstPtr UserToken; TKqpRequestCounters::TPtr Counters; @@ -2065,7 +2079,7 @@ IActor* CreateKqpDataExecuter(IKqpGateway::TExecPhysicalRequest&& request, const NYql::NDq::IDqAsyncIoFactory::TPtr asyncIoFactory, const TActorId& creator, const TIntrusivePtr& userRequestContext, ui32 statementResultIndex, const std::optional& federatedQuerySetup, const TGUCSettings::TPtr& GUCSettings, - const TShardIdToTableInfoPtr& shardIdToTableInfo); + const TShardIdToTableInfoPtr& shardIdToTableInfo, const IKqpTransactionManagerPtr& txManager, const TActorId bufferActorId); IActor* CreateKqpScanExecuter(IKqpGateway::TExecPhysicalRequest&& request, const TString& database, const TIntrusiveConstPtr& userToken, TKqpRequestCounters::TPtr counters, diff --git a/ydb/core/kqp/executer_actor/kqp_planner.cpp b/ydb/core/kqp/executer_actor/kqp_planner.cpp index 94a6d6992fec..ba6cc4adacff 100644 --- a/ydb/core/kqp/executer_actor/kqp_planner.cpp +++ b/ydb/core/kqp/executer_actor/kqp_planner.cpp @@ -52,6 +52,23 @@ void BuildInitialTaskResources(const TKqpTasksGraph& graph, ui64 taskId, TTaskRe ret.HeavyProgram = opts.GetHasMapJoin(); } +bool NeedToRunLocally(const TTask& task) { + for (const auto& output : task.Outputs) { + if (output.Type == TTaskOutputType::Sink && output.SinkType == KqpTableSinkName) { + YQL_ENSURE(output.SinkSettings); + const google::protobuf::Any& settingsAny = *output.SinkSettings; + YQL_ENSURE(settingsAny.Is()); + NKikimrKqp::TKqpTableSinkSettings settings; + YQL_ENSURE(settingsAny.UnpackTo(&settings)); + if (ActorIdFromProto(settings.GetBufferActorId())) { + // We need to run compute actor locally if it uses buffer actor. + return true; + } + } + } + return false; +} + bool LimitCPU(TIntrusivePtr ctx) { return ctx->PoolId && ctx->PoolConfig.has_value() && ctx->PoolConfig->TotalCpuLimitPercentPerNode > 0; } @@ -420,7 +437,12 @@ std::unique_ptr TKqpPlanner::AssignTasksToNodes() { for(ui64 taskId: group.TaskIds) { auto [it, success] = alreadyAssigned.emplace(taskId, group.NodeId); if (success) { - TasksPerNode[group.NodeId].push_back(taskId); + if (NeedToRunLocally(TasksGraph.GetTask(taskId))) { + const ui64 selfNodeId = ExecuterId.NodeId(); + TasksPerNode[selfNodeId].push_back(taskId); + } else { + TasksPerNode[group.NodeId].push_back(taskId); + } } } } @@ -466,7 +488,7 @@ TString TKqpPlanner::ExecuteDataComputeTask(ui64 taskId, ui32 computeTasksSize) .WithSpilling = WithSpilling, .StatsMode = GetDqStatsMode(StatsMode), .Deadline = Deadline, - .ShareMailbox = (computeTasksSize <= 1), + .ShareMailbox = (computeTasksSize <= 1) || NeedToRunLocally(task), .RlPath = Nothing(), .BlockTrackingMode = BlockTrackingMode }); diff --git a/ydb/core/kqp/expr_nodes/kqp_expr_nodes.json b/ydb/core/kqp/expr_nodes/kqp_expr_nodes.json index 63ec4a7dc929..b8f78803d853 100644 --- a/ydb/core/kqp/expr_nodes/kqp_expr_nodes.json +++ b/ydb/core/kqp/expr_nodes/kqp_expr_nodes.json @@ -548,7 +548,9 @@ {"Index": 0, "Name": "Table", "Type": "TKqpTable"}, {"Index": 1, "Name": "InconsistentWrite", "Type": "TCoAtom"}, {"Index": 2, "Name": "Mode", "Type": "TCoAtom"}, - {"Index": 3, "Name": "Settings", "Type": "TCoNameValueTupleList", "Optional": true} + {"Index": 3, "Name": "Priority", "Type": "TCoAtom"}, + {"Index": 4, "Name": "TableType", "Type": "TCoAtom"}, + {"Index": 5, "Name": "Settings", "Type": "TCoNameValueTupleList", "Optional": true} ] }, { diff --git a/ydb/core/kqp/host/kqp_type_ann.cpp b/ydb/core/kqp/host/kqp_type_ann.cpp index 36249024fea4..c1eac4a75c47 100644 --- a/ydb/core/kqp/host/kqp_type_ann.cpp +++ b/ydb/core/kqp/host/kqp_type_ann.cpp @@ -1859,7 +1859,7 @@ TStatus AnnotateKqpSinkEffect(const TExprNode::TPtr& node, TExprContext& ctx) { } TStatus AnnotateTableSinkSettings(const TExprNode::TPtr& input, TExprContext& ctx) { - if (!EnsureMinMaxArgsCount(*input, 4, 5, ctx)) { + if (!EnsureMinMaxArgsCount(*input, 5, 6, ctx)) { return TStatus::Error; } input->SetTypeAnn(ctx.MakeType()); diff --git a/ydb/core/kqp/opt/kqp_opt_build_txs.cpp b/ydb/core/kqp/opt/kqp_opt_build_txs.cpp index f15d85253640..fecacf7ac09d 100644 --- a/ydb/core/kqp/opt/kqp_opt_build_txs.cpp +++ b/ydb/core/kqp/opt/kqp_opt_build_txs.cpp @@ -560,7 +560,7 @@ class TKqpBuildTxsTransformer : public TSyncTransformerBase { } if (!query.Effects().Empty()) { - auto collectedEffects = CollectEffects(query.Effects(), ctx); + auto collectedEffects = CollectEffects(query.Effects(), ctx, *KqpCtx); for (auto& effects : collectedEffects) { auto tx = BuildTx(effects.Ptr(), ctx, /* isPrecompute */ false); @@ -585,11 +585,12 @@ class TKqpBuildTxsTransformer : public TSyncTransformerBase { } private: - TVector CollectEffects(const TExprList& list, TExprContext& ctx) { + TVector CollectEffects(const TExprList& list, TExprContext& ctx, TKqpOptimizeContext& kqpCtx) { struct TEffectsInfo { enum class EType { KQP_EFFECT, KQP_SINK, + KQP_BATCH_SINK, EXTERNAL_SINK, }; @@ -617,23 +618,38 @@ class TKqpBuildTxsTransformer : public TSyncTransformerBase { effectsInfos.back().Type = TEffectsInfo::EType::EXTERNAL_SINK; effectsInfos.back().Exprs.push_back(expr.Ptr()); } else { - // Two table sinks can't be executed in one physical transaction if they write into one table. - const TStringBuf tablePathId = sinkSettings.Cast().Table().PathId().Value(); - - auto it = std::find_if( - std::begin(effectsInfos), - std::end(effectsInfos), - [&tablePathId](const auto& effectsInfo) { - return effectsInfo.Type == TEffectsInfo::EType::KQP_SINK - && !effectsInfo.TablesPathIds.contains(tablePathId); - }); - if (it == std::end(effectsInfos)) { - effectsInfos.emplace_back(); - it = std::prev(std::end(effectsInfos)); - it->Type = TEffectsInfo::EType::KQP_SINK; + // Two table sinks can't be executed in one physical transaction if they write into same table and have same priority. + + const auto& tableDescription = kqpCtx.Tables->ExistingTable(kqpCtx.Cluster, sinkSettings.Cast().Table().Path()); + if (tableDescription.Metadata->Kind == EKikimrTableKind::Olap) { + const TStringBuf tablePathId = sinkSettings.Cast().Table().PathId().Value(); + + auto it = std::find_if( + std::begin(effectsInfos), + std::end(effectsInfos), + [&tablePathId](const auto& effectsInfo) { + return effectsInfo.Type == TEffectsInfo::EType::KQP_SINK + && !effectsInfo.TablesPathIds.contains(tablePathId); + }); + if (it == std::end(effectsInfos)) { + effectsInfos.emplace_back(); + it = std::prev(std::end(effectsInfos)); + it->Type = TEffectsInfo::EType::KQP_SINK; + } + it->TablesPathIds.insert(tablePathId); + it->Exprs.push_back(expr.Ptr()); + } else { + auto it = std::find_if( + std::begin(effectsInfos), + std::end(effectsInfos), + [](const auto& effectsInfo) { return effectsInfo.Type == TEffectsInfo::EType::KQP_BATCH_SINK; }); + if (it == std::end(effectsInfos)) { + effectsInfos.emplace_back(); + it = std::prev(std::end(effectsInfos)); + it->Type = TEffectsInfo::EType::KQP_BATCH_SINK; + } + it->Exprs.push_back(expr.Ptr()); } - it->TablesPathIds.insert(tablePathId); - it->Exprs.push_back(expr.Ptr()); } } else { // Table effects are executed all in one physical transaction. diff --git a/ydb/core/kqp/opt/kqp_opt_effects.cpp b/ydb/core/kqp/opt/kqp_opt_effects.cpp index 0c0e818d6853..2dd9dd91ee40 100644 --- a/ydb/core/kqp/opt/kqp_opt_effects.cpp +++ b/ydb/core/kqp/opt/kqp_opt_effects.cpp @@ -232,7 +232,7 @@ TCoAtomList BuildKeyColumnsList(const TKikimrTableDescription& table, TPositionH } TDqStage RebuildPureStageWithSink(TExprBase expr, const TKqpTable& table, - const bool allowInconsistentWrites, const TStringBuf mode, TExprContext& ctx) { + const bool allowInconsistentWrites, const TStringBuf mode, const i64 order, const bool isOlap, TExprContext& ctx) { Y_DEBUG_ABORT_UNLESS(IsDqPureExpr(expr)); return Build(ctx, expr.Pos()) @@ -257,6 +257,8 @@ TDqStage RebuildPureStageWithSink(TExprBase expr, const TKqpTable& table, ? ctx.NewAtom(expr.Pos(), "true") : ctx.NewAtom(expr.Pos(), "false")) .Mode(ctx.NewAtom(expr.Pos(), mode)) + .Priority(ctx.NewAtom(expr.Pos(), ToString(order))) + .TableType(ctx.NewAtom(expr.Pos(), isOlap ? "olap" : "oltp")) .Settings() .Build() .Build() @@ -296,7 +298,7 @@ TDqPhyPrecompute BuildPrecomputeStage(TExprBase expr, TExprContext& ctx) { } bool BuildUpsertRowsEffect(const TKqlUpsertRows& node, TExprContext& ctx, const TKqpOptimizeContext& kqpCtx, - const TCoArgument& inputArg, TMaybeNode& stageInput, TMaybeNode& effect, bool& sinkEffect) + const TCoArgument& inputArg, TMaybeNode& stageInput, TMaybeNode& effect, bool& sinkEffect, const i64 order) { const auto& table = kqpCtx.Tables->ExistingTable(kqpCtx.Cluster, node.Table().Path()); @@ -306,12 +308,14 @@ bool BuildUpsertRowsEffect(const TKqlUpsertRows& node, TExprContext& ctx, const } sinkEffect = NeedSinks(table, kqpCtx) || (kqpCtx.IsGenericQuery() && settings.AllowInconsistentWrites); + const bool isOlap = (table.Metadata->Kind == EKikimrTableKind::Olap); + const i64 priority = isOlap ? 0 : order; if (IsDqPureExpr(node.Input())) { if (sinkEffect) { stageInput = RebuildPureStageWithSink( node.Input(), node.Table(), - settings.AllowInconsistentWrites, settings.Mode, ctx); + settings.AllowInconsistentWrites, settings.Mode, priority, isOlap, ctx); effect = Build(ctx, node.Pos()) .Stage(stageInput.Cast().Ptr()) .SinkIndex().Build("0") @@ -352,6 +356,8 @@ bool BuildUpsertRowsEffect(const TKqlUpsertRows& node, TExprContext& ctx, const ? ctx.NewAtom(node.Pos(), "true") : ctx.NewAtom(node.Pos(), "false")) .Mode(ctx.NewAtom(node.Pos(), settings.Mode)) + .Priority(ctx.NewAtom(node.Pos(), ToString(priority))) + .TableType(ctx.NewAtom(node.Pos(), isOlap ? "olap" : "oltp")) .Settings() .Build() .Build() @@ -448,16 +454,17 @@ bool BuildUpsertRowsEffect(const TKqlUpsertRows& node, TExprContext& ctx, const } bool BuildDeleteRowsEffect(const TKqlDeleteRows& node, TExprContext& ctx, const TKqpOptimizeContext& kqpCtx, - const TCoArgument& inputArg, TMaybeNode& stageInput, TMaybeNode& effect, bool& sinkEffect) + const TCoArgument& inputArg, TMaybeNode& stageInput, TMaybeNode& effect, bool& sinkEffect, const i64 order) { const auto& table = kqpCtx.Tables->ExistingTable(kqpCtx.Cluster, node.Table().Path()); sinkEffect = NeedSinks(table, kqpCtx); - + const bool isOlap = (table.Metadata->Kind == EKikimrTableKind::Olap); + const i64 priority = isOlap ? 0 : order; if (IsDqPureExpr(node.Input())) { if (sinkEffect) { const auto keyColumns = BuildKeyColumnsList(table, node.Pos(), ctx); - stageInput = RebuildPureStageWithSink(node.Input(), node.Table(), false, "delete", ctx); + stageInput = RebuildPureStageWithSink(node.Input(), node.Table(), false, "delete", priority, isOlap, ctx); effect = Build(ctx, node.Pos()) .Stage(stageInput.Cast().Ptr()) .SinkIndex().Build("0") @@ -494,6 +501,8 @@ bool BuildDeleteRowsEffect(const TKqlDeleteRows& node, TExprContext& ctx, const .Table(node.Table()) .InconsistentWrite(ctx.NewAtom(node.Pos(), "false")) .Mode(ctx.NewAtom(node.Pos(), "delete")) + .Priority(ctx.NewAtom(node.Pos(), ToString(priority))) + .TableType(ctx.NewAtom(node.Pos(), isOlap ? "olap" : "oltp")) .Settings() .Build() .Build() @@ -584,6 +593,7 @@ bool BuildEffects(TPositionHandle pos, const TVector& effects, TVector newSinkEffects; newEffects.reserve(effects.size()); newSinkEffects.reserve(effects.size()); + i64 order = builtEffects.size(); for (const auto& effect : effects) { TMaybeNode newEffect; @@ -596,15 +606,17 @@ bool BuildEffects(TPositionHandle pos, const TVector& effects, .Done(); if (auto maybeUpsertRows = effect.Maybe()) { - if (!BuildUpsertRowsEffect(maybeUpsertRows.Cast(), ctx, kqpCtx, inputArg, input, newEffect, sinkEffect)) { + if (!BuildUpsertRowsEffect(maybeUpsertRows.Cast(), ctx, kqpCtx, inputArg, input, newEffect, sinkEffect, order)) { return false; } + ++order; } if (auto maybeDeleteRows = effect.Maybe()) { - if (!BuildDeleteRowsEffect(maybeDeleteRows.Cast(), ctx, kqpCtx, inputArg, input, newEffect, sinkEffect)) { + if (!BuildDeleteRowsEffect(maybeDeleteRows.Cast(), ctx, kqpCtx, inputArg, input, newEffect, sinkEffect, order)) { return false; } + ++order; } if (input) { @@ -696,7 +708,6 @@ TMaybeNode BuildEffects(const TKqlQuery& query, TExprContext& ctx, const TKqpOptimizeContext& kqpCtx) { TVector builtEffects; - if constexpr (GroupEffectsByTable) { TMap> tableEffectsMap; ExploreEffectLists( diff --git a/ydb/core/kqp/query_compiler/kqp_query_compiler.cpp b/ydb/core/kqp/query_compiler/kqp_query_compiler.cpp index 2e2143decdf5..140df1d43ad2 100644 --- a/ydb/core/kqp/query_compiler/kqp_query_compiler.cpp +++ b/ydb/core/kqp/query_compiler/kqp_query_compiler.cpp @@ -1140,6 +1140,8 @@ class TKqpQueryCompiler : public IKqpQueryCompiler { if (const auto inconsistentWrite = settings.InconsistentWrite().Cast(); inconsistentWrite.StringValue() == "true") { settingsProto.SetInconsistentTx(true); } + settingsProto.SetIsOlap(settings.TableType().Cast().StringValue() == "olap"); + settingsProto.SetPriority(FromString(settings.Priority().Cast().StringValue())); if (settings.Mode().Cast().StringValue() == "replace") { settingsProto.SetType(NKikimrKqp::TKqpTableSinkSettings::MODE_REPLACE); diff --git a/ydb/core/kqp/runtime/kqp_write_actor.cpp b/ydb/core/kqp/runtime/kqp_write_actor.cpp index 8085609e395d..8b052349e4ca 100644 --- a/ydb/core/kqp/runtime/kqp_write_actor.cpp +++ b/ydb/core/kqp/runtime/kqp_write_actor.cpp @@ -8,14 +8,16 @@ #include #include #include +#include +#include #include +#include #include #include #include #include #include #include -#include #include #include #include @@ -37,65 +39,103 @@ namespace { return delay; } - struct TLockInfo { - bool AddAndCheckLock(const NKikimrDataEvents::TLock& lock) { - if (!Lock) { - Lock = lock; - return true; - } else { - return lock.GetLockId() == Lock->GetLockId() - && lock.GetDataShard() == Lock->GetDataShard() - && lock.GetSchemeShard() == Lock->GetSchemeShard() - && lock.GetPathId() == Lock->GetPathId() - && lock.GetGeneration() == Lock->GetGeneration() - && lock.GetCounter() == Lock->GetCounter(); + NKikimrDataEvents::TEvWrite::TOperation::EOperationType GetOperation(NKikimrKqp::TKqpTableSinkSettings::EType type) { + switch (type) { + case NKikimrKqp::TKqpTableSinkSettings::MODE_REPLACE: + return NKikimrDataEvents::TEvWrite::TOperation::OPERATION_REPLACE; + case NKikimrKqp::TKqpTableSinkSettings::MODE_UPSERT: + return NKikimrDataEvents::TEvWrite::TOperation::OPERATION_UPSERT; + case NKikimrKqp::TKqpTableSinkSettings::MODE_INSERT: + return NKikimrDataEvents::TEvWrite::TOperation::OPERATION_INSERT; + case NKikimrKqp::TKqpTableSinkSettings::MODE_DELETE: + return NKikimrDataEvents::TEvWrite::TOperation::OPERATION_DELETE; + case NKikimrKqp::TKqpTableSinkSettings::MODE_UPDATE: + return NKikimrDataEvents::TEvWrite::TOperation::OPERATION_UPDATE; + default: + return NKikimrDataEvents::TEvWrite::TOperation::OPERATION_UNSPECIFIED; + } + } + + void FillEvWritePrepare(NKikimr::NEvents::TDataEvents::TEvWrite* evWrite, ui64 shardId, ui64 txId, const NKikimr::NKqp::IKqpTransactionManagerPtr& txManager) { + evWrite->Record.SetTxId(txId); + auto* protoLocks = evWrite->Record.MutableLocks(); + protoLocks->SetOp(NKikimrDataEvents::TKqpLocks::Commit); + + const auto prepareSettings = txManager->GetPrepareTransactionInfo(); + if (!prepareSettings.ArbiterColumnShard) { + for (const ui64 sendingShardId : prepareSettings.SendingShards) { + protoLocks->AddSendingShards(sendingShardId); + } + for (const ui64 receivingShardId : prepareSettings.ReceivingShards) { + protoLocks->AddReceivingShards(receivingShardId); + } + if (prepareSettings.Arbiter) { + protoLocks->SetArbiterShard(*prepareSettings.Arbiter); + } + } else if (prepareSettings.ArbiterColumnShard == shardId) { + protoLocks->SetArbiterColumnShard(*prepareSettings.ArbiterColumnShard); + for (const ui64 sendingShardId : prepareSettings.SendingShards) { + protoLocks->AddSendingShards(sendingShardId); + } + for (const ui64 receivingShardId : prepareSettings.ReceivingShards) { + protoLocks->AddReceivingShards(receivingShardId); + } + } else { + protoLocks->SetArbiterColumnShard(*prepareSettings.ArbiterColumnShard); + protoLocks->AddSendingShards(*prepareSettings.ArbiterColumnShard); + protoLocks->AddReceivingShards(*prepareSettings.ArbiterColumnShard); + if (prepareSettings.SendingShards.contains(shardId)) { + protoLocks->AddSendingShards(shardId); + } + if (prepareSettings.ReceivingShards.contains(shardId)) { + protoLocks->AddReceivingShards(shardId); } } - const std::optional& GetLock() const { - return Lock; + const auto locks = txManager->GetLocks(shardId); + for (const auto& lock : locks) { + *protoLocks->AddLocks() = lock; } + } - private: - std::optional Lock; - }; + void FillEvWriteRollback(NKikimr::NEvents::TDataEvents::TEvWrite* evWrite, ui64 shardId, const NKikimr::NKqp::IKqpTransactionManagerPtr& txManager) { + auto* protoLocks = evWrite->Record.MutableLocks(); + protoLocks->SetOp(NKikimrDataEvents::TKqpLocks::Rollback); + + const auto locks = txManager->GetLocks(shardId); + for (const auto& lock : locks) { + *protoLocks->AddLocks() = lock; + } + } } namespace NKikimr { namespace NKqp { -class TKqpDirectWriteActor : public TActorBootstrapped, public NYql::NDq::IDqComputeActorAsyncOutput { - using TBase = TActorBootstrapped; +struct IKqpTableWriterCallbacks { + virtual ~IKqpTableWriterCallbacks() = default; - class TResumeNotificationManager { - public: - TResumeNotificationManager(TKqpDirectWriteActor& writer) - : Writer(writer) { - CheckMemory(); - } + // Ready to accept writes + virtual void OnReady() = 0; - void CheckMemory() { - const auto freeSpace = Writer.GetFreeSpace(); - const auto targetMemory = Writer.MemoryLimit / 2; - if (freeSpace >= targetMemory && targetMemory > LastFreeMemory) { - YQL_ENSURE(freeSpace > 0); - Writer.ResumeExecution(); - } - LastFreeMemory = freeSpace; - } + // EvWrite statuses + virtual void OnPrepared(IKqpTransactionManager::TPrepareResult&& preparedInfo, ui64 dataSize) = 0; + virtual void OnCommitted(ui64 shardId, ui64 dataSize) = 0; + virtual void OnMessageAcknowledged(ui64 dataSize) = 0; - private: - TKqpDirectWriteActor& Writer; - i64 LastFreeMemory = std::numeric_limits::max(); - }; + virtual void OnError(const TString& message, NYql::NDqProto::StatusIds::StatusCode statusCode, const NYql::TIssues& subIssues) = 0; +}; - friend class TResumeNotificationManager; + +class TKqpTableWriteActor : public TActorBootstrapped { + using TBase = TActorBootstrapped; struct TEvPrivate { enum EEv { EvShardRequestTimeout = EventSpaceBegin(TKikimrEvents::ES_PRIVATE), EvResolveRequestPlanned, + EvTerminate, }; struct TEvShardRequestTimeout : public TEventLocal { @@ -108,114 +148,177 @@ class TKqpDirectWriteActor : public TActorBootstrapped, pu struct TEvResolveRequestPlanned : public TEventLocal { }; + + struct TEvTerminate : public TEventLocal { + }; + }; + + enum class EMode { + WRITE, + PREPARE, + COMMIT, + IMMEDIATE_COMMIT, }; public: - TKqpDirectWriteActor( - NKikimrKqp::TKqpTableSinkSettings&& settings, - NYql::NDq::TDqAsyncIoFactory::TSinkArguments&& args, - TIntrusivePtr counters) - : LogPrefix(TStringBuilder() << "TxId: " << args.TxId << ", task: " << args.TaskId << ". ") - , Settings(std::move(settings)) - , MessageSettings(GetWriteActorSettings()) - , OutputIndex(args.OutputIndex) - , Callbacks(args.Callback) - , Counters(counters) - , TypeEnv(args.TypeEnv) - , Alloc(args.Alloc) - , TxId(args.TxId) - , TableId( - Settings.GetTable().GetOwnerId(), - Settings.GetTable().GetTableId(), - Settings.GetTable().GetVersion()) - , FinalTx( - Settings.GetFinalTx()) - , ImmediateTx( - Settings.GetImmediateTx()) - , InconsistentTx( - Settings.GetInconsistentTx()) - , MemoryLimit(MessageSettings.InFlightMemoryLimitPerActorBytes) - , WriteActorSpan(TWilsonKqp::WriteActor, NWilson::TTraceId(args.TraceId), "WriteActor") + TKqpTableWriteActor( + IKqpTableWriterCallbacks* callbacks, + const TTableId& tableId, + const TStringBuf tablePath, + const ui64 lockTxId, + const ui64 lockNodeId, + const bool inconsistentTx, + const NMiniKQL::TTypeEnvironment& typeEnv, + std::shared_ptr alloc, + const IKqpTransactionManagerPtr& txManager, + const TActorId sessionActorId) + : TypeEnv(typeEnv) + , Alloc(alloc) + , TableId(tableId) + , TablePath(tablePath) + , LockTxId(lockTxId) + , LockNodeId(lockNodeId) + , InconsistentTx(inconsistentTx) + , Callbacks(callbacks) + , TxManager(txManager ? txManager : CreateKqpTransactionManager(/* collectOnly= */ true)) { - YQL_ENSURE(std::holds_alternative(TxId)); - YQL_ENSURE(!ImmediateTx); - EgressStats.Level = args.StatsLevel; - - Counters->WriteActorsCount->Inc(); + LogPrefix = TStringBuilder() << "SessionActorId: " << sessionActorId; + try { + ShardedWriteController = CreateShardedWriteController( + TShardedWriteControllerSettings { + .MemoryLimitTotal = MessageSettings.InFlightMemoryLimitPerActorBytes, + .MemoryLimitPerMessage = MessageSettings.MemoryLimitPerMessageBytes, + .MaxBatchesPerMessage = MessageSettings.MaxBatchesPerMessage, + }, + TypeEnv, + Alloc); + } catch (...) { + RuntimeError( + CurrentExceptionMessage(), + NYql::NDqProto::StatusIds::INTERNAL_ERROR); + } } void Bootstrap() { - LogPrefix = TStringBuilder() << "SelfId: " << this->SelfId() << ", " << LogPrefix; + LogPrefix = TStringBuilder() << "SelfId: " << this->SelfId() << ", Table: `" << TablePath << "` (" << TableId << "), "<< LogPrefix; ResolveTable(); - Become(&TKqpDirectWriteActor::StateFunc); + Become(&TKqpTableWriteActor::StateProcessing); } - static constexpr char ActorName[] = "KQP_WRITE_ACTOR"; + static constexpr char ActorName[] = "KQP_TABLE_WRITE_ACTOR"; -private: - virtual ~TKqpDirectWriteActor() { + i64 GetMemory() const { + return IsReady() + ? ShardedWriteController->GetMemory() + : 0; } - void CommitState(const NYql::NDqProto::TCheckpoint&) final {}; - void LoadState(const NYql::NDq::TSinkState&) final {}; + bool IsReady() const { + return ShardedWriteController->IsReady(); + } - ui64 GetOutputIndex() const final { - return OutputIndex; + bool IsEmpty() const { + return ShardedWriteController->IsEmpty(); } - const NYql::NDq::TDqAsyncStats& GetEgressStats() const final { - return EgressStats; + bool IsOlap() const { + YQL_ENSURE(SchemeEntry); + return SchemeEntry->Kind == NSchemeCache::TSchemeCacheNavigate::KindColumnTable; } - i64 GetFreeSpace() const final { - const i64 result = (ShardedWriteController && !IsResolving()) - ? MemoryLimit - ShardedWriteController->GetMemory() - : std::numeric_limits::min(); // Can't use zero here because compute can use overcommit! - return result; + TVector GetLocks() const { + return TxManager->GetLocks(); } - TMaybe ExtraData() override { - NKikimrKqp::TEvKqpOutputActorResultInfo resultInfo; - for (const auto& [_, lockInfo] : LocksInfo) { - if (const auto& lock = lockInfo.GetLock(); lock) { - resultInfo.AddLocks()->CopyFrom(*lock); - } - } - google::protobuf::Any result; - result.PackFrom(resultInfo); - return result; + TVector GetShardsIds() const { + return ShardedWriteController->GetShardsIds(); } - void SendData(NMiniKQL::TUnboxedValueBatch&& data, i64 size, const TMaybe&, bool finished) final { - YQL_ENSURE(!data.IsWide(), "Wide stream is not supported yet"); - YQL_ENSURE(!Finished); - Finished = finished; - EgressStats.Resume(); + std::optional GetShardsCount() const { + return InconsistentTx + ? std::nullopt + : std::optional(ShardedWriteController->GetShardsCount()); + } - CA_LOG_D("New data: size=" << size << ", finished=" << finished << ", used memory=" << ShardedWriteController->GetMemory() << "."); + using TWriteToken = IShardedWriteController::TWriteToken; + + TWriteToken Open( + NKikimrDataEvents::TEvWrite::TOperation::EOperationType operationType, + TVector&& columnsMetadata, + i64 priority) { + YQL_ENSURE(!Closed); + auto token = ShardedWriteController->Open( + TableId, + operationType, + std::move(columnsMetadata), + priority); + CA_LOG_D("Open: token=" << token); + return token; + } + void Write(TWriteToken token, const NMiniKQL::TUnboxedValueBatch& data) { + YQL_ENSURE(!data.IsWide(), "Wide stream is not supported yet"); + YQL_ENSURE(!Closed); YQL_ENSURE(ShardedWriteController); + CA_LOG_D("Write: token=" << token); try { - ShardedWriteController->AddData(std::move(data)); - if (Finished) { - ShardedWriteController->Close(); - } + ShardedWriteController->Write(token, data); + UpdateShards(); + } catch (...) { + RuntimeError( + CurrentExceptionMessage(), + NYql::NDqProto::StatusIds::INTERNAL_ERROR); + } + } + + void Close(TWriteToken token) { + YQL_ENSURE(!Closed); + YQL_ENSURE(ShardedWriteController); + CA_LOG_D("Close: token=" << token); + try { + ShardedWriteController->Close(token); + UpdateShards(); } catch (...) { RuntimeError( CurrentExceptionMessage(), NYql::NDqProto::StatusIds::INTERNAL_ERROR); } - ProcessBatches(); } - STFUNC(StateFunc) { + void Close() { + YQL_ENSURE(!Closed); + YQL_ENSURE(ShardedWriteController); + YQL_ENSURE(ShardedWriteController->IsAllWritesClosed()); + Closed = true; + ShardedWriteController->Close(); + } + + void UpdateShards() { + // TODO: Maybe there are better ways to initialize new shards... + for (const auto& shardInfo : ShardedWriteController->GetPendingShards()) { + TxManager->AddShard(shardInfo.ShardId, IsOlap(), TablePath); + TxManager->AddAction(shardInfo.ShardId, IKqpTransactionManager::EAction::WRITE); + if (shardInfo.HasRead) { + TxManager->AddAction(shardInfo.ShardId, IKqpTransactionManager::EAction::READ); + } + } + } + + bool IsClosed() const { + return Closed; + } + + bool IsFinished() const { + return IsClosed() && ShardedWriteController->IsAllWritesFinished(); + } + + STFUNC(StateProcessing) { try { switch (ev->GetTypeRewrite()) { hFunc(NKikimr::NEvents::TDataEvents::TEvWriteResult, Handle); hFunc(TEvTxProxySchemeCache::TEvNavigateKeySetResult, Handle); hFunc(TEvTxProxySchemeCache::TEvResolveKeySetResult, Handle); hFunc(TEvPipeCache::TEvDeliveryProblem, Handle); - IgnoreFunc(TEvTxUserProxy::TEvAllocateTxIdResult); hFunc(TEvPrivate::TEvShardRequestTimeout, Handle); hFunc(TEvPrivate::TEvResolveRequestPlanned, Handle); IgnoreFunc(TEvInterconnect::TEvNodeConnected); @@ -226,6 +329,16 @@ class TKqpDirectWriteActor : public TActorBootstrapped, pu } } + STFUNC(StateTerminating) { + try { + switch (ev->GetTypeRewrite()) { + hFunc(TEvPrivate::TEvTerminate, Handle); + } + } catch (const yexception& e) { + CA_LOG_W(e.what()); + } + } + bool IsResolving() const { return ResolveAttempts > 0; } @@ -248,16 +361,15 @@ class TKqpDirectWriteActor : public TActorBootstrapped, pu } void ResolveTable() { - Counters->WriteActorsShardResolve->Inc(); SchemeEntry.reset(); SchemeRequest.reset(); if (ResolveAttempts++ >= MessageSettings.MaxResolveAttempts) { CA_LOG_E(TStringBuilder() - << "Too many table resolve attempts for table " << TableId << "."); + << "Too many table resolve attempts for table `" << TablePath << "` (" << TableId << ")."); RuntimeError( TStringBuilder() - << "Too many table resolve attempts for table `" << Settings.GetTable().GetPath() << "`.", + << "Too many table resolve attempts for table `" << TablePath << "`.", NYql::NDqProto::StatusIds::SCHEME_ERROR); return; } @@ -272,14 +384,12 @@ class TKqpDirectWriteActor : public TActorBootstrapped, pu entry.ShowPrivatePath = true; request->ResultSet.emplace_back(entry); - WriteActorStateSpan = NWilson::TSpan(TWilsonKqp::WriteActorTableNavigate, WriteActorSpan.GetTraceId(), - "WaitForShardsResolve", NWilson::EFlags::AUTO_END); - - Send(MakeSchemeCacheID(), new TEvTxProxySchemeCache::TEvInvalidateTable(TableId, {}), 0, 0, WriteActorSpan.GetTraceId()); - Send(MakeSchemeCacheID(), new TEvTxProxySchemeCache::TEvNavigateKeySet(request), 0, 0, WriteActorSpan.GetTraceId()); + Send(MakeSchemeCacheID(), new TEvTxProxySchemeCache::TEvInvalidateTable(TableId, {}), 0, 0); + Send(MakeSchemeCacheID(), new TEvTxProxySchemeCache::TEvNavigateKeySet(request), 0, 0); } void Handle(TEvTxProxySchemeCache::TEvNavigateKeySetResult::TPtr& ev) { + YQL_ENSURE(!SchemeRequest || InconsistentTx); auto& resultSet = ev->Get()->Request->ResultSet; YQL_ENSURE(resultSet.size() == 1); @@ -302,7 +412,6 @@ class TKqpDirectWriteActor : public TActorBootstrapped, pu } if (SchemeEntry->Kind == NSchemeCache::TSchemeCacheNavigate::KindColumnTable) { - YQL_ENSURE(!ImmediateTx); Prepare(); } else { ResolveShards(); @@ -335,7 +444,7 @@ class TKqpDirectWriteActor : public TActorBootstrapped, pu request->ResultSet.emplace_back(std::move(keyRange)); TAutoPtr resolveReq(new TEvTxProxySchemeCache::TEvResolveKeySet(request)); - Send(MakeSchemeCacheID(), resolveReq.Release(), 0, 0, WriteActorSpan.GetTraceId()); + Send(MakeSchemeCacheID(), resolveReq.Release()); } void Handle(TEvTxProxySchemeCache::TEvResolveKeySetResult::TPtr& ev) { @@ -387,14 +496,15 @@ class TKqpDirectWriteActor : public TActorBootstrapped, pu << getIssues().ToOneLineString()); RuntimeError( TStringBuilder() << "Unspecified error for table `" - << SchemeEntry->TableId.PathId.ToString() << "`. " + << TablePath << "`. " << getIssues().ToOneLineString(), NYql::NDqProto::StatusIds::UNSPECIFIED, getIssues()); return; } case NKikimrDataEvents::TEvWriteResult::STATUS_PREPARED: { - YQL_ENSURE(false); + ProcessWritePreparedShard(ev); + return; } case NKikimrDataEvents::TEvWriteResult::STATUS_COMPLETED: { ProcessWriteCompletedShard(ev); @@ -408,7 +518,7 @@ class TKqpDirectWriteActor : public TActorBootstrapped, pu << getIssues().ToOneLineString()); RuntimeError( TStringBuilder() << "Aborted for table `" - << SchemeEntry->TableId.PathId.ToString() << "`. " + << TablePath << "`. " << getIssues().ToOneLineString(), NYql::NDqProto::StatusIds::ABORTED, getIssues()); @@ -428,7 +538,7 @@ class TKqpDirectWriteActor : public TActorBootstrapped, pu } else { RuntimeError( TStringBuilder() << "Internal error for table `" - << SchemeEntry->TableId.PathId.ToString() << "`. " + << TablePath << "`. " << getIssues().ToOneLineString(), NYql::NDqProto::StatusIds::INTERNAL_ERROR, getIssues()); @@ -444,7 +554,7 @@ class TKqpDirectWriteActor : public TActorBootstrapped, pu RuntimeError( TStringBuilder() << "Disk space exhausted for table `" - << SchemeEntry->TableId.PathId.ToString() << "`. " + << TablePath << "`. " << getIssues().ToOneLineString(), NYql::NDqProto::StatusIds::PRECONDITION_FAILED, getIssues()); @@ -461,7 +571,7 @@ class TKqpDirectWriteActor : public TActorBootstrapped, pu if (!InconsistentTx) { RuntimeError( TStringBuilder() << "Tablet " << ev->Get()->Record.GetOrigin() << " is overloaded. Table `" - << SchemeEntry->TableId.PathId.ToString() << "`. " + << TablePath << "`. " << getIssues().ToOneLineString(), NYql::NDqProto::StatusIds::OVERLOADED, getIssues()); @@ -476,7 +586,7 @@ class TKqpDirectWriteActor : public TActorBootstrapped, pu << getIssues().ToOneLineString()); RuntimeError( TStringBuilder() << "Cancelled request to table `" - << SchemeEntry->TableId.PathId.ToString() << "`." + << TablePath << "`." << getIssues().ToOneLineString(), NYql::NDqProto::StatusIds::CANCELLED, getIssues()); @@ -490,7 +600,7 @@ class TKqpDirectWriteActor : public TActorBootstrapped, pu << getIssues().ToOneLineString()); RuntimeError( TStringBuilder() << "Bad request. Table `" - << SchemeEntry->TableId.PathId.ToString() << "`. " + << TablePath << "`. " << getIssues().ToOneLineString(), NYql::NDqProto::StatusIds::BAD_REQUEST, getIssues()); @@ -508,7 +618,7 @@ class TKqpDirectWriteActor : public TActorBootstrapped, pu } else { RuntimeError( TStringBuilder() << "Scheme changed. Table `" - << SchemeEntry->TableId.PathId.ToString() << "`. " + << TablePath << "`. " << getIssues().ToOneLineString(), NYql::NDqProto::StatusIds::SCHEME_ERROR, getIssues()); @@ -521,9 +631,12 @@ class TKqpDirectWriteActor : public TActorBootstrapped, pu << " ShardID=" << ev->Get()->Record.GetOrigin() << "," << " Sink=" << this->SelfId() << "." << getIssues().ToOneLineString()); + + TxManager->BreakLock(ev->Get()->Record.GetOrigin()); + YQL_ENSURE(TxManager->BrokenLocks()); RuntimeError( TStringBuilder() << "Transaction locks invalidated. Table `" - << SchemeEntry->TableId.PathId.ToString() << "`. " + << TablePath << "`. " << getIssues().ToOneLineString(), NYql::NDqProto::StatusIds::ABORTED, getIssues()); @@ -532,10 +645,35 @@ class TKqpDirectWriteActor : public TActorBootstrapped, pu } } + void ProcessWritePreparedShard(NKikimr::NEvents::TDataEvents::TEvWriteResult::TPtr& ev) { + YQL_ENSURE(Mode == EMode::PREPARE); + const auto& record = ev->Get()->Record; + IKqpTransactionManager::TPrepareResult preparedInfo; + preparedInfo.ShardId = record.GetOrigin(); + preparedInfo.MinStep = record.GetMinStep(); + preparedInfo.MaxStep = record.GetMaxStep(); + + preparedInfo.Coordinator = 0; + if (record.DomainCoordinatorsSize()) { + auto domainCoordinators = TCoordinators(TVector(record.GetDomainCoordinators().begin(), + record.GetDomainCoordinators().end())); + preparedInfo.Coordinator = domainCoordinators.Select(*TxId); + } + + const auto result = ShardedWriteController->OnMessageAcknowledged( + ev->Get()->Record.GetOrigin(), ev->Cookie); + if (result) { + YQL_ENSURE(result->IsShardEmpty); + Callbacks->OnPrepared(std::move(preparedInfo), result->DataSize); + } + } + void ProcessWriteCompletedShard(NKikimr::NEvents::TDataEvents::TEvWriteResult::TPtr& ev) { + YQL_ENSURE(SchemeEntry); CA_LOG_D("Got completed result TxId=" << ev->Get()->Record.GetTxId() << ", TabletId=" << ev->Get()->Record.GetOrigin() << ", Cookie=" << ev->Cookie + << ", Mode=" << static_cast(Mode) << ", Locks=" << [&]() { TStringBuilder builder; for (const auto& lock : ev->Get()->Record.GetTxLocks()) { @@ -544,116 +682,124 @@ class TKqpDirectWriteActor : public TActorBootstrapped, pu return builder; }()); - OnMessageAcknowledged(ev->Get()->Record.GetOrigin(), ev->Cookie); - for (const auto& lock : ev->Get()->Record.GetTxLocks()) { - if (!LocksInfo[ev->Get()->Record.GetOrigin()].AddAndCheckLock(lock)) { + Y_ABORT_UNLESS(Mode == EMode::WRITE); + if (!TxManager->AddLock(ev->Get()->Record.GetOrigin(), lock)) { + YQL_ENSURE(TxManager->BrokenLocks()); + NYql::TIssues issues; + issues.AddIssue(*TxManager->GetLockIssue()); RuntimeError( TStringBuilder() << "Transaction locks invalidated. Table `" - << SchemeEntry->TableId.PathId.ToString() << "`.", + << TablePath << "`.", NYql::NDqProto::StatusIds::ABORTED, - NYql::TIssues{}); + issues); + return; } } - ProcessBatches(); + if (Mode == EMode::COMMIT) { + Callbacks->OnCommitted(ev->Get()->Record.GetOrigin(), 0); + return; + } + + const auto result = ShardedWriteController->OnMessageAcknowledged( + ev->Get()->Record.GetOrigin(), ev->Cookie); + if (result && result->IsShardEmpty && Mode == EMode::IMMEDIATE_COMMIT) { + Callbacks->OnCommitted(ev->Get()->Record.GetOrigin(), result->DataSize); + } else if (result) { + Callbacks->OnMessageAcknowledged(result->DataSize); + } } - void OnMessageAcknowledged(ui64 shardId, ui64 cookie) { - TResumeNotificationManager resumeNotificator(*this); - const auto removedDataSize = ShardedWriteController->OnMessageAcknowledged(shardId, cookie); - if (removedDataSize) { - EgressStats.Bytes += *removedDataSize; - EgressStats.Chunks++; - EgressStats.Splits++; - EgressStats.Resume(); + void SetPrepare(ui64 txId) { + CA_LOG_D("SetPrepare; txId=" << txId); + YQL_ENSURE(Mode == EMode::WRITE); + Mode = EMode::PREPARE; + TxId = txId; + ShardedWriteController->AddCoveringMessages(); + } - if (auto it = SendTime.find(shardId); it != std::end(SendTime)) { - Counters->WriteActorWritesLatencyHistogram->Collect((TInstant::Now() - it->second).MilliSeconds()); - SendTime.erase(it); - } - } - resumeNotificator.CheckMemory(); + void SetDistributedCommit() { + CA_LOG_D("SetDistributedCommit; txId=" << *TxId); + YQL_ENSURE(Mode == EMode::PREPARE); + Mode = EMode::COMMIT; } - void ProcessBatches() { - if (!ImmediateTx || Finished || GetFreeSpace() <= 0) { - SendBatchesToShards(); - } + void SetImmediateCommit() { + CA_LOG_D("SetImmediateCommit"); + YQL_ENSURE(Mode == EMode::WRITE); + Mode = EMode::IMMEDIATE_COMMIT; - if (Finished && ShardedWriteController->IsFinished()) { - CA_LOG_D("Write actor finished"); - Callbacks->OnAsyncOutputFinished(GetOutputIndex()); + if (ShardedWriteController->GetShardsCount() == 1) { + ShardedWriteController->AddCoveringMessages(); + } else { + YQL_ENSURE(ShardedWriteController->GetShardsCount() == 0); } } - void SendBatchesToShards() { - for (const size_t shardId : ShardedWriteController->GetPendingShards()) { - SendDataToShard(shardId); + void FlushBuffers() { + ShardedWriteController->FlushBuffers(); + UpdateShards(); + } + + void Flush() { + for (const auto& shardInfo : ShardedWriteController->GetPendingShards()) { + SendDataToShard(shardInfo.ShardId); } } void SendDataToShard(const ui64 shardId) { + YQL_ENSURE(Mode != EMode::COMMIT); + const auto metadata = ShardedWriteController->GetMessageMetadata(shardId); YQL_ENSURE(metadata); if (metadata->SendAttempts >= MessageSettings.MaxWriteAttempts) { CA_LOG_E("ShardId=" << shardId - << " for table '" << Settings.GetTable().GetPath() + << " for table '" << TablePath << "': retry limit exceeded." << " Sink=" << this->SelfId() << "."); RuntimeError( TStringBuilder() << "ShardId=" << shardId - << " for table '" << Settings.GetTable().GetPath() + << " for table '" << TablePath << "': retry limit exceeded.", NYql::NDqProto::StatusIds::UNAVAILABLE); return; } - auto evWrite = std::make_unique( - NKikimrDataEvents::TEvWrite::MODE_IMMEDIATE); + + const bool isPrepare = metadata->IsFinal && Mode == EMode::PREPARE; + const bool isImmediateCommit = metadata->IsFinal && Mode == EMode::IMMEDIATE_COMMIT; + + auto evWrite = std::make_unique(); + + evWrite->Record.SetTxMode(isPrepare + ? (TxManager->IsVolatile() + ? NKikimrDataEvents::TEvWrite::MODE_VOLATILE_PREPARE + : NKikimrDataEvents::TEvWrite::MODE_PREPARE) + : NKikimrDataEvents::TEvWrite::MODE_IMMEDIATE); - if (ImmediateTx && FinalTx && Finished && metadata->IsFinal) { - // Last immediate write (only for datashard) - if (LocksInfo[shardId].GetLock()) { - // multi immediate evwrite - auto* locks = evWrite->Record.MutableLocks(); - locks->SetOp(NKikimrDataEvents::TKqpLocks::Commit); - locks->AddSendingShards(shardId); - locks->AddReceivingShards(shardId); - *locks->AddLocks() = *LocksInfo.at(shardId).GetLock(); + if (isImmediateCommit) { + const auto locks = TxManager->GetLocks(shardId); + if (!locks.empty()) { + auto* protoLocks = evWrite->Record.MutableLocks(); + protoLocks->SetOp(NKikimrDataEvents::TKqpLocks::Commit); + protoLocks->AddSendingShards(shardId); + protoLocks->AddReceivingShards(shardId); + for (const auto& lock : locks) { + *protoLocks->AddLocks() = lock; + } } + } else if (isPrepare) { + YQL_ENSURE(TxId); + FillEvWritePrepare(evWrite.get(), shardId, *TxId, TxManager); } else if (!InconsistentTx) { - evWrite->SetLockId(Settings.GetLockTxId(), Settings.GetLockNodeId()); + evWrite->SetLockId(LockTxId, LockNodeId); } const auto serializationResult = ShardedWriteController->SerializeMessageToPayload(shardId, *evWrite); - YQL_ENSURE(serializationResult.TotalDataSize > 0); - - for (size_t payloadIndex : serializationResult.PayloadIndexes) { - evWrite->AddOperation( - GetOperation(), - { - Settings.GetTable().GetOwnerId(), - Settings.GetTable().GetTableId(), - Settings.GetTable().GetVersion(), - }, - ShardedWriteController->GetWriteColumnIds(), - payloadIndex, - ShardedWriteController->GetDataFormat()); - } - - if (metadata->SendAttempts == 0) { - Counters->WriteActorImmediateWrites->Inc(); - Counters->WriteActorWritesSizeHistogram->Collect(serializationResult.TotalDataSize); - Counters->WriteActorWritesOperationsHistogram->Collect(metadata->OperationsCount); - - SendTime[shardId] = TInstant::Now(); - } else { - Counters->WriteActorImmediateWritesRetries->Inc(); - } + YQL_ENSURE(isPrepare || isImmediateCommit || serializationResult.TotalDataSize > 0); - CA_LOG_D("Send EvWrite to ShardID=" << shardId << ", TxId=" << evWrite->Record.GetTxId() - << ", TxMode=" << evWrite->Record.GetTxMode() + CA_LOG_D("Send EvWrite to ShardID=" << shardId << ", isPrepare=" << isPrepare << ", isImmediateCommit=" << isImmediateCommit << ", TxId=" << evWrite->Record.GetTxId() << ", LockTxId=" << evWrite->Record.GetLockTxId() << ", LockNodeId=" << evWrite->Record.GetLockNodeId() << ", Locks= " << [&]() { TStringBuilder builder; @@ -663,12 +809,12 @@ class TKqpDirectWriteActor : public TActorBootstrapped, pu return builder; }() << ", Size=" << serializationResult.TotalDataSize << ", Cookie=" << metadata->Cookie - << ", OperationsCount=" << metadata->OperationsCount << ", IsFinal=" << metadata->IsFinal - << ", Attempts=" << metadata->SendAttempts); + << ", OperationsCount=" << evWrite->Record.OperationsSize() << ", IsFinal=" << metadata->IsFinal + << ", Attempts=" << metadata->SendAttempts << ", Mode=" << static_cast(Mode)); Send( PipeCacheId, new TEvPipeCache::TEvForward(evWrite.release(), shardId, true), - 0, + IEventHandle::FlagTrackDelivery, metadata->Cookie); ShardedWriteController->OnMessageSent(shardId, metadata->Cookie); @@ -685,25 +831,6 @@ class TKqpDirectWriteActor : public TActorBootstrapped, pu } } - NKikimrDataEvents::TEvWrite::TOperation::EOperationType GetOperation() { - switch (Settings.GetType()) { - case NKikimrKqp::TKqpTableSinkSettings::MODE_REPLACE: - return NKikimrDataEvents::TEvWrite::TOperation::OPERATION_REPLACE; - case NKikimrKqp::TKqpTableSinkSettings::MODE_UPSERT: - return NKikimrDataEvents::TEvWrite::TOperation::OPERATION_UPSERT; - case NKikimrKqp::TKqpTableSinkSettings::MODE_INSERT: - return NKikimrDataEvents::TEvWrite::TOperation::OPERATION_INSERT; - case NKikimrKqp::TKqpTableSinkSettings::MODE_DELETE: - return NKikimrDataEvents::TEvWrite::TOperation::OPERATION_DELETE; - case NKikimrKqp::TKqpTableSinkSettings::MODE_UPDATE: - return NKikimrDataEvents::TEvWrite::TOperation::OPERATION_UPDATE; - default: - RuntimeError( - TStringBuilder() << "Unknown operation.", - NYql::NDqProto::StatusIds::INTERNAL_ERROR); - } - } - void RetryShard(const ui64 shardId, const std::optional ifCookieEqual) { const auto metadata = ShardedWriteController->GetMessageMetadata(shardId); if (!metadata || (ifCookieEqual && metadata->Cookie != ifCookieEqual)) { @@ -725,6 +852,10 @@ class TKqpDirectWriteActor : public TActorBootstrapped, pu RetryShard(ev->Get()->ShardId, ev->Cookie); } + void Handle(TEvPrivate::TEvTerminate::TPtr&) { + PassAway(); + } + void Handle(TEvPipeCache::TEvDeliveryProblem::TPtr& ev) { CA_LOG_W("TEvDeliveryProblem was received from tablet: " << ev->Get()->TabletId); if (InconsistentTx) { @@ -738,122 +869,1289 @@ class TKqpDirectWriteActor : public TActorBootstrapped, pu } } - void RuntimeError(const TString& message, NYql::NDqProto::StatusIds::StatusCode statusCode, const NYql::TIssues& subIssues = {}) { - NYql::TIssue issue(message); - for (const auto& i : subIssues) { - issue.AddSubIssue(MakeIntrusive(i)); - } - - NYql::TIssues issues; - issues.AddIssue(std::move(issue)); - - if (WriteActorStateSpan) { - WriteActorStateSpan.EndError(issues.ToOneLineString()); - } - if (WriteActorSpan) { - WriteActorSpan.EndError(issues.ToOneLineString()); - } - - Callbacks->OnAsyncOutputError(OutputIndex, std::move(issues), statusCode); - } - - void PassAway() override { - Send(PipeCacheId, new TEvPipeCache::TEvUnlink(0)); - TActorBootstrapped::PassAway(); - } - void Prepare() { - WriteActorStateSpan.EndOk(); - YQL_ENSURE(SchemeEntry); ResolveAttempts = 0; - if (!ShardedWriteController) { - TVector columnsMetadata; - columnsMetadata.reserve(Settings.GetColumns().size()); - for (const auto & column : Settings.GetColumns()) { - columnsMetadata.push_back(column); - } - - try { - ShardedWriteController = CreateShardedWriteController( - TShardedWriteControllerSettings { - .MemoryLimitTotal = MessageSettings.InFlightMemoryLimitPerActorBytes, - .MemoryLimitPerMessage = MessageSettings.MemoryLimitPerMessageBytes, - .MaxBatchesPerMessage = (SchemeEntry->Kind == NSchemeCache::TSchemeCacheNavigate::KindColumnTable - ? 1 - : MessageSettings.MaxBatchesPerMessage), - }, - std::move(columnsMetadata), - TypeEnv, - Alloc); - } catch (...) { - RuntimeError( - CurrentExceptionMessage(), - NYql::NDqProto::StatusIds::INTERNAL_ERROR); - } - } - try { if (SchemeEntry->Kind == NSchemeCache::TSchemeCacheNavigate::KindColumnTable) { ShardedWriteController->OnPartitioningChanged(*SchemeEntry); } else { ShardedWriteController->OnPartitioningChanged(*SchemeEntry, std::move(*SchemeRequest)); + SchemeRequest.reset(); } - ResumeExecution(); } catch (...) { RuntimeError( CurrentExceptionMessage(), NYql::NDqProto::StatusIds::INTERNAL_ERROR); } - ProcessBatches(); + + Callbacks->OnReady(); } - void ResumeExecution() { - CA_LOG_D("Resuming execution."); - Callbacks->ResumeExecution(); + void RuntimeError(const TString& message, NYql::NDqProto::StatusIds::StatusCode statusCode, const NYql::TIssues& subIssues = {}) { + Callbacks->OnError(message, statusCode, subIssues); + } + + void PassAway() override {; + CA_LOG_D("PassAway"); + Send(PipeCacheId, new TEvPipeCache::TEvUnlink(0)); + TActorBootstrapped::PassAway(); + } + + void Terminate() { + Become(&TKqpTableWriteActor::StateTerminating); + Send(this->SelfId(), new TEvPrivate::TEvTerminate{}); } NActors::TActorId PipeCacheId = NKikimr::MakePipePerNodeCacheID(false); TString LogPrefix; - const NKikimrKqp::TKqpTableSinkSettings Settings; TWriteActorSettings MessageSettings; - const ui64 OutputIndex; - NYql::NDq::TDqAsyncStats EgressStats; - NYql::NDq::IDqComputeActorAsyncOutput::ICallbacks * Callbacks = nullptr; - TIntrusivePtr Counters; const NMiniKQL::TTypeEnvironment& TypeEnv; std::shared_ptr Alloc; - const NYql::NDq::TTxId TxId; const TTableId TableId; - const bool FinalTx; - const bool ImmediateTx; + const TString TablePath; + + std::optional TxId; + const ui64 LockTxId; + const ui64 LockNodeId; const bool InconsistentTx; + IKqpTableWriterCallbacks* Callbacks; + std::optional SchemeEntry; std::optional SchemeRequest; ui64 ResolveAttempts = 0; - THashMap SendTime; - THashMap LocksInfo; - bool Finished = false; - - const i64 MemoryLimit; + IKqpTransactionManagerPtr TxManager; + bool Closed = false; + EMode Mode = EMode::WRITE; IShardedWriteControllerPtr ShardedWriteController = nullptr; - - NWilson::TSpan WriteActorSpan; - NWilson::TSpan WriteActorStateSpan; }; -void RegisterKqpWriteActor(NYql::NDq::TDqAsyncIoFactory& factory, TIntrusivePtr counters) { - factory.RegisterSink( - TString(NYql::KqpTableSinkName), - [counters] (NKikimrKqp::TKqpTableSinkSettings&& settings, NYql::NDq::TDqAsyncIoFactory::TSinkArguments&& args) { - auto* actor = new TKqpDirectWriteActor(std::move(settings), std::move(args), counters); - return std::make_pair(actor, actor); +class TKqpDirectWriteActor : public TActorBootstrapped, public NYql::NDq::IDqComputeActorAsyncOutput, public IKqpTableWriterCallbacks { + using TBase = TActorBootstrapped; + +public: + TKqpDirectWriteActor( + NKikimrKqp::TKqpTableSinkSettings&& settings, + NYql::NDq::TDqAsyncIoFactory::TSinkArguments&& args, + TIntrusivePtr counters) + : LogPrefix(TStringBuilder() << "TxId: " << args.TxId << ", task: " << args.TaskId << ". ") + , Settings(std::move(settings)) + , MessageSettings(GetWriteActorSettings()) + , OutputIndex(args.OutputIndex) + , Callbacks(args.Callback) + , Counters(counters) + , TypeEnv(args.TypeEnv) + , Alloc(args.Alloc) + , TxId(std::get(args.TxId)) + , TableId( + Settings.GetTable().GetOwnerId(), + Settings.GetTable().GetTableId(), + Settings.GetTable().GetVersion()) + { + EgressStats.Level = args.StatsLevel; + } + + void Bootstrap() { + LogPrefix = TStringBuilder() << "SelfId: " << this->SelfId() << ", " << LogPrefix; + + WriteTableActor = new TKqpTableWriteActor( + this, + TableId, + Settings.GetTable().GetPath(), + Settings.GetLockTxId(), + Settings.GetLockNodeId(), + Settings.GetInconsistentTx(), + TypeEnv, + Alloc, + nullptr, + TActorId{}); + + WriteTableActorId = RegisterWithSameMailbox(WriteTableActor); + + TVector columnsMetadata; + columnsMetadata.reserve(Settings.GetColumns().size()); + for (const auto & column : Settings.GetColumns()) { + columnsMetadata.push_back(column); + } + YQL_ENSURE(Settings.GetPriority() == 0); + WriteToken = WriteTableActor->Open(GetOperation(Settings.GetType()), std::move(columnsMetadata), Settings.GetPriority()); + WaitingForTableActor = true; + } + + static constexpr char ActorName[] = "KQP_DIRECT_WRITE_ACTOR"; + +private: + virtual ~TKqpDirectWriteActor() { + } + + void CommitState(const NYql::NDqProto::TCheckpoint&) final {}; + void LoadState(const NYql::NDq::TSinkState&) final {}; + + ui64 GetOutputIndex() const final { + return OutputIndex; + } + + const NYql::NDq::TDqAsyncStats& GetEgressStats() const final { + return EgressStats; + } + + i64 GetFreeSpace() const final { + return (WriteTableActor && WriteTableActor->IsReady()) + ? MessageSettings.InFlightMemoryLimitPerActorBytes - GetMemory() + : std::numeric_limits::min(); // Can't use zero here because compute can use overcommit! + } + + i64 GetMemory() const { + return (WriteTableActor && WriteTableActor->IsReady()) + ? WriteTableActor->GetMemory() + : 0; + } + + TMaybe ExtraData() override { + if (!WriteTableActor) { + return {}; + } + NKikimrKqp::TEvKqpOutputActorResultInfo resultInfo; + for (const auto& lock : WriteTableActor->GetLocks()) { + resultInfo.AddLocks()->CopyFrom(lock); + } + resultInfo.SetHasRead( + GetOperation(Settings.GetType()) == NKikimrDataEvents::TEvWrite::TOperation::OPERATION_INSERT || + GetOperation(Settings.GetType()) == NKikimrDataEvents::TEvWrite::TOperation::OPERATION_UPDATE); + google::protobuf::Any result; + result.PackFrom(resultInfo); + return result; + } + + void SendData(NMiniKQL::TUnboxedValueBatch&& data, i64 size, const TMaybe&, bool finished) final { + YQL_ENSURE(!data.IsWide(), "Wide stream is not supported yet"); + YQL_ENSURE(!Closed); + Closed = finished; + EgressStats.Resume(); + Y_UNUSED(size); + + YQL_ENSURE(WriteTableActor); + WriteTableActor->Write(*WriteToken, data); + if (Closed) { + WriteTableActor->Close(*WriteToken); + WriteTableActor->Close(); + } + Process(); + } + + void Process() { + if (GetFreeSpace() <= 0) { + WaitingForTableActor = true; + } else if (WaitingForTableActor && GetFreeSpace() > MessageSettings.InFlightMemoryLimitPerActorBytes / 2) { + ResumeExecution(); + } + + if (Closed || GetFreeSpace() <= 0) { + WriteTableActor->Flush(); + } + + if (Closed && WriteTableActor->IsFinished()) { + CA_LOG_D("Write actor finished"); + Callbacks->OnAsyncOutputFinished(GetOutputIndex()); + } + } + + void RuntimeError(const TString& message, NYql::NDqProto::StatusIds::StatusCode statusCode, const NYql::TIssues& subIssues = {}) { + NYql::TIssue issue(message); + for (const auto& i : subIssues) { + issue.AddSubIssue(MakeIntrusive(i)); + } + + NYql::TIssues issues; + issues.AddIssue(std::move(issue)); + + Callbacks->OnAsyncOutputError(OutputIndex, std::move(issues), statusCode); + } + + void PassAway() override { + if (WriteTableActor) { + WriteTableActor->Terminate(); + } + TActorBootstrapped::PassAway(); + } + + void ResumeExecution() { + CA_LOG_D("Resuming execution."); + WaitingForTableActor = false; + Callbacks->ResumeExecution(); + } + + void OnReady() override { + Process(); + } + + void OnPrepared(IKqpTransactionManager::TPrepareResult&&, ui64) override { + AFL_ENSURE(false); + } + + void OnCommitted(ui64, ui64) override { + AFL_ENSURE(false); + } + + void OnMessageAcknowledged(ui64 dataSize) override { + EgressStats.Bytes += dataSize; + EgressStats.Chunks++; + EgressStats.Splits++; + EgressStats.Resume(); + Process(); + } + + void OnError(const TString& message, NYql::NDqProto::StatusIds::StatusCode statusCode, const NYql::TIssues& subIssues) override { + RuntimeError(message, statusCode, subIssues); + } + + TString LogPrefix; + const NKikimrKqp::TKqpTableSinkSettings Settings; + TWriteActorSettings MessageSettings; + const ui64 OutputIndex; + NYql::NDq::TDqAsyncStats EgressStats; + NYql::NDq::IDqComputeActorAsyncOutput::ICallbacks * Callbacks = nullptr; + TIntrusivePtr Counters; + const NMiniKQL::TTypeEnvironment& TypeEnv; + std::shared_ptr Alloc; + + const ui64 TxId; + const TTableId TableId; + TKqpTableWriteActor* WriteTableActor = nullptr; + TActorId WriteTableActorId; + + std::optional WriteToken; + + bool Closed = false; + + bool WaitingForTableActor = false; +}; + + +namespace { + +struct TWriteToken { + TTableId TableId; + ui64 Cookie; + + bool IsEmpty() const { + return !TableId; + } +}; + +struct TTransactionSettings { + ui64 TxId = 0; + ui64 LockTxId = 0; + ui64 LockNodeId = 0; + bool InconsistentTx = false; +}; + +struct TWriteSettings { + TTableId TableId; + TString TablePath; // for error messages + NKikimrDataEvents::TEvWrite::TOperation::EOperationType OperationType; + TVector Columns; + TTransactionSettings TransactionSettings; + i64 Priority; +}; + +struct TBufferWriteMessage { + TActorId From; + TWriteToken Token; + bool Close = false; + // TODO: move to serialized data + std::shared_ptr> Data; + std::shared_ptr Alloc; +}; + +struct TEvBufferWrite : public TEventLocal { + bool Close = false; + std::optional Token; + std::optional Settings; + std::shared_ptr> Data; + std::shared_ptr Alloc; + + ~TEvBufferWrite() { + if (Alloc) { + TGuard guard(*Alloc); + Data = nullptr; + } + } +}; + +struct TEvBufferWriteResult : public TEventLocal { + TWriteToken Token; +}; + +} + + +class TKqpBufferWriteActor :public TActorBootstrapped, public IKqpTableWriterCallbacks { + using TBase = TActorBootstrapped; + +public: + enum class EState { + WRITING, // Allow to write data to buffer. + FLUSHING, // Force flush (for uncommitted changes visibility). Can't accept any writes in this state. + PREPARING, // Do preparation for commit. All writers are closed. New writes wouldn't be accepted. + COMMITTING, // Do commit. All writers are closed. New writes wouldn't be accepted. + ROLLINGBACK, // Do rollback. New writes wouldn't be accepted. + FINISHED, + }; + +public: + TKqpBufferWriteActor( + TKqpBufferWriterSettings&& settings) + : SessionActorId(settings.SessionActorId) + , MessageSettings(GetWriteActorSettings()) + , TxManager(settings.TxManager) + , Alloc(std::make_shared(__LOCATION__)) + , TypeEnv(*Alloc) + { + State = EState::WRITING; + Alloc->Release(); + } + + void Bootstrap() { + LogPrefix = TStringBuilder() << "SelfId: " << this->SelfId() << ", SessionActorId: " << SessionActorId << ", " << LogPrefix; + Become(&TKqpBufferWriteActor::StateWrite); + } + + static constexpr char ActorName[] = "KQP_BUFFER_WRITE_ACTOR"; + + // TODO: split states + STFUNC(StateWrite) { + try { + switch (ev->GetTypeRewrite()) { + hFunc(TEvKqpBuffer::TEvTerminate, Handle); + hFunc(TEvKqpBuffer::TEvFlush, Handle); + hFunc(TEvKqpBuffer::TEvCommit, Handle); + hFunc(TEvKqpBuffer::TEvRollback, Handle); + hFunc(TEvBufferWrite, Handle); + + hFunc(TEvTxProxy::TEvProposeTransactionStatus, Handle); + hFunc(NKikimr::NEvents::TDataEvents::TEvWriteResult, Handle); + hFunc(TEvPipeCache::TEvDeliveryProblem, Handle); + default: + AFL_ENSURE(false)("unknown message", ev->GetTypeRewrite()); + } + } catch (const yexception& e) { + ReplyErrorAndDie(e.what(), NYql::NDqProto::StatusIds::INTERNAL_ERROR, {}); + } + } + + void Handle(TEvBufferWrite::TPtr& ev) { + TWriteToken token; + if (!ev->Get()->Token) { + AFL_ENSURE(ev->Get()->Settings); + auto& settings = *ev->Get()->Settings; + if (!WriteInfos.empty()) { + AFL_ENSURE(LockTxId == settings.TransactionSettings.LockTxId); + AFL_ENSURE(LockNodeId == settings.TransactionSettings.LockNodeId); + AFL_ENSURE(InconsistentTx == settings.TransactionSettings.InconsistentTx); + } else { + LockTxId = settings.TransactionSettings.LockTxId; + LockNodeId = settings.TransactionSettings.LockNodeId; + InconsistentTx = settings.TransactionSettings.InconsistentTx; + } + + auto& writeInfo = WriteInfos[settings.TableId]; + if (!writeInfo.WriteTableActor) { + writeInfo.WriteTableActor = new TKqpTableWriteActor( + this, + settings.TableId, + settings.TablePath, + LockTxId, + LockNodeId, + InconsistentTx, + TypeEnv, + Alloc, + TxManager, + SessionActorId); + writeInfo.WriteTableActorId = RegisterWithSameMailbox(writeInfo.WriteTableActor); + CA_LOG_D("Create new TableWriteActor for table `" << settings.TablePath << "` (" << settings.TableId << "). lockId=" << LockTxId << " " << writeInfo.WriteTableActorId); + } + + auto cookie = writeInfo.WriteTableActor->Open(settings.OperationType, std::move(settings.Columns), settings.Priority); + token = TWriteToken{settings.TableId, cookie}; + } else { + token = *ev->Get()->Token; + } + + auto& queue = DataQueues[token.TableId]; + queue.emplace(); + auto& message = queue.back(); + + message.Token = token; + message.From = ev->Sender; + message.Close = ev->Get()->Close; + message.Data = ev->Get()->Data; + message.Alloc = ev->Get()->Alloc; + + ev->Get()->Data = nullptr; + ev->Get()->Alloc = nullptr; + + Process(); + } + + void Process() { + ProcessRequestQueue(); + ProcessWrite(); + ProcessAckQueue(); + + if (State == EState::FLUSHING) { + bool isEmpty = true; + for (auto& [_, info] : WriteInfos) { + isEmpty = isEmpty && info.WriteTableActor->IsReady() && info.WriteTableActor->IsEmpty(); + } + if (isEmpty) { + OnFlushed(); + } + } + } + + void ProcessRequestQueue() { + for (auto& [tableId, queue] : DataQueues) { + auto& writeInfo = WriteInfos.at(tableId); + + if (!writeInfo.WriteTableActor->IsReady()) { + CA_LOG_D("ProcessRequestQueue " << tableId << " NOT READY queue=" << queue.size()); + return; + } + + while (!queue.empty()) { + auto& message = queue.front(); + + if (!message.Data->empty()) { + for (const auto& data : *message.Data) { + writeInfo.WriteTableActor->Write(message.Token.Cookie, data); + } + } + if (message.Close) { + writeInfo.WriteTableActor->Close(message.Token.Cookie); + } + + AckQueue.push(TAckMessage{ + .ForwardActorId = message.From, + .Token = message.Token, + .DataSize = 0, + }); + + { + TGuard guard(*message.Alloc); + message.Data = nullptr; + } + queue.pop(); + } + } + } + + void ProcessAckQueue() { + while (!AckQueue.empty()) { + const auto& item = AckQueue.front(); + if (GetTotalFreeSpace() >= item.DataSize) { + auto result = std::make_unique(); + result->Token = AckQueue.front().Token; + Send(AckQueue.front().ForwardActorId, result.release()); + AckQueue.pop(); + } else { + YQL_ENSURE(false); + return; + } + } + } + + void ProcessWrite() { + const bool needToFlush = GetTotalFreeSpace() <= 0 + || State == EState::FLUSHING + || State == EState::PREPARING + || State == EState::COMMITTING + || State == EState::ROLLINGBACK; + + if (needToFlush) { + CA_LOG_D("Flush data"); + for (auto& [_, info] : WriteInfos) { + if (info.WriteTableActor->IsReady()) { + info.WriteTableActor->Flush(); + } + } + } + } + + void Flush() { + CA_LOG_D("Start flush"); + YQL_ENSURE(State == EState::WRITING); + State = EState::FLUSHING; + for (auto& [_, queue] : DataQueues) { + YQL_ENSURE(queue.empty()); + } + Process(); + } + + void Prepare(const ui64 txId) { + CA_LOG_D("Start prepare for distributed commit"); + YQL_ENSURE(State == EState::WRITING); + State = EState::PREPARING; + for (auto& [_, queue] : DataQueues) { + YQL_ENSURE(queue.empty()); + } + TxId = txId; + for (auto& [_, info] : WriteInfos) { + info.WriteTableActor->SetPrepare(txId); + } + Close(); + Process(); + SendToExternalShards(false); + } + + void ImmediateCommit() { + CA_LOG_D("Start immediate commit"); + YQL_ENSURE(State == EState::WRITING); + State = EState::COMMITTING; + for (auto& [_, queue] : DataQueues) { + YQL_ENSURE(queue.empty()); + } + for (auto& [_, info] : WriteInfos) { + info.WriteTableActor->SetImmediateCommit(); + } + Close(); + Process(); + } + + void DistributedCommit() { + CA_LOG_D("Start distributed commit with TxId=" << *TxId); + YQL_ENSURE(State == EState::PREPARING); + State = EState::COMMITTING; + for (auto& [_, queue] : DataQueues) { + YQL_ENSURE(queue.empty()); + } + for (auto& [_, info] : WriteInfos) { + info.WriteTableActor->SetDistributedCommit(); + } + SendCommitToCoordinator(); + } + + void Rollback() { + CA_LOG_D("Start rollback"); + State = EState::ROLLINGBACK; + SendToExternalShards(true); + } + + void SendToExternalShards(bool isRollback) { + THashSet shards = TxManager->GetShards(); + if (!isRollback) { + for (auto& [_, info] : WriteInfos) { + for (const auto& shardId : info.WriteTableActor->GetShardsIds()) { + shards.erase(shardId); + } + } + } + + for (const ui64 shardId : shards) { + if (TxManager->GetLocks(shardId).empty()) { + continue; + } + auto evWrite = std::make_unique(isRollback + ? NKikimrDataEvents::TEvWrite::MODE_IMMEDIATE + : (TxManager->IsVolatile() + ? NKikimrDataEvents::TEvWrite::MODE_VOLATILE_PREPARE + : NKikimrDataEvents::TEvWrite::MODE_PREPARE)); + + if (isRollback) { + FillEvWriteRollback(evWrite.get(), shardId, TxManager); + } else { + YQL_ENSURE(TxId); + FillEvWritePrepare(evWrite.get(), shardId, *TxId, TxManager); + } + + CA_LOG_D("Send EvWrite (external) to ShardID=" << shardId << ", isPrepare=" << !isRollback << ", isImmediateCommit=" << isRollback << ", TxId=" << evWrite->Record.GetTxId() + << ", LockTxId=" << evWrite->Record.GetLockTxId() << ", LockNodeId=" << evWrite->Record.GetLockNodeId() + << ", Locks= " << [&]() { + TStringBuilder builder; + for (const auto& lock : evWrite->Record.GetLocks().GetLocks()) { + builder << lock.ShortDebugString(); + } + return builder; + }() + << ", Size=" << 0 << ", Cookie=" << 0 + << ", OperationsCount=" << 0 << ", IsFinal=" << 1 + << ", Attempts=" << 0); + + Send( + NKikimr::MakePipePerNodeCacheID(false), + new TEvPipeCache::TEvForward(evWrite.release(), shardId, true), + 0, + 0); + } + } + + void SendCommitToCoordinator() { + const auto commitInfo = TxManager->GetCommitInfo(); + + auto ev = MakeHolder(); + + YQL_ENSURE(commitInfo.Coordinator); + ev->Record.SetCoordinatorID(commitInfo.Coordinator); + + auto& transaction = *ev->Record.MutableTransaction(); + auto& affectedSet = *transaction.MutableAffectedSet(); + affectedSet.Reserve(commitInfo.ShardsInfo.size()); + + YQL_ENSURE(TxId); + transaction.SetTxId(*TxId); + transaction.SetMinStep(commitInfo.MinStep); + transaction.SetMaxStep(commitInfo.MaxStep); + if (TxManager->IsVolatile()) { + transaction.SetFlags(TEvTxProxy::TEvProposeTransaction::FlagVolatile); + } + + for (const auto& shardInfo : commitInfo.ShardsInfo) { + auto& item = *affectedSet.Add(); + item.SetTabletId(shardInfo.ShardId); + Y_ABORT_UNLESS(shardInfo.AffectedFlags != 0); + item.SetFlags(shardInfo.AffectedFlags); + } + + //TODO: NDataIntegrity + CA_LOG_D("Execute planned transaction, coordinator: " << commitInfo.Coordinator + << ", volitale: " << ((transaction.GetFlags() & TEvTxProxy::TEvProposeTransaction::FlagVolatile) != 0) + << ", shards: " << affectedSet.size()); + Send(MakePipePerNodeCacheID(false), new TEvPipeCache::TEvForward(ev.Release(), commitInfo.Coordinator, /* subscribe */ true)); + } + + void Close() { + for (auto& [_, info] : WriteInfos) { + if (!info.WriteTableActor->IsClosed()) { + info.WriteTableActor->Close(); + } + } + } + + i64 GetFreeSpace(TWriteToken token) const { + auto& info = WriteInfos.at(token.TableId); + return info.WriteTableActor->IsReady() + ? MessageSettings.InFlightMemoryLimitPerActorBytes - info.WriteTableActor->GetMemory() + : std::numeric_limits::min(); // Can't use zero here because compute can use overcommit! + } + + i64 GetTotalFreeSpace() const { + return MessageSettings.InFlightMemoryLimitPerActorBytes - GetTotalMemory(); + } + + i64 GetTotalMemory() const { + i64 totalMemory = 0; + for (auto& [_, info] : WriteInfos) { + totalMemory += info.WriteTableActor->IsReady() + ? info.WriteTableActor->GetMemory() + : 0; + } + return totalMemory; + } + + THashSet GetShardsIds() const { + THashSet shardIds; + for (auto& [_, info] : WriteInfos) { + for (const auto& id : info.WriteTableActor->GetShardsIds()) { + shardIds.insert(id); + } + } + return shardIds; + } + + void PassAway() override { + for (auto& [_, queue] : DataQueues) { + while (!queue.empty()) { + auto& message = queue.front(); + { + TGuard guard(*message.Alloc); + message.Data = nullptr; + } + queue.pop(); + } + } + + for (auto& [_, info] : WriteInfos) { + if (info.WriteTableActor) { + info.WriteTableActor->Terminate(); + } + } + TActorBootstrapped::PassAway(); + } + + void Handle(TEvTxProxy::TEvProposeTransactionStatus::TPtr &ev) { + TEvTxProxy::TEvProposeTransactionStatus* res = ev->Get(); + CA_LOG_D("Got transaction status, status: " << res->GetStatus()); + + switch (res->GetStatus()) { + case TEvTxProxy::TEvProposeTransactionStatus::EStatus::StatusAccepted: + // TODO: metrics + break; + case TEvTxProxy::TEvProposeTransactionStatus::EStatus::StatusProcessed: + break; + case TEvTxProxy::TEvProposeTransactionStatus::EStatus::StatusConfirmed: + break; + + case TEvTxProxy::TEvProposeTransactionStatus::EStatus::StatusPlanned: + break; + + case TEvTxProxy::TEvProposeTransactionStatus::EStatus::StatusOutdated: + case TEvTxProxy::TEvProposeTransactionStatus::EStatus::StatusDeclined: + case TEvTxProxy::TEvProposeTransactionStatus::EStatus::StatusDeclinedNoSpace: + case TEvTxProxy::TEvProposeTransactionStatus::EStatus::StatusRestarting: + // TODO: CancelProposal??? + ReplyErrorAndDie(TStringBuilder() << "Failed to plan transaction, status: " << res->GetStatus(), NYql::NDqProto::StatusIds::UNAVAILABLE, {}); + break; + + case TEvTxProxy::TEvProposeTransactionStatus::EStatus::StatusUnknown: + case TEvTxProxy::TEvProposeTransactionStatus::EStatus::StatusAborted: + ReplyErrorAndDie(TStringBuilder() << "Unexpected TEvProposeTransactionStatus status: " << res->GetStatus(), NYql::NDqProto::StatusIds::INTERNAL_ERROR, {}); + break; + } + } + + void Handle(TEvPipeCache::TEvDeliveryProblem::TPtr& ev) { + CA_LOG_W("TEvDeliveryProblem was received from tablet: " << ev->Get()->TabletId); + ReplyErrorAndDie(TStringBuilder() << "Failed to deviler message.", NYql::NDqProto::StatusIds::UNAVAILABLE, {}); + } + + void Handle(TEvKqpBuffer::TEvTerminate::TPtr&) { + PassAway(); + } + + void Handle(TEvKqpBuffer::TEvFlush::TPtr& ev) { + ExecuterActorId = ev->Get()->ExecuterActorId; + for (auto& [_, info] : WriteInfos) { + info.WriteTableActor->FlushBuffers(); + } + Flush(); + } + + void Handle(TEvKqpBuffer::TEvCommit::TPtr& ev) { + ExecuterActorId = ev->Get()->ExecuterActorId; + for (auto& [_, info] : WriteInfos) { + info.WriteTableActor->FlushBuffers(); + } + if (TxManager->IsReadOnly()) { + Rollback(); + State = EState::FINISHED; + Send(ExecuterActorId, new TEvKqpBuffer::TEvResult{}); + } else if (TxManager->IsSingleShard() && !TxManager->HasOlapTable() && !WriteInfos.empty()) { + TxManager->StartExecute(); + ImmediateCommit(); + } else { + TxManager->StartPrepare(); + Prepare(ev->Get()->TxId); + } + } + + void Handle(TEvKqpBuffer::TEvRollback::TPtr& ev) { + ExecuterActorId = ev->Get()->ExecuterActorId; + Rollback(); + State = EState::FINISHED; + Send(ExecuterActorId, new TEvKqpBuffer::TEvResult{}); + } + + void Handle(NKikimr::NEvents::TDataEvents::TEvWriteResult::TPtr& ev) { + auto getIssues = [&ev]() { + NYql::TIssues issues; + NYql::IssuesFromMessage(ev->Get()->Record.GetIssues(), issues); + return issues; + }; + + CA_LOG_D("Recv EvWriteResult (external) from ShardID=" << ev->Get()->Record.GetOrigin() + << ", Status=" << NKikimrDataEvents::TEvWriteResult::EStatus_Name(ev->Get()->GetStatus()) + << ", TxId=" << ev->Get()->Record.GetTxId() + << ", Locks= " << [&]() { + TStringBuilder builder; + for (const auto& lock : ev->Get()->Record.GetTxLocks()) { + builder << lock.ShortDebugString(); + } + return builder; + }() + << ", Cookie=" << ev->Cookie); + + // TODO: get rid of copy-paste + switch (ev->Get()->GetStatus()) { + case NKikimrDataEvents::TEvWriteResult::STATUS_UNSPECIFIED: { + CA_LOG_E("Got UNSPECIFIED for table." + << " ShardID=" << ev->Get()->Record.GetOrigin() << "," + << " Sink=" << this->SelfId() << "." + << getIssues().ToOneLineString()); + ReplyErrorAndDie( + TStringBuilder() << "Unspecified error for table. " + << getIssues().ToOneLineString(), + NYql::NDqProto::StatusIds::UNSPECIFIED, + getIssues()); + return; + } + case NKikimrDataEvents::TEvWriteResult::STATUS_PREPARED: { + ProcessWritePreparedShard(ev); + return; + } + case NKikimrDataEvents::TEvWriteResult::STATUS_COMPLETED: { + ProcessWriteCompletedShard(ev); + return; + } + case NKikimrDataEvents::TEvWriteResult::STATUS_ABORTED: { + CA_LOG_E("Got ABORTED for table." + << " ShardID=" << ev->Get()->Record.GetOrigin() << "," + << " Sink=" << this->SelfId() << "." + << getIssues().ToOneLineString()); + ReplyErrorAndDie( + TStringBuilder() << "Aborted for table. " + << getIssues().ToOneLineString(), + NYql::NDqProto::StatusIds::ABORTED, + getIssues()); + return; + } + case NKikimrDataEvents::TEvWriteResult::STATUS_INTERNAL_ERROR: { + CA_LOG_E("Got INTERNAL ERROR for table." + << " ShardID=" << ev->Get()->Record.GetOrigin() << "," + << " Sink=" << this->SelfId() << "." + << getIssues().ToOneLineString()); + + ReplyErrorAndDie( + TStringBuilder() << "Internal error for table. " + << getIssues().ToOneLineString(), + NYql::NDqProto::StatusIds::INTERNAL_ERROR, + getIssues()); + return; + } + case NKikimrDataEvents::TEvWriteResult::STATUS_DISK_SPACE_EXHAUSTED: { + CA_LOG_E("Got DISK_SPACE_EXHAUSTED for table." + << " ShardID=" << ev->Get()->Record.GetOrigin() << "," + << " Sink=" << this->SelfId() << "." + << getIssues().ToOneLineString()); + + ReplyErrorAndDie( + TStringBuilder() << "Disk space exhausted for table. " + << getIssues().ToOneLineString(), + NYql::NDqProto::StatusIds::PRECONDITION_FAILED, + getIssues()); + return; + } + case NKikimrDataEvents::TEvWriteResult::STATUS_OVERLOADED: { + CA_LOG_W("Got OVERLOADED for table ." + << " ShardID=" << ev->Get()->Record.GetOrigin() << "," + << " Sink=" << this->SelfId() << "." + << " Ignored this error." + << getIssues().ToOneLineString()); + // TODO: support waiting + ReplyErrorAndDie( + TStringBuilder() << "Tablet " << ev->Get()->Record.GetOrigin() << " is overloaded." + << getIssues().ToOneLineString(), + NYql::NDqProto::StatusIds::OVERLOADED, + getIssues()); + return; + } + case NKikimrDataEvents::TEvWriteResult::STATUS_CANCELLED: { + CA_LOG_E("Got CANCELLED for table." + << " ShardID=" << ev->Get()->Record.GetOrigin() << "," + << " Sink=" << this->SelfId() << "." + << getIssues().ToOneLineString()); + ReplyErrorAndDie( + TStringBuilder() << "Cancelled request to table." + << getIssues().ToOneLineString(), + NYql::NDqProto::StatusIds::CANCELLED, + getIssues()); + return; + } + case NKikimrDataEvents::TEvWriteResult::STATUS_BAD_REQUEST: { + CA_LOG_E("Got BAD REQUEST for table." + << " ShardID=" << ev->Get()->Record.GetOrigin() << "," + << " Sink=" << this->SelfId() << "." + << getIssues().ToOneLineString()); + ReplyErrorAndDie( + TStringBuilder() << "Bad request. " + << getIssues().ToOneLineString(), + NYql::NDqProto::StatusIds::BAD_REQUEST, + getIssues()); + return; + } + case NKikimrDataEvents::TEvWriteResult::STATUS_SCHEME_CHANGED: { + CA_LOG_E("Got SCHEME CHANGED for table." + << " ShardID=" << ev->Get()->Record.GetOrigin() << "," + << " Sink=" << this->SelfId() << "." + << getIssues().ToOneLineString()); + ReplyErrorAndDie( + TStringBuilder() << "Scheme changed. " + << getIssues().ToOneLineString(), + NYql::NDqProto::StatusIds::SCHEME_ERROR, + getIssues()); + return; + } + case NKikimrDataEvents::TEvWriteResult::STATUS_LOCKS_BROKEN: { + CA_LOG_E("Got LOCKS BROKEN for table." + << " ShardID=" << ev->Get()->Record.GetOrigin() << "," + << " Sink=" << this->SelfId() << "." + << getIssues().ToOneLineString()); + + TxManager->BreakLock(ev->Get()->Record.GetOrigin()); + YQL_ENSURE(TxManager->BrokenLocks()); + ReplyErrorAndDie( + TStringBuilder() << "Transaction locks invalidated." + << getIssues().ToOneLineString(), + NYql::NDqProto::StatusIds::ABORTED, + getIssues()); + return; + } + } + } + + void ProcessWritePreparedShard(NKikimr::NEvents::TDataEvents::TEvWriteResult::TPtr& ev) { + if (State != EState::PREPARING) { + CA_LOG_D("Ignored write prepared event."); + return; + } + const auto& record = ev->Get()->Record; + IKqpTransactionManager::TPrepareResult preparedInfo; + preparedInfo.ShardId = record.GetOrigin(); + preparedInfo.MinStep = record.GetMinStep(); + preparedInfo.MaxStep = record.GetMaxStep(); + + preparedInfo.Coordinator = 0; + if (record.DomainCoordinatorsSize()) { + auto domainCoordinators = TCoordinators(TVector(record.GetDomainCoordinators().begin(), + record.GetDomainCoordinators().end())); + preparedInfo.Coordinator = domainCoordinators.Select(*TxId); + } + + OnPrepared(std::move(preparedInfo), 0); + } + + void ProcessWriteCompletedShard(NKikimr::NEvents::TDataEvents::TEvWriteResult::TPtr& ev) { + if (State != EState::COMMITTING) { + CA_LOG_D("Ignored write completed event."); + return; + } + CA_LOG_D("Got completed result TxId=" << ev->Get()->Record.GetTxId() + << ", TabletId=" << ev->Get()->Record.GetOrigin() + << ", Cookie=" << ev->Cookie + << ", Locks=" << [&]() { + TStringBuilder builder; + for (const auto& lock : ev->Get()->Record.GetTxLocks()) { + builder << lock.ShortDebugString(); + } + return builder; + }()); + + OnCommitted(ev->Get()->Record.GetOrigin(), 0); + } + + void OnReady() override { + Process(); + } + + void OnPrepared(IKqpTransactionManager::TPrepareResult&& preparedInfo, ui64 dataSize) override { + if (State != EState::PREPARING) { + return; + } + Y_UNUSED(preparedInfo, dataSize); + if (TxManager->ConsumePrepareTransactionResult(std::move(preparedInfo))) { + TxManager->StartExecute(); + Y_ABORT_UNLESS(GetTotalMemory() == 0); + DistributedCommit(); + return; + } + Process(); + } + + void OnCommitted(ui64 shardId, ui64 dataSize) override { + if (State != EState::COMMITTING) { + return; + } + Y_UNUSED(dataSize); + if (TxManager->ConsumeCommitResult(shardId)) { + CA_LOG_D("Committed"); + State = EState::FINISHED; + Send(ExecuterActorId, new TEvKqpBuffer::TEvResult{}); + ExecuterActorId = {}; + Y_ABORT_UNLESS(GetTotalMemory() == 0); + return; + } + } + + void OnMessageAcknowledged(ui64 dataSize) override { + Y_UNUSED(dataSize); + Process(); + } + + void OnFlushed() { + CA_LOG_D("Flushed"); + State = EState::WRITING; + Send(ExecuterActorId, new TEvKqpBuffer::TEvResult{}); + ExecuterActorId = {}; + Y_ABORT_UNLESS(GetTotalMemory() == 0); + } + + void OnError(const TString& message, NYql::NDqProto::StatusIds::StatusCode statusCode, const NYql::TIssues& subIssues) override { + ReplyErrorAndDie(message, statusCode, subIssues); + } + + void ReplyErrorAndDie(const TString& message, NYql::NDqProto::StatusIds::StatusCode statusCode, const NYql::TIssues& subIssues = {}) { + CA_LOG_E(message << ". statusCode=" << NYql::NDqProto::StatusIds_StatusCode_Name(statusCode) << ". subIssues=" << subIssues.ToString() << ". sessionActorId=" << SessionActorId << ". isRollback=" << (State == EState::ROLLINGBACK)); + + Y_ABORT_UNLESS(!HasError); + HasError = true; + if (State != EState::ROLLINGBACK) { + // Rollback can't finish with error + Send(SessionActorId, new TEvKqpBuffer::TEvError{ + message, + statusCode, + subIssues, + }); + } + PassAway(); + } + +private: + TString LogPrefix; + const TActorId SessionActorId; + TWriteActorSettings MessageSettings; + + TActorId ExecuterActorId; + IKqpTransactionManagerPtr TxManager; + + std::optional TxId; + ui64 LockTxId = 0; + ui64 LockNodeId = 0; + bool InconsistentTx = false; + + std::shared_ptr Alloc; + NMiniKQL::TTypeEnvironment TypeEnv; + + struct TWriteInfo { + TKqpTableWriteActor* WriteTableActor = nullptr; + TActorId WriteTableActorId; + }; + + THashMap WriteInfos; + + EState State; + bool HasError = false; + THashMap> DataQueues; + + struct TAckMessage { + TActorId ForwardActorId; + TWriteToken Token; + i64 DataSize; + }; + std::queue AckQueue; + + IShardedWriteControllerPtr ShardedWriteController = nullptr; + + NWilson::TSpan WriteActorSpan; + NWilson::TSpan WriteActorStateSpan; +}; + +class TKqpForwardWriteActor : public TActorBootstrapped, public NYql::NDq::IDqComputeActorAsyncOutput { + using TBase = TActorBootstrapped; + +public: + TKqpForwardWriteActor( + NKikimrKqp::TKqpTableSinkSettings&& settings, + NYql::NDq::TDqAsyncIoFactory::TSinkArguments&& args, + TIntrusivePtr counters) + : LogPrefix(TStringBuilder() << "TxId: " << args.TxId << ", task: " << args.TaskId << ". ") + , Settings(std::move(settings)) + , MessageSettings(GetWriteActorSettings()) + , OutputIndex(args.OutputIndex) + , Callbacks(args.Callback) + , Counters(counters) + , TypeEnv(args.TypeEnv) + , Alloc(args.Alloc) + , BufferActorId(ActorIdFromProto(Settings.GetBufferActorId())) + , TxId(std::get(args.TxId)) + , TableId( + Settings.GetTable().GetOwnerId(), + Settings.GetTable().GetTableId(), + Settings.GetTable().GetVersion()) + { + EgressStats.Level = args.StatsLevel; + } + + void Bootstrap() { + LogPrefix = TStringBuilder() << "SelfId: " << this->SelfId() << ", " << LogPrefix; + Become(&TKqpForwardWriteActor::StateFuncFwd); + } + + static constexpr char ActorName[] = "KQP_FORWARD_WRITE_ACTOR"; + +private: + STFUNC(StateFuncFwd) { + try { + switch (ev->GetTypeRewrite()) { + hFunc(TEvBufferWriteResult, Handle); + default: + AFL_ENSURE(false)("unknown message", ev->GetTypeRewrite()); + } + } catch (const yexception& e) { + RuntimeError(e.what(), NYql::NDqProto::StatusIds::INTERNAL_ERROR); + } + } + + void Handle(TEvBufferWriteResult::TPtr& result) { + CA_LOG_D("TKqpForwardWriteActor recieve EvBufferWriteResult from " << BufferActorId); + EgressStats.Bytes += DataSize; + EgressStats.Chunks++; + EgressStats.Splits++; + EgressStats.Resume(); + + WriteToken = result->Get()->Token; + DataSize = 0; + { + auto alloc = TypeEnv.BindAllocator(); + Data = nullptr; + } + + if (Closed) { + CA_LOG_D("Finished"); + Callbacks->OnAsyncOutputFinished(GetOutputIndex()); + return; + } + CA_LOG_D("Resume with freeSpace=" << GetFreeSpace()); + Callbacks->ResumeExecution(); + } + + void WriteToBuffer() { + auto ev = std::make_unique(); + + ev->Data = Data; + ev->Close = Closed; + ev->Alloc = Alloc; + + if (!WriteToken.IsEmpty()) { + ev->Token = WriteToken; + } else { + TVector columnsMetadata; + columnsMetadata.reserve(Settings.GetColumns().size()); + for (const auto & column : Settings.GetColumns()) { + columnsMetadata.push_back(column); + } + + ev->Settings = TWriteSettings{ + .TableId = TableId, + .TablePath = Settings.GetTable().GetPath(), + .OperationType = GetOperation(Settings.GetType()), + .Columns = std::move(columnsMetadata), + .TransactionSettings = TTransactionSettings{ + .TxId = TxId, + .LockTxId = Settings.GetLockTxId(), + .LockNodeId = Settings.GetLockNodeId(), + .InconsistentTx = Settings.GetInconsistentTx(), + }, + .Priority = Settings.GetPriority(), + }; + } + + CA_LOG_D("Send data=" << DataSize << ", closed=" << Closed << ", bufferActorId=" << BufferActorId); + AFL_ENSURE(Send(BufferActorId, ev.release())); + } + + void CommitState(const NYql::NDqProto::TCheckpoint&) final {}; + void LoadState(const NYql::NDq::TSinkState&) final {}; + + ui64 GetOutputIndex() const final { + return OutputIndex; + } + + const NYql::NDq::TDqAsyncStats& GetEgressStats() const final { + return EgressStats; + } + + i64 GetFreeSpace() const final { + return MessageSettings.MaxForwardedSize - DataSize > 0 + ? MessageSettings.MaxForwardedSize - DataSize + : std::numeric_limits::min(); + } + + TMaybe ExtraData() override { + return {}; + } + + void SendData(NMiniKQL::TUnboxedValueBatch&& data, i64 size, const TMaybe&, bool finished) final { + YQL_ENSURE(!data.IsWide(), "Wide stream is not supported yet"); + Closed |= finished; + if (!Data) { + Data = std::make_shared>(); + } + Data->emplace_back(std::move(data)); + DataSize += size; + + CA_LOG_D("Add data: " << size << " / " << DataSize); + if (Closed || GetFreeSpace() <= 0) { + WriteToBuffer(); + } + } + + void RuntimeError(const TString& message, NYql::NDqProto::StatusIds::StatusCode statusCode, const NYql::TIssues& subIssues = {}) { + CA_LOG_E("RuntimeError: " << message); + NYql::TIssue issue(message); + for (const auto& i : subIssues) { + issue.AddSubIssue(MakeIntrusive(i)); + } + + NYql::TIssues issues; + issues.AddIssue(std::move(issue)); + + Callbacks->OnAsyncOutputError(OutputIndex, std::move(issues), statusCode); + } + + ~TKqpForwardWriteActor() { + { + TGuard guard(*Alloc); + Data = nullptr; + } + } + + void PassAway() override { + TActorBootstrapped::PassAway(); + } + + TString LogPrefix; + const NKikimrKqp::TKqpTableSinkSettings Settings; + TWriteActorSettings MessageSettings; + const ui64 OutputIndex; + NYql::NDq::TDqAsyncStats EgressStats; + NYql::NDq::IDqComputeActorAsyncOutput::ICallbacks * Callbacks = nullptr; + TIntrusivePtr Counters; + const NMiniKQL::TTypeEnvironment& TypeEnv; + std::shared_ptr Alloc; + + TActorId BufferActorId; + + std::shared_ptr> Data; + i64 DataSize = 0; + bool Closed = false; + + const ui64 TxId; + const TTableId TableId; + + TWriteToken WriteToken; +}; + +NActors::IActor* CreateKqpBufferWriterActor(TKqpBufferWriterSettings&& settings) { + return new TKqpBufferWriteActor(std::move(settings)); +} + + +void RegisterKqpWriteActor(NYql::NDq::TDqAsyncIoFactory& factory, TIntrusivePtr counters) { + factory.RegisterSink( + TString(NYql::KqpTableSinkName), + [counters] (NKikimrKqp::TKqpTableSinkSettings&& settings, NYql::NDq::TDqAsyncIoFactory::TSinkArguments&& args) { + if (!ActorIdFromProto(settings.GetBufferActorId())) { + auto* actor = new TKqpDirectWriteActor(std::move(settings), std::move(args), counters); + return std::make_pair(actor, actor); + } else { + auto* actor = new TKqpForwardWriteActor(std::move(settings), std::move(args), counters); + return std::make_pair(actor, actor); + } }); } diff --git a/ydb/core/kqp/runtime/kqp_write_actor.h b/ydb/core/kqp/runtime/kqp_write_actor.h index 844309a70a77..f87062db2289 100644 --- a/ydb/core/kqp/runtime/kqp_write_actor.h +++ b/ydb/core/kqp/runtime/kqp_write_actor.h @@ -1,6 +1,10 @@ #pragma once +#include #include +#include +#include +#include #include namespace NKikimr { diff --git a/ydb/core/kqp/runtime/kqp_write_actor_settings.h b/ydb/core/kqp/runtime/kqp_write_actor_settings.h index 328dcd5120a7..37e8bfe91055 100644 --- a/ydb/core/kqp/runtime/kqp_write_actor_settings.h +++ b/ydb/core/kqp/runtime/kqp_write_actor_settings.h @@ -11,6 +11,7 @@ struct TWriteActorSettings : TAtomicRefCount { i64 InFlightMemoryLimitPerActorBytes = 64_MB; i64 MemoryLimitPerMessageBytes = 64_MB; i64 MaxBatchesPerMessage = 1000; + i64 MaxForwardedSize = 64_MB; TDuration StartRetryDelay = TDuration::Seconds(1); TDuration MaxRetryDelay = TDuration::Seconds(10); diff --git a/ydb/core/kqp/runtime/kqp_write_table.cpp b/ydb/core/kqp/runtime/kqp_write_table.cpp index a228163619f8..798cb724fe0e 100644 --- a/ydb/core/kqp/runtime/kqp_write_table.cpp +++ b/ydb/core/kqp/runtime/kqp_write_table.cpp @@ -19,7 +19,6 @@ namespace { constexpr ui64 DataShardMaxOperationBytes = 8_MB; constexpr ui64 ColumnShardMaxOperationBytes = 64_MB; -constexpr ui64 MaxUnshardedBatchBytes = 0_MB; class IPayloadSerializer : public TThrRefBase { public: @@ -32,7 +31,7 @@ class IPayloadSerializer : public TThrRefBase { using IBatchPtr = TIntrusivePtr; - virtual void AddData(NMiniKQL::TUnboxedValueBatch&& data) = 0; + virtual void AddData(const NMiniKQL::TUnboxedValueBatch& data) = 0; virtual void AddBatch(const IBatchPtr& batch) = 0; virtual void Close() = 0; @@ -179,7 +178,7 @@ std::vector> BuildBatchBuilderColumns( result.reserve(columns.size()); for (const auto& column : columns) { if (inputColumnsIds.contains(column.GetId())) { - Y_ABORT_UNLESS(column.HasTypeId()); + YQL_ENSURE(column.HasTypeId()); auto typeInfoMod = NScheme::TypeInfoModFromProtoColumnType(column.GetTypeId(), column.HasTypeInfo() ? &column.GetTypeInfo() : nullptr); result.emplace_back(column.GetName(), typeInfoMod.TypeInfo); @@ -394,7 +393,7 @@ class TColumnShardPayloadSerializer : public IPayloadSerializer { Sharding = shardingConclusion.DetachResult(); } - void AddData(NMiniKQL::TUnboxedValueBatch&& data) override { + void AddData(const NMiniKQL::TUnboxedValueBatch& data) override { YQL_ENSURE(!Closed); if (data.empty()) { return; @@ -421,7 +420,7 @@ class TColumnShardPayloadSerializer : public IPayloadSerializer { } void FlushUnsharded(bool force) { - if ((BatchBuilder.Bytes() > 0 && force) || BatchBuilder.Bytes() > MaxUnshardedBatchBytes) { + if (BatchBuilder.Bytes() > 0 && force) { const auto unshardedBatch = BatchBuilder.FlushBatch(true); YQL_ENSURE(unshardedBatch); ShardAndFlushBatch(unshardedBatch, force); @@ -433,14 +432,13 @@ class TColumnShardPayloadSerializer : public IPayloadSerializer { const i64 shardBatchMemory = NArrow::GetBatchDataSize(shardBatch); YQL_ENSURE(shardBatchMemory != 0); + ShardIds.insert(shardId); auto& unpreparedBatch = UnpreparedBatches[shardId]; unpreparedBatch.TotalDataSize += shardBatchMemory; unpreparedBatch.Batches.emplace_back(shardBatch); Memory += shardBatchMemory; FlushUnpreparedBatch(shardId, unpreparedBatch, force); - - ShardIds.insert(shardId); } } @@ -650,7 +648,7 @@ class TDataShardPayloadSerializer : public IPayloadSerializer { } ui64 AddRow(TRowWithData&& rowWithData) { - Y_ABORT_UNLESS(rowWithData.Cells.size() == ColumnCount); + YQL_ENSURE(rowWithData.Cells.size() == ColumnCount); ui64 newMemory = 0; for (const auto& cell : rowWithData.Cells) { newMemory += cell.Size(); @@ -682,10 +680,10 @@ class TDataShardPayloadSerializer : public IPayloadSerializer { public: TDataShardPayloadSerializer( const NSchemeCache::TSchemeCacheNavigate::TEntry& schemeEntry, - NSchemeCache::TSchemeCacheRequest::TEntry&& partitionsEntry, + const NSchemeCache::TSchemeCacheRequest::TEntry& partitionsEntry, const TConstArrayRef inputColumns) : SchemeEntry(schemeEntry) - , KeyDescription(std::move(partitionsEntry.KeyDescription)) + , KeyDescription(partitionsEntry.KeyDescription) , Columns(BuildColumns(inputColumns)) , WriteIndex(BuildWriteIndexKeyFirst(SchemeEntry, inputColumns)) , WriteColumnIds(BuildWriteColumnIds(inputColumns, WriteIndex)) @@ -716,7 +714,7 @@ class TDataShardPayloadSerializer : public IPayloadSerializer { ShardIds.insert(shardIter->ShardId); } - void AddData(NMiniKQL::TUnboxedValueBatch&& data) override { + void AddData(const NMiniKQL::TUnboxedValueBatch& data) override { YQL_ENSURE(!Closed); TRowBuilder rowBuilder(Columns.size()); @@ -821,7 +819,7 @@ class TDataShardPayloadSerializer : public IPayloadSerializer { } const NSchemeCache::TSchemeCacheNavigate::TEntry SchemeEntry; - THolder KeyDescription; + const THolder& KeyDescription; const TVector Columns; const std::vector WriteIndex; @@ -836,8 +834,6 @@ class TDataShardPayloadSerializer : public IPayloadSerializer { bool Closed = false; }; -} - bool IPayloadSerializer::IBatch::IsEmpty() const { return GetMemory() == 0; } @@ -851,14 +847,37 @@ IPayloadSerializerPtr CreateColumnShardPayloadSerializer( IPayloadSerializerPtr CreateDataShardPayloadSerializer( const NSchemeCache::TSchemeCacheNavigate::TEntry& schemeEntry, - NSchemeCache::TSchemeCacheRequest::TEntry&& partitionsEntry, + const NSchemeCache::TSchemeCacheRequest::TEntry& partitionsEntry, const TConstArrayRef inputColumns) { return MakeIntrusive( - schemeEntry, std::move(partitionsEntry), inputColumns); + schemeEntry, partitionsEntry, inputColumns); +} + } namespace { +struct TMetadata { + const TTableId TableId; + const NKikimrDataEvents::TEvWrite::TOperation::EOperationType OperationType; + const TVector InputColumnsMetadata; + const i64 Priority; +}; + +struct TBatchWithMetadata { + IShardedWriteController::TWriteToken Token = std::numeric_limits::max(); + IPayloadSerializer::IBatchPtr Data = nullptr; + bool HasRead = false; + + bool IsCoveringBatch() const { + return Data == nullptr; + } + + i64 GetMemory() const { + return IsCoveringBatch() ? 0 : Data->GetMemory(); + } +}; + class TShardsInfo { public: class TShardInfo { @@ -890,27 +909,31 @@ class TShardsInfo { void MakeNextBatches(i64 maxDataSize, ui64 maxCount) { YQL_ENSURE(BatchesInFlight == 0); YQL_ENSURE(!IsEmpty()); + YQL_ENSURE(maxCount != 0); i64 dataSize = 0; // For columnshard batch can be slightly larger than the limit. while (BatchesInFlight < maxCount && BatchesInFlight < Batches.size() - && (dataSize + GetBatch(BatchesInFlight)->GetMemory() <= maxDataSize || BatchesInFlight == 0)) { - dataSize += GetBatch(BatchesInFlight)->GetMemory(); + && (dataSize + GetBatch(BatchesInFlight).GetMemory() <= maxDataSize || BatchesInFlight == 0)) { + dataSize += GetBatch(BatchesInFlight).GetMemory(); ++BatchesInFlight; } YQL_ENSURE(BatchesInFlight != 0); - YQL_ENSURE(BatchesInFlight == maxCount || BatchesInFlight == Batches.size() || dataSize + GetBatch(BatchesInFlight)->GetMemory() >= maxDataSize); + YQL_ENSURE(BatchesInFlight == Batches.size() || BatchesInFlight >= maxCount || dataSize + GetBatch(BatchesInFlight).GetMemory() > maxDataSize); } - const IPayloadSerializer::IBatchPtr& GetBatch(size_t index) const { + const TBatchWithMetadata& GetBatch(size_t index) const { return Batches.at(index); } - std::optional PopBatches(const ui64 cookie) { + struct TBatchInfo { + ui64 DataSize = 0; + }; + std::optional PopBatches(const ui64 cookie) { if (BatchesInFlight != 0 && Cookie == cookie) { - ui64 dataSize = 0; + TBatchInfo result; for (size_t index = 0; index < BatchesInFlight; ++index) { - dataSize += Batches.front()->GetMemory(); + result.DataSize += Batches.front().GetMemory(); Batches.pop_front(); } @@ -918,16 +941,17 @@ class TShardsInfo { SendAttempts = 0; BatchesInFlight = 0; - Memory -= dataSize; - return dataSize; + Memory -= result.DataSize; + return result; } return std::nullopt; } - void PushBatch(IPayloadSerializer::IBatchPtr&& batch) { + void PushBatch(TBatchWithMetadata&& batch) { YQL_ENSURE(!IsClosed()); Batches.emplace_back(std::move(batch)); - Memory += Batches.back()->GetMemory(); + Memory += Batches.back().GetMemory(); + HasReadInBatch |= Batches.back().HasRead; } ui64 GetCookie() const { @@ -950,9 +974,14 @@ class TShardsInfo { SendAttempts = 0; } + bool HasRead() const { + return HasReadInBatch; + } + private: - std::deque Batches; + std::deque Batches; i64& Memory; + bool HasReadInBatch = false; ui64& NextCookie; ui64 Cookie; @@ -973,11 +1002,14 @@ class TShardsInfo { return insertIt->second; } - TVector GetPendingShards() const { - TVector result; + TVector GetPendingShards() const { + TVector result; for (const auto& [id, shard] : ShardsInfo) { if (!shard.IsEmpty() && shard.GetSendAttempts() == 0) { - result.push_back(id); + result.push_back(IShardedWriteController::TPendingShardInfo{ + .ShardId = id, + .HasRead = shard.HasRead(), + }); } } return result; @@ -1009,6 +1041,10 @@ class TShardsInfo { return ShardsInfo; } + const THashMap& GetShards() const { + return ShardsInfo; + } + i64 GetMemory() const { return Memory; } @@ -1033,69 +1069,181 @@ class TShardsInfo { class TShardedWriteController : public IShardedWriteController { public: void OnPartitioningChanged(const NSchemeCache::TSchemeCacheNavigate::TEntry& schemeEntry) override { + IsOlap = true; + SchemeEntry = schemeEntry; BeforePartitioningChanged(); - Serializer = CreateColumnShardPayloadSerializer( - schemeEntry, - InputColumnsMetadata); + for (TWriteToken token = 0; token < CurrentWriteToken; ++token) { + auto& writeInfo = WriteInfos.at(token); + writeInfo.Serializer = CreateColumnShardPayloadSerializer( + *SchemeEntry, + writeInfo.Metadata.InputColumnsMetadata); + } AfterPartitioningChanged(); } void OnPartitioningChanged( const NSchemeCache::TSchemeCacheNavigate::TEntry& schemeEntry, NSchemeCache::TSchemeCacheRequest::TEntry&& partitionsEntry) override { + IsOlap = false; + SchemeEntry = schemeEntry; + PartitionsEntry = std::move(partitionsEntry); BeforePartitioningChanged(); - Serializer = CreateDataShardPayloadSerializer( - schemeEntry, - std::move(partitionsEntry), - InputColumnsMetadata); + for (TWriteToken token = 0; token < CurrentWriteToken; ++token) { + auto& writeInfo = WriteInfos.at(token); + writeInfo.Serializer = CreateDataShardPayloadSerializer( + *SchemeEntry, + *PartitionsEntry, + writeInfo.Metadata.InputColumnsMetadata); + } AfterPartitioningChanged(); } void BeforePartitioningChanged() { - if (Serializer) { - if (!Closed) { - Serializer->Close(); + if (!Settings.Inconsistent) { + return; + } + for (TWriteToken token = 0; token < CurrentWriteToken; ++token) { + auto& writeInfo = WriteInfos.at(token); + if (writeInfo.Serializer) { + if (!writeInfo.Closed) { + writeInfo.Serializer->Close(); + } + FlushSerializer(token, true); + writeInfo.Serializer = nullptr; } - FlushSerializer(true); } } void AfterPartitioningChanged() { - ShardsInfo.Close(); - ReshardData(); - ShardsInfo.Clear(); - if (Closed) { - Close(); - } else { - FlushSerializer(GetMemory() >= Settings.MemoryLimitTotal); + if (!Settings.Inconsistent) { + return; + } + if (!WriteInfos.empty()) { + ShardsInfo.Close(); + ReshardData(); + ShardsInfo.Clear(); + for (TWriteToken token = 0; token < CurrentWriteToken; ++token) { + const auto& writeInfo = WriteInfos.at(token); + if (writeInfo.Closed) { + Close(token); + } else { + FlushSerializer(token, GetMemory() >= Settings.MemoryLimitTotal); + } + } } } - void AddData(NMiniKQL::TUnboxedValueBatch&& data) override { + TWriteToken Open( + const TTableId tableId, + const NKikimrDataEvents::TEvWrite::TOperation::EOperationType operationType, + TVector&& inputColumns, + const i64 priority) override { + auto token = CurrentWriteToken++; + auto iter = WriteInfos.emplace( + token, + TWriteInfo { + .Metadata = TMetadata { + .TableId = tableId, + .OperationType = operationType, + .InputColumnsMetadata = std::move(inputColumns), + .Priority = priority, + }, + .Serializer = nullptr, + .Closed = false, + }).first; + if (PartitionsEntry) { + iter->second.Serializer = CreateDataShardPayloadSerializer( + *SchemeEntry, + *PartitionsEntry, + iter->second.Metadata.InputColumnsMetadata); + } else if (SchemeEntry) { + iter->second.Serializer = CreateColumnShardPayloadSerializer( + *SchemeEntry, + iter->second.Metadata.InputColumnsMetadata); + } + return token; + } + + void Write(TWriteToken token, const NMiniKQL::TUnboxedValueBatch& data) override { YQL_ENSURE(!data.IsWide(), "Wide stream is not supported yet"); - YQL_ENSURE(!Closed); + auto& info = WriteInfos.at(token); + YQL_ENSURE(!info.Closed); auto allocGuard = TypeEnv.BindAllocator(); - YQL_ENSURE(Serializer); - Serializer->AddData(std::move(data)); + YQL_ENSURE(info.Serializer); + info.Serializer->AddData(data); - FlushSerializer(GetMemory() >= Settings.MemoryLimitTotal); + if (info.Metadata.Priority == 0) { + FlushSerializer(token, GetMemory() >= Settings.MemoryLimitTotal); + } else { + YQL_ENSURE(GetMemory() <= Settings.MemoryLimitTotal); + } } - void Close() override { + void Close(TWriteToken token) override { auto allocGuard = TypeEnv.BindAllocator(); - YQL_ENSURE(Serializer); - Closed = true; - Serializer->Close(); - FlushSerializer(true); - YQL_ENSURE(Serializer->IsFinished()); + auto& info = WriteInfos.at(token); + YQL_ENSURE(info.Serializer); + info.Closed = true; + info.Serializer->Close(); + if (info.Metadata.Priority == 0) { + FlushSerializer(token, true); + YQL_ENSURE(info.Serializer->IsFinished()); + } + } + + void FlushBuffers() override { + TVector writeTokensFoFlush; + for (TWriteToken token = 0; token < CurrentWriteToken; ++token) { + const auto& writeInfo = WriteInfos.at(token); + YQL_ENSURE(writeInfo.Closed); + if (writeInfo.Metadata.Priority != 0) { + if (!writeInfo.Serializer->IsFinished()) { + writeTokensFoFlush.push_back(token); + } + } else { + YQL_ENSURE(writeInfo.Serializer->IsFinished()); + } + } + + std::sort( + std::begin(writeTokensFoFlush), + std::end(writeTokensFoFlush), + [&](const TWriteToken& lhs, const TWriteToken& rhs) { + const auto& leftWriteInfo = WriteInfos.at(lhs); + const auto& rightWriteInfo = WriteInfos.at(rhs); + return leftWriteInfo.Metadata.Priority < rightWriteInfo.Metadata.Priority; + }); + + for (const TWriteToken token : writeTokensFoFlush) { + FlushSerializer(token, true); + YQL_ENSURE(WriteInfos.at(token).Serializer->IsFinished()); + } + } + + void Close() override { ShardsInfo.Close(); } - TVector GetPendingShards() const override { + void AddCoveringMessages() override { + for (auto& [_, shardInfo] : ShardsInfo.GetShards()) { + shardInfo.PushBatch(TBatchWithMetadata{}); + } + } + + TVector GetPendingShards() const override { return ShardsInfo.GetPendingShards(); } + TVector GetShardsIds() const override { + TVector result; + result.reserve(ShardsInfo.GetShards().size()); + for (const auto& [id, _] : ShardsInfo.GetShards()) { + result.push_back(id); + } + return result; + } + std::optional GetMessageMetadata(ui64 shardId) override { auto& shardInfo = ShardsInfo.GetShard(shardId); if (shardInfo.IsEmpty()) { @@ -1122,29 +1270,37 @@ class TShardedWriteController : public IShardedWriteController { for (size_t index = 0; index < shardInfo.GetBatchesInFlight(); ++index) { const auto& inFlightBatch = shardInfo.GetBatch(index); - YQL_ENSURE(!inFlightBatch->IsEmpty()); - result.TotalDataSize += inFlightBatch->GetMemory(); - const ui64 payloadIndex = NKikimr::NEvWrite::TPayloadWriter(evWrite) - .AddDataToPayload(inFlightBatch->SerializeToString()); - result.PayloadIndexes.push_back(payloadIndex); + if (inFlightBatch.Data) { + YQL_ENSURE(!inFlightBatch.Data->IsEmpty()); + result.TotalDataSize += inFlightBatch.Data->GetMemory(); + const ui64 payloadIndex = NKikimr::NEvWrite::TPayloadWriter(evWrite) + .AddDataToPayload(inFlightBatch.Data->SerializeToString()); + const auto& writeInfo = WriteInfos.at(inFlightBatch.Token); + evWrite.AddOperation( + writeInfo.Metadata.OperationType, + writeInfo.Metadata.TableId, + writeInfo.Serializer->GetWriteColumnIds(), + payloadIndex, + writeInfo.Serializer->GetDataFormat()); + } else { + YQL_ENSURE(index + 1 == shardInfo.GetBatchesInFlight()); + } } return result; } - NKikimrDataEvents::EDataFormat GetDataFormat() override { - return Serializer->GetDataFormat(); - } - - std::vector GetWriteColumnIds() override { - return Serializer->GetWriteColumnIds(); - } - - std::optional OnMessageAcknowledged(ui64 shardId, ui64 cookie) override { + std::optional OnMessageAcknowledged(ui64 shardId, ui64 cookie) override { auto allocGuard = TypeEnv.BindAllocator(); auto& shardInfo = ShardsInfo.GetShard(shardId); - const auto removedDataSize = shardInfo.PopBatches(cookie); - return removedDataSize; + const auto result = shardInfo.PopBatches(cookie); + if (result) { + return TMessageAcknowledgedResult { + .DataSize = result->DataSize, + .IsShardEmpty = shardInfo.IsEmpty(), + }; + } + return std::nullopt; } void OnMessageSent(ui64 shardId, ui64 cookie) override { @@ -1164,29 +1320,66 @@ class TShardedWriteController : public IShardedWriteController { } i64 GetMemory() const override { - YQL_ENSURE(Serializer); - return Serializer->GetMemory() + ShardsInfo.GetMemory(); + i64 total = ShardsInfo.GetMemory(); + for (TWriteToken token = 0; token < CurrentWriteToken; ++token) { + const auto& writeInfo = WriteInfos.at(token); + if (writeInfo.Serializer) { + total += writeInfo.Serializer->GetMemory(); + } else { + YQL_ENSURE(writeInfo.Closed); + } + } + return total; } - bool IsClosed() const override { - return Closed; + bool IsAllWritesClosed() const override { + for (TWriteToken token = 0; token < CurrentWriteToken; ++token) { + if (!WriteInfos.at(token).Closed) { + return false; + } + } + return true; } - bool IsFinished() const override { - return IsClosed() && Serializer->IsFinished() && ShardsInfo.IsFinished(); + bool IsAllWritesFinished() const override { + for (TWriteToken token = 0; token < CurrentWriteToken; ++token) { + const auto& writeInfo = WriteInfos.at(token); + if (!writeInfo.Closed || !writeInfo.Serializer->IsFinished()) { + return false; + } + } + return ShardsInfo.IsFinished(); } bool IsReady() const override { - return Serializer != nullptr; + for (TWriteToken token = 0; token < CurrentWriteToken; ++token) { + const auto& writeInfo = WriteInfos.at(token); + if (!writeInfo.Serializer && !writeInfo.Closed) { + return false; + } + } + return true; + } + + bool IsEmpty() const override { + for (TWriteToken token = 0; token < CurrentWriteToken; ++token) { + const auto& writeInfo = WriteInfos.at(token); + if (writeInfo.Serializer && !writeInfo.Serializer->IsEmpty()) { + return false; + } + } + return ShardsInfo.IsEmpty(); + } + + ui64 GetShardsCount() const override { + return ShardsInfo.GetShards().size(); } TShardedWriteController( const TShardedWriteControllerSettings settings, - TVector&& inputColumnsMetadata, const NMiniKQL::TTypeEnvironment& typeEnv, std::shared_ptr alloc) : Settings(settings) - , InputColumnsMetadata(std::move(inputColumnsMetadata)) , TypeEnv(typeEnv) , Alloc(alloc) { } @@ -1195,28 +1388,42 @@ class TShardedWriteController : public IShardedWriteController { Y_ABORT_UNLESS(Alloc); TGuard allocGuard(*Alloc); ShardsInfo.Clear(); - Serializer = nullptr; + for (TWriteToken token = 0; token < CurrentWriteToken; ++token) { + WriteInfos.at(token).Serializer = nullptr; + } } private: - void FlushSerializer(bool force) { + void FlushSerializer(TWriteToken token, bool force) { if (force) { - for (auto& [shardId, batches] : Serializer->FlushBatchesForce()) { + const auto& writeInfo = WriteInfos.at(token); + for (auto& [shardId, batches] : writeInfo.Serializer->FlushBatchesForce()) { for (auto& batch : batches) { if (batch && !batch->IsEmpty()) { - ShardsInfo.GetShard(shardId).PushBatch(std::move(batch)); + ShardsInfo.GetShard(shardId).PushBatch(TBatchWithMetadata{ + .Token = token, + .Data = std::move(batch), + .HasRead = (writeInfo.Metadata.OperationType != NKikimrDataEvents::TEvWrite::TOperation::OPERATION_REPLACE + && writeInfo.Metadata.OperationType != NKikimrDataEvents::TEvWrite::TOperation::OPERATION_UPSERT), + }); } } } } else { - for (const ui64 shardId : Serializer->GetShardIds()) { + const auto& writeInfo = WriteInfos.at(token); + for (const ui64 shardId : writeInfo.Serializer->GetShardIds()) { auto& shard = ShardsInfo.GetShard(shardId); while (true) { - auto batch = Serializer->FlushBatch(shardId); + auto batch = writeInfo.Serializer->FlushBatch(shardId); if (!batch || batch->IsEmpty()) { break; } - shard.PushBatch(std::move(batch)); + shard.PushBatch(TBatchWithMetadata{ + .Token = token, + .Data = std::move(batch), + .HasRead = (writeInfo.Metadata.OperationType != NKikimrDataEvents::TEvWrite::TOperation::OPERATION_REPLACE + && writeInfo.Metadata.OperationType != NKikimrDataEvents::TEvWrite::TOperation::OPERATION_UPSERT), + }); } } } @@ -1224,29 +1431,45 @@ class TShardedWriteController : public IShardedWriteController { void BuildBatchesForShard(TShardsInfo::TShardInfo& shard) { if (shard.GetBatchesInFlight() == 0) { + YQL_ENSURE(IsOlap != std::nullopt); shard.MakeNextBatches( Settings.MemoryLimitPerMessage, - Settings.MaxBatchesPerMessage); + (*IsOlap) ? 1 : Settings.MaxBatchesPerMessage); } } void ReshardData() { + YQL_ENSURE(!Settings.Inconsistent); for (auto& [_, shardInfo] : ShardsInfo.GetShards()) { for (size_t index = 0; index < shardInfo.Size(); ++index) { - Serializer->AddBatch(shardInfo.GetBatch(index)); + const auto& batch = shardInfo.GetBatch(index); + const auto& writeInfo = WriteInfos.at(batch.Token); + // Resharding supported only for inconsistent write, + // so convering empty batches don't exist in this case. + YQL_ENSURE(batch.Data); + writeInfo.Serializer->AddBatch(batch.Data); } } } TShardedWriteControllerSettings Settings; - TVector InputColumnsMetadata; const NMiniKQL::TTypeEnvironment& TypeEnv; std::shared_ptr Alloc; + struct TWriteInfo { + TMetadata Metadata; + IPayloadSerializerPtr Serializer = nullptr; + bool Closed = false; + }; + + THashMap WriteInfos; + TWriteToken CurrentWriteToken = 0; + TShardsInfo ShardsInfo; - bool Closed = false; - IPayloadSerializerPtr Serializer = nullptr; + std::optional SchemeEntry; + std::optional PartitionsEntry; + std::optional IsOlap; }; } @@ -1254,11 +1477,10 @@ class TShardedWriteController : public IShardedWriteController { IShardedWriteControllerPtr CreateShardedWriteController( const TShardedWriteControllerSettings& settings, - TVector&& inputColumns, const NMiniKQL::TTypeEnvironment& typeEnv, std::shared_ptr alloc) { return MakeIntrusive( - settings, std::move(inputColumns), typeEnv, alloc); + settings, typeEnv, alloc); } } diff --git a/ydb/core/kqp/runtime/kqp_write_table.h b/ydb/core/kqp/runtime/kqp_write_table.h index 46e5ac4f7308..e3e645541310 100644 --- a/ydb/core/kqp/runtime/kqp_write_table.h +++ b/ydb/core/kqp/runtime/kqp_write_table.h @@ -12,15 +12,39 @@ namespace NKqp { class IShardedWriteController : public TThrRefBase { public: - virtual void OnPartitioningChanged(const NSchemeCache::TSchemeCacheNavigate::TEntry& schemeEntry) = 0; + virtual void OnPartitioningChanged( + const NSchemeCache::TSchemeCacheNavigate::TEntry& schemeEntry) = 0; virtual void OnPartitioningChanged( const NSchemeCache::TSchemeCacheNavigate::TEntry& schemeEntry, NSchemeCache::TSchemeCacheRequest::TEntry&& partitionsEntry) = 0; - virtual void AddData(NMiniKQL::TUnboxedValueBatch&& data) = 0; + using TWriteToken = ui64; + + // Data ordering invariant: + // For two writes A and B: + // A happend before B <=> Close(A) happend before Open(B) otherwise Priority(A) < Priority(B). + + virtual TWriteToken Open( + const TTableId TableId, + const NKikimrDataEvents::TEvWrite::TOperation::EOperationType operationType, + TVector&& inputColumns, + const i64 priority) = 0; + virtual void Write(TWriteToken token, const NMiniKQL::TUnboxedValueBatch& data) = 0; + virtual void Close(TWriteToken token) = 0; + + virtual void FlushBuffers() = 0; + virtual void Close() = 0; - virtual TVector GetPendingShards() const = 0; + virtual void AddCoveringMessages() = 0; + + struct TPendingShardInfo { + ui64 ShardId; + bool HasRead; + }; + virtual TVector GetPendingShards() const = 0; + virtual ui64 GetShardsCount() const = 0; + virtual TVector GetShardsIds() const = 0; struct TMessageMetadata { ui64 Cookie = 0; @@ -36,20 +60,24 @@ class IShardedWriteController : public TThrRefBase { }; virtual TSerializationResult SerializeMessageToPayload(ui64 shardId, NKikimr::NEvents::TDataEvents::TEvWrite& evWrite) = 0; - virtual NKikimrDataEvents::EDataFormat GetDataFormat() = 0; - virtual std::vector GetWriteColumnIds() = 0; - virtual std::optional OnMessageAcknowledged(ui64 shardId, ui64 cookie) = 0; + struct TMessageAcknowledgedResult { + ui64 DataSize = 0; + bool IsShardEmpty = 0; + }; + + virtual std::optional OnMessageAcknowledged(ui64 shardId, ui64 cookie) = 0; virtual void OnMessageSent(ui64 shardId, ui64 cookie) = 0; virtual void ResetRetries(ui64 shardId, ui64 cookie) = 0; virtual i64 GetMemory() const = 0; - virtual bool IsClosed() const = 0; - virtual bool IsFinished() const = 0; + virtual bool IsAllWritesClosed() const = 0; + virtual bool IsAllWritesFinished() const = 0; virtual bool IsReady() const = 0; + virtual bool IsEmpty() const = 0; }; using IShardedWriteControllerPtr = TIntrusivePtr; @@ -59,11 +87,11 @@ struct TShardedWriteControllerSettings { i64 MemoryLimitTotal; i64 MemoryLimitPerMessage; i64 MaxBatchesPerMessage; + bool Inconsistent; }; IShardedWriteControllerPtr CreateShardedWriteController( const TShardedWriteControllerSettings& settings, - TVector&& inputColumns, const NMiniKQL::TTypeEnvironment& typeEnv, std::shared_ptr alloc); diff --git a/ydb/core/kqp/runtime/ya.make b/ydb/core/kqp/runtime/ya.make index 4fc296d934ca..536aedd23989 100644 --- a/ydb/core/kqp/runtime/ya.make +++ b/ydb/core/kqp/runtime/ya.make @@ -30,23 +30,24 @@ SRCS( PEERDIR( contrib/libs/apache/arrow + library/cpp/threading/hot_swap ydb/core/actorlib_impl ydb/core/base ydb/core/engine ydb/core/engine/minikql ydb/core/formats ydb/core/kqp/common + ydb/core/kqp/common/buffer ydb/core/protos ydb/core/scheme ydb/core/ydb_convert - ydb/library/yql/minikql/computation/llvm14 - ydb/library/yql/minikql/comp_nodes - ydb/library/yql/utils ydb/library/yql/dq/actors/protos ydb/library/yql/dq/actors/spilling ydb/library/yql/dq/common ydb/library/yql/dq/runtime - library/cpp/threading/hot_swap + ydb/library/yql/minikql/comp_nodes + ydb/library/yql/minikql/computation/llvm14 + ydb/library/yql/utils ) YQL_LAST_ABI_VERSION() diff --git a/ydb/core/kqp/session_actor/kqp_query_state.h b/ydb/core/kqp/session_actor/kqp_query_state.h index b48bc0e20d4e..9f1c74d3bf23 100644 --- a/ydb/core/kqp/session_actor/kqp_query_state.h +++ b/ydb/core/kqp/session_actor/kqp_query_state.h @@ -347,13 +347,8 @@ class TKqpQueryState : public TNonCopyable { return true; } - if (HasTxSinkInTx(tx)) { - // At current time transactional internal sinks require separate tnx with commit. - return false; - } - if (TxCtx->HasOlapTable) { - // HTAP/OLAP transactions always use separate commit. + // Olap sink results can't be committed with changes return false; } @@ -372,12 +367,13 @@ class TKqpQueryState : public TNonCopyable { } bool ShouldAcquireLocks(const TKqpPhyTxHolder::TConstPtr& tx) { + Y_UNUSED(tx); if (*TxCtx->EffectiveIsolationLevel != NKikimrKqp::ISOLATION_LEVEL_SERIALIZABLE) { return false; } // Inconsistent writes (CTAS) don't require locks. - if (IsSplitted() && !HasTxSinkInTx(tx)) { + if (IsSplitted()) { return false; } @@ -416,9 +412,9 @@ class TKqpQueryState : public TNonCopyable { const auto& phyQuery = PreparedQuery->GetPhysicalQuery(); auto tx = PreparedQuery->GetPhyTxOrEmpty(CurrentTx); - if (TxCtx->CanDeferEffects()) { + if (TxCtx->CanDeferEffects()) { // At current time sinks require separate tnx with commit. - while (tx && tx->GetHasEffects() && !HasTxSinkInTx(tx)) { + while (tx && tx->GetHasEffects() && !TxCtx->HasOlapTable) { QueryData->CreateKqpValueMap(tx); bool success = TxCtx->AddDeferredEffect(tx, QueryData); YQL_ENSURE(success); @@ -435,40 +431,6 @@ class TKqpQueryState : public TNonCopyable { return tx; } - bool HasTxSinkInStage(const ::NKqpProto::TKqpPhyStage& stage) const { - for (const auto& sink : stage.GetSinks()) { - if (sink.GetTypeCase() == NKqpProto::TKqpSink::kInternalSink && sink.GetInternalSink().GetSettings().Is()) { - NKikimrKqp::TKqpTableSinkSettings settings; - YQL_ENSURE(sink.GetInternalSink().GetSettings().UnpackTo(&settings), "Failed to unpack settings"); - if (!settings.GetInconsistentTx()) { - return true; - } - } - } - return false; - } - - bool HasTxSink() const { - const auto& query = PreparedQuery->GetPhysicalQuery(); - for (auto& tx : query.GetTransactions()) { - for (const auto& stage : tx.GetStages()) { - if (HasTxSinkInStage(stage)) { - return true; - } - } - } - return false; - } - - bool HasTxSinkInTx(const TKqpPhyTxHolder::TConstPtr& tx) const { - for (const auto& stage : tx->GetStages()) { - if (HasTxSinkInStage(stage)) { - return true; - } - } - return false; - } - bool HasTxControl() const { return RequestEv->HasTxControl(); } diff --git a/ydb/core/kqp/session_actor/kqp_session_actor.cpp b/ydb/core/kqp/session_actor/kqp_session_actor.cpp index 03c9845db51d..c75f08fcb423 100644 --- a/ydb/core/kqp/session_actor/kqp_session_actor.cpp +++ b/ydb/core/kqp/session_actor/kqp_session_actor.cpp @@ -3,6 +3,8 @@ #include "kqp_query_state.h" #include "kqp_query_stats.h" +#include +#include #include #include #include @@ -1022,15 +1024,19 @@ class TKqpSessionActor : public TActorBootstrapped { bool CheckTransactionLocks(const TKqpPhyTxHolder::TConstPtr& tx) { auto& txCtx = *QueryState->TxCtx; - if (!txCtx.DeferredEffects.Empty() && txCtx.Locks.Broken()) { + const bool broken = txCtx.TxManager + ? !!txCtx.TxManager->GetLockIssue() + : txCtx.Locks.Broken(); + + if (!txCtx.DeferredEffects.Empty() && broken) { ReplyQueryError(Ydb::StatusIds::ABORTED, "tx has deferred effects, but locks are broken", - MessageFromIssues(std::vector{txCtx.Locks.GetIssue()})); + MessageFromIssues(std::vector{txCtx.TxManager ? *txCtx.TxManager->GetLockIssue() : txCtx.Locks.GetIssue()})); return false; } - if (tx && tx->GetHasEffects() && txCtx.Locks.Broken()) { + if (tx && tx->GetHasEffects() && broken) { ReplyQueryError(Ydb::StatusIds::ABORTED, "tx has effects, but locks are broken", - MessageFromIssues(std::vector{txCtx.Locks.GetIssue()})); + MessageFromIssues(std::vector{txCtx.TxManager ? *txCtx.TxManager->GetLockIssue() : txCtx.Locks.GetIssue()})); return false; } @@ -1138,9 +1144,10 @@ class TKqpSessionActor : public TActorBootstrapped { auto& txCtx = *QueryState->TxCtx; bool literal = tx && tx->IsLiteralTx(); + const bool hasLocks = txCtx.TxManager ? txCtx.TxManager->HasLocks() : txCtx.Locks.HasLocks(); if (commit) { - if (txCtx.TxHasEffects() || txCtx.Locks.HasLocks() || txCtx.TopicOperations.HasOperations()) { + if (txCtx.TxHasEffects() || hasLocks || txCtx.TopicOperations.HasOperations()) { // Cannot perform commit in literal execution literal = false; } else if (!tx) { @@ -1204,20 +1211,37 @@ class TKqpSessionActor : public TActorBootstrapped { request.PerShardKeysSizeLimitBytes = Config->_CommitPerShardKeysSizeLimitBytes.Get().GetRef(); } - if (txCtx.Locks.HasLocks() || txCtx.TopicOperations.HasOperations()) { - if (!txCtx.GetSnapshot().IsValid() || txCtx.TxHasEffects() || txCtx.TopicOperations.HasOperations()) { - LOG_D("TExecPhysicalRequest, tx has commit locks"); - request.LocksOp = ELocksOp::Commit; - } else { - LOG_D("TExecPhysicalRequest, tx has rollback locks"); - request.LocksOp = ELocksOp::Rollback; + if (Settings.TableService.GetEnableOltpSink()) { + if (txCtx.TxHasEffects() || hasLocks || txCtx.TopicOperations.HasOperations()) { + request.AcquireLocksTxId = txCtx.Locks.GetLockTxId(); } - for (auto& [lockId, lock] : txCtx.Locks.LocksMap) { - auto dsLock = ExtractLock(lock.GetValueRef(txCtx.Locks.LockType)); - request.DataShardLocks[dsLock.GetDataShard()].emplace_back(dsLock); + if (hasLocks) { + if (!txCtx.GetSnapshot().IsValid() || txCtx.TxHasEffects() || txCtx.TopicOperations.HasOperations()) { + LOG_D("TExecPhysicalRequest, tx has commit locks"); + request.LocksOp = ELocksOp::Commit; + } else { + LOG_D("TExecPhysicalRequest, tx has rollback locks"); + request.LocksOp = ELocksOp::Rollback; + } + } else if (txCtx.TxHasEffects()) { + LOG_D("TExecPhysicalRequest, need commit locks"); + request.LocksOp = ELocksOp::Commit; + } + } else { + if (hasLocks || txCtx.TopicOperations.HasOperations()) { + if (!txCtx.GetSnapshot().IsValid() || txCtx.TxHasEffects() || txCtx.TopicOperations.HasOperations()) { + LOG_D("TExecPhysicalRequest, tx has commit locks"); + request.LocksOp = ELocksOp::Commit; + } else { + LOG_D("TExecPhysicalRequest, tx has rollback locks"); + request.LocksOp = ELocksOp::Rollback; + } + for (auto& [lockId, lock] : txCtx.Locks.LocksMap) { + auto dsLock = ExtractLock(lock.GetValueRef(txCtx.Locks.LockType)); + request.DataShardLocks[dsLock.GetDataShard()].emplace_back(dsLock); + } } - } request.TopicOperations = std::move(txCtx.TopicOperations); @@ -1233,7 +1257,7 @@ class TKqpSessionActor : public TActorBootstrapped { QueryState->Orbit, QueryState->CurrentTx, request.Transactions.size(), - txCtx.Locks.Size(), + (txCtx.TxManager ? txCtx.TxManager->GetShardsCount() : txCtx.Locks.Size()), request.AcquireLocksTxId.Defined()); SendToExecuter(QueryState->TxCtx.Get(), std::move(request)); @@ -1284,19 +1308,31 @@ class TKqpSessionActor : public TActorBootstrapped { request.ResourceManager_ = ResourceManager_; LOG_D("Sending to Executer TraceId: " << request.TraceId.GetTraceId() << " " << request.TraceId.GetSpanIdSize()); + if (Settings.TableService.GetEnableOltpSink() && !txCtx->TxManager) { + txCtx->TxManager = CreateKqpTransactionManager(); + } + if (Settings.TableService.GetEnableOltpSink() && !txCtx->BufferActorId && txCtx->HasTableWrite) { + TKqpBufferWriterSettings settings { + .SessionActorId = SelfId(), + .TxManager = txCtx->TxManager, + }; + auto* actor = CreateKqpBufferWriterActor(std::move(settings)); + txCtx->BufferActorId = RegisterWithSameMailbox(actor); + } auto executerActor = CreateKqpExecuter(std::move(request), Settings.Database, QueryState ? QueryState->UserToken : TIntrusiveConstPtr(), RequestCounters, Settings.TableService, AsyncIoFactory, QueryState ? QueryState->PreparedQuery : nullptr, SelfId(), QueryState ? QueryState->UserRequestContext : MakeIntrusive("", Settings.Database, SessionId), - QueryState ? QueryState->StatementResultIndex : 0, FederatedQuerySetup, GUCSettings, txCtx->ShardIdToTableInfo); + QueryState ? QueryState->StatementResultIndex : 0, FederatedQuerySetup, GUCSettings, + txCtx->ShardIdToTableInfo, txCtx->TxManager, txCtx->BufferActorId); auto exId = RegisterWithSameMailbox(executerActor); LOG_D("Created new KQP executer: " << exId << " isRollback: " << isRollback); auto ev = std::make_unique(exId); Send(MakeTxProxyID(), ev.release()); if (!isRollback) { - Y_ABORT_UNLESS(!ExecuterId); + YQL_ENSURE(!ExecuterId); } ExecuterId = exId; } @@ -1446,9 +1482,13 @@ class TKqpSessionActor : public TActorBootstrapped { // Invalidate query cache on scheme/internal errors switch (status) { case Ydb::StatusIds::ABORTED: { - if (ev->BrokenLockPathId) { + if (QueryState->TxCtx->TxManager && QueryState->TxCtx->TxManager->BrokenLocks()) { + issues.AddIssue(*QueryState->TxCtx->TxManager->GetLockIssue()); + } else if (ev->BrokenLockPathId) { + YQL_ENSURE(!QueryState->TxCtx->TxManager); issues.AddIssue(GetLocksInvalidatedIssue(*QueryState->TxCtx, *ev->BrokenLockPathId)); } else if (ev->BrokenLockShardId) { + YQL_ENSURE(!QueryState->TxCtx->TxManager); issues.AddIssue(GetLocksInvalidatedIssue(*QueryState->TxCtx->ShardIdToTableInfo, *ev->BrokenLockShardId)); } break; @@ -1496,7 +1536,7 @@ class TKqpSessionActor : public TActorBootstrapped { QueryState->TxCtx->Locks.LockHandle = std::move(ev->LockHandle); } - if (!MergeLocksWithTxResult(executerResults)) { + if (!QueryState->TxCtx->TxManager && !MergeLocksWithTxResult(executerResults)) { return; } @@ -1542,6 +1582,29 @@ class TKqpSessionActor : public TActorBootstrapped { } } + void Handle(TEvKqpBuffer::TEvError::TPtr& ev) { + const auto& msg = *ev->Get(); + + TString logMsg = TStringBuilder() << "got TEvKqpBuffer::TEvError in " << CurrentStateFuncName() + << ", status: " << NYql::NDqProto::StatusIds_StatusCode_Name(msg.StatusCode) << " send to: " << ExecuterId << " from: " << ev->Sender; + + if (!QueryState || !QueryState->TxCtx || QueryState->TxCtx->BufferActorId != ev->Sender) { + LOG_E(logMsg << ": Old error."); + return; + } else { + LOG_W(logMsg); + } + + TString reason = TStringBuilder() << msg.Message << "; " << msg.SubIssues.ToString(); + + if (ExecuterId) { + auto abortEv = MakeHolder(msg.StatusCode, reason); + Send(ExecuterId, abortEv.Release(), IEventHandle::FlagTrackDelivery); + } else { + ReplyQueryError(NYql::NDq::DqStatusToYdbStatus(msg.StatusCode), logMsg, MessageFromIssues(msg.SubIssues)); + } + } + void CollectSystemViewQueryStats(const TKqpQueryStats* stats, TDuration queryDuration, const TString& database, ui64 requestUnits) { @@ -1892,7 +1955,7 @@ class TKqpSessionActor : public TActorBootstrapped { } void Reply() { - YQL_ENSURE(QueryState); + Y_ABORT_UNLESS(QueryState); YQL_ENSURE(Counters); auto& record = QueryResponse->Record; @@ -2034,10 +2097,12 @@ class TKqpSessionActor : public TActorBootstrapped { request.LocksOp = ELocksOp::Rollback; - // Should tx with empty LocksMap be aborted? - for (auto& [lockId, lock] : txCtx->Locks.LocksMap) { - auto dsLock = ExtractLock(lock.GetValueRef(txCtx->Locks.LockType)); - request.DataShardLocks[dsLock.GetDataShard()].emplace_back(dsLock); + if (!txCtx->TxManager) { + // Should tx with empty LocksMap be aborted? + for (auto& [lockId, lock] : txCtx->Locks.LocksMap) { + auto dsLock = ExtractLock(lock.GetValueRef(txCtx->Locks.LockType)); + request.DataShardLocks[dsLock.GetDataShard()].emplace_back(dsLock); + } } SendToExecuter(txCtx, std::move(request), true); @@ -2047,6 +2112,13 @@ class TKqpSessionActor : public TActorBootstrapped { if (QueryState->TxCtx) { QueryState->TxCtx->ClearDeferredEffects(); QueryState->TxCtx->Locks.Clear(); + QueryState->TxCtx->TxManager.reset(); + + if (QueryState->TxCtx->BufferActorId) { + Send(QueryState->TxCtx->BufferActorId, new TEvKqpBuffer::TEvTerminate{}); + QueryState->TxCtx->BufferActorId = {}; + } + QueryState->TxCtx->Finish(); } } @@ -2311,6 +2383,7 @@ class TKqpSessionActor : public TActorBootstrapped { hFunc(NWorkload::TEvContinueRequest, HandleNoop); // message from KQP proxy in case of our reply just after kqp proxy timer tick hFunc(NYql::NDq::TEvDq::TEvAbortExecution, HandleNoop); + hFunc(TEvKqpBuffer::TEvError, Handle); hFunc(TEvTxUserProxy::TEvAllocateTxIdResult, HandleNoop); default: @@ -2345,6 +2418,7 @@ class TKqpSessionActor : public TActorBootstrapped { hFunc(TEvKqpExecuter::TEvStreamDataAck, HandleExecute); hFunc(NYql::NDq::TEvDq::TEvAbortExecution, HandleExecute); + hFunc(TEvKqpBuffer::TEvError, Handle); hFunc(TEvKqp::TEvCloseSessionRequest, HandleExecute); hFunc(NGRpcService::TEvClientLost, HandleClientLost); @@ -2392,6 +2466,7 @@ class TKqpSessionActor : public TActorBootstrapped { hFunc(TEvKqp::TEvCompileResponse, HandleNoop); hFunc(TEvKqp::TEvSplitResponse, HandleNoop); hFunc(NYql::NDq::TEvDq::TEvAbortExecution, HandleNoop); + hFunc(TEvKqpBuffer::TEvError, Handle); hFunc(TEvTxProxySchemeCache::TEvNavigateKeySetResult, HandleNoop); hFunc(TEvents::TEvUndelivered, HandleNoop); hFunc(TEvTxUserProxy::TEvAllocateTxIdResult, HandleNoop); diff --git a/ydb/core/kqp/session_actor/kqp_session_actor.h b/ydb/core/kqp/session_actor/kqp_session_actor.h index f26fff2b00ca..50ad957d2453 100644 --- a/ydb/core/kqp/session_actor/kqp_session_actor.h +++ b/ydb/core/kqp/session_actor/kqp_session_actor.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include diff --git a/ydb/core/kqp/ut/service/kqp_qs_queries_ut.cpp b/ydb/core/kqp/ut/service/kqp_qs_queries_ut.cpp index 191ee25c2cb1..df4ca7d943ab 100644 --- a/ydb/core/kqp/ut/service/kqp_qs_queries_ut.cpp +++ b/ydb/core/kqp/ut/service/kqp_qs_queries_ut.cpp @@ -3623,7 +3623,7 @@ Y_UNIT_TEST_SUITE(KqpQueryService) { } } - Y_UNIT_TEST(TableSink_ReplaceColumnShard) { + Y_UNIT_TEST(TableSink_Olap_Replace) { NKikimrConfig::TAppConfig appConfig; appConfig.MutableTableServiceConfig()->SetEnableOlapSink(true); auto settings = TKikimrSettings() @@ -3706,14 +3706,14 @@ Y_UNIT_TEST_SUITE(KqpQueryService) { auto it = client.ExecuteQuery(R"( REPLACE INTO `/Root/DataShard` (Col1, Col2) VALUES (0u, 0); REPLACE INTO `/Root/DataShard` (Col1, Col3) VALUES (1u, 'test'); - )", NYdb::NQuery::TTxControl::BeginTx().CommitTx()).ExtractValueSync(); + )", NYdb::NQuery::TTxControl::BeginTx().CommitTx(), TExecuteQuerySettings().ClientTimeout(TDuration::MilliSeconds(1000))).ExtractValueSync(); UNIT_ASSERT_C(it.IsSuccess(), it.GetIssues().ToString()); } { auto it = client.StreamExecuteQuery(R"( SELECT * FROM `/Root/DataShard`; - )", NYdb::NQuery::TTxControl::BeginTx().CommitTx()).ExtractValueSync(); + )", NYdb::NQuery::TTxControl::BeginTx().CommitTx(), TExecuteQuerySettings().ClientTimeout(TDuration::MilliSeconds(1000))).ExtractValueSync(); UNIT_ASSERT_VALUES_EQUAL_C(it.GetStatus(), EStatus::SUCCESS, it.GetIssues().ToString()); TString output = StreamResultToYson(it); CompareYson(output, R"([[0u;[0];#];[1u;#;["test"]]])"); @@ -3723,7 +3723,7 @@ Y_UNIT_TEST_SUITE(KqpQueryService) { auto it = client.ExecuteQuery(R"( REPLACE INTO `/Root/DataShard` (Col1, Col3) VALUES (0u, 'null'); REPLACE INTO `/Root/DataShard` (Col1) VALUES (1u); - )", NYdb::NQuery::TTxControl::BeginTx().CommitTx()).ExtractValueSync(); + )", NYdb::NQuery::TTxControl::BeginTx().CommitTx(), TExecuteQuerySettings().ClientTimeout(TDuration::MilliSeconds(1000))).ExtractValueSync(); UNIT_ASSERT_C(it.IsSuccess(), it.GetIssues().ToString()); } @@ -3731,7 +3731,7 @@ Y_UNIT_TEST_SUITE(KqpQueryService) { { auto it = client.StreamExecuteQuery(R"( SELECT * FROM `/Root/DataShard`; - )", NYdb::NQuery::TTxControl::BeginTx().CommitTx()).ExtractValueSync(); + )", NYdb::NQuery::TTxControl::BeginTx().CommitTx(), TExecuteQuerySettings().ClientTimeout(TDuration::MilliSeconds(1000))).ExtractValueSync(); UNIT_ASSERT_VALUES_EQUAL_C(it.GetStatus(), EStatus::SUCCESS, it.GetIssues().ToString()); TString output = StreamResultToYson(it); CompareYson(output, R"([[0u;#;["null"]];[1u;#;#]])"); @@ -3746,8 +3746,8 @@ Y_UNIT_TEST_SUITE(KqpQueryService) { virtual void DoExecute() = 0; public: void Execute() { - AppConfig.MutableTableServiceConfig()->SetEnableOlapSink(true); - AppConfig.MutableTableServiceConfig()->SetEnableOltpSink(true); + AppConfig.MutableTableServiceConfig()->SetEnableOlapSink(IsOlap); + AppConfig.MutableTableServiceConfig()->SetEnableOltpSink(!IsOlap); AppConfig.MutableTableServiceConfig()->SetEnableKqpDataQueryStreamLookup(true); auto settings = TKikimrSettings().SetAppConfig(AppConfig).SetWithSampleTables(false); @@ -3852,14 +3852,14 @@ Y_UNIT_TEST_SUITE(KqpQueryService) { INSERT INTO `/Root/DataShard` (Col1, Col2) VALUES (0u, 0); INSERT INTO `/Root/DataShard` (Col1, Col3) VALUES (1u, 'test'); INSERT INTO `/Root/DataShard` (Col1, Col3, Col2) VALUES (2u, 't', 3); - )", NYdb::NQuery::TTxControl::BeginTx().CommitTx()).ExtractValueSync(); + )", NYdb::NQuery::TTxControl::BeginTx().CommitTx(), TExecuteQuerySettings().ClientTimeout(TDuration::MilliSeconds(1000))).ExtractValueSync(); UNIT_ASSERT_C(it.IsSuccess(), it.GetIssues().ToString()); } { auto it = client.StreamExecuteQuery(R"( SELECT * FROM `/Root/DataShard` ORDER BY Col1; - )", NYdb::NQuery::TTxControl::BeginTx().CommitTx()).ExtractValueSync(); + )", NYdb::NQuery::TTxControl::BeginTx().CommitTx(), TExecuteQuerySettings().ClientTimeout(TDuration::MilliSeconds(1000))).ExtractValueSync(); UNIT_ASSERT_VALUES_EQUAL_C(it.GetStatus(), EStatus::SUCCESS, it.GetIssues().ToString()); TString output = StreamResultToYson(it); CompareYson(output, R"([[0u;[0];#];[1u;#;["test"]];[2u;[3];["t"]]])"); @@ -3868,7 +3868,7 @@ Y_UNIT_TEST_SUITE(KqpQueryService) { { auto it = client.ExecuteQuery(R"( INSERT INTO `/Root/DataShard` (Col1, Col3) VALUES (0u, 'null'); - )", NYdb::NQuery::TTxControl::BeginTx().CommitTx()).ExtractValueSync(); + )", NYdb::NQuery::TTxControl::BeginTx().CommitTx(), TExecuteQuerySettings().ClientTimeout(TDuration::MilliSeconds(1000))).ExtractValueSync(); UNIT_ASSERT_C(!it.IsSuccess(), it.GetIssues().ToString()); UNIT_ASSERT_C( it.GetIssues().ToString().Contains("Operation is aborting because an duplicate key") @@ -3879,7 +3879,7 @@ Y_UNIT_TEST_SUITE(KqpQueryService) { { auto it = client.StreamExecuteQuery(R"( SELECT * FROM `/Root/DataShard` ORDER BY Col1; - )", NYdb::NQuery::TTxControl::BeginTx().CommitTx()).ExtractValueSync(); + )", NYdb::NQuery::TTxControl::BeginTx().CommitTx(), TExecuteQuerySettings().ClientTimeout(TDuration::MilliSeconds(1000))).ExtractValueSync(); UNIT_ASSERT_VALUES_EQUAL_C(it.GetStatus(), EStatus::SUCCESS, it.GetIssues().ToString()); TString output = StreamResultToYson(it); CompareYson(output, R"([[0u;[0];#];[1u;#;["test"]];[2u;[3];["t"]]])"); @@ -3984,14 +3984,14 @@ Y_UNIT_TEST_SUITE(KqpQueryService) { INSERT INTO `/Root/DataShard` (Col1, Col2) VALUES (0u, 0); INSERT INTO `/Root/DataShard` (Col1, Col3) VALUES (1u, 'test'); INSERT INTO `/Root/DataShard` (Col1, Col3, Col2) VALUES (2u, 't', 3); - )", NYdb::NQuery::TTxControl::BeginTx().CommitTx()).ExtractValueSync(); + )", NYdb::NQuery::TTxControl::BeginTx().CommitTx(), TExecuteQuerySettings().ClientTimeout(TDuration::MilliSeconds(1000))).ExtractValueSync(); UNIT_ASSERT_C(it.IsSuccess(), it.GetIssues().ToString()); } { auto it = client.StreamExecuteQuery(R"( SELECT * FROM `/Root/DataShard` ORDER BY Col1; - )", NYdb::NQuery::TTxControl::BeginTx().CommitTx()).ExtractValueSync(); + )", NYdb::NQuery::TTxControl::BeginTx().CommitTx(), TExecuteQuerySettings().ClientTimeout(TDuration::MilliSeconds(1000))).ExtractValueSync(); UNIT_ASSERT_VALUES_EQUAL_C(it.GetStatus(), EStatus::SUCCESS, it.GetIssues().ToString()); TString output = StreamResultToYson(it); CompareYson(output, R"([[0u;[0];#];[1u;#;["test"]];[2u;[3];["t"]]])"); @@ -4000,21 +4000,21 @@ Y_UNIT_TEST_SUITE(KqpQueryService) { { auto it = client.ExecuteQuery(R"( UPDATE `/Root/DataShard` SET Col2 = 42 WHERE Col3 == 'not found'; - )", NYdb::NQuery::TTxControl::BeginTx().CommitTx()).ExtractValueSync(); + )", NYdb::NQuery::TTxControl::BeginTx().CommitTx(), TExecuteQuerySettings().ClientTimeout(TDuration::MilliSeconds(1000))).ExtractValueSync(); UNIT_ASSERT_C(it.IsSuccess(), it.GetIssues().ToString()); } { auto it = client.ExecuteQuery(R"( UPDATE `/Root/DataShard` SET Col2 = 42 WHERE Col3 == 't'; - )", NYdb::NQuery::TTxControl::BeginTx().CommitTx()).ExtractValueSync(); + )", NYdb::NQuery::TTxControl::BeginTx().CommitTx(), TExecuteQuerySettings().ClientTimeout(TDuration::MilliSeconds(1000))).ExtractValueSync(); UNIT_ASSERT_C(it.IsSuccess(), it.GetIssues().ToString()); } { auto it = client.StreamExecuteQuery(R"( SELECT * FROM `/Root/DataShard` ORDER BY Col1; - )", NYdb::NQuery::TTxControl::BeginTx().CommitTx()).ExtractValueSync(); + )", NYdb::NQuery::TTxControl::BeginTx().CommitTx(), TExecuteQuerySettings().ClientTimeout(TDuration::MilliSeconds(1000))).ExtractValueSync(); UNIT_ASSERT_VALUES_EQUAL_C(it.GetStatus(), EStatus::SUCCESS, it.GetIssues().ToString()); TString output = StreamResultToYson(it); CompareYson(output, R"([[0u;[0];#];[1u;#;["test"]];[2u;[42];["t"]]])"); @@ -4023,20 +4023,20 @@ Y_UNIT_TEST_SUITE(KqpQueryService) { { auto it = client.ExecuteQuery(R"( UPDATE `/Root/DataShard` ON SELECT 0u AS Col1, 1 AS Col2, 'text' AS Col3; - )", NYdb::NQuery::TTxControl::BeginTx().CommitTx()).ExtractValueSync(); + )", NYdb::NQuery::TTxControl::BeginTx().CommitTx(), TExecuteQuerySettings().ClientTimeout(TDuration::MilliSeconds(1000))).ExtractValueSync(); UNIT_ASSERT_C(it.IsSuccess(), it.GetIssues().ToString()); } { auto it = client.ExecuteQuery(R"( UPDATE `/Root/DataShard` ON SELECT 10u AS Col1, 1 AS Col2, 'text' AS Col3; - )", NYdb::NQuery::TTxControl::BeginTx().CommitTx()).ExtractValueSync(); + )", NYdb::NQuery::TTxControl::BeginTx().CommitTx(), TExecuteQuerySettings().ClientTimeout(TDuration::MilliSeconds(1000))).ExtractValueSync(); UNIT_ASSERT_C(it.IsSuccess(), it.GetIssues().ToString()); } auto it = client.StreamExecuteQuery(R"( SELECT * FROM `/Root/DataShard` ORDER BY Col1; - )", NYdb::NQuery::TTxControl::BeginTx().CommitTx()).ExtractValueSync(); + )", NYdb::NQuery::TTxControl::BeginTx().CommitTx(), TExecuteQuerySettings().ClientTimeout(TDuration::MilliSeconds(1000))).ExtractValueSync(); UNIT_ASSERT_VALUES_EQUAL_C(it.GetStatus(), EStatus::SUCCESS, it.GetIssues().ToString()); TString output = StreamResultToYson(it); CompareYson(output, R"([[0u;[1];["text"]];[1u;#;["test"]];[2u;[42];["t"]]])"); @@ -4152,7 +4152,8 @@ Y_UNIT_TEST_SUITE(KqpQueryService) { } } - Y_UNIT_TEST_TWIN(TableSink_ReplaceDataShard, UseSink) { + Y_UNIT_TEST_TWIN(TableSink_Oltp_Replace, UseSink) { + //UseSink = true; NKikimrConfig::TAppConfig appConfig; appConfig.MutableTableServiceConfig()->SetEnableOlapSink(UseSink); appConfig.MutableTableServiceConfig()->SetEnableOltpSink(UseSink); @@ -4198,14 +4199,14 @@ Y_UNIT_TEST_SUITE(KqpQueryService) { auto prepareResult = client.ExecuteQuery(R"( REPLACE INTO `/Root/DataShard` (Col1, Col2, Col3) VALUES (10u, "test1", 10), (20u, "test2", 11), (2147483647u, "test3", 12), (2147483640u, NULL, 13); - )", NYdb::NQuery::TTxControl::BeginTx().CommitTx()).ExtractValueSync(); + )", NYdb::NQuery::TTxControl::BeginTx().CommitTx(), TExecuteQuerySettings().ClientTimeout(TDuration::MilliSeconds(1000))).ExtractValueSync(); UNIT_ASSERT_C(prepareResult.IsSuccess(), prepareResult.GetIssues().ToString()); } { auto it = client.StreamExecuteQuery(R"( SELECT COUNT(*) FROM `/Root/DataShard`; - )", NYdb::NQuery::TTxControl::BeginTx().CommitTx()).ExtractValueSync(); + )", NYdb::NQuery::TTxControl::BeginTx().CommitTx(), TExecuteQuerySettings().ClientTimeout(TDuration::MilliSeconds(1000))).ExtractValueSync(); UNIT_ASSERT_VALUES_EQUAL_C(it.GetStatus(), EStatus::SUCCESS, it.GetIssues().ToString()); TString output = StreamResultToYson(it); CompareYson( @@ -4216,14 +4217,14 @@ Y_UNIT_TEST_SUITE(KqpQueryService) { { auto prepareResult = client.ExecuteQuery(R"( REPLACE INTO `/Root/DataShard2` SELECT * FROM `/Root/DataShard`; - )", NYdb::NQuery::TTxControl::BeginTx().CommitTx()).ExtractValueSync(); + )", NYdb::NQuery::TTxControl::BeginTx().CommitTx(), TExecuteQuerySettings().ClientTimeout(TDuration::MilliSeconds(1000))).ExtractValueSync(); UNIT_ASSERT_C(prepareResult.IsSuccess(), prepareResult.GetIssues().ToString()); } { auto it = client.StreamExecuteQuery(R"( SELECT COUNT(*) FROM `/Root/DataShard2`; - )", NYdb::NQuery::TTxControl::BeginTx().CommitTx()).ExtractValueSync(); + )", NYdb::NQuery::TTxControl::BeginTx().CommitTx(), TExecuteQuerySettings().ClientTimeout(TDuration::MilliSeconds(1000))).ExtractValueSync(); UNIT_ASSERT_VALUES_EQUAL_C(it.GetStatus(), EStatus::SUCCESS, it.GetIssues().ToString()); TString output = StreamResultToYson(it); CompareYson( @@ -4236,14 +4237,14 @@ Y_UNIT_TEST_SUITE(KqpQueryService) { REPLACE INTO `/Root/DataShard2` (Col1, Col2, Col3) VALUES (11u, "test1", 10), (21u, "test2", 11), (2147483646u, "test3", 12), (2147483641u, NULL, 13); SELECT COUNT(*) FROM `/Root/DataShard`; - )", NYdb::NQuery::TTxControl::BeginTx().CommitTx()).ExtractValueSync(); + )", NYdb::NQuery::TTxControl::BeginTx().CommitTx(), TExecuteQuerySettings().ClientTimeout(TDuration::MilliSeconds(1000))).ExtractValueSync(); UNIT_ASSERT_C(prepareResult.IsSuccess(), prepareResult.GetIssues().ToString()); } { auto it = client.StreamExecuteQuery(R"( SELECT COUNT(*) FROM `/Root/DataShard2`; - )", NYdb::NQuery::TTxControl::BeginTx().CommitTx()).ExtractValueSync(); + )", NYdb::NQuery::TTxControl::BeginTx().CommitTx(), TExecuteQuerySettings().ClientTimeout(TDuration::MilliSeconds(1000))).ExtractValueSync(); UNIT_ASSERT_VALUES_EQUAL_C(it.GetStatus(), EStatus::SUCCESS, it.GetIssues().ToString()); TString output = StreamResultToYson(it); CompareYson( @@ -4252,6 +4253,109 @@ Y_UNIT_TEST_SUITE(KqpQueryService) { } } + Y_UNIT_TEST(TableSink_OltpInteractive) { + NKikimrConfig::TAppConfig appConfig; + appConfig.MutableTableServiceConfig()->SetEnableOlapSink(true); + appConfig.MutableTableServiceConfig()->SetEnableOltpSink(true); + auto settings = TKikimrSettings() + .SetAppConfig(appConfig) + .SetWithSampleTables(false); + TKikimrRunner kikimr(settings); + Tests::NCommon::TLoggerInit(kikimr).Initialize(); + + auto session = kikimr.GetTableClient().CreateSession().GetValueSync().GetSession(); + + const TString query = R"( + CREATE TABLE `/Root/DataShard` ( + Col1 Uint32 NOT NULL, + Col2 String, + Col3 Int32 NOT NULL, + PRIMARY KEY (Col1) + ) + WITH ( + AUTO_PARTITIONING_BY_SIZE = DISABLED, + AUTO_PARTITIONING_MIN_PARTITIONS_COUNT = 16, + AUTO_PARTITIONING_MAX_PARTITIONS_COUNT = 16, + UNIFORM_PARTITIONS = 16); + + CREATE TABLE `/Root/DataShard2` ( + Col1 Uint32 NOT NULL, + Col2 String, + Col3 Int32 NOT NULL, + PRIMARY KEY (Col1) + ) + WITH ( + AUTO_PARTITIONING_BY_SIZE = DISABLED, + AUTO_PARTITIONING_MIN_PARTITIONS_COUNT = 17, + AUTO_PARTITIONING_MAX_PARTITIONS_COUNT = 17, + UNIFORM_PARTITIONS = 17); + )"; + + auto result = session.ExecuteSchemeQuery(query).GetValueSync(); + UNIT_ASSERT_C(result.GetStatus() == NYdb::EStatus::SUCCESS, result.GetIssues().ToString()); + + auto client = kikimr.GetQueryClient(); + auto session2 = client.GetSession().GetValueSync().GetSession(); + + auto tx = session2.BeginTransaction(NYdb::NQuery::TTxSettings::SerializableRW()) + .ExtractValueSync() + .GetTransaction(); + UNIT_ASSERT(tx.IsActive()); + { + auto prepareResult = session2.ExecuteQuery(R"( + REPLACE INTO `/Root/DataShard` (Col1, Col2, Col3) VALUES + (10u, "test1", 10), (20u, "test2", 11), (2147483647u, "test3", 12), (2147483640u, NULL, 13); + )", TTxControl::Tx(tx.GetId()), TExecuteQuerySettings().ClientTimeout(TDuration::MilliSeconds(1000))).ExtractValueSync(); + UNIT_ASSERT_C(prepareResult.IsSuccess(), prepareResult.GetIssues().ToString()); + } + + { + auto prepareResult = session2.ExecuteQuery(R"( + REPLACE INTO `/Root/DataShard2` (Col1, Col2, Col3) VALUES + (11u, "test1", 10), (21u, "test2", 11), (2147483646u, "test3", 12), (2147483641u, NULL, 13); + )", TTxControl::Tx(tx.GetId()), TExecuteQuerySettings().ClientTimeout(TDuration::MilliSeconds(1000))).ExtractValueSync(); + UNIT_ASSERT_C(prepareResult.IsSuccess(), prepareResult.GetIssues().ToString()); + } + + { + auto it = session2.StreamExecuteQuery(R"( + SELECT COUNT(*) FROM `/Root/DataShard`; + )", TTxControl::Tx(tx.GetId()), TExecuteQuerySettings().ClientTimeout(TDuration::MilliSeconds(1000))).ExtractValueSync(); + UNIT_ASSERT_VALUES_EQUAL_C(it.GetStatus(), EStatus::SUCCESS, it.GetIssues().ToString()); + TString output = StreamResultToYson(it); + CompareYson( + output, + R"([[4u]])"); + } + + { + auto prepareResult = session2.ExecuteQuery(R"( + SELECT * FROM `/Root/DataShard2`; + )", TTxControl::Tx(tx.GetId()), TExecuteQuerySettings().ClientTimeout(TDuration::MilliSeconds(1000))).ExtractValueSync(); + UNIT_ASSERT_C(prepareResult.IsSuccess(), prepareResult.GetIssues().ToString()); + } + + { + auto prepareResult = session2.ExecuteQuery(R"( + REPLACE INTO `/Root/DataShard2` (Col1, Col2, Col3) VALUES + (11u, "test1", 10), (21u, "test2", 11), (2147483646u, "test3", 12), (2147483641u, NULL, 13); + )", TTxControl::Tx(tx.GetId()), TExecuteQuerySettings().ClientTimeout(TDuration::MilliSeconds(1000))).ExtractValueSync(); + UNIT_ASSERT_C(prepareResult.IsSuccess(), prepareResult.GetIssues().ToString()); + } + + { + auto commitResult = tx.Commit().ExtractValueSync(); + UNIT_ASSERT_VALUES_EQUAL_C(commitResult.GetStatus(), EStatus::SUCCESS, commitResult.GetIssues().ToString()); + } + + { + auto prepareResult = client.ExecuteQuery(R"( + REPLACE INTO `/Root/DataShard2` SELECT * FROM `/Root/DataShard`; + )", NYdb::NQuery::TTxControl::BeginTx().CommitTx(), TExecuteQuerySettings().ClientTimeout(TDuration::MilliSeconds(1000))).ExtractValueSync(); + UNIT_ASSERT_C(prepareResult.IsSuccess(), prepareResult.GetIssues().ToString()); + } + } + Y_UNIT_TEST(ReadDatashardAndColumnshard) { NKikimrConfig::TAppConfig appConfig; appConfig.MutableTableServiceConfig()->SetEnableOlapSink(true); diff --git a/ydb/core/kqp/ut/tx/kqp_sink_tx_ut.cpp b/ydb/core/kqp/ut/tx/kqp_sink_tx_ut.cpp index bc4f31996137..a780ea2f9260 100644 --- a/ydb/core/kqp/ut/tx/kqp_sink_tx_ut.cpp +++ b/ydb/core/kqp/ut/tx/kqp_sink_tx_ut.cpp @@ -144,11 +144,11 @@ Y_UNIT_TEST_SUITE(KqpSinkTx) { result = session.ExecuteQuery(Q_(R"( UPDATE `/Root/KV` SET Value = "third" WHERE Key = 4; )"), TTxControl::Tx(tx->GetId())).ExtractValueSync(); - UNIT_ASSERT_VALUES_EQUAL_C(result.GetStatus(), EStatus::ABORTED, result.GetIssues().ToString()); + UNIT_ASSERT_VALUES_EQUAL_C(result.GetStatus(), EStatus::SUCCESS, result.GetIssues().ToString()); auto commitResult = tx->Commit().ExtractValueSync(); - UNIT_ASSERT_VALUES_EQUAL_C(commitResult.GetStatus(), EStatus::NOT_FOUND, commitResult.GetIssues().ToString()); + UNIT_ASSERT_VALUES_EQUAL_C(commitResult.GetStatus(), EStatus::ABORTED, commitResult.GetIssues().ToString()); } }; @@ -177,6 +177,7 @@ Y_UNIT_TEST_SUITE(KqpSinkTx) { auto result = session.ExecuteQuery(Q_(R"( INSERT INTO `/Root/KV` (Key, Value) VALUES (1u, "New"); + SELECT COUNT(*) FROM `/Root/KV`; )"), TTxControl::Tx(tx.GetId())).ExtractValueSync(); result.GetIssues().PrintTo(Cerr); UNIT_ASSERT_VALUES_EQUAL_C(result.GetStatus(), EStatus::BAD_REQUEST, result.GetIssues().ToString()); diff --git a/ydb/core/protos/kqp.proto b/ydb/core/protos/kqp.proto index 7fe58c104cd2..21b9bf8bae1e 100644 --- a/ydb/core/protos/kqp.proto +++ b/ydb/core/protos/kqp.proto @@ -695,13 +695,9 @@ message TEvKillScanTablet { message TEvKqpOutputActorResultInfo { repeated NKikimrDataEvents.TLock Locks = 1; + optional bool HasRead = 2; } -message TKqpTableSinkLocks { - repeated NKikimrDataEvents.TLock Locks = 1; - repeated uint64 SendingShards = 2; - repeated uint64 ReceivingShards = 3; -} message TKqpTableSinkSettings { enum EType { @@ -717,11 +713,11 @@ message TKqpTableSinkSettings { repeated TKqpColumnMetadataProto Columns = 5; optional uint64 LockTxId = 6; optional uint64 LockNodeId = 7; - optional bool FinalTx = 8; // If tx is immediate then commit, otherwise send prepare - optional bool ImmediateTx = 9; // Try to be immediate tx - optional bool InconsistentTx = 10; // Write each batch in it's own single immediate tx - optional EType Type = 11; - optional TKqpTableSinkLocks Locks = 12; + optional bool InconsistentTx = 8; // Write each batch in it's own single immediate tx + optional EType Type = 9; + optional NActorsProto.TActorId BufferActorId = 10; + optional int64 Priority = 11; + optional bool IsOlap = 12; } message TKqpStreamLookupSettings {