diff --git a/CHANGELOG.md b/CHANGELOG.md index 7baae76ea43..bab54ca68e2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ # Changelog ## main / unreleased +* [ENHANCEMENT] Query-scheduler: Introduce `query-scheduler.use-multi-algorithm-query-queue`, which allows use of an experimental queue structure, with no change in external queue behavior. #7873 ### Grafana Mimir diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index 8da2843a345..44cb63e701d 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -15909,6 +15909,17 @@ "fieldType": "boolean", "fieldCategory": "experimental" }, + { + "kind": "field", + "name": "use_multi_algorithm_query_queue", + "required": false, + "desc": "Use an experimental version of the query queue which has the same behavior as the existing queue, but integrates tenant selection into the tree model.", + "fieldValue": null, + "fieldDefaultValue": false, + "fieldFlag": "query-scheduler.use-multi-algorithm-query-queue", + "fieldType": "boolean", + "fieldCategory": "experimental" + }, { "kind": "field", "name": "querier_forget_delay", diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index 7a6f26ade4b..9357e2e29e2 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -2317,6 +2317,8 @@ Usage of ./cmd/mimir/mimir: Backend storage to use for the ring. Supported values are: consul, etcd, inmemory, memberlist, multi. (default "memberlist") -query-scheduler.service-discovery-mode string [experimental] Service discovery mode that query-frontends and queriers use to find query-scheduler instances. When query-scheduler ring-based service discovery is enabled, this option needs be set on query-schedulers, query-frontends and queriers. Supported values are: dns, ring. (default "dns") + -query-scheduler.use-multi-algorithm-query-queue + [experimental] Use an experimental version of the query queue which has the same behavior as the existing queue, but integrates tenant selection into the tree model. -ruler-storage.azure.account-key string Azure storage account key. If unset, Azure managed identities will be used for authentication instead. -ruler-storage.azure.account-name string diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index 5f114817834..34e8a50cda2 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -1708,6 +1708,12 @@ The `query_scheduler` block configures the query-scheduler. # CLI flag: -query-scheduler.additional-query-queue-dimensions-enabled [additional_query_queue_dimensions_enabled: | default = false] +# (experimental) Use an experimental version of the query queue which has the +# same behavior as the existing queue, but integrates tenant selection into the +# tree model. +# CLI flag: -query-scheduler.use-multi-algorithm-query-queue +[use_multi_algorithm_query_queue: | default = false] + # (experimental) If a querier disconnects without sending notification about # graceful shutdown, the query-scheduler will keep the querier in the tenant's # shard until the forget delay has passed. This feature is useful to reduce the diff --git a/pkg/frontend/v1/frontend.go b/pkg/frontend/v1/frontend.go index a804b95057c..82bfc98d1e1 100644 --- a/pkg/frontend/v1/frontend.go +++ b/pkg/frontend/v1/frontend.go @@ -128,6 +128,7 @@ func New(cfg Config, limits Limits, log log.Logger, registerer prometheus.Regist log, cfg.MaxOutstandingPerTenant, false, + false, cfg.QuerierForgetDelay, f.queueLength, f.discardedRequests, @@ -251,7 +252,7 @@ func (f *Frontend) Process(server frontendv1pb.Frontend_ProcessServer) error { /* We want to dequeue the next unexpired request from the chosen tenant queue. - The chance of choosing a particular tenant for dequeueing is (1/active_tenants). + The chance of choosing a particular tenant for dequeuing is (1/active_tenants). This is problematic under load, especially with other middleware enabled such as querier.split-by-interval, where one request may fan out into many. If expired requests aren't exhausted before checking another tenant, it would take diff --git a/pkg/scheduler/queue/multi_queuing_algorithm_tree_queue.go b/pkg/scheduler/queue/multi_queuing_algorithm_tree_queue.go new file mode 100644 index 00000000000..3a14944a23d --- /dev/null +++ b/pkg/scheduler/queue/multi_queuing_algorithm_tree_queue.go @@ -0,0 +1,283 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package queue + +import ( + "container/list" + "fmt" +) + +type QueuePath []string //nolint:revive // disallows types beginning with package name +type QueueIndex int //nolint:revive // disallows types beginning with package name + +const localQueueIndex = -1 + +type Tree interface { + EnqueueFrontByPath(QueuePath, any) error + EnqueueBackByPath(QueuePath, any) error + Dequeue() (QueuePath, any) + ItemCount() int + IsEmpty() bool +} + +// MultiQueuingAlgorithmTreeQueue holds metadata and a pointer to the root node of a hierarchical queue implementation. +// The root Node maintains a localQueue and an arbitrary number of child nodes (which themselves +// may have local queues and children). Each Node in MultiQueuingAlgorithmTreeQueue uses a QueuingAlgorithm (determined by +// node depth) to determine dequeue order of that Node's subtree. +// +// Each queuing dimension is modeled as a node in the tree, internally reachable through a QueuePath. +// +// The QueuePath is an ordered array of strings describing the path from the tree root to a Node. +// In addition to child Nodes, each Node contains a local queue (FIFO) of items. +// +// When dequeuing from a given node, a Node will use its QueuingAlgorithm to choose either itself +// or a child node to dequeue from recursively (i.e., a child Node will use its own QueuingAlgorithm +// to determine how to proceed). MultiQueuingAlgorithmTreeQueue will not dequeue from two different Nodes at the same depth +// consecutively, unless the previously-checked Node was empty down to the leaf node. +type MultiQueuingAlgorithmTreeQueue struct { + rootNode *Node + algosByDepth []QueuingAlgorithm +} + +func NewTree(queuingAlgorithms ...QueuingAlgorithm) (*MultiQueuingAlgorithmTreeQueue, error) { + if len(queuingAlgorithms) == 0 { + return nil, fmt.Errorf("cannot create a tree without defined QueuingAlgorithm") + } + root, err := newNode("root", 0, queuingAlgorithms[0]) + if err != nil { + return nil, err + } + root.depth = 0 + return &MultiQueuingAlgorithmTreeQueue{ + rootNode: root, + algosByDepth: queuingAlgorithms, + }, nil +} + +func (t *MultiQueuingAlgorithmTreeQueue) ItemCount() int { + return t.rootNode.ItemCount() +} + +func (t *MultiQueuingAlgorithmTreeQueue) IsEmpty() bool { + return t.rootNode.IsEmpty() +} + +// Dequeue removes and returns an item from the front of the next appropriate Node in the MultiQueuingAlgorithmTreeQueue, as +// well as the path to the Node which that item was dequeued from. +// +// Either the root/self node or a child node is chosen according to the Node's QueuingAlgorithm. If +// the root node is chosen, an item will be dequeued from the front of its localQueue. If a child +// node is chosen, it is recursively dequeued from until a node selects its localQueue. +// +// Nodes that empty down to the leaf after being dequeued from (or which are found to be empty leaf +// nodes during the dequeue operation) are deleted as the recursion returns up the stack. This +// maintains structural guarantees relied upon to make IsEmpty() non-recursive. +func (t *MultiQueuingAlgorithmTreeQueue) Dequeue() (QueuePath, any) { + path, v := t.rootNode.dequeue() + // The returned node dequeue path includes the root node; exclude + // this so that the return path can be used if needed to enqueue. + return path[1:], v +} + +// EnqueueBackByPath enqueues an item in the back of the local queue of the node +// located at a given path through the tree; nodes for the path are created as needed. +// +// path is relative to the root node; providing a QueuePath beginning with "root" +// will create a child node of the root node which is also named "root." +func (t *MultiQueuingAlgorithmTreeQueue) EnqueueBackByPath(path QueuePath, v any) error { + return t.rootNode.enqueueBackByPath(t, path, v) +} + +// EnqueueFrontByPath enqueues an item in the front of the local queue of the Node +// located at a given path through the MultiQueuingAlgorithmTreeQueue; nodes for the path are created as needed. +// +// Enqueueing to the front is intended only for items which were first enqueued to the back +// and then dequeued after reaching the front. +// +// Re-enqueueing to the front is only intended for use in cases where a queue consumer +// fails to complete operations on the dequeued item, but failure is not yet final, and the +// operations should be retried by a subsequent queue consumer. A concrete example is when +// a queue consumer fails or disconnects for unrelated reasons while we are in the process +// of dequeuing a request for it. +// +// path must be relative to the root node; providing a QueuePath beginning with "root" +// will create a child node of root which is also named "root." +func (t *MultiQueuingAlgorithmTreeQueue) EnqueueFrontByPath(path QueuePath, v any) error { + return t.rootNode.enqueueFrontByPath(t, path, v) +} + +func (t *MultiQueuingAlgorithmTreeQueue) GetNode(path QueuePath) *Node { + return t.rootNode.getNode(path) +} + +// Node maintains node-specific information used to enqueue and dequeue to itself, such as a local +// queue, node depth, references to its children, and position in queue. +// Note that the tenantQuerierAssignments QueuingAlgorithm largely disregards Node's queueOrder and +// queuePosition, managing analogous state instead, because shuffle-sharding + fairness requirements +// necessitate input from the querier. +type Node struct { + name string + localQueue *list.List + queuePosition int // next index in queueOrder to dequeue from + queueOrder []string // order for dequeuing from self/children + queueMap map[string]*Node + depth int + queuingAlgorithm QueuingAlgorithm + childrenChecked int +} + +func newNode(name string, depth int, da QueuingAlgorithm) (*Node, error) { + if da == nil { + return nil, fmt.Errorf("cannot create a node without a defined QueuingAlgorithm") + } + return &Node{ + name: name, + localQueue: list.New(), + queuePosition: localQueueIndex, + queueOrder: make([]string, 0), + queueMap: make(map[string]*Node, 1), + depth: depth, + queuingAlgorithm: da, + }, nil +} + +func (n *Node) IsEmpty() bool { + // avoid recursion to make this a cheap operation + // + // Because we dereference empty child nodes during dequeuing, + // we assume that emptiness means there are no child nodes + // and nothing in this tree node's local queue. + // + // In reality a package member could attach empty child queues with getOrAddNode + // in order to get a functionally-empty tree that would report false for IsEmpty. + // We assume this does not occur or is not relevant during normal operation. + return n.localQueue.Len() == 0 && len(n.queueMap) == 0 +} + +// ItemCount counts the queue items in the Node and in all its children, recursively. +func (n *Node) ItemCount() int { + items := n.localQueue.Len() + for _, child := range n.queueMap { + items += child.ItemCount() + } + return items +} + +func (n *Node) Name() string { + return n.name +} + +func (n *Node) getLocalQueue() *list.List { + return n.localQueue +} + +func (n *Node) enqueueFrontByPath(tree *MultiQueuingAlgorithmTreeQueue, pathFromNode QueuePath, v any) error { + childNode, err := n.getOrAddNode(pathFromNode, tree) + if err != nil { + return err + } + childNode.localQueue.PushFront(v) + return nil +} + +func (n *Node) enqueueBackByPath(tree *MultiQueuingAlgorithmTreeQueue, pathFromNode QueuePath, v any) error { + childNode, err := n.getOrAddNode(pathFromNode, tree) + if err != nil { + return err + } + childNode.localQueue.PushBack(v) + return nil +} + +func (n *Node) dequeue() (QueuePath, any) { + var v any + var childPath QueuePath + + path := QueuePath{n.name} + + if n.IsEmpty() { + return path, nil + } + + var checkedAllNodes bool + var dequeueNode *Node + // continue until we've found a value or checked all nodes that need checking + for v == nil && !checkedAllNodes { + dequeueNode, checkedAllNodes = n.queuingAlgorithm.dequeueSelectNode(n) + switch dequeueNode { + // dequeuing from local queue + case n: + if n.localQueue.Len() > 0 { + // dequeueNode is self, local queue non-empty + if elt := n.localQueue.Front(); elt != nil { + n.localQueue.Remove(elt) + v = elt.Value + } + } + // no dequeue-able child found; break out of the loop, + // since we won't find anything to dequeue if we don't + // have a node to dequeue from now + case nil: + checkedAllNodes = true + // dequeue from a child + default: + childPath, v = dequeueNode.dequeue() + } + + if v == nil { + n.childrenChecked++ + } + + n.queuingAlgorithm.dequeueUpdateState(n, dequeueNode) + } + // reset childrenChecked to 0 before completing this dequeue + n.childrenChecked = 0 + return append(path, childPath...), v +} + +func (n *Node) getNode(pathFromNode QueuePath) *Node { + if len(pathFromNode) == 0 { + return n + } + + if n.queueMap == nil { + return nil + } + + if childQueue, ok := n.queueMap[pathFromNode[0]]; ok { + return childQueue.getNode(pathFromNode[1:]) + } + + // no child node matches next path segment + return nil +} + +// getOrAddNode recursively gets or adds tree queue nodes based on given relative child path. It +// checks whether the first node in pathFromNode exists in the Node's children; if no node exists, +// one is created and added to the Node's queueOrder, according to the Node's QueuingAlgorithm. +// +// pathFromNode must be relative to the receiver node; providing a QueuePath beginning with +// the receiver/parent node name will create a child node of the same name as the parent. +func (n *Node) getOrAddNode(pathFromNode QueuePath, tree *MultiQueuingAlgorithmTreeQueue) (*Node, error) { + if len(pathFromNode) == 0 { + return n, nil + } + + var childNode *Node + var ok bool + var err error + if childNode, ok = n.queueMap[pathFromNode[0]]; !ok { + // child does not exist, create it + if n.depth+1 >= len(tree.algosByDepth) { + return nil, fmt.Errorf("cannot add a node beyond max tree depth: %v", len(tree.algosByDepth)) + } + childNode, err = newNode(pathFromNode[0], n.depth+1, tree.algosByDepth[n.depth+1]) + if err != nil { + return nil, err + } + // add the newly created child to the node + n.queuingAlgorithm.addChildNode(n, childNode) + + } + return childNode.getOrAddNode(pathFromNode[1:], tree) +} diff --git a/pkg/scheduler/queue/multi_queuing_algorithm_tree_queue_test.go b/pkg/scheduler/queue/multi_queuing_algorithm_tree_queue_test.go new file mode 100644 index 00000000000..a37faf8c112 --- /dev/null +++ b/pkg/scheduler/queue/multi_queuing_algorithm_tree_queue_test.go @@ -0,0 +1,976 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package queue + +import ( + "fmt" + "strings" + "testing" + + "github.com/stretchr/testify/require" +) + +// enqueueObj is intended for use in tests; it represents an obj to enqueue to path, or an expected path and obj +// for a given dequeue +type enqueueObj struct { + obj any + path QueuePath +} + +func newTenantQuerierAssignments() *tenantQuerierAssignments { + return &tenantQuerierAssignments{ + tenantQuerierIDs: make(map[TenantID]map[QuerierID]struct{}), + tenantNodes: make(map[string][]*Node), + } +} + +func Test_NewTree(t *testing.T) { + tests := []struct { + name string + treeAlgos []QueuingAlgorithm + state *tenantQuerierAssignments + expectErr bool + }{ + { + name: "create round-robin tree", + treeAlgos: []QueuingAlgorithm{&roundRobinState{}}, + }, + { + name: "create tenant-querier tree", + treeAlgos: []QueuingAlgorithm{&tenantQuerierAssignments{}}, + }, + { + name: "fail to create tree without defined dequeuing algorithm", + treeAlgos: []QueuingAlgorithm{nil}, + expectErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + _, err := NewTree(tt.treeAlgos...) + if tt.expectErr { + require.Error(t, err) + } else { + require.NoError(t, err) + } + + }) + } +} + +func Test_EnqueueBackByPath(t *testing.T) { + tests := []struct { + name string + treeAlgosByDepth []QueuingAlgorithm + childPathsToEnqueue []QueuePath + expectErr bool + }{ + { + name: "enqueue round-robin node to round-robin node", + treeAlgosByDepth: []QueuingAlgorithm{&roundRobinState{}, &roundRobinState{}}, + childPathsToEnqueue: []QueuePath{{"child-1"}}, + }, + { + name: "enqueue tenant-querier node to round-robin node", + treeAlgosByDepth: []QueuingAlgorithm{&roundRobinState{}, &tenantQuerierAssignments{tenantQuerierIDs: map[TenantID]map[QuerierID]struct{}{}}}, + childPathsToEnqueue: []QueuePath{{"child-1"}, {"child-2"}}, + }, + { + name: "enqueue round-robin node to tenant-querier node", + treeAlgosByDepth: []QueuingAlgorithm{ + newTenantQuerierAssignments(), + &roundRobinState{}, + }, + childPathsToEnqueue: []QueuePath{{"child-1"}, {"child-2"}}, + }, + { + name: "enqueue tenant-querier node to tenant-querier node", + treeAlgosByDepth: []QueuingAlgorithm{ + newTenantQuerierAssignments(), + newTenantQuerierAssignments(), + }, + childPathsToEnqueue: []QueuePath{{"child-1"}}, + }, + { + name: "enqueue grandchildren to a tree with max-depth of 2", + treeAlgosByDepth: []QueuingAlgorithm{ + newTenantQuerierAssignments(), + &roundRobinState{}, + newTenantQuerierAssignments(), + }, + childPathsToEnqueue: []QueuePath{{"child"}, {"grandchild"}}, + }, + { + name: "enqueue beyond max-depth", + treeAlgosByDepth: []QueuingAlgorithm{&roundRobinState{}}, + childPathsToEnqueue: []QueuePath{{"child"}, {"child, grandchild"}}, + expectErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tree, err := NewTree(tt.treeAlgosByDepth...) + require.NoError(t, err) + + for _, childPath := range tt.childPathsToEnqueue { + err = tree.EnqueueBackByPath(childPath, "some-object") + } + if tt.expectErr { + require.Error(t, err) + } else { + require.NoError(t, err) + require.Equal(t, len(tt.childPathsToEnqueue), tree.ItemCount()) + } + }) + } +} + +func Test_EnqueueFrontByPath(t *testing.T) { + someQuerier := QuerierID("placeholder") + tests := []struct { + name string + treeAlgosByDepth []QueuingAlgorithm + enqueueObjs []enqueueObj + expected []any + }{ + { + name: "enqueue to front of round-robin node", + treeAlgosByDepth: []QueuingAlgorithm{&roundRobinState{}}, + enqueueObjs: []enqueueObj{ + {"query-1", QueuePath{}}, + {"query-2", QueuePath{}}, + }, + expected: []any{"query-2", "query-1"}, + }, + { + name: "enqueue to front of tenant-querier node", + treeAlgosByDepth: []QueuingAlgorithm{&tenantQuerierAssignments{ + currentQuerier: someQuerier, + }}, + enqueueObjs: []enqueueObj{ + {"query-1", QueuePath{}}, + {"query-2", QueuePath{}}, + }, + expected: []any{"query-2", "query-1"}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tree, err := NewTree(tt.treeAlgosByDepth...) + require.NoError(t, err) + + for _, o := range tt.enqueueObjs { + err = tree.EnqueueFrontByPath(o.path, o.obj) + } + require.NoError(t, err) + + for _, expectedVal := range tt.expected { + _, v := tree.Dequeue() + require.Equal(t, expectedVal, v) + } + _, v := tree.Dequeue() + require.Nil(t, v) + }) + } +} + +func Test_Dequeue_RootNode(t *testing.T) { + tests := []struct { + name string + rootAlgo QueuingAlgorithm + enqueueToRoot []any + }{ + { + name: "dequeue from empty round-robin root node", + rootAlgo: &roundRobinState{}, + }, + { + name: "dequeue from empty tenant-querier root node", + rootAlgo: newTenantQuerierAssignments(), + }, + { + name: "dequeue from non-empty round-robin root node", + rootAlgo: &roundRobinState{}, + enqueueToRoot: []any{"something-in-root"}, + }, + { + name: "dequeue from non-empty tenant-querier root node", + rootAlgo: newTenantQuerierAssignments(), + enqueueToRoot: []any{"something-else-in-root"}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if tqa, ok := tt.rootAlgo.(*tenantQuerierAssignments); ok { + tqa.updateQueuingAlgorithmState("placeholder", localQueueIndex) + } + tree, err := NewTree(tt.rootAlgo) + require.NoError(t, err) + + path := QueuePath{} + for _, elt := range tt.enqueueToRoot { + err = tree.EnqueueBackByPath(path, elt) + require.NoError(t, err) + } + + for _, elt := range tt.enqueueToRoot { + dequeuePath, v := tree.Dequeue() + require.Equal(t, path, dequeuePath) + require.Equal(t, elt, v) + + } + + dequeuePath, v := tree.Dequeue() + require.Equal(t, path, dequeuePath) + require.Nil(t, v) + + }) + } +} + +func Test_RoundRobinDequeue(t *testing.T) { + tests := []struct { + name string + selfQueueObjects []string + children []string + childQueueObjects map[string][]any + grandchildren []string + grandchildrenQueueObjects map[string]struct { + path QueuePath + objs []any + } + expected []string + }{ + { + name: "dequeue from round-robin child when local queue empty", + children: []string{"child-1"}, + childQueueObjects: map[string][]any{"child-1": {"child-1:some-object"}}, + expected: []string{"child-1:some-object"}, + }, + { + name: "dequeue from round-robin root when on node's turn", + selfQueueObjects: []string{"root:object-1", "root:object-2"}, + children: []string{"child-1"}, + childQueueObjects: map[string][]any{"child-1": {"child-1:object-1"}}, + expected: []string{"root:object-1", "child-1:object-1"}, + }, + { + name: "dequeue from second round-robin child when first child is empty", + children: []string{"child-1", "child-2"}, + childQueueObjects: map[string][]any{"child-1": {nil}, "child-2": {"child-2:some-object"}}, + expected: []string{"child-2:some-object"}, + }, + { + name: "dequeue from round-robin grandchild when non-empty", + children: []string{"child-1", "child-2"}, + childQueueObjects: map[string][]any{"child-1": {"child-1:object-1", "child-1:object-2"}, "child-2": {"child-2:object-1"}}, + expected: []string{"child-1:object-1", "child-2:object-1", "grandchild-1:object-1"}, + grandchildren: []string{"grandchild-1"}, + grandchildrenQueueObjects: map[string]struct { + path QueuePath + objs []any + }{"grandchild-1": { + path: QueuePath{"child-1", "grandchild-1"}, + objs: []any{"grandchild-1:object-1"}, + }}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tree, err := NewTree(&roundRobinState{}, &roundRobinState{}, &roundRobinState{}) + require.NoError(t, err) + + for _, sqo := range tt.selfQueueObjects { + err = tree.EnqueueBackByPath(QueuePath{}, sqo) + require.NoError(t, err) + } + + for _, child := range tt.children { + for _, obj := range tt.childQueueObjects[child] { + _ = tree.EnqueueBackByPath(QueuePath{child}, obj) + } + } + + for _, grandchild := range tt.grandchildren { + gqo := tt.grandchildrenQueueObjects[grandchild] + for _, obj := range gqo.objs { + err = tree.EnqueueBackByPath(gqo.path, obj) + require.NoError(t, err) + } + } + + for _, expected := range tt.expected { + _, val := tree.Dequeue() + v, ok := val.(string) + require.True(t, ok) + require.Equal(t, expected, v) + } + }) + } +} + +func Test_DequeueOrderAfterEnqueue(t *testing.T) { + type opType string + enqueue := opType("enqueue") + dequeue := opType("dequeue") + placeholderQuerier := QuerierID("some-querier") + + type op struct { + kind opType + path QueuePath + obj any + } + + tests := []struct { + name string + treeAlgosByDepth []QueuingAlgorithm + operationOrder []op + }{ + { + name: "round-robin node should dequeue from first child one more time after new node added", + treeAlgosByDepth: []QueuingAlgorithm{&roundRobinState{}, &roundRobinState{}}, + operationOrder: []op{ + {enqueue, QueuePath{"child-1"}, "obj-1"}, + {enqueue, QueuePath{"child-1"}, "obj-2"}, + {dequeue, QueuePath{"child-1"}, "obj-1"}, + {enqueue, QueuePath{"child-2"}, "obj-3"}, + {dequeue, QueuePath{"child-1"}, "obj-2"}, + {dequeue, QueuePath{"child-2"}, "obj-3"}, + {dequeue, QueuePath{}, nil}, + }, + }, + { + name: "should dequeue from new tenant-querier child before repeat-dequeueing", + treeAlgosByDepth: []QueuingAlgorithm{ + &tenantQuerierAssignments{ + tenantQuerierIDs: map[TenantID]map[QuerierID]struct{}{}, + tenantNodes: map[string][]*Node{}, + currentQuerier: placeholderQuerier, + }, + &tenantQuerierAssignments{ + tenantQuerierIDs: map[TenantID]map[QuerierID]struct{}{}, + tenantNodes: map[string][]*Node{}, + currentQuerier: placeholderQuerier, + }, + }, + operationOrder: []op{ + {enqueue, QueuePath{"child-1"}, "obj-1"}, + {enqueue, QueuePath{"child-1"}, "obj-2"}, + {dequeue, QueuePath{"child-1"}, "obj-1"}, + {enqueue, QueuePath{"child-2"}, "obj-3"}, + {dequeue, QueuePath{"child-2"}, "obj-3"}, + {dequeue, QueuePath{"child-1"}, "obj-2"}, + {dequeue, QueuePath{}, nil}, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tree, err := NewTree(tt.treeAlgosByDepth...) + require.NoError(t, err) + + for _, operation := range tt.operationOrder { + if operation.kind == enqueue { + err = tree.EnqueueBackByPath(operation.path, operation.obj) + require.NoError(t, err) + } + if operation.kind == dequeue { + path, obj := tree.Dequeue() + require.Equal(t, operation.path, path) + require.Equal(t, operation.obj, obj) + } + } + }) + } +} + +func Test_TenantQuerierAssignmentsDequeue(t *testing.T) { + tests := []struct { + name string + treeAlgosByDepth []QueuingAlgorithm + currQuerier []QuerierID + enqueueObjs []enqueueObj + expected []any + expectErr bool + }{ + { + name: "happy path - tenant found in tenant-querier map under first child", + treeAlgosByDepth: []QueuingAlgorithm{ + &roundRobinState{}, + &tenantQuerierAssignments{ + tenantQuerierIDs: map[TenantID]map[QuerierID]struct{}{"tenant-1": {"querier-1": {}}}, + tenantNodes: map[string][]*Node{}, + }, + &roundRobinState{}, + }, + currQuerier: []QuerierID{"querier-1"}, + enqueueObjs: []enqueueObj{ + {obj: "query-1", path: QueuePath{"query-component-1", "tenant-1"}}, + }, + expected: []any{"query-1"}, + }, + { + name: "tenant exists, but not for querier", + treeAlgosByDepth: []QueuingAlgorithm{ + &roundRobinState{}, + &tenantQuerierAssignments{ + tenantQuerierIDs: map[TenantID]map[QuerierID]struct{}{"tenant-1": {"querier-2": {}}}, + tenantNodes: map[string][]*Node{}, + }, + &roundRobinState{}, + }, + currQuerier: []QuerierID{"querier-1"}, + enqueueObjs: []enqueueObj{ + {obj: "query-1", path: QueuePath{"query-component-1", "tenant-1"}}, + }, + expected: []any{nil}, + }, + { + name: "1 of 3 tenants exist for querier", + treeAlgosByDepth: []QueuingAlgorithm{ + &roundRobinState{}, + &tenantQuerierAssignments{ + tenantQuerierIDs: map[TenantID]map[QuerierID]struct{}{"tenant-1": {"querier-2": {}}, "tenant-2": {"querier-2": {}}, "tenant-3": {"querier-1": {}}}, + tenantNodes: map[string][]*Node{}, + }, + &roundRobinState{}, + }, + currQuerier: []QuerierID{"querier-1", "querier-1"}, + enqueueObjs: []enqueueObj{ + {obj: "query-1", path: QueuePath{"query-component-1", "tenant-1"}}, + {obj: "query-2", path: QueuePath{"query-component-1", "tenant-2"}}, + {obj: "query-3", path: QueuePath{"query-component-1", "tenant-3"}}, + }, + expected: []any{"query-3", nil}, + }, + { + name: "tenant exists for querier on next parent node", + treeAlgosByDepth: []QueuingAlgorithm{ + &roundRobinState{}, + &tenantQuerierAssignments{ + tenantQuerierIDs: map[TenantID]map[QuerierID]struct{}{"tenant-1": {"querier-2": {}}, "tenant-2": {"querier-2": {}}, "tenant-3": {"querier-1": {}}}, + tenantNodes: map[string][]*Node{}, + }, + &roundRobinState{}, + }, + currQuerier: []QuerierID{"querier-1", "querier-1", "querier-1"}, + enqueueObjs: []enqueueObj{ + {obj: "query-1", path: QueuePath{"query-component-1", "tenant-1"}}, + {obj: "query-2", path: QueuePath{"query-component-1", "tenant-2"}}, + {obj: "query-3", path: QueuePath{"query-component-2", "tenant-3"}}, + {obj: "query-4", path: QueuePath{"query-component-1", "tenant-3"}}, + }, + expected: []any{"query-4", "query-3", nil}, + }, + { + name: "2 of 3 tenants exist for querier", + treeAlgosByDepth: []QueuingAlgorithm{ + &roundRobinState{}, + &tenantQuerierAssignments{ + tenantQuerierIDs: map[TenantID]map[QuerierID]struct{}{"tenant-1": {"querier-1": {}}, "tenant-2": {"querier-2": {}}, "tenant-3": {"querier-1": {}}}, + tenantNodes: map[string][]*Node{}, + }, + &roundRobinState{}, + }, + currQuerier: []QuerierID{"querier-1", "querier-1", "querier-1"}, + enqueueObjs: []enqueueObj{ + {obj: "query-1", path: QueuePath{"query-component-1", "tenant-1"}}, + {obj: "query-2", path: QueuePath{"query-component-1", "tenant-2"}}, + {obj: "query-3", path: QueuePath{"query-component-2", "tenant-3"}}, + }, + expected: []any{"query-1", "query-3", nil}, + }, + { + name: "root node is tenant-querier node", + treeAlgosByDepth: []QueuingAlgorithm{ + &tenantQuerierAssignments{ + tenantQuerierIDs: map[TenantID]map[QuerierID]struct{}{"tenant-1": {"querier-1": {}}, "tenant-2": {}}, + tenantNodes: map[string][]*Node{}, + }, + &roundRobinState{}, + &roundRobinState{}, + }, + currQuerier: []QuerierID{"querier-1", "querier-1", "querier-1"}, + enqueueObjs: []enqueueObj{ + {obj: "query-1", path: QueuePath{"tenant-2", "query-component-1"}}, + {obj: "query-2", path: QueuePath{"tenant-1", "query-component-2"}}, + {obj: "query-3", path: QueuePath{"tenant-1", "query-component-1"}}, + }, + expected: []any{"query-2", "query-3", nil}, + }, + { + name: "dequeuing for one querier returns nil, but does return for a different querier", + treeAlgosByDepth: []QueuingAlgorithm{ + &tenantQuerierAssignments{ + tenantQuerierIDs: map[TenantID]map[QuerierID]struct{}{"tenant-1": {"querier-1": {}}}, + tenantNodes: map[string][]*Node{}, + }, + &roundRobinState{}, + &roundRobinState{}, + }, + currQuerier: []QuerierID{"querier-2", "querier-1"}, + enqueueObjs: []enqueueObj{ + {obj: "query-1", path: QueuePath{"tenant-1", "query-component-1"}}, + }, + expected: []any{nil, "query-1"}, + }, + { + name: "no querier set in state", + treeAlgosByDepth: []QueuingAlgorithm{ + &tenantQuerierAssignments{ + tenantQuerierIDs: map[TenantID]map[QuerierID]struct{}{"tenant-1": {"querier-1": {}}}, + tenantNodes: map[string][]*Node{}, + }, + &tenantQuerierAssignments{ + tenantQuerierIDs: map[TenantID]map[QuerierID]struct{}{"tenant-1": {"querier-1": {}}}, + tenantNodes: map[string][]*Node{}, + }, + &roundRobinState{}, + }, + currQuerier: []QuerierID{""}, + enqueueObjs: []enqueueObj{ + {obj: "query-1", path: QueuePath{"tenant-1", "query-component-1"}}, + }, + expected: []any{nil}, + }, + { + // This also dequeues if the tenant _is not_ in the tenant querier map; is this expected? (probably) + name: "dequeue from a tenant with a nil tenant-querier map", + treeAlgosByDepth: []QueuingAlgorithm{ + &tenantQuerierAssignments{ + tenantQuerierIDs: map[TenantID]map[QuerierID]struct{}{"tenant-1": {"querier-1": {}}, "tenant-2": nil}, + tenantNodes: map[string][]*Node{}, + }, + &roundRobinState{}, + &roundRobinState{}, + }, + currQuerier: []QuerierID{"querier-1", "querier-1", "querier-1"}, + enqueueObjs: []enqueueObj{ + {obj: "query-1", path: QueuePath{"tenant-1", "query-component-1"}}, + {obj: "query-2", path: QueuePath{"tenant-2", "query-component-1"}}, + {obj: "query-3", path: QueuePath{"tenant-3", "query-component-1"}}, + }, + expected: []any{"query-1", "query-2", "query-3"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tqas := make([]*tenantQuerierAssignments, 0) + for _, da := range tt.treeAlgosByDepth { + if tqa, ok := da.(*tenantQuerierAssignments); ok { + tqas = append(tqas, tqa) + } + } + + tree, err := NewTree(tt.treeAlgosByDepth...) + require.NoError(t, err) + + for _, o := range tt.enqueueObjs { + err = tree.EnqueueBackByPath(o.path, o.obj) + require.NoError(t, err) + } + // currQuerier at position i is used to dequeue the expected result at position i + require.Equal(t, len(tt.currQuerier), len(tt.expected)) + for i := 0; i < len(tt.expected); i++ { + for _, tqa := range tqas { + tqa.updateQueuingAlgorithmState(tt.currQuerier[i], i-1) + } + _, v := tree.Dequeue() + require.Equal(t, tt.expected[i], v) + } + }) + } + +} + +// Test_ChangeTenantQuerierAssignments illustrates that we can update a state in tenantQuerierAssignments, +// and the tree dequeue behavior will adjust accordingly. +func Test_ChangeTenantQuerierAssignments(t *testing.T) { + tqa := &tenantQuerierAssignments{ + tenantQuerierIDs: map[TenantID]map[QuerierID]struct{}{ + "tenant-1": {"querier-1": {}}, + "tenant-2": {"querier-2": {}}, + }, + tenantNodes: map[string][]*Node{}, + currentQuerier: QuerierID(""), + } + + tree, err := NewTree(tqa, &roundRobinState{}, &roundRobinState{}) + require.NoError(t, err) + + enqueueObjs := []enqueueObj{ + {"query-1", QueuePath{"tenant-1", "query-component-1"}}, + {"query-2", QueuePath{"tenant-2", "query-component-1"}}, + {"query-3", QueuePath{"tenant-2", "query-component-1"}}, + {"query-4", QueuePath{"tenant-2", "query-component-1"}}, + {"query-5", QueuePath{"tenant-3", "query-component-1"}}, + } + + for _, eo := range enqueueObjs { + err = tree.EnqueueBackByPath(eo.path, eo.obj) + require.NoError(t, err) + } + + querier1 := QuerierID("querier-1") + querier2 := QuerierID("querier-2") + querier3 := QuerierID("querier-3") + + // set state to querier-2 should dequeue query-2 + tqa.updateQueuingAlgorithmState(querier2, -1) + _, v := tree.Dequeue() + require.Equal(t, "query-2", v) + + // update tqa to querier-1 should dequeue query-1 + tqa.updateQueuingAlgorithmState(querier1, -1) + _, v = tree.Dequeue() + require.Equal(t, "query-1", v) + + // update tqa map to add querier-3 as assigned to tenant-2, then set tqa to querier-3 should dequeue query-3 + tqa.tenantQuerierIDs["tenant-2"]["querier-3"] = struct{}{} + tqa.updateQueuingAlgorithmState(querier3, -1) + _, v = tree.Dequeue() + require.Equal(t, "query-3", v) + + // during reshuffle, we only ever reassign tenant values, we don't assign an entirely new map value + // to tenantQuerierIDs. Reassign tenant-2 to an empty map value, and query-5 (tenant-3), which can be handled + // by any querier, should be dequeued, + tqa.tenantQuerierIDs["tenant-2"] = map[QuerierID]struct{}{} + _, v = tree.Dequeue() + require.Equal(t, "query-5", v) + + // then we should not be able to dequeue query-4 + tqa.tenantQuerierIDs["tenant-2"] = map[QuerierID]struct{}{} + _, v = tree.Dequeue() + require.Nil(t, v) + +} + +// Test_DequeueBalancedRoundRobinTree checks dequeuing behavior from a balanced round-robin tree. +// +// Dequeuing from a balanced tree allows the test to have a simple looped structures +// while running checks to ensure that round-robin order is respected. +func Test_DequeueBalancedRoundRobinTree(t *testing.T) { + firstDimensions := []string{"0", "1", "2"} + secondDimensions := []string{"a", "b", "c"} + itemsPerDimension := 5 + tree := makeBalancedRoundRobinTree(t, firstDimensions, secondDimensions, itemsPerDimension) + require.NotNil(t, tree) + + count := 0 + + // MultiQueuingAlgorithmTreeQueue will fairly dequeue from all levels of the tree + rotationsBeforeRepeat := len(firstDimensions) * len(secondDimensions) + // track dequeued paths to ensure round-robin dequeuing does not repeat before expected + dequeuedPathCache := make([]QueuePath, rotationsBeforeRepeat) + + for !tree.IsEmpty() { + dequeuedPath, _ := tree.Dequeue() + + // require dequeued path has not repeated before the expected number of rotations + require.NotContains(t, dequeuedPathCache, dequeuedPath) + + dequeuedPathCache = append(dequeuedPathCache[1:], dequeuedPath) + count++ + } + + // count items enqueued to nodes at depth 1 + expectedFirstDimensionCount := len(firstDimensions) * itemsPerDimension + // count items enqueued to nodes at depth 2 + expectedSecondDimensionCount := len(firstDimensions) * len(secondDimensions) * itemsPerDimension + + require.Equal(t, expectedFirstDimensionCount+expectedSecondDimensionCount, count) + +} + +// Test_DequeueUnbalancedRoundRobinTree checks dequeuing behavior from an unbalanced tree. +// +// Assertions are done one by one to illustrate and check the behaviors of dequeuing from +// an unbalanced tree, where the same node will be dequeued from twice if the node remains +// nonempty while its sibling nodes have been exhausted and deleted from the tree. +func Test_DequeueUnbalancedRoundRobinTree(t *testing.T) { + tree := makeUnbalancedRoundRobinTree(t) + + // dequeue from root until exhausted + _, v := tree.Dequeue() + require.Equal(t, "root:0:val0", v) + + _, v = tree.Dequeue() + require.Equal(t, "root:1:val0", v) + + _, v = tree.Dequeue() + require.Equal(t, "root:2:0:val0", v) + + _, v = tree.Dequeue() + require.Equal(t, "root:1:0:val0", v) + + _, v = tree.Dequeue() + require.Equal(t, "root:2:1:val0", v) + + _, v = tree.Dequeue() + require.Equal(t, "root:1:val1", v) + + _, v = tree.Dequeue() + require.Equal(t, "root:2:0:val1", v) + + _, v = tree.Dequeue() + require.Equal(t, "root:1:0:val1", v) + + _, v = tree.Dequeue() + require.Equal(t, "root:2:1:val1", v) + + _, v = tree.Dequeue() + require.Equal(t, "root:2:1:val2", v) + + // all items have been dequeued + require.Equal(t, 0, tree.rootNode.ItemCount()) + require.Equal(t, 1, tree.rootNode.nodeCount()) + + // require nothing in local or child queues + require.True(t, tree.IsEmpty()) +} + +func Test_EnqueueDuringDequeueRespectsRoundRobin(t *testing.T) { + tree, err := NewTree(&roundRobinState{}, &roundRobinState{}, &roundRobinState{}) + require.NoError(t, err) + require.NotNil(t, tree) + + root := tree.rootNode + + cache := map[string]struct{}{} + + // enqueue two items to path root:0 + childPath := QueuePath{"0"} + item := makeItemForChildQueue(root, childPath, cache) + require.NoError(t, tree.EnqueueBackByPath(childPath, item)) + item = makeItemForChildQueue(root, childPath, cache) + require.NoError(t, tree.EnqueueBackByPath(childPath, item)) + + // enqueue one item to path root:1 + childPath = QueuePath{"1"} + item = makeItemForChildQueue(root, childPath, cache) + require.NoError(t, tree.EnqueueBackByPath(childPath, item)) + + // enqueue two items to path root:2 + childPath = QueuePath{"2"} + item = makeItemForChildQueue(root, childPath, cache) + require.NoError(t, tree.EnqueueBackByPath(childPath, item)) + item = makeItemForChildQueue(root, childPath, cache) + require.NoError(t, tree.EnqueueBackByPath(childPath, item)) + + require.Equal(t, []string{"0", "1", "2"}, root.queueOrder) + + // dequeue first item + dequeuedPath, _ := tree.Dequeue() + require.Equal(t, QueuePath{"0"}, dequeuedPath) + + // dequeue second item; root:1 is now exhausted and deleted + dequeuedPath, _ = tree.Dequeue() + require.Equal(t, QueuePath{"1"}, dequeuedPath) + require.Nil(t, root.getNode(QueuePath{"1"})) + require.Equal(t, []string{"0", "2"}, root.queueOrder) + + // dequeue third item + dequeuedPath, _ = tree.Dequeue() + require.Equal(t, QueuePath{"2"}, dequeuedPath) + + // root:1 was previously exhausted; root:0, then root:2 will be next in the rotation + // here we insert something new into root:1 to test that it + // does not jump the line in front of root:0 or root:2 + item = makeItemForChildQueue(root, QueuePath{"1"}, cache) + require.NoError(t, tree.EnqueueBackByPath(QueuePath{"1"}, item)) + require.NotNil(t, root.getNode(QueuePath{"1"})) + require.Equal(t, []string{"0", "2", "1"}, root.queueOrder) + + // dequeue fourth item; the newly-enqueued root:1 item + // has not jumped the line in front of root:0 + dequeuedPath, _ = tree.Dequeue() + require.Equal(t, QueuePath{"0"}, dequeuedPath) + + // dequeue fifth item; the newly-enqueued root:1 item + // has not jumped the line in front of root:2 + dequeuedPath, _ = tree.Dequeue() + require.Equal(t, QueuePath{"2"}, dequeuedPath) + + // dequeue sixth item; verifying the order 0->2->1 is being followed + dequeuedPath, _ = tree.Dequeue() + require.Equal(t, QueuePath{"1"}, dequeuedPath) + + // all items have been dequeued + require.Equal(t, 0, root.ItemCount()) + require.Equal(t, 1, root.nodeCount()) + + // require nothing in local or child queues + require.True(t, tree.IsEmpty()) +} + +// Test_NodeCannotDeleteItself creates an empty node, dequeues from it, and ensures that the node still exists +// and has not deleted itself. +func Test_NodeCannotDeleteItself(t *testing.T) { + tests := []struct { + name string + nodeType QueuingAlgorithm + }{ + {"round robin", &roundRobinState{}}, + {"tenant querier assignment", &tenantQuerierAssignments{}}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tree, err := NewTree(tt.nodeType) + require.NoError(t, err) + require.NotNil(t, tree) + + _, _ = tree.Dequeue() + + require.NotNil(t, tree.rootNode) + require.Zero(t, tree.rootNode.getLocalQueue().Len()) + require.Empty(t, tree.rootNode.queueMap) + }) + } +} + +func makeBalancedRoundRobinTree(t *testing.T, firstDimensions, secondDimensions []string, itemsPerDimension int) *MultiQueuingAlgorithmTreeQueue { + tree, err := NewTree(&roundRobinState{}, &roundRobinState{}, &roundRobinState{}) + require.NoError(t, err) + require.Equal(t, 1, tree.rootNode.nodeCount()) + require.Equal(t, 0, tree.rootNode.ItemCount()) + + cache := map[string]struct{}{} + + for _, firstDimName := range firstDimensions { + for k := 0; k < itemsPerDimension; k++ { + childPath := QueuePath{firstDimName} + item := makeItemForChildQueue(tree.rootNode, childPath, cache) + require.NoError(t, tree.EnqueueBackByPath(childPath, item)) + } + for _, secondDimName := range secondDimensions { + for k := 0; k < itemsPerDimension; k++ { + childPath := QueuePath{firstDimName, secondDimName} + item := makeItemForChildQueue(tree.rootNode, childPath, cache) + require.NoError(t, tree.EnqueueBackByPath(childPath, item)) + } + } + } + return tree +} + +func makeUnbalancedRoundRobinTree(t *testing.T) *MultiQueuingAlgorithmTreeQueue { + /* + root + ├── child0 + │ └── localQueue + │ └── val0 + ├── child1 + │ ├── child0 + │ │ └── localQueue + │ │ ├── val0 + │ │ └── val1 + │ └── localQueue + │ ├── val0 + │ └── val1 + ├── child2 + │ ├── child0 + │ │ └── localQueue + │ │ ├── val0 + │ │ └── val1 + │ ├── child1 + │ │ └── localQueue + │ │ ├── val0 + │ │ ├── val1 + │ │ └── val2 + │ └── localQueue + └── localQueue + */ + tree, err := NewTree(&roundRobinState{}, &roundRobinState{}, &roundRobinState{}) + require.NoError(t, err) + require.Equal(t, 1, tree.rootNode.nodeCount()) + require.Equal(t, 0, tree.rootNode.ItemCount()) + + cache := map[string]struct{}{} + + // enqueue one item to root:0 + childPath := QueuePath{"0"} + item := makeItemForChildQueue(tree.rootNode, childPath, cache) + require.NoError(t, tree.EnqueueBackByPath(childPath, item)) + require.Equal(t, 2, tree.rootNode.nodeCount()) + require.Equal(t, 1, tree.rootNode.ItemCount()) + + // enqueue two items to root:1 + childPath = QueuePath{"1"} + item = makeItemForChildQueue(tree.rootNode, childPath, cache) + require.NoError(t, tree.EnqueueBackByPath(childPath, item)) + require.Equal(t, 3, tree.rootNode.nodeCount()) + require.Equal(t, 2, tree.rootNode.ItemCount()) + + item = makeItemForChildQueue(tree.rootNode, childPath, cache) + require.NoError(t, tree.EnqueueBackByPath(childPath, item)) + require.Equal(t, 3, tree.rootNode.nodeCount()) + require.Equal(t, 3, tree.rootNode.ItemCount()) + + // enqueue two items to root:1:0 + childPath = QueuePath{"1", "0"} + item = makeItemForChildQueue(tree.rootNode, childPath, cache) + require.NoError(t, tree.EnqueueBackByPath(childPath, item)) + require.Equal(t, 4, tree.rootNode.nodeCount()) + require.Equal(t, 4, tree.rootNode.ItemCount()) + + item = makeItemForChildQueue(tree.rootNode, childPath, cache) + require.NoError(t, tree.EnqueueBackByPath(childPath, item)) + require.Equal(t, 4, tree.rootNode.nodeCount()) + require.Equal(t, 5, tree.rootNode.ItemCount()) + + // enqueue two items to root:2:0 + childPath = QueuePath{"2", "0"} + item = makeItemForChildQueue(tree.rootNode, childPath, cache) + require.NoError(t, tree.EnqueueBackByPath(childPath, item)) + require.Equal(t, 6, tree.rootNode.nodeCount()) + require.Equal(t, 6, tree.rootNode.ItemCount()) + + item = makeItemForChildQueue(tree.rootNode, childPath, cache) + require.NoError(t, tree.EnqueueBackByPath(childPath, item)) + require.Equal(t, 6, tree.rootNode.nodeCount()) + require.Equal(t, 7, tree.rootNode.ItemCount()) + + // enqueue three items to root:2:1 + childPath = QueuePath{"2", "1"} + item = makeItemForChildQueue(tree.rootNode, childPath, cache) + require.NoError(t, tree.EnqueueBackByPath(childPath, item)) + require.Equal(t, 7, tree.rootNode.nodeCount()) + require.Equal(t, 8, tree.rootNode.ItemCount()) + + item = makeItemForChildQueue(tree.rootNode, childPath, cache) + require.NoError(t, tree.EnqueueBackByPath(childPath, item)) + require.Equal(t, 7, tree.rootNode.nodeCount()) + require.Equal(t, 9, tree.rootNode.ItemCount()) + + item = makeItemForChildQueue(tree.rootNode, childPath, cache) + require.NoError(t, tree.EnqueueBackByPath(childPath, item)) + require.Equal(t, 7, tree.rootNode.nodeCount()) + require.Equal(t, 10, tree.rootNode.ItemCount()) + + return tree +} + +// makeItemForChildQueue constructs a queue item to match its enqueued path +// by joining the path components and appending an incrementing value for each path. +// +// e.g. for a tree named "root": +// - childQueuePath{"1", "0"}'s first item will be "root:1:0:val0" +// - childQueuePath{"1", "0"}'s second item will be "root:1:0:val1" +func makeItemForChildQueue( + parent *Node, childPath QueuePath, cache map[string]struct{}, +) string { + path := append(QueuePath{parent.name}, childPath...) + + i := 0 + for { + item := strings.Join(path, ":") + fmt.Sprintf(":val%d", i) + if _, ok := cache[item]; !ok { + cache[item] = struct{}{} + return item + } + i++ + } +} diff --git a/pkg/scheduler/queue/queue.go b/pkg/scheduler/queue/queue.go index 4130ca7c7ac..13c74cf56f4 100644 --- a/pkg/scheduler/queue/queue.go +++ b/pkg/scheduler/queue/queue.go @@ -174,6 +174,7 @@ func NewRequestQueue( log log.Logger, maxOutstandingPerTenant int, additionalQueueDimensionsEnabled bool, + useMultiAlgoQueue bool, forgetDelay time.Duration, queueLength *prometheus.GaugeVec, discardedRequests *prometheus.CounterVec, @@ -210,7 +211,7 @@ func NewRequestQueue( waitingQuerierConnsToDispatch: list.New(), QueryComponentUtilization: queryComponentCapacity, - queueBroker: newQueueBroker(maxOutstandingPerTenant, additionalQueueDimensionsEnabled, forgetDelay), + queueBroker: newQueueBroker(maxOutstandingPerTenant, additionalQueueDimensionsEnabled, useMultiAlgoQueue, forgetDelay), } q.Service = services.NewBasicService(q.starting, q.running, q.stop).WithName("request queue") @@ -371,7 +372,7 @@ func (q *RequestQueue) trySendNextRequestForQuerier(waitingConn *waitingQuerierC exceedsThreshold, queryComponent := q.QueryComponentUtilization.ExceedsThresholdForComponentName( queryComponentName, int(q.connectedQuerierWorkers.Load()), - q.queueBroker.tenantQueuesTree.ItemCount(), + q.queueBroker.tree.ItemCount(), q.waitingQuerierConnsToDispatch.Len(), ) diff --git a/pkg/scheduler/queue/queue_test.go b/pkg/scheduler/queue/queue_test.go index 2ecdf8676ba..0309d43ddff 100644 --- a/pkg/scheduler/queue/queue_test.go +++ b/pkg/scheduler/queue/queue_test.go @@ -29,6 +29,21 @@ import ( util_test "github.com/grafana/mimir/pkg/util/test" ) +// TODO (casie): Write tests for prioritizeQueryComponents is true + +func buildTreeTestsStruct() []struct { + name string + useMultiAlgoTreeQueue bool +} { + return []struct { + name string + useMultiAlgoTreeQueue bool + }{ + {"legacy tree queue", false}, + {"integrated tree queue", true}, + } +} + func TestMain(m *testing.M) { util_test.VerifyNoLeakTestMain(m) } @@ -93,194 +108,209 @@ func makeSchedulerRequest(tenantID string, additionalQueueDimensions []string) * // The queue broker then round-robins between the split queues, which has the effect of alternating between // dequeuing the slow queries and normal queries rather than blocking normal queries behind slow queries. func TestMultiDimensionalQueueFairnessSlowConsumerEffects(t *testing.T) { - promRegistry := prometheus.NewPedanticRegistry() - - maxQueriersPerTenant := 0 // disable shuffle sharding - forgetQuerierDelay := time.Duration(0) - maxOutstandingRequestsPerTenant := 1000 - - totalRequests := 100 - numTenants := 1 - numProducers := 10 - numConsumers := 1 - - normalQueueDimension := "normal-request" - slowConsumerLatency := 20 * time.Millisecond - slowConsumerQueueDimension := "slow-request" - normalQueueDimensionFunc := func() []string { return []string{normalQueueDimension} } - slowQueueDimensionFunc := func() []string { return []string{slowConsumerQueueDimension} } - - additionalQueueDimensionsEnabledCases := []bool{false, true} - queueDurationTotals := map[bool]map[string]float64{ - false: {normalQueueDimension: 0.0, slowConsumerQueueDimension: 0.0}, - true: {normalQueueDimension: 0.0, slowConsumerQueueDimension: 0.0}, - } - - for _, additionalQueueDimensionsEnabled := range additionalQueueDimensionsEnabledCases { - - // Scheduler code uses a histogram for queue duration, but a counter is a more direct metric - // for this test, as we are concerned with the total or average wait time for all queue items. - // Prometheus histograms also lack support for test assertions via prometheus/testutil. - queueDuration := promauto.With(promRegistry).NewCounterVec(prometheus.CounterOpts{ - Name: "test_query_scheduler_queue_duration_total_seconds", - Help: "[test] total time spent by items in queue before getting picked up by a consumer", - }, []string{"additional_queue_dimensions"}) - - queue, err := NewRequestQueue( - log.NewNopLogger(), - maxOutstandingRequestsPerTenant, - additionalQueueDimensionsEnabled, - forgetQuerierDelay, - promauto.With(nil).NewGaugeVec(prometheus.GaugeOpts{}, []string{"user"}), - promauto.With(nil).NewCounterVec(prometheus.CounterOpts{}, []string{"user"}), - promauto.With(nil).NewHistogram(prometheus.HistogramOpts{}), - promauto.With(nil).NewSummaryVec(prometheus.SummaryOpts{}, []string{"query_component"}), - ) - require.NoError(t, err) - - ctx := context.Background() - require.NoError(t, queue.starting(ctx)) - t.Cleanup(func() { - require.NoError(t, queue.stop(nil)) - }) - - // fill queue first with the slow queries, then the normal queries - for _, queueDimensionFunc := range []func() []string{slowQueueDimensionFunc, normalQueueDimensionFunc} { - startProducersChan := make(chan struct{}) - producersErrGroup, _ := errgroup.WithContext(ctx) - - runProducer := runQueueProducerIters( - queue, maxQueriersPerTenant, totalRequests/2, numProducers, numTenants, startProducersChan, queueDimensionFunc, - ) - for producerIdx := 0; producerIdx < numProducers; producerIdx++ { - producerIdx := producerIdx - producersErrGroup.Go(func() error { - return runProducer(producerIdx) - }) + treeTypes := buildTreeTestsStruct() + + for _, tt := range treeTypes { + t.Run(tt.name, func(t *testing.T) { + promRegistry := prometheus.NewPedanticRegistry() + + maxQueriersPerTenant := 0 // disable shuffle sharding + forgetQuerierDelay := time.Duration(0) + maxOutstandingRequestsPerTenant := 1000 + + totalRequests := 100 + numTenants := 1 + numProducers := 10 + numConsumers := 1 + + normalQueueDimension := "normal-request" + slowConsumerLatency := 20 * time.Millisecond + slowConsumerQueueDimension := "slow-request" + normalQueueDimensionFunc := func() []string { return []string{normalQueueDimension} } + slowQueueDimensionFunc := func() []string { return []string{slowConsumerQueueDimension} } + + additionalQueueDimensionsEnabledCases := []bool{false, true} + queueDurationTotals := map[bool]map[string]float64{ + false: {normalQueueDimension: 0.0, slowConsumerQueueDimension: 0.0}, + true: {normalQueueDimension: 0.0, slowConsumerQueueDimension: 0.0}, } - close(startProducersChan) - err := producersErrGroup.Wait() - require.NoError(t, err) - } - - // emulate delay when consuming the slow queries - consumeFunc := func(request Request) error { - schedulerRequest := request.(*SchedulerRequest) - if schedulerRequest.AdditionalQueueDimensions[0] == slowConsumerQueueDimension { - time.Sleep(slowConsumerLatency) - } - - queueTime := time.Since(schedulerRequest.EnqueueTime) - additionalQueueDimensionLabels := strings.Join(schedulerRequest.AdditionalQueueDimensions, ":") - queueDuration.With(prometheus.Labels{"additional_queue_dimensions": additionalQueueDimensionLabels}).Add(queueTime.Seconds()) - return nil - } - // consume queries - queueConsumerErrGroup, ctx := errgroup.WithContext(ctx) - startConsumersChan := make(chan struct{}) - runConsumer := runQueueConsumerIters(ctx, queue, totalRequests, numConsumers, startConsumersChan, consumeFunc) - - for consumerIdx := 0; consumerIdx < numConsumers; consumerIdx++ { - consumerIdx := consumerIdx - queueConsumerErrGroup.Go(func() error { - return runConsumer(consumerIdx) - }) - } + for _, additionalQueueDimensionsEnabled := range additionalQueueDimensionsEnabledCases { + + // Scheduler code uses a histogram for queue duration, but a counter is a more direct metric + // for this test, as we are concerned with the total or average wait time for all queue items. + // Prometheus histograms also lack support for test assertions via prometheus/testutil. + queueDuration := promauto.With(promRegistry).NewCounterVec(prometheus.CounterOpts{ + Name: "test_query_scheduler_queue_duration_total_seconds", + Help: "[test] total time spent by items in queue before getting picked up by a consumer", + }, []string{"additional_queue_dimensions"}) + + queue, err := NewRequestQueue( + log.NewNopLogger(), + maxOutstandingRequestsPerTenant, + additionalQueueDimensionsEnabled, + tt.useMultiAlgoTreeQueue, + forgetQuerierDelay, + promauto.With(nil).NewGaugeVec(prometheus.GaugeOpts{}, []string{"user"}), + promauto.With(nil).NewCounterVec(prometheus.CounterOpts{}, []string{"user"}), + promauto.With(nil).NewHistogram(prometheus.HistogramOpts{}), + promauto.With(nil).NewSummaryVec(prometheus.SummaryOpts{}, []string{"query_component"}), + ) + require.NoError(t, err) + + ctx := context.Background() + require.NoError(t, queue.starting(ctx)) + t.Cleanup(func() { + require.NoError(t, queue.stop(nil)) + }) - close(startConsumersChan) - err = queueConsumerErrGroup.Wait() - require.NoError(t, err) + // fill queue first with the slow queries, then the normal queries + for _, queueDimensionFunc := range []func() []string{slowQueueDimensionFunc, normalQueueDimensionFunc} { + startProducersChan := make(chan struct{}) + producersErrGroup, _ := errgroup.WithContext(ctx) + + runProducer := runQueueProducerIters( + queue, maxQueriersPerTenant, totalRequests/2, numProducers, numTenants, startProducersChan, queueDimensionFunc, + ) + for producerIdx := 0; producerIdx < numProducers; producerIdx++ { + producerIdx := producerIdx + producersErrGroup.Go(func() error { + return runProducer(producerIdx) + }) + } + close(startProducersChan) + err := producersErrGroup.Wait() + require.NoError(t, err) + } + + // emulate delay when consuming the slow queries + consumeFunc := func(request Request) error { + schedulerRequest := request.(*SchedulerRequest) + if schedulerRequest.AdditionalQueueDimensions[0] == slowConsumerQueueDimension { + time.Sleep(slowConsumerLatency) + } - // record total queue duration by queue dimensions and whether the queue splitting was enabled - for _, queueDimension := range []string{normalQueueDimension, slowConsumerQueueDimension} { - queueDurationTotals[additionalQueueDimensionsEnabled][queueDimension] = promtest.ToFloat64( - queueDuration.With(prometheus.Labels{"additional_queue_dimensions": queueDimension}), - ) - } + queueTime := time.Since(schedulerRequest.EnqueueTime) + additionalQueueDimensionLabels := strings.Join(schedulerRequest.AdditionalQueueDimensions, ":") + queueDuration.With(prometheus.Labels{"additional_queue_dimensions": additionalQueueDimensionLabels}).Add(queueTime.Seconds()) + return nil + } + + // consume queries + queueConsumerErrGroup, ctx := errgroup.WithContext(ctx) + startConsumersChan := make(chan struct{}) + runConsumer := runQueueConsumerIters(ctx, queue, totalRequests, numConsumers, startConsumersChan, consumeFunc) + + for consumerIdx := 0; consumerIdx < numConsumers; consumerIdx++ { + consumerIdx := consumerIdx + queueConsumerErrGroup.Go(func() error { + return runConsumer(consumerIdx) + }) + } + + close(startConsumersChan) + err = queueConsumerErrGroup.Wait() + require.NoError(t, err) + + // record total queue duration by queue dimensions and whether the queue splitting was enabled + for _, queueDimension := range []string{normalQueueDimension, slowConsumerQueueDimension} { + queueDurationTotals[additionalQueueDimensionsEnabled][queueDimension] = promtest.ToFloat64( + queueDuration.With(prometheus.Labels{"additional_queue_dimensions": queueDimension}), + ) + } + + promRegistry.Unregister(queueDuration) + } - promRegistry.Unregister(queueDuration) + // total or average time in queue for a normal queue item should be roughly cut in half + // when queue splitting is enabled, as the average normal queue item waits behind + // half of the slow queue items, instead of waiting behind all the slow queue items. + expected := queueDurationTotals[false][normalQueueDimension] / 2 + actual := queueDurationTotals[true][normalQueueDimension] + // some variance allowed due to actual time processing needed beyond the slow consumer delay; + // variance is also a function of the number of consumers and the consumer delay chosen. + // variance can be tighter if the test runs longer but there is a tradeoff for testing and CI speed + delta := expected * 0.10 + require.InDelta(t, expected, actual, delta) + }) } - // total or average time in queue for a normal queue item should be roughly cut in half - // when queue splitting is enabled, as the average normal queue item waits behind - // half of the slow queue items, instead of waiting behind all the slow queue items. - expected := queueDurationTotals[false][normalQueueDimension] / 2 - actual := queueDurationTotals[true][normalQueueDimension] - // some variance allowed due to actual time processing needed beyond the slow consumer delay; - // variance is also a function of the number of consumers and the consumer delay chosen. - // variance can be tighter if the test runs longer but there is a tradeoff for testing and CI speed - delta := expected * 0.10 - require.InDelta(t, expected, actual, delta) - } func BenchmarkConcurrentQueueOperations(b *testing.B) { - maxQueriersPerTenant := 0 // disable shuffle sharding - forgetQuerierDelay := time.Duration(0) - maxOutstandingRequestsPerTenant := 100 - - for _, numTenants := range []int{1, 10, 1000} { - b.Run(fmt.Sprintf("%v tenants", numTenants), func(b *testing.B) { - - // Query-frontends run 5 parallel streams per scheduler by default, - // and we typically see 2-5 frontends running at any one time. - for _, numProducers := range []int{10, 25} { - b.Run(fmt.Sprintf("%v concurrent producers", numProducers), func(b *testing.B) { - - // Queriers run with parallelism of 16 when query sharding is enabled. - for _, numConsumers := range []int{16, 160, 1600} { - b.Run(fmt.Sprintf("%v concurrent consumers", numConsumers), func(b *testing.B) { - queue, err := NewRequestQueue( - log.NewNopLogger(), - maxOutstandingRequestsPerTenant, - true, - forgetQuerierDelay, - promauto.With(nil).NewGaugeVec(prometheus.GaugeOpts{}, []string{"user"}), - promauto.With(nil).NewCounterVec(prometheus.CounterOpts{}, []string{"user"}), - promauto.With(nil).NewHistogram(prometheus.HistogramOpts{}), - promauto.With(nil).NewSummaryVec(prometheus.SummaryOpts{}, []string{"query_component"}), - ) - require.NoError(b, err) - - startSignalChan := make(chan struct{}) - queueActorsErrGroup, ctx := errgroup.WithContext(context.Background()) - - require.NoError(b, queue.starting(ctx)) - b.Cleanup(func() { - require.NoError(b, queue.stop(nil)) - }) - - runProducer := runQueueProducerIters( - queue, maxQueriersPerTenant, b.N, numProducers, numTenants, startSignalChan, nil, - ) - - for producerIdx := 0; producerIdx < numProducers; producerIdx++ { - producerIdx := producerIdx - queueActorsErrGroup.Go(func() error { - return runProducer(producerIdx) - }) - } - - runConsumer := runQueueConsumerIters(ctx, queue, b.N, numConsumers, startSignalChan, nil) - - for consumerIdx := 0; consumerIdx < numConsumers; consumerIdx++ { - consumerIdx := consumerIdx - queueActorsErrGroup.Go(func() error { - return runConsumer(consumerIdx) + treeTypes := buildTreeTestsStruct() + + for _, t := range treeTypes { + b.Run(t.name, func(b *testing.B) { + maxQueriersPerTenant := 0 // disable shuffle sharding + forgetQuerierDelay := time.Duration(0) + maxOutstandingRequestsPerTenant := 100 + + for _, numTenants := range []int{1, 10, 1000} { + b.Run(fmt.Sprintf("%v tenants", numTenants), func(b *testing.B) { + + // Query-frontends run 5 parallel streams per scheduler by default, + // and we typically see 2-5 frontends running at any one time. + for _, numProducers := range []int{10, 25} { + b.Run(fmt.Sprintf("%v concurrent producers", numProducers), func(b *testing.B) { + + // Queriers run with parallelism of 16 when query sharding is enabled. + for _, numConsumers := range []int{16, 160, 1600} { + b.Run(fmt.Sprintf("%v concurrent consumers", numConsumers), func(b *testing.B) { + queue, err := NewRequestQueue( + log.NewNopLogger(), + maxOutstandingRequestsPerTenant, + true, + t.useMultiAlgoTreeQueue, + forgetQuerierDelay, + promauto.With(nil).NewGaugeVec(prometheus.GaugeOpts{}, []string{"user"}), + promauto.With(nil).NewCounterVec(prometheus.CounterOpts{}, []string{"user"}), + promauto.With(nil).NewHistogram(prometheus.HistogramOpts{}), + promauto.With(nil).NewSummaryVec(prometheus.SummaryOpts{}, []string{"query_component"}), + ) + require.NoError(b, err) + + startSignalChan := make(chan struct{}) + queueActorsErrGroup, ctx := errgroup.WithContext(context.Background()) + + require.NoError(b, queue.starting(ctx)) + b.Cleanup(func() { + require.NoError(b, queue.stop(nil)) + }) + + runProducer := runQueueProducerIters( + queue, maxQueriersPerTenant, b.N, numProducers, numTenants, startSignalChan, nil, + ) + + for producerIdx := 0; producerIdx < numProducers; producerIdx++ { + producerIdx := producerIdx + queueActorsErrGroup.Go(func() error { + return runProducer(producerIdx) + }) + } + + runConsumer := runQueueConsumerIters(ctx, queue, b.N, numConsumers, startSignalChan, nil) + + for consumerIdx := 0; consumerIdx < numConsumers; consumerIdx++ { + consumerIdx := consumerIdx + queueActorsErrGroup.Go(func() error { + return runConsumer(consumerIdx) + }) + } + + b.ResetTimer() + close(startSignalChan) + err = queueActorsErrGroup.Wait() + if err != nil { + require.NoError(b, err) + } }) } - - b.ResetTimer() - close(startSignalChan) - err = queueActorsErrGroup.Wait() - if err != nil { - require.NoError(b, err) - } }) } }) } + }) } } @@ -404,282 +434,333 @@ func TestRequestQueue_GetNextRequestForQuerier_ShouldGetRequestAfterReshardingBe const forgetDelay = 3 * time.Second const testTimeout = 10 * time.Second - queue, err := NewRequestQueue( - log.NewNopLogger(), - 1, true, - forgetDelay, - promauto.With(nil).NewGaugeVec(prometheus.GaugeOpts{}, []string{"user"}), - promauto.With(nil).NewCounterVec(prometheus.CounterOpts{}, []string{"user"}), - promauto.With(nil).NewHistogram(prometheus.HistogramOpts{}), - promauto.With(nil).NewSummaryVec(prometheus.SummaryOpts{}, []string{"query_component"}), - ) - require.NoError(t, err) - - // Start the queue service. - ctx := context.Background() - require.NoError(t, services.StartAndAwaitRunning(ctx, queue)) - t.Cleanup(func() { - // if the test has failed and the queue does not get cleared, - // we must send a shutdown signal for the remaining connected querier - // or else StopAndAwaitTerminated will never complete. - queue.SubmitUnregisterQuerierConnection("querier-2") - require.NoError(t, services.StopAndAwaitTerminated(ctx, queue)) - }) - - // Two queriers connect. - queue.SubmitRegisterQuerierConnection("querier-1") - queue.SubmitRegisterQuerierConnection("querier-2") - - // Querier-2 waits for a new request. - querier2wg := sync.WaitGroup{} - querier2wg.Add(1) - go func() { - defer querier2wg.Done() - _, _, err := queue.WaitForRequestForQuerier(ctx, FirstTenant(), "querier-2") - require.NoError(t, err) - }() - - // Querier-1 crashes (no graceful shutdown notification). - queue.SubmitUnregisterQuerierConnection("querier-1") - - // Enqueue a request from an user which would be assigned to querier-1. - // NOTE: "user-1" shuffle shard always chooses the first querier ("querier-1" in this case) - // when there are only one or two queriers in the sorted list of connected queriers - req := &SchedulerRequest{ - Ctx: context.Background(), - Request: &httpgrpc.HTTPRequest{Method: "GET", Url: "/hello"}, - AdditionalQueueDimensions: randAdditionalQueueDimension(true), - } - require.NoError(t, queue.SubmitRequestToEnqueue("user-1", req, 1, nil)) - - startTime := time.Now() - done := make(chan struct{}) - go func() { - querier2wg.Wait() - close(done) - }() - - select { - case <-done: - waitTime := time.Since(startTime) - // We expect that querier-2 got the request only after forget delay is passed. - assert.GreaterOrEqual(t, waitTime.Milliseconds(), forgetDelay.Milliseconds()) - case <-time.After(testTimeout): - t.Fatal("timeout: querier-2 did not receive the request expected to be resharded to querier-2") + treeTypes := buildTreeTestsStruct() + + for _, tt := range treeTypes { + t.Run(tt.name, func(t *testing.T) { + queue, err := NewRequestQueue( + log.NewNopLogger(), + 1, true, + tt.useMultiAlgoTreeQueue, + forgetDelay, + promauto.With(nil).NewGaugeVec(prometheus.GaugeOpts{}, []string{"user"}), + promauto.With(nil).NewCounterVec(prometheus.CounterOpts{}, []string{"user"}), + promauto.With(nil).NewHistogram(prometheus.HistogramOpts{}), + promauto.With(nil).NewSummaryVec(prometheus.SummaryOpts{}, []string{"query_component"}), + ) + require.NoError(t, err) + + // Start the queue service. + ctx := context.Background() + require.NoError(t, services.StartAndAwaitRunning(ctx, queue)) + t.Cleanup(func() { + // if the test has failed and the queue does not get cleared, + // we must send a shutdown signal for the remaining connected querier + // or else StopAndAwaitTerminated will never complete. + queue.SubmitUnregisterQuerierConnection("querier-2") + require.NoError(t, services.StopAndAwaitTerminated(ctx, queue)) + }) + + // Two queriers connect. + queue.SubmitRegisterQuerierConnection("querier-1") + queue.SubmitRegisterQuerierConnection("querier-2") + + // Querier-2 waits for a new request. + querier2wg := sync.WaitGroup{} + querier2wg.Add(1) + go func() { + defer querier2wg.Done() + _, _, err := queue.WaitForRequestForQuerier(ctx, FirstTenant(), "querier-2") + require.NoError(t, err) + }() + + // Querier-1 crashes (no graceful shutdown notification). + queue.SubmitUnregisterQuerierConnection("querier-1") + + // Enqueue a request from an user which would be assigned to querier-1. + // NOTE: "user-1" shuffle shard always chooses the first querier ("querier-1" in this case) + // when there are only one or two queriers in the sorted list of connected queriers + req := &SchedulerRequest{ + Ctx: context.Background(), + Request: &httpgrpc.HTTPRequest{Method: "GET", Url: "/hello"}, + AdditionalQueueDimensions: randAdditionalQueueDimension(true), + } + require.NoError(t, queue.SubmitRequestToEnqueue("user-1", req, 1, nil)) + + startTime := time.Now() + done := make(chan struct{}) + go func() { + querier2wg.Wait() + close(done) + }() + + select { + case <-done: + waitTime := time.Since(startTime) + // We expect that querier-2 got the request only after forget delay is passed. + assert.GreaterOrEqual(t, waitTime.Milliseconds(), forgetDelay.Milliseconds()) + case <-time.After(testTimeout): + t.Fatal("timeout: querier-2 did not receive the request expected to be resharded to querier-2") + } + + }) } } func TestRequestQueue_GetNextRequestForQuerier_ReshardNotifiedCorrectlyForMultipleQuerierForget(t *testing.T) { const forgetDelay = 3 * time.Second const testTimeout = 10 * time.Second + treeTypes := buildTreeTestsStruct() + + for _, tt := range treeTypes { + t.Run(tt.name, func(t *testing.T) { + queue, err := NewRequestQueue( + log.NewNopLogger(), + 1, true, + tt.useMultiAlgoTreeQueue, + forgetDelay, + promauto.With(nil).NewGaugeVec(prometheus.GaugeOpts{}, []string{"user"}), + promauto.With(nil).NewCounterVec(prometheus.CounterOpts{}, []string{"user"}), + promauto.With(nil).NewHistogram(prometheus.HistogramOpts{}), + promauto.With(nil).NewSummaryVec(prometheus.SummaryOpts{}, []string{"query_component"}), + ) + require.NoError(t, err) - queue, err := NewRequestQueue( - log.NewNopLogger(), - 1, true, - forgetDelay, - promauto.With(nil).NewGaugeVec(prometheus.GaugeOpts{}, []string{"user"}), - promauto.With(nil).NewCounterVec(prometheus.CounterOpts{}, []string{"user"}), - promauto.With(nil).NewHistogram(prometheus.HistogramOpts{}), - promauto.With(nil).NewSummaryVec(prometheus.SummaryOpts{}, []string{"query_component"}), - ) - require.NoError(t, err) - - // Start the queue service. - ctx := context.Background() - require.NoError(t, services.StartAndAwaitRunning(ctx, queue)) - t.Cleanup(func() { - // if the test has failed and the queue does not get cleared, - // we must send a shutdown signal for the remaining connected querier - // or else StopAndAwaitTerminated will never complete. - queue.SubmitUnregisterQuerierConnection("querier-2") - require.NoError(t, services.StopAndAwaitTerminated(ctx, queue)) - }) - - // Three queriers connect. - // We will submit the enqueue request with maxQueriers: 2. - // - // Whenever forgetDisconnectedQueriers runs, all queriers which reached zero connections since the last - // run of forgetDisconnectedQueriers will all be removed in from the shuffle shard in the same run. - // - // In this case two queriers are forgotten in the same run, but only the first forgotten querier triggers a reshard. - // In the first reshard, the tenant goes from a shuffled subset of queriers to a state of - // "tenant can use all queriers", as connected queriers is now <= tenant.maxQueriers. - // The second forgotten querier won't trigger a reshard, as connected queriers is already <= tenant.maxQueriers. - // - // We are testing that the occurrence of a reshard is reported correctly - // when not all querier forget operations in a single run of forgetDisconnectedQueriers caused a reshard. - queue.SubmitRegisterQuerierConnection("querier-1") - queue.SubmitRegisterQuerierConnection("querier-2") - queue.SubmitRegisterQuerierConnection("querier-3") - - // querier-2 waits for a new request. - querier2wg := sync.WaitGroup{} - querier2wg.Add(1) - go func() { - defer querier2wg.Done() - _, _, err := queue.WaitForRequestForQuerier(ctx, FirstTenant(), "querier-2") - require.NoError(t, err) - }() - - // querier-1 and querier-3 crash (no graceful shutdown notification). - queue.SubmitUnregisterQuerierConnection("querier-1") - queue.SubmitUnregisterQuerierConnection("querier-3") - - // Enqueue a request from a tenant which would be assigned to querier-1. - // NOTE: "user-1" shuffle shard always chooses the first querier ("querier-1" in this case) - // when there are only one or two queriers in the sorted list of connected queriers - req := &SchedulerRequest{ - Ctx: context.Background(), - Request: &httpgrpc.HTTPRequest{Method: "GET", Url: "/hello"}, - AdditionalQueueDimensions: randAdditionalQueueDimension(true), - } - require.NoError(t, queue.SubmitRequestToEnqueue("user-1", req, 2, nil)) - - startTime := time.Now() - done := make(chan struct{}) - go func() { - querier2wg.Wait() - close(done) - }() - - select { - case <-done: - waitTime := time.Since(startTime) - // We expect that querier-2 got the request only after forget delay is passed. - assert.GreaterOrEqual(t, waitTime.Milliseconds(), forgetDelay.Milliseconds()) - case <-time.After(testTimeout): - t.Fatal("timeout: querier-2 did not receive the request expected to be resharded to querier-2") + // Start the queue service. + ctx := context.Background() + require.NoError(t, services.StartAndAwaitRunning(ctx, queue)) + t.Cleanup(func() { + // if the test has failed and the queue does not get cleared, + // we must send a shutdown signal for the remaining connected querier + // or else StopAndAwaitTerminated will never complete. + queue.SubmitUnregisterQuerierConnection("querier-2") + require.NoError(t, services.StopAndAwaitTerminated(ctx, queue)) + }) + + // Three queriers connect. + // We will submit the enqueue request with maxQueriers: 2. + // + // Whenever forgetDisconnectedQueriers runs, all queriers which reached zero connections since the last + // run of forgetDisconnectedQueriers will all be removed in from the shuffle shard in the same run. + // + // In this case two queriers are forgotten in the same run, but only the first forgotten querier triggers a reshard. + // In the first reshard, the tenant goes from a shuffled subset of queriers to a state of + // "tenant can use all queriers", as connected queriers is now <= tenant.maxQueriers. + // The second forgotten querier won't trigger a reshard, as connected queriers is already <= tenant.maxQueriers. + // + // We are testing that the occurrence of a reshard is reported correctly + // when not all querier forget operations in a single run of forgetDisconnectedQueriers caused a reshard. + queue.SubmitRegisterQuerierConnection("querier-1") + queue.SubmitRegisterQuerierConnection("querier-2") + queue.SubmitRegisterQuerierConnection("querier-3") + + // querier-2 waits for a new request. + querier2wg := sync.WaitGroup{} + querier2wg.Add(1) + go func() { + defer querier2wg.Done() + _, _, err := queue.WaitForRequestForQuerier(ctx, FirstTenant(), "querier-2") + require.NoError(t, err) + }() + + // querier-1 and querier-3 crash (no graceful shutdown notification). + queue.SubmitUnregisterQuerierConnection("querier-1") + queue.SubmitUnregisterQuerierConnection("querier-3") + + // Enqueue a request from a tenant which would be assigned to querier-1. + // NOTE: "user-1" shuffle shard always chooses the first querier ("querier-1" in this case) + // when there are only one or two queriers in the sorted list of connected queriers + req := &SchedulerRequest{ + Ctx: context.Background(), + Request: &httpgrpc.HTTPRequest{Method: "GET", Url: "/hello"}, + AdditionalQueueDimensions: randAdditionalQueueDimension(true), + } + require.NoError(t, queue.SubmitRequestToEnqueue("user-1", req, 2, nil)) + + startTime := time.Now() + done := make(chan struct{}) + go func() { + querier2wg.Wait() + close(done) + }() + + select { + case <-done: + waitTime := time.Since(startTime) + // We expect that querier-2 got the request only after forget delay is passed. + assert.GreaterOrEqual(t, waitTime.Milliseconds(), forgetDelay.Milliseconds()) + case <-time.After(testTimeout): + t.Fatal("timeout: querier-2 did not receive the request expected to be resharded to querier-2") + } + + }) } } func TestRequestQueue_GetNextRequestForQuerier_ShouldReturnAfterContextCancelled(t *testing.T) { const forgetDelay = 3 * time.Second const querierID = "querier-1" + treeTypes := buildTreeTestsStruct() + + for _, tt := range treeTypes { + t.Run(tt.name, func(t *testing.T) { + queue, err := NewRequestQueue( + log.NewNopLogger(), + 1, + true, + tt.useMultiAlgoTreeQueue, + forgetDelay, + promauto.With(nil).NewGaugeVec(prometheus.GaugeOpts{}, []string{"user"}), + promauto.With(nil).NewCounterVec(prometheus.CounterOpts{}, []string{"user"}), + promauto.With(nil).NewHistogram(prometheus.HistogramOpts{}), + promauto.With(nil).NewSummaryVec(prometheus.SummaryOpts{}, []string{"query_component"}), + ) + require.NoError(t, err) + + require.NoError(t, services.StartAndAwaitRunning(context.Background(), queue)) + t.Cleanup(func() { + require.NoError(t, services.StopAndAwaitTerminated(context.Background(), queue)) + }) - queue, err := NewRequestQueue( - log.NewNopLogger(), - 1, - true, - forgetDelay, - promauto.With(nil).NewGaugeVec(prometheus.GaugeOpts{}, []string{"user"}), - promauto.With(nil).NewCounterVec(prometheus.CounterOpts{}, []string{"user"}), - promauto.With(nil).NewHistogram(prometheus.HistogramOpts{}), - promauto.With(nil).NewSummaryVec(prometheus.SummaryOpts{}, []string{"query_component"}), - ) - require.NoError(t, err) - - require.NoError(t, services.StartAndAwaitRunning(context.Background(), queue)) - t.Cleanup(func() { - require.NoError(t, services.StopAndAwaitTerminated(context.Background(), queue)) - }) - - queue.SubmitRegisterQuerierConnection(querierID) - - // Calling WaitForRequestForQuerier with a context that is already cancelled should fail immediately. - deadCtx, cancel := context.WithCancel(context.Background()) - cancel() - r, tenant, err := queue.WaitForRequestForQuerier(deadCtx, FirstTenant(), querierID) - assert.Nil(t, r) - assert.Equal(t, FirstTenant(), tenant) - assert.ErrorIs(t, err, context.Canceled) - - // Further, a context canceled after WaitForRequestForQuerier publishes a request should also fail. - errChan := make(chan error) - ctx, cancel := context.WithCancel(context.Background()) - - go func() { - _, _, err := queue.WaitForRequestForQuerier(ctx, FirstTenant(), querierID) - errChan <- err - }() - - time.Sleep(20 * time.Millisecond) // Wait for WaitForRequestForQuerier to be waiting for a query. - cancel() - - select { - case err := <-errChan: - require.Equal(t, context.Canceled, err) - case <-time.After(time.Second): - require.Fail(t, "gave up waiting for GetNextRequestForQuerierToReturn") + queue.SubmitRegisterQuerierConnection(querierID) + + // Calling WaitForRequestForQuerier with a context that is already cancelled should fail immediately. + deadCtx, cancel := context.WithCancel(context.Background()) + cancel() + r, tenant, err := queue.WaitForRequestForQuerier(deadCtx, FirstTenant(), querierID) + assert.Nil(t, r) + assert.Equal(t, FirstTenant(), tenant) + assert.ErrorIs(t, err, context.Canceled) + + // Further, a context canceled after WaitForRequestForQuerier publishes a request should also fail. + errChan := make(chan error) + ctx, cancel := context.WithCancel(context.Background()) + + go func() { + _, _, err := queue.WaitForRequestForQuerier(ctx, FirstTenant(), querierID) + errChan <- err + }() + + time.Sleep(20 * time.Millisecond) // Wait for WaitForRequestForQuerier to be waiting for a query. + cancel() + + select { + case err := <-errChan: + require.Equal(t, context.Canceled, err) + case <-time.After(time.Second): + require.Fail(t, "gave up waiting for GetNextRequestForQuerierToReturn") + } + + }) } + } func TestRequestQueue_GetNextRequestForQuerier_ShouldReturnImmediatelyIfQuerierIsAlreadyShuttingDown(t *testing.T) { const forgetDelay = 3 * time.Second const querierID = "querier-1" - queue, err := NewRequestQueue( - log.NewNopLogger(), - 1, - true, - forgetDelay, - promauto.With(nil).NewGaugeVec(prometheus.GaugeOpts{}, []string{"user"}), - promauto.With(nil).NewCounterVec(prometheus.CounterOpts{}, []string{"user"}), - promauto.With(nil).NewHistogram(prometheus.HistogramOpts{}), - promauto.With(nil).NewSummaryVec(prometheus.SummaryOpts{}, []string{"query_component"}), - ) - require.NoError(t, err) - - ctx := context.Background() - require.NoError(t, services.StartAndAwaitRunning(ctx, queue)) - t.Cleanup(func() { - require.NoError(t, services.StopAndAwaitTerminated(ctx, queue)) - }) - - queue.SubmitRegisterQuerierConnection(querierID) - queue.SubmitNotifyQuerierShutdown(querierID) - - _, _, err = queue.WaitForRequestForQuerier(context.Background(), FirstTenant(), querierID) - require.EqualError(t, err, "querier has informed the scheduler it is shutting down") + treeTypes := buildTreeTestsStruct() + + for _, tt := range treeTypes { + t.Run(tt.name, func(t *testing.T) { + queue, err := NewRequestQueue( + log.NewNopLogger(), + 1, + true, + tt.useMultiAlgoTreeQueue, + forgetDelay, + promauto.With(nil).NewGaugeVec(prometheus.GaugeOpts{}, []string{"user"}), + promauto.With(nil).NewCounterVec(prometheus.CounterOpts{}, []string{"user"}), + promauto.With(nil).NewHistogram(prometheus.HistogramOpts{}), + promauto.With(nil).NewSummaryVec(prometheus.SummaryOpts{}, []string{"query_component"}), + ) + require.NoError(t, err) + + ctx := context.Background() + require.NoError(t, services.StartAndAwaitRunning(ctx, queue)) + t.Cleanup(func() { + require.NoError(t, services.StopAndAwaitTerminated(ctx, queue)) + }) + + queue.SubmitRegisterQuerierConnection(querierID) + queue.SubmitNotifyQuerierShutdown(querierID) + + _, _, err = queue.WaitForRequestForQuerier(context.Background(), FirstTenant(), querierID) + require.EqualError(t, err, "querier has informed the scheduler it is shutting down") + }) + } + } func TestRequestQueue_tryDispatchRequestToQuerier_ShouldReEnqueueAfterFailedSendToQuerier(t *testing.T) { const forgetDelay = 3 * time.Second const querierID = "querier-1" - queue, err := NewRequestQueue( - log.NewNopLogger(), - 1, - true, - forgetDelay, - promauto.With(nil).NewGaugeVec(prometheus.GaugeOpts{}, []string{"user"}), - promauto.With(nil).NewCounterVec(prometheus.CounterOpts{}, []string{"user"}), - promauto.With(nil).NewHistogram(prometheus.HistogramOpts{}), - promauto.With(nil).NewSummaryVec(prometheus.SummaryOpts{}, []string{"query_component"}), - ) - require.NoError(t, err) - - // bypassing queue dispatcher loop for direct usage of the queueBroker and - // passing a waitingQuerierConn for a canceled querier connection - queueBroker := newQueueBroker(queue.maxOutstandingPerTenant, queue.additionalQueueDimensionsEnabled, queue.forgetDelay) - queueBroker.addQuerierConnection(querierID) - - tenantMaxQueriers := 0 // no sharding - req := &SchedulerRequest{ - Ctx: context.Background(), - Request: &httpgrpc.HTTPRequest{Method: "GET", Url: "/hello"}, - AdditionalQueueDimensions: randAdditionalQueueDimension(true), - } - tr := tenantRequest{ - tenantID: TenantID("tenant-1"), - req: req, - } + treeTypes := buildTreeTestsStruct() + for _, tt := range treeTypes { + t.Run(tt.name, func(t *testing.T) { + queue, err := NewRequestQueue( + log.NewNopLogger(), + 1, + true, + tt.useMultiAlgoTreeQueue, + forgetDelay, + promauto.With(nil).NewGaugeVec(prometheus.GaugeOpts{}, []string{"user"}), + promauto.With(nil).NewCounterVec(prometheus.CounterOpts{}, []string{"user"}), + promauto.With(nil).NewHistogram(prometheus.HistogramOpts{}), + promauto.With(nil).NewSummaryVec(prometheus.SummaryOpts{}, []string{"query_component"}), + ) + require.NoError(t, err) + + // bypassing queue dispatcher loop for direct usage of the queueBroker and + // passing a waitingQuerierConn for a canceled querier connection + queueBroker := newQueueBroker(queue.maxOutstandingPerTenant, queue.additionalQueueDimensionsEnabled, false, queue.forgetDelay) + queueBroker.addQuerierConnection(querierID) + + tenantMaxQueriers := 0 // no sharding + req := &SchedulerRequest{ + Ctx: context.Background(), + Request: &httpgrpc.HTTPRequest{Method: "GET", Url: "/hello"}, + AdditionalQueueDimensions: randAdditionalQueueDimension(true), + } + tr := tenantRequest{ + tenantID: TenantID("tenant-1"), + req: req, + } - require.Nil(t, queueBroker.tenantQueuesTree.getNode(QueuePath{"tenant-1"})) - require.NoError(t, queueBroker.enqueueRequestBack(&tr, tenantMaxQueriers)) - require.False(t, queueBroker.tenantQueuesTree.getNode(QueuePath{"tenant-1"}).IsEmpty()) + // TODO (casie): Clean this up when deprecating legacy tree queue + if tq, ok := queueBroker.tree.(*TreeQueue); ok { + require.Nil(t, tq.getNode(QueuePath{"tenant-1"})) + require.NoError(t, queueBroker.enqueueRequestBack(&tr, tenantMaxQueriers)) + require.False(t, tq.getNode(QueuePath{"tenant-1"}).IsEmpty()) + } else if itq, ok := queueBroker.tree.(*MultiQueuingAlgorithmTreeQueue); ok { + require.Nil(t, itq.GetNode(QueuePath{"tenant-1"})) + require.NoError(t, queueBroker.enqueueRequestBack(&tr, tenantMaxQueriers)) + require.False(t, itq.GetNode(QueuePath{"tenant-1"}).IsEmpty()) + } - ctx, cancel := context.WithCancel(context.Background()) - call := &waitingQuerierConn{ - querierConnCtx: ctx, - querierID: QuerierID(querierID), - lastTenantIndex: FirstTenant(), - recvChan: make(chan requestForQuerier), + ctx, cancel := context.WithCancel(context.Background()) + call := &waitingQuerierConn{ + querierConnCtx: ctx, + querierID: QuerierID(querierID), + lastTenantIndex: FirstTenant(), + recvChan: make(chan requestForQuerier), + } + cancel() // ensure querier context done before send is attempted + + // send to querier will fail but method returns true, + // indicating not to re-submit a request for waitingQuerierConn for the querier + require.True(t, queue.trySendNextRequestForQuerier(call)) + // assert request was re-enqueued for tenant after failed send + // TODO (casie): Clean this up when deprecating legacy tree queue + if tq, ok := queueBroker.tree.(*TreeQueue); ok { + require.False(t, tq.getNode(QueuePath{"tenant-1"}).IsEmpty()) + } else if itq, ok := queueBroker.tree.(*MultiQueuingAlgorithmTreeQueue); ok { + require.False(t, itq.GetNode(QueuePath{"tenant-1"}).IsEmpty()) + } + + }) } - cancel() // ensure querier context done before send is attempted - // send to querier will fail but method returns true, - // indicating not to re-submit a request for waitingQuerierConn for the querier - require.True(t, queue.trySendNextRequestForQuerier(call)) - // assert request was re-enqueued for tenant after failed send - require.False(t, queueBroker.tenantQueuesTree.getNode(QueuePath{"tenant-1"}).IsEmpty()) } diff --git a/pkg/scheduler/queue/tenant_querier_assignment.go b/pkg/scheduler/queue/tenant_querier_assignment.go index 9d3b1bb3440..83ea0473e55 100644 --- a/pkg/scheduler/queue/tenant_querier_assignment.go +++ b/pkg/scheduler/queue/tenant_querier_assignment.go @@ -53,6 +53,8 @@ func (s querierIDSlice) Search(x QuerierID) int { return sort.Search(len(s), func(i int) bool { return s[i] >= x }) } +// tenantQuerierAssignments implements QueuingAlgorithm. In the context of a MultiQueuingAlgorithmTreeQueue, it maintains a mapping of +// tenants to queriers in order to support dequeuing from an appropriate tenant if shuffle-sharding is enabled. type tenantQuerierAssignments struct { // a tenant has many queriers // a tenant has *all* queriers if: @@ -80,14 +82,22 @@ type tenantQuerierAssignments struct { // List of all tenants with queues, used for iteration when searching for next queue to handle. tenantIDOrder []TenantID tenantsByID map[TenantID]*queueTenant + // tenantOrderIndex is the index of the _last_ tenant dequeued from; it is passed by + // the querier, and then updated to the index of the last tenant dequeued from, so it + // can be returned to the querier. Newly connected queriers should pass -1 to start at the + // beginning of tenantIDOrder. + tenantOrderIndex int + tenantNodes map[string][]*Node // Tenant assigned querier ID set as determined by shuffle sharding. // If tenant querier ID set is not nil, only those queriers can handle the tenant's requests, // Tenant querier ID is set to nil if sharding is off or available queriers <= tenant's maxQueriers. tenantQuerierIDs map[TenantID]map[QuerierID]struct{} + currentQuerier QuerierID } // getNextTenantForQuerier gets the next tenant in the tenant order assigned to a given querier. +// It should _only_ be called by the legacy TreeQueue. // // The next tenant for the querier is obtained by rotating through the global tenant order // starting just after the last tenant the querier received a request for, until a tenant @@ -109,7 +119,6 @@ func (tqa *tenantQuerierAssignments) getNextTenantForQuerier(lastTenantIndex int // length of the tenant list, wrapping via modulo would skip the beginning of the list. tenantOrderIndex = 0 } - tenantID := tqa.tenantIDOrder[tenantOrderIndex] if tenantID == emptyTenantID { continue @@ -129,14 +138,6 @@ func (tqa *tenantQuerierAssignments) getNextTenantForQuerier(lastTenantIndex int return nil, lastTenantIndex, nil } -func (tqa *tenantQuerierAssignments) getTenant(tenantID TenantID) (*queueTenant, error) { - if tenantID == emptyTenantID { - return nil, ErrInvalidTenantID - } - tenant := tqa.tenantsByID[tenantID] - return tenant, nil -} - // createOrUpdateTenant creates or updates a tenant into the tenant-querier assignment state. // // New tenants are added to the tenant order list and tenant-querier shards are shuffled if needed. @@ -212,21 +213,31 @@ func (tqa *tenantQuerierAssignments) addQuerierConnection(querierID QuerierID) ( return tqa.recomputeTenantQueriers() } +// removeTenant only manages deletion of a *queueTenant from tenantsByID. All other +// tenant deletion (e.g., from tenantIDOrder, or tenantNodes) is done during the dequeue operation, +// as we cannot remove from those things arbitrarily; we must check whether other tenant +// queues exist for the same tenant before removing. func (tqa *tenantQuerierAssignments) removeTenant(tenantID TenantID) { tenant := tqa.tenantsByID[tenantID] if tenant == nil { return } delete(tqa.tenantsByID, tenantID) - tqa.tenantIDOrder[tenant.orderIndex] = emptyTenantID - - // Shrink tenant list if possible by removing empty tenant IDs. - // We remove only from the end; removing from the middle would re-index all tenant IDs - // and skip tenants when starting iteration from a querier-provided lastTenantIndex. - // Empty tenant IDs stuck in the middle of the slice are handled - // by replacing them when a new tenant ID arrives in the queue. - for i := len(tqa.tenantIDOrder) - 1; i >= 0 && tqa.tenantIDOrder[i] == emptyTenantID; i-- { - tqa.tenantIDOrder = tqa.tenantIDOrder[:i] + + // TODO (casie): When cleaning up old TreeQueue, everything below this point can be removed, since + // tenantIDOrder is updated during tree operations in MultiQueuingAlgorithmTreeQueue. + // Everything below this point should be a no-op on the MultiQueuingAlgorithmTreeQueue + if len(tqa.tenantIDOrder) > tenant.orderIndex && tqa.tenantIDOrder[tenant.orderIndex] == tenantID { + tqa.tenantIDOrder[tenant.orderIndex] = emptyTenantID + + // Shrink tenant list if possible by removing empty tenant IDs. + // We remove only from the end; removing from the middle would re-index all tenant IDs + // and skip tenants when starting iteration from a querier-provided lastTenantIndex. + // Empty tenant IDs stuck in the middle of the slice are handled + // by replacing them when a new tenant ID arrives in the queue. + for i := len(tqa.tenantIDOrder) - 1; i >= 0 && tqa.tenantIDOrder[i] == emptyTenantID; i-- { + tqa.tenantIDOrder = tqa.tenantIDOrder[:i] + } } } @@ -358,3 +369,165 @@ func (tqa *tenantQuerierAssignments) shuffleTenantQueriers(tenantID TenantID, sc tqa.tenantQuerierIDs[tenantID] = querierIDSet return true } + +// dequeueSelectNode chooses the next node to dequeue from based on tenantIDOrder and tenantOrderIndex, which are +// shared across all nodes to maintain an O(n) (where n = # tenants) time-to-dequeue for each tenant. +// If tenant order were maintained by individual nodes, we would end up with O(mn) (where m = # query components) +// time-to-dequeue for a given tenant. +// +// tenantOrderIndex is incremented, checks if the tenant at that index is a child of the current node (it may not be, +// e.g., in the case that a tenant has queries queued for one query component but not others), and if so, returns +// the tenant node if currentQuerier can handle queries for that tenant. +// +// Note that because we use the shared tenantIDOrder and tenantOrderIndex to manage the queue, we functionally +// ignore each Node's individual queueOrder and queuePosition. +func (tqa *tenantQuerierAssignments) dequeueSelectNode(node *Node) (*Node, bool) { + // can't get a tenant if no querier set + if tqa.currentQuerier == "" { + return nil, true + } + + checkedAllNodes := node.childrenChecked == len(node.queueMap)+1 // must check local queue as well + + // advance queue position for dequeue + tqa.tenantOrderIndex++ + if tqa.tenantOrderIndex >= len(tqa.tenantIDOrder) { + tqa.tenantOrderIndex = localQueueIndex + } + + // no children or local queue reached + if len(node.queueMap) == 0 || tqa.tenantOrderIndex == localQueueIndex { + return node, checkedAllNodes + } + + checkIndex := tqa.tenantOrderIndex + + // iterate through the tenant order until we find a tenant that is assigned to the current querier, or + // have checked the entire tenantIDOrder, whichever comes first + for iters := 0; iters < len(tqa.tenantIDOrder); iters++ { + if checkIndex >= len(tqa.tenantIDOrder) { + // do not use modulo to wrap this index; tenantOrderIndex is provided from an outer process + // which does not know if the tenant list has changed since the last dequeue + // wrapping with modulo after shrinking the list could cause us to skip tenants + checkIndex = 0 + } + tenantID := tqa.tenantIDOrder[checkIndex] + tenantName := string(tenantID) + + if _, ok := node.queueMap[tenantName]; !ok { + // tenant not in _this_ node's children, move on + checkIndex++ + continue + } + + checkedAllNodes = node.childrenChecked == len(node.queueMap)+1 + + // if the tenant-querier set is nil, any querier can serve this tenant + if tqa.tenantQuerierIDs[tenantID] == nil { + tqa.tenantOrderIndex = checkIndex + return node.queueMap[tenantName], checkedAllNodes + } + // otherwise, check if the querier is assigned to this tenant + if tenantQuerierSet, ok := tqa.tenantQuerierIDs[tenantID]; ok { + if _, ok := tenantQuerierSet[tqa.currentQuerier]; ok { + tqa.tenantOrderIndex = checkIndex + return node.queueMap[tenantName], checkedAllNodes + } + } + checkIndex++ + } + return nil, checkedAllNodes +} + +// dequeueUpdateState deletes the dequeued-from node from the following locations if it is empty: +// - parent's queueMap, +// - tenantNodes +// - tenantIDOrder iff there are no other nodes by the same name in tenantNodes. If the child is at the end of +// tenantIDOrder, it is removed outright; otherwise, it is replaced with an empty ("") element +// +// dequeueUpdateState would normally also handle incrementing the queue position after performing a dequeue, but +// tenantQuerierAssignments currently expects the caller to handle this by having the querier set tenantOrderIndex. +func (tqa *tenantQuerierAssignments) dequeueUpdateState(node *Node, dequeuedFrom *Node) { + // if dequeuedFrom is nil or is not empty, we don't need to do anything; + // position updates will be handled by the caller, and we don't need to remove any nodes. + if dequeuedFrom == nil || !dequeuedFrom.IsEmpty() { + return + } + + // delete from the node's children + childName := dequeuedFrom.Name() + delete(node.queueMap, childName) + + // delete from shared tenantNodes + for i, tenantNode := range tqa.tenantNodes[childName] { + if tenantNode == dequeuedFrom { + tqa.tenantNodes[childName] = append(tqa.tenantNodes[childName][:i], tqa.tenantNodes[childName][i+1:]...) + } + } + + // check tenantNodes; we only remove from tenantIDOrder if all nodes with this name are gone. + // If the removed child is at the _end_ of tenantIDOrder, we remove it outright; otherwise, + // we replace it with an empty string. Removal of elements from anywhere other than the end of the slice + // would re-index all tenant IDs, resulting in skipped tenants when starting iteration from the + // querier-provided lastTenantIndex. + removeFromSharedQueueOrder := len(tqa.tenantNodes[childName]) == 0 + + if removeFromSharedQueueOrder { + for idx, name := range tqa.tenantIDOrder { + if string(name) == childName { + tqa.tenantIDOrder[idx] = emptyTenantID + } + } + // clear all sequential empty elements from tenantIDOrder + lastElementIndex := len(tqa.tenantIDOrder) - 1 + for i := lastElementIndex; i >= 0 && tqa.tenantIDOrder[i] == ""; i-- { + tqa.tenantIDOrder = tqa.tenantIDOrder[:i] + } + } +} + +// addChildNode adds a child to: +// - the node's own queueMap +// - tenantNodes, which maintains a slice of all nodes with the same name. tenantNodes is checked on node deletion +// to ensure that we only remove a tenant from tenantIDOrder if _all_ nodes with the same name have been removed. +// - tenantIDOrder iff the node did not already exist in tenantNodes or tenantIDOrder. addChildNode will place +// a new tenant in the first empty ("") element it finds in tenantIDOrder, or at the end if no empty elements exist. +func (tqa *tenantQuerierAssignments) addChildNode(parent, child *Node) { + childName := child.Name() + _, tenantHasAnyQueue := tqa.tenantNodes[childName] + + // add childNode to node's queueMap, + // and to the shared tenantNodes map + parent.queueMap[childName] = child + tqa.tenantNodes[childName] = append(tqa.tenantNodes[childName], child) + + // if child has any queue, it should already be in tenantIDOrder, return without altering order + if tenantHasAnyQueue { + return + } + + // otherwise, replace the first empty element in n.tenantIDOrder with childName, or append to the end + for i, elt := range tqa.tenantIDOrder { + // if we encounter a tenant with this name in tenantIDOrder already, return without altering order. + // This is a weak check (childName could exist farther down tenantIDOrder, but be inserted here as well), + // but should be fine, since the only time we hit this case is when createOrUpdateTenant is called, which + // only happens immediately before enqueueing. + if elt == TenantID(childName) { + return + } + if elt == "" { + tqa.tenantIDOrder[i] = TenantID(childName) + return + } + } + // if we get here, we didn't find any empty elements in tenantIDOrder; append + tqa.tenantIDOrder = append(tqa.tenantIDOrder, TenantID(childName)) +} + +// updateQueuingAlgorithmState should be called before attempting to dequeue, and updates inputs required by this +// QueuingAlgorithm to dequeue the appropriate value for the given querier. In some test cases, it need not be called +// before consecutive dequeues for the same querier, but in all operating cases, it should be called ahead of a dequeue. +func (tqa *tenantQuerierAssignments) updateQueuingAlgorithmState(querierID QuerierID, tenantOrderIndex int) { + tqa.currentQuerier = querierID + tqa.tenantOrderIndex = tenantOrderIndex +} diff --git a/pkg/scheduler/queue/tenant_queues.go b/pkg/scheduler/queue/tenant_queues.go index a7439f6a5b6..98c7df0db82 100644 --- a/pkg/scheduler/queue/tenant_queues.go +++ b/pkg/scheduler/queue/tenant_queues.go @@ -6,6 +6,7 @@ package queue import ( + "fmt" "time" ) @@ -23,32 +24,63 @@ type tenantRequest struct { // queueBroker encapsulates access to tenant queues for pending requests // and maintains consistency with the tenant-querier assignments type queueBroker struct { - tenantQueuesTree *TreeQueue + tree Tree - tenantQuerierAssignments tenantQuerierAssignments + tenantQuerierAssignments *tenantQuerierAssignments maxTenantQueueSize int additionalQueueDimensionsEnabled bool + prioritizeQueryComponents bool } -func newQueueBroker(maxTenantQueueSize int, additionalQueueDimensionsEnabled bool, forgetDelay time.Duration) *queueBroker { - return &queueBroker{ - tenantQueuesTree: NewTreeQueue("root"), - tenantQuerierAssignments: tenantQuerierAssignments{ - queriersByID: map[QuerierID]*querierConn{}, - querierIDsSorted: nil, - querierForgetDelay: forgetDelay, - tenantIDOrder: nil, - tenantsByID: map[TenantID]*queueTenant{}, - tenantQuerierIDs: map[TenantID]map[QuerierID]struct{}{}, - }, +func newQueueBroker( + maxTenantQueueSize int, + additionalQueueDimensionsEnabled bool, + useMultiAlgoTreeQueue bool, + forgetDelay time.Duration, +) *queueBroker { + currentQuerier := QuerierID("") + tqas := &tenantQuerierAssignments{ + queriersByID: map[QuerierID]*querierConn{}, + querierIDsSorted: nil, + querierForgetDelay: forgetDelay, + tenantIDOrder: nil, + tenantsByID: map[TenantID]*queueTenant{}, + tenantQuerierIDs: map[TenantID]map[QuerierID]struct{}{}, + tenantNodes: map[string][]*Node{}, + currentQuerier: currentQuerier, + tenantOrderIndex: localQueueIndex, + } + + var tree Tree + var err error + if useMultiAlgoTreeQueue { + tree, err = NewTree( + tqas, // root; QueuingAlgorithm selects tenants + &roundRobinState{}, // tenant queues; QueuingAlgorithm selects query component + &roundRobinState{}, // query components; QueuingAlgorithm selects query from local queue + ) + } else { + // by default, use the legacy tree queue + tree = NewTreeQueue("root") + } + + // An error building the tree is fatal; we must panic + if err != nil { + panic(fmt.Sprintf("error creating the tree queue: %v", err)) + } + qb := &queueBroker{ + tree: tree, + tenantQuerierAssignments: tqas, maxTenantQueueSize: maxTenantQueueSize, additionalQueueDimensionsEnabled: additionalQueueDimensionsEnabled, } + + return qb } func (qb *queueBroker) isEmpty() bool { - return qb.tenantQueuesTree.IsEmpty() + return qb.tree.IsEmpty() } // enqueueRequestBack is the standard interface to enqueue requests for dispatch to queriers. @@ -64,13 +96,27 @@ func (qb *queueBroker) enqueueRequestBack(request *tenantRequest, tenantMaxQueri if err != nil { return err } - if tenantQueueNode := qb.tenantQueuesTree.getNode(queuePath[:1]); tenantQueueNode != nil { - if tenantQueueNode.ItemCount()+1 > qb.maxTenantQueueSize { + + // TODO (casie): When deprecating TreeQueue, clean this up. + // Technically, the MultiQueuingAlgorithmTreeQueue approach is adequate for both tree types, but we are temporarily + // maintaining the legacy tree behavior as much as possible for stability reasons. + if tq, ok := qb.tree.(*TreeQueue); ok { + if tenantQueueNode := tq.getNode(queuePath[:1]); tenantQueueNode != nil { + if tenantQueueNode.ItemCount()+1 > qb.maxTenantQueueSize { + return ErrTooManyRequests + } + } + } else if _, ok := qb.tree.(*MultiQueuingAlgorithmTreeQueue); ok { + itemCount := 0 + for _, tenantNode := range qb.tenantQuerierAssignments.tenantNodes[string(request.tenantID)] { + itemCount += tenantNode.ItemCount() + } + if itemCount+1 > qb.maxTenantQueueSize { return ErrTooManyRequests } } - err = qb.tenantQueuesTree.EnqueueBackByPath(queuePath, request) + err = qb.tree.EnqueueBackByPath(queuePath, request) return err } @@ -89,12 +135,15 @@ func (qb *queueBroker) enqueueRequestFront(request *tenantRequest, tenantMaxQuer if err != nil { return err } - return qb.tenantQueuesTree.EnqueueFrontByPath(queuePath, request) + return qb.tree.EnqueueFrontByPath(queuePath, request) } func (qb *queueBroker) makeQueuePath(request *tenantRequest) (QueuePath, error) { if qb.additionalQueueDimensionsEnabled { if schedulerRequest, ok := request.req.(*SchedulerRequest); ok { + if qb.prioritizeQueryComponents { + return append(schedulerRequest.AdditionalQueueDimensions, string(request.tenantID)), nil + } return append(QueuePath{string(request.tenantID)}, schedulerRequest.AdditionalQueueDimensions...), nil } } @@ -103,28 +152,71 @@ func (qb *queueBroker) makeQueuePath(request *tenantRequest) (QueuePath, error) return QueuePath{string(request.tenantID)}, nil } -func (qb *queueBroker) dequeueRequestForQuerier(lastTenantIndex int, querierID QuerierID) (*tenantRequest, *queueTenant, int, error) { - tenant, tenantIndex, err := qb.tenantQuerierAssignments.getNextTenantForQuerier(lastTenantIndex, querierID) - if tenant == nil || err != nil { - return nil, tenant, tenantIndex, err +func (qb *queueBroker) dequeueRequestForQuerier( + lastTenantIndex int, + querierID QuerierID, +) ( + *tenantRequest, + *queueTenant, + int, + error, +) { + // check if querier is registered and is not shutting down + if q := qb.tenantQuerierAssignments.queriersByID[querierID]; q == nil || q.shuttingDown { + return nil, nil, qb.tenantQuerierAssignments.tenantOrderIndex, ErrQuerierShuttingDown } - queuePath := QueuePath{string(tenant.tenantID)} - queueElement := qb.tenantQueuesTree.DequeueByPath(queuePath) + var queuePath QueuePath + var queueElement any + if tq, ok := qb.tree.(*TreeQueue); ok { + tenant, tenantIndex, err := qb.tenantQuerierAssignments.getNextTenantForQuerier(lastTenantIndex, querierID) + if tenant == nil || err != nil { + return nil, tenant, tenantIndex, err + } + qb.tenantQuerierAssignments.tenantOrderIndex = tenantIndex + // We can manually build queuePath here because TreeQueue only supports one tree structure ordering: + // root --> tenant --> (optional: query dimensions) + queuePath = QueuePath{string(tenant.tenantID)} + queueElement = tq.DequeueByPath(queuePath) + } else if itq, ok := qb.tree.(*MultiQueuingAlgorithmTreeQueue); ok { + qb.tenantQuerierAssignments.updateQueuingAlgorithmState(querierID, lastTenantIndex) + queuePath, queueElement = itq.Dequeue() + } - queueNodeAfterDequeue := qb.tenantQueuesTree.getNode(queuePath) - if queueNodeAfterDequeue == nil { - // queue node was deleted due to being empty after dequeue - qb.tenantQuerierAssignments.removeTenant(tenant.tenantID) + if queueElement == nil { + return nil, nil, qb.tenantQuerierAssignments.tenantOrderIndex, nil } var request *tenantRequest - if queueElement != nil { - // re-casting to same type it was enqueued as; panic would indicate a bug - request = queueElement.(*tenantRequest) + var tenantID TenantID + + // re-casting to same type it was enqueued as; panic would indicate a bug + request = queueElement.(*tenantRequest) + tenantID = request.tenantID + + var tenant *queueTenant + if tenantID != "" { + tenant = qb.tenantQuerierAssignments.tenantsByID[tenantID] } - return request, tenant, tenantIndex, nil + // TODO (casie): When deprecating TreeQueue, clean this up. + // This cannot be handled by the Tree interface without defining some other, more expansive interfaces + // between the legacy and integrated tree queues, which would be more overhead than it's worth, given + // that we will eventually retire the legacy tree queue. + if tq, ok := qb.tree.(*TreeQueue); ok { + queueNodeAfterDequeue := tq.getNode(queuePath) + if queueNodeAfterDequeue == nil { + // queue node was deleted due to being empty after dequeue + qb.tenantQuerierAssignments.removeTenant(tenant.tenantID) + } + } else if itq, ok := qb.tree.(*MultiQueuingAlgorithmTreeQueue); ok { + queueNodeAfterDequeue := itq.GetNode(queuePath) + if queueNodeAfterDequeue == nil && len(qb.tenantQuerierAssignments.tenantNodes[string(tenantID)]) == 0 { + // queue node was deleted due to being empty after dequeue + qb.tenantQuerierAssignments.removeTenant(tenantID) + } + } + return request, tenant, qb.tenantQuerierAssignments.tenantOrderIndex, nil } func (qb *queueBroker) addQuerierConnection(querierID QuerierID) (resharded bool) { diff --git a/pkg/scheduler/queue/tenant_queues_test.go b/pkg/scheduler/queue/tenant_queues_test.go index f6b09fedd37..443d5dc763b 100644 --- a/pkg/scheduler/queue/tenant_queues_test.go +++ b/pkg/scheduler/queue/tenant_queues_test.go @@ -19,228 +19,420 @@ import ( "github.com/stretchr/testify/require" ) -func TestQueues(t *testing.T) { - qb := newQueueBroker(0, true, 0) - assert.NotNil(t, qb) - assert.NoError(t, isConsistent(qb)) +func (qb *queueBroker) enqueueObjectsForTests(tenantID TenantID, numObjects int) error { + for i := 0; i < numObjects; i++ { + req := &tenantRequest{ + tenantID: tenantID, + req: fmt.Sprintf("%v: object-%v", tenantID, i), + } + path, err := qb.makeQueuePath(req) + if err != nil { + return err + } + err = qb.tree.EnqueueBackByPath(path, req) + if err != nil { + return err + } + } + return nil +} - qb.addQuerierConnection("querier-1") - qb.addQuerierConnection("querier-2") +func buildExpectedObject(tenantID TenantID, num int) *tenantRequest { + return &tenantRequest{ + tenantID: tenantID, + req: fmt.Sprintf("%v: object-%v", tenantID, num), + } +} - req, tenant, lastTenantIndex, err := qb.dequeueRequestForQuerier(-1, "querier-1") - assert.Nil(t, req) - assert.Nil(t, tenant) - assert.NoError(t, err) +type dequeueVal struct { + req *tenantRequest + tenant *queueTenant +} - // Add queues: [one] - qOne := getOrAdd(t, qb, "one", 0) - lastTenantIndex = confirmOrderForQuerier(t, qb, "querier-1", lastTenantIndex, qOne, qOne) +func assertExpectedValuesOnDequeue(t *testing.T, qb *queueBroker, lastTenantIndex int, querierID QuerierID, expectedVals []dequeueVal) int { + var req *tenantRequest + var tenant *queueTenant + var err error - // [one two] - qTwo := getOrAdd(t, qb, "two", 0) + for _, expected := range expectedVals { + req, tenant, lastTenantIndex, err = qb.dequeueRequestForQuerier(lastTenantIndex, querierID) + assert.Equal(t, expected.req, req) + assert.Equal(t, expected.tenant, tenant) + assert.NoError(t, err) + } + return lastTenantIndex +} - lastTenantIndex = confirmOrderForQuerier(t, qb, "querier-1", lastTenantIndex, qTwo, qOne, qTwo, qOne) - confirmOrderForQuerier(t, qb, "querier-2", -1, qOne, qTwo, qOne) +// TestQueues_NoShuffleSharding tests dequeueing of objects for different queriers, where any querier can +// handle queries for any tenant, as tenant queues are added and removed +func TestQueues_NoShuffleSharding(t *testing.T) { + treeTypes := buildTreeTestsStruct() + for _, tt := range treeTypes { + t.Run(tt.name, func(t *testing.T) { + qb := newQueueBroker(0, true, tt.useMultiAlgoTreeQueue, 0) + assert.NotNil(t, qb) + assert.NoError(t, isConsistent(qb)) - // [one two three] - // confirm fifo by adding a third queue and iterating to it - qThree := getOrAdd(t, qb, "three", 0) + qb.addQuerierConnection("querier-1") + qb.addQuerierConnection("querier-2") - lastTenantIndex = confirmOrderForQuerier(t, qb, "querier-1", lastTenantIndex, qTwo, qThree, qOne) + req, tenant, lastTenantIndexQuerierOne, err := qb.dequeueRequestForQuerier(-1, "querier-1") + assert.Nil(t, req) + assert.Nil(t, tenant) + assert.NoError(t, err) + + // Add tenant queues: tenant "one" + err = qb.tenantQuerierAssignments.createOrUpdateTenant("one", 0) + assert.NoError(t, err) + + tenantOne := qb.tenantQuerierAssignments.tenantsByID["one"] + + // Queue enough objects for tenant "one" + err = qb.enqueueObjectsForTests(tenantOne.tenantID, 10) + assert.NoError(t, err) + assert.NoError(t, isConsistent(qb)) - // Remove one: ["" two three] - qb.removeTenantQueue("one") - assert.NoError(t, isConsistent(qb)) + //queuePathOne := QueuePath{"root", "one"} + expectedDequeueVals := []dequeueVal{ + {buildExpectedObject(tenantOne.tenantID, 0), tenantOne}, + {buildExpectedObject(tenantOne.tenantID, 1), tenantOne}, + } + lastTenantIndexQuerierOne = assertExpectedValuesOnDequeue(t, qb, lastTenantIndexQuerierOne, "querier-1", expectedDequeueVals) - lastTenantIndex = confirmOrderForQuerier(t, qb, "querier-1", lastTenantIndex, qTwo, qThree, qTwo) + // Add tenant two + err = qb.tenantQuerierAssignments.createOrUpdateTenant("two", 0) + assert.NoError(t, err) - // "four" is added at the beginning of the list: [four two three] - qFour := getOrAdd(t, qb, "four", 0) + tenantTwo := qb.tenantQuerierAssignments.tenantsByID["two"] - lastTenantIndex = confirmOrderForQuerier(t, qb, "querier-1", lastTenantIndex, qThree, qFour, qTwo, qThree) + err = qb.enqueueObjectsForTests(tenantTwo.tenantID, 10) + assert.NoError(t, err) + assert.NoError(t, isConsistent(qb)) - // Remove two: [four "" three] - qb.removeTenantQueue("two") - assert.NoError(t, isConsistent(qb)) + expectedDequeueVals = []dequeueVal{ + {buildExpectedObject(tenantTwo.tenantID, 0), tenantTwo}, + {buildExpectedObject(tenantOne.tenantID, 2), tenantOne}, + {buildExpectedObject(tenantTwo.tenantID, 1), tenantTwo}, + {buildExpectedObject(tenantOne.tenantID, 3), tenantOne}, + } - lastTenantIndex = confirmOrderForQuerier(t, qb, "querier-1", lastTenantIndex, qFour, qThree, qFour) + lastTenantIndexQuerierOne = assertExpectedValuesOnDequeue(t, qb, lastTenantIndexQuerierOne, "querier-1", expectedDequeueVals) - // Remove three: [four] - qb.removeTenantQueue("three") - assert.NoError(t, isConsistent(qb)) + expectedDequeueVals = []dequeueVal{ + {buildExpectedObject(tenantOne.tenantID, 4), tenantOne}, + {buildExpectedObject(tenantTwo.tenantID, 2), tenantTwo}, + {buildExpectedObject(tenantOne.tenantID, 5), tenantOne}, + } + lastTenantIndexQuerierTwo := assertExpectedValuesOnDequeue(t, qb, -1, "querier-2", expectedDequeueVals) - // Remove four: [] - qb.removeTenantQueue("four") - assert.NoError(t, isConsistent(qb)) + // [one two three] + // confirm fifo by adding a third tenant queue and iterating to it + err = qb.tenantQuerierAssignments.createOrUpdateTenant("three", 0) + assert.NoError(t, err) + + tenantThree := qb.tenantQuerierAssignments.tenantsByID["three"] + + err = qb.enqueueObjectsForTests(tenantThree.tenantID, 10) + assert.NoError(t, err) + assert.NoError(t, isConsistent(qb)) + + // lastTenantIndexQuerierOne was 0 (tenantOne) for querier-1; appending + expectedDequeueVals = []dequeueVal{ + {buildExpectedObject(tenantTwo.tenantID, 3), tenantTwo}, + {buildExpectedObject(tenantThree.tenantID, 0), tenantThree}, + {buildExpectedObject(tenantOne.tenantID, 6), tenantOne}, + } + + lastTenantIndexQuerierOne = assertExpectedValuesOnDequeue(t, qb, lastTenantIndexQuerierOne, "querier-1", expectedDequeueVals) + + // Remove one: ["" two three] + qb.removeTenantQueue("one") + assert.NoError(t, isConsistent(qb)) + + expectedDequeueVals = []dequeueVal{ + {buildExpectedObject(tenantTwo.tenantID, 4), tenantTwo}, + {buildExpectedObject(tenantThree.tenantID, 1), tenantThree}, + {buildExpectedObject(tenantTwo.tenantID, 5), tenantTwo}, + } + lastTenantIndexQuerierOne = assertExpectedValuesOnDequeue(t, qb, lastTenantIndexQuerierOne, "querier-1", expectedDequeueVals) + + // "four" is added at the beginning of the list: [four two three] + err = qb.tenantQuerierAssignments.createOrUpdateTenant("four", 0) + assert.NoError(t, err) + + tenantFour := qb.tenantQuerierAssignments.tenantsByID["four"] + + err = qb.enqueueObjectsForTests(tenantFour.tenantID, 10) + assert.NoError(t, err) + assert.NoError(t, isConsistent(qb)) + + expectedDequeueVals = []dequeueVal{ + {buildExpectedObject(tenantTwo.tenantID, 6), tenantTwo}, + {buildExpectedObject(tenantThree.tenantID, 2), tenantThree}, + {buildExpectedObject(tenantFour.tenantID, 0), tenantFour}, + {buildExpectedObject(tenantTwo.tenantID, 7), tenantTwo}, + } + + _ = assertExpectedValuesOnDequeue(t, qb, lastTenantIndexQuerierTwo, "querier-2", expectedDequeueVals) + + // Remove two: [four "" three] + qb.removeTenantQueue("two") + assert.NoError(t, isConsistent(qb)) + + expectedDequeueVals = []dequeueVal{ + {buildExpectedObject(tenantThree.tenantID, 3), tenantThree}, + {buildExpectedObject(tenantFour.tenantID, 1), tenantFour}, + {buildExpectedObject(tenantThree.tenantID, 4), tenantThree}, + } + lastTenantIndexQuerierOne = assertExpectedValuesOnDequeue(t, qb, lastTenantIndexQuerierOne, "querier-1", expectedDequeueVals) + + // Remove three: [four ""] ) + qb.removeTenantQueue("three") + assert.NoError(t, isConsistent(qb)) + + // Remove four: [] + qb.removeTenantQueue("four") + assert.NoError(t, isConsistent(qb)) - req, _, _, err = qb.dequeueRequestForQuerier(lastTenantIndex, "querier-1") - assert.Nil(t, req) - assert.NoError(t, err) + req, tenant, _, err = qb.dequeueRequestForQuerier(lastTenantIndexQuerierOne, "querier-1") + assert.Nil(t, req) + assert.Nil(t, tenant) + assert.NoError(t, err) + + }) + } } func TestQueuesRespectMaxTenantQueueSizeWithSubQueues(t *testing.T) { - maxTenantQueueSize := 100 - qb := newQueueBroker(maxTenantQueueSize, true, 0) - additionalQueueDimensions := map[int][]string{ - 0: nil, - 1: {"ingester"}, - 2: {"store-gateway"}, - 3: {"ingester-and-store-gateway"}, - } - req := &SchedulerRequest{ - Ctx: context.Background(), - FrontendAddr: "http://query-frontend:8007", - UserID: "tenant-1", - Request: &httpgrpc.HTTPRequest{}, - } + // TODO (casie): When implementing prioritizeQueryComponents, add tests here, since we will be able to have + // multiple nodes for a single tenant + treeTypes := buildTreeTestsStruct() + + for _, tt := range treeTypes { + t.Run(tt.name, func(t *testing.T) { + maxTenantQueueSize := 100 + qb := newQueueBroker(maxTenantQueueSize, true, tt.useMultiAlgoTreeQueue, 0) + additionalQueueDimensions := map[int][]string{ + 0: nil, + 1: {"ingester"}, + 2: {"store-gateway"}, + 3: {"ingester-and-store-gateway"}, + } + req := &SchedulerRequest{ + Ctx: context.Background(), + FrontendAddr: "http://query-frontend:8007", + UserID: "tenant-1", + Request: &httpgrpc.HTTPRequest{}, + } + + // build queue evenly with either no additional queue dimension or one of 3 additional dimensions + for i := 0; i < len(additionalQueueDimensions); i++ { + for j := 0; j < maxTenantQueueSize/len(additionalQueueDimensions); j++ { + req.AdditionalQueueDimensions = additionalQueueDimensions[i] + tenantReq := &tenantRequest{tenantID: "tenant-1", req: req} + err := qb.enqueueRequestBack(tenantReq, 0) + assert.NoError(t, err) + } + } + // assert item count of tenant node and its subnodes + queuePath := QueuePath{"tenant-1"} + + // TODO (casie): After deprecating legacy tree queue, clean this up + if tq, ok := qb.tree.(*TreeQueue); ok { + assert.Equal(t, maxTenantQueueSize, tq.getNode(queuePath).ItemCount()) + } else if itq, ok := qb.tree.(*MultiQueuingAlgorithmTreeQueue); ok { + assert.Equal(t, maxTenantQueueSize, itq.GetNode(queuePath).ItemCount()) + } + + // assert equal distribution of queue items between tenant node and 3 subnodes + for _, v := range additionalQueueDimensions { + queuePath := append(QueuePath{"tenant-1"}, v...) + // TODO (casie): After deprecating legacy tree queue, clean this up + var itemCount int + if tq, ok := qb.tree.(*TreeQueue); ok { + itemCount = tq.getNode(queuePath).LocalQueueLen() + } else if itq, ok := qb.tree.(*MultiQueuingAlgorithmTreeQueue); ok { + itemCount = itq.GetNode(queuePath).getLocalQueue().Len() + } + assert.Equal(t, maxTenantQueueSize/len(additionalQueueDimensions), itemCount) + } + + // assert error received when hitting a tenant's enqueue limit, + // even though most of the requests are in the subqueues + for _, additionalQueueDimension := range additionalQueueDimensions { + // error should be received no matter if the enqueue attempt + // is for the tenant queue or any of its subqueues + req.AdditionalQueueDimensions = additionalQueueDimension + tenantReq := &tenantRequest{tenantID: "tenant-1", req: req} + err := qb.enqueueRequestBack(tenantReq, 0) + assert.ErrorIs(t, err, ErrTooManyRequests) + } + + // dequeue a request + qb.addQuerierConnection("querier-1") + dequeuedTenantReq, _, _, err := qb.dequeueRequestForQuerier(-1, "querier-1") + assert.NoError(t, err) + assert.NotNil(t, dequeuedTenantReq) - // build queue evenly with either no additional queue dimension or one of 3 additional dimensions - for i := 0; i < len(additionalQueueDimensions); i++ { - for j := 0; j < maxTenantQueueSize/len(additionalQueueDimensions); j++ { - req.AdditionalQueueDimensions = additionalQueueDimensions[i] tenantReq := &tenantRequest{tenantID: "tenant-1", req: req} - err := qb.enqueueRequestBack(tenantReq, 0) + // assert not hitting an error when enqueueing after dequeuing to below the limit + err = qb.enqueueRequestBack(tenantReq, 0) assert.NoError(t, err) - } - } - // assert item count of tenant node and its subnodes - queuePath := QueuePath{"tenant-1"} - assert.Equal(t, maxTenantQueueSize, qb.tenantQueuesTree.getNode(queuePath).ItemCount()) - - // assert equal distribution of queue items between tenant node and 3 subnodes - for _, v := range additionalQueueDimensions { - queuePath := append(QueuePath{"tenant-1"}, v...) - assert.Equal(t, maxTenantQueueSize/len(additionalQueueDimensions), qb.tenantQueuesTree.getNode(queuePath).LocalQueueLen()) - } - // assert error received when hitting a tenant's enqueue limit, - // even though most of the requests are in the subqueues - for _, additionalQueueDimension := range additionalQueueDimensions { - // error should be received no matter if the enqueue attempt - // is for the tenant queue or any of its subqueues - req.AdditionalQueueDimensions = additionalQueueDimension - tenantReq := &tenantRequest{tenantID: "tenant-1", req: req} - err := qb.enqueueRequestBack(tenantReq, 0) - assert.ErrorIs(t, err, ErrTooManyRequests) + // we then hit an error again, as we are back at the limit + err = qb.enqueueRequestBack(tenantReq, 0) + assert.ErrorIs(t, err, ErrTooManyRequests) + }) } +} - // dequeue a request - qb.addQuerierConnection("querier-1") - dequeuedTenantReq, _, _, err := qb.dequeueRequestForQuerier(-1, "querier-1") - assert.NoError(t, err) - assert.NotNil(t, dequeuedTenantReq) +func TestQueuesOnTerminatingQuerier(t *testing.T) { + treeTypes := buildTreeTestsStruct() + for _, tt := range treeTypes { + t.Run(tt.name, func(t *testing.T) { + qb := newQueueBroker(0, true, tt.useMultiAlgoTreeQueue, 0) + assert.NotNil(t, qb) + assert.NoError(t, isConsistent(qb)) - tenantReq := &tenantRequest{tenantID: "tenant-1", req: req} - // assert not hitting an error when enqueueing after dequeuing to below the limit - err = qb.enqueueRequestBack(tenantReq, 0) - assert.NoError(t, err) + qb.addQuerierConnection("querier-1") + qb.addQuerierConnection("querier-2") - // we then hit an error again, as we are back at the limit - err = qb.enqueueRequestBack(tenantReq, 0) - assert.ErrorIs(t, err, ErrTooManyRequests) -} + // Add queues: [one two] + err := qb.tenantQuerierAssignments.createOrUpdateTenant("one", 0) + assert.NoError(t, err) + err = qb.tenantQuerierAssignments.createOrUpdateTenant("two", 0) + assert.NoError(t, err) -func TestQueuesOnTerminatingQuerier(t *testing.T) { - qb := newQueueBroker(0, true, 0) - assert.NotNil(t, qb) - assert.NoError(t, isConsistent(qb)) - - qb.addQuerierConnection("querier-1") - qb.addQuerierConnection("querier-2") - - // Add queues: [one, two] - qOne := getOrAdd(t, qb, "one", 0) - qTwo := getOrAdd(t, qb, "two", 0) - confirmOrderForQuerier(t, qb, "querier-1", -1, qOne, qTwo, qOne, qTwo) - confirmOrderForQuerier(t, qb, "querier-2", -1, qOne, qTwo, qOne, qTwo) - - // After notify shutdown for querier-2, it's expected to own no queue. - qb.notifyQuerierShutdown("querier-2") - tenant, _, err := qb.tenantQuerierAssignments.getNextTenantForQuerier(-1, "querier-2") - assert.Nil(t, tenant) - assert.Equal(t, ErrQuerierShuttingDown, err) - - // However, querier-1 still get queues because it's still running. - confirmOrderForQuerier(t, qb, "querier-1", -1, qOne, qTwo, qOne, qTwo) - - // After disconnecting querier-2, it's expected to own no queue. - qb.tenantQuerierAssignments.removeQuerier("querier-2") - tenant, _, err = qb.tenantQuerierAssignments.getNextTenantForQuerier(-1, "querier-2") - assert.Nil(t, tenant) - assert.Equal(t, ErrQuerierShuttingDown, err) -} + err = qb.enqueueObjectsForTests("one", 10) + assert.NoError(t, err) + tenantOne := qb.tenantQuerierAssignments.tenantsByID["one"] + + err = qb.enqueueObjectsForTests("two", 10) + assert.NoError(t, err) + tenantTwo := qb.tenantQuerierAssignments.tenantsByID["two"] -func TestQueuesWithQueriers(t *testing.T) { - qb := newQueueBroker(0, true, 0) - assert.NotNil(t, qb) - assert.NoError(t, isConsistent(qb)) + expectedDequeueVals := []dequeueVal{ + {buildExpectedObject(tenantOne.tenantID, 0), tenantOne}, + {buildExpectedObject(tenantTwo.tenantID, 0), tenantTwo}, + {buildExpectedObject(tenantOne.tenantID, 1), tenantOne}, + {buildExpectedObject(tenantTwo.tenantID, 1), tenantTwo}, + } + qOneLastTenantIndex := assertExpectedValuesOnDequeue(t, qb, -1, "querier-1", expectedDequeueVals) - queriers := 30 - numTenants := 1000 - maxQueriersPerTenant := 5 + expectedDequeueVals = []dequeueVal{ + {buildExpectedObject(tenantOne.tenantID, 2), tenantOne}, + {buildExpectedObject(tenantTwo.tenantID, 2), tenantTwo}, + {buildExpectedObject(tenantOne.tenantID, 3), tenantOne}, + {buildExpectedObject(tenantTwo.tenantID, 3), tenantTwo}, + } + qTwolastTenantIndex := assertExpectedValuesOnDequeue(t, qb, -1, "querier-2", expectedDequeueVals) + + // After notify shutdown for querier-2, it's expected to own no queue. + qb.notifyQuerierShutdown("querier-2") + req, tenant, qTwolastTenantIndex, err := qb.dequeueRequestForQuerier(qTwolastTenantIndex, "querier-2") + assert.Nil(t, req) + assert.Nil(t, tenant) + assert.Equal(t, ErrQuerierShuttingDown, err) + + // However, querier-1 still get queues because it's still running. + expectedDequeueVals = []dequeueVal{ + {buildExpectedObject(tenantOne.tenantID, 4), tenantOne}, + {buildExpectedObject(tenantTwo.tenantID, 4), tenantTwo}, + {buildExpectedObject(tenantOne.tenantID, 5), tenantOne}, + {buildExpectedObject(tenantTwo.tenantID, 5), tenantTwo}, + } - // Add some queriers. - for ix := 0; ix < queriers; ix++ { - qid := QuerierID(fmt.Sprintf("querier-%d", ix)) - qb.addQuerierConnection(qid) + for _, expected := range expectedDequeueVals { + req, tenant, qOneLastTenantIndex, err = qb.dequeueRequestForQuerier(qOneLastTenantIndex, "querier-1") + assert.Equal(t, expected.req, req) + assert.Equal(t, expected.tenant, tenant) + assert.NoError(t, err) + } - // No querier has any queues yet. - req, tenant, _, err := qb.dequeueRequestForQuerier(-1, qid) - assert.Nil(t, req) - assert.Nil(t, tenant) - assert.NoError(t, err) + // After disconnecting querier-2, it's expected to own no queue. + qb.tenantQuerierAssignments.removeQuerier("querier-2") + req, tenant, _, err = qb.dequeueRequestForQuerier(qTwolastTenantIndex, "querier-2") + assert.Nil(t, req) + assert.Nil(t, tenant) + assert.Equal(t, ErrQuerierShuttingDown, err) + + }) } +} - assert.NoError(t, isConsistent(qb)) +func TestQueues_QuerierDistribution(t *testing.T) { + treeTypes := buildTreeTestsStruct() + for _, tt := range treeTypes { + t.Run(tt.name, func(t *testing.T) { + qb := newQueueBroker(0, true, tt.useMultiAlgoTreeQueue, 0) + assert.NotNil(t, qb) + assert.NoError(t, isConsistent(qb)) - // Add tenant queues. - for i := 0; i < numTenants; i++ { - uid := TenantID(fmt.Sprintf("tenant-%d", i)) - getOrAdd(t, qb, uid, maxQueriersPerTenant) + queriers := 30 + numTenants := 1000 + maxQueriersPerTenant := 5 - // Verify it has maxQueriersPerTenant queriers assigned now. - qs := qb.tenantQuerierAssignments.tenantQuerierIDs[uid] - assert.Equal(t, maxQueriersPerTenant, len(qs)) - } + // Add some queriers. + for ix := 0; ix < queriers; ix++ { + qid := QuerierID(fmt.Sprintf("querier-%d", ix)) + qb.addQuerierConnection(qid) - // After adding all tenants, verify results. For each querier, find out how many different tenants it handles, - // and compute mean and stdDev. - queriersMap := make(map[QuerierID]int) + // No querier has any queues yet. + req, tenant, _, err := qb.dequeueRequestForQuerier(-1, qid) + assert.Nil(t, req) + assert.Nil(t, tenant) + assert.NoError(t, err) + } - for q := 0; q < queriers; q++ { - qid := QuerierID(fmt.Sprintf("querier-%d", q)) + assert.NoError(t, isConsistent(qb)) - lastTenantIndex := -1 - for { - _, newIx, err := qb.tenantQuerierAssignments.getNextTenantForQuerier(lastTenantIndex, qid) - assert.NoError(t, err) - if newIx < lastTenantIndex { - break + // Add tenant queues. + for i := 0; i < numTenants; i++ { + uid := TenantID(fmt.Sprintf("tenant-%d", i)) + err := qb.tenantQuerierAssignments.createOrUpdateTenant(uid, maxQueriersPerTenant) + assert.NoError(t, err) + + //Enqueue some stuff so that the tree queue node exists + err = qb.enqueueObjectsForTests(uid, 1) + assert.NoError(t, err) + + // Verify it has maxQueriersPerTenant queriers assigned now. + qs := qb.tenantQuerierAssignments.tenantQuerierIDs[uid] + assert.Equal(t, maxQueriersPerTenant, len(qs)) } - lastTenantIndex = newIx - queriersMap[qid]++ - } - } - mean := float64(0) - for _, c := range queriersMap { - mean += float64(c) - } - mean = mean / float64(len(queriersMap)) + // After adding all tenants, verify results. For each querier, find out how many different tenants it handles, + // and compute mean and stdDev. + queriersMap := make(map[QuerierID]int) - stdDev := float64(0) - for _, c := range queriersMap { - d := float64(c) - mean - stdDev += (d * d) - } - stdDev = math.Sqrt(stdDev / float64(len(queriersMap))) - t.Log("mean:", mean, "stddev:", stdDev) + for _, querierSet := range qb.tenantQuerierAssignments.tenantQuerierIDs { + for querierID := range querierSet { + queriersMap[querierID]++ + } + } + + mean := float64(0) + for _, c := range queriersMap { + mean += float64(c) + } + mean = mean / float64(len(queriersMap)) - assert.InDelta(t, numTenants*maxQueriersPerTenant/queriers, mean, 1) - assert.InDelta(t, stdDev, 0, mean*0.2) + stdDev := float64(0) + for _, c := range queriersMap { + d := float64(c) - mean + stdDev += (d * d) + } + stdDev = math.Sqrt(stdDev / float64(len(queriersMap))) + t.Log("mean:", mean, "stddev:", stdDev) + + assert.InDelta(t, numTenants*maxQueriersPerTenant/queriers, mean, 1) + assert.InDelta(t, stdDev, 0, mean*0.2) + }) + } } func TestQueuesConsistency(t *testing.T) { + treeTypes := buildTreeTestsStruct() tests := map[string]struct { forgetDelay time.Duration }{ @@ -248,46 +440,49 @@ func TestQueuesConsistency(t *testing.T) { "with forget delay": {forgetDelay: time.Minute}, } - for testName, testData := range tests { - t.Run(testName, func(t *testing.T) { - qb := newQueueBroker(0, true, testData.forgetDelay) - assert.NotNil(t, qb) - assert.NoError(t, isConsistent(qb)) - - r := rand.New(rand.NewSource(time.Now().Unix())) - - lastTenantIndexes := map[QuerierID]int{} - - conns := map[QuerierID]int{} - - for i := 0; i < 10000; i++ { - switch r.Int() % 6 { - case 0: - queue, err := qb.getOrAddTenantQueue(generateTenant(r), 3) - assert.Nil(t, err) - assert.NotNil(t, queue) - case 1: - querierID := generateQuerier(r) - _, tenantIndex, _ := qb.tenantQuerierAssignments.getNextTenantForQuerier(lastTenantIndexes[querierID], querierID) - lastTenantIndexes[querierID] = tenantIndex - case 2: - qb.removeTenantQueue(generateTenant(r)) - case 3: - q := generateQuerier(r) - qb.addQuerierConnection(q) - conns[q]++ - case 4: - q := generateQuerier(r) - if conns[q] > 0 { - qb.removeQuerierConnection(q, time.Now()) - conns[q]-- + for _, tt := range treeTypes { + t.Run(tt.name, func(t *testing.T) { + for testName, testData := range tests { + t.Run(testName, func(t *testing.T) { + qb := newQueueBroker(0, true, tt.useMultiAlgoTreeQueue, testData.forgetDelay) + assert.NotNil(t, qb) + assert.NoError(t, isConsistent(qb)) + + r := rand.New(rand.NewSource(time.Now().Unix())) + + lastTenantIndexes := map[QuerierID]int{} + + conns := map[QuerierID]int{} + + for i := 0; i < 10000; i++ { + switch r.Int() % 6 { + case 0: + err := qb.getOrAddTenantQueue(generateTenant(r), 3) + assert.Nil(t, err) + case 1: + querierID := generateQuerier(r) + _, tenantIndex, _ := qb.tenantQuerierAssignments.getNextTenantForQuerier(lastTenantIndexes[querierID], querierID) + lastTenantIndexes[querierID] = tenantIndex + case 2: + qb.removeTenantQueue(generateTenant(r)) + case 3: + q := generateQuerier(r) + qb.addQuerierConnection(q) + conns[q]++ + case 4: + q := generateQuerier(r) + if conns[q] > 0 { + qb.removeQuerierConnection(q, time.Now()) + conns[q]-- + } + case 5: + q := generateQuerier(r) + qb.notifyQuerierShutdown(q) + } + + assert.NoErrorf(t, isConsistent(qb), "last action %d, rand: %d", i, r.Int()%6) } - case 5: - q := generateQuerier(r) - qb.notifyQuerierShutdown(q) - } - - assert.NoErrorf(t, isConsistent(qb), "last action %d", i) + }) } }) } @@ -300,88 +495,99 @@ func TestQueues_ForgetDelay(t *testing.T) { numTenants = 10 ) - now := time.Now() - qb := newQueueBroker(0, true, forgetDelay) - assert.NotNil(t, qb) - assert.NoError(t, isConsistent(qb)) + treeTypes := buildTreeTestsStruct() - // 3 queriers open 2 connections each. - for i := 1; i <= 3; i++ { - qb.addQuerierConnection(QuerierID(fmt.Sprintf("querier-%d", i))) - qb.addQuerierConnection(QuerierID(fmt.Sprintf("querier-%d", i))) - } + for _, tt := range treeTypes { + t.Run(tt.name, func(t *testing.T) { + now := time.Now() + qb := newQueueBroker(0, true, tt.useMultiAlgoTreeQueue, forgetDelay) + assert.NotNil(t, qb) + assert.NoError(t, isConsistent(qb)) - // Add tenant queues. - for i := 0; i < numTenants; i++ { - tenantID := TenantID(fmt.Sprintf("tenant-%d", i)) - getOrAdd(t, qb, tenantID, maxQueriersPerTenant) - } + // 3 queriers open 2 connections each. + for i := 1; i <= 3; i++ { + qb.addQuerierConnection(QuerierID(fmt.Sprintf("querier-%d", i))) + qb.addQuerierConnection(QuerierID(fmt.Sprintf("querier-%d", i))) + } - // We expect querier-1 to have some tenants. - querier1Tenants := getTenantsByQuerier(qb, "querier-1") - require.NotEmpty(t, querier1Tenants) + // Add tenant queues. + for i := 0; i < numTenants; i++ { + tenantID := TenantID(fmt.Sprintf("tenant-%d", i)) + err := qb.tenantQuerierAssignments.createOrUpdateTenant(tenantID, maxQueriersPerTenant) + assert.NoError(t, err) - // Gracefully shutdown querier-1. - qb.removeQuerierConnection("querier-1", now.Add(20*time.Second)) - qb.removeQuerierConnection("querier-1", now.Add(21*time.Second)) - qb.notifyQuerierShutdown("querier-1") + //Enqueue some stuff so that the tree queue node exists + err = qb.enqueueObjectsForTests(tenantID, 1) + assert.NoError(t, err) + } - // We expect querier-1 has been removed. - assert.NotContains(t, qb.tenantQuerierAssignments.queriersByID, "querier-1") - assert.NoError(t, isConsistent(qb)) + // We expect querier-1 to have some tenants. + querier1Tenants := getTenantsByQuerier(qb, "querier-1") + require.NotEmpty(t, querier1Tenants) - // We expect querier-1 tenants have been shuffled to other queriers. - for _, tenantID := range querier1Tenants { - assert.Contains(t, append(getTenantsByQuerier(qb, "querier-2"), getTenantsByQuerier(qb, "querier-3")...), tenantID) - } + // Gracefully shutdown querier-1. + qb.removeQuerierConnection("querier-1", now.Add(20*time.Second)) + qb.removeQuerierConnection("querier-1", now.Add(21*time.Second)) + qb.notifyQuerierShutdown("querier-1") - // Querier-1 reconnects. - qb.addQuerierConnection("querier-1") - qb.addQuerierConnection("querier-1") + // We expect querier-1 has been removed. + assert.NotContains(t, qb.tenantQuerierAssignments.queriersByID, "querier-1") + assert.NoError(t, isConsistent(qb)) - // We expect the initial querier-1 tenants have got back to querier-1. - for _, tenantID := range querier1Tenants { - assert.Contains(t, getTenantsByQuerier(qb, "querier-1"), tenantID) - assert.NotContains(t, getTenantsByQuerier(qb, "querier-2"), tenantID) - assert.NotContains(t, getTenantsByQuerier(qb, "querier-3"), tenantID) - } + // We expect querier-1 tenants have been shuffled to other queriers. + for _, tenantID := range querier1Tenants { + assert.Contains(t, append(getTenantsByQuerier(qb, "querier-2"), getTenantsByQuerier(qb, "querier-3")...), tenantID) + } - // Querier-1 abruptly terminates (no shutdown notification received). - qb.removeQuerierConnection("querier-1", now.Add(40*time.Second)) - qb.removeQuerierConnection("querier-1", now.Add(41*time.Second)) + // Querier-1 reconnects. + qb.addQuerierConnection("querier-1") + qb.addQuerierConnection("querier-1") - // We expect querier-1 has NOT been removed. - assert.Contains(t, qb.tenantQuerierAssignments.queriersByID, QuerierID("querier-1")) - assert.NoError(t, isConsistent(qb)) + // We expect the initial querier-1 tenants have got back to querier-1. + for _, tenantID := range querier1Tenants { + assert.Contains(t, getTenantsByQuerier(qb, "querier-1"), tenantID) + assert.NotContains(t, getTenantsByQuerier(qb, "querier-2"), tenantID) + assert.NotContains(t, getTenantsByQuerier(qb, "querier-3"), tenantID) + } - // We expect the querier-1 tenants have not been shuffled to other queriers. - for _, tenantID := range querier1Tenants { - assert.Contains(t, getTenantsByQuerier(qb, "querier-1"), tenantID) - assert.NotContains(t, getTenantsByQuerier(qb, "querier-2"), tenantID) - assert.NotContains(t, getTenantsByQuerier(qb, "querier-3"), tenantID) - } + // Querier-1 abruptly terminates (no shutdown notification received). + qb.removeQuerierConnection("querier-1", now.Add(40*time.Second)) + qb.removeQuerierConnection("querier-1", now.Add(41*time.Second)) - // Try to forget disconnected queriers, but querier-1 forget delay hasn't passed yet. - qb.forgetDisconnectedQueriers(now.Add(90 * time.Second)) + // We expect querier-1 has NOT been removed. + assert.Contains(t, qb.tenantQuerierAssignments.queriersByID, QuerierID("querier-1")) + assert.NoError(t, isConsistent(qb)) - assert.Contains(t, qb.tenantQuerierAssignments.queriersByID, QuerierID("querier-1")) - assert.NoError(t, isConsistent(qb)) + // We expect the querier-1 tenants have not been shuffled to other queriers. + for _, tenantID := range querier1Tenants { + assert.Contains(t, getTenantsByQuerier(qb, "querier-1"), tenantID) + assert.NotContains(t, getTenantsByQuerier(qb, "querier-2"), tenantID) + assert.NotContains(t, getTenantsByQuerier(qb, "querier-3"), tenantID) + } - for _, tenantID := range querier1Tenants { - assert.Contains(t, getTenantsByQuerier(qb, "querier-1"), tenantID) - assert.NotContains(t, getTenantsByQuerier(qb, "querier-2"), tenantID) - assert.NotContains(t, getTenantsByQuerier(qb, "querier-3"), tenantID) - } + // Try to forget disconnected queriers, but querier-1 forget delay hasn't passed yet. + qb.forgetDisconnectedQueriers(now.Add(90 * time.Second)) + + assert.Contains(t, qb.tenantQuerierAssignments.queriersByID, QuerierID("querier-1")) + assert.NoError(t, isConsistent(qb)) + + for _, tenantID := range querier1Tenants { + assert.Contains(t, getTenantsByQuerier(qb, "querier-1"), tenantID) + assert.NotContains(t, getTenantsByQuerier(qb, "querier-2"), tenantID) + assert.NotContains(t, getTenantsByQuerier(qb, "querier-3"), tenantID) + } - // Try to forget disconnected queriers. This time querier-1 forget delay has passed. - qb.forgetDisconnectedQueriers(now.Add(105 * time.Second)) + // Try to forget disconnected queriers. This time querier-1 forget delay has passed. + qb.forgetDisconnectedQueriers(now.Add(105 * time.Second)) - assert.NotContains(t, qb.tenantQuerierAssignments.queriersByID, QuerierID("querier-1")) - assert.NoError(t, isConsistent(qb)) + assert.NotContains(t, qb.tenantQuerierAssignments.queriersByID, QuerierID("querier-1")) + assert.NoError(t, isConsistent(qb)) - // We expect querier-1 tenants have been shuffled to other queriers. - for _, tenantID := range querier1Tenants { - assert.Contains(t, append(getTenantsByQuerier(qb, "querier-2"), getTenantsByQuerier(qb, "querier-3")...), tenantID) + // We expect querier-1 tenants have been shuffled to other queriers. + for _, tenantID := range querier1Tenants { + assert.Contains(t, append(getTenantsByQuerier(qb, "querier-2"), getTenantsByQuerier(qb, "querier-3")...), tenantID) + } + }) } } @@ -392,69 +598,81 @@ func TestQueues_ForgetDelay_ShouldCorrectlyHandleQuerierReconnectingBeforeForget numTenants = 100 ) - now := time.Now() - qb := newQueueBroker(0, true, forgetDelay) - assert.NotNil(t, qb) - assert.NoError(t, isConsistent(qb)) + treeTypes := buildTreeTestsStruct() - // 3 queriers open 2 connections each. - for i := 1; i <= 3; i++ { - qb.addQuerierConnection(QuerierID(fmt.Sprintf("querier-%d", i))) - qb.addQuerierConnection(QuerierID(fmt.Sprintf("querier-%d", i))) - } + for _, tt := range treeTypes { + t.Run(tt.name, func(t *testing.T) { + now := time.Now() + qb := newQueueBroker(0, true, tt.useMultiAlgoTreeQueue, forgetDelay) + assert.NotNil(t, qb) + assert.NoError(t, isConsistent(qb)) - // Add tenant queues. - for i := 0; i < numTenants; i++ { - tenantID := TenantID(fmt.Sprintf("tenant-%d", i)) - getOrAdd(t, qb, tenantID, maxQueriersPerTenant) - } + // 3 queriers open 2 connections each. + for i := 1; i <= 3; i++ { + qb.addQuerierConnection(QuerierID(fmt.Sprintf("querier-%d", i))) + qb.addQuerierConnection(QuerierID(fmt.Sprintf("querier-%d", i))) + } - // We expect querier-1 to have some tenants. - querier1Tenants := getTenantsByQuerier(qb, "querier-1") - require.NotEmpty(t, querier1Tenants) + // Add tenant queues. + for i := 0; i < numTenants; i++ { + tenantID := TenantID(fmt.Sprintf("tenant-%d", i)) + err := qb.tenantQuerierAssignments.createOrUpdateTenant(tenantID, maxQueriersPerTenant) + assert.NoError(t, err) - // Querier-1 abruptly terminates (no shutdown notification received). - qb.removeQuerierConnection("querier-1", now.Add(40*time.Second)) - qb.removeQuerierConnection("querier-1", now.Add(41*time.Second)) + //Enqueue some stuff so that the tree queue node exists + err = qb.enqueueObjectsForTests(tenantID, 1) + assert.NoError(t, err) + } - // We expect querier-1 has NOT been removed. - assert.Contains(t, qb.tenantQuerierAssignments.queriersByID, QuerierID("querier-1")) - assert.NoError(t, isConsistent(qb)) + // We expect querier-1 to have some tenants. + querier1Tenants := getTenantsByQuerier(qb, "querier-1") + require.NotEmpty(t, querier1Tenants) - // We expect the querier-1 tenants have not been shuffled to other queriers. - for _, tenantID := range querier1Tenants { - assert.Contains(t, getTenantsByQuerier(qb, "querier-1"), tenantID) - assert.NotContains(t, getTenantsByQuerier(qb, "querier-2"), tenantID) - assert.NotContains(t, getTenantsByQuerier(qb, "querier-3"), tenantID) - } + // Querier-1 abruptly terminates (no shutdown notification received). + qb.removeQuerierConnection("querier-1", now.Add(40*time.Second)) + qb.removeQuerierConnection("querier-1", now.Add(41*time.Second)) + + // We expect querier-1 has NOT been removed. + assert.Contains(t, qb.tenantQuerierAssignments.queriersByID, QuerierID("querier-1")) + assert.NoError(t, isConsistent(qb)) - // Try to forget disconnected queriers, but querier-1 forget delay hasn't passed yet. - qb.forgetDisconnectedQueriers(now.Add(90 * time.Second)) + // We expect the querier-1 tenants have not been shuffled to other queriers. + for _, tenantID := range querier1Tenants { + assert.Contains(t, getTenantsByQuerier(qb, "querier-1"), tenantID) + assert.NotContains(t, getTenantsByQuerier(qb, "querier-2"), tenantID) + assert.NotContains(t, getTenantsByQuerier(qb, "querier-3"), tenantID) + } - // Querier-1 reconnects. - qb.addQuerierConnection("querier-1") - qb.addQuerierConnection("querier-1") + // Try to forget disconnected queriers, but querier-1 forget delay hasn't passed yet. + qb.forgetDisconnectedQueriers(now.Add(90 * time.Second)) - assert.Contains(t, qb.tenantQuerierAssignments.queriersByID, QuerierID("querier-1")) - assert.NoError(t, isConsistent(qb)) + // Querier-1 reconnects. + qb.addQuerierConnection("querier-1") + qb.addQuerierConnection("querier-1") - // We expect the querier-1 tenants have not been shuffled to other queriers. - for _, tenantID := range querier1Tenants { - assert.Contains(t, getTenantsByQuerier(qb, "querier-1"), tenantID) - assert.NotContains(t, getTenantsByQuerier(qb, "querier-2"), tenantID) - assert.NotContains(t, getTenantsByQuerier(qb, "querier-3"), tenantID) - } + assert.Contains(t, qb.tenantQuerierAssignments.queriersByID, QuerierID("querier-1")) + assert.NoError(t, isConsistent(qb)) + + // We expect the querier-1 tenants have not been shuffled to other queriers. + for _, tenantID := range querier1Tenants { + assert.Contains(t, getTenantsByQuerier(qb, "querier-1"), tenantID) + assert.NotContains(t, getTenantsByQuerier(qb, "querier-2"), tenantID) + assert.NotContains(t, getTenantsByQuerier(qb, "querier-3"), tenantID) + } + + // Try to forget disconnected queriers far in the future, but there's no disconnected querier. + qb.forgetDisconnectedQueriers(now.Add(200 * time.Second)) - // Try to forget disconnected queriers far in the future, but there's no disconnected querier. - qb.forgetDisconnectedQueriers(now.Add(200 * time.Second)) + assert.Contains(t, qb.tenantQuerierAssignments.queriersByID, QuerierID("querier-1")) + assert.NoError(t, isConsistent(qb)) - assert.Contains(t, qb.tenantQuerierAssignments.queriersByID, QuerierID("querier-1")) - assert.NoError(t, isConsistent(qb)) + for _, tenantID := range querier1Tenants { + assert.Contains(t, getTenantsByQuerier(qb, "querier-1"), tenantID) + assert.NotContains(t, getTenantsByQuerier(qb, "querier-2"), tenantID) + assert.NotContains(t, getTenantsByQuerier(qb, "querier-3"), tenantID) + } - for _, tenantID := range querier1Tenants { - assert.Contains(t, getTenantsByQuerier(qb, "querier-1"), tenantID) - assert.NotContains(t, getTenantsByQuerier(qb, "querier-2"), tenantID) - assert.NotContains(t, getTenantsByQuerier(qb, "querier-3"), tenantID) + }) } } @@ -466,57 +684,91 @@ func generateQuerier(r *rand.Rand) QuerierID { return QuerierID(fmt.Sprint("querier-", r.Int()%5)) } -func getOrAdd(t *testing.T, qb *queueBroker, tenantID TenantID, maxQueriers int) *list.List { - addedQueue, err := qb.getOrAddTenantQueue(tenantID, maxQueriers) - assert.Nil(t, err) - assert.NotNil(t, addedQueue) - assert.NoError(t, isConsistent(qb)) - reAddedQueue, err := qb.getOrAddTenantQueue(tenantID, maxQueriers) - assert.Nil(t, err) - assert.Equal(t, addedQueue, reAddedQueue) - return addedQueue.localQueue +// getTenantsByQuerier returns the list of tenants handled by the provided QuerierID. +func getTenantsByQuerier(broker *queueBroker, querierID QuerierID) []TenantID { + var tenantIDs []TenantID + for _, tenantID := range broker.tenantQuerierAssignments.tenantIDOrder { + querierSet := broker.tenantQuerierAssignments.tenantQuerierIDs[tenantID] + if querierSet == nil { + // If it's nil then all queriers can handle this tenant. + tenantIDs = append(tenantIDs, tenantID) + continue + } + if _, ok := querierSet[querierID]; ok { + tenantIDs = append(tenantIDs, tenantID) + } + } + return tenantIDs } // getOrAddTenantQueue is a test utility, not intended for use by consumers of queueBroker -func (qb *queueBroker) getOrAddTenantQueue(tenantID TenantID, maxQueriers int) (*TreeQueue, error) { +func (qb *queueBroker) getOrAddTenantQueue(tenantID TenantID, maxQueriers int) error { err := qb.tenantQuerierAssignments.createOrUpdateTenant(tenantID, maxQueriers) if err != nil { - return nil, err + return err } - queuePath := QueuePath{string(tenantID)} - return qb.tenantQueuesTree.getOrAddNode(queuePath) -} - -// getQueue is a test utility, not intended for use by consumers of queueBroker -func (qb *queueBroker) getQueue(tenantID TenantID) *TreeQueue { - tenant, err := qb.tenantQuerierAssignments.getTenant(tenantID) - if tenant == nil || err != nil { - return nil + // TODO (casie): When deprecating legacy tree queue, clean this up + queuePath := qb.makeQueuePathForTests(tenantID) + if tq, ok := qb.tree.(*TreeQueue); ok { + _, err = tq.getOrAddNode(queuePath) + } else if itq, ok := qb.tree.(*MultiQueuingAlgorithmTreeQueue); ok { + _, err = itq.rootNode.getOrAddNode(queuePath, itq) } - - queuePath := QueuePath{string(tenantID)} - tenantQueue := qb.tenantQueuesTree.getNode(queuePath) - return tenantQueue + return err } // removeTenantQueue is a test utility, not intended for use by consumers of queueBroker func (qb *queueBroker) removeTenantQueue(tenantID TenantID) bool { qb.tenantQuerierAssignments.removeTenant(tenantID) queuePath := QueuePath{string(tenantID)} - return qb.tenantQueuesTree.deleteNode(queuePath) + + // TODO (casie): When deprecating legacy tree queue, clean this up + if tq, ok := qb.tree.(*TreeQueue); ok { + // Normally, emptying out a tenant queue would handle removal from tenantIDOrder, but + // in tests, we sometimes remove a tenant with an existing queue outright, in which case + // we need to update tenantIDOrder and tenantNodes manually. + for i, t := range qb.tenantQuerierAssignments.tenantIDOrder { + if tenantID == t { + qb.tenantQuerierAssignments.tenantIDOrder[i] = emptyTenantID + } + } + return tq.deleteNode(queuePath) + + } else if itq, ok := qb.tree.(*MultiQueuingAlgorithmTreeQueue); ok { + return itq.rootNode.deleteNode(queuePath) + } + + return false } -func confirmOrderForQuerier(t *testing.T, qb *queueBroker, querier QuerierID, lastTenantIndex int, queues ...*list.List) int { - for _, queue := range queues { - var err error - tenant, _, err := qb.tenantQuerierAssignments.getNextTenantForQuerier(lastTenantIndex, querier) - tenantQueue := qb.getQueue(tenant.tenantID) - assert.Equal(t, queue, tenantQueue.localQueue) - assert.NoError(t, isConsistent(qb)) - assert.NoError(t, err) +// deleteNode is a test utility, not intended for use by consumers of Node +func (n *Node) deleteNode(pathFromNode QueuePath) bool { + if len(pathFromNode) == 0 { + // node cannot delete itself + return false } - return lastTenantIndex + + // parentPath is everything except the last node, childNode is the last in path + parentPath, deleteNodeName := pathFromNode[:len(pathFromNode)-1], pathFromNode[len(pathFromNode)-1] + + parentNode := n.getNode(parentPath) + if parentNode == nil { + // not found + return false + } + if deleteNode := parentNode.getNode(QueuePath{deleteNodeName}); deleteNode != nil { + // empty the node so that update state properly deletes it + deleteNode.queueMap = map[string]*Node{} + deleteNode.localQueue = list.New() + parentNode.queuingAlgorithm.dequeueUpdateState(parentNode, deleteNode) + return true + } + return false +} + +func (qb *queueBroker) makeQueuePathForTests(tenantID TenantID) QueuePath { + return QueuePath{string(tenantID)} } func isConsistent(qb *queueBroker) error { @@ -525,17 +777,37 @@ func isConsistent(qb *queueBroker) error { } tenantCount := 0 + for ix, tenantID := range qb.tenantQuerierAssignments.tenantIDOrder { - if tenantID != "" && qb.getQueue(tenantID) == nil { - return fmt.Errorf("tenant %s doesn't have queue", tenantID) - } - if tenantID == "" && qb.getQueue(tenantID) != nil { - return fmt.Errorf("tenant %s shouldn't have queue", tenantID) + path := qb.makeQueuePathForTests(tenantID) + + // TODO (casie): After deprecating legacy tree queue, clean this up + if tq, ok := qb.tree.(*TreeQueue); ok { + node := tq.getNode(path) + if tenantID != "" && node == nil { + return fmt.Errorf("tenant %s doesn't have queue in legacy tree", tenantID) + } + if tenantID == "" && node != nil { + return fmt.Errorf("tenant %s shouldn't have queue in legacy tree", tenantID) + } + } else if itq, ok := qb.tree.(*MultiQueuingAlgorithmTreeQueue); ok { + node := itq.rootNode.getNode(path) + if tenantID != "" && node == nil { + return fmt.Errorf("tenant %s doesn't have queue", tenantID) + } + + if tenantID == "" && node != nil { + return fmt.Errorf("tenant %s shouldn't have queue", tenantID) + } } + if tenantID == "" { continue } - tenantCount++ + + if tenantID != "" { + tenantCount++ + } tenant := qb.tenantQuerierAssignments.tenantsByID[tenantID] querierSet := qb.tenantQuerierAssignments.tenantQuerierIDs[tenantID] @@ -557,27 +829,24 @@ func isConsistent(qb *queueBroker) error { } } - tenantQueueCount := qb.tenantQueuesTree.NodeCount() - 1 // exclude root node - if tenantCount != tenantQueueCount { + var tenantQueueCount int + // TODO (casie): After deprecating legacy tree queue, clean this up + if tq, ok := qb.tree.(*TreeQueue); ok { + tenantQueueCount = tq.NodeCount() - 1 + } else if itq, ok := qb.tree.(*MultiQueuingAlgorithmTreeQueue); ok { + tenantQueueCount = itq.rootNode.nodeCount() - 1 + } + if tenantQueueCount != tenantCount { return fmt.Errorf("inconsistent number of tenants list and tenant queues") } return nil } -// getTenantsByQuerier returns the list of tenants handled by the provided QuerierID. -func getTenantsByQuerier(broker *queueBroker, querierID QuerierID) []TenantID { - var tenantIDs []TenantID - for _, tenantID := range broker.tenantQuerierAssignments.tenantIDOrder { - querierSet := broker.tenantQuerierAssignments.tenantQuerierIDs[tenantID] - if querierSet == nil { - // If it's nil then all queriers can handle this tenant. - tenantIDs = append(tenantIDs, tenantID) - continue - } - if _, ok := querierSet[querierID]; ok { - tenantIDs = append(tenantIDs, tenantID) - } +func (n *Node) nodeCount() int { + count := 1 + for _, child := range n.queueMap { + count += child.nodeCount() } - return tenantIDs + return count } diff --git a/pkg/scheduler/queue/tree_queue.go b/pkg/scheduler/queue/tree_queue.go index 1cf136bdc08..994dabedf78 100644 --- a/pkg/scheduler/queue/tree_queue.go +++ b/pkg/scheduler/queue/tree_queue.go @@ -6,11 +6,6 @@ import ( "container/list" ) -type QueuePath []string //nolint:revive // disallows types beginning with package name -type QueueIndex int //nolint:revive // disallows types beginning with package name - -const localQueueIndex = -1 - // TreeQueue is a hierarchical queue implementation with an arbitrary amount of child queues. // // TreeQueue internally maintains round-robin fair queuing across all of its queue dimensions. @@ -201,7 +196,7 @@ func (q *TreeQueue) DequeueByPath(childPath QueuePath) any { return nil } - v := childQueue.Dequeue() + _, v := childQueue.Dequeue() if childQueue.IsEmpty() { // child node will recursively clean up its own empty children during dequeue, @@ -221,9 +216,11 @@ func (q *TreeQueue) DequeueByPath(childPath QueuePath) any { // // Nodes that empty down to the leaf after being dequeued from are deleted as the recursion returns // up the stack. This maintains structural guarantees relied on to make IsEmpty() non-recursive. -func (q *TreeQueue) Dequeue() any { +func (q *TreeQueue) Dequeue() (QueuePath, any) { + var childPath QueuePath var v any initialLen := len(q.childQueueOrder) + path := QueuePath{q.name} for iters := 0; iters <= initialLen && v == nil; iters++ { if q.currentChildQueueIndex == localQueueIndex { @@ -242,7 +239,7 @@ func (q *TreeQueue) Dequeue() any { // pick the child node whose turn it is and recur childQueueName := q.childQueueOrder[q.currentChildQueueIndex] childQueue := q.childQueueMap[childQueueName] - v = childQueue.Dequeue() + childPath, v = childQueue.Dequeue() // perform cleanup if child node is empty after dequeuing recursively if childQueue.IsEmpty() { @@ -253,7 +250,7 @@ func (q *TreeQueue) Dequeue() any { } } } - return v + return append(path, childPath...), v } // deleteNode removes a child node from the tree and the childQueueOrder and corrects the indices. diff --git a/pkg/scheduler/queue/tree_queue_test.go b/pkg/scheduler/queue/tree_queue_test.go index aa0859ca46b..0d333c39a62 100644 --- a/pkg/scheduler/queue/tree_queue_test.go +++ b/pkg/scheduler/queue/tree_queue_test.go @@ -10,6 +10,9 @@ import ( "github.com/stretchr/testify/require" ) +// TODO (casie): When deprecating TreeQueue, it's safe to delete this file wholesale; all relevant test +// functionality has been duplicated for MultiQueuingAlgorithmTreeQueue. + // TestDequeueBalancedTree checks dequeuing behavior from a balanced tree. // // Dequeuing from a balanced tree allows the test to have a simple looped structures @@ -28,7 +31,7 @@ func TestDequeueBalancedTree(t *testing.T) { dequeuedPathCache := make([]QueuePath, rotationsBeforeRepeat) for !root.IsEmpty() { - v := root.Dequeue() + _, v := root.Dequeue() dequeuedPath := getChildPathFromQueueItem(v) // require dequeued path has not repeated before the expected number of rotations @@ -96,7 +99,7 @@ func TestDequeueUnbalancedTree(t *testing.T) { root := makeUnbalancedTreeQueue(t) // dequeue from root until exhausted - v := root.Dequeue() + _, v := root.Dequeue() require.Equal(t, "root:0:val0", v) // root:0 and any subtrees are exhausted @@ -106,25 +109,25 @@ func TestDequeueUnbalancedTree(t *testing.T) { // root:0 was deleted require.Nil(t, root.getNode(path)) - v = root.Dequeue() + _, v = root.Dequeue() require.Equal(t, "root:1:val0", v) - v = root.Dequeue() + _, v = root.Dequeue() require.Equal(t, "root:2:0:val0", v) - v = root.Dequeue() + _, v = root.Dequeue() require.Equal(t, "root:1:0:val0", v) - v = root.Dequeue() + _, v = root.Dequeue() require.Equal(t, "root:2:1:val0", v) - v = root.Dequeue() + _, v = root.Dequeue() require.Equal(t, "root:1:val1", v) - v = root.Dequeue() + _, v = root.Dequeue() require.Equal(t, "root:2:0:val1", v) - v = root.Dequeue() + _, v = root.Dequeue() require.Equal(t, "root:1:0:val1", v) // root:1 and any subtrees are exhausted @@ -134,10 +137,10 @@ func TestDequeueUnbalancedTree(t *testing.T) { // root:1 was deleted require.Nil(t, root.getNode(path)) - v = root.Dequeue() + _, v = root.Dequeue() require.Equal(t, "root:2:1:val1", v) - v = root.Dequeue() + _, v = root.Dequeue() require.Equal(t, "root:2:1:val2", v) // root:2 and any subtrees are exhausted @@ -264,19 +267,19 @@ func TestEnqueueDuringDequeueRespectsRoundRobin(t *testing.T) { require.Equal(t, []string{"0", "1", "2"}, root.childQueueOrder) // dequeue first item - v := root.Dequeue() + _, v := root.Dequeue() dequeuedPath := getChildPathFromQueueItem(v) require.Equal(t, QueuePath{"0"}, dequeuedPath) // dequeue second item; root:1 is now exhausted and deleted - v = root.Dequeue() + _, v = root.Dequeue() dequeuedPath = getChildPathFromQueueItem(v) require.Equal(t, QueuePath{"1"}, dequeuedPath) require.Nil(t, root.getNode(QueuePath{"1"})) require.Equal(t, []string{"0", "2"}, root.childQueueOrder) // dequeue third item - v = root.Dequeue() + _, v = root.Dequeue() dequeuedPath = getChildPathFromQueueItem(v) require.Equal(t, QueuePath{"2"}, dequeuedPath) @@ -290,18 +293,18 @@ func TestEnqueueDuringDequeueRespectsRoundRobin(t *testing.T) { // dequeue fourth item; the newly-enqueued root:1 item // has not jumped the line in front of root:0 - v = root.Dequeue() + _, v = root.Dequeue() dequeuedPath = getChildPathFromQueueItem(v) require.Equal(t, QueuePath{"0"}, dequeuedPath) // dequeue fifth item; the newly-enqueued root:1 item // has not jumped the line in front of root:2 - v = root.Dequeue() + _, v = root.Dequeue() dequeuedPath = getChildPathFromQueueItem(v) require.Equal(t, QueuePath{"2"}, dequeuedPath) // dequeue sixth item; verifying the order 0->2->1 is being followed - v = root.Dequeue() + _, v = root.Dequeue() dequeuedPath = getChildPathFromQueueItem(v) require.Equal(t, QueuePath{"1"}, dequeuedPath) diff --git a/pkg/scheduler/queue/tree_queueing_algorithms.go b/pkg/scheduler/queue/tree_queueing_algorithms.go new file mode 100644 index 00000000000..2c52773d5d2 --- /dev/null +++ b/pkg/scheduler/queue/tree_queueing_algorithms.go @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package queue + +// QueuingAlgorithm represents the set of operations specific to different approaches to queuing/dequeuing. It is +// applied at the layer-level -- every Node at the same depth in a MultiQueuingAlgorithmTreeQueue shares the same QueuingAlgorithm, +// including any state in structs that implement QueuingAlgorithm. +type QueuingAlgorithm interface { + // addChildNode updates a parent's queueMap and other child-tracking structures with a new child Node. QueuePaths + // passed to enqueue functions are allowed to contain node names for nodes which do not yet exist; in those cases, + // we create the missing nodes. Implementers are responsible for adding the newly-created node to whatever state(s) + // keep track of nodes in the subtree, as well as updating any positional pointers that have been shifted by the + // addition of a new node. + addChildNode(parent, child *Node) + + // dequeueSelectNode uses information in the given node and/or the QueuingAlgorithm state to select and return + // the node which an element will be dequeued from next, as well as a bool representing whether the passed node + // and all its children have been checked for dequeue-ability. The returned node may be one of the following: + // - the given node: this triggers the base case for the recursive Node.dequeue, and results in dequeuing from + // the node's local queue. + // - a direct child of the given node: this continues the dequeue recursion. + // The Tree uses the bool returned to determine when no dequeue-able value can be found at this child. If all + // children in the subtree have been checked, but no dequeue-able value is found, the Tree traverses to the next + // child in the given node's order. dequeueSelectNode does *not* update any Node fields. + dequeueSelectNode(node *Node) (*Node, bool) + + // dequeueUpdateState is called after we have finished dequeuing from a node. When a child is left empty after the + // dequeue operation, dequeueUpdateState should perform cleanup by deleting that child from the Node and update + // any other necessary state to reflect the child's removal. It should also update the relevant position to point + // to the next node to dequeue from. + dequeueUpdateState(node *Node, dequeuedFrom *Node) +} + +// roundRobinState is the simplest type of QueuingAlgorithm; nodes which use this QueuingAlgorithm and are at +// the same depth in a MultiQueuingAlgorithmTreeQueue do not share any state. When children are added to these nodes, they are placed at +// the "end" of the order from the perspective of the node's current queuePosition (e.g., if queuePosition is 3, +// a new child will be placed at index 2). Children are dequeued from using a simple round-robin ordering; +// queuePosition is incremented on every dequeue. +type roundRobinState struct { +} + +// addChildNode adds a child Node to the "end" of the parent's queueOrder from the perspective +// of the parent's queuePosition. Thus, If queuePosition is localQueueIndex, the new child is added to the end of +// queueOrder. Otherwise, the new child is added at queuePosition - 1, and queuePosition is incremented to keep +// it pointed to the same child. +func (rrs *roundRobinState) addChildNode(parent, child *Node) { + // add childNode to n.queueMap + parent.queueMap[child.Name()] = child + + // add childNode to n.queueOrder before the next-to-dequeue position, update n.queuePosition to current element + if parent.queuePosition <= localQueueIndex { + parent.queueOrder = append(parent.queueOrder, child.Name()) + } else { + parent.queueOrder = append(parent.queueOrder[:parent.queuePosition], append([]string{child.Name()}, parent.queueOrder[parent.queuePosition:]...)...) + parent.queuePosition++ // keep position pointed at same element + } +} + +// dequeueSelectNode returns the node at the node's queuePosition. queuePosition represents the position of +// the next node to dequeue from, and is incremented in dequeueUpdateState. +func (rrs *roundRobinState) dequeueSelectNode(node *Node) (*Node, bool) { + checkedAllNodes := node.childrenChecked == len(node.queueOrder)+1 // must check local queue as well + if node.queuePosition == localQueueIndex { + return node, checkedAllNodes + } + + currentNodeName := node.queueOrder[node.queuePosition] + if node, ok := node.queueMap[currentNodeName]; ok { + return node, checkedAllNodes + } + return nil, checkedAllNodes +} + +// dequeueUpdateState does the following: +// - deletes the dequeued-from child node if it is empty after the dequeue operation +// - increments queuePosition if no child was deleted +func (rrs *roundRobinState) dequeueUpdateState(node *Node, dequeuedFrom *Node) { + // if the child node is nil, we haven't done anything to the tree; return early + if dequeuedFrom == nil { + return + } + + // corner case: if node == dequeuedFrom and is empty, we will try to delete a "child" (no-op), + // and won't increment position. This is fine, because if the node is empty, there's nothing to + // increment to. + childIsEmpty := dequeuedFrom.IsEmpty() + + // if the child is empty, we should delete it, but not increment queue position, since removing an element + // from queueOrder sets our position to the next element already. + if childIsEmpty { + childName := dequeuedFrom.Name() + delete(node.queueMap, childName) + for idx, name := range node.queueOrder { + if name == childName { + node.queueOrder = append(node.queueOrder[:idx], node.queueOrder[idx+1:]...) + } + } + + } else { + node.queuePosition++ + } + if node.queuePosition >= len(node.queueOrder) { + node.queuePosition = localQueueIndex + } +} diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go index a8511fd432c..70fce9050df 100644 --- a/pkg/scheduler/scheduler.go +++ b/pkg/scheduler/scheduler.go @@ -95,6 +95,7 @@ type connectedFrontend struct { type Config struct { MaxOutstandingPerTenant int `yaml:"max_outstanding_requests_per_tenant"` AdditionalQueryQueueDimensionsEnabled bool `yaml:"additional_query_queue_dimensions_enabled" category:"experimental"` + UseMultiAlgorithmQueryQueue bool `yaml:"use_multi_algorithm_query_queue" category:"experimental"` QuerierForgetDelay time.Duration `yaml:"querier_forget_delay" category:"experimental"` GRPCClientConfig grpcclient.Config `yaml:"grpc_client_config" doc:"description=This configures the gRPC client used to report errors back to the query-frontend."` @@ -104,6 +105,7 @@ type Config struct { func (cfg *Config) RegisterFlags(f *flag.FlagSet, logger log.Logger) { f.IntVar(&cfg.MaxOutstandingPerTenant, "query-scheduler.max-outstanding-requests-per-tenant", 100, "Maximum number of outstanding requests per tenant per query-scheduler. In-flight requests above this limit will fail with HTTP response status code 429.") f.BoolVar(&cfg.AdditionalQueryQueueDimensionsEnabled, "query-scheduler.additional-query-queue-dimensions-enabled", false, "Enqueue query requests with additional queue dimensions to split tenant request queues into subqueues. This enables separate requests to proceed from a tenant's subqueues even when other subqueues are blocked on slow query requests. Must be set on both query-frontend and scheduler to take effect. (default false)") + f.BoolVar(&cfg.UseMultiAlgorithmQueryQueue, "query-scheduler.use-multi-algorithm-query-queue", false, "Use an experimental version of the query queue which has the same behavior as the existing queue, but integrates tenant selection into the tree model.") f.DurationVar(&cfg.QuerierForgetDelay, "query-scheduler.querier-forget-delay", 0, "If a querier disconnects without sending notification about graceful shutdown, the query-scheduler will keep the querier in the tenant's shard until the forget delay has passed. This feature is useful to reduce the blast radius when shuffle-sharding is enabled.") cfg.GRPCClientConfig.RegisterFlagsWithPrefix("query-scheduler.grpc-client-config", f) @@ -160,6 +162,7 @@ func NewScheduler(cfg Config, limits Limits, log log.Logger, registerer promethe s.log, cfg.MaxOutstandingPerTenant, cfg.AdditionalQueryQueueDimensionsEnabled, + cfg.UseMultiAlgorithmQueryQueue, cfg.QuerierForgetDelay, s.queueLength, s.discardedRequests, @@ -441,7 +444,7 @@ func (s *Scheduler) QuerierLoop(querier schedulerpb.SchedulerForQuerier_QuerierL /* We want to dequeue the next unexpired request from the chosen tenant queue. - The chance of choosing a particular tenant for dequeueing is (1/active_tenants). + The chance of choosing a particular tenant for dequeuing is (1/active_tenants). This is problematic under load, especially with other middleware enabled such as querier.split-by-interval, where one request may fan out into many. If expired requests aren't exhausted before checking another tenant, it would take