From 7d501729c37124018a1e852a7732396f7d89bd7c Mon Sep 17 00:00:00 2001
From: Dylan Guedes <djmgguedes@gmail.com>
Date: Thu, 13 Jan 2022 05:55:17 -0300
Subject: [PATCH] Add missing forked gateway tests (#5118)

* Add missing tests for `storegateway`.

* Fork cortex tsdb testutil package.

* Fix import lint.
---
 pkg/storage/tsdb/testutil/block_mock.go       |   68 +
 pkg/storage/tsdb/testutil/objstore.go         |   26 +
 .../bucket_index_metadata_fetcher_test.go     |  323 +++++
 .../bucket_store_inmemory_server.go           |   66 +
 pkg/storegateway/bucket_store_metrics_test.go |  510 ++++++++
 pkg/storegateway/bucket_stores_test.go        |  616 +++++++++
 pkg/storegateway/chunk_bytes_pool_test.go     |   37 +
 pkg/storegateway/gateway_ring_test.go         |   72 ++
 pkg/storegateway/gateway_test.go              | 1132 +++++++++++++++++
 .../metadata_fetcher_filters_test.go          |  107 ++
 .../metadata_fetcher_metrics_test.go          |  111 ++
 pkg/storegateway/partitioner_test.go          |   58 +
 pkg/storegateway/sharding_strategy_test.go    |  670 ++++++++++
 13 files changed, 3796 insertions(+)
 create mode 100644 pkg/storage/tsdb/testutil/block_mock.go
 create mode 100644 pkg/storage/tsdb/testutil/objstore.go
 create mode 100644 pkg/storegateway/bucket_index_metadata_fetcher_test.go
 create mode 100644 pkg/storegateway/bucket_store_inmemory_server.go
 create mode 100644 pkg/storegateway/bucket_store_metrics_test.go
 create mode 100644 pkg/storegateway/bucket_stores_test.go
 create mode 100644 pkg/storegateway/chunk_bytes_pool_test.go
 create mode 100644 pkg/storegateway/gateway_ring_test.go
 create mode 100644 pkg/storegateway/gateway_test.go
 create mode 100644 pkg/storegateway/metadata_fetcher_filters_test.go
 create mode 100644 pkg/storegateway/metadata_fetcher_metrics_test.go
 create mode 100644 pkg/storegateway/partitioner_test.go
 create mode 100644 pkg/storegateway/sharding_strategy_test.go

diff --git a/pkg/storage/tsdb/testutil/block_mock.go b/pkg/storage/tsdb/testutil/block_mock.go
new file mode 100644
index 0000000000000..a2a931aa59cd7
--- /dev/null
+++ b/pkg/storage/tsdb/testutil/block_mock.go
@@ -0,0 +1,68 @@
+package testutil
+
+import (
+	"context"
+	"crypto/rand"
+	"encoding/json"
+	"fmt"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/oklog/ulid"
+	"github.com/prometheus/prometheus/tsdb"
+	"github.com/stretchr/testify/require"
+	"github.com/thanos-io/thanos/pkg/block/metadata"
+	"github.com/thanos-io/thanos/pkg/objstore"
+)
+
+func MockStorageBlock(t testing.TB, bucket objstore.Bucket, userID string, minT, maxT int64) tsdb.BlockMeta {
+	// Generate a block ID whose timestamp matches the maxT (for simplicity we assume it
+	// has been compacted and shipped in zero time, even if not realistic).
+	id := ulid.MustNew(uint64(maxT), rand.Reader)
+
+	meta := tsdb.BlockMeta{
+		Version: 1,
+		ULID:    id,
+		MinTime: minT,
+		MaxTime: maxT,
+		Compaction: tsdb.BlockMetaCompaction{
+			Level:   1,
+			Sources: []ulid.ULID{id},
+		},
+	}
+
+	metaContent, err := json.Marshal(meta)
+	if err != nil {
+		panic("failed to marshal mocked block meta")
+	}
+
+	metaContentReader := strings.NewReader(string(metaContent))
+	metaPath := fmt.Sprintf("%s/%s/meta.json", userID, id.String())
+	require.NoError(t, bucket.Upload(context.Background(), metaPath, metaContentReader))
+
+	// Upload an empty index, just to make sure the meta.json is not the only object in the block location.
+	indexPath := fmt.Sprintf("%s/%s/index", userID, id.String())
+	require.NoError(t, bucket.Upload(context.Background(), indexPath, strings.NewReader("")))
+
+	return meta
+}
+
+func MockStorageDeletionMark(t testing.TB, bucket objstore.Bucket, userID string, meta tsdb.BlockMeta) *metadata.DeletionMark {
+	mark := metadata.DeletionMark{
+		ID:           meta.ULID,
+		DeletionTime: time.Now().Add(-time.Minute).Unix(),
+		Version:      metadata.DeletionMarkVersion1,
+	}
+
+	markContent, err := json.Marshal(mark)
+	if err != nil {
+		panic("failed to marshal mocked block meta")
+	}
+
+	markContentReader := strings.NewReader(string(markContent))
+	markPath := fmt.Sprintf("%s/%s/%s", userID, meta.ULID.String(), metadata.DeletionMarkFilename)
+	require.NoError(t, bucket.Upload(context.Background(), markPath, markContentReader))
+
+	return &mark
+}
diff --git a/pkg/storage/tsdb/testutil/objstore.go b/pkg/storage/tsdb/testutil/objstore.go
new file mode 100644
index 0000000000000..94d12aed64782
--- /dev/null
+++ b/pkg/storage/tsdb/testutil/objstore.go
@@ -0,0 +1,26 @@
+package testutil
+
+import (
+	"io/ioutil"
+	"os"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+	"github.com/thanos-io/thanos/pkg/objstore"
+
+	"github.com/cortexproject/cortex/pkg/storage/bucket/filesystem"
+)
+
+func PrepareFilesystemBucket(t testing.TB) (objstore.Bucket, string) {
+	storageDir, err := ioutil.TempDir(os.TempDir(), "bucket")
+	require.NoError(t, err)
+
+	t.Cleanup(func() {
+		require.NoError(t, os.RemoveAll(storageDir))
+	})
+
+	bkt, err := filesystem.NewBucketClient(filesystem.Config{Directory: storageDir})
+	require.NoError(t, err)
+
+	return objstore.BucketWithMetrics("test", bkt, nil), storageDir
+}
diff --git a/pkg/storegateway/bucket_index_metadata_fetcher_test.go b/pkg/storegateway/bucket_index_metadata_fetcher_test.go
new file mode 100644
index 0000000000000..617cfb343dd49
--- /dev/null
+++ b/pkg/storegateway/bucket_index_metadata_fetcher_test.go
@@ -0,0 +1,323 @@
+package storegateway
+
+import (
+	"bytes"
+	"context"
+	"path"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/cortexproject/cortex/pkg/storage/bucket"
+	"github.com/cortexproject/cortex/pkg/storage/tsdb/bucketindex"
+	"github.com/go-kit/log"
+	"github.com/grafana/dskit/concurrency"
+	"github.com/oklog/ulid"
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/testutil"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/mock"
+	"github.com/stretchr/testify/require"
+	"github.com/thanos-io/thanos/pkg/block"
+	"github.com/thanos-io/thanos/pkg/block/metadata"
+
+	tsdb_testutil "github.com/grafana/loki/pkg/storage/tsdb/testutil"
+)
+
+func TestBucketIndexMetadataFetcher_Fetch(t *testing.T) {
+	const userID = "user-1"
+
+	bkt, _ := tsdb_testutil.PrepareFilesystemBucket(t)
+	reg := prometheus.NewPedanticRegistry()
+	ctx := context.Background()
+	now := time.Now()
+	logs := &concurrency.SyncBuffer{}
+	logger := log.NewLogfmtLogger(logs)
+
+	// Create a bucket index.
+	block1 := &bucketindex.Block{ID: ulid.MustNew(1, nil)}
+	block2 := &bucketindex.Block{ID: ulid.MustNew(2, nil)}
+	block3 := &bucketindex.Block{ID: ulid.MustNew(3, nil)}
+	mark1 := &bucketindex.BlockDeletionMark{ID: block1.ID, DeletionTime: now.Add(-time.Hour).Unix()}     // Below the ignore delay threshold.
+	mark2 := &bucketindex.BlockDeletionMark{ID: block2.ID, DeletionTime: now.Add(-3 * time.Hour).Unix()} // Above the ignore delay threshold.
+
+	require.NoError(t, bucketindex.WriteIndex(ctx, bkt, userID, nil, &bucketindex.Index{
+		Version:            bucketindex.IndexVersion1,
+		Blocks:             bucketindex.Blocks{block1, block2, block3},
+		BlockDeletionMarks: bucketindex.BlockDeletionMarks{mark1, mark2},
+		UpdatedAt:          now.Unix(),
+	}))
+
+	// Create a metadata fetcher with filters.
+	filters := []block.MetadataFilter{
+		NewIgnoreDeletionMarkFilter(logger, bucket.NewUserBucketClient(userID, bkt, nil), 2*time.Hour, 1),
+	}
+
+	fetcher := NewBucketIndexMetadataFetcher(userID, bkt, NewNoShardingStrategy(), nil, logger, reg, filters, nil)
+	metas, partials, err := fetcher.Fetch(ctx)
+	require.NoError(t, err)
+	assert.Equal(t, map[ulid.ULID]*metadata.Meta{
+		block1.ID: block1.ThanosMeta(userID),
+		block3.ID: block3.ThanosMeta(userID),
+	}, metas)
+	assert.Empty(t, partials)
+	assert.Empty(t, logs)
+
+	assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
+		# HELP blocks_meta_modified Number of blocks whose metadata changed
+		# TYPE blocks_meta_modified gauge
+		blocks_meta_modified{modified="replica-label-removed"} 0
+
+		# HELP blocks_meta_sync_failures_total Total blocks metadata synchronization failures
+		# TYPE blocks_meta_sync_failures_total counter
+		blocks_meta_sync_failures_total 0
+
+		# HELP blocks_meta_synced Number of block metadata synced
+		# TYPE blocks_meta_synced gauge
+		blocks_meta_synced{state="corrupted-bucket-index"} 0
+		blocks_meta_synced{state="corrupted-meta-json"} 0
+		blocks_meta_synced{state="duplicate"} 0
+		blocks_meta_synced{state="failed"} 0
+		blocks_meta_synced{state="label-excluded"} 0
+		blocks_meta_synced{state="loaded"} 2
+		blocks_meta_synced{state="marked-for-deletion"} 1
+		blocks_meta_synced{state="marked-for-no-compact"} 0
+		blocks_meta_synced{state="no-bucket-index"} 0
+		blocks_meta_synced{state="no-meta-json"} 0
+		blocks_meta_synced{state="time-excluded"} 0
+		blocks_meta_synced{state="too-fresh"} 0
+
+		# HELP blocks_meta_syncs_total Total blocks metadata synchronization attempts
+		# TYPE blocks_meta_syncs_total counter
+		blocks_meta_syncs_total 1
+	`),
+		"blocks_meta_modified",
+		"blocks_meta_sync_failures_total",
+		"blocks_meta_synced",
+		"blocks_meta_syncs_total",
+	))
+}
+
+func TestBucketIndexMetadataFetcher_Fetch_NoBucketIndex(t *testing.T) {
+	const userID = "user-1"
+
+	bkt, _ := tsdb_testutil.PrepareFilesystemBucket(t)
+	reg := prometheus.NewPedanticRegistry()
+	ctx := context.Background()
+	logs := &concurrency.SyncBuffer{}
+	logger := log.NewLogfmtLogger(logs)
+
+	fetcher := NewBucketIndexMetadataFetcher(userID, bkt, NewNoShardingStrategy(), nil, logger, reg, nil, nil)
+	metas, partials, err := fetcher.Fetch(ctx)
+	require.NoError(t, err)
+	assert.Empty(t, metas)
+	assert.Empty(t, partials)
+	assert.Empty(t, logs)
+
+	assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
+		# HELP blocks_meta_modified Number of blocks whose metadata changed
+		# TYPE blocks_meta_modified gauge
+		blocks_meta_modified{modified="replica-label-removed"} 0
+
+		# HELP blocks_meta_sync_failures_total Total blocks metadata synchronization failures
+		# TYPE blocks_meta_sync_failures_total counter
+		blocks_meta_sync_failures_total 0
+
+		# HELP blocks_meta_synced Number of block metadata synced
+		# TYPE blocks_meta_synced gauge
+		blocks_meta_synced{state="corrupted-bucket-index"} 0
+		blocks_meta_synced{state="corrupted-meta-json"} 0
+		blocks_meta_synced{state="duplicate"} 0
+		blocks_meta_synced{state="failed"} 0
+		blocks_meta_synced{state="label-excluded"} 0
+		blocks_meta_synced{state="loaded"} 0
+		blocks_meta_synced{state="marked-for-deletion"} 0
+		blocks_meta_synced{state="marked-for-no-compact"} 0
+		blocks_meta_synced{state="no-bucket-index"} 1
+		blocks_meta_synced{state="no-meta-json"} 0
+		blocks_meta_synced{state="time-excluded"} 0
+		blocks_meta_synced{state="too-fresh"} 0
+
+		# HELP blocks_meta_syncs_total Total blocks metadata synchronization attempts
+		# TYPE blocks_meta_syncs_total counter
+		blocks_meta_syncs_total 1
+	`),
+		"blocks_meta_modified",
+		"blocks_meta_sync_failures_total",
+		"blocks_meta_synced",
+		"blocks_meta_syncs_total",
+	))
+}
+
+func TestBucketIndexMetadataFetcher_Fetch_CorruptedBucketIndex(t *testing.T) {
+	const userID = "user-1"
+
+	bkt, _ := tsdb_testutil.PrepareFilesystemBucket(t)
+	reg := prometheus.NewPedanticRegistry()
+	ctx := context.Background()
+	logs := &concurrency.SyncBuffer{}
+	logger := log.NewLogfmtLogger(logs)
+
+	// Upload a corrupted bucket index.
+	require.NoError(t, bkt.Upload(ctx, path.Join(userID, bucketindex.IndexCompressedFilename), strings.NewReader("invalid}!")))
+
+	fetcher := NewBucketIndexMetadataFetcher(userID, bkt, NewNoShardingStrategy(), nil, logger, reg, nil, nil)
+	metas, partials, err := fetcher.Fetch(ctx)
+	require.NoError(t, err)
+	assert.Empty(t, metas)
+	assert.Empty(t, partials)
+	assert.Regexp(t, "corrupted bucket index found", logs)
+
+	assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
+		# HELP blocks_meta_modified Number of blocks whose metadata changed
+		# TYPE blocks_meta_modified gauge
+		blocks_meta_modified{modified="replica-label-removed"} 0
+
+		# HELP blocks_meta_sync_failures_total Total blocks metadata synchronization failures
+		# TYPE blocks_meta_sync_failures_total counter
+		blocks_meta_sync_failures_total 0
+
+		# HELP blocks_meta_synced Number of block metadata synced
+		# TYPE blocks_meta_synced gauge
+		blocks_meta_synced{state="corrupted-bucket-index"} 1
+		blocks_meta_synced{state="corrupted-meta-json"} 0
+		blocks_meta_synced{state="duplicate"} 0
+		blocks_meta_synced{state="failed"} 0
+		blocks_meta_synced{state="label-excluded"} 0
+		blocks_meta_synced{state="loaded"} 0
+		blocks_meta_synced{state="marked-for-deletion"} 0
+		blocks_meta_synced{state="marked-for-no-compact"} 0
+		blocks_meta_synced{state="no-bucket-index"} 0
+		blocks_meta_synced{state="no-meta-json"} 0
+		blocks_meta_synced{state="time-excluded"} 0
+		blocks_meta_synced{state="too-fresh"} 0
+
+		# HELP blocks_meta_syncs_total Total blocks metadata synchronization attempts
+		# TYPE blocks_meta_syncs_total counter
+		blocks_meta_syncs_total 1
+	`),
+		"blocks_meta_modified",
+		"blocks_meta_sync_failures_total",
+		"blocks_meta_synced",
+		"blocks_meta_syncs_total",
+	))
+}
+
+func TestBucketIndexMetadataFetcher_Fetch_ShouldResetGaugeMetrics(t *testing.T) {
+	const userID = "user-1"
+
+	bkt, _ := tsdb_testutil.PrepareFilesystemBucket(t)
+	reg := prometheus.NewPedanticRegistry()
+	ctx := context.Background()
+	now := time.Now()
+	logger := log.NewNopLogger()
+	strategy := &mockShardingStrategy{}
+	strategy.On("FilterUsers", mock.Anything, mock.Anything).Return([]string{userID})
+
+	// Corrupted bucket index.
+	require.NoError(t, bkt.Upload(ctx, path.Join(userID, bucketindex.IndexCompressedFilename), strings.NewReader("invalid}!")))
+
+	fetcher := NewBucketIndexMetadataFetcher(userID, bkt, strategy, nil, logger, reg, nil, nil)
+	metas, _, err := fetcher.Fetch(ctx)
+	require.NoError(t, err)
+	assert.Len(t, metas, 0)
+
+	assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
+		# HELP blocks_meta_synced Number of block metadata synced
+		# TYPE blocks_meta_synced gauge
+		blocks_meta_synced{state="corrupted-bucket-index"} 1
+		blocks_meta_synced{state="corrupted-meta-json"} 0
+		blocks_meta_synced{state="duplicate"} 0
+		blocks_meta_synced{state="failed"} 0
+		blocks_meta_synced{state="label-excluded"} 0
+		blocks_meta_synced{state="loaded"} 0
+		blocks_meta_synced{state="marked-for-deletion"} 0
+		blocks_meta_synced{state="marked-for-no-compact"} 0
+		blocks_meta_synced{state="no-bucket-index"} 0
+		blocks_meta_synced{state="no-meta-json"} 0
+		blocks_meta_synced{state="time-excluded"} 0
+		blocks_meta_synced{state="too-fresh"} 0
+	`), "blocks_meta_synced"))
+
+	// No bucket index.
+	require.NoError(t, bucketindex.DeleteIndex(ctx, bkt, userID, nil))
+
+	metas, _, err = fetcher.Fetch(ctx)
+	require.NoError(t, err)
+	assert.Len(t, metas, 0)
+
+	assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
+		# HELP blocks_meta_synced Number of block metadata synced
+		# TYPE blocks_meta_synced gauge
+		blocks_meta_synced{state="corrupted-bucket-index"} 0
+		blocks_meta_synced{state="corrupted-meta-json"} 0
+		blocks_meta_synced{state="duplicate"} 0
+		blocks_meta_synced{state="failed"} 0
+		blocks_meta_synced{state="label-excluded"} 0
+		blocks_meta_synced{state="loaded"} 0
+		blocks_meta_synced{state="marked-for-deletion"} 0
+		blocks_meta_synced{state="marked-for-no-compact"} 0
+		blocks_meta_synced{state="no-bucket-index"} 1
+		blocks_meta_synced{state="no-meta-json"} 0
+		blocks_meta_synced{state="time-excluded"} 0
+		blocks_meta_synced{state="too-fresh"} 0
+	`), "blocks_meta_synced"))
+
+	// Create a bucket index.
+	block1 := &bucketindex.Block{ID: ulid.MustNew(1, nil)}
+	block2 := &bucketindex.Block{ID: ulid.MustNew(2, nil)}
+	block3 := &bucketindex.Block{ID: ulid.MustNew(3, nil)}
+
+	require.NoError(t, bucketindex.WriteIndex(ctx, bkt, userID, nil, &bucketindex.Index{
+		Version:   bucketindex.IndexVersion1,
+		Blocks:    bucketindex.Blocks{block1, block2, block3},
+		UpdatedAt: now.Unix(),
+	}))
+
+	metas, _, err = fetcher.Fetch(ctx)
+	require.NoError(t, err)
+	assert.Len(t, metas, 3)
+
+	assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
+		# HELP blocks_meta_synced Number of block metadata synced
+		# TYPE blocks_meta_synced gauge
+		blocks_meta_synced{state="corrupted-bucket-index"} 0
+		blocks_meta_synced{state="corrupted-meta-json"} 0
+		blocks_meta_synced{state="duplicate"} 0
+		blocks_meta_synced{state="failed"} 0
+		blocks_meta_synced{state="label-excluded"} 0
+		blocks_meta_synced{state="loaded"} 3
+		blocks_meta_synced{state="marked-for-deletion"} 0
+		blocks_meta_synced{state="marked-for-no-compact"} 0
+		blocks_meta_synced{state="no-bucket-index"} 0
+		blocks_meta_synced{state="no-meta-json"} 0
+		blocks_meta_synced{state="time-excluded"} 0
+		blocks_meta_synced{state="too-fresh"} 0
+	`), "blocks_meta_synced"))
+
+	// Remove the tenant from the shard.
+	strategy = &mockShardingStrategy{}
+	strategy.On("FilterUsers", mock.Anything, mock.Anything).Return([]string{})
+	fetcher.strategy = strategy
+
+	metas, _, err = fetcher.Fetch(ctx)
+	require.NoError(t, err)
+	assert.Len(t, metas, 0)
+
+	assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
+		# HELP blocks_meta_synced Number of block metadata synced
+		# TYPE blocks_meta_synced gauge
+		blocks_meta_synced{state="corrupted-bucket-index"} 0
+		blocks_meta_synced{state="corrupted-meta-json"} 0
+		blocks_meta_synced{state="duplicate"} 0
+		blocks_meta_synced{state="failed"} 0
+		blocks_meta_synced{state="label-excluded"} 0
+		blocks_meta_synced{state="loaded"} 0
+		blocks_meta_synced{state="marked-for-deletion"} 0
+		blocks_meta_synced{state="marked-for-no-compact"} 0
+		blocks_meta_synced{state="no-bucket-index"} 0
+		blocks_meta_synced{state="no-meta-json"} 0
+		blocks_meta_synced{state="time-excluded"} 0
+		blocks_meta_synced{state="too-fresh"} 0
+	`), "blocks_meta_synced"))
+}
diff --git a/pkg/storegateway/bucket_store_inmemory_server.go b/pkg/storegateway/bucket_store_inmemory_server.go
new file mode 100644
index 0000000000000..ff02afb44a725
--- /dev/null
+++ b/pkg/storegateway/bucket_store_inmemory_server.go
@@ -0,0 +1,66 @@
+package storegateway
+
+import (
+	"context"
+
+	"github.com/gogo/protobuf/types"
+	"github.com/pkg/errors"
+	"github.com/prometheus/prometheus/storage"
+	"github.com/thanos-io/thanos/pkg/store/hintspb"
+	"github.com/thanos-io/thanos/pkg/store/storepb"
+)
+
+// bucketStoreSeriesServer is an fake in-memory gRPC server used to
+// call Thanos BucketStore.Series() without having to go through the
+// gRPC networking stack.
+type bucketStoreSeriesServer struct {
+	// This field just exist to pseudo-implement the unused methods of the interface.
+	storepb.Store_SeriesServer
+
+	ctx context.Context
+
+	SeriesSet []*storepb.Series
+	Warnings  storage.Warnings
+	Hints     hintspb.SeriesResponseHints
+}
+
+func newBucketStoreSeriesServer(ctx context.Context) *bucketStoreSeriesServer {
+	return &bucketStoreSeriesServer{ctx: ctx}
+}
+
+func (s *bucketStoreSeriesServer) Send(r *storepb.SeriesResponse) error {
+	if r.GetWarning() != "" {
+		s.Warnings = append(s.Warnings, errors.New(r.GetWarning()))
+	}
+
+	if rawHints := r.GetHints(); rawHints != nil {
+		// We expect only 1 hints entry so we just keep 1.
+		if err := types.UnmarshalAny(rawHints, &s.Hints); err != nil {
+			return errors.Wrap(err, "failed to unmarshal series hints")
+		}
+	}
+
+	if recvSeries := r.GetSeries(); recvSeries != nil {
+		// Thanos uses a pool for the chunks and may use other pools in the future.
+		// Given we need to retain the reference after the pooled slices are recycled,
+		// we need to do a copy here. We prefer to stay on the safest side at this stage
+		// so we do a marshal+unmarshal to copy the whole series.
+		recvSeriesData, err := recvSeries.Marshal()
+		if err != nil {
+			return errors.Wrap(err, "marshal received series")
+		}
+
+		copiedSeries := &storepb.Series{}
+		if err = copiedSeries.Unmarshal(recvSeriesData); err != nil {
+			return errors.Wrap(err, "unmarshal received series")
+		}
+
+		s.SeriesSet = append(s.SeriesSet, copiedSeries)
+	}
+
+	return nil
+}
+
+func (s *bucketStoreSeriesServer) Context() context.Context {
+	return s.ctx
+}
diff --git a/pkg/storegateway/bucket_store_metrics_test.go b/pkg/storegateway/bucket_store_metrics_test.go
new file mode 100644
index 0000000000000..2990c20a5c6a3
--- /dev/null
+++ b/pkg/storegateway/bucket_store_metrics_test.go
@@ -0,0 +1,510 @@
+package storegateway
+
+import (
+	"bytes"
+	"fmt"
+	"testing"
+
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/promauto"
+	"github.com/prometheus/client_golang/prometheus/testutil"
+	"github.com/stretchr/testify/require"
+)
+
+func TestBucketStoreMetrics(t *testing.T) {
+	mainReg := prometheus.NewPedanticRegistry()
+
+	tsdbMetrics := NewBucketStoreMetrics()
+	mainReg.MustRegister(tsdbMetrics)
+
+	tsdbMetrics.AddUserRegistry("user1", populateMockedBucketStoreMetrics(5328))
+	tsdbMetrics.AddUserRegistry("user2", populateMockedBucketStoreMetrics(6908))
+	tsdbMetrics.AddUserRegistry("user3", populateMockedBucketStoreMetrics(10283))
+
+	//noinspection ALL
+	err := testutil.GatherAndCompare(mainReg, bytes.NewBufferString(`
+			# HELP cortex_bucket_store_blocks_loaded Number of currently loaded blocks.
+			# TYPE cortex_bucket_store_blocks_loaded gauge
+			cortex_bucket_store_blocks_loaded 22519
+
+			# HELP cortex_bucket_store_block_loads_total Total number of remote block loading attempts.
+			# TYPE cortex_bucket_store_block_loads_total counter
+			cortex_bucket_store_block_loads_total 45038
+
+			# HELP cortex_bucket_store_block_load_failures_total Total number of failed remote block loading attempts.
+			# TYPE cortex_bucket_store_block_load_failures_total counter
+			cortex_bucket_store_block_load_failures_total 67557
+
+			# HELP cortex_bucket_store_block_drops_total Total number of local blocks that were dropped.
+			# TYPE cortex_bucket_store_block_drops_total counter
+			cortex_bucket_store_block_drops_total 90076
+
+			# HELP cortex_bucket_store_block_drop_failures_total Total number of local blocks that failed to be dropped.
+			# TYPE cortex_bucket_store_block_drop_failures_total counter
+			cortex_bucket_store_block_drop_failures_total 112595
+
+			# HELP cortex_bucket_store_series_blocks_queried Number of blocks in a bucket store that were touched to satisfy a query.
+			# TYPE cortex_bucket_store_series_blocks_queried summary
+			cortex_bucket_store_series_blocks_queried_sum 1.283583e+06
+			cortex_bucket_store_series_blocks_queried_count 9
+
+			# HELP cortex_bucket_store_series_data_fetched How many items of a data type in a block were fetched for a single series request.
+			# TYPE cortex_bucket_store_series_data_fetched summary
+			cortex_bucket_store_series_data_fetched_sum{data_type="fetched-a"} 202671
+			cortex_bucket_store_series_data_fetched_count{data_type="fetched-a"} 3
+			cortex_bucket_store_series_data_fetched_sum{data_type="fetched-b"} 225190
+			cortex_bucket_store_series_data_fetched_count{data_type="fetched-b"} 3
+			cortex_bucket_store_series_data_fetched_sum{data_type="fetched-c"} 247709
+			cortex_bucket_store_series_data_fetched_count{data_type="fetched-c"} 3
+
+			# HELP cortex_bucket_store_series_data_size_fetched_bytes Size of all items of a data type in a block were fetched for a single series request.
+			# TYPE cortex_bucket_store_series_data_size_fetched_bytes summary
+			cortex_bucket_store_series_data_size_fetched_bytes_sum{data_type="size-fetched-a"} 337785
+			cortex_bucket_store_series_data_size_fetched_bytes_count{data_type="size-fetched-a"} 3
+			cortex_bucket_store_series_data_size_fetched_bytes_sum{data_type="size-fetched-b"} 360304
+			cortex_bucket_store_series_data_size_fetched_bytes_count{data_type="size-fetched-b"} 3
+			cortex_bucket_store_series_data_size_fetched_bytes_sum{data_type="size-fetched-c"} 382823
+			cortex_bucket_store_series_data_size_fetched_bytes_count{data_type="size-fetched-c"} 3
+
+			# HELP cortex_bucket_store_series_data_size_touched_bytes Size of all items of a data type in a block were touched for a single series request.
+			# TYPE cortex_bucket_store_series_data_size_touched_bytes summary
+			cortex_bucket_store_series_data_size_touched_bytes_sum{data_type="size-touched-a"} 270228
+			cortex_bucket_store_series_data_size_touched_bytes_count{data_type="size-touched-a"} 3
+			cortex_bucket_store_series_data_size_touched_bytes_sum{data_type="size-touched-b"} 292747
+			cortex_bucket_store_series_data_size_touched_bytes_count{data_type="size-touched-b"} 3
+			cortex_bucket_store_series_data_size_touched_bytes_sum{data_type="size-touched-c"} 315266
+			cortex_bucket_store_series_data_size_touched_bytes_count{data_type="size-touched-c"} 3
+
+			# HELP cortex_bucket_store_series_data_touched How many items of a data type in a block were touched for a single series request.
+			# TYPE cortex_bucket_store_series_data_touched summary
+			cortex_bucket_store_series_data_touched_sum{data_type="touched-a"} 135114
+			cortex_bucket_store_series_data_touched_count{data_type="touched-a"} 3
+			cortex_bucket_store_series_data_touched_sum{data_type="touched-b"} 157633
+			cortex_bucket_store_series_data_touched_count{data_type="touched-b"} 3
+			cortex_bucket_store_series_data_touched_sum{data_type="touched-c"} 180152
+			cortex_bucket_store_series_data_touched_count{data_type="touched-c"} 3
+
+			# HELP cortex_bucket_store_series_get_all_duration_seconds Time it takes until all per-block prepares and preloads for a query are finished.
+			# TYPE cortex_bucket_store_series_get_all_duration_seconds histogram
+			cortex_bucket_store_series_get_all_duration_seconds_bucket{le="0.001"} 0
+			cortex_bucket_store_series_get_all_duration_seconds_bucket{le="0.01"} 0
+			cortex_bucket_store_series_get_all_duration_seconds_bucket{le="0.1"} 0
+			cortex_bucket_store_series_get_all_duration_seconds_bucket{le="0.3"} 0
+			cortex_bucket_store_series_get_all_duration_seconds_bucket{le="0.6"} 0
+			cortex_bucket_store_series_get_all_duration_seconds_bucket{le="1"} 0
+			cortex_bucket_store_series_get_all_duration_seconds_bucket{le="3"} 0
+			cortex_bucket_store_series_get_all_duration_seconds_bucket{le="6"} 0
+			cortex_bucket_store_series_get_all_duration_seconds_bucket{le="9"} 0
+			cortex_bucket_store_series_get_all_duration_seconds_bucket{le="20"} 0
+			cortex_bucket_store_series_get_all_duration_seconds_bucket{le="30"} 0
+			cortex_bucket_store_series_get_all_duration_seconds_bucket{le="60"} 0
+			cortex_bucket_store_series_get_all_duration_seconds_bucket{le="90"} 0
+			cortex_bucket_store_series_get_all_duration_seconds_bucket{le="120"} 0
+			cortex_bucket_store_series_get_all_duration_seconds_bucket{le="+Inf"} 9
+			cortex_bucket_store_series_get_all_duration_seconds_sum 1.486254e+06
+			cortex_bucket_store_series_get_all_duration_seconds_count 9
+
+			# HELP cortex_bucket_store_series_merge_duration_seconds Time it takes to merge sub-results from all queried blocks into a single result.
+			# TYPE cortex_bucket_store_series_merge_duration_seconds histogram
+			cortex_bucket_store_series_merge_duration_seconds_bucket{le="0.001"} 0
+			cortex_bucket_store_series_merge_duration_seconds_bucket{le="0.01"} 0
+			cortex_bucket_store_series_merge_duration_seconds_bucket{le="0.1"} 0
+			cortex_bucket_store_series_merge_duration_seconds_bucket{le="0.3"} 0
+			cortex_bucket_store_series_merge_duration_seconds_bucket{le="0.6"} 0
+			cortex_bucket_store_series_merge_duration_seconds_bucket{le="1"} 0
+			cortex_bucket_store_series_merge_duration_seconds_bucket{le="3"} 0
+			cortex_bucket_store_series_merge_duration_seconds_bucket{le="6"} 0
+			cortex_bucket_store_series_merge_duration_seconds_bucket{le="9"} 0
+			cortex_bucket_store_series_merge_duration_seconds_bucket{le="20"} 0
+			cortex_bucket_store_series_merge_duration_seconds_bucket{le="30"} 0
+			cortex_bucket_store_series_merge_duration_seconds_bucket{le="60"} 0
+			cortex_bucket_store_series_merge_duration_seconds_bucket{le="90"} 0
+			cortex_bucket_store_series_merge_duration_seconds_bucket{le="120"} 0
+			cortex_bucket_store_series_merge_duration_seconds_bucket{le="+Inf"} 9
+			cortex_bucket_store_series_merge_duration_seconds_sum 1.688925e+06
+			cortex_bucket_store_series_merge_duration_seconds_count 9
+
+			# HELP cortex_bucket_store_series_refetches_total Total number of cases where the built-in max series size was not enough to fetch series from index, resulting in refetch.
+			# TYPE cortex_bucket_store_series_refetches_total counter
+			cortex_bucket_store_series_refetches_total 743127
+
+			# HELP cortex_bucket_store_series_result_series Number of series observed in the final result of a query.
+			# TYPE cortex_bucket_store_series_result_series summary
+			cortex_bucket_store_series_result_series_sum 1.238545e+06
+			cortex_bucket_store_series_result_series_count 6
+
+			# HELP cortex_bucket_store_queries_dropped_total Number of queries that were dropped due to the max chunks per query limit.
+			# TYPE cortex_bucket_store_queries_dropped_total counter
+			cortex_bucket_store_queries_dropped_total 698089
+
+			# HELP cortex_bucket_store_cached_postings_compressions_total Number of postings compressions and decompressions when storing to index cache.
+			# TYPE cortex_bucket_store_cached_postings_compressions_total counter
+			cortex_bucket_store_cached_postings_compressions_total{op="encode"} 1125950
+			cortex_bucket_store_cached_postings_compressions_total{op="decode"} 1148469
+
+			# HELP cortex_bucket_store_cached_postings_compression_errors_total Number of postings compression and decompression errors.
+			# TYPE cortex_bucket_store_cached_postings_compression_errors_total counter
+			cortex_bucket_store_cached_postings_compression_errors_total{op="encode"} 1170988
+			cortex_bucket_store_cached_postings_compression_errors_total{op="decode"} 1193507
+
+			# HELP cortex_bucket_store_cached_postings_compression_time_seconds Time spent compressing and decompressing postings when storing to / reading from postings cache.
+			# TYPE cortex_bucket_store_cached_postings_compression_time_seconds counter
+			cortex_bucket_store_cached_postings_compression_time_seconds{op="encode"} 1216026
+			cortex_bucket_store_cached_postings_compression_time_seconds{op="decode"} 1238545
+
+			# HELP cortex_bucket_store_cached_postings_original_size_bytes_total Original size of postings stored into cache.
+			# TYPE cortex_bucket_store_cached_postings_original_size_bytes_total counter
+			cortex_bucket_store_cached_postings_original_size_bytes_total 1261064
+
+			# HELP cortex_bucket_store_cached_postings_compressed_size_bytes_total Compressed size of postings stored into cache.
+			# TYPE cortex_bucket_store_cached_postings_compressed_size_bytes_total counter
+			cortex_bucket_store_cached_postings_compressed_size_bytes_total 1283583
+
+			# HELP cortex_bucket_store_cached_series_fetch_duration_seconds Time it takes to fetch series to respond a request sent to store-gateway. It includes both the time to fetch it from cache and from storage in case of cache misses.
+			# TYPE cortex_bucket_store_cached_series_fetch_duration_seconds histogram
+			cortex_bucket_store_cached_series_fetch_duration_seconds_bucket{le="0.001"} 0
+			cortex_bucket_store_cached_series_fetch_duration_seconds_bucket{le="0.01"} 0
+			cortex_bucket_store_cached_series_fetch_duration_seconds_bucket{le="0.1"} 0
+			cortex_bucket_store_cached_series_fetch_duration_seconds_bucket{le="0.3"} 0
+			cortex_bucket_store_cached_series_fetch_duration_seconds_bucket{le="0.6"} 0
+			cortex_bucket_store_cached_series_fetch_duration_seconds_bucket{le="1"} 0
+			cortex_bucket_store_cached_series_fetch_duration_seconds_bucket{le="3"} 0
+			cortex_bucket_store_cached_series_fetch_duration_seconds_bucket{le="6"} 0
+			cortex_bucket_store_cached_series_fetch_duration_seconds_bucket{le="9"} 0
+			cortex_bucket_store_cached_series_fetch_duration_seconds_bucket{le="20"} 0
+			cortex_bucket_store_cached_series_fetch_duration_seconds_bucket{le="30"} 0
+			cortex_bucket_store_cached_series_fetch_duration_seconds_bucket{le="60"} 0
+			cortex_bucket_store_cached_series_fetch_duration_seconds_bucket{le="90"} 0
+			cortex_bucket_store_cached_series_fetch_duration_seconds_bucket{le="120"} 0
+			cortex_bucket_store_cached_series_fetch_duration_seconds_bucket{le="+Inf"} 3
+			cortex_bucket_store_cached_series_fetch_duration_seconds_sum 1.306102e+06
+			cortex_bucket_store_cached_series_fetch_duration_seconds_count 3
+
+			# HELP cortex_bucket_store_cached_postings_fetch_duration_seconds Time it takes to fetch postings to respond a request sent to store-gateway. It includes both the time to fetch it from cache and from storage in case of cache misses.
+			# TYPE cortex_bucket_store_cached_postings_fetch_duration_seconds histogram
+			cortex_bucket_store_cached_postings_fetch_duration_seconds_bucket{le="0.001"} 0
+			cortex_bucket_store_cached_postings_fetch_duration_seconds_bucket{le="0.01"} 0
+			cortex_bucket_store_cached_postings_fetch_duration_seconds_bucket{le="0.1"} 0
+			cortex_bucket_store_cached_postings_fetch_duration_seconds_bucket{le="0.3"} 0
+			cortex_bucket_store_cached_postings_fetch_duration_seconds_bucket{le="0.6"} 0
+			cortex_bucket_store_cached_postings_fetch_duration_seconds_bucket{le="1"} 0
+			cortex_bucket_store_cached_postings_fetch_duration_seconds_bucket{le="3"} 0
+			cortex_bucket_store_cached_postings_fetch_duration_seconds_bucket{le="6"} 0
+			cortex_bucket_store_cached_postings_fetch_duration_seconds_bucket{le="9"} 0
+			cortex_bucket_store_cached_postings_fetch_duration_seconds_bucket{le="20"} 0
+			cortex_bucket_store_cached_postings_fetch_duration_seconds_bucket{le="30"} 0
+			cortex_bucket_store_cached_postings_fetch_duration_seconds_bucket{le="60"} 0
+			cortex_bucket_store_cached_postings_fetch_duration_seconds_bucket{le="90"} 0
+			cortex_bucket_store_cached_postings_fetch_duration_seconds_bucket{le="120"} 0
+			cortex_bucket_store_cached_postings_fetch_duration_seconds_bucket{le="+Inf"} 3
+			cortex_bucket_store_cached_postings_fetch_duration_seconds_sum 1.328621e+06
+			cortex_bucket_store_cached_postings_fetch_duration_seconds_count 3
+
+			# HELP cortex_bucket_store_indexheader_lazy_load_duration_seconds Duration of the index-header lazy loading in seconds.
+			# TYPE cortex_bucket_store_indexheader_lazy_load_duration_seconds histogram
+			cortex_bucket_store_indexheader_lazy_load_duration_seconds_bucket{le="0.01"} 0
+			cortex_bucket_store_indexheader_lazy_load_duration_seconds_bucket{le="0.02"} 0
+			cortex_bucket_store_indexheader_lazy_load_duration_seconds_bucket{le="0.05"} 0
+			cortex_bucket_store_indexheader_lazy_load_duration_seconds_bucket{le="0.1"} 0
+			cortex_bucket_store_indexheader_lazy_load_duration_seconds_bucket{le="0.2"} 0
+			cortex_bucket_store_indexheader_lazy_load_duration_seconds_bucket{le="0.5"} 0
+			cortex_bucket_store_indexheader_lazy_load_duration_seconds_bucket{le="1"} 3
+			cortex_bucket_store_indexheader_lazy_load_duration_seconds_bucket{le="2"} 3
+			cortex_bucket_store_indexheader_lazy_load_duration_seconds_bucket{le="5"} 3
+			cortex_bucket_store_indexheader_lazy_load_duration_seconds_bucket{le="+Inf"} 3
+			cortex_bucket_store_indexheader_lazy_load_duration_seconds_sum 1.9500000000000002
+			cortex_bucket_store_indexheader_lazy_load_duration_seconds_count 3
+
+			# HELP cortex_bucket_store_indexheader_lazy_load_failed_total Total number of failed index-header lazy load operations.
+			# TYPE cortex_bucket_store_indexheader_lazy_load_failed_total counter
+			cortex_bucket_store_indexheader_lazy_load_failed_total 1.373659e+06
+
+			# HELP cortex_bucket_store_indexheader_lazy_load_total Total number of index-header lazy load operations.
+			# TYPE cortex_bucket_store_indexheader_lazy_load_total counter
+			cortex_bucket_store_indexheader_lazy_load_total 1.35114e+06
+
+			# HELP cortex_bucket_store_indexheader_lazy_unload_failed_total Total number of failed index-header lazy unload operations.
+			# TYPE cortex_bucket_store_indexheader_lazy_unload_failed_total counter
+			cortex_bucket_store_indexheader_lazy_unload_failed_total 1.418697e+06
+
+			# HELP cortex_bucket_store_indexheader_lazy_unload_total Total number of index-header lazy unload operations.
+			# TYPE cortex_bucket_store_indexheader_lazy_unload_total counter
+			cortex_bucket_store_indexheader_lazy_unload_total 1.396178e+06
+`))
+	require.NoError(t, err)
+}
+
+func BenchmarkMetricsCollections10(b *testing.B) {
+	benchmarkMetricsCollection(b, 10)
+}
+
+func BenchmarkMetricsCollections100(b *testing.B) {
+	benchmarkMetricsCollection(b, 100)
+}
+
+func BenchmarkMetricsCollections1000(b *testing.B) {
+	benchmarkMetricsCollection(b, 1000)
+}
+
+func BenchmarkMetricsCollections10000(b *testing.B) {
+	benchmarkMetricsCollection(b, 10000)
+}
+
+func benchmarkMetricsCollection(b *testing.B, users int) {
+	mainReg := prometheus.NewRegistry()
+
+	tsdbMetrics := NewBucketStoreMetrics()
+	mainReg.MustRegister(tsdbMetrics)
+
+	base := 123456.0
+	for i := 0; i < users; i++ {
+		tsdbMetrics.AddUserRegistry(fmt.Sprintf("user-%d", i), populateMockedBucketStoreMetrics(base*float64(i)))
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_, _ = mainReg.Gather()
+	}
+}
+
+func populateMockedBucketStoreMetrics(base float64) *prometheus.Registry {
+	reg := prometheus.NewRegistry()
+	m := newMockedBucketStoreMetrics(reg)
+
+	m.blocksLoaded.Add(1 * base)
+	m.blockLoads.Add(2 * base)
+	m.blockLoadFailures.Add(3 * base)
+	m.blockDrops.Add(4 * base)
+	m.blockDropFailures.Add(5 * base)
+	m.seriesDataTouched.WithLabelValues("touched-a").Observe(6 * base)
+	m.seriesDataTouched.WithLabelValues("touched-b").Observe(7 * base)
+	m.seriesDataTouched.WithLabelValues("touched-c").Observe(8 * base)
+
+	m.seriesDataFetched.WithLabelValues("fetched-a").Observe(9 * base)
+	m.seriesDataFetched.WithLabelValues("fetched-b").Observe(10 * base)
+	m.seriesDataFetched.WithLabelValues("fetched-c").Observe(11 * base)
+
+	m.seriesDataSizeTouched.WithLabelValues("size-touched-a").Observe(12 * base)
+	m.seriesDataSizeTouched.WithLabelValues("size-touched-b").Observe(13 * base)
+	m.seriesDataSizeTouched.WithLabelValues("size-touched-c").Observe(14 * base)
+
+	m.seriesDataSizeFetched.WithLabelValues("size-fetched-a").Observe(15 * base)
+	m.seriesDataSizeFetched.WithLabelValues("size-fetched-b").Observe(16 * base)
+	m.seriesDataSizeFetched.WithLabelValues("size-fetched-c").Observe(17 * base)
+
+	m.seriesBlocksQueried.Observe(18 * base)
+	m.seriesBlocksQueried.Observe(19 * base)
+	m.seriesBlocksQueried.Observe(20 * base)
+
+	m.seriesGetAllDuration.Observe(21 * base)
+	m.seriesGetAllDuration.Observe(22 * base)
+	m.seriesGetAllDuration.Observe(23 * base)
+
+	m.seriesMergeDuration.Observe(24 * base)
+	m.seriesMergeDuration.Observe(25 * base)
+	m.seriesMergeDuration.Observe(26 * base)
+
+	m.resultSeriesCount.Observe(27 * base)
+	m.resultSeriesCount.Observe(28 * base)
+
+	m.chunkSizeBytes.Observe(29 * base)
+	m.chunkSizeBytes.Observe(30 * base)
+
+	m.queriesDropped.WithLabelValues("chunks").Add(31 * base)
+	m.queriesDropped.WithLabelValues("series").Add(0)
+
+	m.seriesRefetches.Add(33 * base)
+
+	m.cachedPostingsCompressions.WithLabelValues("encode").Add(50 * base)
+	m.cachedPostingsCompressions.WithLabelValues("decode").Add(51 * base)
+
+	m.cachedPostingsCompressionErrors.WithLabelValues("encode").Add(52 * base)
+	m.cachedPostingsCompressionErrors.WithLabelValues("decode").Add(53 * base)
+
+	m.cachedPostingsCompressionTimeSeconds.WithLabelValues("encode").Add(54 * base)
+	m.cachedPostingsCompressionTimeSeconds.WithLabelValues("decode").Add(55 * base)
+
+	m.cachedPostingsOriginalSizeBytes.Add(56 * base)
+	m.cachedPostingsCompressedSizeBytes.Add(57 * base)
+
+	m.seriesFetchDuration.Observe(58 * base)
+	m.postingsFetchDuration.Observe(59 * base)
+
+	m.indexHeaderLazyLoadCount.Add(60 * base)
+	m.indexHeaderLazyLoadFailedCount.Add(61 * base)
+	m.indexHeaderLazyUnloadCount.Add(62 * base)
+	m.indexHeaderLazyUnloadFailedCount.Add(63 * base)
+	m.indexHeaderLazyLoadDuration.Observe(0.65)
+
+	return reg
+}
+
+// copied from Thanos, pkg/store/bucket.go
+type mockedBucketStoreMetrics struct {
+	blocksLoaded          prometheus.Gauge
+	blockLoads            prometheus.Counter
+	blockLoadFailures     prometheus.Counter
+	blockDrops            prometheus.Counter
+	blockDropFailures     prometheus.Counter
+	seriesDataTouched     *prometheus.SummaryVec
+	seriesDataFetched     *prometheus.SummaryVec
+	seriesDataSizeTouched *prometheus.SummaryVec
+	seriesDataSizeFetched *prometheus.SummaryVec
+	seriesBlocksQueried   prometheus.Summary
+	seriesGetAllDuration  prometheus.Histogram
+	seriesMergeDuration   prometheus.Histogram
+	seriesRefetches       prometheus.Counter
+	resultSeriesCount     prometheus.Summary
+	chunkSizeBytes        prometheus.Histogram
+	queriesDropped        *prometheus.CounterVec
+
+	cachedPostingsCompressions           *prometheus.CounterVec
+	cachedPostingsCompressionErrors      *prometheus.CounterVec
+	cachedPostingsCompressionTimeSeconds *prometheus.CounterVec
+	cachedPostingsOriginalSizeBytes      prometheus.Counter
+	cachedPostingsCompressedSizeBytes    prometheus.Counter
+
+	seriesFetchDuration   prometheus.Histogram
+	postingsFetchDuration prometheus.Histogram
+
+	indexHeaderLazyLoadCount         prometheus.Counter
+	indexHeaderLazyLoadFailedCount   prometheus.Counter
+	indexHeaderLazyUnloadCount       prometheus.Counter
+	indexHeaderLazyUnloadFailedCount prometheus.Counter
+	indexHeaderLazyLoadDuration      prometheus.Histogram
+}
+
+func newMockedBucketStoreMetrics(reg prometheus.Registerer) *mockedBucketStoreMetrics {
+	var m mockedBucketStoreMetrics
+
+	m.blockLoads = promauto.With(reg).NewCounter(prometheus.CounterOpts{
+		Name: "thanos_bucket_store_block_loads_total",
+		Help: "Total number of remote block loading attempts.",
+	})
+	m.blockLoadFailures = promauto.With(reg).NewCounter(prometheus.CounterOpts{
+		Name: "thanos_bucket_store_block_load_failures_total",
+		Help: "Total number of failed remote block loading attempts.",
+	})
+	m.blockDrops = promauto.With(reg).NewCounter(prometheus.CounterOpts{
+		Name: "thanos_bucket_store_block_drops_total",
+		Help: "Total number of local blocks that were dropped.",
+	})
+	m.blockDropFailures = promauto.With(reg).NewCounter(prometheus.CounterOpts{
+		Name: "thanos_bucket_store_block_drop_failures_total",
+		Help: "Total number of local blocks that failed to be dropped.",
+	})
+	m.blocksLoaded = promauto.With(reg).NewGauge(prometheus.GaugeOpts{
+		Name: "thanos_bucket_store_blocks_loaded",
+		Help: "Number of currently loaded blocks.",
+	})
+
+	m.seriesDataTouched = promauto.With(reg).NewSummaryVec(prometheus.SummaryOpts{
+		Name: "thanos_bucket_store_series_data_touched",
+		Help: "How many items of a data type in a block were touched for a single series request.",
+	}, []string{"data_type"})
+	m.seriesDataFetched = promauto.With(reg).NewSummaryVec(prometheus.SummaryOpts{
+		Name: "thanos_bucket_store_series_data_fetched",
+		Help: "How many items of a data type in a block were fetched for a single series request.",
+	}, []string{"data_type"})
+
+	m.seriesDataSizeTouched = promauto.With(reg).NewSummaryVec(prometheus.SummaryOpts{
+		Name: "thanos_bucket_store_series_data_size_touched_bytes",
+		Help: "Size of all items of a data type in a block were touched for a single series request.",
+	}, []string{"data_type"})
+	m.seriesDataSizeFetched = promauto.With(reg).NewSummaryVec(prometheus.SummaryOpts{
+		Name: "thanos_bucket_store_series_data_size_fetched_bytes",
+		Help: "Size of all items of a data type in a block were fetched for a single series request.",
+	}, []string{"data_type"})
+
+	m.seriesBlocksQueried = promauto.With(reg).NewSummary(prometheus.SummaryOpts{
+		Name: "thanos_bucket_store_series_blocks_queried",
+		Help: "Number of blocks in a bucket store that were touched to satisfy a query.",
+	})
+	m.seriesGetAllDuration = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
+		Name:    "thanos_bucket_store_series_get_all_duration_seconds",
+		Help:    "Time it takes until all per-block prepares and preloads for a query are finished.",
+		Buckets: []float64{0.001, 0.01, 0.1, 0.3, 0.6, 1, 3, 6, 9, 20, 30, 60, 90, 120},
+	})
+	m.seriesMergeDuration = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
+		Name:    "thanos_bucket_store_series_merge_duration_seconds",
+		Help:    "Time it takes to merge sub-results from all queried blocks into a single result.",
+		Buckets: []float64{0.001, 0.01, 0.1, 0.3, 0.6, 1, 3, 6, 9, 20, 30, 60, 90, 120},
+	})
+	m.resultSeriesCount = promauto.With(reg).NewSummary(prometheus.SummaryOpts{
+		Name: "thanos_bucket_store_series_result_series",
+		Help: "Number of series observed in the final result of a query.",
+	})
+
+	m.chunkSizeBytes = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
+		Name: "thanos_bucket_store_sent_chunk_size_bytes",
+		Help: "Size in bytes of the chunks for the single series, which is adequate to the gRPC message size sent to querier.",
+		Buckets: []float64{
+			32, 256, 512, 1024, 32 * 1024, 256 * 1024, 512 * 1024, 1024 * 1024, 32 * 1024 * 1024, 256 * 1024 * 1024, 512 * 1024 * 1024,
+		},
+	})
+
+	m.queriesDropped = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
+		Name: "thanos_bucket_store_queries_dropped_total",
+		Help: "Number of queries that were dropped due to the limit.",
+	}, []string{"reason"})
+	m.seriesRefetches = promauto.With(reg).NewCounter(prometheus.CounterOpts{
+		Name: "thanos_bucket_store_series_refetches_total",
+		Help: fmt.Sprintf("Total number of cases where %v bytes was not enough was to fetch series from index, resulting in refetch.", 64*1024),
+	})
+
+	m.cachedPostingsCompressions = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
+		Name: "thanos_bucket_store_cached_postings_compressions_total",
+		Help: "Number of postings compressions before storing to index cache.",
+	}, []string{"op"})
+	m.cachedPostingsCompressionErrors = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
+		Name: "thanos_bucket_store_cached_postings_compression_errors_total",
+		Help: "Number of postings compression errors.",
+	}, []string{"op"})
+	m.cachedPostingsCompressionTimeSeconds = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
+		Name: "thanos_bucket_store_cached_postings_compression_time_seconds_total",
+		Help: "Time spent compressing postings before storing them into postings cache.",
+	}, []string{"op"})
+	m.cachedPostingsOriginalSizeBytes = promauto.With(reg).NewCounter(prometheus.CounterOpts{
+		Name: "thanos_bucket_store_cached_postings_original_size_bytes_total",
+		Help: "Original size of postings stored into cache.",
+	})
+	m.cachedPostingsCompressedSizeBytes = promauto.With(reg).NewCounter(prometheus.CounterOpts{
+		Name: "thanos_bucket_store_cached_postings_compressed_size_bytes_total",
+		Help: "Compressed size of postings stored into cache.",
+	})
+
+	m.seriesFetchDuration = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
+		Name:    "thanos_bucket_store_cached_series_fetch_duration_seconds",
+		Help:    "Time it takes to fetch series from a bucket to respond a query. It also includes the time it takes to cache fetch and store operations.",
+		Buckets: []float64{0.001, 0.01, 0.1, 0.3, 0.6, 1, 3, 6, 9, 20, 30, 60, 90, 120},
+	})
+	m.postingsFetchDuration = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
+		Name:    "thanos_bucket_store_cached_postings_fetch_duration_seconds",
+		Help:    "Time it takes to fetch postings from a bucket to respond a query. It also includes the time it takes to cache fetch and store operations.",
+		Buckets: []float64{0.001, 0.01, 0.1, 0.3, 0.6, 1, 3, 6, 9, 20, 30, 60, 90, 120},
+	})
+
+	m.indexHeaderLazyLoadCount = promauto.With(reg).NewCounter(prometheus.CounterOpts{
+		Name: "thanos_bucket_store_indexheader_lazy_load_total",
+		Help: "Total number of index-header lazy load operations.",
+	})
+	m.indexHeaderLazyLoadFailedCount = promauto.With(reg).NewCounter(prometheus.CounterOpts{
+		Name: "thanos_bucket_store_indexheader_lazy_load_failed_total",
+		Help: "Total number of failed index-header lazy load operations.",
+	})
+	m.indexHeaderLazyUnloadCount = promauto.With(reg).NewCounter(prometheus.CounterOpts{
+		Name: "thanos_bucket_store_indexheader_lazy_unload_total",
+		Help: "Total number of index-header lazy unload operations.",
+	})
+	m.indexHeaderLazyUnloadFailedCount = promauto.With(reg).NewCounter(prometheus.CounterOpts{
+		Name: "thanos_bucket_store_indexheader_lazy_unload_failed_total",
+		Help: "Total number of failed index-header lazy unload operations.",
+	})
+	m.indexHeaderLazyLoadDuration = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
+		Name:    "thanos_bucket_store_indexheader_lazy_load_duration_seconds",
+		Help:    "Duration of the index-header lazy loading in seconds.",
+		Buckets: []float64{0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 2, 5},
+	})
+
+	return &m
+}
diff --git a/pkg/storegateway/bucket_stores_test.go b/pkg/storegateway/bucket_stores_test.go
new file mode 100644
index 0000000000000..482b052bebe4a
--- /dev/null
+++ b/pkg/storegateway/bucket_stores_test.go
@@ -0,0 +1,616 @@
+package storegateway
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"math"
+	"os"
+	"path/filepath"
+	"sort"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/go-kit/log"
+	"github.com/grafana/dskit/flagext"
+	"github.com/oklog/ulid"
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/testutil"
+	"github.com/prometheus/prometheus/model/labels"
+	"github.com/prometheus/prometheus/storage"
+	"github.com/prometheus/prometheus/tsdb"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/mock"
+	"github.com/stretchr/testify/require"
+	thanos_metadata "github.com/thanos-io/thanos/pkg/block/metadata"
+	"github.com/thanos-io/thanos/pkg/extprom"
+	"github.com/thanos-io/thanos/pkg/objstore"
+	"github.com/thanos-io/thanos/pkg/store"
+	"github.com/thanos-io/thanos/pkg/store/labelpb"
+	"github.com/thanos-io/thanos/pkg/store/storepb"
+	"github.com/weaveworks/common/logging"
+	"go.uber.org/atomic"
+	"google.golang.org/grpc/metadata"
+
+	"github.com/cortexproject/cortex/pkg/storage/bucket"
+	"github.com/cortexproject/cortex/pkg/storage/bucket/filesystem"
+	cortex_tsdb "github.com/cortexproject/cortex/pkg/storage/tsdb"
+	"github.com/cortexproject/cortex/pkg/util"
+)
+
+func TestBucketStores_InitialSync(t *testing.T) {
+	userToMetric := map[string]string{
+		"user-1": "series_1",
+		"user-2": "series_2",
+	}
+
+	ctx := context.Background()
+	cfg, cleanup := prepareStorageConfig(t)
+	defer cleanup()
+
+	storageDir, err := ioutil.TempDir(os.TempDir(), "storage-*")
+	require.NoError(t, err)
+
+	for userID, metricName := range userToMetric {
+		generateStorageBlock(t, storageDir, userID, metricName, 10, 100, 15)
+	}
+
+	bucket, err := filesystem.NewBucketClient(filesystem.Config{Directory: storageDir})
+	require.NoError(t, err)
+
+	reg := prometheus.NewPedanticRegistry()
+	stores, err := NewBucketStores(cfg, NewNoShardingStrategy(), bucket, defaultLimitsOverrides(t), mockLoggingLevel(), log.NewNopLogger(), reg)
+	require.NoError(t, err)
+
+	// Query series before the initial sync.
+	for userID, metricName := range userToMetric {
+		seriesSet, warnings, err := querySeries(stores, userID, metricName, 20, 40)
+		require.NoError(t, err)
+		assert.Empty(t, warnings)
+		assert.Empty(t, seriesSet)
+	}
+
+	require.NoError(t, stores.InitialSync(ctx))
+
+	// Query series after the initial sync.
+	for userID, metricName := range userToMetric {
+		seriesSet, warnings, err := querySeries(stores, userID, metricName, 20, 40)
+		require.NoError(t, err)
+		assert.Empty(t, warnings)
+		require.Len(t, seriesSet, 1)
+		assert.Equal(t, []labelpb.ZLabel{{Name: labels.MetricName, Value: metricName}}, seriesSet[0].Labels)
+	}
+
+	// Query series of another user.
+	seriesSet, warnings, err := querySeries(stores, "user-1", "series_2", 20, 40)
+	require.NoError(t, err)
+	assert.Empty(t, warnings)
+	assert.Empty(t, seriesSet)
+
+	assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(`
+			# HELP cortex_bucket_store_blocks_loaded Number of currently loaded blocks.
+			# TYPE cortex_bucket_store_blocks_loaded gauge
+			cortex_bucket_store_blocks_loaded 2
+
+			# HELP cortex_bucket_store_block_loads_total Total number of remote block loading attempts.
+			# TYPE cortex_bucket_store_block_loads_total counter
+			cortex_bucket_store_block_loads_total 2
+
+			# HELP cortex_bucket_store_block_load_failures_total Total number of failed remote block loading attempts.
+			# TYPE cortex_bucket_store_block_load_failures_total counter
+			cortex_bucket_store_block_load_failures_total 0
+
+			# HELP cortex_bucket_stores_gate_queries_concurrent_max Number of maximum concurrent queries allowed.
+			# TYPE cortex_bucket_stores_gate_queries_concurrent_max gauge
+			cortex_bucket_stores_gate_queries_concurrent_max 100
+
+			# HELP cortex_bucket_stores_gate_queries_in_flight Number of queries that are currently in flight.
+			# TYPE cortex_bucket_stores_gate_queries_in_flight gauge
+			cortex_bucket_stores_gate_queries_in_flight 0
+	`),
+		"cortex_bucket_store_blocks_loaded",
+		"cortex_bucket_store_block_loads_total",
+		"cortex_bucket_store_block_load_failures_total",
+		"cortex_bucket_stores_gate_queries_concurrent_max",
+		"cortex_bucket_stores_gate_queries_in_flight",
+	))
+
+	assert.Greater(t, testutil.ToFloat64(stores.syncLastSuccess), float64(0))
+}
+
+func TestBucketStores_InitialSyncShouldRetryOnFailure(t *testing.T) {
+	ctx := context.Background()
+	cfg, cleanup := prepareStorageConfig(t)
+	defer cleanup()
+
+	storageDir, err := ioutil.TempDir(os.TempDir(), "storage-*")
+	require.NoError(t, err)
+
+	// Generate a block for the user in the storage.
+	generateStorageBlock(t, storageDir, "user-1", "series_1", 10, 100, 15)
+
+	bucket, err := filesystem.NewBucketClient(filesystem.Config{Directory: storageDir})
+	require.NoError(t, err)
+
+	// Wrap the bucket to fail the 1st Get() request.
+	bucket = &failFirstGetBucket{Bucket: bucket}
+
+	reg := prometheus.NewPedanticRegistry()
+	stores, err := NewBucketStores(cfg, NewNoShardingStrategy(), bucket, defaultLimitsOverrides(t), mockLoggingLevel(), log.NewNopLogger(), reg)
+	require.NoError(t, err)
+
+	// Initial sync should succeed even if a transient error occurs.
+	require.NoError(t, stores.InitialSync(ctx))
+
+	// Query series after the initial sync.
+	seriesSet, warnings, err := querySeries(stores, "user-1", "series_1", 20, 40)
+	require.NoError(t, err)
+	assert.Empty(t, warnings)
+	require.Len(t, seriesSet, 1)
+	assert.Equal(t, []labelpb.ZLabel{{Name: labels.MetricName, Value: "series_1"}}, seriesSet[0].Labels)
+
+	assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(`
+			# HELP cortex_blocks_meta_syncs_total Total blocks metadata synchronization attempts
+			# TYPE cortex_blocks_meta_syncs_total counter
+			cortex_blocks_meta_syncs_total 2
+
+			# HELP cortex_blocks_meta_sync_failures_total Total blocks metadata synchronization failures
+			# TYPE cortex_blocks_meta_sync_failures_total counter
+			cortex_blocks_meta_sync_failures_total 1
+
+			# HELP cortex_bucket_store_blocks_loaded Number of currently loaded blocks.
+			# TYPE cortex_bucket_store_blocks_loaded gauge
+			cortex_bucket_store_blocks_loaded 1
+
+			# HELP cortex_bucket_store_block_loads_total Total number of remote block loading attempts.
+			# TYPE cortex_bucket_store_block_loads_total counter
+			cortex_bucket_store_block_loads_total 1
+
+			# HELP cortex_bucket_store_block_load_failures_total Total number of failed remote block loading attempts.
+			# TYPE cortex_bucket_store_block_load_failures_total counter
+			cortex_bucket_store_block_load_failures_total 0
+	`),
+		"cortex_blocks_meta_syncs_total",
+		"cortex_blocks_meta_sync_failures_total",
+		"cortex_bucket_store_block_loads_total",
+		"cortex_bucket_store_block_load_failures_total",
+		"cortex_bucket_store_blocks_loaded",
+	))
+
+	assert.Greater(t, testutil.ToFloat64(stores.syncLastSuccess), float64(0))
+}
+
+func TestBucketStores_SyncBlocks(t *testing.T) {
+	const (
+		userID     = "user-1"
+		metricName = "series_1"
+	)
+
+	ctx := context.Background()
+	cfg, cleanup := prepareStorageConfig(t)
+	defer cleanup()
+
+	storageDir, err := ioutil.TempDir(os.TempDir(), "storage-*")
+	require.NoError(t, err)
+
+	bucket, err := filesystem.NewBucketClient(filesystem.Config{Directory: storageDir})
+	require.NoError(t, err)
+
+	reg := prometheus.NewPedanticRegistry()
+	stores, err := NewBucketStores(cfg, NewNoShardingStrategy(), bucket, defaultLimitsOverrides(t), mockLoggingLevel(), log.NewNopLogger(), reg)
+	require.NoError(t, err)
+
+	// Run an initial sync to discover 1 block.
+	generateStorageBlock(t, storageDir, userID, metricName, 10, 100, 15)
+	require.NoError(t, stores.InitialSync(ctx))
+
+	// Query a range for which we have no samples.
+	seriesSet, warnings, err := querySeries(stores, userID, metricName, 150, 180)
+	require.NoError(t, err)
+	assert.Empty(t, warnings)
+	assert.Empty(t, seriesSet)
+
+	// Generate another block and sync blocks again.
+	generateStorageBlock(t, storageDir, userID, metricName, 100, 200, 15)
+	require.NoError(t, stores.SyncBlocks(ctx))
+
+	seriesSet, warnings, err = querySeries(stores, userID, metricName, 150, 180)
+	require.NoError(t, err)
+	assert.Empty(t, warnings)
+	assert.Len(t, seriesSet, 1)
+	assert.Equal(t, []labelpb.ZLabel{{Name: labels.MetricName, Value: metricName}}, seriesSet[0].Labels)
+
+	assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(`
+			# HELP cortex_bucket_store_blocks_loaded Number of currently loaded blocks.
+			# TYPE cortex_bucket_store_blocks_loaded gauge
+			cortex_bucket_store_blocks_loaded 2
+
+			# HELP cortex_bucket_store_block_loads_total Total number of remote block loading attempts.
+			# TYPE cortex_bucket_store_block_loads_total counter
+			cortex_bucket_store_block_loads_total 2
+
+			# HELP cortex_bucket_store_block_load_failures_total Total number of failed remote block loading attempts.
+			# TYPE cortex_bucket_store_block_load_failures_total counter
+			cortex_bucket_store_block_load_failures_total 0
+
+			# HELP cortex_bucket_stores_gate_queries_concurrent_max Number of maximum concurrent queries allowed.
+			# TYPE cortex_bucket_stores_gate_queries_concurrent_max gauge
+			cortex_bucket_stores_gate_queries_concurrent_max 100
+
+			# HELP cortex_bucket_stores_gate_queries_in_flight Number of queries that are currently in flight.
+			# TYPE cortex_bucket_stores_gate_queries_in_flight gauge
+			cortex_bucket_stores_gate_queries_in_flight 0
+	`),
+		"cortex_bucket_store_blocks_loaded",
+		"cortex_bucket_store_block_loads_total",
+		"cortex_bucket_store_block_load_failures_total",
+		"cortex_bucket_stores_gate_queries_concurrent_max",
+		"cortex_bucket_stores_gate_queries_in_flight",
+	))
+
+	assert.Greater(t, testutil.ToFloat64(stores.syncLastSuccess), float64(0))
+}
+
+func TestBucketStores_syncUsersBlocks(t *testing.T) {
+	allUsers := []string{"user-1", "user-2", "user-3"}
+
+	tests := map[string]struct {
+		shardingStrategy ShardingStrategy
+		expectedStores   int32
+	}{
+		"when sharding is disabled all users should be synced": {
+			shardingStrategy: NewNoShardingStrategy(),
+			expectedStores:   3,
+		},
+		"when sharding is enabled only stores for filtered users should be created": {
+			shardingStrategy: func() ShardingStrategy {
+				s := &mockShardingStrategy{}
+				s.On("FilterUsers", mock.Anything, allUsers).Return([]string{"user-1", "user-2"})
+				return s
+			}(),
+			expectedStores: 2,
+		},
+	}
+
+	for testName, testData := range tests {
+		t.Run(testName, func(t *testing.T) {
+			cfg, cleanup := prepareStorageConfig(t)
+			cfg.BucketStore.TenantSyncConcurrency = 2
+			defer cleanup()
+
+			bucketClient := &bucket.ClientMock{}
+			bucketClient.MockIter("", allUsers, nil)
+
+			stores, err := NewBucketStores(cfg, testData.shardingStrategy, bucketClient, defaultLimitsOverrides(t), mockLoggingLevel(), log.NewNopLogger(), nil)
+			require.NoError(t, err)
+
+			// Sync user stores and count the number of times the callback is called.
+			var storesCount atomic.Int32
+			err = stores.syncUsersBlocks(context.Background(), func(ctx context.Context, bs *store.BucketStore) error {
+				storesCount.Inc()
+				return nil
+			})
+
+			assert.NoError(t, err)
+			bucketClient.AssertNumberOfCalls(t, "Iter", 1)
+			assert.Equal(t, storesCount.Load(), testData.expectedStores)
+		})
+	}
+}
+
+func TestBucketStores_Series_ShouldCorrectlyQuerySeriesSpanningMultipleChunks(t *testing.T) {
+	for _, lazyLoadingEnabled := range []bool{true, false} {
+		t.Run(fmt.Sprintf("lazy loading enabled = %v", lazyLoadingEnabled), func(t *testing.T) {
+			testBucketStoresSeriesShouldCorrectlyQuerySeriesSpanningMultipleChunks(t, lazyLoadingEnabled)
+		})
+	}
+}
+
+func testBucketStoresSeriesShouldCorrectlyQuerySeriesSpanningMultipleChunks(t *testing.T, lazyLoadingEnabled bool) {
+	const (
+		userID     = "user-1"
+		metricName = "series_1"
+	)
+
+	ctx := context.Background()
+	cfg, cleanup := prepareStorageConfig(t)
+	cfg.BucketStore.IndexHeaderLazyLoadingEnabled = lazyLoadingEnabled
+	cfg.BucketStore.IndexHeaderLazyLoadingIdleTimeout = time.Minute
+	defer cleanup()
+
+	storageDir, err := ioutil.TempDir(os.TempDir(), "storage-*")
+	require.NoError(t, err)
+
+	// Generate a single block with 1 series and a lot of samples.
+	generateStorageBlock(t, storageDir, userID, metricName, 0, 10000, 1)
+
+	bucket, err := filesystem.NewBucketClient(filesystem.Config{Directory: storageDir})
+	require.NoError(t, err)
+
+	reg := prometheus.NewPedanticRegistry()
+	stores, err := NewBucketStores(cfg, NewNoShardingStrategy(), bucket, defaultLimitsOverrides(t), mockLoggingLevel(), log.NewNopLogger(), reg)
+	require.NoError(t, err)
+	require.NoError(t, stores.InitialSync(ctx))
+
+	tests := map[string]struct {
+		reqMinTime      int64
+		reqMaxTime      int64
+		expectedSamples int
+	}{
+		"query the entire block": {
+			reqMinTime:      math.MinInt64,
+			reqMaxTime:      math.MaxInt64,
+			expectedSamples: 10000,
+		},
+		"query the beginning of the block": {
+			reqMinTime:      0,
+			reqMaxTime:      100,
+			expectedSamples: store.MaxSamplesPerChunk,
+		},
+		"query the middle of the block": {
+			reqMinTime:      4000,
+			reqMaxTime:      4050,
+			expectedSamples: store.MaxSamplesPerChunk,
+		},
+		"query the end of the block": {
+			reqMinTime:      9800,
+			reqMaxTime:      10000,
+			expectedSamples: (store.MaxSamplesPerChunk * 2) + (10000 % store.MaxSamplesPerChunk),
+		},
+	}
+
+	for testName, testData := range tests {
+		t.Run(testName, func(t *testing.T) {
+			// Query a range for which we have no samples.
+			seriesSet, warnings, err := querySeries(stores, userID, metricName, testData.reqMinTime, testData.reqMaxTime)
+			require.NoError(t, err)
+			assert.Empty(t, warnings)
+			assert.Len(t, seriesSet, 1)
+
+			// Count returned samples.
+			samples, err := readSamplesFromChunks(seriesSet[0].Chunks)
+			require.NoError(t, err)
+			assert.Equal(t, testData.expectedSamples, len(samples))
+		})
+	}
+}
+
+func prepareStorageConfig(t *testing.T) (cortex_tsdb.BlocksStorageConfig, func()) {
+	tmpDir, err := ioutil.TempDir(os.TempDir(), "blocks-sync-*")
+	require.NoError(t, err)
+
+	cfg := cortex_tsdb.BlocksStorageConfig{}
+	flagext.DefaultValues(&cfg)
+	cfg.BucketStore.SyncDir = tmpDir
+
+	cleanup := func() {
+		require.NoError(t, os.RemoveAll(tmpDir))
+	}
+
+	return cfg, cleanup
+}
+
+func generateStorageBlock(t *testing.T, storageDir, userID string, metricName string, minT, maxT int64, step int) {
+	// Create a directory for the user (if doesn't already exist).
+	userDir := filepath.Join(storageDir, userID)
+	if _, err := os.Stat(userDir); err != nil {
+		require.NoError(t, os.Mkdir(userDir, os.ModePerm))
+	}
+
+	// Create a temporary directory where the TSDB is opened,
+	// then it will be snapshotted to the storage directory.
+	tmpDir, err := ioutil.TempDir(os.TempDir(), "tsdb-*")
+	require.NoError(t, err)
+	defer func() {
+		require.NoError(t, os.RemoveAll(tmpDir))
+	}()
+
+	db, err := tsdb.Open(tmpDir, log.NewNopLogger(), nil, tsdb.DefaultOptions(), nil)
+	require.NoError(t, err)
+	defer func() {
+		require.NoError(t, db.Close())
+	}()
+
+	series := labels.Labels{labels.Label{Name: labels.MetricName, Value: metricName}}
+
+	app := db.Appender(context.Background())
+	for ts := minT; ts < maxT; ts += int64(step) {
+		_, err = app.Append(0, series, ts, 1)
+		require.NoError(t, err)
+	}
+	require.NoError(t, app.Commit())
+
+	// Snapshot TSDB to the storage directory.
+	require.NoError(t, db.Snapshot(userDir, true))
+}
+
+func querySeries(stores *BucketStores, userID, metricName string, minT, maxT int64) ([]*storepb.Series, storage.Warnings, error) {
+	req := &storepb.SeriesRequest{
+		MinTime: minT,
+		MaxTime: maxT,
+		Matchers: []storepb.LabelMatcher{{
+			Type:  storepb.LabelMatcher_EQ,
+			Name:  labels.MetricName,
+			Value: metricName,
+		}},
+		PartialResponseStrategy: storepb.PartialResponseStrategy_ABORT,
+	}
+
+	ctx := setUserIDToGRPCContext(context.Background(), userID)
+	srv := newBucketStoreSeriesServer(ctx)
+	err := stores.Series(req, srv)
+
+	return srv.SeriesSet, srv.Warnings, err
+}
+
+func mockLoggingLevel() logging.Level {
+	level := logging.Level{}
+	err := level.Set("info")
+	if err != nil {
+		panic(err)
+	}
+
+	return level
+}
+
+func setUserIDToGRPCContext(ctx context.Context, userID string) context.Context {
+	// We have to store it in the incoming metadata because we have to emulate the
+	// case it's coming from a gRPC request, while here we're running everything in-memory.
+	return metadata.NewIncomingContext(ctx, metadata.Pairs(cortex_tsdb.TenantIDExternalLabel, userID))
+}
+
+func TestBucketStores_deleteLocalFilesForExcludedTenants(t *testing.T) {
+	const (
+		user1 = "user-1"
+		user2 = "user-2"
+	)
+
+	userToMetric := map[string]string{
+		user1: "series_1",
+		user2: "series_2",
+	}
+
+	ctx := context.Background()
+	cfg, cleanup := prepareStorageConfig(t)
+	defer cleanup()
+
+	storageDir, err := ioutil.TempDir(os.TempDir(), "storage-*")
+	require.NoError(t, err)
+	t.Cleanup(func() {
+		require.NoError(t, os.RemoveAll(storageDir))
+	})
+
+	for userID, metricName := range userToMetric {
+		generateStorageBlock(t, storageDir, userID, metricName, 10, 100, 15)
+	}
+
+	bucket, err := filesystem.NewBucketClient(filesystem.Config{Directory: storageDir})
+	require.NoError(t, err)
+
+	sharding := userShardingStrategy{}
+
+	reg := prometheus.NewPedanticRegistry()
+	stores, err := NewBucketStores(cfg, &sharding, bucket, defaultLimitsOverrides(t), mockLoggingLevel(), log.NewNopLogger(), reg)
+	require.NoError(t, err)
+
+	// Perform sync.
+	sharding.users = []string{user1, user2}
+	require.NoError(t, stores.InitialSync(ctx))
+	require.Equal(t, []string{user1, user2}, getUsersInDir(t, cfg.BucketStore.SyncDir))
+
+	metricNames := []string{"cortex_bucket_store_block_drops_total", "cortex_bucket_store_block_loads_total", "cortex_bucket_store_blocks_loaded"}
+
+	require.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(`
+        	            	# HELP cortex_bucket_store_block_drops_total Total number of local blocks that were dropped.
+        	            	# TYPE cortex_bucket_store_block_drops_total counter
+        	            	cortex_bucket_store_block_drops_total 0
+        	            	# HELP cortex_bucket_store_block_loads_total Total number of remote block loading attempts.
+        	            	# TYPE cortex_bucket_store_block_loads_total counter
+        	            	cortex_bucket_store_block_loads_total 2
+        	            	# HELP cortex_bucket_store_blocks_loaded Number of currently loaded blocks.
+        	            	# TYPE cortex_bucket_store_blocks_loaded gauge
+        	            	cortex_bucket_store_blocks_loaded 2
+	`), metricNames...))
+
+	// Single user left in shard.
+	sharding.users = []string{user1}
+	require.NoError(t, stores.SyncBlocks(ctx))
+	require.Equal(t, []string{user1}, getUsersInDir(t, cfg.BucketStore.SyncDir))
+
+	require.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(`
+        	            	# HELP cortex_bucket_store_block_drops_total Total number of local blocks that were dropped.
+        	            	# TYPE cortex_bucket_store_block_drops_total counter
+        	            	cortex_bucket_store_block_drops_total 1
+        	            	# HELP cortex_bucket_store_block_loads_total Total number of remote block loading attempts.
+        	            	# TYPE cortex_bucket_store_block_loads_total counter
+        	            	cortex_bucket_store_block_loads_total 2
+        	            	# HELP cortex_bucket_store_blocks_loaded Number of currently loaded blocks.
+        	            	# TYPE cortex_bucket_store_blocks_loaded gauge
+        	            	cortex_bucket_store_blocks_loaded 1
+	`), metricNames...))
+
+	// No users left in this shard.
+	sharding.users = nil
+	require.NoError(t, stores.SyncBlocks(ctx))
+	require.Equal(t, []string(nil), getUsersInDir(t, cfg.BucketStore.SyncDir))
+
+	require.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(`
+        	            	# HELP cortex_bucket_store_block_drops_total Total number of local blocks that were dropped.
+        	            	# TYPE cortex_bucket_store_block_drops_total counter
+        	            	cortex_bucket_store_block_drops_total 2
+        	            	# HELP cortex_bucket_store_block_loads_total Total number of remote block loading attempts.
+        	            	# TYPE cortex_bucket_store_block_loads_total counter
+        	            	cortex_bucket_store_block_loads_total 2
+        	            	# HELP cortex_bucket_store_blocks_loaded Number of currently loaded blocks.
+        	            	# TYPE cortex_bucket_store_blocks_loaded gauge
+        	            	cortex_bucket_store_blocks_loaded 0
+	`), metricNames...))
+
+	// We can always get user back.
+	sharding.users = []string{user1}
+	require.NoError(t, stores.SyncBlocks(ctx))
+	require.Equal(t, []string{user1}, getUsersInDir(t, cfg.BucketStore.SyncDir))
+
+	require.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(`
+        	            	# HELP cortex_bucket_store_block_drops_total Total number of local blocks that were dropped.
+        	            	# TYPE cortex_bucket_store_block_drops_total counter
+        	            	cortex_bucket_store_block_drops_total 2
+        	            	# HELP cortex_bucket_store_block_loads_total Total number of remote block loading attempts.
+        	            	# TYPE cortex_bucket_store_block_loads_total counter
+        	            	cortex_bucket_store_block_loads_total 3
+        	            	# HELP cortex_bucket_store_blocks_loaded Number of currently loaded blocks.
+        	            	# TYPE cortex_bucket_store_blocks_loaded gauge
+        	            	cortex_bucket_store_blocks_loaded 1
+	`), metricNames...))
+}
+
+func getUsersInDir(t *testing.T, dir string) []string {
+	fs, err := ioutil.ReadDir(dir)
+	require.NoError(t, err)
+
+	var result []string
+	for _, fi := range fs {
+		if fi.IsDir() {
+			result = append(result, fi.Name())
+		}
+	}
+	sort.Strings(result)
+	return result
+}
+
+type userShardingStrategy struct {
+	users []string
+}
+
+func (u *userShardingStrategy) FilterUsers(ctx context.Context, userIDs []string) []string {
+	return u.users
+}
+
+func (u *userShardingStrategy) FilterBlocks(ctx context.Context, userID string, metas map[ulid.ULID]*thanos_metadata.Meta, loaded map[ulid.ULID]struct{}, synced *extprom.TxGaugeVec) error {
+	if util.StringsContain(u.users, userID) {
+		return nil
+	}
+
+	for k := range metas {
+		delete(metas, k)
+	}
+	return nil
+}
+
+// failFirstGetBucket is an objstore.Bucket wrapper which fails the first Get() request with a mocked error.
+type failFirstGetBucket struct {
+	objstore.Bucket
+
+	firstGet atomic.Bool
+}
+
+func (f *failFirstGetBucket) Get(ctx context.Context, name string) (io.ReadCloser, error) {
+	if f.firstGet.CAS(false, true) {
+		return nil, errors.New("Get() request mocked error")
+	}
+
+	return f.Bucket.Get(ctx, name)
+}
diff --git a/pkg/storegateway/chunk_bytes_pool_test.go b/pkg/storegateway/chunk_bytes_pool_test.go
new file mode 100644
index 0000000000000..182bb04c0f2ea
--- /dev/null
+++ b/pkg/storegateway/chunk_bytes_pool_test.go
@@ -0,0 +1,37 @@
+package storegateway
+
+import (
+	"bytes"
+	"fmt"
+	"testing"
+
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/testutil"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+	"github.com/thanos-io/thanos/pkg/store"
+
+	cortex_tsdb "github.com/cortexproject/cortex/pkg/storage/tsdb"
+)
+
+func TestChunkBytesPool_Get(t *testing.T) {
+	reg := prometheus.NewPedanticRegistry()
+	p, err := newChunkBytesPool(cortex_tsdb.ChunkPoolDefaultMinBucketSize, cortex_tsdb.ChunkPoolDefaultMaxBucketSize, 0, reg)
+	require.NoError(t, err)
+
+	_, err = p.Get(store.EstimatedMaxChunkSize - 1)
+	require.NoError(t, err)
+
+	_, err = p.Get(store.EstimatedMaxChunkSize + 1)
+	require.NoError(t, err)
+
+	assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(fmt.Sprintf(`
+		# HELP cortex_bucket_store_chunk_pool_requested_bytes_total Total bytes requested to chunk bytes pool.
+		# TYPE cortex_bucket_store_chunk_pool_requested_bytes_total counter
+		cortex_bucket_store_chunk_pool_requested_bytes_total %d
+
+		# HELP cortex_bucket_store_chunk_pool_returned_bytes_total Total bytes returned by the chunk bytes pool.
+		# TYPE cortex_bucket_store_chunk_pool_returned_bytes_total counter
+		cortex_bucket_store_chunk_pool_returned_bytes_total %d
+	`, store.EstimatedMaxChunkSize*2, store.EstimatedMaxChunkSize*3))))
+}
diff --git a/pkg/storegateway/gateway_ring_test.go b/pkg/storegateway/gateway_ring_test.go
new file mode 100644
index 0000000000000..b621a566adf7c
--- /dev/null
+++ b/pkg/storegateway/gateway_ring_test.go
@@ -0,0 +1,72 @@
+package storegateway
+
+import (
+	"testing"
+	"time"
+
+	"github.com/grafana/dskit/ring"
+	"github.com/stretchr/testify/assert"
+)
+
+func TestIsHealthyForStoreGatewayOperations(t *testing.T) {
+	t.Parallel()
+
+	tests := map[string]struct {
+		instance          *ring.InstanceDesc
+		timeout           time.Duration
+		ownerSyncExpected bool
+		ownerReadExpected bool
+		readExpected      bool
+	}{
+		"ACTIVE instance with last keepalive newer than timeout": {
+			instance:          &ring.InstanceDesc{State: ring.ACTIVE, Timestamp: time.Now().Add(-30 * time.Second).Unix()},
+			timeout:           time.Minute,
+			ownerSyncExpected: true,
+			ownerReadExpected: true,
+			readExpected:      true,
+		},
+		"ACTIVE instance with last keepalive older than timeout": {
+			instance:          &ring.InstanceDesc{State: ring.ACTIVE, Timestamp: time.Now().Add(-90 * time.Second).Unix()},
+			timeout:           time.Minute,
+			ownerSyncExpected: false,
+			ownerReadExpected: false,
+			readExpected:      false,
+		},
+		"JOINING instance with last keepalive newer than timeout": {
+			instance:          &ring.InstanceDesc{State: ring.JOINING, Timestamp: time.Now().Add(-30 * time.Second).Unix()},
+			timeout:           time.Minute,
+			ownerSyncExpected: true,
+			ownerReadExpected: false,
+			readExpected:      false,
+		},
+		"LEAVING instance with last keepalive newer than timeout": {
+			instance:          &ring.InstanceDesc{State: ring.LEAVING, Timestamp: time.Now().Add(-30 * time.Second).Unix()},
+			timeout:           time.Minute,
+			ownerSyncExpected: true,
+			ownerReadExpected: false,
+			readExpected:      false,
+		},
+		"PENDING instance with last keepalive newer than timeout": {
+			instance:          &ring.InstanceDesc{State: ring.PENDING, Timestamp: time.Now().Add(-30 * time.Second).Unix()},
+			timeout:           time.Minute,
+			ownerSyncExpected: false,
+			ownerReadExpected: false,
+			readExpected:      false,
+		},
+	}
+
+	for testName, testData := range tests {
+		testData := testData
+
+		t.Run(testName, func(t *testing.T) {
+			actual := testData.instance.IsHealthy(BlocksOwnerSync, testData.timeout, time.Now())
+			assert.Equal(t, testData.ownerSyncExpected, actual)
+
+			actual = testData.instance.IsHealthy(BlocksOwnerRead, testData.timeout, time.Now())
+			assert.Equal(t, testData.ownerReadExpected, actual)
+
+			actual = testData.instance.IsHealthy(BlocksRead, testData.timeout, time.Now())
+			assert.Equal(t, testData.readExpected, actual)
+		})
+	}
+}
diff --git a/pkg/storegateway/gateway_test.go b/pkg/storegateway/gateway_test.go
new file mode 100644
index 0000000000000..ab8a982692bb0
--- /dev/null
+++ b/pkg/storegateway/gateway_test.go
@@ -0,0 +1,1132 @@
+package storegateway
+
+import (
+	"context"
+	"fmt"
+	"io/ioutil"
+	"math"
+	"math/rand"
+	"net/http"
+	"os"
+	"path"
+	"path/filepath"
+	"sort"
+	"strconv"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/cortexproject/cortex/pkg/storage/bucket"
+	"github.com/cortexproject/cortex/pkg/storage/bucket/filesystem"
+	cortex_tsdb "github.com/cortexproject/cortex/pkg/storage/tsdb"
+	"github.com/cortexproject/cortex/pkg/storage/tsdb/bucketindex"
+	"github.com/cortexproject/cortex/pkg/util"
+	"github.com/cortexproject/cortex/pkg/util/validation"
+	"github.com/go-kit/log"
+	"github.com/grafana/dskit/flagext"
+	"github.com/grafana/dskit/kv/consul"
+	"github.com/grafana/dskit/ring"
+	"github.com/grafana/dskit/services"
+	"github.com/oklog/ulid"
+	"github.com/pkg/errors"
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/prometheus/model/labels"
+	"github.com/prometheus/prometheus/tsdb"
+	"github.com/prometheus/prometheus/tsdb/chunkenc"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/mock"
+	"github.com/stretchr/testify/require"
+	"github.com/thanos-io/thanos/pkg/block"
+	"github.com/thanos-io/thanos/pkg/block/metadata"
+	"github.com/thanos-io/thanos/pkg/extprom"
+	"github.com/thanos-io/thanos/pkg/objstore"
+	"github.com/thanos-io/thanos/pkg/store/labelpb"
+	"github.com/thanos-io/thanos/pkg/store/storepb"
+	"google.golang.org/grpc/status"
+
+	tsdb_testutil "github.com/grafana/loki/pkg/storage/tsdb/testutil"
+	"github.com/grafana/loki/pkg/util/test"
+)
+
+func TestConfig_Validate(t *testing.T) {
+	tests := map[string]struct {
+		setup    func(cfg *Config, limits *validation.Limits)
+		expected error
+	}{
+		"should pass by default": {
+			setup:    func(cfg *Config, limits *validation.Limits) {},
+			expected: nil,
+		},
+		"should fail if the sharding strategy is invalid": {
+			setup: func(cfg *Config, limits *validation.Limits) {
+				cfg.ShardingEnabled = true
+				cfg.ShardingStrategy = "xxx"
+			},
+			expected: errInvalidShardingStrategy,
+		},
+		"should fail if the sharding strategy is shuffle-sharding and shard size has not been set": {
+			setup: func(cfg *Config, limits *validation.Limits) {
+				cfg.ShardingEnabled = true
+				cfg.ShardingStrategy = util.ShardingStrategyShuffle
+			},
+			expected: errInvalidTenantShardSize,
+		},
+		"should pass if the sharding strategy is shuffle-sharding and shard size has been set": {
+			setup: func(cfg *Config, limits *validation.Limits) {
+				cfg.ShardingEnabled = true
+				cfg.ShardingStrategy = util.ShardingStrategyShuffle
+				limits.StoreGatewayTenantShardSize = 3
+			},
+			expected: nil,
+		},
+	}
+
+	for testName, testData := range tests {
+		t.Run(testName, func(t *testing.T) {
+			cfg := &Config{}
+			limits := &validation.Limits{}
+			flagext.DefaultValues(cfg, limits)
+			testData.setup(cfg, limits)
+
+			assert.Equal(t, testData.expected, cfg.Validate(*limits))
+		})
+	}
+}
+
+func TestStoreGateway_InitialSyncWithDefaultShardingEnabled(t *testing.T) {
+	tests := map[string]struct {
+		initialExists bool
+		initialState  ring.InstanceState
+		initialTokens ring.Tokens
+	}{
+		"instance not in the ring": {
+			initialExists: false,
+		},
+		"instance already in the ring with PENDING state and has no tokens": {
+			initialExists: true,
+			initialState:  ring.PENDING,
+			initialTokens: ring.Tokens{},
+		},
+		"instance already in the ring with JOINING state and has some tokens": {
+			initialExists: true,
+			initialState:  ring.JOINING,
+			initialTokens: ring.Tokens{1, 2, 3, 4, 5, 6, 7, 8, 9},
+		},
+		"instance already in the ring with ACTIVE state and has all tokens": {
+			initialExists: true,
+			initialState:  ring.ACTIVE,
+			initialTokens: generateSortedTokens(RingNumTokens),
+		},
+		"instance already in the ring with LEAVING state and has all tokens": {
+			initialExists: true,
+			initialState:  ring.LEAVING,
+			initialTokens: generateSortedTokens(RingNumTokens),
+		},
+	}
+
+	for testName, testData := range tests {
+		t.Run(testName, func(t *testing.T) {
+			ctx := context.Background()
+			gatewayCfg := mockGatewayConfig()
+			gatewayCfg.ShardingEnabled = true
+			storageCfg := mockStorageConfig(t)
+			ringStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil)
+			t.Cleanup(func() { assert.NoError(t, closer.Close()) })
+
+			bucketClient := &bucket.ClientMock{}
+
+			// Setup the initial instance state in the ring.
+			if testData.initialExists {
+				require.NoError(t, ringStore.CAS(ctx, RingKey, func(in interface{}) (interface{}, bool, error) {
+					ringDesc := ring.GetOrCreateRingDesc(in)
+					ringDesc.AddIngester(gatewayCfg.ShardingRing.InstanceID, gatewayCfg.ShardingRing.InstanceAddr, "", testData.initialTokens, testData.initialState, time.Now())
+					return ringDesc, true, nil
+				}))
+			}
+
+			g, err := newStoreGateway(gatewayCfg, storageCfg, bucketClient, ringStore, defaultLimitsOverrides(t), mockLoggingLevel(), log.NewNopLogger(), nil)
+			require.NoError(t, err)
+			defer services.StopAndAwaitTerminated(ctx, g) //nolint:errcheck
+			assert.False(t, g.ringLifecycler.IsRegistered())
+
+			bucketClient.MockIterWithCallback("", []string{"user-1", "user-2"}, nil, func() {
+				// During the initial sync, we expect the instance to always be in the JOINING
+				// state within the ring.
+				assert.True(t, g.ringLifecycler.IsRegistered())
+				assert.Equal(t, ring.JOINING, g.ringLifecycler.GetState())
+				assert.Equal(t, RingNumTokens, len(g.ringLifecycler.GetTokens()))
+				assert.Subset(t, g.ringLifecycler.GetTokens(), testData.initialTokens)
+			})
+			bucketClient.MockIter("user-1/", []string{}, nil)
+			bucketClient.MockIter("user-2/", []string{}, nil)
+
+			// Once successfully started, the instance should be ACTIVE in the ring.
+			require.NoError(t, services.StartAndAwaitRunning(ctx, g))
+
+			assert.True(t, g.ringLifecycler.IsRegistered())
+			assert.Equal(t, ring.ACTIVE, g.ringLifecycler.GetState())
+			assert.Equal(t, RingNumTokens, len(g.ringLifecycler.GetTokens()))
+			assert.Subset(t, g.ringLifecycler.GetTokens(), testData.initialTokens)
+
+			assert.NotNil(t, g.stores.getStore("user-1"))
+			assert.NotNil(t, g.stores.getStore("user-2"))
+			assert.Nil(t, g.stores.getStore("user-unknown"))
+		})
+	}
+}
+
+func TestStoreGateway_InitialSyncWithShardingDisabled(t *testing.T) {
+	ctx := context.Background()
+	gatewayCfg := mockGatewayConfig()
+	gatewayCfg.ShardingEnabled = false
+	storageCfg := mockStorageConfig(t)
+	bucketClient := &bucket.ClientMock{}
+
+	g, err := newStoreGateway(gatewayCfg, storageCfg, bucketClient, nil, defaultLimitsOverrides(t), mockLoggingLevel(), log.NewNopLogger(), nil)
+	require.NoError(t, err)
+	defer services.StopAndAwaitTerminated(ctx, g) //nolint:errcheck
+
+	bucketClient.MockIter("", []string{"user-1", "user-2"}, nil)
+	bucketClient.MockIter("user-1/", []string{}, nil)
+	bucketClient.MockIter("user-2/", []string{}, nil)
+
+	require.NoError(t, services.StartAndAwaitRunning(ctx, g))
+	assert.NotNil(t, g.stores.getStore("user-1"))
+	assert.NotNil(t, g.stores.getStore("user-2"))
+	assert.Nil(t, g.stores.getStore("user-unknown"))
+}
+
+func TestStoreGateway_InitialSyncFailure(t *testing.T) {
+	ctx := context.Background()
+	gatewayCfg := mockGatewayConfig()
+	gatewayCfg.ShardingEnabled = true
+	storageCfg := mockStorageConfig(t)
+	ringStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil)
+	t.Cleanup(func() { assert.NoError(t, closer.Close()) })
+
+	bucketClient := &bucket.ClientMock{}
+
+	g, err := newStoreGateway(gatewayCfg, storageCfg, bucketClient, ringStore, defaultLimitsOverrides(t), mockLoggingLevel(), log.NewNopLogger(), nil)
+	require.NoError(t, err)
+
+	bucketClient.MockIter("", []string{}, errors.New("network error"))
+
+	require.NoError(t, g.StartAsync(ctx))
+	err = g.AwaitRunning(ctx)
+	assert.Error(t, err)
+	assert.Equal(t, services.Failed, g.State())
+
+	// We expect a clean shutdown, including unregistering the instance from the ring.
+	assert.False(t, g.ringLifecycler.IsRegistered())
+}
+
+// TestStoreGateway_InitialSyncWithWaitRingStability tests the store-gateway cold start case.
+// When several store-gateways start up at once, we expect each store-gateway to only load
+// their own blocks, regardless which store-gateway joined the ring first or last (even if starting
+// at the same time, they will join the ring at a slightly different time).
+func TestStoreGateway_InitialSyncWithWaitRingStability(t *testing.T) {
+	bucketClient, storageDir := tsdb_testutil.PrepareFilesystemBucket(t)
+
+	// This tests uses real TSDB blocks. 24h time range, 2h block range period,
+	// 2 users = total (24 / 12) * 2 = 24 blocks.
+	numUsers := 2
+	numBlocks := numUsers * 12
+	now := time.Now()
+	mockTSDB(t, path.Join(storageDir, "user-1"), 24, 12, now.Add(-24*time.Hour).Unix()*1000, now.Unix()*1000)
+	mockTSDB(t, path.Join(storageDir, "user-2"), 24, 12, now.Add(-24*time.Hour).Unix()*1000, now.Unix()*1000)
+
+	// Write the bucket index.
+	for _, userID := range []string{"user-1", "user-2"} {
+		createBucketIndex(t, bucketClient, userID)
+	}
+
+	tests := map[string]struct {
+		shardingStrategy     string
+		tenantShardSize      int // Used only when the sharding strategy is shuffle-sharding.
+		replicationFactor    int
+		numGateways          int
+		expectedBlocksLoaded int
+	}{
+		"default sharding strategy, 1 gateway, RF = 1": {
+			shardingStrategy:     util.ShardingStrategyDefault,
+			replicationFactor:    1,
+			numGateways:          1,
+			expectedBlocksLoaded: numBlocks,
+		},
+		"default sharding strategy, 2 gateways, RF = 1": {
+			shardingStrategy:     util.ShardingStrategyDefault,
+			replicationFactor:    1,
+			numGateways:          2,
+			expectedBlocksLoaded: numBlocks, // blocks are sharded across gateways
+		},
+		"default sharding strategy, 3 gateways, RF = 2": {
+			shardingStrategy:     util.ShardingStrategyDefault,
+			replicationFactor:    2,
+			numGateways:          3,
+			expectedBlocksLoaded: 2 * numBlocks, // blocks are replicated 2 times
+		},
+		"default sharding strategy, 5 gateways, RF = 3": {
+			shardingStrategy:     util.ShardingStrategyDefault,
+			replicationFactor:    3,
+			numGateways:          5,
+			expectedBlocksLoaded: 3 * numBlocks, // blocks are replicated 3 times
+		},
+		"shuffle sharding strategy, 1 gateway, RF = 1, SS = 1": {
+			shardingStrategy:     util.ShardingStrategyShuffle,
+			tenantShardSize:      1,
+			replicationFactor:    1,
+			numGateways:          1,
+			expectedBlocksLoaded: numBlocks,
+		},
+		"shuffle sharding strategy, 5 gateways, RF = 2, SS = 3": {
+			shardingStrategy:     util.ShardingStrategyShuffle,
+			tenantShardSize:      3,
+			replicationFactor:    2,
+			numGateways:          5,
+			expectedBlocksLoaded: 2 * numBlocks, // blocks are replicated 2 times
+		},
+		"shuffle sharding strategy, 20 gateways, RF = 3, SS = 3": {
+			shardingStrategy:     util.ShardingStrategyShuffle,
+			tenantShardSize:      3,
+			replicationFactor:    3,
+			numGateways:          20,
+			expectedBlocksLoaded: 3 * numBlocks, // blocks are replicated 3 times
+		},
+	}
+
+	for testName, testData := range tests {
+		for _, bucketIndexEnabled := range []bool{true, false} {
+			t.Run(fmt.Sprintf("%s (bucket index enabled = %v)", testName, bucketIndexEnabled), func(t *testing.T) {
+				// Randomise the seed but log it in case we need to reproduce the test on failure.
+				seed := time.Now().UnixNano()
+				rand.Seed(seed)
+				t.Log("random generator seed:", seed)
+
+				ctx := context.Background()
+				ringStore, closer := consul.NewInMemoryClientWithConfig(ring.GetCodec(), consul.Config{
+					MaxCasRetries: 20,
+					CasRetryDelay: 500 * time.Millisecond,
+				}, log.NewNopLogger(), nil)
+				t.Cleanup(func() { assert.NoError(t, closer.Close()) })
+
+				// Create the configured number of gateways.
+				var gateways []*StoreGateway
+				registries := util.NewUserRegistries()
+
+				for i := 1; i <= testData.numGateways; i++ {
+					instanceID := fmt.Sprintf("gateway-%d", i)
+
+					storageCfg := mockStorageConfig(t)
+					storageCfg.BucketStore.SyncInterval = time.Hour // Do not trigger the periodic sync in this test. We want the initial sync only.
+					storageCfg.BucketStore.BucketIndex.Enabled = bucketIndexEnabled
+
+					limits := defaultLimitsConfig()
+					gatewayCfg := mockGatewayConfig()
+					gatewayCfg.ShardingRing.ReplicationFactor = testData.replicationFactor
+					gatewayCfg.ShardingRing.InstanceID = instanceID
+					gatewayCfg.ShardingRing.InstanceAddr = fmt.Sprintf("127.0.0.%d", i)
+					gatewayCfg.ShardingRing.RingCheckPeriod = time.Hour // Do not check the ring topology changes in this test. We want the initial sync only.
+					gatewayCfg.ShardingRing.WaitStabilityMinDuration = 2 * time.Second
+					gatewayCfg.ShardingRing.WaitStabilityMaxDuration = 30 * time.Second
+					gatewayCfg.ShardingEnabled = true
+					gatewayCfg.ShardingStrategy = testData.shardingStrategy
+					limits.StoreGatewayTenantShardSize = testData.tenantShardSize
+
+					overrides, err := validation.NewOverrides(limits, nil)
+					require.NoError(t, err)
+
+					reg := prometheus.NewPedanticRegistry()
+					g, err := newStoreGateway(gatewayCfg, storageCfg, bucketClient, ringStore, overrides, mockLoggingLevel(), log.NewNopLogger(), reg)
+					require.NoError(t, err)
+					defer services.StopAndAwaitTerminated(ctx, g) //nolint:errcheck
+
+					gateways = append(gateways, g)
+					registries.AddUserRegistry(instanceID, reg)
+				}
+
+				// Start all gateways concurrently.
+				for _, g := range gateways {
+					require.NoError(t, g.StartAsync(ctx))
+				}
+
+				// Wait until all gateways are running.
+				for _, g := range gateways {
+					require.NoError(t, g.AwaitRunning(ctx))
+				}
+
+				// At this point we expect that all gateways have done the initial sync and
+				// they have synched only their own blocks, because they waited for a stable
+				// ring before starting the initial sync.
+				metrics := registries.BuildMetricFamiliesPerUser()
+				assert.Equal(t, float64(testData.expectedBlocksLoaded), metrics.GetSumOfGauges("cortex_bucket_store_blocks_loaded"))
+				assert.Equal(t, float64(2*testData.numGateways), metrics.GetSumOfGauges("cortex_bucket_stores_tenants_discovered"))
+
+				if testData.shardingStrategy == util.ShardingStrategyShuffle {
+					assert.Equal(t, float64(testData.tenantShardSize*numBlocks), metrics.GetSumOfGauges("cortex_blocks_meta_synced"))
+					assert.Equal(t, float64(testData.tenantShardSize*numUsers), metrics.GetSumOfGauges("cortex_bucket_stores_tenants_synced"))
+				} else {
+					assert.Equal(t, float64(testData.numGateways*numBlocks), metrics.GetSumOfGauges("cortex_blocks_meta_synced"))
+					assert.Equal(t, float64(testData.numGateways*numUsers), metrics.GetSumOfGauges("cortex_bucket_stores_tenants_synced"))
+				}
+
+				// We expect that all gateways have only run the initial sync and not the periodic one.
+				assert.Equal(t, float64(testData.numGateways), metrics.GetSumOfCounters("cortex_storegateway_bucket_sync_total"))
+			})
+		}
+	}
+}
+
+func TestStoreGateway_BlocksSyncWithDefaultSharding_RingTopologyChangedAfterScaleUp(t *testing.T) {
+	const (
+		numUsers             = 2
+		numBlocks            = numUsers * 12
+		shardingStrategy     = util.ShardingStrategyDefault
+		replicationFactor    = 3
+		numInitialGateways   = 4
+		numScaleUpGateways   = 6
+		expectedBlocksLoaded = 3 * numBlocks // blocks are replicated 3 times
+	)
+
+	bucketClient, storageDir := tsdb_testutil.PrepareFilesystemBucket(t)
+
+	// This tests uses real TSDB blocks. 24h time range, 2h block range period,
+	// 2 users = total (24 / 12) * 2 = 24 blocks.
+	now := time.Now()
+	mockTSDB(t, path.Join(storageDir, "user-1"), 24, 12, now.Add(-24*time.Hour).Unix()*1000, now.Unix()*1000)
+	mockTSDB(t, path.Join(storageDir, "user-2"), 24, 12, now.Add(-24*time.Hour).Unix()*1000, now.Unix()*1000)
+
+	// Write the bucket index.
+	for _, userID := range []string{"user-1", "user-2"} {
+		createBucketIndex(t, bucketClient, userID)
+	}
+
+	// Randomise the seed but log it in case we need to reproduce the test on failure.
+	seed := time.Now().UnixNano()
+	rand.Seed(seed)
+	t.Log("random generator seed:", seed)
+
+	ctx := context.Background()
+	ringStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil)
+	t.Cleanup(func() { assert.NoError(t, closer.Close()) })
+
+	// Create the configured number of gateways.
+	var initialGateways []*StoreGateway
+	initialRegistries := util.NewUserRegistries()
+	allRegistries := util.NewUserRegistries()
+
+	createStoreGateway := func(id int, waitStabilityMin time.Duration) (*StoreGateway, string, *prometheus.Registry) {
+		instanceID := fmt.Sprintf("gateway-%d", id)
+
+		storageCfg := mockStorageConfig(t)
+		storageCfg.BucketStore.SyncInterval = time.Hour // Do not trigger the periodic sync in this test. We want it to be triggered by ring topology changed.
+		storageCfg.BucketStore.BucketIndex.Enabled = true
+
+		limits := defaultLimitsConfig()
+		gatewayCfg := mockGatewayConfig()
+		gatewayCfg.ShardingRing.ReplicationFactor = replicationFactor
+		gatewayCfg.ShardingRing.InstanceID = instanceID
+		gatewayCfg.ShardingRing.InstanceAddr = fmt.Sprintf("127.0.0.%d", id)
+		gatewayCfg.ShardingRing.RingCheckPeriod = 100 * time.Millisecond // Check it continuously. Topology will change on scale up.
+		gatewayCfg.ShardingRing.WaitStabilityMinDuration = waitStabilityMin
+		gatewayCfg.ShardingRing.WaitStabilityMaxDuration = 30 * time.Second
+		gatewayCfg.ShardingEnabled = true
+		gatewayCfg.ShardingStrategy = shardingStrategy
+
+		overrides, err := validation.NewOverrides(limits, nil)
+		require.NoError(t, err)
+
+		reg := prometheus.NewPedanticRegistry()
+		g, err := newStoreGateway(gatewayCfg, storageCfg, bucketClient, ringStore, overrides, mockLoggingLevel(), log.NewNopLogger(), reg)
+		require.NoError(t, err)
+
+		return g, instanceID, reg
+	}
+
+	for i := 1; i <= numInitialGateways; i++ {
+		g, instanceID, reg := createStoreGateway(i, 2*time.Second)
+		initialGateways = append(initialGateways, g)
+		initialRegistries.AddUserRegistry(instanceID, reg)
+		allRegistries.AddUserRegistry(instanceID, reg)
+	}
+
+	// Start all gateways concurrently.
+	for _, g := range initialGateways {
+		require.NoError(t, g.StartAsync(ctx))
+		defer services.StopAndAwaitTerminated(ctx, g) //nolint:errcheck
+	}
+
+	// Wait until all gateways are running.
+	for _, g := range initialGateways {
+		require.NoError(t, g.AwaitRunning(ctx))
+	}
+
+	// At this point we expect that all gateways have done the initial sync and
+	// they have synched only their own blocks.
+	metrics := initialRegistries.BuildMetricFamiliesPerUser()
+	assert.Equal(t, float64(expectedBlocksLoaded), metrics.GetSumOfGauges("cortex_bucket_store_blocks_loaded"))
+	assert.Equal(t, float64(2*numInitialGateways), metrics.GetSumOfGauges("cortex_bucket_stores_tenants_discovered"))
+
+	assert.Equal(t, float64(numInitialGateways*numBlocks), metrics.GetSumOfGauges("cortex_blocks_meta_synced"))
+	assert.Equal(t, float64(numInitialGateways*numUsers), metrics.GetSumOfGauges("cortex_bucket_stores_tenants_synced"))
+
+	// Scale up store-gateways.
+	var scaleUpGateways []*StoreGateway
+	scaleUpRegistries := util.NewUserRegistries()
+	numAllGateways := numInitialGateways + numScaleUpGateways
+
+	for i := numInitialGateways + 1; i <= numAllGateways; i++ {
+		g, instanceID, reg := createStoreGateway(i, 10*time.Second) // Intentionally high "wait stability min duration".
+		scaleUpGateways = append(scaleUpGateways, g)
+		scaleUpRegistries.AddUserRegistry(instanceID, reg)
+		allRegistries.AddUserRegistry(instanceID, reg)
+	}
+
+	// Start all new gateways concurrently.
+	for _, g := range scaleUpGateways {
+		require.NoError(t, g.StartAsync(ctx))
+		defer services.StopAndAwaitTerminated(ctx, g) //nolint:errcheck
+	}
+
+	// Since we configured the new store-gateways with an high "wait stability min duration", we expect
+	// them to join the ring at start up (with JOINING state) but then wait at least the min duration
+	// before syncing blocks and becoming ACTIVE. This give us enough time to check how the initial
+	// store-gateways behaves with regards to blocks syncing while other replicas are JOINING.
+
+	// Wait until all the initial store-gateways sees all new store-gateways too.
+	test.Poll(t, 5*time.Second, float64(numAllGateways*numInitialGateways), func() interface{} {
+		metrics := initialRegistries.BuildMetricFamiliesPerUser()
+		return metrics.GetSumOfGauges("cortex_ring_members")
+	})
+
+	// We expect each block to be available for querying on at least 1 initial store-gateway.
+	for _, userID := range []string{"user-1", "user-2"} {
+		idx, err := bucketindex.ReadIndex(ctx, bucketClient, userID, nil, log.NewNopLogger())
+		require.NoError(t, err)
+
+		for _, block := range idx.Blocks {
+			queried := false
+
+			for _, g := range initialGateways {
+				req := &storepb.SeriesRequest{MinTime: math.MinInt64, MaxTime: math.MaxInt64}
+				srv := newBucketStoreSeriesServer(setUserIDToGRPCContext(ctx, userID))
+				require.NoError(t, g.Series(req, srv))
+
+				for _, b := range srv.Hints.QueriedBlocks {
+					if b.Id == block.ID.String() {
+						queried = true
+					}
+				}
+			}
+
+			assert.True(t, queried, "block %s has been successfully queried on initial store-gateways", block.ID.String())
+		}
+	}
+
+	// Wait until all new gateways are running.
+	for _, g := range scaleUpGateways {
+		require.NoError(t, g.AwaitRunning(ctx))
+	}
+
+	// At this point the new store-gateways are expected to be ACTIVE in the ring and all the initial
+	// store-gateways should unload blocks they don't own anymore.
+	test.Poll(t, 5*time.Second, float64(expectedBlocksLoaded), func() interface{} {
+		metrics := allRegistries.BuildMetricFamiliesPerUser()
+		return metrics.GetSumOfGauges("cortex_bucket_store_blocks_loaded")
+	})
+}
+
+func TestStoreGateway_ShouldSupportLoadRingTokensFromFile(t *testing.T) {
+	tests := map[string]struct {
+		storedTokens      ring.Tokens
+		expectedNumTokens int
+	}{
+		"stored tokens are less than the configured ones": {
+			storedTokens:      generateSortedTokens(RingNumTokens - 10),
+			expectedNumTokens: RingNumTokens,
+		},
+		"stored tokens are equal to the configured ones": {
+			storedTokens:      generateSortedTokens(RingNumTokens),
+			expectedNumTokens: RingNumTokens,
+		},
+		"stored tokens are more then the configured ones": {
+			storedTokens:      generateSortedTokens(RingNumTokens + 10),
+			expectedNumTokens: RingNumTokens + 10,
+		},
+	}
+
+	for testName, testData := range tests {
+		t.Run(testName, func(t *testing.T) {
+			tokensFile, err := ioutil.TempFile(os.TempDir(), "tokens-*")
+			require.NoError(t, err)
+			defer os.Remove(tokensFile.Name()) //nolint:errcheck
+
+			// Store some tokens to the file.
+			require.NoError(t, testData.storedTokens.StoreToFile(tokensFile.Name()))
+
+			ctx := context.Background()
+			gatewayCfg := mockGatewayConfig()
+			gatewayCfg.ShardingEnabled = true
+			gatewayCfg.ShardingRing.TokensFilePath = tokensFile.Name()
+
+			storageCfg := mockStorageConfig(t)
+			ringStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil)
+			t.Cleanup(func() { assert.NoError(t, closer.Close()) })
+
+			bucketClient := &bucket.ClientMock{}
+			bucketClient.MockIter("", []string{}, nil)
+
+			g, err := newStoreGateway(gatewayCfg, storageCfg, bucketClient, ringStore, defaultLimitsOverrides(t), mockLoggingLevel(), log.NewNopLogger(), nil)
+			require.NoError(t, err)
+			defer services.StopAndAwaitTerminated(ctx, g) //nolint:errcheck
+			assert.False(t, g.ringLifecycler.IsRegistered())
+
+			require.NoError(t, services.StartAndAwaitRunning(ctx, g))
+			assert.True(t, g.ringLifecycler.IsRegistered())
+			assert.Equal(t, ring.ACTIVE, g.ringLifecycler.GetState())
+			assert.Len(t, g.ringLifecycler.GetTokens(), testData.expectedNumTokens)
+			assert.Subset(t, g.ringLifecycler.GetTokens(), testData.storedTokens)
+		})
+	}
+}
+
+func TestStoreGateway_SyncOnRingTopologyChanged(t *testing.T) {
+	registeredAt := time.Now()
+
+	tests := map[string]struct {
+		setupRing    func(desc *ring.Desc)
+		updateRing   func(desc *ring.Desc)
+		expectedSync bool
+	}{
+		"should sync when an instance is added to the ring": {
+			setupRing: func(desc *ring.Desc) {
+				desc.AddIngester("instance-1", "127.0.0.1", "", ring.Tokens{1, 2, 3}, ring.ACTIVE, registeredAt)
+			},
+			updateRing: func(desc *ring.Desc) {
+				desc.AddIngester("instance-2", "127.0.0.2", "", ring.Tokens{4, 5, 6}, ring.ACTIVE, registeredAt)
+			},
+			expectedSync: true,
+		},
+		"should sync when an instance is removed from the ring": {
+			setupRing: func(desc *ring.Desc) {
+				desc.AddIngester("instance-1", "127.0.0.1", "", ring.Tokens{1, 2, 3}, ring.ACTIVE, registeredAt)
+				desc.AddIngester("instance-2", "127.0.0.2", "", ring.Tokens{4, 5, 6}, ring.ACTIVE, registeredAt)
+			},
+			updateRing: func(desc *ring.Desc) {
+				desc.RemoveIngester("instance-1")
+			},
+			expectedSync: true,
+		},
+		"should sync when an instance changes state": {
+			setupRing: func(desc *ring.Desc) {
+				desc.AddIngester("instance-1", "127.0.0.1", "", ring.Tokens{1, 2, 3}, ring.ACTIVE, registeredAt)
+				desc.AddIngester("instance-2", "127.0.0.2", "", ring.Tokens{4, 5, 6}, ring.JOINING, registeredAt)
+			},
+			updateRing: func(desc *ring.Desc) {
+				instance := desc.Ingesters["instance-2"]
+				instance.State = ring.ACTIVE
+				desc.Ingesters["instance-2"] = instance
+			},
+			expectedSync: true,
+		},
+		"should sync when an healthy instance becomes unhealthy": {
+			setupRing: func(desc *ring.Desc) {
+				desc.AddIngester("instance-1", "127.0.0.1", "", ring.Tokens{1, 2, 3}, ring.ACTIVE, registeredAt)
+				desc.AddIngester("instance-2", "127.0.0.2", "", ring.Tokens{4, 5, 6}, ring.ACTIVE, registeredAt)
+			},
+			updateRing: func(desc *ring.Desc) {
+				instance := desc.Ingesters["instance-2"]
+				instance.Timestamp = time.Now().Add(-time.Hour).Unix()
+				desc.Ingesters["instance-2"] = instance
+			},
+			expectedSync: true,
+		},
+		"should sync when an unhealthy instance becomes healthy": {
+			setupRing: func(desc *ring.Desc) {
+				desc.AddIngester("instance-1", "127.0.0.1", "", ring.Tokens{1, 2, 3}, ring.ACTIVE, registeredAt)
+
+				instance := desc.AddIngester("instance-2", "127.0.0.2", "", ring.Tokens{4, 5, 6}, ring.ACTIVE, registeredAt)
+				instance.Timestamp = time.Now().Add(-time.Hour).Unix()
+				desc.Ingesters["instance-2"] = instance
+			},
+			updateRing: func(desc *ring.Desc) {
+				instance := desc.Ingesters["instance-2"]
+				instance.Timestamp = time.Now().Unix()
+				desc.Ingesters["instance-2"] = instance
+			},
+			expectedSync: true,
+		},
+		"should NOT sync when an instance updates the heartbeat": {
+			setupRing: func(desc *ring.Desc) {
+				desc.AddIngester("instance-1", "127.0.0.1", "", ring.Tokens{1, 2, 3}, ring.ACTIVE, registeredAt)
+				desc.AddIngester("instance-2", "127.0.0.2", "", ring.Tokens{4, 5, 6}, ring.ACTIVE, registeredAt)
+			},
+			updateRing: func(desc *ring.Desc) {
+				instance := desc.Ingesters["instance-2"]
+				instance.Timestamp = time.Now().Add(time.Second).Unix()
+				desc.Ingesters["instance-2"] = instance
+			},
+			expectedSync: false,
+		},
+		"should NOT sync when an instance is auto-forgotten in the ring but was already unhealthy in the previous state": {
+			setupRing: func(desc *ring.Desc) {
+				desc.AddIngester("instance-1", "127.0.0.1", "", ring.Tokens{1, 2, 3}, ring.ACTIVE, registeredAt)
+				desc.AddIngester("instance-2", "127.0.0.2", "", ring.Tokens{4, 5, 6}, ring.ACTIVE, registeredAt)
+
+				// Set it already unhealthy.
+				instance := desc.Ingesters["instance-2"]
+				instance.Timestamp = time.Now().Add(-time.Hour).Unix()
+				desc.Ingesters["instance-2"] = instance
+			},
+			updateRing: func(desc *ring.Desc) {
+				// Remove the unhealthy instance from the ring.
+				desc.RemoveIngester("instance-2")
+			},
+			expectedSync: false,
+		},
+	}
+
+	for testName, testData := range tests {
+		t.Run(testName, func(t *testing.T) {
+			ctx := context.Background()
+			gatewayCfg := mockGatewayConfig()
+			gatewayCfg.ShardingEnabled = true
+			gatewayCfg.ShardingRing.RingCheckPeriod = 100 * time.Millisecond
+
+			storageCfg := mockStorageConfig(t)
+			storageCfg.BucketStore.SyncInterval = time.Hour // Do not trigger the periodic sync in this test.
+
+			reg := prometheus.NewPedanticRegistry()
+			ringStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil)
+			t.Cleanup(func() { assert.NoError(t, closer.Close()) })
+
+			bucketClient := &bucket.ClientMock{}
+			bucketClient.MockIter("", []string{}, nil)
+
+			g, err := newStoreGateway(gatewayCfg, storageCfg, bucketClient, ringStore, defaultLimitsOverrides(t), mockLoggingLevel(), log.NewNopLogger(), reg)
+			require.NoError(t, err)
+
+			// Store the initial ring state before starting the gateway.
+			require.NoError(t, ringStore.CAS(ctx, RingKey, func(in interface{}) (interface{}, bool, error) {
+				ringDesc := ring.GetOrCreateRingDesc(in)
+				testData.setupRing(ringDesc)
+				return ringDesc, true, nil
+			}))
+
+			require.NoError(t, services.StartAndAwaitRunning(ctx, g))
+			defer services.StopAndAwaitTerminated(ctx, g) //nolint:errcheck
+
+			// Assert on the initial state.
+			regs := util.NewUserRegistries()
+			regs.AddUserRegistry("test", reg)
+			metrics := regs.BuildMetricFamiliesPerUser()
+			assert.Equal(t, float64(1), metrics.GetSumOfCounters("cortex_storegateway_bucket_sync_total"))
+
+			// Change the ring topology.
+			require.NoError(t, ringStore.CAS(ctx, RingKey, func(in interface{}) (interface{}, bool, error) {
+				ringDesc := ring.GetOrCreateRingDesc(in)
+				testData.updateRing(ringDesc)
+				return ringDesc, true, nil
+			}))
+
+			// Assert whether the sync triggered or not.
+			if testData.expectedSync {
+				test.Poll(t, time.Second, float64(2), func() interface{} {
+					metrics := regs.BuildMetricFamiliesPerUser()
+					return metrics.GetSumOfCounters("cortex_storegateway_bucket_sync_total")
+				})
+			} else {
+				// Give some time to the store-gateway to trigger the sync (if any).
+				time.Sleep(250 * time.Millisecond)
+
+				metrics := regs.BuildMetricFamiliesPerUser()
+				assert.Equal(t, float64(1), metrics.GetSumOfCounters("cortex_storegateway_bucket_sync_total"))
+			}
+		})
+	}
+}
+
+func TestStoreGateway_RingLifecyclerShouldAutoForgetUnhealthyInstances(t *testing.T) {
+	const unhealthyInstanceID = "unhealthy-id"
+	const heartbeatTimeout = time.Minute
+
+	ctx := context.Background()
+	gatewayCfg := mockGatewayConfig()
+	gatewayCfg.ShardingEnabled = true
+	gatewayCfg.ShardingRing.HeartbeatPeriod = 100 * time.Millisecond
+	gatewayCfg.ShardingRing.HeartbeatTimeout = heartbeatTimeout
+
+	storageCfg := mockStorageConfig(t)
+
+	ringStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil)
+	t.Cleanup(func() { assert.NoError(t, closer.Close()) })
+
+	bucketClient := &bucket.ClientMock{}
+	bucketClient.MockIter("", []string{}, nil)
+
+	g, err := newStoreGateway(gatewayCfg, storageCfg, bucketClient, ringStore, defaultLimitsOverrides(t), mockLoggingLevel(), log.NewNopLogger(), nil)
+	require.NoError(t, err)
+	require.NoError(t, services.StartAndAwaitRunning(ctx, g))
+	defer services.StopAndAwaitTerminated(ctx, g) //nolint:errcheck
+
+	// Add an unhealthy instance to the ring.
+	require.NoError(t, ringStore.CAS(ctx, RingKey, func(in interface{}) (interface{}, bool, error) {
+		ringDesc := ring.GetOrCreateRingDesc(in)
+
+		instance := ringDesc.AddIngester(unhealthyInstanceID, "1.1.1.1", "", generateSortedTokens(RingNumTokens), ring.ACTIVE, time.Now())
+		instance.Timestamp = time.Now().Add(-(ringAutoForgetUnhealthyPeriods + 1) * heartbeatTimeout).Unix()
+		ringDesc.Ingesters[unhealthyInstanceID] = instance
+
+		return ringDesc, true, nil
+	}))
+
+	// Ensure the unhealthy instance is removed from the ring.
+	test.Poll(t, time.Second, false, func() interface{} {
+		d, err := ringStore.Get(ctx, RingKey)
+		if err != nil {
+			return err
+		}
+
+		_, ok := ring.GetOrCreateRingDesc(d).Ingesters[unhealthyInstanceID]
+		return ok
+	})
+}
+
+func TestStoreGateway_SeriesQueryingShouldRemoveExternalLabels(t *testing.T) {
+	ctx := context.Background()
+	logger := log.NewNopLogger()
+	userID := "user-1"
+
+	storageDir, err := ioutil.TempDir(os.TempDir(), "")
+	require.NoError(t, err)
+	defer os.RemoveAll(storageDir) //nolint:errcheck
+
+	// Generate 2 TSDB blocks with the same exact series (and data points).
+	numSeries := 2
+	now := time.Now()
+	minT := now.Add(-1*time.Hour).Unix() * 1000
+	maxT := now.Unix() * 1000
+	step := (maxT - minT) / int64(numSeries)
+	mockTSDB(t, path.Join(storageDir, userID), numSeries, 0, minT, maxT)
+	mockTSDB(t, path.Join(storageDir, userID), numSeries, 0, minT, maxT)
+
+	bucketClient, err := filesystem.NewBucketClient(filesystem.Config{Directory: storageDir})
+	require.NoError(t, err)
+
+	createBucketIndex(t, bucketClient, userID)
+
+	// Find the created blocks (we expect 2).
+	var blockIDs []string
+	require.NoError(t, bucketClient.Iter(ctx, "user-1/", func(key string) error {
+		if _, ok := block.IsBlockDir(key); ok {
+			blockIDs = append(blockIDs, strings.TrimSuffix(strings.TrimPrefix(key, userID+"/"), "/"))
+		}
+		return nil
+	}))
+	require.Len(t, blockIDs, 2)
+
+	// Inject different external labels for each block.
+	for idx, blockID := range blockIDs {
+		meta := metadata.Thanos{
+			Labels: map[string]string{
+				cortex_tsdb.TenantIDExternalLabel:   userID,
+				cortex_tsdb.IngesterIDExternalLabel: fmt.Sprintf("ingester-%d", idx),
+				cortex_tsdb.ShardIDExternalLabel:    fmt.Sprintf("shard-%d", idx),
+			},
+			Source: metadata.TestSource,
+		}
+
+		_, err := metadata.InjectThanos(logger, filepath.Join(storageDir, userID, blockID), meta, nil)
+		require.NoError(t, err)
+	}
+
+	for _, bucketIndexEnabled := range []bool{true, false} {
+		t.Run(fmt.Sprintf("bucket index enabled = %v", bucketIndexEnabled), func(t *testing.T) {
+			// Create a store-gateway used to query back the series from the blocks.
+			gatewayCfg := mockGatewayConfig()
+			gatewayCfg.ShardingEnabled = false
+			storageCfg := mockStorageConfig(t)
+			storageCfg.BucketStore.BucketIndex.Enabled = bucketIndexEnabled
+
+			g, err := newStoreGateway(gatewayCfg, storageCfg, bucketClient, nil, defaultLimitsOverrides(t), mockLoggingLevel(), logger, nil)
+			require.NoError(t, err)
+			require.NoError(t, services.StartAndAwaitRunning(ctx, g))
+			defer services.StopAndAwaitTerminated(ctx, g) //nolint:errcheck
+
+			// Query back all series.
+			req := &storepb.SeriesRequest{
+				MinTime: minT,
+				MaxTime: maxT,
+				Matchers: []storepb.LabelMatcher{
+					{Type: storepb.LabelMatcher_RE, Name: "__name__", Value: ".*"},
+				},
+			}
+
+			srv := newBucketStoreSeriesServer(setUserIDToGRPCContext(ctx, userID))
+			err = g.Series(req, srv)
+			require.NoError(t, err)
+			assert.Empty(t, srv.Warnings)
+			assert.Len(t, srv.SeriesSet, numSeries)
+
+			for seriesID := 0; seriesID < numSeries; seriesID++ {
+				actual := srv.SeriesSet[seriesID]
+
+				// Ensure Cortex external labels have been removed.
+				assert.Equal(t, []labelpb.ZLabel{{Name: "series_id", Value: strconv.Itoa(seriesID)}}, actual.Labels)
+
+				// Ensure samples have been correctly queried. The Thanos store also deduplicate samples
+				// in most cases, but it's not strictly required guaranteeing deduplication at this stage.
+				samples, err := readSamplesFromChunks(actual.Chunks)
+				require.NoError(t, err)
+				assert.Equal(t, []sample{
+					{ts: minT + (step * int64(seriesID)), value: float64(seriesID)},
+				}, samples)
+			}
+		})
+	}
+}
+
+func TestStoreGateway_SeriesQueryingShouldEnforceMaxChunksPerQueryLimit(t *testing.T) {
+	const chunksQueried = 10
+
+	tests := map[string]struct {
+		limit       int
+		expectedErr error
+	}{
+		"no limit enforced if zero": {
+			limit:       0,
+			expectedErr: nil,
+		},
+		"should return NO error if the actual number of queried chunks is <= limit": {
+			limit:       chunksQueried,
+			expectedErr: nil,
+		},
+		"should return error if the actual number of queried chunks is > limit": {
+			limit:       chunksQueried - 1,
+			expectedErr: status.Error(http.StatusUnprocessableEntity, fmt.Sprintf("exceeded chunks limit: rpc error: code = Code(422) desc = limit %d violated (got %d)", chunksQueried-1, chunksQueried)),
+		},
+	}
+
+	ctx := context.Background()
+	logger := log.NewNopLogger()
+	userID := "user-1"
+
+	storageDir, err := ioutil.TempDir(os.TempDir(), "")
+	require.NoError(t, err)
+	defer os.RemoveAll(storageDir) //nolint:errcheck
+
+	// Generate 1 TSDB block with chunksQueried series. Since each mocked series contains only 1 sample,
+	// it will also only have 1 chunk.
+	now := time.Now()
+	minT := now.Add(-1*time.Hour).Unix() * 1000
+	maxT := now.Unix() * 1000
+	mockTSDB(t, path.Join(storageDir, userID), chunksQueried, 0, minT, maxT)
+
+	bucketClient, err := filesystem.NewBucketClient(filesystem.Config{Directory: storageDir})
+	require.NoError(t, err)
+
+	// Prepare the request to query back all series (1 chunk per series in this test).
+	req := &storepb.SeriesRequest{
+		MinTime: minT,
+		MaxTime: maxT,
+		Matchers: []storepb.LabelMatcher{
+			{Type: storepb.LabelMatcher_RE, Name: "__name__", Value: ".*"},
+		},
+	}
+
+	for testName, testData := range tests {
+		t.Run(testName, func(t *testing.T) {
+			// Customise the limits.
+			limits := defaultLimitsConfig()
+			limits.MaxChunksPerQueryFromStore = testData.limit
+			overrides, err := validation.NewOverrides(limits, nil)
+			require.NoError(t, err)
+
+			// Create a store-gateway used to query back the series from the blocks.
+			gatewayCfg := mockGatewayConfig()
+			gatewayCfg.ShardingEnabled = false
+			storageCfg := mockStorageConfig(t)
+
+			g, err := newStoreGateway(gatewayCfg, storageCfg, bucketClient, nil, overrides, mockLoggingLevel(), logger, nil)
+			require.NoError(t, err)
+			require.NoError(t, services.StartAndAwaitRunning(ctx, g))
+			defer services.StopAndAwaitTerminated(ctx, g) //nolint:errcheck
+
+			// Query back all the series (1 chunk per series in this test).
+			srv := newBucketStoreSeriesServer(setUserIDToGRPCContext(ctx, userID))
+			err = g.Series(req, srv)
+
+			if testData.expectedErr != nil {
+				require.Error(t, err)
+				assert.IsType(t, testData.expectedErr, err)
+				s1, ok := status.FromError(errors.Cause(err))
+				assert.True(t, ok)
+				s2, ok := status.FromError(errors.Cause(testData.expectedErr))
+				assert.True(t, ok)
+				assert.True(t, strings.Contains(s1.Message(), s2.Message()))
+				assert.Equal(t, s1.Code(), s2.Code())
+			} else {
+				require.NoError(t, err)
+				assert.Empty(t, srv.Warnings)
+				assert.Len(t, srv.SeriesSet, chunksQueried)
+			}
+		})
+	}
+}
+
+func mockGatewayConfig() Config {
+	cfg := Config{}
+	flagext.DefaultValues(&cfg)
+
+	cfg.ShardingRing.InstanceID = "test"
+	cfg.ShardingRing.InstanceAddr = "127.0.0.1"
+	cfg.ShardingRing.WaitStabilityMinDuration = 0
+	cfg.ShardingRing.WaitStabilityMaxDuration = 0
+
+	return cfg
+}
+
+func mockStorageConfig(t *testing.T) cortex_tsdb.BlocksStorageConfig {
+	tmpDir, err := ioutil.TempDir(os.TempDir(), "store-gateway-test-*")
+	require.NoError(t, err)
+	t.Cleanup(func() {
+		require.NoError(t, os.RemoveAll(tmpDir))
+	})
+
+	cfg := cortex_tsdb.BlocksStorageConfig{}
+	flagext.DefaultValues(&cfg)
+
+	cfg.BucketStore.ConsistencyDelay = 0
+	cfg.BucketStore.SyncDir = tmpDir
+
+	return cfg
+}
+
+// mockTSDB create 1+ TSDB blocks storing numSeries of series, each series
+// with 1 sample and its timestamp evenly distributed between minT and maxT.
+// If numBlocks > 0, then it uses numSeries only to find the distribution of
+// samples.
+func mockTSDB(t *testing.T, dir string, numSeries, numBlocks int, minT, maxT int64) {
+	// Create a new TSDB on a temporary directory. The blocks
+	// will be then snapshotted to the input dir.
+	tempDir, err := ioutil.TempDir(os.TempDir(), "tsdb")
+	require.NoError(t, err)
+	defer os.RemoveAll(tempDir) //nolint:errcheck
+
+	db, err := tsdb.Open(tempDir, nil, nil, &tsdb.Options{
+		MinBlockDuration:  2 * time.Hour.Milliseconds(),
+		MaxBlockDuration:  2 * time.Hour.Milliseconds(),
+		RetentionDuration: 15 * 24 * time.Hour.Milliseconds(),
+	}, nil)
+	require.NoError(t, err)
+
+	db.DisableCompactions()
+
+	step := (maxT - minT) / int64(numSeries)
+	addSample := func(i int) {
+		lbls := labels.Labels{labels.Label{Name: "series_id", Value: strconv.Itoa(i)}}
+
+		app := db.Appender(context.Background())
+		_, err := app.Append(0, lbls, minT+(step*int64(i)), float64(i))
+		require.NoError(t, err)
+		require.NoError(t, app.Commit())
+		require.NoError(t, db.Compact())
+	}
+	if numBlocks > 0 {
+		i := 0
+		// Snapshot adds another block. Hence numBlocks-1.
+		for len(db.Blocks()) < numBlocks-1 {
+			addSample(i)
+			i++
+		}
+	} else {
+		for i := 0; i < numSeries; i++ {
+			addSample(i)
+		}
+	}
+
+	require.NoError(t, db.Snapshot(dir, true))
+
+	require.NoError(t, db.Close())
+}
+
+func generateSortedTokens(numTokens int) ring.Tokens {
+	tokens := ring.GenerateTokens(numTokens, nil)
+
+	// Ensure generated tokens are sorted.
+	sort.Slice(tokens, func(i, j int) bool {
+		return tokens[i] < tokens[j]
+	})
+
+	return ring.Tokens(tokens)
+}
+
+func readSamplesFromChunks(rawChunks []storepb.AggrChunk) ([]sample, error) {
+	var samples []sample
+
+	for _, rawChunk := range rawChunks {
+		c, err := chunkenc.FromData(chunkenc.EncXOR, rawChunk.Raw.Data)
+		if err != nil {
+			return nil, err
+		}
+
+		it := c.Iterator(nil)
+		for it.Next() {
+			if it.Err() != nil {
+				return nil, it.Err()
+			}
+
+			ts, v := it.At()
+			samples = append(samples, sample{
+				ts:    ts,
+				value: v,
+			})
+		}
+
+		if it.Err() != nil {
+			return nil, it.Err()
+		}
+	}
+
+	return samples, nil
+}
+
+type sample struct {
+	ts    int64
+	value float64
+}
+
+func defaultLimitsConfig() validation.Limits {
+	limits := validation.Limits{}
+	flagext.DefaultValues(&limits)
+	return limits
+}
+
+func defaultLimitsOverrides(t *testing.T) *validation.Overrides {
+	overrides, err := validation.NewOverrides(defaultLimitsConfig(), nil)
+	require.NoError(t, err)
+
+	return overrides
+}
+
+type mockShardingStrategy struct {
+	mock.Mock
+}
+
+func (m *mockShardingStrategy) FilterUsers(ctx context.Context, userIDs []string) []string {
+	args := m.Called(ctx, userIDs)
+	return args.Get(0).([]string)
+}
+
+func (m *mockShardingStrategy) FilterBlocks(ctx context.Context, userID string, metas map[ulid.ULID]*metadata.Meta, loaded map[ulid.ULID]struct{}, synced *extprom.TxGaugeVec) error {
+	args := m.Called(ctx, userID, metas, loaded, synced)
+	return args.Error(0)
+}
+
+func createBucketIndex(t *testing.T, bkt objstore.Bucket, userID string) *bucketindex.Index {
+	updater := bucketindex.NewUpdater(bkt, userID, nil, log.NewNopLogger())
+	idx, _, err := updater.UpdateIndex(context.Background(), nil)
+	require.NoError(t, err)
+	require.NoError(t, bucketindex.WriteIndex(context.Background(), bkt, userID, nil, idx))
+
+	return idx
+}
diff --git a/pkg/storegateway/metadata_fetcher_filters_test.go b/pkg/storegateway/metadata_fetcher_filters_test.go
new file mode 100644
index 0000000000000..7df4d4888822f
--- /dev/null
+++ b/pkg/storegateway/metadata_fetcher_filters_test.go
@@ -0,0 +1,107 @@
+package storegateway
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"path"
+	"testing"
+	"time"
+
+	"github.com/cortexproject/cortex/pkg/storage/bucket"
+	"github.com/cortexproject/cortex/pkg/storage/tsdb/bucketindex"
+	"github.com/go-kit/log"
+	"github.com/oklog/ulid"
+	"github.com/prometheus/client_golang/prometheus"
+	promtest "github.com/prometheus/client_golang/prometheus/testutil"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+	"github.com/thanos-io/thanos/pkg/block"
+	"github.com/thanos-io/thanos/pkg/block/metadata"
+	"github.com/thanos-io/thanos/pkg/extprom"
+	"github.com/thanos-io/thanos/pkg/objstore"
+
+	tsdb_testutil "github.com/grafana/loki/pkg/storage/tsdb/testutil"
+)
+
+func TestIgnoreDeletionMarkFilter_Filter(t *testing.T) {
+	testIgnoreDeletionMarkFilter(t, false)
+}
+
+func TestIgnoreDeletionMarkFilter_FilterWithBucketIndex(t *testing.T) {
+	testIgnoreDeletionMarkFilter(t, true)
+}
+
+func testIgnoreDeletionMarkFilter(t *testing.T, bucketIndexEnabled bool) {
+	const userID = "user-1"
+
+	now := time.Now()
+	ctx := context.Background()
+	logger := log.NewNopLogger()
+
+	// Create a bucket backed by filesystem.
+	bkt, _ := tsdb_testutil.PrepareFilesystemBucket(t)
+	bkt = bucketindex.BucketWithGlobalMarkers(bkt)
+	userBkt := bucket.NewUserBucketClient(userID, bkt, nil)
+
+	shouldFetch := &metadata.DeletionMark{
+		ID:           ulid.MustNew(1, nil),
+		DeletionTime: now.Add(-15 * time.Hour).Unix(),
+		Version:      1,
+	}
+
+	shouldIgnore := &metadata.DeletionMark{
+		ID:           ulid.MustNew(2, nil),
+		DeletionTime: now.Add(-60 * time.Hour).Unix(),
+		Version:      1,
+	}
+
+	var buf bytes.Buffer
+	require.NoError(t, json.NewEncoder(&buf).Encode(&shouldFetch))
+	require.NoError(t, userBkt.Upload(ctx, path.Join(shouldFetch.ID.String(), metadata.DeletionMarkFilename), &buf))
+	require.NoError(t, json.NewEncoder(&buf).Encode(&shouldIgnore))
+	require.NoError(t, userBkt.Upload(ctx, path.Join(shouldIgnore.ID.String(), metadata.DeletionMarkFilename), &buf))
+	require.NoError(t, userBkt.Upload(ctx, path.Join(ulid.MustNew(3, nil).String(), metadata.DeletionMarkFilename), bytes.NewBufferString("not a valid deletion-mark.json")))
+
+	// Create the bucket index if required.
+	var idx *bucketindex.Index
+	if bucketIndexEnabled {
+		var err error
+
+		u := bucketindex.NewUpdater(bkt, userID, nil, logger)
+		idx, _, err = u.UpdateIndex(ctx, nil)
+		require.NoError(t, err)
+		require.NoError(t, bucketindex.WriteIndex(ctx, bkt, userID, nil, idx))
+	}
+
+	inputMetas := map[ulid.ULID]*metadata.Meta{
+		ulid.MustNew(1, nil): {},
+		ulid.MustNew(2, nil): {},
+		ulid.MustNew(3, nil): {},
+		ulid.MustNew(4, nil): {},
+	}
+
+	expectedMetas := map[ulid.ULID]*metadata.Meta{
+		ulid.MustNew(1, nil): {},
+		ulid.MustNew(3, nil): {},
+		ulid.MustNew(4, nil): {},
+	}
+
+	expectedDeletionMarks := map[ulid.ULID]*metadata.DeletionMark{
+		ulid.MustNew(1, nil): shouldFetch,
+		ulid.MustNew(2, nil): shouldIgnore,
+	}
+
+	synced := extprom.NewTxGaugeVec(nil, prometheus.GaugeOpts{Name: "synced"}, []string{"state"})
+	f := NewIgnoreDeletionMarkFilter(logger, objstore.WithNoopInstr(userBkt), 48*time.Hour, 32)
+
+	if bucketIndexEnabled {
+		require.NoError(t, f.FilterWithBucketIndex(ctx, inputMetas, idx, synced))
+	} else {
+		require.NoError(t, f.Filter(ctx, inputMetas, synced))
+	}
+
+	assert.Equal(t, 1.0, promtest.ToFloat64(synced.WithLabelValues(block.MarkedForDeletionMeta)))
+	assert.Equal(t, expectedMetas, inputMetas)
+	assert.Equal(t, expectedDeletionMarks, f.DeletionMarkBlocks())
+}
diff --git a/pkg/storegateway/metadata_fetcher_metrics_test.go b/pkg/storegateway/metadata_fetcher_metrics_test.go
new file mode 100644
index 0000000000000..ea028cb07cd3f
--- /dev/null
+++ b/pkg/storegateway/metadata_fetcher_metrics_test.go
@@ -0,0 +1,111 @@
+package storegateway
+
+import (
+	"bytes"
+	"testing"
+
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/promauto"
+	"github.com/prometheus/client_golang/prometheus/testutil"
+	"github.com/stretchr/testify/require"
+)
+
+func TestMetadataFetcherMetrics(t *testing.T) {
+	mainReg := prometheus.NewPedanticRegistry()
+
+	metrics := NewMetadataFetcherMetrics()
+	mainReg.MustRegister(metrics)
+
+	metrics.AddUserRegistry("user1", populateMetadataFetcherMetrics(3))
+	metrics.AddUserRegistry("user2", populateMetadataFetcherMetrics(5))
+	metrics.AddUserRegistry("user3", populateMetadataFetcherMetrics(7))
+
+	//noinspection ALL
+	err := testutil.GatherAndCompare(mainReg, bytes.NewBufferString(`
+		# HELP cortex_blocks_meta_sync_duration_seconds Duration of the blocks metadata synchronization in seconds
+		# TYPE cortex_blocks_meta_sync_duration_seconds histogram
+		cortex_blocks_meta_sync_duration_seconds_bucket{le="0.01"} 0
+		cortex_blocks_meta_sync_duration_seconds_bucket{le="1"} 0
+		cortex_blocks_meta_sync_duration_seconds_bucket{le="10"} 3
+		cortex_blocks_meta_sync_duration_seconds_bucket{le="100"} 3
+		cortex_blocks_meta_sync_duration_seconds_bucket{le="1000"} 3
+		cortex_blocks_meta_sync_duration_seconds_bucket{le="+Inf"} 3
+		cortex_blocks_meta_sync_duration_seconds_sum 9
+		cortex_blocks_meta_sync_duration_seconds_count 3
+
+		# HELP cortex_blocks_meta_sync_failures_total Total blocks metadata synchronization failures
+		# TYPE cortex_blocks_meta_sync_failures_total counter
+		cortex_blocks_meta_sync_failures_total 30
+
+		# HELP cortex_blocks_meta_syncs_total Total blocks metadata synchronization attempts
+		# TYPE cortex_blocks_meta_syncs_total counter
+		cortex_blocks_meta_syncs_total 15
+
+		# HELP cortex_blocks_meta_sync_consistency_delay_seconds Configured consistency delay in seconds.
+		# TYPE cortex_blocks_meta_sync_consistency_delay_seconds gauge
+		cortex_blocks_meta_sync_consistency_delay_seconds 300
+
+		# HELP cortex_blocks_meta_synced Reflects current state of synced blocks (over all tenants).
+		# TYPE cortex_blocks_meta_synced gauge
+		cortex_blocks_meta_synced{state="corrupted-meta-json"} 75
+		cortex_blocks_meta_synced{state="loaded"} 90
+		cortex_blocks_meta_synced{state="too-fresh"} 105
+`))
+	require.NoError(t, err)
+}
+
+func populateMetadataFetcherMetrics(base float64) *prometheus.Registry {
+	reg := prometheus.NewRegistry()
+	m := newMetadataFetcherMetricsMock(reg)
+
+	m.syncs.Add(base * 1)
+	m.syncFailures.Add(base * 2)
+	m.syncDuration.Observe(3)
+	m.syncConsistencyDelay.Set(300)
+
+	m.synced.WithLabelValues("corrupted-meta-json").Set(base * 5)
+	m.synced.WithLabelValues("loaded").Set(base * 6)
+	m.synced.WithLabelValues("too-fresh").Set(base * 7)
+
+	return reg
+}
+
+type metadataFetcherMetricsMock struct {
+	syncs                prometheus.Counter
+	syncFailures         prometheus.Counter
+	syncDuration         prometheus.Histogram
+	syncConsistencyDelay prometheus.Gauge
+	synced               *prometheus.GaugeVec
+}
+
+func newMetadataFetcherMetricsMock(reg prometheus.Registerer) *metadataFetcherMetricsMock {
+	var m metadataFetcherMetricsMock
+
+	m.syncs = promauto.With(reg).NewCounter(prometheus.CounterOpts{
+		Subsystem: "blocks_meta",
+		Name:      "syncs_total",
+		Help:      "Total blocks metadata synchronization attempts",
+	})
+	m.syncFailures = promauto.With(reg).NewCounter(prometheus.CounterOpts{
+		Subsystem: "blocks_meta",
+		Name:      "sync_failures_total",
+		Help:      "Total blocks metadata synchronization failures",
+	})
+	m.syncDuration = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
+		Subsystem: "blocks_meta",
+		Name:      "sync_duration_seconds",
+		Help:      "Duration of the blocks metadata synchronization in seconds",
+		Buckets:   []float64{0.01, 1, 10, 100, 1000},
+	})
+	m.syncConsistencyDelay = promauto.With(reg).NewGauge(prometheus.GaugeOpts{
+		Name: "consistency_delay_seconds",
+		Help: "Configured consistency delay in seconds.",
+	})
+	m.synced = promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
+		Subsystem: "blocks_meta",
+		Name:      "synced",
+		Help:      "Number of block metadata synced",
+	}, []string{"state"})
+
+	return &m
+}
diff --git a/pkg/storegateway/partitioner_test.go b/pkg/storegateway/partitioner_test.go
new file mode 100644
index 0000000000000..ab659374973c1
--- /dev/null
+++ b/pkg/storegateway/partitioner_test.go
@@ -0,0 +1,58 @@
+package storegateway
+
+import (
+	"bytes"
+	"testing"
+
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/testutil"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+	"github.com/thanos-io/thanos/pkg/store"
+)
+
+func TestGapBasedPartitioner_Partition(t *testing.T) {
+	reg := prometheus.NewPedanticRegistry()
+	p := newGapBasedPartitioner(10, reg)
+
+	parts := p.Partition(5, func(i int) (uint64, uint64) {
+		switch i {
+		case 0:
+			return 10, 12
+		case 1:
+			return 15, 18
+		case 2:
+			return 22, 27
+		case 3:
+			return 38, 41
+		case 4:
+			return 50, 52
+		default:
+			return 0, 0
+		}
+	})
+
+	expected := []store.Part{
+		{Start: 10, End: 27, ElemRng: [2]int{0, 3}},
+		{Start: 38, End: 52, ElemRng: [2]int{3, 5}},
+	}
+	require.Equal(t, expected, parts)
+
+	assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
+		# HELP cortex_bucket_store_partitioner_requested_bytes_total Total size of byte ranges required to fetch from the storage before they are passed to the partitioner.
+		# TYPE cortex_bucket_store_partitioner_requested_bytes_total counter
+		cortex_bucket_store_partitioner_requested_bytes_total 15
+
+		# HELP cortex_bucket_store_partitioner_requested_ranges_total Total number of byte ranges required to fetch from the storage before they are passed to the partitioner.
+		# TYPE cortex_bucket_store_partitioner_requested_ranges_total counter
+		cortex_bucket_store_partitioner_requested_ranges_total 5
+
+		# HELP cortex_bucket_store_partitioner_expanded_bytes_total Total size of byte ranges returned by the partitioner after they've been combined together to reduce the number of bucket API calls.
+		# TYPE cortex_bucket_store_partitioner_expanded_bytes_total counter
+		cortex_bucket_store_partitioner_expanded_bytes_total 31
+
+		# HELP cortex_bucket_store_partitioner_expanded_ranges_total Total number of byte ranges returned by the partitioner after they've been combined together to reduce the number of bucket API calls.
+		# TYPE cortex_bucket_store_partitioner_expanded_ranges_total counter
+		cortex_bucket_store_partitioner_expanded_ranges_total 2
+	`)))
+}
diff --git a/pkg/storegateway/sharding_strategy_test.go b/pkg/storegateway/sharding_strategy_test.go
new file mode 100644
index 0000000000000..9ba549f62752e
--- /dev/null
+++ b/pkg/storegateway/sharding_strategy_test.go
@@ -0,0 +1,670 @@
+package storegateway
+
+import (
+	"context"
+	"testing"
+	"time"
+
+	"github.com/go-kit/log"
+	"github.com/grafana/dskit/kv/consul"
+	"github.com/grafana/dskit/ring"
+	"github.com/grafana/dskit/services"
+	"github.com/oklog/ulid"
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/testutil"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+	"github.com/thanos-io/thanos/pkg/block/metadata"
+	"github.com/thanos-io/thanos/pkg/extprom"
+
+	cortex_tsdb "github.com/cortexproject/cortex/pkg/storage/tsdb"
+)
+
+func TestDefaultShardingStrategy(t *testing.T) {
+	// The following block IDs have been picked to have increasing hash values
+	// in order to simplify the tests.
+	block1 := ulid.MustNew(1, nil) // hash: 283204220
+	block2 := ulid.MustNew(2, nil) // hash: 444110359
+	block3 := ulid.MustNew(5, nil) // hash: 2931974232
+	block4 := ulid.MustNew(6, nil) // hash: 3092880371
+	numAllBlocks := 4
+
+	block1Hash := cortex_tsdb.HashBlockID(block1)
+	block2Hash := cortex_tsdb.HashBlockID(block2)
+	block3Hash := cortex_tsdb.HashBlockID(block3)
+	block4Hash := cortex_tsdb.HashBlockID(block4)
+
+	registeredAt := time.Now()
+
+	tests := map[string]struct {
+		replicationFactor    int
+		zoneAwarenessEnabled bool
+		setupRing            func(*ring.Desc)
+		expectedBlocks       map[string][]ulid.ULID
+	}{
+		"one ACTIVE instance in the ring with replication factor = 1": {
+			replicationFactor: 1,
+			setupRing: func(r *ring.Desc) {
+				r.AddIngester("instance-1", "127.0.0.1", "", []uint32{0}, ring.ACTIVE, registeredAt)
+			},
+			expectedBlocks: map[string][]ulid.ULID{
+				"127.0.0.1": {block1, block2, block3, block4},
+				"127.0.0.2": {},
+			},
+		},
+		"two ACTIVE instances in the ring with replication factor = 1": {
+			replicationFactor: 1,
+			setupRing: func(r *ring.Desc) {
+				r.AddIngester("instance-1", "127.0.0.1", "", []uint32{block1Hash + 1, block3Hash + 1}, ring.ACTIVE, registeredAt)
+				r.AddIngester("instance-2", "127.0.0.2", "", []uint32{block2Hash + 1, block4Hash + 1}, ring.ACTIVE, registeredAt)
+			},
+			expectedBlocks: map[string][]ulid.ULID{
+				"127.0.0.1": {block1, block3},
+				"127.0.0.2": {block2, block4},
+			},
+		},
+		"one ACTIVE instance in the ring with replication factor = 2": {
+			replicationFactor: 2,
+			setupRing: func(r *ring.Desc) {
+				r.AddIngester("instance-1", "127.0.0.1", "", []uint32{0}, ring.ACTIVE, registeredAt)
+			},
+			expectedBlocks: map[string][]ulid.ULID{
+				"127.0.0.1": {block1, block2, block3, block4},
+				"127.0.0.2": {},
+			},
+		},
+		"two ACTIVE instances in the ring with replication factor = 2": {
+			replicationFactor: 2,
+			setupRing: func(r *ring.Desc) {
+				r.AddIngester("instance-1", "127.0.0.1", "", []uint32{block1Hash + 1, block3Hash + 1}, ring.ACTIVE, registeredAt)
+				r.AddIngester("instance-2", "127.0.0.2", "", []uint32{block2Hash + 1, block4Hash + 1}, ring.ACTIVE, registeredAt)
+			},
+			expectedBlocks: map[string][]ulid.ULID{
+				"127.0.0.1": {block1, block2, block3, block4},
+				"127.0.0.2": {block1, block2, block3, block4},
+			},
+		},
+		"multiple ACTIVE instances in the ring with replication factor = 2": {
+			replicationFactor: 2,
+			setupRing: func(r *ring.Desc) {
+				r.AddIngester("instance-1", "127.0.0.1", "", []uint32{block1Hash + 1, block3Hash + 1}, ring.ACTIVE, registeredAt)
+				r.AddIngester("instance-2", "127.0.0.2", "", []uint32{block2Hash + 1}, ring.ACTIVE, registeredAt)
+				r.AddIngester("instance-3", "127.0.0.3", "", []uint32{block4Hash + 1}, ring.ACTIVE, registeredAt)
+			},
+			expectedBlocks: map[string][]ulid.ULID{
+				"127.0.0.1": {block1, block3 /* replicated: */, block2, block4},
+				"127.0.0.2": {block2 /* replicated: */, block1},
+				"127.0.0.3": {block4 /* replicated: */, block3},
+			},
+		},
+		"multiple ACTIVE instances in the ring with replication factor = 2 and zone-awareness enabled": {
+			replicationFactor:    2,
+			zoneAwarenessEnabled: true,
+			setupRing: func(r *ring.Desc) {
+				r.AddIngester("instance-1", "127.0.0.1", "zone-a", []uint32{block1Hash + 1, block3Hash + 1}, ring.ACTIVE, registeredAt)
+				r.AddIngester("instance-2", "127.0.0.2", "zone-a", []uint32{block2Hash + 1}, ring.ACTIVE, registeredAt)
+				r.AddIngester("instance-3", "127.0.0.3", "zone-b", []uint32{block4Hash + 1}, ring.ACTIVE, registeredAt)
+			},
+			expectedBlocks: map[string][]ulid.ULID{
+				"127.0.0.1": {block1, block3, block4},
+				"127.0.0.2": {block2},
+				"127.0.0.3": {block1, block2, block3, block4},
+			},
+		},
+		"one unhealthy instance in the ring with replication factor = 1": {
+			replicationFactor: 1,
+			setupRing: func(r *ring.Desc) {
+				r.AddIngester("instance-1", "127.0.0.1", "", []uint32{block1Hash + 1, block3Hash + 1}, ring.ACTIVE, registeredAt)
+				r.AddIngester("instance-2", "127.0.0.2", "", []uint32{block2Hash + 1}, ring.ACTIVE, registeredAt)
+
+				r.Ingesters["instance-3"] = ring.InstanceDesc{
+					Addr:      "127.0.0.3",
+					Timestamp: time.Now().Add(-time.Hour).Unix(),
+					State:     ring.ACTIVE,
+					Tokens:    []uint32{block4Hash + 1},
+				}
+			},
+			expectedBlocks: map[string][]ulid.ULID{
+				// No shard has the blocks of the unhealthy instance.
+				"127.0.0.1": {block1, block3},
+				"127.0.0.2": {block2},
+				"127.0.0.3": {},
+			},
+		},
+		"one unhealthy instance in the ring with replication factor = 2": {
+			replicationFactor: 2,
+			setupRing: func(r *ring.Desc) {
+				r.AddIngester("instance-1", "127.0.0.1", "", []uint32{block1Hash + 1, block3Hash + 1}, ring.ACTIVE, registeredAt)
+				r.AddIngester("instance-2", "127.0.0.2", "", []uint32{block2Hash + 1}, ring.ACTIVE, registeredAt)
+
+				r.Ingesters["instance-3"] = ring.InstanceDesc{
+					Addr:      "127.0.0.3",
+					Timestamp: time.Now().Add(-time.Hour).Unix(),
+					State:     ring.ACTIVE,
+					Tokens:    []uint32{block4Hash + 1},
+				}
+			},
+			expectedBlocks: map[string][]ulid.ULID{
+				"127.0.0.1": {block1, block3 /* replicated: */, block2, block4},
+				"127.0.0.2": {block2 /* replicated: */, block1},
+				"127.0.0.3": {},
+			},
+		},
+		"two unhealthy instances in the ring with replication factor = 2": {
+			replicationFactor: 2,
+			setupRing: func(r *ring.Desc) {
+				r.AddIngester("instance-1", "127.0.0.1", "", []uint32{block1Hash + 1}, ring.ACTIVE, registeredAt)
+
+				r.Ingesters["instance-2"] = ring.InstanceDesc{
+					Addr:      "127.0.0.2",
+					Timestamp: time.Now().Add(-time.Hour).Unix(),
+					State:     ring.ACTIVE,
+					Tokens:    []uint32{block2Hash + 1, block3Hash + 1},
+				}
+
+				r.Ingesters["instance-3"] = ring.InstanceDesc{
+					Addr:      "127.0.0.3",
+					Timestamp: time.Now().Add(-time.Hour).Unix(),
+					State:     ring.ACTIVE,
+					Tokens:    []uint32{block4Hash + 1},
+				}
+			},
+			expectedBlocks: map[string][]ulid.ULID{
+				// There may be some blocks missing depending if there are shared blocks
+				// between the two unhealthy nodes.
+				"127.0.0.1": {block1 /* replicated: */, block4},
+				"127.0.0.2": {},
+				"127.0.0.3": {},
+			},
+		},
+		"two unhealthy instances in the ring with replication factor = 3": {
+			replicationFactor: 3,
+			setupRing: func(r *ring.Desc) {
+				r.AddIngester("instance-1", "127.0.0.1", "", []uint32{block1Hash + 1}, ring.ACTIVE, registeredAt)
+				r.AddIngester("instance-2", "127.0.0.2", "", []uint32{block2Hash + 1}, ring.ACTIVE, registeredAt)
+
+				r.Ingesters["instance-3"] = ring.InstanceDesc{
+					Addr:      "127.0.0.3",
+					Timestamp: time.Now().Add(-time.Hour).Unix(),
+					State:     ring.ACTIVE,
+					Tokens:    []uint32{block3Hash + 1},
+				}
+
+				r.Ingesters["instance-4"] = ring.InstanceDesc{
+					Addr:      "127.0.0.4",
+					Timestamp: time.Now().Add(-time.Hour).Unix(),
+					State:     ring.ACTIVE,
+					Tokens:    []uint32{block4Hash + 1},
+				}
+			},
+			expectedBlocks: map[string][]ulid.ULID{
+				// There may be some blocks missing depending if there are shared blocks
+				// between the two unhealthy nodes.
+				"127.0.0.1": {block1 /* replicated: */, block3, block4},
+				"127.0.0.2": {block2 /* replicated: */, block1, block4},
+				"127.0.0.3": {},
+				"127.0.0.4": {},
+			},
+		},
+		"LEAVING instance in the ring should continue to keep its shard blocks but they should also be replicated to another instance": {
+			replicationFactor: 1,
+			setupRing: func(r *ring.Desc) {
+				r.AddIngester("instance-1", "127.0.0.1", "", []uint32{block1Hash + 1, block3Hash + 1}, ring.ACTIVE, registeredAt)
+				r.AddIngester("instance-2", "127.0.0.2", "", []uint32{block2Hash + 1}, ring.ACTIVE, registeredAt)
+				r.AddIngester("instance-3", "127.0.0.3", "", []uint32{block4Hash + 1}, ring.LEAVING, registeredAt)
+			},
+			expectedBlocks: map[string][]ulid.ULID{
+				"127.0.0.1": {block1, block3 /* replicated: */, block4},
+				"127.0.0.2": {block2},
+				"127.0.0.3": {block4},
+			},
+		},
+		"JOINING instance in the ring should get its shard blocks and they should not be replicated to another instance": {
+			replicationFactor: 1,
+			setupRing: func(r *ring.Desc) {
+				r.AddIngester("instance-1", "127.0.0.1", "", []uint32{block1Hash + 1, block3Hash + 1}, ring.ACTIVE, registeredAt)
+				r.AddIngester("instance-2", "127.0.0.2", "", []uint32{block2Hash + 1}, ring.ACTIVE, registeredAt)
+				r.AddIngester("instance-3", "127.0.0.3", "", []uint32{block4Hash + 1}, ring.JOINING, registeredAt)
+			},
+			expectedBlocks: map[string][]ulid.ULID{
+				"127.0.0.1": {block1, block3},
+				"127.0.0.2": {block2},
+				"127.0.0.3": {block4},
+			},
+		},
+	}
+
+	for testName, testData := range tests {
+		testName := testName
+		testData := testData
+
+		t.Run(testName, func(t *testing.T) {
+			t.Parallel()
+
+			ctx := context.Background()
+			store, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil)
+			t.Cleanup(func() { assert.NoError(t, closer.Close()) })
+
+			// Initialize the ring state.
+			require.NoError(t, store.CAS(ctx, "test", func(in interface{}) (interface{}, bool, error) {
+				d := ring.NewDesc()
+				testData.setupRing(d)
+				return d, true, nil
+			}))
+
+			cfg := ring.Config{
+				ReplicationFactor:    testData.replicationFactor,
+				HeartbeatTimeout:     time.Minute,
+				ZoneAwarenessEnabled: testData.zoneAwarenessEnabled,
+			}
+
+			r, err := ring.NewWithStoreClientAndStrategy(cfg, "test", "test", store, ring.NewIgnoreUnhealthyInstancesReplicationStrategy(), nil, nil)
+			require.NoError(t, err)
+			require.NoError(t, services.StartAndAwaitRunning(ctx, r))
+			defer services.StopAndAwaitTerminated(ctx, r) //nolint:errcheck
+
+			// Wait until the ring client has synced.
+			require.NoError(t, ring.WaitInstanceState(ctx, r, "instance-1", ring.ACTIVE))
+
+			for instanceAddr, expectedBlocks := range testData.expectedBlocks {
+				filter := NewDefaultShardingStrategy(r, instanceAddr, log.NewNopLogger())
+				synced := extprom.NewTxGaugeVec(nil, prometheus.GaugeOpts{}, []string{"state"})
+				synced.WithLabelValues(shardExcludedMeta).Set(0)
+
+				metas := map[ulid.ULID]*metadata.Meta{
+					block1: {},
+					block2: {},
+					block3: {},
+					block4: {},
+				}
+
+				err = filter.FilterBlocks(ctx, "user-1", metas, map[ulid.ULID]struct{}{}, synced)
+				require.NoError(t, err)
+
+				var actualBlocks []ulid.ULID
+				for id := range metas {
+					actualBlocks = append(actualBlocks, id)
+				}
+
+				assert.ElementsMatch(t, expectedBlocks, actualBlocks)
+
+				// Assert on the metric used to keep track of the blocks filtered out.
+				synced.Submit()
+				assert.Equal(t, float64(numAllBlocks-len(testData.expectedBlocks[instanceAddr])), testutil.ToFloat64(synced))
+			}
+		})
+	}
+}
+
+func TestShuffleShardingStrategy(t *testing.T) {
+	// The following block IDs have been picked to have increasing hash values
+	// in order to simplify the tests.
+	block1 := ulid.MustNew(1, nil) // hash: 283204220
+	block2 := ulid.MustNew(2, nil) // hash: 444110359
+	block3 := ulid.MustNew(5, nil) // hash: 2931974232
+	block4 := ulid.MustNew(6, nil) // hash: 3092880371
+	numAllBlocks := 4
+
+	block1Hash := cortex_tsdb.HashBlockID(block1)
+	block2Hash := cortex_tsdb.HashBlockID(block2)
+	block3Hash := cortex_tsdb.HashBlockID(block3)
+	block4Hash := cortex_tsdb.HashBlockID(block4)
+
+	userID := "user-A"
+	registeredAt := time.Now()
+
+	type usersExpectation struct {
+		instanceID   string
+		instanceAddr string
+		users        []string
+	}
+
+	type blocksExpectation struct {
+		instanceID   string
+		instanceAddr string
+		blocks       []ulid.ULID
+	}
+
+	tests := map[string]struct {
+		replicationFactor int
+		limits            ShardingLimits
+		setupRing         func(*ring.Desc)
+		expectedUsers     []usersExpectation
+		expectedBlocks    []blocksExpectation
+	}{
+		"one ACTIVE instance in the ring with RF = 1 and SS = 1": {
+			replicationFactor: 1,
+			limits:            &shardingLimitsMock{storeGatewayTenantShardSize: 1},
+			setupRing: func(r *ring.Desc) {
+				r.AddIngester("instance-1", "127.0.0.1", "", []uint32{0}, ring.ACTIVE, registeredAt)
+			},
+			expectedUsers: []usersExpectation{
+				{instanceID: "instance-1", instanceAddr: "127.0.0.1", users: []string{userID}},
+				{instanceID: "instance-2", instanceAddr: "127.0.0.2", users: nil},
+			},
+			expectedBlocks: []blocksExpectation{
+				{instanceID: "instance-1", instanceAddr: "127.0.0.1", blocks: []ulid.ULID{block1, block2, block3, block4}},
+				{instanceID: "instance-2", instanceAddr: "127.0.0.2", blocks: []ulid.ULID{}},
+			},
+		},
+		"one ACTIVE instance in the ring with RF = 2 and SS = 1 (should still sync blocks on the only available instance)": {
+			replicationFactor: 1,
+			limits:            &shardingLimitsMock{storeGatewayTenantShardSize: 1},
+			setupRing: func(r *ring.Desc) {
+				r.AddIngester("instance-1", "127.0.0.1", "", []uint32{0}, ring.ACTIVE, registeredAt)
+			},
+			expectedUsers: []usersExpectation{
+				{instanceID: "instance-1", instanceAddr: "127.0.0.1", users: []string{userID}},
+				{instanceID: "instance-2", instanceAddr: "127.0.0.2", users: nil},
+			},
+			expectedBlocks: []blocksExpectation{
+				{instanceID: "instance-1", instanceAddr: "127.0.0.1", blocks: []ulid.ULID{block1, block2, block3, block4}},
+				{instanceID: "instance-2", instanceAddr: "127.0.0.2", blocks: []ulid.ULID{}},
+			},
+		},
+		"one ACTIVE instance in the ring with RF = 2 and SS = 2 (should still sync blocks on the only available instance)": {
+			replicationFactor: 1,
+			limits:            &shardingLimitsMock{storeGatewayTenantShardSize: 2},
+			setupRing: func(r *ring.Desc) {
+				r.AddIngester("instance-1", "127.0.0.1", "", []uint32{0}, ring.ACTIVE, registeredAt)
+			},
+			expectedUsers: []usersExpectation{
+				{instanceID: "instance-1", instanceAddr: "127.0.0.1", users: []string{userID}},
+				{instanceID: "instance-2", instanceAddr: "127.0.0.2", users: nil},
+			},
+			expectedBlocks: []blocksExpectation{
+				{instanceID: "instance-1", instanceAddr: "127.0.0.1", blocks: []ulid.ULID{block1, block2, block3, block4}},
+				{instanceID: "instance-2", instanceAddr: "127.0.0.2", blocks: []ulid.ULID{}},
+			},
+		},
+		"two ACTIVE instances in the ring with RF = 1 and SS = 1 (should sync blocks on 1 instance because of the shard size)": {
+			replicationFactor: 1,
+			limits:            &shardingLimitsMock{storeGatewayTenantShardSize: 1},
+			setupRing: func(r *ring.Desc) {
+				r.AddIngester("instance-1", "127.0.0.1", "", []uint32{block1Hash + 1, block3Hash + 1}, ring.ACTIVE, registeredAt)
+				r.AddIngester("instance-2", "127.0.0.2", "", []uint32{block2Hash + 1, block4Hash + 1}, ring.ACTIVE, registeredAt)
+			},
+			expectedUsers: []usersExpectation{
+				{instanceID: "instance-1", instanceAddr: "127.0.0.1", users: []string{userID}},
+				{instanceID: "instance-2", instanceAddr: "127.0.0.2", users: nil},
+			},
+			expectedBlocks: []blocksExpectation{
+				{instanceID: "instance-1", instanceAddr: "127.0.0.1", blocks: []ulid.ULID{block1, block2, block3, block4}},
+				{instanceID: "instance-2", instanceAddr: "127.0.0.2", blocks: []ulid.ULID{}},
+			},
+		},
+		"two ACTIVE instances in the ring with RF = 1 and SS = 2 (should sync blocks on 2 instances because of the shard size)": {
+			replicationFactor: 1,
+			limits:            &shardingLimitsMock{storeGatewayTenantShardSize: 2},
+			setupRing: func(r *ring.Desc) {
+				r.AddIngester("instance-1", "127.0.0.1", "", []uint32{block1Hash + 1, block3Hash + 1}, ring.ACTIVE, registeredAt)
+				r.AddIngester("instance-2", "127.0.0.2", "", []uint32{block2Hash + 1, block4Hash + 1}, ring.ACTIVE, registeredAt)
+			},
+			expectedUsers: []usersExpectation{
+				{instanceID: "instance-1", instanceAddr: "127.0.0.1", users: []string{userID}},
+				{instanceID: "instance-2", instanceAddr: "127.0.0.2", users: []string{userID}},
+			},
+			expectedBlocks: []blocksExpectation{
+				{instanceID: "instance-1", instanceAddr: "127.0.0.1", blocks: []ulid.ULID{block1, block3}},
+				{instanceID: "instance-2", instanceAddr: "127.0.0.2", blocks: []ulid.ULID{block2, block4}},
+			},
+		},
+		"two ACTIVE instances in the ring with RF = 2 and SS = 1 (should sync blocks on 1 instance because of the shard size)": {
+			replicationFactor: 2,
+			limits:            &shardingLimitsMock{storeGatewayTenantShardSize: 1},
+			setupRing: func(r *ring.Desc) {
+				r.AddIngester("instance-1", "127.0.0.1", "", []uint32{block1Hash + 1, block3Hash + 1}, ring.ACTIVE, registeredAt)
+				r.AddIngester("instance-2", "127.0.0.2", "", []uint32{block2Hash + 1, block4Hash + 1}, ring.ACTIVE, registeredAt)
+			},
+			expectedUsers: []usersExpectation{
+				{instanceID: "instance-1", instanceAddr: "127.0.0.1", users: []string{userID}},
+				{instanceID: "instance-2", instanceAddr: "127.0.0.2", users: nil},
+			},
+			expectedBlocks: []blocksExpectation{
+				{instanceID: "instance-1", instanceAddr: "127.0.0.1", blocks: []ulid.ULID{block1, block2, block3, block4}},
+				{instanceID: "instance-2", instanceAddr: "127.0.0.2", blocks: []ulid.ULID{}},
+			},
+		},
+		"two ACTIVE instances in the ring with RF = 2 and SS = 2 (should sync all blocks on 2 instances)": {
+			replicationFactor: 2,
+			limits:            &shardingLimitsMock{storeGatewayTenantShardSize: 2},
+			setupRing: func(r *ring.Desc) {
+				r.AddIngester("instance-1", "127.0.0.1", "", []uint32{block1Hash + 1, block3Hash + 1}, ring.ACTIVE, registeredAt)
+				r.AddIngester("instance-2", "127.0.0.2", "", []uint32{block2Hash + 1, block4Hash + 1}, ring.ACTIVE, registeredAt)
+			},
+			expectedUsers: []usersExpectation{
+				{instanceID: "instance-1", instanceAddr: "127.0.0.1", users: []string{userID}},
+				{instanceID: "instance-2", instanceAddr: "127.0.0.2", users: []string{userID}},
+			},
+			expectedBlocks: []blocksExpectation{
+				{instanceID: "instance-1", instanceAddr: "127.0.0.1", blocks: []ulid.ULID{block1, block2, block3, block4}},
+				{instanceID: "instance-2", instanceAddr: "127.0.0.2", blocks: []ulid.ULID{block1, block2, block3, block4}},
+			},
+		},
+		"multiple ACTIVE instances in the ring with RF = 2 and SS = 3": {
+			replicationFactor: 2,
+			limits:            &shardingLimitsMock{storeGatewayTenantShardSize: 3},
+			setupRing: func(r *ring.Desc) {
+				r.AddIngester("instance-1", "127.0.0.1", "", []uint32{block1Hash + 1, block3Hash + 1}, ring.ACTIVE, registeredAt)
+				r.AddIngester("instance-2", "127.0.0.2", "", []uint32{block2Hash + 1}, ring.ACTIVE, registeredAt)
+				r.AddIngester("instance-3", "127.0.0.3", "", []uint32{block4Hash + 1}, ring.ACTIVE, registeredAt)
+			},
+			expectedUsers: []usersExpectation{
+				{instanceID: "instance-1", instanceAddr: "127.0.0.1", users: []string{userID}},
+				{instanceID: "instance-2", instanceAddr: "127.0.0.2", users: []string{userID}},
+				{instanceID: "instance-3", instanceAddr: "127.0.0.3", users: []string{userID}},
+			},
+			expectedBlocks: []blocksExpectation{
+				{instanceID: "instance-1", instanceAddr: "127.0.0.1", blocks: []ulid.ULID{block1, block3 /* replicated: */, block2, block4}},
+				{instanceID: "instance-2", instanceAddr: "127.0.0.2", blocks: []ulid.ULID{block2 /* replicated: */, block1}},
+				{instanceID: "instance-3", instanceAddr: "127.0.0.3", blocks: []ulid.ULID{block4 /* replicated: */, block3}},
+			},
+		},
+		"one unhealthy instance in the ring with RF = 1 and SS = 3": {
+			replicationFactor: 1,
+			limits:            &shardingLimitsMock{storeGatewayTenantShardSize: 3},
+			setupRing: func(r *ring.Desc) {
+				r.AddIngester("instance-1", "127.0.0.1", "", []uint32{block1Hash + 1, block3Hash + 1}, ring.ACTIVE, registeredAt)
+				r.AddIngester("instance-2", "127.0.0.2", "", []uint32{block2Hash + 1}, ring.ACTIVE, registeredAt)
+
+				r.Ingesters["instance-3"] = ring.InstanceDesc{
+					Addr:      "127.0.0.3",
+					Timestamp: time.Now().Add(-time.Hour).Unix(),
+					State:     ring.ACTIVE,
+					Tokens:    []uint32{block4Hash + 1},
+				}
+			},
+			expectedUsers: []usersExpectation{
+				{instanceID: "instance-1", instanceAddr: "127.0.0.1", users: []string{userID}},
+				{instanceID: "instance-2", instanceAddr: "127.0.0.2", users: []string{userID}},
+				{instanceID: "instance-3", instanceAddr: "127.0.0.3", users: []string{userID}},
+			},
+			expectedBlocks: []blocksExpectation{
+				// No shard has the blocks of the unhealthy instance.
+				{instanceID: "instance-1", instanceAddr: "127.0.0.1", blocks: []ulid.ULID{block1, block3}},
+				{instanceID: "instance-2", instanceAddr: "127.0.0.2", blocks: []ulid.ULID{block2}},
+				{instanceID: "instance-3", instanceAddr: "127.0.0.3", blocks: []ulid.ULID{}},
+			},
+		},
+		"one unhealthy instance in the ring with RF = 2 and SS = 3": {
+			replicationFactor: 2,
+			limits:            &shardingLimitsMock{storeGatewayTenantShardSize: 3},
+			setupRing: func(r *ring.Desc) {
+				r.AddIngester("instance-1", "127.0.0.1", "", []uint32{block1Hash + 1, block3Hash + 1}, ring.ACTIVE, registeredAt)
+				r.AddIngester("instance-2", "127.0.0.2", "", []uint32{block2Hash + 1}, ring.ACTIVE, registeredAt)
+
+				r.Ingesters["instance-3"] = ring.InstanceDesc{
+					Addr:      "127.0.0.3",
+					Timestamp: time.Now().Add(-time.Hour).Unix(),
+					State:     ring.ACTIVE,
+					Tokens:    []uint32{block4Hash + 1},
+				}
+			},
+			expectedUsers: []usersExpectation{
+				{instanceID: "instance-1", instanceAddr: "127.0.0.1", users: []string{userID}},
+				{instanceID: "instance-2", instanceAddr: "127.0.0.2", users: []string{userID}},
+				{instanceID: "instance-3", instanceAddr: "127.0.0.3", users: []string{userID}},
+			},
+			expectedBlocks: []blocksExpectation{
+				{instanceID: "instance-1", instanceAddr: "127.0.0.1", blocks: []ulid.ULID{block1, block3 /* replicated: */, block2, block4}},
+				{instanceID: "instance-2", instanceAddr: "127.0.0.2", blocks: []ulid.ULID{block2 /* replicated: */, block1}},
+				{instanceID: "instance-3", instanceAddr: "127.0.0.3", blocks: []ulid.ULID{}},
+			},
+		},
+		"one unhealthy instance in the ring with RF = 2 and SS = 2": {
+			replicationFactor: 2,
+			limits:            &shardingLimitsMock{storeGatewayTenantShardSize: 2},
+			setupRing: func(r *ring.Desc) {
+				r.AddIngester("instance-1", "127.0.0.1", "", []uint32{block1Hash + 1, block4Hash + 1}, ring.ACTIVE, registeredAt)
+				r.AddIngester("instance-2", "127.0.0.2", "", []uint32{block2Hash + 1}, ring.ACTIVE, registeredAt)
+
+				r.Ingesters["instance-3"] = ring.InstanceDesc{
+					Addr:      "127.0.0.3",
+					Timestamp: time.Now().Add(-time.Hour).Unix(),
+					State:     ring.ACTIVE,
+					Tokens:    []uint32{block3Hash + 1},
+				}
+			},
+			expectedUsers: []usersExpectation{
+				{instanceID: "instance-1", instanceAddr: "127.0.0.1", users: []string{userID}},
+				{instanceID: "instance-2", instanceAddr: "127.0.0.2", users: nil},
+				{instanceID: "instance-3", instanceAddr: "127.0.0.3", users: []string{userID}},
+			},
+			expectedBlocks: []blocksExpectation{
+				{instanceID: "instance-1", instanceAddr: "127.0.0.1", blocks: []ulid.ULID{block1, block2, block3, block4}},
+				{instanceID: "instance-2", instanceAddr: "127.0.0.2", blocks: []ulid.ULID{ /* no blocks because not belonging to the shard */ }},
+				{instanceID: "instance-3", instanceAddr: "127.0.0.3", blocks: []ulid.ULID{ /* no blocks because unhealthy */ }},
+			},
+		},
+		"LEAVING instance in the ring should continue to keep its shard blocks but they should also be replicated to another instance": {
+			replicationFactor: 1,
+			limits:            &shardingLimitsMock{storeGatewayTenantShardSize: 2},
+			setupRing: func(r *ring.Desc) {
+				r.AddIngester("instance-1", "127.0.0.1", "", []uint32{block1Hash + 1, block3Hash + 1}, ring.ACTIVE, registeredAt)
+				r.AddIngester("instance-2", "127.0.0.2", "", []uint32{block2Hash + 1}, ring.ACTIVE, registeredAt)
+				r.AddIngester("instance-3", "127.0.0.3", "", []uint32{block4Hash + 1}, ring.LEAVING, registeredAt)
+			},
+			expectedUsers: []usersExpectation{
+				{instanceID: "instance-1", instanceAddr: "127.0.0.1", users: []string{userID}},
+				{instanceID: "instance-2", instanceAddr: "127.0.0.2", users: nil},
+				{instanceID: "instance-3", instanceAddr: "127.0.0.3", users: []string{userID}},
+			},
+			expectedBlocks: []blocksExpectation{
+				{instanceID: "instance-1", instanceAddr: "127.0.0.1", blocks: []ulid.ULID{block1, block2, block3 /* replicated: */, block4}},
+				{instanceID: "instance-2", instanceAddr: "127.0.0.2", blocks: []ulid.ULID{ /* no blocks because not belonging to the shard */ }},
+				{instanceID: "instance-3", instanceAddr: "127.0.0.3", blocks: []ulid.ULID{block4}},
+			},
+		},
+		"JOINING instance in the ring should get its shard blocks and they should not be replicated to another instance": {
+			replicationFactor: 1,
+			limits:            &shardingLimitsMock{storeGatewayTenantShardSize: 2},
+			setupRing: func(r *ring.Desc) {
+				r.AddIngester("instance-1", "127.0.0.1", "", []uint32{block1Hash + 1, block3Hash + 1}, ring.ACTIVE, registeredAt)
+				r.AddIngester("instance-2", "127.0.0.2", "", []uint32{block2Hash + 1}, ring.ACTIVE, registeredAt)
+				r.AddIngester("instance-3", "127.0.0.3", "", []uint32{block4Hash + 1}, ring.JOINING, registeredAt)
+			},
+			expectedUsers: []usersExpectation{
+				{instanceID: "instance-1", instanceAddr: "127.0.0.1", users: []string{userID}},
+				{instanceID: "instance-2", instanceAddr: "127.0.0.2", users: nil},
+				{instanceID: "instance-3", instanceAddr: "127.0.0.3", users: []string{userID}},
+			},
+			expectedBlocks: []blocksExpectation{
+				{instanceID: "instance-1", instanceAddr: "127.0.0.1", blocks: []ulid.ULID{block1, block2, block3}},
+				{instanceID: "instance-2", instanceAddr: "127.0.0.2", blocks: []ulid.ULID{ /* no blocks because not belonging to the shard */ }},
+				{instanceID: "instance-3", instanceAddr: "127.0.0.3", blocks: []ulid.ULID{block4}},
+			},
+		},
+		"SS = 0 disables shuffle sharding": {
+			replicationFactor: 1,
+			limits:            &shardingLimitsMock{storeGatewayTenantShardSize: 0},
+			setupRing: func(r *ring.Desc) {
+				r.AddIngester("instance-1", "127.0.0.1", "", []uint32{block1Hash + 1, block3Hash + 1}, ring.ACTIVE, registeredAt)
+				r.AddIngester("instance-2", "127.0.0.2", "", []uint32{block2Hash + 1, block4Hash + 1}, ring.ACTIVE, registeredAt)
+			},
+			expectedUsers: []usersExpectation{
+				{instanceID: "instance-1", instanceAddr: "127.0.0.1", users: []string{userID}},
+				{instanceID: "instance-2", instanceAddr: "127.0.0.2", users: []string{userID}},
+			},
+			expectedBlocks: []blocksExpectation{
+				{instanceID: "instance-1", instanceAddr: "127.0.0.1", blocks: []ulid.ULID{block1, block3}},
+				{instanceID: "instance-2", instanceAddr: "127.0.0.2", blocks: []ulid.ULID{block2, block4}},
+			},
+		},
+	}
+
+	for testName, testData := range tests {
+		testName := testName
+		testData := testData
+
+		t.Run(testName, func(t *testing.T) {
+			t.Parallel()
+
+			ctx := context.Background()
+			store, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil)
+			t.Cleanup(func() { assert.NoError(t, closer.Close()) })
+
+			// Initialize the ring state.
+			require.NoError(t, store.CAS(ctx, "test", func(in interface{}) (interface{}, bool, error) {
+				d := ring.NewDesc()
+				testData.setupRing(d)
+				return d, true, nil
+			}))
+
+			cfg := ring.Config{
+				ReplicationFactor:    testData.replicationFactor,
+				HeartbeatTimeout:     time.Minute,
+				SubringCacheDisabled: true,
+			}
+
+			r, err := ring.NewWithStoreClientAndStrategy(cfg, "test", "test", store, ring.NewIgnoreUnhealthyInstancesReplicationStrategy(), nil, nil)
+			require.NoError(t, err)
+			require.NoError(t, services.StartAndAwaitRunning(ctx, r))
+			defer services.StopAndAwaitTerminated(ctx, r) //nolint:errcheck
+
+			// Wait until the ring client has synced.
+			require.NoError(t, ring.WaitInstanceState(ctx, r, "instance-1", ring.ACTIVE))
+
+			// Assert on filter users.
+			for _, expected := range testData.expectedUsers {
+				filter := NewShuffleShardingStrategy(r, expected.instanceID, expected.instanceAddr, testData.limits, log.NewNopLogger())
+				assert.Equal(t, expected.users, filter.FilterUsers(ctx, []string{userID}))
+			}
+
+			// Assert on filter blocks.
+			for _, expected := range testData.expectedBlocks {
+				filter := NewShuffleShardingStrategy(r, expected.instanceID, expected.instanceAddr, testData.limits, log.NewNopLogger())
+				synced := extprom.NewTxGaugeVec(nil, prometheus.GaugeOpts{}, []string{"state"})
+				synced.WithLabelValues(shardExcludedMeta).Set(0)
+
+				metas := map[ulid.ULID]*metadata.Meta{
+					block1: {},
+					block2: {},
+					block3: {},
+					block4: {},
+				}
+
+				err = filter.FilterBlocks(ctx, userID, metas, map[ulid.ULID]struct{}{}, synced)
+				require.NoError(t, err)
+
+				var actualBlocks []ulid.ULID
+				for id := range metas {
+					actualBlocks = append(actualBlocks, id)
+				}
+
+				assert.ElementsMatch(t, expected.blocks, actualBlocks)
+
+				// Assert on the metric used to keep track of the blocks filtered out.
+				synced.Submit()
+				assert.Equal(t, float64(numAllBlocks-len(expected.blocks)), testutil.ToFloat64(synced))
+			}
+		})
+	}
+}
+
+type shardingLimitsMock struct {
+	storeGatewayTenantShardSize int
+}
+
+func (m *shardingLimitsMock) StoreGatewayTenantShardSize(_ string) int {
+	return m.storeGatewayTenantShardSize
+}