diff --git a/pkg/dataobj/internal/dataset/column_builder.go b/pkg/dataobj/internal/dataset/column_builder.go index a54d782054ef1..3c67724f44c4e 100644 --- a/pkg/dataobj/internal/dataset/column_builder.go +++ b/pkg/dataobj/internal/dataset/column_builder.go @@ -27,9 +27,19 @@ type BuilderOptions struct { // CompressionOptions holds optional configuration for compression. CompressionOptions CompressionOptions + // StatisticsOptions holds optional configuration for statistics. + Statistics StatisticsOptions +} + +// StatisticsOptions customizes the collection of statistics for a column. +type StatisticsOptions struct { // StoreRangeStats indicates whether to store value range statistics for the // column and pages. StoreRangeStats bool + + // StoreCardinalityStats indicates whether to store cardinality estimations, + // facilitated by hyperloglog + StoreCardinalityStats bool } // CompressionOptions customizes the compressor used when building pages. @@ -48,8 +58,9 @@ type ColumnBuilder struct { rows int // Total number of rows in the column. - pages []*MemPage - builder *pageBuilder + pages []*MemPage + statsBuilder *columnStatsBuilder + pageBuilder *pageBuilder } // NewColumnBuilder creates a new ColumnBuilder from the optional name and @@ -61,11 +72,17 @@ func NewColumnBuilder(name string, opts BuilderOptions) (*ColumnBuilder, error) return nil, fmt.Errorf("creating page builder: %w", err) } + statsBuilder, err := newColumnStatsBuilder(opts.Statistics) + if err != nil { + return nil, fmt.Errorf("creating stats builder: %w", err) + } + return &ColumnBuilder{ name: name, opts: opts, - builder: builder, + pageBuilder: builder, + statsBuilder: statsBuilder, }, nil } @@ -87,6 +104,7 @@ func (cb *ColumnBuilder) Append(row int, value Value) error { for range 2 { if cb.append(row, value) { cb.rows = row + 1 + cb.statsBuilder.Append(value) return nil } @@ -107,7 +125,7 @@ func (cb *ColumnBuilder) EstimatedSize() int { for _, p := range cb.pages { size += p.Info.CompressedSize } - size += cb.builder.EstimatedSize() + size += cb.pageBuilder.EstimatedSize() return size } @@ -132,7 +150,7 @@ func (cb *ColumnBuilder) Backfill(row int) { func (cb *ColumnBuilder) backfill(row int) bool { for row > cb.rows { - if !cb.builder.AppendNull() { + if !cb.pageBuilder.AppendNull() { return false } cb.rows++ @@ -146,7 +164,7 @@ func (cb *ColumnBuilder) append(row int, value Value) bool { if !cb.backfill(row) { return false } - return cb.builder.Append(value) + return cb.pageBuilder.Append(value) } // Flush converts data in cb into a [MemColumn]. Afterwards, cb is reset to a @@ -159,7 +177,7 @@ func (cb *ColumnBuilder) Flush() (*MemColumn, error) { Type: cb.opts.Value, Compression: cb.opts.Compression, - Statistics: cb.buildStats(), + Statistics: cb.statsBuilder.Flush(cb.pages), } for _, page := range cb.pages { @@ -178,54 +196,12 @@ func (cb *ColumnBuilder) Flush() (*MemColumn, error) { return column, nil } -func (cb *ColumnBuilder) buildStats() *datasetmd.Statistics { - if !cb.opts.StoreRangeStats { - return nil - } - - var stats datasetmd.Statistics - - var minValue, maxValue Value - - for i, page := range cb.pages { - if page.Info.Stats == nil { - // This should never hit; if cb.opts.StoreRangeStats is true, then - // page.Info.Stats will be populated. - panic("ColumnBuilder.buildStats: page missing stats") - } - - var pageMin, pageMax Value - - if err := pageMin.UnmarshalBinary(page.Info.Stats.MinValue); err != nil { - panic(fmt.Sprintf("ColumnBuilder.buildStats: failed to unmarshal min value: %s", err)) - } else if err := pageMax.UnmarshalBinary(page.Info.Stats.MaxValue); err != nil { - panic(fmt.Sprintf("ColumnBuilder.buildStats: failed to unmarshal max value: %s", err)) - } - - if i == 0 || CompareValues(pageMin, minValue) < 0 { - minValue = pageMin - } - if i == 0 || CompareValues(pageMax, maxValue) > 0 { - maxValue = pageMax - } - } - - var err error - if stats.MinValue, err = minValue.MarshalBinary(); err != nil { - panic(fmt.Sprintf("ColumnBuilder.buildStats: failed to marshal min value: %s", err)) - } - if stats.MaxValue, err = maxValue.MarshalBinary(); err != nil { - panic(fmt.Sprintf("ColumnBuilder.buildStats: failed to marshal max value: %s", err)) - } - return &stats -} - func (cb *ColumnBuilder) flushPage() { - if cb.builder.Rows() == 0 { + if cb.pageBuilder.Rows() == 0 { return } - page, err := cb.builder.Flush() + page, err := cb.pageBuilder.Flush() if err != nil { // Flush should only return an error when it's empty, which we already // ensure it's not in the lines above. @@ -238,5 +214,5 @@ func (cb *ColumnBuilder) flushPage() { func (cb *ColumnBuilder) Reset() { cb.rows = 0 cb.pages = nil - cb.builder.Reset() + cb.pageBuilder.Reset() } diff --git a/pkg/dataobj/internal/dataset/column_stats.go b/pkg/dataobj/internal/dataset/column_stats.go new file mode 100644 index 0000000000000..94e6f94335378 --- /dev/null +++ b/pkg/dataobj/internal/dataset/column_stats.go @@ -0,0 +1,115 @@ +package dataset + +import ( + "fmt" + + "github.com/axiomhq/hyperloglog" + + "github.com/grafana/loki/v3/pkg/dataobj/internal/metadata/datasetmd" +) + +// NB: https://engineering.fb.com/2018/12/13/data-infrastructure/hyperloglog/ +// Standard error (SE) = 1.04/sqrt(2^n_registers) +// so +// +// 65% of estimates will be within ±1SE of true value +// 95% of estimates will be within ±2SE of true value +// 99% of estimates will be within ±3SE of true value +// +// e.g. given 2^12 registers +// SE = 1.04/sqrt(2^12) = 0.01625 +// 65% of estimates will be within ±1SE of true value (1.625%) +// 95% of estimates will be within ±2SE of true value (3.25%) +// 99% of estimates will be within ±3SE of true value (4.875%) +// and with 8-bit registers, this is 2^12 = 4KB size. +func newHyperLogLog() (*hyperloglog.Sketch, error) { + return hyperloglog.NewSketch(12, true) +} + +type columnStatsBuilder struct { + opts StatisticsOptions + + // for cardinality + hll *hyperloglog.Sketch +} + +// ColumnStatsBuilder is for column-level statistics +func newColumnStatsBuilder(opts StatisticsOptions) (*columnStatsBuilder, error) { + result := &columnStatsBuilder{ + opts: opts, + } + + if opts.StoreCardinalityStats { + var err error + if result.hll, err = newHyperLogLog(); err != nil { + return nil, fmt.Errorf("failed to create hll: %w", err) + } + } + + return result, nil +} + +func (csb *columnStatsBuilder) Append(value Value) { + if csb.opts.StoreCardinalityStats && !value.IsNil() && !value.IsZero() { + buf, err := value.MarshalBinary() + if err != nil { + panic(fmt.Sprintf( + "failed to marshal value for cardinality stats of type %s: %s", + value.Type(), err, + )) + } + + // TODO(owen-d): improve efficiency, ideally we don't need to marshal + // into an intermediate buffer. + csb.hll.Insert(buf) + } +} + +// Flush builds the column-level stats both from the given pages and any internal +// state +func (csb *columnStatsBuilder) Flush(pages []*MemPage) *datasetmd.Statistics { + var dst datasetmd.Statistics + if csb.opts.StoreCardinalityStats { + dst.CardinalityCount = csb.hll.Estimate() + } + if csb.opts.StoreRangeStats { + csb.buildRangeStats(pages, &dst) + } + + return &dst +} + +func (csb *columnStatsBuilder) buildRangeStats(pages []*MemPage, dst *datasetmd.Statistics) { + var minValue, maxValue Value + + for i, page := range pages { + if page.Info.Stats == nil { + // This should never hit; if cb.opts.StoreRangeStats is true, then + // page.Info.Stats will be populated. + panic("ColumnStatsBuilder.buildStats: page missing stats") + } + + var pageMin, pageMax Value + + if err := pageMin.UnmarshalBinary(page.Info.Stats.MinValue); err != nil { + panic(fmt.Sprintf("ColumnStatsBuilder.buildStats: failed to unmarshal min value: %s", err)) + } else if err := pageMax.UnmarshalBinary(page.Info.Stats.MaxValue); err != nil { + panic(fmt.Sprintf("ColumnStatsBuilder.buildStats: failed to unmarshal max value: %s", err)) + } + + if i == 0 || CompareValues(pageMin, minValue) < 0 { + minValue = pageMin + } + if i == 0 || CompareValues(pageMax, maxValue) > 0 { + maxValue = pageMax + } + } + + var err error + if dst.MinValue, err = minValue.MarshalBinary(); err != nil { + panic(fmt.Sprintf("ColumnStatsBuilder.buildStats: failed to marshal min value: %s", err)) + } + if dst.MaxValue, err = maxValue.MarshalBinary(); err != nil { + panic(fmt.Sprintf("ColumnStatsBuilder.buildStats: failed to marshal max value: %s", err)) + } +} diff --git a/pkg/dataobj/internal/dataset/column_test.go b/pkg/dataobj/internal/dataset/column_test.go index de2f3f3549461..ca8c39cc9ae91 100644 --- a/pkg/dataobj/internal/dataset/column_test.go +++ b/pkg/dataobj/internal/dataset/column_test.go @@ -101,7 +101,9 @@ func TestColumnBuilder_MinMax(t *testing.T) { Compression: datasetmd.COMPRESSION_TYPE_NONE, Encoding: datasetmd.ENCODING_TYPE_PLAIN, - StoreRangeStats: true, + Statistics: StatisticsOptions{ + StoreRangeStats: true, + }, } b, err := NewColumnBuilder("", opts) require.NoError(t, err) @@ -132,6 +134,56 @@ func TestColumnBuilder_MinMax(t *testing.T) { require.Equal(t, fString, page1Max.String()) } +func TestColumnBuilder_Cardinality(t *testing.T) { + var ( + // We include the null string in the test to ensure that it's never + // considered in min/max ranges. + nullString = "" + + aString = strings.Repeat("a", 100) + bString = strings.Repeat("b", 100) + cString = strings.Repeat("c", 100) + ) + + // We store nulls and duplicates (should not be counted in cardinality count) + in := []string{ + nullString, + + aString, + + bString, + bString, + bString, + + cString, + } + + opts := BuilderOptions{ + PageSizeHint: 301, // Slightly larger than the string length of 3 strings per page. + Value: datasetmd.VALUE_TYPE_STRING, + Compression: datasetmd.COMPRESSION_TYPE_NONE, + Encoding: datasetmd.ENCODING_TYPE_PLAIN, + + Statistics: StatisticsOptions{ + StoreCardinalityStats: true, + }, + } + b, err := NewColumnBuilder("", opts) + require.NoError(t, err) + + for i, s := range in { + require.NoError(t, b.Append(i, StringValue(s))) + } + + col, err := b.Flush() + require.NoError(t, err) + require.Equal(t, datasetmd.VALUE_TYPE_STRING, col.Info.Type) + require.NotNil(t, col.Info.Statistics) + // we use sparse hyperloglog reprs until a certain cardinality is reached, + // so this should not be approximate at low counts. + require.Equal(t, uint64(3), col.Info.Statistics.CardinalityCount) +} + func getMinMax(t *testing.T, stats *datasetmd.Statistics) (min, max Value) { t.Helper() require.NotNil(t, stats) diff --git a/pkg/dataobj/internal/dataset/page_builder.go b/pkg/dataobj/internal/dataset/page_builder.go index fa4963db1661b..809902587e84b 100644 --- a/pkg/dataobj/internal/dataset/page_builder.go +++ b/pkg/dataobj/internal/dataset/page_builder.go @@ -99,10 +99,10 @@ func (b *pageBuilder) Append(value Value) bool { return false } - // Update min/max values for stats. We only do this for non-NULL values, + // Update statistics. We only do this for non-NULL values, // otherwise NULL would always be the min for columns that contain a single // NULL. - b.updateMinMax(value) + b.accumulateStatistics(value) // The following calls won't fail; they only return errors when the // underlying writers fail, which ours cannot. @@ -140,14 +140,16 @@ func (b *pageBuilder) AppendNull() bool { return true } -func (b *pageBuilder) updateMinMax(value Value) { +func (b *pageBuilder) accumulateStatistics(value Value) { // As a small optimization, we only update min/max values if we're intending - // on populating the stats. This avoids unnecessary comparisons for very + // on populating them in statistics. This avoids unnecessary comparisons for very // large values. - if !b.opts.StoreRangeStats { - return + if b.opts.Statistics.StoreRangeStats { + b.updateMinMax(value) } +} +func (b *pageBuilder) updateMinMax(value Value) { // We'll init minValue/maxValue if this is our first non-NULL value (b.values == 0). // This allows us to only avoid comparing against NULL values, which would lead to // NULL always being the min. @@ -259,9 +261,15 @@ func (b *pageBuilder) Flush() (*MemPage, error) { } func (b *pageBuilder) buildStats() *datasetmd.Statistics { - if !b.opts.StoreRangeStats { - return nil + var stats datasetmd.Statistics + if b.opts.Statistics.StoreRangeStats { + b.buildRangeStats(&stats) + return &stats } + return nil +} + +func (b *pageBuilder) buildRangeStats(dst *datasetmd.Statistics) { minValueBytes, err := b.minValue.MarshalBinary() if err != nil { @@ -272,10 +280,8 @@ func (b *pageBuilder) buildStats() *datasetmd.Statistics { panic(fmt.Sprintf("pageBuilder.buildStats: failed to marshal max value: %s", err)) } - return &datasetmd.Statistics{ - MinValue: minValueBytes, - MaxValue: maxValueBytes, - } + dst.MinValue = minValueBytes + dst.MaxValue = maxValueBytes } // Reset resets the pageBuilder to a fresh state, allowing it to be reused. diff --git a/pkg/dataobj/internal/metadata/datasetmd/datasetmd.pb.go b/pkg/dataobj/internal/metadata/datasetmd/datasetmd.pb.go index 1920ffde67281..1ee081cd83e2b 100644 --- a/pkg/dataobj/internal/metadata/datasetmd/datasetmd.pb.go +++ b/pkg/dataobj/internal/metadata/datasetmd/datasetmd.pb.go @@ -268,6 +268,11 @@ type Statistics struct { // Applications must not assume that an unset max_value means that the column // is empty; check for values_count == 0 instead. MaxValue []byte `protobuf:"bytes,2,opt,name=max_value,json=maxValue,proto3" json:"max_value,omitempty"` + // Estimated number of distinct values in the column. + // + // Applications must not assume that an unset cardinality_count means that + // the column has no distinct values; check for values_count == 0 instead. + CardinalityCount uint64 `protobuf:"varint,3,opt,name=cardinality_count,json=cardinalityCount,proto3" json:"cardinality_count,omitempty"` } func (m *Statistics) Reset() { *m = Statistics{} } @@ -316,6 +321,13 @@ func (m *Statistics) GetMaxValue() []byte { return nil } +func (m *Statistics) GetCardinalityCount() uint64 { + if m != nil { + return m.CardinalityCount + } + return 0 +} + // Page describes an individual page within a column. type PageInfo struct { // Uncompressed size of the page within the data object. @@ -448,51 +460,52 @@ func init() { } var fileDescriptor_7ab9d5b21b743868 = []byte{ - // 699 bytes of a gzipped FileDescriptorProto - 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xac, 0x54, 0x41, 0x53, 0xda, 0x5a, - 0x14, 0xe6, 0x02, 0xfa, 0xc8, 0x01, 0x35, 0xde, 0xa7, 0xcf, 0xf8, 0x78, 0xe6, 0x51, 0x3b, 0x53, - 0xa9, 0xed, 0xc0, 0x14, 0x3b, 0xed, 0x1a, 0x21, 0x3a, 0x99, 0xd1, 0x90, 0x49, 0xd0, 0x19, 0xdd, - 0x64, 0x62, 0xb8, 0xd0, 0x54, 0x92, 0x30, 0x24, 0x50, 0x75, 0xd5, 0x95, 0xeb, 0xfe, 0x8c, 0xfe, - 0x94, 0x2e, 0x5d, 0xba, 0xac, 0x71, 0xa6, 0xd3, 0xa5, 0x3f, 0xa1, 0xc3, 0x85, 0x40, 0x04, 0xca, - 0xb8, 0xe8, 0xee, 0xf2, 0x7d, 0xdf, 0x39, 0x27, 0x7c, 0xdf, 0xb9, 0x17, 0xde, 0xb7, 0xce, 0x1b, - 0xf9, 0x9a, 0xee, 0xe9, 0xce, 0xd9, 0xc7, 0xbc, 0x69, 0x7b, 0xa4, 0x6d, 0xeb, 0xcd, 0xbc, 0x45, - 0x3c, 0xbd, 0x07, 0x52, 0xc6, 0x25, 0x9e, 0x55, 0x1b, 0x9d, 0x72, 0xad, 0xb6, 0xe3, 0x39, 0x38, - 0x3d, 0x28, 0xca, 0x05, 0xda, 0xdc, 0x40, 0x91, 0xeb, 0xbe, 0xd9, 0xfc, 0x11, 0x03, 0x28, 0x39, - 0xcd, 0x8e, 0x65, 0x8b, 0x76, 0xdd, 0xc1, 0x18, 0xe2, 0xb6, 0x6e, 0x11, 0x0e, 0x65, 0x50, 0x96, - 0x51, 0xe8, 0x19, 0x0b, 0x00, 0x5d, 0xbd, 0xd9, 0x21, 0x9a, 0x77, 0xd9, 0x22, 0x5c, 0x34, 0x83, - 0xb2, 0x8b, 0x85, 0x17, 0xb9, 0x19, 0x4d, 0x73, 0xc7, 0x3d, 0x79, 0xf5, 0xb2, 0x45, 0x14, 0xa6, - 0x1b, 0x1c, 0xf1, 0x06, 0x40, 0xdb, 0xf9, 0xe4, 0x6a, 0x86, 0xd3, 0xb1, 0x3d, 0x2e, 0x96, 0x41, - 0xd9, 0xb8, 0xc2, 0xf4, 0x90, 0x52, 0x0f, 0xc0, 0x12, 0x24, 0x0d, 0xc7, 0x6a, 0xb5, 0x89, 0xeb, - 0x9a, 0x8e, 0xcd, 0xc5, 0xe9, 0x98, 0xd7, 0x33, 0xc7, 0x94, 0x46, 0x7a, 0x3a, 0x2c, 0xdc, 0x00, - 0xbf, 0x82, 0xe5, 0x8e, 0x1d, 0x00, 0xa4, 0xa6, 0xb9, 0xe6, 0x15, 0xe1, 0xe6, 0xe8, 0x54, 0x36, - 0x4c, 0xa8, 0xe6, 0x15, 0xc1, 0x5b, 0xb0, 0x34, 0x2e, 0x9d, 0xa7, 0xd2, 0xc5, 0x49, 0x61, 0xf0, - 0x25, 0x9a, 0x53, 0xaf, 0xbb, 0xc4, 0xe3, 0xfe, 0xea, 0x0b, 0x03, 0xb8, 0x42, 0x51, 0xfc, 0x1c, - 0x16, 0x86, 0x42, 0xda, 0x2f, 0x41, 0x65, 0xa9, 0x00, 0xa4, 0xdd, 0xf6, 0x01, 0x5c, 0x4f, 0xf7, - 0x4c, 0xd7, 0x33, 0x0d, 0x97, 0x63, 0x32, 0x28, 0x9b, 0x2c, 0x6c, 0xcd, 0xfc, 0xcb, 0xea, 0x50, - 0xae, 0x84, 0x4a, 0xf1, 0x33, 0x48, 0x51, 0xa3, 0x03, 0x77, 0x81, 0x0e, 0x4b, 0xf6, 0x31, 0xea, - 0xef, 0xe6, 0x1e, 0xc0, 0xa8, 0x18, 0xa7, 0x81, 0xb1, 0x4c, 0x5b, 0xa3, 0x02, 0x1a, 0x76, 0x4a, - 0x49, 0x58, 0xa6, 0x4d, 0x83, 0xa3, 0xa4, 0x7e, 0x31, 0x20, 0xa3, 0x03, 0x52, 0xbf, 0xa0, 0xe4, - 0xe6, 0x75, 0x0c, 0x12, 0xb2, 0xde, 0x20, 0x74, 0x5d, 0xa6, 0x9a, 0x8c, 0x9e, 0x6e, 0x72, 0x74, - 0xaa, 0xc9, 0x2b, 0x30, 0x67, 0xb4, 0x8d, 0x9d, 0x02, 0x5d, 0x92, 0x05, 0xa5, 0xff, 0x63, 0x6c, - 0x7f, 0xe2, 0xe3, 0xfb, 0x23, 0x40, 0x82, 0xd8, 0x86, 0x53, 0x33, 0xed, 0x06, 0x8d, 0x79, 0xb1, - 0xf0, 0x72, 0xa6, 0x93, 0xc2, 0x40, 0x4c, 0x37, 0x67, 0x58, 0x8a, 0xff, 0x87, 0x64, 0x38, 0xdc, - 0xfe, 0x16, 0x40, 0x28, 0xd8, 0x34, 0x30, 0xa3, 0x50, 0xfb, 0xd9, 0x27, 0x7e, 0x13, 0x68, 0xe2, - 0xcf, 0x05, 0xca, 0x4c, 0x04, 0xba, 0x7d, 0x0e, 0xcc, 0xf0, 0x9e, 0xe1, 0x7f, 0xe1, 0x9f, 0xe3, - 0xe2, 0xc1, 0x91, 0xa0, 0x55, 0x4f, 0x64, 0x41, 0x3b, 0x92, 0x54, 0x59, 0x28, 0x89, 0x7b, 0xa2, - 0x50, 0x66, 0x23, 0x78, 0x05, 0xd8, 0x10, 0x27, 0x4a, 0xd5, 0x77, 0x6f, 0x59, 0x84, 0x57, 0x61, - 0x39, 0x5c, 0xd1, 0x87, 0xa3, 0x63, 0xb0, 0x5a, 0x55, 0x44, 0x69, 0x9f, 0x8d, 0x6d, 0x5f, 0x23, - 0x58, 0x1a, 0xbb, 0x6e, 0x38, 0x03, 0xff, 0x95, 0x2a, 0x87, 0xb2, 0x22, 0xa8, 0xaa, 0x58, 0x91, - 0xa6, 0x4d, 0x5e, 0x87, 0xd5, 0x09, 0x85, 0x54, 0x91, 0x04, 0x16, 0xe1, 0x34, 0xac, 0x4d, 0x50, - 0xaa, 0x54, 0x94, 0xe5, 0x13, 0x36, 0x3a, 0xb5, 0xee, 0x54, 0xad, 0x96, 0xd9, 0xd8, 0xf6, 0x25, - 0xa4, 0xc2, 0xc9, 0xe1, 0x0d, 0x58, 0x17, 0xa4, 0x52, 0xa5, 0x2c, 0x4a, 0xfb, 0xd3, 0xbe, 0x60, - 0x0d, 0xfe, 0x7e, 0x4c, 0xcb, 0x07, 0x45, 0x51, 0x62, 0xd1, 0x24, 0x51, 0x16, 0x0e, 0xaa, 0x45, - 0x36, 0x8a, 0x39, 0x58, 0x79, 0x4c, 0xec, 0x8a, 0xd5, 0xc3, 0xa2, 0xcc, 0xc6, 0x76, 0x2f, 0x6e, - 0xee, 0xf8, 0xc8, 0xed, 0x1d, 0x1f, 0x79, 0xb8, 0xe3, 0xd1, 0x67, 0x9f, 0x47, 0x5f, 0x7d, 0x1e, - 0x7d, 0xf3, 0x79, 0x74, 0xe3, 0xf3, 0xe8, 0xbb, 0xcf, 0xa3, 0x9f, 0x3e, 0x1f, 0x79, 0xf0, 0x79, - 0xf4, 0xe5, 0x9e, 0x8f, 0xdc, 0xdc, 0xf3, 0x91, 0xdb, 0x7b, 0x3e, 0x72, 0xba, 0xdb, 0x30, 0xbd, - 0x0f, 0x9d, 0xb3, 0x9c, 0xe1, 0x58, 0xf9, 0x46, 0x5b, 0xaf, 0xeb, 0xb6, 0x9e, 0x6f, 0x3a, 0xe7, - 0x66, 0xbe, 0xbb, 0x93, 0x7f, 0xe2, 0x8b, 0x7e, 0x36, 0x4f, 0x1f, 0xf2, 0x9d, 0x5f, 0x01, 0x00, - 0x00, 0xff, 0xff, 0xe6, 0x72, 0xec, 0xb1, 0x03, 0x06, 0x00, 0x00, + // 714 bytes of a gzipped FileDescriptorProto + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xac, 0x54, 0x41, 0x53, 0xda, 0x5c, + 0x14, 0xe5, 0x01, 0xfa, 0x91, 0x0b, 0x6a, 0x7c, 0x9f, 0xd6, 0x58, 0x6a, 0x4a, 0xed, 0x4c, 0xa5, + 0xda, 0x81, 0x29, 0x76, 0xda, 0x35, 0x42, 0xea, 0x64, 0x46, 0x43, 0x26, 0x41, 0x67, 0x74, 0xc3, + 0x3c, 0x43, 0xa0, 0xa9, 0x24, 0x61, 0x48, 0xa0, 0xe2, 0xaa, 0x2b, 0xd7, 0xfd, 0x19, 0xfd, 0x29, + 0x5d, 0xba, 0x74, 0x59, 0x71, 0xa6, 0xd3, 0xa5, 0x3f, 0xa1, 0xc3, 0x0b, 0x81, 0x08, 0x94, 0x71, + 0xd1, 0xdd, 0xe3, 0x9c, 0x73, 0xef, 0x7d, 0xdc, 0x73, 0xf2, 0xe0, 0x43, 0xf3, 0xbc, 0x9e, 0xad, + 0x12, 0x97, 0xd8, 0x67, 0x9f, 0xb3, 0x86, 0xe5, 0xea, 0x2d, 0x8b, 0x34, 0xb2, 0xa6, 0xee, 0x92, + 0x3e, 0x48, 0x19, 0x47, 0x77, 0xcd, 0xea, 0xe8, 0x94, 0x69, 0xb6, 0x6c, 0xd7, 0xc6, 0xc9, 0x41, + 0x51, 0xc6, 0xd7, 0x66, 0x06, 0x8a, 0x4c, 0xe7, 0xed, 0xe6, 0xaf, 0x08, 0x40, 0xc1, 0x6e, 0xb4, + 0x4d, 0x4b, 0xb4, 0x6a, 0x36, 0xc6, 0x10, 0xb5, 0x88, 0xa9, 0x73, 0x28, 0x85, 0xd2, 0x8c, 0x42, + 0xcf, 0x58, 0x00, 0xe8, 0x90, 0x46, 0x5b, 0xaf, 0xb8, 0xdd, 0xa6, 0xce, 0x85, 0x53, 0x28, 0xbd, + 0x98, 0x7b, 0x95, 0x99, 0xd1, 0x34, 0x73, 0xdc, 0x97, 0x97, 0xbb, 0x4d, 0x5d, 0x61, 0x3a, 0xfe, + 0x11, 0x6f, 0x00, 0xb4, 0xec, 0x2f, 0x4e, 0x45, 0xb3, 0xdb, 0x96, 0xcb, 0x45, 0x52, 0x28, 0x1d, + 0x55, 0x98, 0x3e, 0x52, 0xe8, 0x03, 0x58, 0x82, 0xb8, 0x66, 0x9b, 0xcd, 0x96, 0xee, 0x38, 0x86, + 0x6d, 0x71, 0x51, 0x3a, 0xe6, 0xcd, 0xcc, 0x31, 0x85, 0x91, 0x9e, 0x0e, 0x0b, 0x36, 0xc0, 0x3b, + 0xb0, 0xdc, 0xb6, 0x7c, 0x40, 0xaf, 0x56, 0x1c, 0xe3, 0x52, 0xe7, 0xe6, 0xe8, 0x54, 0x36, 0x48, + 0xa8, 0xc6, 0xa5, 0x8e, 0xb7, 0x60, 0x69, 0x5c, 0x3a, 0x4f, 0xa5, 0x8b, 0x93, 0x42, 0xff, 0x26, + 0x15, 0xbb, 0x56, 0x73, 0x74, 0x97, 0xfb, 0xcf, 0x13, 0xfa, 0x70, 0x89, 0xa2, 0xf8, 0x25, 0x2c, + 0x0c, 0x85, 0xb4, 0x5f, 0x8c, 0xca, 0x12, 0x3e, 0x48, 0xbb, 0xed, 0x03, 0x38, 0x2e, 0x71, 0x0d, + 0xc7, 0x35, 0x34, 0x87, 0x63, 0x52, 0x28, 0x1d, 0xcf, 0x6d, 0xcd, 0xfc, 0xcb, 0xea, 0x50, 0xae, + 0x04, 0x4a, 0xf1, 0x0b, 0x48, 0xd0, 0x45, 0xfb, 0xdb, 0x05, 0x3a, 0x2c, 0xee, 0x61, 0x74, 0xbf, + 0x9b, 0x0e, 0xc0, 0xa8, 0x18, 0x27, 0x81, 0x31, 0x0d, 0xab, 0x42, 0x05, 0xd4, 0xec, 0x84, 0x12, + 0x33, 0x0d, 0x8b, 0x1a, 0x47, 0x49, 0x72, 0x31, 0x20, 0xc3, 0x03, 0x92, 0x5c, 0x78, 0xe4, 0x0e, + 0x2c, 0x6b, 0xa4, 0x55, 0x35, 0x2c, 0xd2, 0x30, 0xdc, 0xee, 0x03, 0x37, 0xd9, 0x00, 0xe1, 0x0d, + 0xbd, 0x8a, 0x40, 0x4c, 0x26, 0x75, 0x9d, 0x66, 0x6b, 0xaa, 0x23, 0xe8, 0xf1, 0x8e, 0x84, 0xa7, + 0x3a, 0xb2, 0x02, 0x73, 0x5a, 0x4b, 0xdb, 0xcd, 0xd1, 0x3b, 0x2c, 0x28, 0xde, 0x8f, 0xb1, 0xb0, + 0x45, 0xc7, 0xc3, 0x26, 0x40, 0x4c, 0xb7, 0x34, 0xbb, 0x6a, 0x58, 0x75, 0x9a, 0x89, 0xc5, 0xdc, + 0xeb, 0x99, 0x6b, 0x17, 0x06, 0x62, 0x1a, 0xb3, 0x61, 0x29, 0x7e, 0x0e, 0xf1, 0x60, 0x12, 0xbc, + 0xc8, 0x40, 0x20, 0x05, 0x49, 0x60, 0x46, 0x09, 0xf0, 0x82, 0x12, 0xfb, 0x8b, 0xfb, 0xb1, 0x7f, + 0xe7, 0x3e, 0x33, 0xe1, 0xfe, 0xf6, 0x39, 0x30, 0xc3, 0x8f, 0x12, 0x3f, 0x85, 0x27, 0xc7, 0xf9, + 0x83, 0x23, 0xa1, 0x52, 0x3e, 0x91, 0x85, 0xca, 0x91, 0xa4, 0xca, 0x42, 0x41, 0xfc, 0x28, 0x0a, + 0x45, 0x36, 0x84, 0x57, 0x80, 0x0d, 0x70, 0xa2, 0x54, 0x7e, 0xff, 0x8e, 0x45, 0x78, 0x15, 0x96, + 0x83, 0x15, 0x1e, 0x1c, 0x1e, 0x83, 0xd5, 0xb2, 0x22, 0x4a, 0xfb, 0x6c, 0x64, 0xfb, 0x0a, 0xc1, + 0xd2, 0xd8, 0xb7, 0x89, 0x53, 0xf0, 0xac, 0x50, 0x3a, 0x94, 0x15, 0x41, 0x55, 0xc5, 0x92, 0x34, + 0x6d, 0xf2, 0x3a, 0xac, 0x4e, 0x28, 0xa4, 0x92, 0x24, 0xb0, 0x08, 0x27, 0x61, 0x6d, 0x82, 0x52, + 0xa5, 0xbc, 0x2c, 0x9f, 0xb0, 0xe1, 0xa9, 0x75, 0xa7, 0x6a, 0xb9, 0xc8, 0x46, 0xb6, 0xbb, 0x90, + 0x08, 0x3a, 0x87, 0x37, 0x60, 0x5d, 0x90, 0x0a, 0xa5, 0xa2, 0x28, 0xed, 0x4f, 0xbb, 0xc1, 0x1a, + 0xfc, 0xff, 0x90, 0x96, 0x0f, 0xf2, 0xa2, 0xc4, 0xa2, 0x49, 0xa2, 0x28, 0x1c, 0x94, 0xf3, 0x6c, + 0x18, 0x73, 0xb0, 0xf2, 0x90, 0xd8, 0x13, 0xcb, 0x87, 0x79, 0x99, 0x8d, 0xec, 0x5d, 0x5c, 0xdf, + 0xf2, 0xa1, 0x9b, 0x5b, 0x3e, 0x74, 0x7f, 0xcb, 0xa3, 0xaf, 0x3d, 0x1e, 0x7d, 0xef, 0xf1, 0xe8, + 0x47, 0x8f, 0x47, 0xd7, 0x3d, 0x1e, 0xfd, 0xec, 0xf1, 0xe8, 0x77, 0x8f, 0x0f, 0xdd, 0xf7, 0x78, + 0xf4, 0xed, 0x8e, 0x0f, 0x5d, 0xdf, 0xf1, 0xa1, 0x9b, 0x3b, 0x3e, 0x74, 0xba, 0x57, 0x37, 0xdc, + 0x4f, 0xed, 0xb3, 0x8c, 0x66, 0x9b, 0xd9, 0x7a, 0x8b, 0xd4, 0x88, 0x45, 0xb2, 0x0d, 0xfb, 0xdc, + 0xc8, 0x76, 0x76, 0xb3, 0x8f, 0x7c, 0xfe, 0xcf, 0xe6, 0xe9, 0xab, 0xbf, 0xfb, 0x27, 0x00, 0x00, + 0xff, 0xff, 0xbf, 0x4d, 0x54, 0xfd, 0x30, 0x06, 0x00, 0x00, } func (x ValueType) String() string { @@ -592,6 +605,9 @@ func (this *Statistics) Equal(that interface{}) bool { if !bytes.Equal(this.MaxValue, that1.MaxValue) { return false } + if this.CardinalityCount != that1.CardinalityCount { + return false + } return true } func (this *PageInfo) Equal(that interface{}) bool { @@ -667,10 +683,11 @@ func (this *Statistics) GoString() string { if this == nil { return "nil" } - s := make([]string, 0, 6) + s := make([]string, 0, 7) s = append(s, "&datasetmd.Statistics{") s = append(s, "MinValue: "+fmt.Sprintf("%#v", this.MinValue)+",\n") s = append(s, "MaxValue: "+fmt.Sprintf("%#v", this.MaxValue)+",\n") + s = append(s, "CardinalityCount: "+fmt.Sprintf("%#v", this.CardinalityCount)+",\n") s = append(s, "}") return strings.Join(s, "") } @@ -804,6 +821,11 @@ func (m *Statistics) MarshalToSizedBuffer(dAtA []byte) (int, error) { _ = i var l int _ = l + if m.CardinalityCount != 0 { + i = encodeVarintDatasetmd(dAtA, i, uint64(m.CardinalityCount)) + i-- + dAtA[i] = 0x18 + } if len(m.MaxValue) > 0 { i -= len(m.MaxValue) copy(dAtA[i:], m.MaxValue) @@ -962,6 +984,9 @@ func (m *Statistics) Size() (n int) { if l > 0 { n += 1 + l + sovDatasetmd(uint64(l)) } + if m.CardinalityCount != 0 { + n += 1 + sovDatasetmd(uint64(m.CardinalityCount)) + } return n } @@ -1034,6 +1059,7 @@ func (this *Statistics) String() string { s := strings.Join([]string{`&Statistics{`, `MinValue:` + fmt.Sprintf("%v", this.MinValue) + `,`, `MaxValue:` + fmt.Sprintf("%v", this.MaxValue) + `,`, + `CardinalityCount:` + fmt.Sprintf("%v", this.CardinalityCount) + `,`, `}`, }, "") return s @@ -1434,6 +1460,25 @@ func (m *Statistics) Unmarshal(dAtA []byte) error { m.MaxValue = []byte{} } iNdEx = postIndex + case 3: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field CardinalityCount", wireType) + } + m.CardinalityCount = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowDatasetmd + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.CardinalityCount |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } default: iNdEx = preIndex skippy, err := skipDatasetmd(dAtA[iNdEx:]) diff --git a/pkg/dataobj/internal/metadata/datasetmd/datasetmd.proto b/pkg/dataobj/internal/metadata/datasetmd/datasetmd.proto index 450b5c241bcf7..c03e10e041c25 100644 --- a/pkg/dataobj/internal/metadata/datasetmd/datasetmd.proto +++ b/pkg/dataobj/internal/metadata/datasetmd/datasetmd.proto @@ -86,6 +86,12 @@ message Statistics { // Applications must not assume that an unset max_value means that the column // is empty; check for values_count == 0 instead. bytes max_value = 2; + + // Estimated number of distinct values in the column. + // + // Applications must not assume that an unset cardinality_count means that + // the column has no distinct values; check for values_count == 0 instead. + uint64 cardinality_count = 3; } // Page describes an individual page within a column. diff --git a/pkg/dataobj/internal/sections/logs/table.go b/pkg/dataobj/internal/sections/logs/table.go index 82658e967fa40..a2427bda59351 100644 --- a/pkg/dataobj/internal/sections/logs/table.go +++ b/pkg/dataobj/internal/sections/logs/table.go @@ -116,11 +116,13 @@ func (b *tableBuffer) StreamID(pageSize int) *dataset.ColumnBuilder { } col, err := dataset.NewColumnBuilder("", dataset.BuilderOptions{ - PageSizeHint: pageSize, - Value: datasetmd.VALUE_TYPE_INT64, - Encoding: datasetmd.ENCODING_TYPE_DELTA, - Compression: datasetmd.COMPRESSION_TYPE_NONE, - StoreRangeStats: true, + PageSizeHint: pageSize, + Value: datasetmd.VALUE_TYPE_INT64, + Encoding: datasetmd.ENCODING_TYPE_DELTA, + Compression: datasetmd.COMPRESSION_TYPE_NONE, + Statistics: dataset.StatisticsOptions{ + StoreRangeStats: true, + }, }) if err != nil { // We control the Value/Encoding tuple so this can't fail; if it does, @@ -140,11 +142,13 @@ func (b *tableBuffer) Timestamp(pageSize int) *dataset.ColumnBuilder { } col, err := dataset.NewColumnBuilder("", dataset.BuilderOptions{ - PageSizeHint: pageSize, - Value: datasetmd.VALUE_TYPE_INT64, - Encoding: datasetmd.ENCODING_TYPE_DELTA, - Compression: datasetmd.COMPRESSION_TYPE_NONE, - StoreRangeStats: true, + PageSizeHint: pageSize, + Value: datasetmd.VALUE_TYPE_INT64, + Encoding: datasetmd.ENCODING_TYPE_DELTA, + Compression: datasetmd.COMPRESSION_TYPE_NONE, + Statistics: dataset.StatisticsOptions{ + StoreRangeStats: true, + }, }) if err != nil { // We control the Value/Encoding tuple so this can't fail; if it does, @@ -177,7 +181,10 @@ func (b *tableBuffer) Metadata(key string, pageSize int, compressionOpts dataset Encoding: datasetmd.ENCODING_TYPE_PLAIN, Compression: datasetmd.COMPRESSION_TYPE_ZSTD, CompressionOptions: compressionOpts, - StoreRangeStats: true, + Statistics: dataset.StatisticsOptions{ + StoreRangeStats: true, + StoreCardinalityStats: true, + }, }) if err != nil { // We control the Value/Encoding tuple so this can't fail; if it does, @@ -213,7 +220,9 @@ func (b *tableBuffer) Message(pageSize int, compressionOpts dataset.CompressionO // // A "min log line" and "max log line" isn't very valuable, and since log // lines can be quite long, it would consume a fair amount of metadata. - StoreRangeStats: false, + Statistics: dataset.StatisticsOptions{ + StoreRangeStats: false, + }, }) if err != nil { // We control the Value/Encoding tuple so this can't fail; if it does, diff --git a/pkg/dataobj/internal/sections/streams/streams.go b/pkg/dataobj/internal/sections/streams/streams.go index 4d208fb1676d5..05002fb77e345 100644 --- a/pkg/dataobj/internal/sections/streams/streams.go +++ b/pkg/dataobj/internal/sections/streams/streams.go @@ -251,11 +251,13 @@ func (s *Streams) EncodeTo(enc *encoding.Encoder) error { } builder, err := dataset.NewColumnBuilder(name, dataset.BuilderOptions{ - PageSizeHint: s.pageSize, - Value: datasetmd.VALUE_TYPE_STRING, - Encoding: datasetmd.ENCODING_TYPE_PLAIN, - Compression: datasetmd.COMPRESSION_TYPE_ZSTD, - StoreRangeStats: true, + PageSizeHint: s.pageSize, + Value: datasetmd.VALUE_TYPE_STRING, + Encoding: datasetmd.ENCODING_TYPE_PLAIN, + Compression: datasetmd.COMPRESSION_TYPE_ZSTD, + Statistics: dataset.StatisticsOptions{ + StoreRangeStats: true, + }, }) if err != nil { return nil, fmt.Errorf("creating label column: %w", err) @@ -323,11 +325,13 @@ func (s *Streams) EncodeTo(enc *encoding.Encoder) error { func numberColumnBuilder(pageSize int) (*dataset.ColumnBuilder, error) { return dataset.NewColumnBuilder("", dataset.BuilderOptions{ - PageSizeHint: pageSize, - Value: datasetmd.VALUE_TYPE_INT64, - Encoding: datasetmd.ENCODING_TYPE_DELTA, - Compression: datasetmd.COMPRESSION_TYPE_NONE, - StoreRangeStats: true, + PageSizeHint: pageSize, + Value: datasetmd.VALUE_TYPE_INT64, + Encoding: datasetmd.ENCODING_TYPE_DELTA, + Compression: datasetmd.COMPRESSION_TYPE_NONE, + Statistics: dataset.StatisticsOptions{ + StoreRangeStats: true, + }, }) }