Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add btrfs device error stats #2193

Merged
merged 10 commits into from
Sep 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
259 changes: 239 additions & 20 deletions collector/btrfs_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,13 @@ package collector

import (
"fmt"
"path"
"strings"
"syscall"

dennwc "github.com/dennwc/btrfs"
"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/procfs/btrfs"
)
Expand Down Expand Up @@ -52,34 +57,178 @@ func NewBtrfsCollector(logger log.Logger) (Collector, error) {
func (c *btrfsCollector) Update(ch chan<- prometheus.Metric) error {
stats, err := c.fs.Stats()
if err != nil {
return fmt.Errorf("failed to retrieve Btrfs stats: %w", err)
return fmt.Errorf("failed to retrieve Btrfs stats from procfs: %w", err)
}

ioctlStatsMap, err := c.getIoctlStats()
if err != nil {
level.Debug(c.logger).Log(
"msg", "Error querying btrfs device stats with ioctl",
"err", err)
ioctlStatsMap = make(map[string]*btrfsIoctlFsStats)
}

for _, s := range stats {
c.updateBtrfsStats(ch, s)
// match up procfs and ioctl info by filesystem UUID (without dashes)
var fsUUID = strings.Replace(s.UUID, "-", "", -1)
ioctlStats := ioctlStatsMap[fsUUID]
c.updateBtrfsStats(ch, s, ioctlStats)
}

return nil
}

type btrfsIoctlFsDevStats struct {
path string
uuid string

bytesUsed uint64
totalBytes uint64

// The error stats below match the following upstream lists:
// https://github.com/dennwc/btrfs/blob/b3db0b2dedac3bf580f412034d77e0bf4b420167/btrfs.go#L132-L140
// https://github.com/torvalds/linux/blob/70d605cbeecb408dd884b1f0cd3963eeeaac144c/include/uapi/linux/btrfs.h#L680-L692
writeErrs uint64
readErrs uint64
flushErrs uint64
corruptionErrs uint64
generationErrs uint64
}

type btrfsIoctlFsStats struct {
uuid string
devices []btrfsIoctlFsDevStats
}

func (c *btrfsCollector) getIoctlStats() (map[string]*btrfsIoctlFsStats, error) {
// Instead of introducing more ioctl calls to scan for all btrfs
// filesytems re-use our mount point utils to find known mounts
leth marked this conversation as resolved.
Show resolved Hide resolved
mountsList, err := mountPointDetails(c.logger)
if err != nil {
return nil, err
}

// track devices we have successfully scanned, by device path
devicesDone := make(map[string]struct{})
// filesystems scann results by UUID
fsStats := make(map[string]*btrfsIoctlFsStats)

for _, mount := range mountsList {
if mount.fsType != "btrfs" {
continue
}

if _, found := devicesDone[mount.device]; found {
// We already found this filesystem by another mount point
continue
}

fs, err := dennwc.Open(mount.mountPoint, true)
if err != nil {
// failed to open this mount point, maybe we didn't have permission
// maybe we'll find another mount point for this FS later
level.Debug(c.logger).Log(
"msg", "Error inspecting btrfs mountpoint",
"mountPoint", mount.mountPoint,
"err", err)
continue
}

fsInfo, err := fs.Info()
if err != nil {
// Failed to get the FS info for some reason,
// perhaps it'll work with a different mount point
level.Debug(c.logger).Log(
"msg", "Error querying btrfs filesystem",
"mountPoint", mount.mountPoint,
"err", err)
continue
}

fsID := fsInfo.FSID.String()
if _, found := fsStats[fsID]; found {
// We already found this filesystem by another mount point
continue
}

deviceStats, err := c.getIoctlDeviceStats(fs, &fsInfo)
if err != nil {
level.Debug(c.logger).Log(
"msg", "Error querying btrfs device stats",
"mountPoint", mount.mountPoint,
"err", err)
continue
}

devicesDone[mount.device] = struct{}{}
fsStats[fsID] = &btrfsIoctlFsStats{
uuid: fsID,
devices: deviceStats,
}
}

return fsStats, nil
}

func (c *btrfsCollector) getIoctlDeviceStats(fs *dennwc.FS, fsInfo *dennwc.Info) ([]btrfsIoctlFsDevStats, error) {
devices := make([]btrfsIoctlFsDevStats, 0, fsInfo.NumDevices)

for i := uint64(0); i <= fsInfo.MaxID; i++ {
deviceInfo, err := fs.GetDevInfo(i)

if err != nil {
if errno, ok := err.(syscall.Errno); ok && errno == syscall.ENODEV {
// device IDs do not consistently start at 0, nor are ranges contiguous, so we expect this
continue
}
return nil, err
}

deviceStats, err := fs.GetDevStats(i)
if err != nil {
return nil, err
}

devices = append(devices, btrfsIoctlFsDevStats{
path: deviceInfo.Path,
uuid: deviceInfo.UUID.String(),
bytesUsed: deviceInfo.BytesUsed,
totalBytes: deviceInfo.TotalBytes,

writeErrs: deviceStats.WriteErrs,
readErrs: deviceStats.ReadErrs,
flushErrs: deviceStats.FlushErrs,
corruptionErrs: deviceStats.CorruptionErrs,
generationErrs: deviceStats.GenerationErrs,
})

if uint64(len(devices)) == fsInfo.NumDevices {
break
}
}

return devices, nil
}

// btrfsMetric represents a single Btrfs metric that is converted into a Prometheus Metric.
type btrfsMetric struct {
name string
metricType prometheus.ValueType
desc string
value float64
extraLabel []string
extraLabelValue []string
}

// updateBtrfsStats collects statistics for one bcache ID.
func (c *btrfsCollector) updateBtrfsStats(ch chan<- prometheus.Metric, s *btrfs.Stats) {
func (c *btrfsCollector) updateBtrfsStats(ch chan<- prometheus.Metric, s *btrfs.Stats, ioctlStats *btrfsIoctlFsStats) {
const subsystem = "btrfs"

// Basic information about the filesystem.
devLabels := []string{"uuid"}

// Retrieve the metrics.
metrics := c.getMetrics(s)
metrics := c.getMetrics(s, ioctlStats)

// Convert all gathered metrics to Prometheus Metrics and add to channel.
for _, m := range metrics {
Expand All @@ -99,46 +248,112 @@ func (c *btrfsCollector) updateBtrfsStats(ch chan<- prometheus.Metric, s *btrfs.

ch <- prometheus.MustNewConstMetric(
desc,
prometheus.GaugeValue,
m.metricType,
m.value,
labelValues...,
)
}
}

// getMetrics returns metrics for the given Btrfs statistics.
func (c *btrfsCollector) getMetrics(s *btrfs.Stats) []btrfsMetric {
func (c *btrfsCollector) getMetrics(s *btrfs.Stats, ioctlStats *btrfsIoctlFsStats) []btrfsMetric {
metrics := []btrfsMetric{
{
name: "info",
desc: "Filesystem information",
value: 1,
metricType: prometheus.GaugeValue,
extraLabel: []string{"label"},
extraLabelValue: []string{s.Label},
},
{
name: "global_rsv_size_bytes",
desc: "Size of global reserve.",
value: float64(s.Allocation.GlobalRsvSize),
name: "global_rsv_size_bytes",
desc: "Size of global reserve.",
metricType: prometheus.GaugeValue,
value: float64(s.Allocation.GlobalRsvSize),
},
}

// Information about devices.
for n, dev := range s.Devices {
metrics = append(metrics, btrfsMetric{
name: "device_size_bytes",
desc: "Size of a device that is part of the filesystem.",
value: float64(dev.Size),
extraLabel: []string{"device"},
extraLabelValue: []string{n},
})
}

// Information about data, metadata and system data.
metrics = append(metrics, c.getAllocationStats("data", s.Allocation.Data)...)
metrics = append(metrics, c.getAllocationStats("metadata", s.Allocation.Metadata)...)
metrics = append(metrics, c.getAllocationStats("system", s.Allocation.System)...)

// Information about devices.
if ioctlStats == nil {
for n, dev := range s.Devices {
metrics = append(metrics, btrfsMetric{
name: "device_size_bytes",
desc: "Size of a device that is part of the filesystem.",
metricType: prometheus.GaugeValue,
value: float64(dev.Size),
extraLabel: []string{"device"},
extraLabelValue: []string{n},
})
}
return metrics
}

for _, dev := range ioctlStats.devices {
// trim the path prefix from the device name so the value should match
// the value used in the fallback branch above.
// e.g. /dev/sda -> sda, /rootfs/dev/md1 -> md1
_, device := path.Split(dev.path)

extraLabels := []string{"device", "btrfs_dev_uuid"}
extraLabelValues := []string{device, dev.uuid}

metrics = append(metrics,
btrfsMetric{
name: "device_size_bytes",
desc: "Size of a device that is part of the filesystem.",
metricType: prometheus.GaugeValue,
value: float64(dev.totalBytes),
extraLabel: extraLabels,
extraLabelValue: extraLabelValues,
},
// A bytes available metric is probably more useful than a
// bytes used metric, because large numbers of bytes will
// suffer from floating point representation issues
// and we probably care more about the number when it's low anyway
btrfsMetric{
name: "device_unused_bytes",
desc: "Unused bytes unused on a device that is part of the filesystem.",
metricType: prometheus.GaugeValue,
value: float64(dev.totalBytes - dev.bytesUsed),
extraLabel: extraLabels,
extraLabelValue: extraLabelValues,
})

errorLabels := append([]string{"type"}, extraLabels...)
values := []uint64{
dev.writeErrs,
dev.readErrs,
dev.flushErrs,
dev.corruptionErrs,
dev.generationErrs,
}
btrfsErrorTypeNames := []string{
"write",
"read",
"flush",
"corruption",
"generation",
}

for i, errorType := range btrfsErrorTypeNames {
metrics = append(metrics,
btrfsMetric{
name: "device_errors_total",
desc: "Errors reported for the device",
metricType: prometheus.CounterValue,
value: float64(values[i]),
extraLabel: errorLabels,
extraLabelValue: append([]string{errorType}, extraLabelValues...),
})
}
}

return metrics
}

Expand All @@ -148,6 +363,7 @@ func (c *btrfsCollector) getAllocationStats(a string, s *btrfs.AllocationStats)
{
name: "reserved_bytes",
desc: "Amount of space reserved for a data type",
metricType: prometheus.GaugeValue,
value: float64(s.ReservedBytes),
extraLabel: []string{"block_group_type"},
extraLabelValue: []string{a},
Expand All @@ -168,20 +384,23 @@ func (c *btrfsCollector) getLayoutStats(a, l string, s *btrfs.LayoutUsage) []btr
{
name: "used_bytes",
desc: "Amount of used space by a layout/data type",
metricType: prometheus.GaugeValue,
value: float64(s.UsedBytes),
extraLabel: []string{"block_group_type", "mode"},
extraLabelValue: []string{a, l},
},
{
name: "size_bytes",
desc: "Amount of space allocated for a layout/data type",
metricType: prometheus.GaugeValue,
value: float64(s.TotalBytes),
extraLabel: []string{"block_group_type", "mode"},
extraLabelValue: []string{a, l},
},
{
name: "allocation_ratio",
desc: "Data allocation ratio for a layout/data type",
metricType: prometheus.GaugeValue,
value: s.Ratio,
extraLabel: []string{"block_group_type", "mode"},
extraLabelValue: []string{a, l},
Expand Down
Loading