Skip to content

Commit

Permalink
Merge pull request #2503 from phillxnet/#2394_Improve_'Maintenance_re…
Browse files Browse the repository at this point in the history
…quired'_advice_for_btrfs-single/btrfs-raid0

Improve 'Maintenance required' advice #2394
  • Loading branch information
phillxnet authored Mar 3, 2023
2 parents f397bc9 + f016264 commit 578c53e
Show file tree
Hide file tree
Showing 7 changed files with 266 additions and 93 deletions.
88 changes: 67 additions & 21 deletions src/rockstor/fs/btrfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,35 @@
DefaultSubvol = collections.namedtuple("DefaultSubvol", "id path boot_to_snap")
# Named Tuple for balance status: active (boolean) internal (boolean) status (dict)
BalanceStatusAll = collections.namedtuple("BalanceStatusAll", "active internal status")
# Named Tuple to define raid profile limits
btrfs_profile = collections.namedtuple("btrfs_profile", "min_dev_count max_dev_missing")
# List of profiles indexed by their name.
# I.e. PROFILE[raid_level].min_dev_count
# N.B. Mixed profiles indicated by "-" i.e. DATA-METADATA
PROFILE = {
# non redundant profiles!
"single": btrfs_profile(min_dev_count=1, max_dev_missing=0),
"raid0": btrfs_profile(min_dev_count=2, max_dev_missing=0),
# Mirrored profiles:
"raid1": btrfs_profile(min_dev_count=2, max_dev_missing=1),
"raid1c3": btrfs_profile(min_dev_count=3, max_dev_missing=2),
"raid1c4": btrfs_profile(min_dev_count=4, max_dev_missing=3),
"raid10": btrfs_profile(min_dev_count=4, max_dev_missing=1),
# Parity raid levels
"raid5": btrfs_profile(min_dev_count=2, max_dev_missing=1),
"raid6": btrfs_profile(min_dev_count=3, max_dev_missing=2),
# ------- MIXED PROFILES DATA-METADATA (max 10 chars) -------
# Mixed Mirrored profiles:
"raid1-1c3": btrfs_profile(min_dev_count=3, max_dev_missing=1),
"raid1-1c4": btrfs_profile(min_dev_count=4, max_dev_missing=1),
"raid10-1c3": btrfs_profile(min_dev_count=4, max_dev_missing=1),
"raid10-1c4": btrfs_profile(min_dev_count=4, max_dev_missing=1),
# Parity data - Mirrored metadata
"raid5-1": btrfs_profile(min_dev_count=2, max_dev_missing=1),
"raid5-1c3": btrfs_profile(min_dev_count=3, max_dev_missing=1),
"raid6-1c3": btrfs_profile(min_dev_count=3, max_dev_missing=2),
"raid6-1c4": btrfs_profile(min_dev_count=4, max_dev_missing=2),
}


def add_pool(pool, disks):
Expand Down Expand Up @@ -197,29 +226,46 @@ def get_dev_io_error_stats(target, json_format=True):
return json.dumps(stats)


def is_pool_missing_dev(label):
def pool_missing_dev_count(label):
"""
Simple and fast wrapper around 'btrfs fi show --raw label' to return True /
False depending on if a device is reported missing from the given pool by
label. Works by matching the end of output lines for the string 'missing',
after lower the case of the line.
Parses 'btrfs fi show --raw label' to return number of missing devices.
Extracts vol total dev count from e.g.: "\tTotal devices 3 FS bytes used 2.63GiB".
And counts the number of lines there-after beginning "\tdevid" and not ending
in "SING" or "sing" (for "MISSING"/"missing").
Label is used as this is preserved in our Pool db so will work if the pool
fails to mount, and there by allows surfacing this as a potential reason
for the mount failure.
:param label: Pool label.
:return: True if at least one device was found to be missing, False if not.
:return: int for number of missing devices (total - attached).
"""
if label is None:
return False
return 0
# --raw used to minimise pre-processing of irrelevant 'used' info (units).
cmd = [BTRFS, "fi", "show", "--raw", label]
o, e, rc = run_command(cmd)
total_devices = 0
attached_devids = 0
for line in o:
if not line:
continue
if line.lower().endswith("missing"):
return True
return False
# Skip "Label:" line as it has no 'missing' info.
# Skip "warning, device 8 is missing" lines as they only appear when unmounted.
# Skip "(TAB)*** Some devices missing" we count devid lines no ending in MISSING
if line.startswith(("Lab", "war", "\t**")):
continue
if line.startswith("\tTotal"):
total_devices = int(line.split()[2])
continue
if not total_devices == 0:
# Leap 15.4 default & backport kernels (not missing)
# devid 5 size 5.00GiB used 2.12GiB path /dev/sda
# Newer Stable Kernel Backport (e.g. 6.2.0+) add a MISSING:
# older kernels do not have entries for missing devices.
# devid 1 size 0 used 0 path MISSING
if line.startswith("\tdev") and not line.endswith(("SING", "sing")):
attached_devids += 1
return total_devices - attached_devids


def degraded_pools_found():
Expand Down Expand Up @@ -1896,7 +1942,7 @@ def balance_status(pool):
:return: dictionary containing parsed info about the balance status,
ie indexed by 'status' and 'percent_done'.
"""
stats = {"status": u"unknown"}
stats = {"status": "unknown"}
# The balance status of an umounted pool is undetermined / unknown, ie it
# could still be mid balance: our balance status command requires a
# relevant active mount path.
Expand All @@ -1914,13 +1960,13 @@ def balance_status(pool):
if len(out) > 0:
if re.match("Balance", out[0]) is not None:
if re.search("cancel requested", out[0]) is not None:
stats["status"] = u"cancelling"
stats["status"] = "cancelling"
elif re.search("pause requested", out[0]) is not None:
stats["status"] = u"pausing"
stats["status"] = "pausing"
elif re.search("paused", out[0]) is not None:
stats["status"] = u"paused"
stats["status"] = "paused"
else:
stats["status"] = u"running"
stats["status"] = "running"
# make sure we have a second line before parsing it.
if len(out) > 1 and re.search("chunks balanced", out[1]) is not None:
percent_left = out[1].split()[-2][:-1]
Expand All @@ -1930,7 +1976,7 @@ def balance_status(pool):
except:
pass
elif re.match("No balance", out[0]) is not None:
stats["status"] = u"finished"
stats["status"] = "finished"
stats["percent_done"] = 100
return stats

Expand Down Expand Up @@ -2000,7 +2046,7 @@ def balance_status_internal(pool):
:return: dictionary containing parsed info about the balance status,
ie indexed by 'status' and 'percent_done'.
"""
stats = {"status": u"unknown"}
stats = {"status": "unknown"}
try:
mnt_pt = mount_root(pool)
except Exception as e:
Expand All @@ -2020,12 +2066,12 @@ def balance_status_internal(pool):
if fields[0] == "Unallocated:":
unallocated = int(fields[1])
if unallocated < 0:
stats["status"] = u"running"
stats["status"] = "running"
break
if unallocated >= 0:
# We have no 'tell' so report a finished balance as there is no
# evidence of one happening.
stats["status"] = u"finished"
stats["status"] = "finished"
stats["percent_done"] = 100
return stats

Expand All @@ -2044,10 +2090,10 @@ def balance_status_all(pool):
active = False
internal = False
status = balance_status(pool)
if status["status"] in [u"unknown", u"finished"]:
if status["status"] in ["unknown", "finished"]:
# Try internal balance detection as we don't have regular balance in-flight.
status_internal = balance_status_internal(pool)
if status_internal["status"] not in [u"unknown", u"finished"]:
if status_internal["status"] not in ["unknown", "finished"]:
internal = active = True
status = status_internal
else:
Expand Down
158 changes: 136 additions & 22 deletions src/rockstor/fs/tests/test_btrfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
balance_status_internal,
balance_status_all,
BalanceStatusAll,
is_pool_missing_dev,
pool_missing_dev_count,
btrfsprogs_legacy,
scrub_status_raw,
scrub_status_extra,
Expand Down Expand Up @@ -1665,9 +1665,9 @@ def test_device_scan_parameter(self):
msg="Failed to return results from non btrfs device.",
)

def test_is_pool_missing_dev(self):
def test_pool_missing_dev_count(self):
"""
Test is_pool_missing_dev() across various pool specific btrfs fi show outputs.
Test pool_missing_dev_count() across various pool specific btrfs fi show outputs.
"""
# More modern output where MISSING is now in upper case, and no longer only
# appears on the first or 3rd-last line.
Expand All @@ -1688,12 +1688,83 @@ def test_is_pool_missing_dev(self):
]
err = [[""]]
rc = [0]
expected_result = [True]
expected_result = [1]

# Leap 15.4 stable kernel backport 6.2.0-lp154.6.g09a9a65-default
# raid0 0 missing mounted:
pool_label.append("test-pool-new-kernel")
fi_show_out.append(
[
"Label: 'test-pool-new-kernel' uuid: 2c680ff8-9687-4356-87db-e48d23749d80",
"\tTotal devices 3 FS bytes used 2829742080",
"\tdevid 5 size 5368709120 used 1207959552 path /dev/sda",
"\tdevid 7 size 5368709120 used 1207959552 path /dev/vdb",
"\tdevid 8 size 5368709120 used 1207959552 path /dev/vda",
"",
"",
]
)
err.append([""])
rc.append(0)
expected_result.append(0)

# raid0 1 missing mounted (as dev removed live via virtio):
# Note: we have a path entry for the missing device on live removal.
pool_label.append("test-pool-new-kernel")
fi_show_out.append(
[
"Label: 'test-pool-new-kernel' uuid: 2c680ff8-9687-4356-87db-e48d23749d80",
"\tTotal devices 3 FS bytes used 2829742080",
"\tdevid 5 size 5368709120 used 1207959552 path /dev/sda",
"\tdevid 7 size 5368709120 used 1207959552 path /dev/vdb",
"\tdevid 8 size 0 used 0 path /dev/vda MISSING",
"",
"",
]
)
err.append([""])
rc.append(0)
expected_result.append(1)

# raid0 2 missing mounted (as devs removed live via virtio):
pool_label.append("test-pool-new-kernel")
fi_show_out.append(
[
"Label: 'test-pool-new-kernel' uuid: 2c680ff8-9687-4356-87db-e48d23749d80",
"\tTotal devices 3 FS bytes used 2829742080",
"\tdevid 5 size 5368709120 used 1207959552 path /dev/sda",
"\tdevid 7 size 0 used 0 path /dev/vdb MISSING",
"\tdevid 8 size 0 used 0 path /dev/vda MISSING",
"",
"",
]
)
err.append([""])
rc.append(0)
expected_result.append(2)

# raid0 2 missing unmounted (after a reboot of the last example of the same)
pool_label.append("test-pool-new-kernel")
fi_show_out.append(
[
"warning, device 7 is missing",
"warning, device 8 is missing",
"Label: 'test-pool-new-kernel' uuid: 2c680ff8-9687-4356-87db-e48d23749d80",
"\tTotal devices 3 FS bytes used 2829742080",
"\tdevid 5 size 5368709120 used 1207959552 path /dev/sda",
"\t*** Some devices missing",
"",
"",
]
)
err.append(["ERROR: cannot read chunk root", ""])
rc.append(0)
expected_result.append(2)

# Leap 15.4 with default kernel of 5.14.21-150400.24.41-default:
# unmounted degraded pool:
pool_label = ["test-pool-default-kernel"]
fi_show_out = [
pool_label.append("test-pool-default-kernel")
fi_show_out.append(
[
"warning, device 1 is missing",
"Label: 'test-pool-default-kernel' uuid: 21345a94-f2bf-48d7-a2be-37734ffd2a48",
Expand All @@ -1705,15 +1776,40 @@ def test_is_pool_missing_dev(self):
"",
"",
]
]
err = [[""]]
rc = [0]
expected_result = [True]
)
err.append([""])
rc.append(0)
expected_result.append(1)

# Leap 15.4 with default kernel of 5.14.21-150400.24.41-default:
# unmounted degraded pool with redundancy exceeded
pool_label.append("test-pool-default-kernel")
fi_show_out.append(
[
"warning, device 8 is missing",
"warning, device 7 is missing",
"Label: 'test-pool-default-kernel' uuid: 2c680ff8-9687-4356-87db-e48d23749d80",
"\tTotal devices 3 FS bytes used 2829807616",
"\tdevid 5 size 5368709120 used 2281701376 path /dev/sda",
"\t*** Some devices missing",
"",
"",
]
)
err.append(
[
"bad tree block 41142992896, bytenr mismatch, want=41142992896, have=0",
"ERROR: cannot read chunk root",
"",
]
)
rc.append(0)
expected_result.append(2)

# Leap 15.4 with default kernel of 5.14.21-150400.24.41-default:
# mounted -o ro,degraded
pool_label = ["test-pool-default-kernel"]
fi_show_out = [
pool_label.append("test-pool-default-kernel")
fi_show_out.append(
[
"Label: 'test-pool-default-kernel' uuid: 21345a94-f2bf-48d7-a2be-37734ffd2a48",
"\tTotal devices 4 FS bytes used 4508352512",
Expand All @@ -1724,10 +1820,28 @@ def test_is_pool_missing_dev(self):
"",
"",
]
]
err = [[""]]
rc = [0]
expected_result = [True]
)
err.append([""])
rc.append(0)
expected_result.append(1)

# 6.2.0 raid1 1 missing live removed still mounted rw and no degraded.
pool_label.append("test-pool-new-kernel")
fi_show_out.append(
[
"Label: 'test-pool-new-kernel' uuid: 2c680ff8-9687-4356-87db-e48d23749d80",
"\tTotal devices 4 FS bytes used 2829578240",
"\tdevid 5 size 5368709120 used 1342177280 path /dev/sda",
"\tdevid 7 size 5368709120 used 1342177280 path /dev/vdb",
"\tdevid 8 size 0 used 0 path /dev/vda MISSING",
"\tdevid 9 size 5368709120 used 3254779904 path /dev/sdc",
"",
"",
]
)
err.append([""])
rc.append(0)
expected_result.append(1)

pool_label.append("ROOT")
fi_show_out.append(
Expand All @@ -1741,24 +1855,24 @@ def test_is_pool_missing_dev(self):
)
err.append([""])
rc.append(0)
expected_result.append(False)
expected_result.append(0)

# Test for our return False on label = None.
# Test for our return 0 on label = None.
pool_label.append(None)
fi_show_out.append([""])
err.append([""])
rc.append(0)
expected_result.append(False)
expected_result.append(0)

# Cycle through each of the above mock_run_command data sets.
for label, out, e, r, result in zip(
pool_label, fi_show_out, err, rc, expected_result
):
self.mock_run_command.return_value = (out, e, r)
self.assertEqual(
is_pool_missing_dev(label),
pool_missing_dev_count(label),
result,
msg="Un-expected boolean returned: is_pool_missing_dev. Mock ({}) "
msg="Un-expected boolean returned: pool_missing_dev_count. Mock ({}) "
"return expected ({})".format(out, result),
)

Expand Down Expand Up @@ -1958,7 +2072,7 @@ def test_degraded_pools_found(self):
# A non degraded mount of rock-pool-3 was then successfully achieved
# the the output there after showed no signs of issue.
# So we don't count this pool as degraded in degraded_pools_found().
# But it would still be counted as degraded by is_pool_missing_dev()!!
# But it would still be counted as degraded by pool_missing_dev_count()!!
# This diversity in degraded pool assessment may prove useful later.
fi_show_out.append(
[
Expand Down
Loading

0 comments on commit 578c53e

Please sign in to comment.