diff --git a/cmd/zfs_object_agent/server/src/main.rs b/cmd/zfs_object_agent/server/src/main.rs index 86ebc7dd22f6..be0b24ea4a2b 100644 --- a/cmd/zfs_object_agent/server/src/main.rs +++ b/cmd/zfs_object_agent/server/src/main.rs @@ -11,6 +11,7 @@ use util::writeln_stderr; use util::TrackingAllocator; use util::ALLOCATOR_PRINT_MIN_ALLOCS; use util::ALLOCATOR_PRINT_MIN_BYTES; +use zettacache::base_types::CacheGuid; use zettacache::CacheOpenMode; use zettaobject::object_access::BlobCredentials; use zettaobject::object_access::ObjectAccessProtocol; @@ -90,7 +91,7 @@ struct Cli { value_name = "GUID", conflicts_with = "cache-device" )] - guid: Option, + guid: Option, /// Clear the cache when it has incompatible features #[clap(long)] diff --git a/cmd/zfs_object_agent/zcdb/src/main.rs b/cmd/zfs_object_agent/zcdb/src/main.rs index a13af5ca2b3c..35e97389b350 100644 --- a/cmd/zfs_object_agent/zcdb/src/main.rs +++ b/cmd/zfs_object_agent/zcdb/src/main.rs @@ -3,6 +3,7 @@ use std::path::PathBuf; use clap::Parser; use clap::Subcommand; use git_version::git_version; +use zettacache::base_types::CacheGuid; use zettacache::CacheOpenMode; use zettacache::DumpSlabsOptions; use zettacache::DumpStructuresOptions; @@ -33,7 +34,7 @@ struct Cli { /// Specific cache GUID to look for in cache device directory #[clap(short = 'g', long, value_name = "GUID")] - guid: Option, + guid: Option, /// Sets the verbosity level for logging and debugging #[clap(short = 'v', long, parse(from_occurrences), global = true)] diff --git a/cmd/zfs_object_agent/zettacache/src/base_types.rs b/cmd/zfs_object_agent/zettacache/src/base_types.rs index 6afc69ebc949..f0aa6ffcc1d7 100644 --- a/cmd/zfs_object_agent/zettacache/src/base_types.rs +++ b/cmd/zfs_object_agent/zettacache/src/base_types.rs @@ -2,8 +2,10 @@ use std::borrow::Borrow; use std::cmp::min; use std::fmt::*; use std::num::NonZeroU64; +use std::num::ParseIntError; use std::ops::Add; use std::ops::Sub; +use std::str::FromStr; use more_asserts::*; use serde::Deserialize; @@ -18,6 +20,38 @@ impl Display for PoolGuid { } } +#[derive(Serialize, Deserialize, Debug, Clone, Copy, Eq, PartialEq, Hash)] +pub struct CacheGuid(pub u64); +impl CacheGuid { + pub fn new() -> Self { + CacheGuid(rand::random()) + } +} +impl Default for CacheGuid { + fn default() -> Self { + CacheGuid::new() + } +} +impl FromStr for CacheGuid { + type Err = ParseIntError; + fn from_str(s: &str) -> std::result::Result { + s.parse().map(CacheGuid) + } +} + +#[derive(Serialize, Deserialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd, Hash)] +pub struct DiskGuid(pub u64); +impl DiskGuid { + pub fn new() -> Self { + DiskGuid(rand::random()) + } +} +impl Default for DiskGuid { + fn default() -> Self { + DiskGuid::new() + } +} + #[derive(Serialize, Deserialize, Debug, Copy, Clone, PartialEq, Eq, Ord, PartialOrd, Hash)] pub struct BlockId(pub u64); impl Display for BlockId { diff --git a/cmd/zfs_object_agent/zettacache/src/features.rs b/cmd/zfs_object_agent/zettacache/src/features.rs index 9563ce87cc14..362140708411 100644 --- a/cmd/zfs_object_agent/zettacache/src/features.rs +++ b/cmd/zfs_object_agent/zettacache/src/features.rs @@ -28,6 +28,7 @@ lazy_static! { SLAB_ALLOCATOR.clone(), SLAB_SIZE_32MB.clone(), TRIMMED_INDEX.clone(), + DISK_GUIDS.clone(), ] .map(|feature: Feature| (feature.name, feature.info)) .into_iter() @@ -48,6 +49,10 @@ lazy_static! { name: FeatureName("com.delphix:trimmed_index".to_string()), info: FeatureType::Upgradeable }; + pub static ref DISK_GUIDS: Feature = Feature { + name: FeatureName("com.delphix:disk_guids".to_string()), + info: FeatureType::Upgradeable + }; } #[derive(Debug)] diff --git a/cmd/zfs_object_agent/zettacache/src/open.rs b/cmd/zfs_object_agent/zettacache/src/open.rs index 39a968ac9537..c6ab7cb87237 100644 --- a/cmd/zfs_object_agent/zettacache/src/open.rs +++ b/cmd/zfs_object_agent/zettacache/src/open.rs @@ -16,6 +16,8 @@ use tokio::fs::File; use tokio::io::AsyncReadExt; use util::message::SUPERBLOCK_SIZE; +use crate::base_types::CacheGuid; +use crate::base_types::DiskGuid; use crate::base_types::DiskId; use crate::block_access::BlockAccess; use crate::superblock::SuperblockPhys; @@ -23,8 +25,8 @@ use crate::superblock::SuperblockPhys; #[derive(Debug, Clone)] pub enum CacheOpenMode { DeviceList(Vec), - DiscoveryDirectory(PathBuf, Option), // u64 is the cache guid - None, // no zettacache + DiscoveryDirectory(PathBuf, Option), + None, // no zettacache } impl CacheOpenMode { @@ -73,8 +75,8 @@ impl DiscoveredDevice { } } -async fn discover_devices(dir_path: &Path, target_guid: Option) -> Result> { - let mut caches = HashMap::>::new(); +async fn discover_devices(dir_path: &Path, target_guid: Option) -> Result> { + let mut caches = HashMap::<_, BTreeMap<_, _>>::new(); let mut discovery = FuturesUnordered::new(); let mut canonical_entries = HashSet::new(); @@ -121,28 +123,26 @@ async fn discover_devices(dir_path: &Path, target_guid: Option) -> Result { debug!("discovery: found device: {device:?}"); - let cache_guid = device.superblock.guid; + let cache_guid = device.superblock.cache_guid; let cache = caches.entry(cache_guid).or_default(); - if let Some(old_device) = cache.insert(device.superblock.disk, device) { - // If we crash in the middle of a zcache-add for one device - // and then do zcache-add for another device, we may get - // into a situation where discovery runs into two devices - // with the same DiskID and cache GUID. Until we make cache - // device import more deterministic (see DLPX-81000) error - // out. + let disk_guid = device.superblock.disk_guid; + if let Some(old_device) = cache.insert((device.superblock.disk, disk_guid), device) + { return Err(anyhow!( - "found two disks with {:?} for cache {cache_guid}", - old_device.superblock.disk + "found two disks with {:?} for {cache_guid:?} with {disk_guid:?}", + old_device.superblock.disk, )); } } Err(why) => debug!("discovery: error: {why:?}"), }; } - filter_invalid_caches(&mut caches); + + caches.retain(|&guid, disks| is_valid_cache(guid, disks)); if let Some(guid) = target_guid { caches.retain(|cache_guid, _| *cache_guid == guid); } + match caches.values().next() { Some(cache) => { if caches.len() > 1 { @@ -161,25 +161,51 @@ async fn discover_devices(dir_path: &Path, target_guid: Option) -> Result>) { - // Only retain caches that have a primary block - caches.retain(|_, disks| disks.values().any(|disk| disk.superblock.primary.is_some())); - - // Only retain caches whose devices we've discovered - caches.retain(|cache_guid, disks| { - let mut disk_ids_from_discovery = HashSet::new(); - let mut disk_ids_from_primary = HashSet::new(); - for (&disk_id, disk) in disks { - disk_ids_from_discovery.insert(disk_id); - if let Some(primary) = &disk.superblock.primary { - disk_ids_from_primary = primary.disks.keys().copied().collect(); - } +fn is_valid_cache( + cache_guid: CacheGuid, + disks: &mut BTreeMap<(DiskId, Option), DiscoveredDevice>, +) -> bool { + let primary = disks + .values() + .find(|&disk| disk.superblock.primary.is_some()); + + match primary { + Some(disk) => { + let disks_from_primary = disk + .superblock + .primary + .as_ref() + .unwrap() + .disks + .iter() + .map(|(k, v)| (*k, v.guid)) + .collect::>(); + + // Only retain disks with IDs that are part of the primary block + disks.retain( + |(id, disk_guid), _| match (disks_from_primary.get(id), *disk_guid) { + (Some(&guid_from_primary), Some(guid_from_superblock)) => { + guid_from_superblock == guid_from_primary + } + (Some(_guid_from_primary), None) => true, + (None, _) => false, + }, + ); + + // If we are missing any device mentioned in the primary block, + // then this cache is invalid + let disks_ids_from_discovery = disks + .keys() + .map(|(id, _)| id) + .copied() + .collect::>(); + let disk_ids_from_primary = disks_from_primary.keys().copied().collect::>(); + disk_ids_from_primary + .difference(&disks_ids_from_discovery) + .inspect(|id| info!("cache {cache_guid:?} can't find {id:?}")) + .count() + == 0 } - assert!(!disk_ids_from_primary.is_empty()); - disk_ids_from_primary - .difference(&disk_ids_from_discovery) - .inspect(|id| info!("cache {cache_guid} can't find {id:?}")) - .count() - == 0 - }); + None => false, + } } diff --git a/cmd/zfs_object_agent/zettacache/src/superblock.rs b/cmd/zfs_object_agent/zettacache/src/superblock.rs index 06276f7d2f3f..2738595d8f5b 100644 --- a/cmd/zfs_object_agent/zettacache/src/superblock.rs +++ b/cmd/zfs_object_agent/zettacache/src/superblock.rs @@ -25,7 +25,10 @@ pub const SUPERBLOCK_SIZE: u64 = util::message::SUPERBLOCK_SIZE as u64; pub struct SuperblockPhys { pub primary: Option, pub disk: DiskId, - pub guid: u64, + #[serde(alias = "guid")] + pub cache_guid: CacheGuid, + #[serde(default)] + pub disk_guid: Option, } /// Subset of SuperblockPhys that's needed to get the feature flags. @@ -33,19 +36,24 @@ pub struct SuperblockPhys { struct SuperblockFeaturesPhys { primary: Option, disk: DiskId, - guid: u64, + #[serde(alias = "guid")] + cache_guid: CacheGuid, } /// State stored about every disk #[derive(Serialize, Deserialize, Debug, Clone)] pub struct DiskPhys { pub size: u64, - // XXX put sector size in here too and verify it matches what the disk says now? + #[serde(default)] + pub guid: DiskGuid, } impl DiskPhys { pub fn new(size: u64) -> Self { - Self { size } + Self { + size, + guid: DiskGuid::new(), + } } } @@ -55,6 +63,8 @@ pub struct PrimaryPhys { pub checkpoint_id: CheckpointId, pub feature_flags: Vec, pub disks: BTreeMap, + #[serde(default)] + pub sector_size: Option, // Each extent is a single slab, but the last extent can be a fraction of a // slab. The remainder of that slab is uninitialized padding. @@ -68,17 +78,27 @@ pub struct PrimaryFeaturesPhys { } impl PrimaryPhys { - pub fn new(disks: BTreeMap, checkpoint_extents: Vec) -> Self { + pub fn new( + disks: BTreeMap, + sector_size: u64, + checkpoint_extents: Vec, + ) -> Self { PrimaryPhys { checkpoint_id: CheckpointId(0), feature_flags: SUPPORTED_FEATURES.keys().cloned().collect(), disks, + sector_size: Some(sector_size), checkpoint: checkpoint_extents, } } /// Write superblocks to all disks. - pub async fn write_all(&self, primary_disk: DiskId, guid: u64, block_access: &BlockAccess) { + pub async fn write_all( + &self, + primary_disk: DiskId, + cache_guid: CacheGuid, + block_access: &BlockAccess, + ) { // Write the non-primary disks first, so that newly-added disks will // have their superblocks present before the primary superblock is // updated to indicate that they are part of the cache. If we wrote all @@ -92,7 +112,8 @@ impl PrimaryPhys { let phys = SuperblockPhys { primary: None, disk, - guid, + cache_guid, + disk_guid: Some(self.disks.get(&disk).unwrap().guid), }; phys.write(block_access, disk).await; }) @@ -103,7 +124,8 @@ impl PrimaryPhys { let phys = SuperblockPhys { primary: Some(self.clone()), disk: primary_disk, - guid, + cache_guid, + disk_guid: Some(self.disks.get(&primary_disk).unwrap().guid), }; phys.write(block_access, primary_disk).await; } @@ -114,17 +136,19 @@ impl PrimaryPhys { .map(|phys| phys.feature_flags) } - /// Return value is (Self, primary_disk, guid, extra_disks) - pub async fn read(block_access: &BlockAccess) -> Result<(Self, DiskId, u64, Vec)> { + /// Return value is (Self, primary_disk, cache_guid, extra_disks) + pub async fn read( + block_access: &BlockAccess, + ) -> Result<(Self, DiskId, CacheGuid, Vec)> { let results = SuperblockPhys::read_all(block_access).await; - let (primary, primary_disk, guid) = results + let (mut primary, primary_disk, cache_guid) = results .iter() .find_map(|result| { if let Ok(phys) = result { phys.primary .as_ref() - .map(|primary| (primary.clone(), phys.disk, phys.guid)) + .map(|primary| (primary.clone(), phys.disk, phys.cache_guid)) } else { None } @@ -144,11 +168,21 @@ impl PrimaryPhys { // XXX proper error handling // XXX we should be able to reorder them? if let Ok(phys) = result { - assert_eq!(DiskId::new(id), phys.disk); - assert_eq!(phys.guid, guid); + let disk = DiskId::new(id); + assert_eq!(disk, phys.disk); assert!(phys.primary.is_none() || phys.disk == primary_disk); + assert_eq!(phys.cache_guid, cache_guid); + if let Some(disk_guid) = phys.disk_guid { + assert_eq!(disk_guid, primary.disks.get(&disk).unwrap().guid); + } } } + let sector_size = block_access.round_up_to_sector::(1); + if let Some(recorded_sector_size) = primary.sector_size { + assert_eq!(recorded_sector_size, sector_size); + } else { + primary.sector_size = Some(sector_size); + } assert_eq!( results.len() - extra_disks.len(), @@ -159,7 +193,7 @@ impl PrimaryPhys { results.len() - extra_disks.len() ); - Ok((primary, primary_disk, guid, extra_disks)) + Ok((primary, primary_disk, cache_guid, extra_disks)) } } @@ -213,11 +247,12 @@ impl SuperblockPhys { for block_access_disk_id in block_access.disks() { match Self::read_impl(block_access, block_access_disk_id).await { Ok(superblock) => writeln_stdout!( - "{:?} - Path: {:?} Size: {} GUID: {} Primary?: {}", + "{:?} - Path: {:?} Size: {} {:?} {:?} Primary?: {}", superblock.disk, block_access.disk_path(block_access_disk_id), nice_p2size(block_access.disk_size(block_access_disk_id)), - superblock.guid, + superblock.disk_guid, + superblock.cache_guid, match &superblock.primary { Some(primary) => format!("{:#?}", primary), None => "No".to_string(), @@ -233,17 +268,17 @@ impl SuperblockPhys { } impl PrimaryFeaturesPhys { - /// Return value is (Self, primary_disk, guid, extra_disks) + /// Return value is (Self, primary_disk, cache_guid, extra_disks) pub async fn read(block_access: &BlockAccess) -> Result { let results = SuperblockFeaturesPhys::read_all(block_access).await; - let (primary, primary_disk, guid) = results + let (primary, primary_disk, cache_guid) = results .iter() .find_map(|result| { if let Ok(phys) = result { phys.primary .as_ref() - .map(|primary| (primary.clone(), phys.disk, phys.guid)) + .map(|primary| (primary.clone(), phys.disk, phys.cache_guid)) } else { None } @@ -255,7 +290,7 @@ impl PrimaryFeaturesPhys { // XXX we should be able to reorder them? if let Ok(phys) = result { assert_eq!(DiskId::new(id), phys.disk); - assert_eq!(phys.guid, guid); + assert_eq!(phys.cache_guid, cache_guid); assert!(phys.primary.is_none() || phys.disk == primary_disk); } } diff --git a/cmd/zfs_object_agent/zettacache/src/zettacache/mod.rs b/cmd/zfs_object_agent/zettacache/src/zettacache/mod.rs index 284e8220212e..28650812b747 100644 --- a/cmd/zfs_object_agent/zettacache/src/zettacache/mod.rs +++ b/cmd/zfs_object_agent/zettacache/src/zettacache/mod.rs @@ -211,7 +211,7 @@ type PendingChanges = BTreeMap; struct Locked { block_access: Arc, primary: PrimaryPhys, - guid: u64, + guid: CacheGuid, primary_disk: DiskId, block_allocator: BlockAllocator, pending_changes: PendingChanges, @@ -449,7 +449,7 @@ impl Inner { } let block_access = BlockAccess::new(disks, false); - let guid: u64 = rand::random(); + let guid = CacheGuid::new(); let total_capacity = block_access.total_capacity(); info!("creating cache from {} disks", block_access.disks().count()); @@ -483,6 +483,7 @@ impl Inner { .disks() .map(|disk| (disk, DiskPhys::new(block_access.disk_size(disk)))) .collect(), + block_access.round_up_to_sector(1), checkpoint_extents, ) .write_all(DiskId::new(0), guid, &block_access) @@ -605,7 +606,7 @@ impl Inner { } info!( - "opening ZettaCache {} with {} disks", + "opening ZettaCache {:?} with {} disks", guid, primary.disks.len() ); diff --git a/cmd/zfs_object_agent/zettacache/src/zettacache/zcdb.rs b/cmd/zfs_object_agent/zettacache/src/zettacache/zcdb.rs index 1128a460d8f0..d4ab0c154b3a 100644 --- a/cmd/zfs_object_agent/zettacache/src/zettacache/zcdb.rs +++ b/cmd/zfs_object_agent/zettacache/src/zettacache/zcdb.rs @@ -8,6 +8,7 @@ use util::writeln_stderr; use util::writeln_stdout; use super::CheckpointPhys; +use crate::base_types::CacheGuid; use crate::base_types::DiskId; use crate::block_access::BlockAccess; use crate::block_access::Disk; @@ -24,7 +25,7 @@ pub struct ZCacheDBHandle { block_access: Arc, primary: PrimaryPhys, primary_disk: DiskId, - guid: u64, + guid: CacheGuid, checkpoint: Arc, slab_builder: SlabAllocatorBuilder, } @@ -74,7 +75,7 @@ impl ZCacheDBHandle { pub async fn dump_space(&self) { writeln_stdout!("Superblock: {}", nice_p2size(SUPERBLOCK_SIZE)); - writeln_stdout!(" Primary {:?}, GUID: {}", self.primary_disk, self.guid); + writeln_stdout!(" Primary {:?}, {:?}", self.primary_disk, self.guid); writeln_stdout!(); let slabs_capacity = self