From 6ae002460b8474abe7ebc3592605b88f53253387 Mon Sep 17 00:00:00 2001 From: neeraj pratap singh Date: Wed, 26 Jun 2024 22:29:16 +0530 Subject: [PATCH 001/148] mgr/vol : shortening the name of function Fixes: https://tracker.ceph.com/issues/66815 Introduced by:https://github.com/ceph/ceph/pull/55838#discussion_r1573655512 Signed-off-by: Neeraj Pratap Singh --- .../volumes/fs/operations/versions/metadata_manager.py | 2 +- .../mgr/volumes/fs/operations/versions/subvolume_v1.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/pybind/mgr/volumes/fs/operations/versions/metadata_manager.py b/src/pybind/mgr/volumes/fs/operations/versions/metadata_manager.py index 610a61e6a4c1f..146d6d3f453d6 100644 --- a/src/pybind/mgr/volumes/fs/operations/versions/metadata_manager.py +++ b/src/pybind/mgr/volumes/fs/operations/versions/metadata_manager.py @@ -172,7 +172,7 @@ def list_all_options_from_section(self, section): metadata_dict[option] = self.config.get(section,option) return metadata_dict - def list_all_keys_with_specified_values_from_section(self, section, value): + def filter_keys(self, section, value): keys = [] if self.config.has_section(section): options = self.config.options(section) diff --git a/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v1.py b/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v1.py index 90f35a4c90b39..f037d5d2a1bbd 100644 --- a/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v1.py +++ b/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v1.py @@ -752,7 +752,7 @@ def get_pending_clones(self, snapname): try: if self.has_pending_clones(snapname): - pending_track_id_list = self.metadata_mgr.list_all_keys_with_specified_values_from_section('clone snaps', snapname) + pending_track_id_list = self.metadata_mgr.filter_keys('clone snaps', snapname) else: return pending_clones_info except MetadataMgrException as me: @@ -774,9 +774,9 @@ def get_pending_clones(self, snapname): raise VolumeException(-e.args[0], e.args[1]) else: try: - # If clone is completed between 'list_all_keys_with_specified_values_from_section' - # and readlink(track_id_path) call then readlink will fail with error ENOENT (2) - # Hence we double check whether track_id is exist in .meta file or not. + # If clone is completed between 'filter_keys' and readlink(track_id_path) call + # then readlink will fail with error ENOENT (2). Hence we double check whether + # track_id exists in .meta file or not. # Edge case scenario. # If track_id for clone exist but path /volumes/_index/clone/{track_id} not found # then clone is orphan. From 9ae2c89511be4b64c17974cd0fc6770641f9af4d Mon Sep 17 00:00:00 2001 From: Rishabh Dave Date: Tue, 9 Jul 2024 18:49:52 +0530 Subject: [PATCH 002/148] qa/cephfs: ignore when specific OSD is reported down during upgrade We already ignore health warning regarding OSD being down during upgrade but health warning regarding specific OSD being down is not added to the ignorelist which causes upgrade jobs to be marked as failed even though they were successful. Fixes: https://tracker.ceph.com/issues/66877 Signed-off-by: Rishabh Dave --- .../mds_upgrade_sequence/overrides/ignorelist_upgrade.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/qa/suites/fs/upgrade/mds_upgrade_sequence/overrides/ignorelist_upgrade.yaml b/qa/suites/fs/upgrade/mds_upgrade_sequence/overrides/ignorelist_upgrade.yaml index 713adb9628ab6..96e4353e99c7a 100644 --- a/qa/suites/fs/upgrade/mds_upgrade_sequence/overrides/ignorelist_upgrade.yaml +++ b/qa/suites/fs/upgrade/mds_upgrade_sequence/overrides/ignorelist_upgrade.yaml @@ -2,3 +2,4 @@ overrides: ceph: log-ignorelist: - OSD_DOWN + - osd.*is down From 2268923dd9f18a7ba05f5d04ca39b26be4068a67 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Tue, 6 Aug 2024 10:47:53 +0200 Subject: [PATCH 003/148] qa/rados/upgrade: ignore PG_DEGRADED Fixes: https://tracker.ceph.com/issues/67182 Signed-off-by: Pere Diaz Bou --- qa/suites/upgrade/reef-x/parallel/0-start.yaml | 2 ++ qa/suites/upgrade/reef-x/parallel/1-tasks.yaml | 1 + 2 files changed, 3 insertions(+) diff --git a/qa/suites/upgrade/reef-x/parallel/0-start.yaml b/qa/suites/upgrade/reef-x/parallel/0-start.yaml index 3814ea3efdb50..146bd57960dad 100644 --- a/qa/suites/upgrade/reef-x/parallel/0-start.yaml +++ b/qa/suites/upgrade/reef-x/parallel/0-start.yaml @@ -31,3 +31,5 @@ overrides: conf: osd: osd shutdown pgref assert: true + log-ignorelist: + - PG_DEGRADED diff --git a/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml b/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml index bf3005fad458f..ce4e0cc228bba 100644 --- a/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml +++ b/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml @@ -6,6 +6,7 @@ overrides: - MON_DOWN - out of quorum - PG_AVAILABILITY + - PG_DEGRADED tasks: - install: branch: reef From 12a9aba43caedf6ea2cb897708e0b31d96ee358f Mon Sep 17 00:00:00 2001 From: Adam King Date: Mon, 9 Sep 2024 18:28:45 -0400 Subject: [PATCH 004/148] cephadm: add ability to continue on failure when applying multiple specs Additionally, add the flag that does so when cephadm applies a spec during bootstrap. Bootstrap will continue to completion regardless of whether applying the spec fails, so we might as well try applying all of it while reporting errors we do see back to the user Fixes: https://tracker.ceph.com/issues/65338 Signed-off-by: Adam King --- src/cephadm/cephadm.py | 2 +- src/pybind/mgr/cephadm/module.py | 16 +++++++- src/pybind/mgr/orchestrator/_interface.py | 7 +++- src/pybind/mgr/orchestrator/module.py | 48 +++++++++++++++++++---- 4 files changed, 62 insertions(+), 11 deletions(-) diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py index e71addf7bfa5a..33cac4a34032a 100755 --- a/src/cephadm/cephadm.py +++ b/src/cephadm/cephadm.py @@ -2946,7 +2946,7 @@ def mgr_has_latest_epoch(): mounts = {} mounts[pathify(ctx.apply_spec)] = '/tmp/spec.yml:ro' try: - out = cli(['orch', 'apply', '-i', '/tmp/spec.yml'], extra_mounts=mounts) + out = cli(['orch', 'apply', '--continue-on-error', '-i', '/tmp/spec.yml'], extra_mounts=mounts) logger.info(out) except Exception: ctx.error_code = -errno.EINVAL diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 1a9a10862180e..650d9711bd3be 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -3516,7 +3516,12 @@ def _apply_service_spec(self, spec: ServiceSpec) -> str: return "Scheduled %s update..." % spec.service_name() @handle_orch_error - def apply(self, specs: Sequence[GenericSpec], no_overwrite: bool = False) -> List[str]: + def apply( + self, + specs: Sequence[GenericSpec], + no_overwrite: bool = False, + continue_on_error: bool = True + ) -> List[str]: results = [] for spec in specs: if no_overwrite: @@ -3528,7 +3533,14 @@ def apply(self, specs: Sequence[GenericSpec], no_overwrite: bool = False) -> Lis results.append('Skipped %s service spec. To change %s spec omit --no-overwrite flag' % (cast(ServiceSpec, spec).service_name(), cast(ServiceSpec, spec).service_name())) continue - results.append(self._apply(spec)) + try: + res = self._apply(spec) + results.append(res) + except Exception as e: + if continue_on_error: + results.append(f'Failed to apply spec for {spec}: {str(e)}') + else: + raise e return results @handle_orch_error diff --git a/src/pybind/mgr/orchestrator/_interface.py b/src/pybind/mgr/orchestrator/_interface.py index c33f38cfdd470..7b602401a7e61 100644 --- a/src/pybind/mgr/orchestrator/_interface.py +++ b/src/pybind/mgr/orchestrator/_interface.py @@ -574,7 +574,12 @@ def cert_store_get_key( raise NotImplementedError() @handle_orch_error - def apply(self, specs: Sequence["GenericSpec"], no_overwrite: bool = False) -> List[str]: + def apply( + self, + specs: Sequence["GenericSpec"], + no_overwrite: bool = False, + continue_on_error: bool = False + ) -> List[str]: """ Applies any spec """ diff --git a/src/pybind/mgr/orchestrator/module.py b/src/pybind/mgr/orchestrator/module.py index d0f3286177ce5..7c943b076f4a8 100644 --- a/src/pybind/mgr/orchestrator/module.py +++ b/src/pybind/mgr/orchestrator/module.py @@ -1620,12 +1620,14 @@ def apply_misc(self, format: Format = Format.plain, unmanaged: bool = False, no_overwrite: bool = False, + continue_on_error: bool = False, inbuf: Optional[str] = None) -> HandleCommandResult: """Update the size or placement for a service or apply a large yaml spec""" usage = """Usage: ceph orch apply -i [--dry-run] ceph orch apply [--placement=] [--unmanaged] """ + errs: List[str] = [] if inbuf: if service_type or placement or unmanaged: raise OrchestratorValidationError(usage) @@ -1635,7 +1637,14 @@ def apply_misc(self, # None entries in the output. Let's skip them silently. content = [o for o in yaml_objs if o is not None] for s in content: - spec = json_to_generic_spec(s) + try: + spec = json_to_generic_spec(s) + except Exception as e: + if continue_on_error: + errs.append(f'Failed to convert {s} from json object: {str(e)}') + continue + else: + raise e # validate the config (we need MgrModule for that) if isinstance(spec, ServiceSpec) and spec.config: @@ -1643,7 +1652,12 @@ def apply_misc(self, try: self.get_foreign_ceph_option('mon', k) except KeyError: - raise SpecValidationError(f'Invalid config option {k} in spec') + err = SpecValidationError(f'Invalid config option {k} in spec') + if continue_on_error: + errs.append(str(err)) + continue + else: + raise err # There is a general "osd" service with no service id, but we use # that to dump osds created individually with "ceph orch daemon add osd" @@ -1658,7 +1672,12 @@ def apply_misc(self, and spec.service_type == 'osd' and not spec.service_id ): - raise SpecValidationError('Please provide the service_id field in your OSD spec') + err = SpecValidationError('Please provide the service_id field in your OSD spec') + if continue_on_error: + errs.append(str(err)) + continue + else: + raise err if dry_run and not isinstance(spec, HostSpec): spec.preview_only = dry_run @@ -1668,15 +1687,30 @@ def apply_misc(self, continue specs.append(spec) else: + # Note in this case there is only ever one spec + # being applied so there is no need to worry about + # handling of continue_on_error placementspec = PlacementSpec.from_string(placement) if not service_type: raise OrchestratorValidationError(usage) specs = [ServiceSpec(service_type.value, placement=placementspec, unmanaged=unmanaged, preview_only=dry_run)] - return self._apply_misc(specs, dry_run, format, no_overwrite) - - def _apply_misc(self, specs: Sequence[GenericSpec], dry_run: bool, format: Format, no_overwrite: bool = False) -> HandleCommandResult: - completion = self.apply(specs, no_overwrite) + cmd_result = self._apply_misc(specs, dry_run, format, no_overwrite, continue_on_error) + if errs: + # HandleCommandResult is a named tuple, so use + # _replace to modify it. + cmd_result = cmd_result._replace(stdout=cmd_result.stdout + '\n' + '\n'.join(errs)) + return cmd_result + + def _apply_misc( + self, + specs: Sequence[GenericSpec], + dry_run: bool, + format: Format, + no_overwrite: bool = False, + continue_on_error: bool = False + ) -> HandleCommandResult: + completion = self.apply(specs, no_overwrite, continue_on_error) raise_if_exception(completion) out = completion.result_str() if dry_run: From e905fedfccbfc70ae42e0cbac9164a1bf918ad01 Mon Sep 17 00:00:00 2001 From: Matan Breizman Date: Tue, 10 Sep 2024 12:09:24 +0000 Subject: [PATCH 005/148] osd/PG: make use of SnapMapper::update_snap_map https://github.com/ceph/ceph/pull/58868 introduced SnapMapper::update_snap_map to be used both by Crimson and Classic. No change in behavior. Signed-off-by: Matan Breizman --- src/osd/PG.cc | 42 +++--------------------------------------- 1 file changed, 3 insertions(+), 39 deletions(-) diff --git a/src/osd/PG.cc b/src/osd/PG.cc index f7a5033574f76..ee14f650e5336 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -1137,46 +1137,10 @@ void PG::update_snap_map( const vector &log_entries, ObjectStore::Transaction &t) { - for (auto i = log_entries.cbegin(); i != log_entries.cend(); ++i) { + for (const auto& entry : log_entries) { OSDriver::OSTransaction _t(osdriver.get_transaction(&t)); - if (i->soid.snap < CEPH_MAXSNAP) { - if (i->is_delete()) { - int r = snap_mapper.remove_oid( - i->soid, - &_t); - if (r) - derr << __func__ << " remove_oid " << i->soid << " failed with " << r << dendl; - // On removal tolerate missing key corruption - ceph_assert(r == 0 || r == -ENOENT); - } else if (i->is_update()) { - ceph_assert(i->snaps.length() > 0); - vector snaps; - bufferlist snapbl = i->snaps; - auto p = snapbl.cbegin(); - try { - decode(snaps, p); - } catch (...) { - derr << __func__ << " decode snaps failure on " << *i << dendl; - snaps.clear(); - } - set _snaps(snaps.begin(), snaps.end()); - - if (i->is_clone() || i->is_promote()) { - snap_mapper.add_oid( - i->soid, - _snaps, - &_t); - } else if (i->is_modify()) { - int r = snap_mapper.update_snaps( - i->soid, - _snaps, - 0, - &_t); - ceph_assert(r == 0); - } else { - ceph_assert(i->is_clean()); - } - } + if (entry.soid.snap < CEPH_MAXSNAP) { + snap_mapper.update_snap_map(entry, &_t); } } } From a79e9a4e7aec195c904505ffdfd4851cb9eba532 Mon Sep 17 00:00:00 2001 From: Kevin Zhao Date: Mon, 12 Aug 2024 14:12:58 +0800 Subject: [PATCH 006/148] doc/rgw/uadk: Add UADK document for compressor zlib Signed-off-by: Kevin Zhao --- doc/radosgw/uadk-accel.rst | 131 +++++++++++++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 doc/radosgw/uadk-accel.rst diff --git a/doc/radosgw/uadk-accel.rst b/doc/radosgw/uadk-accel.rst new file mode 100644 index 0000000000000..748fe6d3b3b57 --- /dev/null +++ b/doc/radosgw/uadk-accel.rst @@ -0,0 +1,131 @@ +=============================================== +UADK Acceleration for Compression +=============================================== + +UADK is a framework for applications to access hardware accelerators in a +unified, secure, and efficient way. UADK is comprised of UACCE, libwd and many +other algorithm libraries. + +See `Compressor UADK Support`_. + + +UADK in the Software Stack +========================== + +UADK is a general-purpose user space accelerator framework that uses shared +virtual addressing (SVA) to provide a unified programming interface for hardware +acceleration of cryptographic and compression algorithms. + +UADK includes Unified/User-space-access-intended Accelerator Framework (UACCE), +which enables hardware accelerators that support SVA to adapt to UADK. + +Currently, HiSilicon Kunpeng hardware accelerators have been registered with +UACCE. Through the UADK framework, users can run cryptographic and compression +algorithms using hardware accelerators instead of CPUs, freeing up CPU computing +power and improving computing performance. + +A user can access the hardware accelerators by performing user-mode operations on +the character devices, or the use of UADK can be done via frameworks that have +been enabled by others including UADK support (for example, OpenSSL* libcrypto*, +DPDK, and the Linux* Kernel Crypto Framework). + +See `OpenSSL UADK Engine`_. + +UADK Environment Setup +====================== +UADK consists of UACCE, vendors’ drivers, and an algorithm layer. UADK requires the +hardware accelerator to support SVA, and the operating system to support IOMMU and +SVA. Hardware accelerators from different vendors are registered as different character +devices with UACCE by using kernel-mode drivers of the vendors. + +:: + + +----------------------------------+ + | apps | + +----+------------------------+----+ + | | + | | + +-------+--------+ +-------+-------+ + | scheduler | | alg libraries | + +-------+--------+ +-------+-------+ + | | + | | + | | + | +--------+------+ + | | vendor drivers| + | +-+-------------+ + | | + | | + +--+------------------+--+ + | libwd | + User +----+-------------+-----+ + -------------------------------------------------- + Kernel +--+-----+ +------+ + | uacce | | smmu | + +---+----+ +------+ + | + +---+------------------+ + | vendor kernel driver | + +----------------------+ + -------------------------------------------------- + +----------------------+ + | HW Accelerators | + +----------------------+ + +Configuration +============= + +#. Kernel Requirement + +User needs to make sure that UACCE is already supported in Linux kernel. The kernel version +should be at least v5.9 with SVA (Shared Virtual Addressing) enabled. + +UACCE may be built as a module or built into the kernel. Here's an example to build UACCE +with hardware accelerators for the HiSilicon Kunpeng platform. + + .. prompt:: bash $ + + CONFIG_IOMMU_SVA_LIB=y + CONFIG_ARM_SMMU=y + CONFIG_ARM_SMMU_V3=y + CONFIG_ARM_SMMU_V3_SVA=y + CONFIG_PCI_PASID=y + CONFIG_UACCE=y + CONFIG_CRYPTO_DEV_HISI_QM=y + CONFIG_CRYPTO_DEV_HISI_ZIP=y + +Make sure all these above kernel configurations are selected. + +#. UADK enablement +If the architecture is aarch64, it will automatically download the UADK source code to build +the static library. If it runs on other architecture, user can enable it with build parameters +`-DWITH_UADK=true` + +#. Manual Build UADK +As the above paragraph shows, the UADK is enabled automatically, no need to build manually. +For developer who is interested in UADK, you can refer to the below steps for building. + + .. prompt:: bash $ + + git clone https://github.com/Linaro/uadk.git + cd uadk + mkdir build + ./autogen.sh + ./configure --prefix=$PWD/build + make + make install + + .. note:: Without –prefix, UADK will be installed to /usr/local/lib by default. + If get error:"cannot find -lnuma", please install the `libnuma-dev` + +#. Configure + + Edit the Ceph configuration file (usually ``ceph.conf``) to enable UADK + support for *zlib* compression:: + + uadk_compressor_enabled=true + + The default value in `global.yaml.in` for `uadk_compressor_enabled` is false. + +.. _Compressor UADK Support: https://github.com/ceph/ceph/pull/58336 +.. _OpenSSL UADK Engine: https://github.com/Linaro/uadk_engine From ccd58786b90e358b19fa1d5108802856d6b4b237 Mon Sep 17 00:00:00 2001 From: Rishabh Dave Date: Wed, 18 Sep 2024 11:07:02 +0530 Subject: [PATCH 007/148] mgr/vol: use pre-defined timeout period instead of hardcoded value Currently timeout is set to 5. But hardcoding this is unnecessary since the class already defines a attribute for this purpose. Use that instead. Signed-off-by: Rishabh Dave --- src/pybind/mgr/volumes/fs/async_job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pybind/mgr/volumes/fs/async_job.py b/src/pybind/mgr/volumes/fs/async_job.py index 6834e3e240b33..83a119ca5564c 100644 --- a/src/pybind/mgr/volumes/fs/async_job.py +++ b/src/pybind/mgr/volumes/fs/async_job.py @@ -167,7 +167,7 @@ def run(self): for i in range(c, self.nr_concurrent_jobs): self.threads.append(JobThread(self, self.vc, name="{0}.{1}.{2}".format(self.name_pfx, time.time(), i))) self.threads[-1].start() - self.cv.wait(timeout=5) + self.cv.wait(timeout=self.wakeup_timeout) def shutdown(self): self.stopping.set() From 85dff0d19185fa6dfad723ce80b6b3314de9752c Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Wed, 18 Sep 2024 17:34:39 +0800 Subject: [PATCH 008/148] crimson/osd: purge strays when PGs go clean Signed-off-by: Xuehan Xu --- src/crimson/osd/osd.cc | 24 ++++++++++++++++++++++++ src/crimson/osd/osd.h | 2 ++ src/crimson/osd/pg.cc | 18 +++++++++++++++++- src/crimson/osd/pg.h | 12 ++++++++++-- 4 files changed, 53 insertions(+), 3 deletions(-) diff --git a/src/crimson/osd/osd.cc b/src/crimson/osd/osd.cc index 8d2d10fbd7c45..34ad97ceb068a 100644 --- a/src/crimson/osd/osd.cc +++ b/src/crimson/osd/osd.cc @@ -23,6 +23,7 @@ #include "messages/MOSDOp.h" #include "messages/MOSDPeeringOp.h" #include "messages/MOSDPGCreate2.h" +#include "messages/MOSDPGRemove.h" #include "messages/MOSDPGUpdateLogMissing.h" #include "messages/MOSDPGUpdateLogMissingReply.h" #include "messages/MOSDRepOpReply.h" @@ -863,6 +864,8 @@ OSD::do_ms_dispatch( [[fallthrough]]; case MSG_OSD_PG_LOG: return handle_peering_op(conn, boost::static_pointer_cast(m)); + case MSG_OSD_PG_REMOVE: + return handle_pg_remove(conn, boost::static_pointer_cast(m)); case MSG_OSD_REPOP: return handle_rep_op(conn, boost::static_pointer_cast(m)); case MSG_OSD_REPOPREPLY: @@ -1555,6 +1558,27 @@ seastar::future<> OSD::handle_peering_op( std::move(*evt)).second; } +seastar::future<> OSD::handle_pg_remove( + crimson::net::ConnectionRef conn, + Ref m) +{ + LOG_PREFIX(OSD::handle_pg_remove); + const int from = m->get_source().num(); + std::vector> futs; + for (auto &pg : m->pg_list) { + DEBUG("{} from {}", pg, from); + futs.emplace_back( + pg_shard_manager.start_pg_operation( + conn, + pg_shard_t{from, pg.shard}, + pg, + m->get_epoch(), + m->get_epoch(), + PeeringState::DeleteStart()).second); + } + return seastar::when_all_succeed(std::move(futs)); +} + seastar::future<> OSD::check_osdmap_features() { LOG_PREFIX(OSD::check_osdmap_features); diff --git a/src/crimson/osd/osd.h b/src/crimson/osd/osd.h index de39d80827494..d7d54d5d2c3c3 100644 --- a/src/crimson/osd/osd.h +++ b/src/crimson/osd/osd.h @@ -208,6 +208,8 @@ class OSD final : public crimson::net::Dispatcher, Ref m); seastar::future<> handle_peering_op(crimson::net::ConnectionRef conn, Ref m); + seastar::future<> handle_pg_remove(crimson::net::ConnectionRef conn, + Ref m); seastar::future<> handle_recovery_subreq(crimson::net::ConnectionRef conn, Ref m); seastar::future<> handle_scrub_command(crimson::net::ConnectionRef conn, diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc index 644cc84513d49..c92978fcfc2b9 100644 --- a/src/crimson/osd/pg.cc +++ b/src/crimson/osd/pg.cc @@ -517,7 +517,8 @@ Context *PG::on_clean() { recovery_handler->on_pg_clean(); scrubber.on_primary_active_clean(); - return nullptr; + recovery_finisher = new C_PG_FinishRecovery(*this); + return recovery_finisher; } seastar::future<> PG::clear_temp_objects() @@ -1883,4 +1884,19 @@ void PG::cancel_pglog_based_recovery_op() { pglog_based_recovery_op->cancel(); reset_pglog_based_recovery_op(); } + +void PG::C_PG_FinishRecovery::finish(int r) { + LOG_PREFIX(PG::C_PG_FinishRecovery::finish); + auto &peering_state = pg.get_peering_state(); + if (peering_state.is_deleting() || !peering_state.is_clean()) { + DEBUGDPP("raced with delete or repair", pg); + return; + } + if (this == pg.recovery_finisher) { + peering_state.purge_strays(); + pg.recovery_finisher = nullptr; + } else { + DEBUGDPP("stale recovery finsher", pg); + } +} } diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h index 11c0e3668b142..91bd529b95d63 100644 --- a/src/crimson/osd/pg.h +++ b/src/crimson/osd/pg.h @@ -375,7 +375,7 @@ class PG : public boost::intrusive_ref_counter< } void check_blocklisted_watchers() final; void clear_primary_state() final { - // Not needed yet + recovery_finisher = nullptr; } void queue_check_readable(epoch_t last_peering_reset, @@ -394,7 +394,7 @@ class PG : public boost::intrusive_ref_counter< void on_replica_activate() final; void on_activate_complete() final; void on_new_interval() final { - // Not needed yet + recovery_finisher = nullptr; } Context *on_clean() final; void on_activate_committed() final { @@ -712,9 +712,17 @@ class PG : public boost::intrusive_ref_counter< } seastar::future<> stop(); private: + class C_PG_FinishRecovery : public Context { + public: + explicit C_PG_FinishRecovery(PG &pg) : pg(pg) {} + void finish(int r) override; + private: + PG& pg; + }; std::unique_ptr backend; std::unique_ptr recovery_backend; std::unique_ptr recovery_handler; + C_PG_FinishRecovery *recovery_finisher; PeeringState peering_state; eversion_t projected_last_update; From 91734345b612b65ef7ccbd8ec6c3b485287294ec Mon Sep 17 00:00:00 2001 From: Zac Dover Date: Wed, 18 Sep 2024 21:02:32 +1000 Subject: [PATCH 009/148] doc/radosgw: correct RST formatting fixup Signed-off-by: Zac Dover --- doc/radosgw/index.rst | 2 +- doc/radosgw/uadk-accel.rst | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/radosgw/index.rst b/doc/radosgw/index.rst index da92692fa8b7b..3085e1a528f9f 100644 --- a/doc/radosgw/index.rst +++ b/doc/radosgw/index.rst @@ -88,4 +88,4 @@ Cluster with one API and then retrieve that data with the other API. D3N Data Cache Cloud Transition Metrics - + UADK Acceleration for Compression diff --git a/doc/radosgw/uadk-accel.rst b/doc/radosgw/uadk-accel.rst index 748fe6d3b3b57..fdf99f891f0a7 100644 --- a/doc/radosgw/uadk-accel.rst +++ b/doc/radosgw/uadk-accel.rst @@ -115,8 +115,9 @@ For developer who is interested in UADK, you can refer to the below steps for bu make make install - .. note:: Without –prefix, UADK will be installed to /usr/local/lib by default. - If get error:"cannot find -lnuma", please install the `libnuma-dev` + .. note:: Without –prefix, UADK will be installed to /usr/local/lib by + default. If get error:"cannot find -lnuma", please install + the `libnuma-dev`. #. Configure From a5d0f546807311d3fc37facd0b4acc98009a6271 Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Thu, 19 Sep 2024 17:30:01 +0800 Subject: [PATCH 010/148] crimson/osd/backfill_state: push peer pg infos' last_backfills only when all objects before them are backfilled Fixes: https://tracker.ceph.com/issues/68147 Signed-off-by: Xuehan Xu --- src/crimson/osd/backfill_state.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/crimson/osd/backfill_state.cc b/src/crimson/osd/backfill_state.cc index 70c43f49faf72..172c5e9cb0c1f 100644 --- a/src/crimson/osd/backfill_state.cc +++ b/src/crimson/osd/backfill_state.cc @@ -570,6 +570,7 @@ void BackfillState::ProgressTracker::complete_to( } else { ceph_abort_msg("completing untracked object shall not happen"); } + auto new_last_backfill = peering_state().earliest_backfill(); for (auto it = std::begin(registry); it != std::end(registry) && it->second.stage != op_stage_t::enqueued_push; @@ -579,6 +580,8 @@ void BackfillState::ProgressTracker::complete_to( peering_state().update_complete_backfill_object_stats( soid, *item.stats); + assert(soid > new_last_backfill); + new_last_backfill = soid; } if (Enqueuing::all_enqueued(peering_state(), backfill_state().backfill_info, @@ -587,7 +590,7 @@ void BackfillState::ProgressTracker::complete_to( backfill_state().last_backfill_started = hobject_t::get_max(); backfill_listener().update_peers_last_backfill(hobject_t::get_max()); } else { - backfill_listener().update_peers_last_backfill(obj); + backfill_listener().update_peers_last_backfill(new_last_backfill); } } From bfe15f68075c80df9099da52111a40a5c16cfa31 Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Fri, 13 Sep 2024 17:35:06 +0800 Subject: [PATCH 011/148] crimson/osd/backfill_state: always go to Enqueuing when object is pushed during Waiting Fixes: https://tracker.ceph.com/issues/68061 Signed-off-by: Xuehan Xu --- src/crimson/osd/backfill_state.cc | 22 ++++++++-------------- src/crimson/osd/backfill_state.h | 2 +- 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/src/crimson/osd/backfill_state.cc b/src/crimson/osd/backfill_state.cc index 70c43f49faf72..1db852da5ccd5 100644 --- a/src/crimson/osd/backfill_state.cc +++ b/src/crimson/osd/backfill_state.cc @@ -403,7 +403,7 @@ BackfillState::PrimaryScanning::react(ObjectPushed evt) { logger().debug("PrimaryScanning::react() on ObjectPushed; evt.object={}", evt.object); - backfill_state().progress_tracker->complete_to(evt.object, evt.stat); + backfill_state().progress_tracker->complete_to(evt.object, evt.stat, true); return discard_event(); } @@ -480,7 +480,7 @@ BackfillState::ReplicasScanning::react(ObjectPushed evt) { logger().debug("ReplicasScanning::react() on ObjectPushed; evt.object={}", evt.object); - backfill_state().progress_tracker->complete_to(evt.object, evt.stat); + backfill_state().progress_tracker->complete_to(evt.object, evt.stat, true); return discard_event(); } @@ -496,16 +496,8 @@ BackfillState::Waiting::react(ObjectPushed evt) { logger().debug("Waiting::react() on ObjectPushed; evt.object={}", evt.object); - backfill_state().progress_tracker->complete_to(evt.object, evt.stat); - if (!Enqueuing::all_enqueued(peering_state(), - backfill_state().backfill_info, - backfill_state().peer_backfill_info)) { - return transit(); - } else { - // we still have something to wait on - logger().debug("Waiting::react() on ObjectPushed; still waiting"); - return discard_event(); - } + backfill_state().progress_tracker->complete_to(evt.object, evt.stat, false); + return transit();; } // -- Done @@ -559,7 +551,8 @@ void BackfillState::ProgressTracker::enqueue_drop(const hobject_t& obj) void BackfillState::ProgressTracker::complete_to( const hobject_t& obj, - const pg_stat_t& stats) + const pg_stat_t& stats, + bool may_push_to_max) { logger().debug("{}: obj={}", __func__, obj); @@ -580,7 +573,8 @@ void BackfillState::ProgressTracker::complete_to( soid, *item.stats); } - if (Enqueuing::all_enqueued(peering_state(), + if (may_push_to_max && + Enqueuing::all_enqueued(peering_state(), backfill_state().backfill_info, backfill_state().peer_backfill_info) && tracked_objects_completed()) { diff --git a/src/crimson/osd/backfill_state.h b/src/crimson/osd/backfill_state.h index 6c36db81813b7..66ba2307f808a 100644 --- a/src/crimson/osd/backfill_state.h +++ b/src/crimson/osd/backfill_state.h @@ -421,7 +421,7 @@ class BackfillState::ProgressTracker { bool enqueue_push(const hobject_t&); void enqueue_drop(const hobject_t&); - void complete_to(const hobject_t&, const pg_stat_t&); + void complete_to(const hobject_t&, const pg_stat_t&, bool may_push_to_max); }; } // namespace crimson::osd From 14c905d0c1f609d438aed3b4a4f600825d07d845 Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Fri, 20 Sep 2024 11:35:07 +0800 Subject: [PATCH 012/148] test/crimson/test_backfill: set BackfillState's log level to debug Signed-off-by: Xuehan Xu --- src/test/crimson/test_backfill.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/test/crimson/test_backfill.cc b/src/test/crimson/test_backfill.cc index 6648719c61c8f..0abb62a0320e0 100644 --- a/src/test/crimson/test_backfill.cc +++ b/src/test/crimson/test_backfill.cc @@ -270,6 +270,9 @@ BackfillFixture::BackfillFixture( this->backfill_targets), std::make_unique(this->backfill_source)) { + seastar::global_logger_registry().set_all_loggers_level( + seastar::log_level::debug + ); backfill_state.process_event(crimson::osd::BackfillState::Triggered{}.intrusive_from_this()); } From ec2af67dba8ba2874f8b60c10e51c75808ebb0a2 Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Sat, 21 Sep 2024 13:27:01 +0800 Subject: [PATCH 013/148] crimson/osd/pg: remove snapmapper objects when eventually removing collections at the last moment of pg deleting, just as pg meta objects Fixes: https://tracker.ceph.com/issues/68174 Signed-off-by: Xuehan Xu --- src/crimson/osd/pg.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc index d210773ca3031..5ef8c2d97afe4 100644 --- a/src/crimson/osd/pg.cc +++ b/src/crimson/osd/pg.cc @@ -481,6 +481,7 @@ PG::do_delete_work(ceph::os::Transaction &t, ghobject_t _next) auto [objs_to_rm, next] = fut.get(); if (objs_to_rm.empty()) { logger().info("all objs removed, removing coll for {}", pgid); + t.remove(coll_ref->get_cid(), pgid.make_snapmapper_oid()); t.remove(coll_ref->get_cid(), pgmeta_oid); t.remove_collection(coll_ref->get_cid()); (void) shard_services.get_store().do_transaction( @@ -490,7 +491,7 @@ PG::do_delete_work(ceph::os::Transaction &t, ghobject_t _next) return {next, false}; } else { for (auto &obj : objs_to_rm) { - if (obj == pgmeta_oid) { + if (obj == pgmeta_oid || obj.is_internal_pg_local()) { continue; } logger().trace("pg {}, removing obj {}", pgid, obj); From 4db3bb77b6458f8b54af7f9279151d616f042d49 Mon Sep 17 00:00:00 2001 From: Sachin Prabhu Date: Thu, 12 Sep 2024 17:13:25 +0100 Subject: [PATCH 014/148] mgr/smb: accept public_addrs on cli when creating cluster We can set the public ip address to set for the cluster using the declarative method by providing the information in the resource description. The corresponding functionality is not available with the imperative method of creating the smb cluster. This patch adds this functionality by allowing the user the option of providing the a public address on the command line when creating the smb cluster. Signed-off-by: Sachin Prabhu --- doc/mgr/smb.rst | 5 +++++ src/pybind/mgr/smb/module.py | 14 ++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/doc/mgr/smb.rst b/doc/mgr/smb.rst index 05e6369ddf107..3252c485a9aa7 100644 --- a/doc/mgr/smb.rst +++ b/doc/mgr/smb.rst @@ -96,6 +96,11 @@ clustering enables clustering regardless of the placement count. A value of ``never`` disables clustering regardless of the placement count. If unspecified, ``default`` is assumed. +public_addrs + Optional. A string in the form of [%]. + Supported only when using Samba's clustering. Assign "virtual" IP + addresses that will be managed by the clustering subsystem and may automatically + move between nodes running Samba containers. Remove Cluster ++++++++++++++ diff --git a/src/pybind/mgr/smb/module.py b/src/pybind/mgr/smb/module.py index 1e71721202e80..e2ec9663af52f 100644 --- a/src/pybind/mgr/smb/module.py +++ b/src/pybind/mgr/smb/module.py @@ -167,6 +167,7 @@ def cluster_create( custom_dns: Optional[List[str]] = None, placement: Optional[str] = None, clustering: Optional[SMBClustering] = None, + public_addrs: Optional[List[str]] = None, ) -> results.Result: """Create an smb cluster""" domain_settings = None @@ -251,6 +252,18 @@ def cluster_create( ) ) + c_public_addrs = [] + if public_addrs: + for pa in public_addrs: + pa_arr = pa.split('%', 1) + address = pa_arr[0] + destination = pa_arr[1] if len(pa_arr) > 1 else None + c_public_addrs.append( + resources.ClusterPublicIPAssignment( + address=address, destination=destination + ) + ) + pspec = resources.WrappedPlacementSpec.wrap( PlacementSpec.from_string(placement) ) @@ -262,6 +275,7 @@ def cluster_create( custom_dns=custom_dns, placement=pspec, clustering=clustering, + public_addrs=c_public_addrs, ) to_apply.append(cluster) return self._handler.apply(to_apply, create_only=True).squash(cluster) From 7d9fe0a5dbc9abf942b26147e6dc17f85529dfe4 Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Wed, 18 Sep 2024 15:28:46 -0700 Subject: [PATCH 015/148] mgr/smb: use is comparisions for enum values Use `is` based comparisions for two enum related functions as mypy likes this better. Signed-off-by: John Mulligan --- src/pybind/mgr/smb/enums.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pybind/mgr/smb/enums.py b/src/pybind/mgr/smb/enums.py index dea45f951f831..3e8544f43cf5a 100644 --- a/src/pybind/mgr/smb/enums.py +++ b/src/pybind/mgr/smb/enums.py @@ -21,7 +21,7 @@ class CephFSStorageProvider(_StrEnum): def expand(self) -> 'CephFSStorageProvider': """Expand abbreviated/default values into the full/expanded form.""" - if self == self.SAMBA_VFS: + if self is self.SAMBA_VFS: # mypy gets confused by enums return self.__class__(self.SAMBA_VFS_NEW) return self @@ -89,9 +89,9 @@ class LoginAccess(_StrEnum): def expand(self) -> 'LoginAccess': """Exapend abbreviated enum values into their full forms.""" # the extra LoginAccess(...) calls are to appease mypy - if self == self.READ_ONLY_SHORT: + if self is self.READ_ONLY_SHORT: return LoginAccess(self.READ_ONLY) - if self == self.READ_WRITE_SHORT: + if self is self.READ_WRITE_SHORT: return LoginAccess(self.READ_WRITE) return self From 51516ba146e9602c0dea1de65b040d737d1dab6a Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Wed, 18 Sep 2024 17:44:39 -0700 Subject: [PATCH 016/148] python-common: add a utils function to replace distutils.util.strtobool As distutils is removed from python 3.12 ceph can no longer use the simple conversion function once located in that module. Add our own trivial replacement function. Signed-off-by: John Mulligan --- src/python-common/ceph/utils.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/python-common/ceph/utils.py b/src/python-common/ceph/utils.py index e92a2d1de7db8..0544e9f4173d1 100644 --- a/src/python-common/ceph/utils.py +++ b/src/python-common/ceph/utils.py @@ -167,3 +167,18 @@ def http_req(hostname: str = '', log.error(e) # handle error here if needed raise + + +_TRUE_VALS = {'y', 'yes', 't', 'true', 'on', '1'} +_FALSE_VALS = {'n', 'no', 'f', 'false', 'off', '0'} + + +def strtobool(value: str) -> bool: + """Convert a string to a boolean value. + Based on a simlilar function once available at distutils.util.strtobool. + """ + if value.lower() in _TRUE_VALS: + return True + if value.lower() in _FALSE_VALS: + return False + raise ValueError(f'invalid truth value {value!r}') From ffcc157a694f0e40829b5ecd2692e54f0a763607 Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Wed, 18 Sep 2024 17:45:58 -0700 Subject: [PATCH 017/148] pybind/mgr: replace imports of distutils.util In python 3.12 distutils is removed. Replace uses of distutils.util.strtobool with our own utility function. Signed-off-by: John Mulligan --- src/pybind/mgr/dashboard/tools.py | 2 +- src/pybind/mgr/volumes/fs/operations/pin_util.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pybind/mgr/dashboard/tools.py b/src/pybind/mgr/dashboard/tools.py index 51ed9c471aac6..14de970cceb0f 100644 --- a/src/pybind/mgr/dashboard/tools.py +++ b/src/pybind/mgr/dashboard/tools.py @@ -9,9 +9,9 @@ import time import urllib from datetime import datetime, timedelta -from distutils.util import strtobool import cherrypy +from ceph.utils import strtobool from mgr_util import build_url from . import mgr diff --git a/src/pybind/mgr/volumes/fs/operations/pin_util.py b/src/pybind/mgr/volumes/fs/operations/pin_util.py index a12ab5b4d4b28..631fdd8fcaa25 100644 --- a/src/pybind/mgr/volumes/fs/operations/pin_util.py +++ b/src/pybind/mgr/volumes/fs/operations/pin_util.py @@ -3,7 +3,7 @@ import cephfs from ..exception import VolumeException -from distutils.util import strtobool +from ceph.utils import strtobool _pin_value = { "export": lambda x: int(x), From a2cbb40925742d0795ef76e3d8548e34477b8db9 Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Thu, 19 Sep 2024 12:44:03 -0700 Subject: [PATCH 018/148] pybind/mgr/telemetry: remove misleading tox env For some reason there's a 'mypy' environment listed in the telemetry tox.ini that always runs pytest. Remove it. We'll see if this causes the CI to fail, as I can't find anywhere that uses it. Signed-off-by: John Mulligan --- src/pybind/mgr/telemetry/tox.ini | 1 - 1 file changed, 1 deletion(-) diff --git a/src/pybind/mgr/telemetry/tox.ini b/src/pybind/mgr/telemetry/tox.ini index a887590eed89b..b2210da54eaa8 100644 --- a/src/pybind/mgr/telemetry/tox.ini +++ b/src/pybind/mgr/telemetry/tox.ini @@ -1,7 +1,6 @@ [tox] envlist = py3 - mypy skipsdist = true [testenv] From de90c32240eaa8cd40a22ffc8b24d8d893ad6863 Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Thu, 19 Sep 2024 13:07:02 -0700 Subject: [PATCH 019/148] mypy-constrains.txt: bump mypy up to version 1.9 Ceph is still very behind on the version of mypy used in the various tox test dirs. Bump up to version 1.9 as it only requires a few trivial fixes to use. Signed-off-by: John Mulligan --- src/mypy-constrains.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mypy-constrains.txt b/src/mypy-constrains.txt index 7810870804ed2..0a79b8ef4f11d 100644 --- a/src/mypy-constrains.txt +++ b/src/mypy-constrains.txt @@ -2,7 +2,7 @@ # Unfortunately this means we have to manually update those # packages regularly. -mypy==1.1.1 +mypy==1.9 # global types-python-dateutil==0.1.3 From b0f5e1086a64a4ad249fbd27b8fb256de38ca1bd Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Wed, 25 Sep 2024 07:44:46 +0800 Subject: [PATCH 020/148] crimson/osd/pg: also trigger callbacks for empty peering transactions Signed-off-by: Xuehan Xu --- src/crimson/osd/shard_services.cc | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/crimson/osd/shard_services.cc b/src/crimson/osd/shard_services.cc index 5f7c4a624471b..df6d10d6aa7d2 100644 --- a/src/crimson/osd/shard_services.cc +++ b/src/crimson/osd/shard_services.cc @@ -767,14 +767,20 @@ seastar::future<> ShardServices::dispatch_context_transaction( LOG_PREFIX(OSDSingletonState::dispatch_context_transaction); if (ctx.transaction.empty()) { DEBUG("empty transaction"); - return seastar::now(); + co_await get_store().flush(col); + Context* on_commit( + ceph::os::Transaction::collect_all_contexts(ctx.transaction)); + if (on_commit) { + on_commit->complete(0); + } + co_return; } DEBUG("do_transaction ..."); - auto ret = get_store().do_transaction( + co_await get_store().do_transaction( col, ctx.transaction.claim_and_reset()); - return ret; + co_return; } seastar::future<> ShardServices::dispatch_context_messages( From a88c84aa1ca6ead3b31c291a1ff03077562881e0 Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Wed, 25 Sep 2024 07:45:10 +0800 Subject: [PATCH 021/148] crimson/osd/pg: correct log messages for ShardServices::dispatch_context_messages Signed-off-by: Xuehan Xu --- src/crimson/osd/shard_services.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/crimson/osd/shard_services.cc b/src/crimson/osd/shard_services.cc index df6d10d6aa7d2..a053d9d5044c5 100644 --- a/src/crimson/osd/shard_services.cc +++ b/src/crimson/osd/shard_services.cc @@ -786,7 +786,7 @@ seastar::future<> ShardServices::dispatch_context_transaction( seastar::future<> ShardServices::dispatch_context_messages( BufferedRecoveryMessages &&ctx) { - LOG_PREFIX(OSDSingletonState::dispatch_context_transaction); + LOG_PREFIX(OSDSingletonState::dispatch_context_messages); auto ret = seastar::parallel_for_each(std::move(ctx.message_map), [FNAME, this](auto& osd_messages) { auto& [peer, messages] = osd_messages; From 358f33a148c9a65478e33648f16e8c8af73c98f2 Mon Sep 17 00:00:00 2001 From: Adam Kupczyk Date: Fri, 13 Sep 2024 16:39:51 +0000 Subject: [PATCH 022/148] os/bluestore: Fix ceph-bluestore-tool allocmap command BlueStore::read_allocation_from_drive_for_bluestore_tool was not informed that multiple bdev labels can exist and reserve space. Comparison of real alloc vs recovered alloc was failing. Fixes: https://tracker.ceph.com/issues/67596 Signed-off-by: Adam Kupczyk --- src/os/bluestore/BlueStore.cc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 236336386728f..069105bc8d67c 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -20527,6 +20527,14 @@ int BlueStore::read_allocation_from_drive_for_bluestore_tool() if (ret < 0) { return ret; } + if (bdev_label_multi) { + uint64_t lsize = std::max(BDEV_LABEL_BLOCK_SIZE, min_alloc_size); + for (uint64_t p : bdev_label_valid_locations) { + if (p != BDEV_FIRST_LABEL_POSITION) { + allocator->init_rm_free(p, lsize); + } + } + } duration = ceph_clock_now() - start; stats.insert_count = 0; From 9e3449995f8c21fd3bde6308517aebcd79478988 Mon Sep 17 00:00:00 2001 From: Rishabh Dave Date: Fri, 19 Jul 2024 19:30:39 +0530 Subject: [PATCH 023/148] qa/cephfs: test clone progress reporter after subvol is deleted but... snapshot is retained despite of deletion (using --retain-snapshots option of "subvolume rm" command). Signed-off-by: Rishabh Dave --- qa/tasks/cephfs/test_volumes.py | 69 ++++++++++++++++++++++++++++++++- 1 file changed, 68 insertions(+), 1 deletion(-) diff --git a/qa/tasks/cephfs/test_volumes.py b/qa/tasks/cephfs/test_volumes.py index 2baefd72c3fbc..6a4e25948256e 100644 --- a/qa/tasks/cephfs/test_volumes.py +++ b/qa/tasks/cephfs/test_volumes.py @@ -7876,7 +7876,22 @@ def tearDown(self): self.run_ceph_cmd('fs subvolume snapshot rm --force ' f'--format json {v} {sv} {ss}') - self.run_ceph_cmd(f'fs subvolume rm {v} {sv}') + try: + self.run_ceph_cmd(f'fs subvolume rm {v} {sv}') + except CommandFailedError as e: + if e.exitstatus == errno.ENOENT: + log.info( + 'ignoring this error, perhaps subvolume was deleted ' + 'during the test and snapshot deleted above is a ' + 'retained snapshot. when a retained snapshot (which is ' + 'snapshot retained despite of subvolume deletion) is ' + 'deleted, the subvolume directory is also deleted ' + 'along. and before retained snapshot deletion, the ' + 'subvolume is reported by "subvolume ls" command, which' + 'is what probably caused confusion here') + pass + else: + raise # verify trash dir is clean self._wait_for_trash_empty() @@ -8090,6 +8105,58 @@ def test_clone_to_diff_group_and_less_than_cloner_threads(self): # and not cancelling these clone doesnt affect this test case. self.cancel_clones_and_ignore_if_finished(c) + def test_clone_after_subvol_is_removed(self): + ''' + Initiate cloning after source subvolume has been deleted but with + snapshots retained and then test that, when this clone is in progress, + one progress bar is printed in output of command "ceph status" that + shows progress of this clone. + ''' + v = self.volname + sv = 'sv1' + ss = 'ss1' + # XXX: "clone" must be part of clone name for sake of tearDown() + c = 'ss1clone1' + + # XXX: without setting mds_snap_rstat to true rstats are not updated on + # a subvolume snapshot and therefore clone progress bar will not show + # any progress. + self.config_set('mds', 'mds_snap_rstat', 'true') + + self.run_ceph_cmd(f'fs subvolume create {v} {sv} --mode=777') + size = self._do_subvolume_io(sv, None, None, 10, 1024) + + self.run_ceph_cmd(f'fs subvolume snapshot create {v} {sv} {ss}') + self.wait_till_rbytes_is_right(v, sv, size) + + self.run_ceph_cmd(f'fs subvolume rm {v} {sv} --retain-snapshots') + self.run_ceph_cmd(f'fs subvolume snapshot clone {v} {sv} {ss} {c}') + + with safe_while(tries=15, sleep=10) as proceed: + while proceed(): + pev = self.get_pevs_from_ceph_status(c) + + if len(pev) < 1: + continue + elif len(pev) > 1: + raise RuntimeError('For 1 clone "ceph status" output has 2 ' + 'progress bars, it should have only 1 ' + f'progress bar.\npev -\n{pev}') + + # ensure that exactly 1 progress bar for cloning is present in + # "ceph status" output + msg = ('"progress_events" dict in "ceph status" output must have ' + f'exactly one entry.\nprogress_event dict -\n{pev}') + self.assertEqual(len(pev), 1, msg) + + pev_msg = tuple(pev.values())[0]['message'] + self.assertIn('1 ongoing clones', pev_msg) + break + + # allowing clone jobs to finish will consume too much time and space + # and not cancelling these clone doesnt affect this test case. + self.cancel_clones_and_ignore_if_finished(c) + def test_clones_equal_to_cloner_threads(self): ''' Test that one progress bar is printed in output of "ceph status" output From 21cf769ae78021cf6968666ab7dc5e779835fd01 Mon Sep 17 00:00:00 2001 From: Rishabh Dave Date: Fri, 27 Sep 2024 00:41:25 +0530 Subject: [PATCH 024/148] mgr/mgr_util: don't set event when it is already set In class RTimer in mgr_util.py, "self.finished.set()" is run even though the event self.finished was set just now. If it wasn't set, the while loop the precedes it would've never finished running. Therefore, remove this redundant line of code. Signed-off-by: Rishabh Dave --- src/pybind/mgr/mgr_util.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/pybind/mgr/mgr_util.py b/src/pybind/mgr/mgr_util.py index 67246545eea0f..a999b6525e9f1 100644 --- a/src/pybind/mgr/mgr_util.py +++ b/src/pybind/mgr/mgr_util.py @@ -88,7 +88,6 @@ def run(self): while not self.finished.is_set(): self.finished.wait(self.interval) self.function(*self.args, **self.kwargs) - self.finished.set() except Exception as e: logger.error("task exception: %s", e) raise From 4a4fc7bad533a362fb71aee5ea36014efaecf1b9 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Thu, 26 Sep 2024 20:46:17 -0400 Subject: [PATCH 025/148] qa: ignore pg availability/degraded warnings Fixes: https://tracker.ceph.com/issues/68284 Signed-off-by: Patrick Donnelly --- qa/cephfs/overrides/pg_health.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/qa/cephfs/overrides/pg_health.yaml b/qa/cephfs/overrides/pg_health.yaml index 1740134a2e01b..07ca62e01fbec 100644 --- a/qa/cephfs/overrides/pg_health.yaml +++ b/qa/cephfs/overrides/pg_health.yaml @@ -9,3 +9,5 @@ overrides: - PG_DEGRADED - Reduced data availability - Degraded data redundancy + - pg .* is stuck inactive + - pg .* is .*degraded From 8db39bcbdb4eab197fabf7e611379cdc5e182143 Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Mon, 23 Sep 2024 10:29:20 +0800 Subject: [PATCH 026/148] crimson/osd/backfill_state: do at least one time of replica scanning if necessary in the Enqueuing state Fixes: https://tracker.ceph.com/issues/68175 Signed-off-by: Xuehan Xu --- src/crimson/osd/backfill_state.cc | 52 +++++++++++++++++-------------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/src/crimson/osd/backfill_state.cc b/src/crimson/osd/backfill_state.cc index 70c43f49faf72..7f5b869abbf34 100644 --- a/src/crimson/osd/backfill_state.cc +++ b/src/crimson/osd/backfill_state.cc @@ -225,7 +225,7 @@ bool BackfillState::Enqueuing::should_rescan_primary( const BackfillInterval& backfill_info) const { return backfill_info.begin <= earliest_peer_backfill(peer_backfill_info) && - !backfill_info.extends_to_end(); + !backfill_info.extends_to_end() && backfill_info.empty(); } void BackfillState::Enqueuing::trim_backfilled_object_from_intervals( @@ -327,16 +327,29 @@ BackfillState::Enqueuing::Enqueuing(my_context ctx) } trim_backfill_infos(); - while (!all_emptied(primary_bi, backfill_state().peer_backfill_info)) { + if (should_rescan_primary(backfill_state().peer_backfill_info, + primary_bi)) { + // need to grab one another chunk of the object namespace and restart + // the queueing. + logger().debug("{}: reached end for current local chunk", __func__); + post_event(RequestPrimaryScanning{}); + return; + } + + do { if (!backfill_listener().budget_available()) { post_event(RequestWaiting{}); return; } else if (should_rescan_replicas(backfill_state().peer_backfill_info, - primary_bi)) { + primary_bi)) { // Count simultaneous scans as a single op and let those complete post_event(RequestReplicasScanning{}); return; } + + if (all_emptied(primary_bi, backfill_state().peer_backfill_info)) { + break; + } // Get object within set of peers to operate on and the set of targets // for which that object applies. if (const hobject_t check = \ @@ -355,30 +368,23 @@ BackfillState::Enqueuing::Enqueuing(my_context ctx) trim_backfilled_object_from_intervals(std::move(result), backfill_state().last_backfill_started, backfill_state().peer_backfill_info); - primary_bi.pop_front(); + if (!primary_bi.empty()) { + primary_bi.pop_front(); + } } backfill_listener().maybe_flush(); - } + } while (!all_emptied(primary_bi, backfill_state().peer_backfill_info)); - if (should_rescan_primary(backfill_state().peer_backfill_info, - primary_bi)) { - // need to grab one another chunk of the object namespace and restart - // the queueing. - logger().debug("{}: reached end for current local chunk", - __func__); - post_event(RequestPrimaryScanning{}); - } else { - if (backfill_state().progress_tracker->tracked_objects_completed() - && Enqueuing::all_enqueued(peering_state(), - backfill_state().backfill_info, - backfill_state().peer_backfill_info)) { - backfill_state().last_backfill_started = hobject_t::get_max(); - backfill_listener().update_peers_last_backfill(hobject_t::get_max()); - } - logger().debug("{}: reached end for both local and all peers " - "but still has in-flight operations", __func__); - post_event(RequestWaiting{}); + if (backfill_state().progress_tracker->tracked_objects_completed() + && Enqueuing::all_enqueued(peering_state(), + backfill_state().backfill_info, + backfill_state().peer_backfill_info)) { + backfill_state().last_backfill_started = hobject_t::get_max(); + backfill_listener().update_peers_last_backfill(hobject_t::get_max()); } + logger().debug("{}: reached end for both local and all peers " + "but still has in-flight operations", __func__); + post_event(RequestWaiting{}); } // -- PrimaryScanning From b96d714b23b3f5294df9c28d1f6f5488c4253853 Mon Sep 17 00:00:00 2001 From: Rishabh Dave Date: Fri, 27 Sep 2024 00:43:31 +0530 Subject: [PATCH 027/148] mgr/mgr_util: log traceback when exception occurs in RTimer.run() When an exception occurs in class RTimer of mgr_util.py, only the exception message is logged but not only this is insufficient for debugging but also it is hard to spot in logs. This should not be the case, especially for an occurring exception. Therefore, add code to log traceback and exception name as well along with exception's message. Log entry before this patch - 2024-09-27T00:22:38.656+0530 7f05c7e006c0 0 [volumes ERROR mgr_util] task exception: dummy exception for testing Log entry with this patch - 2024-09-27T00:40:26.509+0530 7f61d64006c0 0 [volumes ERROR mgr_util] exception encountered in RTimer instance "": Traceback (most recent call last): File "/home/rishabh/repos/ceph/minor3/src/pybind/mgr/mgr_util.py", line 91, in run self.function(*self.args, **self.kwargs) File "/home/rishabh/repos/ceph/minor3/src/pybind/mgr/volumes/fs/stats_util.py", line 232, in _update_progress_bars raise RuntimeError('dummy exception for testing') RuntimeError: dummy exception for testing Signed-off-by: Rishabh Dave --- src/pybind/mgr/mgr_util.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/pybind/mgr/mgr_util.py b/src/pybind/mgr/mgr_util.py index a999b6525e9f1..5d37d478de7b1 100644 --- a/src/pybind/mgr/mgr_util.py +++ b/src/pybind/mgr/mgr_util.py @@ -22,6 +22,7 @@ from ipaddress import ip_address from threading import Lock, Condition from typing import no_type_check, NewType +from traceback import format_exc as tb_format_exc import urllib from functools import wraps if sys.version_info >= (3, 3): @@ -88,8 +89,9 @@ def run(self): while not self.finished.is_set(): self.finished.wait(self.interval) self.function(*self.args, **self.kwargs) - except Exception as e: - logger.error("task exception: %s", e) + except Exception: + logger.error(f'exception encountered in RTimer instance "{self}":' + f'\n{tb_format_exc()}') raise From 829c857b9e1be3a4133f088f63950e961ecce67e Mon Sep 17 00:00:00 2001 From: myoungwon oh Date: Wed, 11 Sep 2024 06:04:30 +0000 Subject: [PATCH 028/148] crimson/os/seastore: fix data inconsistency during ool writes In RBM, seastore issues ool writes with allocated address. If a transaction conflict occurs at this point, the allocated address is freed, allowing the address to be reused. However, data inconsistency can occur if seastore issues ool writes with freed address before the preceding ool write has not been complete. To fix this issue, this commit frees the allocated address after ool writes is don in the event of the transaction conflict after ool write is issued. Signed-off-by: Myoungwon Oh --- src/crimson/os/seastore/cache.cc | 8 ++++++-- .../os/seastore/extent_placement_manager.cc | 14 +++++++++++++- src/crimson/os/seastore/transaction.h | 19 +++++++++++++++++++ 3 files changed, 38 insertions(+), 3 deletions(-) diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc index cf8d3c0891d7f..5dcb7514ee1ab 100644 --- a/src/crimson/os/seastore/cache.cc +++ b/src/crimson/os/seastore/cache.cc @@ -990,8 +990,12 @@ void Cache::mark_transaction_conflicted( } efforts.mutate_delta_bytes += delta_stat.bytes; - for (auto &i: t.pre_alloc_list) { - epm.mark_space_free(i->get_paddr(), i->get_length()); + if (t.get_pending_ool()) { + t.get_pending_ool()->is_conflicted = true; + } else { + for (auto &i: t.pre_alloc_list) { + epm.mark_space_free(i->get_paddr(), i->get_length()); + } } auto& ool_stats = t.get_ool_write_stats(); diff --git a/src/crimson/os/seastore/extent_placement_manager.cc b/src/crimson/os/seastore/extent_placement_manager.cc index 34ac199eed8dd..0458fbfed7480 100644 --- a/src/crimson/os/seastore/extent_placement_manager.cc +++ b/src/crimson/os/seastore/extent_placement_manager.cc @@ -987,7 +987,19 @@ RandomBlockOolWriter::alloc_write_ool_extents( return alloc_write_iertr::now(); } return seastar::with_gate(write_guard, [this, &t, &extents] { - return do_write(t, extents); + seastar::lw_shared_ptr ptr = + seastar::make_lw_shared(); + ptr->pending_extents = t.get_pre_alloc_list(); + assert(!t.is_conflicted()); + t.set_pending_ool(ptr); + return do_write(t, extents + ).finally([this, ptr=ptr] { + if (ptr->is_conflicted) { + for (auto &e : ptr->pending_extents) { + rb_cleaner->mark_space_free(e->get_paddr(), e->get_length()); + } + } + }); }); } diff --git a/src/crimson/os/seastore/transaction.h b/src/crimson/os/seastore/transaction.h index 52515937a9e59..5d8ad00ba228b 100644 --- a/src/crimson/os/seastore/transaction.h +++ b/src/crimson/os/seastore/transaction.h @@ -80,6 +80,11 @@ struct rewrite_stats_t { } }; +struct rbm_pending_ool_t { + bool is_conflicted = false; + std::list pending_extents; +}; + /** * Transaction * @@ -554,6 +559,18 @@ class Transaction { return static_cast(*view); } + void set_pending_ool(seastar::lw_shared_ptr ptr) { + pending_ool = ptr; + } + + seastar::lw_shared_ptr get_pending_ool() { + return pending_ool; + } + + const auto& get_pre_alloc_list() { + return pre_alloc_list; + } + private: friend class Cache; friend Ref make_test_transaction(); @@ -650,6 +667,8 @@ class Transaction { const src_t src; transaction_id_t trans_id = TRANS_ID_NULL; + + seastar::lw_shared_ptr pending_ool; }; using TransactionRef = Transaction::Ref; From 3482ebcd3c7fba37b40b91428f53880d66e4c86f Mon Sep 17 00:00:00 2001 From: Rishabh Dave Date: Fri, 27 Sep 2024 13:50:29 +0530 Subject: [PATCH 029/148] mgr/vol: don't define progress bar ID repeatedly Orignally, when the feature was in development, IDs for clone progress bars were set to randomly generated UUID strings. But, eventually, it was decided to assign fixed strings to them. Unlike UUIDs, these strings stay the same even when progress bars are destroyed and re-created. Therefore, instead of re-assigning the same strings every time initiate_reporting() is called, move them to __init__() so that both the IDs are defined only once. Signed-off-by: Rishabh Dave --- src/pybind/mgr/volumes/fs/stats_util.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/pybind/mgr/volumes/fs/stats_util.py b/src/pybind/mgr/volumes/fs/stats_util.py index cec33eaa8873d..3334dc5a3d765 100644 --- a/src/pybind/mgr/volumes/fs/stats_util.py +++ b/src/pybind/mgr/volumes/fs/stats_util.py @@ -106,6 +106,11 @@ def __init__(self, volclient, vol_spec): # reporting has already been initiated by calling RTimer.is_alive(). self.update_task = RTimer(1, self._update_progress_bars) + # progress event ID for ongoing clone jobs + self.on_pev_id: Optional[str] = 'mgr-vol-ongoing-clones' + # progress event ID for ongoing+pending clone jobs + self.onpen_pev_id: Optional[str] = 'mgr-vol-total-clones' + def initiate_reporting(self): if self.update_task.is_alive(): log.info('progress reporting thread is already alive, not ' @@ -113,11 +118,6 @@ def initiate_reporting(self): return log.info('initiating progress reporting for clones...') - # progress event ID for ongoing clone jobs - self.on_pev_id: Optional[str] = 'mgr-vol-ongoing-clones' - # progress event ID for ongoing+pending clone jobs - self.onpen_pev_id: Optional[str] = 'mgr-vol-total-clones' - self.update_task = RTimer(1, self._update_progress_bars) self.update_task.start() log.info('progress reporting for clones has been initiated') @@ -294,10 +294,7 @@ def _finish_progress_events(self): assert self.onpen_pev_id is not None self.volclient.mgr.remote('progress', 'complete', self.on_pev_id) - self.on_pev_id = None - self.volclient.mgr.remote('progress', 'complete', self.onpen_pev_id) - self.onpen_pev_id = None log.info('finished removing progress bars from "ceph status" output') From 706eb26f560bfdd7c34c62445d27c9ebf7f7ad26 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Fri, 27 Sep 2024 13:42:48 -0400 Subject: [PATCH 030/148] mds: do not dump empty bufptr Fixes: https://tracker.ceph.com/issues/68243 Signed-off-by: Patrick Donnelly --- src/mds/CInode.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 0e9b6996ad2c5..dfad411d323d8 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -4589,8 +4589,11 @@ void InodeStoreBase::dump(Formatter *f) const for (const auto& [key, val] : *xattrs) { f->open_object_section("xattr"); f->dump_string("key", key); - std::string v(val.c_str(), val.length()); - f->dump_string("val", v); + if (val.length()) { + f->dump_string("val", std::string(val.c_str(), val.length())); + } else { + f->dump_string("val", ""); + } f->close_section(); } } From ee8c7d2e3bb692fb263a9bb6828c7b9a55a44504 Mon Sep 17 00:00:00 2001 From: Redouane Kachach Date: Fri, 27 Sep 2024 15:48:18 +0200 Subject: [PATCH 031/148] mgr/cephadm: adding config to check client cert for internal nginx Fixes: https://tracker.ceph.com/issues/68310 Signed-off-by: Redouane Kachach --- .../templates/services/mgmt-gateway/internal_server.conf.j2 | 3 +++ src/pybind/mgr/cephadm/tests/test_services.py | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/internal_server.conf.j2 b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/internal_server.conf.j2 index f2c32f8797750..0801adebd0844 100644 --- a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/internal_server.conf.j2 +++ b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/internal_server.conf.j2 @@ -1,5 +1,8 @@ server { + ssl_client_certificate /etc/nginx/ssl/ca.crt; + ssl_verify_client on; + listen {{ internal_port }} ssl; listen [::]:{{ internal_port }} ssl; ssl_certificate /etc/nginx/ssl/nginx_internal.crt; diff --git a/src/pybind/mgr/cephadm/tests/test_services.py b/src/pybind/mgr/cephadm/tests/test_services.py index a9b7da624a0e6..b874161f10959 100644 --- a/src/pybind/mgr/cephadm/tests/test_services.py +++ b/src/pybind/mgr/cephadm/tests/test_services.py @@ -3446,6 +3446,9 @@ def get_services_endpoints(name): }"""), "nginx_internal_server.conf": dedent(""" server { + ssl_client_certificate /etc/nginx/ssl/ca.crt; + ssl_verify_client on; + listen 29443 ssl; listen [::]:29443 ssl; ssl_certificate /etc/nginx/ssl/nginx_internal.crt; @@ -3760,6 +3763,9 @@ def get_services_endpoints(name): }"""), "nginx_internal_server.conf": dedent(""" server { + ssl_client_certificate /etc/nginx/ssl/ca.crt; + ssl_verify_client on; + listen 29443 ssl; listen [::]:29443 ssl; ssl_certificate /etc/nginx/ssl/nginx_internal.crt; From c0e05bf36067294420631f33c5e43c32077eeb82 Mon Sep 17 00:00:00 2001 From: Guillaume Abrioux Date: Mon, 30 Sep 2024 09:17:11 +0000 Subject: [PATCH 032/148] ceph-volume: drop unnecessary call to `get_single_lv()` `Zap.zap_lv()` currently makes a call to `get_single_lv()`: ``` lv = api.get_single_lv(filters={'lv_name': device.lv_name, 'vg_name': device.vg_name}) ``` this isn't needed and redundant as zap_lv() takes an instance of `Device()` as argument which has already a `lv_api` attribute: class Device in device.py: ``` else: vgname, lvname = self.path.split('/') filters = {'lv_name': lvname, 'vg_name': vgname} lv = lvm.get_single_lv(filters=filters) # <---- same call if lv: self.lv_api = lv ``` This implies a duplicate call to `subprocess.Popen()` unnecessarily. Fixes: https://tracker.ceph.com/issues/68312 Signed-off-by: Guillaume Abrioux --- src/ceph-volume/ceph_volume/devices/lvm/zap.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/ceph-volume/ceph_volume/devices/lvm/zap.py b/src/ceph-volume/ceph_volume/devices/lvm/zap.py index c1bef82c10975..2b6925f5b2739 100644 --- a/src/ceph-volume/ceph_volume/devices/lvm/zap.py +++ b/src/ceph-volume/ceph_volume/devices/lvm/zap.py @@ -191,8 +191,7 @@ def zap_lv(self, device): Device examples: vg-name/lv-name, /dev/vg-name/lv-name Requirements: Must be a logical volume (LV) """ - lv = api.get_single_lv(filters={'lv_name': device.lv_name, 'vg_name': - device.vg_name}) + lv: api.Volume = device.lv_api self.unmount_lv(lv) zap_device(device.path) From a55a75c57e7a42a1317e4d7fc86c1964b71137f0 Mon Sep 17 00:00:00 2001 From: Rishabh Dave Date: Fri, 23 Aug 2024 18:19:43 +0530 Subject: [PATCH 033/148] mon,cephfs: require confirmation when changing max_mds on unhealthy cluster User must pass the confirmation flag (--yes-i-really-mean-it) to change the value of CephFS setting variable "max_mds" when the Ceph cluster is unhealthy. This measure was decided upon to prevent users from changing "max_mds" as a measure of troubleshotoing unhealthy cluster. Fixes: https://tracker.ceph.com/issues/66301 Signed-off-by: Rishabh Dave --- src/mon/FSCommands.cc | 11 +++++++++++ src/mon/MDSMonitor.cc | 7 +++++++ src/mon/MDSMonitor.h | 1 + 3 files changed, 19 insertions(+) diff --git a/src/mon/FSCommands.cc b/src/mon/FSCommands.cc index 62d37574ded67..b935ace4affba 100644 --- a/src/mon/FSCommands.cc +++ b/src/mon/FSCommands.cc @@ -385,6 +385,17 @@ class SetHandler : public FileSystemCommandHandler return -EINVAL; } + bool confirm = false; + cmd_getval(cmdmap, "yes_i_really_mean_it", confirm); + if (var == "max_mds" && !confirm && mon->mdsmon()->has_any_health_warning()) { + ss << "One or more file system health warnings are present. Modifying " + << "the file system setting variable \"max_mds\" may not help " + << "troubleshoot or recover from these warnings and may further " + << "destabilize the system. If you really wish to proceed, run " + << "again with --yes-i-really-mean-it"; + return -EPERM; + } + return set_val(mon, fsmap, op, cmdmap, ss, fsp->get_fscid(), var, val); } }; diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc index 76a57ac443de7..d8cca4ceb61b1 100644 --- a/src/mon/MDSMonitor.cc +++ b/src/mon/MDSMonitor.cc @@ -1557,6 +1557,13 @@ bool MDSMonitor::has_health_warnings(vector warnings) return false; } +bool MDSMonitor::has_any_health_warning() +{ + return std::any_of( + pending_daemon_health.begin(), pending_daemon_health.end(), + [](auto& it) { return !it.second.metrics.empty() ? true : false; }); +} + int MDSMonitor::filesystem_command( FSMap &fsmap, MonOpRequestRef op, diff --git a/src/mon/MDSMonitor.h b/src/mon/MDSMonitor.h index b0f88cd31302d..dd2a269009de2 100644 --- a/src/mon/MDSMonitor.h +++ b/src/mon/MDSMonitor.h @@ -53,6 +53,7 @@ class MDSMonitor : public PaxosService, public PaxosFSMap, protected CommandHand bool prepare_update(MonOpRequestRef op) override; bool should_propose(double& delay) override; bool has_health_warnings(std::vector warnings); + bool has_any_health_warning(); bool should_print_status() const { auto& fs = get_fsmap(); From 4d5ec87ab404c2b94aab6865061175eb5870fa33 Mon Sep 17 00:00:00 2001 From: Rishabh Dave Date: Tue, 27 Aug 2024 13:04:35 +0530 Subject: [PATCH 034/148] qa/cephfs: add tests for confirmationn required to change max_mds Add tests to ensure that when cluster has any health warning, especially MDS_TRIM, confirmation flag is mandatory to change max_mds. Signed-off-by: Rishabh Dave --- qa/tasks/cephfs/filesystem.py | 7 +++-- qa/tasks/cephfs/test_admin.py | 57 +++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 2 deletions(-) diff --git a/qa/tasks/cephfs/filesystem.py b/qa/tasks/cephfs/filesystem.py index 1c00a49077dff..2b7fd2ee56945 100644 --- a/qa/tasks/cephfs/filesystem.py +++ b/qa/tasks/cephfs/filesystem.py @@ -640,8 +640,11 @@ def set_down(self, down=True): def set_joinable(self, joinable=True): self.set_var("joinable", joinable) - def set_max_mds(self, max_mds): - self.set_var("max_mds", "%d" % max_mds) + def set_max_mds(self, max_mds, confirm=True): + if confirm: + self.set_var('max_mds', f'{max_mds}', '--yes-i-really-mean-it') + else: + self.set_var("max_mds", f"{max_mds}",) def set_session_timeout(self, timeout): self.set_var("session_timeout", "%d" % timeout) diff --git a/qa/tasks/cephfs/test_admin.py b/qa/tasks/cephfs/test_admin.py index ff9962e73104d..315d9140119d0 100644 --- a/qa/tasks/cephfs/test_admin.py +++ b/qa/tasks/cephfs/test_admin.py @@ -2659,3 +2659,60 @@ def test_with_health_warn_with_2_active_MDSs(self): errmsgs=health_warn) self.run_ceph_cmd(f'mds fail {mds1_id} --yes-i-really-mean-it') self.run_ceph_cmd(f'mds fail {mds2_id} --yes-i-really-mean-it') + + +class TestFSSetMaxMDS(TestAdminCommands): + + def test_when_unhealthy_without_confirm(self): + ''' + Test that command "ceph fs set max_mds " without the + confirmation flag (--yes-i-really-mean-it) fails when cluster is + unhealthy. + ''' + self.gen_health_warn_mds_cache_oversized() + + with self.assertRaises(CommandFailedError) as cfe: + self.fs.set_max_mds(2, confirm=False) + self.assertEqual(cfe.exception.exitstatus, errno.EPERM) + + def test_when_unhealthy_with_confirm(self): + ''' + Test that command "ceph fs set max_mds + --yes-i-really-mean-it" runs successfully when cluster is unhealthy. + ''' + self.gen_health_warn_mds_cache_oversized() + + self.fs.set_max_mds(2, confirm=True) + self.assertEqual(self.fs.get_var('max_mds'), 2) + + def test_when_mds_trim_without_confirm(self): + ''' + Test that command "ceph fs set max_mds " without the + confirmation flag (--yes-i-really-mean-it) fails when cluster has + MDS_TRIM health warning. + ''' + self.gen_health_warn_mds_trim() + + with self.assertRaises(CommandFailedError) as cfe: + self.fs.set_max_mds(2, confirm=False) + self.assertEqual(cfe.exception.exitstatus, errno.EPERM) + + def test_when_mds_trim_when_with_confirm(self): + ''' + Test that command "ceph fs set max_mds + --yes-i-really-mean-it" runs successfully when cluster has MDS_TRIM + health warning. + ''' + self.gen_health_warn_mds_trim() + + self.fs.set_max_mds(2, confirm=True) + self.assertEqual(self.fs.get_var('max_mds'), 2) + + def test_when_healthy_with_confirm(self): + ''' + Test that command "ceph fs set max_mds + --yes-i-really-mean-it" runs successfully also when cluster is + healthy. + ''' + self.fs.set_max_mds(2, confirm=True) + self.assertEqual(self.fs.get_var('max_mds'), 2) From 2d28faaeea11988867471a53e40145f309951307 Mon Sep 17 00:00:00 2001 From: Rishabh Dave Date: Tue, 27 Aug 2024 13:33:23 +0530 Subject: [PATCH 035/148] doc/cephfs: update about changing max_mds FS setting variable Update the documentation for CephFs admininstration as well troubleshooting. Signed-off-by: Rishabh Dave --- doc/cephfs/administration.rst | 11 +++++++++-- doc/cephfs/troubleshooting.rst | 5 +++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/doc/cephfs/administration.rst b/doc/cephfs/administration.rst index 5760e67f73e64..07646bff06786 100644 --- a/doc/cephfs/administration.rst +++ b/doc/cephfs/administration.rst @@ -61,10 +61,17 @@ is a subset of the same information from the ``ceph fs dump`` command. :: - ceph fs set + ceph fs set [--yes-i-really-mean-it] Change a setting on a file system. These settings are specific to the named -file system and do not affect other file systems. +file system and do not affect other file systems. Confirmation flag is only +needed for changing ``max_mds`` when cluster is unhealthy. + +.. note:: It is mandatory to pass confirmation flag (--yes--i-really-mean-it) + for modifying FS setting variable ``max_mds`` when cluster is unhealthy. + It has been added a precaution to tell users that modifying ``max_mds`` + during troubleshooting or recovery might not help. Instead, it might + further destabilize the cluster. :: diff --git a/doc/cephfs/troubleshooting.rst b/doc/cephfs/troubleshooting.rst index 34de1b7501df9..78d0a8f54d336 100644 --- a/doc/cephfs/troubleshooting.rst +++ b/doc/cephfs/troubleshooting.rst @@ -128,6 +128,11 @@ things to do: That prevents any clients from establishing new sessions with the MDS. +* **Dont tweak max_mds** Modifying the FS setting variable ``max_mds`` is + sometimes perceived as a good step during troubleshooting or recovery effort. + Instead, doing so might further destabilize the cluster. If ``max_mds`` must + be changed in such circumstances, run the command to change ``max_mds`` with + the confirmation flag (``--yes-i-really-mean-it``) Expediting MDS journal trim From a71c8e8d1186823cf5d01f23d7b922c5e2665aa5 Mon Sep 17 00:00:00 2001 From: Rishabh Dave Date: Tue, 27 Aug 2024 13:50:49 +0530 Subject: [PATCH 036/148] PendingReleaseNotes: add a release note about confirm flag for max_mds Add a release note for the fact that users now need to pass the confirmation flag for modifying "max_mds" when cluster is unhealthy. Signed-off-by: Rishabh Dave --- PendingReleaseNotes | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/PendingReleaseNotes b/PendingReleaseNotes index 0185d6e54eaf2..c35924c6e8690 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -13,6 +13,14 @@ output is changed from 'STATUS' to 'STATE'. The state of a group snapshot that was shown as 'ok' is now shown as 'complete', which is more descriptive. +* CephFS: Modifying the FS setting variable "max_mds" when a cluster is + unhealthy now requires users to pass the confirmation flag + (--yes-i-really-mean-it). This has been added as a precaution to tell the + users that modifying "max_mds" may not help with troubleshooting or recovery + effort. Instead, it might further destabilize the cluster. + + + >=19.0.0 * cephx: key rotation is now possible using `ceph auth rotate`. Previously, From 3e3b7fa8ea22a2dd7f5d5c97d6e096a5e30585f4 Mon Sep 17 00:00:00 2001 From: Adam King Date: Tue, 1 Oct 2024 10:03:13 -0400 Subject: [PATCH 037/148] mgr/cephadm: add "allow_set_io_flusher_fail = true;" to ganesha conf This is necessary for ganesha 6.1 running in a container to start up without hitting a permission failure (without having to run the container as --privileged) and doesn't seem to cause any damage when read in by ganesha v5.9 (the current version in the main branch containers) Signed-off-by: Adam King --- src/pybind/mgr/cephadm/templates/services/nfs/ganesha.conf.j2 | 1 + src/pybind/mgr/cephadm/tests/test_services.py | 1 + 2 files changed, 2 insertions(+) diff --git a/src/pybind/mgr/cephadm/templates/services/nfs/ganesha.conf.j2 b/src/pybind/mgr/cephadm/templates/services/nfs/ganesha.conf.j2 index ded403169c976..03ff8a32ca292 100644 --- a/src/pybind/mgr/cephadm/templates/services/nfs/ganesha.conf.j2 +++ b/src/pybind/mgr/cephadm/templates/services/nfs/ganesha.conf.j2 @@ -4,6 +4,7 @@ NFS_CORE_PARAM { Enable_RQUOTA = false; Protocols = 4; NFS_Port = {{ port }}; + allow_set_io_flusher_fail = true; {% if bind_addr %} Bind_addr = {{ bind_addr }}; {% endif %} diff --git a/src/pybind/mgr/cephadm/tests/test_services.py b/src/pybind/mgr/cephadm/tests/test_services.py index a9b7da624a0e6..538ca65371489 100644 --- a/src/pybind/mgr/cephadm/tests/test_services.py +++ b/src/pybind/mgr/cephadm/tests/test_services.py @@ -2710,6 +2710,7 @@ def fake_keys(): ' Enable_RQUOTA = false;\n' ' Protocols = 4;\n' ' NFS_Port = 2049;\n' + ' allow_set_io_flusher_fail = true;\n' ' HAProxy_Hosts = 192.168.122.111, 10.10.2.20, 192.168.122.222;\n' '}\n' '\n' From fd895dde9d69ae7be7a78d8db37a2f94ded27080 Mon Sep 17 00:00:00 2001 From: Guillaume Abrioux Date: Mon, 30 Sep 2024 15:30:25 +0200 Subject: [PATCH 038/148] cephadm: pull container images from quay.io Now that all required images are hosted and/or mirrored on quay.io we can move away from docker.io Fixes: https://tracker.ceph.com/issues/68323 Signed-off-by: Guillaume Abrioux --- src/cephadm/cephadmlib/constants.py | 8 ++--- src/cephadm/cephadmlib/data_utils.py | 12 +++---- src/cephadm/samples/custom_container.json | 2 +- src/cephadm/tests/build/test_cephadm_build.py | 4 +-- src/cephadm/tests/test_cephadm.py | 36 +++++++++---------- src/cephadm/tests/test_custom_container.py | 2 +- src/cephadm/tox.ini | 3 +- src/pybind/mgr/cephadm/module.py | 8 ++--- src/pybind/mgr/cephadm/tests/test_cephadm.py | 2 +- src/pybind/mgr/cephadm/tests/test_spec.py | 20 +++++------ src/pybind/mgr/cephadm/upgrade.py | 12 +++---- .../service-daemon-list.component.spec.ts | 12 +++---- .../orchestrator/tests/test_orchestrator.py | 2 +- src/pybind/mgr/tox.ini | 3 +- 14 files changed, 64 insertions(+), 62 deletions(-) diff --git a/src/cephadm/cephadmlib/constants.py b/src/cephadm/cephadmlib/constants.py index d25eb1391e0c0..354c378239802 100644 --- a/src/cephadm/cephadmlib/constants.py +++ b/src/cephadm/cephadmlib/constants.py @@ -5,15 +5,15 @@ DEFAULT_IMAGE_IS_MAIN = True DEFAULT_IMAGE_RELEASE = 'squid' DEFAULT_PROMETHEUS_IMAGE = 'quay.io/prometheus/prometheus:v2.51.0' -DEFAULT_LOKI_IMAGE = 'docker.io/grafana/loki:3.0.0' -DEFAULT_PROMTAIL_IMAGE = 'docker.io/grafana/promtail:3.0.0' +DEFAULT_LOKI_IMAGE = 'quay.io/ceph/loki:3.0.0' +DEFAULT_PROMTAIL_IMAGE = 'quay.io/ceph/promtail:3.0.0' DEFAULT_NODE_EXPORTER_IMAGE = 'quay.io/prometheus/node-exporter:v1.7.0' DEFAULT_ALERT_MANAGER_IMAGE = 'quay.io/prometheus/alertmanager:v0.27.0' DEFAULT_GRAFANA_IMAGE = 'quay.io/ceph/grafana:10.4.8' DEFAULT_HAPROXY_IMAGE = 'quay.io/ceph/haproxy:2.3' DEFAULT_KEEPALIVED_IMAGE = 'quay.io/ceph/keepalived:2.2.4' DEFAULT_NVMEOF_IMAGE = 'quay.io/ceph/nvmeof:1.2.17' -DEFAULT_SNMP_GATEWAY_IMAGE = 'docker.io/maxwo/snmp-notifier:v1.2.1' +DEFAULT_SNMP_GATEWAY_IMAGE = 'quay.io/ceph/snmp-notifier:v1.2.1' DEFAULT_ELASTICSEARCH_IMAGE = 'quay.io/omrizeneva/elasticsearch:6.8.23' DEFAULT_JAEGER_COLLECTOR_IMAGE = 'quay.io/jaegertracing/jaeger-collector:1.29' DEFAULT_JAEGER_AGENT_IMAGE = 'quay.io/jaegertracing/jaeger-agent:1.29' @@ -22,7 +22,7 @@ DEFAULT_SMBMETRICS_IMAGE = 'quay.io/samba.org/samba-metrics:latest' DEFAULT_NGINX_IMAGE = 'quay.io/ceph/nginx:sclorg-nginx-126' DEFAULT_OAUTH2_PROXY_IMAGE = 'quay.io/oauth2-proxy/oauth2-proxy:v7.6.0' -DEFAULT_REGISTRY = 'docker.io' # normalize unqualified digests to this +DEFAULT_REGISTRY = 'quay.io' # normalize unqualified digests to this # ------------------------------------------------------------------------------ LATEST_STABLE_RELEASE = 'squid' diff --git a/src/cephadm/cephadmlib/data_utils.py b/src/cephadm/cephadmlib/data_utils.py index 2f4674752cc17..0ab8b38d2b518 100644 --- a/src/cephadm/cephadmlib/data_utils.py +++ b/src/cephadm/cephadmlib/data_utils.py @@ -165,17 +165,17 @@ def is_fsid(s): def normalize_image_digest(digest: str) -> str: """ Normal case: - >>> normalize_image_digest('ceph/ceph', 'docker.io') - 'docker.io/ceph/ceph' + >>> normalize_image_digest('ceph/ceph', 'quay.io') + 'quay.io/ceph/ceph' No change: - >>> normalize_image_digest('quay.ceph.io/ceph/ceph', 'docker.io') + >>> normalize_image_digest('quay.ceph.io/ceph/ceph', 'quay.io') 'quay.ceph.io/ceph/ceph' - >>> normalize_image_digest('docker.io/ubuntu', 'docker.io') - 'docker.io/ubuntu' + >>> normalize_image_digest('quay.io/ubuntu', 'quay.io') + 'quay.io/ubuntu' - >>> normalize_image_digest('localhost/ceph', 'docker.io') + >>> normalize_image_digest('localhost/ceph', 'quay.io') 'localhost/ceph' """ known_shortnames = [ diff --git a/src/cephadm/samples/custom_container.json b/src/cephadm/samples/custom_container.json index 194a44d2abbf1..210cf1e3e552a 100644 --- a/src/cephadm/samples/custom_container.json +++ b/src/cephadm/samples/custom_container.json @@ -1,5 +1,5 @@ { - "image": "docker.io/prom/alertmanager:v0.20.0", + "image": "quay.io/prometheus/alertmanager:v0.20.0", "ports": [9093, 9094], "args": [ "-p", "9093:9093", diff --git a/src/cephadm/tests/build/test_cephadm_build.py b/src/cephadm/tests/build/test_cephadm_build.py index 1465c2c5efea7..c2995a76d4b15 100644 --- a/src/cephadm/tests/build/test_cephadm_build.py +++ b/src/cephadm/tests/build/test_cephadm_build.py @@ -34,12 +34,12 @@ }, 'ubuntu-20.04': { 'name': 'cephadm-build-test:ubuntu-20-04-py3', - 'base_image': 'docker.io/library/ubuntu:20.04', + 'base_image': 'quay.io/library/ubuntu:20.04', 'script': 'apt update && apt install -y python3-venv', }, 'ubuntu-22.04': { 'name': 'cephadm-build-test:ubuntu-22-04-py3', - 'base_image': 'docker.io/library/ubuntu:22.04', + 'base_image': 'quay.io/library/ubuntu:22.04', 'script': 'apt update && apt install -y python3-venv', }, } diff --git a/src/cephadm/tests/test_cephadm.py b/src/cephadm/tests/test_cephadm.py index 928982de70b6f..f27b9bcd3625a 100644 --- a/src/cephadm/tests/test_cephadm.py +++ b/src/cephadm/tests/test_cephadm.py @@ -533,12 +533,12 @@ def test_registry_login(self, _logger, _get_parm, _call_throws): def test_get_image_info_from_inspect(self): # podman - out = """204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1,[docker.io/ceph/ceph@sha256:1cc9b824e1b076cdff52a9aa3f0cc8557d879fb2fbbba0cafed970aca59a3992]""" + out = """204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1,[quay.io/ceph/ceph@sha256:1cc9b824e1b076cdff52a9aa3f0cc8557d879fb2fbbba0cafed970aca59a3992]""" r = _cephadm.get_image_info_from_inspect(out, 'registry/ceph/ceph:latest') print(r) assert r == { 'image_id': '204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1', - 'repo_digests': ['docker.io/ceph/ceph@sha256:1cc9b824e1b076cdff52a9aa3f0cc8557d879fb2fbbba0cafed970aca59a3992'] + 'repo_digests': ['quay.io/ceph/ceph@sha256:1cc9b824e1b076cdff52a9aa3f0cc8557d879fb2fbbba0cafed970aca59a3992'] } # docker @@ -550,13 +550,13 @@ def test_get_image_info_from_inspect(self): } # multiple digests (podman) - out = """e935122ab143a64d92ed1fbb27d030cf6e2f0258207be1baf1b509c466aeeb42,[docker.io/prom/prometheus@sha256:e4ca62c0d62f3e886e684806dfe9d4e0cda60d54986898173c1083856cfda0f4 docker.io/prom/prometheus@sha256:efd99a6be65885c07c559679a0df4ec709604bcdd8cd83f0d00a1a683b28fb6a]""" + out = """e935122ab143a64d92ed1fbb27d030cf6e2f0258207be1baf1b509c466aeeb42,[quay.io/prom/prometheus@sha256:e4ca62c0d62f3e886e684806dfe9d4e0cda60d54986898173c1083856cfda0f4 quay.io/prom/prometheus@sha256:efd99a6be65885c07c559679a0df4ec709604bcdd8cd83f0d00a1a683b28fb6a]""" r = _cephadm.get_image_info_from_inspect(out, 'registry/prom/prometheus:latest') assert r == { 'image_id': 'e935122ab143a64d92ed1fbb27d030cf6e2f0258207be1baf1b509c466aeeb42', 'repo_digests': [ - 'docker.io/prom/prometheus@sha256:e4ca62c0d62f3e886e684806dfe9d4e0cda60d54986898173c1083856cfda0f4', - 'docker.io/prom/prometheus@sha256:efd99a6be65885c07c559679a0df4ec709604bcdd8cd83f0d00a1a683b28fb6a', + 'quay.io/prom/prometheus@sha256:e4ca62c0d62f3e886e684806dfe9d4e0cda60d54986898173c1083856cfda0f4', + 'quay.io/prom/prometheus@sha256:efd99a6be65885c07c559679a0df4ec709604bcdd8cd83f0d00a1a683b28fb6a', ] } @@ -604,7 +604,7 @@ def test_infer_local_ceph_image(self, _logger, _listdir): '') out = '''quay.ceph.io/ceph-ci/ceph@sha256:87f200536bb887b36b959e887d5984dd7a3f008a23aa1f283ab55d48b22c6185|dad864ee21e9|main|2022-03-23 16:29:19 +0000 UTC quay.ceph.io/ceph-ci/ceph@sha256:b50b130fcda2a19f8507ddde3435bb4722266956e1858ac395c838bc1dcf1c0e|514e6a882f6e|pacific|2022-03-23 15:58:34 +0000 UTC - docker.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508|666bbfa87e8d|v15.2.5|2020-09-16 14:15:15 +0000 UTC''' + quay.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508|666bbfa87e8d|v15.2.5|2020-09-16 14:15:15 +0000 UTC''' with mock.patch('cephadm.call_throws', return_value=(out, '', '')): with mock.patch('cephadm.get_container_info', return_value=cinfo): image = _cephadm.infer_local_ceph_image(ctx, ctx.container_engine) @@ -613,7 +613,7 @@ def test_infer_local_ceph_image(self, _logger, _listdir): # make sure first valid image is used when no container_info is found out = '''quay.ceph.io/ceph-ci/ceph@sha256:87f200536bb887b36b959e887d5984dd7a3f008a23aa1f283ab55d48b22c6185|dad864ee21e9|main|2022-03-23 16:29:19 +0000 UTC quay.ceph.io/ceph-ci/ceph@sha256:b50b130fcda2a19f8507ddde3435bb4722266956e1858ac395c838bc1dcf1c0e|514e6a882f6e|pacific|2022-03-23 15:58:34 +0000 UTC - docker.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508|666bbfa87e8d|v15.2.5|2020-09-16 14:15:15 +0000 UTC''' + quay.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508|666bbfa87e8d|v15.2.5|2020-09-16 14:15:15 +0000 UTC''' with mock.patch('cephadm.call_throws', return_value=(out, '', '')): with mock.patch('cephadm.get_container_info', return_value=None): image = _cephadm.infer_local_ceph_image(ctx, ctx.container_engine) @@ -621,12 +621,12 @@ def test_infer_local_ceph_image(self, _logger, _listdir): # make sure images without digest are discarded (no container_info is found) out = '''quay.ceph.io/ceph-ci/ceph@||| - docker.io/ceph/ceph@||| - docker.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508|666bbfa87e8d|v15.2.5|2020-09-16 14:15:15 +0000 UTC''' + quay.io/ceph/ceph@||| + quay.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508|666bbfa87e8d|v15.2.5|2020-09-16 14:15:15 +0000 UTC''' with mock.patch('cephadm.call_throws', return_value=(out, '', '')): with mock.patch('cephadm.get_container_info', return_value=None): image = _cephadm.infer_local_ceph_image(ctx, ctx.container_engine) - assert image == 'docker.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508' + assert image == 'quay.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508' @@ -2409,7 +2409,7 @@ class TestSNMPGateway: def test_unit_run_V2c(self, cephadm_fs): fsid = 'ca734440-3dc6-11ec-9b98-5254002537a6' - with with_cephadm_ctx(['--image=docker.io/maxwo/snmp-notifier:v1.2.1'], list_networks={}) as ctx: + with with_cephadm_ctx(['--image=quay.io/ceph/snmp-notifier:v1.2.1'], list_networks={}) as ctx: import json ctx.config_json = json.dumps(self.V2c_config) ctx.fsid = fsid @@ -2434,11 +2434,11 @@ def test_unit_run_V2c(self, cephadm_fs): ) with open(f'/var/lib/ceph/{fsid}/snmp-gateway.daemon_id/unit.run', 'r') as f: run_cmd = f.readlines()[-1].rstrip() - assert run_cmd.endswith('docker.io/maxwo/snmp-notifier:v1.2.1 --web.listen-address=:9464 --snmp.destination=192.168.1.10:162 --snmp.version=V2c --log.level=info --snmp.trap-description-template=/etc/snmp_notifier/description-template.tpl') + assert run_cmd.endswith('quay.io/ceph/snmp-notifier:v1.2.1 --web.listen-address=:9464 --snmp.destination=192.168.1.10:162 --snmp.version=V2c --log.level=info --snmp.trap-description-template=/etc/snmp_notifier/description-template.tpl') def test_unit_run_V3_noPriv(self, cephadm_fs): fsid = 'ca734440-3dc6-11ec-9b98-5254002537a6' - with with_cephadm_ctx(['--image=docker.io/maxwo/snmp-notifier:v1.2.1'], list_networks={}) as ctx: + with with_cephadm_ctx(['--image=quay.io/ceph/snmp-notifier:v1.2.1'], list_networks={}) as ctx: import json ctx.config_json = json.dumps(self.V3_no_priv_config) ctx.fsid = fsid @@ -2463,11 +2463,11 @@ def test_unit_run_V3_noPriv(self, cephadm_fs): ) with open(f'/var/lib/ceph/{fsid}/snmp-gateway.daemon_id/unit.run', 'r') as f: run_cmd = f.readlines()[-1].rstrip() - assert run_cmd.endswith('docker.io/maxwo/snmp-notifier:v1.2.1 --web.listen-address=:9465 --snmp.destination=192.168.1.10:162 --snmp.version=V3 --log.level=info --snmp.trap-description-template=/etc/snmp_notifier/description-template.tpl --snmp.authentication-enabled --snmp.authentication-protocol=SHA --snmp.security-engine-id=8000C53F00000000') + assert run_cmd.endswith('quay.io/ceph/snmp-notifier:v1.2.1 --web.listen-address=:9465 --snmp.destination=192.168.1.10:162 --snmp.version=V3 --log.level=info --snmp.trap-description-template=/etc/snmp_notifier/description-template.tpl --snmp.authentication-enabled --snmp.authentication-protocol=SHA --snmp.security-engine-id=8000C53F00000000') def test_unit_run_V3_Priv(self, cephadm_fs): fsid = 'ca734440-3dc6-11ec-9b98-5254002537a6' - with with_cephadm_ctx(['--image=docker.io/maxwo/snmp-notifier:v1.2.1'], list_networks={}) as ctx: + with with_cephadm_ctx(['--image=quay.io/ceph/snmp-notifier:v1.2.1'], list_networks={}) as ctx: import json ctx.config_json = json.dumps(self.V3_priv_config) ctx.fsid = fsid @@ -2492,11 +2492,11 @@ def test_unit_run_V3_Priv(self, cephadm_fs): ) with open(f'/var/lib/ceph/{fsid}/snmp-gateway.daemon_id/unit.run', 'r') as f: run_cmd = f.readlines()[-1].rstrip() - assert run_cmd.endswith('docker.io/maxwo/snmp-notifier:v1.2.1 --web.listen-address=:9464 --snmp.destination=192.168.1.10:162 --snmp.version=V3 --log.level=info --snmp.trap-description-template=/etc/snmp_notifier/description-template.tpl --snmp.authentication-enabled --snmp.authentication-protocol=SHA --snmp.security-engine-id=8000C53F00000000 --snmp.private-enabled --snmp.private-protocol=DES') + assert run_cmd.endswith('quay.io/ceph/snmp-notifier:v1.2.1 --web.listen-address=:9464 --snmp.destination=192.168.1.10:162 --snmp.version=V3 --log.level=info --snmp.trap-description-template=/etc/snmp_notifier/description-template.tpl --snmp.authentication-enabled --snmp.authentication-protocol=SHA --snmp.security-engine-id=8000C53F00000000 --snmp.private-enabled --snmp.private-protocol=DES') def test_unit_run_no_dest(self, cephadm_fs): fsid = 'ca734440-3dc6-11ec-9b98-5254002537a6' - with with_cephadm_ctx(['--image=docker.io/maxwo/snmp-notifier:v1.2.1'], list_networks={}) as ctx: + with with_cephadm_ctx(['--image=quay.io/ceph/snmp-notifier:v1.2.1'], list_networks={}) as ctx: import json ctx.config_json = json.dumps(self.no_destination_config) ctx.fsid = fsid @@ -2512,7 +2512,7 @@ def test_unit_run_no_dest(self, cephadm_fs): def test_unit_run_bad_version(self, cephadm_fs): fsid = 'ca734440-3dc6-11ec-9b98-5254002537a6' - with with_cephadm_ctx(['--image=docker.io/maxwo/snmp-notifier:v1.2.1'], list_networks={}) as ctx: + with with_cephadm_ctx(['--image=quay.io/ceph/snmp-notifier:v1.2.1'], list_networks={}) as ctx: import json ctx.config_json = json.dumps(self.bad_version_config) ctx.fsid = fsid diff --git a/src/cephadm/tests/test_custom_container.py b/src/cephadm/tests/test_custom_container.py index c185b0908df6c..197ed38dca3be 100644 --- a/src/cephadm/tests/test_custom_container.py +++ b/src/cephadm/tests/test_custom_container.py @@ -47,7 +47,7 @@ def setUp(self): ] ] }, - image='docker.io/library/hello-world:latest' + image='quay.io/hello-world/hello-world:latest' ) def test_entrypoint(self): diff --git a/src/cephadm/tox.ini b/src/cephadm/tox.ini index 70e9a411238fb..20608c1681ce1 100644 --- a/src/cephadm/tox.ini +++ b/src/cephadm/tox.ini @@ -49,7 +49,8 @@ deps = flake8-quotes commands = flake8 --config=tox.ini {posargs:cephadm.py cephadmlib} - bash -c 'test $(git ls-files 'cephadm.py' 'cephadmlib/*.py' | sort -u | xargs grep "docker.io" | wc -l) == 11' + bash -c 'test $(git ls-files 'cephadm.py' 'cephadmlib/*.py' | sort -u | xargs grep "docker.io" | wc -l) == 1' + bash -c 'test $(git ls-files 'cephadm.py' 'cephadmlib/*.py' | sort -u | xargs grep "quay.io" | wc -l) == 25' # Downstream distributions may choose to alter this "docker.io" number, # to make sure no new references to docker.io are creeping in unnoticed. diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 5216c489064c9..178f9cb7ce803 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -135,13 +135,13 @@ def os_exit_noop(status: int) -> None: DEFAULT_PROMETHEUS_IMAGE = 'quay.io/prometheus/prometheus:v2.51.0' DEFAULT_NODE_EXPORTER_IMAGE = 'quay.io/prometheus/node-exporter:v1.7.0' DEFAULT_NVMEOF_IMAGE = 'quay.io/ceph/nvmeof:1.2.17' -DEFAULT_LOKI_IMAGE = 'docker.io/grafana/loki:3.0.0' -DEFAULT_PROMTAIL_IMAGE = 'docker.io/grafana/promtail:3.0.0' +DEFAULT_LOKI_IMAGE = 'quay.io/ceph/loki:3.0.0' +DEFAULT_PROMTAIL_IMAGE = 'quay.io/ceph/promtail:3.0.0' DEFAULT_ALERT_MANAGER_IMAGE = 'quay.io/prometheus/alertmanager:v0.27.0' DEFAULT_GRAFANA_IMAGE = 'quay.io/ceph/grafana:10.4.8' DEFAULT_HAPROXY_IMAGE = 'quay.io/ceph/haproxy:2.3' DEFAULT_KEEPALIVED_IMAGE = 'quay.io/ceph/keepalived:2.2.4' -DEFAULT_SNMP_GATEWAY_IMAGE = 'docker.io/maxwo/snmp-notifier:v1.2.1' +DEFAULT_SNMP_GATEWAY_IMAGE = 'quay.io/ceph/snmp-notifier:v1.2.1' DEFAULT_ELASTICSEARCH_IMAGE = 'quay.io/omrizeneva/elasticsearch:6.8.23' DEFAULT_JAEGER_COLLECTOR_IMAGE = 'quay.io/jaegertracing/jaeger-collector:1.29' DEFAULT_JAEGER_AGENT_IMAGE = 'quay.io/jaegertracing/jaeger-agent:1.29' @@ -446,7 +446,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, Option( 'default_registry', type='str', - default='docker.io', + default='quay.io', desc='Search-registry to which we should normalize unqualified image names. ' 'This is not the default registry', ), diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py index 5a485f98be390..975c125225dc8 100644 --- a/src/pybind/mgr/cephadm/tests/test_cephadm.py +++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py @@ -2040,7 +2040,7 @@ def test_blink_device_light_custom_per_host(self, _run_cephadm, cephadm_module): ), CephadmOrchestrator.apply_iscsi), (CustomContainerSpec( service_id='hello-world', - image='docker.io/library/hello-world:latest', + image='quay.io/hello-world/hello-world:latest', uid=65534, gid=65534, dirs=['foo/bar'], diff --git a/src/pybind/mgr/cephadm/tests/test_spec.py b/src/pybind/mgr/cephadm/tests/test_spec.py index 78a2d73118fe7..42e590945cd96 100644 --- a/src/pybind/mgr/cephadm/tests/test_spec.py +++ b/src/pybind/mgr/cephadm/tests/test_spec.py @@ -130,7 +130,7 @@ def convert_to_old_style_json(j): "hostname": "ceph-001", "container_id": "d94d7969094d", "container_image_id": "0881eb8f169f5556a292b4e2c01d683172b12830a62a9225a98a8e206bb734f0", - "container_image_name": "docker.io/prom/alertmanager:latest", + "container_image_name": "quay.io/prometheus/alertmanager:latest", "daemon_id": "ceph-001", "daemon_type": "alertmanager", "version": "0.20.0", @@ -145,7 +145,7 @@ def convert_to_old_style_json(j): "hostname": "ceph-001", "container_id": "c4b036202241", "container_image_id": "204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1", - "container_image_name": "docker.io/ceph/ceph:v15", + "container_image_name": "quay.io/ceph/ceph:v15", "daemon_id": "ceph-001", "daemon_type": "crash", "version": "15.2.0", @@ -160,7 +160,7 @@ def convert_to_old_style_json(j): "hostname": "ceph-001", "container_id": "5b7b94b48f31", "container_image_id": "87a51ecf0b1c9a7b187b21c1b071425dafea0d765a96d5bc371c791169b3d7f4", - "container_image_name": "docker.io/ceph/ceph-grafana:latest", + "container_image_name": "quay.io/ceph/ceph-grafana:latest", "daemon_id": "ceph-001", "daemon_type": "grafana", "version": "6.6.2", @@ -175,7 +175,7 @@ def convert_to_old_style_json(j): "hostname": "ceph-001", "container_id": "9ca007280456", "container_image_id": "204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1", - "container_image_name": "docker.io/ceph/ceph:v15", + "container_image_name": "quay.io/ceph/ceph:v15", "daemon_id": "ceph-001.gkjwqp", "daemon_type": "mgr", "version": "15.2.0", @@ -190,7 +190,7 @@ def convert_to_old_style_json(j): "hostname": "ceph-001", "container_id": "3d1ba9a2b697", "container_image_id": "204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1", - "container_image_name": "docker.io/ceph/ceph:v15", + "container_image_name": "quay.io/ceph/ceph:v15", "daemon_id": "ceph-001", "daemon_type": "mon", "version": "15.2.0", @@ -205,7 +205,7 @@ def convert_to_old_style_json(j): "hostname": "ceph-001", "container_id": "36d026c68ba1", "container_image_id": "e5a616e4b9cf68dfcad7782b78e118be4310022e874d52da85c55923fb615f87", - "container_image_name": "docker.io/prom/node-exporter:latest", + "container_image_name": "quay.io/prometheus/node-exporter:latest", "daemon_id": "ceph-001", "daemon_type": "node-exporter", "version": "0.18.1", @@ -220,7 +220,7 @@ def convert_to_old_style_json(j): "hostname": "ceph-001", "container_id": "faf76193cbfe", "container_image_id": "204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1", - "container_image_name": "docker.io/ceph/ceph:v15", + "container_image_name": "quay.io/ceph/ceph:v15", "daemon_id": "0", "daemon_type": "osd", "version": "15.2.0", @@ -235,7 +235,7 @@ def convert_to_old_style_json(j): "hostname": "ceph-001", "container_id": "f82505bae0f1", "container_image_id": "204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1", - "container_image_name": "docker.io/ceph/ceph:v15", + "container_image_name": "quay.io/ceph/ceph:v15", "daemon_id": "1", "daemon_type": "osd", "version": "15.2.0", @@ -250,7 +250,7 @@ def convert_to_old_style_json(j): "hostname": "ceph-001", "container_id": "2708d84cd484", "container_image_id": "358a0d2395fe711bb8258e8fb4b2d7865c0a9a6463969bcd1452ee8869ea6653", - "container_image_name": "docker.io/prom/prometheus:latest", + "container_image_name": "quay.io/prom/prometheus:latest", "daemon_id": "ceph-001", "daemon_type": "prometheus", "version": "2.17.1", @@ -569,7 +569,7 @@ def convert_to_old_style_json(j): CustomContainerSpec( service_type='container', service_id='hello-world', - image='docker.io/library/hello-world:latest', + image='quay.io/hello-world/hello-world:latest', ), DaemonDescription( daemon_type='container', diff --git a/src/pybind/mgr/cephadm/upgrade.py b/src/pybind/mgr/cephadm/upgrade.py index d8ffab2da5187..ed3d26807e5ce 100644 --- a/src/pybind/mgr/cephadm/upgrade.py +++ b/src/pybind/mgr/cephadm/upgrade.py @@ -29,17 +29,17 @@ def normalize_image_digest(digest: str, default_registry: str) -> str: """ Normal case: - >>> normalize_image_digest('ceph/ceph', 'docker.io') - 'docker.io/ceph/ceph' + >>> normalize_image_digest('ceph/ceph', 'quay.io') + 'quay.io/ceph/ceph' No change: - >>> normalize_image_digest('quay.ceph.io/ceph/ceph', 'docker.io') + >>> normalize_image_digest('quay.ceph.io/ceph/ceph', 'quay.io') 'quay.ceph.io/ceph/ceph' - >>> normalize_image_digest('docker.io/ubuntu', 'docker.io') - 'docker.io/ubuntu' + >>> normalize_image_digest('quay.io/centos', 'quay.io') + 'quay.io/centos' - >>> normalize_image_digest('localhost/ceph', 'docker.io') + >>> normalize_image_digest('localhost/ceph', 'quay.io') 'localhost/ceph' """ known_shortnames = [ diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-daemon-list/service-daemon-list.component.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-daemon-list/service-daemon-list.component.spec.ts index d3ea8c018f66a..367418c752e07 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-daemon-list/service-daemon-list.component.spec.ts +++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-daemon-list/service-daemon-list.component.spec.ts @@ -27,7 +27,7 @@ describe('ServiceDaemonListComponent', () => { hostname: 'osd0', container_id: '003c10beafc8c27b635bcdfed1ed832e4c1005be89bb1bb05ad4cc6c2b98e41b', container_image_id: 'e70344c77bcbf3ee389b9bf5128f635cf95f3d59e005c5d8e67fc19bcc74ed23', - container_image_name: 'docker.io/ceph/daemon-base:latest-master-devel', + container_image_name: 'quay.io/ceph/daemon-base:latest-master-devel', daemon_id: '3', daemon_type: 'osd', daemon_name: 'osd.3', @@ -47,7 +47,7 @@ describe('ServiceDaemonListComponent', () => { hostname: 'osd0', container_id: 'baeec41a01374b3ed41016d542d19aef4a70d69c27274f271e26381a0cc58e7a', container_image_id: 'e70344c77bcbf3ee389b9bf5128f635cf95f3d59e005c5d8e67fc19bcc74ed23', - container_image_name: 'docker.io/ceph/daemon-base:latest-master-devel', + container_image_name: 'quay.io/ceph/daemon-base:latest-master-devel', daemon_id: '4', daemon_type: 'osd', daemon_name: 'osd.4', @@ -63,7 +63,7 @@ describe('ServiceDaemonListComponent', () => { hostname: 'osd0', container_id: '8483de277e365bea4365cee9e1f26606be85c471e4da5d51f57e4b85a42c616e', container_image_id: 'e70344c77bcbf3ee389b9bf5128f635cf95f3d59e005c5d8e67fc19bcc74ed23', - container_image_name: 'docker.io/ceph/daemon-base:latest-master-devel', + container_image_name: 'quay.io/ceph/daemon-base:latest-master-devel', daemon_id: '5', daemon_type: 'osd', daemon_name: 'osd.5', @@ -79,7 +79,7 @@ describe('ServiceDaemonListComponent', () => { hostname: 'mon0', container_id: '6ca0574f47e300a6979eaf4e7c283a8c4325c2235ae60358482fc4cd58844a21', container_image_id: 'e70344c77bcbf3ee389b9bf5128f635cf95f3d59e005c5d8e67fc19bcc74ed23', - container_image_name: 'docker.io/ceph/daemon-base:latest-master-devel', + container_image_name: 'quay.io/ceph/daemon-base:latest-master-devel', daemon_id: 'a', daemon_name: 'mon.a', daemon_type: 'mon', @@ -99,7 +99,7 @@ describe('ServiceDaemonListComponent', () => { service_name: 'osd', status: { container_image_id: 'e70344c77bcbf3ee389b9bf5128f635cf95f3d59e005c5d8e67fc19bcc74ed23', - container_image_name: 'docker.io/ceph/daemon-base:latest-master-devel', + container_image_name: 'quay.io/ceph/daemon-base:latest-master-devel', size: 3, running: 3, last_refresh: '2020-02-25T04:33:26.465699' @@ -111,7 +111,7 @@ describe('ServiceDaemonListComponent', () => { service_name: 'crash', status: { container_image_id: 'e70344c77bcbf3ee389b9bf5128f635cf95f3d59e005c5d8e67fc19bcc74ed23', - container_image_name: 'docker.io/ceph/daemon-base:latest-master-devel', + container_image_name: 'quay.io/ceph/daemon-base:latest-master-devel', size: 1, running: 1, last_refresh: '2020-02-25T04:33:26.465766' diff --git a/src/pybind/mgr/orchestrator/tests/test_orchestrator.py b/src/pybind/mgr/orchestrator/tests/test_orchestrator.py index 726a7ac7937c5..3247b06a3993b 100644 --- a/src/pybind/mgr/orchestrator/tests/test_orchestrator.py +++ b/src/pybind/mgr/orchestrator/tests/test_orchestrator.py @@ -102,7 +102,7 @@ def test_yaml(): host_pattern: '*' status: container_image_id: 74803e884bea289d2d2d3ebdf6d37cd560499e955595695b1390a89800f4e37a - container_image_name: docker.io/ceph/daemon-base:latest-master-devel + container_image_name: quay.io/ceph/daemon-base:latest-main-devel created: '2020-06-10T10:37:31.051288Z' last_refresh: '2020-06-10T10:57:40.715637Z' running: 1 diff --git a/src/pybind/mgr/tox.ini b/src/pybind/mgr/tox.ini index a8a2d39d01a73..f39ececa93dd5 100644 --- a/src/pybind/mgr/tox.ini +++ b/src/pybind/mgr/tox.ini @@ -160,7 +160,8 @@ modules = commands = flake8 --config=tox.ini {posargs} \ {posargs:{[testenv:flake8]modules}} - bash -c 'test $(git ls-files cephadm | grep ".py$" | grep -v tests | xargs grep "docker.io" | wc -l) == 13' + bash -c 'test $(git ls-files cephadm | grep ".py$" | grep -v tests | xargs grep "docker.io" | wc -l) == 3' + bash -c 'test $(git ls-files cephadm | grep ".py$" | grep -v tests | xargs grep "quay.io" | wc -l) == 26' [testenv:jinjalint] deps = From 69baa6de2579f0e3ce8298e14b970b8c68deae9c Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Fri, 27 Sep 2024 21:21:34 -0400 Subject: [PATCH 039/148] common: assert debug mutex lock is not held if !recursive There's appropriate checks for unlock and post-lock but nothing to stop the undefined behavior of a double-lock on a non-recursive mutex. Signed-off-by: Patrick Donnelly --- src/common/mutex_debug.h | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/common/mutex_debug.h b/src/common/mutex_debug.h index c1a4ff2a43501..d56d0ebee9987 100644 --- a/src/common/mutex_debug.h +++ b/src/common/mutex_debug.h @@ -169,20 +169,16 @@ class mutex_debug_impl : public mutex_debugging_base } bool try_lock(bool no_lockdep = false) { - bool locked = try_lock_impl(); - if (locked) { - if (enable_lockdep(no_lockdep)) - _locked(); - _post_lock(); - } - return locked; + ceph_assert(recursive || !is_locked_by_me()); + return _try_lock(no_lockdep); } void lock(bool no_lockdep = false) { + ceph_assert(recursive || !is_locked_by_me()); if (enable_lockdep(no_lockdep)) _will_lock(recursive); - if (try_lock(no_lockdep)) + if (_try_lock(no_lockdep)) return; lock_impl(); @@ -198,6 +194,16 @@ class mutex_debug_impl : public mutex_debugging_base unlock_impl(); } +private: + bool _try_lock(bool no_lockdep) { + bool locked = try_lock_impl(); + if (locked) { + if (enable_lockdep(no_lockdep)) + _locked(); + _post_lock(); + } + return locked; + } }; From bec702dad7a1075c482b4e89c2e2f745e4058123 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Fri, 27 Sep 2024 21:24:43 -0400 Subject: [PATCH 040/148] common,osdc: remove obsolete ceph::mutex_debugging Now that we confirm a lock is not held in mutex_debug::lock. Signed-off-by: Patrick Donnelly --- src/common/ceph_mutex.h | 5 ----- src/common/config_proxy.h | 1 - src/osdc/Journaler.h | 14 -------------- 3 files changed, 20 deletions(-) diff --git a/src/common/ceph_mutex.h b/src/common/ceph_mutex.h index 059d81f2ac39b..6ed8c56d5dad3 100644 --- a/src/common/ceph_mutex.h +++ b/src/common/ceph_mutex.h @@ -83,7 +83,6 @@ namespace ceph { return {}; } - static constexpr bool mutex_debugging = false; #define ceph_mutex_is_locked(m) true #define ceph_mutex_is_locked_by_me(m) true } @@ -131,8 +130,6 @@ namespace ceph { return {std::forward(args)...}; } - static constexpr bool mutex_debugging = true; - // debug methods #define ceph_mutex_is_locked(m) ((m).is_locked()) #define ceph_mutex_is_not_locked(m) (!(m).is_locked()) @@ -186,8 +183,6 @@ namespace ceph { return {}; } - static constexpr bool mutex_debugging = false; - // debug methods. Note that these can blindly return true // because any code that does anything other than assert these // are true is broken. diff --git a/src/common/config_proxy.h b/src/common/config_proxy.h index b9b47d9cef472..12a273b8c84f7 100644 --- a/src/common/config_proxy.h +++ b/src/common/config_proxy.h @@ -31,7 +31,6 @@ class ConfigProxy { using rev_obs_map_t = ObsMgr::rev_obs_map; void _call_observers(rev_obs_map_t& rev_obs) { - ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock)); for (auto& [obs, keys] : rev_obs) { (*obs)->handle_conf_change(*this, keys); } diff --git a/src/osdc/Journaler.h b/src/osdc/Journaler.h index 4a574ed66d94e..d15862c08ba52 100644 --- a/src/osdc/Journaler.h +++ b/src/osdc/Journaler.h @@ -529,43 +529,35 @@ class Journaler { // =================== Header get_last_committed() const { - ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock)); lock_guard l(lock); return last_committed; } Header get_last_written() const { - ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock)); lock_guard l(lock); return last_written; } uint64_t get_layout_period() const { - ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock)); lock_guard l(lock); return layout.get_period(); } file_layout_t get_layout() const { - ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock)); lock_guard l(lock); return layout; } bool is_active() const { - ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock)); lock_guard l(lock); return state == STATE_ACTIVE; } bool is_stopping() const { - ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock)); lock_guard l(lock); return state == STATE_STOPPING; } int get_error() const { - ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock)); lock_guard l(lock); return error; } bool is_readonly() const { - ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock)); lock_guard l(lock); return readonly; } @@ -573,32 +565,26 @@ class Journaler { bool _is_readable(); bool try_read_entry(bufferlist& bl); uint64_t get_write_pos() const { - ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock)); lock_guard l(lock); return write_pos; } uint64_t get_write_safe_pos() const { - ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock)); lock_guard l(lock); return safe_pos; } uint64_t get_read_pos() const { - ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock)); lock_guard l(lock); return read_pos; } uint64_t get_expire_pos() const { - ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock)); lock_guard l(lock); return expire_pos; } uint64_t get_trimmed_pos() const { - ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock)); lock_guard l(lock); return trimmed_pos; } size_t get_journal_envelope_size() const { - ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock)); lock_guard l(lock); return journal_stream.get_envelope_size(); } From 1213df95915a792be66126acec1e08aa5bf3c795 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Mon, 30 Sep 2024 09:51:54 -0400 Subject: [PATCH 041/148] test/common: fix invalid vim mode Signed-off-by: Patrick Donnelly --- src/test/common/test_mutex_debug.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/common/test_mutex_debug.cc b/src/test/common/test_mutex_debug.cc index 977dfe738a921..565dcd64de9c0 100644 --- a/src/test/common/test_mutex_debug.cc +++ b/src/test/common/test_mutex_debug.cc @@ -1,5 +1,5 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 &smarttab +// vim: ts=8 sw=2 smarttab /* * Ceph - scalable distributed file system * From 84ebb30ea94bea60a5ff93f1a6b334000455da82 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Mon, 30 Sep 2024 09:53:09 -0400 Subject: [PATCH 042/148] common/test: do not test exception raised from recursive lock The C++ standard does not require that implementations raise std::system_error when double-locking a non-recursive lock. Our implementation of debug_mutex now catches this error with a ceph_assert so it cannot be caught. Signed-off-by: Patrick Donnelly --- src/test/common/test_mutex_debug.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/test/common/test_mutex_debug.cc b/src/test/common/test_mutex_debug.cc index 565dcd64de9c0..29eb8076859c9 100644 --- a/src/test/common/test_mutex_debug.cc +++ b/src/test/common/test_mutex_debug.cc @@ -65,7 +65,6 @@ TEST(MutexDebug, NotRecursive) { ASSERT_TRUE(m.is_locked()); ASSERT_FALSE(std::async(std::launch::async, ttl, &m).get()); - ASSERT_THROW(m.lock(), std::system_error); ASSERT_TRUE(m.is_locked()); ASSERT_FALSE(std::async(std::launch::async, ttl, &m).get()); From 8d2d54f6c26295e8c5ba5e5fe9ca5e14f8ba7786 Mon Sep 17 00:00:00 2001 From: neeraj pratap singh Date: Thu, 26 Sep 2024 14:25:00 +0530 Subject: [PATCH 043/148] cephfs-shell: fixing the cephfs-shell test failures cephfs-shell is failing in Ubuntu22.04, because it is behaving weirdly with cmd2's version. It is taking cmd2 version as 0.0.0 instead of the correct version. Fixes: https://tracker.ceph.com/issues/63700 Signed-off-by: Neeraj Pratap Singh --- src/tools/cephfs/shell/cephfs-shell | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/tools/cephfs/shell/cephfs-shell b/src/tools/cephfs/shell/cephfs-shell index 9449007a80b97..3f19a637e6864 100755 --- a/src/tools/cephfs/shell/cephfs-shell +++ b/src/tools/cephfs/shell/cephfs-shell @@ -15,14 +15,22 @@ import re import shlex import stat import errno +import distro from cmd2 import Cmd from cmd2 import __version__ as cmd2_version from packaging.version import Version +# DFLAG is used to override the checks done by cephfs-shell +# for cmd2 versions due to weird behaviour of Ubuntu22.04 with +# cmd2's version i.e. it always gets the version of cmd2 as +# "0.0.0" instead of the actual cmd2 version. +DFLAG = False +if distro.name() == "Ubuntu" and distro.version() == "22.04": + DFLAG = True # XXX: In cmd2 versions < 1.0.1, we'll get SystemExit(2) instead of # Cmd2ArgparseError -if Version(cmd2_version) >= Version("1.0.1"): +if Version(cmd2_version) >= Version("1.0.1") or DFLAG is True: from cmd2.exceptions import Cmd2ArgparseError else: # HACK: so that we don't have check for version everywhere @@ -1700,7 +1708,7 @@ def read_shell_conf(shell, shell_conf_file): sec = 'cephfs-shell' opts = [] - if Version(cmd2_version) >= Version("0.10.0"): + if Version(cmd2_version) >= Version("0.10.0") or DFLAG is True: for attr in shell.settables.keys(): opts.append(attr) else: @@ -1768,7 +1776,7 @@ def manage_args(): args.exe_and_quit = False # Execute and quit, don't launch the shell. if args.batch: - if Version(cmd2_version) <= Version("0.9.13"): + if Version(cmd2_version) <= Version("0.9.13") and DFLAG is not True: args.commands = ['load ' + args.batch, ',quit'] else: args.commands = ['run_script ' + args.batch, ',quit'] @@ -1813,7 +1821,7 @@ def execute_cmds_and_quit(args): # value to indicate whether the execution of the commands should stop, but # since 0.9.7 it returns the return value of do_* methods only if it's # not None. When it is None it returns False instead of None. - if Version(cmd2_version) <= Version("0.9.6"): + if Version(cmd2_version) <= Version("0.9.6") and DFLAG is not True: stop_exec_val = None else: stop_exec_val = False From a48080af3956ae84fb2b3e5da2db1ca16c308c2f Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Wed, 2 Oct 2024 10:48:34 -0400 Subject: [PATCH 044/148] test/common: add death test for double !recursive lock Signed-off-by: Patrick Donnelly --- src/test/common/test_mutex_debug.cc | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/src/test/common/test_mutex_debug.cc b/src/test/common/test_mutex_debug.cc index 29eb8076859c9..cee4b427770ae 100644 --- a/src/test/common/test_mutex_debug.cc +++ b/src/test/common/test_mutex_debug.cc @@ -57,20 +57,13 @@ TEST(MutexDebug, Lock) { test_lock(); } -TEST(MutexDebug, NotRecursive) { +TEST(MutexDebugDeathTest, NotRecursive) { ceph::mutex_debug m("foo"); - auto ttl = &test_try_lock; - - ASSERT_NO_THROW(m.lock()); - ASSERT_TRUE(m.is_locked()); - ASSERT_FALSE(std::async(std::launch::async, ttl, &m).get()); - + // avoid assert during test cleanup where the mutex is locked and cannot be + // pthread_mutex_destroy'd + std::unique_lock locker{m}; ASSERT_TRUE(m.is_locked()); - ASSERT_FALSE(std::async(std::launch::async, ttl, &m).get()); - - ASSERT_NO_THROW(m.unlock()); - ASSERT_FALSE(m.is_locked()); - ASSERT_TRUE(std::async(std::launch::async, ttl, &m).get()); + ASSERT_DEATH(m.lock(), "FAILED ceph_assert(recursive || !is_locked_by_me())"); } TEST(MutexRecursiveDebug, Lock) { From 5a9c1c06e8dffaadebbe44ed8c329bd5bc34537e Mon Sep 17 00:00:00 2001 From: Anoop C S Date: Fri, 4 Oct 2024 16:04:15 +0530 Subject: [PATCH 045/148] cephadm/smb: Add a provision to specify ctdb log level sambacc already accepts 'log_level' as a field inside ctdb config stub to explicitly set the log level for ctdbd. Make use of this to provide a means when non default log levels are desired in future for ctdb. Signed-off-by: Anoop C S --- src/cephadm/cephadmlib/daemons/smb.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/cephadm/cephadmlib/daemons/smb.py b/src/cephadm/cephadmlib/daemons/smb.py index 74cb13f4ab022..5e400481a857a 100644 --- a/src/cephadm/cephadmlib/daemons/smb.py +++ b/src/cephadm/cephadmlib/daemons/smb.py @@ -72,6 +72,7 @@ class Config: instance_id: str source_config: str samba_debug_level: int + ctdb_log_level: str debug_delay: int domain_member: bool clustered: bool @@ -98,6 +99,7 @@ def __init__( domain_member: bool, clustered: bool, samba_debug_level: int = 0, + ctdb_log_level: str = '', debug_delay: int = 0, join_sources: Optional[List[str]] = None, user_sources: Optional[List[str]] = None, @@ -119,6 +121,7 @@ def __init__( self.domain_member = domain_member self.clustered = clustered self.samba_debug_level = samba_debug_level + self.ctdb_log_level = ctdb_log_level self.debug_delay = debug_delay self.join_sources = join_sources or [] self.user_sources = user_sources or [] @@ -756,7 +759,7 @@ def prepare_data_dir(self, data_dir: str, uid: int, gid: int) -> None: def _write_ctdb_stub_config(self, path: pathlib.Path) -> None: reclock_cmd = ' '.join(_MUTEX_SUBCMD + [self._cfg.cluster_lock_uri]) nodes_cmd = ' '.join(_NODES_SUBCMD) - stub_config = { + stub_config: Dict[str, Any] = { 'samba-container-config': 'v0', 'ctdb': { # recovery_lock is passed directly to ctdb: needs '!' prefix @@ -768,6 +771,8 @@ def _write_ctdb_stub_config(self, path: pathlib.Path) -> None: ), }, } + if self._cfg.ctdb_log_level: + stub_config['ctdb']['log_level'] = self._cfg.ctdb_log_level with file_utils.write_new(path) as fh: json.dump(stub_config, fh) From 6d8f61015f29ef3bfd737d78a0b8734171574c98 Mon Sep 17 00:00:00 2001 From: Kotresh HR Date: Thu, 26 Sep 2024 21:23:21 +0530 Subject: [PATCH 046/148] qa: Add data read/write test for nfs-ganesha Fixes: https://tracker.ceph.com/issues/68146 Signed-off-by: Kotresh HR --- qa/tasks/cephfs/test_nfs.py | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/qa/tasks/cephfs/test_nfs.py b/qa/tasks/cephfs/test_nfs.py index 932d504d47f3e..19076ea44b3be 100644 --- a/qa/tasks/cephfs/test_nfs.py +++ b/qa/tasks/cephfs/test_nfs.py @@ -8,6 +8,7 @@ from tasks.mgr.mgr_test_case import MgrTestCase from teuthology import contextutil from teuthology.exceptions import CommandFailedError +from teuthology.orchestra.run import Raw log = logging.getLogger(__name__) @@ -319,7 +320,7 @@ def _get_port_ip_info(self): else: log.warning(f'{e}, retrying') - def _test_mnt(self, pseudo_path, port, ip, check=True): + def _test_mnt(self, pseudo_path, port, ip, check=True, datarw=False): ''' Test mounting of created exports :param pseudo_path: It is the pseudo root name @@ -347,12 +348,27 @@ def _test_mnt(self, pseudo_path, port, ip, check=True): self.ctx.cluster.run(args=['sudo', 'chmod', '1777', '/mnt']) try: + # Clean up volumes directory created by subvolume create by some tests + self.ctx.cluster.run(args=['sudo', 'rm', '-rf', '/mnt/volumes']) self.ctx.cluster.run(args=['touch', '/mnt/test']) out_mnt = self._sys_cmd(['ls', '/mnt']) self.assertEqual(out_mnt, b'test\n') + if datarw: + self.ctx.cluster.run(args=['echo', 'test data', Raw('|'), 'tee', '/mnt/test1']) + out_test1 = self._sys_cmd(['cat', '/mnt/test1']) + self.assertEqual(out_test1, b'test data\n') finally: self.ctx.cluster.run(args=['sudo', 'umount', '/mnt']) + def _test_data_read_write(self, pseudo_path, port, ip): + ''' + Check if read/write works fine + ''' + try: + self._test_mnt(pseudo_path, port, ip, True, True) + except CommandFailedError as e: + self.fail(f"expected read/write of a file to be successful but failed with {e.exitstatus}") + def _write_to_read_only_export(self, pseudo_path, port, ip): ''' Check if write to read only export fails @@ -599,6 +615,18 @@ def test_write_to_read_only_export(self): self._write_to_read_only_export(self.pseudo_path, port, ip) self._test_delete_cluster() + def test_data_read_write(self): + ''' + Test date read and write on export. + ''' + self._test_create_cluster() + self._create_export(export_id='1', create_fs=True, + extra_cmd=['--pseudo-path', self.pseudo_path]) + port, ip = self._get_port_ip_info() + self._check_nfs_cluster_status('running', 'NFS Ganesha cluster restart failed') + self._test_data_read_write(self.pseudo_path, port, ip) + self._test_delete_cluster() + def test_cluster_info(self): ''' Test cluster info outputs correct ip and hostname From 59b996f0ed022f1bafd77317467d2e18ff0fa710 Mon Sep 17 00:00:00 2001 From: Kotresh HR Date: Sat, 28 Sep 2024 23:34:50 +0530 Subject: [PATCH 047/148] qa: Add libcephfs client test with objectcacher disabled Fixes: https://tracker.ceph.com/issues/68146 Signed-off-by: Kotresh HR --- qa/suites/fs/libcephfs/tasks/client.yaml | 1 + qa/workunits/client/test_oc_disabled.sh | 5 +++++ 2 files changed, 6 insertions(+) create mode 100755 qa/workunits/client/test_oc_disabled.sh diff --git a/qa/suites/fs/libcephfs/tasks/client.yaml b/qa/suites/fs/libcephfs/tasks/client.yaml index da84137322069..42ca9336c8e7d 100644 --- a/qa/suites/fs/libcephfs/tasks/client.yaml +++ b/qa/suites/fs/libcephfs/tasks/client.yaml @@ -12,3 +12,4 @@ tasks: clients: client.0: - client/test.sh + - client/test_oc_disabled.sh diff --git a/qa/workunits/client/test_oc_disabled.sh b/qa/workunits/client/test_oc_disabled.sh new file mode 100755 index 0000000000000..88552aa50bdc5 --- /dev/null +++ b/qa/workunits/client/test_oc_disabled.sh @@ -0,0 +1,5 @@ +#!/bin/sh + +set -ex + +ceph_test_client --client_oc=false From b5af1c1ffe8786a96c866edcec69c78030e9f2e4 Mon Sep 17 00:00:00 2001 From: Kotresh HR Date: Sat, 28 Sep 2024 23:19:30 +0530 Subject: [PATCH 048/148] test/client: Fix aio nonblocking test The same bufferlist is used without cleaning for multiple calls. The test 'LlreadvLlwritev' used to fail because of it. Fixed the same. Fixes: https://tracker.ceph.com/issues/68146 Signed-off-by: Kotresh HR --- src/test/client/nonblocking.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/test/client/nonblocking.cc b/src/test/client/nonblocking.cc index d4aecb10ffcb4..93bcfabd3fcf1 100644 --- a/src/test/client/nonblocking.cc +++ b/src/test/client/nonblocking.cc @@ -111,6 +111,8 @@ TEST_F(TestClient, LlreadvLlwritev) { writefinish.reset(new C_SaferCond("test-nonblocking-writefinish")); readfinish.reset(new C_SaferCond("test-nonblocking-readfinish")); ssize_t nwritten_a = iov_out_a[0].iov_len + iov_out_a[1].iov_len; + // reset bufferlist + bl.clear(); rc = client->ll_preadv_pwritev(fh, iov_out_a, 2, 100, true, writefinish.get(), nullptr); ASSERT_EQ(0, rc); @@ -130,6 +132,8 @@ TEST_F(TestClient, LlreadvLlwritev) { writefinish.reset(new C_SaferCond("test-nonblocking-writefinish")); readfinish.reset(new C_SaferCond("test-nonblocking-readfinish")); ssize_t nwritten_b = iov_out_b[0].iov_len + iov_out_b[1].iov_len; + // reset bufferlist + bl.clear(); rc = client->ll_preadv_pwritev(fh, iov_out_b, 2, 1000, true, writefinish.get(), nullptr, true, false); ASSERT_EQ(0, rc); From 3ebe97484d26cf5d9cd78636ee4718c075a2897b Mon Sep 17 00:00:00 2001 From: Kotresh HR Date: Thu, 26 Sep 2024 10:50:32 +0530 Subject: [PATCH 049/148] client: Fix libcephfs aio metadata corruption. Problem: With cephfs nfs-ganesha, there were following asserts hit while doing write on a file. 1. FAILED ceph_assert((bool)_front == (bool)_size) 2. FAILED ceph_assert(cap_refs[c] > 0) Cause: In aio path, the client_lock was not being held in the internal callback after the io is done where it's expected to be taken leading to corruption. Fix: Take client_lock in the callback Fixes: https://tracker.ceph.com/issues/68146 Signed-off-by: Kotresh HR --- src/client/Client.cc | 19 +++++++++++++++++-- src/client/Client.h | 15 +++++++++++++++ 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/src/client/Client.cc b/src/client/Client.cc index e208cf7667577..1bc67ce38bb87 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -11399,10 +11399,18 @@ int64_t Client::_write_success(Fh *f, utime_t start, uint64_t fpos, return r; } +void Client::C_Lock_Client_Finisher::finish(int r) +{ + std::scoped_lock lock(clnt->client_lock); + onfinish->complete(r); +} + void Client::C_Write_Finisher::finish_io(int r) { bool fini; + ceph_assert(ceph_mutex_is_locked_by_me(clnt->client_lock)); + clnt->put_cap_ref(in, CEPH_CAP_FILE_BUFFER); if (r >= 0) { @@ -11438,6 +11446,8 @@ void Client::C_Write_Finisher::finish_fsync(int r) bool fini; client_t const whoami = clnt->whoami; // For the benefit of ldout prefix + ceph_assert(ceph_mutex_is_locked_by_me(clnt->client_lock)); + ldout(clnt->cct, 3) << "finish_fsync r = " << r << dendl; fsync_finished = true; @@ -11598,6 +11608,7 @@ int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf, std::unique_ptr iofinish = nullptr; std::unique_ptr cwf = nullptr; + std::unique_ptr filer_iofinish = nullptr; if (in->inline_version < CEPH_INLINE_NONE) { if (endoff > cct->_conf->client_max_inline_size || @@ -11709,7 +11720,10 @@ int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf, if (onfinish == nullptr) { // We need a safer condition to wait on. cond_iofinish = new C_SaferCond(); - iofinish.reset(cond_iofinish); + filer_iofinish.reset(cond_iofinish); + } else { + //Register a wrapper callback for the C_Write_Finisher which takes 'client_lock' + filer_iofinish.reset(new C_Lock_Client_Finisher(this, iofinish.get())); } get_cap_ref(in, CEPH_CAP_FILE_BUFFER); @@ -11717,11 +11731,12 @@ int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf, filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(), offset, size, bl, ceph::real_clock::now(), 0, in->truncate_size, in->truncate_seq, - iofinish.get()); + filer_iofinish.get()); if (onfinish) { // handle non-blocking caller (onfinish != nullptr), we can now safely // release all the managed pointers + filer_iofinish.release(); iofinish.release(); onuninline.release(); cwf.release(); diff --git a/src/client/Client.h b/src/client/Client.h index 5a1e69394d02a..f8c39e2fdd6ab 100644 --- a/src/client/Client.h +++ b/src/client/Client.h @@ -1409,6 +1409,21 @@ class Client : public Dispatcher, public md_config_obs_t { void finish(int r) override; }; + // A wrapper callback which takes the 'client_lock' and finishes the context. + // One of the usecase is the filer->write_trunc which doesn't hold client_lock + // in the call back passed. So, use this wrapper in such cases. + class C_Lock_Client_Finisher : public Context { + public: + C_Lock_Client_Finisher(Client *clnt, Context *onfinish) + : clnt(clnt), onfinish(onfinish) {} + + private: + Client *clnt; + Context *onfinish; + + void finish(int r) override; + }; + class C_Write_Finisher : public Context { public: void finish_io(int r); From 10c8330f20cd2e93ce036d0ea2c38552d71b62c6 Mon Sep 17 00:00:00 2001 From: Kotresh HR Date: Mon, 30 Sep 2024 12:45:04 +0530 Subject: [PATCH 050/148] client: Fix caps_ref[c]<0 assert When libcephfs aio tests (src/test/client) are run with objectcacher disabled (ceph_test_client --client_oc=false), the TestClient.LlreadvLlwritev fails and core dumps. The client hits the assert 'caps_ref[c]<0'. This patch fixes the same. There is no need to give out cap_ref and take it again between multiple read because of short reads. In some cases, the get_caps used to fail in C_Read_Sync_NonBlocking::finish causing cap_ref to go negative when put_cap_ref is done at last in C_Read_Finish::finish_io Fixes: https://tracker.ceph.com/issues/68308 Signed-off-by: Kotresh HR --- src/client/Client.cc | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/client/Client.cc b/src/client/Client.cc index 1bc67ce38bb87..e73f821438b3a 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -10798,7 +10798,6 @@ void Client::C_Read_Sync_NonBlocking::finish(int r) goto success; } - clnt->put_cap_ref(in, CEPH_CAP_FILE_RD); // reverify size { r = clnt->_getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms); @@ -10810,14 +10809,6 @@ void Client::C_Read_Sync_NonBlocking::finish(int r) if ((uint64_t)pos >= in->size) goto success; - { - int have_caps2 = 0; - r = clnt->get_caps(f, CEPH_CAP_FILE_RD, have_caps, &have_caps2, -1); - if (r < 0) { - goto error; - } - } - wanted = left; retry(); clnt->client_lock.unlock(); From 942474c2f5b4c696364f3b7411ae7d96444edfa8 Mon Sep 17 00:00:00 2001 From: Kotresh HR Date: Sat, 28 Sep 2024 01:18:23 +0530 Subject: [PATCH 051/148] client: Fix aio zerobyte file read The following test fails when run with objectcacher disabled. TestClient.LlreadvLlwritevZeroBytes Failure - nonblocking.cc ceph/src/osdc/Striper.cc: 186: FAILED ceph_assert(len > 0) Traceback: ceph version Development (no_version) squid (dev) 1: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x125) [0x7fc0a340aafe] 2: (ceph::register_assert_context(ceph::common::CephContext*)+0) [0x7fc0a340ad20] 3: (Striper::file_to_extents(ceph::common::CephContext*, file_layout_t const*, ...)+0x184) [0x562727e13ab4] 4: (Striper::file_to_extents(ceph::common::CephContext*, char const*, ...)+0x97) [0x562727e145d1] 5: (Striper::file_to_extents(ceph::common::CephContext*, inodeno_t, ...)+0x75) [0x562727d29520] 6: (Filer::read_trunc(inodeno_t, file_layout_t const*, snapid_t, ...)+0x61) [0x562727d66ea5] 7: (Client::C_Read_Sync_NonBlocking::retry()+0x10c) [0x562727cd8a8e] 8: (Client::_read(Fh*, long, unsigned long, ceph::buffer::v15_2_0::list*, Context*)+0x578) [0x562727d10cb6] 9: (Client::_preadv_pwritev_locked(Fh*, iovec const*, int, long, bool, ...)+0x3a7) [0x562727d18159] 10: (Client::ll_preadv_pwritev(Fh*, iovec const*, int, long, bool, ...)+0x179) [0x562727d18b99] 11: (TestClient_LlreadvLlwritevZeroBytes_Test::TestBody()+0x592) [0x562727ca5352] 12: (void testing::internal::HandleSehExceptionsInMethodIfSupported(testing::Test*, ...)+0x1b) [0x562727d9dea3] 13: (void testing::internal::HandleExceptionsInMethodIfSupported(testing::Test*, ...)+0x80) [0x562727da2b26] 14: (testing::Test::Run()+0xb4) [0x562727d927ae] 15: (testing::TestInfo::Run()+0x104) [0x562727d92988] 16: (testing::TestSuite::Run()+0xb2) [0x562727d92b34] 17: (testing::internal::UnitTestImpl::RunAllTests()+0x36b) [0x562727d95303] 18: (bool testing::internal::HandleSehExceptionsInMethodIfSupported(testing::internal::UnitTestImpl*, ...)(), char const*)+0x1b) [0x562727d9e15f] 19: (bool testing::internal::HandleExceptionsInMethodIfSupported(testing::internal::UnitTestImpl*, ...)+0x80) [0x562727da3083] 20: (testing::UnitTest::Run()+0x63) [0x562727d92813] 21: (RUN_ALL_TESTS()+0x11) [0x562727c828d9] 22: main() The patch fixes the same. Fixes: https://tracker.ceph.com/issues/68309 Signed-off-by: Kotresh HR --- src/client/Client.cc | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/client/Client.cc b/src/client/Client.cc index e73f821438b3a..6577dd575f1fd 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -10962,6 +10962,20 @@ int64_t Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl, // branch below but in a non-blocking fashion. The code in _read_sync // is duplicated and modified and exists in // C_Read_Sync_NonBlocking::finish(). + + // trim read based on file size? + if ((offset >= in->size) || (size == 0)) { + // read is requested at the EOF or the read len is zero, therefore just + // release managed pointers and complete the C_Read_Finisher immediately with 0 bytes + + Context *iof = iofinish.release(); + crf.release(); + iof->complete(0); + + // Signal async completion + return 0; + } + C_Read_Sync_NonBlocking *crsa = new C_Read_Sync_NonBlocking(this, iofinish.release(), f, in, f->pos, offset, size, bl, filer.get(), have); From db926acb533ac058090e3bbf1343bba0ca367051 Mon Sep 17 00:00:00 2001 From: Casey Bodley Date: Fri, 4 Oct 2024 15:31:07 -0400 Subject: [PATCH 052/148] doc/dev/radosgw: update paths that moved under src/rgw/driver/rados/ Signed-off-by: Casey Bodley --- doc/dev/radosgw/bucket_index.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/dev/radosgw/bucket_index.rst b/doc/dev/radosgw/bucket_index.rst index 6764641e0f50e..ceff57b58cfc8 100644 --- a/doc/dev/radosgw/bucket_index.rst +++ b/doc/dev/radosgw/bucket_index.rst @@ -32,7 +32,7 @@ For a given bucket, the index may be split into several rados objects, called bu The default shard count for new buckets is 11, but can be overridden in the zonegroup's ``bucket_index_max_shards`` or ceph.conf's ``rgw_override_bucket_index_max_shards``. As the number of objects in a bucket grows, its index shard count will also increase as a result of dynamic resharding. -Information about the bucket's index object layout is stored in ``RGWBucketInfo`` as ``struct rgw::BucketLayout`` from ``src/rgw/rgw_bucket_layout.h``. The resharding logic is in ``src/rgw/rgw_reshard.cc``. +Information about the bucket's index object layout is stored in ``RGWBucketInfo`` as ``struct rgw::BucketLayout`` from ``src/rgw/rgw_bucket_layout.h``. The resharding logic is in ``src/rgw/driver/rados/rgw_reshard.cc``. ----------------- Index Transaction @@ -46,7 +46,7 @@ To keep the bucket index consistent, all object writes or deletes must also upda Object writes and deletes may race with each other, so a given object may have more than one prepared transaction at a time. RGW considers an object entry to be 'pending' if there are any outstanding transactions, or 'completed' otherwise. -This transaction is implemented in ``src/rgw/rgw_rados.cc`` as ``RGWRados::Object::Write::write_meta()`` for object writes, and ``RGWRados::Object::Delete::delete_obj()`` for object deletes. The bucket index operations are implemented in ``src/cls/rgw/cls_rgw.cc`` as ``rgw_bucket_prepare_op()`` and ``rgw_bucket_complete_op()``. +This transaction is implemented in ``src/rgw/driver/rados/rgw_rados.cc`` as ``RGWRados::Object::Write::write_meta()`` for object writes, and ``RGWRados::Object::Delete::delete_obj()`` for object deletes. The bucket index operations are implemented in ``src/cls/rgw/cls_rgw.cc`` as ``rgw_bucket_prepare_op()`` and ``rgw_bucket_complete_op()``. ------- Listing @@ -56,7 +56,7 @@ When listing objects, RGW will read all entries (pending and completed) from the If an RGW crashes in the middle of an `Index Transaction`_, an index entry may get stuck in this 'pending' state. When bucket listing encounters these pending entries, it also sends information from the head object back to the bucket index so it can update the entry and resolve its stale transactions. This message is called 'dir suggest', because the bucket index treats it as a hint or suggestion. -Bucket listing is implemented in ``src/rgw/rgw_rados.cc`` as ``RGWRados::Bucket::List::list_objects_ordered()`` and ``RGWRados::Bucket::List::list_objects_unordered()``. ``RGWRados::check_disk_state()`` is the part that reads the head object and encodes suggested changes. The corresponding bucket index operations are implemented in ``src/cls/rgw/cls_rgw.cc`` as ``rgw_bucket_list()`` and ``rgw_dir_suggest_changes()``. +Bucket listing is implemented in ``src/rgw/driver/rados/rgw_rados.cc`` as ``RGWRados::Bucket::List::list_objects_ordered()`` and ``RGWRados::Bucket::List::list_objects_unordered()``. ``RGWRados::check_disk_state()`` is the part that reads the head object and encodes suggested changes. The corresponding bucket index operations are implemented in ``src/cls/rgw/cls_rgw.cc`` as ``rgw_bucket_list()`` and ``rgw_dir_suggest_changes()``. -------------------- S3 Object Versioning @@ -66,9 +66,9 @@ For versioned buckets, the bucket index contains an entry for each object versio RGW stores a head object in the rgw.buckets.data pool for each object version. This rados object's oid is a combination of the object name and its version id. -In S3, a GET/HEAD request for an object name will give you that object's "current" version. To support this, RGW stores an extra 'object logical head' (olh) object whose oid includes the object name only, that acts as an indirection to the head object of its current version. This indirection logic is implemented in ``src/rgw/rgw_rados.cc`` as ``RGWRados::follow_olh()``. +In S3, a GET/HEAD request for an object name will give you that object's "current" version. To support this, RGW stores an extra 'object logical head' (olh) object whose oid includes the object name only, that acts as an indirection to the head object of its current version. This indirection logic is implemented in ``src/rgw/driver/rados/rgw_rados.cc`` as ``RGWRados::follow_olh()``. -To maintain the consistency between this olh object and the bucket index, the index keeps a separate 'olh' entry for each object name. This entry stores a log of all writes/deletes to its versions. In ``src/rgw/rgw_rados.cc``, ``RGWRados::apply_olh_log()`` replays this log to guarantee that this olh object converges on the same "current" version as the bucket index. +To maintain the consistency between this olh object and the bucket index, the index keeps a separate 'olh' entry for each object name. This entry stores a log of all writes/deletes to its versions. In ``src/rgw/driver/rados/rgw_rados.cc``, ``RGWRados::apply_olh_log()`` replays this log to guarantee that this olh object converges on the same "current" version as the bucket index. .. _ListObjectsV2: https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjects.html .. _ListObjectVersions: https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectVersions.html From cfe254758b1ca9647c4dcfb13b6a3310558b88d2 Mon Sep 17 00:00:00 2001 From: Casey Bodley Date: Fri, 4 Oct 2024 15:50:05 -0400 Subject: [PATCH 053/148] doc/radosgw/multisite: fix Configuring Secondary Zones -> Updating the Period this was copy/pasted from Configuring a Master Zone -> Update the Period but still referred to the 'master zone' Signed-off-by: Casey Bodley --- doc/radosgw/multisite.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/radosgw/multisite.rst b/doc/radosgw/multisite.rst index 6a21b7479e6f6..d6925c8ed9c04 100644 --- a/doc/radosgw/multisite.rst +++ b/doc/radosgw/multisite.rst @@ -507,7 +507,7 @@ For example: Updating the Period ------------------- -After updating the master zone configuration, update the period: +After updating the secondary zone configuration, update the period: .. prompt:: bash # From 485cb051192a6142104756ed88a900a5ba455179 Mon Sep 17 00:00:00 2001 From: Nizamudeen A Date: Mon, 7 Oct 2024 12:11:11 +0530 Subject: [PATCH 054/148] mgr/dashboard: add gw_groups to all nvmeof endpoints This was missed in the previous implementation Signed-off-by: Nizamudeen A --- .../mgr/dashboard/controllers/nvmeof.py | 86 +++++++++++++------ src/pybind/mgr/dashboard/openapi.yaml | 69 +++++++++++++++ 2 files changed, 127 insertions(+), 28 deletions(-) diff --git a/src/pybind/mgr/dashboard/controllers/nvmeof.py b/src/pybind/mgr/dashboard/controllers/nvmeof.py index 757b9e8ac02cf..5db6a4f1acfec 100644 --- a/src/pybind/mgr/dashboard/controllers/nvmeof.py +++ b/src/pybind/mgr/dashboard/controllers/nvmeof.py @@ -63,7 +63,10 @@ def list(self, gw_group: Optional[str] = None): @EndpointDoc( "Get information from a specific NVMeoF subsystem", - parameters={"nqn": Param(str, "NVMeoF subsystem NQN")}, + parameters={ + "nqn": Param(str, "NVMeoF subsystem NQN"), + "gw_group": Param(str, "NVMeoF gateway group", True, None), + }, ) @map_model(model.Subsystem, first="subsystems") @handle_nvmeof_error @@ -78,6 +81,7 @@ def get(self, nqn: str, gw_group: Optional[str] = None): "nqn": Param(str, "NVMeoF subsystem NQN"), "max_namespaces": Param(int, "Maximum number of namespaces", True, 1024), "enable_ha": Param(bool, "Enable high availability"), + "gw_group": Param(str, "NVMeoF gateway group", True, None), }, ) @empty_response @@ -95,6 +99,7 @@ def create(self, nqn: str, enable_ha: bool, max_namespaces: int = 1024, parameters={ "nqn": Param(str, "NVMeoF subsystem NQN"), "force": Param(bool, "Force delete", "false"), + "gw_group": Param(str, "NVMeoF gateway group", True, None), }, ) @empty_response @@ -111,12 +116,15 @@ def delete(self, nqn: str, force: Optional[str] = "false", gw_group: Optional[st class NVMeoFListener(RESTController): @EndpointDoc( "List all NVMeoF listeners", - parameters={"nqn": Param(str, "NVMeoF subsystem NQN")}, + parameters={ + "nqn": Param(str, "NVMeoF subsystem NQN"), + "gw_group": Param(str, "NVMeoF gateway group", True, None), + }, ) @map_collection(model.Listener, pick="listeners") @handle_nvmeof_error - def list(self, nqn: str): - return NVMeoFClient().stub.list_listeners( + def list(self, nqn: str, gw_group: Optional[str] = None): + return NVMeoFClient(gw_group=gw_group).stub.list_listeners( NVMeoFClient.pb2.list_listeners_req(subsystem=nqn) ) @@ -128,6 +136,7 @@ def list(self, nqn: str): "traddr": Param(str, "NVMeoF transport address"), "trsvcid": Param(int, "NVMeoF transport service port", True, 4420), "adrfam": Param(int, "NVMeoF address family (0 - IPv4, 1 - IPv6)", True, 0), + "gw_group": Param(str, "NVMeoF gateway group", True, None), }, ) @empty_response @@ -138,9 +147,10 @@ def create( host_name: str, traddr: str, trsvcid: int = 4420, - adrfam: int = 0, # IPv4 + adrfam: int = 0, # IPv4, + gw_group: Optional[str] = None ): - return NVMeoFClient(traddr=traddr).stub.create_listener( + return NVMeoFClient(gw_group=gw_group, traddr=traddr).stub.create_listener( NVMeoFClient.pb2.create_listener_req( nqn=nqn, host_name=host_name, @@ -158,6 +168,7 @@ def create( "traddr": Param(str, "NVMeoF transport address"), "trsvcid": Param(int, "NVMeoF transport service port", True, 4420), "adrfam": Param(int, "NVMeoF address family (0 - IPv4, 1 - IPv6)", True, 0), + "gw_group": Param(str, "NVMeoF gateway group", True, None), }, ) @empty_response @@ -170,8 +181,9 @@ def delete( trsvcid: int = 4420, adrfam: int = 0, # IPv4 force: bool = False, + gw_group: Optional[str] = None ): - return NVMeoFClient().stub.delete_listener( + return NVMeoFClient(gw_group=gw_group).stub.delete_listener( NVMeoFClient.pb2.delete_listener_req( nqn=nqn, host_name=host_name, @@ -187,12 +199,15 @@ def delete( class NVMeoFNamespace(RESTController): @EndpointDoc( "List all NVMeoF namespaces in a subsystem", - parameters={"nqn": Param(str, "NVMeoF subsystem NQN")}, + parameters={ + "nqn": Param(str, "NVMeoF subsystem NQN"), + "gw_group": Param(str, "NVMeoF gateway group", True, None), + }, ) @map_collection(model.Namespace, pick="namespaces") @handle_nvmeof_error - def list(self, nqn: str): - return NVMeoFClient().stub.list_namespaces( + def list(self, nqn: str, gw_group: Optional[str] = None): + return NVMeoFClient(gw_group=gw_group).stub.list_namespaces( NVMeoFClient.pb2.list_namespaces_req(subsystem=nqn) ) @@ -201,12 +216,13 @@ def list(self, nqn: str): parameters={ "nqn": Param(str, "NVMeoF subsystem NQN"), "nsid": Param(str, "NVMeoF Namespace ID"), + "gw_group": Param(str, "NVMeoF gateway group", True, None), }, ) @map_model(model.Namespace, first="namespaces") @handle_nvmeof_error - def get(self, nqn: str, nsid: str): - return NVMeoFClient().stub.list_namespaces( + def get(self, nqn: str, nsid: str, gw_group: Optional[str] = None): + return NVMeoFClient(gw_group=gw_group).stub.list_namespaces( NVMeoFClient.pb2.list_namespaces_req(subsystem=nqn, nsid=int(nsid)) ) @@ -217,12 +233,13 @@ def get(self, nqn: str, nsid: str): parameters={ "nqn": Param(str, "NVMeoF subsystem NQN"), "nsid": Param(str, "NVMeoF Namespace ID"), + "gw_group": Param(str, "NVMeoF gateway group", True, None), }, ) @map_model(model.NamespaceIOStats) @handle_nvmeof_error - def io_stats(self, nqn: str, nsid: str): - return NVMeoFClient().stub.namespace_get_io_stats( + def io_stats(self, nqn: str, nsid: str, gw_group: Optional[str] = None): + return NVMeoFClient(gw_group=gw_group).stub.namespace_get_io_stats( NVMeoFClient.pb2.namespace_get_io_stats_req( subsystem_nqn=nqn, nsid=int(nsid)) ) @@ -237,6 +254,7 @@ def io_stats(self, nqn: str, nsid: str): "size": Param(int, "RBD image size"), "block_size": Param(int, "NVMeoF namespace block size"), "load_balancing_group": Param(int, "Load balancing group"), + "gw_group": Param(str, "NVMeoF gateway group", True, None), }, ) @map_model(model.NamespaceCreation) @@ -250,8 +268,9 @@ def create( size: Optional[int] = 1024, block_size: int = 512, load_balancing_group: Optional[int] = None, + gw_group: Optional[str] = None, ): - return NVMeoFClient().stub.namespace_add( + return NVMeoFClient(gw_group=gw_group).stub.namespace_add( NVMeoFClient.pb2.namespace_add_req( subsystem_nqn=nqn, rbd_image_name=rbd_image_name, @@ -274,6 +293,7 @@ def create( "rw_mbytes_per_second": Param(int, "Read/Write MB/s"), "r_mbytes_per_second": Param(int, "Read MB/s"), "w_mbytes_per_second": Param(int, "Write MB/s"), + "gw_group": Param(str, "NVMeoF gateway group", True, None), }, ) @empty_response @@ -288,12 +308,13 @@ def update( rw_mbytes_per_second: Optional[int] = None, r_mbytes_per_second: Optional[int] = None, w_mbytes_per_second: Optional[int] = None, + gw_group: Optional[str] = None ): if rbd_image_size: mib = 1024 * 1024 new_size_mib = int((rbd_image_size + mib - 1) / mib) - response = NVMeoFClient().stub.namespace_resize( + response = NVMeoFClient(gw_group=gw_group).stub.namespace_resize( NVMeoFClient.pb2.namespace_resize_req( subsystem_nqn=nqn, nsid=int(nsid), new_size=new_size_mib ) @@ -336,12 +357,13 @@ def update( parameters={ "nqn": Param(str, "NVMeoF subsystem NQN"), "nsid": Param(str, "NVMeoF Namespace ID"), + "gw_group": Param(str, "NVMeoF gateway group", True, None), }, ) @empty_response @handle_nvmeof_error - def delete(self, nqn: str, nsid: str): - return NVMeoFClient().stub.namespace_delete( + def delete(self, nqn: str, nsid: str, gw_group: Optional[str] = None): + return NVMeoFClient(gw_group=gw_group).stub.namespace_delete( NVMeoFClient.pb2.namespace_delete_req(subsystem_nqn=nqn, nsid=int(nsid)) ) @@ -351,7 +373,10 @@ def delete(self, nqn: str, nsid: str): class NVMeoFHost(RESTController): @EndpointDoc( "List all allowed hosts for an NVMeoF subsystem", - parameters={"nqn": Param(str, "NVMeoF subsystem NQN")}, + parameters={ + "nqn": Param(str, "NVMeoF subsystem NQN"), + "gw_group": Param(str, "NVMeoF gateway group", True, None), + }, ) @map_collection( model.Host, @@ -362,8 +387,8 @@ class NVMeoFHost(RESTController): else o, ) @handle_nvmeof_error - def list(self, nqn: str): - return NVMeoFClient().stub.list_hosts( + def list(self, nqn: str, gw_group: Optional[str] = None): + return NVMeoFClient(gw_group=gw_group).stub.list_hosts( NVMeoFClient.pb2.list_hosts_req(subsystem=nqn) ) @@ -372,12 +397,13 @@ def list(self, nqn: str): parameters={ "nqn": Param(str, "NVMeoF subsystem NQN"), "host_nqn": Param(str, 'NVMeoF host NQN. Use "*" to allow any host.'), + "gw_group": Param(str, "NVMeoF gateway group", True, None), }, ) @empty_response @handle_nvmeof_error - def create(self, nqn: str, host_nqn: str): - return NVMeoFClient().stub.add_host( + def create(self, nqn: str, host_nqn: str, gw_group: Optional[str] = None): + return NVMeoFClient(gw_group=gw_group).stub.add_host( NVMeoFClient.pb2.add_host_req(subsystem_nqn=nqn, host_nqn=host_nqn) ) @@ -386,12 +412,13 @@ def create(self, nqn: str, host_nqn: str): parameters={ "nqn": Param(str, "NVMeoF subsystem NQN"), "host_nqn": Param(str, 'NVMeoF host NQN. Use "*" to disallow any host.'), + "gw_group": Param(str, "NVMeoF gateway group", True, None), }, ) @empty_response @handle_nvmeof_error - def delete(self, nqn: str, host_nqn: str): - return NVMeoFClient().stub.remove_host( + def delete(self, nqn: str, host_nqn: str, gw_group: Optional[str] = None): + return NVMeoFClient(gw_group=gw_group).stub.remove_host( NVMeoFClient.pb2.remove_host_req(subsystem_nqn=nqn, host_nqn=host_nqn) ) @@ -400,12 +427,15 @@ def delete(self, nqn: str, host_nqn: str): class NVMeoFConnection(RESTController): @EndpointDoc( "List all NVMeoF Subsystem Connections", - parameters={"nqn": Param(str, "NVMeoF subsystem NQN")}, + parameters={ + "nqn": Param(str, "NVMeoF subsystem NQN"), + "gw_group": Param(str, "NVMeoF gateway group", True, None), + }, ) @map_collection(model.Connection, pick="connections") @handle_nvmeof_error - def list(self, nqn: str): - return NVMeoFClient().stub.list_connections( + def list(self, nqn: str, gw_group: Optional[str] = None): + return NVMeoFClient(gw_group=gw_group).stub.list_connections( NVMeoFClient.pb2.list_connections_req(subsystem=nqn) ) diff --git a/src/pybind/mgr/dashboard/openapi.yaml b/src/pybind/mgr/dashboard/openapi.yaml index e8ab663d0d593..5df80259d9f5d 100644 --- a/src/pybind/mgr/dashboard/openapi.yaml +++ b/src/pybind/mgr/dashboard/openapi.yaml @@ -8293,6 +8293,7 @@ paths: description: Enable high availability type: boolean gw_group: + description: NVMeoF gateway group type: string max_namespaces: default: 1024 @@ -8346,6 +8347,7 @@ paths: schema: type: boolean - allowEmptyValue: true + description: NVMeoF gateway group in: query name: gw_group schema: @@ -8384,6 +8386,7 @@ paths: schema: type: string - allowEmptyValue: true + description: NVMeoF gateway group in: query name: gw_group schema: @@ -8417,6 +8420,12 @@ paths: required: true schema: type: string + - allowEmptyValue: true + description: NVMeoF gateway group + in: query + name: gw_group + schema: + type: string responses: '200': content: @@ -8446,6 +8455,12 @@ paths: required: true schema: type: string + - allowEmptyValue: true + description: NVMeoF gateway group + in: query + name: gw_group + schema: + type: string responses: '200': content: @@ -8479,6 +8494,9 @@ paths: application/json: schema: properties: + gw_group: + description: NVMeoF gateway group + type: string host_nqn: description: NVMeoF host NQN. Use "*" to allow any host. type: string @@ -8525,6 +8543,12 @@ paths: required: true schema: type: string + - allowEmptyValue: true + description: NVMeoF gateway group + in: query + name: gw_group + schema: + type: string responses: '202': content: @@ -8559,6 +8583,12 @@ paths: required: true schema: type: string + - allowEmptyValue: true + description: NVMeoF gateway group + in: query + name: gw_group + schema: + type: string responses: '200': content: @@ -8596,6 +8626,9 @@ paths: default: 0 description: NVMeoF address family (0 - IPv4, 1 - IPv6) type: integer + gw_group: + description: NVMeoF gateway group + type: string host_name: description: NVMeoF hostname type: string @@ -8673,6 +8706,12 @@ paths: name: force schema: type: boolean + - allowEmptyValue: true + description: NVMeoF gateway group + in: query + name: gw_group + schema: + type: string responses: '202': content: @@ -8707,6 +8746,12 @@ paths: required: true schema: type: string + - allowEmptyValue: true + description: NVMeoF gateway group + in: query + name: gw_group + schema: + type: string responses: '200': content: @@ -8748,6 +8793,9 @@ paths: default: true description: Create RBD image type: boolean + gw_group: + description: NVMeoF gateway group + type: string load_balancing_group: description: Load balancing group type: integer @@ -8805,6 +8853,12 @@ paths: required: true schema: type: string + - allowEmptyValue: true + description: NVMeoF gateway group + in: query + name: gw_group + schema: + type: string responses: '202': content: @@ -8844,6 +8898,12 @@ paths: required: true schema: type: string + - allowEmptyValue: true + description: NVMeoF gateway group + in: query + name: gw_group + schema: + type: string responses: '200': content: @@ -8883,6 +8943,9 @@ paths: application/json: schema: properties: + gw_group: + description: NVMeoF gateway group + type: string load_balancing_group: description: Load balancing group type: integer @@ -8937,6 +9000,12 @@ paths: required: true schema: type: string + - allowEmptyValue: true + description: NVMeoF gateway group + in: query + name: gw_group + schema: + type: string responses: '200': content: From ee16b099d540f2a60dd84fcbc69499c1b1e649a3 Mon Sep 17 00:00:00 2001 From: Yuval Lifshitz Date: Tue, 1 Oct 2024 15:19:46 +0000 Subject: [PATCH 055/148] common: missing std include with GCC 14 In file included from src/rgw/driver/posix/bucket_cache.h:19, from src/test/rgw/test_posix_bucket_cache.cc:4: src/common/cohort_lru.h: In member function _void cohort::lru::TreeX::lock()_: src/common/cohort_lru.h:334:14: error: _for_each_ is not a member of _std_ 334 | std::for_each(locks.begin(), locks.end(), | ^~~~~~~~ src/common/cohort_lru.h: In member function _void cohort::lru::TreeX::unlock()_: /home/yuvalif/ceph5/src/common/cohort_lru.h:339:14: error: _for_each_ is not a member of _std_ 339 | std::for_each(locks.begin(), locks.end(), | ^~~~~~~~ Signed-off-by: Yuval Lifshitz --- src/common/cohort_lru.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/common/cohort_lru.h b/src/common/cohort_lru.h index af2baaa5c67bf..86ced8d183c71 100644 --- a/src/common/cohort_lru.h +++ b/src/common/cohort_lru.h @@ -15,6 +15,12 @@ #include #include +#include +#include +#include +#include +#include +#include #ifdef __CEPH__ # include "include/ceph_assert.h" From ea53aceb8d72187f7f8629aa6d3b66c7cca88a86 Mon Sep 17 00:00:00 2001 From: Aashish Sharma Date: Wed, 25 Sep 2024 18:09:32 +0530 Subject: [PATCH 056/148] mgr/dashboard: show non default realm sync status in rgw overview page Currently, we just show the sync status of the default realm in rgw overview page. This PR is to show the sync status of non-default realms as well. Multisite sync status can be viewed for any of the active daemon which runs in default/non-default realm. Fixes: https://tracker.ceph.com/issues/68329 Signed-off-by: Aashish Sharma --- src/pybind/mgr/dashboard/controllers/rgw.py | 4 ++-- .../rgw-overview-dashboard.component.ts | 4 +++- .../src/app/shared/api/rgw-multisite.service.ts | 4 +++- src/pybind/mgr/dashboard/openapi.yaml | 7 ++++++- src/pybind/mgr/dashboard/services/rgw_client.py | 10 +++++++++- 5 files changed, 23 insertions(+), 6 deletions(-) diff --git a/src/pybind/mgr/dashboard/controllers/rgw.py b/src/pybind/mgr/dashboard/controllers/rgw.py index 8667d469060f8..4969d11935d6f 100755 --- a/src/pybind/mgr/dashboard/controllers/rgw.py +++ b/src/pybind/mgr/dashboard/controllers/rgw.py @@ -162,9 +162,9 @@ class RgwMultisiteController(RESTController): @ReadPermission @allow_empty_body # pylint: disable=W0102,W0613 - def get_sync_status(self): + def get_sync_status(self, daemon_name=None): multisite_instance = RgwMultisite() - result = multisite_instance.get_multisite_sync_status() + result = multisite_instance.get_multisite_sync_status(daemon_name) return result @Endpoint(path='/sync-policy') diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-overview-dashboard/rgw-overview-dashboard.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-overview-dashboard/rgw-overview-dashboard.component.ts index 8b5901769c357..00037a7235b8e 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-overview-dashboard/rgw-overview-dashboard.component.ts +++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-overview-dashboard/rgw-overview-dashboard.component.ts @@ -91,7 +91,9 @@ export class RgwOverviewDashboardComponent implements OnInit, OnDestroy { this.totalPoolUsedBytes = data['total_pool_bytes_used']; this.averageObjectSize = data['average_object_size']; }); - this.getSyncStatus(); + setTimeout(() => { + this.getSyncStatus(); + }); }); this.BucketSub = this.rgwBucketService .getTotalBucketsAndUsersLength() diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/rgw-multisite.service.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/rgw-multisite.service.ts index d57cd523a4dfe..e4688358013ab 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/rgw-multisite.service.ts +++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/rgw-multisite.service.ts @@ -28,7 +28,9 @@ export class RgwMultisiteService { } getSyncStatus() { - return this.http.get(`${this.url}/sync_status`); + return this.rgwDaemonService.request((params: HttpParams) => { + return this.http.get(`${this.url}/sync_status`, { params: params }); + }); } status() { diff --git a/src/pybind/mgr/dashboard/openapi.yaml b/src/pybind/mgr/dashboard/openapi.yaml index 8f98f1f62a0a8..7cdc0357ae4f1 100644 --- a/src/pybind/mgr/dashboard/openapi.yaml +++ b/src/pybind/mgr/dashboard/openapi.yaml @@ -11653,7 +11653,12 @@ paths: - RgwMultisite /api/rgw/multisite/sync_status: get: - parameters: [] + parameters: + - allowEmptyValue: true + in: query + name: daemon_name + schema: + type: string responses: '200': content: diff --git a/src/pybind/mgr/dashboard/services/rgw_client.py b/src/pybind/mgr/dashboard/services/rgw_client.py index 2441b73b361be..e1e113a812e7d 100755 --- a/src/pybind/mgr/dashboard/services/rgw_client.py +++ b/src/pybind/mgr/dashboard/services/rgw_client.py @@ -1981,8 +1981,16 @@ def get_multisite_status(self): is_multisite_configured = False return is_multisite_configured - def get_multisite_sync_status(self): + def get_multisite_sync_status(self, daemon_name: str): rgw_multisite_sync_status_cmd = ['sync', 'status'] + daemons = _get_daemons() + try: + realm_name = daemons[daemon_name].realm_name + except (KeyError, AttributeError): + raise DashboardException('Unable to get realm name from daemon', + http_status_code=500, component='rgw') + if realm_name: + rgw_multisite_sync_status_cmd.extend(['--rgw-realm', realm_name]) try: exit_code, out, _ = mgr.send_rgwadmin_command(rgw_multisite_sync_status_cmd, False) if exit_code > 0: From 8dd9e9dad6a5ad9b427d73c4547286fe46b67d46 Mon Sep 17 00:00:00 2001 From: Aashish Sharma Date: Thu, 3 Oct 2024 13:58:14 +0530 Subject: [PATCH 057/148] =?UTF-8?q?mgr/dashboard:=20fix=20gateways=20secti?= =?UTF-8?q?on=20error:=E2=80=9D404=20-=20Not=20Found=20RGW=20Daemon=20not?= =?UTF-8?q?=20found:=20None=E2=80=9D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A case was missed here where we do have a default realm created but no default_zonegorup, in that case, the existing behavior should prevail and that's not being handled. If a default_realm is created but no default_zonegroup is there, weshould continue getting the keys from daemon_name = next(iter(daemon_keys)) Fixes: https://tracker.ceph.com/issues/68376 Signed-off-by: Aashish Sharma --- .../mgr/dashboard/services/rgw_client.py | 31 ++++++++++--------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/src/pybind/mgr/dashboard/services/rgw_client.py b/src/pybind/mgr/dashboard/services/rgw_client.py index 2441b73b361be..8846f42e70764 100755 --- a/src/pybind/mgr/dashboard/services/rgw_client.py +++ b/src/pybind/mgr/dashboard/services/rgw_client.py @@ -288,21 +288,22 @@ def instance(userid: Optional[str] = None, daemon_keys = RgwClient._daemons.keys() if not daemon_name: - if len(daemon_keys) > 1: - try: - multiiste = RgwMultisite() - default_zonegroup = multiiste.get_all_zonegroups_info()['default_zonegroup'] - - # Iterate through _daemons.values() to find the daemon with the - # matching zonegroup_id - for daemon in RgwClient._daemons.values(): - if daemon.zonegroup_id == default_zonegroup: - daemon_name = daemon.name - break - except Exception: # pylint: disable=broad-except - daemon_name = next(iter(daemon_keys)) - else: - # Handle the case where there is only one or no key in _daemons + try: + if len(daemon_keys) > 1: + default_zonegroup = ( + RgwMultisite() + .get_all_zonegroups_info()['default_zonegroup'] + ) + if default_zonegroup: + daemon_name = next( + (daemon.name + for daemon in RgwClient._daemons.values() + if daemon.zonegroup_id == default_zonegroup), + None + ) + daemon_name = daemon_name or next(iter(daemon_keys)) + except Exception as e: # pylint: disable=broad-except + logger.exception('Failed to determine default RGW daemon: %s', str(e)) daemon_name = next(iter(daemon_keys)) # Discard all cached instances if any rgw setting has changed From 471ebae9240192a4e143c00894e1736dd1921308 Mon Sep 17 00:00:00 2001 From: Nitzan Mordechai Date: Mon, 7 Oct 2024 10:21:58 +0000 Subject: [PATCH 058/148] qa/suites/crimson-rados/perf: add ssh keys cbt use ssh connection by defualt, without ssh_keys the task won't generate public key and the cbt task will fail Fixes: https://tracker.ceph.com/issues/68421 Signed-off-by: Nitzan Mordechai --- qa/suites/crimson-rados/perf/deploy/ceph.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/qa/suites/crimson-rados/perf/deploy/ceph.yaml b/qa/suites/crimson-rados/perf/deploy/ceph.yaml index 0f6021975a4a2..50d170f502272 100644 --- a/qa/suites/crimson-rados/perf/deploy/ceph.yaml +++ b/qa/suites/crimson-rados/perf/deploy/ceph.yaml @@ -10,3 +10,4 @@ tasks: osd: debug monc: 20 flavor: crimson +- ssh_keys: From 984a76f5a3ef3556122a6f81d63de756d0d9dc4d Mon Sep 17 00:00:00 2001 From: Yuval Lifshitz Date: Mon, 7 Oct 2024 15:34:15 +0000 Subject: [PATCH 059/148] test/rgw/lua: use stats polling instead of sleep this makes the test more consistent as well as faster fixes: https://tracker.ceph.com/issues/68335 Signed-off-by: Yuval Lifshitz --- src/rgw/rgw_lua_background.h | 5 +- src/test/rgw/test_rgw_lua.cc | 178 ++++++++++++++++++++++------------- 2 files changed, 117 insertions(+), 66 deletions(-) diff --git a/src/rgw/rgw_lua_background.h b/src/rgw/rgw_lua_background.h index 7b8d12599f4e8..2973a753fff63 100644 --- a/src/rgw/rgw_lua_background.h +++ b/src/rgw/rgw_lua_background.h @@ -153,9 +153,8 @@ class Background : public RGWRealmReloader::Pauser { void run(); -protected: std::string rgw_script; - virtual int read_script(); + int read_script(); public: Background(rgw::sal::Driver* _driver, @@ -173,7 +172,7 @@ class Background : public RGWRealmReloader::Pauser { std::unique_lock cond_lock(table_mutex); rgw_map[key] = value; } - + // update the manager after void set_manager(rgw::sal::LuaManager* _lua_manager); void pause() override; diff --git a/src/test/rgw/test_rgw_lua.cc b/src/test/rgw/test_rgw_lua.cc index b2e11e442a28f..ad923023a6d01 100644 --- a/src/test/rgw/test_rgw_lua.cc +++ b/src/test/rgw/test_rgw_lua.cc @@ -9,6 +9,7 @@ #include "rgw_lua_background.h" #include "rgw_lua_data_filter.h" #include "rgw_sal_config.h" +#include "rgw_perf_counters.h" using namespace std; using namespace rgw; @@ -184,9 +185,51 @@ inline std::unique_ptr make_store() { return std::make_unique(std::move(context_pool)); }; +class TestLuaManager : public rgw::sal::StoreLuaManager { + public: + std::string lua_script; + unsigned read_time = 0; + TestLuaManager() { + rgw_perf_start(g_cct); + } + int get_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, std::string& script) override { + std::this_thread::sleep_for(std::chrono::seconds(read_time)); + script = lua_script; + return 0; + } + int put_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, const std::string& script) override { + return 0; + } + int del_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key) override { + return 0; + } + int add_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name) override { + return 0; + } + int remove_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name) override { + return 0; + } + int list_packages(const DoutPrefixProvider* dpp, optional_yield y, rgw::lua::packages_t& packages) override { + return 0; + } + int reload_packages(const DoutPrefixProvider* dpp, optional_yield y) override { + return 0; + } + ~TestLuaManager() { + rgw_perf_stop(g_cct); + } +}; + +void set_script(rgw::sal::LuaManager* manager, const std::string& script) { + static_cast(manager)->lua_script = script; +} +void set_read_time(rgw::sal::LuaManager* manager, unsigned read_time) { + static_cast(manager)->read_time = read_time; +} + #define DEFINE_REQ_STATE RGWProcessEnv pe; \ auto store = make_store(); \ - pe.lua.manager = store->get_lua_manager(""); \ + pe.lua.manager = std::make_unique(); \ RGWEnv e; \ req_state s(g_cct, pe, &e, 0); @@ -850,24 +893,12 @@ TEST(TestRGWLua, OpsLog) } class TestBackground : public rgw::lua::Background { - const unsigned read_time; - -protected: - int read_script() override { - // don't read the object from the store - std::this_thread::sleep_for(std::chrono::seconds(read_time)); - return 0; - } - public: - TestBackground(sal::RadosStore* store, const std::string& script, rgw::sal::LuaManager* manager, unsigned read_time = 0) : + TestBackground(sal::RadosStore* store, rgw::sal::LuaManager* manager) : rgw::lua::Background(store, g_cct, manager, - 1 /* run every second */), - read_time(read_time) { - // the script is passed in the constructor - rgw_script = script; + 1 /* run every second */) { } ~TestBackground() override { @@ -878,20 +909,19 @@ class TestBackground : public rgw::lua::Background { TEST(TestRGWLuaBackground, Start) { auto store = make_store(); - auto manager = store->get_lua_manager(""); + auto manager = std::make_unique(); { // ctr and dtor without running - TestBackground lua_background(store.get(), "", manager.get()); + TestBackground lua_background(store.get(), manager.get()); } { // ctr and dtor with running - TestBackground lua_background(store.get(), "", manager.get()); + TestBackground lua_background(store.get(), manager.get()); lua_background.start(); } } - -constexpr auto wait_time = std::chrono::seconds(3); +constexpr auto wait_time = std::chrono::milliseconds(100); template const T& get_table_value(const TestBackground& b, const std::string& index) { @@ -903,6 +933,15 @@ const T& get_table_value(const TestBackground& b, const std::string& index) { } } +#define WAIT_FOR_BACKGROUND \ +{ \ + unsigned max_tries = 100; \ + do { \ + std::this_thread::sleep_for(wait_time); \ + --max_tries; \ + } while (perfcounter->get(l_rgw_lua_script_ok) + perfcounter->get(l_rgw_lua_script_fail) == 0 && max_tries > 0); \ +} + TEST(TestRGWLuaBackground, Script) { const std::string script = R"( @@ -912,10 +951,11 @@ TEST(TestRGWLuaBackground, Script) )"; auto store = make_store(); - auto manager = store->get_lua_manager(""); - TestBackground lua_background(store.get(), script, manager.get()); + auto manager = std::make_unique(); + set_script(manager.get(), script); + TestBackground lua_background(store.get(), manager.get()); lua_background.start(); - std::this_thread::sleep_for(wait_time); + WAIT_FOR_BACKGROUND; EXPECT_EQ(get_table_value(lua_background, "hello"), "world"); } @@ -928,9 +968,10 @@ TEST(TestRGWLuaBackground, RequestScript) )"; DEFINE_REQ_STATE; - TestBackground lua_background(store.get(), background_script, pe.lua.manager.get()); + set_script(pe.lua.manager.get(), background_script); + TestBackground lua_background(store.get(), pe.lua.manager.get()); lua_background.start(); - std::this_thread::sleep_for(wait_time); + WAIT_FOR_BACKGROUND; const std::string request_script = R"( local key = "hello" @@ -947,8 +988,9 @@ TEST(TestRGWLuaBackground, RequestScript) ASSERT_EQ(rc, 0); EXPECT_EQ(get_table_value(lua_background, "hello"), "from request"); // now we resume and let the background set the value + perfcounter->set(l_rgw_lua_script_ok, 0); lua_background.resume(store.get()); - std::this_thread::sleep_for(wait_time); + WAIT_FOR_BACKGROUND; EXPECT_EQ(get_table_value(lua_background, "hello"), "from background"); } @@ -965,14 +1007,16 @@ TEST(TestRGWLuaBackground, Pause) )"; auto store = make_store(); - auto manager = store->get_lua_manager(""); - TestBackground lua_background(store.get(), script, manager.get()); + auto manager = std::make_unique(); + set_script(manager.get(), script); + TestBackground lua_background(store.get(), manager.get()); lua_background.start(); - std::this_thread::sleep_for(wait_time); + WAIT_FOR_BACKGROUND; const auto value_len = get_table_value(lua_background, "hello").size(); EXPECT_GT(value_len, 0); lua_background.pause(); - std::this_thread::sleep_for(wait_time); + // make sure no execution occurs + std::this_thread::sleep_for(wait_time*10); // no change in len EXPECT_EQ(value_len, get_table_value(lua_background, "hello").size()); } @@ -991,15 +1035,17 @@ TEST(TestRGWLuaBackground, PauseWhileReading) )"; auto store = make_store(); - auto manager = store->get_lua_manager(""); - TestBackground lua_background(store.get(), script, manager.get(), 2); + auto manager = std::make_unique(); + set_script(manager.get(), script); + set_read_time(manager.get(), 2); + TestBackground lua_background(store.get(), manager.get()); lua_background.start(); - constexpr auto long_wait_time = std::chrono::seconds(6); - std::this_thread::sleep_for(long_wait_time); + WAIT_FOR_BACKGROUND; const auto value_len = get_table_value(lua_background, "hello").size(); EXPECT_GT(value_len, 0); lua_background.pause(); - std::this_thread::sleep_for(long_wait_time); + // make sure no execution occurs + std::this_thread::sleep_for(wait_time*10); // one execution might occur after pause EXPECT_TRUE(value_len + 1 >= get_table_value(lua_background, "hello").size()); } @@ -1013,14 +1059,16 @@ TEST(TestRGWLuaBackground, ReadWhilePaused) )"; auto store = make_store(); - auto manager = store->get_lua_manager(""); - TestBackground lua_background(store.get(), script, manager.get()); + auto manager = std::make_unique(); + set_script(manager.get(), script); + TestBackground lua_background(store.get(), manager.get()); lua_background.pause(); lua_background.start(); - std::this_thread::sleep_for(wait_time); + // make sure no execution occurs + std::this_thread::sleep_for(wait_time*10); EXPECT_EQ(get_table_value(lua_background, "hello"), ""); lua_background.resume(store.get()); - std::this_thread::sleep_for(wait_time); + WAIT_FOR_BACKGROUND; EXPECT_EQ(get_table_value(lua_background, "hello"), "world"); } @@ -1037,18 +1085,21 @@ TEST(TestRGWLuaBackground, PauseResume) )"; auto store = make_store(); - auto manager = store->get_lua_manager(""); - TestBackground lua_background(store.get(), script, manager.get()); + auto manager = std::make_unique(); + set_script(manager.get(), script); + TestBackground lua_background(store.get(), manager.get()); lua_background.start(); - std::this_thread::sleep_for(wait_time); + WAIT_FOR_BACKGROUND; const auto value_len = get_table_value(lua_background, "hello").size(); EXPECT_GT(value_len, 0); lua_background.pause(); - std::this_thread::sleep_for(wait_time); + // make sure no execution occurs + std::this_thread::sleep_for(wait_time*10); // no change in len EXPECT_EQ(value_len, get_table_value(lua_background, "hello").size()); + perfcounter->set(l_rgw_lua_script_ok, 0); lua_background.resume(store.get()); - std::this_thread::sleep_for(wait_time); + WAIT_FOR_BACKGROUND; // should be a change in len EXPECT_GT(get_table_value(lua_background, "hello").size(), value_len); } @@ -1066,18 +1117,19 @@ TEST(TestRGWLuaBackground, MultipleStarts) )"; auto store = make_store(); - auto manager = store->get_lua_manager(""); - TestBackground lua_background(store.get(), script, manager.get()); + auto manager = std::make_unique(); + set_script(manager.get(), script); + TestBackground lua_background(store.get(), manager.get()); lua_background.start(); - std::this_thread::sleep_for(wait_time); + WAIT_FOR_BACKGROUND; const auto value_len = get_table_value(lua_background, "hello").size(); EXPECT_GT(value_len, 0); lua_background.start(); lua_background.shutdown(); lua_background.shutdown(); - std::this_thread::sleep_for(wait_time); + perfcounter->set(l_rgw_lua_script_ok, 0); lua_background.start(); - std::this_thread::sleep_for(wait_time); + WAIT_FOR_BACKGROUND; // should be a change in len EXPECT_GT(get_table_value(lua_background, "hello").size(), value_len); } @@ -1085,7 +1137,7 @@ TEST(TestRGWLuaBackground, MultipleStarts) TEST(TestRGWLuaBackground, TableValues) { DEFINE_REQ_STATE; - TestBackground lua_background(store.get(), "", pe.lua.manager.get()); + TestBackground lua_background(store.get(), pe.lua.manager.get()); const std::string request_script = R"( RGW["key1"] = "string value" @@ -1107,7 +1159,7 @@ TEST(TestRGWLuaBackground, TableValues) TEST(TestRGWLuaBackground, TablePersist) { DEFINE_REQ_STATE; - TestBackground lua_background(store.get(), "", pe.lua.manager.get()); + TestBackground lua_background(store.get(), pe.lua.manager.get()); std::string request_script = R"( RGW["key1"] = "string value" @@ -1137,7 +1189,7 @@ TEST(TestRGWLuaBackground, TablePersist) TEST(TestRGWLuaBackground, TableValuesFromRequest) { DEFINE_REQ_STATE; - TestBackground lua_background(store.get(), "", pe.lua.manager.get()); + TestBackground lua_background(store.get(), pe.lua.manager.get()); lua_background.start(); const std::string request_script = R"( @@ -1165,7 +1217,7 @@ TEST(TestRGWLuaBackground, TableValuesFromRequest) TEST(TestRGWLuaBackground, TableInvalidValue) { DEFINE_REQ_STATE; - TestBackground lua_background(store.get(), "", pe.lua.manager.get()); + TestBackground lua_background(store.get(), pe.lua.manager.get()); lua_background.start(); const std::string request_script = R"( @@ -1191,7 +1243,7 @@ TEST(TestRGWLuaBackground, TableInvalidValue) TEST(TestRGWLuaBackground, TableErase) { DEFINE_REQ_STATE; - TestBackground lua_background(store.get(), "", pe.lua.manager.get()); + TestBackground lua_background(store.get(), pe.lua.manager.get()); std::string request_script = R"( RGW["size"] = 0 @@ -1229,7 +1281,7 @@ TEST(TestRGWLuaBackground, TableErase) TEST(TestRGWLuaBackground, TableIterate) { DEFINE_REQ_STATE; - TestBackground lua_background(store.get(), "", pe.lua.manager.get()); + TestBackground lua_background(store.get(), pe.lua.manager.get()); const std::string request_script = R"( RGW["key1"] = "string value" @@ -1256,7 +1308,7 @@ TEST(TestRGWLuaBackground, TableIterate) TEST(TestRGWLuaBackground, TableIterateWrite) { DEFINE_REQ_STATE; - TestBackground lua_background(store.get(), "", pe.lua.manager.get()); + TestBackground lua_background(store.get(), pe.lua.manager.get()); const std::string request_script = R"( RGW["a"] = 1 @@ -1286,7 +1338,7 @@ TEST(TestRGWLuaBackground, TableIterateWrite) TEST(TestRGWLuaBackground, TableIncrement) { DEFINE_REQ_STATE; - TestBackground lua_background(store.get(), "", pe.lua.manager.get()); + TestBackground lua_background(store.get(), pe.lua.manager.get()); const std::string request_script = R"( RGW["key1"] = 42 @@ -1306,7 +1358,7 @@ TEST(TestRGWLuaBackground, TableIncrement) TEST(TestRGWLuaBackground, TableIncrementBy) { DEFINE_REQ_STATE; - TestBackground lua_background(store.get(), "", pe.lua.manager.get()); + TestBackground lua_background(store.get(), pe.lua.manager.get()); const std::string request_script = R"( RGW["key1"] = 42 @@ -1328,7 +1380,7 @@ TEST(TestRGWLuaBackground, TableIncrementBy) TEST(TestRGWLuaBackground, TableDecrement) { DEFINE_REQ_STATE; - TestBackground lua_background(store.get(), "", pe.lua.manager.get()); + TestBackground lua_background(store.get(), pe.lua.manager.get()); const std::string request_script = R"( RGW["key1"] = 42 @@ -1348,7 +1400,7 @@ TEST(TestRGWLuaBackground, TableDecrement) TEST(TestRGWLuaBackground, TableDecrementBy) { DEFINE_REQ_STATE; - TestBackground lua_background(store.get(), "", pe.lua.manager.get()); + TestBackground lua_background(store.get(), pe.lua.manager.get()); const std::string request_script = R"( RGW["key1"] = 42 @@ -1370,7 +1422,7 @@ TEST(TestRGWLuaBackground, TableDecrementBy) TEST(TestRGWLuaBackground, TableIncrementValueError) { DEFINE_REQ_STATE; - TestBackground lua_background(store.get(), "", pe.lua.manager.get()); + TestBackground lua_background(store.get(), pe.lua.manager.get()); std::string request_script = R"( -- cannot increment string values @@ -1405,7 +1457,7 @@ TEST(TestRGWLuaBackground, TableIncrementValueError) TEST(TestRGWLuaBackground, TableIncrementError) { DEFINE_REQ_STATE; - TestBackground lua_background(store.get(), "", pe.lua.manager.get()); + TestBackground lua_background(store.get(), pe.lua.manager.get()); std::string request_script = R"( -- missing argument @@ -1494,7 +1546,7 @@ TEST(TestRGWLua, Data) )"; DEFINE_REQ_STATE; - TestBackground lua_background(store.get(), "", pe.lua.manager.get()); + TestBackground lua_background(store.get(), pe.lua.manager.get()); s.host_id = "foo"; pe.lua.background = &lua_background; lua::RGWObjFilter filter(&s, script); From fc537c8d914274791ec179bf08a95dc558d81266 Mon Sep 17 00:00:00 2001 From: Yingxin Cheng Date: Fri, 6 Sep 2024 16:54:22 +0800 Subject: [PATCH 060/148] crimson/os/seastore: misc cleanups Signed-off-by: Yingxin Cheng --- src/crimson/os/futurized_store.h | 8 +- src/crimson/os/seastore/seastore.cc | 245 ++++++++++++++-------------- src/crimson/os/seastore/seastore.h | 64 +++----- 3 files changed, 154 insertions(+), 163 deletions(-) diff --git a/src/crimson/os/futurized_store.h b/src/crimson/os/futurized_store.h index fe09cc5451072..0dca695ba3a1e 100644 --- a/src/crimson/os/futurized_store.h +++ b/src/crimson/os/futurized_store.h @@ -75,14 +75,15 @@ class FuturizedStore { CollectionRef c, const ghobject_t& oid) = 0; - using omap_values_t = std::map>; + using omap_values_t = attrs_t; using omap_keys_t = std::set; virtual read_errorator::future omap_get_values( CollectionRef c, const ghobject_t& oid, const omap_keys_t& keys) = 0; - virtual read_errorator::future> omap_get_values( + using omap_values_paged_t = std::tuple; + virtual read_errorator::future omap_get_values( CollectionRef c, ///< [in] collection const ghobject_t &oid, ///< [in] oid const std::optional &start ///< [in] start, empty for begin @@ -147,7 +148,8 @@ class FuturizedStore { return seastar::now(); } - virtual read_errorator::future> fiemap( + using fiemap_ret_t = std::map; + virtual read_errorator::future fiemap( CollectionRef ch, const ghobject_t& oid, uint64_t off, diff --git a/src/crimson/os/seastore/seastore.cc b/src/crimson/os/seastore/seastore.cc index 1577433237351..d708231b47b1a 100644 --- a/src/crimson/os/seastore/seastore.cc +++ b/src/crimson/os/seastore/seastore.cc @@ -30,8 +30,6 @@ #include "crimson/os/seastore/onode_manager.h" #include "crimson/os/seastore/object_data_handler.h" - -using std::string; using crimson::common::local_conf; template <> struct fmt::formatter @@ -278,10 +276,10 @@ SeaStore::mount_ertr::future<> SeaStore::mount() return set_secondaries(); }); }); - }).safe_then([this] { - return shard_stores.invoke_on_all([](auto &local_store) { - return local_store.mount_managers(); - }); + }); + }).safe_then([this] { + return shard_stores.invoke_on_all([](auto &local_store) { + return local_store.mount_managers(); }); }).handle_error( crimson::ct_error::assert_all{ @@ -345,15 +343,15 @@ seastar::future<> SeaStore::write_fsid(uuid_d new_osd_fsid) auto [ret, fsid] = tuple; std::string str_fsid = stringify(new_osd_fsid); if (ret == -1) { - return write_meta("fsid", stringify(new_osd_fsid)); + return write_meta("fsid", stringify(new_osd_fsid)); } else if (ret == 0 && fsid != str_fsid) { - ERROR("on-disk fsid {} != provided {}", - fsid, stringify(new_osd_fsid)); - throw std::runtime_error("store fsid error"); - } else { + ERROR("on-disk fsid {} != provided {}", + fsid, stringify(new_osd_fsid)); + throw std::runtime_error("store fsid error"); + } else { return seastar::now(); - } - }); + } + }); } seastar::future<> @@ -413,7 +411,8 @@ seastar::future<> SeaStore::set_secondaries() SeaStore::mkfs_ertr::future<> SeaStore::test_mkfs(uuid_d new_osd_fsid) { ceph_assert(seastar::this_shard_id() == primary_core); - return read_meta("mkfs_done").then([this, new_osd_fsid] (auto tuple) { + return read_meta("mkfs_done" + ).then([this, new_osd_fsid](auto tuple) { auto [done, value] = tuple; if (done == 0) { return seastar::now(); @@ -449,7 +448,8 @@ seastar::future<> SeaStore::prepare_meta(uuid_d new_osd_fsid) SeaStore::mkfs_ertr::future<> SeaStore::mkfs(uuid_d new_osd_fsid) { ceph_assert(seastar::this_shard_id() == primary_core); - return read_meta("mkfs_done").then([this, new_osd_fsid] (auto tuple) { + return read_meta("mkfs_done" + ).then([this, new_osd_fsid](auto tuple) { auto [done, value] = tuple; if (done == 0) { return seastar::now(); @@ -542,7 +542,7 @@ SeaStore::mkfs_ertr::future<> SeaStore::mkfs(uuid_d new_osd_fsid) }); } -using coll_core_t = FuturizedStore::coll_core_t; +using coll_core_t = SeaStore::coll_core_t; seastar::future> SeaStore::list_collections() { @@ -566,9 +566,10 @@ store_statfs_t SeaStore::Shard::stat() const seastar::future SeaStore::stat() const { - ceph_assert(seastar::this_shard_id() == primary_core); LOG_PREFIX(SeaStore::stat); DEBUG(""); + + ceph_assert(seastar::this_shard_id() == primary_core); return shard_stores.map_reduce0( [](const SeaStore::Shard &local_store) { return local_store.stat(); @@ -914,10 +915,11 @@ SeaStore::Shard::list_objects(CollectionRef ch, seastar::stop_iteration >(seastar::stop_iteration::no); }); - }).si_then([&ret] { - return list_iertr::make_ready_future< - OnodeManager::list_onodes_bare_ret>(std::move(ret)); - }); + } + ).si_then([&ret] { + return list_iertr::make_ready_future< + OnodeManager::list_onodes_bare_ret>(std::move(ret)); + }); } }); }).safe_then([&ret](auto&& _ret) { @@ -949,7 +951,8 @@ SeaStore::Shard::open_collection(const coll_t& cid) { LOG_PREFIX(SeaStore::open_collection); DEBUG("{}", cid); - return list_collections().then([cid, this] (auto colls_cores) { + return list_collections( + ).then([cid, this] (auto colls_cores) { if (auto found = std::find(colls_cores.begin(), colls_cores.end(), std::make_pair(cid, seastar::this_shard_id())); @@ -1032,7 +1035,7 @@ SeaStore::Shard::read( Transaction::src_t::READ, "read_obj", op_type_t::READ, - [=, this](auto &t, auto &onode) -> ObjectDataHandler::read_ret { + [this, offset, len, op_flags](auto &t, auto &onode) -> ObjectDataHandler::read_ret { size_t size = onode.get_layout().size; if (offset >= size) { @@ -1098,10 +1101,10 @@ SeaStore::Shard::readv( return seastar::do_with( _oid, ceph::bufferlist{}, - [=, this, &m](auto &oid, auto &ret) { + [ch, op_flags, this, &m](auto &oid, auto &ret) { return crimson::do_for_each( m, - [=, this, &oid, &ret](auto &p) { + [ch, op_flags, this, &oid, &ret](auto &p) { return read( ch, oid, p.first, p.second, op_flags ).safe_then([&ret](auto bl) { @@ -1112,7 +1115,6 @@ SeaStore::Shard::readv( (std::move(ret)); }); }); - return read_errorator::make_ready_future(); } using crimson::os::seastore::omap_manager::BtreeOMapManager; @@ -1123,20 +1125,19 @@ SeaStore::Shard::get_attr( const ghobject_t& oid, std::string_view name) const { - auto c = static_cast(ch.get()); LOG_PREFIX(SeaStore::get_attr); - DEBUG("{} {}", c->get_cid(), oid); + DEBUG("{} {}", ch->get_cid(), oid); ++(shard_stats.read_num); ++(shard_stats.pending_read_num); return repeat_with_onode( - c, + ch, oid, Transaction::src_t::READ, "get_attr", op_type_t::GET_ATTR, - [=, this](auto &t, auto& onode) -> _omap_get_value_ret { + [this, name](auto &t, auto& onode) -> _omap_get_value_ret { auto& layout = onode.get_layout(); if (name == OI_ATTR && layout.oi_size) { ceph::bufferlist bl; @@ -1170,19 +1171,18 @@ SeaStore::Shard::get_attrs( const ghobject_t& oid) { LOG_PREFIX(SeaStore::get_attrs); - auto c = static_cast(ch.get()); - DEBUG("{} {}", c->get_cid(), oid); + DEBUG("{} {}", ch->get_cid(), oid); ++(shard_stats.read_num); ++(shard_stats.pending_read_num); return repeat_with_onode( - c, + ch, oid, Transaction::src_t::READ, "get_addrs", op_type_t::GET_ATTRS, - [=, this](auto &t, auto& onode) { + [this](auto &t, auto& onode) { auto& layout = onode.get_layout(); return omap_list(onode, layout.xattr_root, t, std::nullopt, OMapManager::omap_list_config_t() @@ -1202,7 +1202,7 @@ SeaStore::Shard::get_attrs( attrs.emplace(SS_ATTR, std::move(bl)); DEBUGT("set ss from onode layout", t); } - return seastar::make_ready_future(std::move(attrs)); + return seastar::make_ready_future(std::move(attrs)); }); } ).handle_error( @@ -1229,7 +1229,7 @@ seastar::future SeaStore::Shard::stat( Transaction::src_t::READ, "stat", op_type_t::STAT, - [=, this](auto &t, auto &onode) { + [this, oid](auto &t, auto &onode) { struct stat st; auto &olayout = onode.get_layout(); st.st_size = olayout.size; @@ -1266,9 +1266,8 @@ SeaStore::Shard::omap_get_values( ++(shard_stats.read_num); ++(shard_stats.pending_read_num); - auto c = static_cast(ch.get()); return repeat_with_onode( - c, + ch, oid, Transaction::src_t::READ, "omap_get_values", @@ -1298,21 +1297,20 @@ SeaStore::Shard::_omap_get_value( std::move(root), std::string(key), [&t](auto &manager, auto& root, auto& key) -> _omap_get_value_ret { - if (root.is_null()) { + if (root.is_null()) { + return crimson::ct_error::enodata::make(); + } + return manager.omap_get_value(root, t, key + ).si_then([](auto opt) -> _omap_get_value_ret { + if (!opt) { return crimson::ct_error::enodata::make(); } - return manager.omap_get_value(root, t, key - ).si_then([](auto opt) -> _omap_get_value_ret { - if (!opt) { - return crimson::ct_error::enodata::make(); - } - return seastar::make_ready_future(std::move(*opt)); - }); - } - ); + return seastar::make_ready_future(std::move(*opt)); + }); + }); } -SeaStore::Shard::_omap_get_values_ret +SeaStore::base_iertr::future SeaStore::Shard::_omap_get_values( Transaction &t, omap_root_t &&omap_root, @@ -1325,31 +1323,29 @@ SeaStore::Shard::_omap_get_values( BtreeOMapManager(*transaction_manager), std::move(omap_root), omap_values_t(), - [&](auto &manager, auto &root, auto &ret) { - return trans_intr::do_for_each( - keys.begin(), - keys.end(), - [&](auto &key) { - return manager.omap_get_value( - root, - t, - key - ).si_then([&ret, &key](auto &&p) { - if (p) { - bufferlist bl; - bl.append(*p); - ret.emplace( - std::move(key), - std::move(bl)); - } - return seastar::now(); - }); + [&t, &keys](auto &manager, auto &root, auto &ret) { + return trans_intr::do_for_each( + keys.begin(), + keys.end(), + [&t, &manager, &root, &ret](auto &key) { + return manager.omap_get_value( + root, + t, + key + ).si_then([&ret, &key](auto &&p) { + if (p) { + bufferlist bl; + bl.append(*p); + ret.emplace( + std::move(key), + std::move(bl)); } - ).si_then([&ret] { - return std::move(ret); + return seastar::now(); }); - } - ); + }).si_then([&ret] { + return std::move(ret); + }); + }); } SeaStore::Shard::omap_list_ret @@ -1377,22 +1373,20 @@ SeaStore::Shard::omap_list( }); } -SeaStore::Shard::omap_get_values_ret_t +SeaStore::Shard::read_errorator::future SeaStore::Shard::omap_get_values( CollectionRef ch, const ghobject_t &oid, - const std::optional &start) + const std::optional &start) { - auto c = static_cast(ch.get()); LOG_PREFIX(SeaStore::omap_get_values); - DEBUG("{} {}", c->get_cid(), oid); + DEBUG("{} {}", ch->get_cid(), oid); ++(shard_stats.read_num); ++(shard_stats.pending_read_num); - using ret_bare_t = std::tuple; - return repeat_with_onode( - c, + return repeat_with_onode( + ch, oid, Transaction::src_t::READ, "omap_list", @@ -1413,7 +1407,8 @@ SeaStore::Shard::omap_get_values( }); } -SeaStore::Shard::_fiemap_ret SeaStore::Shard::_fiemap( +SeaStore::base_iertr::future +SeaStore::Shard::_fiemap( Transaction &t, Onode &onode, uint64_t off, @@ -1421,7 +1416,7 @@ SeaStore::Shard::_fiemap_ret SeaStore::Shard::_fiemap( { return seastar::do_with( ObjectDataHandler(max_object_size), - [=, this, &t, &onode] (auto &objhandler) { + [this, off, len, &t, &onode](auto &objhandler) { return objhandler.fiemap( ObjectDataHandler::context_t{ *transaction_manager, @@ -1433,7 +1428,7 @@ SeaStore::Shard::_fiemap_ret SeaStore::Shard::_fiemap( }); } -SeaStore::Shard::read_errorator::future> +SeaStore::Shard::read_errorator::future SeaStore::Shard::fiemap( CollectionRef ch, const ghobject_t& oid, @@ -1446,13 +1441,13 @@ SeaStore::Shard::fiemap( ++(shard_stats.read_num); ++(shard_stats.pending_read_num); - return repeat_with_onode>( + return repeat_with_onode( ch, oid, Transaction::src_t::READ, "fiemap_read", op_type_t::READ, - [=, this](auto &t, auto &onode) -> _fiemap_ret { + [this, off, len](auto &t, auto &onode) -> base_iertr::future { size_t size = onode.get_layout().size; if (off >= size) { INFOT("fiemap offset is over onode size!", t); @@ -1497,7 +1492,7 @@ seastar::future<> SeaStore::Shard::do_transaction_no_callbacks( "do_transaction", op_type_t::TRANSACTION, [this](auto &ctx) { - return with_trans_intr(*ctx.transaction, [&, this](auto &t) { + return with_trans_intr(*ctx.transaction, [&ctx, this](auto &t) { LOG_PREFIX(SeaStore::Shard::do_transaction_no_callbacks); SUBDEBUGT(seastore_t, "start with {} objects", t, ctx.iter.objects.size()); @@ -1578,8 +1573,9 @@ SeaStore::Shard::_do_transaction_step( SUBTRACET(seastore_t, "got op {}", *ctx.transaction, (uint32_t)op->op); using ceph::os::Transaction; - if (op->op == Transaction::OP_NOP) + if (op->op == Transaction::OP_NOP) { return tm_iertr::now(); + } switch (op->op) { case Transaction::OP_RMCOLL: @@ -1611,14 +1607,14 @@ SeaStore::Shard::_do_transaction_step( create = true; } if (!onodes[op->oid]) { + const ghobject_t& oid = i.get_oid(op->oid); if (!create) { - fut = onode_manager->get_onode(*ctx.transaction, i.get_oid(op->oid)); + fut = onode_manager->get_onode(*ctx.transaction, oid); } else { - fut = onode_manager->get_or_create_onode( - *ctx.transaction, i.get_oid(op->oid)); + fut = onode_manager->get_or_create_onode(*ctx.transaction, oid); } } - return fut.si_then([&, op](auto get_onode) { + return fut.si_then([&, op, this](auto get_onode) { OnodeRef &o = onodes[op->oid]; if (!o) { assert(get_onode); @@ -1632,7 +1628,7 @@ SeaStore::Shard::_do_transaction_step( // support parallel extents loading return onode_manager->get_or_create_onode( *ctx.transaction, i.get_oid(op->dest_oid) - ).si_then([&, op](auto dest_onode) { + ).si_then([&onodes, &d_onodes, op](auto dest_onode) { assert(dest_onode); auto &d_o = onodes[op->dest_oid]; assert(!d_o); @@ -1644,7 +1640,7 @@ SeaStore::Shard::_do_transaction_step( } else { return OnodeManager::get_or_create_onode_iertr::now(); } - }).si_then([&, op, this]() -> tm_ret { + }).si_then([&ctx, &i, &onodes, &d_onodes, op, this, FNAME]() -> tm_ret { LOG_PREFIX(SeaStore::_do_transaction_step); try { switch (op->op) { @@ -2126,8 +2122,8 @@ SeaStore::Shard::_omap_clear( { LOG_PREFIX(SeaStore::_omap_clear); DEBUGT("{} {} keys", *ctx.transaction, *onode); - return _xattr_rmattr(ctx, onode, std::string(OMAP_HEADER_XATTR_KEY)) - .si_then([this, &ctx, &onode]() -> tm_ret { + return _xattr_rmattr(ctx, onode, std::string(OMAP_HEADER_XATTR_KEY) + ).si_then([this, &ctx, &onode]() -> tm_ret { if (auto omap_root = onode->get_layout().omap_root.get( onode->get_metadata_hint(device->get_block_size())); omap_root.is_null()) { @@ -2142,8 +2138,8 @@ SeaStore::Shard::_omap_clear( auto &omap_root) { return omap_manager.omap_clear( omap_root, - *ctx.transaction) - .si_then([&] { + *ctx.transaction + ).si_then([&] { if (omap_root.must_update()) { onode->update_omap_root(*ctx.transaction, omap_root); } @@ -2489,6 +2485,21 @@ SeaStore::Shard::_get_collection(const coll_t& cid) return new SeastoreCollection{cid}; } +seastar::future<> SeaStore::write_meta( + const std::string& key, + const std::string& value) { + ceph_assert(seastar::this_shard_id() == primary_core); + return seastar::do_with(key, value, + [this](auto& key, auto& value) { + return shard_stores.local().write_meta(key, value + ).then([this, &key, &value] { + return mdstore->write_meta(key, value); + }).handle_error( + crimson::ct_error::assert_all{"Invalid error in SeaStore::write_meta"} + ); + }); +} + seastar::future<> SeaStore::Shard::write_meta( const std::string& key, const std::string& value) @@ -2501,27 +2512,22 @@ seastar::future<> SeaStore::Shard::write_meta( // For TM::submit_transaction() ++(shard_stats.processing_inlock_io_num); - return seastar::do_with( - key, value, - [this, FNAME](auto& key, auto& value) { - return repeat_eagain([this, FNAME, &key, &value] { - ++(shard_stats.repeat_io_num); - - return transaction_manager->with_transaction_intr( - Transaction::src_t::MUTATE, - "write_meta", - [this, FNAME, &key, &value](auto& t) - { - DEBUGT("Have transaction, key: {}; value: {}", t, key, value); - return transaction_manager->update_root_meta( - t, key, value - ).si_then([this, &t] { - return transaction_manager->submit_transaction(t); - }); - }); - }); - } - ).handle_error( + return repeat_eagain([this, FNAME, &key, &value] { + ++(shard_stats.repeat_io_num); + + return transaction_manager->with_transaction_intr( + Transaction::src_t::MUTATE, + "write_meta", + [this, FNAME, &key, &value](auto& t) + { + DEBUGT("Have transaction, key: {}; value: {}", t, key, value); + return transaction_manager->update_root_meta( + t, key, value + ).si_then([this, &t] { + return transaction_manager->submit_transaction(t); + }); + }); + }).handle_error( crimson::ct_error::assert_all{"Invalid error in SeaStore::write_meta"} ).finally([this] { assert(shard_stats.pending_io_num); @@ -2535,10 +2541,11 @@ seastar::future<> SeaStore::Shard::write_meta( seastar::future> SeaStore::read_meta(const std::string& key) { - ceph_assert(seastar::this_shard_id() == primary_core); LOG_PREFIX(SeaStore::read_meta); DEBUG("key: {}", key); - return mdstore->read_meta(key).safe_then([](auto v) { + ceph_assert(seastar::this_shard_id() == primary_core); + return mdstore->read_meta(key + ).safe_then([](auto v) { if (v) { return std::make_tuple(0, std::move(*v)); } else { diff --git a/src/crimson/os/seastore/seastore.h b/src/crimson/os/seastore/seastore.h index fb495a422f656..58d4f5e593cbe 100644 --- a/src/crimson/os/seastore/seastore.h +++ b/src/crimson/os/seastore/seastore.h @@ -71,20 +71,19 @@ struct col_obj_ranges_t { class SeaStore final : public FuturizedStore { public: + using base_ertr = TransactionManager::base_ertr; + using base_iertr = TransactionManager::base_iertr; + class MDStore { public: - using base_iertr = crimson::errorator< - crimson::ct_error::input_output_error - >; - - using write_meta_ertr = base_iertr; + using write_meta_ertr = base_ertr; using write_meta_ret = write_meta_ertr::future<>; virtual write_meta_ret write_meta( const std::string &key, const std::string &val ) = 0; - using read_meta_ertr = base_iertr; + using read_meta_ertr = base_ertr; using read_meta_ret = write_meta_ertr::future>; virtual read_meta_ret read_meta(const std::string &key) = 0; @@ -136,10 +135,7 @@ class SeaStore final : public FuturizedStore { const omap_keys_t& keys) final; /// Retrieves paged set of values > start (if present) - using omap_get_values_ret_bare_t = std::tuple; - using omap_get_values_ret_t = read_errorator::future< - omap_get_values_ret_bare_t>; - omap_get_values_ret_t omap_get_values( + read_errorator::future omap_get_values( CollectionRef c, ///< [in] collection const ghobject_t &oid, ///< [in] oid const std::optional &start ///< [in] start, empty for begin @@ -170,7 +166,7 @@ class SeaStore final : public FuturizedStore { * stages and locks as do_transaction. */ seastar::future<> flush(CollectionRef ch) final; - read_errorator::future> fiemap( + read_errorator::future fiemap( CollectionRef ch, const ghobject_t& oid, uint64_t off, @@ -190,7 +186,6 @@ class SeaStore final : public FuturizedStore { secondaries.emplace_back(&sec_dev); } - using coll_core_t = FuturizedStore::coll_core_t; seastar::future> list_collections(); seastar::future<> write_meta(const std::string& key, @@ -334,14 +329,16 @@ class SeaStore final : public FuturizedStore { }); } - using _fiemap_ret = ObjectDataHandler::fiemap_ret; - _fiemap_ret _fiemap( - Transaction &t, - Onode &onode, - uint64_t off, - uint64_t len) const; + using omap_list_bare_ret = OMapManager::omap_list_bare_ret; + using omap_list_ret = OMapManager::omap_list_ret; + omap_list_ret omap_list( + Onode& onode, + const omap_root_le_t& omap_root, + Transaction& t, + const std::optional& start, + OMapManager::omap_list_config_t config) const; - using _omap_get_value_iertr = OMapManager::base_iertr::extend< + using _omap_get_value_iertr = base_iertr::extend< crimson::ct_error::enodata >; using _omap_get_value_ret = _omap_get_value_iertr::future; @@ -350,25 +347,20 @@ class SeaStore final : public FuturizedStore { omap_root_t &&root, std::string_view key) const; - using _omap_get_values_iertr = OMapManager::base_iertr; - using _omap_get_values_ret = _omap_get_values_iertr::future; - _omap_get_values_ret _omap_get_values( + base_iertr::future _omap_get_values( Transaction &t, omap_root_t &&root, const omap_keys_t &keys) const; friend class SeaStoreOmapIterator; - using omap_list_bare_ret = OMapManager::omap_list_bare_ret; - using omap_list_ret = OMapManager::omap_list_ret; - omap_list_ret omap_list( + base_iertr::future _fiemap( + Transaction &t, Onode &onode, - const omap_root_le_t& omap_root, - Transaction& t, - const std::optional& start, - OMapManager::omap_list_config_t config) const; + uint64_t off, + uint64_t len) const; - using tm_iertr = TransactionManager::base_iertr; + using tm_iertr = base_iertr; using tm_ret = tm_iertr::future<>; tm_ret _do_transaction_step( internal_context_t &ctx, @@ -535,17 +527,7 @@ class SeaStore final : public FuturizedStore { return shard_stores.local().get_fsid(); } - seastar::future<> write_meta( - const std::string& key, - const std::string& value) final { - ceph_assert(seastar::this_shard_id() == primary_core); - return shard_stores.local().write_meta( - key, value).then([this, key, value] { - return mdstore->write_meta(key, value); - }).handle_error( - crimson::ct_error::assert_all{"Invalid error in SeaStore::write_meta"} - ); - } + seastar::future<> write_meta(const std::string& key, const std::string& value) final; seastar::future> read_meta(const std::string& key) final; From a49e49a1fd61914cd6cb2e1281c1733efe20abe7 Mon Sep 17 00:00:00 2001 From: Yingxin Cheng Date: Tue, 10 Sep 2024 11:41:15 +0800 Subject: [PATCH 061/148] crimson/os/seastore: convert length logs to the hex format Signed-off-by: Yingxin Cheng --- src/crimson/os/seastore/cached_extent.cc | 10 +++++---- src/crimson/os/seastore/cached_extent.h | 2 +- .../onode_manager/staged-fltree/node_layout.h | 2 +- src/crimson/os/seastore/seastore_types.cc | 22 ++++++++++--------- 4 files changed, 20 insertions(+), 16 deletions(-) diff --git a/src/crimson/os/seastore/cached_extent.cc b/src/crimson/os/seastore/cached_extent.cc index cdad6dfb1b03d..76c18bde667a4 100644 --- a/src/crimson/os/seastore/cached_extent.cc +++ b/src/crimson/os/seastore/cached_extent.cc @@ -158,12 +158,14 @@ parent_tracker_t::~parent_tracker_t() { std::ostream &operator<<(std::ostream &out, const LBAMapping &rhs) { - out << "LBAMapping(" << rhs.get_key() << "~" << rhs.get_length() + out << "LBAMapping(" << rhs.get_key() + << "~0x" << std::hex << rhs.get_length() << std::dec << "->" << rhs.get_val(); if (rhs.is_indirect()) { - out << " indirect(" << rhs.get_intermediate_base() << "~" - << rhs.get_intermediate_key() << "~" - << rhs.get_intermediate_length() << ")"; + out << ",indirect(" << rhs.get_intermediate_base() + << "~0x" << std::hex << rhs.get_intermediate_length() + << "@0x" << rhs.get_intermediate_offset() << std::dec + << ")"; } out << ")"; return out; diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h index 6c5c6c6fcc292..6025725aa337d 100644 --- a/src/crimson/os/seastore/cached_extent.h +++ b/src/crimson/os/seastore/cached_extent.h @@ -350,7 +350,7 @@ class CachedExtent << ", modify_time=" << sea_time_point_printer_t{modify_time} << ", paddr=" << get_paddr() << ", prior_paddr=" << prior_poffset_str - << ", length=" << get_length() + << std::hex << ", length=0x" << get_length() << std::dec << ", state=" << state << ", last_committed_crc=" << last_committed_crc << ", refcount=" << use_count() diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h index 960ea6ba41181..397a014a7c3d2 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h @@ -925,7 +925,7 @@ class NodeLayoutT final : public InternalNodeImpl, public LeafNodeImpl { std::ostringstream sos; sos << "Node" << NODE_TYPE << FIELD_TYPE << "@" << extent.get_laddr() - << "+" << std::hex << extent.get_length() << std::dec + << "+0x" << std::hex << extent.get_length() << std::dec << "Lv" << (unsigned)level() << (is_level_tail() ? "$" : ""); name = sos.str(); diff --git a/src/crimson/os/seastore/seastore_types.cc b/src/crimson/os/seastore/seastore_types.cc index e1430b30019a5..f379dd0117c8d 100644 --- a/src/crimson/os/seastore/seastore_types.cc +++ b/src/crimson/os/seastore/seastore_types.cc @@ -54,7 +54,9 @@ std::ostream &operator<<(std::ostream &out, const device_id_printer_t &id) } else if (_id == DEVICE_ID_ROOT) { return out << "Dev(ROOT)"; } else { - return out << "Dev(" << (unsigned)_id << ")"; + return out << "Dev(0x" + << std::hex << (unsigned)_id << std::dec + << ")"; } } @@ -64,7 +66,7 @@ std::ostream &operator<<(std::ostream &out, const segment_id_t &segment) return out << "Seg[NULL]"; } else { return out << "Seg[" << device_id_printer_t{segment.device_id()} - << "," << segment.device_segment_id() + << ",0x" << std::hex << segment.device_segment_id() << std::dec << "]"; } } @@ -93,12 +95,12 @@ std::ostream& operator<<(std::ostream& out, segment_seq_printer_t seq) } std::ostream &operator<<(std::ostream &out, const laddr_t &laddr) { - return out << 'L' << std::hex << laddr.value << std::dec; + return out << "L0x" << std::hex << laddr.value << std::dec; } std::ostream &operator<<(std::ostream &out, const laddr_offset_t &laddr_offset) { return out << laddr_offset.get_aligned_laddr() - << "+" << std::hex << laddr_offset.get_offset() << std::dec; + << "+0x" << std::hex << laddr_offset.get_offset() << std::dec; } std::ostream &operator<<(std::ostream &out, const pladdr_t &pladdr) @@ -123,18 +125,18 @@ std::ostream &operator<<(std::ostream &out, const paddr_t &rhs) } else if (has_device_off(id)) { auto &s = rhs.as_res_paddr(); out << device_id_printer_t{id} - << "," - << s.get_device_off(); + << ",0x" + << std::hex << s.get_device_off() << std::dec; } else if (rhs.get_addr_type() == paddr_types_t::SEGMENT) { auto &s = rhs.as_seg_paddr(); out << s.get_segment_id() - << "," - << s.get_segment_off(); + << ",0x" + << std::hex << s.get_segment_off() << std::dec; } else if (rhs.get_addr_type() == paddr_types_t::RANDOM_BLOCK) { auto &s = rhs.as_blk_paddr(); out << device_id_printer_t{s.get_device_id()} - << "," - << s.get_device_off(); + << ",0x" + << std::hex << s.get_device_off() << std::dec; } else { out << "INVALID!"; } From d39949e8e9c5db4692d5c5ab7168eb965e0c84e5 Mon Sep 17 00:00:00 2001 From: Yingxin Cheng Date: Sun, 29 Sep 2024 11:14:14 +0800 Subject: [PATCH 062/148] crimson/os/seastore: adjust op names Signed-off-by: Yingxin Cheng --- src/crimson/os/seastore/seastore.cc | 38 ++++++++++++++--------------- src/crimson/os/seastore/seastore.h | 4 +-- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/src/crimson/os/seastore/seastore.cc b/src/crimson/os/seastore/seastore.cc index d708231b47b1a..9206d38035a6c 100644 --- a/src/crimson/os/seastore/seastore.cc +++ b/src/crimson/os/seastore/seastore.cc @@ -40,8 +40,8 @@ template <> struct fmt::formatter auto format(op_type_t op, FormatContext& ctx) const { std::string_view name = "unknown"; switch (op) { - case op_type_t::TRANSACTION: - name = "transaction"; + case op_type_t::DO_TRANSACTION: + name = "do_transaction"; break; case op_type_t::READ: name = "read"; @@ -61,8 +61,8 @@ template <> struct fmt::formatter case op_type_t::OMAP_GET_VALUES: name = "omap_get_values"; break; - case op_type_t::OMAP_LIST: - name = "omap_list"; + case op_type_t::OMAP_GET_VALUES2: + name = "omap_get_values2"; break; case op_type_t::MAX: name = "unknown"; @@ -141,14 +141,14 @@ void SeaStore::Shard::register_metrics() namespace sm = seastar::metrics; using op_type_t = crimson::os::seastore::op_type_t; std::pair labels_by_op_type[] = { - {op_type_t::TRANSACTION, sm::label_instance("latency", "TRANSACTION")}, - {op_type_t::READ, sm::label_instance("latency", "READ")}, - {op_type_t::WRITE, sm::label_instance("latency", "WRITE")}, - {op_type_t::GET_ATTR, sm::label_instance("latency", "GET_ATTR")}, - {op_type_t::GET_ATTRS, sm::label_instance("latency", "GET_ATTRS")}, - {op_type_t::STAT, sm::label_instance("latency", "STAT")}, - {op_type_t::OMAP_GET_VALUES, sm::label_instance("latency", "OMAP_GET_VALUES")}, - {op_type_t::OMAP_LIST, sm::label_instance("latency", "OMAP_LIST")}, + {op_type_t::DO_TRANSACTION, sm::label_instance("latency", "DO_TRANSACTION")}, + {op_type_t::READ, sm::label_instance("latency", "READ")}, + {op_type_t::WRITE, sm::label_instance("latency", "WRITE")}, + {op_type_t::GET_ATTR, sm::label_instance("latency", "GET_ATTR")}, + {op_type_t::GET_ATTRS, sm::label_instance("latency", "GET_ATTRS")}, + {op_type_t::STAT, sm::label_instance("latency", "STAT")}, + {op_type_t::OMAP_GET_VALUES, sm::label_instance("latency", "OMAP_GET_VALUES")}, + {op_type_t::OMAP_GET_VALUES2, sm::label_instance("latency", "OMAP_GET_VALUES2")}, }; for (auto& [op_type, label] : labels_by_op_type) { @@ -1033,7 +1033,7 @@ SeaStore::Shard::read( ch, oid, Transaction::src_t::READ, - "read_obj", + "read", op_type_t::READ, [this, offset, len, op_flags](auto &t, auto &onode) -> ObjectDataHandler::read_ret { size_t size = onode.get_layout().size; @@ -1076,7 +1076,7 @@ SeaStore::Shard::exists( c, oid, Transaction::src_t::READ, - "oid_exists", + "exists", op_type_t::READ, [](auto&, auto&) { return seastar::make_ready_future(true); @@ -1180,7 +1180,7 @@ SeaStore::Shard::get_attrs( ch, oid, Transaction::src_t::READ, - "get_addrs", + "get_attrs", op_type_t::GET_ATTRS, [this](auto &t, auto& onode) { auto& layout = onode.get_layout(); @@ -1389,8 +1389,8 @@ SeaStore::Shard::omap_get_values( ch, oid, Transaction::src_t::READ, - "omap_list", - op_type_t::OMAP_LIST, + "omap_get_values2", + op_type_t::OMAP_GET_VALUES2, [this, start](auto &t, auto &onode) { return omap_list( onode, @@ -1445,7 +1445,7 @@ SeaStore::Shard::fiemap( ch, oid, Transaction::src_t::READ, - "fiemap_read", + "fiemap", op_type_t::READ, [this, off, len](auto &t, auto &onode) -> base_iertr::future { size_t size = onode.get_layout().size; @@ -1490,7 +1490,7 @@ seastar::future<> SeaStore::Shard::do_transaction_no_callbacks( std::move(_t), Transaction::src_t::MUTATE, "do_transaction", - op_type_t::TRANSACTION, + op_type_t::DO_TRANSACTION, [this](auto &ctx) { return with_trans_intr(*ctx.transaction, [&ctx, this](auto &t) { LOG_PREFIX(SeaStore::Shard::do_transaction_no_callbacks); diff --git a/src/crimson/os/seastore/seastore.h b/src/crimson/os/seastore/seastore.h index 58d4f5e593cbe..611e909619ac4 100644 --- a/src/crimson/os/seastore/seastore.h +++ b/src/crimson/os/seastore/seastore.h @@ -35,14 +35,14 @@ using OnodeRef = boost::intrusive_ptr; class TransactionManager; enum class op_type_t : uint8_t { - TRANSACTION = 0, + DO_TRANSACTION = 0, READ, WRITE, GET_ATTR, GET_ATTRS, STAT, OMAP_GET_VALUES, - OMAP_LIST, + OMAP_GET_VALUES2, MAX }; From 589e9cb9356ad55b7cfbb1b0041e8cbd7fd71e57 Mon Sep 17 00:00:00 2001 From: Yingxin Cheng Date: Sun, 29 Sep 2024 11:47:46 +0800 Subject: [PATCH 063/148] crimson/os/seastore: refine logs of seastore.h/cc Signed-off-by: Yingxin Cheng --- src/crimson/os/seastore/seastore.cc | 398 ++++++++++++++++++---------- src/crimson/os/seastore/seastore.h | 9 +- src/osd/osd_types_fmt.h | 2 + 3 files changed, 265 insertions(+), 144 deletions(-) diff --git a/src/crimson/os/seastore/seastore.cc b/src/crimson/os/seastore/seastore.cc index 9206d38035a6c..e2dee84637831 100644 --- a/src/crimson/os/seastore/seastore.cc +++ b/src/crimson/os/seastore/seastore.cc @@ -17,6 +17,7 @@ #include "common/safe_io.h" #include "include/stringify.h" #include "os/Transaction.h" +#include "osd/osd_types_fmt.h" #include "crimson/common/buffer_io.h" @@ -192,6 +193,9 @@ void SeaStore::Shard::register_metrics() seastar::future<> SeaStore::start() { + LOG_PREFIX(SeaStore::start); + INFO("..."); + ceph_assert(seastar::this_shard_id() == primary_core); #ifndef NDEBUG bool is_test = true; @@ -212,19 +216,30 @@ seastar::future<> SeaStore::start() }).then([this, is_test] { ceph_assert(device); return shard_stores.start(root, device.get(), is_test); + }).then([FNAME] { + INFO("done"); }); } seastar::future<> SeaStore::test_start(DeviceRef device_obj) { + LOG_PREFIX(SeaStore::test_start); + INFO("..."); + ceph_assert(device_obj); ceph_assert(root == ""); device = std::move(device_obj); - return shard_stores.start_single(root, device.get(), true); + return shard_stores.start_single(root, device.get(), true + ).then([FNAME] { + INFO("done"); + }); } seastar::future<> SeaStore::stop() { + LOG_PREFIX(SeaStore::stop); + INFO("..."); + ceph_assert(seastar::this_shard_id() == primary_core); return seastar::do_for_each(secondaries, [](auto& sec_dev) { return sec_dev->stop(); @@ -237,17 +252,28 @@ seastar::future<> SeaStore::stop() } }).then([this] { return shard_stores.stop(); + }).then([FNAME] { + INFO("done"); }); } SeaStore::mount_ertr::future<> SeaStore::test_mount() { + LOG_PREFIX(SeaStore::test_mount); + INFO("..."); + ceph_assert(seastar::this_shard_id() == primary_core); - return shard_stores.local().mount_managers(); + return shard_stores.local().mount_managers( + ).then([FNAME] { + INFO("done"); + }); } SeaStore::mount_ertr::future<> SeaStore::mount() { + LOG_PREFIX(SeaStore::mount); + INFO("..."); + ceph_assert(seastar::this_shard_id() == primary_core); return device->mount( ).safe_then([this] { @@ -281,6 +307,8 @@ SeaStore::mount_ertr::future<> SeaStore::mount() return shard_stores.invoke_on_all([](auto &local_store) { return local_store.mount_managers(); }); + }).safe_then([FNAME] { + INFO("done"); }).handle_error( crimson::ct_error::assert_all{ "Invalid error in SeaStore::mount" @@ -300,9 +328,14 @@ seastar::future<> SeaStore::Shard::mount_managers() seastar::future<> SeaStore::umount() { + LOG_PREFIX(SeaStore::umount); + INFO("..."); + ceph_assert(seastar::this_shard_id() == primary_core); return shard_stores.invoke_on_all([](auto &local_store) { return local_store.umount(); + }).then([FNAME] { + INFO("done"); }); } @@ -330,7 +363,7 @@ seastar::future<> SeaStore::Shard::umount() onode_manager.reset(); }).handle_error( crimson::ct_error::assert_all{ - "Invalid error in SeaStore::umount" + "Invalid error in SeaStoreS::umount" } ); } @@ -377,6 +410,8 @@ SeaStore::Shard::mkfs_managers() "mkfs_seastore", [this](auto& t) { + LOG_PREFIX(SeaStoreS::mkfs_managers); + DEBUGT("...", t); return onode_manager->mkfs(t ).si_then([this, &t] { return collection_manager->mkfs(t); @@ -410,16 +445,22 @@ seastar::future<> SeaStore::set_secondaries() SeaStore::mkfs_ertr::future<> SeaStore::test_mkfs(uuid_d new_osd_fsid) { + LOG_PREFIX(SeaStore::test_mkfs); + INFO("uuid={} ...", new_osd_fsid); + ceph_assert(seastar::this_shard_id() == primary_core); return read_meta("mkfs_done" - ).then([this, new_osd_fsid](auto tuple) { + ).then([this, new_osd_fsid, FNAME](auto tuple) { auto [done, value] = tuple; if (done == 0) { + ERROR("failed"); return seastar::now(); } return shard_stores.local().mkfs_managers( ).then([this, new_osd_fsid] { return prepare_meta(new_osd_fsid); + }).then([FNAME] { + INFO("done"); }); }); } @@ -447,28 +488,29 @@ seastar::future<> SeaStore::prepare_meta(uuid_d new_osd_fsid) SeaStore::mkfs_ertr::future<> SeaStore::mkfs(uuid_d new_osd_fsid) { + LOG_PREFIX(SeaStore::mkfs); + INFO("uuid={}, root={} ...", new_osd_fsid, root); + ceph_assert(seastar::this_shard_id() == primary_core); return read_meta("mkfs_done" - ).then([this, new_osd_fsid](auto tuple) { + ).then([this, new_osd_fsid, FNAME](auto tuple) { auto [done, value] = tuple; if (done == 0) { + ERROR("failed"); return seastar::now(); } else { return seastar::do_with( secondary_device_set_t(), - [this, new_osd_fsid](auto& sds) { + [this, new_osd_fsid, FNAME](auto& sds) { auto fut = seastar::now(); - LOG_PREFIX(SeaStore::mkfs); - DEBUG("root: {}", root); if (!root.empty()) { fut = seastar::open_directory(root - ).then([this, &sds, new_osd_fsid](seastar::file rdir) mutable { + ).then([this, &sds, new_osd_fsid, FNAME](seastar::file rdir) mutable { std::unique_ptr root_f = std::make_unique(std::move(rdir)); auto sub = root_f->list_directory( - [this, &sds, new_osd_fsid](auto de) mutable -> seastar::future<> + [this, &sds, new_osd_fsid, FNAME](auto de) mutable -> seastar::future<> { - LOG_PREFIX(SeaStore::mkfs); DEBUG("found file: {}", de.name); if (de.name.find("block.") == 0 && de.name.length() > 6 /* 6 for "block." */) { @@ -533,6 +575,8 @@ SeaStore::mkfs_ertr::future<> SeaStore::mkfs(uuid_d new_osd_fsid) return prepare_meta(new_osd_fsid); }).safe_then([this] { return umount(); + }).safe_then([FNAME] { + INFO("done"); }).handle_error( crimson::ct_error::assert_all{ "Invalid error in SeaStore::mkfs" @@ -546,14 +590,18 @@ using coll_core_t = SeaStore::coll_core_t; seastar::future> SeaStore::list_collections() { + LOG_PREFIX(SeaStore::list_collections); + DEBUG("..."); + ceph_assert(seastar::this_shard_id() == primary_core); return shard_stores.map([](auto &local_store) { return local_store.list_collections(); - }).then([](std::vector> results) { + }).then([FNAME](std::vector> results) { std::vector collections; for (auto& colls : results) { collections.insert(collections.end(), colls.begin(), colls.end()); } + DEBUG("got {} collections", collections.size()); return seastar::make_ready_future>( std::move(collections)); }); @@ -561,13 +609,16 @@ SeaStore::list_collections() store_statfs_t SeaStore::Shard::stat() const { - return transaction_manager->store_stat(); + LOG_PREFIX(SeaStoreS::stat); + auto ss = transaction_manager->store_stat(); + DEBUG("stat={}", ss); + return ss; } seastar::future SeaStore::stat() const { LOG_PREFIX(SeaStore::stat); - DEBUG(""); + DEBUG("..."); ceph_assert(seastar::this_shard_id() == primary_core); return shard_stores.map_reduce0( @@ -579,19 +630,30 @@ seastar::future SeaStore::stat() const ss.add(ret); return std::move(ss); } - ).then([](store_statfs_t ss) { + ).then([FNAME](store_statfs_t ss) { + DEBUG("done, stat={}", ss); return seastar::make_ready_future(std::move(ss)); }); } seastar::future SeaStore::pool_statfs(int64_t pool_id) const { - //TODO - return SeaStore::stat(); + LOG_PREFIX(SeaStore::pool_statfs); + DEBUG("pool_id={} ...", pool_id); + ceph_assert(seastar::this_shard_id() == primary_core); + //TODO + return SeaStore::stat( + ).then([FNAME, pool_id](store_statfs_t ss) { + DEBUG("done, pool_id={}, ret={}", pool_id, ss); + return seastar::make_ready_future(std::move(ss)); + }); } seastar::future<> SeaStore::report_stats() { + LOG_PREFIX(SeaStore::report_stats); + DEBUG("..."); + ceph_assert(seastar::this_shard_id() == primary_core); shard_device_stats.resize(seastar::smp::count); shard_io_stats.resize(seastar::smp::count); @@ -610,8 +672,7 @@ seastar::future<> SeaStore::report_stats() local_store.get_io_stats(report_detail, seconds); shard_cache_stats[seastar::this_shard_id()] = local_store.get_cache_stats(report_detail, seconds); - }).then([this] { - LOG_PREFIX(SeaStore); + }).then([this, FNAME] { auto now = seastar::lowres_clock::now(); if (last_tp == seastar::lowres_clock::time_point::min()) { last_tp = now; @@ -858,24 +919,26 @@ SeaStore::Shard::list_objects(CollectionRef ch, "list_objects", [this, ch, start, end, &limit, &ret](auto &t) { + LOG_PREFIX(SeaStoreS::list_objects); + DEBUGT("cid={} start={} end={} limit={} ...", + t, ch->get_cid(), start, end, limit); return get_coll_bits( ch, t - ).si_then([this, ch, &t, start, end, &limit, &ret](auto bits) { + ).si_then([FNAME, this, ch, &t, start, end, &limit, &ret](auto bits) { if (!bits) { + DEBUGT("no bits, return none", t); return list_iertr::make_ready_future< OnodeManager::list_onodes_bare_ret >(std::make_tuple( std::vector(), ghobject_t::get_max())); } else { - LOG_PREFIX(SeaStore::list_objects); - DEBUGT("start {}, end {}, limit {}, bits {}", - t, start, end, limit, *bits); + DEBUGT("bits={} ...", t, *bits); auto filter = SeaStore::get_objs_range(ch, *bits); using list_iertr = OnodeManager::list_onodes_iertr; using repeat_ret = list_iertr::future; return trans_intr::repeat( - [this, &t, &ret, &limit, end, + [this, FNAME, &t, &ret, &limit, end, filter, ranges = get_ranges(ch, start, end, filter) ]() mutable -> repeat_ret { if (limit == 0 || ranges.empty()) { @@ -887,11 +950,10 @@ SeaStore::Shard::list_objects(CollectionRef ch, auto pstart = ite->first; auto pend = ite->second; ranges.pop_front(); - LOG_PREFIX(SeaStore::list_objects); - DEBUGT("pstart {}, pend {}, limit {}", t, pstart, pend, limit); + DEBUGT("pstart {}, pend {}, limit {} ...", t, pstart, pend, limit); return onode_manager->list_onodes( t, pstart, pend, limit - ).si_then([&limit, &ret, pend, &t, last=ranges.empty(), end] + ).si_then([&limit, &ret, pend, &t, last=ranges.empty(), end, FNAME] (auto &&_ret) mutable { auto &next_objects = std::get<0>(_ret); auto &ret_objects = std::get<0>(ret); @@ -902,7 +964,6 @@ SeaStore::Shard::list_objects(CollectionRef ch, std::get<1>(ret) = std::get<1>(_ret); assert(limit >= next_objects.size()); limit -= next_objects.size(); - LOG_PREFIX(SeaStore::list_objects); DEBUGT("got {} objects, left limit {}", t, next_objects.size(), limit); assert(limit == 0 || @@ -916,7 +977,9 @@ SeaStore::Shard::list_objects(CollectionRef ch, >(seastar::stop_iteration::no); }); } - ).si_then([&ret] { + ).si_then([&ret, FNAME] { + DEBUG("got {} objects, next={}", + std::get<0>(ret).size(), std::get<1>(ret)); return list_iertr::make_ready_future< OnodeManager::list_onodes_bare_ret>(std::move(ret)); }); @@ -929,7 +992,7 @@ SeaStore::Shard::list_objects(CollectionRef ch, return std::move(ret); }).handle_error( crimson::ct_error::assert_all{ - "Invalid error in SeaStore::list_objects" + "Invalid error in SeaStoreS::list_objects" } ); }).finally([this] { @@ -941,24 +1004,26 @@ SeaStore::Shard::list_objects(CollectionRef ch, seastar::future SeaStore::Shard::create_new_collection(const coll_t& cid) { - LOG_PREFIX(SeaStore::create_new_collection); - DEBUG("{}", cid); + LOG_PREFIX(SeaStoreS::create_new_collection); + DEBUG("cid={}", cid); return seastar::make_ready_future(_get_collection(cid)); } seastar::future SeaStore::Shard::open_collection(const coll_t& cid) { - LOG_PREFIX(SeaStore::open_collection); - DEBUG("{}", cid); + LOG_PREFIX(SeaStoreS::open_collection); + DEBUG("cid={} ...", cid); return list_collections( - ).then([cid, this] (auto colls_cores) { + ).then([cid, this, FNAME] (auto colls_cores) { if (auto found = std::find(colls_cores.begin(), colls_cores.end(), std::make_pair(cid, seastar::this_shard_id())); found != colls_cores.end()) { + DEBUG("cid={} exists", cid); return seastar::make_ready_future(_get_collection(cid)); } else { + DEBUG("cid={} not exists", cid); return seastar::make_ready_future(); } }); @@ -968,6 +1033,8 @@ seastar::future<> SeaStore::Shard::set_collection_opts(CollectionRef c, const pool_opts_t& opts) { + LOG_PREFIX(SeaStoreS::set_collection_opts); + DEBUG("cid={}, opts={} not implemented", c->get_cid(), opts); //TODO return seastar::now(); } @@ -989,6 +1056,8 @@ SeaStore::Shard::list_collections() "list_collections", [this, &ret](auto& t) { + LOG_PREFIX(SeaStoreS::list_collections); + DEBUGT("...", t); return transaction_manager->read_collection_root(t ).si_then([this, &t](auto coll_root) { return collection_manager->list(coll_root, t); @@ -1007,7 +1076,7 @@ SeaStore::Shard::list_collections() } ).handle_error( crimson::ct_error::assert_all{ - "Invalid error in SeaStore::list_collections" + "Invalid error in SeaStoreS::list_collections" } ).finally([this] { assert(shard_stats.pending_read_num); @@ -1023,9 +1092,6 @@ SeaStore::Shard::read( size_t len, uint32_t op_flags) { - LOG_PREFIX(SeaStore::read); - DEBUG("oid {} offset {} len {}", oid, offset, len); - ++(shard_stats.read_num); ++(shard_stats.pending_read_num); @@ -1036,12 +1102,16 @@ SeaStore::Shard::read( "read", op_type_t::READ, [this, offset, len, op_flags](auto &t, auto &onode) -> ObjectDataHandler::read_ret { + LOG_PREFIX(SeaStoreS::read); size_t size = onode.get_layout().size; - if (offset >= size) { + DEBUGT("0x{:x}~0x{:x} onode-size=0x{:x} flags=0x{:x}, got none", + t, offset, len, size, op_flags); return seastar::make_ready_future(); } + DEBUGT("0x{:x}~0x{:x} onode-size=0x{:x} flags=0x{:x} ...", + t, offset, len, size, op_flags); size_t corrected_len = (len == 0) ? size - offset : std::min(size - offset, len); @@ -1053,7 +1123,11 @@ SeaStore::Shard::read( onode, }, offset, - corrected_len); + corrected_len + ).si_then([FNAME, &t](auto bl) { + DEBUGT("got bl length=0x{:x}", t, bl.length()); + return bl; + }); } ).finally([this] { assert(shard_stats.pending_read_num); @@ -1066,9 +1140,7 @@ SeaStore::Shard::exists( CollectionRef c, const ghobject_t& oid) { - LOG_PREFIX(SeaStore::exists); - DEBUG("oid {}", oid); - + LOG_PREFIX(SeaStoreS::exists); ++(shard_stats.read_num); ++(shard_stats.pending_read_num); @@ -1078,10 +1150,12 @@ SeaStore::Shard::exists( Transaction::src_t::READ, "exists", op_type_t::READ, - [](auto&, auto&) { + [FNAME](auto& t, auto&) { + DEBUGT("exists", t); return seastar::make_ready_future(true); }).handle_error( - crimson::ct_error::enoent::handle([] { + crimson::ct_error::enoent::handle([FNAME] { + DEBUG("not exists"); return seastar::make_ready_future(false); }), crimson::ct_error::assert_all{"unexpected error"} @@ -1098,10 +1172,14 @@ SeaStore::Shard::readv( interval_set& m, uint32_t op_flags) { + LOG_PREFIX(SeaStoreS::readv); + DEBUG("cid={} oid={} op_flags=0x{:x} {} intervals", + ch->get_cid(), _oid, op_flags, m.num_intervals()); + return seastar::do_with( _oid, ceph::bufferlist{}, - [ch, op_flags, this, &m](auto &oid, auto &ret) { + [ch, op_flags, this, FNAME, &m](auto &oid, auto &ret) { return crimson::do_for_each( m, [ch, op_flags, this, &oid, &ret](auto &p) { @@ -1110,7 +1188,8 @@ SeaStore::Shard::readv( ).safe_then([&ret](auto bl) { ret.claim_append(bl); }); - }).safe_then([&ret] { + }).safe_then([&ret, FNAME] { + DEBUG("got bl length=0x{:x}", ret.length()); return read_errorator::make_ready_future (std::move(ret)); }); @@ -1125,9 +1204,6 @@ SeaStore::Shard::get_attr( const ghobject_t& oid, std::string_view name) const { - LOG_PREFIX(SeaStore::get_attr); - DEBUG("{} {}", ch->get_cid(), oid); - ++(shard_stats.read_num); ++(shard_stats.pending_read_num); @@ -1138,17 +1214,21 @@ SeaStore::Shard::get_attr( "get_attr", op_type_t::GET_ATTR, [this, name](auto &t, auto& onode) -> _omap_get_value_ret { + LOG_PREFIX(SeaStoreS::get_attr); auto& layout = onode.get_layout(); if (name == OI_ATTR && layout.oi_size) { ceph::bufferlist bl; bl.append(ceph::bufferptr(&layout.oi[0], layout.oi_size)); + DEBUGT("got OI_ATTR, value length=0x{:x}", t, bl.length()); return seastar::make_ready_future(std::move(bl)); } if (name == SS_ATTR && layout.ss_size) { ceph::bufferlist bl; bl.append(ceph::bufferptr(&layout.ss[0], layout.ss_size)); + DEBUGT("got SS_ATTR, value length=0x{:x}", t, bl.length()); return seastar::make_ready_future(std::move(bl)); } + DEBUGT("name={} ...", t, name); return _omap_get_value( t, layout.xattr_root.get( @@ -1170,9 +1250,6 @@ SeaStore::Shard::get_attrs( CollectionRef ch, const ghobject_t& oid) { - LOG_PREFIX(SeaStore::get_attrs); - DEBUG("{} {}", ch->get_cid(), oid); - ++(shard_stats.read_num); ++(shard_stats.pending_read_num); @@ -1183,6 +1260,8 @@ SeaStore::Shard::get_attrs( "get_attrs", op_type_t::GET_ATTRS, [this](auto &t, auto& onode) { + LOG_PREFIX(SeaStoreS::get_attrs); + DEBUGT("...", t); auto& layout = onode.get_layout(); return omap_list(onode, layout.xattr_root, t, std::nullopt, OMapManager::omap_list_config_t() @@ -1190,17 +1269,17 @@ SeaStore::Shard::get_attrs( .without_max() ).si_then([&layout, &t, FNAME](auto p) { auto& attrs = std::get<1>(p); + DEBUGT("got {} attrs, OI length=0x{:x}, SS length=0x{:x}", + t, attrs.size(), (uint32_t)layout.oi_size, (uint32_t)layout.ss_size); ceph::bufferlist bl; if (layout.oi_size) { bl.append(ceph::bufferptr(&layout.oi[0], layout.oi_size)); attrs.emplace(OI_ATTR, std::move(bl)); - DEBUGT("set oi from onode layout", t); } if (layout.ss_size) { bl.clear(); bl.append(ceph::bufferptr(&layout.ss[0], layout.ss_size)); attrs.emplace(SS_ATTR, std::move(bl)); - DEBUGT("set ss from onode layout", t); } return seastar::make_ready_future(std::move(attrs)); }); @@ -1222,7 +1301,6 @@ seastar::future SeaStore::Shard::stat( ++(shard_stats.read_num); ++(shard_stats.pending_read_num); - LOG_PREFIX(SeaStore::stat); return repeat_with_onode( c, oid, @@ -1230,18 +1308,20 @@ seastar::future SeaStore::Shard::stat( "stat", op_type_t::STAT, [this, oid](auto &t, auto &onode) { + LOG_PREFIX(SeaStoreS::stat); struct stat st; auto &olayout = onode.get_layout(); st.st_size = olayout.size; st.st_blksize = device->get_block_size(); st.st_blocks = (st.st_size + st.st_blksize - 1) / st.st_blksize; st.st_nlink = 1; - DEBUGT("cid {}, oid {}, return size {}", t, c->get_cid(), oid, st.st_size); + DEBUGT("oid={}, size={}, blksize={}", + t, oid, st.st_size, st.st_blksize); return seastar::make_ready_future(st); } ).handle_error( crimson::ct_error::assert_all{ - "Invalid error in SeaStore::stat" + "Invalid error in SeaStoreS::stat" } ).finally([this] { assert(shard_stats.pending_read_num); @@ -1273,6 +1353,8 @@ SeaStore::Shard::omap_get_values( "omap_get_values", op_type_t::OMAP_GET_VALUES, [this, keys](auto &t, auto &onode) { + LOG_PREFIX(SeaStoreS::omap_get_values); + DEBUGT("{} keys ...", t, keys.size()); omap_root_t omap_root = onode.get_layout().omap_root.get( onode.get_metadata_hint(device->get_block_size())); return _omap_get_values( @@ -1297,14 +1379,18 @@ SeaStore::Shard::_omap_get_value( std::move(root), std::string(key), [&t](auto &manager, auto& root, auto& key) -> _omap_get_value_ret { + LOG_PREFIX(SeaStoreS::_omap_get_value); if (root.is_null()) { + DEBUGT("key={} is absent because of null root", t, key); return crimson::ct_error::enodata::make(); } return manager.omap_get_value(root, t, key - ).si_then([](auto opt) -> _omap_get_value_ret { + ).si_then([&key, &t, FNAME](auto opt) -> _omap_get_value_ret { if (!opt) { + DEBUGT("key={} is absent", t, key); return crimson::ct_error::enodata::make(); } + DEBUGT("key={}, value length=0x{:x}", t, key, opt->length()); return seastar::make_ready_future(std::move(*opt)); }); }); @@ -1316,14 +1402,16 @@ SeaStore::Shard::_omap_get_values( omap_root_t &&omap_root, const omap_keys_t &keys) const { + LOG_PREFIX(SeaStoreS::_omap_get_values); if (omap_root.is_null()) { + DEBUGT("{} keys are absent because of null root", t, keys.size()); return seastar::make_ready_future(); } return seastar::do_with( BtreeOMapManager(*transaction_manager), std::move(omap_root), omap_values_t(), - [&t, &keys](auto &manager, auto &root, auto &ret) { + [&t, &keys, FNAME](auto &manager, auto &root, auto &ret) { return trans_intr::do_for_each( keys.begin(), keys.end(), @@ -1342,7 +1430,8 @@ SeaStore::Shard::_omap_get_values( } return seastar::now(); }); - }).si_then([&ret] { + }).si_then([&t, &ret, &keys, FNAME] { + DEBUGT("{} keys got {} values", t, keys.size(), ret.size()); return std::move(ret); }); }); @@ -1379,9 +1468,6 @@ SeaStore::Shard::omap_get_values( const ghobject_t &oid, const std::optional &start) { - LOG_PREFIX(SeaStore::omap_get_values); - DEBUG("{} {}", ch->get_cid(), oid); - ++(shard_stats.read_num); ++(shard_stats.pending_read_num); @@ -1392,6 +1478,8 @@ SeaStore::Shard::omap_get_values( "omap_get_values2", op_type_t::OMAP_GET_VALUES2, [this, start](auto &t, auto &onode) { + LOG_PREFIX(SeaStoreS::omap_get_values); + DEBUGT("start={} ...", t, start.has_value() ? *start : ""); return omap_list( onode, onode.get_layout().omap_root, @@ -1399,7 +1487,12 @@ SeaStore::Shard::omap_get_values( start, OMapManager::omap_list_config_t() .with_inclusive(false, false) - .without_max()); + .without_max() + ).si_then([FNAME, &t](omap_values_paged_t ret) { + DEBUGT("got {} values, complete={}", + t, std::get<1>(ret).size(), std::get<0>(ret)); + return ret; + }); } ).finally([this] { assert(shard_stats.pending_read_num); @@ -1414,6 +1507,7 @@ SeaStore::Shard::_fiemap( uint64_t off, uint64_t len) const { + LOG_PREFIX(SeaStoreS::_fiemap); return seastar::do_with( ObjectDataHandler(max_object_size), [this, off, len, &t, &onode](auto &objhandler) { @@ -1425,6 +1519,9 @@ SeaStore::Shard::_fiemap( }, off, len); + }).si_then([FNAME, &t](auto ret) { + DEBUGT("got {} intervals", t, ret.size()); + return ret; }); } @@ -1435,9 +1532,6 @@ SeaStore::Shard::fiemap( uint64_t off, uint64_t len) { - LOG_PREFIX(SeaStore::fiemap); - DEBUG("oid: {}, off: {}, len: {} ", oid, off, len); - ++(shard_stats.read_num); ++(shard_stats.pending_read_num); @@ -1448,11 +1542,15 @@ SeaStore::Shard::fiemap( "fiemap", op_type_t::READ, [this, off, len](auto &t, auto &onode) -> base_iertr::future { + LOG_PREFIX(SeaStoreS::fiemap); size_t size = onode.get_layout().size; if (off >= size) { - INFOT("fiemap offset is over onode size!", t); + DEBUGT("0x{:x}~0x{:x} onode-size=0x{:x}, got none", + t, off, len, size); return seastar::make_ready_future>(); } + DEBUGT("0x{:x}~0x{:x} onode-size=0x{:x} ...", + t, off, len, size); size_t adjust_len = (len == 0) ? size - off: std::min(size - off, len); @@ -1464,7 +1562,7 @@ SeaStore::Shard::fiemap( } void SeaStore::Shard::on_error(ceph::os::Transaction &t) { - LOG_PREFIX(SeaStore::on_error); + LOG_PREFIX(SeaStoreS::on_error); ERROR(" transaction dump:\n"); JSONFormatter f(true); f.open_object_section("transaction"); @@ -1485,17 +1583,22 @@ seastar::future<> SeaStore::Shard::do_transaction_no_callbacks( ++(shard_stats.starting_io_num); // repeat_with_internal_context ensures ordering via collection lock + auto num_bytes = _t.get_num_bytes(); return repeat_with_internal_context( _ch, std::move(_t), Transaction::src_t::MUTATE, "do_transaction", op_type_t::DO_TRANSACTION, - [this](auto &ctx) { - return with_trans_intr(*ctx.transaction, [&ctx, this](auto &t) { - LOG_PREFIX(SeaStore::Shard::do_transaction_no_callbacks); - SUBDEBUGT(seastore_t, "start with {} objects", - t, ctx.iter.objects.size()); + [this, num_bytes](auto &ctx) { + LOG_PREFIX(SeaStoreS::do_transaction_no_callbacks); + return with_trans_intr(*ctx.transaction, [&ctx, this, FNAME, num_bytes](auto &t) { + DEBUGT("cid={}, {} operations, {} bytes, {} colls, {} objects ...", + t, ctx.ch->get_cid(), + ctx.ext_transaction.get_num_ops(), + num_bytes, + ctx.iter.colls.size(), + ctx.iter.objects.size()); #ifndef NDEBUG TRACET(" transaction dump:\n", t); JSONFormatter f(true); @@ -1529,6 +1632,8 @@ seastar::future<> SeaStore::Shard::do_transaction_no_callbacks( }).si_then([this, &ctx] { return transaction_manager->submit_transaction(*ctx.transaction); }); + }).safe_then([FNAME, &ctx] { + DEBUGT("done", *ctx.transaction); }); } ).finally([this] { @@ -1568,12 +1673,12 @@ SeaStore::Shard::_do_transaction_step( std::vector &d_onodes, ceph::os::Transaction::iterator &i) { - LOG_PREFIX(SeaStore::Shard::_do_transaction_step); + LOG_PREFIX(SeaStoreS::_do_transaction_step); auto op = i.decode_op(); - SUBTRACET(seastore_t, "got op {}", *ctx.transaction, (uint32_t)op->op); using ceph::os::Transaction; if (op->op == Transaction::OP_NOP) { + DEBUGT("op NOP", *ctx.transaction); return tm_iertr::now(); } @@ -1581,15 +1686,18 @@ SeaStore::Shard::_do_transaction_step( case Transaction::OP_RMCOLL: { coll_t cid = i.get_cid(op->cid); + DEBUGT("op RMCOLL, cid={} ...", *ctx.transaction, cid); return _remove_collection(ctx, cid); } case Transaction::OP_MKCOLL: { coll_t cid = i.get_cid(op->cid); + DEBUGT("op MKCOLL, cid={} ...", *ctx.transaction, cid); return _create_collection(ctx, cid, op->split_bits); } case Transaction::OP_COLL_HINT: { + DEBUGT("op COLL_HINT", *ctx.transaction); ceph::bufferlist hint; i.decode_bl(hint); return tm_iertr::now(); @@ -1609,12 +1717,16 @@ SeaStore::Shard::_do_transaction_step( if (!onodes[op->oid]) { const ghobject_t& oid = i.get_oid(op->oid); if (!create) { + DEBUGT("op {}, get oid={} ...", + *ctx.transaction, (uint32_t)op->op, oid); fut = onode_manager->get_onode(*ctx.transaction, oid); } else { + DEBUGT("op {}, get_or_create oid={} ...", + *ctx.transaction, (uint32_t)op->op, oid); fut = onode_manager->get_or_create_onode(*ctx.transaction, oid); } } - return fut.si_then([&, op, this](auto get_onode) { + return fut.si_then([&, op, this, FNAME](auto get_onode) { OnodeRef &o = onodes[op->oid]; if (!o) { assert(get_onode); @@ -1624,10 +1736,12 @@ SeaStore::Shard::_do_transaction_step( if ((op->op == Transaction::OP_CLONE || op->op == Transaction::OP_COLL_MOVE_RENAME) && !d_onodes[op->dest_oid]) { + const ghobject_t& dest_oid = i.get_oid(op->dest_oid); + DEBUGT("op {}, get_or_create dest oid={} ...", + *ctx.transaction, (uint32_t)op->op, dest_oid); //TODO: use when_all_succeed after making onode tree // support parallel extents loading - return onode_manager->get_or_create_onode( - *ctx.transaction, i.get_oid(op->dest_oid) + return onode_manager->get_or_create_onode(*ctx.transaction, dest_oid ).si_then([&onodes, &d_onodes, op](auto dest_onode) { assert(dest_onode); auto &d_o = onodes[op->dest_oid]; @@ -1641,12 +1755,12 @@ SeaStore::Shard::_do_transaction_step( return OnodeManager::get_or_create_onode_iertr::now(); } }).si_then([&ctx, &i, &onodes, &d_onodes, op, this, FNAME]() -> tm_ret { - LOG_PREFIX(SeaStore::_do_transaction_step); + const ghobject_t& oid = i.get_oid(op->oid); try { switch (op->op) { case Transaction::OP_REMOVE: { - TRACET("removing {}", *ctx.transaction, i.get_oid(op->oid)); + DEBUGT("op REMOVE, oid={} ...", *ctx.transaction, oid); return _remove(ctx, onodes[op->oid] ).si_then([&onodes, &d_onodes, op] { onodes[op->oid].reset(); @@ -1656,6 +1770,7 @@ SeaStore::Shard::_do_transaction_step( case Transaction::OP_CREATE: case Transaction::OP_TOUCH: { + DEBUGT("op CREATE/TOUCH, oid={} ...", *ctx.transaction, oid); return _touch(ctx, onodes[op->oid]); } case Transaction::OP_WRITE: @@ -1665,6 +1780,8 @@ SeaStore::Shard::_do_transaction_step( uint32_t fadvise_flags = i.get_fadvise_flags(); ceph::bufferlist bl; i.decode_bl(bl); + DEBUGT("op WRITE, oid={}, 0x{:x}~0x{:x}, flags=0x{:x} ...", + *ctx.transaction, oid, off, len, fadvise_flags); return _write( ctx, onodes[op->oid], off, len, std::move(bl), fadvise_flags); @@ -1672,6 +1789,7 @@ SeaStore::Shard::_do_transaction_step( case Transaction::OP_TRUNCATE: { uint64_t off = op->off; + DEBUGT("op TRUNCATE, oid={}, 0x{:x} ...", *ctx.transaction, oid, off); return _truncate(ctx, onodes[op->oid], off); } case Transaction::OP_SETATTR: @@ -1680,80 +1798,96 @@ SeaStore::Shard::_do_transaction_step( std::map to_set; ceph::bufferlist& bl = to_set[name]; i.decode_bl(bl); + DEBUGT("op SETATTR, oid={}, attr name={}, value length=0x{:x} ...", + *ctx.transaction, oid, name, bl.length()); return _setattrs(ctx, onodes[op->oid], std::move(to_set)); } case Transaction::OP_SETATTRS: { std::map to_set; i.decode_attrset(to_set); + DEBUGT("op SETATTRS, oid={}, attrs size={} ...", + *ctx.transaction, oid, to_set.size()); return _setattrs(ctx, onodes[op->oid], std::move(to_set)); } case Transaction::OP_RMATTR: { std::string name = i.decode_string(); + DEBUGT("op RMATTR, oid={}, attr name={} ...", + *ctx.transaction, oid, name); return _rmattr(ctx, onodes[op->oid], name); } case Transaction::OP_RMATTRS: { + DEBUGT("op RMATTRS, oid={} ...", *ctx.transaction, oid); return _rmattrs(ctx, onodes[op->oid]); } case Transaction::OP_OMAP_SETKEYS: { std::map aset; i.decode_attrset(aset); + DEBUGT("op OMAP_SETKEYS, oid={}, omap size={} ...", + *ctx.transaction, oid, aset.size()); return _omap_set_values(ctx, onodes[op->oid], std::move(aset)); } case Transaction::OP_OMAP_SETHEADER: { ceph::bufferlist bl; i.decode_bl(bl); + DEBUGT("op OMAP_SETHEADER, oid={}, length=0x{:x} ...", + *ctx.transaction, oid, bl.length()); return _omap_set_header(ctx, onodes[op->oid], std::move(bl)); } case Transaction::OP_OMAP_RMKEYS: { omap_keys_t keys; i.decode_keyset(keys); + DEBUGT("op OMAP_RMKEYS, oid={}, omap size={} ...", + *ctx.transaction, oid, keys.size()); return _omap_rmkeys(ctx, onodes[op->oid], std::move(keys)); } case Transaction::OP_OMAP_RMKEYRANGE: { - string first, last; + std::string first, last; first = i.decode_string(); last = i.decode_string(); + DEBUGT("op OMAP_RMKEYRANGE, oid={}, first={}, last={} ...", + *ctx.transaction, oid, first, last); return _omap_rmkeyrange( ctx, onodes[op->oid], std::move(first), std::move(last)); } case Transaction::OP_OMAP_CLEAR: { + DEBUGT("op OMAP_CLEAR, oid={} ...", *ctx.transaction, oid); return _omap_clear(ctx, onodes[op->oid]); } case Transaction::OP_ZERO: { objaddr_t off = op->off; extent_len_t len = op->len; + DEBUGT("op ZERO, oid={}, 0x{:x}~0x{:x} ...", + *ctx.transaction, oid, off, len); return _zero(ctx, onodes[op->oid], off, len); } case Transaction::OP_SETALLOCHINT: { + DEBUGT("op SETALLOCHINT, oid={}, not implemented", + *ctx.transaction, oid); // TODO return tm_iertr::now(); } case Transaction::OP_CLONE: { - TRACET("cloning {} to {}", - *ctx.transaction, - i.get_oid(op->oid), - i.get_oid(op->dest_oid)); + DEBUGT("op CLONE, oid={}, dest oid={} ...", + *ctx.transaction, oid, i.get_oid(op->dest_oid)); return _clone(ctx, onodes[op->oid], d_onodes[op->dest_oid]); } case Transaction::OP_COLL_MOVE_RENAME: { + DEBUGT("op COLL_MOVE_RENAME, oid={}, dest oid={} ...", + *ctx.transaction, oid, i.get_oid(op->dest_oid)); ceph_assert(op->cid == op->dest_cid); - TRACET("renaming {} to {}", - *ctx.transaction, - i.get_oid(op->oid), - i.get_oid(op->dest_oid)); return _rename( ctx, onodes[op->oid], d_onodes[op->dest_oid] ).si_then([&onodes, &d_onodes, op] { @@ -1789,7 +1923,7 @@ SeaStore::Shard::_do_transaction_step( return seastar::now(); }), crimson::ct_error::assert_all{ - "Invalid error in SeaStore::do_transaction_step" + "Invalid error in SeaStoreS::do_transaction_step" } ); } @@ -1825,7 +1959,7 @@ SeaStore::Shard::_rename( ).handle_error_interruptible( crimson::ct_error::input_output_error::pass_further(), crimson::ct_error::assert_all{ - "Invalid error in SeaStore::_rename"} + "Invalid error in SeaStoreS::_rename"} ); } @@ -1846,7 +1980,7 @@ SeaStore::Shard::_remove_omaps( ).handle_error_interruptible( crimson::ct_error::input_output_error::pass_further(), crimson::ct_error::assert_all{ - "Invalid error in SeaStore::_remove" + "Invalid error in SeaStoreS::_remove_omaps" } ); }); @@ -1859,8 +1993,6 @@ SeaStore::Shard::_remove( internal_context_t &ctx, OnodeRef &onode) { - LOG_PREFIX(SeaStore::_remove); - DEBUGT("onode={}", *ctx.transaction, *onode); return _remove_omaps( ctx, onode, @@ -1888,7 +2020,7 @@ SeaStore::Shard::_remove( }).handle_error_interruptible( crimson::ct_error::input_output_error::pass_further(), crimson::ct_error::assert_all( - "Invalid error in SeaStore::_remove" + "Invalid error in SeaStoreS::_remove" ) ); } @@ -1898,8 +2030,6 @@ SeaStore::Shard::_touch( internal_context_t &ctx, OnodeRef &onode) { - LOG_PREFIX(SeaStore::_touch); - DEBUGT("onode={}", *ctx.transaction, *onode); return tm_iertr::now(); } @@ -1911,8 +2041,6 @@ SeaStore::Shard::_write( ceph::bufferlist &&_bl, uint32_t fadvise_flags) { - LOG_PREFIX(SeaStore::_write); - DEBUGT("onode={} {}~{}", *ctx.transaction, *onode, offset, len); const auto &object_size = onode->get_layout().size; if (offset + len > object_size) { onode->update_onode_size( @@ -2003,8 +2131,6 @@ SeaStore::Shard::_clone( OnodeRef &onode, OnodeRef &d_onode) { - LOG_PREFIX(SeaStore::_clone); - DEBUGT("onode={} d_onode={}", *ctx.transaction, *onode, *d_onode); return seastar::do_with( ObjectDataHandler(max_object_size), [this, &ctx, &onode, &d_onode](auto &objHandler) { @@ -2030,9 +2156,10 @@ SeaStore::Shard::_zero( objaddr_t offset, extent_len_t len) { - LOG_PREFIX(SeaStore::_zero); - DEBUGT("onode={} {}~{}", *ctx.transaction, *onode, offset, len); if (offset + len >= max_object_size) { + LOG_PREFIX(SeaStoreS::_zero); + ERRORT("0x{:x}~0x{:x} >= 0x{:x}", + *ctx.transaction, offset, len, max_object_size); return crimson::ct_error::input_output_error::make(); } const auto &object_size = onode->get_layout().size; @@ -2088,8 +2215,6 @@ SeaStore::Shard::_omap_set_values( OnodeRef &onode, std::map &&aset) { - LOG_PREFIX(SeaStore::_omap_set_values); - DEBUGT("{} {} keys", *ctx.transaction, *onode, aset.size()); return _omap_set_kvs( onode, onode->get_layout().omap_root, @@ -2108,8 +2233,6 @@ SeaStore::Shard::_omap_set_header( OnodeRef &onode, ceph::bufferlist &&header) { - LOG_PREFIX(SeaStore::_omap_set_header); - DEBUGT("{} {} bytes", *ctx.transaction, *onode, header.length()); std::map to_set; to_set[OMAP_HEADER_XATTR_KEY] = header; return _setattrs(ctx, onode,std::move(to_set)); @@ -2120,8 +2243,6 @@ SeaStore::Shard::_omap_clear( internal_context_t &ctx, OnodeRef &onode) { - LOG_PREFIX(SeaStore::_omap_clear); - DEBUGT("{} {} keys", *ctx.transaction, *onode); return _xattr_rmattr(ctx, onode, std::string(OMAP_HEADER_XATTR_KEY) ).si_then([this, &ctx, &onode]() -> tm_ret { if (auto omap_root = onode->get_layout().omap_root.get( @@ -2155,8 +2276,6 @@ SeaStore::Shard::_omap_rmkeys( OnodeRef &onode, omap_keys_t &&keys) { - LOG_PREFIX(SeaStore::_omap_rmkeys); - DEBUGT("{} {} keys", *ctx.transaction, *onode, keys.size()); auto omap_root = onode->get_layout().omap_root.get( onode->get_metadata_hint(device->get_block_size())); if (omap_root.is_null()) { @@ -2197,10 +2316,9 @@ SeaStore::Shard::_omap_rmkeyrange( std::string first, std::string last) { - LOG_PREFIX(SeaStore::_omap_rmkeyrange); - DEBUGT("{} first={} last={}", *ctx.transaction, *onode, first, last); if (first > last) { - ERRORT("range error, first: {} > last:{}", *ctx.transaction, first, last); + LOG_PREFIX(SeaStoreS::_omap_rmkeyrange); + ERRORT("range error, first:{} > last:{}", *ctx.transaction, first, last); ceph_abort(); } auto omap_root = onode->get_layout().omap_root.get( @@ -2243,8 +2361,6 @@ SeaStore::Shard::_truncate( OnodeRef &onode, uint64_t size) { - LOG_PREFIX(SeaStore::_truncate); - DEBUGT("onode={} size={}", *ctx.transaction, *onode, size); onode->update_onode_size(*ctx.transaction, size); return seastar::do_with( ObjectDataHandler(max_object_size), @@ -2265,9 +2381,7 @@ SeaStore::Shard::_setattrs( OnodeRef &onode, std::map&& aset) { - LOG_PREFIX(SeaStore::_setattrs); - DEBUGT("onode={}", *ctx.transaction, *onode); - + LOG_PREFIX(SeaStoreS::_setattrs); auto fut = tm_iertr::now(); auto& layout = onode->get_layout(); if (auto it = aset.find(OI_ATTR); it != aset.end()) { @@ -2329,8 +2443,6 @@ SeaStore::Shard::_rmattr( OnodeRef &onode, std::string name) { - LOG_PREFIX(SeaStore::_rmattr); - DEBUGT("onode={}", *ctx.transaction, *onode); auto& layout = onode->get_layout(); if ((name == OI_ATTR) && (layout.oi_size > 0)) { onode->clear_object_info(*ctx.transaction); @@ -2352,7 +2464,7 @@ SeaStore::Shard::_xattr_rmattr( OnodeRef &onode, std::string &&name) { - LOG_PREFIX(SeaStore::_xattr_rmattr); + LOG_PREFIX(SeaStoreS::_xattr_rmattr); DEBUGT("onode={}", *ctx.transaction, *onode); auto xattr_root = onode->get_layout().xattr_root.get( onode->get_metadata_hint(device->get_block_size())); @@ -2380,8 +2492,6 @@ SeaStore::Shard::_rmattrs( internal_context_t &ctx, OnodeRef &onode) { - LOG_PREFIX(SeaStore::_rmattrs); - DEBUGT("onode={}", *ctx.transaction, *onode); onode->clear_object_info(*ctx.transaction); onode->clear_snapset(*ctx.transaction); return _xattr_clear(ctx, onode); @@ -2392,7 +2502,7 @@ SeaStore::Shard::_xattr_clear( internal_context_t &ctx, OnodeRef &onode) { - LOG_PREFIX(SeaStore::_xattr_clear); + LOG_PREFIX(SeaStoreS::_xattr_clear); DEBUGT("onode={}", *ctx.transaction, *onode); auto xattr_root = onode->get_layout().xattr_root.get( onode->get_metadata_hint(device->get_block_size())); @@ -2442,7 +2552,7 @@ SeaStore::Shard::_create_collection( }).handle_error_interruptible( tm_iertr::pass_further{}, crimson::ct_error::assert_all{ - "Invalid error in SeaStore::_create_collection" + "Invalid error in SeaStoreS::_create_collection" } ); } @@ -2474,7 +2584,7 @@ SeaStore::Shard::_remove_collection( }).handle_error_interruptible( tm_iertr::pass_further{}, crimson::ct_error::assert_all{ - "Invalid error in SeaStore::_create_collection" + "Invalid error in SeaStoreS::_create_collection" } ); } @@ -2488,12 +2598,17 @@ SeaStore::Shard::_get_collection(const coll_t& cid) seastar::future<> SeaStore::write_meta( const std::string& key, const std::string& value) { + LOG_PREFIX(SeaStore::write_meta); + DEBUG("key={} value={} ...", key, value); + ceph_assert(seastar::this_shard_id() == primary_core); return seastar::do_with(key, value, - [this](auto& key, auto& value) { + [this, FNAME](auto& key, auto& value) { return shard_stores.local().write_meta(key, value ).then([this, &key, &value] { return mdstore->write_meta(key, value); + }).safe_then([FNAME, &key, &value] { + DEBUG("key={} value={} done", key, value); }).handle_error( crimson::ct_error::assert_all{"Invalid error in SeaStore::write_meta"} ); @@ -2504,23 +2619,21 @@ seastar::future<> SeaStore::Shard::write_meta( const std::string& key, const std::string& value) { - LOG_PREFIX(SeaStore::write_meta); - DEBUG("key: {}; value: {}", key, value); - ++(shard_stats.io_num); ++(shard_stats.pending_io_num); // For TM::submit_transaction() ++(shard_stats.processing_inlock_io_num); - return repeat_eagain([this, FNAME, &key, &value] { + return repeat_eagain([this, &key, &value] { ++(shard_stats.repeat_io_num); return transaction_manager->with_transaction_intr( Transaction::src_t::MUTATE, "write_meta", - [this, FNAME, &key, &value](auto& t) + [this, &key, &value](auto& t) { - DEBUGT("Have transaction, key: {}; value: {}", t, key, value); + LOG_PREFIX(SeaStoreS::write_meta); + DEBUGT("key={} value={} ...", t, key, value); return transaction_manager->update_root_meta( t, key, value ).si_then([this, &t] { @@ -2528,7 +2641,7 @@ seastar::future<> SeaStore::Shard::write_meta( }); }); }).handle_error( - crimson::ct_error::assert_all{"Invalid error in SeaStore::write_meta"} + crimson::ct_error::assert_all{"Invalid error in SeaStoreS::write_meta"} ).finally([this] { assert(shard_stats.pending_io_num); --(shard_stats.pending_io_num); @@ -2542,13 +2655,16 @@ seastar::future> SeaStore::read_meta(const std::string& key) { LOG_PREFIX(SeaStore::read_meta); - DEBUG("key: {}", key); + DEBUG("key={} ...", key); + ceph_assert(seastar::this_shard_id() == primary_core); return mdstore->read_meta(key - ).safe_then([](auto v) { + ).safe_then([key, FNAME](auto v) { if (v) { + DEBUG("key={}, value={}", key, *v); return std::make_tuple(0, std::move(*v)); } else { + ERROR("key={} failed", key); return std::make_tuple(-1, std::string("")); } }).handle_error( @@ -2605,7 +2721,7 @@ shard_stats_t SeaStore::Shard::get_io_stats( ret.minus(last_shard_stats); if (report_detail && seconds != 0) { - LOG_PREFIX(SeaStore::get_io_stats); + LOG_PREFIX(SeaStoreS::get_io_stats); auto calc_conflicts = [](uint64_t ios, uint64_t repeats) { return (double)(repeats-ios)/ios; }; diff --git a/src/crimson/os/seastore/seastore.h b/src/crimson/os/seastore/seastore.h index 611e909619ac4..f851cedda827c 100644 --- a/src/crimson/os/seastore/seastore.h +++ b/src/crimson/os/seastore/seastore.h @@ -300,18 +300,21 @@ class SeaStore final : public FuturizedStore { auto begin_time = std::chrono::steady_clock::now(); return seastar::do_with( oid, Ret{}, std::forward(f), - [this, src, op_type, begin_time, tname + [this, ch, src, op_type, begin_time, tname ](auto &oid, auto &ret, auto &f) { - return repeat_eagain([&, this, src, tname] { + return repeat_eagain([&, this, ch, src, tname] { assert(src == Transaction::src_t::READ); ++(shard_stats.repeat_read_num); return transaction_manager->with_transaction_intr( src, tname, - [&, this](auto& t) + [&, this, ch, tname](auto& t) { + LOG_PREFIX(SeaStoreS::repeat_with_onode); + SUBDEBUGT(seastore, "{} cid={} oid={} ...", + t, tname, ch->get_cid(), oid); return onode_manager->get_onode(t, oid ).si_then([&](auto onode) { return seastar::do_with(std::move(onode), [&](auto& onode) { diff --git a/src/osd/osd_types_fmt.h b/src/osd/osd_types_fmt.h index 04f4d46ee5109..100ce6e4646b3 100644 --- a/src/osd/osd_types_fmt.h +++ b/src/osd/osd_types_fmt.h @@ -392,4 +392,6 @@ inline std::ostream &operator<<(std::ostream &lhs, const object_stat_sum_t &sum) #if FMT_VERSION >= 90000 template struct fmt::formatter> : fmt::ostream_formatter {}; +template <> struct fmt::formatter : fmt::ostream_formatter {}; +template <> struct fmt::formatter : fmt::ostream_formatter {}; #endif From 14eacf64559c9130977026ba085f1c6887645c7b Mon Sep 17 00:00:00 2001 From: Yingxin Cheng Date: Sun, 29 Sep 2024 13:47:17 +0800 Subject: [PATCH 064/148] crimson/os/seastore: workaround log linkage issue under clang14 The logs printing lambda-captured variables cannot be linked correctly with SeaStore::Shard::repeat_with_onode() under clang14. Signed-off-by: Yingxin Cheng --- src/crimson/os/seastore/seastore.cc | 311 ++++++++++++++++------------ src/crimson/os/seastore/seastore.h | 31 +++ 2 files changed, 210 insertions(+), 132 deletions(-) diff --git a/src/crimson/os/seastore/seastore.cc b/src/crimson/os/seastore/seastore.cc index e2dee84637831..d90edbb20dbe2 100644 --- a/src/crimson/os/seastore/seastore.cc +++ b/src/crimson/os/seastore/seastore.cc @@ -1084,6 +1084,42 @@ SeaStore::Shard::list_collections() }); } +SeaStore::base_iertr::future +SeaStore::Shard::_read( + Transaction& t, + Onode& onode, + uint64_t offset, + std::size_t len, + uint32_t op_flags) +{ + LOG_PREFIX(SeaStoreS::_read); + size_t size = onode.get_layout().size; + if (offset >= size) { + DEBUGT("0x{:x}~0x{:x} onode-size=0x{:x} flags=0x{:x}, got none", + t, offset, len, size, op_flags); + return seastar::make_ready_future(); + } + + DEBUGT("0x{:x}~0x{:x} onode-size=0x{:x} flags=0x{:x} ...", + t, offset, len, size, op_flags); + size_t corrected_len = (len == 0) ? + size - offset : + std::min(size - offset, len); + + return ObjectDataHandler(max_object_size).read( + ObjectDataHandler::context_t{ + *transaction_manager, + t, + onode, + }, + offset, + corrected_len + ).si_then([FNAME, &t](auto bl) { + DEBUGT("got bl length=0x{:x}", t, bl.length()); + return bl; + }); +} + SeaStore::Shard::read_errorator::future SeaStore::Shard::read( CollectionRef ch, @@ -1101,35 +1137,9 @@ SeaStore::Shard::read( Transaction::src_t::READ, "read", op_type_t::READ, - [this, offset, len, op_flags](auto &t, auto &onode) -> ObjectDataHandler::read_ret { - LOG_PREFIX(SeaStoreS::read); - size_t size = onode.get_layout().size; - if (offset >= size) { - DEBUGT("0x{:x}~0x{:x} onode-size=0x{:x} flags=0x{:x}, got none", - t, offset, len, size, op_flags); - return seastar::make_ready_future(); - } - - DEBUGT("0x{:x}~0x{:x} onode-size=0x{:x} flags=0x{:x} ...", - t, offset, len, size, op_flags); - size_t corrected_len = (len == 0) ? - size - offset : - std::min(size - offset, len); - - return ObjectDataHandler(max_object_size).read( - ObjectDataHandler::context_t{ - *transaction_manager, - t, - onode, - }, - offset, - corrected_len - ).si_then([FNAME, &t](auto bl) { - DEBUGT("got bl length=0x{:x}", t, bl.length()); - return bl; - }); - } - ).finally([this] { + [this, offset, len, op_flags](auto &t, auto &onode) { + return _read(t, onode, offset, len, op_flags); + }).finally([this] { assert(shard_stats.pending_read_num); --(shard_stats.pending_read_num); }); @@ -1198,6 +1208,34 @@ SeaStore::Shard::readv( using crimson::os::seastore::omap_manager::BtreeOMapManager; +SeaStore::Shard::_omap_get_value_ret +SeaStore::Shard::_get_attr( + Transaction& t, + Onode& onode, + std::string_view name) const +{ + LOG_PREFIX(SeaStoreS::_get_attr); + auto& layout = onode.get_layout(); + if (name == OI_ATTR && layout.oi_size) { + ceph::bufferlist bl; + bl.append(ceph::bufferptr(&layout.oi[0], layout.oi_size)); + DEBUGT("got OI_ATTR, value length=0x{:x}", t, bl.length()); + return seastar::make_ready_future(std::move(bl)); + } + if (name == SS_ATTR && layout.ss_size) { + ceph::bufferlist bl; + bl.append(ceph::bufferptr(&layout.ss[0], layout.ss_size)); + DEBUGT("got SS_ATTR, value length=0x{:x}", t, bl.length()); + return seastar::make_ready_future(std::move(bl)); + } + DEBUGT("name={} ...", t, name); + return _omap_get_value( + t, + layout.xattr_root.get( + onode.get_metadata_hint(device->get_block_size())), + name); +} + SeaStore::Shard::get_attr_errorator::future SeaStore::Shard::get_attr( CollectionRef ch, @@ -1213,29 +1251,9 @@ SeaStore::Shard::get_attr( Transaction::src_t::READ, "get_attr", op_type_t::GET_ATTR, - [this, name](auto &t, auto& onode) -> _omap_get_value_ret { - LOG_PREFIX(SeaStoreS::get_attr); - auto& layout = onode.get_layout(); - if (name == OI_ATTR && layout.oi_size) { - ceph::bufferlist bl; - bl.append(ceph::bufferptr(&layout.oi[0], layout.oi_size)); - DEBUGT("got OI_ATTR, value length=0x{:x}", t, bl.length()); - return seastar::make_ready_future(std::move(bl)); - } - if (name == SS_ATTR && layout.ss_size) { - ceph::bufferlist bl; - bl.append(ceph::bufferptr(&layout.ss[0], layout.ss_size)); - DEBUGT("got SS_ATTR, value length=0x{:x}", t, bl.length()); - return seastar::make_ready_future(std::move(bl)); - } - DEBUGT("name={} ...", t, name); - return _omap_get_value( - t, - layout.xattr_root.get( - onode.get_metadata_hint(device->get_block_size())), - name); - } - ).handle_error( + [this, name](auto &t, auto& onode) { + return _get_attr(t, onode, name); + }).handle_error( crimson::ct_error::input_output_error::assert_failure{ "EIO when getting attrs"}, crimson::ct_error::pass_further_all{} @@ -1245,6 +1263,36 @@ SeaStore::Shard::get_attr( }); } +SeaStore::base_iertr::future +SeaStore::Shard::_get_attrs( + Transaction& t, + Onode& onode) +{ + LOG_PREFIX(SeaStoreS::_get_attrs); + DEBUGT("...", t); + auto& layout = onode.get_layout(); + return omap_list(onode, layout.xattr_root, t, std::nullopt, + OMapManager::omap_list_config_t() + .with_inclusive(false, false) + .without_max() + ).si_then([&layout, &t, FNAME](auto p) { + auto& attrs = std::get<1>(p); + DEBUGT("got {} attrs, OI length=0x{:x}, SS length=0x{:x}", + t, attrs.size(), (uint32_t)layout.oi_size, (uint32_t)layout.ss_size); + ceph::bufferlist bl; + if (layout.oi_size) { + bl.append(ceph::bufferptr(&layout.oi[0], layout.oi_size)); + attrs.emplace(OI_ATTR, std::move(bl)); + } + if (layout.ss_size) { + bl.clear(); + bl.append(ceph::bufferptr(&layout.ss[0], layout.ss_size)); + attrs.emplace(SS_ATTR, std::move(bl)); + } + return seastar::make_ready_future(std::move(attrs)); + }); +} + SeaStore::Shard::get_attrs_ertr::future SeaStore::Shard::get_attrs( CollectionRef ch, @@ -1260,31 +1308,8 @@ SeaStore::Shard::get_attrs( "get_attrs", op_type_t::GET_ATTRS, [this](auto &t, auto& onode) { - LOG_PREFIX(SeaStoreS::get_attrs); - DEBUGT("...", t); - auto& layout = onode.get_layout(); - return omap_list(onode, layout.xattr_root, t, std::nullopt, - OMapManager::omap_list_config_t() - .with_inclusive(false, false) - .without_max() - ).si_then([&layout, &t, FNAME](auto p) { - auto& attrs = std::get<1>(p); - DEBUGT("got {} attrs, OI length=0x{:x}, SS length=0x{:x}", - t, attrs.size(), (uint32_t)layout.oi_size, (uint32_t)layout.ss_size); - ceph::bufferlist bl; - if (layout.oi_size) { - bl.append(ceph::bufferptr(&layout.oi[0], layout.oi_size)); - attrs.emplace(OI_ATTR, std::move(bl)); - } - if (layout.ss_size) { - bl.clear(); - bl.append(ceph::bufferptr(&layout.ss[0], layout.ss_size)); - attrs.emplace(SS_ATTR, std::move(bl)); - } - return seastar::make_ready_future(std::move(attrs)); - }); - } - ).handle_error( + return _get_attrs(t, onode); + }).handle_error( crimson::ct_error::input_output_error::assert_failure{ "EIO when getting attrs"}, crimson::ct_error::pass_further_all{} @@ -1294,6 +1319,23 @@ SeaStore::Shard::get_attrs( }); } +seastar::future SeaStore::Shard::_stat( + Transaction& t, + Onode& onode, + const ghobject_t& oid) +{ + LOG_PREFIX(SeaStoreS::_stat); + struct stat st; + auto &olayout = onode.get_layout(); + st.st_size = olayout.size; + st.st_blksize = device->get_block_size(); + st.st_blocks = (st.st_size + st.st_blksize - 1) / st.st_blksize; + st.st_nlink = 1; + DEBUGT("oid={}, size={}, blksize={}", + t, oid, st.st_size, st.st_blksize); + return seastar::make_ready_future(st); +} + seastar::future SeaStore::Shard::stat( CollectionRef c, const ghobject_t& oid) @@ -1308,18 +1350,8 @@ seastar::future SeaStore::Shard::stat( "stat", op_type_t::STAT, [this, oid](auto &t, auto &onode) { - LOG_PREFIX(SeaStoreS::stat); - struct stat st; - auto &olayout = onode.get_layout(); - st.st_size = olayout.size; - st.st_blksize = device->get_block_size(); - st.st_blocks = (st.st_size + st.st_blksize - 1) / st.st_blksize; - st.st_nlink = 1; - DEBUGT("oid={}, size={}, blksize={}", - t, oid, st.st_size, st.st_blksize); - return seastar::make_ready_future(st); - } - ).handle_error( + return _stat(t, onode, oid); + }).handle_error( crimson::ct_error::assert_all{ "Invalid error in SeaStoreS::stat" } @@ -1337,6 +1369,22 @@ SeaStore::Shard::omap_get_header( return get_attr(ch, oid, OMAP_HEADER_XATTR_KEY); } +SeaStore::base_iertr::future +SeaStore::Shard::do_omap_get_values( + Transaction& t, + Onode& onode, + const omap_keys_t& keys) +{ + LOG_PREFIX(SeaStoreS::do_omap_get_values); + DEBUGT("{} keys ...", t, keys.size()); + omap_root_t omap_root = onode.get_layout().omap_root.get( + onode.get_metadata_hint(device->get_block_size())); + return _omap_get_values( + t, + std::move(omap_root), + keys); +} + SeaStore::Shard::read_errorator::future SeaStore::Shard::omap_get_values( CollectionRef ch, @@ -1353,16 +1401,8 @@ SeaStore::Shard::omap_get_values( "omap_get_values", op_type_t::OMAP_GET_VALUES, [this, keys](auto &t, auto &onode) { - LOG_PREFIX(SeaStoreS::omap_get_values); - DEBUGT("{} keys ...", t, keys.size()); - omap_root_t omap_root = onode.get_layout().omap_root.get( - onode.get_metadata_hint(device->get_block_size())); - return _omap_get_values( - t, - std::move(omap_root), - keys); - } - ).finally([this] { + return do_omap_get_values(t, onode, keys); + }).finally([this] { assert(shard_stats.pending_read_num); --(shard_stats.pending_read_num); }); @@ -1462,6 +1502,29 @@ SeaStore::Shard::omap_list( }); } +SeaStore::base_iertr::future +SeaStore::Shard::do_omap_get_values( + Transaction& t, + Onode& onode, + const std::optional& start) +{ + LOG_PREFIX(SeaStoreS::do_omap_get_values); + DEBUGT("start={} ...", t, start.has_value() ? *start : ""); + return omap_list( + onode, + onode.get_layout().omap_root, + t, + start, + OMapManager::omap_list_config_t() + .with_inclusive(false, false) + .without_max() + ).si_then([FNAME, &t](omap_values_paged_t ret) { + DEBUGT("got {} values, complete={}", + t, std::get<1>(ret).size(), std::get<0>(ret)); + return ret; + }); +} + SeaStore::Shard::read_errorator::future SeaStore::Shard::omap_get_values( CollectionRef ch, @@ -1478,23 +1541,8 @@ SeaStore::Shard::omap_get_values( "omap_get_values2", op_type_t::OMAP_GET_VALUES2, [this, start](auto &t, auto &onode) { - LOG_PREFIX(SeaStoreS::omap_get_values); - DEBUGT("start={} ...", t, start.has_value() ? *start : ""); - return omap_list( - onode, - onode.get_layout().omap_root, - t, - start, - OMapManager::omap_list_config_t() - .with_inclusive(false, false) - .without_max() - ).si_then([FNAME, &t](omap_values_paged_t ret) { - DEBUGT("got {} values, complete={}", - t, std::get<1>(ret).size(), std::get<0>(ret)); - return ret; - }); - } - ).finally([this] { + return do_omap_get_values(t, onode, start); + }).finally([this] { assert(shard_stats.pending_read_num); --(shard_stats.pending_read_num); }); @@ -1508,9 +1556,20 @@ SeaStore::Shard::_fiemap( uint64_t len) const { LOG_PREFIX(SeaStoreS::_fiemap); + size_t size = onode.get_layout().size; + if (off >= size) { + DEBUGT("0x{:x}~0x{:x} onode-size=0x{:x}, got none", + t, off, len, size); + return seastar::make_ready_future>(); + } + DEBUGT("0x{:x}~0x{:x} onode-size=0x{:x} ...", + t, off, len, size); + size_t adjust_len = (len == 0) ? + size - off: + std::min(size - off, len); return seastar::do_with( ObjectDataHandler(max_object_size), - [this, off, len, &t, &onode](auto &objhandler) { + [this, off, adjust_len, &t, &onode](auto &objhandler) { return objhandler.fiemap( ObjectDataHandler::context_t{ *transaction_manager, @@ -1518,7 +1577,7 @@ SeaStore::Shard::_fiemap( onode, }, off, - len); + adjust_len); }).si_then([FNAME, &t](auto ret) { DEBUGT("got {} intervals", t, ret.size()); return ret; @@ -1541,20 +1600,8 @@ SeaStore::Shard::fiemap( Transaction::src_t::READ, "fiemap", op_type_t::READ, - [this, off, len](auto &t, auto &onode) -> base_iertr::future { - LOG_PREFIX(SeaStoreS::fiemap); - size_t size = onode.get_layout().size; - if (off >= size) { - DEBUGT("0x{:x}~0x{:x} onode-size=0x{:x}, got none", - t, off, len, size); - return seastar::make_ready_future>(); - } - DEBUGT("0x{:x}~0x{:x} onode-size=0x{:x} ...", - t, off, len, size); - size_t adjust_len = (len == 0) ? - size - off: - std::min(size - off, len); - return _fiemap(t, onode, off, adjust_len); + [this, off, len](auto &t, auto &onode) { + return _fiemap(t, onode, off, len); }).finally([this] { assert(shard_stats.pending_read_num); --(shard_stats.pending_read_num); diff --git a/src/crimson/os/seastore/seastore.h b/src/crimson/os/seastore/seastore.h index f851cedda827c..185072744f2d8 100644 --- a/src/crimson/os/seastore/seastore.h +++ b/src/crimson/os/seastore/seastore.h @@ -357,6 +357,37 @@ class SeaStore final : public FuturizedStore { friend class SeaStoreOmapIterator; + base_iertr::future _read( + Transaction& t, + Onode& onode, + uint64_t offset, + std::size_t len, + uint32_t op_flags); + + _omap_get_value_ret _get_attr( + Transaction& t, + Onode& onode, + std::string_view name) const; + + base_iertr::future _get_attrs( + Transaction& t, + Onode& onode); + + seastar::future _stat( + Transaction& t, + Onode& onode, + const ghobject_t& oid); + + base_iertr::future do_omap_get_values( + Transaction& t, + Onode& onode, + const omap_keys_t& keys); + + base_iertr::future do_omap_get_values( + Transaction& t, + Onode& onode, + const std::optional& start); + base_iertr::future _fiemap( Transaction &t, Onode &onode, From 1ee32107df9c641908bc0b908be47f1d5af3bf63 Mon Sep 17 00:00:00 2001 From: Yingxin Cheng Date: Tue, 10 Sep 2024 11:52:56 +0800 Subject: [PATCH 065/148] crimson/os/seastore/transaction_manager: misc cleanups Signed-off-by: Yingxin Cheng --- .../os/seastore/transaction_manager.cc | 44 ++++--- src/crimson/os/seastore/transaction_manager.h | 112 ++++++++---------- 2 files changed, 74 insertions(+), 82 deletions(-) diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc index a76b7fbe0c96c..0fd23ef6afbbb 100644 --- a/src/crimson/os/seastore/transaction_manager.cc +++ b/src/crimson/os/seastore/transaction_manager.cc @@ -98,7 +98,8 @@ TransactionManager::mkfs_ertr::future<> TransactionManager::mkfs() }); } -TransactionManager::mount_ertr::future<> TransactionManager::mount() +TransactionManager::mount_ertr::future<> +TransactionManager::mount() { LOG_PREFIX(TransactionManager::mount); INFO("enter"); @@ -175,7 +176,8 @@ TransactionManager::mount_ertr::future<> TransactionManager::mount() ); } -TransactionManager::close_ertr::future<> TransactionManager::close() { +TransactionManager::close_ertr::future<> +TransactionManager::close() { LOG_PREFIX(TransactionManager::close); INFO("enter"); return epm->stop_background( @@ -241,11 +243,11 @@ TransactionManager::ref_ret TransactionManager::remove( }); } -TransactionManager::ref_ret TransactionManager::_dec_ref( +TransactionManager::ref_ret TransactionManager::remove( Transaction &t, laddr_t offset) { - LOG_PREFIX(TransactionManager::_dec_ref); + LOG_PREFIX(TransactionManager::remove); TRACET("{}", t, offset); return lba_manager->decref_extent(t, offset ).si_then([this, FNAME, offset, &t](auto result) -> ref_ret { @@ -273,17 +275,18 @@ TransactionManager::refs_ret TransactionManager::remove( LOG_PREFIX(TransactionManager::remove); DEBUG("{} offsets", offsets.size()); return seastar::do_with(std::move(offsets), std::vector(), - [this, &t] (auto &&offsets, auto &refcnt) { - return trans_intr::do_for_each(offsets.begin(), offsets.end(), - [this, &t, &refcnt] (auto &laddr) { - return this->remove(t, laddr).si_then([&refcnt] (auto ref) { - refcnt.push_back(ref); - return ref_iertr::now(); - }); - }).si_then([&refcnt] { - return ref_iertr::make_ready_future>(std::move(refcnt)); + [this, &t](auto &&offsets, auto &refcnts) { + return trans_intr::do_for_each(offsets.begin(), offsets.end(), + [this, &t, &refcnts](auto &laddr) { + return this->remove(t, laddr + ).si_then([&refcnts](auto ref) { + refcnts.push_back(ref); + return ref_iertr::now(); }); + }).si_then([&refcnts] { + return ref_iertr::make_ready_future>(std::move(refcnts)); }); + }); } TransactionManager::submit_transaction_iertr::future<> @@ -340,6 +343,7 @@ TransactionManager::update_lba_mappings( return; } if (extent->is_logical()) { + assert(is_logical_type(extent->get_type())); // for rewritten extents, last_committed_crc should have been set // because the crc of the original extent may be reused. // also see rewrite_logical_extent() @@ -359,6 +363,7 @@ TransactionManager::update_lba_mappings( #endif lextents.emplace_back(extent->template cast()); } else { + assert(is_physical_type(extent->get_type())); pextents.emplace_back(extent); } }; @@ -566,7 +571,8 @@ TransactionManager::rewrite_logical_extent( 0, lextent->get_length(), extent_ref_count_t(0), - [this, lextent, &t](auto &extents, auto &off, auto &left, auto &refcount) { + [this, lextent, &t] + (auto &extents, auto &off, auto &left, auto &refcount) { return trans_intr::do_for_each( extents, [lextent, this, &t, &off, &left, &refcount](auto &nextent) { @@ -665,11 +671,6 @@ TransactionManager::rewrite_extent_ret TransactionManager::rewrite_extent( t.get_rewrite_stats().account_n_dirty(); } - if (is_backref_node(extent->get_type())) { - DEBUGT("rewriting backref extent -- {}", t, *extent); - return backref_manager->rewrite_extent(t, extent); - } - if (is_root_type(extent->get_type())) { DEBUGT("rewriting root extent -- {}", t, *extent); cache->duplicate_for_write(t, extent); @@ -677,8 +678,13 @@ TransactionManager::rewrite_extent_ret TransactionManager::rewrite_extent( } if (extent->is_logical()) { + assert(is_logical_type(extent->get_type())); return rewrite_logical_extent(t, extent->cast()); + } else if (is_backref_node(extent->get_type())) { + DEBUGT("rewriting backref extent -- {}", t, *extent); + return backref_manager->rewrite_extent(t, extent); } else { + assert(is_lba_node(extent->get_type())); DEBUGT("rewriting physical extent -- {}", t, *extent); return lba_manager->rewrite_extent(t, extent); } diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h index 828b8a25592fc..6d1b010ab69ea 100644 --- a/src/crimson/os/seastore/transaction_manager.h +++ b/src/crimson/os/seastore/transaction_manager.h @@ -215,49 +215,6 @@ class TransactionManager : public ExtentCallbackInterface { }); } - template - std::variant>> - get_extent_if_linked( - Transaction &t, - LBAMappingRef pin) - { - ceph_assert(pin->is_parent_viewable()); - // checking the lba child must be atomic with creating - // and linking the absent child - auto v = pin->get_logical_extent(t); - if (v.has_child()) { - return v.get_child_fut().safe_then([pin=std::move(pin)](auto extent) { -#ifndef NDEBUG - auto lextent = extent->template cast(); - auto pin_laddr = pin->get_key(); - if (pin->is_indirect()) { - pin_laddr = pin->get_intermediate_base(); - } - assert(lextent->get_laddr() == pin_laddr); -#endif - return extent->template cast(); - }); - } else { - return pin; - } - } - - base_iertr::future read_pin_by_type( - Transaction &t, - LBAMappingRef pin, - extent_types_t type) - { - ceph_assert(!pin->parent_modified()); - auto v = pin->get_logical_extent(t); - // checking the lba child must be atomic with creating - // and linking the absent child - if (v.has_child()) { - return std::move(v.get_child_fut()); - } else { - return pin_to_extent_by_type(t, std::move(pin), type); - } - } - /// Obtain mutable copy of extent LogicalCachedExtentRef get_mutable_extent(Transaction &t, LogicalCachedExtentRef ref) { LOG_PREFIX(TransactionManager::get_mutable_extent); @@ -282,7 +239,6 @@ class TransactionManager : public ExtentCallbackInterface { return ret; } - using ref_iertr = LBAManager::ref_iertr; using ref_ret = ref_iertr::future; @@ -302,26 +258,15 @@ class TransactionManager : public ExtentCallbackInterface { * remove * * Remove the extent and the corresponding lba mapping, - * users must make sure that lba mapping's refcount is 1 + * users must make sure that lba mapping's refcount > 1 */ ref_ret remove( Transaction &t, LogicalCachedExtentRef &ref); - /** - * remove - * - * 1. Remove the indirect mapping(s), and if refcount drops to 0, - * also remove the direct mapping and retire the extent. - * - * 2. Remove the direct mapping(s) and retire the extent if - * refcount drops to 0. - */ ref_ret remove( Transaction &t, - laddr_t offset) { - return _dec_ref(t, offset); - } + laddr_t offset); /// remove refcount for list of offset using refs_ret = ref_iertr::future>; @@ -411,7 +356,10 @@ class TransactionManager : public ExtentCallbackInterface { } template - read_extent_ret get_mutable_extent_by_laddr(Transaction &t, laddr_t laddr, extent_len_t len) { + read_extent_ret get_mutable_extent_by_laddr( + Transaction &t, + laddr_t laddr, + extent_len_t len) { return get_pin(t, laddr ).si_then([this, &t, len](auto pin) { ceph_assert(pin->is_data_stable() && !pin->is_zero_reserved()); @@ -853,6 +801,49 @@ class TransactionManager : public ExtentCallbackInterface { shard_stats_t& shard_stats; + template + std::variant>> + get_extent_if_linked( + Transaction &t, + LBAMappingRef pin) + { + ceph_assert(pin->is_parent_viewable()); + // checking the lba child must be atomic with creating + // and linking the absent child + auto v = pin->get_logical_extent(t); + if (v.has_child()) { + return v.get_child_fut().safe_then([pin=std::move(pin)](auto extent) { +#ifndef NDEBUG + auto lextent = extent->template cast(); + auto pin_laddr = pin->get_key(); + if (pin->is_indirect()) { + pin_laddr = pin->get_intermediate_base(); + } + assert(lextent->get_laddr() == pin_laddr); +#endif + return extent->template cast(); + }); + } else { + return pin; + } + } + + base_iertr::future read_pin_by_type( + Transaction &t, + LBAMappingRef pin, + extent_types_t type) + { + ceph_assert(!pin->parent_modified()); + auto v = pin->get_logical_extent(t); + // checking the lba child must be atomic with creating + // and linking the absent child + if (v.has_child()) { + return std::move(v.get_child_fut()); + } else { + return pin_to_extent_by_type(t, std::move(pin), type); + } + } + rewrite_extent_ret rewrite_logical_extent( Transaction& t, LogicalCachedExtentRef extent); @@ -862,11 +853,6 @@ class TransactionManager : public ExtentCallbackInterface { ExtentPlacementManager::dispatch_result_t dispatch_result, std::optional seq_to_trim = std::nullopt); - /// Remove refcount for offset - ref_ret _dec_ref( - Transaction &t, - laddr_t offset); - using update_lba_mappings_ret = LBAManager::update_mappings_ret; update_lba_mappings_ret update_lba_mappings( Transaction &t, From ec5c6c5761ed1124d12c2e036262e6135fc99a9b Mon Sep 17 00:00:00 2001 From: Yingxin Cheng Date: Tue, 10 Sep 2024 11:55:05 +0800 Subject: [PATCH 066/148] crimson/os/seastore/transaction_manager: refine logs Signed-off-by: Yingxin Cheng --- .../lba_manager/btree/btree_lba_manager.h | 8 +- .../os/seastore/transaction_manager.cc | 82 ++++++------ src/crimson/os/seastore/transaction_manager.h | 122 ++++++++++-------- 3 files changed, 122 insertions(+), 90 deletions(-) diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h index 5d6fa3cb1b170..ef10ff9623b50 100644 --- a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h +++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h @@ -173,16 +173,22 @@ class BtreeLBAMapping : public BtreeNodeMapping { if (!parent_modified()) { return; } + LOG_PREFIX(BtreeLBAMapping::maybe_fix_pos); auto &p = static_cast(*parent); p.maybe_fix_mapping_pos(*this); + SUBDEBUGT(seastore_lba, "fixed pin {}", + ctx.trans, static_cast(*this)); } LBAMappingRef refresh_with_pending_parent() final { + LOG_PREFIX(BtreeLBAMapping::refresh_with_pending_parent); assert(is_parent_valid() && !is_parent_viewable()); auto &p = static_cast(*parent); auto &viewable_p = static_cast( *p.find_pending_version(ctx.trans, get_key())); - return viewable_p.get_mapping(ctx, get_key()); + auto new_pin = viewable_p.get_mapping(ctx, get_key()); + SUBDEBUGT(seastore_lba, "new pin {}", ctx.trans, static_cast(*new_pin)); + return new_pin; } protected: std::unique_ptr> _duplicate( diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc index 0fd23ef6afbbb..f4e3b0858f2f1 100644 --- a/src/crimson/os/seastore/transaction_manager.cc +++ b/src/crimson/os/seastore/transaction_manager.cc @@ -48,7 +48,7 @@ TransactionManager::TransactionManager( TransactionManager::mkfs_ertr::future<> TransactionManager::mkfs() { LOG_PREFIX(TransactionManager::mkfs); - INFO("enter"); + INFO("..."); return epm->mount( ).safe_then([this] { return journal->open_for_mkfs(); @@ -94,7 +94,7 @@ TransactionManager::mkfs_ertr::future<> TransactionManager::mkfs() }).safe_then([this] { return close(); }).safe_then([FNAME] { - INFO("completed"); + INFO("done"); }); } @@ -102,7 +102,7 @@ TransactionManager::mount_ertr::future<> TransactionManager::mount() { LOG_PREFIX(TransactionManager::mount); - INFO("enter"); + INFO("..."); cache->init(); return epm->mount( ).safe_then([this] { @@ -169,7 +169,7 @@ TransactionManager::mount() return epm->open_for_write(); }).safe_then([FNAME, this] { epm->start_background(); - INFO("completed"); + INFO("done"); }).handle_error( mount_ertr::pass_further{}, crimson::ct_error::assert_all{"unhandled error"} @@ -179,7 +179,7 @@ TransactionManager::mount() TransactionManager::close_ertr::future<> TransactionManager::close() { LOG_PREFIX(TransactionManager::close); - INFO("enter"); + INFO("..."); return epm->stop_background( ).then([this] { return cache->close(); @@ -189,7 +189,7 @@ TransactionManager::close() { }).safe_then([this] { return epm->close(); }).safe_then([FNAME] { - INFO("completed"); + INFO("done"); return seastar::now(); }); } @@ -231,14 +231,14 @@ TransactionManager::ref_ret TransactionManager::remove( LogicalCachedExtentRef &ref) { LOG_PREFIX(TransactionManager::remove); - TRACET("{}", t, *ref); + DEBUGT("{} ...", t, *ref); return lba_manager->decref_extent(t, ref->get_laddr() ).si_then([this, FNAME, &t, ref](auto result) { - DEBUGT("extent refcount is decremented to {} -- {}", - t, result.refcount, *ref); if (result.refcount == 0) { cache->retire_extent(t, ref); } + DEBUGT("removed {}~0x{:x} refcount={} -- {}", + t, result.addr, result.length, result.refcount, *ref); return result.refcount; }); } @@ -248,11 +248,9 @@ TransactionManager::ref_ret TransactionManager::remove( laddr_t offset) { LOG_PREFIX(TransactionManager::remove); - TRACET("{}", t, offset); + DEBUGT("{} ...", t, offset); return lba_manager->decref_extent(t, offset ).si_then([this, FNAME, offset, &t](auto result) -> ref_ret { - DEBUGT("extent refcount is decremented to {} -- {}~{}, {}", - t, result.refcount, offset, result.length, result.addr); auto fut = ref_iertr::now(); if (result.refcount == 0) { if (result.addr.is_paddr() && @@ -261,8 +259,9 @@ TransactionManager::ref_ret TransactionManager::remove( t, result.addr.get_paddr(), result.length); } } - - return fut.si_then([result=std::move(result)] { + return fut.si_then([result=std::move(result), offset, &t, FNAME] { + DEBUGT("removed {}~0x{:x} refcount={} -- offset={}", + t, result.addr, result.length, result.refcount, offset); return result.refcount; }); }); @@ -273,9 +272,9 @@ TransactionManager::refs_ret TransactionManager::remove( std::vector offsets) { LOG_PREFIX(TransactionManager::remove); - DEBUG("{} offsets", offsets.size()); + DEBUGT("{} offsets ...", t, offsets.size()); return seastar::do_with(std::move(offsets), std::vector(), - [this, &t](auto &&offsets, auto &refcnts) { + [this, &t, FNAME](auto &&offsets, auto &refcnts) { return trans_intr::do_for_each(offsets.begin(), offsets.end(), [this, &t, &refcnts](auto &laddr) { return this->remove(t, laddr @@ -283,7 +282,8 @@ TransactionManager::refs_ret TransactionManager::remove( refcnts.push_back(ref); return ref_iertr::now(); }); - }).si_then([&refcnts] { + }).si_then([&refcnts, &t, FNAME] { + DEBUGT("removed {} offsets", t, refcnts.size()); return ref_iertr::make_ready_future>(std::move(refcnts)); }); }); @@ -520,7 +520,6 @@ TransactionManager::rewrite_logical_extent( ERRORT("extent has been invalidated -- {}", t, *extent); ceph_abort(); } - TRACET("rewriting extent -- {}", t, *extent); auto lextent = extent->cast(); cache->retire_extent(t, extent); @@ -534,7 +533,7 @@ TransactionManager::rewrite_logical_extent( lextent->get_rewrite_generation())->cast(); nlextent->rewrite(t, *lextent, 0); - DEBUGT("rewriting logical extent -- {} to {}", t, *lextent, *nlextent); + DEBUGT("rewriting meta -- {} to {}", t, *lextent, *nlextent); #ifndef NDEBUG if (get_checksum_needed(lextent->get_paddr())) { @@ -571,17 +570,16 @@ TransactionManager::rewrite_logical_extent( 0, lextent->get_length(), extent_ref_count_t(0), - [this, lextent, &t] + [this, FNAME, lextent, &t] (auto &extents, auto &off, auto &left, auto &refcount) { return trans_intr::do_for_each( extents, - [lextent, this, &t, &off, &left, &refcount](auto &nextent) { - LOG_PREFIX(TransactionManager::rewrite_logical_extent); + [lextent, this, FNAME, &t, &off, &left, &refcount](auto &nextent) { bool first_extent = (off == 0); ceph_assert(left >= nextent->get_length()); auto nlextent = nextent->template cast(); nlextent->rewrite(t, *lextent, off); - DEBUGT("rewriting logical extent -- {} to {}", t, *lextent, *nlextent); + DEBUGT("rewriting data -- {} to {}", t, *lextent, *nlextent); /* This update_mapping is, strictly speaking, unnecessary for delayed_alloc * extents since we're going to do it again once we either do the ool write @@ -635,10 +633,18 @@ TransactionManager::rewrite_extent_ret TransactionManager::rewrite_extent( { auto updated = cache->update_extent_from_transaction(t, extent); if (!updated) { - DEBUGT("extent is already retired, skipping -- {}", t, *extent); + DEBUGT("target={} {} already retired, skipping -- {}", t, + rewrite_gen_printer_t{target_generation}, + sea_time_point_printer_t{modify_time}, + *extent); return rewrite_extent_iertr::now(); } + extent = updated; + DEBUGT("target={} {} -- {} ...", t, + rewrite_gen_printer_t{target_generation}, + sea_time_point_printer_t{modify_time}, + *extent); ceph_assert(!extent->is_pending_io()); } @@ -656,9 +662,9 @@ TransactionManager::rewrite_extent_ret TransactionManager::rewrite_extent( // FIXME: is_dirty() is true for mutation pending extents // which shouldn't do inplace rewrite because a pending transaction // may fail. - DEBUGT("delta overwriting extent -- {}", t, *extent); t.add_inplace_rewrite_extent(extent); extent->set_inplace_rewrite_generation(); + DEBUGT("rewritten as inplace rewrite -- {}", t, *extent); return rewrite_extent_iertr::now(); } extent->set_target_rewrite_generation(INIT_GENERATION); @@ -672,22 +678,24 @@ TransactionManager::rewrite_extent_ret TransactionManager::rewrite_extent( } if (is_root_type(extent->get_type())) { - DEBUGT("rewriting root extent -- {}", t, *extent); cache->duplicate_for_write(t, extent); + DEBUGT("rewritten root {}", t, *extent); return rewrite_extent_iertr::now(); } + auto fut = rewrite_extent_iertr::now(); if (extent->is_logical()) { assert(is_logical_type(extent->get_type())); - return rewrite_logical_extent(t, extent->cast()); + fut = rewrite_logical_extent(t, extent->cast()); } else if (is_backref_node(extent->get_type())) { - DEBUGT("rewriting backref extent -- {}", t, *extent); - return backref_manager->rewrite_extent(t, extent); + fut = backref_manager->rewrite_extent(t, extent); } else { assert(is_lba_node(extent->get_type())); - DEBUGT("rewriting physical extent -- {}", t, *extent); - return lba_manager->rewrite_extent(t, extent); + fut = lba_manager->rewrite_extent(t, extent); } + return fut.si_then([FNAME, &t] { + DEBUGT("rewritten", t); + }); } TransactionManager::get_extents_if_live_ret @@ -699,7 +707,7 @@ TransactionManager::get_extents_if_live( extent_len_t len) { LOG_PREFIX(TransactionManager::get_extents_if_live); - TRACET("{} {}~{} {}", t, type, laddr, len, paddr); + DEBUGT("{} {}~0x{:x} {} ...", t, type, laddr, len, paddr); // This only works with segments to check if alive, // as parallel transactions may split the extent at the same time. @@ -709,7 +717,7 @@ TransactionManager::get_extents_if_live( ).si_then([=, this, &t](auto extent) -> get_extents_if_live_ret { if (extent && extent->get_length() == len) { - DEBUGT("{} {}~{} {} is live in cache -- {}", + DEBUGT("{} {}~0x{:x} {} is cached and alive -- {}", t, type, laddr, len, paddr, *extent); std::list res; res.emplace_back(std::move(extent)); @@ -763,7 +771,9 @@ TransactionManager::get_extents_if_live( list.emplace_back(std::move(ret)); return seastar::now(); }); - }).si_then([&list] { + }).si_then([&list, &t, FNAME, type, laddr, len, paddr] { + DEBUGT("{} {}~0x{:x} {} is alive as {} extents", + t, type, laddr, len, paddr, list.size()); return get_extents_if_live_ret( interruptible::ready_future_marker{}, std::move(list)); @@ -784,11 +794,11 @@ TransactionManager::get_extents_if_live( ).si_then([=, &t](auto ret) { std::list res; if (ret) { - DEBUGT("{} {}~{} {} is live as physical extent -- {}", + DEBUGT("{} {}~0x{:x} {} is absent and alive as physical extent -- {}", t, type, laddr, len, paddr, *ret); res.emplace_back(std::move(ret)); } else { - DEBUGT("{} {}~{} {} is not live as physical extent", + DEBUGT("{} {}~0x{:x} {} is not alive as physical extent", t, type, laddr, len, paddr); } return get_extents_if_live_ret( diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h index 6d1b010ab69ea..c7a94a9ef1132 100644 --- a/src/crimson/os/seastore/transaction_manager.h +++ b/src/crimson/os/seastore/transaction_manager.h @@ -106,8 +106,12 @@ class TransactionManager : public ExtentCallbackInterface { Transaction &t, laddr_t offset) { LOG_PREFIX(TransactionManager::get_pin); - SUBTRACET(seastore_tm, "{}", t, offset); - return lba_manager->get_mapping(t, offset); + SUBDEBUGT(seastore_tm, "{} ...", t, offset); + return lba_manager->get_mapping(t, offset + ).si_then([FNAME, &t](LBAMappingRef pin) { + SUBDEBUGT(seastore_tm, "got {}", t, *pin); + return pin; + }); } /** @@ -122,9 +126,13 @@ class TransactionManager : public ExtentCallbackInterface { laddr_t offset, extent_len_t length) { LOG_PREFIX(TransactionManager::get_pins); - SUBDEBUGT(seastore_tm, "{}~{}", t, offset, length); + SUBDEBUGT(seastore_tm, "{}~0x{:x} ...", t, offset, length); return lba_manager->get_mappings( - t, offset, length); + t, offset, length + ).si_then([FNAME, &t](lba_pin_list_t pins) { + SUBDEBUGT(seastore_tm, "got {} pins", t, pins.size()); + return pins; + }); } /** @@ -142,15 +150,15 @@ class TransactionManager : public ExtentCallbackInterface { laddr_t offset, extent_len_t length) { LOG_PREFIX(TransactionManager::read_extent); - SUBTRACET(seastore_tm, "{}~{}", t, offset, length); + SUBDEBUGT(seastore_tm, "{}~0x{:x} {} ...", + t, offset, length, T::TYPE); return get_pin( t, offset ).si_then([this, FNAME, &t, offset, length] (auto pin) -> read_extent_ret { if (length != pin->get_length() || !pin->get_val().is_real()) { - SUBERRORT(seastore_tm, - "offset {} len {} got wrong pin {}", - t, offset, length, *pin); + SUBERRORT(seastore_tm, "{}~0x{:x} {} got wrong {}", + t, offset, length, T::TYPE, *pin); ceph_assert(0 == "Should be impossible"); } return this->read_pin(t, std::move(pin)); @@ -167,15 +175,15 @@ class TransactionManager : public ExtentCallbackInterface { Transaction &t, laddr_t offset) { LOG_PREFIX(TransactionManager::read_extent); - SUBTRACET(seastore_tm, "{}", t, offset); + SUBDEBUGT(seastore_tm, "{} {} ...", + t, offset, T::TYPE); return get_pin( t, offset ).si_then([this, FNAME, &t, offset] (auto pin) -> read_extent_ret { if (!pin->get_val().is_real()) { - SUBERRORT(seastore_tm, - "offset {} got wrong pin {}", - t, offset, *pin); + SUBERRORT(seastore_tm, "{} {} got wrong {}", + t, offset, T::TYPE, *pin); ceph_assert(0 == "Should be impossible"); } return this->read_pin(t, std::move(pin)); @@ -187,6 +195,8 @@ class TransactionManager : public ExtentCallbackInterface { Transaction &t, LBAMappingRef pin) { + LOG_PREFIX(TransactionManager::read_pin); + SUBDEBUGT(seastore_tm, "{} {} ...", t, T::TYPE, *pin); auto fut = base_iertr::make_ready_future(); if (!pin->is_parent_viewable()) { if (pin->is_parent_valid()) { @@ -212,6 +222,9 @@ class TransactionManager : public ExtentCallbackInterface { } else { return this->pin_to_extent(t, std::move(std::get<0>(ret))); } + }).si_then([FNAME, &t](TCachedExtentRef ext) { + SUBDEBUGT(seastore_tm, "got {}", t, *ext); + return ext; }); } @@ -222,17 +235,9 @@ class TransactionManager : public ExtentCallbackInterface { t, ref)->cast(); if (!ret->has_laddr()) { - SUBDEBUGT(seastore_tm, - "duplicating extent for write -- {} -> {}", - t, - *ref, - *ret); + SUBDEBUGT(seastore_tm, "duplicate from {}", t, *ref); ret->set_laddr(ref->get_laddr()); } else { - SUBTRACET(seastore_tm, - "extent is already duplicated -- {}", - t, - *ref); assert(ref->is_mutable()); assert(&*ref == &*ret); } @@ -291,23 +296,23 @@ class TransactionManager : public ExtentCallbackInterface { extent_len_t len, placement_hint_t placement_hint = placement_hint_t::HOT) { LOG_PREFIX(TransactionManager::alloc_non_data_extent); - SUBTRACET(seastore_tm, "{} len={}, placement_hint={}, laddr_hint={}", - t, T::TYPE, len, placement_hint, laddr_hint); + SUBDEBUGT(seastore_tm, "{} hint {}~0x{:x} phint={} ...", + t, T::TYPE, laddr_hint, len, placement_hint); auto ext = cache->alloc_new_non_data_extent( t, len, placement_hint, INIT_GENERATION); if (!ext) { + SUBERRORT(seastore_tm, "insufficient space!", t); return crimson::ct_error::enospc::make(); } return lba_manager->alloc_extent( t, laddr_hint, *ext - ).si_then([ext=std::move(ext), laddr_hint, &t](auto &&) mutable { - LOG_PREFIX(TransactionManager::alloc_non_data_extent); - SUBDEBUGT(seastore_tm, "new extent: {}, laddr_hint: {}", t, *ext, laddr_hint); + ).si_then([ext=std::move(ext), &t, FNAME](auto &&) mutable { + SUBDEBUGT(seastore_tm, "allocated {}", t, *ext); return alloc_extent_iertr::make_ready_future>( std::move(ext)); }); @@ -330,14 +335,15 @@ class TransactionManager : public ExtentCallbackInterface { extent_len_t len, placement_hint_t placement_hint = placement_hint_t::HOT) { LOG_PREFIX(TransactionManager::alloc_data_extents); - SUBTRACET(seastore_tm, "{} len={}, placement_hint={}, laddr_hint={}", - t, T::TYPE, len, placement_hint, laddr_hint); + SUBDEBUGT(seastore_tm, "{} hint {}~0x{:x} phint={} ...", + t, T::TYPE, laddr_hint, len, placement_hint); auto exts = cache->alloc_new_data_extents( t, len, placement_hint, INIT_GENERATION); if (exts.empty()) { + SUBERRORT(seastore_tm, "insufficient space!", t); return crimson::ct_error::enospc::make(); } return lba_manager->alloc_extents( @@ -348,7 +354,7 @@ class TransactionManager : public ExtentCallbackInterface { EXTENT_DEFAULT_REF_COUNT ).si_then([exts=std::move(exts), &t, FNAME](auto &&) mutable { for (auto &ext : exts) { - SUBDEBUGT(seastore_tm, "new extent: {}", t, *ext); + SUBDEBUGT(seastore_tm, "allocated {}", t, *ext); } return alloc_extent_iertr::make_ready_future< std::vector>>(std::move(exts)); @@ -360,14 +366,17 @@ class TransactionManager : public ExtentCallbackInterface { Transaction &t, laddr_t laddr, extent_len_t len) { + LOG_PREFIX(TransactionManager::get_mutable_extent_by_laddr); + SUBDEBUGT(seastore_tm, "{}~0x{:x} ...", t, laddr, len); return get_pin(t, laddr ).si_then([this, &t, len](auto pin) { ceph_assert(pin->is_data_stable() && !pin->is_zero_reserved()); ceph_assert(!pin->is_clone()); ceph_assert(pin->get_length() == len); return this->read_pin(t, std::move(pin)); - }).si_then([this, &t](auto extent) { + }).si_then([this, &t, FNAME](auto extent) { auto ext = get_mutable_extent(t, extent)->template cast(); + SUBDEBUGT(seastore_tm, "got mutable {}", t, *ext); return read_extent_iertr::make_ready_future>( std::move(ext)); }); @@ -424,10 +433,8 @@ class TransactionManager : public ExtentCallbackInterface { extent_len_t original_len = pin->get_length(); paddr_t original_paddr = pin->get_val(); LOG_PREFIX(TransactionManager::remap_pin); - SUBDEBUGT(seastore_tm, - "original laddr: {}, original paddr: {}, original length: {}," - " remap to {} extents", - t, original_laddr, original_paddr, original_len, remaps.size()); + SUBDEBUGT(seastore_tm, "{}~0x{:x} {} into {} remaps ... {}", + t, original_laddr, original_len, original_paddr, remaps.size(), *pin); // The according extent might be stable or pending. auto fut = base_iertr::now(); if (!pin->is_indirect()) { @@ -484,14 +491,13 @@ class TransactionManager : public ExtentCallbackInterface { auto remap_len = remap.len; auto remap_laddr = (original_laddr + remap_offset).checked_to_laddr(); auto remap_paddr = original_paddr.add_offset(remap_offset); + SUBDEBUGT(seastore_tm, "remap direct pin into {}~0x{:x} {} ...", + t, remap_laddr, remap_len, remap_paddr); ceph_assert(remap_len < original_len); ceph_assert(remap_offset + remap_len <= original_len); ceph_assert(remap_len != 0); ceph_assert(remap_offset % cache->get_block_size() == 0); ceph_assert(remap_len % cache->get_block_size() == 0); - SUBDEBUGT(seastore_tm, - "remap laddr: {}, remap paddr: {}, remap length: {}", t, - remap_laddr, remap_paddr, remap_len); auto extent = cache->alloc_remapped_extent( t, remap_laddr, @@ -503,13 +509,15 @@ class TransactionManager : public ExtentCallbackInterface { } }); } - return fut.si_then([this, &t, &pin, &remaps, &extents] { + return fut.si_then([this, &t, &pin, &remaps, &extents, FNAME] { return lba_manager->remap_mappings( t, std::move(pin), std::vector(remaps.begin(), remaps.end()), std::move(extents) - ).si_then([](auto ret) { + ).si_then([FNAME, &t](auto ret) { + SUBDEBUGT(seastore_tm, "remapped {} pins", + t, ret.remapped_mappings.size()); return Cache::retire_extent_iertr::make_ready_future< std::vector>(std::move(ret.remapped_mappings)); }); @@ -529,11 +537,15 @@ class TransactionManager : public ExtentCallbackInterface { laddr_t hint, extent_len_t len) { LOG_PREFIX(TransactionManager::reserve_region); - SUBDEBUGT(seastore_tm, "len={}, laddr_hint={}", t, len, hint); + SUBDEBUGT(seastore_tm, "hint {}~0x{:x} ...", t, hint, len); return lba_manager->reserve_region( t, hint, - len); + len + ).si_then([FNAME, &t](auto pin) { + SUBDEBUGT(seastore_tm, "reserved {}", t, *pin); + return pin; + }); } /* @@ -560,15 +572,17 @@ class TransactionManager : public ExtentCallbackInterface { : mapping.get_key(); LOG_PREFIX(TransactionManager::clone_pin); - SUBDEBUGT(seastore_tm, "len={}, laddr_hint={}, clone_offset {}", - t, mapping.get_length(), hint, intermediate_key); + SUBDEBUGT(seastore_tm, "{} clone to hint {} ...", t, mapping, hint); return lba_manager->clone_mapping( t, hint, mapping.get_length(), intermediate_key, intermediate_base - ); + ).si_then([FNAME, &t](auto pin) { + SUBDEBUGT(seastore_tm, "cloned as {}", t, *pin); + return pin; + }); } /* alloc_extents @@ -583,10 +597,10 @@ class TransactionManager : public ExtentCallbackInterface { extent_len_t len, int num) { LOG_PREFIX(TransactionManager::alloc_extents); - SUBDEBUGT(seastore_tm, "len={}, laddr_hint={}, num={}", - t, len, hint, num); + SUBDEBUGT(seastore_tm, "hint {}~({} * 0x{:x}) ...", + t, hint, num, len); return seastar::do_with(std::vector>(), - [this, &t, hint, len, num] (auto &extents) { + [this, &t, hint, len, num, FNAME](auto &extents) { return trans_intr::do_for_each( boost::make_counting_iterator(0), boost::make_counting_iterator(num), @@ -595,7 +609,8 @@ class TransactionManager : public ExtentCallbackInterface { [&extents](auto &&node) { extents.push_back(node); }); - }).si_then([&extents] { + }).si_then([&extents, &t, FNAME] { + SUBDEBUGT(seastore_tm, "allocated {} extents", t, extents.size()); return alloc_extents_iertr::make_ready_future >>(std::move(extents)); }); @@ -701,7 +716,7 @@ class TransactionManager : public ExtentCallbackInterface { const std::string& key, const std::string& value) { LOG_PREFIX(TransactionManager::update_root_meta); - SUBDEBUGT(seastore_tm, "seastore_tm, {} -> {}", t, key, value); + SUBDEBUGT(seastore_tm, "seastore_tm, {} -> {} ...", t, key, value); return cache->get_root( t ).si_then([this, &t, &key, &value](RootBlockRef root) { @@ -756,7 +771,7 @@ class TransactionManager : public ExtentCallbackInterface { return cache->get_root(t).si_then([&t](auto croot) { LOG_PREFIX(TransactionManager::read_collection_root); auto ret = croot->get_root().collection_root.get(); - SUBTRACET(seastore_tm, "{}~{}", + SUBTRACET(seastore_tm, "{}~0x{:x}", t, ret.get_location(), ret.get_size()); return ret; }); @@ -769,7 +784,7 @@ class TransactionManager : public ExtentCallbackInterface { */ void write_collection_root(Transaction &t, coll_root_t cmroot) { LOG_PREFIX(TransactionManager::write_collection_root); - SUBDEBUGT(seastore_tm, "{}~{}", + SUBDEBUGT(seastore_tm, "{}~0x{:x}", t, cmroot.get_location(), cmroot.get_size()); auto croot = cache->get_root_fast(t); croot = cache->duplicate_for_write(t, croot)->cast(); @@ -872,7 +887,7 @@ class TransactionManager : public ExtentCallbackInterface { Transaction &t, LBAMappingRef pin) { LOG_PREFIX(TransactionManager::pin_to_extent); - SUBTRACET(seastore_tm, "getting extent {}", t, *pin); + SUBTRACET(seastore_tm, "getting absent extent from pin {} ...", t, *pin); static_assert(is_logical_type(T::TYPE)); using ret = pin_to_extent_ret; auto &pref = *pin; @@ -936,7 +951,8 @@ class TransactionManager : public ExtentCallbackInterface { extent_types_t type) { LOG_PREFIX(TransactionManager::pin_to_extent_by_type); - SUBTRACET(seastore_tm, "getting extent {} type {}", t, *pin, type); + SUBTRACET(seastore_tm, "getting absent extent from pin {} type {} ...", + t, *pin, type); assert(is_logical_type(type)); auto &pref = *pin; return cache->get_absent_extent_by_type( From 97e68b20aa3bf0d54ca0e10d0e7c9003adb61eb5 Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Tue, 3 Sep 2024 16:25:25 +0800 Subject: [PATCH 067/148] crimson/osd/backfill_state: add the object to be pushed in the peer missing set of PeeringState Fixes: https://tracker.ceph.com/issues/67874 Signed-off-by: Xuehan Xu --- src/crimson/osd/backfill_facades.h | 6 ++++++ src/crimson/osd/backfill_state.cc | 27 +++++++++++++++++++++------ src/crimson/osd/backfill_state.h | 7 ++++++- src/crimson/osd/pg_recovery.cc | 8 +++++--- src/crimson/osd/pg_recovery.h | 3 ++- src/test/crimson/test_backfill.cc | 10 ++++++++-- 6 files changed, 48 insertions(+), 13 deletions(-) diff --git a/src/crimson/osd/backfill_facades.h b/src/crimson/osd/backfill_facades.h index 683dc6ea64948..522a93a1ddcbe 100644 --- a/src/crimson/osd/backfill_facades.h +++ b/src/crimson/osd/backfill_facades.h @@ -52,6 +52,12 @@ struct PeeringFacade final : BackfillState::PeeringFacade { return peering_state.is_backfilling(); } + void prepare_backfill_for_missing( + const hobject_t &soid, + const eversion_t &v, + const std::vector &peers) override { + return peering_state.prepare_backfill_for_missing(soid, v, peers); + } PeeringFacade(PeeringState& peering_state) : peering_state(peering_state) { } diff --git a/src/crimson/osd/backfill_state.cc b/src/crimson/osd/backfill_state.cc index d015a77545cf4..5e4687877c2cd 100644 --- a/src/crimson/osd/backfill_state.cc +++ b/src/crimson/osd/backfill_state.cc @@ -251,6 +251,7 @@ BackfillState::Enqueuing::update_on_peers(const hobject_t& check) logger().debug("{}: check={}", __func__, check); const auto& primary_bi = backfill_state().backfill_info; result_t result { {}, primary_bi.begin }; + std::map>> backfills; for (const auto& bt : peering_state().get_backfill_targets()) { const auto& peer_bi = backfill_state().peer_backfill_info.at(bt); @@ -258,9 +259,13 @@ BackfillState::Enqueuing::update_on_peers(const hobject_t& check) // Find all check peers that have the wrong version if (const eversion_t& obj_v = primary_bi.objects.begin()->second; check == primary_bi.begin && check == peer_bi.begin) { - if(peer_bi.objects.begin()->second != obj_v && - backfill_state().progress_tracker->enqueue_push(primary_bi.begin)) { - backfill_listener().enqueue_push(primary_bi.begin, obj_v); + if (peer_bi.objects.begin()->second != obj_v) { + std::ignore = backfill_state().progress_tracker->enqueue_push( + primary_bi.begin); + auto &[v, peers] = backfills[primary_bi.begin]; + assert(v == obj_v || v == eversion_t()); + v = obj_v; + peers.push_back(bt); } else { // it's fine, keep it! OR already recovering } @@ -269,12 +274,22 @@ BackfillState::Enqueuing::update_on_peers(const hobject_t& check) // Only include peers that we've caught up to their backfill line // otherwise, they only appear to be missing this object // because their peer_bi.begin > backfill_info.begin. - if (primary_bi.begin > peering_state().get_peer_last_backfill(bt) && - backfill_state().progress_tracker->enqueue_push(primary_bi.begin)) { - backfill_listener().enqueue_push(primary_bi.begin, obj_v); + if (primary_bi.begin > peering_state().get_peer_last_backfill(bt)) { + std::ignore = backfill_state().progress_tracker->enqueue_push( + primary_bi.begin); + auto &[v, peers] = backfills[primary_bi.begin]; + assert(v == obj_v || v == eversion_t()); + v = obj_v; + peers.push_back(bt); } } } + for (auto &backfill : backfills) { + auto &soid = backfill.first; + auto &obj_v = backfill.second.first; + auto &peers = backfill.second.second; + backfill_listener().enqueue_push(soid, obj_v, peers); + } return result; } diff --git a/src/crimson/osd/backfill_state.h b/src/crimson/osd/backfill_state.h index 4cdd4daafce6d..da88b611fcf9b 100644 --- a/src/crimson/osd/backfill_state.h +++ b/src/crimson/osd/backfill_state.h @@ -315,7 +315,8 @@ struct BackfillState::BackfillListener { virtual void enqueue_push( const hobject_t& obj, - const eversion_t& v) = 0; + const eversion_t& v, + const std::vector &peers) = 0; virtual void enqueue_drop( const pg_shard_t& target, @@ -354,6 +355,10 @@ struct BackfillState::PeeringFacade { virtual void update_complete_backfill_object_stats(const hobject_t &hoid, const pg_stat_t &stats) = 0; virtual bool is_backfilling() const = 0; + virtual void prepare_backfill_for_missing( + const hobject_t &soid, + const eversion_t &v, + const std::vector &peers) = 0; virtual ~PeeringFacade() {} }; diff --git a/src/crimson/osd/pg_recovery.cc b/src/crimson/osd/pg_recovery.cc index f4a7d8a63db9f..55d64925ec5c0 100644 --- a/src/crimson/osd/pg_recovery.cc +++ b/src/crimson/osd/pg_recovery.cc @@ -520,10 +520,12 @@ void PGRecovery::request_primary_scan( void PGRecovery::enqueue_push( const hobject_t& obj, - const eversion_t& v) + const eversion_t& v, + const std::vector &peers) { - logger().info("{}: obj={} v={}", - __func__, obj, v); + logger().info("{}: obj={} v={} peers={}", __func__, obj, v, peers); + auto &peering_state = pg->get_peering_state(); + peering_state.prepare_backfill_for_missing(obj, v, peers); auto [recovering, added] = pg->get_recovery_backend()->add_recovering(obj); if (!added) return; diff --git a/src/crimson/osd/pg_recovery.h b/src/crimson/osd/pg_recovery.h index f5b8632a38263..eb9c928fe5db6 100644 --- a/src/crimson/osd/pg_recovery.h +++ b/src/crimson/osd/pg_recovery.h @@ -106,7 +106,8 @@ class PGRecovery : public crimson::osd::BackfillState::BackfillListener { const hobject_t& begin) final; void enqueue_push( const hobject_t& obj, - const eversion_t& v) final; + const eversion_t& v, + const std::vector &peers) final; void enqueue_drop( const pg_shard_t& target, const hobject_t& obj, diff --git a/src/test/crimson/test_backfill.cc b/src/test/crimson/test_backfill.cc index 1ce9b42ad381f..30aef449278ba 100644 --- a/src/test/crimson/test_backfill.cc +++ b/src/test/crimson/test_backfill.cc @@ -128,7 +128,8 @@ class BackfillFixture : public crimson::osd::BackfillState::BackfillListener { void enqueue_push( const hobject_t& obj, - const eversion_t& v) override; + const eversion_t& v, + const std::vector &peers) override; void enqueue_drop( const pg_shard_t& target, @@ -222,6 +223,10 @@ struct BackfillFixture::PeeringFacade void update_complete_backfill_object_stats(const hobject_t &hoid, const pg_stat_t &stats) override { } + void prepare_backfill_for_missing( + const hobject_t &soid, + const eversion_t &v, + const std::vector &peers) override {} bool is_backfilling() const override { return true; } @@ -282,7 +287,8 @@ void BackfillFixture::request_primary_scan( void BackfillFixture::enqueue_push( const hobject_t& obj, - const eversion_t& v) + const eversion_t& v, + const std::vector &) { for (auto& [ _, bt ] : backfill_targets) { bt.store.push(obj, v); From 5b90117348d030db16738ae06e9308ade4355bb0 Mon Sep 17 00:00:00 2001 From: JonBailey1993 Date: Tue, 8 Oct 2024 09:43:10 +0100 Subject: [PATCH 068/148] common/io_exerciser: Add version argument to callbacks in ceph_radios_io_sequence Add new version that was missing from ceph_test_rados_io_sequence callbacks due to interface changes Signed-off-by: Jon Bailey --- src/common/io_exerciser/RadosIo.cc | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/src/common/io_exerciser/RadosIo.cc b/src/common/io_exerciser/RadosIo.cc index 3f907ccf47416..41be2885f3f67 100644 --- a/src/common/io_exerciser/RadosIo.cc +++ b/src/common/io_exerciser/RadosIo.cc @@ -118,7 +118,8 @@ void RadosIo::applyIoOp(IoOp &op) op_info = std::make_shared(0, op.length1); op_info->bl1 = db->generate_data(0, op.length1); op_info->wop.write_full(op_info->bl1); - auto create_cb = [this] (boost::system::error_code ec) { + auto create_cb = [this] (boost::system::error_code ec, + version_t ver) { ceph_assert(ec == boost::system::errc::success); finish_io(); }; @@ -132,7 +133,8 @@ void RadosIo::applyIoOp(IoOp &op) start_io(); op_info = std::make_shared(); op_info->wop.remove(); - auto remove_cb = [this] (boost::system::error_code ec) { + auto remove_cb = [this] (boost::system::error_code ec, + version_t ver) { ceph_assert(ec == boost::system::errc::success); finish_io(); }; @@ -148,7 +150,9 @@ void RadosIo::applyIoOp(IoOp &op) op_info->rop.read(op.offset1 * block_size, op.length1 * block_size, &op_info->bl1, nullptr); - auto read_cb = [this, op_info] (boost::system::error_code ec, bufferlist bl) { + auto read_cb = [this, op_info] (boost::system::error_code ec, + version_t ver, + bufferlist bl) { ceph_assert(ec == boost::system::errc::success); db->validate(op_info->bl1, op_info->offset1, op_info->length1); finish_io(); @@ -174,6 +178,7 @@ void RadosIo::applyIoOp(IoOp &op) op.length2 * block_size, &op_info->bl2, nullptr); auto read2_cb = [this, op_info] (boost::system::error_code ec, + version_t ver, bufferlist bl) { ceph_assert(ec == boost::system::errc::success); db->validate(op_info->bl1, op_info->offset1, op_info->length1); @@ -202,6 +207,7 @@ void RadosIo::applyIoOp(IoOp &op) op.length3 * block_size, &op_info->bl3, nullptr); auto read3_cb = [this, op_info] (boost::system::error_code ec, + version_t ver, bufferlist bl) { ceph_assert(ec == boost::system::errc::success); db->validate(op_info->bl1, op_info->offset1, op_info->length1); @@ -222,7 +228,8 @@ void RadosIo::applyIoOp(IoOp &op) op_info->bl1 = db->generate_data(op.offset1, op.length1); op_info->wop.write(op.offset1 * block_size, op_info->bl1); - auto write_cb = [this] (boost::system::error_code ec) { + auto write_cb = [this] (boost::system::error_code ec, + version_t ver) { ceph_assert(ec == boost::system::errc::success); finish_io(); }; @@ -241,7 +248,8 @@ void RadosIo::applyIoOp(IoOp &op) op_info->bl2 = db->generate_data(op.offset2, op.length2); op_info->wop.write(op.offset1 * block_size, op_info->bl1); op_info->wop.write(op.offset2 * block_size, op_info->bl2); - auto write2_cb = [this] (boost::system::error_code ec) { + auto write2_cb = [this] (boost::system::error_code ec, + version_t ver) { ceph_assert(ec == boost::system::errc::success); finish_io(); }; @@ -263,7 +271,8 @@ void RadosIo::applyIoOp(IoOp &op) op_info->wop.write(op.offset1 * block_size, op_info->bl1); op_info->wop.write(op.offset2 * block_size, op_info->bl2); op_info->wop.write(op.offset3 * block_size, op_info->bl3); - auto write3_cb = [this] (boost::system::error_code ec) { + auto write3_cb = [this] (boost::system::error_code ec, + version_t ver) { ceph_assert(ec == boost::system::errc::success); finish_io(); }; From 256b20de486337dde92fcb2067e0351ea6e67f54 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Thu, 26 Sep 2024 20:39:40 -0400 Subject: [PATCH 069/148] qa: do not fail cephfs QA tests for slow bluestore ops Fixes: https://tracker.ceph.com/issues/68283 Signed-off-by: Patrick Donnelly --- qa/cephfs/overrides/ignorelist_health.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/qa/cephfs/overrides/ignorelist_health.yaml b/qa/cephfs/overrides/ignorelist_health.yaml index 678548fe2cc22..94b4257977759 100644 --- a/qa/cephfs/overrides/ignorelist_health.yaml +++ b/qa/cephfs/overrides/ignorelist_health.yaml @@ -21,3 +21,6 @@ overrides: - overall HEALTH_ - Replacing daemon - deprecated feature inline_data + - BLUESTORE_SLOW_OP_ALERT + - slow operation indications in BlueStore + - experiencing slow operations in BlueStore From 9fc65f160cd3764a68fb3697d067c358761fc837 Mon Sep 17 00:00:00 2001 From: Adam Kupczyk Date: Mon, 7 Oct 2024 11:45:41 +0000 Subject: [PATCH 070/148] os/bluestore: Make truncate() drop unused allocations Now when truncate() drops unused allocations. Modified Close() in BlueRocksEnv to unconditionally call truncate. Fixes: https://tracker.ceph.com/issues/68385 Signed-off-by: Adam Kupczyk --- src/os/bluestore/BlueFS.cc | 65 +++++++++++++++++++++++++------- src/os/bluestore/BlueRocksEnv.cc | 14 ++----- 2 files changed, 56 insertions(+), 23 deletions(-) diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index 3dcd96830c48d..5f4f1a4d48ac2 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -3760,15 +3760,16 @@ int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/ { auto t0 = mono_clock::now(); std::lock_guard hl(h->lock); + auto& fnode = h->file->fnode; dout(10) << __func__ << " 0x" << std::hex << offset << std::dec - << " file " << h->file->fnode << dendl; + << " file " << fnode << dendl; if (h->file->deleted) { dout(10) << __func__ << " deleted, no-op" << dendl; return 0; } // we never truncate internal log files - ceph_assert(h->file->fnode.ino > 1); + ceph_assert(fnode.ino > 1); // truncate off unflushed data? if (h->pos < offset && @@ -3782,20 +3783,58 @@ int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/ if (r < 0) return r; } - if (offset == h->file->fnode.size) { - return 0; // no-op! - } - if (offset > h->file->fnode.size) { + if (offset > fnode.size) { ceph_abort_msg("truncate up not supported"); } - ceph_assert(h->file->fnode.size >= offset); + ceph_assert(offset <= fnode.size); _flush_bdev(h); - - std::lock_guard ll(log.lock); - vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size - offset); - h->file->fnode.size = offset; - h->file->is_dirty = true; - log.t.op_file_update_inc(h->file->fnode); + { + std::lock_guard ll(log.lock); + std::lock_guard dl(dirty.lock); + bool changed_extents = false; + vselector->sub_usage(h->file->vselector_hint, fnode); + uint64_t x_off = 0; + auto p = fnode.seek(offset, &x_off); + uint64_t cut_off = + (p == fnode.extents.end()) ? 0 : p2roundup(x_off, alloc_size[p->bdev]); + uint64_t new_allocated; + if (0 == cut_off) { + // whole pextent to remove + changed_extents = true; + new_allocated = offset; + } else if (cut_off < p->length) { + dirty.pending_release[p->bdev].insert(p->offset + cut_off, p->length - cut_off); + new_allocated = (offset - x_off) + cut_off; + p->length = cut_off; + changed_extents = true; + ++p; + } else { + ceph_assert(cut_off >= p->length); + new_allocated = (offset - x_off) + p->length; + // just leave it here + ++p; + } + while (p != fnode.extents.end()) { + dirty.pending_release[p->bdev].insert(p->offset, p->length); + p = fnode.extents.erase(p); + changed_extents = true; + } + if (changed_extents) { + fnode.size = offset; + fnode.allocated = new_allocated; + fnode.reset_delta(); + log.t.op_file_update(fnode); + // sad, but is_dirty must be set to signal flushing of the log + h->file->is_dirty = true; + } else { + if (offset != fnode.size) { + fnode.size = offset; + //skipping log.t.op_file_update_inc, it will be done by flush() + h->file->is_dirty = true; + } + } + vselector->add_usage(h->file->vselector_hint, fnode); + } logger->tinc(l_bluefs_truncate_lat, mono_clock::now() - t0); return 0; } diff --git a/src/os/bluestore/BlueRocksEnv.cc b/src/os/bluestore/BlueRocksEnv.cc index 68040af428280..7cbe0a1d12146 100644 --- a/src/os/bluestore/BlueRocksEnv.cc +++ b/src/os/bluestore/BlueRocksEnv.cc @@ -221,18 +221,12 @@ class BlueRocksWritableFile : public rocksdb::WritableFile { } rocksdb::Status Close() override { - fs->fsync(h); - // mimic posix env, here. shrug. - size_t block_size; - size_t last_allocated_block; - GetPreallocationStatus(&block_size, &last_allocated_block); - if (last_allocated_block > 0) { - int r = fs->truncate(h, h->pos); - if (r < 0) - return err_to_status(r); + int r = fs->truncate(h, h->pos); + if (r < 0) { + return err_to_status(r); } - + fs->fsync(h); return rocksdb::Status::OK(); } From 512eea1af52126e9b082744ee7b870c12c23d55d Mon Sep 17 00:00:00 2001 From: Ernesto Puerta <37327689+epuertat@users.noreply.github.com> Date: Wed, 9 Oct 2024 08:12:42 +0200 Subject: [PATCH 071/148] doc: fix email Signed-off-by: Ernesto Puerta <37327689+epuertat@users.noreply.github.com> --- doc/governance.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/governance.rst b/doc/governance.rst index 3105a917f1b01..284a9570397c3 100644 --- a/doc/governance.rst +++ b/doc/governance.rst @@ -82,7 +82,7 @@ Current Members * Casey Bodley * Dan van der Ster * David Orman - * Ernesto Puerta + * Ernesto Puerta * Gregory Farnum * Haomai Wang * Ilya Dryomov From 1b535c011f1d0e50a149e7195d4b50af28c01800 Mon Sep 17 00:00:00 2001 From: Jos Collin Date: Wed, 9 Oct 2024 15:46:00 +0530 Subject: [PATCH 072/148] doc: update 'header get' output in cephfs-journal-tool.rst Signed-off-by: Jos Collin --- doc/cephfs/cephfs-journal-tool.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/cephfs/cephfs-journal-tool.rst b/doc/cephfs/cephfs-journal-tool.rst index 4ad7304481f7f..3ae1139ceac2c 100644 --- a/doc/cephfs/cephfs-journal-tool.rst +++ b/doc/cephfs/cephfs-journal-tool.rst @@ -105,12 +105,12 @@ Example: header get/set "write_pos": 4274947, "expire_pos": 4194304, "trimmed_pos": 4194303, + "stream_format": 1, "layout": { "stripe_unit": 4194304, - "stripe_count": 4194304, + "stripe_count": 1, "object_size": 4194304, - "cas_hash": 4194304, - "object_stripe_unit": 4194304, - "pg_pool": 4194304}} + "pool_id": 2, + "pool_ns": ""}} # cephfs-journal-tool header set trimmed_pos 4194303 Updating trimmed_pos 0x400000 -> 0x3fffff From a0c51d0e7f05e84411e3877b5861f3eec26ad934 Mon Sep 17 00:00:00 2001 From: JonBailey1993 Date: Wed, 9 Oct 2024 11:28:42 +0100 Subject: [PATCH 073/148] common/io_exerciser: Modify is_locked_by_me call in ceph_test_rados_io_sequence is_locked_by_me() is a function of ceph::mutex which is only used in debug builds. By using the ceph_mutex_is_locked_by_me macro, we can neatly make sure we only run this function in debug mode, allowing compilation to no longer be affected when running in release mode. Signed-off-by: Jon Bailey --- src/common/io_exerciser/RadosIo.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/io_exerciser/RadosIo.cc b/src/common/io_exerciser/RadosIo.cc index 41be2885f3f67..a28a1e2f488b4 100644 --- a/src/common/io_exerciser/RadosIo.cc +++ b/src/common/io_exerciser/RadosIo.cc @@ -81,7 +81,7 @@ RadosIo::AsyncOpInfo::AsyncOpInfo(uint64_t offset1, uint64_t length1, bool RadosIo::readyForIoOp(IoOp &op) { - ceph_assert(lock.is_locked_by_me()); //Must be called with lock held + ceph_assert(ceph_mutex_is_locked_by_me(lock)); //Must be called with lock held if (!om->readyForIoOp(op)) { return false; } From 78ae3b13509b5cc053b1f5831f0f6a675e99975b Mon Sep 17 00:00:00 2001 From: Avan Thakkar Date: Wed, 9 Oct 2024 18:31:11 +0530 Subject: [PATCH 074/148] qa/cephfs: update earmark values to valid ones in test_volumes.py smb.test is an invalid earmark now it should be either smb or smb.cluster.. Update the test_volumes.py to set valid earmarks wherever used. Fixes: https://tracker.ceph.com/issues/68448 Signed-off-by: Avan Thakkar --- qa/tasks/cephfs/test_volumes.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/qa/tasks/cephfs/test_volumes.py b/qa/tasks/cephfs/test_volumes.py index 2baefd72c3fbc..9ca85ee67f9e3 100644 --- a/qa/tasks/cephfs/test_volumes.py +++ b/qa/tasks/cephfs/test_volumes.py @@ -2388,7 +2388,7 @@ def test_subvolume_set_and_get_earmark(self): self._fs_cmd("subvolume", "create", self.volname, subvolume) # set earmark - earmark = "smb.test" + earmark = "smb" self._fs_cmd("subvolume", "earmark", "set", self.volname, subvolume, "--earmark", earmark) # get earmark @@ -2401,7 +2401,7 @@ def test_subvolume_clear_earmark(self): self._fs_cmd("subvolume", "create", self.volname, subvolume) # set earmark - earmark = "smb.test" + earmark = "smb" self._fs_cmd("subvolume", "earmark", "set", self.volname, subvolume, "--earmark", earmark) # remove earmark @@ -2559,7 +2559,7 @@ def test_subvolume_info(self): self.assertIn(feature, subvol_info["features"], msg="expected feature '{0}' in subvolume".format(feature)) # set earmark - earmark = "smb.test" + earmark = "smb" self._fs_cmd("subvolume", "earmark", "set", self.volname, subvolume, "--earmark", earmark) subvol_info = json.loads(self._get_subvolume_info(self.volname, subvolume)) From d513cc527ca5260ffb957c739cb84f8d3474d728 Mon Sep 17 00:00:00 2001 From: Vallari Agrawal Date: Wed, 9 Oct 2024 16:17:04 +0530 Subject: [PATCH 075/148] labeler: add nvmeof labelers Signed-off-by: Vallari Agrawal --- .github/labeler.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.github/labeler.yml b/.github/labeler.yml index 1b50ff7c5a391..9f2ed1e479019 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -263,6 +263,19 @@ rbd: - systemd/rbdmap.service.in - udev/50-rbd.rules +nvmeof: + - qa/suites/nvmeof/** + - qa/tasks/nvmeof.py + - qa/workunits/nvmeof/** + - src/ceph_nvmeof_monitor_client.cc + - src/cephadm/cephadmlib/daemons/nvmeof.py + - src/messages/MNVMeofGw* + - src/mon/NVMeofGw* + - src/nvmeof/** + - src/pybind/mgr/cephadm/services/nvmeof.py + - src/pybind/mgr/cephadm/templates/services/nvmeof/** + - src/tools/ceph-dencoder/nvmeof* + rgw: - qa/suites/rgw/** - qa/tasks/rgw* From 6c419323584103aefed847d8af0a261eda938c29 Mon Sep 17 00:00:00 2001 From: Anoop C S Date: Thu, 10 Oct 2024 11:00:03 +0530 Subject: [PATCH 076/148] cephadm/smb: Add NET_RAW capability to deploy ctdbd CTDB heavily depends on raw sockets to send gratuitous ARPs[1](see the second point from the list of reasons to use gratuitous ARPs). As per the current design it is also inevitable while sending tickle ACKs[2] in the event of an IP failover. man capabilities(7)[3] further mandates CAP_NET_RAW to use raw sockets. Therefore append NET_RAW to the list of capabilties while deploying ctdbd containers. [1] https://wiki.wireshark.org/Gratuitous_ARP [2] https://ctdb.samba.org/manpages/ctdb.1.html [3] https://www.man7.org/linux/man-pages/man7/capabilities.7.html Signed-off-by: Anoop C S --- src/cephadm/cephadmlib/daemons/smb.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/cephadm/cephadmlib/daemons/smb.py b/src/cephadm/cephadmlib/daemons/smb.py index 74cb13f4ab022..a0e648e857cbf 100644 --- a/src/cephadm/cephadmlib/daemons/smb.py +++ b/src/cephadm/cephadmlib/daemons/smb.py @@ -370,6 +370,8 @@ def container_args(self) -> List[str]: # make conditional? # CAP_NET_ADMIN is needed for event script to add public ips to iface cargs.append('--cap-add=NET_ADMIN') + # CAP_NET_RAW allows to send gratuitous ARPs/tickle ACKs via raw sockets + cargs.append('--cap-add=NET_RAW') return cargs From 0bab553399c2c407140f8223f22afb59d5819a10 Mon Sep 17 00:00:00 2001 From: Vallari Agrawal Date: Tue, 1 Oct 2024 16:36:21 +0530 Subject: [PATCH 077/148] qa/suites/nvmeof: use "latest" image of gateway and cli Change nvmeof gateway and cli image from 1.2 to "latest". Signed-off-by: Vallari Agrawal --- qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml | 4 ++-- qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml | 4 ++-- qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml | 4 ++-- .../thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml b/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml index 2e4741e814079..7c97edae552da 100644 --- a/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml +++ b/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml @@ -1,14 +1,14 @@ tasks: - nvmeof: installer: host.a - gw_image: quay.io/ceph/nvmeof:1.2 # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest" + gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest" rbd: pool_name: mypool image_name_prefix: myimage gateway_config: subsystems_count: 3 namespaces_count: 20 - cli_image: quay.io/ceph/nvmeof-cli:1.2 + cli_image: quay.io/ceph/nvmeof-cli:latest - cephadm.wait_for_service: service: nvmeof.mypool.mygroup0 diff --git a/qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml b/qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml index 2e873a04bab2a..9ef3700442717 100644 --- a/qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml +++ b/qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml @@ -1,14 +1,14 @@ tasks: - nvmeof: installer: host.a - gw_image: quay.io/ceph/nvmeof:1.2 # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest" + gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest" rbd: pool_name: mypool image_name_prefix: myimage gateway_config: subsystems_count: 3 namespaces_count: 20 - cli_image: quay.io/ceph/nvmeof-cli:1.2 + cli_image: quay.io/ceph/nvmeof-cli:latest - cephadm.wait_for_service: service: nvmeof.mypool.mygroup0 diff --git a/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml b/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml index 83d16e4cb2c9a..12cb50b408d49 100644 --- a/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml +++ b/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml @@ -1,14 +1,14 @@ tasks: - nvmeof: installer: host.a - gw_image: quay.io/ceph/nvmeof:1.2 # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest" + gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest" rbd: pool_name: mypool image_name_prefix: myimage gateway_config: subsystems_count: 3 namespaces_count: 20 - cli_image: quay.io/ceph/nvmeof-cli:1.2 + cli_image: quay.io/ceph/nvmeof-cli:latest - cephadm.wait_for_service: service: nvmeof.mypool.mygroup0 diff --git a/qa/suites/nvmeof/thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml b/qa/suites/nvmeof/thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml index 6db0c0d4e1829..b4755a6433b0a 100644 --- a/qa/suites/nvmeof/thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml +++ b/qa/suites/nvmeof/thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml @@ -1,14 +1,14 @@ tasks: - nvmeof: installer: host.a - gw_image: quay.io/ceph/nvmeof:1.2 # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest" + gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest" rbd: pool_name: mypool image_name_prefix: myimage gateway_config: subsystems_count: 3 namespaces_count: 20 # each subsystem - cli_image: quay.io/ceph/nvmeof-cli:1.2 + cli_image: quay.io/ceph/nvmeof-cli:latest - cephadm.wait_for_service: service: nvmeof.mypool.mygroup0 From 303f18b1aff8274d79b1d5e7d84ee3096e4694a1 Mon Sep 17 00:00:00 2001 From: Vallari Agrawal Date: Wed, 9 Oct 2024 12:57:32 +0530 Subject: [PATCH 078/148] qa/workunits/nvmeof/setup_subsystem.sh: use --no-group-append In newer version of nvmeof cli, "subsystem add" needs this tag to ensure subsystem name is value of --subsystem. Otherwise, in newer cli version, the gateway group is appended at the end of the subsystem name. This fixes the teuthology nvmeof suite (currently all jobs fails because of this). Signed-off-by: Vallari Agrawal --- qa/workunits/nvmeof/setup_subsystem.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qa/workunits/nvmeof/setup_subsystem.sh b/qa/workunits/nvmeof/setup_subsystem.sh index fb72e1d6402dd..cc4024323eb87 100755 --- a/qa/workunits/nvmeof/setup_subsystem.sh +++ b/qa/workunits/nvmeof/setup_subsystem.sh @@ -29,7 +29,7 @@ list_subsystems () { # add all subsystems for i in $(seq 1 $NVMEOF_SUBSYSTEMS_COUNT); do subsystem_nqn="${NVMEOF_SUBSYSTEMS_PREFIX}${i}" - sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT subsystem add --subsystem $subsystem_nqn + sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT subsystem add --subsystem $subsystem_nqn --no-group-append done list_subsystems From 074dee9cbd63df7529977fc969e6e333fd4312eb Mon Sep 17 00:00:00 2001 From: Lee Sanders Date: Fri, 4 Oct 2024 15:13:57 +0100 Subject: [PATCH 079/148] qa/suites/tasks/cbt.py: Deprecating cosbench from Teuthology in preparation for deletion of cosbench support from CBT. The code being deleting is infrastructure code, no qa test suite uses this function. Therefore it can be safely deleted. Signed-off-by: Lee Sanders --- qa/tasks/cbt.py | 131 +----------------------------------------------- 1 file changed, 2 insertions(+), 129 deletions(-) diff --git a/qa/tasks/cbt.py b/qa/tasks/cbt.py index 84e096520b40f..e6a9dc8223cf8 100644 --- a/qa/tasks/cbt.py +++ b/qa/tasks/cbt.py @@ -47,22 +47,11 @@ def generate_cbt_config(self): benchmark_config = self.config.get('benchmarks') benchmark_type = next(iter(benchmark_config.keys())) + if benchmark_type in ['librbdfio', 'fio']: testdir = misc.get_testdir(self.ctx) benchmark_config[benchmark_type]['cmd_path'] = os.path.join(testdir, 'fio/fio') - if benchmark_type == 'cosbench': - # create cosbench_dir and cosbench_xml_dir - testdir = misc.get_testdir(self.ctx) - benchmark_config['cosbench']['cosbench_dir'] = os.path.join(testdir, 'cos') - benchmark_config['cosbench']['cosbench_xml_dir'] = os.path.join(testdir, 'xml') - self.ctx.cluster.run(args=['mkdir', '-p', '-m0755', '--', benchmark_config['cosbench']['cosbench_xml_dir']]) - benchmark_config['cosbench']['controller'] = osd_hosts[0] - - # set auth details - remotes_and_roles = self.ctx.cluster.remotes.items() - ips = [host for (host, port) in - (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)] - benchmark_config['cosbench']['auth'] = "username=cosbench:operator;password=intel2012;url=http://%s:80/auth/v1.0;retry=9" %(ips[0]) + client_endpoints_config = self.config.get('client_endpoints', None) monitoring_profiles = self.config.get('monitoring_profiles', {}) @@ -117,77 +106,6 @@ def install_dependencies(self): ] ) - if benchmark_type == 'cosbench': - # install cosbench - self.log.info('install dependencies for cosbench') - if system_type == 'rpm': - cosbench_depends = ['wget', 'unzip', 'java-1.7.0-openjdk', 'curl'] - else: - cosbench_depends = ['wget', 'unzip', 'openjdk-8-jre', 'curl'] - self.first_mon.run(args=install_cmd + cosbench_depends) - testdir = misc.get_testdir(self.ctx) - cosbench_version = '0.4.2.c3' - cosbench_location = 'https://github.com/intel-cloud/cosbench/releases/download/v0.4.2.c3/0.4.2.c3.zip' - os_version = misc.get_system_type(self.first_mon, False, True) - - # additional requirements for bionic - if os_version == '18.04': - self.first_mon.run( - args=['sudo', 'apt-get', '-y', 'purge', 'openjdk-11*']) - # use our own version of cosbench - cosbench_version = 'cosbench-0.4.2.c3.1' - # contains additional parameter "-N" to nc - cosbench_location = 'http://drop.ceph.com/qa/cosbench-0.4.2.c3.1.zip' - cosbench_dir = os.path.join(testdir, cosbench_version) - self.ctx.cluster.run(args=['mkdir', '-p', '-m0755', '--', cosbench_dir]) - self.first_mon.run( - args=[ - 'cd', testdir, run.Raw('&&'), - 'wget', - cosbench_location, run.Raw('&&'), - 'unzip', '{name}.zip'.format(name=cosbench_version), '-d', cosbench_version - ] - ) - else: - self.first_mon.run( - args=[ - 'cd', testdir, run.Raw('&&'), - 'wget', - cosbench_location, run.Raw('&&'), - 'unzip', '{name}.zip'.format(name=cosbench_version) - ] - ) - self.first_mon.run( - args=[ - 'cd', testdir, run.Raw('&&'), - 'ln', '-s', cosbench_version, 'cos', - ] - ) - self.first_mon.run( - args=[ - 'cd', os.path.join(testdir, 'cos'), run.Raw('&&'), - 'chmod', '+x', run.Raw('*.sh'), - ] - ) - - # start cosbench and check info - self.log.info('start cosbench') - self.first_mon.run( - args=[ - 'cd', testdir, run.Raw('&&'), - 'cd', 'cos', run.Raw('&&'), - 'sh', 'start-all.sh' - ] - ) - self.log.info('check cosbench info') - self.first_mon.run( - args=[ - 'cd', testdir, run.Raw('&&'), - 'cd', 'cos', run.Raw('&&'), - 'sh', 'cli.sh', 'info' - ] - ) - def checkout_cbt(self): testdir = misc.get_testdir(self.ctx) repo = self.config.get('repo', 'https://github.com/ceph/cbt.git') @@ -269,51 +187,6 @@ def end(self): ] ) - if benchmark_type == 'cosbench': - os_version = misc.get_system_type(self.first_mon, False, True) - if os_version == '18.04': - cosbench_version = 'cosbench-0.4.2.c3.1' - else: - cosbench_version = '0.4.2.c3' - # note: stop-all requires 'nc' - self.first_mon.run( - args=[ - 'cd', testdir, run.Raw('&&'), - 'cd', 'cos', run.Raw('&&'), - 'sh', 'stop-all.sh', - run.Raw('||'), 'true' - ] - ) - self.first_mon.run( - args=[ - 'sudo', 'killall', '-9', 'java', - run.Raw('||'), 'true' - ] - ) - self.first_mon.run( - args=[ - 'rm', '--one-file-system', '-rf', '--', - '{tdir}/cos'.format(tdir=testdir), - ] - ) - self.first_mon.run( - args=[ - 'rm', '--one-file-system', '-rf', '--', - '{tdir}/{version}'.format(tdir=testdir, version=cosbench_version), - ] - ) - self.first_mon.run( - args=[ - 'rm', '--one-file-system', '-rf', '--', - '{tdir}/{version}.zip'.format(tdir=testdir, version=cosbench_version), - ] - ) - self.first_mon.run( - args=[ - 'rm', '--one-file-system', '-rf', '--', - '{tdir}/xml'.format(tdir=testdir), - ] - ) # Collect cbt performance data cbt_performance = CBTperformance() cbt_performance.collect(self.ctx, self.config) From 9b7fb48c2464ddf26271f43cd8dd6ad969a80fe0 Mon Sep 17 00:00:00 2001 From: Lee Sanders Date: Thu, 10 Oct 2024 11:21:46 +0100 Subject: [PATCH 080/148] Add cosbench deprecation warning to qa/README Signed-off-by: Lee Sanders --- qa/README | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/qa/README b/qa/README index f9b8988c6f9f6..a6a95c479bc9c 100644 --- a/qa/README +++ b/qa/README @@ -83,3 +83,8 @@ supported_distros as distros$ will be run just once: either on centos, rhel or ubuntu, chosen randomly. The teuthology code can be found in https://github.com/ceph/teuthology.git + +Note: The performance suites clone CBT from master here: https://github.com/ceph/cbt.git +CBT will not support cosbench beyond release tag v0.3, therefore no qa suite should use cosbench. +cosbench support has been removed from qa/tasks/cbt.py. + From 0317b5f87ac22399f6242d72f0bb9924794687de Mon Sep 17 00:00:00 2001 From: Zac Dover Date: Thu, 10 Oct 2024 22:11:22 +1000 Subject: [PATCH 081/148] doc: SubmittingPatches-backports - remove backports team Remove all references to the "Stable Releases and Backports Team", which as of October 2024 does not exist. Fixes: https://tracker.ceph.com/issues/68471 Co-authored-by: Laura Flores Signed-off-by: Zac Dover --- SubmittingPatches-backports.rst | 51 ++++++--------------------------- 1 file changed, 8 insertions(+), 43 deletions(-) diff --git a/SubmittingPatches-backports.rst b/SubmittingPatches-backports.rst index 0f96aec65c4f8..bb55088cb5fac 100644 --- a/SubmittingPatches-backports.rst +++ b/SubmittingPatches-backports.rst @@ -121,14 +121,11 @@ If you do not have sufficient permissions to modify any field of the tracker issue, just add a comment describing what changes you would like to make. Someone with permissions will make the necessary modifications on your behalf. -For straightforward backports, that's all that you (as the developer of the fix) -need to do. Volunteers from the `Stable Releases and Backports team`_ will -proceed to create Backport issues to track the necessary backports and stage the -backports by opening GitHub PRs with the cherry-picks. If you don't want to -wait, and provided you have sufficient permissions at https://tracker.ceph.com, -you can `create Backport tracker issues` and `stage backports`_ yourself. In -that case, read on. - +Authors of pull requests are responsible for creating associated backport pull +requests. As long as you have sufficient permissions at +https://tracker.ceph.com, you can `create Backport tracker issues` and `stage +backports`_ yourself. Read these linked sections to learn how to create +backport tracker issues and how to stage backports: .. _`create backport tracker issues`: .. _`backport tracker issue`: @@ -146,10 +143,7 @@ issues can be created in the backport tracker issue for tracking the backporting Under ordinary circumstances, the developer who merges the ``main`` PR will flag the ``main`` branch tracker issue for backport by changing the Status to "Pending -Backport", and volunteers from the `Stable Releases and Backports team`_ -periodically create backport tracker issues by running the -``backport-create-issue`` script. They also do the actual backporting. But that -does take time and you may not want to wait. +Backport". You might be tempted to forge ahead and create the backport issues yourself. Please don't do that - it is difficult (bordering on impossible) to get all the @@ -360,20 +354,11 @@ Once the backport PR is open, the first order of business is to set the Milestone tag to the stable release the backport PR is targeting. For example, if the PR is targeting "nautilus", set the Milestone tag to "nautilus". -If you don't have sufficient GitHub permissions to set the Milestone, don't -worry. Members of the `Stable Releases and Backports team`_ periodically run -a script (``ceph-backport.sh --milestones``) which scans all PRs targetting stable -branches and automatically adds the correct Milestone tag if it is missing. - Next, check which component label was applied to the ``main`` PR corresponding to this backport, and double-check that that label is applied to the backport PR as well. For example, if the ``main`` PR carries the component label "core", the backport PR should also get that label. -In general, it is the responsibility of the `Stable Releases and Backports -team`_ to ensure that backport PRs are properly labelled. If in doubt, just -leave the labelling to them. - .. _`backport PR reviewing`: .. _`backport PR testing`: .. _`backport PR merging`: @@ -381,9 +366,8 @@ leave the labelling to them. Reviewing, testing, and merging of backport PRs ----------------------------------------------- -Once your backport PR is open and the Milestone is set properly, the -`Stable Releases and Backports team` will take care of getting the PR -reviewed and tested. Once the PR is reviewed and tested, it will be merged. +Once your backport PR is open, it will be reviewed and tested. When the PR has +been reviewed and tested, it will be merged. If you would like to facilitate this process, you can solicit reviews and run integration tests on the PR. In this case, add comments to the PR describing the @@ -394,22 +378,3 @@ it will be merged. Even if you have sufficient GitHub permissions to merge the PR, please do *not* merge it yourself. (Uncontrolled merging to stable branches unnecessarily complicates the release preparation process, which is done by volunteers.) - - -Stable Releases and Backports team ----------------------------------- - -Ceph has a `Stable Releases and Backports`_ team, staffed by volunteers, -which is charged with maintaining the stable releases and backporting bugfixes -from the ``main`` branch to them. (That team maintains a wiki, accessible by -clicking the `Stable Releases and Backports`_ link, which describes various -workflows in the backporting lifecycle.) - -.. _`Stable Releases and Backports`: http://tracker.ceph.com/projects/ceph-releases/wiki - -Ordinarily, it is enough to fill out the "Backport" field in the bug (tracker -issue). The volunteers from the Stable Releases and Backports team will -backport the fix, run regression tests on it, and include it in one or more -future point releases. - - From 95916c91906604c516e78b550010e9de511fc1e9 Mon Sep 17 00:00:00 2001 From: Zac Dover Date: Thu, 10 Oct 2024 22:17:12 +1000 Subject: [PATCH 082/148] doc/dev: remove "Stable Releases and Backports" Remove mention of "Stable Releases and Backports" from doc/dev/developer_guide/essentials.rst. Co-authored-by: Laura Flores Signed-off-by: Zac Dover --- doc/dev/developer_guide/essentials.rst | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/doc/dev/developer_guide/essentials.rst b/doc/dev/developer_guide/essentials.rst index cbde8779a66da..7cce4c6f898ff 100644 --- a/doc/dev/developer_guide/essentials.rst +++ b/doc/dev/developer_guide/essentials.rst @@ -287,16 +287,13 @@ See :ref:`kubernetes-dev` Backporting ----------- -All bugfixes should be merged to the ``main`` branch before being -backported. To flag a bugfix for backporting, make sure it has a -`tracker issue`_ associated with it and set the ``Backport`` field to a -comma-separated list of previous releases (e.g. "hammer,jewel") that you think -need the backport. -The rest (including the actual backporting) will be taken care of by the -`Stable Releases and Backports`_ team. +All bugfixes should be merged to the ``main`` branch before being backported. +To flag a bugfix for backporting, make sure it has a `tracker issue`_ +associated with it and set the ``Backport`` field to a comma-separated list of +previous releases (e.g. "hammer,jewel") that you think need the backport. You +are responsible for the backporting of pull requests that you raise. .. _`tracker issue`: http://tracker.ceph.com/ -.. _`Stable Releases and Backports`: http://tracker.ceph.com/projects/ceph-releases/wiki Dependabot ---------- From 5a1a21573b92113144060e8475778c669b1de4aa Mon Sep 17 00:00:00 2001 From: Naman Munet Date: Mon, 7 Oct 2024 10:41:29 +0530 Subject: [PATCH 083/148] mgr/dashboard: unable to edit pipe config for bucket level policy of a bucket Fixes: https://tracker.ceph.com/issues/68387 Fixes Includes: 1) Passing additional parameter for 'user' and 'mode' as the user can be either system/dashboard or other values while creating pipe. 2) Previously while removing the src/dest bucket field, we were getting same old values on editing pipe, but now it will become '*' if empty value passed from frontend. Signed-off-by: Naman Munet --- src/pybind/mgr/dashboard/controllers/rgw.py | 10 ++--- ...w-multisite-sync-pipe-modal.component.html | 6 +++ ...ultisite-sync-pipe-modal.component.spec.ts | 43 ++++++++++++++++++- ...rgw-multisite-sync-pipe-modal.component.ts | 7 ++- .../app/shared/api/rgw-multisite.service.ts | 11 ++++- src/pybind/mgr/dashboard/openapi.yaml | 11 ++--- .../mgr/dashboard/services/rgw_client.py | 24 ++++++----- 7 files changed, 87 insertions(+), 25 deletions(-) diff --git a/src/pybind/mgr/dashboard/controllers/rgw.py b/src/pybind/mgr/dashboard/controllers/rgw.py index 8667d469060f8..b8e07a708e79d 100755 --- a/src/pybind/mgr/dashboard/controllers/rgw.py +++ b/src/pybind/mgr/dashboard/controllers/rgw.py @@ -244,11 +244,13 @@ def create_sync_pipe(self, group_id: str, pipe_id: str, source_zones: Dict[str, Any], destination_zones: Dict[str, Any], source_bucket: str = '', - destination_bucket: str = '', bucket_name: str = ''): + destination_bucket: str = '', bucket_name: str = '', + user: str = '', mode: str = ''): multisite_instance = RgwMultisite() return multisite_instance.create_sync_pipe(group_id, pipe_id, source_zones, destination_zones, source_bucket, - destination_bucket, bucket_name, True) + destination_bucket, bucket_name, True, + user, mode) @Endpoint(method='DELETE', path='/sync-pipe') @EndpointDoc("Remove the sync pipe") @@ -256,12 +258,10 @@ def create_sync_pipe(self, group_id: str, pipe_id: str, def remove_sync_pipe(self, group_id: str, pipe_id: str, source_zones: Optional[List[str]] = None, destination_zones: Optional[List[str]] = None, - destination_bucket: str = '', bucket_name: str = ''): multisite_instance = RgwMultisite() return multisite_instance.remove_sync_pipe(group_id, pipe_id, source_zones, - destination_zones, destination_bucket, - bucket_name, True) + destination_zones, bucket_name, True) @APIRouter('/rgw/daemon', Scope.RGW) diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.html index e50666cdeaa96..767305958d4c8 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.html +++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.html @@ -64,6 +64,9 @@ i18n-placeholder placeholder="Source Bucket Name..." formControlName="source_bucket"/> + + {{ allBucketSelectedHelpText }} +
@@ -78,6 +81,9 @@ i18n-placeholder placeholder="Destination Bucket Name..." formControlName="destination_bucket"/> + + {{ allBucketSelectedHelpText }} +
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.spec.ts index 369658d7d427f..1127db1c59a59 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.spec.ts +++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.spec.ts @@ -89,6 +89,47 @@ describe('RgwMultisiteSyncPipeModalComponent', () => { component.submit(); expect(spy).toHaveBeenCalled(); expect(putDataSpy).toHaveBeenCalled(); - expect(putDataSpy).toHaveBeenCalledWith(component.pipeForm.getRawValue()); + expect(putDataSpy).toHaveBeenCalledWith({ + ...component.pipeForm.getRawValue(), + mode: '', + user: '' + }); + }); + + it('should pass "user" and "mode" while creating/editing pipe', () => { + component.editing = true; + component.pipeForm.patchValue({ + pipe_id: 'pipe1', + group_id: 's3-bucket-replication:enabled', + source_bucket: '', + source_zones: { added: ['zone1-zg1-realm1'], removed: [] }, + destination_bucket: '', + destination_zones: { added: ['zone2-zg1-realm1'], removed: [] } + }); + component.pipeSelectedRow = { + dest: { bucket: '*', zones: ['zone2-zg1-realm1'] }, + id: 'pipi1', + params: { + dest: {}, + mode: 'user', + priority: 0, + source: { filter: { tags: [] } }, + user: 'dashboard' + }, + source: { bucket: '*', zones: ['zone1-zg1-realm1'] } + }; + + component.sourceZones.data.selected = ['zone1-zg1-realm1']; + component.destZones.data.selected = ['zone2-zg1-realm1']; + const spy = jest.spyOn(component, 'submit'); + const putDataSpy = jest.spyOn(multisiteServiceMock, 'createEditSyncPipe'); + component.submit(); + expect(spy).toHaveBeenCalled(); + expect(putDataSpy).toHaveBeenCalled(); + expect(putDataSpy).toHaveBeenCalledWith({ + ...component.pipeForm.getRawValue(), + mode: 'user', + user: 'dashboard' + }); }); }); diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.ts index 2f41dbd23c843..43742ef60b839 100755 --- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.ts +++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.ts @@ -18,6 +18,8 @@ import { ZoneData } from '../models/rgw-multisite-zone-selector'; import { SucceededActionLabelsI18n } from '~/app/shared/constants/app.constants'; const ALL_ZONES = $localize`All zones (*)`; +const ALL_BUCKET_SELECTED_HELP_TEXT = + 'If no value is provided, all the buckets in the zone group will be selected.'; @Component({ selector: 'cd-rgw-multisite-sync-pipe-modal', @@ -33,6 +35,7 @@ export class RgwMultisiteSyncPipeModalComponent implements OnInit { sourceZones = new ZoneData(false, 'Filter Zones'); destZones = new ZoneData(false, 'Filter Zones'); icons = Icons; + allBucketSelectedHelpText = ALL_BUCKET_SELECTED_HELP_TEXT; constructor( public activeModal: NgbActiveModal, @@ -187,7 +190,9 @@ export class RgwMultisiteSyncPipeModalComponent implements OnInit { .createEditSyncPipe({ ...this.pipeForm.getRawValue(), source_zones: sourceZones, - destination_zones: destZones + destination_zones: destZones, + user: this.editing ? this.pipeSelectedRow?.params?.user : '', + mode: this.editing ? this.pipeSelectedRow?.params?.mode : '' }) .subscribe( () => { diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/rgw-multisite.service.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/rgw-multisite.service.ts index d57cd523a4dfe..5e12a00ec95d3 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/rgw-multisite.service.ts +++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/rgw-multisite.service.ts @@ -123,8 +123,15 @@ export class RgwMultisiteService { ); } - createEditSyncPipe(payload: any) { - return this.http.put(`${this.url}/sync-pipe`, payload); + createEditSyncPipe(payload: any, user?: string, mode?: string) { + let params = new HttpParams(); + if (user) { + params = params.append('user', user); + } + if (mode) { + params = params.append('mode', mode); + } + return this.http.put(`${this.url}/sync-pipe`, payload, { params }); } removeSyncPipe(pipe_id: string, group_id: string, bucket_name?: string) { diff --git a/src/pybind/mgr/dashboard/openapi.yaml b/src/pybind/mgr/dashboard/openapi.yaml index e8ab663d0d593..4fac085d1f361 100644 --- a/src/pybind/mgr/dashboard/openapi.yaml +++ b/src/pybind/mgr/dashboard/openapi.yaml @@ -11384,6 +11384,9 @@ paths: type: string group_id: type: string + mode: + default: '' + type: string pipe_id: type: string source_bucket: @@ -11391,6 +11394,9 @@ paths: type: string source_zones: type: string + user: + default: '' + type: string required: - group_id - pipe_id @@ -11445,11 +11451,6 @@ paths: name: destination_zones schema: type: string - - default: '' - in: query - name: destination_bucket - schema: - type: string - default: '' in: query name: bucket_name diff --git a/src/pybind/mgr/dashboard/services/rgw_client.py b/src/pybind/mgr/dashboard/services/rgw_client.py index 2441b73b361be..e45c4fa447b31 100755 --- a/src/pybind/mgr/dashboard/services/rgw_client.py +++ b/src/pybind/mgr/dashboard/services/rgw_client.py @@ -2236,7 +2236,8 @@ def create_sync_pipe(self, group_id: str, pipe_id: str, source_bucket: str = '', destination_bucket: str = '', bucket_name: str = '', - update_period=False): + update_period=False, + user: str = '', mode: str = ''): if source_zones['added'] or destination_zones['added']: rgw_sync_policy_cmd = ['sync', 'group', 'pipe', 'create', @@ -2245,11 +2246,9 @@ def create_sync_pipe(self, group_id: str, pipe_id: str, if bucket_name: rgw_sync_policy_cmd += ['--bucket', bucket_name] - if source_bucket: - rgw_sync_policy_cmd += ['--source-bucket', source_bucket] + rgw_sync_policy_cmd += ['--source-bucket', source_bucket] - if destination_bucket: - rgw_sync_policy_cmd += ['--dest-bucket', destination_bucket] + rgw_sync_policy_cmd += ['--dest-bucket', destination_bucket] if source_zones['added']: rgw_sync_policy_cmd += ['--source-zones', ','.join(source_zones['added'])] @@ -2257,6 +2256,12 @@ def create_sync_pipe(self, group_id: str, pipe_id: str, if destination_zones['added']: rgw_sync_policy_cmd += ['--dest-zones', ','.join(destination_zones['added'])] + if user: + rgw_sync_policy_cmd += ['--uid', user] + + if mode: + rgw_sync_policy_cmd += ['--mode', mode] + logger.info("Creating sync pipe!") try: exit_code, _, err = mgr.send_rgwadmin_command(rgw_sync_policy_cmd) @@ -2271,13 +2276,13 @@ def create_sync_pipe(self, group_id: str, pipe_id: str, if ((source_zones['removed'] and '*' not in source_zones['added']) or (destination_zones['removed'] and '*' not in destination_zones['added'])): self.remove_sync_pipe(group_id, pipe_id, source_zones['removed'], - destination_zones['removed'], destination_bucket, - bucket_name) + destination_zones['removed'], + bucket_name, True) def remove_sync_pipe(self, group_id: str, pipe_id: str, source_zones: Optional[List[str]] = None, destination_zones: Optional[List[str]] = None, - destination_bucket: str = '', bucket_name: str = '', + bucket_name: str = '', update_period=False): rgw_sync_policy_cmd = ['sync', 'group', 'pipe', 'remove', '--group-id', group_id, '--pipe-id', pipe_id] @@ -2291,9 +2296,6 @@ def remove_sync_pipe(self, group_id: str, pipe_id: str, if destination_zones: rgw_sync_policy_cmd += ['--dest-zones', ','.join(destination_zones)] - if destination_bucket: - rgw_sync_policy_cmd += ['--dest-bucket', destination_bucket] - logger.info("Removing sync pipe! %s", rgw_sync_policy_cmd) try: exit_code, _, err = mgr.send_rgwadmin_command(rgw_sync_policy_cmd) From 47e7a24c7b94cbb677298d26af6ac09519f70161 Mon Sep 17 00:00:00 2001 From: Leonid Chernin Date: Wed, 9 Oct 2024 06:59:09 +0000 Subject: [PATCH 084/148] mon/nvmeofgw*: fix HA usecase when gateway has no listeners: behaves like no-subsystems Signed-off-by: Leonid Chernin --- src/mon/NVMeofGwMap.cc | 3 +-- src/mon/NVMeofGwMap.h | 2 +- src/mon/NVMeofGwMon.cc | 22 ++++++++++++++++++++-- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/src/mon/NVMeofGwMap.cc b/src/mon/NVMeofGwMap.cc index 7b1bc9b8e56cf..c01ea9e710321 100755 --- a/src/mon/NVMeofGwMap.cc +++ b/src/mon/NVMeofGwMap.cc @@ -254,7 +254,7 @@ void NVMeofGwMap::track_deleting_gws(const NvmeGroupKey& group_key, } } -int NVMeofGwMap::process_gw_map_gw_no_subsystems( +int NVMeofGwMap::process_gw_map_gw_no_subsys_no_listeners( const NvmeGwId &gw_id, const NvmeGroupKey& group_key, bool &propose_pending) { int rc = 0; @@ -424,7 +424,6 @@ void NVMeofGwMap::find_failback_gw( auto& gws_states = created_gws[group_key]; auto& gw_state = created_gws[group_key][gw_id]; bool do_failback = false; - dout(10) << "Find failback GW for GW " << gw_id << dendl; for (auto& gw_state_it: gws_states) { auto& st = gw_state_it.second; diff --git a/src/mon/NVMeofGwMap.h b/src/mon/NVMeofGwMap.h index 2971037174218..267d85b10f918 100755 --- a/src/mon/NVMeofGwMap.h +++ b/src/mon/NVMeofGwMap.h @@ -54,7 +54,7 @@ class NVMeofGwMap int process_gw_map_gw_down( const NvmeGwId &gw_id, const NvmeGroupKey& group_key, bool &propose_pending); - int process_gw_map_gw_no_subsystems( + int process_gw_map_gw_no_subsys_no_listeners( const NvmeGwId &gw_id, const NvmeGroupKey& group_key, bool &propose_pending); void update_active_timers(bool &propose_pending); diff --git a/src/mon/NVMeofGwMon.cc b/src/mon/NVMeofGwMon.cc index 734e90defd946..d9e936e27df34 100644 --- a/src/mon/NVMeofGwMon.cc +++ b/src/mon/NVMeofGwMon.cc @@ -367,6 +367,13 @@ bool NVMeofGwMon::preprocess_command(MonOpRequestRef op) std::stringstream sstrm1; sstrm1 << state.availability; f->dump_string("Availability", sstrm1.str()); + uint32_t num_listeners = 0; + if (state.availability == gw_availability_t::GW_AVAILABLE) { + for (auto &subs: state.subsystems) { + num_listeners += subs.listeners.size(); + } + f->dump_unsigned("num-listeners", num_listeners); + } sstrm1.str(""); for (auto &state_itr: map.created_gws[group_key][gw_id].sm_state) { sstrm1 << " " << state_itr.first + 1 << ": " @@ -476,7 +483,7 @@ void NVMeofGwMon::process_gw_down(const NvmeGwId &gw_id, if (avail == gw_availability_t::GW_UNAVAILABLE) { pending_map.process_gw_map_gw_down(gw_id, group_key, propose_pending); } else { - pending_map.process_gw_map_gw_no_subsystems(gw_id, group_key, propose_pending); + pending_map.process_gw_map_gw_no_subsys_no_listeners(gw_id, group_key, propose_pending); } } @@ -600,7 +607,18 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op) if (sub.size() == 0) { avail = gw_availability_t::GW_CREATED; - } + } else { + bool listener_found = false; + for (auto &subs: sub) { + if (subs.listeners.size()) { + listener_found = true; + break; + } + } + if (!listener_found) { + avail = gw_availability_t::GW_CREATED; + } + }// for HA no-subsystems and no-listeners are same usecases if (pending_map.created_gws[group_key][gw_id].subsystems != sub) { dout(10) << "subsystems of GW changed, propose pending " << gw_id << dendl; pending_map.created_gws[group_key][gw_id].subsystems = sub; From e80b7ba4add2d698555112e0ec46328cab703688 Mon Sep 17 00:00:00 2001 From: Redouane Kachach Date: Tue, 1 Oct 2024 10:38:39 +0200 Subject: [PATCH 085/148] mgr/cephadm: move Grafana's subpath handling logic to grafana config Fixes: https://tracker.ceph.com/issues/68315 So far, Grafana's subpath handling has been managed on the Nginx server side using a rewrite rule. Let's move this logic to the Grafana side to make it consistent with the rest of the monitoring services. Signed-off-by: Redouane Kachach --- .../mgr/cephadm/templates/services/grafana/grafana.ini.j2 | 3 ++- .../templates/services/mgmt-gateway/external_server.conf.j2 | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pybind/mgr/cephadm/templates/services/grafana/grafana.ini.j2 b/src/pybind/mgr/cephadm/templates/services/grafana/grafana.ini.j2 index 972ef22e7b58e..967f1355af14b 100644 --- a/src/pybind/mgr/cephadm/templates/services/grafana/grafana.ini.j2 +++ b/src/pybind/mgr/cephadm/templates/services/grafana/grafana.ini.j2 @@ -15,7 +15,8 @@ http_port = {{ http_port }} http_addr = {{ http_addr }} {% if mgmt_gw_enabled %} - root_url = %(protocol)s://%(domain)s/grafana/ + root_url = %(protocol)s://%(domain)s:%(http_port)s/grafana/ + serve_from_sub_path = true {% endif %} [snapshots] external_enabled = false diff --git a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/external_server.conf.j2 b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/external_server.conf.j2 index 260e7418e2d7f..b830034a7d4e9 100644 --- a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/external_server.conf.j2 +++ b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/external_server.conf.j2 @@ -109,7 +109,6 @@ server { {% if grafana_endpoints %} location /grafana { - rewrite ^/grafana/(.*) /$1 break; proxy_pass {{ grafana_scheme }}://grafana_servers; # clear any Authorization header as Prometheus and Alertmanager are using basic-auth browser # will send this header if Grafana is running on the same node as one of those services From 38d9cf4ca760c667d105435a714f76dbff926960 Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Mon, 23 Sep 2024 01:59:30 -0500 Subject: [PATCH 086/148] osd/scrub: introduce ScrubStore::at_level_t to hold the caching and backend details related to the representation of scrub-detected errors as OMap entries of a uniquely-named object. In a followup commit - the ScrubStore is modified to hold two of these objects, one for the shallow errors and one for the deep errors. Signed-off-by: Ronen Friedman --- src/osd/scrubber/ScrubStore.cc | 49 +++++++++++++++++++++---------- src/osd/scrubber/ScrubStore.h | 53 +++++++++++++++++++++++++++++----- 2 files changed, 79 insertions(+), 23 deletions(-) diff --git a/src/osd/scrubber/ScrubStore.cc b/src/osd/scrubber/ScrubStore.cc index a00ab2caecee6..af223cb5cdc09 100644 --- a/src/osd/scrubber/ScrubStore.cc +++ b/src/osd/scrubber/ScrubStore.cc @@ -109,19 +109,29 @@ Store::create(ObjectStore* store, ceph_assert(t); ghobject_t oid = make_scrub_object(pgid); t->touch(coll, oid); - return new Store{coll, oid, store}; + return new Store{*store, t, pgid, coll}; +} + + +Store::Store( + ObjectStore& osd_store, + ObjectStore::Transaction* t, + const spg_t& pgid, + const coll_t& coll) + : object_store{osd_store} + , coll{coll} +{ + ceph_assert(t); + + const auto err_obj = pgid.make_temp_ghobject(fmt::format("scrub_{}", pgid)); + t->touch(coll, err_obj); + errors_db.emplace(pgid, err_obj, OSDriver{&object_store, coll, err_obj}); } -Store::Store(const coll_t& coll, const ghobject_t& oid, ObjectStore* store) - : coll(coll), - hoid(oid), - driver(store, coll, hoid), - backend(&driver) -{} Store::~Store() { - ceph_assert(results.empty()); + ceph_assert(!errors_db || errors_db->results.empty()); } void Store::add_error(int64_t pool, const inconsistent_obj_wrapper& e) @@ -131,11 +141,13 @@ void Store::add_error(int64_t pool, const inconsistent_obj_wrapper& e) void Store::add_object_error(int64_t pool, const inconsistent_obj_wrapper& e) { + const auto key = to_object_key(pool, e.object); bufferlist bl; e.encode(bl); - results[to_object_key(pool, e.object)] = bl; + errors_db->results[key] = bl; } + void Store::add_error(int64_t pool, const inconsistent_snapset_wrapper& e) { add_snap_error(pool, e); @@ -145,26 +157,28 @@ void Store::add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e) { bufferlist bl; e.encode(bl); - results[to_snap_key(pool, e.object)] = bl; + errors_db->results[to_snap_key(pool, e.object)] = bl; } bool Store::empty() const { - return results.empty(); + return errors_db->results.empty(); } void Store::flush(ObjectStore::Transaction* t) { if (t) { - OSDriver::OSTransaction txn = driver.get_transaction(t); - backend.set_keys(results, &txn); + OSDriver::OSTransaction txn = errors_db->driver.get_transaction(t); + errors_db->backend.set_keys(errors_db->results, &txn); } - results.clear(); + errors_db->results.clear(); } void Store::cleanup(ObjectStore::Transaction* t) { - t->remove(coll, hoid); + ceph_assert(t); + if (errors_db) + t->remove(coll, errors_db->errors_hoid); } std::vector @@ -195,8 +209,11 @@ Store::get_errors(const string& begin, uint64_t max_return) const { vector errors; + if (!errors_db) + return errors; + auto next = std::make_pair(begin, bufferlist{}); - while (max_return && !backend.get_next(next.first, &next)) { + while (max_return && !errors_db->backend.get_next(next.first, &next)) { if (next.first >= end) break; errors.push_back(next.second); diff --git a/src/osd/scrubber/ScrubStore.h b/src/osd/scrubber/ScrubStore.h index 567badf608b6c..949a976051e67 100644 --- a/src/osd/scrubber/ScrubStore.h +++ b/src/osd/scrubber/ScrubStore.h @@ -5,6 +5,7 @@ #define CEPH_SCRUB_RESULT_H #include "common/map_cacher.hpp" +#include "osd/osd_types_fmt.h" #include "osd/SnapMapper.h" // for OSDriver namespace librados { @@ -45,18 +46,56 @@ class Store { uint64_t max_return) const; private: - Store(const coll_t& coll, const ghobject_t& oid, ObjectStore* store); + /** + * at_level_t + * + * The machinery for caching and storing errors at a specific scrub level. + */ + struct at_level_t { + at_level_t(const spg_t& pgid, const ghobject_t& err_obj, OSDriver&& drvr) + : errors_hoid{err_obj} + , driver{std::move(drvr)} + , backend{&driver} + {} + + /// the object in the PG store, where the errors are stored + ghobject_t errors_hoid; + + /// abstracted key fetching + OSDriver driver; + + /// a K,V cache for the errors that are detected during the scrub + /// session. The errors marked for a specific object are stored as + /// an OMap entry with the object's name as the key. + MapCacher::MapCacher backend; + + /// a temp object mapping seq-id to inconsistencies + std::map results; + }; + + Store(ObjectStore& osd_store, + ObjectStore::Transaction* t, + const spg_t& pgid, + const coll_t& coll); + std::vector get_errors(const std::string& start, const std::string& end, uint64_t max_return) const; private: + /// the OSD's storage backend + ObjectStore& object_store; + + /// the collection (i.e. - the PG store) in which the errors are stored const coll_t coll; - const ghobject_t hoid; - // a temp object holding mappings from seq-id to inconsistencies found in - // scrubbing - OSDriver driver; - mutable MapCacher::MapCacher backend; - std::map results; + + /** + * the machinery (backend details, cache, etc.) for storing both levels + * of errors (note: 'optional' to allow delayed creation w/o dynamic + * allocations; and 'mutable', as the caching mechanism is used in const + * methods) + */ + mutable std::optional errors_db; + // not yet: mutable std::optional deep_db; }; } // namespace Scrub From 571e2f3c193fc0d117cfd577fe90798fc75e98fa Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Mon, 23 Sep 2024 03:58:59 -0500 Subject: [PATCH 087/148] osd/scrub: directly create or reinit the ScrubStore The ScrubStore is now directly created or reinitialized by the Scrubber. Note that the store object is not identical to the errors DB: the errors DB is an entity in the OSD store (a collection of OMap entries in a uniquely-named object(s)), while the ScrubSTore object is a cacher and accessor for that entity. That one can be recreated or disposed of at will. We now do not recreate the ScrubStore object for every scrub. Signed-off-by: Ronen Friedman --- src/osd/scrubber/ScrubStore.cc | 41 ++++++++++++++++++++----------- src/osd/scrubber/ScrubStore.h | 42 ++++++++++++++++++++++++-------- src/osd/scrubber/pg_scrubber.cc | 43 +++++++++++++++++++++++++++------ src/osd/scrubber/pg_scrubber.h | 10 ++++++++ 4 files changed, 104 insertions(+), 32 deletions(-) diff --git a/src/osd/scrubber/ScrubStore.cc b/src/osd/scrubber/ScrubStore.cc index af223cb5cdc09..0c36be6b66b02 100644 --- a/src/osd/scrubber/ScrubStore.cc +++ b/src/osd/scrubber/ScrubStore.cc @@ -99,20 +99,6 @@ string last_snap_key(int64_t pool) namespace Scrub { -Store* -Store::create(ObjectStore* store, - ObjectStore::Transaction* t, - const spg_t& pgid, - const coll_t& coll) -{ - ceph_assert(store); - ceph_assert(t); - ghobject_t oid = make_scrub_object(pgid); - t->touch(coll, oid); - return new Store{*store, t, pgid, coll}; -} - - Store::Store( ObjectStore& osd_store, ObjectStore::Transaction* t, @@ -174,6 +160,33 @@ void Store::flush(ObjectStore::Transaction* t) errors_db->results.clear(); } + +void Store::clear_level_db( + ObjectStore::Transaction* t, + at_level_t& db) +{ + // easiest way to guarantee that the object representing the DB exists + t->touch(coll, db.errors_hoid); + + // remove all the keys in the DB + t->omap_clear(coll, db.errors_hoid); + + // restart the 'in progress' part of the MapCacher + db.backend.reset(); +} + + +void Store::reinit(ObjectStore::Transaction* t, [[maybe_unused]] scrub_level_t level) +{ + // Note: only one caller, and it creates the transaction passed to reinit(). + // No need to assert on 't' + + if (errors_db) { + clear_level_db(t, *errors_db); + } +} + + void Store::cleanup(ObjectStore::Transaction* t) { ceph_assert(t); diff --git a/src/osd/scrubber/ScrubStore.h b/src/osd/scrubber/ScrubStore.h index 949a976051e67..600905e85e8a2 100644 --- a/src/osd/scrubber/ScrubStore.h +++ b/src/osd/scrubber/ScrubStore.h @@ -20,11 +20,16 @@ namespace Scrub { class Store { public: ~Store(); - static Store* create(ObjectStore* store, - ObjectStore::Transaction* t, - const spg_t& pgid, - const coll_t& coll); + + Store(ObjectStore& osd_store, + ObjectStore::Transaction* t, + const spg_t& pgid, + const coll_t& coll); + + + /// mark down detected errors, either shallow or deep void add_object_error(int64_t pool, const inconsistent_obj_wrapper& e); + void add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e); // and a variant-friendly interface: @@ -33,8 +38,22 @@ class Store { bool empty() const; void flush(ObjectStore::Transaction*); + + /// remove both shallow and deep errors DBs. Called on interval. void cleanup(ObjectStore::Transaction*); + /** + * prepare the Store object for a new scrub session. + * This involves clearing one or both of the errors DBs, and resetting + * the cache. + * + * @param level: the scrub level to prepare for. Whenever a deep scrub + * is requested, both the shallow and deep errors DBs are cleared. + * If, on the other hand, a shallow scrub is requested, only the shallow + * errors DB is cleared. + */ + void reinit(ObjectStore::Transaction* t, scrub_level_t level); + std::vector get_snap_errors( int64_t pool, const librados::object_id_t& start, @@ -73,15 +92,9 @@ class Store { std::map results; }; - Store(ObjectStore& osd_store, - ObjectStore::Transaction* t, - const spg_t& pgid, - const coll_t& coll); - std::vector get_errors(const std::string& start, const std::string& end, uint64_t max_return) const; - private: /// the OSD's storage backend ObjectStore& object_store; @@ -96,6 +109,15 @@ class Store { */ mutable std::optional errors_db; // not yet: mutable std::optional deep_db; + + /** + * Clear the DB of errors at a specific scrub level by performing an + * omap_clear() on the DB object, and resetting the MapCacher. + */ + void clear_level_db( + ObjectStore::Transaction* t, + at_level_t& db); + }; } // namespace Scrub diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc index 555d13ba72b2b..a085481f477ac 100644 --- a/src/osd/scrubber/pg_scrubber.cc +++ b/src/osd/scrubber/pg_scrubber.cc @@ -1183,6 +1183,7 @@ void PgScrubber::_request_scrub_map(pg_shard_t replica, m_osds->send_message_osd_cluster(replica.osd, repscrubop, get_osdmap_epoch()); } +// only called on interval change. Both DBs are to be removed. void PgScrubber::cleanup_store(ObjectStore::Transaction* t) { if (!m_store) @@ -1200,6 +1201,38 @@ void PgScrubber::cleanup_store(ObjectStore::Transaction* t) ceph_assert(!m_store); } + +void PgScrubber::reinit_scrub_store() +{ + // Entering, 0 to 3 of the following objects(*) may exist: + // ((*)'objects' here: both code objects (the ScrubStore object) and + // actual Object Store objects). + // 1. The ScrubStore object itself. + // 2,3. The two special hobjects in the coll (the PG data) holding the last + // scrub's results. <> + // + // The Store object can be deleted and recreated, as a way to guarantee + // no junk is left. We won't do it here, but we will clear the at_level_t + // structures. + // The hobjects: possibly. The shallow DB object is always cleared. The + // deep one - only if running a deep scrub. + ObjectStore::Transaction t; + if (m_store) { + dout(10) << __func__ << " reusing existing store" << dendl; + m_store->flush(&t); + } else { + dout(10) << __func__ << " creating new store" << dendl; + m_store = std::make_unique( + *m_pg->osd->store, &t, m_pg->info.pgid, m_pg->coll); + } + + // regardless of whether the ScrubStore object was recreated or reused, we need to + // (possibly) clear the actual DB objects in the Object Store. + m_store->reinit(&t, m_active_target->level()); + m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t), nullptr); +} + + void PgScrubber::on_init() { // going upwards from 'inactive' @@ -1217,14 +1250,8 @@ void PgScrubber::on_init() m_is_deep ? scrub_level_t::deep : scrub_level_t::shallow, m_pg->get_actingset()); - // create a new store - { - ObjectStore::Transaction t; - cleanup_store(&t); - m_store.reset( - Scrub::Store::create(m_pg->osd->store, &t, m_pg->info.pgid, m_pg->coll)); - m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t), nullptr); - } + // create or reuse the 'known errors' store + reinit_scrub_store(); m_start = m_pg->info.pgid.pgid.get_hobj_start(); m_active = true; diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h index ff8c98d387ea2..1a5813bd9235c 100644 --- a/src/osd/scrubber/pg_scrubber.h +++ b/src/osd/scrubber/pg_scrubber.h @@ -771,6 +771,16 @@ class PgScrubber : public ScrubPgIF, std::unique_ptr m_store; + /** + * the ScrubStore sub-object caches and manages the database of known + * scrub errors. reinit_scrub_store() clears the database and re-initializes + * the ScrubStore object. + * + * in the next iteration - reinit_..() potentially deletes only the + * shallow errors part of the database. + */ + void reinit_scrub_store(); + int num_digest_updates_pending{0}; hobject_t m_start, m_end; ///< note: half-closed: [start,end) From ce58c88158381e252ffa432ff855a01570cc98dd Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Mon, 23 Sep 2024 05:15:57 -0500 Subject: [PATCH 088/148] osd/scrub: add dout() capability to the ScrubStore now that the ScrubSTore object is directly created by the scrubber, (and has a lifetime that does not extend beyond the scrubber object), we can add the same dout() mechanism used by the other scrubber sub-objects. Note: that mechanism will be changed shortly, so that the sub-objects would use one prefix() creator supplied by the Scrubber object. Signed-off-by: Ronen Friedman --- src/osd/scrubber/ScrubStore.cc | 50 +++++++++++++++++++++++++++++---- src/osd/scrubber/ScrubStore.h | 20 +++++++++---- src/osd/scrubber/pg_scrubber.cc | 2 +- 3 files changed, 61 insertions(+), 11 deletions(-) diff --git a/src/osd/scrubber/ScrubStore.cc b/src/osd/scrubber/ScrubStore.cc index 0c36be6b66b02..dd141d1c38ca4 100644 --- a/src/osd/scrubber/ScrubStore.cc +++ b/src/osd/scrubber/ScrubStore.cc @@ -1,11 +1,13 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab -#include "ScrubStore.h" +#include "./ScrubStore.h" #include "osd/osd_types.h" #include "common/scrub_types.h" #include "include/rados/rados_types.hpp" +#include "pg_scrubber.h" + using std::ostringstream; using std::string; using std::vector; @@ -95,16 +97,31 @@ string last_snap_key(int64_t pool) hoid.build_hash_cache(); return "SCRUB_SS_" + hoid.to_str(); } + +} // namespace + +#undef dout_context +#define dout_context (m_scrubber.get_pg_cct()) +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix _prefix_fn(_dout, this, __func__) + +template +static std::ostream& _prefix_fn(std::ostream* _dout, T* t, std::string fn = "") +{ + return t->gen_prefix(*_dout, fn); } namespace Scrub { Store::Store( + PgScrubber& scrubber, ObjectStore& osd_store, ObjectStore::Transaction* t, const spg_t& pgid, const coll_t& coll) - : object_store{osd_store} + : m_scrubber{scrubber} + , object_store{osd_store} , coll{coll} { ceph_assert(t); @@ -120,6 +137,18 @@ Store::~Store() ceph_assert(!errors_db || errors_db->results.empty()); } + +std::ostream& Store::gen_prefix(std::ostream& out, std::string_view fn) const +{ + if (fn.starts_with("operator")) { + // it's a lambda, and __func__ is not available + return m_scrubber.gen_prefix(out) << "Store::"; + } else { + return m_scrubber.gen_prefix(out) << "Store::" << fn << ": "; + } +} + + void Store::add_error(int64_t pool, const inconsistent_obj_wrapper& e) { add_object_error(pool, e); @@ -163,8 +192,11 @@ void Store::flush(ObjectStore::Transaction* t) void Store::clear_level_db( ObjectStore::Transaction* t, - at_level_t& db) + at_level_t& db, + std::string_view db_name) { + dout(20) << fmt::format("removing (omap) entries for {} error DB", db_name) + << dendl; // easiest way to guarantee that the object representing the DB exists t->touch(coll, db.errors_hoid); @@ -176,19 +208,27 @@ void Store::clear_level_db( } -void Store::reinit(ObjectStore::Transaction* t, [[maybe_unused]] scrub_level_t level) +void Store::reinit( + ObjectStore::Transaction* t, + [[maybe_unused]] scrub_level_t level) { + dout(20) << fmt::format( + "re-initializing the Scrub::Store (for {} scrub)", + (level == scrub_level_t::deep ? "deep" : "shallow")) + << dendl; + // Note: only one caller, and it creates the transaction passed to reinit(). // No need to assert on 't' if (errors_db) { - clear_level_db(t, *errors_db); + clear_level_db(t, *errors_db, "scrub"); } } void Store::cleanup(ObjectStore::Transaction* t) { + dout(20) << "discarding error DBs" << dendl; ceph_assert(t); if (errors_db) t->remove(coll, errors_db->errors_hoid); diff --git a/src/osd/scrubber/ScrubStore.h b/src/osd/scrubber/ScrubStore.h index 600905e85e8a2..a83841e2cfbb2 100644 --- a/src/osd/scrubber/ScrubStore.h +++ b/src/osd/scrubber/ScrubStore.h @@ -14,6 +14,7 @@ struct object_id_t; struct inconsistent_obj_wrapper; struct inconsistent_snapset_wrapper; +class PgScrubber; namespace Scrub { @@ -21,10 +22,12 @@ class Store { public: ~Store(); - Store(ObjectStore& osd_store, - ObjectStore::Transaction* t, - const spg_t& pgid, - const coll_t& coll); + Store( + PgScrubber& scrubber, + ObjectStore& osd_store, + ObjectStore::Transaction* t, + const spg_t& pgid, + const coll_t& coll); /// mark down detected errors, either shallow or deep @@ -64,6 +67,8 @@ class Store { const librados::object_id_t& start, uint64_t max_return) const; + std::ostream& gen_prefix(std::ostream& out, std::string_view fn) const; + private: /** * at_level_t @@ -95,6 +100,10 @@ class Store { std::vector get_errors(const std::string& start, const std::string& end, uint64_t max_return) const; + + /// access to the owning Scrubber object, for logging mostly + PgScrubber& m_scrubber; + /// the OSD's storage backend ObjectStore& object_store; @@ -116,7 +125,8 @@ class Store { */ void clear_level_db( ObjectStore::Transaction* t, - at_level_t& db); + at_level_t& db, + std::string_view db_name); }; } // namespace Scrub diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc index a085481f477ac..81093666f91c8 100644 --- a/src/osd/scrubber/pg_scrubber.cc +++ b/src/osd/scrubber/pg_scrubber.cc @@ -1223,7 +1223,7 @@ void PgScrubber::reinit_scrub_store() } else { dout(10) << __func__ << " creating new store" << dendl; m_store = std::make_unique( - *m_pg->osd->store, &t, m_pg->info.pgid, m_pg->coll); + *this, *m_pg->osd->store, &t, m_pg->info.pgid, m_pg->coll); } // regardless of whether the ScrubStore object was recreated or reused, we need to From 283f4c258641f86d3de3431bfdfba31856387ea6 Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Mon, 23 Sep 2024 05:25:05 -0500 Subject: [PATCH 089/148] common: extend MapCacher API to include 'no out' version of get_next() Signed-off-by: Ronen Friedman --- src/common/map_cacher.hpp | 45 +++++++++++++++++++++++++++++++++++ src/osd/scrubber/ScrubStore.h | 5 ++++ 2 files changed, 50 insertions(+) diff --git a/src/common/map_cacher.hpp b/src/common/map_cacher.hpp index 4d843be75dc64..95353425de9e6 100644 --- a/src/common/map_cacher.hpp +++ b/src/common/map_cacher.hpp @@ -16,6 +16,7 @@ #define MAPCACHER_H #include "include/Context.h" +#include "include/expected.hpp" #include "common/sharedptr_registry.hpp" namespace MapCacher { @@ -130,6 +131,50 @@ class MapCacher { return -EINVAL; } ///< @return error value, 0 on success, -ENOENT if no more entries + /// Fetch first key/value std::pair after specified key + struct PosAndData { + K last_key; + V data; + }; + using MaybePosAndData = tl::expected; + + MaybePosAndData get_1st_after_key( + K key ///< [in] key after which to get next + ) + { + ceph_assert(driver); + while (true) { + std::pair> cached; + bool got_cached = in_progress.get_next(key, &cached); + + ///\todo a driver->get_next() that returns an expected would be nice + bool got_store{false}; + std::pair store; + int r = driver->get_next(key, &store); + if (r < 0 && r != -ENOENT) { + return tl::unexpected(r); + } else if (r == 0) { + got_store = true; + } + + if (!got_cached && !got_store) { + return tl::unexpected(-ENOENT); + } else if (got_cached && (!got_store || store.first >= cached.first)) { + if (cached.second) { + return PosAndData{cached.first, *cached.second}; + } else { + key = cached.first; + continue; // value was cached as removed, recurse + } + } else { + return PosAndData{store.first, store.second}; + } + } + ceph_abort(); // not reachable + return tl::unexpected(-EINVAL); + } + + /// Adds operation setting keys to Transaction void set_keys( const std::map &keys, ///< [in] keys/values to std::set diff --git a/src/osd/scrubber/ScrubStore.h b/src/osd/scrubber/ScrubStore.h index a83841e2cfbb2..7d590d2d1915e 100644 --- a/src/osd/scrubber/ScrubStore.h +++ b/src/osd/scrubber/ScrubStore.h @@ -97,6 +97,11 @@ class Store { std::map results; }; + using CacherPosData = + MapCacher::MapCacher::PosAndData; + using ExpCacherPosData = tl::expected; + + std::vector get_errors(const std::string& start, const std::string& end, uint64_t max_return) const; From 031580fb662f35daacf61a8aa2a4b4f3b32b7b6b Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Mon, 23 Sep 2024 08:51:22 -0500 Subject: [PATCH 090/148] common/scrub,osd/scrub: minor cleanups to ScrubStore Including: - introducing 'no out param' encode() for the inconsistent wrappers; - renaming the ambiguous 'empty()' to 'is_empty()'; - removing unused code; - a few other minor cleanups. Signed-off-by: Ronen Friedman --- src/common/scrub_types.cc | 14 ++++++++ src/common/scrub_types.h | 2 ++ src/osd/scrubber/ScrubStore.cc | 61 +++++++++++----------------------- src/osd/scrubber/ScrubStore.h | 18 ++++------ 4 files changed, 42 insertions(+), 53 deletions(-) diff --git a/src/common/scrub_types.cc b/src/common/scrub_types.cc index b03a3cab70c84..4b4d191e09c39 100644 --- a/src/common/scrub_types.cc +++ b/src/common/scrub_types.cc @@ -161,6 +161,13 @@ void inconsistent_obj_wrapper::encode(bufferlist& bl) const ENCODE_FINISH(bl); } +bufferlist inconsistent_obj_wrapper::encode() const +{ + bufferlist bl; + encode(bl); + return bl; +} + void inconsistent_obj_wrapper::decode(bufferlist::const_iterator& bp) { DECODE_START(2, bp); @@ -240,6 +247,13 @@ void inconsistent_snapset_wrapper::encode(bufferlist& bl) const ENCODE_FINISH(bl); } +bufferlist inconsistent_snapset_wrapper::encode() const +{ + bufferlist bl; + encode(bl); + return bl; +} + void inconsistent_snapset_wrapper::decode(bufferlist::const_iterator& bp) { DECODE_START(2, bp); diff --git a/src/common/scrub_types.h b/src/common/scrub_types.h index dd206f56f6035..d86fc12b6c8cf 100644 --- a/src/common/scrub_types.h +++ b/src/common/scrub_types.h @@ -152,6 +152,7 @@ struct inconsistent_obj_wrapper : librados::inconsistent_obj_t { const pg_shard_t &primary); void set_version(uint64_t ver) { version = ver; } void encode(ceph::buffer::list& bl) const; + ceph::buffer::list encode() const; void decode(ceph::buffer::list::const_iterator& bp); }; @@ -181,6 +182,7 @@ struct inconsistent_snapset_wrapper : public librados::inconsistent_snapset_t { void set_size_mismatch(); void encode(ceph::buffer::list& bl) const; + ceph::buffer::list encode() const; void decode(ceph::buffer::list::const_iterator& bp); }; diff --git a/src/osd/scrubber/ScrubStore.cc b/src/osd/scrubber/ScrubStore.cc index dd141d1c38ca4..033ea6b24dfd4 100644 --- a/src/osd/scrubber/ScrubStore.cc +++ b/src/osd/scrubber/ScrubStore.cc @@ -15,21 +15,9 @@ using std::vector; using ceph::bufferlist; namespace { -ghobject_t make_scrub_object(const spg_t& pgid) -{ - ostringstream ss; - ss << "scrub_" << pgid; - return pgid.make_temp_ghobject(ss.str()); -} - string first_object_key(int64_t pool) { - auto hoid = hobject_t(object_t(), - "", - 0, - 0x00000000, - pool, - ""); + auto hoid = hobject_t(object_t(), "", CEPH_NOSNAP, 0x00000000, pool, ""); hoid.build_hash_cache(); return "SCRUB_OBJ_" + hoid.to_str(); } @@ -49,12 +37,7 @@ string to_object_key(int64_t pool, const librados::object_id_t& oid) string last_object_key(int64_t pool) { - auto hoid = hobject_t(object_t(), - "", - 0, - 0xffffffff, - pool, - ""); + auto hoid = hobject_t(object_t(), "", CEPH_NOSNAP, 0xffffffff, pool, ""); hoid.build_hash_cache(); return "SCRUB_OBJ_" + hoid.to_str(); } @@ -62,14 +45,9 @@ string last_object_key(int64_t pool) string first_snap_key(int64_t pool) { // scrub object is per spg_t object, so we can misuse the hash (pg.seed) for - // the representing the minimal and maximum keys. and this relies on how + // representing the minimal and maximum keys. and this relies on how // hobject_t::to_str() works: hex(pool).hex(revhash). - auto hoid = hobject_t(object_t(), - "", - 0, - 0x00000000, - pool, - ""); + auto hoid = hobject_t(object_t(), "", 0, 0x00000000, pool, ""); hoid.build_hash_cache(); return "SCRUB_SS_" + hoid.to_str(); } @@ -88,12 +66,7 @@ string to_snap_key(int64_t pool, const librados::object_id_t& oid) string last_snap_key(int64_t pool) { - auto hoid = hobject_t(object_t(), - "", - 0, - 0xffffffff, - pool, - ""); + auto hoid = hobject_t(object_t(), "", 0, 0xffffffff, pool, ""); hoid.build_hash_cache(); return "SCRUB_SS_" + hoid.to_str(); } @@ -168,22 +141,23 @@ void Store::add_error(int64_t pool, const inconsistent_snapset_wrapper& e) add_snap_error(pool, e); } + void Store::add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e) { - bufferlist bl; - e.encode(bl); - errors_db->results[to_snap_key(pool, e.object)] = bl; + errors_db->results[to_snap_key(pool, e.object)] = e.encode(); } -bool Store::empty() const + +bool Store::is_empty() const { - return errors_db->results.empty(); + return !errors_db || errors_db->results.empty(); } + void Store::flush(ObjectStore::Transaction* t) { if (t) { - OSDriver::OSTransaction txn = errors_db->driver.get_transaction(t); + auto txn = errors_db->driver.get_transaction(t); errors_db->backend.set_keys(errors_db->results, &txn); } errors_db->results.clear(); @@ -234,10 +208,11 @@ void Store::cleanup(ObjectStore::Transaction* t) t->remove(coll, errors_db->errors_hoid); } -std::vector -Store::get_snap_errors(int64_t pool, - const librados::object_id_t& start, - uint64_t max_return) const + +std::vector Store::get_snap_errors( + int64_t pool, + const librados::object_id_t& start, + uint64_t max_return) const { const string begin = (start.name.empty() ? first_snap_key(pool) : to_snap_key(pool, start)); @@ -272,6 +247,8 @@ Store::get_errors(const string& begin, errors.push_back(next.second); max_return--; } + + dout(10) << fmt::format("{} errors reported", errors.size()) << dendl; return errors; } diff --git a/src/osd/scrubber/ScrubStore.h b/src/osd/scrubber/ScrubStore.h index 7d590d2d1915e..9eb77ab667db7 100644 --- a/src/osd/scrubber/ScrubStore.h +++ b/src/osd/scrubber/ScrubStore.h @@ -1,8 +1,6 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab - -#ifndef CEPH_SCRUB_RESULT_H -#define CEPH_SCRUB_RESULT_H +#pragma once #include "common/map_cacher.hpp" #include "osd/osd_types_fmt.h" @@ -39,7 +37,7 @@ class Store { void add_error(int64_t pool, const inconsistent_obj_wrapper& e); void add_error(int64_t pool, const inconsistent_snapset_wrapper& e); - bool empty() const; + [[nodiscard]] bool is_empty() const; void flush(ObjectStore::Transaction*); /// remove both shallow and deep errors DBs. Called on interval. @@ -101,11 +99,6 @@ class Store { MapCacher::MapCacher::PosAndData; using ExpCacherPosData = tl::expected; - - std::vector get_errors(const std::string& start, - const std::string& end, - uint64_t max_return) const; - /// access to the owning Scrubber object, for logging mostly PgScrubber& m_scrubber; @@ -124,6 +117,11 @@ class Store { mutable std::optional errors_db; // not yet: mutable std::optional deep_db; + std::vector get_errors( + const std::string& start, + const std::string& end, + uint64_t max_return) const; + /** * Clear the DB of errors at a specific scrub level by performing an * omap_clear() on the DB object, and resetting the MapCacher. @@ -135,5 +133,3 @@ class Store { }; } // namespace Scrub - -#endif // CEPH_SCRUB_RESULT_H From daf848fa5afcf4ad86388eade472d2c3a4873826 Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Mon, 23 Sep 2024 23:09:51 -0500 Subject: [PATCH 091/148] osd/scrub: separate shallow vs deep errors storage The ScrubStore now holds two ScrubStore::at_level_t objects, one for the shallow errors and one for the deep errors. The shallow errors DB is recreated at the start of every scrub, while the deep errors DB is only recreated at the start of a deep scrub. When queried by the operator for known scrub errors, the ScrubStore will return the union of the errors from both DBs. Signed-off-by: Ronen Friedman --- src/osd/scrubber/ScrubStore.cc | 285 +++++++++++++++++++++++++++----- src/osd/scrubber/ScrubStore.h | 42 ++++- src/osd/scrubber/pg_scrubber.cc | 2 +- 3 files changed, 285 insertions(+), 44 deletions(-) diff --git a/src/osd/scrubber/ScrubStore.cc b/src/osd/scrubber/ScrubStore.cc index 033ea6b24dfd4..9c680da0de16f 100644 --- a/src/osd/scrubber/ScrubStore.cc +++ b/src/osd/scrubber/ScrubStore.cc @@ -99,15 +99,30 @@ Store::Store( { ceph_assert(t); - const auto err_obj = pgid.make_temp_ghobject(fmt::format("scrub_{}", pgid)); - t->touch(coll, err_obj); - errors_db.emplace(pgid, err_obj, OSDriver{&object_store, coll, err_obj}); + // shallow errors DB object + const auto sh_err_obj = + pgid.make_temp_ghobject(fmt::format("scrub_{}", pgid)); + t->touch(coll, sh_err_obj); + shallow_db.emplace( + pgid, sh_err_obj, OSDriver{&object_store, coll, sh_err_obj}); + + // and the DB for deep errors + const auto dp_err_obj = + pgid.make_temp_ghobject(fmt::format("deep_scrub_{}", pgid)); + t->touch(coll, dp_err_obj); + deep_db.emplace(pgid, dp_err_obj, OSDriver{&object_store, coll, dp_err_obj}); + + dout(20) << fmt::format( + "created Scrub::Store for pg[{}], shallow: {}, deep: {}", + pgid, sh_err_obj, dp_err_obj) + << dendl; } Store::~Store() { - ceph_assert(!errors_db || errors_db->results.empty()); + ceph_assert(!shallow_db || shallow_db->results.empty()); + ceph_assert(!deep_db || deep_db->results.empty()); } @@ -127,12 +142,49 @@ void Store::add_error(int64_t pool, const inconsistent_obj_wrapper& e) add_object_error(pool, e); } +namespace { + +inconsistent_obj_wrapper create_filtered_copy( + const inconsistent_obj_wrapper& obj, + uint64_t obj_err_mask, + uint64_t shard_err_mask) +{ + inconsistent_obj_wrapper dup = obj; + dup.errors &= obj_err_mask; + for (auto& [shard, si] : dup.shards) { + si.errors &= shard_err_mask; + } + return dup; +} + +} // namespace + + void Store::add_object_error(int64_t pool, const inconsistent_obj_wrapper& e) { const auto key = to_object_key(pool, e.object); - bufferlist bl; - e.encode(bl); - errors_db->results[key] = bl; + dout(20) << fmt::format( + "adding error for object {} ({}). Errors: {} ({}/{}) wr:{}", + e.object, key, librados::err_t{e.errors}, + librados::err_t{e.errors & librados::err_t::SHALLOW_ERRORS}, + librados::err_t{e.errors & librados::err_t::DEEP_ERRORS}, e) + << dendl; + + // divide the errors & shard errors into shallow and deep. + { + bufferlist bl; + create_filtered_copy( + e, librados::obj_err_t::SHALLOW_ERRORS, librados::err_t::SHALLOW_ERRORS) + .encode(bl); + shallow_db->results[key] = bl; + } + { + bufferlist bl; + create_filtered_copy( + e, librados::obj_err_t::DEEP_ERRORS, librados::err_t::DEEP_ERRORS) + .encode(bl); + deep_db->results[key] = bl; + } } @@ -144,23 +196,29 @@ void Store::add_error(int64_t pool, const inconsistent_snapset_wrapper& e) void Store::add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e) { - errors_db->results[to_snap_key(pool, e.object)] = e.encode(); + // note: snap errors are only placed in the shallow store + shallow_db->results[to_snap_key(pool, e.object)] = e.encode(); } bool Store::is_empty() const { - return !errors_db || errors_db->results.empty(); + return (!shallow_db || shallow_db->results.empty()) && + (!deep_db || deep_db->results.empty()); } void Store::flush(ObjectStore::Transaction* t) { if (t) { - auto txn = errors_db->driver.get_transaction(t); - errors_db->backend.set_keys(errors_db->results, &txn); + auto txn = shallow_db->driver.get_transaction(t); + shallow_db->backend.set_keys(shallow_db->results, &txn); + txn = deep_db->driver.get_transaction(t); + deep_db->backend.set_keys(deep_db->results, &txn); } - errors_db->results.clear(); + + shallow_db->results.clear(); + deep_db->results.clear(); } @@ -184,18 +242,23 @@ void Store::clear_level_db( void Store::reinit( ObjectStore::Transaction* t, - [[maybe_unused]] scrub_level_t level) + scrub_level_t level) { + // Note: only one caller, and it creates the transaction passed to reinit(). + // No need to assert on 't' dout(20) << fmt::format( "re-initializing the Scrub::Store (for {} scrub)", (level == scrub_level_t::deep ? "deep" : "shallow")) << dendl; - // Note: only one caller, and it creates the transaction passed to reinit(). - // No need to assert on 't' - - if (errors_db) { - clear_level_db(t, *errors_db, "scrub"); + // always clear the known shallow errors DB (as both shallow and deep scrubs + // would recreate it) + if (shallow_db) { + clear_level_db(t, *shallow_db, "shallow"); + } + // only a deep scrub recreates the deep errors DB + if (level == scrub_level_t::deep && deep_db) { + clear_level_db(t, *deep_db, "deep"); } } @@ -204,8 +267,10 @@ void Store::cleanup(ObjectStore::Transaction* t) { dout(20) << "discarding error DBs" << dendl; ceph_assert(t); - if (errors_db) - t->remove(coll, errors_db->errors_hoid); + if (shallow_db) + t->remove(coll, shallow_db->errors_hoid); + if (deep_db) + t->remove(coll, deep_db->errors_hoid); } @@ -214,42 +279,180 @@ std::vector Store::get_snap_errors( const librados::object_id_t& start, uint64_t max_return) const { - const string begin = (start.name.empty() ? - first_snap_key(pool) : to_snap_key(pool, start)); + vector errors; + const string begin = + (start.name.empty() ? first_snap_key(pool) : to_snap_key(pool, start)); const string end = last_snap_key(pool); - return get_errors(begin, end, max_return); + + // the snap errors are stored only in the shallow store + ExpCacherPosData latest_sh = shallow_db->backend.get_1st_after_key(begin); + + while (max_return-- && latest_sh.has_value() && latest_sh->last_key < end) { + errors.push_back(latest_sh->data); + latest_sh = shallow_db->backend.get_1st_after_key(latest_sh->last_key); + } + + return errors; } -std::vector -Store::get_object_errors(int64_t pool, - const librados::object_id_t& start, - uint64_t max_return) const + +std::vector Store::get_object_errors( + int64_t pool, + const librados::object_id_t& start, + uint64_t max_return) const { - const string begin = (start.name.empty() ? - first_object_key(pool) : to_object_key(pool, start)); + const string begin = + (start.name.empty() ? first_object_key(pool) + : to_object_key(pool, start)); const string end = last_object_key(pool); + dout(20) << fmt::format("fetching errors, from {} to {}", begin, end) + << dendl; return get_errors(begin, end, max_return); } -std::vector -Store::get_errors(const string& begin, - const string& end, - uint64_t max_return) const + +inline void decode( + librados::inconsistent_obj_t& obj, + ceph::buffer::list::const_iterator& bp) { + reinterpret_cast(obj).decode(bp); +} + + +inconsistent_obj_wrapper decode_wrapper( + hobject_t obj, + ceph::buffer::list::const_iterator bp) +{ + inconsistent_obj_wrapper iow{obj}; + iow.decode(bp); + return iow; +} + + +void Store::collect_specific_store( + MapCacher::MapCacher& backend, + Store::ExpCacherPosData& latest, + std::vector& errors, + std::string_view end_key, + uint64_t max_return) const +{ + while (max_return-- && latest.has_value() && + latest.value().last_key < end_key) { + errors.push_back(latest->data); + latest = backend.get_1st_after_key(latest->last_key); + } +} + + +bufferlist Store::merge_encoded_error_wrappers( + hobject_t obj, + ExpCacherPosData& latest_sh, + ExpCacherPosData& latest_dp) const +{ + // decode both error wrappers + auto sh_wrap = decode_wrapper(obj, latest_sh->data.cbegin()); + auto dp_wrap = decode_wrapper(obj, latest_dp->data.cbegin()); + dout(20) << fmt::format( + "merging errors {}. Shallow: {}-({}), Deep: {}-({})", + sh_wrap.object, sh_wrap.errors, dp_wrap.errors, sh_wrap, + dp_wrap) + << dendl; + + // merge the object errors (a simple OR of the two error bit-sets) + sh_wrap.errors |= dp_wrap.errors; + + // merge the two shard error maps + for (const auto& [shard, si] : dp_wrap.shards) { + dout(20) << fmt::format( + "shard {} dp-errors: {} sh-errors:{}", shard, si.errors, + sh_wrap.shards[shard].errors) + << dendl; + // note: we may be creating the shallow shard entry here. This is OK + sh_wrap.shards[shard].errors |= si.errors; + } + + return sh_wrap.encode(); +} + + +// a better way to implement get_errors(): use two generators, one for each store. +// and sort-merge the results. Almost like a merge-sort, but with equal +// keys combined. 'todo' once 'ranges' are really working. + +std::vector Store::get_errors( + const std::string& from_key, + const std::string& end_key, + uint64_t max_return) const +{ + // merge the input from the two sorted DBs into 'errors' (until + // enough errors are collected) vector errors; - if (!errors_db) - return errors; + dout(20) << fmt::format("getting errors from {} to {}", from_key, end_key) + << dendl; - auto next = std::make_pair(begin, bufferlist{}); - while (max_return && !errors_db->backend.get_next(next.first, &next)) { - if (next.first >= end) + ceph_assert(shallow_db); + ceph_assert(deep_db); + ExpCacherPosData latest_sh = shallow_db->backend.get_1st_after_key(from_key); + ExpCacherPosData latest_dp = deep_db->backend.get_1st_after_key(from_key); + + while (max_return) { + dout(20) << fmt::format( + "n:{} latest_sh: {}, latest_dp: {}", max_return, + (latest_sh ? latest_sh->last_key : "(none)"), + (latest_dp ? latest_dp->last_key : "(none)")) + << dendl; + + // keys not smaller than end_key are not interesting + if (latest_sh.has_value() && latest_sh->last_key >= end_key) { + latest_sh = tl::unexpected(-EINVAL); + } + if (latest_dp.has_value() && latest_dp->last_key >= end_key) { + latest_dp = tl::unexpected(-EINVAL); + } + + if (!latest_sh && !latest_dp) { + // both stores are exhausted + break; + } + if (!latest_sh.has_value()) { + // continue with the deep store + dout(10) << fmt::format("collecting from deep store") << dendl; + collect_specific_store( + deep_db->backend, latest_dp, errors, end_key, max_return); break; - errors.push_back(next.second); + } + if (!latest_dp.has_value()) { + // continue with the shallow store + dout(10) << fmt::format("collecting from shallow store") << dendl; + collect_specific_store( + shallow_db->backend, latest_sh, errors, end_key, max_return); + break; + } + + // we have results from both stores. Select the one with a lower key. + // If the keys are equal, combine the errors. + if (latest_sh->last_key == latest_dp->last_key) { + auto bl = merge_encoded_error_wrappers( + shallow_db->errors_hoid.hobj, latest_sh, latest_dp); + errors.push_back(bl); + latest_sh = shallow_db->backend.get_1st_after_key(latest_sh->last_key); + latest_dp = deep_db->backend.get_1st_after_key(latest_dp->last_key); + + } else if (latest_sh->last_key < latest_dp->last_key) { + dout(20) << fmt::format("shallow store element ({})", latest_sh->last_key) + << dendl; + errors.push_back(latest_sh->data); + latest_sh = shallow_db->backend.get_1st_after_key(latest_sh->last_key); + } else { + dout(20) << fmt::format("deep store element ({})", latest_dp->last_key) + << dendl; + errors.push_back(latest_dp->data); + latest_dp = deep_db->backend.get_1st_after_key(latest_dp->last_key); + } max_return--; } dout(10) << fmt::format("{} errors reported", errors.size()) << dendl; return errors; } - -} // namespace Scrub +} // namespace Scrub diff --git a/src/osd/scrubber/ScrubStore.h b/src/osd/scrubber/ScrubStore.h index 9eb77ab667db7..8a30e8daf8569 100644 --- a/src/osd/scrubber/ScrubStore.h +++ b/src/osd/scrubber/ScrubStore.h @@ -16,6 +16,28 @@ class PgScrubber; namespace Scrub { +/** + * Storing errors detected during scrubbing. + * + * From both functional and internal perspectives, the store is a pair of key-value + * databases: one maps objects to shallow errors detected during their scrubbing, + * and other stores deep errors. + * Note that the first store is updated in both shallow and in deep scrubs. The + * second - only while deep scrubbing. + * + * The DBs can be consulted by the operator, when trying to list 'errors known + * at this point in time'. Whenever a scrub starts - the relevant entries in the + * DBs are removed. Specifically - the shallow errors DB is recreated each scrub, + * while the deep errors DB is recreated only when a deep scrub starts. + * + * When queried - the data from both DBs is merged for each named object, and + * returned to the operator. + * + * Implementation: + * Each of the two DBs is implemented as OMAP entries of a single, uniquely named, + * object. Both DBs are cached using the general KV Cache mechanism. + */ + class Store { public: ~Store(); @@ -114,14 +136,21 @@ class Store { * allocations; and 'mutable', as the caching mechanism is used in const * methods) */ - mutable std::optional errors_db; - // not yet: mutable std::optional deep_db; + mutable std::optional shallow_db; + mutable std::optional deep_db; std::vector get_errors( const std::string& start, const std::string& end, uint64_t max_return) const; + void collect_specific_store( + MapCacher::MapCacher& backend, + ExpCacherPosData& latest, + std::vector& errors, + std::string_view end_key, + uint64_t max_return) const; + /** * Clear the DB of errors at a specific scrub level by performing an * omap_clear() on the DB object, and resetting the MapCacher. @@ -131,5 +160,14 @@ class Store { at_level_t& db, std::string_view db_name); + /** + * merge the two error wrappers - fetched from both DBs for the same object. + * Specifically, the object errors are or'ed, and so are the per-shard + * entries. + */ + bufferlist merge_encoded_error_wrappers( + hobject_t obj, + ExpCacherPosData& latest_sh, + ExpCacherPosData& latest_dp) const; }; } // namespace Scrub diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc index 81093666f91c8..594ffb15e2b5b 100644 --- a/src/osd/scrubber/pg_scrubber.cc +++ b/src/osd/scrubber/pg_scrubber.cc @@ -1209,7 +1209,7 @@ void PgScrubber::reinit_scrub_store() // actual Object Store objects). // 1. The ScrubStore object itself. // 2,3. The two special hobjects in the coll (the PG data) holding the last - // scrub's results. <> + // scrub's results. // // The Store object can be deleted and recreated, as a way to guarantee // no junk is left. We won't do it here, but we will clear the at_level_t From 47ef574bee6fc43850e9da9c0b9b6c4a34d58dae Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Mon, 7 Oct 2024 01:49:18 -0500 Subject: [PATCH 092/148] qa/standalone/scrub: test new ScrubStore implementation The ScrubStore is now comprised of two separate data structures, one for shallow errors and one for deep. A new test is added to verify the main objective of that design change: shallow scrubs should not overwrite deep scrub data. Signed-off-by: Ronen Friedman --- qa/standalone/scrub/osd-scrub-repair.sh | 249 +++++++++++++++++++++++- 1 file changed, 248 insertions(+), 1 deletion(-) diff --git a/qa/standalone/scrub/osd-scrub-repair.sh b/qa/standalone/scrub/osd-scrub-repair.sh index 59564f7e37e28..491e46603f72e 100755 --- a/qa/standalone/scrub/osd-scrub-repair.sh +++ b/qa/standalone/scrub/osd-scrub-repair.sh @@ -442,7 +442,6 @@ function TEST_auto_repair_bluestore_basic() { ['pool_name']="testpool" ['extras']=" --osd_scrub_auto_repair=true" ) - local extr_dbg=3 standard_scrub_cluster $dir cluster_conf local poolid=${cluster_conf['pool_id']} local poolname=${cluster_conf['pool_name']} @@ -6252,6 +6251,254 @@ function TEST_request_scrub_priority() { grep "log_channel.*scrub ok" $dir/osd.${primary}.log | grep -v purged_snaps | head -1 | sed 's/.*[[]DBG[]]//' | grep -q $pg || return 1 } +# +# Testing the "split scrub store" feature: shallow scrubs do not +# purge deep errors from the store. +# +# Corrupt one copy of a replicated pool, creating both shallow and deep errors. +# Then shallow-scrub the pool and verify that the deep errors are still present. +# +function TEST_dual_store_replicated_cluster() { + local dir=$1 + local poolname=csr_pool + local total_objs=19 + local extr_dbg=1 # note: 3 and above leave some temp files around + + run_mon $dir a --osd_pool_default_size=2 || return 1 + run_mgr $dir x --mgr_stats_period=1 || return 1 + local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0 --osd-deep-scrub-randomize-ratio=0 " + ceph_osd_args+="--osd_scrub_backoff_ratio=0 --osd_stats_update_period_not_scrubbing=3 " + ceph_osd_args+="--osd_stats_update_period_scrubbing=2 --osd_op_queue=wpq --osd_scrub_auto_repair=0 " + for osd in $(seq 0 1) + do + run_osd $dir $osd $ceph_osd_args || return 1 + done + + create_rbd_pool || return 1 + wait_for_clean || return 1 + + create_pool foo 1 || return 1 + create_pool $poolname 1 1 || return 1 + wait_for_clean || return 1 + + ceph osd pool set $poolname noscrub 1 + ceph osd pool set $poolname nodeep-scrub 1 + + for i in $(seq 1 $total_objs) ; do + objname=ROBJ${i} + add_something $dir $poolname $objname || return 1 + + rados --pool $poolname setomapheader $objname hdr-$objname || return 1 + rados --pool $poolname setomapval $objname key-$objname val-$objname || return 1 + done + + # Increase file 1 MB + 1KB + dd if=/dev/zero of=$dir/new.ROBJ19 bs=1024 count=1025 + rados --pool $poolname put $objname $dir/new.ROBJ19 || return 1 + rm -f $dir/new.ROBJ19 + + local pg=$(get_pg $poolname ROBJ0) + local primary=$(get_primary $poolname ROBJ0) + + # Compute an old omap digest and save oi + CEPH_ARGS='' ceph daemon $(get_asok_path osd.0) \ + config set osd_deep_scrub_update_digest_min_age 0 + CEPH_ARGS='' ceph daemon $(get_asok_path osd.1) \ + config set osd_deep_scrub_update_digest_min_age 0 + pg_deep_scrub $pg + + for i in $(seq 1 $total_objs) ; do + objname=ROBJ${i} + + # Alternate corruption between osd.0 and osd.1 + local osd=$(expr $i % 2) + + case $i in + 1) + # Size (deep scrub data_digest too) + local payload=UVWXYZZZ + echo $payload > $dir/CORRUPT + objectstore_tool $dir $osd $objname set-bytes $dir/CORRUPT || return 1 + ;; + + 2) + # digest (deep scrub only) + local payload=UVWXYZ + echo $payload > $dir/CORRUPT + objectstore_tool $dir $osd $objname set-bytes $dir/CORRUPT || return 1 + ;; + + 3) + # missing + objectstore_tool $dir $osd $objname remove || return 1 + ;; + + 4) + # Modify omap value (deep scrub only) + objectstore_tool $dir $osd $objname set-omap key-$objname $dir/CORRUPT || return 1 + ;; + + 5) + # Delete omap key (deep scrub only) + objectstore_tool $dir $osd $objname rm-omap key-$objname || return 1 + ;; + + 6) + # Add extra omap key (deep scrub only) + echo extra > $dir/extra-val + objectstore_tool $dir $osd $objname set-omap key2-$objname $dir/extra-val || return 1 + rm $dir/extra-val + ;; + + 7) + # Modify omap header (deep scrub only) + echo -n newheader > $dir/hdr + objectstore_tool $dir $osd $objname set-omaphdr $dir/hdr || return 1 + rm $dir/hdr + ;; + + 8) + rados --pool $poolname setxattr $objname key1-$objname val1-$objname || return 1 + rados --pool $poolname setxattr $objname key2-$objname val2-$objname || return 1 + + # Break xattrs + echo -n bad-val > $dir/bad-val + objectstore_tool $dir $osd $objname set-attr _key1-$objname $dir/bad-val || return 1 + objectstore_tool $dir $osd $objname rm-attr _key2-$objname || return 1 + echo -n val3-$objname > $dir/newval + objectstore_tool $dir $osd $objname set-attr _key3-$objname $dir/newval || return 1 + rm $dir/bad-val $dir/newval + ;; + + 9) + objectstore_tool $dir $osd $objname get-attr _ > $dir/robj9-oi + echo -n D > $dir/change + rados --pool $poolname put $objname $dir/change + objectstore_tool $dir $osd $objname set-attr _ $dir/robj9-oi + rm $dir/oi $dir/change + ;; + + # ROBJ10 must be handled after digests are re-computed by a deep scrub below + # ROBJ11 must be handled with config change before deep scrub + # ROBJ12 must be handled with config change before scrubs + # ROBJ13 must be handled before scrubs + + 14) + echo -n bad-val > $dir/bad-val + objectstore_tool $dir 0 $objname set-attr _ $dir/bad-val || return 1 + objectstore_tool $dir 1 $objname rm-attr _ || return 1 + rm $dir/bad-val + ;; + + 15) + objectstore_tool $dir $osd $objname rm-attr _ || return 1 + ;; + + 16) + objectstore_tool $dir 0 $objname rm-attr snapset || return 1 + echo -n bad-val > $dir/bad-val + objectstore_tool $dir 1 $objname set-attr snapset $dir/bad-val || return 1 + ;; + + 17) + # Deep-scrub only (all replicas are diffent than the object info + local payload=ROBJ17 + echo $payload > $dir/new.ROBJ17 + objectstore_tool $dir 0 $objname set-bytes $dir/new.ROBJ17 || return 1 + objectstore_tool $dir 1 $objname set-bytes $dir/new.ROBJ17 || return 1 + ;; + + 18) + # Deep-scrub only (all replicas are diffent than the object info + local payload=ROBJ18 + echo $payload > $dir/new.ROBJ18 + objectstore_tool $dir 0 $objname set-bytes $dir/new.ROBJ18 || return 1 + objectstore_tool $dir 1 $objname set-bytes $dir/new.ROBJ18 || return 1 + # Make one replica have a different object info, so a full repair must happen too + objectstore_tool $dir $osd $objname corrupt-info || return 1 + ;; + + 19) + # Set osd-max-object-size smaller than this object's size + + esac + done + + local pg=$(get_pg $poolname ROBJ0) + + ceph tell osd.\* injectargs -- --osd-max-object-size=1048576 + + inject_eio rep data $poolname ROBJ11 $dir 0 || return 1 # shard 0 of [1, 0], osd.1 + inject_eio rep mdata $poolname ROBJ12 $dir 1 || return 1 # shard 1 of [1, 0], osd.0 + inject_eio rep data $poolname ROBJ13 $dir 0 || return 1 # shard 0 of [1, 0], osd.1 + + # first sequence: the final shallow scrub should not override any of the deep errors + pg_scrub $pg + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.' > /tmp/WQR_1.json + pg_scrub $pg + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.' > /tmp/WQR_1b.json + rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | python3 -c "$sortkeys" > $dir/sh1_results.json + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \ + python3 -c "$sortkeys" > /tmp/WQR_1b_s.json + + pg_deep_scrub $pg + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.' > /tmp/WQR_2.json + rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | python3 -c "$sortkeys" > $dir/dp_results.json + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \ + python3 -c "$sortkeys" > /tmp/WQR_2s.json + + pg_scrub $pg + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.' > /tmp/WQR_3.json + rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | python3 -c "$sortkeys" > $dir/sh2_results.json + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \ + python3 -c "$sortkeys" > /tmp/WQR_3s.json + + diff -u $dir/dp_results.json $dir/sh2_results.json || return 1 + + # inject a read error, which is a special case: the scrub encountering the read error + # would override the previously collected shard info. + inject_eio rep mdata $poolname ROBJ13 $dir 1 || return 1 # shard 1 of [1, 0], osd.0 + + pg_deep_scrub $pg + + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.' > /tmp/WQR_4.json + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \ + python3 -c "$sortkeys" > /tmp/WQR_4s_w13.json + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | \ + jq 'del(.inconsistents[] | select(.object.name == "ROBJ13"))' | \ + jq '.inconsistents' | python3 -c "$sortkeys" > /tmp/WQR_4s_wo13.json + + rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \ + python3 -c "$sortkeys" > $dir/dpPart2_w13_results.json + # Remove the entry with "name":"ROBJ13" from the $dir/d*_results.json + rados list-inconsistent-obj $pg | jq "$jqfilter" | jq 'del(.inconsistents[] | select(.object.name == "ROBJ13"))' | \ + jq '.inconsistents' | python3 -c "$sortkeys" > $dir/dpPart2_wo13_results.json + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \ + python3 -c "$sortkeys" > /tmp/WQR_4s.json + + pg_scrub $pg + + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.' > /tmp/WQR_5.json + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \ + python3 -c "$sortkeys" > /tmp/WQR_5s_w13.json + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | \ + jq 'del(.inconsistents[] | select(.object.name == "ROBJ13"))' |\ + jq '.inconsistents' | python3 -c "$sortkeys" > /tmp/WQR_5s_wo13.json + + rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | python3 -c "$sortkeys" > \ + $dir/sh2Part2_w13_results.json + rados list-inconsistent-obj $pg | jq "$jqfilter" | jq 'del(.inconsistents[] | select(.object.name == "ROBJ13"))' |\ + jq '.inconsistents' | python3 -c "$sortkeys" > $dir/shPart2_wo13_results.json + + # the shallow scrub results should differ from the results of the deep + # scrub preceding it, but the difference should be limited to ROBJ13 + diff -u $dir/dpPart2_w13_results.json $dir/sh2Part2_w13_results.json && return 1 + diff -u $dir/dpPart2_wo13_results.json $dir/shPart2_wo13_results.json || return 1 + + ceph osd pool rm $poolname $poolname --yes-i-really-really-mean-it + return 0 +} + main osd-scrub-repair "$@" From 4f1ef85c7204dec9df9853597024637ac2873762 Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Sat, 5 Oct 2024 07:33:49 -0500 Subject: [PATCH 093/148] osd/scrub: modify ScrubStore contents retrieval A separate commit added a simple test to verify the new store implementation (creating both shallow & deep errors), scrubbing (step 1), deep scrubbing (step 2), then shallow scrubbing again (step 3). The test verifies that the results after step 2 include all shallow errors data (*), and that the results after step 3 include all deep errors data. The test highlighted the need to correctly partition and retrieve the "shards inconsistencies" and the "selected shard" data, which was not fully implemented in the previous commit. Thus, this commit adds the following: - add_object_error() no longer filters out data saved during deep scrubbing; it also filters less of the shallow scrubs "shards inconsistencies" data; - merge_encoded_error_wrappers() now merges the "shards inconsistencies" data correctly, handling the multiple scenarios possible. (*) note the special case of not being able to read the object's version during deep scrubbing (due to a read error). In this case - the data collected during the shallow scrub will not be reported. Signed-off-by: Ronen Friedman --- src/osd/scrubber/ScrubStore.cc | 154 ++++++++++++++++++++++----------- src/osd/scrubber/ScrubStore.h | 2 + 2 files changed, 106 insertions(+), 50 deletions(-) diff --git a/src/osd/scrubber/ScrubStore.cc b/src/osd/scrubber/ScrubStore.cc index 9c680da0de16f..7f28ca2d642a8 100644 --- a/src/osd/scrubber/ScrubStore.cc +++ b/src/osd/scrubber/ScrubStore.cc @@ -142,49 +142,30 @@ void Store::add_error(int64_t pool, const inconsistent_obj_wrapper& e) add_object_error(pool, e); } -namespace { - -inconsistent_obj_wrapper create_filtered_copy( - const inconsistent_obj_wrapper& obj, - uint64_t obj_err_mask, - uint64_t shard_err_mask) -{ - inconsistent_obj_wrapper dup = obj; - dup.errors &= obj_err_mask; - for (auto& [shard, si] : dup.shards) { - si.errors &= shard_err_mask; - } - return dup; -} - -} // namespace - void Store::add_object_error(int64_t pool, const inconsistent_obj_wrapper& e) { + using librados::obj_err_t; const auto key = to_object_key(pool, e.object); dout(20) << fmt::format( - "adding error for object {} ({}). Errors: {} ({}/{}) wr:{}", - e.object, key, librados::err_t{e.errors}, - librados::err_t{e.errors & librados::err_t::SHALLOW_ERRORS}, - librados::err_t{e.errors & librados::err_t::DEEP_ERRORS}, e) + "{}: adding error for object {} ({}). Errors: {} ({}/{}) " + "unfiltered:{}", + (current_level == scrub_level_t::deep ? "deep" : "shallow"), + e.object, key, obj_err_t{e.errors}, + obj_err_t{e.errors & obj_err_t::SHALLOW_ERRORS}, + obj_err_t{e.errors & obj_err_t::DEEP_ERRORS}, e) << dendl; - // divide the errors & shard errors into shallow and deep. - { - bufferlist bl; - create_filtered_copy( - e, librados::obj_err_t::SHALLOW_ERRORS, librados::err_t::SHALLOW_ERRORS) - .encode(bl); - shallow_db->results[key] = bl; - } - { - bufferlist bl; - create_filtered_copy( - e, librados::obj_err_t::DEEP_ERRORS, librados::err_t::DEEP_ERRORS) - .encode(bl); - deep_db->results[key] = bl; + if (current_level == scrub_level_t::deep) { + // not overriding the deep errors DB during shallow scrubs + deep_db->results[key] = e.encode(); } + + // only shallow errors are stored in the shallow DB + auto e_copy = e; + e_copy.errors &= librados::obj_err_t::SHALLOW_ERRORS; + e_copy.union_shards.errors &= librados::err_t::SHALLOW_ERRORS; + shallow_db->results[key] = e_copy.encode(); } @@ -251,6 +232,8 @@ void Store::reinit( (level == scrub_level_t::deep ? "deep" : "shallow")) << dendl; + current_level = level; + // always clear the known shallow errors DB (as both shallow and deep scrubs // would recreate it) if (shallow_db) { @@ -344,6 +327,15 @@ void Store::collect_specific_store( } +/* + * Implementation notes: + * - see https://github.com/ceph/ceph/commit/df3ff6dafeadb3822b35c424a890db9a14d7f60f + * for why we encode the shard_info_t in the store. + * - to maintain known shard_info-s created during a deep scrub (but only when + * needed), we use our knowledge of the level of the last scrub performed + * (current_level), and the object user version as encoded in the error + * structure. + */ bufferlist Store::merge_encoded_error_wrappers( hobject_t obj, ExpCacherPosData& latest_sh, @@ -352,26 +344,88 @@ bufferlist Store::merge_encoded_error_wrappers( // decode both error wrappers auto sh_wrap = decode_wrapper(obj, latest_sh->data.cbegin()); auto dp_wrap = decode_wrapper(obj, latest_dp->data.cbegin()); - dout(20) << fmt::format( - "merging errors {}. Shallow: {}-({}), Deep: {}-({})", - sh_wrap.object, sh_wrap.errors, dp_wrap.errors, sh_wrap, - dp_wrap) - << dendl; - // merge the object errors (a simple OR of the two error bit-sets) - sh_wrap.errors |= dp_wrap.errors; - - // merge the two shard error maps - for (const auto& [shard, si] : dp_wrap.shards) { + // note: the '20' level is just until we're sure the merging works as + // expected + if (g_conf()->subsys.should_gather()) { + dout(20) << fmt::format( + "merging errors {}. Deep: {:#x}-({})", sh_wrap.object, + dp_wrap.errors, dp_wrap) + << dendl; dout(20) << fmt::format( - "shard {} dp-errors: {} sh-errors:{}", shard, si.errors, - sh_wrap.shards[shard].errors) + "merging errors {}. Shallow: {:#x}-({})", sh_wrap.object, + sh_wrap.errors, sh_wrap) << dendl; - // note: we may be creating the shallow shard entry here. This is OK - sh_wrap.shards[shard].errors |= si.errors; + // dev: list the attributes: + for (const auto& [shard, si] : sh_wrap.shards) { + for (const auto& [attr, bl] : si.attrs) { + dout(20) << fmt::format(" shallow: shard {} attr: {}", shard, attr) + << dendl; + } + } + for (const auto& [shard, si] : dp_wrap.shards) { + for (const auto& [attr, bl] : si.attrs) { + dout(20) << fmt::format(" deep: shard {} attr: {}", shard, attr) + << dendl; + } + } + } + + // Actual merging of the shard map entries is only performed if the + // latest version is from the shallow scrub. + // Otherwise, the deep scrub, which (for the shards info) contains all data, + // and the shallow scrub is ignored. + if (current_level == scrub_level_t::shallow) { + // is the object data related to the same object version? + if (sh_wrap.version == dp_wrap.version) { + // combine the error information + dp_wrap.errors |= sh_wrap.errors; + for (const auto& [shard, si] : sh_wrap.shards) { + if (dp_wrap.shards.contains(shard)) { + dout(20) << fmt::format( + "-----> {}-{} combining: sh-errors: {} dp-errors:{}", + sh_wrap.object, shard, si, dp_wrap.shards[shard]) + << dendl; + const auto saved_er = dp_wrap.shards[shard].errors; + dp_wrap.shards[shard].selected_oi = si.selected_oi; + dp_wrap.shards[shard].primary = si.primary; + dp_wrap.shards[shard].errors |= saved_er; + + // the attributes: + for (const auto& [attr, bl] : si.attrs) { + if (!dp_wrap.shards[shard].attrs.contains(attr)) { + dout(20) << fmt::format( + "-----> {}-{} copying shallow attr: attr: {}", + sh_wrap.object, shard, attr) + << dendl; + dp_wrap.shards[shard].attrs[attr] = bl; + } + // otherwise - we'll ignore the shallow attr buffer + } + } else { + // the deep scrub data for this shard is missing. We take the shallow + // scrub data. + dp_wrap.shards[shard] = si; + } + } + } else if (sh_wrap.version > dp_wrap.version) { + if (false && dp_wrap.version == 0) { + // there was a read error in the deep scrub. The deep version + // shows as '0'. That's severe enough for us to ignore the shallow. + dout(10) << fmt::format("{} ignoring deep after read failure", + sh_wrap.object) + << dendl; + } else { + // There is a new shallow version of the object results. + // The deep data is for an older version of that object. + // There are multiple possibilities here, but for now we ignore the + // deep data. + dp_wrap = sh_wrap; + } + } } - return sh_wrap.encode(); + return dp_wrap.encode(); } diff --git a/src/osd/scrubber/ScrubStore.h b/src/osd/scrubber/ScrubStore.h index 8a30e8daf8569..0955654d78e91 100644 --- a/src/osd/scrubber/ScrubStore.h +++ b/src/osd/scrubber/ScrubStore.h @@ -130,6 +130,8 @@ class Store { /// the collection (i.e. - the PG store) in which the errors are stored const coll_t coll; + scrub_level_t current_level; + /** * the machinery (backend details, cache, etc.) for storing both levels * of errors (note: 'optional' to allow delayed creation w/o dynamic From 0c4028a6a356ae8c6e7d6d646e96c8e38a114789 Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Tue, 8 Oct 2024 08:25:56 -0500 Subject: [PATCH 094/148] qa/standalone/scrub: remove TEST_recovery_scrub_2 That test does no longer match the actual requirements and implementation of scrubbing. It was already deactivated in https://github.com/ceph/ceph/pull/59590. Here - it is fully removed, mainly for the sake of backporting. Signed-off-by: Ronen Friedman --- qa/standalone/scrub/osd-recovery-scrub.sh | 140 ---------------------- 1 file changed, 140 deletions(-) diff --git a/qa/standalone/scrub/osd-recovery-scrub.sh b/qa/standalone/scrub/osd-recovery-scrub.sh index 4eac1106e8d3a..843e9b9901b9c 100755 --- a/qa/standalone/scrub/osd-recovery-scrub.sh +++ b/qa/standalone/scrub/osd-recovery-scrub.sh @@ -234,146 +234,6 @@ function wait_background_check() { return $return_code } -# osd_scrub_during_recovery=true make sure scrub happens -# update 26.8.24: the test should be redesigned. The current version is not -# reliable, and playing around with the timeouts and such won't fix the -# design issues. -function TEST_recovery_scrub_2() { - local dir=$1 - local poolname=test - return 0 - - TESTDATA="testdata.$$" - OSDS=8 - PGS=32 - OBJECTS=40 - - setup $dir || return 1 - run_mon $dir a --osd_pool_default_size=1 --mon_allow_pool_size_one=true || return 1 - run_mgr $dir x --mgr_stats_period=1 || return 1 - local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0.1 " - ceph_osd_args+="--osd_scrub_backoff_ratio=0 " - ceph_osd_args+="--osd_stats_update_period_not_scrubbing=3 " - ceph_osd_args+="--osd_stats_update_period_scrubbing=2 " - ceph_osd_args+="--mgr_stats_period=1" - for osd in $(seq 0 $(expr $OSDS - 1)) - do - run_osd $dir $osd --osd_scrub_during_recovery=true --osd_recovery_sleep=1 \ - $ceph_osd_args || return 1 - done - - # Create a pool with $PGS pgs - create_pool $poolname $PGS $PGS - wait_for_clean || return 1 - poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }') - - dd if=/dev/urandom of=$TESTDATA bs=1M count=50 - for i in $(seq 1 $OBJECTS) - do - rados -p $poolname put obj${i} $TESTDATA - done - rm -f $TESTDATA - - ceph osd pool set $poolname size 3 - - ceph pg dump pgs - - # note that the following will be needed if the mclock scheduler is specified - ceph tell osd.* config get osd_mclock_override_recovery_settings - - # the '_max_active' is expected to be 0 - ceph tell osd.1 config get osd_recovery_max_active - # both next parameters are expected to be >=3 - ceph tell osd.1 config set osd_recovery_max_active_hdd 6 - ceph tell osd.1 config set osd_recovery_max_active_ssd 6 - ceph tell osd.1 config get osd_recovery_max_active_hdd - ceph tell osd.1 config get osd_recovery_max_active_ssd - - # Wait for recovery to start - count=0 - while(true) - do - #ceph --format json pg dump pgs | jq '.pg_stats | [.[].state]' - ceph pg dump pgs - if test $(ceph --format json pg dump pgs | - jq '.pg_stats | [.[].state]'| grep recovering | wc -l) -ge 2 - then - break - fi - sleep 2 - if test "$count" -eq "10" - then - echo "Not enough recovery started simultaneously" - return 1 - fi - count=$(expr $count + 1) - done - ceph pg dump pgs - - pids="" - recov_scrub_count=0 - for pg in $(seq 0 $(expr $PGS - 1)) - do - run_in_background pids pg_scrub_mod $poolid.$(printf "%x" $pg) - done - wait_background_check pids - return_code=$? - if [ $return_code -ne 0 ]; then return $return_code; fi - - ERRORS=0 - if test $recov_scrub_count -eq 0 - then - echo "No scrubs occurred while PG recovering" - ERRORS=$(expr $ERRORS + 1) - fi - - pidfile=$(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid') - pid=$(cat $pidfile) - if ! kill -0 $pid - then - echo "OSD crash occurred" - #tail -100 $dir/osd.0.log - ERRORS=$(expr $ERRORS + 1) - fi - - # Work around for http://tracker.ceph.com/issues/38195 - kill_daemons $dir #|| return 1 - - declare -a err_strings - ## we do not expect a refusal to scrub - err_strings[0]="recovery in progress.*scrubs" - for osd in $(seq 0 $(expr $OSDS - 1)) - do - grep "recovery in progress.*scrubs" $dir/osd.${osd}.log - done - for err_string in "${err_strings[@]}" - do - found=false - for osd in $(seq 0 $(expr $OSDS - 1)) - do - if grep "$err_string" $dir/osd.${osd}.log > /dev/null; - then - found=true - fi - done - if [ "$found" = "true" ]; then - echo "Found log message not expected '$err_string'" - ERRORS=$(expr $ERRORS + 1) - fi - done - - teardown $dir || return 1 - - if [ $ERRORS != "0" ]; - then - echo "TEST FAILED WITH $ERRORS ERRORS" - return 1 - fi - - echo "TEST PASSED" - return 0 -} - main osd-recovery-scrub "$@" # Local Variables: From 0a867d149e9f7783b5e703f348b5c7e6afc099fa Mon Sep 17 00:00:00 2001 From: Redouane Kachach Date: Tue, 1 Oct 2024 10:39:37 +0200 Subject: [PATCH 095/148] mgr/cephadm: adding more UT for mgmt-gateway service Signed-off-by: Redouane Kachach --- src/pybind/mgr/cephadm/tests/test_services.py | 489 +++++++++++++++++- 1 file changed, 483 insertions(+), 6 deletions(-) diff --git a/src/pybind/mgr/cephadm/tests/test_services.py b/src/pybind/mgr/cephadm/tests/test_services.py index a9b7da624a0e6..a05c87ce3c3a9 100644 --- a/src/pybind/mgr/cephadm/tests/test_services.py +++ b/src/pybind/mgr/cephadm/tests/test_services.py @@ -49,9 +49,9 @@ cephadm_root_ca = """-----BEGIN CERTIFICATE-----\\nMIIE7DCCAtSgAwIBAgIUE8b2zZ64geu2ns3Zfn3/4L+Cf6MwDQYJKoZIhvcNAQEL\\nBQAwFzEVMBMGA1UEAwwMY2VwaGFkbS1yb290MB4XDTI0MDYyNjE0NDA1M1oXDTM0\\nMDYyNzE0NDA1M1owFzEVMBMGA1UEAwwMY2VwaGFkbS1yb290MIICIjANBgkqhkiG\\n9w0BAQEFAAOCAg8AMIICCgKCAgEAsZRJsdtTr9GLG1lWFql5SGc46ldFanNJd1Gl\\nqXq5vgZVKRDTmNgAb/XFuNEEmbDAXYIRZolZeYKMHfn0pouPRSel0OsC6/02ZUOW\\nIuN89Wgo3IYleCFpkVIumD8URP3hwdu85plRxYZTtlruBaTRH38lssyCqxaOdEt7\\nAUhvYhcMPJThB17eOSQ73mb8JEC83vB47fosI7IhZuvXvRSuZwUW30rJanWNhyZq\\neS2B8qw2RSO0+77H6gA4ftBnitfsE1Y8/F9Z/f92JOZuSMQXUB07msznPbRJia3f\\nueO8gOc32vxd1A1/Qzp14uX34yEGY9ko2lW226cZO29IVUtXOX+LueQttwtdlpz8\\ne6Npm09pXhXAHxV/OW3M28MdXmobIqT/m9MfkeAErt5guUeC5y8doz6/3VQRjFEn\\nRpN0WkblgnNAQ3DONPc+Qd9Fi/wZV2X7bXoYpNdoWDsEOiE/eLmhG1A2GqU/mneP\\nzQ6u79nbdwTYpwqHpa+PvusXeLfKauzI8lLUJotdXy9EK8iHUofibB61OljYye6B\\nG3b8C4QfGsw8cDb4APZd/6AZYyMx/V3cGZ+GcOV7WvsC8k7yx5Uqasm/kiGQ3EZo\\nuNenNEYoGYrjb8D/8QzqNUTwlEh27/ps80tO7l2GGTvWVZL0PRZbmLDvO77amtOf\\nOiRXMoUCAwEAAaMwMC4wGwYDVR0RBBQwEocQAAAAAAAAAAAAAAAAAAAAATAPBgNV\\nHRMBAf8EBTADAQH/MA0GCSqGSIb3DQEBCwUAA4ICAQAxwzX5AhYEWhTV4VUwUj5+\\nqPdl4Q2tIxRokqyE+cDxoSd+6JfGUefUbNyBxDt0HaBq8obDqqrbcytxnn7mpnDu\\nhtiauY+I4Amt7hqFOiFA4cCLi2mfok6g2vL53tvhd9IrsfflAU2wy7hL76Ejm5El\\nA+nXlkJwps01Whl9pBkUvIbOn3pXX50LT4hb5zN0PSu957rjd2xb4HdfuySm6nW4\\n4GxtVWfmGA6zbC4XMEwvkuhZ7kD2qjkAguGDF01uMglkrkCJT3OROlNBuSTSBGqt\\ntntp5VytHvb7KTF7GttM3ha8/EU2KYaHM6WImQQTrOfiImAktOk4B3lzUZX3HYIx\\n+sByO4P4dCvAoGz1nlWYB2AvCOGbKf0Tgrh4t4jkiF8FHTXGdfvWmjgi1pddCNAy\\nn65WOCmVmLZPERAHOk1oBwqyReSvgoCFo8FxbZcNxJdlhM0Z6hzKggm3O3Dl88Xl\\n5euqJjh2STkBW8Xuowkg1TOs5XyWvKoDFAUzyzeLOL8YSG+gXV22gPTUaPSVAqdb\\nwd0Fx2kjConuC5bgTzQHs8XWA930U3XWZraj21Vaa8UxlBLH4fUro8H5lMSYlZNE\\nJHRNW8BkznAClaFSDG3dybLsrzrBFAu/Qb5zVkT1xyq0YkepGB7leXwq6vjWA5Pw\\nmZbKSphWfh0qipoqxqhfkw==\\n-----END CERTIFICATE-----\\n""" -ceph_generated_cert = """-----BEGIN CERTIFICATE-----\nMIICxjCCAa4CEQDIZSujNBlKaLJzmvntjukjMA0GCSqGSIb3DQEBDQUAMCExDTAL\nBgNVBAoMBENlcGgxEDAOBgNVBAMMB2NlcGhhZG0wHhcNMjIwNzEzMTE0NzA3WhcN\nMzIwNzEwMTE0NzA3WjAhMQ0wCwYDVQQKDARDZXBoMRAwDgYDVQQDDAdjZXBoYWRt\nMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAyyMe4DMA+MeYK7BHZMHB\nq7zjliEOcNgxomjU8qbf5USF7Mqrf6+/87XWqj4pCyAW8x0WXEr6A56a+cmBVmt+\nqtWDzl020aoId6lL5EgLLn6/kMDCCJLq++Lg9cEofMSvcZh+lY2f+1p+C+00xent\nrLXvXGOilAZWaQfojT2BpRnNWWIFbpFwlcKrlg2G0cFjV5c1m6a0wpsQ9JHOieq0\nSvwCixajwq3CwAYuuiU1wjI4oJO4Io1+g8yB3nH2Mo/25SApCxMXuXh4kHLQr/T4\n4hqisvG4uJYgKMcSIrWj5o25mclByGi1UI/kZkCUES94i7Z/3ihx4Bad0AMs/9tw\nFwIDAQABMA0GCSqGSIb3DQEBDQUAA4IBAQAf+pwz7Gd7mDwU2LY0TQXsK6/8KGzh\nHuX+ErOb8h5cOAbvCnHjyJFWf6gCITG98k9nxU9NToG0WYuNm/max1y/54f0dtxZ\npUo6KSNl3w6iYCfGOeUIj8isi06xMmeTgMNzv8DYhDt+P2igN6LenqWTVztogkiV\nxQ5ZJFFLEw4sN0CXnrZX3t5ruakxLXLTLKeE0I91YJvjClSBGkVJq26wOKQNHMhx\npWxeydQ5EgPZY+Aviz5Dnxe8aB7oSSovpXByzxURSabOuCK21awW5WJCGNpmqhWK\nZzACBDEstccj57c4OGV0eayHJRsluVr2e9NHRINZA3qdB37e6gsI1xHo\n-----END CERTIFICATE-----\n""" +ceph_generated_cert = """-----BEGIN CERTIFICATE-----\\nMIICxjCCAa4CEQDIZSujNBlKaLJzmvntjukjMA0GCSqGSIb3DQEBDQUAMCExDTAL\\nBgNVBAoMBENlcGgxEDAOBgNVBAMMB2NlcGhhZG0wHhcNMjIwNzEzMTE0NzA3WhcN\\nMzIwNzEwMTE0NzA3WjAhMQ0wCwYDVQQKDARDZXBoMRAwDgYDVQQDDAdjZXBoYWRt\\nMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAyyMe4DMA+MeYK7BHZMHB\\nq7zjliEOcNgxomjU8qbf5USF7Mqrf6+/87XWqj4pCyAW8x0WXEr6A56a+cmBVmt+\\nqtWDzl020aoId6lL5EgLLn6/kMDCCJLq++Lg9cEofMSvcZh+lY2f+1p+C+00xent\\nrLXvXGOilAZWaQfojT2BpRnNWWIFbpFwlcKrlg2G0cFjV5c1m6a0wpsQ9JHOieq0\\nSvwCixajwq3CwAYuuiU1wjI4oJO4Io1+g8yB3nH2Mo/25SApCxMXuXh4kHLQr/T4\\n4hqisvG4uJYgKMcSIrWj5o25mclByGi1UI/kZkCUES94i7Z/3ihx4Bad0AMs/9tw\\nFwIDAQABMA0GCSqGSIb3DQEBDQUAA4IBAQAf+pwz7Gd7mDwU2LY0TQXsK6/8KGzh\\nHuX+ErOb8h5cOAbvCnHjyJFWf6gCITG98k9nxU9NToG0WYuNm/max1y/54f0dtxZ\\npUo6KSNl3w6iYCfGOeUIj8isi06xMmeTgMNzv8DYhDt+P2igN6LenqWTVztogkiV\\nxQ5ZJFFLEw4sN0CXnrZX3t5ruakxLXLTLKeE0I91YJvjClSBGkVJq26wOKQNHMhx\\npWxeydQ5EgPZY+Aviz5Dnxe8aB7oSSovpXByzxURSabOuCK21awW5WJCGNpmqhWK\\nZzACBDEstccj57c4OGV0eayHJRsluVr2e9NHRINZA3qdB37e6gsI1xHo\\n-----END CERTIFICATE-----\\n""" -ceph_generated_key = """-----BEGIN PRIVATE KEY-----\nMIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQDLIx7gMwD4x5gr\nsEdkwcGrvOOWIQ5w2DGiaNTypt/lRIXsyqt/r7/ztdaqPikLIBbzHRZcSvoDnpr5\nyYFWa36q1YPOXTbRqgh3qUvkSAsufr+QwMIIkur74uD1wSh8xK9xmH6VjZ/7Wn4L\n7TTF6e2ste9cY6KUBlZpB+iNPYGlGc1ZYgVukXCVwquWDYbRwWNXlzWbprTCmxD0\nkc6J6rRK/AKLFqPCrcLABi66JTXCMjigk7gijX6DzIHecfYyj/blICkLExe5eHiQ\nctCv9PjiGqKy8bi4liAoxxIitaPmjbmZyUHIaLVQj+RmQJQRL3iLtn/eKHHgFp3Q\nAyz/23AXAgMBAAECggEAVoTB3Mm8azlPlaQB9GcV3tiXslSn+uYJ1duCf0sV52dV\nBzKW8s5fGiTjpiTNhGCJhchowqxoaew+o47wmGc2TvqbpeRLuecKrjScD0GkCYyQ\neM2wlshEbz4FhIZdgS6gbuh9WaM1dW/oaZoBNR5aTYo7xYTmNNeyLA/jO2zr7+4W\n5yES1lMSBXpKk7bDGKYY4bsX2b5RLr2Grh2u2bp7hoLABCEvuu8tSQdWXLEXWpXo\njwmV3hc6tabypIa0mj2Dmn2Dmt1ppSO0AZWG/WAizN3f4Z0r/u9HnbVrVmh0IEDw\n3uf2LP5o3msG9qKCbzv3lMgt9mMr70HOKnJ8ohMSKQKBgQDLkNb+0nr152HU9AeJ\nvdz8BeMxcwxCG77iwZphZ1HprmYKvvXgedqWtS6FRU+nV6UuQoPUbQxJBQzrN1Qv\nwKSlOAPCrTJgNgF/RbfxZTrIgCPuK2KM8I89VZv92TSGi362oQA4MazXC8RAWjoJ\nSu1/PHzK3aXOfVNSLrOWvIYeZQKBgQD/dgT6RUXKg0UhmXj7ExevV+c7oOJTDlMl\nvLngrmbjRgPO9VxLnZQGdyaBJeRngU/UXfNgajT/MU8B5fSKInnTMawv/tW7634B\nw3v6n5kNIMIjJmENRsXBVMllDTkT9S7ApV+VoGnXRccbTiDapBThSGd0wri/CuwK\nNWK1YFOeywKBgEDyI/XG114PBUJ43NLQVWm+wx5qszWAPqV/2S5MVXD1qC6zgCSv\nG9NLWN1CIMimCNg6dm7Wn73IM7fzvhNCJgVkWqbItTLG6DFf3/DPODLx1wTMqLOI\nqFqMLqmNm9l1Nec0dKp5BsjRQzq4zp1aX21hsfrTPmwjxeqJZdioqy2VAoGAXR5X\nCCdSHlSlUW8RE2xNOOQw7KJjfWT+WAYoN0c7R+MQplL31rRU7dpm1bLLRBN11vJ8\nMYvlT5RYuVdqQSP6BkrX+hLJNBvOLbRlL+EXOBrVyVxHCkDe+u7+DnC4epbn+N8P\nLYpwqkDMKB7diPVAizIKTBxinXjMu5fkKDs5n+sCgYBbZheYKk5M0sIxiDfZuXGB\nkf4mJdEkTI1KUGRdCwO/O7hXbroGoUVJTwqBLi1tKqLLarwCITje2T200BYOzj82\nqwRkCXGtXPKnxYEEUOiFx9OeDrzsZV00cxsEnX0Zdj+PucQ/J3Cvd0dWUspJfLHJ\n39gnaegswnz9KMQAvzKFdg==\n-----END PRIVATE KEY-----\n""" +ceph_generated_key = """-----BEGIN PRIVATE KEY-----\\nMIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQDLIx7gMwD4x5gr\\nsEdkwcGrvOOWIQ5w2DGiaNTypt/lRIXsyqt/r7/ztdaqPikLIBbzHRZcSvoDnpr5\\nyYFWa36q1YPOXTbRqgh3qUvkSAsufr+QwMIIkur74uD1wSh8xK9xmH6VjZ/7Wn4L\\n7TTF6e2ste9cY6KUBlZpB+iNPYGlGc1ZYgVukXCVwquWDYbRwWNXlzWbprTCmxD0\\nkc6J6rRK/AKLFqPCrcLABi66JTXCMjigk7gijX6DzIHecfYyj/blICkLExe5eHiQ\\nctCv9PjiGqKy8bi4liAoxxIitaPmjbmZyUHIaLVQj+RmQJQRL3iLtn/eKHHgFp3Q\\nAyz/23AXAgMBAAECggEAVoTB3Mm8azlPlaQB9GcV3tiXslSn+uYJ1duCf0sV52dV\\nBzKW8s5fGiTjpiTNhGCJhchowqxoaew+o47wmGc2TvqbpeRLuecKrjScD0GkCYyQ\\neM2wlshEbz4FhIZdgS6gbuh9WaM1dW/oaZoBNR5aTYo7xYTmNNeyLA/jO2zr7+4W\\n5yES1lMSBXpKk7bDGKYY4bsX2b5RLr2Grh2u2bp7hoLABCEvuu8tSQdWXLEXWpXo\\njwmV3hc6tabypIa0mj2Dmn2Dmt1ppSO0AZWG/WAizN3f4Z0r/u9HnbVrVmh0IEDw\\n3uf2LP5o3msG9qKCbzv3lMgt9mMr70HOKnJ8ohMSKQKBgQDLkNb+0nr152HU9AeJ\\nvdz8BeMxcwxCG77iwZphZ1HprmYKvvXgedqWtS6FRU+nV6UuQoPUbQxJBQzrN1Qv\\nwKSlOAPCrTJgNgF/RbfxZTrIgCPuK2KM8I89VZv92TSGi362oQA4MazXC8RAWjoJ\\nSu1/PHzK3aXOfVNSLrOWvIYeZQKBgQD/dgT6RUXKg0UhmXj7ExevV+c7oOJTDlMl\\nvLngrmbjRgPO9VxLnZQGdyaBJeRngU/UXfNgajT/MU8B5fSKInnTMawv/tW7634B\\nw3v6n5kNIMIjJmENRsXBVMllDTkT9S7ApV+VoGnXRccbTiDapBThSGd0wri/CuwK\\nNWK1YFOeywKBgEDyI/XG114PBUJ43NLQVWm+wx5qszWAPqV/2S5MVXD1qC6zgCSv\\nG9NLWN1CIMimCNg6dm7Wn73IM7fzvhNCJgVkWqbItTLG6DFf3/DPODLx1wTMqLOI\\nqFqMLqmNm9l1Nec0dKp5BsjRQzq4zp1aX21hsfrTPmwjxeqJZdioqy2VAoGAXR5X\\nCCdSHlSlUW8RE2xNOOQw7KJjfWT+WAYoN0c7R+MQplL31rRU7dpm1bLLRBN11vJ8\\nMYvlT5RYuVdqQSP6BkrX+hLJNBvOLbRlL+EXOBrVyVxHCkDe+u7+DnC4epbn+N8P\\nLYpwqkDMKB7diPVAizIKTBxinXjMu5fkKDs5n+sCgYBbZheYKk5M0sIxiDfZuXGB\\nkf4mJdEkTI1KUGRdCwO/O7hXbroGoUVJTwqBLi1tKqLLarwCITje2T200BYOzj82\\nqwRkCXGtXPKnxYEEUOiFx9OeDrzsZV00cxsEnX0Zdj+PucQ/J3Cvd0dWUspJfLHJ\\n39gnaegswnz9KMQAvzKFdg==\\n-----END PRIVATE KEY-----\\n""" class FakeInventory: @@ -602,6 +602,101 @@ def test_alertmanager_config( use_current_daemon_image=False, ) + @patch("cephadm.serve.CephadmServe._run_cephadm") + @patch("socket.getfqdn") + @patch("cephadm.module.CephadmOrchestrator.get_mgr_ip", lambda _: '::1') + @patch("cephadm.services.monitoring.password_hash", lambda password: 'alertmanager_password_hash') + @patch('cephadm.cert_mgr.CertMgr.get_root_ca', lambda instance: 'cephadm_root_cert') + @patch('cephadm.cert_mgr.CertMgr.generate_cert', lambda instance, fqdn, ip: ('mycert', 'mykey')) + def test_alertmanager_config_when_mgmt_gw_enabled(self, _get_fqdn, _run_cephadm, cephadm_module: CephadmOrchestrator): + _run_cephadm.side_effect = async_side_effect(('{}', '', 0)) + + fqdn = 'host1.test' + _get_fqdn.return_value = fqdn + + with with_host(cephadm_module, 'test'): + cephadm_module.secure_monitoring_stack = True + cephadm_module.set_store(AlertmanagerService.USER_CFG_KEY, 'alertmanager_user') + cephadm_module.set_store(AlertmanagerService.PASS_CFG_KEY, 'alertmanager_plain_password') + with with_service(cephadm_module, MgmtGatewaySpec("mgmt-gateway")) as _, \ + with_service(cephadm_module, AlertManagerSpec()): + + y = dedent(""" + # This file is generated by cephadm. + # See https://prometheus.io/docs/alerting/configuration/ for documentation. + + global: + resolve_timeout: 5m + http_config: + tls_config: + ca_file: root_cert.pem + + route: + receiver: 'default' + routes: + - group_by: ['alertname'] + group_wait: 10s + group_interval: 10s + repeat_interval: 1h + receiver: 'ceph-dashboard' + + receivers: + - name: 'default' + webhook_configs: + - name: 'ceph-dashboard' + webhook_configs: + - url: 'https://host_fqdn:29443/internal/dashboard/api/prometheus_receiver' + """).lstrip() + + web_config = dedent(""" + tls_server_config: + cert_file: alertmanager.crt + key_file: alertmanager.key + client_auth_type: RequireAndVerifyClientCert + client_ca_file: root_cert.pem + basic_auth_users: + alertmanager_user: alertmanager_password_hash + """).lstrip() + + _run_cephadm.assert_called_with( + 'test', + "alertmanager.test", + ['_orch', 'deploy'], + [], + stdin=json.dumps({ + "fsid": "fsid", + "name": 'alertmanager.test', + "image": '', + "deploy_arguments": [], + "params": { + 'tcp_ports': [9093, 9094], + }, + "meta": { + 'service_name': 'alertmanager', + 'ports': [9093, 9094], + 'ip': None, + 'deployed_by': [], + 'rank': None, + 'rank_generation': None, + 'extra_container_args': None, + 'extra_entrypoint_args': None, + }, + "config_blobs": { + "files": { + "alertmanager.yml": y, + 'alertmanager.crt': 'mycert', + 'alertmanager.key': 'mykey', + 'web.yml': web_config, + 'root_cert.pem': 'cephadm_root_cert' + }, + 'peers': [], + 'web_config': '/etc/alertmanager/web.yml', + "use_url_prefix": True, + } + }), + use_current_daemon_image=False, + ) + @patch("cephadm.serve.CephadmServe._run_cephadm") @patch("socket.getfqdn") @patch("cephadm.module.CephadmOrchestrator.get_mgr_ip", lambda _: '::1') @@ -738,6 +833,110 @@ def test_ceph_exporter_config_security_enabled(self, _get_fqdn, _run_cephadm, ce "ceph-exporter.key": "mykey"}}}), use_current_daemon_image=False) + @patch("cephadm.serve.CephadmServe._run_cephadm") + @patch("mgr_module.MgrModule.get") + @patch("socket.getfqdn") + def test_node_exporter_config_without_mgmt_gw( + self, + mock_getfqdn, + mock_get, + _run_cephadm, + cephadm_module: CephadmOrchestrator, + ): + _run_cephadm.side_effect = async_side_effect(("{}", "", 0)) + fqdn = 'host1.test' + mock_getfqdn.return_value = fqdn + + with with_host(cephadm_module, "test"): + with with_service(cephadm_module, MonitoringSpec('node-exporter')): + _run_cephadm.assert_called_with( + 'test', + "node-exporter.test", + ['_orch', 'deploy'], + [], + stdin=json.dumps({ + "fsid": "fsid", + "name": 'node-exporter.test', + "image": '', + "deploy_arguments": [], + "params": { + 'tcp_ports': [9100], + }, + "meta": { + 'service_name': 'node-exporter', + 'ports': [9100], + 'ip': None, + 'deployed_by': [], + 'rank': None, + 'rank_generation': None, + 'extra_container_args': None, + 'extra_entrypoint_args': None, + }, + "config_blobs": {} + }), + use_current_daemon_image=False, + ) + + @patch('cephadm.cert_mgr.CertMgr.generate_cert', lambda instance, fqdn, ip: (ceph_generated_cert, ceph_generated_key)) + @patch('cephadm.cert_mgr.CertMgr.get_root_ca', lambda instance: cephadm_root_ca) + @patch("cephadm.serve.CephadmServe._run_cephadm") + @patch("socket.getfqdn") + def test_node_exporter_config_with_mgmt_gw( + self, + mock_getfqdn, + _run_cephadm, + cephadm_module: CephadmOrchestrator, + ): + _run_cephadm.side_effect = async_side_effect(("{}", "", 0)) + mock_getfqdn.return_value = 'host1.test' + + y = dedent(""" + tls_server_config: + cert_file: node_exporter.crt + key_file: node_exporter.key + client_auth_type: RequireAndVerifyClientCert + client_ca_file: root_cert.pem + """).lstrip() + + with with_host(cephadm_module, "test"): + with with_service(cephadm_module, MgmtGatewaySpec("mgmt-gateway")) as _, \ + with_service(cephadm_module, MonitoringSpec('node-exporter')): + _run_cephadm.assert_called_with( + 'test', + "node-exporter.test", + ['_orch', 'deploy'], + [], + stdin=json.dumps({ + "fsid": "fsid", + "name": 'node-exporter.test', + "image": '', + "deploy_arguments": [], + "params": { + 'tcp_ports': [9100], + }, + "meta": { + 'service_name': 'node-exporter', + 'ports': [9100], + 'ip': None, + 'deployed_by': [], + 'rank': None, + 'rank_generation': None, + 'extra_container_args': None, + 'extra_entrypoint_args': None, + }, + "config_blobs": { + "files": { + "web.yml": y, + 'root_cert.pem': f"{cephadm_root_ca}", + 'node_exporter.crt': f"{ceph_generated_cert}", + 'node_exporter.key': f"{ceph_generated_key}", + }, + 'web_config': '/etc/node-exporter/web.yml', + } + }), + use_current_daemon_image=False, + ) + @patch("cephadm.serve.CephadmServe._run_cephadm") @patch("cephadm.module.CephadmOrchestrator.get_mgr_ip", lambda _: '::1') def test_prometheus_config_security_disabled(self, _run_cephadm, cephadm_module: CephadmOrchestrator): @@ -1240,6 +1439,286 @@ def test_promtail_config(self, _run_cephadm, cephadm_module: CephadmOrchestrator use_current_daemon_image=False, ) + @patch("cephadm.serve.CephadmServe._run_cephadm") + @patch("cephadm.module.CephadmOrchestrator.get_mgr_ip", lambda _: '1::4') + @patch("cephadm.module.CephadmOrchestrator.get_fqdn", lambda a, b: 'host_fqdn') + @patch("cephadm.services.monitoring.verify_tls", lambda *_: None) + @patch('cephadm.cert_mgr.CertMgr.get_root_ca', lambda instance: cephadm_root_ca) + def test_grafana_config_with_mgmt_gw_and_ouath2_proxy(self, _run_cephadm, cephadm_module: CephadmOrchestrator): + _run_cephadm.side_effect = async_side_effect(("{}", "", 0)) + + y = dedent(f""" + # This file is generated by cephadm. + apiVersion: 1 + + deleteDatasources: + - name: 'Dashboard1' + orgId: 1 + + datasources: + - name: 'Dashboard1' + type: 'prometheus' + access: 'proxy' + orgId: 1 + url: 'https://host_fqdn:29443/internal/prometheus' + basicAuth: true + isDefault: true + editable: false + basicAuthUser: admin + jsonData: + graphiteVersion: "1.1" + tlsAuth: false + tlsAuthWithCACert: true + tlsSkipVerify: false + secureJsonData: + basicAuthPassword: admin + tlsCACert: "{cephadm_root_ca}" + tlsClientCert: "{ceph_generated_cert}" + tlsClientKey: "{ceph_generated_key}" + + - name: 'Loki' + type: 'loki' + access: 'proxy' + url: '' + basicAuth: false + isDefault: false + editable: false""").lstrip() + + oauth2_spec = OAuth2ProxySpec(provider_display_name='my_idp_provider', + client_id='my_client_id', + client_secret='my_client_secret', + oidc_issuer_url='http://192.168.10.10:8888/dex', + cookie_secret='kbAEM9opAmuHskQvt0AW8oeJRaOM2BYy5Loba0kZ0SQ=', + ssl_certificate=ceph_generated_cert, + ssl_certificate_key=ceph_generated_key) + + with with_host(cephadm_module, "test"): + cephadm_module.cert_key_store.save_cert('grafana_cert', ceph_generated_cert, host='test') + cephadm_module.cert_key_store.save_key('grafana_key', ceph_generated_key, host='test') + with with_service(cephadm_module, PrometheusSpec("prometheus")) as _, \ + with_service(cephadm_module, MgmtGatewaySpec("mgmt-gateway")) as _, \ + with_service(cephadm_module, oauth2_spec) as _, \ + with_service(cephadm_module, ServiceSpec("mgr")) as _, with_service( + cephadm_module, GrafanaSpec("grafana") + ) as _: + files = { + 'grafana.ini': dedent(""" + # This file is generated by cephadm. + [users] + default_theme = light + [auth.anonymous] + enabled = true + org_name = 'Main Org.' + org_role = 'Viewer' + [server] + domain = 'host_fqdn' + protocol = https + cert_file = /etc/grafana/certs/cert_file + cert_key = /etc/grafana/certs/cert_key + http_port = 3000 + http_addr = + root_url = %(protocol)s://%(domain)s:%(http_port)s/grafana/ + serve_from_sub_path = true + [snapshots] + external_enabled = false + [security] + disable_initial_admin_creation = true + cookie_secure = true + cookie_samesite = none + allow_embedding = true + [auth] + disable_login_form = true + [auth.proxy] + enabled = true + header_name = X-WEBAUTH-USER + header_property = username + auto_sign_up = true + sync_ttl = 15 + whitelist = 1::4 + headers_encoded = false + enable_login_token = false + headers = Role:X-WEBAUTH-ROLE\n""").lstrip(), # noqa: W291 + "provisioning/datasources/ceph-dashboard.yml": y, + 'certs/cert_file': dedent(f""" + # generated by cephadm\n{ceph_generated_cert}""").lstrip(), + 'certs/cert_key': dedent(f""" + # generated by cephadm\n{ceph_generated_key}""").lstrip(), + 'provisioning/dashboards/default.yml': dedent(""" + # This file is generated by cephadm. + apiVersion: 1 + + providers: + - name: 'Ceph Dashboard' + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 3 + editable: false + options: + path: '/etc/grafana/provisioning/dashboards'""").lstrip(), + } + + _run_cephadm.assert_called_with( + 'test', + "grafana.test", + ['_orch', 'deploy'], + [], + stdin=json.dumps({ + "fsid": "fsid", + "name": 'grafana.test', + "image": '', + "deploy_arguments": [], + "params": { + 'tcp_ports': [3000], + }, + "meta": { + 'service_name': 'grafana', + 'ports': [3000], + 'ip': None, + 'deployed_by': [], + 'rank': None, + 'rank_generation': None, + 'extra_container_args': None, + 'extra_entrypoint_args': None, + }, + "config_blobs": { + "files": files, + }, + }), + use_current_daemon_image=False, + ) + + @patch("cephadm.serve.CephadmServe._run_cephadm") + @patch("cephadm.module.CephadmOrchestrator.get_mgr_ip", lambda _: '1::4') + @patch("cephadm.module.CephadmOrchestrator.get_fqdn", lambda a, b: 'host_fqdn') + @patch("cephadm.services.monitoring.verify_tls", lambda *_: None) + @patch('cephadm.cert_mgr.CertMgr.get_root_ca', lambda instance: cephadm_root_ca) + def test_grafana_config_with_mgmt_gw(self, _run_cephadm, cephadm_module: CephadmOrchestrator): + _run_cephadm.side_effect = async_side_effect(("{}", "", 0)) + + y = dedent(f""" + # This file is generated by cephadm. + apiVersion: 1 + + deleteDatasources: + - name: 'Dashboard1' + orgId: 1 + + datasources: + - name: 'Dashboard1' + type: 'prometheus' + access: 'proxy' + orgId: 1 + url: 'https://host_fqdn:29443/internal/prometheus' + basicAuth: true + isDefault: true + editable: false + basicAuthUser: admin + jsonData: + graphiteVersion: "1.1" + tlsAuth: false + tlsAuthWithCACert: true + tlsSkipVerify: false + secureJsonData: + basicAuthPassword: admin + tlsCACert: "{cephadm_root_ca}" + tlsClientCert: "{ceph_generated_cert}" + tlsClientKey: "{ceph_generated_key}" + + - name: 'Loki' + type: 'loki' + access: 'proxy' + url: '' + basicAuth: false + isDefault: false + editable: false""").lstrip() + + with with_host(cephadm_module, "test"): + cephadm_module.cert_key_store.save_cert('grafana_cert', ceph_generated_cert, host='test') + cephadm_module.cert_key_store.save_key('grafana_key', ceph_generated_key, host='test') + with with_service( + cephadm_module, PrometheusSpec("prometheus") + ) as _, with_service(cephadm_module, MgmtGatewaySpec("mgmt-gateway")) as _, \ + with_service(cephadm_module, ServiceSpec("mgr")) as _, with_service( + cephadm_module, GrafanaSpec("grafana") + ) as _: + files = { + 'grafana.ini': dedent(""" + # This file is generated by cephadm. + [users] + default_theme = light + [auth.anonymous] + enabled = true + org_name = 'Main Org.' + org_role = 'Viewer' + [server] + domain = 'host_fqdn' + protocol = https + cert_file = /etc/grafana/certs/cert_file + cert_key = /etc/grafana/certs/cert_key + http_port = 3000 + http_addr = + root_url = %(protocol)s://%(domain)s:%(http_port)s/grafana/ + serve_from_sub_path = true + [snapshots] + external_enabled = false + [security] + disable_initial_admin_creation = true + cookie_secure = true + cookie_samesite = none + allow_embedding = true\n""").lstrip(), # noqa: W291 + "provisioning/datasources/ceph-dashboard.yml": y, + 'certs/cert_file': dedent(f""" + # generated by cephadm\n{ceph_generated_cert}""").lstrip(), + 'certs/cert_key': dedent(f""" + # generated by cephadm\n{ceph_generated_key}""").lstrip(), + 'provisioning/dashboards/default.yml': dedent(""" + # This file is generated by cephadm. + apiVersion: 1 + + providers: + - name: 'Ceph Dashboard' + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 3 + editable: false + options: + path: '/etc/grafana/provisioning/dashboards'""").lstrip(), + } + + _run_cephadm.assert_called_with( + 'test', + "grafana.test", + ['_orch', 'deploy'], + [], + stdin=json.dumps({ + "fsid": "fsid", + "name": 'grafana.test', + "image": '', + "deploy_arguments": [], + "params": { + 'tcp_ports': [3000], + }, + "meta": { + 'service_name': 'grafana', + 'ports': [3000], + 'ip': None, + 'deployed_by': [], + 'rank': None, + 'rank_generation': None, + 'extra_container_args': None, + 'extra_entrypoint_args': None, + }, + "config_blobs": { + "files": files, + }, + }), + use_current_daemon_image=False, + ) + @patch("cephadm.serve.CephadmServe._run_cephadm") @patch("cephadm.module.CephadmOrchestrator.get_mgr_ip", lambda _: '1::4') @patch("cephadm.module.CephadmOrchestrator.get_fqdn", lambda a, b: 'host_fqdn') @@ -3296,7 +3775,7 @@ class TestMgmtGateway: @patch("cephadm.module.CephadmOrchestrator.get_mgr_ip", lambda _: '::1') @patch('cephadm.cert_mgr.CertMgr.get_root_ca', lambda instance: cephadm_root_ca) @patch("cephadm.services.mgmt_gateway.get_dashboard_endpoints", lambda _: (["ceph-node-2:8443", "ceph-node-2:8443"], "https")) - def test_mgmt_gateway_config_no_auth(self, get_service_endpoints_mock: List[str], _run_cephadm, cephadm_module: CephadmOrchestrator): + def test_mgmt_gw_config_no_auth(self, get_service_endpoints_mock: List[str], _run_cephadm, cephadm_module: CephadmOrchestrator): def get_services_endpoints(name): if name == 'prometheus': @@ -3417,7 +3896,6 @@ def get_services_endpoints(name): } location /grafana { - rewrite ^/grafana/(.*) /$1 break; proxy_pass https://grafana_servers; # clear any Authorization header as Prometheus and Alertmanager are using basic-auth browser # will send this header if Grafana is running on the same node as one of those services @@ -3518,7 +3996,7 @@ def get_services_endpoints(name): @patch('cephadm.cert_mgr.CertMgr.get_root_ca', lambda instance: cephadm_root_ca) @patch("cephadm.services.mgmt_gateway.get_dashboard_endpoints", lambda _: (["ceph-node-2:8443", "ceph-node-2:8443"], "https")) @patch("cephadm.services.mgmt_gateway.MgmtGatewayService.get_oauth2_service_url", lambda _: "https://192.168.100.102:4180") - def test_mgmt_gateway_config_with_auth(self, get_service_endpoints_mock: List[str], _run_cephadm, cephadm_module: CephadmOrchestrator): + def test_mgmt_gw_config_with_auth(self, get_service_endpoints_mock: List[str], _run_cephadm, cephadm_module: CephadmOrchestrator): def get_services_endpoints(name): if name == 'prometheus': @@ -3689,7 +4167,6 @@ def get_services_endpoints(name): } location /grafana { - rewrite ^/grafana/(.*) /$1 break; proxy_pass https://grafana_servers; # clear any Authorization header as Prometheus and Alertmanager are using basic-auth browser # will send this header if Grafana is running on the same node as one of those services From 7e03ee798f4ed3aa4a0bb1a9e9d62df52e54406d Mon Sep 17 00:00:00 2001 From: Dan Mick Date: Thu, 10 Oct 2024 16:38:58 -0700 Subject: [PATCH 096/148] container/build.sh: fix arm architecture tagging The wrong string was used for comparison, and for tagging, so the arm64 branch and sha1 images overwrote and destroyed the amd64 images. Signed-off-by: Dan Mick --- container/build.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/container/build.sh b/container/build.sh index 7c97e2261c16f..5edf469d2d2e4 100755 --- a/container/build.sh +++ b/container/build.sh @@ -136,9 +136,9 @@ if [[ ${CI_CONTAINER} == "true" ]] ; then branch_repo_tag=$repopath/ceph:${BRANCH} sha1_repo_tag=$repopath/ceph:${CEPH_SHA1} - if [[ "${ARCH}" == "aarch64" ]] ; then - branch_repo_tag=${branch_repo_tag}-aarch64 - sha1_repo_tag=${sha1_repo_tag}-aarch64 + if [[ "${ARCH}" == "arm64" ]] ; then + branch_repo_tag=${branch_repo_tag}-arm64 + sha1_repo_tag=${sha1_repo_tag}-arm64 fi podman tag ${image_id} ${full_repo_tag} From ad147f2e8d820ff251e1499c1e4c3fe57d1a2082 Mon Sep 17 00:00:00 2001 From: Aashish Sharma Date: Wed, 9 Oct 2024 19:32:49 +0530 Subject: [PATCH 097/148] mgr/cephadm: RGW service deployment defaults to 'default' realm/zonegroup/zone despite non-default spec in service When we create an RGW service using the ceph orch apply command, the service is always deployed in the default realm, zonegroup, and zone, even if we specify a different realm, zonegroup, or zone in the service spec. This happens because certain configuration values, like rgw_realm, rgw_zonegroup, and rgw_zone, need to be set for the RGW instances before the daemons are deployed. Currently, these configurations are being applied after the RGW daemons are deployed, which requires a service restart to reflect the correct realm, zonegroup, and zone. Ideally, these configurations should be applied before the RGW daemons are deployed, so they are correctly placed in the desired realm, zonegroup, and zone from the start. Fixes: https://tracker.ceph.com/issues/68461 Signed-off-by: Aashish Sharma --- src/pybind/mgr/cephadm/module.py | 1 + src/pybind/mgr/cephadm/serve.py | 4 ++++ src/pybind/mgr/cephadm/services/cephadmservice.py | 9 +++++++-- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 5216c489064c9..afaf5d7846e3c 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -764,6 +764,7 @@ def __init__(self, *args: Any, **kwargs: Any): self.iscsi_service: IscsiService = cast(IscsiService, self.cephadm_services['iscsi']) self.nvmeof_service: NvmeofService = cast(NvmeofService, self.cephadm_services['nvmeof']) self.node_proxy_service: NodeProxy = cast(NodeProxy, self.cephadm_services['node-proxy']) + self.rgw_service: RgwService = cast(RgwService, self.cephadm_services['rgw']) self.scheduled_async_actions: List[Callable] = [] diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index c6212c9efb83d..611c27c34538a 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -950,6 +950,10 @@ def update_progress() -> None: ) continue + # set multisite config before deploying the rgw daemon + if service_type == 'rgw': + self.mgr.rgw_service.set_realm_zg_zone(cast(RGWSpec, spec)) + # deploy new daemon daemon_id = slot.name diff --git a/src/pybind/mgr/cephadm/services/cephadmservice.py b/src/pybind/mgr/cephadm/services/cephadmservice.py index eb9a1c838a656..9043577bc5a60 100644 --- a/src/pybind/mgr/cephadm/services/cephadmservice.py +++ b/src/pybind/mgr/cephadm/services/cephadmservice.py @@ -984,10 +984,9 @@ class RgwService(CephService): def allow_colo(self) -> bool: return True - def config(self, spec: RGWSpec) -> None: # type: ignore + def set_realm_zg_zone(self, spec: RGWSpec) -> None: assert self.TYPE == spec.service_type - # set rgw_realm rgw_zonegroup and rgw_zone, if present if spec.rgw_realm: ret, out, err = self.mgr.check_mon_command({ 'prefix': 'config set', @@ -1010,6 +1009,12 @@ def config(self, spec: RGWSpec) -> None: # type: ignore 'value': spec.rgw_zone, }) + def config(self, spec: RGWSpec) -> None: # type: ignore + assert self.TYPE == spec.service_type + + # set rgw_realm rgw_zonegroup and rgw_zone, if present + self.set_realm_zg_zone(spec) + if spec.generate_cert and not spec.rgw_frontend_ssl_certificate: # generate a self-signed cert for the rgw service cert, key = self.mgr.cert_mgr.ssl_certs.generate_root_cert(custom_san_list=spec.zonegroup_hostnames) From 86378344ab0a381569b116c2112a981404f93671 Mon Sep 17 00:00:00 2001 From: Nizamudeen A Date: Fri, 20 Sep 2024 20:35:38 +0530 Subject: [PATCH 098/148] mgr/dashboard: introduce server side pagination for osds Fixes: https://tracker.ceph.com/issues/56511 Signed-off-by: Nizamudeen A --- qa/tasks/mgr/dashboard/test_osd.py | 5 +- src/pybind/mgr/dashboard/controllers/osd.py | 28 ++++++++- .../osd/osd-list/osd-list.component.html | 6 +- .../osd/osd-list/osd-list.component.spec.ts | 59 +++++++++++-------- .../osd/osd-list/osd-list.component.ts | 14 +++-- .../src/app/shared/api/osd.service.spec.ts | 6 +- .../src/app/shared/api/osd.service.ts | 11 +++- .../src/app/shared/api/paginate.model.ts | 2 +- .../shared/classes/paginate-params.class.ts | 15 +++++ .../models/cd-table-fetch-data-context.ts | 2 +- .../src/app/shared/models/osd.model.ts | 49 +++++++++++++++ src/pybind/mgr/dashboard/openapi.yaml | 24 +++++++- src/pybind/mgr/dashboard/tests/test_osd.py | 3 +- 13 files changed, 178 insertions(+), 46 deletions(-) create mode 100644 src/pybind/mgr/dashboard/frontend/src/app/shared/classes/paginate-params.class.ts create mode 100644 src/pybind/mgr/dashboard/frontend/src/app/shared/models/osd.model.ts diff --git a/qa/tasks/mgr/dashboard/test_osd.py b/qa/tasks/mgr/dashboard/test_osd.py index 07c69ddc47cb6..be7afccf33176 100644 --- a/qa/tasks/mgr/dashboard/test_osd.py +++ b/qa/tasks/mgr/dashboard/test_osd.py @@ -11,6 +11,7 @@ class OsdTest(DashboardTestCase): AUTH_ROLES = ['cluster-manager'] + _VERSION = '1.1' @classmethod def setUpClass(cls): @@ -24,7 +25,7 @@ def tearDown(self): @DashboardTestCase.RunAs('test', 'test', ['block-manager']) def test_access_permissions(self): - self._get('/api/osd') + self._get('/api/osd', version=self._VERSION) self.assertStatus(403) self._get('/api/osd/0') self.assertStatus(403) @@ -33,7 +34,7 @@ def assert_in_and_not_none(self, data, properties): self.assertSchema(data, JObj({p: JAny(none=False) for p in properties}, allow_unknown=True)) def test_list(self): - data = self._get('/api/osd') + data = self._get('/api/osd', version=self._VERSION) self.assertStatus(200) self.assertGreaterEqual(len(data), 1) diff --git a/src/pybind/mgr/dashboard/controllers/osd.py b/src/pybind/mgr/dashboard/controllers/osd.py index c9d1417720005..07d8db7755b8a 100644 --- a/src/pybind/mgr/dashboard/controllers/osd.py +++ b/src/pybind/mgr/dashboard/controllers/osd.py @@ -5,12 +5,14 @@ import time from typing import Any, Dict, List, Optional, Union +import cherrypy from ceph.deployment.drive_group import DriveGroupSpec, DriveGroupValidationError # type: ignore from mgr_util import get_most_recent_rate from .. import mgr from ..exceptions import DashboardException from ..security import Scope +from ..services._paginate import ListPaginator from ..services.ceph_service import CephService, SendCommandError from ..services.exception import handle_orchestrator_error, handle_send_command_error from ..services.orchestrator import OrchClient, OrchFeature @@ -121,8 +123,30 @@ def osd_task(name, metadata, wait_for=2.0): @APIRouter('/osd', Scope.OSD) @APIDoc('OSD management API', 'OSD') class Osd(RESTController): - def list(self): - osds = self.get_osd_map() + @RESTController.MethodMap(version=APIVersion(1, 1)) + def list(self, offset: int = 0, limit: int = 10, + search: str = '', sort: str = ''): + all_osds = self.get_osd_map() + + paginator = ListPaginator(int(offset), int(limit), sort, search, + input_list=all_osds.values(), + searchable_params=['id'], + sortable_params=['id'], + default_sort='+id') + + cherrypy.response.headers['X-Total-Count'] = paginator.get_count() + + paginated_osds_list = list(paginator.list()) + # creating a dictionary to have faster lookups + paginated_osds_by_id = {osd['id']: osd for osd in paginated_osds_list} + try: + osds = { + key: paginated_osds_by_id[int(key)] + for key in all_osds.keys() + if int(key) in paginated_osds_by_id + } + except ValueError as e: + raise DashboardException(e, component='osd', http_status_code=400) # Extending by osd stats information for stat in mgr.get('osd_stats')['osd_stats']: diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.html index 5f5f91dd0ed67..a56877512f99a 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.html +++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.html @@ -6,13 +6,15 @@ i18n>OSDs List + [updateSelectionOnRefresh]="'never'" + [serverSide]="true" + [count]="count">
{ let component: OsdListComponent; @@ -141,38 +143,42 @@ describe('OsdListComponent', () => { }); describe('getOsdList', () => { - let osds: any[]; + let osds: Osd[]; let flagsSpy: jasmine.Spy; - const createOsd = (n: number) => - >{ - in: 'in', - up: 'up', - tree: { - device_class: 'ssd' - }, - stats_history: { - op_out_bytes: [ - [n, n], - [n * 2, n * 2] - ], - op_in_bytes: [ - [n * 3, n * 3], - [n * 4, n * 4] - ] - }, - stats: { - stat_bytes_used: n * n, - stat_bytes: n * n * n - }, - state: [] - }; + const createOsd = (n: number): Osd => ({ + id: n, + host: { + id: 0, + name: 'test_host' + }, + in: 1, + up: 1, + tree: { + device_class: 'ssd' + }, + stats_history: { + op_out_bytes: [ + [n, n], + [n * 2, n * 2] + ], + op_in_bytes: [ + [n * 3, n * 3], + [n * 4, n * 4] + ] + }, + stats: { + stat_bytes_used: n * n, + stat_bytes: n * n * n + }, + state: [] + }); const expectAttributeOnEveryOsd = (attr: string) => expect(component.osds.every((osd) => Boolean(_.get(osd, attr)))).toBeTruthy(); beforeEach(() => { - spyOn(osdService, 'getList').and.callFake(() => of(osds)); + spyOn(osdService, 'getList').and.callFake(() => new PaginateObservable(of(osds))); flagsSpy = spyOn(osdService, 'getFlags').and.callFake(() => of([])); osds = [createOsd(1), createOsd(2), createOsd(3)]; component.getOsdList(); @@ -556,8 +562,9 @@ describe('OsdListComponent', () => { beforeEach(() => { component.permissions = fakeAuthStorageService.getPermissions(); - spyOn(osdService, 'getList').and.callFake(() => of(fakeOsds)); + spyOn(osdService, 'getList').and.callFake(() => new PaginateObservable(of(fakeOsds))); spyOn(osdService, 'getFlags').and.callFake(() => of([])); + component.getOsdList(); }); const testTableActions = async ( diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.ts index 103b61e79f0af..91cb0193f3cce 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.ts +++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.ts @@ -39,6 +39,8 @@ import { OsdRecvSpeedModalComponent } from '../osd-recv-speed-modal/osd-recv-spe import { OsdReweightModalComponent } from '../osd-reweight-modal/osd-reweight-modal.component'; import { OsdScrubModalComponent } from '../osd-scrub-modal/osd-scrub-modal.component'; import { ModalCdsService } from '~/app/shared/services/modal-cds.service'; +import { CdTableFetchDataContext } from '~/app/shared/models/cd-table-fetch-data-context'; +import { Osd } from '~/app/shared/models/osd.model'; const BASE_URL = 'osd'; @@ -71,6 +73,7 @@ export class OsdListComponent extends ListWithDetails implements OnInit { clusterWideActions: CdTableAction[]; icons = Icons; osdSettings = new OsdSettings(); + count = 0; selection = new CdTableSelection(); osds: any[] = []; @@ -426,10 +429,13 @@ export class OsdListComponent extends ListWithDetails implements OnInit { } } - getOsdList() { - const observables = [this.osdService.getList(), this.osdService.getFlags()]; - observableForkJoin(observables).subscribe((resp: [any[], string[]]) => { - this.osds = resp[0].map((osd) => { + getOsdList(context?: CdTableFetchDataContext) { + if (!context) context = new CdTableFetchDataContext(); + const pagination_obs = this.osdService.getList(context.toParams()); + const observables = [pagination_obs.observable, this.osdService.getFlags()]; + observableForkJoin(observables).subscribe((resp: any) => { + this.osds = resp[0].map((osd: Osd) => { + this.count = pagination_obs.count; osd.collectedStates = OsdListComponent.collectStates(osd); osd.stats_history.out_bytes = osd.stats_history.op_out_bytes.map((i: string) => i[1]); osd.stats_history.in_bytes = osd.stats_history.op_in_bytes.map((i: string) => i[1]); diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.spec.ts index d1f9997791ae0..c81c9193a2e3c 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.spec.ts +++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.spec.ts @@ -3,6 +3,7 @@ import { TestBed } from '@angular/core/testing'; import { configureTestBed } from '~/testing/unit-test-helper'; import { OsdService } from './osd.service'; +import { CdTableFetchDataContext } from '../models/cd-table-fetch-data-context'; describe('OsdService', () => { let service: OsdService; @@ -64,8 +65,9 @@ describe('OsdService', () => { }); it('should call getList', () => { - service.getList().subscribe(); - const req = httpTesting.expectOne('api/osd'); + const context = new CdTableFetchDataContext(() => {}); + service.getList(context.toParams()).observable.subscribe(); + const req = httpTesting.expectOne('api/osd?offset=0&limit=10&search=&sort=%2Bname'); expect(req.request.method).toBe('GET'); }); diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.ts index f2ed4d7cc9e76..85a75073deafc 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.ts +++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.ts @@ -1,4 +1,4 @@ -import { HttpClient } from '@angular/common/http'; +import { HttpClient, HttpParams } from '@angular/common/http'; import { Injectable } from '@angular/core'; import _ from 'lodash'; @@ -12,6 +12,9 @@ import { OsdSettings } from '../models/osd-settings'; import { SmartDataResponseV1 } from '../models/smart'; import { DeviceService } from '../services/device.service'; import { CdFormGroup } from '../forms/cd-form-group'; +import { PaginateObservable } from './paginate.model'; +import { PaginateParams } from '../classes/paginate-params.class'; +import { Osd } from '../models/osd.model'; @Injectable({ providedIn: 'root' @@ -80,8 +83,10 @@ export class OsdService { return this.http.post(this.path, request, { observe: 'response' }); } - getList() { - return this.http.get(`${this.path}`); + getList(params: HttpParams): PaginateObservable { + return new PaginateObservable( + this.http.get(this.path, new PaginateParams(params, 1, 1)) + ); } getOsdSettings(): Observable { diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/paginate.model.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/paginate.model.ts index 703792a757181..77ec4e43f7cfe 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/paginate.model.ts +++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/paginate.model.ts @@ -9,7 +9,7 @@ export class PaginateObservable { this.observable = obs.pipe( map((response: any) => { this.count = Number(response.headers?.get('X-Total-Count')); - return response['body']; + return response['body'] || response; }) ); } diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/classes/paginate-params.class.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/classes/paginate-params.class.ts new file mode 100644 index 0000000000000..a1b079b426b9d --- /dev/null +++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/classes/paginate-params.class.ts @@ -0,0 +1,15 @@ +import { HttpParams } from '@angular/common/http'; + +export class PaginateParams { + constructor(params: HttpParams, majorVersion = 1, minorVersion = 0) { + const options = { + params: params, + headers: { + Accept: `application/vnd.ceph.api.v${majorVersion}.${minorVersion}+json` + } + }; + + options['observe'] = 'response'; + return options; + } +} diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/models/cd-table-fetch-data-context.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/models/cd-table-fetch-data-context.ts index 0df2d2ebbe071..6ea415bfee983 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/shared/models/cd-table-fetch-data-context.ts +++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/models/cd-table-fetch-data-context.ts @@ -18,7 +18,7 @@ export class CdTableFetchDataContext { search = ''; sort = '+name'; - constructor(error: () => void) { + constructor(error?: () => void) { this.error = error; } diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/models/osd.model.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/models/osd.model.ts new file mode 100644 index 0000000000000..f22987e439ea5 --- /dev/null +++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/models/osd.model.ts @@ -0,0 +1,49 @@ +/* We will need to check what are all the value that the + UI need and only make them the mandatory parameters here. + For now based on what I saw in the unit test file; + osd-list.component.spec.ts, I've made the decision to make + things optional and non-optional. This should be re-evaluated. */ + +export interface Osd { + id: number; + host: Host; + stats_history: StatsHistory; + state: string[]; + stats: Stats; + collectedStates?: string[]; + in?: number; + out?: number; + up?: number; + down?: number; + destroyed?: number; + cdIsBinary?: boolean; + cdIndivFlags?: string[]; + cdClusterFlags?: string[]; + cdExecuting?: any; + tree?: Tree; + operational_status?: string; +} + +interface Tree { + device_class: string; +} + +interface Host { + id: number; + name: string; +} + +interface StatsHistory { + op_out_bytes: any[]; + op_in_bytes: any[]; + out_bytes?: any[]; + in_bytes?: any[]; +} + +interface Stats { + stat_bytes_used: number; + stat_bytes: number; + op_w?: number; + op_r?: number; + usage?: number; +} diff --git a/src/pybind/mgr/dashboard/openapi.yaml b/src/pybind/mgr/dashboard/openapi.yaml index 8f98f1f62a0a8..24422f5b030e3 100644 --- a/src/pybind/mgr/dashboard/openapi.yaml +++ b/src/pybind/mgr/dashboard/openapi.yaml @@ -8935,11 +8935,31 @@ paths: - NVMe-oF Subsystem Namespace /api/osd: get: - parameters: [] + parameters: + - default: 0 + in: query + name: offset + schema: + type: integer + - default: 10 + in: query + name: limit + schema: + type: integer + - default: '' + in: query + name: search + schema: + type: string + - default: '' + in: query + name: sort + schema: + type: string responses: '200': content: - application/vnd.ceph.api.v1.0+json: + application/vnd.ceph.api.v1.1+json: type: object description: OK '400': diff --git a/src/pybind/mgr/dashboard/tests/test_osd.py b/src/pybind/mgr/dashboard/tests/test_osd.py index c3cd0dca88dca..9b6dbd10de18d 100644 --- a/src/pybind/mgr/dashboard/tests/test_osd.py +++ b/src/pybind/mgr/dashboard/tests/test_osd.py @@ -8,6 +8,7 @@ from ceph.deployment.service_spec import PlacementSpec from .. import mgr +from ..controllers._version import APIVersion from ..controllers.osd import Osd, OsdUi from ..services.osd import OsdDeploymentOptions from ..tests import ControllerTestCase @@ -274,7 +275,7 @@ def test_osd_list_aggregation(self): osds_leftover = [0, 1, 2] with self._mock_osd_list(osd_stat_ids=osds_actual, osdmap_tree_node_ids=osds_leftover, osdmap_ids=osds_actual): - self._get('/api/osd') + self._get('/api/osd', version=APIVersion(1, 1)) self.assertEqual(len(self.json_body()), 2, 'It should display two OSDs without failure') self.assertStatus(200) From f9b50b2e88ae5d9ac4f2cab986527a0a12317da9 Mon Sep 17 00:00:00 2001 From: Nizamudeen A Date: Wed, 9 Oct 2024 20:15:55 +0530 Subject: [PATCH 099/148] mgr/dashboard: fix group name bugs in the nvmeof API there are 2 issues 1. in cephadm, i was always using the first daemon to populate the group in all the services for the dashboard config. 2. in the API, if there are more than 1 gateways listed in the config, rather than chosing a random gateway from the group, raise an exception and warn user to specify the gw_group parameter in the api request Fixes: https://tracker.ceph.com/issues/68463 Signed-off-by: Nizamudeen A --- src/pybind/mgr/cephadm/services/nvmeof.py | 5 ++--- src/pybind/mgr/dashboard/services/nvmeof_conf.py | 14 +++++++++++++- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/src/pybind/mgr/cephadm/services/nvmeof.py b/src/pybind/mgr/cephadm/services/nvmeof.py index 8b15aace373c5..162815da24c73 100644 --- a/src/pybind/mgr/cephadm/services/nvmeof.py +++ b/src/pybind/mgr/cephadm/services/nvmeof.py @@ -123,10 +123,9 @@ def get_set_cmd_dicts(out: str) -> List[dict]: gateways = json.loads(out)['gateways'] cmd_dicts = [] - spec = cast(NvmeofServiceSpec, - self.mgr.spec_store.all_specs.get(daemon_descrs[0].service_name(), None)) - for dd in daemon_descrs: + spec = cast(NvmeofServiceSpec, + self.mgr.spec_store.all_specs.get(dd.service_name(), None)) service_name = dd.service_name() if dd.hostname is None: err_msg = ('Trying to config_dashboard nvmeof but no hostname is defined') diff --git a/src/pybind/mgr/dashboard/services/nvmeof_conf.py b/src/pybind/mgr/dashboard/services/nvmeof_conf.py index 1802f8a5fce9f..2426c59907874 100644 --- a/src/pybind/mgr/dashboard/services/nvmeof_conf.py +++ b/src/pybind/mgr/dashboard/services/nvmeof_conf.py @@ -177,6 +177,18 @@ def _get_running_daemon_svc_config(svc_config, running_daemons): def _get_default_service(gateways): if gateways: - service_name = list(gateways.keys())[0] + gateway_keys = list(gateways.keys()) + # if there are more than 1 gateway, rather than chosing a random gateway + # from any of the group, raise an exception to make it clear that we need + # to specify the group name in the API request. + if len(gateway_keys) > 1: + raise DashboardException( + msg=( + "Multiple NVMe-oF gateway groups are configured. " + "Please specify the 'gw_group' parameter in the request." + ), + component="nvmeof" + ) + service_name = gateway_keys[0] return service_name, gateways[service_name][0]['service_url'] return None From 3dc091dd12c54103b9e93b5c38b86c883d93f242 Mon Sep 17 00:00:00 2001 From: Afreen Misbah Date: Fri, 11 Oct 2024 14:27:24 +0530 Subject: [PATCH 100/148] mgr/dashboard: Fix listener deletion Listener deletion is broken due to passing wrong gateway address. Including `traddr` in DELETE API of listener to choose correct gateway address for deletion. The same fix we did for POST API here: https://github.com/afreen23/ceph/commit/287ff3b3603291763b3cd08f9b1543fe60d5f3b9 Fixes: https://tracker.ceph.com/issues/68506 Signed-off-by: Afreen Misbah --- src/pybind/mgr/dashboard/controllers/nvmeof.py | 2 +- .../nvmeof-listeners-list/nvmeof-listeners-list.component.ts | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pybind/mgr/dashboard/controllers/nvmeof.py b/src/pybind/mgr/dashboard/controllers/nvmeof.py index 5db6a4f1acfec..f199867943d14 100644 --- a/src/pybind/mgr/dashboard/controllers/nvmeof.py +++ b/src/pybind/mgr/dashboard/controllers/nvmeof.py @@ -183,7 +183,7 @@ def delete( force: bool = False, gw_group: Optional[str] = None ): - return NVMeoFClient(gw_group=gw_group).stub.delete_listener( + return NVMeoFClient(gw_group=gw_group, traddr=traddr).stub.delete_listener( NVMeoFClient.pb2.delete_listener_req( nqn=nqn, host_name=host_name, diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.ts index f88442e1bd619..974727ad06260 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.ts +++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.ts @@ -76,7 +76,7 @@ export class NvmeofListenersListComponent implements OnInit, OnChanges { name: this.actionLabels.DELETE, permission: 'delete', icon: Icons.destroy, - click: () => this.deleteSubsystemModal() + click: () => this.deleteListenerModal() } ]; } @@ -101,7 +101,7 @@ export class NvmeofListenersListComponent implements OnInit, OnChanges { }); } - deleteSubsystemModal() { + deleteListenerModal() { const listener = this.selection.first(); this.modalService.show(CriticalConfirmationModalComponent, { itemDescription: 'Listener', From 517ab013e2a9bd23b482134121bcb85c5a32f028 Mon Sep 17 00:00:00 2001 From: Naman Munet Date: Tue, 8 Oct 2024 14:54:32 +0530 Subject: [PATCH 101/148] mgr/dashboard: sync policy's in Object >> Multi-site >> Sync-policy, does not show the zonegroup to which policy belongs to Fixes: https://tracker.ceph.com/issues/68355 Fixes Includes: Added default zonegroup name with the sync policy details Signed-off-by: Naman Munet --- src/pybind/mgr/dashboard/controllers/rgw.py | 10 ++++++++++ .../rgw-multisite-sync-policy.component.ts | 16 +++++++++++++--- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/src/pybind/mgr/dashboard/controllers/rgw.py b/src/pybind/mgr/dashboard/controllers/rgw.py index 8667d469060f8..75a664dfb4f91 100755 --- a/src/pybind/mgr/dashboard/controllers/rgw.py +++ b/src/pybind/mgr/dashboard/controllers/rgw.py @@ -176,6 +176,15 @@ def get_sync_policy(self, bucket_name='', zonegroup_name='', all_policy=None): if all_policy: sync_policy_list = [] buckets = json.loads(RgwBucket().list(stats=False)) + zonegroups_info = RgwMultisite().get_all_zonegroups_info() + default_zonegroup = '' + if 'zonegroups' in zonegroups_info and 'default_zonegroup' in zonegroups_info: + default_zonegroup = next( + (zonegroup['name'] for zonegroup in zonegroups_info['zonegroups'] + if 'id' in zonegroup and 'name' in zonegroup + and zonegroup['id'] == zonegroups_info['default_zonegroup']), + '' + ) for bucket in buckets: sync_policy = multisite_instance.get_sync_policy(bucket, zonegroup_name) for policy in sync_policy['groups']: @@ -183,6 +192,7 @@ def get_sync_policy(self, bucket_name='', zonegroup_name='', all_policy=None): sync_policy_list.append(policy) other_sync_policy = multisite_instance.get_sync_policy(bucket_name, zonegroup_name) for policy in other_sync_policy['groups']: + policy['zonegroup'] = default_zonegroup sync_policy_list.append(policy) return sync_policy_list return multisite_instance.get_sync_policy(bucket_name, zonegroup_name) diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-policy/rgw-multisite-sync-policy.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-policy/rgw-multisite-sync-policy.component.ts index ee261db5042c3..03228856125d9 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-policy/rgw-multisite-sync-policy.component.ts +++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-policy/rgw-multisite-sync-policy.component.ts @@ -88,12 +88,22 @@ export class RgwMultisiteSyncPolicyComponent extends ListWithDetails implements { name: $localize`Zonegroup`, prop: 'zonegroup', - flexGrow: 1 + flexGrow: 1, + cellTransformation: CellTemplate.map, + customTemplateConfig: { + undefined: '-', + '': '-' + } }, { name: $localize`Bucket`, prop: 'bucket', - flexGrow: 1 + flexGrow: 1, + cellTransformation: CellTemplate.map, + customTemplateConfig: { + undefined: '-', + '': '-' + } } ]; this.rgwDaemonService.list().subscribe(); @@ -137,7 +147,7 @@ export class RgwMultisiteSyncPolicyComponent extends ListWithDetails implements groupName: policy['id'], status: policy['status'], bucket: policy['bucketName'], - zonegroup: '' + zonegroup: policy['zonegroup'] }); }); this.syncPolicyData = [...this.syncPolicyData]; From 88e4484acf198a64c700f18bcc06af1014356c43 Mon Sep 17 00:00:00 2001 From: Avan Thakkar Date: Mon, 14 Oct 2024 19:32:11 +0530 Subject: [PATCH 102/148] mgr/cephadm: add ok_to_stop func for smb service Fixes: https://tracker.ceph.com/issues/68527 Signed-off-by: Avan Thakkar --- src/pybind/mgr/cephadm/services/smb.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/pybind/mgr/cephadm/services/smb.py b/src/pybind/mgr/cephadm/services/smb.py index dabc202a024bd..e322acb0e3e73 100644 --- a/src/pybind/mgr/cephadm/services/smb.py +++ b/src/pybind/mgr/cephadm/services/smb.py @@ -1,6 +1,9 @@ +import errno import logging from typing import Any, Dict, List, Tuple, cast, Optional +from mgr_module import HandleCommandResult + from ceph.deployment.service_spec import ServiceSpec, SMBSpec from orchestrator import DaemonDescription @@ -117,6 +120,23 @@ def ignore_possible_stray( return True return False + def ok_to_stop( + self, daemon_ids: List[str], force: bool = False, known: Optional[List[str]] = None + ) -> HandleCommandResult: + # if only 1 smb, alert user (this is not passable with --force) + warn, warn_message = self._enough_daemons_to_stop(self.TYPE, daemon_ids, "SMB", 1, True) + if warn: + return HandleCommandResult(-errno.EBUSY, "", warn_message) + + # if reached here, there is > 1 smb daemon. + if force: + return HandleCommandResult(0, warn_message, "") + + # if reached here, > 1 smb daemon and no force flag. + # Provide warning + warn_message = "WARNING: Removing SMB daemons can cause clients to lose connectivity. " + return HandleCommandResult(-errno.EBUSY, "", warn_message) + def _allow_config_key_command(self, name: str) -> str: # permit the samba container config access to the mon config key store # with keys like smb/config//*. From 022b90a75335471a6973cab180f71812c0d6125f Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Mon, 14 Oct 2024 12:14:53 -0400 Subject: [PATCH 103/148] doc/governance: add new CSC members Congratulations! Election: https://vote.heliosvoting.org/helios/elections/f276a15a-84c5-11ef-a0e4-b69e035002b0/view Signed-off-by: Patrick Donnelly --- doc/governance.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/doc/governance.rst b/doc/governance.rst index 284a9570397c3..b145116e5566d 100644 --- a/doc/governance.rst +++ b/doc/governance.rst @@ -104,6 +104,17 @@ Current Members * Yingxin Cheng * Yuri Weinstein * Zac Dover + * Laura Flores + * Venky Shankar + * Guillaume Abrioux + * Anthony D'Atri + * Joseph Mundackal + * Gaurav Sitlani + * Afreen Misbah + * Radoslaw Zarzynski + * Matan Breizman + * Yaarit Hatuka + * Adam C. Emerson .. _ctl: From e4177406f9734f1c8af91f8292aa972d01fb77f9 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Mon, 14 Oct 2024 14:50:41 -0400 Subject: [PATCH 104/148] mailmap: add my ibm email Signed-off-by: Patrick Donnelly --- .githubmap | 2 +- .mailmap | 3 ++- .organizationmap | 1 + .peoplemap | 2 +- 4 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.githubmap b/.githubmap index 724de9c002d4c..01785190643a6 100644 --- a/.githubmap +++ b/.githubmap @@ -27,7 +27,7 @@ b-ranto Boris Ranto badone Brad Hubbard baruza Barbora Ančincová bassamtabbara Bassam Tabbara -batrick Patrick Donnelly +batrick Patrick Donnelly bigjust Justin Caratzas bk201 Kiefer Chang BlaineEXE Blaine Gardner diff --git a/.mailmap b/.mailmap index 1c4ac95340abd..9428951b391f0 100644 --- a/.mailmap +++ b/.mailmap @@ -544,7 +544,8 @@ Pan Liu Parth Arora parth-gr Pascal de Bruijn Patience Warnick -Patrick Donnelly +Patrick Donnelly +Patrick Donnelly Patrick McGarry Patrick McGarry Patrick Seidensal diff --git a/.organizationmap b/.organizationmap index 3a601f4e2b2bf..42e639c274d62 100644 --- a/.organizationmap +++ b/.organizationmap @@ -361,6 +361,7 @@ IBM Samuel Matzek IBM Shraddha Agrawal IBM Kushal Deb IBM Shweta Bhosale +IBM Patrick Donnelly IBM Sunil Angadi IBM Teoman Onay IBM Ulrich Weigand diff --git a/.peoplemap b/.peoplemap index 507f50edb43e8..418e8505fb49c 100644 --- a/.peoplemap +++ b/.peoplemap @@ -73,5 +73,5 @@ Yehuda Sadeh Yehuda Sadeh Yuri Weinstein Yuri Weinstein Zhi Zhang Zhi (David) Zhang Zheng Yin Zheng Yin -Patrick Donnelly Patrick Donnelly +Patrick Donnelly Patrick Donnelly Patrick Donnelly Myoungwon Oh Myoungwon Oh Myoungwon Oh From 2f61b2847d92b5156408dbcfa5b6e09e2de404c1 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Mon, 14 Oct 2024 14:57:31 -0400 Subject: [PATCH 105/148] doc/governance: update my CSC email Signed-off-by: Patrick Donnelly --- doc/governance.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/governance.rst b/doc/governance.rst index 284a9570397c3..1080948e5cd05 100644 --- a/doc/governance.rst +++ b/doc/governance.rst @@ -96,7 +96,7 @@ Current Members * Mike Perez * Myoungwon Oh * Neha Ojha - * Patrick Donnelly + * Patrick Donnelly * Sam Just * Vikhyat Umrao * Xie Xingguo From c4c647480adbd702296a632707c34b172121b9b0 Mon Sep 17 00:00:00 2001 From: Casey Bodley Date: Mon, 14 Oct 2024 16:07:38 -0400 Subject: [PATCH 106/148] osdc: remove unused overloads for async::Completion ea67f3dee2a3f8fcdcbb0bc0e80e38ec70378f05 switched to asio::any_completion_handler<> for completions, but left some converting overloads behind for compatibility. none of those overloads appear to be used, so remove them Signed-off-by: Casey Bodley --- src/osdc/Objecter.h | 95 --------------------------------------------- 1 file changed, 95 deletions(-) diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h index 68bd76268ae94..927c7e413296f 100644 --- a/src/osdc/Objecter.h +++ b/src/osdc/Objecter.h @@ -48,7 +48,6 @@ #include "include/function2.hpp" #include "include/neorados/RADOS_Decodable.hpp" -#include "common/async/completion.h" #include "common/admin_socket.h" #include "common/ceph_time.h" #include "common/ceph_mutex.h" @@ -1968,30 +1967,6 @@ class Objecter : public md_config_obs_t, public Dispatcher { } } - boost::asio::any_completion_handler - OpCompletionVert(std::unique_ptr> c) { - if (c) - return [c = std::move(c)](boost::system::error_code ec) mutable { - c->dispatch(std::move(c), ec); - }; - else - return nullptr; - } - - template - boost::asio::any_completion_handler - OpCompletionVert(std::unique_ptr> c) { - if (c) { - return [c = std::move(c)](boost::system::error_code ec, T t) mutable { - c->dispatch(std::move(c), ec, std::move(t)); - }; - } else { - return nullptr; - } - } - struct Op : public RefCountedObject { OSDSession *session = nullptr; int incarnation = 0; @@ -3268,18 +3243,6 @@ class Objecter : public md_config_obs_t, public Dispatcher { return linger_watch(info, op, snapc, mtime, inbl, OpContextVert(onfinish, nullptr), objver); } - ceph_tid_t linger_watch(LingerOp *info, - ObjectOperation& op, - const SnapContext& snapc, ceph::real_time mtime, - ceph::buffer::list& inbl, - std::unique_ptr> onfinish, - version_t *objver) { - return linger_watch(info, op, snapc, mtime, inbl, - OpCompletionVert( - std::move(onfinish)), objver); - } ceph_tid_t linger_notify(LingerOp *info, ObjectOperation& op, snapid_t snap, ceph::buffer::list& inbl, @@ -3295,17 +3258,6 @@ class Objecter : public md_config_obs_t, public Dispatcher { OpContextVert(onack, poutbl), objver); } - ceph_tid_t linger_notify(LingerOp *info, - ObjectOperation& op, - snapid_t snap, ceph::buffer::list& inbl, - std::unique_ptr> onack, - version_t *objver) { - return linger_notify(info, op, snap, inbl, - OpCompletionVert( - std::move(onack)), objver); - } tl::expected linger_check(LingerOp *info); void linger_cancel(LingerOp *info); // releases a reference @@ -3886,12 +3838,6 @@ class Objecter : public md_config_obs_t, public Dispatcher { create_pool_snap(pool, snapName, OpContextVert(c, nullptr)); } - void create_pool_snap( - int64_t pool, std::string_view snapName, - std::unique_ptr> c) { - create_pool_snap(pool, snapName, - OpCompletionVert(std::move(c))); - } void allocate_selfmanaged_snap(int64_t pool, boost::asio::any_completion_handler< void(boost::system::error_code, @@ -3901,12 +3847,6 @@ class Objecter : public md_config_obs_t, public Dispatcher { allocate_selfmanaged_snap(pool, OpContextVert(c, psnapid)); } - void allocate_selfmanaged_snap(int64_t pool, - std::unique_ptr> c) { - allocate_selfmanaged_snap(pool, - OpCompletionVert(std::move(c))); - } void delete_pool_snap(int64_t pool, std::string_view snapName, decltype(PoolOp::onfinish)&& onfinish); void delete_pool_snap(int64_t pool, std::string_view snapName, @@ -3914,12 +3854,6 @@ class Objecter : public md_config_obs_t, public Dispatcher { delete_pool_snap(pool, snapName, OpContextVert(c, nullptr)); } - void delete_pool_snap(int64_t pool, std::string_view snapName, - std::unique_ptr> c) { - delete_pool_snap(pool, snapName, - OpCompletionVert(std::move(c))); - } void delete_selfmanaged_snap(int64_t pool, snapid_t snap, decltype(PoolOp::onfinish)&& onfinish); @@ -3928,12 +3862,6 @@ class Objecter : public md_config_obs_t, public Dispatcher { delete_selfmanaged_snap(pool, snap, OpContextVert(c, nullptr)); } - void delete_selfmanaged_snap(int64_t pool, snapid_t snap, - std::unique_ptr> c) { - delete_selfmanaged_snap(pool, snap, - OpCompletionVert(std::move(c))); - } void create_pool(std::string_view name, @@ -3945,25 +3873,12 @@ class Objecter : public md_config_obs_t, public Dispatcher { OpContextVert(onfinish, nullptr), crush_rule); } - void create_pool(std::string_view name, - std::unique_ptr> c, - int crush_rule=-1) { - create_pool(name, - OpCompletionVert(std::move(c)), - crush_rule); - } void delete_pool(int64_t pool, decltype(PoolOp::onfinish)&& onfinish); void delete_pool(int64_t pool, Context* onfinish) { delete_pool(pool, OpContextVert(onfinish, nullptr)); } - void delete_pool(int64_t pool, - std::unique_ptr> c) { - delete_pool(pool, OpCompletionVert(std::move(c))); - } void delete_pool(std::string_view name, decltype(PoolOp::onfinish)&& onfinish); @@ -3972,11 +3887,6 @@ class Objecter : public md_config_obs_t, public Dispatcher { Context* onfinish) { delete_pool(name, OpContextVert(onfinish, nullptr)); } - void delete_pool(std::string_view name, - std::unique_ptr> c) { - delete_pool(name, OpCompletionVert(std::move(c))); - } void handle_pool_op_reply(MPoolOpReply *m); int pool_op_cancel(ceph_tid_t tid, int r); @@ -4026,11 +3936,6 @@ class Objecter : public md_config_obs_t, public Dispatcher { Context *onfinish) { get_fs_stats_(poolid, OpContextVert(onfinish, result)); } - void get_fs_stats(std::optional poolid, - std::unique_ptr> c) { - get_fs_stats_(poolid, OpCompletionVert(std::move(c))); - } int statfs_op_cancel(ceph_tid_t tid, int r); void _finish_statfs_op(StatfsOp *op, int r); From 7b783876960d39de1b87d55135c4207325c4ce69 Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Mon, 14 Oct 2024 19:42:59 -0700 Subject: [PATCH 107/148] crimson: remove watchers upon object deletion Fixes: https://tracker.ceph.com/issues/68538 Signed-off-by: Samuel Just --- src/crimson/osd/ops_executer.cc | 36 ++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/src/crimson/osd/ops_executer.cc b/src/crimson/osd/ops_executer.cc index df4f73d4077d1..4464466eff0d7 100644 --- a/src/crimson/osd/ops_executer.cc +++ b/src/crimson/osd/ops_executer.cc @@ -678,16 +678,32 @@ OpsExecuter::do_execute_op(OSDOp& osd_op) whiteout = true; } return do_write_op([this, whiteout](auto& backend, auto& os, auto& txn) { - int num_bytes = 0; - // Calculate num_bytes to be removed - if (obc->obs.oi.soid.is_snap()) { - ceph_assert(obc->ssc->snapset.clone_overlap.count(obc->obs.oi.soid.snap)); - num_bytes = obc->ssc->snapset.get_clone_bytes(obc->obs.oi.soid.snap); - } else { - num_bytes = obc->obs.oi.size; - } - return backend.remove(os, txn, *osd_op_params, - delta_stats, whiteout, num_bytes); + struct emptyctx_t {}; + return with_effect_on_obc( + emptyctx_t{}, + [&](auto &ctx) { + int num_bytes = 0; + // Calculate num_bytes to be removed + if (obc->obs.oi.soid.is_snap()) { + ceph_assert(obc->ssc->snapset.clone_overlap.count( + obc->obs.oi.soid.snap)); + num_bytes = obc->ssc->snapset.get_clone_bytes( + obc->obs.oi.soid.snap); + } else { + num_bytes = obc->obs.oi.size; + } + return backend.remove(os, txn, *osd_op_params, + delta_stats, whiteout, num_bytes); + }, + [](auto &&ctx, ObjectContextRef obc, Ref) { + return seastar::do_for_each( + obc->watchers, + [](auto &p) { return p.second->remove(); } + ).then([obc] { + obc->watchers.clear(); + return seastar::now(); + }); + }); }); } case CEPH_OSD_OP_CALL: From 1f99108d197f1c579838107d4b57be806b6807e1 Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Mon, 14 Oct 2024 18:46:37 -0700 Subject: [PATCH 108/148] crimson: add missing field to SUBLOGDPPI and LOGDPPI SUBLOGDPPI and LOGDPPI need an extra {} for the interrupt_cond. Signed-off-by: Samuel Just --- src/crimson/common/log.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/crimson/common/log.h b/src/crimson/common/log.h index 4f564ac044d05..c38b225c94b4f 100644 --- a/src/crimson/common/log.h +++ b/src/crimson/common/log.h @@ -90,7 +90,7 @@ static inline seastar::log_level to_log_level(int level) { #define SUBLOGDPP(subname_, level_, MSG, dpp, ...) \ LOGGER(subname_).log(level_, "{} {}: " MSG, dpp, FNAME , ##__VA_ARGS__) #define SUBLOGDPPI(subname_, level_, MSG, dpp, ...) \ - LOGGER(subname_).log(level_, "{} {}: " MSG, \ + LOGGER(subname_).log(level_, "{} {} {}: " MSG, \ interruptor::get_interrupt_cond(), dpp, FNAME , ##__VA_ARGS__) #define SUBTRACEDPP(subname_, ...) SUBLOGDPP(subname_, seastar::log_level::trace, __VA_ARGS__) #define SUBTRACEDPPI(subname_, ...) SUBLOGDPPI(subname_, seastar::log_level::trace, __VA_ARGS__) @@ -106,7 +106,7 @@ static inline seastar::log_level to_log_level(int level) { #define LOGDPP(level_, MSG, dpp, ...) \ LOCAL_LOGGER.log(level_, "{} {}: " MSG, dpp, FNAME , ##__VA_ARGS__) #define LOGDPPI(level_, MSG, dpp, ...) \ - LOCAL_LOGGER.log(level_, "{} {}: " MSG, \ + LOCAL_LOGGER.log(level_, "{} {} {}: " MSG, \ interruptor::get_interrupt_cond(), dpp, FNAME , ##__VA_ARGS__) #define TRACEDPP(...) LOGDPP(seastar::log_level::trace, __VA_ARGS__) #define TRACEDPPI(...) LOGDPPI(seastar::log_level::trace, __VA_ARGS__) From 4bea366e5de5b110086c8174eaf39798448ff77f Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Tue, 27 Aug 2024 19:08:10 +0000 Subject: [PATCH 109/148] crimson: fix typo OpsExecutor->OpsExecuter Signed-off-by: Samuel Just --- src/crimson/osd/ops_executer.cc | 8 ++++---- src/crimson/osd/ops_executer.h | 4 ++-- src/crimson/osd/pg_backend.cc | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/crimson/osd/ops_executer.cc b/src/crimson/osd/ops_executer.cc index 4464466eff0d7..0a07fa7ee293e 100644 --- a/src/crimson/osd/ops_executer.cc +++ b/src/crimson/osd/ops_executer.cc @@ -504,7 +504,7 @@ OpsExecuter::list_snaps_iertr::future<> OpsExecuter::do_list_snaps( auto p = ss.clone_snaps.find(clone); if (p == ss.clone_snaps.end()) { logger().error( - "OpsExecutor::do_list_snaps: {} has inconsistent " + "OpsExecuter::do_list_snaps: {} has inconsistent " "clone_snaps, missing clone {}", os.oi.soid, clone); @@ -518,7 +518,7 @@ OpsExecuter::list_snaps_iertr::future<> OpsExecuter::do_list_snaps( auto p = ss.clone_overlap.find(clone); if (p == ss.clone_overlap.end()) { logger().error( - "OpsExecutor::do_list_snaps: {} has inconsistent " + "OpsExecuter::do_list_snaps: {} has inconsistent " "clone_overlap, missing clone {}", os.oi.soid, clone); @@ -532,7 +532,7 @@ OpsExecuter::list_snaps_iertr::future<> OpsExecuter::do_list_snaps( auto p = ss.clone_size.find(clone); if (p == ss.clone_size.end()) { logger().error( - "OpsExecutor::do_list_snaps: {} has inconsistent " + "OpsExecuter::do_list_snaps: {} has inconsistent " "clone_size, missing clone {}", os.oi.soid, clone); @@ -551,7 +551,7 @@ OpsExecuter::list_snaps_iertr::future<> OpsExecuter::do_list_snaps( } resp.seq = ss.seq; logger().error( - "OpsExecutor::do_list_snaps: {}, resp.clones.size(): {}", + "OpsExecuter::do_list_snaps: {}, resp.clones.size(): {}", os.oi.soid, resp.clones.size()); resp.encode(osd_op.outdata); diff --git a/src/crimson/osd/ops_executer.h b/src/crimson/osd/ops_executer.h index 0dea7d0515e93..e25a035616edd 100644 --- a/src/crimson/osd/ops_executer.h +++ b/src/crimson/osd/ops_executer.h @@ -213,10 +213,10 @@ class OpsExecuter : public seastar::enable_lw_shared_from_this { * execute_clone * * If snapc contains a snap which occurred logically after the last write - * seen by this object (see OpsExecutor::should_clone()), we first need + * seen by this object (see OpsExecuter::should_clone()), we first need * make a clone of the object at its current state. execute_clone primes * txn with that clone operation and returns an - * OpsExecutor::CloningContext which will allow us to fill in the corresponding + * OpsExecuter::CloningContext which will allow us to fill in the corresponding * metadata and log_entries once the operations have been processed. * * Note that this strategy differs from classic, which instead performs this diff --git a/src/crimson/osd/pg_backend.cc b/src/crimson/osd/pg_backend.cc index fa8201b61c28d..24a381b4cf7e2 100644 --- a/src/crimson/osd/pg_backend.cc +++ b/src/crimson/osd/pg_backend.cc @@ -1289,7 +1289,7 @@ void PGBackend::clone( const ObjectState& d_os, ceph::os::Transaction& txn) { - // See OpsExecutor::execute_clone documentation + // See OpsExecuter::execute_clone documentation txn.clone(coll->get_cid(), ghobject_t{os.oi.soid}, ghobject_t{d_os.oi.soid}); { ceph::bufferlist bv; From a7812e095c13debcd844883db5888bdf5a185170 Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Tue, 10 Sep 2024 23:52:32 +0000 Subject: [PATCH 110/148] crimson/.../internal_client_request: remove unnecessary system_shutdown guard Signed-off-by: Samuel Just --- .../osd_operations/internal_client_request.cc | 150 +++++++++--------- 1 file changed, 74 insertions(+), 76 deletions(-) diff --git a/src/crimson/osd/osd_operations/internal_client_request.cc b/src/crimson/osd/osd_operations/internal_client_request.cc index a19bb0826f004..b1224f6e25942 100644 --- a/src/crimson/osd/osd_operations/internal_client_request.cc +++ b/src/crimson/osd/osd_operations/internal_client_request.cc @@ -53,88 +53,86 @@ CommonPGPipeline& InternalClientRequest::client_pp() seastar::future<> InternalClientRequest::start() { track_event(); - return crimson::common::handle_system_shutdown([this] { - LOG_PREFIX(InternalClientRequest::start); - DEBUGI("{}: in repeat", *this); - - return interruptor::with_interruption([this]() mutable { - return enter_stage( - client_pp().wait_for_active - ).then_interruptible([this] { - return with_blocking_event([this] (auto&& trigger) { - return pg->wait_for_active_blocker.wait(std::move(trigger)); - }); - }).then_interruptible([this] { - return enter_stage( - client_pp().recover_missing); - }).then_interruptible([this] { - return do_recover_missing(pg, get_target_oid(), osd_reqid_t()); - }).then_interruptible([this](bool unfound) { - if (unfound) { - throw std::system_error( - std::make_error_code(std::errc::operation_canceled), - fmt::format("{} is unfound, drop it!", get_target_oid())); - } - return enter_stage( - client_pp().get_obc); - }).then_interruptible([this] () -> PG::load_obc_iertr::future<> { - LOG_PREFIX(InternalClientRequest::start); - DEBUGI("{}: getting obc lock", *this); - return seastar::do_with(create_osd_ops(), - [this](auto& osd_ops) mutable { - LOG_PREFIX(InternalClientRequest::start); - DEBUGI("InternalClientRequest: got {} OSDOps to execute", - std::size(osd_ops)); + LOG_PREFIX(InternalClientRequest::start); + DEBUGI("{}: in repeat", *this); + + return interruptor::with_interruption([this]() mutable { + return enter_stage( + client_pp().wait_for_active + ).then_interruptible([this] { + return with_blocking_event([this] (auto&& trigger) { + return pg->wait_for_active_blocker.wait(std::move(trigger)); + }); + }).then_interruptible([this] { + return enter_stage( + client_pp().recover_missing); + }).then_interruptible([this] { + return do_recover_missing(pg, get_target_oid(), osd_reqid_t()); + }).then_interruptible([this](bool unfound) { + if (unfound) { + throw std::system_error( + std::make_error_code(std::errc::operation_canceled), + fmt::format("{} is unfound, drop it!", get_target_oid())); + } + return enter_stage( + client_pp().get_obc); + }).then_interruptible([this] () -> PG::load_obc_iertr::future<> { + LOG_PREFIX(InternalClientRequest::start); + DEBUGI("{}: getting obc lock", *this); + return seastar::do_with( + create_osd_ops(), + [this](auto& osd_ops) mutable { + LOG_PREFIX(InternalClientRequest::start); + DEBUGI("InternalClientRequest: got {} OSDOps to execute", + std::size(osd_ops)); [[maybe_unused]] const int ret = op_info.set_from_op( std::as_const(osd_ops), pg->get_pgid().pgid, *pg->get_osdmap()); assert(ret == 0); // call with_locked_obc() in order, but wait concurrently for loading. enter_stage_sync(client_pp().lock_obc); - return pg->with_locked_obc(get_target_oid(), op_info, - [&osd_ops, this](auto, auto obc) { - return enter_stage(client_pp().process - ).then_interruptible( - [obc=std::move(obc), &osd_ops, this] { - return pg->do_osd_ops( - std::move(obc), - osd_ops, - std::as_const(op_info), - get_do_osd_ops_params() - ).safe_then_unpack_interruptible( - [](auto submitted, auto all_completed) { - return all_completed.handle_error_interruptible( - crimson::ct_error::eagain::handle([] { - return seastar::now(); - })); - }, crimson::ct_error::eagain::handle([] { - return interruptor::now(); - }) - ); - }); - }); + return pg->with_locked_obc( + get_target_oid(), op_info, + [&osd_ops, this](auto, auto obc) { + return enter_stage(client_pp().process + ).then_interruptible( + [obc=std::move(obc), &osd_ops, this] { + return pg->do_osd_ops( + std::move(obc), + osd_ops, + std::as_const(op_info), + get_do_osd_ops_params() + ).safe_then_unpack_interruptible( + [](auto submitted, auto all_completed) { + return all_completed.handle_error_interruptible( + crimson::ct_error::eagain::handle([] { + return seastar::now(); + })); + }, crimson::ct_error::eagain::handle([] { + return interruptor::now(); + }) + ); + }); + }); }); - }).si_then([this] { - logger().debug("{}: complete", *this); - return handle.complete(); - }).handle_error_interruptible( - PG::load_obc_ertr::all_same_way([] { - return seastar::now(); - }) - ); - }, [](std::exception_ptr eptr) { - return seastar::now(); - }, pg, start_epoch - - ).then([this] { - track_event(); - }).handle_exception_type([](std::system_error &error) { - logger().debug("error {}, message: {}", error.code(), error.what()); - return seastar::now(); - }).finally([this] { - logger().debug("{}: exit", *this); - handle.exit(); - }); + }).si_then([this] { + logger().debug("{}: complete", *this); + return handle.complete(); + }).handle_error_interruptible( + PG::load_obc_ertr::all_same_way([] { + return seastar::now(); + }) + ); + }, [](std::exception_ptr eptr) { + return seastar::now(); + }, pg, start_epoch).then([this] { + track_event(); + }).handle_exception_type([](std::system_error &error) { + logger().debug("error {}, message: {}", error.code(), error.what()); + return seastar::now(); + }).finally([this] { + logger().debug("{}: exit", *this); + handle.exit(); }); } From a091414c67ba9f1407c3756dd75ca2aa3b1074ac Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Wed, 11 Sep 2024 01:31:57 +0000 Subject: [PATCH 111/148] crimson/.../internal_client_request: factor out with_interruption Signed-off-by: Samuel Just --- .../osd_operations/internal_client_request.cc | 138 +++++++++--------- .../osd_operations/internal_client_request.h | 2 + 2 files changed, 74 insertions(+), 66 deletions(-) diff --git a/src/crimson/osd/osd_operations/internal_client_request.cc b/src/crimson/osd/osd_operations/internal_client_request.cc index b1224f6e25942..d4213928a3e46 100644 --- a/src/crimson/osd/osd_operations/internal_client_request.cc +++ b/src/crimson/osd/osd_operations/internal_client_request.cc @@ -50,6 +50,77 @@ CommonPGPipeline& InternalClientRequest::client_pp() return pg->request_pg_pipeline; } +InternalClientRequest::interruptible_future<> +InternalClientRequest::with_interruption() +{ + return enter_stage( + client_pp().wait_for_active + ).then_interruptible([this] { + return with_blocking_event([this] (auto&& trigger) { + return pg->wait_for_active_blocker.wait(std::move(trigger)); + }); + }).then_interruptible([this] { + return enter_stage( + client_pp().recover_missing); + }).then_interruptible([this] { + return do_recover_missing(pg, get_target_oid(), osd_reqid_t()); + }).then_interruptible([this](bool unfound) { + if (unfound) { + throw std::system_error( + std::make_error_code(std::errc::operation_canceled), + fmt::format("{} is unfound, drop it!", get_target_oid())); + } + return enter_stage( + client_pp().get_obc); + }).then_interruptible([this] () -> PG::load_obc_iertr::future<> { + LOG_PREFIX(InternalClientRequest::with_interruption); + DEBUGI("{}: getting obc lock", *this); + return seastar::do_with( + create_osd_ops(), + [this](auto& osd_ops) mutable { + LOG_PREFIX(InternalClientRequest::with_interruption); + DEBUGI("InternalClientRequest: got {} OSDOps to execute", + std::size(osd_ops)); + [[maybe_unused]] const int ret = op_info.set_from_op( + std::as_const(osd_ops), pg->get_pgid().pgid, *pg->get_osdmap()); + assert(ret == 0); + // call with_locked_obc() in order, but wait concurrently for loading. + enter_stage_sync(client_pp().lock_obc); + return pg->with_locked_obc( + get_target_oid(), op_info, + [&osd_ops, this](auto, auto obc) { + return enter_stage(client_pp().process + ).then_interruptible( + [obc=std::move(obc), &osd_ops, this] { + return pg->do_osd_ops( + std::move(obc), + osd_ops, + std::as_const(op_info), + get_do_osd_ops_params() + ).safe_then_unpack_interruptible( + [](auto submitted, auto all_completed) { + return all_completed.handle_error_interruptible( + crimson::ct_error::eagain::handle([] { + return seastar::now(); + })); + }, crimson::ct_error::eagain::handle([] { + return interruptor::now(); + }) + ); + }); + }); + }); + }).si_then([this] { + logger().debug("{}: complete", *this); + return handle.complete(); + }).handle_error_interruptible( + PG::load_obc_ertr::all_same_way([] { + return seastar::now(); + }) + ); +} + seastar::future<> InternalClientRequest::start() { track_event(); @@ -57,72 +128,7 @@ seastar::future<> InternalClientRequest::start() DEBUGI("{}: in repeat", *this); return interruptor::with_interruption([this]() mutable { - return enter_stage( - client_pp().wait_for_active - ).then_interruptible([this] { - return with_blocking_event([this] (auto&& trigger) { - return pg->wait_for_active_blocker.wait(std::move(trigger)); - }); - }).then_interruptible([this] { - return enter_stage( - client_pp().recover_missing); - }).then_interruptible([this] { - return do_recover_missing(pg, get_target_oid(), osd_reqid_t()); - }).then_interruptible([this](bool unfound) { - if (unfound) { - throw std::system_error( - std::make_error_code(std::errc::operation_canceled), - fmt::format("{} is unfound, drop it!", get_target_oid())); - } - return enter_stage( - client_pp().get_obc); - }).then_interruptible([this] () -> PG::load_obc_iertr::future<> { - LOG_PREFIX(InternalClientRequest::start); - DEBUGI("{}: getting obc lock", *this); - return seastar::do_with( - create_osd_ops(), - [this](auto& osd_ops) mutable { - LOG_PREFIX(InternalClientRequest::start); - DEBUGI("InternalClientRequest: got {} OSDOps to execute", - std::size(osd_ops)); - [[maybe_unused]] const int ret = op_info.set_from_op( - std::as_const(osd_ops), pg->get_pgid().pgid, *pg->get_osdmap()); - assert(ret == 0); - // call with_locked_obc() in order, but wait concurrently for loading. - enter_stage_sync(client_pp().lock_obc); - return pg->with_locked_obc( - get_target_oid(), op_info, - [&osd_ops, this](auto, auto obc) { - return enter_stage(client_pp().process - ).then_interruptible( - [obc=std::move(obc), &osd_ops, this] { - return pg->do_osd_ops( - std::move(obc), - osd_ops, - std::as_const(op_info), - get_do_osd_ops_params() - ).safe_then_unpack_interruptible( - [](auto submitted, auto all_completed) { - return all_completed.handle_error_interruptible( - crimson::ct_error::eagain::handle([] { - return seastar::now(); - })); - }, crimson::ct_error::eagain::handle([] { - return interruptor::now(); - }) - ); - }); - }); - }); - }).si_then([this] { - logger().debug("{}: complete", *this); - return handle.complete(); - }).handle_error_interruptible( - PG::load_obc_ertr::all_same_way([] { - return seastar::now(); - }) - ); + return with_interruption(); }, [](std::exception_ptr eptr) { return seastar::now(); }, pg, start_epoch).then([this] { diff --git a/src/crimson/osd/osd_operations/internal_client_request.h b/src/crimson/osd/osd_operations/internal_client_request.h index f198e58464338..2f3585013344d 100644 --- a/src/crimson/osd/osd_operations/internal_client_request.h +++ b/src/crimson/osd/osd_operations/internal_client_request.h @@ -41,6 +41,8 @@ class InternalClientRequest : public PhasedOperationT, CommonPGPipeline& client_pp(); + InternalClientRequest::interruptible_future<> with_interruption(); + seastar::future<> do_process(); Ref pg; From 238f3e573d48a082f49713cfa310110190ee521d Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Wed, 11 Sep 2024 21:16:51 +0000 Subject: [PATCH 112/148] crimson/.../internal_client_request: convert with_interruption to coroutine Signed-off-by: Samuel Just --- .../osd_operations/internal_client_request.cc | 123 +++++++++--------- 1 file changed, 61 insertions(+), 62 deletions(-) diff --git a/src/crimson/osd/osd_operations/internal_client_request.cc b/src/crimson/osd/osd_operations/internal_client_request.cc index d4213928a3e46..d0aa0822f8030 100644 --- a/src/crimson/osd/osd_operations/internal_client_request.cc +++ b/src/crimson/osd/osd_operations/internal_client_request.cc @@ -53,72 +53,71 @@ CommonPGPipeline& InternalClientRequest::client_pp() InternalClientRequest::interruptible_future<> InternalClientRequest::with_interruption() { - return enter_stage( + LOG_PREFIX(InternalClientRequest::with_interruption); + co_await enter_stage( client_pp().wait_for_active - ).then_interruptible([this] { - return with_blocking_event([this] (auto&& trigger) { - return pg->wait_for_active_blocker.wait(std::move(trigger)); - }); - }).then_interruptible([this] { - return enter_stage( - client_pp().recover_missing); - }).then_interruptible([this] { - return do_recover_missing(pg, get_target_oid(), osd_reqid_t()); - }).then_interruptible([this](bool unfound) { - if (unfound) { - throw std::system_error( - std::make_error_code(std::errc::operation_canceled), - fmt::format("{} is unfound, drop it!", get_target_oid())); - } - return enter_stage( - client_pp().get_obc); - }).then_interruptible([this] () -> PG::load_obc_iertr::future<> { - LOG_PREFIX(InternalClientRequest::with_interruption); - DEBUGI("{}: getting obc lock", *this); - return seastar::do_with( - create_osd_ops(), - [this](auto& osd_ops) mutable { - LOG_PREFIX(InternalClientRequest::with_interruption); - DEBUGI("InternalClientRequest: got {} OSDOps to execute", - std::size(osd_ops)); - [[maybe_unused]] const int ret = op_info.set_from_op( - std::as_const(osd_ops), pg->get_pgid().pgid, *pg->get_osdmap()); - assert(ret == 0); - // call with_locked_obc() in order, but wait concurrently for loading. - enter_stage_sync(client_pp().lock_obc); - return pg->with_locked_obc( - get_target_oid(), op_info, - [&osd_ops, this](auto, auto obc) { - return enter_stage(client_pp().process - ).then_interruptible( - [obc=std::move(obc), &osd_ops, this] { - return pg->do_osd_ops( - std::move(obc), - osd_ops, - std::as_const(op_info), - get_do_osd_ops_params() - ).safe_then_unpack_interruptible( - [](auto submitted, auto all_completed) { - return all_completed.handle_error_interruptible( - crimson::ct_error::eagain::handle([] { - return seastar::now(); - })); - }, crimson::ct_error::eagain::handle([] { - return interruptor::now(); - }) - ); - }); - }); + ); + + co_await with_blocking_event([this] (auto&& trigger) { + return pg->wait_for_active_blocker.wait(std::move(trigger)); + }); + + co_await enter_stage(client_pp().recover_missing); + + bool unfound = co_await do_recover_missing( + pg, get_target_oid(), osd_reqid_t()); + + if (unfound) { + throw std::system_error( + std::make_error_code(std::errc::operation_canceled), + fmt::format("{} is unfound, drop it!", get_target_oid())); + } + co_await enter_stage( + client_pp().get_obc); + + DEBUGI("{}: getting obc lock", *this); + + auto osd_ops = create_osd_ops(); + + DEBUGI("InternalClientRequest: got {} OSDOps to execute", + std::size(osd_ops)); + [[maybe_unused]] const int ret = op_info.set_from_op( + std::as_const(osd_ops), pg->get_pgid().pgid, *pg->get_osdmap()); + assert(ret == 0); + // call with_locked_obc() in order, but wait concurrently for loading. + enter_stage_sync(client_pp().lock_obc); + + auto fut = pg->with_locked_obc( + get_target_oid(), op_info, + [&osd_ops, this](auto, auto obc) { + return enter_stage(client_pp().process + ).then_interruptible( + [obc=std::move(obc), &osd_ops, this] { + return pg->do_osd_ops( + std::move(obc), + osd_ops, + std::as_const(op_info), + get_do_osd_ops_params() + ).safe_then_unpack_interruptible( + [](auto submitted, auto all_completed) { + return all_completed.handle_error_interruptible( + crimson::ct_error::eagain::handle([] { + return seastar::now(); + })); + }, crimson::ct_error::eagain::handle([] { + return interruptor::now(); + }) + ); }); - }).si_then([this] { - logger().debug("{}: complete", *this); - return handle.complete(); }).handle_error_interruptible( - PG::load_obc_ertr::all_same_way([] { - return seastar::now(); - }) + crimson::ct_error::assert_all("unexpected error") ); + co_await std::move(fut); + + logger().debug("{}: complete", *this); + co_await interruptor::make_interruptible(handle.complete()); + co_return; } seastar::future<> InternalClientRequest::start() From 96c771383ae0458de68517f1e1f1757e27367d0d Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Fri, 13 Sep 2024 23:55:43 +0000 Subject: [PATCH 113/148] crimson: eliminate get_obc stage f90af12d introduced check_already_complete_get_obc to replace get_obc, but left get_obc and didn't update the other users. Signed-off-by: Samuel Just --- src/crimson/osd/osd_operation_external_tracking.h | 11 ----------- src/crimson/osd/osd_operations/client_request.h | 1 - src/crimson/osd/osd_operations/common/pg_pipeline.h | 3 --- .../osd/osd_operations/internal_client_request.cc | 2 +- .../osd/osd_operations/internal_client_request.h | 2 +- src/crimson/osd/osd_operations/snaptrim_event.cc | 2 +- src/crimson/osd/osd_operations/snaptrim_event.h | 2 +- 7 files changed, 4 insertions(+), 19 deletions(-) diff --git a/src/crimson/osd/osd_operation_external_tracking.h b/src/crimson/osd/osd_operation_external_tracking.h index 530732ba71028..d2786a95e4d3c 100644 --- a/src/crimson/osd/osd_operation_external_tracking.h +++ b/src/crimson/osd/osd_operation_external_tracking.h @@ -36,7 +36,6 @@ struct LttngBackend ClientRequest::PGPipeline::RecoverMissing:: BlockingEvent::ExitBarrierEvent::Backend, ClientRequest::PGPipeline::CheckAlreadyCompleteGetObc::BlockingEvent::Backend, - ClientRequest::PGPipeline::GetOBC::BlockingEvent::Backend, ClientRequest::PGPipeline::LockOBC::BlockingEvent::Backend, ClientRequest::PGPipeline::LockOBC::BlockingEvent::ExitBarrierEvent::Backend, ClientRequest::PGPipeline::Process::BlockingEvent::Backend, @@ -117,10 +116,6 @@ struct LttngBackend const ClientRequest::PGPipeline::CheckAlreadyCompleteGetObc& blocker) override { } - void handle(ClientRequest::PGPipeline::GetOBC::BlockingEvent& ev, - const Operation& op, - const ClientRequest::PGPipeline::GetOBC& blocker) override { - } void handle(ClientRequest::PGPipeline::LockOBC::BlockingEvent& ev, const Operation& op, @@ -171,7 +166,6 @@ struct HistoricBackend ClientRequest::PGPipeline::RecoverMissing:: BlockingEvent::ExitBarrierEvent::Backend, ClientRequest::PGPipeline::CheckAlreadyCompleteGetObc::BlockingEvent::Backend, - ClientRequest::PGPipeline::GetOBC::BlockingEvent::Backend, ClientRequest::PGPipeline::LockOBC::BlockingEvent::Backend, ClientRequest::PGPipeline::LockOBC::BlockingEvent::ExitBarrierEvent::Backend, ClientRequest::PGPipeline::Process::BlockingEvent::Backend, @@ -252,11 +246,6 @@ struct HistoricBackend const ClientRequest::PGPipeline::CheckAlreadyCompleteGetObc& blocker) override { } - void handle(ClientRequest::PGPipeline::GetOBC::BlockingEvent& ev, - const Operation& op, - const ClientRequest::PGPipeline::GetOBC& blocker) override { - } - void handle(ClientRequest::PGPipeline::LockOBC::BlockingEvent& ev, const Operation& op, const ClientRequest::PGPipeline::LockOBC& blocker) override { diff --git a/src/crimson/osd/osd_operations/client_request.h b/src/crimson/osd/osd_operations/client_request.h index ea7aade22ac75..f14e76504fcd6 100644 --- a/src/crimson/osd/osd_operations/client_request.h +++ b/src/crimson/osd/osd_operations/client_request.h @@ -104,7 +104,6 @@ class ClientRequest final : public PhasedOperationT, PGPipeline::RecoverMissing::BlockingEvent, scrub::PGScrubber::BlockingEvent, PGPipeline::CheckAlreadyCompleteGetObc::BlockingEvent, - PGPipeline::GetOBC::BlockingEvent, PGPipeline::LockOBC::BlockingEvent, PGPipeline::Process::BlockingEvent, PGPipeline::WaitRepop::BlockingEvent, diff --git a/src/crimson/osd/osd_operations/common/pg_pipeline.h b/src/crimson/osd/osd_operations/common/pg_pipeline.h index 2b2d03ae4b3ed..0146cb247945f 100644 --- a/src/crimson/osd/osd_operations/common/pg_pipeline.h +++ b/src/crimson/osd/osd_operations/common/pg_pipeline.h @@ -23,9 +23,6 @@ class CommonPGPipeline { struct CheckAlreadyCompleteGetObc : OrderedExclusivePhaseT { static constexpr auto type_name = "CommonPGPipeline::check_already_complete_get_obc"; } check_already_complete_get_obc; - struct GetOBC : OrderedExclusivePhaseT { - static constexpr auto type_name = "CommonPGPipeline::get_obc"; - } get_obc; struct LockOBC : OrderedConcurrentPhaseT { static constexpr auto type_name = "CommonPGPipeline::lock_obc"; } lock_obc; diff --git a/src/crimson/osd/osd_operations/internal_client_request.cc b/src/crimson/osd/osd_operations/internal_client_request.cc index d0aa0822f8030..2bfa4296b2829 100644 --- a/src/crimson/osd/osd_operations/internal_client_request.cc +++ b/src/crimson/osd/osd_operations/internal_client_request.cc @@ -74,7 +74,7 @@ InternalClientRequest::with_interruption() fmt::format("{} is unfound, drop it!", get_target_oid())); } co_await enter_stage( - client_pp().get_obc); + client_pp().check_already_complete_get_obc); DEBUGI("{}: getting obc lock", *this); diff --git a/src/crimson/osd/osd_operations/internal_client_request.h b/src/crimson/osd/osd_operations/internal_client_request.h index 2f3585013344d..6e31ee993b9cb 100644 --- a/src/crimson/osd/osd_operations/internal_client_request.h +++ b/src/crimson/osd/osd_operations/internal_client_request.h @@ -58,7 +58,7 @@ class InternalClientRequest : public PhasedOperationT, CommonPGPipeline::WaitForActive::BlockingEvent, PGActivationBlocker::BlockingEvent, CommonPGPipeline::RecoverMissing::BlockingEvent, - CommonPGPipeline::GetOBC::BlockingEvent, + CommonPGPipeline::CheckAlreadyCompleteGetObc::BlockingEvent, CommonPGPipeline::LockOBC::BlockingEvent, CommonPGPipeline::Process::BlockingEvent, CompletionEvent diff --git a/src/crimson/osd/osd_operations/snaptrim_event.cc b/src/crimson/osd/osd_operations/snaptrim_event.cc index 7512b3d108dfc..9ed0b73cfb458 100644 --- a/src/crimson/osd/osd_operations/snaptrim_event.cc +++ b/src/crimson/osd/osd_operations/snaptrim_event.cc @@ -396,7 +396,7 @@ SnapTrimObjSubEvent::start() }); co_await enter_stage( - client_pp().get_obc); + client_pp().check_already_complete_get_obc); logger().debug("{}: getting obc for {}", *this, coid); // end of commonality diff --git a/src/crimson/osd/osd_operations/snaptrim_event.h b/src/crimson/osd/osd_operations/snaptrim_event.h index 06d8f43c2f3c9..cdd82cdbf3086 100644 --- a/src/crimson/osd/osd_operations/snaptrim_event.h +++ b/src/crimson/osd/osd_operations/snaptrim_event.h @@ -170,7 +170,7 @@ class SnapTrimObjSubEvent : public PhasedOperationT { std::tuple< StartEvent, - CommonPGPipeline::GetOBC::BlockingEvent, + CommonPGPipeline::CheckAlreadyCompleteGetObc::BlockingEvent, CommonPGPipeline::Process::BlockingEvent, CommonPGPipeline::WaitRepop::BlockingEvent, CompletionEvent From 7da7c3d736cebed2233ed836f53219b8dfe85047 Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Mon, 16 Sep 2024 22:16:37 +0000 Subject: [PATCH 114/148] crimson/osd: move pipelines to osd_operation.h Each of the two existing pipelines are shared across multiple ops. Rather than defining them in a specific op or in osd_operations/common/pg_pipeline.h, just declare them in osd_operation.h. Signed-off-by: Samuel Just --- src/crimson/osd/osd_operation.h | 31 ++++++++++++++++ .../osd/osd_operations/client_request.h | 1 - .../osd/osd_operations/common/pg_pipeline.h | 37 ------------------- .../osd_operations/internal_client_request.h | 1 - .../osd/osd_operations/peering_event.h | 9 ----- .../osd/osd_operations/snaptrim_event.h | 1 - 6 files changed, 31 insertions(+), 49 deletions(-) delete mode 100644 src/crimson/osd/osd_operations/common/pg_pipeline.h diff --git a/src/crimson/osd/osd_operation.h b/src/crimson/osd/osd_operation.h index fb0432edb8f9a..fd8b049c0bf08 100644 --- a/src/crimson/osd/osd_operation.h +++ b/src/crimson/osd/osd_operation.h @@ -40,6 +40,37 @@ struct PerShardPipeline { } create_or_wait_pg; }; +struct PGPeeringPipeline { + struct AwaitMap : OrderedExclusivePhaseT { + static constexpr auto type_name = "PeeringEvent::PGPipeline::await_map"; + } await_map; + struct Process : OrderedExclusivePhaseT { + static constexpr auto type_name = "PeeringEvent::PGPipeline::process"; + } process; +}; + +struct CommonPGPipeline { + struct WaitForActive : OrderedExclusivePhaseT { + static constexpr auto type_name = "CommonPGPipeline:::wait_for_active"; + } wait_for_active; + struct RecoverMissing : OrderedConcurrentPhaseT { + static constexpr auto type_name = "CommonPGPipeline::recover_missing"; + } recover_missing; + struct CheckAlreadyCompleteGetObc : OrderedExclusivePhaseT { + static constexpr auto type_name = "CommonPGPipeline::check_already_complete_get_obc"; + } check_already_complete_get_obc; + struct LockOBC : OrderedConcurrentPhaseT { + static constexpr auto type_name = "CommonPGPipeline::lock_obc"; + } lock_obc; + struct Process : OrderedExclusivePhaseT { + static constexpr auto type_name = "CommonPGPipeline::process"; + } process; + struct WaitRepop : OrderedConcurrentPhaseT { + static constexpr auto type_name = "ClientRequest::PGPipeline::wait_repop"; + } wait_repop; +}; + + enum class OperationTypeCode { client_request = 0, peering_event, diff --git a/src/crimson/osd/osd_operations/client_request.h b/src/crimson/osd/osd_operations/client_request.h index f14e76504fcd6..331cedaadfff2 100644 --- a/src/crimson/osd/osd_operations/client_request.h +++ b/src/crimson/osd/osd_operations/client_request.h @@ -14,7 +14,6 @@ #include "crimson/osd/osdmap_gate.h" #include "crimson/osd/osd_operation.h" #include "crimson/osd/osd_operations/client_request_common.h" -#include "crimson/osd/osd_operations/common/pg_pipeline.h" #include "crimson/osd/pg_activation_blocker.h" #include "crimson/osd/pg_map.h" #include "crimson/osd/scrub/pg_scrubber.h" diff --git a/src/crimson/osd/osd_operations/common/pg_pipeline.h b/src/crimson/osd/osd_operations/common/pg_pipeline.h deleted file mode 100644 index 0146cb247945f..0000000000000 --- a/src/crimson/osd/osd_operations/common/pg_pipeline.h +++ /dev/null @@ -1,37 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#pragma once - -#include "osd/osd_op_util.h" -#include "crimson/osd/osd_operation.h" - -namespace crimson::osd { - -class CommonPGPipeline { -protected: - friend class InternalClientRequest; - friend class SnapTrimEvent; - friend class SnapTrimObjSubEvent; - - struct WaitForActive : OrderedExclusivePhaseT { - static constexpr auto type_name = "CommonPGPipeline:::wait_for_active"; - } wait_for_active; - struct RecoverMissing : OrderedConcurrentPhaseT { - static constexpr auto type_name = "CommonPGPipeline::recover_missing"; - } recover_missing; - struct CheckAlreadyCompleteGetObc : OrderedExclusivePhaseT { - static constexpr auto type_name = "CommonPGPipeline::check_already_complete_get_obc"; - } check_already_complete_get_obc; - struct LockOBC : OrderedConcurrentPhaseT { - static constexpr auto type_name = "CommonPGPipeline::lock_obc"; - } lock_obc; - struct Process : OrderedExclusivePhaseT { - static constexpr auto type_name = "CommonPGPipeline::process"; - } process; - struct WaitRepop : OrderedConcurrentPhaseT { - static constexpr auto type_name = "ClientRequest::PGPipeline::wait_repop"; - } wait_repop; -}; - -} // namespace crimson::osd diff --git a/src/crimson/osd/osd_operations/internal_client_request.h b/src/crimson/osd/osd_operations/internal_client_request.h index 6e31ee993b9cb..782fb809042a6 100644 --- a/src/crimson/osd/osd_operations/internal_client_request.h +++ b/src/crimson/osd/osd_operations/internal_client_request.h @@ -6,7 +6,6 @@ #include "crimson/common/type_helpers.h" #include "crimson/osd/osd_operation.h" #include "crimson/osd/osd_operations/client_request_common.h" -#include "crimson/osd/osd_operations/common/pg_pipeline.h" #include "crimson/osd/pg.h" #include "crimson/osd/pg_activation_blocker.h" diff --git a/src/crimson/osd/osd_operations/peering_event.h b/src/crimson/osd/osd_operations/peering_event.h index 1e6bd957289ff..85de5c711d67c 100644 --- a/src/crimson/osd/osd_operations/peering_event.h +++ b/src/crimson/osd/osd_operations/peering_event.h @@ -23,15 +23,6 @@ class ShardServices; class PG; class BackfillRecovery; - struct PGPeeringPipeline { - struct AwaitMap : OrderedExclusivePhaseT { - static constexpr auto type_name = "PeeringEvent::PGPipeline::await_map"; - } await_map; - struct Process : OrderedExclusivePhaseT { - static constexpr auto type_name = "PeeringEvent::PGPipeline::process"; - } process; - }; - template class PeeringEvent : public PhasedOperationT { T* that() { diff --git a/src/crimson/osd/osd_operations/snaptrim_event.h b/src/crimson/osd/osd_operations/snaptrim_event.h index cdd82cdbf3086..1164b3169d293 100644 --- a/src/crimson/osd/osd_operations/snaptrim_event.h +++ b/src/crimson/osd/osd_operations/snaptrim_event.h @@ -9,7 +9,6 @@ #include "crimson/osd/osdmap_gate.h" #include "crimson/osd/osd_operation.h" #include "crimson/common/subop_blocker.h" -#include "crimson/osd/osd_operations/common/pg_pipeline.h" #include "crimson/osd/pg.h" #include "crimson/osd/pg_activation_blocker.h" #include "osd/osd_types.h" From 0a83d956e546d7d04c55de34a788234533ed5293 Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Thu, 19 Sep 2024 00:59:21 +0000 Subject: [PATCH 115/148] crimson: remove the eagain error from PG::do_osd_ops The idea here is that PG::do_osd_ops propogates an eagain after starting a repair upon encountering an eio to indicate that the op should restart from the top of ClientRequest::process_op. However, InternalClientRequest's handler for this error simply ignores it. ClientRequest's handling, while superficially reasonable, doesn't actually work. Re-calling process_op would mean reentering previous stages. This is problematic for at least a few reasons: 1. Reentering a prior stage with the same handler doesn't actually work since the corresponding event entries will already be populated. 2. There might be other ops on the same object waiting on the process stage. They'd need to be sent back as well in order to preserve ordering. Because this mechanism doesn't really seem to be fully baked, let's remove it for now and try to reintroduce it later after do_osd_ops[_execute] are a bit simpler. Signed-off-by: Samuel Just --- .../osd/osd_operations/client_request.cc | 23 +++++++++++++------ .../osd/osd_operations/client_request.h | 7 +----- .../osd_operations/internal_client_request.cc | 2 ++ 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/src/crimson/osd/osd_operations/client_request.cc b/src/crimson/osd/osd_operations/client_request.cc index 8e9a7c4d7490c..6eed04df6a5ac 100644 --- a/src/crimson/osd/osd_operations/client_request.cc +++ b/src/crimson/osd/osd_operations/client_request.cc @@ -403,11 +403,6 @@ ClientRequest::process_op( *pg, *this, this_instance_id); return do_process( ihref, pg, obc, this_instance_id - ).handle_error_interruptible( - crimson::ct_error::eagain::handle( - [this, pg, this_instance_id, &ihref]() mutable { - return process_op(ihref, pg, this_instance_id); - }) ); } ); @@ -437,7 +432,7 @@ ClientRequest::process_op( co_await std::move(process); } -ClientRequest::do_process_iertr::future<> +ClientRequest::interruptible_future<> ClientRequest::do_process( instance_handle_t &ihref, Ref pg, crimson::osd::ObjectContextRef obc, @@ -509,12 +504,26 @@ ClientRequest::do_process( auto [submitted, all_completed] = co_await pg->do_osd_ops( m, r_conn, obc, op_info, snapc + ).handle_error_interruptible( + crimson::ct_error::eagain::handle([] { + ceph_assert(0 == "not handled"); + return std::make_tuple( + interruptor::now(), + PG::do_osd_ops_iertr::make_ready_future>()); + }) ); co_await std::move(submitted); co_await ihref.enter_stage(client_pp(*pg).wait_repop, *this); - auto reply = co_await std::move(all_completed); + auto reply = co_await std::move( + all_completed + ).handle_error_interruptible( + crimson::ct_error::eagain::handle([] { + ceph_assert(0 == "not handled"); + return MURef(); + }) + ); co_await ihref.enter_stage(client_pp(*pg).send_reply, *this); DEBUGDPP("{}.{}: sending response", diff --git a/src/crimson/osd/osd_operations/client_request.h b/src/crimson/osd/osd_operations/client_request.h index 331cedaadfff2..6ee57e9874cd1 100644 --- a/src/crimson/osd/osd_operations/client_request.h +++ b/src/crimson/osd/osd_operations/client_request.h @@ -274,12 +274,7 @@ class ClientRequest final : public PhasedOperationT, interruptible_future<> with_sequencer(FuncT&& func); interruptible_future<> reply_op_error(const Ref& pg, int err); - - using do_process_iertr = - ::crimson::interruptible::interruptible_errorator< - ::crimson::osd::IOInterruptCondition, - ::crimson::errorator>; - do_process_iertr::future<> do_process( + interruptible_future<> do_process( instance_handle_t &ihref, Ref pg, crimson::osd::ObjectContextRef obc, diff --git a/src/crimson/osd/osd_operations/internal_client_request.cc b/src/crimson/osd/osd_operations/internal_client_request.cc index 2bfa4296b2829..dabff1a33bdb6 100644 --- a/src/crimson/osd/osd_operations/internal_client_request.cc +++ b/src/crimson/osd/osd_operations/internal_client_request.cc @@ -103,9 +103,11 @@ InternalClientRequest::with_interruption() [](auto submitted, auto all_completed) { return all_completed.handle_error_interruptible( crimson::ct_error::eagain::handle([] { + ceph_assert(0 == "not handled"); return seastar::now(); })); }, crimson::ct_error::eagain::handle([] { + ceph_assert(0 == "not handled"); return interruptor::now(); }) ); From a43452f47ee6f2eb7e2496ee242848acba8f8472 Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Thu, 19 Sep 2024 23:58:48 +0000 Subject: [PATCH 116/148] crimson: OpsExecutor::flush_clone_metadata no longer needs to return a future Snapmapper updates happen during log commit now. Signed-off-by: Samuel Just --- src/crimson/osd/ops_executer.cc | 10 ++-------- src/crimson/osd/ops_executer.h | 4 ++-- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/src/crimson/osd/ops_executer.cc b/src/crimson/osd/ops_executer.cc index 0a07fa7ee293e..9bf60140374c8 100644 --- a/src/crimson/osd/ops_executer.cc +++ b/src/crimson/osd/ops_executer.cc @@ -973,7 +973,7 @@ void OpsExecuter::CloningContext::apply_to( processed_obc.ssc->snapset = std::move(new_snapset); } -OpsExecuter::interruptible_future> +std::vector OpsExecuter::flush_clone_metadata( std::vector&& log_entries, SnapMapper& snap_mapper, @@ -981,7 +981,6 @@ OpsExecuter::flush_clone_metadata( ceph::os::Transaction& txn) { assert(!txn.empty()); - auto maybe_snap_mapped = interruptor::now(); update_clone_overlap(); if (cloning_ctx) { std::move(*cloning_ctx).apply_to(log_entries, *obc); @@ -993,12 +992,7 @@ OpsExecuter::flush_clone_metadata( } logger().debug("{} done, initial snapset={}, new snapset={}", __func__, obc->obs.oi.soid, obc->ssc->snapset); - return std::move( - maybe_snap_mapped - ).then_interruptible([log_entries=std::move(log_entries)]() mutable { - return interruptor::make_ready_future>( - std::move(log_entries)); - }); + return std::move(log_entries); } ObjectContextRef OpsExecuter::prepare_clone( diff --git a/src/crimson/osd/ops_executer.h b/src/crimson/osd/ops_executer.h index e25a035616edd..0b61f80b9983b 100644 --- a/src/crimson/osd/ops_executer.h +++ b/src/crimson/osd/ops_executer.h @@ -267,7 +267,7 @@ class OpsExecuter : public seastar::enable_lw_shared_from_this { */ void update_clone_overlap(); - interruptible_future> flush_clone_metadata( + std::vector flush_clone_metadata( std::vector&& log_entries, SnapMapper& snap_mapper, OSDriver& osdriver, @@ -510,7 +510,7 @@ OpsExecuter::flush_changes_n_do_ops_effects( } if (want_mutate) { - auto log_entries = co_await flush_clone_metadata( + auto log_entries = flush_clone_metadata( prepare_transaction(ops), snap_mapper, osdriver, From 24b7b4f4b5d53927d5cc6689fd0ca1ec2276a5f3 Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Fri, 20 Sep 2024 02:23:47 +0000 Subject: [PATCH 117/148] crimson: futures from flush_changes_n_do_ops_effects must not fail The return signature previously suggested that the second future returned could be an error. This seemed necessary due to how effects are handled: template OpsExecuter::rep_op_fut_t OpsExecuter::flush_changes_n_do_ops_effects( const std::vector& ops, SnapMapper& snap_mapper, OSDriver& osdriver, MutFunc mut_func) && { ... all_completed = std::move(all_completed).then_interruptible([this, pg=this->pg] { // let's do the cleaning of `op_effects` in destructor return interruptor::do_for_each(op_effects, [pg=std::move(pg)](auto& op_effect) { return op_effect->execute(pg); }); However, all of the actual execute implementations (created via OpsExecuter::with_effect_on_obc) return a bare seastar::future and cannot fail. In a larger sense, it's actually critical that neither future returned from flush_changes_n_do_ops_effects may fail -- they represent applying the transaction locally and remotely. If either portion fails, there would need to be an interval change to recover. Signed-off-by: Samuel Just --- src/crimson/osd/ops_executer.h | 11 ++++--- src/crimson/osd/pg.cc | 53 +++++++++++++++++++++++++--------- 2 files changed, 45 insertions(+), 19 deletions(-) diff --git a/src/crimson/osd/ops_executer.h b/src/crimson/osd/ops_executer.h index 0b61f80b9983b..185ead24e7550 100644 --- a/src/crimson/osd/ops_executer.h +++ b/src/crimson/osd/ops_executer.h @@ -179,7 +179,7 @@ class OpsExecuter : public seastar::enable_lw_shared_from_this { // should be used. struct effect_t { // an effect can affect PG, i.e. create a watch timeout - virtual osd_op_errorator::future<> execute(Ref pg) = 0; + virtual seastar::future<> execute(Ref pg) = 0; virtual ~effect_t() = default; }; @@ -400,7 +400,7 @@ class OpsExecuter : public seastar::enable_lw_shared_from_this { execute_op(OSDOp& osd_op); using rep_op_fut_tuple = - std::tuple, osd_op_ierrorator::future<>>; + std::tuple, interruptible_future<>>; using rep_op_fut_t = interruptible_future; template @@ -475,7 +475,7 @@ auto OpsExecuter::with_effect_on_obc( effect_func(std::move(effect_func)), obc(std::move(obc)) { } - osd_op_errorator::future<> execute(Ref pg) final { + seastar::future<> execute(Ref pg) final { return std::move(effect_func)(std::move(ctx), std::move(obc), std::move(pg)); @@ -502,8 +502,7 @@ OpsExecuter::flush_changes_n_do_ops_effects( assert(obc); auto submitted = interruptor::now(); - auto all_completed = - interruptor::make_interruptible(osd_op_errorator::now()); + auto all_completed = interruptor::now(); if (cloning_ctx) { ceph_assert(want_mutate); @@ -536,7 +535,7 @@ OpsExecuter::flush_changes_n_do_ops_effects( // need extra ref pg due to apply_stats() which can be executed after // informing snap mapper all_completed = - std::move(all_completed).safe_then_interruptible([this, pg=this->pg] { + std::move(all_completed).then_interruptible([this, pg=this->pg] { // let's do the cleaning of `op_effects` in destructor return interruptor::do_for_each(op_effects, [pg=std::move(pg)](auto& op_effect) { diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc index 97d48c1fa454c..8ab4e4e899b8e 100644 --- a/src/crimson/osd/pg.cc +++ b/src/crimson/osd/pg.cc @@ -999,6 +999,28 @@ PG::do_osd_ops_execute( ceph_osd_op_name(osd_op.op.op)); return ox->execute_op(osd_op); }).safe_then_interruptible([this, ox, &ops] { + /* flush_changes_n_do_ops_effects now returns + * + * interruptible_future< + * tuple, interruptible_future<>>> + * + * Previously, this lambda relied on the second element of that tuple to + * include OpsExecutor::osd_op_errorator in order to propogate the + * following three errors to the next callback. This is actually quite + * awkward as the second future is the completion future, which really + * cannot fail (for it to do so would require an interval change to + * correct). + * + * Rather than reworking this now, I'll leave it as is and refactor it + * later. + */ + using complete_iertr = crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + OpsExecuter::osd_op_errorator>; + using ret_t = std::tuple< + interruptible_future<>, + complete_iertr::future<>>; + logger().debug( "do_osd_ops_execute: object {} all operations successful", ox->get_target()); @@ -1014,22 +1036,22 @@ PG::do_osd_ops_execute( // they tried, they failed. logger().info(" full, replying to FULL_TRY op"); if (get_pgpool().info.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) - return interruptor::make_ready_future( - seastar::now(), - OpsExecuter::osd_op_ierrorator::future<>( - crimson::ct_error::edquot::make())); + return interruptor::make_ready_future( + interruptor::now(), + complete_iertr::future<>( + crimson::ct_error::edquot::make())); else - return interruptor::make_ready_future( - seastar::now(), - OpsExecuter::osd_op_ierrorator::future<>( - crimson::ct_error::enospc::make())); + return interruptor::make_ready_future( + interruptor::now(), + complete_iertr::future<>( + crimson::ct_error::enospc::make())); } else { // drop request logger().info(" full, dropping request (bad client)"); - return interruptor::make_ready_future( - seastar::now(), - OpsExecuter::osd_op_ierrorator::future<>( - crimson::ct_error::eagain::make())); + return interruptor::make_ready_future( + interruptor::now(), + complete_iertr::future<>( + crimson::ct_error::eagain::make())); } } return std::move(*ox).flush_changes_n_do_ops_effects( @@ -1049,7 +1071,12 @@ PG::do_osd_ops_execute( std::move(txn), std::move(osd_op_p), std::move(log_entries)); - }); + }).then_interruptible([](auto &&futs) { + auto &&[submitted, completed] = std::move(futs); + return interruptor::make_ready_future( + std::move(submitted), + std::move(completed)); + }); }).safe_then_unpack_interruptible( [success_func=std::move(success_func), rollbacker, this, failure_func_ptr, obc] (auto submitted_fut, auto _all_completed_fut) mutable { From 5e28a3bd3b58353ff29cf1cd1b9627575158c290 Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Fri, 20 Sep 2024 12:56:17 -0700 Subject: [PATCH 118/148] crimson: introduce rollback_obc_if_modified without an error argument Signed-off-by: Samuel Just --- src/crimson/osd/ops_executer.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/src/crimson/osd/ops_executer.h b/src/crimson/osd/ops_executer.h index 185ead24e7550..6986f49ea08a7 100644 --- a/src/crimson/osd/ops_executer.h +++ b/src/crimson/osd/ops_executer.h @@ -551,6 +551,7 @@ OpsExecuter::flush_changes_n_do_ops_effects( template struct OpsExecuter::RollbackHelper { + void rollback_obc_if_modified(); void rollback_obc_if_modified(const std::error_code& e); seastar::lw_shared_ptr ox; Func func; @@ -562,6 +563,33 @@ OpsExecuter::create_rollbacker(Func&& func) { return {shared_from_this(), std::forward(func)}; } +template +void OpsExecuter::RollbackHelper::rollback_obc_if_modified() +{ + // Oops, an operation had failed. do_osd_ops() altogether with + // OpsExecuter already dropped the ObjectStore::Transaction if + // there was any. However, this is not enough to completely + // rollback as we gave OpsExecuter the very single copy of `obc` + // we maintain and we did it for both reading and writing. + // Now all modifications must be reverted. + // + // The conditional's purpose is to efficiently handle hot errors + // which may appear as a result of e.g. CEPH_OSD_OP_CMPXATTR or + // CEPH_OSD_OP_OMAP_CMP. These are read-like ops and clients + // typically append them before any write. If OpsExecuter hasn't + // seen any modifying operation, `obc` is supposed to be kept + // unchanged. + assert(ox); + const auto need_rollback = ox->has_seen_write(); + crimson::get_logger(ceph_subsys_osd).debug( + "{}: object {} got error, need_rollback={}", + __func__, + ox->obc->get_oid(), + need_rollback); + if (need_rollback) { + func(ox->obc); + } +} template void OpsExecuter::RollbackHelper::rollback_obc_if_modified( From 7a826eb86c423e895345557632091a934f7c7d7e Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Thu, 19 Sep 2024 19:39:08 -0700 Subject: [PATCH 119/148] crimson: PG::submit_error_log returns eversion_t rather than optional It seems like the motivation here was to allow do_osd_ops_execute to communicate that it didn't submit an error log by making maybe_submit_error_log a std::optional. However, submit_error_log itself always returns a version. Fix submit_error_log and compensate in do_osd_ops_execute. Signed-off-by: Samuel Just --- src/crimson/osd/pg.cc | 10 +++++++--- src/crimson/osd/pg.h | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc index 8ab4e4e899b8e..26d1fa883bbce 100644 --- a/src/crimson/osd/pg.cc +++ b/src/crimson/osd/pg.cc @@ -1122,7 +1122,11 @@ PG::do_osd_ops_execute( if constexpr (!std::is_same_v) { if(op_info.may_write()) { maybe_submit_error_log = - submit_error_log(m, op_info, obc, e, rep_tid); + submit_error_log( + m, op_info, obc, e, rep_tid + ).then_interruptible([](auto &&e) { + return std::make_optional(std::move(e)); + }); } } return maybe_submit_error_log.then_interruptible( @@ -1175,7 +1179,7 @@ PG::interruptible_future<> PG::complete_error_log(const ceph_tid_t& rep_tid, return result; } -PG::interruptible_future> PG::submit_error_log( +PG::interruptible_future PG::submit_error_log( Ref m, const OpInfo &op_info, ObjectContextRef obc, @@ -1241,7 +1245,7 @@ PG::interruptible_future> PG::submit_error_log( get_collection_ref(), std::move(t) ).then([this] { peering_state.update_trim_to(); - return seastar::make_ready_future>(projected_last_update); + return seastar::make_ready_future(projected_last_update); }); }); }); diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h index d8bbc56abcc46..5bd5c3aeff849 100644 --- a/src/crimson/osd/pg.h +++ b/src/crimson/osd/pg.h @@ -621,7 +621,7 @@ class PG : public boost::intrusive_ref_counter< void dump_primary(Formatter*); interruptible_future<> complete_error_log(const ceph_tid_t& rep_tid, const eversion_t& version); - interruptible_future> submit_error_log( + interruptible_future submit_error_log( Ref m, const OpInfo &op_info, ObjectContextRef obc, From 00057b45f03ae9864a83451b498b4e0239496785 Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Thu, 26 Sep 2024 14:10:06 -0700 Subject: [PATCH 120/148] crimson: introduce PG::run_executer,submit_executer These are intended to replace do_osd_ops*. The implementation is simpler and does not involve passing success and failure callbacks. It also moves responsibility for dealing with the MOSDOpReply and client related error handling over to ClientRequest. do_osd_op* will be removed once users are switched over. Signed-off-by: Samuel Just --- src/crimson/osd/pg.cc | 79 +++++++++++++++++++++++++++++++++++++++++++ src/crimson/osd/pg.h | 27 +++++++++++++++ 2 files changed, 106 insertions(+) diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc index 26d1fa883bbce..bb5c1e9000baf 100644 --- a/src/crimson/osd/pg.cc +++ b/src/crimson/osd/pg.cc @@ -13,6 +13,9 @@ #include #include #include + +#include + #include "include/utime_fmt.h" #include "common/hobject.h" @@ -1251,6 +1254,82 @@ PG::interruptible_future PG::submit_error_log( }); } +PG::run_executer_fut PG::run_executer( + seastar::lw_shared_ptr ox, + ObjectContextRef obc, + const OpInfo &op_info, + std::vector& ops) +{ + LOG_PREFIX(PG::run_executer); + auto rollbacker = ox->create_rollbacker( + [stored_obc=duplicate_obc(obc)](auto &obc) mutable { + obc->update_from(*stored_obc); + }); + auto rollback_on_error = seastar::defer([&rollbacker] { + rollbacker.rollback_obc_if_modified(); + }); + + for (auto &op: ops) { + DEBUGDPP("object {} handle op {}", *this, ox->get_target(), op); + co_await ox->execute_op(op); + } + DEBUGDPP("object {} all operations successful", *this, ox->get_target()); + + // check for full + if ((ox->delta_stats.num_bytes > 0 || + ox->delta_stats.num_objects > 0) && + get_pgpool().info.has_flag(pg_pool_t::FLAG_FULL)) { + const auto& m = ox->get_message(); + if (m.get_reqid().name.is_mds() || // FIXME: ignore MDS for now + m.has_flag(CEPH_OSD_FLAG_FULL_FORCE)) { + INFODPP("full, but proceeding due to FULL_FORCE, or MDS", *this); + } else if (m.has_flag(CEPH_OSD_FLAG_FULL_TRY)) { + // they tried, they failed. + INFODPP("full, replying to FULL_TRY op", *this); + if (get_pgpool().info.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) { + co_await run_executer_fut( + crimson::ct_error::edquot::make()); + } else { + co_await run_executer_fut( + crimson::ct_error::enospc::make()); + } + } else { + // drop request + INFODPP("full, dropping request (bad client)", *this); + co_await run_executer_fut( + crimson::ct_error::eagain::make()); + } + } + rollback_on_error.cancel(); +} + +PG::submit_executer_fut PG::submit_executer( + seastar::lw_shared_ptr ox, + const std::vector& ops) +{ + LOG_PREFIX(PG::submit_executer); + // transaction must commit at this point + return std::move( + *ox + ).flush_changes_n_do_ops_effects( + ops, + snap_mapper, + osdriver, + [FNAME, this](auto&& txn, + auto&& obc, + auto&& osd_op_p, + auto&& log_entries) { + DEBUGDPP("object {} submitting txn", *this, obc->get_oid()); + mutate_object(obc, txn, osd_op_p); + return submit_transaction( + std::move(obc), + std::move(txn), + std::move(osd_op_p), + std::move(log_entries)); + }); +} + + PG::do_osd_ops_iertr::future>> PG::do_osd_ops( Ref m, diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h index 5bd5c3aeff849..c91f93171dbc1 100644 --- a/src/crimson/osd/pg.h +++ b/src/crimson/osd/pg.h @@ -645,6 +645,33 @@ class PG : public boost::intrusive_ref_counter< } } background_process_lock; + using run_executer_ertr = crimson::compound_errorator_t< + OpsExecuter::osd_op_errorator, + crimson::errorator< + crimson::ct_error::edquot, + crimson::ct_error::eagain, + crimson::ct_error::enospc + > + >; + using run_executer_iertr = crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + run_executer_ertr>; + using run_executer_fut = run_executer_iertr::future<>; + run_executer_fut run_executer( + seastar::lw_shared_ptr ox, + ObjectContextRef obc, + const OpInfo &op_info, + std::vector& ops); + + using submit_executer_ret = std::tuple< + interruptible_future<>, + interruptible_future<>>; + using submit_executer_fut = interruptible_future< + submit_executer_ret>; + submit_executer_fut submit_executer( + seastar::lw_shared_ptr ox, + const std::vector& ops); + using do_osd_ops_ertr = crimson::errorator< crimson::ct_error::eagain>; using do_osd_ops_iertr = From 304e20e9bcf6f29b0f0f22089665d78099265fec Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Thu, 26 Sep 2024 15:15:48 -0700 Subject: [PATCH 121/148] crimson: switch ClientRequest::do_request to use *_executer rather than do_osd_ops Signed-off-by: Samuel Just --- .../osd/osd_operations/client_request.cc | 141 +++++++++++++++--- 1 file changed, 117 insertions(+), 24 deletions(-) diff --git a/src/crimson/osd/osd_operations/client_request.cc b/src/crimson/osd/osd_operations/client_request.cc index 6eed04df6a5ac..c226222fa0c75 100644 --- a/src/crimson/osd/osd_operations/client_request.cc +++ b/src/crimson/osd/osd_operations/client_request.cc @@ -502,36 +502,129 @@ ClientRequest::do_process( co_return; } - auto [submitted, all_completed] = co_await pg->do_osd_ops( - m, r_conn, obc, op_info, snapc - ).handle_error_interruptible( - crimson::ct_error::eagain::handle([] { - ceph_assert(0 == "not handled"); - return std::make_tuple( - interruptor::now(), - PG::do_osd_ops_iertr::make_ready_future>()); + auto ox = seastar::make_lw_shared( + pg, obc, op_info, *m, r_conn, snapc); + auto ret = co_await pg->run_executer( + ox, obc, op_info, m->ops + ).si_then([]() -> std::optional { + return std::nullopt; + }).handle_error_interruptible(crimson::ct_error::all_same_way( + [](auto e) -> std::optional { + return e; }) ); - co_await std::move(submitted); - co_await ihref.enter_stage(client_pp(*pg).wait_repop, *this); + auto should_log_error = [](std::error_code e) -> bool { + switch (e.value()) { + case EDQUOT: + case ENOSPC: + case EAGAIN: + return false; + default: + return true; + } + }; - auto reply = co_await std::move( - all_completed - ).handle_error_interruptible( - crimson::ct_error::eagain::handle([] { - ceph_assert(0 == "not handled"); - return MURef(); - }) - ); + if (ret && !should_log_error(*ret)) { + co_await reply_op_error(pg, -ret->value()); + co_return; + } + + { + auto all_completed = interruptor::now(); + if (ret) { + assert(should_log_error(*ret)); + if (op_info.may_write()) { + auto rep_tid = pg->shard_services.get_tid(); + auto version = co_await pg->submit_error_log( + m, op_info, obc, *ret, rep_tid); + + all_completed = pg->complete_error_log( + rep_tid, version); + } + // simply return the error below, leaving all_completed alone + } else { + auto submitted = interruptor::now(); + std::tie(submitted, all_completed) = co_await pg->submit_executer( + std::move(ox), m->ops); + co_await std::move(submitted); + } + co_await ihref.enter_stage(client_pp(*pg).wait_repop, *this); + + co_await std::move(all_completed); + } co_await ihref.enter_stage(client_pp(*pg).send_reply, *this); - DEBUGDPP("{}.{}: sending response", - *pg, *this, this_instance_id); - // TODO: gate the crosscore sending - co_await interruptor::make_interruptible( - get_foreign_connection().send_with_throttling(std::move(reply)) - ); + + if (ret) { + int err = -ret->value(); + DEBUGDPP("{}: replying with error {}", *pg, *this, err); + + auto reply = crimson::make_message( + m.get(), err, pg->get_osdmap_epoch(), 0, false); + + if (!m->ops.empty() && m->ops.back().op.flags & CEPH_OSD_OP_FLAG_FAILOK) { + reply->set_result(0); + } + + // For all ops except for CMPEXT, the correct error value is encoded + // in e. For CMPEXT, osdop.rval has the actual error value. + if (err == -ct_error::cmp_fail_error_value) { + assert(!m->ops.empty()); + for (auto &osdop : m->ops) { + if (osdop.rval < 0) { + reply->set_result(osdop.rval); + break; + } + } + } + + reply->set_enoent_reply_versions( + pg->peering_state.get_info().last_update, + pg->peering_state.get_info().last_user_version); + reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK); + + // TODO: gate the crosscore sending + co_await interruptor::make_interruptible( + get_foreign_connection().send_with_throttling(std::move(reply))); + } else { + int result = m->ops.empty() ? 0 : m->ops.back().rval.code; + if (op_info.may_read() && result >= 0) { + for (auto &osdop : m->ops) { + if (osdop.rval < 0 && !(osdop.op.flags & CEPH_OSD_OP_FLAG_FAILOK)) { + result = osdop.rval.code; + break; + } + } + } else if (result > 0 && op_info.may_write() && !op_info.allows_returnvec()) { + result = 0; + } else if (result < 0 && + (m->ops.empty() ? + 0 : m->ops.back().op.flags & CEPH_OSD_OP_FLAG_FAILOK)) { + result = 0; + } + auto reply = crimson::make_message( + m.get(), + result, + pg->get_osdmap_epoch(), + 0, + false); + reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK); + if (obc->obs.exists) { + reply->set_reply_versions(pg->peering_state.get_info().last_update, + obc->obs.oi.user_version); + } else { + reply->set_reply_versions(pg->peering_state.get_info().last_update, + pg->peering_state.get_info().last_user_version); + } + + DEBUGDPP("{}.{}: sending response {}", + *pg, *this, this_instance_id, *m); + // TODO: gate the crosscore sending + co_await interruptor::make_interruptible( + get_foreign_connection().send_with_throttling(std::move(reply)) + ); + } } bool ClientRequest::is_misdirected(const PG& pg) const From fc41fcb9d2a7c5b589ea68ad0644ac92d22fe761 Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Thu, 26 Sep 2024 22:30:59 +0000 Subject: [PATCH 122/148] crimson: factor out InternalClientRequest::do_process Signed-off-by: Samuel Just --- .../osd_operations/internal_client_request.cc | 44 +++++++++++-------- .../osd_operations/internal_client_request.h | 3 ++ 2 files changed, 29 insertions(+), 18 deletions(-) diff --git a/src/crimson/osd/osd_operations/internal_client_request.cc b/src/crimson/osd/osd_operations/internal_client_request.cc index dabff1a33bdb6..d0ee392ecb638 100644 --- a/src/crimson/osd/osd_operations/internal_client_request.cc +++ b/src/crimson/osd/osd_operations/internal_client_request.cc @@ -50,6 +50,30 @@ CommonPGPipeline& InternalClientRequest::client_pp() return pg->request_pg_pipeline; } +InternalClientRequest::interruptible_future<> +InternalClientRequest::do_process( + crimson::osd::ObjectContextRef obc, + std::vector &osd_ops) +{ + return pg->do_osd_ops( + std::move(obc), + osd_ops, + std::as_const(op_info), + get_do_osd_ops_params() + ).safe_then_unpack_interruptible( + [](auto submitted, auto all_completed) { + return all_completed.handle_error_interruptible( + crimson::ct_error::eagain::handle([] { + ceph_assert(0 == "not handled"); + return seastar::now(); + })); + }, crimson::ct_error::eagain::handle([] { + ceph_assert(0 == "not handled"); + return interruptor::now(); + }) + ); +} + InternalClientRequest::interruptible_future<> InternalClientRequest::with_interruption() { @@ -93,24 +117,8 @@ InternalClientRequest::with_interruption() [&osd_ops, this](auto, auto obc) { return enter_stage(client_pp().process ).then_interruptible( - [obc=std::move(obc), &osd_ops, this] { - return pg->do_osd_ops( - std::move(obc), - osd_ops, - std::as_const(op_info), - get_do_osd_ops_params() - ).safe_then_unpack_interruptible( - [](auto submitted, auto all_completed) { - return all_completed.handle_error_interruptible( - crimson::ct_error::eagain::handle([] { - ceph_assert(0 == "not handled"); - return seastar::now(); - })); - }, crimson::ct_error::eagain::handle([] { - ceph_assert(0 == "not handled"); - return interruptor::now(); - }) - ); + [obc=std::move(obc), &osd_ops, this]() mutable { + return do_process(std::move(obc), osd_ops); }); }).handle_error_interruptible( crimson::ct_error::assert_all("unexpected error") diff --git a/src/crimson/osd/osd_operations/internal_client_request.h b/src/crimson/osd/osd_operations/internal_client_request.h index 782fb809042a6..6023db0a8dbe2 100644 --- a/src/crimson/osd/osd_operations/internal_client_request.h +++ b/src/crimson/osd/osd_operations/internal_client_request.h @@ -41,6 +41,9 @@ class InternalClientRequest : public PhasedOperationT, CommonPGPipeline& client_pp(); InternalClientRequest::interruptible_future<> with_interruption(); + InternalClientRequest::interruptible_future<> do_process( + crimson::osd::ObjectContextRef obc, + std::vector &osd_ops); seastar::future<> do_process(); From c091f3b2ab6a89762e6fcf5ccaa49b65c9ab6fca Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Thu, 26 Sep 2024 22:43:35 +0000 Subject: [PATCH 123/148] crimson: convert InternalClientRequest::do_request to use *_executer rather than do_osd_ops* Signed-off-by: Samuel Just --- .../osd_operations/internal_client_request.cc | 35 ++++++++++--------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/src/crimson/osd/osd_operations/internal_client_request.cc b/src/crimson/osd/osd_operations/internal_client_request.cc index d0ee392ecb638..6ad447cf32ee4 100644 --- a/src/crimson/osd/osd_operations/internal_client_request.cc +++ b/src/crimson/osd/osd_operations/internal_client_request.cc @@ -55,23 +55,26 @@ InternalClientRequest::do_process( crimson::osd::ObjectContextRef obc, std::vector &osd_ops) { - return pg->do_osd_ops( - std::move(obc), - osd_ops, - std::as_const(op_info), - get_do_osd_ops_params() - ).safe_then_unpack_interruptible( - [](auto submitted, auto all_completed) { - return all_completed.handle_error_interruptible( - crimson::ct_error::eagain::handle([] { - ceph_assert(0 == "not handled"); - return seastar::now(); - })); - }, crimson::ct_error::eagain::handle([] { - ceph_assert(0 == "not handled"); - return interruptor::now(); - }) + LOG_PREFIX(InternalClientRequest::do_process); + auto params = get_do_osd_ops_params(); + auto ox = seastar::make_lw_shared( + pg, obc, op_info, params, params.get_connection(), SnapContext{}); + co_await pg->run_executer( + ox, obc, op_info, osd_ops + ).handle_error_interruptible( + crimson::ct_error::all_same_way( + [this, FNAME](auto e) { + ERRORDPPI("{}: got unexpected error {}", *pg, *this, e); + ceph_assert(0 == "should not return an error"); + return interruptor::now(); + }) ); + + auto [submitted, completed] = co_await pg->submit_executer( + std::move(ox), osd_ops); + + co_await std::move(submitted); + co_await std::move(completed); } InternalClientRequest::interruptible_future<> From a0efff116cd038b08c0ce31a5c32c4b9df574088 Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Thu, 10 Oct 2024 16:22:28 +0000 Subject: [PATCH 124/148] crimson: clarify ops_executer.h comment Signed-off-by: Samuel Just --- src/crimson/osd/ops_executer.h | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/crimson/osd/ops_executer.h b/src/crimson/osd/ops_executer.h index 6986f49ea08a7..3a7aaef7cd036 100644 --- a/src/crimson/osd/ops_executer.h +++ b/src/crimson/osd/ops_executer.h @@ -170,13 +170,9 @@ class OpsExecuter : public seastar::enable_lw_shared_from_this { object_stat_sum_t delta_stats; private: - // an operation can be divided into two stages: main and effect-exposing - // one. The former is performed immediately on call to `do_osd_op()` while - // the later on `submit_changes()` – after successfully processing main - // stages of all involved operations. When any stage fails, none of all - // scheduled effect-exposing stages will be executed. - // when operation requires this division, some variant of `with_effect()` - // should be used. + // with_effect can be used to schedule operations to be performed + // at commit time. effects will be discarded if the operation does + // not commit. struct effect_t { // an effect can affect PG, i.e. create a watch timeout virtual seastar::future<> execute(Ref pg) = 0; From 8f3ac965c310d80270e53644c56f3bca30511240 Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Thu, 26 Sep 2024 22:49:59 +0000 Subject: [PATCH 125/148] crimson: remove now unused PG::do_osd_ops* and log_reply Signed-off-by: Samuel Just --- src/crimson/osd/pg.cc | 308 ------------------------------------------ src/crimson/osd/pg.h | 35 +---- 2 files changed, 1 insertion(+), 342 deletions(-) diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc index bb5c1e9000baf..9cdd19d01332f 100644 --- a/src/crimson/osd/pg.cc +++ b/src/crimson/osd/pg.cc @@ -978,181 +978,6 @@ ObjectContextRef duplicate_obc(const ObjectContextRef &obc) { return object_context; } -template -PG::do_osd_ops_iertr::future> -PG::do_osd_ops_execute( - seastar::lw_shared_ptr ox, - ObjectContextRef obc, - const OpInfo &op_info, - Ref m, - std::vector& ops, - SuccessFunc&& success_func, - FailureFunc&& failure_func) -{ - assert(ox); - auto rollbacker = ox->create_rollbacker( - [object_context=duplicate_obc(obc)] (auto& obc) mutable { - obc->update_from(*object_context); - }); - auto failure_func_ptr = seastar::make_lw_shared(std::move(failure_func)); - return interruptor::do_for_each(ops, [ox](OSDOp& osd_op) { - logger().debug( - "do_osd_ops_execute: object {} - handling op {}", - ox->get_target(), - ceph_osd_op_name(osd_op.op.op)); - return ox->execute_op(osd_op); - }).safe_then_interruptible([this, ox, &ops] { - /* flush_changes_n_do_ops_effects now returns - * - * interruptible_future< - * tuple, interruptible_future<>>> - * - * Previously, this lambda relied on the second element of that tuple to - * include OpsExecutor::osd_op_errorator in order to propogate the - * following three errors to the next callback. This is actually quite - * awkward as the second future is the completion future, which really - * cannot fail (for it to do so would require an interval change to - * correct). - * - * Rather than reworking this now, I'll leave it as is and refactor it - * later. - */ - using complete_iertr = crimson::interruptible::interruptible_errorator< - ::crimson::osd::IOInterruptCondition, - OpsExecuter::osd_op_errorator>; - using ret_t = std::tuple< - interruptible_future<>, - complete_iertr::future<>>; - - logger().debug( - "do_osd_ops_execute: object {} all operations successful", - ox->get_target()); - // check for full - if ((ox->delta_stats.num_bytes > 0 || - ox->delta_stats.num_objects > 0) && - get_pgpool().info.has_flag(pg_pool_t::FLAG_FULL)) { - const auto& m = ox->get_message(); - if (m.get_reqid().name.is_mds() || // FIXME: ignore MDS for now - m.has_flag(CEPH_OSD_FLAG_FULL_FORCE)) { - logger().info(" full, but proceeding due to FULL_FORCE or MDS"); - } else if (m.has_flag(CEPH_OSD_FLAG_FULL_TRY)) { - // they tried, they failed. - logger().info(" full, replying to FULL_TRY op"); - if (get_pgpool().info.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) - return interruptor::make_ready_future( - interruptor::now(), - complete_iertr::future<>( - crimson::ct_error::edquot::make())); - else - return interruptor::make_ready_future( - interruptor::now(), - complete_iertr::future<>( - crimson::ct_error::enospc::make())); - } else { - // drop request - logger().info(" full, dropping request (bad client)"); - return interruptor::make_ready_future( - interruptor::now(), - complete_iertr::future<>( - crimson::ct_error::eagain::make())); - } - } - return std::move(*ox).flush_changes_n_do_ops_effects( - ops, - snap_mapper, - osdriver, - [this] (auto&& txn, - auto&& obc, - auto&& osd_op_p, - auto&& log_entries) { - logger().debug( - "do_osd_ops_execute: object {} submitting txn", - obc->get_oid()); - mutate_object(obc, txn, osd_op_p); - return submit_transaction( - std::move(obc), - std::move(txn), - std::move(osd_op_p), - std::move(log_entries)); - }).then_interruptible([](auto &&futs) { - auto &&[submitted, completed] = std::move(futs); - return interruptor::make_ready_future( - std::move(submitted), - std::move(completed)); - }); - }).safe_then_unpack_interruptible( - [success_func=std::move(success_func), rollbacker, this, failure_func_ptr, obc] - (auto submitted_fut, auto _all_completed_fut) mutable { - - auto all_completed_fut = _all_completed_fut.safe_then_interruptible_tuple( - std::move(success_func), - crimson::ct_error::object_corrupted::handle( - [rollbacker, this, obc] (const std::error_code& e) mutable { - // this is a path for EIO. it's special because we want to fix the obejct - // and try again. that is, the layer above `PG::do_osd_ops` is supposed to - // restart the execution. - rollbacker.rollback_obc_if_modified(e); - return repair_object(obc->obs.oi.soid, - obc->obs.oi.version - ).then_interruptible([] { - return do_osd_ops_iertr::future{crimson::ct_error::eagain::make()}; - }); - }), OpsExecuter::osd_op_errorator::all_same_way( - [rollbacker, failure_func_ptr] - (const std::error_code& e) mutable { - // handle non-fatal errors only - ceph_assert(e.value() == EDQUOT || - e.value() == ENOSPC || - e.value() == EAGAIN); - rollbacker.rollback_obc_if_modified(e); - return (*failure_func_ptr)(e); - })); - - return PG::do_osd_ops_iertr::make_ready_future>( - std::move(submitted_fut), - std::move(all_completed_fut) - ); - }, OpsExecuter::osd_op_errorator::all_same_way( - [this, op_info, m, obc, - rollbacker, failure_func_ptr] - (const std::error_code& e) mutable { - ceph_tid_t rep_tid = shard_services.get_tid(); - rollbacker.rollback_obc_if_modified(e); - // record error log - auto maybe_submit_error_log = - interruptor::make_ready_future>(std::nullopt); - // call submit_error_log only for non-internal clients - if constexpr (!std::is_same_v) { - if(op_info.may_write()) { - maybe_submit_error_log = - submit_error_log( - m, op_info, obc, e, rep_tid - ).then_interruptible([](auto &&e) { - return std::make_optional(std::move(e)); - }); - } - } - return maybe_submit_error_log.then_interruptible( - [this, failure_func_ptr, e, rep_tid] (auto version) { - auto all_completed = - [this, failure_func_ptr, e, rep_tid, version] { - if (version.has_value()) { - return complete_error_log(rep_tid, version.value() - ).then_interruptible([failure_func_ptr, e] { - return (*failure_func_ptr)(e); - }); - } else { - return (*failure_func_ptr)(e); - } - }; - return PG::do_osd_ops_iertr::make_ready_future>( - std::move(seastar::now()), - std::move(all_completed()) - ); - }); - })); -} - PG::interruptible_future<> PG::complete_error_log(const ceph_tid_t& rep_tid, const eversion_t& version) { @@ -1329,139 +1154,6 @@ PG::submit_executer_fut PG::submit_executer( }); } - -PG::do_osd_ops_iertr::future>> -PG::do_osd_ops( - Ref m, - crimson::net::ConnectionXcoreRef conn, - ObjectContextRef obc, - const OpInfo &op_info, - const SnapContext& snapc) -{ - if (__builtin_expect(stopping, false)) { - throw crimson::common::system_shutdown_exception(); - } - return do_osd_ops_execute>( - seastar::make_lw_shared( - Ref{this}, obc, op_info, *m, conn, snapc), - obc, - op_info, - m, - m->ops, - // success_func - [this, m, obc, may_write = op_info.may_write(), - may_read = op_info.may_read(), rvec = op_info.allows_returnvec()] { - // TODO: should stop at the first op which returns a negative retval, - // cmpext uses it for returning the index of first unmatched byte - int result = m->ops.empty() ? 0 : m->ops.back().rval.code; - if (may_read && result >= 0) { - for (auto &osdop : m->ops) { - if (osdop.rval < 0 && !(osdop.op.flags & CEPH_OSD_OP_FLAG_FAILOK)) { - result = osdop.rval.code; - break; - } - } - } else if (result > 0 && may_write && !rvec) { - result = 0; - } else if (result < 0 && (m->ops.empty() ? - 0 : m->ops.back().op.flags & CEPH_OSD_OP_FLAG_FAILOK)) { - result = 0; - } - auto reply = crimson::make_message(m.get(), - result, - get_osdmap_epoch(), - 0, - false); - reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK); - logger().debug( - "do_osd_ops: {} - object {} sending reply", - *m, - m->get_hobj()); - if (obc->obs.exists) { - reply->set_reply_versions(peering_state.get_info().last_update, - obc->obs.oi.user_version); - } else { - reply->set_reply_versions(peering_state.get_info().last_update, - peering_state.get_info().last_user_version); - } - return do_osd_ops_iertr::make_ready_future>( - std::move(reply)); - }, - // failure_func - [m, this] - (const std::error_code& e) { - logger().error("do_osd_ops_execute::failure_func {} got error: {}", - *m, e); - return log_reply(m, e); - }); -} - -PG::do_osd_ops_iertr::future> -PG::log_reply( - Ref m, - const std::error_code& e) -{ - auto reply = crimson::make_message( - m.get(), -e.value(), get_osdmap_epoch(), 0, false); - if (m->ops.empty() ? 0 : - m->ops.back().op.flags & CEPH_OSD_OP_FLAG_FAILOK) { - reply->set_result(0); - } - // For all ops except for CMPEXT, the correct error value is encoded - // in e.value(). For CMPEXT, osdop.rval has the actual error value. - if (e.value() == ct_error::cmp_fail_error_value) { - assert(!m->ops.empty()); - for (auto &osdop : m->ops) { - if (osdop.rval < 0) { - reply->set_result(osdop.rval); - break; - } - } - } - reply->set_enoent_reply_versions( - peering_state.get_info().last_update, - peering_state.get_info().last_user_version); - reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK); - return do_osd_ops_iertr::make_ready_future>( - std::move(reply)); -} - -PG::do_osd_ops_iertr::future> -PG::do_osd_ops( - ObjectContextRef obc, - std::vector& ops, - const OpInfo &op_info, - const do_osd_ops_params_t &&msg_params) -{ - // This overload is generally used for internal client requests, - // use an empty SnapContext. - return seastar::do_with( - std::move(msg_params), - [=, this, &ops, &op_info](auto &msg_params) { - return do_osd_ops_execute( - seastar::make_lw_shared( - Ref{this}, - obc, - op_info, - msg_params, - msg_params.get_connection(), - SnapContext{} - ), - obc, - op_info, - Ref(), - ops, - // success_func - [] { - return do_osd_ops_iertr::now(); - }, - // failure_func - [] (const std::error_code& e) { - return do_osd_ops_iertr::now(); - }); - }); -} - PG::interruptible_future> PG::do_pg_ops(Ref m) { if (__builtin_expect(stopping, false)) { diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h index c91f93171dbc1..3a8ddad922a50 100644 --- a/src/crimson/osd/pg.h +++ b/src/crimson/osd/pg.h @@ -672,41 +672,8 @@ class PG : public boost::intrusive_ref_counter< seastar::lw_shared_ptr ox, const std::vector& ops); - using do_osd_ops_ertr = crimson::errorator< - crimson::ct_error::eagain>; - using do_osd_ops_iertr = - ::crimson::interruptible::interruptible_errorator< - ::crimson::osd::IOInterruptCondition, - ::crimson::errorator>; - template - using pg_rep_op_fut_t = - std::tuple, - do_osd_ops_iertr::future>; - do_osd_ops_iertr::future>> do_osd_ops( - Ref m, - crimson::net::ConnectionXcoreRef conn, - ObjectContextRef obc, - const OpInfo &op_info, - const SnapContext& snapc); - struct do_osd_ops_params_t; - do_osd_ops_iertr::future> log_reply( - Ref m, - const std::error_code& e); - do_osd_ops_iertr::future> do_osd_ops( - ObjectContextRef obc, - std::vector& ops, - const OpInfo &op_info, - const do_osd_ops_params_t &¶ms); - template - do_osd_ops_iertr::future> do_osd_ops_execute( - seastar::lw_shared_ptr ox, - ObjectContextRef obc, - const OpInfo &op_info, - Ref m, - std::vector& ops, - SuccessFunc&& success_func, - FailureFunc&& failure_func); + interruptible_future> do_pg_ops(Ref m); interruptible_future< std::tuple, interruptible_future<>>> From 7ac64b0b245798b1d4a85b1da86497d2baf2bceb Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Tue, 1 Oct 2024 13:05:03 -0700 Subject: [PATCH 126/148] crimson: OpsExecuter no longer needs to be a lw shared ptr ClientRequest and InternalClientRequest can declare them as auto variables. Signed-off-by: Samuel Just --- src/crimson/osd/ops_executer.h | 6 ++--- .../osd/osd_operations/client_request.cc | 3 +-- .../osd_operations/internal_client_request.cc | 2 +- src/crimson/osd/pg.cc | 23 +++++++++---------- src/crimson/osd/pg.h | 4 ++-- 5 files changed, 18 insertions(+), 20 deletions(-) diff --git a/src/crimson/osd/ops_executer.h b/src/crimson/osd/ops_executer.h index 3a7aaef7cd036..068f510d1ef82 100644 --- a/src/crimson/osd/ops_executer.h +++ b/src/crimson/osd/ops_executer.h @@ -40,7 +40,7 @@ namespace crimson::osd { class PG; // OpsExecuter -- a class for executing ops targeting a certain object. -class OpsExecuter : public seastar::enable_lw_shared_from_this { +class OpsExecuter { friend class SnapTrimObjSubEvent; using call_errorator = crimson::errorator< @@ -549,14 +549,14 @@ template struct OpsExecuter::RollbackHelper { void rollback_obc_if_modified(); void rollback_obc_if_modified(const std::error_code& e); - seastar::lw_shared_ptr ox; + OpsExecuter *ox; Func func; }; template inline OpsExecuter::RollbackHelper OpsExecuter::create_rollbacker(Func&& func) { - return {shared_from_this(), std::forward(func)}; + return {this, std::forward(func)}; } template diff --git a/src/crimson/osd/osd_operations/client_request.cc b/src/crimson/osd/osd_operations/client_request.cc index c226222fa0c75..a89fb2c84bc56 100644 --- a/src/crimson/osd/osd_operations/client_request.cc +++ b/src/crimson/osd/osd_operations/client_request.cc @@ -502,8 +502,7 @@ ClientRequest::do_process( co_return; } - auto ox = seastar::make_lw_shared( - pg, obc, op_info, *m, r_conn, snapc); + OpsExecuter ox(pg, obc, op_info, *m, r_conn, snapc); auto ret = co_await pg->run_executer( ox, obc, op_info, m->ops ).si_then([]() -> std::optional { diff --git a/src/crimson/osd/osd_operations/internal_client_request.cc b/src/crimson/osd/osd_operations/internal_client_request.cc index 6ad447cf32ee4..9e5867caf8067 100644 --- a/src/crimson/osd/osd_operations/internal_client_request.cc +++ b/src/crimson/osd/osd_operations/internal_client_request.cc @@ -57,7 +57,7 @@ InternalClientRequest::do_process( { LOG_PREFIX(InternalClientRequest::do_process); auto params = get_do_osd_ops_params(); - auto ox = seastar::make_lw_shared( + OpsExecuter ox( pg, obc, op_info, params, params.get_connection(), SnapContext{}); co_await pg->run_executer( ox, obc, op_info, osd_ops diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc index 9cdd19d01332f..744a1dbc02b97 100644 --- a/src/crimson/osd/pg.cc +++ b/src/crimson/osd/pg.cc @@ -1080,13 +1080,13 @@ PG::interruptible_future PG::submit_error_log( } PG::run_executer_fut PG::run_executer( - seastar::lw_shared_ptr ox, + OpsExecuter &ox, ObjectContextRef obc, const OpInfo &op_info, std::vector& ops) { LOG_PREFIX(PG::run_executer); - auto rollbacker = ox->create_rollbacker( + auto rollbacker = ox.create_rollbacker( [stored_obc=duplicate_obc(obc)](auto &obc) mutable { obc->update_from(*stored_obc); }); @@ -1095,16 +1095,16 @@ PG::run_executer_fut PG::run_executer( }); for (auto &op: ops) { - DEBUGDPP("object {} handle op {}", *this, ox->get_target(), op); - co_await ox->execute_op(op); + DEBUGDPP("object {} handle op {}", *this, ox.get_target(), op); + co_await ox.execute_op(op); } - DEBUGDPP("object {} all operations successful", *this, ox->get_target()); + DEBUGDPP("object {} all operations successful", *this, ox.get_target()); // check for full - if ((ox->delta_stats.num_bytes > 0 || - ox->delta_stats.num_objects > 0) && + if ((ox.delta_stats.num_bytes > 0 || + ox.delta_stats.num_objects > 0) && get_pgpool().info.has_flag(pg_pool_t::FLAG_FULL)) { - const auto& m = ox->get_message(); + const auto& m = ox.get_message(); if (m.get_reqid().name.is_mds() || // FIXME: ignore MDS for now m.has_flag(CEPH_OSD_FLAG_FULL_FORCE)) { INFODPP("full, but proceeding due to FULL_FORCE, or MDS", *this); @@ -1129,13 +1129,12 @@ PG::run_executer_fut PG::run_executer( } PG::submit_executer_fut PG::submit_executer( - seastar::lw_shared_ptr ox, - const std::vector& ops) -{ + OpsExecuter &&ox, + const std::vector& ops) { LOG_PREFIX(PG::submit_executer); // transaction must commit at this point return std::move( - *ox + ox ).flush_changes_n_do_ops_effects( ops, snap_mapper, diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h index 3a8ddad922a50..604f49005ff04 100644 --- a/src/crimson/osd/pg.h +++ b/src/crimson/osd/pg.h @@ -658,7 +658,7 @@ class PG : public boost::intrusive_ref_counter< run_executer_ertr>; using run_executer_fut = run_executer_iertr::future<>; run_executer_fut run_executer( - seastar::lw_shared_ptr ox, + OpsExecuter &ox, ObjectContextRef obc, const OpInfo &op_info, std::vector& ops); @@ -669,7 +669,7 @@ class PG : public boost::intrusive_ref_counter< using submit_executer_fut = interruptible_future< submit_executer_ret>; submit_executer_fut submit_executer( - seastar::lw_shared_ptr ox, + OpsExecuter &&ox, const std::vector& ops); struct do_osd_ops_params_t; From 2b562b64a64777b1428e9ad3187b50619cbf1a4d Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Tue, 1 Oct 2024 13:11:31 -0700 Subject: [PATCH 127/148] crimson: remove unused OpsExecuter::rollback_obc_if_modified overload Signed-off-by: Samuel Just --- src/crimson/osd/ops_executer.h | 31 ------------------------------- 1 file changed, 31 deletions(-) diff --git a/src/crimson/osd/ops_executer.h b/src/crimson/osd/ops_executer.h index 068f510d1ef82..e770e825b32d0 100644 --- a/src/crimson/osd/ops_executer.h +++ b/src/crimson/osd/ops_executer.h @@ -548,7 +548,6 @@ OpsExecuter::flush_changes_n_do_ops_effects( template struct OpsExecuter::RollbackHelper { void rollback_obc_if_modified(); - void rollback_obc_if_modified(const std::error_code& e); OpsExecuter *ox; Func func; }; @@ -587,36 +586,6 @@ void OpsExecuter::RollbackHelper::rollback_obc_if_modified() } } -template -void OpsExecuter::RollbackHelper::rollback_obc_if_modified( - const std::error_code& e) -{ - // Oops, an operation had failed. do_osd_ops() altogether with - // OpsExecuter already dropped the ObjectStore::Transaction if - // there was any. However, this is not enough to completely - // rollback as we gave OpsExecuter the very single copy of `obc` - // we maintain and we did it for both reading and writing. - // Now all modifications must be reverted. - // - // The conditional's purpose is to efficiently handle hot errors - // which may appear as a result of e.g. CEPH_OSD_OP_CMPXATTR or - // CEPH_OSD_OP_OMAP_CMP. These are read-like ops and clients - // typically append them before any write. If OpsExecuter hasn't - // seen any modifying operation, `obc` is supposed to be kept - // unchanged. - assert(ox); - const auto need_rollback = ox->has_seen_write(); - crimson::get_logger(ceph_subsys_osd).debug( - "{}: object {} got error {}, need_rollback={}", - __func__, - ox->obc->get_oid(), - e, - need_rollback); - if (need_rollback) { - func(ox->obc); - } -} - // PgOpsExecuter -- a class for executing ops targeting a certain PG. class PgOpsExecuter { template From e036fde7e4b03241e617cbb3ef8f19a703aae716 Mon Sep 17 00:00:00 2001 From: Dnyaneshwari Date: Mon, 14 Oct 2024 09:56:45 +0530 Subject: [PATCH 128/148] mgr/dashboard: The subvolumes are missing from the dropdown menu on the "Create NFS export" page Fixes: https://tracker.ceph.com/issues/68519 Signed-off-by: Dnyaneshwari Talwekar --- .../src/app/ceph/nfs/nfs-form/nfs-form.component.html | 5 ----- .../frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.ts | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.html index 1a73490175db7..0da4913e9b8a4 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.html +++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.html @@ -106,7 +106,6 @@ [invalid]="nfsForm.controls.fsal.controls.user_id.invalid && (nfsForm.controls.fsal.controls.user_id.dirty)" [invalidText]="userIdError" [skeleton]="allRGWUsers === null" - (valueChange)="pathChangeHandler()" i18n> @@ -223,8 +222,6 @@ name="path" formControlName="path" [ngbTypeahead]="pathDataSource" - (selectItem)="pathChangeHandler()" - (blur)="pathChangeHandler()" [invalid]="nfsForm.controls.path.invalid && (nfsForm.controls.path.dirty)"> @@ -259,8 +256,6 @@ name="path" formControlName="path" [ngbTypeahead]="bucketDataSource" - (selectItem)="pathChangeHandler()" - (blur)="pathChangeHandler()" [invalid]="nfsForm.controls.path.invalid && (nfsForm.controls.path.dirty)"> diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.ts index 2317671b02238..d502524256ee9 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.ts +++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.ts @@ -434,7 +434,7 @@ export class NfsFormComponent extends CdForm implements OnInit { fs_name: this.selectedFsName } }); - this.volumeChangeHandler(); + this.getSubVolGrp(this.selectedFsName); } if (!_.isEmpty(this.selectedSubvolGroup)) { this.nfsForm.patchValue({ From 8443821073b23946a32009106c45581db0d51e8f Mon Sep 17 00:00:00 2001 From: Anoop C S Date: Thu, 29 Aug 2024 11:53:44 +0530 Subject: [PATCH 129/148] client: Fix symlink open with O_PATH and O_NOFOLLOW man open(2)[1] says the following for O_PATH: . . . If pathname is a symbolic link and the O_NOFOLLOW flag is also specified, then the call returns a file descriptor referring to the symbolic link. This file descriptor can be used as the dirfd argument in calls to fchownat(2), fstatat(2), linkat(2), and readlinkat(2) with an empty pathname to have the calls operate on the symbolic link. . . . symlink check within may_open() failed to consider the O_PATH flag resulting in a ELOOP error to the client. In order to return a valid file descriptor we introduce a check for the presence of O_PATH in the client provided flags. Fixes: https://tracker.ceph.com/issues/67833 [1] https://www.man7.org/linux/man-pages/man2/open.2.html Signed-off-by: Anoop C S --- src/client/Client.cc | 4 ++++ src/test/libcephfs/test.cc | 7 +++++++ 2 files changed, 11 insertions(+) diff --git a/src/client/Client.cc b/src/client/Client.cc index 6577dd575f1fd..9c6785fe65e5a 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -6125,6 +6125,10 @@ int Client::may_open(Inode *in, int flags, const UserPerm& perms) int r = 0; switch (in->mode & S_IFMT) { case S_IFLNK: +#if defined(__linux__) && defined(O_PATH) + if (flags & O_PATH) + break; +#endif r = -CEPHFS_ELOOP; goto out; case S_IFDIR: diff --git a/src/test/libcephfs/test.cc b/src/test/libcephfs/test.cc index f2c87168633bb..b51689ab2637e 100644 --- a/src/test/libcephfs/test.cc +++ b/src/test/libcephfs/test.cc @@ -976,6 +976,13 @@ TEST(LibCephFS, Symlinks) { fd = ceph_open(cmount, test_symlink, O_NOFOLLOW, 0); ASSERT_EQ(fd, -CEPHFS_ELOOP); +#if defined(__linux__) && defined(O_PATH) + // test the O_NOFOLLOW with O_PATH case + fd = ceph_open(cmount, test_symlink, O_PATH|O_NOFOLLOW, 0); + ASSERT_GT(fd, 0); + ceph_close(cmount, fd); +#endif /* __linux */ + // stat the original file struct ceph_statx stx_orig; ASSERT_EQ(ceph_statx(cmount, test_file, &stx_orig, CEPH_STATX_ALL_STATS, 0), 0); From 24f453dd39c25e00527d0ed0a1e9fefa6295999b Mon Sep 17 00:00:00 2001 From: Anoop C S Date: Tue, 27 Aug 2024 15:50:44 +0530 Subject: [PATCH 130/148] client: Resolve symlink from dirfd for empty pathname man readlinkat(2)[1] points at a special case for readlinkat() syscall as follows: . . . Since Linux 2.6.39, pathname can be an empty string, in which case the call operates on the symbolic link referred to by dirfd (which should have been obtained using open(2) with the O_PATH and O_NOFOLLOW flags). . . . man open(2)[2] further explains the need for such a special case when a symlink is opened with O_PATH and O_NOFOLLOW: . . . If pathname is a symbolic link and the O_NOFOLLOW flag is also specified, then the call returns a file descriptor referring to the symbolic link. This file descriptor can be used as the dirfd argument in calls to fchownat(2), fstatat(2), linkat(2), and readlinkat(2) with an empty pathname to have the calls operate on the symbolic link. . . . Accordingly have a check to resolve symlinks out of dirfd when empty pathnames are encountered within readlinkat(). In addition to that match the standard file system behavior to return ENOENT instead of EINVAL when the inode pointed to by dirfd is not a symbolic link with empty pathnames. Fixes: https://tracker.ceph.com/issues/67833 [1] https://www.man7.org/linux/man-pages/man2/readlinkat.2.html [2] https://www.man7.org/linux/man-pages/man2/open.2.html Signed-off-by: Anoop C S --- src/client/Client.cc | 6 ++++++ src/test/libcephfs/test.cc | 12 ++++++++++++ 2 files changed, 18 insertions(+) diff --git a/src/client/Client.cc b/src/client/Client.cc index 9c6785fe65e5a..f8373095b38c1 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -7957,6 +7957,12 @@ int Client::readlinkat(int dirfd, const char *relpath, char *buf, loff_t size, c return r; } + if (!strcmp(relpath, "")) { + if (!dirinode.get()->is_symlink()) + return -CEPHFS_ENOENT; + return _readlink(dirinode.get(), buf, size); + } + InodeRef in; filepath path(relpath); r = path_walk(path, &in, perms, false, 0, dirinode); diff --git a/src/test/libcephfs/test.cc b/src/test/libcephfs/test.cc index b51689ab2637e..6f10d2bbd4e06 100644 --- a/src/test/libcephfs/test.cc +++ b/src/test/libcephfs/test.cc @@ -3019,6 +3019,18 @@ TEST(LibCephFS, Readlinkat) { ASSERT_EQ(0, memcmp(target, rel_file_path, target_len)); ASSERT_EQ(0, ceph_close(cmount, fd)); +#if defined(__linux__) && defined(O_PATH) + // test readlinkat with empty pathname relative to O_PATH|O_NOFOLLOW fd + fd = ceph_open(cmount, link_path, O_PATH | O_NOFOLLOW, 0); + ASSERT_LE(0, fd); + size_t link_target_len = strlen(rel_file_path); + char link_target[link_target_len+1]; + ASSERT_EQ(link_target_len, ceph_readlinkat(cmount, fd, "", link_target, link_target_len)); + link_target[link_target_len] = '\0'; + ASSERT_EQ(0, memcmp(link_target, rel_file_path, link_target_len)); + ASSERT_EQ(0, ceph_close(cmount, fd)); +#endif /* __linux */ + ASSERT_EQ(0, ceph_unlink(cmount, link_path)); ASSERT_EQ(0, ceph_unlink(cmount, file_path)); ASSERT_EQ(0, ceph_rmdir(cmount, dir_path)); From 0be8d01c9ddde0d7d24edd34dc75f6cfc861b5ba Mon Sep 17 00:00:00 2001 From: Milind Changire Date: Fri, 27 Sep 2024 16:10:22 +0530 Subject: [PATCH 131/148] log: thread name save/fetch infra * pthread name is saved in a thread_local storage * the thread_local name is copied into Entry object's ctor * Log::dump_recent() reads the thread name from the Entry object's data member when dumping logs Fixes: https://tracker.ceph.com/issues/50743 Signed-off-by: Milind Changire --- src/common/Thread.cc | 4 ++-- src/common/Thread.h | 8 +++++++- src/log/Entry.h | 10 +++++++++- src/log/Log.cc | 11 ++++------- 4 files changed, 22 insertions(+), 11 deletions(-) diff --git a/src/common/Thread.cc b/src/common/Thread.cc index 9a7a31923c1b7..3903e8c0ed721 100644 --- a/src/common/Thread.cc +++ b/src/common/Thread.cc @@ -83,7 +83,7 @@ void *Thread::entry_wrapper() if (pid && cpuid >= 0) _set_affinity(cpuid); - ceph_pthread_setname(pthread_self(), thread_name.c_str()); + ceph_pthread_setname(pthread_self(), Thread::thread_name.c_str()); return entry(); } @@ -154,7 +154,7 @@ int Thread::try_create(size_t stacksize) void Thread::create(const char *name, size_t stacksize) { ceph_assert(strlen(name) < 16); - thread_name = name; + Thread::thread_name = name; int ret = try_create(stacksize); if (ret != 0) { diff --git a/src/common/Thread.h b/src/common/Thread.h index 5242fb5f30758..d3892c1b36b71 100644 --- a/src/common/Thread.h +++ b/src/common/Thread.h @@ -20,11 +20,14 @@ #include #include #include +#include #include #include +#include "include/ceph_assert.h" #include "include/compat.h" +#include "include/spinlock.h" extern pid_t ceph_gettid(); @@ -33,7 +36,7 @@ class Thread { pthread_t thread_id; pid_t pid; int cpuid; - std::string thread_name; + static inline thread_local std::string thread_name; void *entry_wrapper(); @@ -61,6 +64,9 @@ class Thread { int join(void **prval = 0); int detach(); int set_affinity(int cpuid); + static const std::string get_thread_name() { + return Thread::thread_name; + } }; // Functions for with std::thread diff --git a/src/log/Entry.h b/src/log/Entry.h index 3677c8eb95180..db39eca0ef3ba 100644 --- a/src/log/Entry.h +++ b/src/log/Entry.h @@ -4,9 +4,12 @@ #ifndef __CEPH_LOG_ENTRY_H #define __CEPH_LOG_ENTRY_H +#include "include/compat.h" + #include "log/LogClock.h" #include "common/StackStringStream.h" +#include "common/Thread.h" #include "boost/container/small_vector.hpp" @@ -14,6 +17,7 @@ #include + namespace ceph { namespace logging { @@ -27,7 +31,10 @@ class Entry { m_thread(pthread_self()), m_prio(pr), m_subsys(sub) - {} + { + strncpy(m_thread_name, Thread::get_thread_name().data(), 16); + m_thread_name[15] = '\0'; + } Entry(const Entry &) = default; Entry& operator=(const Entry &) = default; Entry(Entry &&e) = default; @@ -40,6 +47,7 @@ class Entry { time m_stamp; pthread_t m_thread; short m_prio, m_subsys; + char m_thread_name[16]; static log_clock& clock() { static log_clock clock; diff --git a/src/log/Log.cc b/src/log/Log.cc index 69f6df82ecbb7..49dd03c06c096 100644 --- a/src/log/Log.cc +++ b/src/log/Log.cc @@ -493,13 +493,13 @@ void Log::dump_recent() _flush(m_flush, false); _log_message("--- begin dump of recent events ---", true); - std::set recent_pthread_ids; + std::set> recent_pthread_ids; { EntryVector t; t.insert(t.end(), std::make_move_iterator(m_recent.begin()), std::make_move_iterator(m_recent.end())); m_recent.clear(); for (const auto& e : t) { - recent_pthread_ids.emplace(e.m_thread); + recent_pthread_ids.emplace(std::make_pair(e.m_thread, e.m_thread_name)); } _flush(t, true); } @@ -515,14 +515,11 @@ void Log::dump_recent() m_stderr_log, m_stderr_crash), true); _log_message("--- pthread ID / name mapping for recent threads ---", true); - for (const auto pthread_id : recent_pthread_ids) + for (auto& [pthread_id, pthread_name] : recent_pthread_ids) { - char pthread_name[16] = {0}; //limited by 16B include terminating null byte. - ceph_pthread_getname(pthread_id, pthread_name, sizeof(pthread_name)); // we want the ID to be printed in the same format as we use for a log entry. // The reason is easier grepping. - _log_message(fmt::format(" {:x} / {}", - tid_to_int(pthread_id), pthread_name), true); + _log_message(fmt::format(" {:x} / {}", tid_to_int(pthread_id), pthread_name), true); } _log_message(fmt::format(" max_recent {:9}", m_recent.capacity()), true); From 3ab5d1f67f1cac210f4c7f0540900670c25de80b Mon Sep 17 00:00:00 2001 From: Redouane Kachach Date: Tue, 15 Oct 2024 13:34:32 +0200 Subject: [PATCH 132/148] mgr/cephadm: disabling nginx buffering for grafana location Disabling Nginx buffering for Grafana, as it may lead to errors or delays while loading the main Grafana page, particularly when receiving JavaScript files. Fixes: https://tracker.ceph.com/issues/68315 Signed-off-by: Redouane Kachach --- .../templates/services/mgmt-gateway/external_server.conf.j2 | 1 + src/pybind/mgr/cephadm/tests/test_services.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/external_server.conf.j2 b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/external_server.conf.j2 index b830034a7d4e9..91efa91a8d50f 100644 --- a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/external_server.conf.j2 +++ b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/external_server.conf.j2 @@ -113,6 +113,7 @@ server { # clear any Authorization header as Prometheus and Alertmanager are using basic-auth browser # will send this header if Grafana is running on the same node as one of those services proxy_set_header Authorization ""; + proxy_buffering off; {% if oauth2_proxy_url %} auth_request /oauth2/auth; error_page 401 = /oauth2/sign_in; diff --git a/src/pybind/mgr/cephadm/tests/test_services.py b/src/pybind/mgr/cephadm/tests/test_services.py index a05c87ce3c3a9..072f4bec554e1 100644 --- a/src/pybind/mgr/cephadm/tests/test_services.py +++ b/src/pybind/mgr/cephadm/tests/test_services.py @@ -3900,6 +3900,7 @@ def get_services_endpoints(name): # clear any Authorization header as Prometheus and Alertmanager are using basic-auth browser # will send this header if Grafana is running on the same node as one of those services proxy_set_header Authorization ""; + proxy_buffering off; } location /prometheus { @@ -4171,6 +4172,7 @@ def get_services_endpoints(name): # clear any Authorization header as Prometheus and Alertmanager are using basic-auth browser # will send this header if Grafana is running on the same node as one of those services proxy_set_header Authorization ""; + proxy_buffering off; auth_request /oauth2/auth; error_page 401 = /oauth2/sign_in; From 2ed1a3bd70c818835c4e7dc521d31cdd9f9e780b Mon Sep 17 00:00:00 2001 From: Ernesto Puerta Date: Tue, 15 Oct 2024 14:09:48 +0200 Subject: [PATCH 133/148] .github: detect GPL license in PRs Signed-off-by: Ernesto Puerta --- .github/workflows/check-license.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 .github/workflows/check-license.yml diff --git a/.github/workflows/check-license.yml b/.github/workflows/check-license.yml new file mode 100644 index 0000000000000..d201ed7135439 --- /dev/null +++ b/.github/workflows/check-license.yml @@ -0,0 +1,13 @@ +--- +name: "Check Incomatible Licenses" +on: [pull_request] + +jobs: + check_pr: + runs-on: ubuntu-latest + steps: + - name: Check PR + uses: JJ/github-pr-contains-action@526dfe784d8604ea1c39b6c26609074de95b1ffd # releases/v14.1 + with: + github-token: ${{github.token}} + diffDoesNotContain: "GNU General Public License" From 629922bf6a0905cc87707f5e2d027f6320aafd99 Mon Sep 17 00:00:00 2001 From: Jos Collin Date: Fri, 11 Oct 2024 10:33:47 +0530 Subject: [PATCH 134/148] doc: update Key Idea in cephfs-mirroring.rst Updates the snapdiff feature and it's url. Signed-off-by: Jos Collin --- doc/dev/cephfs-mirroring.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/dev/cephfs-mirroring.rst b/doc/dev/cephfs-mirroring.rst index a804a0075995f..e09fed213f230 100644 --- a/doc/dev/cephfs-mirroring.rst +++ b/doc/dev/cephfs-mirroring.rst @@ -17,12 +17,10 @@ Key Idea -------- For a given snapshot pair in a directory, `cephfs-mirror` daemon will rely on -readdir diff to identify changes in a directory tree. The diffs are applied to +`CephFS Snapdiff Feature` to identify changes in a directory tree. The diffs are applied to directory in the remote file system thereby only synchronizing files that have changed between two snapshots. -This feature is tracked here: https://tracker.ceph.com/issues/47034. - Currently, snapshot data is synchronized by bulk copying to the remote filesystem. @@ -407,3 +405,5 @@ Feature Status -------------- `cephfs-mirror` daemon is built by default (follows `WITH_CEPHFS` CMake rule). + +.. _CephFS Snapdiff Feature: https://croit.io/blog/cephfs-snapdiff-feature From b6cb908e0b7e215def9760f480149fd7f1b881db Mon Sep 17 00:00:00 2001 From: Casey Bodley Date: Tue, 15 Oct 2024 11:29:53 -0400 Subject: [PATCH 135/148] rgw: document mstart.sh and related scripts Signed-off-by: Casey Bodley --- src/mrgw.sh | 2 ++ src/mrun | 2 ++ src/mstart.sh | 28 ++++++++++++++++++++++++++++ src/mstop.sh | 2 ++ 4 files changed, 34 insertions(+) diff --git a/src/mrgw.sh b/src/mrgw.sh index 05739bf015ebc..86bef336867de 100755 --- a/src/mrgw.sh +++ b/src/mrgw.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +# Start/restart a radosgw instance on the given mstart.sh cluster. + set -e rgw_frontend=${RGW_FRONTEND:-"beast"} diff --git a/src/mrun b/src/mrun index a85221800218b..df7e3542b93a5 100755 --- a/src/mrun +++ b/src/mrun @@ -1,5 +1,7 @@ #!/usr/bin/env bash +# Run a ceph command against the given mstart.sh cluster. + [ $# -lt 2 ] && echo "usage: $0 [params...]" && exit 1 root=`dirname $0` diff --git a/src/mstart.sh b/src/mstart.sh index 34b57e1761125..0c512ca9eb8c3 100755 --- a/src/mstart.sh +++ b/src/mstart.sh @@ -1,5 +1,33 @@ #!/bin/sh +# Deploy a vstart.sh cluster in a named subdirectory. This makes it possible to +# start multiple clusters in different subdirectories. See mstop.sh for cleanup. +# +# Example: +# +# ~/ceph/build $ MON=1 OSD=1 RGW=1 MDS=0 MGR=0 ../src/mstart.sh c1 -n -d +# ~/ceph/build $ MON=1 OSD=1 RGW=1 MDS=0 MGR=0 ../src/mstart.sh c2 -n -d +# +# ~/ceph/build $ ls run +# c1 c2 +# ~/ceph/build $ ls run/c1 +# asok ceph.conf dev keyring out +# +# ~/ceph/build $ ../src/mrun c1 radosgw-admin user list +# [ +# "56789abcdef0123456789abcdef0123456789abcdef0123456789abcdef01234", +# "testx$9876543210abcdef0123456789abcdef0123456789abcdef0123456789abcdef", +# "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef", +# "testacct1user", +# "test", +# "testacct2root", +# "testacct1root", +# "testid" +# ] +# +# ~/ceph/build $ ../src/mstop.sh c1 +# ~/ceph/build $ ../src/mstop.sh c2 + usage="usage: $0 [vstart options]..\n" usage_exit() { diff --git a/src/mstop.sh b/src/mstop.sh index 702d1765941e5..eec0ca02e42ae 100755 --- a/src/mstop.sh +++ b/src/mstop.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +# Stop a named cluster started by mstart.sh + set -e script_root=`dirname $0` From c78d1ba668d1ad2364db39ffa07be2f8a3d61a48 Mon Sep 17 00:00:00 2001 From: Casey Bodley Date: Tue, 15 Oct 2024 11:30:37 -0400 Subject: [PATCH 136/148] rgw: add mstart-related scripts to CODEOWNERS and labeler Signed-off-by: Casey Bodley --- .github/CODEOWNERS | 4 ++++ .github/labeler.yml | 3 +++ 2 files changed, 7 insertions(+) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index d8d18693efcf6..3e81444ea3d0b 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -164,6 +164,10 @@ README* @ceph/doc-writers /src/cls/rgw_gc @ceph/rgw /src/cls/user @ceph/rgw /src/cls/version @ceph/rgw +/src/mrgw.sh @ceph/rgw +/src/mrun @ceph/rgw +/src/mstart.sh @ceph/rgw +/src/mstop.sh @ceph/rgw /src/rgw @ceph/rgw /src/s3select @ceph/rgw /src/spawn @ceph/rgw diff --git a/.github/labeler.yml b/.github/labeler.yml index 9f2ed1e479019..cc32be3850126 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -288,6 +288,9 @@ rgw: - src/cls/rgw_gc/** - src/cls/timeindex/** - src/mrgw.sh + - src/mrun + - src/mstart.sh + - src/mstop.sh - src/rgw/** - src/test/cls_rgw/** - src/test/librgw_* From 67f884d39c31bd7ece3666f8092814ae9dfc29f1 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Tue, 15 Oct 2024 17:52:45 +0200 Subject: [PATCH 137/148] CodingStyle: allow C++ forward declarations The Google coding guide opposes to forward declarations, but I disagree with that opinion. In my opinion, forward declarations are useful. Ceph build times are miserable due to header dependency bloat and template bloat, both of which can be reduced using forward declarations. All cons listed in https://google.github.io/styleguide/cppguide.html > Forward declarations can hide a dependency, allowing user code to > skip necessary recompilation when headers change. That is a pro, not a con. Skipping (unnecessary) recompilation is a good thing, it's the goal of forward declarations. > A forward declaration as opposed to an #include statement makes it > difficult for automatic tooling to discover the module defining the > symbol. That certainly depends on the tools one uses, but I cannot imagine today's IDEs are limited to one compilation unit. > A forward declaration may be broken by subsequent changes to the > library. True, and that will lead to a compiler error. > Forward declarations of functions and templates can prevent the > header owners from making otherwise-compatible changes to their > APIs, such as widening a parameter type, adding a template parameter > with a default value, or migrating to a new namespace. Forward declarations do not prevent any of that. But if you change the "real" declaration, all incompatible forward declarations will cause a compiler error. > Forward declaring symbols from namespace std:: yields undefined > behavior. Sad, but true. But that is not an argument against forward declarations for Ceph's own types. > It can be difficult to determine whether a forward declaration or a > full #include is needed. If it compiles without the `#include`, then the forward declaration is fine. (Or the primary header happened to be already included by somebody else.) > Replacing an #include with a forward declaration can silently change > the meaning of code: [...] If the #include was replaced with forward > decls for B and D, test() would call f(void*). True, but this is a contrived example, and is bad coding style because it is error prone. Casts to `void*` can and should be avoided. There are rare examples where such casts are necessary (boundary to C APIs), and then it's very unusual to pass derived incomplete types. > Forward declaring multiple symbols from a header can be more verbose > than simply #includeing the header. True, but that misses the point of forward declarations. > Structuring code to enable forward declarations (e.g., using pointer > members instead of object members) can make the code slower and more > complex. True, but that is not a property of forward declarations. I don't suggest doing such a thing. Signed-off-by: Max Kellermann --- CodingStyle | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CodingStyle b/CodingStyle index 659298f0e5ae4..019d23c7703dc 100644 --- a/CodingStyle +++ b/CodingStyle @@ -108,6 +108,12 @@ by section. portability since `#pragma once` is widely supported and is known to work on GCC and Clang. +* Header Files -> Forward declarations: + + Forward declarations of structs, unions, classes and enums can be + used to reduce header dependencies. This speeds up compile times + because the compiler has to process less code. + The following guidelines have not been followed in the legacy code, but are worth mentioning and should be followed strictly for new code: From 56b60c01cacafa40eeb009d76a6854507d33f3a5 Mon Sep 17 00:00:00 2001 From: Afreen Misbah Date: Fri, 11 Oct 2024 20:58:56 +0530 Subject: [PATCH 138/148] mgr/dashboard: Adapt gateway group changes in nvmeof UI - Added gateway group param in namespace request - GET, POST, PATCH, DELETE - Added gateway group param in Listeners request - GET - Added gateway group param in Initiators - GET, POST, DELETE Fixes https://tracker.ceph.com/issues/68510 Signed-off-by: Afreen Misbah --- .../mgr/dashboard/controllers/nvmeof.py | 10 ++-- .../nvmeof-initiators-form.component.ts | 11 +++-- .../nvmeof-initiators-list.component.ts | 25 +++++----- .../nvmeof-listeners-form.component.ts | 7 ++- .../nvmeof-listeners-list.component.ts | 10 ++-- .../nvmeof-namespaces-form.component.ts | 7 ++- .../nvmeof-namespaces-list.component.ts | 48 +++++++++++-------- .../nvmeof-subsystems-details.component.html | 8 +++- .../nvmeof-subsystems-form.component.ts | 4 +- .../src/app/shared/api/nvmeof.service.spec.ts | 9 +++- .../src/app/shared/api/nvmeof.service.ts | 42 ++++++++++------ 11 files changed, 107 insertions(+), 74 deletions(-) diff --git a/src/pybind/mgr/dashboard/controllers/nvmeof.py b/src/pybind/mgr/dashboard/controllers/nvmeof.py index f199867943d14..519c310a98bcc 100644 --- a/src/pybind/mgr/dashboard/controllers/nvmeof.py +++ b/src/pybind/mgr/dashboard/controllers/nvmeof.py @@ -463,16 +463,17 @@ def status(self) -> dict: parameters={ 'subsystem_nqn': (str, 'Subsystem NQN'), "host_nqn": Param(str, 'Comma separated list of NVMeoF host NQNs'), + "gw_group": Param(str, "NVMeoF gateway group") }) @empty_response @handle_nvmeof_error @CreatePermission - def add(self, subsystem_nqn: str, host_nqn: str = ""): + def add(self, subsystem_nqn: str, gw_group: str, host_nqn: str = ""): response = None all_host_nqns = host_nqn.split(',') for nqn in all_host_nqns: - response = NVMeoFClient().stub.add_host( + response = NVMeoFClient(gw_group=gw_group).stub.add_host( NVMeoFClient.pb2.add_host_req(subsystem_nqn=subsystem_nqn, host_nqn=nqn) ) if response.status != 0: @@ -484,16 +485,17 @@ def add(self, subsystem_nqn: str, host_nqn: str = ""): parameters={ "subsystem_nqn": Param(str, "NVMeoF subsystem NQN"), "host_nqn": Param(str, 'Comma separated list of NVMeoF host NQN.'), + "gw_group": Param(str, "NVMeoF gateway group") }) @empty_response @handle_nvmeof_error @DeletePermission - def remove(self, subsystem_nqn: str, host_nqn: str): + def remove(self, subsystem_nqn: str, host_nqn: str, gw_group: str): response = None to_delete_nqns = host_nqn.split(',') for del_nqn in to_delete_nqns: - response = NVMeoFClient().stub.remove_host( + response = NVMeoFClient(gw_group=gw_group).stub.remove_host( NVMeoFClient.pb2.remove_host_req(subsystem_nqn=subsystem_nqn, host_nqn=del_nqn) ) if response.status != 0: diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-form/nvmeof-initiators-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-form/nvmeof-initiators-form.component.ts index 3a143a1a8df90..32f7c76a36282 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-form/nvmeof-initiators-form.component.ts +++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-form/nvmeof-initiators-form.component.ts @@ -10,7 +10,7 @@ import { AuthStorageService } from '~/app/shared/services/auth-storage.service'; import { TaskWrapperService } from '~/app/shared/services/task-wrapper.service'; import { FinishedTask } from '~/app/shared/models/finished-task'; import { ActivatedRoute, Router } from '@angular/router'; -import { NvmeofService } from '~/app/shared/api/nvmeof.service'; +import { InitiatorRequest, NvmeofService } from '~/app/shared/api/nvmeof.service'; @Component({ selector: 'cd-nvmeof-initiators-form', @@ -26,6 +26,7 @@ export class NvmeofInitiatorsFormComponent implements OnInit { remove: boolean = false; subsystemNQN: string; removeHosts: { name: string; value: boolean; id: number }[] = []; + group: string; constructor( private authStorageService: AuthStorageService, @@ -52,6 +53,9 @@ export class NvmeofInitiatorsFormComponent implements OnInit { ); ngOnInit() { + this.route.queryParams.subscribe((params) => { + this.group = params?.['group']; + }); this.createForm(); this.action = this.actionLabels.ADD; this.route.params.subscribe((params: { subsystem_nqn: string }) => { @@ -108,8 +112,9 @@ export class NvmeofInitiatorsFormComponent implements OnInit { const hosts: string[] = this.addedHosts.value; let taskUrl = `nvmeof/initiator/${URLVerbs.ADD}`; - const request = { - host_nqn: hosts.join(',') + const request: InitiatorRequest = { + host_nqn: hosts.join(','), + gw_group: this.group }; if (allowAnyHost) { diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-list/nvmeof-initiators-list.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-list/nvmeof-initiators-list.component.ts index fff38e6985a43..a5575a9c9267e 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-list/nvmeof-initiators-list.component.ts +++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-list/nvmeof-initiators-list.component.ts @@ -1,4 +1,4 @@ -import { Component, Input, OnChanges, OnInit, TemplateRef, ViewChild } from '@angular/core'; +import { Component, Input, OnInit, TemplateRef, ViewChild } from '@angular/core'; import { Router } from '@angular/router'; import { NvmeofService } from '~/app/shared/api/nvmeof.service'; import { CriticalConfirmationModalComponent } from '~/app/shared/components/critical-confirmation-modal/critical-confirmation-modal.component'; @@ -20,9 +20,11 @@ const BASE_URL = 'block/nvmeof/subsystems'; templateUrl: './nvmeof-initiators-list.component.html', styleUrls: ['./nvmeof-initiators-list.component.scss'] }) -export class NvmeofInitiatorsListComponent implements OnInit, OnChanges { +export class NvmeofInitiatorsListComponent implements OnInit { @Input() subsystemNQN: string; + @Input() + group: string; @ViewChild('hostTpl', { static: true }) hostTpl: TemplateRef; @@ -58,10 +60,10 @@ export class NvmeofInitiatorsListComponent implements OnInit, OnChanges { permission: 'create', icon: Icons.add, click: () => - this.router.navigate([ - BASE_URL, - { outlets: { modal: [URLVerbs.ADD, this.subsystemNQN, 'initiator'] } } - ]), + this.router.navigate( + [BASE_URL, { outlets: { modal: [URLVerbs.ADD, this.subsystemNQN, 'initiator'] } }], + { queryParams: { group: this.group } } + ), canBePrimary: (selection: CdTableSelection) => !selection.hasSelection }, { @@ -79,17 +81,13 @@ export class NvmeofInitiatorsListComponent implements OnInit, OnChanges { return this.selection.selected.findIndex((selected) => selected.nqn === '*'); } - ngOnChanges() { - this.listInitiators(); - } - updateSelection(selection: CdTableSelection) { this.selection = selection; } listInitiators() { this.nvmeofService - .getInitiators(this.subsystemNQN) + .getInitiators(this.subsystemNQN, this.group) .subscribe((initiators: NvmeofSubsystemInitiator[]) => { this.initiators = initiators; }); @@ -118,7 +116,10 @@ export class NvmeofInitiatorsListComponent implements OnInit, OnChanges { nqn: this.subsystemNQN, plural: itemNames.length > 1 }), - call: this.nvmeofService.removeInitiators(this.subsystemNQN, { host_nqn }) + call: this.nvmeofService.removeInitiators(this.subsystemNQN, { + host_nqn, + gw_group: this.group + }) }) }); } diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-form/nvmeof-listeners-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-form/nvmeof-listeners-form.component.ts index cd362bf8abe19..8310e65d203e5 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-form/nvmeof-listeners-form.component.ts +++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-form/nvmeof-listeners-form.component.ts @@ -103,7 +103,8 @@ export class NvmeofListenersFormComponent implements OnInit { const host = this.listenerForm.getValue('host'); let trsvcid = Number(this.listenerForm.getValue('trsvcid')); if (!trsvcid) trsvcid = 4420; - const request = { + const request: ListenerRequest = { + gw_group: this.group, host_name: host.hostname, traddr: host.addr, trsvcid @@ -128,9 +129,7 @@ export class NvmeofListenersFormComponent implements OnInit { component.listenerForm.setErrors({ cdSubmitButton: true }); }, complete: () => { - this.router.navigate([this.pageURL, { outlets: { modal: null } }], { - queryParams: { group: this.group } - }); + this.router.navigate([this.pageURL, { outlets: { modal: null } }]); } }); } diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.ts index 974727ad06260..b49adda7c1b92 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.ts +++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.ts @@ -1,4 +1,4 @@ -import { Component, Input, OnChanges, OnInit } from '@angular/core'; +import { Component, Input, OnInit } from '@angular/core'; import { Router } from '@angular/router'; import { NvmeofService } from '~/app/shared/api/nvmeof.service'; import { CriticalConfirmationModalComponent } from '~/app/shared/components/critical-confirmation-modal/critical-confirmation-modal.component'; @@ -21,7 +21,7 @@ const BASE_URL = 'block/nvmeof/subsystems'; templateUrl: './nvmeof-listeners-list.component.html', styleUrls: ['./nvmeof-listeners-list.component.scss'] }) -export class NvmeofListenersListComponent implements OnInit, OnChanges { +export class NvmeofListenersListComponent implements OnInit { @Input() subsystemNQN: string; @Input() @@ -81,17 +81,13 @@ export class NvmeofListenersListComponent implements OnInit, OnChanges { ]; } - ngOnChanges() { - this.listListeners(); - } - updateSelection(selection: CdTableSelection) { this.selection = selection; } listListeners() { this.nvmeofService - .listListeners(this.subsystemNQN) + .listListeners(this.subsystemNQN, this.group) .subscribe((listResponse: NvmeofListener[]) => { this.listeners = listResponse.map((listener, index) => { listener['id'] = index; diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-form/nvmeof-namespaces-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-form/nvmeof-namespaces-form.component.ts index f5721e11ab6d3..b65ad62bdb4b1 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-form/nvmeof-namespaces-form.component.ts +++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-form/nvmeof-namespaces-form.component.ts @@ -41,6 +41,7 @@ export class NvmeofNamespacesFormComponent implements OnInit { nsid: string; currentBytes: number; invalidSizeError: boolean; + group: string; constructor( public actionLabels: ActionLabelsI18n, @@ -62,6 +63,9 @@ export class NvmeofNamespacesFormComponent implements OnInit { } init() { + this.route.queryParams.subscribe((params) => { + this.group = params?.['group']; + }); this.createForm(); this.action = this.actionLabels.CREATE; this.route.params.subscribe((params: { subsystem_nqn: string; nsid: string }) => { @@ -74,7 +78,7 @@ export class NvmeofNamespacesFormComponent implements OnInit { this.edit = true; this.action = this.actionLabels.EDIT; this.nvmeofService - .getNamespace(this.subsystemNQN, this.nsid) + .getNamespace(this.subsystemNQN, this.nsid, this.group) .subscribe((res: NvmeofSubsystemNamespace) => { const convertedSize = this.dimlessBinaryPipe.transform(res.rbd_image_size).split(' '); this.currentBytes = res.rbd_image_size; @@ -120,6 +124,7 @@ export class NvmeofNamespacesFormComponent implements OnInit { const image_size = this.nsForm.getValue('image_size'); const image_size_unit = this.nsForm.getValue('unit'); const request = {} as NamespaceCreateRequest | NamespaceEditRequest; + request['gw_group'] = this.group; if (image_size) { const key: string = this.edit ? 'rbd_image_size' : 'size'; const value: number = this.formatterService.toBytes(image_size + image_size_unit); diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-list/nvmeof-namespaces-list.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-list/nvmeof-namespaces-list.component.ts index c40b538c82088..8f8f6eb8d0598 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-list/nvmeof-namespaces-list.component.ts +++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-list/nvmeof-namespaces-list.component.ts @@ -1,4 +1,4 @@ -import { Component, Input, OnChanges, OnInit } from '@angular/core'; +import { Component, Input, OnInit } from '@angular/core'; import { Router } from '@angular/router'; import { NvmeofService } from '~/app/shared/api/nvmeof.service'; import { CriticalConfirmationModalComponent } from '~/app/shared/components/critical-confirmation-modal/critical-confirmation-modal.component'; @@ -23,9 +23,11 @@ const BASE_URL = 'block/nvmeof/subsystems'; templateUrl: './nvmeof-namespaces-list.component.html', styleUrls: ['./nvmeof-namespaces-list.component.scss'] }) -export class NvmeofNamespacesListComponent implements OnInit, OnChanges { +export class NvmeofNamespacesListComponent implements OnInit { @Input() subsystemNQN: string; + @Input() + group: string; namespacesColumns: any; tableActions: CdTableAction[]; @@ -117,10 +119,10 @@ export class NvmeofNamespacesListComponent implements OnInit, OnChanges { permission: 'create', icon: Icons.add, click: () => - this.router.navigate([ - BASE_URL, - { outlets: { modal: [URLVerbs.CREATE, this.subsystemNQN, 'namespace'] } } - ]), + this.router.navigate( + [BASE_URL, { outlets: { modal: [URLVerbs.CREATE, this.subsystemNQN, 'namespace'] } }], + { queryParams: { group: this.group } } + ), canBePrimary: (selection: CdTableSelection) => !selection.hasSelection }, { @@ -128,41 +130,45 @@ export class NvmeofNamespacesListComponent implements OnInit, OnChanges { permission: 'update', icon: Icons.edit, click: () => - this.router.navigate([ - BASE_URL, - { - outlets: { - modal: [URLVerbs.EDIT, this.subsystemNQN, 'namespace', this.selection.first().nsid] + this.router.navigate( + [ + BASE_URL, + { + outlets: { + modal: [ + URLVerbs.EDIT, + this.subsystemNQN, + 'namespace', + this.selection.first().nsid + ] + } } - } - ]) + ], + { queryParams: { group: this.group } } + ) }, { name: this.actionLabels.DELETE, permission: 'delete', icon: Icons.destroy, - click: () => this.deleteSubsystemModal() + click: () => this.deleteNamespaceModal() } ]; } - ngOnChanges() { - this.listNamespaces(); - } - updateSelection(selection: CdTableSelection) { this.selection = selection; } listNamespaces() { this.nvmeofService - .listNamespaces(this.subsystemNQN) + .listNamespaces(this.subsystemNQN, this.group) .subscribe((res: NvmeofSubsystemNamespace[]) => { this.namespaces = res; }); } - deleteSubsystemModal() { + deleteNamespaceModal() { const namespace = this.selection.first(); this.modalService.show(CriticalConfirmationModalComponent, { itemDescription: 'Namespace', @@ -174,7 +180,7 @@ export class NvmeofNamespacesListComponent implements OnInit, OnChanges { nqn: this.subsystemNQN, nsid: namespace.nsid }), - call: this.nvmeofService.deleteNamespace(this.subsystemNQN, namespace.nsid) + call: this.nvmeofService.deleteNamespace(this.subsystemNQN, namespace.nsid, this.group) }) }); } diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-details/nvmeof-subsystems-details.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-details/nvmeof-subsystems-details.component.html index 7f15a1360adc2..58a1e01a52510 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-details/nvmeof-subsystems-details.component.html +++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-details/nvmeof-subsystems-details.component.html @@ -24,14 +24,18 @@ Namespaces - + + Initiators - + + diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.ts index f7b35a2d645ec..7e5b064f37929 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.ts +++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.ts @@ -118,9 +118,7 @@ export class NvmeofSubsystemsFormComponent implements OnInit { component.subsystemForm.setErrors({ cdSubmitButton: true }); }, complete: () => { - this.router.navigate([this.pageURL, { outlets: { modal: null } }], { - queryParams: { group: this.group } - }); + this.router.navigate([this.pageURL, { outlets: { modal: null } }]); } }); } diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.spec.ts index 313db3445f298..a5c84e60b6f95 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.spec.ts +++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.spec.ts @@ -27,6 +27,7 @@ describe('NvmeofService', () => { expect(service).toBeTruthy(); }); + // gateways it('should call listGatewayGroups', () => { service.listGatewayGroups().subscribe(); const req = httpTesting.expectOne('api/nvmeof/gateway/group'); @@ -39,6 +40,7 @@ describe('NvmeofService', () => { expect(req.request.method).toBe('GET'); }); + // subsystems it('should call listSubsystems', () => { service.listSubsystems(mockGroupName).subscribe(); const req = httpTesting.expectOne(`api/nvmeof/subsystem?gw_group=${mockGroupName}`); @@ -69,9 +71,12 @@ describe('NvmeofService', () => { expect(req.request.method).toBe('DELETE'); }); + // initiators it('should call getInitiators', () => { - service.getInitiators(mockNQN).subscribe(); - const req = httpTesting.expectOne(`api/nvmeof/subsystem/${mockNQN}/host`); + service.getInitiators(mockNQN, mockGroupName).subscribe(); + const req = httpTesting.expectOne( + `api/nvmeof/subsystem/${mockNQN}/host?gw_group=${mockGroupName}` + ); expect(req.request.method).toBe('GET'); }); }); diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.ts index 40202d0d67250..a2bbf507bc345 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.ts +++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.ts @@ -8,6 +8,7 @@ import { catchError, mapTo } from 'rxjs/operators'; export const MAX_NAMESPACE = 1024; export interface ListenerRequest { + gw_group: string; host_name: string; traddr: string; trsvcid: number; @@ -17,14 +18,17 @@ export interface NamespaceCreateRequest { rbd_image_name: string; rbd_pool: string; size: number; + gw_group: string; } export interface NamespaceEditRequest { rbd_image_size: number; + gw_group: string; } export interface InitiatorRequest { host_nqn: string; + gw_group: string; } const API_PATH = 'api/nvmeof'; @@ -81,8 +85,8 @@ export class NvmeofService { } // Initiators - getInitiators(subsystemNQN: string) { - return this.http.get(`${API_PATH}/subsystem/${subsystemNQN}/host`); + getInitiators(subsystemNQN: string, group: string) { + return this.http.get(`${API_PATH}/subsystem/${subsystemNQN}/host?gw_group=${group}`); } addInitiators(subsystemNQN: string, request: InitiatorRequest) { @@ -92,14 +96,17 @@ export class NvmeofService { } removeInitiators(subsystemNQN: string, request: InitiatorRequest) { - return this.http.delete(`${UI_API_PATH}/subsystem/${subsystemNQN}/host/${request.host_nqn}`, { - observe: 'response' - }); + return this.http.delete( + `${UI_API_PATH}/subsystem/${subsystemNQN}/host/${request.host_nqn}/${request.gw_group}`, + { + observe: 'response' + } + ); } // Listeners - listListeners(subsystemNQN: string) { - return this.http.get(`${API_PATH}/subsystem/${subsystemNQN}/listener`); + listListeners(subsystemNQN: string, group: string) { + return this.http.get(`${API_PATH}/subsystem/${subsystemNQN}/listener?gw_group=${group}`); } createListener(subsystemNQN: string, request: ListenerRequest) { @@ -121,12 +128,14 @@ export class NvmeofService { } // Namespaces - listNamespaces(subsystemNQN: string) { - return this.http.get(`${API_PATH}/subsystem/${subsystemNQN}/namespace`); + listNamespaces(subsystemNQN: string, group: string) { + return this.http.get(`${API_PATH}/subsystem/${subsystemNQN}/namespace?gw_group=${group}`); } - getNamespace(subsystemNQN: string, nsid: string) { - return this.http.get(`${API_PATH}/subsystem/${subsystemNQN}/namespace/${nsid}`); + getNamespace(subsystemNQN: string, nsid: string, group: string) { + return this.http.get( + `${API_PATH}/subsystem/${subsystemNQN}/namespace/${nsid}?gw_group=${group}` + ); } createNamespace(subsystemNQN: string, request: NamespaceCreateRequest) { @@ -141,9 +150,12 @@ export class NvmeofService { }); } - deleteNamespace(subsystemNQN: string, nsid: string) { - return this.http.delete(`${API_PATH}/subsystem/${subsystemNQN}/namespace/${nsid}`, { - observe: 'response' - }); + deleteNamespace(subsystemNQN: string, nsid: string, group: string) { + return this.http.delete( + `${API_PATH}/subsystem/${subsystemNQN}/namespace/${nsid}?gw_group=${group}`, + { + observe: 'response' + } + ); } } From 80d0037c2512e696e224e293b6b4153ed6be0350 Mon Sep 17 00:00:00 2001 From: Afreen Misbah Date: Thu, 26 Sep 2024 17:12:23 +0530 Subject: [PATCH 139/148] mailmap: Add dashboard new joinees in maps - Afreen Misbah - Dnyaneshwari Talwekar - Naman Munet - Prachi Goel - Puja Shahu Signed-off-by: Afreen Misbah --- .githubmap | 5 +++++ .mailmap | 5 +++++ .organizationmap | 10 ++++++++++ 3 files changed, 20 insertions(+) diff --git a/.githubmap b/.githubmap index b93132cf1ee4c..68015b4c1a689 100644 --- a/.githubmap +++ b/.githubmap @@ -12,6 +12,7 @@ aaSharma14 Aashish Sharma aclamk Adam Kupczyk adamemerson Adam C. Emerson adk3798 Adam King +afreen23 Afreen Misbah ajarr Ramana Raja alfonsomthd Alfonso Martínez alfredodeza Alfredo Deza @@ -47,6 +48,7 @@ Devp00l Stephan Müller dillaman Jason Dillaman djgalloway David Galloway dmick Dan Mick +dnyanee1997 Dnyaneshwari talwekar dragonylffly Li Wang dsavineau Dimitri Savineau dvanders Dan van der Ster @@ -96,6 +98,7 @@ mikechristie Mike Christie mogeb Mohamad Gebai MrFreezeex Arthur Outhenin-Chalandre myoungwon Myoungwon Oh +nmunet Naman Munet Naveenaidu Naveen Naidu neha-ojha Neha Ojha NitzanMordhai Nitzan Mordechai @@ -109,6 +112,8 @@ p-se Patrick Seidensal pcuzner Paul Cuzner Pegonzal Pedro Gonzalez Gomez pereman2 Pere Diaz Bou +prgoel-code Prachi prgoel@redhat.com +pujaoshahu Puja Shahu rchagam Anjaneya Chagam renhwztetecs huanwen ren ricardoasmarques Ricardo Marques diff --git a/.mailmap b/.mailmap index 8359b1473aedb..20aecd0c2321d 100644 --- a/.mailmap +++ b/.mailmap @@ -24,6 +24,7 @@ Adam Kupczyk Adam Kupczyk Adam Twardowski Adir Lev +Afreen Misbah Ahoussi Armand Ailing Zhang Aishwarya Mathuria amathuria @@ -168,6 +169,7 @@ Dhairya Parmar dparmar18 Dingdang Zhang Dmitry Smirnov Dmitry Yatsushkevich +Dnyaneshwari talwekar Dominik Hannen Dongdong Tao Dongdong Tao @@ -508,6 +510,7 @@ Myoungwon Oh Myoungwon Oh Na Xie Nag Pavan Chilakam <55574442+nagpavan-chilakam@users.noreply.github.com> +Naman Munet Nancy Su Nathan Cutler Nathan Cutler @@ -572,6 +575,8 @@ Pooja Gautam Pritha Srivastava Pritha Srivastava Pritha Srivastava +Prachi prgoel@redhat.com +Puja Shahu Qi Liang Hong Qiankun Zheng Qinfei Liu <18138800392@163.com> diff --git a/.organizationmap b/.organizationmap index bc194953d1b88..7a1061a194c70 100644 --- a/.organizationmap +++ b/.organizationmap @@ -346,17 +346,22 @@ Huayun Zheng Yin Huazhong University of Science and Technology Luo Runbing HXT Semiconductor Jiang Yutang IBM Adam Kupczyk +IBM Afreen Misbah IBM Aliaksei Makarau IBM Andrew Solomon +IBM Dnyaneshwari talwekar IBM Guillaume Abrioux IBM Jonas Pfefferle IBM Laura Flores IBM Martin Ohmacht IBM Michel Normand +IBM Naman Munet IBM Naveen Naidu IBM Neeraj Pratap Singh IBM Or Ozeri IBM Paul Cuzner +IBM Prachi Goel +IBM Puja Shahu IBM Samuel Matzek IBM Shraddha Agrawal IBM Sunil Angadi @@ -582,6 +587,7 @@ Red Hat Adam King Red Hat Adam King Red Hat Adam Kupczyk Red Hat Ademar de Souza Reis Jr +Red Hat Afreen Misbah Red Hat Aishwarya Mathuria Red Hat Albin Antony Red Hat Alex Elder @@ -618,6 +624,7 @@ Red Hat Deepika Upadhyay Red Hat Dhairya Parmar Red Hat Dimitri Savineau Red Hat Divyansh Kamboj +Red Hat Dnyaneshwari talwekar Red Hat Douglas Fuller Red Hat Ernesto Puerta Red Hat Erwan Velu @@ -683,6 +690,7 @@ Red Hat Mike Hackett Red Hat Mike Perez Red Hat Milan Broz Red Hat Milind Changire +Red Hat Naman Munet Red Hat Nathan Weinberg Red Hat Neeraj Pratap Singh Red Hat Neha Ojha @@ -706,9 +714,11 @@ Red Hat Pere Diaz Bou Red Hat Pete Zaitcev Red Hat Petr Lautrbach Red Hat Petr Machata +Red Hat Prachi prgoel@redhat.com Red Hat Prasanna Kumar Kalever Red Hat Prashant D Red Hat Pritha Srivastava +Red Hat Puja Shahu Red Hat Radoslaw Zarzynski Red Hat Rafael Quintero Red Hat Ramakrishnan Periyasamy From 7343be720870d4a5f82b55beee4685457a003067 Mon Sep 17 00:00:00 2001 From: Adam Kupczyk Date: Tue, 15 Oct 2024 12:41:22 +0000 Subject: [PATCH 140/148] os/bluestore: Fix repair of multilabel when collides with BlueFS The problem was that BDEV_FIRST_LABEL_POSITION was removed from bdev_label_valid_locations set. Now, if label at BDEV_FIRST_LABEL_POSITION is valid, it is in the set. Fixes: https://tracker.ceph.com/issues/68528 Signed-off-by: Adam Kupczyk --- src/os/bluestore/BlueStore.cc | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 44a171873c08c..6c31639811e6e 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -6794,9 +6794,8 @@ void BlueStore::_main_bdev_label_try_reserve() vector candidate_positions; vector accepted_positions; uint64_t lsize = std::max(BDEV_LABEL_BLOCK_SIZE, min_alloc_size); - for (size_t i = 1; i < bdev_label_positions.size(); i++) { - uint64_t location = bdev_label_positions[i]; - if (location + lsize <= bdev->get_size()) { + for (uint64_t location : bdev_label_valid_locations) { + if (location != BDEV_FIRST_LABEL_POSITION) { candidate_positions.push_back(location); } } @@ -11497,9 +11496,7 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair) string p = path + "/block"; _write_bdev_label(cct, bdev, p, bdev_label, bdev_labels_in_repair); for (uint64_t pos : bdev_labels_in_repair) { - if (pos != BDEV_FIRST_LABEL_POSITION) { - bdev_label_valid_locations.push_back(pos); - } + bdev_label_valid_locations.push_back(pos); } repaired += bdev_labels_in_repair.size(); } From dd2a150f40fb11abe6bd1ee51bca03419aaa7d7f Mon Sep 17 00:00:00 2001 From: Guillaume Abrioux Date: Wed, 9 Oct 2024 14:59:38 +0000 Subject: [PATCH 141/148] ceph-volume: address test_activate_dmcrypt_tpm This mocks the call to `luks_close()`, otherwise this test fails when run on a system where `cryptsetup` isn't available. Signed-off-by: Guillaume Abrioux --- .../ceph_volume/tests/objectstore/test_rawbluestore.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ceph-volume/ceph_volume/tests/objectstore/test_rawbluestore.py b/src/ceph-volume/ceph_volume/tests/objectstore/test_rawbluestore.py index f4f50b06f8a29..fd7c468037c5c 100644 --- a/src/ceph-volume/ceph_volume/tests/objectstore/test_rawbluestore.py +++ b/src/ceph-volume/ceph_volume/tests/objectstore/test_rawbluestore.py @@ -159,6 +159,7 @@ def test_activate_osd_id_and_fsid(self, @patch('ceph_volume.objectstore.rawbluestore.encryption_utils.rename_mapper', Mock(return_value=MagicMock())) @patch('ceph_volume.util.disk.get_bluestore_header') + @patch('ceph_volume.objectstore.rawbluestore.encryption_utils.luks_close', Mock(return_value=MagicMock())) @patch('ceph_volume.objectstore.rawbluestore.encryption_utils.luks_open', Mock(return_value=MagicMock())) def test_activate_dmcrypt_tpm(self, m_bs_header, rawbluestore, fake_lsblk_all, mock_raw_direct_report, is_root) -> None: m_bs_header.return_value = { From b5e7008d28a5acd63ea9cd0c6b27f400dad409af Mon Sep 17 00:00:00 2001 From: Guillaume Abrioux Date: Wed, 9 Oct 2024 15:00:53 +0000 Subject: [PATCH 142/148] ceph-volume: address mypy errors in disk.py typical error: ``` ceph_volume/util/disk.py:1374: error: Incompatible types in assignment (expression has type "Optional[str]", variable has type "str") [assignment] ``` This commits addresses it. Signed-off-by: Guillaume Abrioux --- src/ceph-volume/ceph_volume/util/disk.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ceph-volume/ceph_volume/util/disk.py b/src/ceph-volume/ceph_volume/util/disk.py index 78c140597d653..3ac51c11e3469 100644 --- a/src/ceph-volume/ceph_volume/util/disk.py +++ b/src/ceph-volume/ceph_volume/util/disk.py @@ -1370,8 +1370,8 @@ def slashed_path(self) -> str: """ result: str = self.path if self.is_lvm: - vg: str = self.environment.get('DM_VG_NAME') - lv: str = self.environment.get('DM_LV_NAME') + vg: str = self.environment.get('DM_VG_NAME', '') + lv: str = self.environment.get('DM_LV_NAME', '') result = f'/dev/{vg}/{lv}' return result @@ -1385,6 +1385,6 @@ def dashed_path(self) -> str: """ result: str = self.path if self.is_lvm: - name: str = self.environment.get('DM_NAME') + name: str = self.environment.get('DM_NAME', '') result = f'/dev/mapper/{name}' return result From 212c8740831a7650b5be86c27d14f8c0b6eacbef Mon Sep 17 00:00:00 2001 From: Guillaume Abrioux Date: Wed, 31 Jul 2024 14:36:48 +0000 Subject: [PATCH 143/148] orch: disk replacement enhancement This introduces a new `ceph orch device replace` command in order to improve the user experience when it comes to replacing the underlying device of an OSD. Fixes: https://tracker.ceph.com/issues/68456 Signed-off-by: Guillaume Abrioux --- doc/cephadm/operations.rst | 69 +++ src/ceph-volume/ceph_volume/__init__.py | 1 + src/ceph-volume/ceph_volume/api/lvm.py | 17 +- .../ceph_volume/devices/lvm/zap.py | 173 +++++-- src/ceph-volume/ceph_volume/tests/conftest.py | 2 +- .../ceph_volume/tests/devices/lvm/test_zap.py | 23 +- .../ceph_volume/tests/test_inventory.py | 1 + .../ceph_volume/util/arg_validators.py | 14 + src/ceph-volume/ceph_volume/util/device.py | 26 +- src/ceph-volume/ceph_volume/util/disk.py | 17 +- src/pybind/mgr/cephadm/ceph_volume.py | 430 ++++++++++++++++++ src/pybind/mgr/cephadm/module.py | 53 +++ src/pybind/mgr/cephadm/serve.py | 5 +- src/pybind/mgr/cephadm/services/osd.py | 45 +- .../mgr/cephadm/tests/ceph_volume_data.py | 1 + src/pybind/mgr/cephadm/tests/conftest.py | 7 +- src/pybind/mgr/cephadm/tests/fixtures.py | 4 +- .../mgr/cephadm/tests/test_ceph_volume.py | 231 ++++++++++ .../mgr/cephadm/tests/test_replace_device.py | 53 +++ src/pybind/mgr/orchestrator/_interface.py | 15 + src/pybind/mgr/orchestrator/module.py | 20 +- .../deployment/drive_selection/selector.py | 4 + .../ceph/deployment/inventory.py | 10 +- 23 files changed, 1151 insertions(+), 70 deletions(-) create mode 100644 src/pybind/mgr/cephadm/ceph_volume.py create mode 100644 src/pybind/mgr/cephadm/tests/ceph_volume_data.py create mode 100644 src/pybind/mgr/cephadm/tests/test_ceph_volume.py create mode 100644 src/pybind/mgr/cephadm/tests/test_replace_device.py diff --git a/doc/cephadm/operations.rst b/doc/cephadm/operations.rst index 3b117c1bd6a60..420ee655ac8ba 100644 --- a/doc/cephadm/operations.rst +++ b/doc/cephadm/operations.rst @@ -734,3 +734,72 @@ Purge ceph daemons from all hosts in the cluster # For each host: cephadm rm-cluster --force --zap-osds --fsid + + +Replacing a device +================== + +The ``ceph orch device replace`` command automates the process of replacing the underlying device of an OSD. +Previously, this process required manual intervention at various stages. +With this new command, all necessary operations are performed automatically, streamlining the replacement process +and improving the overall user experience. + +.. note:: This only supports LVM-based deployed OSD(s) + +.. prompt:: bash # + + ceph orch device replace + +In the case the device being replaced is shared by multiple OSDs (eg: DB/WAL device shared by multiple OSDs), the orchestrator will warn you. + +.. prompt:: bash # + + [ceph: root@ceph /]# ceph orch device replace osd-1 /dev/vdd + + Error EINVAL: /dev/vdd is a shared device. + Replacing /dev/vdd implies destroying OSD(s): ['0', '1']. + Please, *be very careful*, this can be a very dangerous operation. + If you know what you are doing, pass --yes-i-really-mean-it + +If you know what you are doing, you can go ahead and pass ``--yes-i-really-mean-it``. + +.. prompt:: bash # + + [ceph: root@ceph /]# ceph orch device replace osd-1 /dev/vdd --yes-i-really-mean-it + Scheduled to destroy osds: ['6', '7', '8'] and mark /dev/vdd as being replaced. + +``cephadm`` will make ``ceph-volume`` zap and destroy all related devices and mark the corresponding OSD as ``destroyed`` so the +different OSD(s) ID(s) will be preserved: + +.. prompt:: bash # + + [ceph: root@ceph-1 /]# ceph osd tree + ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF + -1 0.97659 root default + -3 0.97659 host devel-1 + 0 hdd 0.29300 osd.0 destroyed 1.00000 1.00000 + 1 hdd 0.29300 osd.1 destroyed 1.00000 1.00000 + 2 hdd 0.19530 osd.2 up 1.00000 1.00000 + 3 hdd 0.19530 osd.3 up 1.00000 1.00000 + +The device being replaced is finally seen as ``being replaced`` preventing ``cephadm`` from redeploying the OSDs too fast: + +.. prompt:: bash # + + [ceph: root@ceph-1 /]# ceph orch device ls + HOST PATH TYPE DEVICE ID SIZE AVAILABLE REFRESHED REJECT REASONS + osd-1 /dev/vdb hdd 200G Yes 13s ago + osd-1 /dev/vdc hdd 200G Yes 13s ago + osd-1 /dev/vdd hdd 200G Yes 13s ago Is being replaced + osd-1 /dev/vde hdd 200G No 13s ago Has a FileSystem, Insufficient space (<10 extents) on vgs, LVM detected + osd-1 /dev/vdf hdd 200G No 13s ago Has a FileSystem, Insufficient space (<10 extents) on vgs, LVM detected + +If for any reason you need to clear the 'device replace header' on a device, then you can use ``ceph orch device replace --clear``: + +.. prompt:: bash # + + [ceph: root@devel-1 /]# ceph orch device replace devel-1 /dev/vdk --clear + Replacement header cleared on /dev/vdk + [ceph: root@devel-1 /]# + +After that, ``cephadm`` will redeploy the OSD service spec within a few minutes (unless the service is set to ``unmanaged``). diff --git a/src/ceph-volume/ceph_volume/__init__.py b/src/ceph-volume/ceph_volume/__init__.py index b10100c02185a..814619cfdddb4 100644 --- a/src/ceph-volume/ceph_volume/__init__.py +++ b/src/ceph-volume/ceph_volume/__init__.py @@ -6,6 +6,7 @@ sys_info = namedtuple('sys_info', ['devices']) sys_info.devices = dict() logger = logging.getLogger(__name__) +BEING_REPLACED_HEADER: str = 'CEPH_DEVICE_BEING_REPLACED' class AllowLoopDevices: diff --git a/src/ceph-volume/ceph_volume/api/lvm.py b/src/ceph-volume/ceph_volume/api/lvm.py index 16cbc08b26254..fc376f891fd25 100644 --- a/src/ceph-volume/ceph_volume/api/lvm.py +++ b/src/ceph-volume/ceph_volume/api/lvm.py @@ -10,6 +10,8 @@ from math import floor from ceph_volume import process, util, conf from ceph_volume.exceptions import SizeAllocationError +from typing import Any, Dict + logger = logging.getLogger(__name__) @@ -807,13 +809,16 @@ def get_all_devices_vgs(name_prefix=''): '--units=b', '--nosuffix'] -class Volume(object): +class Volume: """ Represents a Logical Volume from LVM, with some top-level attributes like ``lv_name`` and parsed tags as a dictionary of key/value pairs. """ - def __init__(self, **kw): + def __init__(self, **kw: str) -> None: + self.lv_path: str = '' + self.lv_name: str = '' + self.lv_uuid: str = '' for k, v in kw.items(): setattr(self, k, v) self.lv_api = kw @@ -824,13 +829,13 @@ def __init__(self, **kw): self.encrypted = self.tags.get('ceph.encrypted', '0') == '1' self.used_by_ceph = 'ceph.osd_id' in self.tags - def __str__(self): + def __str__(self) -> str: return '<%s>' % self.lv_api['lv_path'] - def __repr__(self): + def __repr__(self) -> str: return self.__str__() - def as_dict(self): + def as_dict(self) -> Dict[str, Any]: obj = {} obj.update(self.lv_api) obj['tags'] = self.tags @@ -839,7 +844,7 @@ def as_dict(self): obj['path'] = self.lv_path return obj - def report(self): + def report(self) -> Dict[str, Any]: if not self.used_by_ceph: return { 'name': self.lv_name, diff --git a/src/ceph-volume/ceph_volume/devices/lvm/zap.py b/src/ceph-volume/ceph_volume/devices/lvm/zap.py index 2b6925f5b2739..388f6aeea2708 100644 --- a/src/ceph-volume/ceph_volume/devices/lvm/zap.py +++ b/src/ceph-volume/ceph_volume/devices/lvm/zap.py @@ -5,12 +5,12 @@ from textwrap import dedent -from ceph_volume import decorators, terminal, process +from ceph_volume import decorators, terminal, process, BEING_REPLACED_HEADER from ceph_volume.api import lvm as api from ceph_volume.util import system, encryption, disk, arg_validators, str_to_int, merge_dict from ceph_volume.util.device import Device from ceph_volume.systemd import systemctl -from typing import List +from typing import Any, Dict, List logger = logging.getLogger(__name__) mlogger = terminal.MultiLogger(__name__) @@ -95,29 +95,29 @@ def zap_data(path): 'conv=fsync' ]) - -def find_associated_devices(osd_id=None, osd_fsid=None): +def find_associated_devices(osd_id: str = '', osd_fsid: str = '') -> List[api.Volume]: """ From an ``osd_id`` and/or an ``osd_fsid``, filter out all the LVs in the system that match those tag values, further detect if any partitions are part of the OSD, and then return the set of LVs and partitions (if any). """ lv_tags = {} - if osd_id: - lv_tags['ceph.osd_id'] = osd_id - if osd_fsid: - lv_tags['ceph.osd_fsid'] = osd_fsid + lv_tags = {key: value for key, value in { + 'ceph.osd_id': osd_id, + 'ceph.osd_fsid': osd_fsid + }.items() if value} lvs = api.get_lvs(tags=lv_tags) + if not lvs: raise RuntimeError('Unable to find any LV for zapping OSD: ' - '%s' % osd_id or osd_fsid) - + f'{osd_id or osd_fsid}') devices_to_zap = ensure_associated_lvs(lvs, lv_tags) - return [Device(path) for path in set(devices_to_zap) if path] + return [Device(path) for path in set(devices_to_zap) if path] -def ensure_associated_lvs(lvs, lv_tags={}): +def ensure_associated_lvs(lvs: List[api.Volume], + lv_tags: Dict[str, Any] = {}) -> List[str]: """ Go through each LV and ensure if backing devices (journal, wal, block) are LVs or partitions, so that they can be accurately reported. @@ -166,14 +166,14 @@ def ensure_associated_lvs(lvs, lv_tags={}): return list(set(verified_devices)) -class Zap(object): - +class Zap: help = 'Removes all data and filesystems from a logical volume or partition.' - def __init__(self, argv): + def __init__(self, argv: List[str]) -> None: self.argv = argv + self.osd_ids_to_zap: List[str] = [] - def unmount_lv(self, lv): + def unmount_lv(self, lv: api.Volume) -> None: if lv.tags.get('ceph.cluster_name') and lv.tags.get('ceph.osd_id'): lv_path = "/var/lib/ceph/osd/{}-{}".format(lv.tags['ceph.cluster_name'], lv.tags['ceph.osd_id']) else: @@ -186,39 +186,95 @@ def unmount_lv(self, lv): if dmcrypt and dmcrypt_uuid: self.dmcrypt_close(dmcrypt_uuid) - def zap_lv(self, device): + def _write_replacement_header(self, device: str) -> None: + """Write a replacement header to a device. + + This method writes the string defined in `BEING_REPLACED_HEADER` + to the specified device. This header indicates that the device + is in the process of being replaced. + + Args: + device (str): The path to the device on which the replacement + header will be written. + """ + disk._dd_write(device, + BEING_REPLACED_HEADER) + + def clear_replace_header(self) -> bool: + """Safely erase the replacement header on a device if it is marked as being replaced. + + This method checks whether the given device is marked as being replaced + (`device.is_being_replaced`). If true, it proceeds to erase the replacement header + from the device using the `_erase_replacement_header` method. The method returns + a boolean indicating whether any action was taken. + + Args: + device (Device): The device object, which includes information about the device's + path and status (such as whether it is currently being replaced). + + Returns: + bool: True if the replacement header was successfully erased, False if the + device was not marked as being replaced or no action was necessary. + """ + result: bool = False + device: Device = self.args.clear_replace_header + if device.is_being_replaced: + self._erase_replacement_header(device.path) + result = True + return result + + def _erase_replacement_header(self, device: str) -> None: + """Erase the replacement header on a device. + + This method writes a sequence of null bytes (`0x00`) over the area of the device + where the replacement header is stored, effectively erasing it. + + Args: + device (str): The path to the device from which the replacement header will be erased. + """ + disk._dd_write(device, + b'\x00' * len(BEING_REPLACED_HEADER)) + + def zap_lv(self, device: Device) -> None: """ Device examples: vg-name/lv-name, /dev/vg-name/lv-name Requirements: Must be a logical volume (LV) """ lv: api.Volume = device.lv_api self.unmount_lv(lv) - + self.parent_device: str = disk.get_parent_device_from_mapper(lv.lv_path) zap_device(device.path) if self.args.destroy: lvs = api.get_lvs(filters={'vg_name': device.vg_name}) - if lvs == []: - mlogger.info('No LVs left, exiting', device.vg_name) - return - elif len(lvs) <= 1: + if len(lvs) <= 1: mlogger.info('Only 1 LV left in VG, will proceed to destroy ' 'volume group %s', device.vg_name) pvs = api.get_pvs(filters={'lv_uuid': lv.lv_uuid}) api.remove_vg(device.vg_name) for pv in pvs: api.remove_pv(pv.pv_name) + replacement_args: Dict[str, bool] = { + 'block': self.args.replace_block, + 'db': self.args.replace_db, + 'wal': self.args.replace_wal + } + if replacement_args.get(lv.tags.get('ceph.type'), False): + mlogger.info(f'Marking {self.parent_device} as being replaced') + self._write_replacement_header(self.parent_device) else: mlogger.info('More than 1 LV left in VG, will proceed to ' 'destroy LV only') mlogger.info('Removing LV because --destroy was given: %s', device.path) + if self.args.replace_block: + mlogger.info(f'--replace-block passed but the device still has {str(len(lvs))} LV(s)') api.remove_lv(device.path) elif lv: # just remove all lvm metadata, leaving the LV around lv.clear_tags() - def zap_partition(self, device): + def zap_partition(self, device: Device) -> None: """ Device example: /dev/sda1 Requirements: Must be a partition @@ -246,7 +302,7 @@ def zap_partition(self, device): mlogger.info("Destroying partition since --destroy was used: %s" % device.path) disk.remove_partition(device) - def zap_lvm_member(self, device): + def zap_lvm_member(self, device: Device) -> None: """ An LVM member may have more than one LV and or VG, for example if it is a raw device with multiple partitions each belonging to a different LV @@ -266,7 +322,7 @@ def zap_lvm_member(self, device): - def zap_raw_device(self, device): + def zap_raw_device(self, device: Device) -> None: """ Any whole (raw) device passed in as input will be processed here, checking for LVM membership and partitions (if any). @@ -286,10 +342,19 @@ def zap_raw_device(self, device): self.zap_partition(Device('/dev/%s' % part_name)) zap_device(device.path) + # TODO(guits): I leave this commented out, this should be part of a separate patch in order to + # support device replacement with raw-based OSDs + # if self.args.replace_block: + # disk._dd_write(device.path, 'CEPH_DEVICE_BEING_REPLACED') @decorators.needs_root - def zap(self, devices=None): - devices = devices or self.args.devices + def zap(self) -> None: + """Zap a device. + + Raises: + SystemExit: When the device is a mapper and not a mpath device. + """ + devices = self.args.devices for device in devices: mlogger.info("Zapping: %s", device.path) @@ -316,21 +381,21 @@ def zap(self, devices=None): ) @decorators.needs_root - def zap_osd(self): + def zap_osd(self) -> None: if self.args.osd_id and not self.args.no_systemd: osd_is_running = systemctl.osd_is_active(self.args.osd_id) if osd_is_running: mlogger.error("OSD ID %s is running, stop it with:" % self.args.osd_id) mlogger.error("systemctl stop ceph-osd@%s" % self.args.osd_id) raise SystemExit("Unable to zap devices associated with OSD ID: %s" % self.args.osd_id) - devices = find_associated_devices(self.args.osd_id, self.args.osd_fsid) - self.zap(devices) + self.args.devices = find_associated_devices(self.args.osd_id, self.args.osd_fsid) + self.zap() - def dmcrypt_close(self, dmcrypt_uuid): + def dmcrypt_close(self, dmcrypt_uuid: str) -> None: mlogger.info("Closing encrypted volume %s", dmcrypt_uuid) encryption.dmcrypt_close(mapping=dmcrypt_uuid, skip_path_check=True) - def main(self): + def main(self) -> None: sub_command_help = dedent(""" Zaps the given logical volume(s), raw device(s) or partition(s) for reuse by ceph-volume. If given a path to a logical volume it must be in the format of vg/lv. Any @@ -418,12 +483,56 @@ def main(self): help='Skip systemd unit checks', ) + parser.add_argument( + '--replace-block', + dest='replace_block', + action='store_true', + help='Mark the block device as unavailable.' + ) + + parser.add_argument( + '--replace-db', + dest='replace_db', + action='store_true', + help='Mark the db device as unavailable.' + ) + + parser.add_argument( + '--replace-wal', + dest='replace_wal', + action='store_true', + help='Mark the wal device as unavailable.' + ) + + parser.add_argument( + '--clear-replace-header', + dest='clear_replace_header', + type=arg_validators.ValidClearReplaceHeaderDevice(), + help='clear the replace header on devices.' + ) + if len(self.argv) == 0: print(sub_command_help) return self.args = parser.parse_args(self.argv) + if self.args.clear_replace_header: + rc: bool = False + try: + rc = self.clear_replace_header() + except Exception as e: + raise SystemExit(e) + if rc: + mlogger.info(f'Replacement header cleared on {self.args.clear_replace_header}') + else: + mlogger.info(f'No replacement header detected on {self.args.clear_replace_header}, nothing to do.') + raise SystemExit(not rc) + + if self.args.replace_block or self.args.replace_db or self.args.replace_wal: + self.args.destroy = True + mlogger.info('--replace-block|db|wal passed, enforcing --destroy.') + if self.args.osd_id or self.args.osd_fsid: self.zap_osd() else: diff --git a/src/ceph-volume/ceph_volume/tests/conftest.py b/src/ceph-volume/ceph_volume/tests/conftest.py index ee58081d97da1..e6bf31737b69c 100644 --- a/src/ceph-volume/ceph_volume/tests/conftest.py +++ b/src/ceph-volume/ceph_volume/tests/conftest.py @@ -360,7 +360,7 @@ def apply(devices=None, lsblk=None, lv=None, blkid=None, udevadm=None, has_bluestore_label=False): if devices: for dev in devices.keys(): - devices[dev]['device_nodes'] = os.path.basename(dev) + devices[dev]['device_nodes'] = [os.path.basename(dev)] else: devices = {} lsblk = lsblk if lsblk else {} diff --git a/src/ceph-volume/ceph_volume/tests/devices/lvm/test_zap.py b/src/ceph-volume/ceph_volume/tests/devices/lvm/test_zap.py index d630a7a6bf887..efe52c053ffc3 100644 --- a/src/ceph-volume/ceph_volume/tests/devices/lvm/test_zap.py +++ b/src/ceph-volume/ceph_volume/tests/devices/lvm/test_zap.py @@ -7,11 +7,30 @@ from ceph_volume.devices.lvm import zap -class TestZap(object): - def test_invalid_osd_id_passed(self): +class TestZap: + def test_invalid_osd_id_passed(self) -> None: with pytest.raises(SystemExit): zap.Zap(argv=['--osd-id', 'foo']).main() + @patch('ceph_volume.util.disk._dd_write', Mock()) + @patch('ceph_volume.util.arg_validators.Device') + def test_clear_replace_header_is_being_replaced(self, m_device: Mock) -> None: + m_dev = m_device.return_value + m_dev.is_being_replaced = True + with pytest.raises(SystemExit) as e: + zap.Zap(argv=['--clear', '/dev/foo']).main() + assert e.value.code == 0 + + @patch('ceph_volume.util.disk._dd_write', Mock()) + @patch('ceph_volume.util.arg_validators.Device') + def test_clear_replace_header_is_not_being_replaced(self, m_device: Mock) -> None: + m_dev = m_device.return_value + m_dev.is_being_replaced = False + with pytest.raises(SystemExit) as e: + zap.Zap(argv=['--clear', '/dev/foo']).main() + assert e.value.code == 1 + + class TestFindAssociatedDevices(object): def test_no_lvs_found_that_match_id(self, monkeypatch, device_info): diff --git a/src/ceph-volume/ceph_volume/tests/test_inventory.py b/src/ceph-volume/ceph_volume/tests/test_inventory.py index 785d8b56e86b6..832c083664212 100644 --- a/src/ceph-volume/ceph_volume/tests/test_inventory.py +++ b/src/ceph-volume/ceph_volume/tests/test_inventory.py @@ -126,6 +126,7 @@ class TestInventory(object): 'lvs', 'device_id', 'lsm_data', + 'being_replaced' ] expected_sys_api_keys = [ diff --git a/src/ceph-volume/ceph_volume/util/arg_validators.py b/src/ceph-volume/ceph_volume/util/arg_validators.py index 99e7d039e742b..e75b34e550e3c 100644 --- a/src/ceph-volume/ceph_volume/util/arg_validators.py +++ b/src/ceph-volume/ceph_volume/util/arg_validators.py @@ -7,6 +7,9 @@ from ceph_volume.util.encryption import set_dmcrypt_no_workqueue +mlogger = terminal.MultiLogger(__name__) + + def valid_osd_id(val): return str(int(val)) @@ -70,6 +73,17 @@ def _is_valid_device(self, raise_sys_exit=True): return self._device +class ValidClearReplaceHeaderDevice(ValidDevice): + def __call__(self, dev_path: str) -> str: + super().get_device(dev_path) + return self._format_device(self._is_valid_device()) + + def _is_valid_device(self) -> Device: + if not self._device.is_being_replaced: + mlogger.info(f'{self.dev_path} has no replacement header.') + return self._device + + class ValidDataDevice(ValidDevice): def __call__(self, dev_path): super().get_device(dev_path) diff --git a/src/ceph-volume/ceph_volume/util/device.py b/src/ceph-volume/ceph_volume/util/device.py index 9c2c11e7f316f..82ee3266e3f1f 100644 --- a/src/ceph-volume/ceph_volume/util/device.py +++ b/src/ceph-volume/ceph_volume/util/device.py @@ -1,13 +1,14 @@ # -*- coding: utf-8 -*- - +# type: ignore import logging import os from functools import total_ordering -from ceph_volume import sys_info, allow_loop_devices +from ceph_volume import sys_info, allow_loop_devices, BEING_REPLACED_HEADER from ceph_volume.api import lvm from ceph_volume.util import disk, system from ceph_volume.util.lsmdisk import LSMDisk from ceph_volume.util.constants import ceph_disk_guids +from typing import List, Tuple logger = logging.getLogger(__name__) @@ -92,6 +93,7 @@ class Device(object): 'sys_api', 'device_id', 'lsm_data', + 'being_replaced' ] pretty_report_sys_fields = [ 'actuators', @@ -136,6 +138,7 @@ def __init__(self, path, with_lsm=False, lvs=None, lsblk_all=None, all_devices_v self._exists = None self._is_lvm_member = None self.ceph_device = False + self.being_replaced: bool = self.is_being_replaced self._parse() if self.path in sys_info.devices.keys(): self.device_nodes = sys_info.devices[self.path]['device_nodes'] @@ -298,7 +301,7 @@ def report(self): rot=self.rotational, available=self.available, model=self.model, - device_nodes=self.device_nodes + device_nodes=','.join(self.device_nodes) ) def json_report(self): @@ -590,7 +593,7 @@ def vg_free(self): return [vg_free] @property - def has_partitions(self): + def has_partitions(self) -> bool: ''' Boolean to determine if a given device has partitions. ''' @@ -598,7 +601,14 @@ def has_partitions(self): return True return False - def _check_generic_reject_reasons(self): + @property + def is_being_replaced(self) -> bool: + ''' + Boolean to indicate if the device is being replaced. + ''' + return disk._dd_read(self.path, 26) == BEING_REPLACED_HEADER + + def _check_generic_reject_reasons(self) -> List[str]: reasons = [ ('id_bus', 'usb', 'id_bus'), ('ro', '1', 'read-only'), @@ -639,9 +649,11 @@ def _check_generic_reject_reasons(self): rejected.append('Has partitions') if self.has_fs: rejected.append('Has a FileSystem') + if self.is_being_replaced: + rejected.append('Is being replaced') return rejected - def _check_lvm_reject_reasons(self): + def _check_lvm_reject_reasons(self) -> Tuple[bool, List[str]]: rejected = [] if self.vgs: available_vgs = [vg for vg in self.vgs if int(vg.vg_free_count) > 10] @@ -654,7 +666,7 @@ def _check_lvm_reject_reasons(self): return len(rejected) == 0, rejected - def _check_raw_reject_reasons(self): + def _check_raw_reject_reasons(self) -> Tuple[bool, List[str]]: rejected = self._check_generic_reject_reasons() if len(self.vgs) > 0: rejected.append('LVM detected') diff --git a/src/ceph-volume/ceph_volume/util/disk.py b/src/ceph-volume/ceph_volume/util/disk.py index 3ac51c11e3469..30ee56808c762 100644 --- a/src/ceph-volume/ceph_volume/util/disk.py +++ b/src/ceph-volume/ceph_volume/util/disk.py @@ -7,7 +7,7 @@ from ceph_volume import process, allow_loop_devices from ceph_volume.api import lvm from ceph_volume.util.system import get_file_contents -from typing import Dict, List, Any +from typing import Dict, List, Any, Union logger = logging.getLogger(__name__) @@ -857,13 +857,14 @@ def get_devices(_sys_block_path='/sys/block', device=''): device_slaves = os.listdir(os.path.join(sysdir, 'slaves')) metadata['partitions'] = get_partitions_facts(sysdir) + metadata['device_nodes'] = [] if device_slaves: - metadata['device_nodes'] = ','.join(device_slaves) + metadata['device_nodes'].extend(device_slaves) else: if block[2] == 'part': - metadata['device_nodes'] = block[3] + metadata['device_nodes'].append(block[3]) else: - metadata['device_nodes'] = devname + metadata['device_nodes'].append(devname) metadata['actuators'] = None if os.path.isdir(sysdir + "/queue/independent_access_ranges/"): @@ -979,7 +980,7 @@ def _dd_read(device: str, count: int, skip: int = 0) -> str: return result -def _dd_write(device: str, data: str, skip: int = 0) -> None: +def _dd_write(device: str, data: Union[str, bytes], skip: int = 0) -> None: """Write bytes to a device Args: @@ -991,10 +992,14 @@ def _dd_write(device: str, data: str, skip: int = 0) -> None: OSError: If there is an error opening or writing to the device. Exception: If any other error occurs during the write operation. """ + + if isinstance(data, str): + data = data.encode('utf-8') + try: with open(device, 'r+b') as b: b.seek(skip) - b.write(data.encode('utf-8')) + b.write(data) except OSError: logger.warning(f"Can't write to {device}") raise diff --git a/src/pybind/mgr/cephadm/ceph_volume.py b/src/pybind/mgr/cephadm/ceph_volume.py new file mode 100644 index 0000000000000..a270bb7028f46 --- /dev/null +++ b/src/pybind/mgr/cephadm/ceph_volume.py @@ -0,0 +1,430 @@ +from cephadm.serve import CephadmServe +from typing import List, TYPE_CHECKING, Any, Dict, Set, Tuple +if TYPE_CHECKING: + from cephadm import CephadmOrchestrator + + +class CephVolume: + def __init__(self, mgr: "CephadmOrchestrator", _inheritance: bool = False) -> None: + self.mgr: "CephadmOrchestrator" = mgr + if not _inheritance: + self.lvm_list: "CephVolumeLvmList" = CephVolumeLvmList(mgr) + + def run_json(self, hostname: str, command: List[str]) -> Dict[str, Any]: + """Execute a JSON command on the specified hostname and return the result. + + This method wraps the asynchronous execution of a JSON command on the + specified hostname, waiting for the command to complete. It utilizes the + `_run_json` method to perform the actual execution. + + Args: + hostname (str): The hostname of the target node where the JSON command + will be executed. + command (List[str]): A list of command arguments to be passed to the + JSON command. + + Returns: + Dict[str, Any]: A dictionary containing the JSON response from the + executed command, which may include various data + based on the command executed. + """ + return self.mgr.wait_async(self._run_json(hostname, command)) + + def run(self, hostname: str, command: List[str], **kw: Any) -> Tuple[List[str], List[str], int]: + """Execute a command on the specified hostname and return the result. + + This method wraps the asynchronous execution of a command on the + specified hostname, waiting for the command to complete. It utilizes the + `_run` method to perform the actual execution. + + Args: + hostname (str): The hostname of the target node where the command + will be executed. + command (List[str]): A list of command arguments to be passed to the + command. + **kw (Any): Additional keyword arguments to customize the command + execution. + + Returns: + Tuple[List[str], List[str], int]: A tuple containing: + - A list of strings representing the standard output of the command. + - A list of strings representing the standard error output of the command. + - An integer representing the return code of the command execution. + """ + return self.mgr.wait_async(self._run(hostname, command, **kw)) + + async def _run(self, + hostname: str, + command: List[str], + **kw: Any) -> Tuple[List[str], List[str], int]: + """Execute a ceph-volume command on the specified hostname and return the result. + + This asynchronous method constructs a ceph-volume command and then executes + it on the specified host. + The result of the command is returned in JSON format. + + Args: + hostname (str): The hostname of the target node where the command will be executed. + command (List[str]): A list of command arguments to be passed to the Ceph command. + **kw (Any): Additional keyword arguments to customize the command execution. + + Returns: + Tuple[List[str], List[str], int]: A tuple containing: + - A list of strings representing the standard output of the command. + - A list of strings representing the standard error output of the command. + - An integer representing the return code of the command execution. + """ + cmd: List[str] = ['--'] + cmd.extend(command) + result = await CephadmServe(self.mgr)._run_cephadm( + hostname, 'osd', 'ceph-volume', + cmd, + **kw) + return result + + async def _run_json(self, + hostname: str, + command: List[str]) -> Dict[str, Any]: + """Execute a ceph-volume command on a specified hostname. + + This asynchronous method constructs a ceph-volume command and then executes + it on the specified host. + The result of the command is returned in JSON format. + + Args: + hostname (str): The hostname of the target node where the command will be executed. + command (List[str]): A list of command arguments to be passed to the Ceph command. + + Returns: + Dict[str, Any]: The result of the command execution as a dictionary parsed from + the JSON output. + """ + cmd: List[str] = ['--'] + cmd.extend(command) + result = await CephadmServe(self.mgr)._run_cephadm_json( + hostname, 'osd', 'ceph-volume', + cmd) + return result + + def clear_replace_header(self, hostname: str, device: str) -> str: + """Clear the replacement header on a specified device for a given hostname. + + This method checks if a replacement header exists on the specified device + and clears it if found. After clearing, it invalidates the cached device + information for the specified hostname and kicks the serve loop. + + Args: + hostname (str): The hostname of the device on which the replacement header + will be cleared. This is used to identify the specific + device within the manager's context. + device (str): The path to the device (e.g., '/dev/sda') from which the + replacement header will be cleared. + + Returns: + str: A message indicating the result of the operation. It will either confirm + that the replacement header was cleared or state that no replacement header + was detected on the device. + """ + output: str = '' + result = self.run(hostname, ['lvm', + 'zap', + '--clear-replace-header', + device], + error_ok=True) + out, err, rc = result + if not rc: + output = f'Replacement header cleared on {device}' + self.mgr.cache.invalidate_host_devices(hostname) + self.mgr._kick_serve_loop() + else: + plain_out: str = '\n'.join(out) + plain_err: str = '\n'.join(err) + output = f'No replacement header could be cleared on {device}.\n{plain_out}\n{plain_err}' + return output + + +class CephVolumeLvmList(CephVolume): + def __init__(self, mgr: "CephadmOrchestrator") -> None: + super().__init__(mgr, True) + self.data: Dict[str, Any] = {} + + def get_data(self, hostname: str) -> None: + """Execute the `ceph-volume lvm list` command to list LVM-based OSDs. + + This asynchronous method interacts with the Ceph manager to retrieve + information about the Logical Volume Manager (LVM) devices associated + with the OSDs. It calls the `ceph-volume lvm list` command in JSON format + to gather relevant data. + + Returns: + None: This method does not return a value. The retrieved data is + stored in the `self.data` attribute for further processing. + """ + self.data = self.run_json(hostname, + ['lvm', 'list', '--format', 'json']) + + def devices_by_type(self, device_type: str) -> List[str]: + """Retrieve a list of devices of a specified type across all OSDs. + + This method iterates through all OSDs and collects devices that match + the specified type (e.g., 'block', 'db', 'wal'). The resulting list + contains unique device paths. + + Args: + device_type (str): The type of devices to retrieve. This should + be one of the recognized device types such as + 'block', 'db', or 'wal'. + + Returns: + List[str]: A list of unique device paths of the specified type + found across all OSDs. If no devices of the specified + type are found, an empty list is returned. + """ + result: Set[str] = set() + for osd in self.osd_ids(): + for lv in self.data.get(osd, []): + if lv.get('type') == device_type: + result.update(lv.get('devices', [])) + return list(result) + + def block_devices(self) -> List[str]: + """List all block devices used by OSDs. + + This method returns a list of devices that are used as 'block' devices + for storing the main OSD data. + + Returns: + List[str]: A list of device paths (strings) that are used as 'block' devices. + """ + return self.devices_by_type('block') + + def db_devices(self) -> List[str]: + """List all database (DB) devices used by OSDs. + + This method returns a list of devices that are used as 'db' devices + for storing the database files associated with OSDs. + + Returns: + List[str]: A list of device paths (strings) that are used as 'db' devices. + """ + return self.devices_by_type('db') + + def wal_devices(self) -> List[str]: + """List all write-ahead log (WAL) devices used by OSDs. + + This method returns a list of devices that are used as 'wal' devices + for storing write-ahead log data associated with OSDs. + + Returns: + List[str]: A list of device paths (strings) that are used as 'wal' devices. + """ + return self.devices_by_type('wal') + + def all_devices(self) -> List[str]: + """List all devices used by OSDs for 'block', 'db', or 'wal' purposes. + + This method aggregates all devices that are currently used by the OSDs + in the system for the following device types: + - 'block' devices: Used to store the OSD's data. + - 'db' devices: Used for database purposes. + - 'wal' devices: Used for Write-Ahead Logging. + + The returned list combines devices from all these categories. + + Returns: + List[str]: A list of device paths (strings) that are used as 'block', 'db', or 'wal' devices. + """ + return self.block_devices() + self.db_devices() + self.wal_devices() + + def device_osd_mapping(self, device_type: str = '') -> Dict[str, Dict[str, List[str]]]: + """Create a mapping of devices to their corresponding OSD IDs based on device type. + + This method serves as a 'proxy' function, designed to be called by the *_device_osd_mapping() methods. + + This method iterates over the OSDs and their logical volumes to build a + dictionary that maps each device of the specified type to the list of + OSD IDs that use it. The resulting dictionary can be used to determine + which OSDs share a specific device. + + Args: + device_type (str): The type of the device to filter by (e.g., 'block', 'db', or 'wal'). + If an empty string is provided, devices of all types will be included. + + Returns: + Dict[str, Dict[str, List[str]]]: A dictionary where the keys are device + names and the values are dictionaries containing a list of OSD IDs + that use the corresponding device. + + eg: + ``` + { + '/dev/vda': {'osd_ids': ['0', '1']}, + '/dev/vdb': {'osd_ids': ['2']} + } + ``` + + """ + result: Dict[str, Dict[str, List[str]]] = {} + for osd in self.osd_ids(): + for lv in self.data.get(osd, []): + if lv.get('type') == device_type or not device_type: + for device in lv.get('devices', []): + if device not in result: + result[device] = {'osd_ids': []} + result[device]['osd_ids'].append(osd) + return result + + def block_device_osd_mapping(self) -> Dict[str, Dict[str, List[str]]]: + """Get a dictionnary with all block devices and their corresponding + osd(s) id(s). + + eg: + ``` + {'/dev/vdb': {'osd_ids': ['0']}, + '/dev/vdc': {'osd_ids': ['1']}, + '/dev/vdf': {'osd_ids': ['2']}, + '/dev/vde': {'osd_ids': ['3', '4']}} + ``` + + Returns: + Dict[str, Dict[str, List[str]]]: A dict including all block devices with their corresponding + osd id(s). + """ + return self.device_osd_mapping('block') + + def db_device_osd_mapping(self) -> Dict[str, Dict[str, List[str]]]: + """Get a dictionnary with all db devices and their corresponding + osd(s) id(s). + + eg: + ``` + {'/dev/vdv': {'osd_ids': ['0', '1', '2', '3']}, + '/dev/vdx': {'osd_ids': ['4']}} + ``` + + Returns: + Dict[str, Dict[str, List[str]]]: A dict including all db devices with their corresponding + osd id(s). + """ + return self.device_osd_mapping('db') + + def wal_device_osd_mapping(self) -> Dict[str, Dict[str, List[str]]]: + """Get a dictionnary with all wal devices and their corresponding + osd(s) id(s). + + eg: + ``` + {'/dev/vdy': {'osd_ids': ['0', '1', '2', '3']}, + '/dev/vdz': {'osd_ids': ['4']}} + ``` + + Returns: + Dict[str, Dict[str, List[str]]]: A dict including all wal devices with their corresponding + osd id(s). + """ + return self.device_osd_mapping('wal') + + def is_shared_device(self, device: str) -> bool: + """Determines if a device is shared between multiple OSDs. + + This method checks if a given device is shared by multiple OSDs for a specified device type + (such as 'block', 'db', or 'wal'). If the device is associated with more than one OSD, + it is considered shared. + + Args: + device (str): The device path to check (e.g., '/dev/sda'). + device_type (str): The type of the device (e.g., 'block', 'db', 'wal'). + + Raises: + RuntimeError: If the device is not valid or not found in the shared devices mapping. + + Returns: + bool: True if the device is shared by more than one OSD, False otherwise. + """ + device_osd_mapping = self.device_osd_mapping() + if not device or device not in device_osd_mapping: + raise RuntimeError('Not a valid device path.') + return len(device_osd_mapping[device]['osd_ids']) > 1 + + def is_block_device(self, device: str) -> bool: + """Check if a specified device is a block device. + + This method checks if the specified device is included in the + list of block devices used by OSDs. + + Args: + device (str): The path of the device to check. + + Returns: + bool: True if the device is a block device, + False otherwise. + """ + return device in self.block_devices() + + def is_db_device(self, device: str) -> bool: + """Check if a specified device is a DB device. + + This method checks if the specified device is included in the + list of DB devices used by OSDs. + + Args: + device (str): The path of the device to check. + + Returns: + bool: True if the device is a DB device, + False otherwise. + """ + return device in self.db_devices() + + def is_wal_device(self, device: str) -> bool: + """Check if a specified device is a WAL device. + + This method checks if the specified device is included in the + list of WAL devices used by OSDs. + + Args: + device (str): The path of the device to check. + + Returns: + bool: True if the device is a WAL device, + False otherwise. + """ + return device in self.wal_devices() + + def get_block_devices_from_osd_id(self, osd_id: str) -> List[str]: + """Retrieve the list of block devices associated with a given OSD ID. + + This method looks up the specified OSD ID in the `data` attribute + and returns a list of devices that are of type 'block'. If there are + no devices of type 'block' for the specified OSD ID, an empty list is returned. + + Args: + osd_id (str): The OSD ID for which to retrieve block devices. + + Returns: + List[str]: A list of block device paths associated with the + specified OSD ID. If no block devices are found, + an empty list is returned. + """ + result: List[str] = [] + for lv in self.data.get(osd_id, []): + if lv.get('type') == 'block': + result = lv.get('devices', []) + return result + + def osd_ids(self) -> List[str]: + """Retrieve the list of OSD IDs. + + This method returns a list of OSD IDs by extracting the keys + from the `data` attribute, which is expected to contain + information about OSDs. If there is no data available, an + empty list is returned. + + Returns: + List[str]: A list of OSD IDs. If no data is present, + an empty list is returned. + """ + result: List[str] = [] + if self.data: + result = list(self.data.keys()) + return result diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index f8f0efc9d2831..dc43b48726379 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -101,6 +101,7 @@ from .configchecks import CephadmConfigChecks from .offline_watcher import OfflineHostWatcher from .tuned_profiles import TunedProfileUtils +from .ceph_volume import CephVolume try: import asyncssh @@ -792,6 +793,8 @@ def __init__(self, *args: Any, **kwargs: Any): # as part of the handling of stray daemons self.recently_altered_daemons: Dict[str, datetime.datetime] = {} + self.ceph_volume: CephVolume = CephVolume(self) + def shutdown(self) -> None: self.log.debug('shutdown') self._worker_pool.close() @@ -3828,9 +3831,56 @@ def upgrade_resume(self) -> str: def upgrade_stop(self) -> str: return self.upgrade.upgrade_stop() + @handle_orch_error + def replace_device(self, + hostname: str, + device: str, + clear: bool = False, + yes_i_really_mean_it: bool = False) -> Any: + output: str = '' + + self.ceph_volume.lvm_list.get_data(hostname=hostname) + + if clear: + output = self.ceph_volume.clear_replace_header(hostname, device) + else: + osds_to_zap: List[str] = [] + if hostname not in list(self.inventory.keys()): + raise OrchestratorError(f'{hostname} invalid host.') + + if device not in self.ceph_volume.lvm_list.all_devices(): + raise OrchestratorError(f"{device} doesn't appear to be used for an OSD, not a valid device in {hostname}.") + + device_osd_mapping = self.ceph_volume.lvm_list.device_osd_mapping() + osds_to_zap = device_osd_mapping[device]['osd_ids'] + + if self.ceph_volume.lvm_list.is_shared_device(device): + if not yes_i_really_mean_it: + raise OrchestratorError(f'{device} is a shared device.\n' + f'Replacing {device} implies destroying OSD(s): {osds_to_zap}.\n' + 'Please, *be very careful*, this can be a very dangerous operation.\n' + 'If you know what you are doing, pass --yes-i-really-mean-it') + if not self.to_remove_osds.rm_util.safe_to_destroy([int(osd_id) for osd_id in osds_to_zap]): + raise OrchestratorError(f"Destroying OSD(s) {osds_to_zap} would cause some PGs to be undersized/degraded.\n" + 'Refusing to proceed.') + replace_block: bool = self.ceph_volume.lvm_list.is_block_device(device) + replace_db: bool = self.ceph_volume.lvm_list.is_db_device(device) + replace_wal: bool = self.ceph_volume.lvm_list.is_wal_device(device) + + self.remove_osds(list(osds_to_zap), + replace_block=replace_block, + replace_db=replace_db, + replace_wal=replace_wal) + + output = f'Scheduled to destroy osds: {osds_to_zap} and mark {device} as being replaced.' + return output + @handle_orch_error def remove_osds(self, osd_ids: List[str], replace: bool = False, + replace_block: bool = False, + replace_db: bool = False, + replace_wal: bool = False, force: bool = False, zap: bool = False, no_destroy: bool = False) -> str: @@ -3853,6 +3903,9 @@ def remove_osds(self, osd_ids: List[str], try: self.to_remove_osds.enqueue(OSD(osd_id=int(daemon.daemon_id), replace=replace, + replace_block=replace_block, + replace_db=replace_db, + replace_wal=replace_wal, force=force, zap=zap, no_destroy=no_destroy, diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index 611c27c34538a..4a7959ae04502 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -96,7 +96,10 @@ def serve(self) -> None: if not self.mgr.paused: self._run_async_actions() - self.mgr.to_remove_osds.process_removal_queue() + removal_queue_result = self.mgr.to_remove_osds.process_removal_queue() + self.log.debug(f'process_removal_queue() returned = {removal_queue_result}') + if removal_queue_result: + continue self.mgr.migration.migrate() if self.mgr.migration.is_migration_ongoing(): diff --git a/src/pybind/mgr/cephadm/services/osd.py b/src/pybind/mgr/cephadm/services/osd.py index 9b09b8c9f4925..80bf92772c49b 100644 --- a/src/pybind/mgr/cephadm/services/osd.py +++ b/src/pybind/mgr/cephadm/services/osd.py @@ -551,6 +551,12 @@ def zap_osd(self, osd: "OSD") -> str: "Zaps all devices that are associated with an OSD" if osd.hostname is not None: cmd = ['--', 'lvm', 'zap', '--osd-id', str(osd.osd_id)] + if osd.replace_block: + cmd.append('--replace-block') + if osd.replace_db: + cmd.append('--replace-db') + if osd.replace_wal: + cmd.append('--replace-wal') if not osd.no_destroy: cmd.append('--destroy') with self.mgr.async_timeout_handler(osd.hostname, f'cephadm ceph-volume {" ".join(cmd)}'): @@ -618,6 +624,9 @@ def __init__(self, started: bool = False, stopped: bool = False, replace: bool = False, + replace_block: bool = False, + replace_db: bool = False, + replace_wal: bool = False, force: bool = False, hostname: Optional[str] = None, zap: bool = False, @@ -649,6 +658,12 @@ def __init__(self, # If this is a replace or remove operation self.replace = replace + # If this is a block device replacement + self.replace_block = replace_block + # If this is a db device replacement + self.replace_db = replace_db + # If this is a wal device replacement + self.replace_wal = replace_wal # If we wait for the osd to be drained self.force = force # The name of the node @@ -676,7 +691,7 @@ def start_draining(self) -> bool: if self.stopped: logger.debug(f"Won't start draining {self}. OSD draining is stopped.") return False - if self.replace: + if self.any_replace_params: self.rm_util.set_osd_flag([self], 'out') else: self.rm_util.reweight_osd(self, 0.0) @@ -686,7 +701,7 @@ def start_draining(self) -> bool: return True def stop_draining(self) -> bool: - if self.replace: + if self.any_replace_params: self.rm_util.set_osd_flag([self], 'in') else: if self.original_weight: @@ -764,6 +779,9 @@ def to_json(self) -> dict: out['draining'] = self.draining out['stopped'] = self.stopped out['replace'] = self.replace + out['replace_block'] = self.replace_block + out['replace_db'] = self.replace_db + out['replace_wal'] = self.replace_wal out['force'] = self.force out['zap'] = self.zap out['hostname'] = self.hostname # type: ignore @@ -789,6 +807,13 @@ def from_json(cls, inp: Optional[Dict[str, Any]], rm_util: RemoveUtil) -> Option inp['hostname'] = hostname return cls(**inp) + @property + def any_replace_params(self) -> bool: + return any([self.replace, + self.replace_block, + self.replace_db, + self.replace_wal]) + def __hash__(self) -> int: return hash(self.osd_id) @@ -812,7 +837,7 @@ def __init__(self, mgr: "CephadmOrchestrator") -> None: # network calls, like mon commands. self.lock = Lock() - def process_removal_queue(self) -> None: + def process_removal_queue(self) -> bool: """ Performs actions in the _serve() loop to remove an OSD when criteria is met. @@ -820,6 +845,8 @@ def process_removal_queue(self) -> None: we can't hold self.lock, as we're calling _remove_daemon in the loop """ + result: bool = False + # make sure that we don't run on OSDs that are not in the cluster anymore. self.cleanup() @@ -863,16 +890,23 @@ def process_removal_queue(self) -> None: if self.mgr.cache.has_daemon(f'osd.{osd.osd_id}'): CephadmServe(self.mgr)._remove_daemon(f'osd.{osd.osd_id}', osd.hostname) logger.info(f"Successfully removed {osd} on {osd.hostname}") + result = True else: logger.info(f"Daemon {osd} on {osd.hostname} was already removed") - if osd.replace: + any_replace_params: bool = any([osd.replace, + osd.replace_block, + osd.replace_db, + osd.replace_wal]) + if any_replace_params: # mark destroyed in osdmap if not osd.destroy(): raise orchestrator.OrchestratorError( f"Could not destroy {osd}") logger.info( f"Successfully destroyed old {osd} on {osd.hostname}; ready for replacement") + if any_replace_params: + osd.zap = True else: # purge from osdmap if not osd.purge(): @@ -884,7 +918,7 @@ def process_removal_queue(self) -> None: logger.info(f"Zapping devices for {osd} on {osd.hostname}") osd.do_zap() logger.info(f"Successfully zapped devices for {osd} on {osd.hostname}") - + self.mgr.cache.invalidate_host_devices(osd.hostname) logger.debug(f"Removing {osd} from the queue.") # self could change while this is processing (osds get added from the CLI) @@ -893,6 +927,7 @@ def process_removal_queue(self) -> None: with self.lock: self.osds.intersection_update(new_queue) self._save_to_store() + return result def cleanup(self) -> None: # OSDs can always be cleaned up manually. This ensures that we run on existing OSDs diff --git a/src/pybind/mgr/cephadm/tests/ceph_volume_data.py b/src/pybind/mgr/cephadm/tests/ceph_volume_data.py new file mode 100644 index 0000000000000..afd6d89d39e40 --- /dev/null +++ b/src/pybind/mgr/cephadm/tests/ceph_volume_data.py @@ -0,0 +1 @@ +data = '{"0":[{"devices":["/dev/vdb"],"lv_name":"osd-block-8cd7fa43-ef40-49e7-abb2-db5cfd91bc92","lv_path":"/dev/ceph-81c76363-7a89-47d2-83c1-fdcbab5d6668/osd-block-8cd7fa43-ef40-49e7-abb2-db5cfd91bc92","lv_size":"214744170496","lv_tags":"ceph.block_device=/dev/ceph-81c76363-7a89-47d2-83c1-fdcbab5d6668/osd-block-8cd7fa43-ef40-49e7-abb2-db5cfd91bc92,ceph.block_uuid=d518Lz-gTnC-FyX7-4MN2-icIp-LBCB-zdQw2p,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.db_device=/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-f0f5e20c-f1ee-42df-9a78-0e70b9c08e6c,ceph.db_uuid=EInXUQ-LDDO-7jCL-Y0Jb-tPZ2-KuKl-VNJ2hX,ceph.encrypted=0,ceph.osd_fsid=8cd7fa43-ef40-49e7-abb2-db5cfd91bc92,ceph.osd_id=0,ceph.osdspec_affinity=osd.shared_db,ceph.type=block,ceph.vdo=0,ceph.with_tpm=0","lv_uuid":"d518Lz-gTnC-FyX7-4MN2-icIp-LBCB-zdQw2p","name":"osd-block-8cd7fa43-ef40-49e7-abb2-db5cfd91bc92","path":"/dev/ceph-81c76363-7a89-47d2-83c1-fdcbab5d6668/osd-block-8cd7fa43-ef40-49e7-abb2-db5cfd91bc92","tags":{"ceph.block_device":"/dev/ceph-81c76363-7a89-47d2-83c1-fdcbab5d6668/osd-block-8cd7fa43-ef40-49e7-abb2-db5cfd91bc92","ceph.block_uuid":"d518Lz-gTnC-FyX7-4MN2-icIp-LBCB-zdQw2p","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.db_device":"/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-f0f5e20c-f1ee-42df-9a78-0e70b9c08e6c","ceph.db_uuid":"EInXUQ-LDDO-7jCL-Y0Jb-tPZ2-KuKl-VNJ2hX","ceph.encrypted":"0","ceph.osd_fsid":"8cd7fa43-ef40-49e7-abb2-db5cfd91bc92","ceph.osd_id":"0","ceph.osdspec_affinity":"osd.shared_db","ceph.type":"block","ceph.vdo":"0","ceph.with_tpm":"0"},"type":"block","vg_name":"ceph-81c76363-7a89-47d2-83c1-fdcbab5d6668"},{"devices":["/dev/vdk"],"lv_name":"osd-db-f0f5e20c-f1ee-42df-9a78-0e70b9c08e6c","lv_path":"/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-f0f5e20c-f1ee-42df-9a78-0e70b9c08e6c","lv_size":"107369988096","lv_tags":"ceph.block_device=/dev/ceph-81c76363-7a89-47d2-83c1-fdcbab5d6668/osd-block-8cd7fa43-ef40-49e7-abb2-db5cfd91bc92,ceph.block_uuid=d518Lz-gTnC-FyX7-4MN2-icIp-LBCB-zdQw2p,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.db_device=/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-f0f5e20c-f1ee-42df-9a78-0e70b9c08e6c,ceph.db_uuid=EInXUQ-LDDO-7jCL-Y0Jb-tPZ2-KuKl-VNJ2hX,ceph.encrypted=0,ceph.osd_fsid=8cd7fa43-ef40-49e7-abb2-db5cfd91bc92,ceph.osd_id=0,ceph.osdspec_affinity=osd.shared_db,ceph.type=db,ceph.vdo=0,ceph.with_tpm=0","lv_uuid":"EInXUQ-LDDO-7jCL-Y0Jb-tPZ2-KuKl-VNJ2hX","name":"osd-db-f0f5e20c-f1ee-42df-9a78-0e70b9c08e6c","path":"/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-f0f5e20c-f1ee-42df-9a78-0e70b9c08e6c","tags":{"ceph.block_device":"/dev/ceph-81c76363-7a89-47d2-83c1-fdcbab5d6668/osd-block-8cd7fa43-ef40-49e7-abb2-db5cfd91bc92","ceph.block_uuid":"d518Lz-gTnC-FyX7-4MN2-icIp-LBCB-zdQw2p","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.db_device":"/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-f0f5e20c-f1ee-42df-9a78-0e70b9c08e6c","ceph.db_uuid":"EInXUQ-LDDO-7jCL-Y0Jb-tPZ2-KuKl-VNJ2hX","ceph.encrypted":"0","ceph.osd_fsid":"8cd7fa43-ef40-49e7-abb2-db5cfd91bc92","ceph.osd_id":"0","ceph.osdspec_affinity":"osd.shared_db","ceph.type":"db","ceph.vdo":"0","ceph.with_tpm":"0"},"type":"db","vg_name":"ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf"}],"1":[{"devices":["/dev/vdc"],"lv_name":"osd-block-aaa4c8cb-2b54-4df8-9846-17063c59b6ce","lv_path":"/dev/ceph-964cfc71-ad91-4189-97c1-cab4fd3066bb/osd-block-aaa4c8cb-2b54-4df8-9846-17063c59b6ce","lv_size":"214744170496","lv_tags":"ceph.block_device=/dev/ceph-964cfc71-ad91-4189-97c1-cab4fd3066bb/osd-block-aaa4c8cb-2b54-4df8-9846-17063c59b6ce,ceph.block_uuid=Ccvedr-7t3C-BgIg-lfSl-qW3J-Zw1V-FuH14l,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.db_device=/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-38f53373-7575-4c90-98ca-28f189685774,ceph.db_uuid=1mEAHd-mxQn-Qr9c-DkD8-XGOQ-xfIN-ZsPReC,ceph.encrypted=0,ceph.osd_fsid=aaa4c8cb-2b54-4df8-9846-17063c59b6ce,ceph.osd_id=1,ceph.osdspec_affinity=osd.shared_db,ceph.type=block,ceph.vdo=0,ceph.with_tpm=0","lv_uuid":"Ccvedr-7t3C-BgIg-lfSl-qW3J-Zw1V-FuH14l","name":"osd-block-aaa4c8cb-2b54-4df8-9846-17063c59b6ce","path":"/dev/ceph-964cfc71-ad91-4189-97c1-cab4fd3066bb/osd-block-aaa4c8cb-2b54-4df8-9846-17063c59b6ce","tags":{"ceph.block_device":"/dev/ceph-964cfc71-ad91-4189-97c1-cab4fd3066bb/osd-block-aaa4c8cb-2b54-4df8-9846-17063c59b6ce","ceph.block_uuid":"Ccvedr-7t3C-BgIg-lfSl-qW3J-Zw1V-FuH14l","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.db_device":"/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-38f53373-7575-4c90-98ca-28f189685774","ceph.db_uuid":"1mEAHd-mxQn-Qr9c-DkD8-XGOQ-xfIN-ZsPReC","ceph.encrypted":"0","ceph.osd_fsid":"aaa4c8cb-2b54-4df8-9846-17063c59b6ce","ceph.osd_id":"1","ceph.osdspec_affinity":"osd.shared_db","ceph.type":"block","ceph.vdo":"0","ceph.with_tpm":"0"},"type":"block","vg_name":"ceph-964cfc71-ad91-4189-97c1-cab4fd3066bb"},{"devices":["/dev/vdk"],"lv_name":"osd-db-38f53373-7575-4c90-98ca-28f189685774","lv_path":"/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-38f53373-7575-4c90-98ca-28f189685774","lv_size":"107369988096","lv_tags":"ceph.block_device=/dev/ceph-964cfc71-ad91-4189-97c1-cab4fd3066bb/osd-block-aaa4c8cb-2b54-4df8-9846-17063c59b6ce,ceph.block_uuid=Ccvedr-7t3C-BgIg-lfSl-qW3J-Zw1V-FuH14l,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.db_device=/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-38f53373-7575-4c90-98ca-28f189685774,ceph.db_uuid=1mEAHd-mxQn-Qr9c-DkD8-XGOQ-xfIN-ZsPReC,ceph.encrypted=0,ceph.osd_fsid=aaa4c8cb-2b54-4df8-9846-17063c59b6ce,ceph.osd_id=1,ceph.osdspec_affinity=osd.shared_db,ceph.type=db,ceph.vdo=0,ceph.with_tpm=0","lv_uuid":"1mEAHd-mxQn-Qr9c-DkD8-XGOQ-xfIN-ZsPReC","name":"osd-db-38f53373-7575-4c90-98ca-28f189685774","path":"/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-38f53373-7575-4c90-98ca-28f189685774","tags":{"ceph.block_device":"/dev/ceph-964cfc71-ad91-4189-97c1-cab4fd3066bb/osd-block-aaa4c8cb-2b54-4df8-9846-17063c59b6ce","ceph.block_uuid":"Ccvedr-7t3C-BgIg-lfSl-qW3J-Zw1V-FuH14l","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.db_device":"/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-38f53373-7575-4c90-98ca-28f189685774","ceph.db_uuid":"1mEAHd-mxQn-Qr9c-DkD8-XGOQ-xfIN-ZsPReC","ceph.encrypted":"0","ceph.osd_fsid":"aaa4c8cb-2b54-4df8-9846-17063c59b6ce","ceph.osd_id":"1","ceph.osdspec_affinity":"osd.shared_db","ceph.type":"db","ceph.vdo":"0","ceph.with_tpm":"0"},"type":"db","vg_name":"ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf"}],"2":[{"devices":["/dev/vdf"],"lv_name":"osd-block-a0434b49-759a-46a4-91dc-d7cc65af3a33","lv_path":"/dev/ceph-3ba7a728-709b-408c-a043-9e48704b5ffb/osd-block-a0434b49-759a-46a4-91dc-d7cc65af3a33","lv_size":"214744170496","lv_tags":"ceph.block_device=/dev/ceph-3ba7a728-709b-408c-a043-9e48704b5ffb/osd-block-a0434b49-759a-46a4-91dc-d7cc65af3a33,ceph.block_uuid=adQsil-KScK-5QkX-bLbg-EpJa-sNJL-3oDtaO,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.encrypted=0,ceph.osd_fsid=a0434b49-759a-46a4-91dc-d7cc65af3a33,ceph.osd_id=2,ceph.osdspec_affinity=None,ceph.type=block,ceph.vdo=0,ceph.with_tpm=0","lv_uuid":"adQsil-KScK-5QkX-bLbg-EpJa-sNJL-3oDtaO","name":"osd-block-a0434b49-759a-46a4-91dc-d7cc65af3a33","path":"/dev/ceph-3ba7a728-709b-408c-a043-9e48704b5ffb/osd-block-a0434b49-759a-46a4-91dc-d7cc65af3a33","tags":{"ceph.block_device":"/dev/ceph-3ba7a728-709b-408c-a043-9e48704b5ffb/osd-block-a0434b49-759a-46a4-91dc-d7cc65af3a33","ceph.block_uuid":"adQsil-KScK-5QkX-bLbg-EpJa-sNJL-3oDtaO","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.encrypted":"0","ceph.osd_fsid":"a0434b49-759a-46a4-91dc-d7cc65af3a33","ceph.osd_id":"2","ceph.osdspec_affinity":"None","ceph.type":"block","ceph.vdo":"0","ceph.with_tpm":"0"},"type":"block","vg_name":"ceph-3ba7a728-709b-408c-a043-9e48704b5ffb"}],"3":[{"devices":["/dev/vde"],"lv_name":"osd-block-861ea81a-c24b-4c69-b4f6-e527151b132f","lv_path":"/dev/ceph-97ac74d9-d351-4a7e-bbd1-27b8dd3e7f7b/osd-block-861ea81a-c24b-4c69-b4f6-e527151b132f","lv_size":"214744170496","lv_tags":"ceph.block_device=/dev/ceph-97ac74d9-d351-4a7e-bbd1-27b8dd3e7f7b/osd-block-861ea81a-c24b-4c69-b4f6-e527151b132f,ceph.block_uuid=GBfm14-4hPu-oaWk-wSdA-O1Fw-eU5o-Q2KOh8,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.encrypted=0,ceph.osd_fsid=861ea81a-c24b-4c69-b4f6-e527151b132f,ceph.osd_id=3,ceph.osdspec_affinity=None,ceph.type=block,ceph.vdo=0,ceph.with_tpm=0","lv_uuid":"GBfm14-4hPu-oaWk-wSdA-O1Fw-eU5o-Q2KOh8","name":"osd-block-861ea81a-c24b-4c69-b4f6-e527151b132f","path":"/dev/ceph-97ac74d9-d351-4a7e-bbd1-27b8dd3e7f7b/osd-block-861ea81a-c24b-4c69-b4f6-e527151b132f","tags":{"ceph.block_device":"/dev/ceph-97ac74d9-d351-4a7e-bbd1-27b8dd3e7f7b/osd-block-861ea81a-c24b-4c69-b4f6-e527151b132f","ceph.block_uuid":"GBfm14-4hPu-oaWk-wSdA-O1Fw-eU5o-Q2KOh8","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.encrypted":"0","ceph.osd_fsid":"861ea81a-c24b-4c69-b4f6-e527151b132f","ceph.osd_id":"3","ceph.osdspec_affinity":"None","ceph.type":"block","ceph.vdo":"0","ceph.with_tpm":"0"},"type":"block","vg_name":"ceph-97ac74d9-d351-4a7e-bbd1-27b8dd3e7f7b"}],"4":[{"devices":["/dev/vdg"],"lv_name":"osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd","lv_path":"/dev/ceph-20acdce8-5548-4707-a38e-b8e925485bc5/osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd","lv_size":"214744170496","lv_tags":"ceph.block_device=/dev/ceph-20acdce8-5548-4707-a38e-b8e925485bc5/osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd,ceph.block_uuid=diO6OQ-jjkD-tdVS-FJ5f-VcP7-8QEW-geP4Ds,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.db_device=/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-19fc3a21-ce53-4881-9217-f1d58166af16,ceph.db_uuid=5mng9E-Q3ej-37eY-Ny9C-p6wf-h17w-gC3jtx,ceph.encrypted=0,ceph.osd_fsid=242c4a21-b076-424c-94fb-3f556ed2ddbd,ceph.osd_id=4,ceph.osdspec_affinity=osd.shared_db_wal,ceph.type=block,ceph.vdo=0,ceph.wal_device=/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008,ceph.wal_uuid=ppb82k-9cEs-yb1K-QTNl-c4BM-33PQ-bNX0c2,ceph.with_tpm=0","lv_uuid":"diO6OQ-jjkD-tdVS-FJ5f-VcP7-8QEW-geP4Ds","name":"osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd","path":"/dev/ceph-20acdce8-5548-4707-a38e-b8e925485bc5/osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd","tags":{"ceph.block_device":"/dev/ceph-20acdce8-5548-4707-a38e-b8e925485bc5/osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd","ceph.block_uuid":"diO6OQ-jjkD-tdVS-FJ5f-VcP7-8QEW-geP4Ds","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.db_device":"/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-19fc3a21-ce53-4881-9217-f1d58166af16","ceph.db_uuid":"5mng9E-Q3ej-37eY-Ny9C-p6wf-h17w-gC3jtx","ceph.encrypted":"0","ceph.osd_fsid":"242c4a21-b076-424c-94fb-3f556ed2ddbd","ceph.osd_id":"4","ceph.osdspec_affinity":"osd.shared_db_wal","ceph.type":"block","ceph.vdo":"0","ceph.wal_device":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008","ceph.wal_uuid":"ppb82k-9cEs-yb1K-QTNl-c4BM-33PQ-bNX0c2","ceph.with_tpm":"0"},"type":"block","vg_name":"ceph-20acdce8-5548-4707-a38e-b8e925485bc5"},{"devices":["/dev/vdj"],"lv_name":"osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008","lv_path":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008","lv_size":"107369988096","lv_tags":"ceph.block_device=/dev/ceph-20acdce8-5548-4707-a38e-b8e925485bc5/osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd,ceph.block_uuid=diO6OQ-jjkD-tdVS-FJ5f-VcP7-8QEW-geP4Ds,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.encrypted=0,ceph.osd_fsid=242c4a21-b076-424c-94fb-3f556ed2ddbd,ceph.osd_id=4,ceph.osdspec_affinity=osd.shared_db_wal,ceph.type=wal,ceph.vdo=0,ceph.wal_device=/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008,ceph.wal_uuid=ppb82k-9cEs-yb1K-QTNl-c4BM-33PQ-bNX0c2,ceph.with_tpm=0","lv_uuid":"ppb82k-9cEs-yb1K-QTNl-c4BM-33PQ-bNX0c2","name":"osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008","path":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008","tags":{"ceph.block_device":"/dev/ceph-20acdce8-5548-4707-a38e-b8e925485bc5/osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd","ceph.block_uuid":"diO6OQ-jjkD-tdVS-FJ5f-VcP7-8QEW-geP4Ds","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.encrypted":"0","ceph.osd_fsid":"242c4a21-b076-424c-94fb-3f556ed2ddbd","ceph.osd_id":"4","ceph.osdspec_affinity":"osd.shared_db_wal","ceph.type":"wal","ceph.vdo":"0","ceph.wal_device":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008","ceph.wal_uuid":"ppb82k-9cEs-yb1K-QTNl-c4BM-33PQ-bNX0c2","ceph.with_tpm":"0"},"type":"wal","vg_name":"ceph-776f980b-152a-4e8f-99b6-bae27ed0b528"},{"devices":["/dev/vdi"],"lv_name":"osd-db-19fc3a21-ce53-4881-9217-f1d58166af16","lv_path":"/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-19fc3a21-ce53-4881-9217-f1d58166af16","lv_size":"107369988096","lv_tags":"ceph.block_device=/dev/ceph-20acdce8-5548-4707-a38e-b8e925485bc5/osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd,ceph.block_uuid=diO6OQ-jjkD-tdVS-FJ5f-VcP7-8QEW-geP4Ds,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.db_device=/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-19fc3a21-ce53-4881-9217-f1d58166af16,ceph.db_uuid=5mng9E-Q3ej-37eY-Ny9C-p6wf-h17w-gC3jtx,ceph.encrypted=0,ceph.osd_fsid=242c4a21-b076-424c-94fb-3f556ed2ddbd,ceph.osd_id=4,ceph.osdspec_affinity=osd.shared_db_wal,ceph.type=db,ceph.vdo=0,ceph.wal_device=/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008,ceph.wal_uuid=ppb82k-9cEs-yb1K-QTNl-c4BM-33PQ-bNX0c2,ceph.with_tpm=0","lv_uuid":"5mng9E-Q3ej-37eY-Ny9C-p6wf-h17w-gC3jtx","name":"osd-db-19fc3a21-ce53-4881-9217-f1d58166af16","path":"/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-19fc3a21-ce53-4881-9217-f1d58166af16","tags":{"ceph.block_device":"/dev/ceph-20acdce8-5548-4707-a38e-b8e925485bc5/osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd","ceph.block_uuid":"diO6OQ-jjkD-tdVS-FJ5f-VcP7-8QEW-geP4Ds","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.db_device":"/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-19fc3a21-ce53-4881-9217-f1d58166af16","ceph.db_uuid":"5mng9E-Q3ej-37eY-Ny9C-p6wf-h17w-gC3jtx","ceph.encrypted":"0","ceph.osd_fsid":"242c4a21-b076-424c-94fb-3f556ed2ddbd","ceph.osd_id":"4","ceph.osdspec_affinity":"osd.shared_db_wal","ceph.type":"db","ceph.vdo":"0","ceph.wal_device":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008","ceph.wal_uuid":"ppb82k-9cEs-yb1K-QTNl-c4BM-33PQ-bNX0c2","ceph.with_tpm":"0"},"type":"db","vg_name":"ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452"}],"5":[{"devices":["/dev/vdj"],"lv_name":"osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea","lv_path":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea","lv_size":"107369988096","lv_tags":"ceph.block_device=/dev/ceph-84a4ccfc-80f1-4784-9558-a9a08b15a351/osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f,ceph.block_uuid=gmQkh2-T5i3-Kwfa-YMMO-j88X-RvDw-dx7N6E,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.encrypted=0,ceph.osd_fsid=8cf28853-3453-49b0-a3f9-a693443ed75f,ceph.osd_id=5,ceph.osdspec_affinity=osd.shared_db_wal,ceph.type=wal,ceph.vdo=0,ceph.wal_device=/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea,ceph.wal_uuid=DFQDJy-6bE0-iagr-hgmh-oUEH-HF2R-ILBzzz,ceph.with_tpm=0","lv_uuid":"DFQDJy-6bE0-iagr-hgmh-oUEH-HF2R-ILBzzz","name":"osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea","path":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea","tags":{"ceph.block_device":"/dev/ceph-84a4ccfc-80f1-4784-9558-a9a08b15a351/osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f","ceph.block_uuid":"gmQkh2-T5i3-Kwfa-YMMO-j88X-RvDw-dx7N6E","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.encrypted":"0","ceph.osd_fsid":"8cf28853-3453-49b0-a3f9-a693443ed75f","ceph.osd_id":"5","ceph.osdspec_affinity":"osd.shared_db_wal","ceph.type":"wal","ceph.vdo":"0","ceph.wal_device":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea","ceph.wal_uuid":"DFQDJy-6bE0-iagr-hgmh-oUEH-HF2R-ILBzzz","ceph.with_tpm":"0"},"type":"wal","vg_name":"ceph-776f980b-152a-4e8f-99b6-bae27ed0b528"},{"devices":["/dev/vdh"],"lv_name":"osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f","lv_path":"/dev/ceph-84a4ccfc-80f1-4784-9558-a9a08b15a351/osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f","lv_size":"214744170496","lv_tags":"ceph.block_device=/dev/ceph-84a4ccfc-80f1-4784-9558-a9a08b15a351/osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f,ceph.block_uuid=gmQkh2-T5i3-Kwfa-YMMO-j88X-RvDw-dx7N6E,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.db_device=/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-635f592b-1d4f-4117-aaa6-b68878f84dfb,ceph.db_uuid=wf407q-HwuD-OWhh-xm2A-d2sv-Fdsx-JqeUj2,ceph.encrypted=0,ceph.osd_fsid=8cf28853-3453-49b0-a3f9-a693443ed75f,ceph.osd_id=5,ceph.osdspec_affinity=osd.shared_db_wal,ceph.type=block,ceph.vdo=0,ceph.wal_device=/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea,ceph.wal_uuid=DFQDJy-6bE0-iagr-hgmh-oUEH-HF2R-ILBzzz,ceph.with_tpm=0","lv_uuid":"gmQkh2-T5i3-Kwfa-YMMO-j88X-RvDw-dx7N6E","name":"osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f","path":"/dev/ceph-84a4ccfc-80f1-4784-9558-a9a08b15a351/osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f","tags":{"ceph.block_device":"/dev/ceph-84a4ccfc-80f1-4784-9558-a9a08b15a351/osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f","ceph.block_uuid":"gmQkh2-T5i3-Kwfa-YMMO-j88X-RvDw-dx7N6E","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.db_device":"/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-635f592b-1d4f-4117-aaa6-b68878f84dfb","ceph.db_uuid":"wf407q-HwuD-OWhh-xm2A-d2sv-Fdsx-JqeUj2","ceph.encrypted":"0","ceph.osd_fsid":"8cf28853-3453-49b0-a3f9-a693443ed75f","ceph.osd_id":"5","ceph.osdspec_affinity":"osd.shared_db_wal","ceph.type":"block","ceph.vdo":"0","ceph.wal_device":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea","ceph.wal_uuid":"DFQDJy-6bE0-iagr-hgmh-oUEH-HF2R-ILBzzz","ceph.with_tpm":"0"},"type":"block","vg_name":"ceph-84a4ccfc-80f1-4784-9558-a9a08b15a351"},{"devices":["/dev/vdi"],"lv_name":"osd-db-635f592b-1d4f-4117-aaa6-b68878f84dfb","lv_path":"/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-635f592b-1d4f-4117-aaa6-b68878f84dfb","lv_size":"107369988096","lv_tags":"ceph.block_device=/dev/ceph-84a4ccfc-80f1-4784-9558-a9a08b15a351/osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f,ceph.block_uuid=gmQkh2-T5i3-Kwfa-YMMO-j88X-RvDw-dx7N6E,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.db_device=/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-635f592b-1d4f-4117-aaa6-b68878f84dfb,ceph.db_uuid=wf407q-HwuD-OWhh-xm2A-d2sv-Fdsx-JqeUj2,ceph.encrypted=0,ceph.osd_fsid=8cf28853-3453-49b0-a3f9-a693443ed75f,ceph.osd_id=5,ceph.osdspec_affinity=osd.shared_db_wal,ceph.type=db,ceph.vdo=0,ceph.wal_device=/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea,ceph.wal_uuid=DFQDJy-6bE0-iagr-hgmh-oUEH-HF2R-ILBzzz,ceph.with_tpm=0","lv_uuid":"wf407q-HwuD-OWhh-xm2A-d2sv-Fdsx-JqeUj2","name":"osd-db-635f592b-1d4f-4117-aaa6-b68878f84dfb","path":"/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-635f592b-1d4f-4117-aaa6-b68878f84dfb","tags":{"ceph.block_device":"/dev/ceph-84a4ccfc-80f1-4784-9558-a9a08b15a351/osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f","ceph.block_uuid":"gmQkh2-T5i3-Kwfa-YMMO-j88X-RvDw-dx7N6E","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.db_device":"/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-635f592b-1d4f-4117-aaa6-b68878f84dfb","ceph.db_uuid":"wf407q-HwuD-OWhh-xm2A-d2sv-Fdsx-JqeUj2","ceph.encrypted":"0","ceph.osd_fsid":"8cf28853-3453-49b0-a3f9-a693443ed75f","ceph.osd_id":"5","ceph.osdspec_affinity":"osd.shared_db_wal","ceph.type":"db","ceph.vdo":"0","ceph.wal_device":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea","ceph.wal_uuid":"DFQDJy-6bE0-iagr-hgmh-oUEH-HF2R-ILBzzz","ceph.with_tpm":"0"},"type":"db","vg_name":"ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452"}]}' diff --git a/src/pybind/mgr/cephadm/tests/conftest.py b/src/pybind/mgr/cephadm/tests/conftest.py index e8add2c7b834a..5cc2fabaf49b6 100644 --- a/src/pybind/mgr/cephadm/tests/conftest.py +++ b/src/pybind/mgr/cephadm/tests/conftest.py @@ -1,13 +1,14 @@ import pytest from cephadm.services.osd import RemoveUtil, OSD -from tests import mock - +from mock import mock from .fixtures import with_cephadm_module +from cephadm import CephadmOrchestrator +from typing import Generator @pytest.fixture() -def cephadm_module(): +def cephadm_module() -> Generator[CephadmOrchestrator, None, None]: with with_cephadm_module({}) as m: yield m diff --git a/src/pybind/mgr/cephadm/tests/fixtures.py b/src/pybind/mgr/cephadm/tests/fixtures.py index dd858c6c7dabe..dda0c6720ac6c 100644 --- a/src/pybind/mgr/cephadm/tests/fixtures.py +++ b/src/pybind/mgr/cephadm/tests/fixtures.py @@ -35,11 +35,11 @@ def get_module_option_ex(_, module, key, default=None): return None -def _run_cephadm(ret): +def _run_cephadm(ret, rc: int = 0): async def foo(s, host, entity, cmd, e, **kwargs): if cmd == 'gather-facts': return '{}', '', 0 - return [ret], '', 0 + return [ret], '', rc return foo diff --git a/src/pybind/mgr/cephadm/tests/test_ceph_volume.py b/src/pybind/mgr/cephadm/tests/test_ceph_volume.py new file mode 100644 index 0000000000000..cc1378a75753c --- /dev/null +++ b/src/pybind/mgr/cephadm/tests/test_ceph_volume.py @@ -0,0 +1,231 @@ +import json +import pytest +from .ceph_volume_data import data +from cephadm.serve import CephadmServe +from cephadm import CephadmOrchestrator +from mock import patch +from .fixtures import _run_cephadm, with_host + + +class TestCephVolume: + def test_run(self, cephadm_module: CephadmOrchestrator) -> None: + with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')): + with with_host(cephadm_module, 'test'): + CephadmServe(cephadm_module)._refresh_host_daemons('test') + with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm('fake-output', 0)): + c = cephadm_module.ceph_volume.run('test', ['/bin/foo']) + assert c == (['fake-output'], '', 0) + + def test_run_json(self, cephadm_module: CephadmOrchestrator) -> None: + with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')): + with with_host(cephadm_module, 'test'): + CephadmServe(cephadm_module)._refresh_host_daemons('test') + with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm('{"this-is-a-fake-key": "this-is-a-fake-value"}', 0)): + c = cephadm_module.ceph_volume.run_json('test', ['/bin/foo']) + assert c == {"this-is-a-fake-key": "this-is-a-fake-value"} + + def test_clear_replace_header_ok(self, cephadm_module: CephadmOrchestrator) -> None: + with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')): + with with_host(cephadm_module, 'test'): + CephadmServe(cephadm_module)._refresh_host_daemons('test') + with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm('fake-output', 0)): + c = cephadm_module.ceph_volume.clear_replace_header('test', '/dev/foo') + assert c == 'Replacement header cleared on /dev/foo' + + def test_clear_replace_header_nok(self, cephadm_module: CephadmOrchestrator) -> None: + with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')): + with with_host(cephadm_module, 'test'): + CephadmServe(cephadm_module)._refresh_host_daemons('test') + with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm('', 1)): + c = cephadm_module.ceph_volume.clear_replace_header('fake-output', '/dev/foo') + assert c.strip() == 'No replacement header could be cleared on /dev/foo.' + + +class TestCephVolumeList: + def test_get_data(self, cephadm_module: CephadmOrchestrator) -> None: + with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')): + with with_host(cephadm_module, 'test'): + CephadmServe(cephadm_module)._refresh_host_daemons('test') + with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)): + cephadm_module.ceph_volume.lvm_list.get_data('test') + assert cephadm_module.ceph_volume.lvm_list.data == json.loads(data) + + def test_devices_by_type_block(self, cephadm_module: CephadmOrchestrator) -> None: + with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')): + with with_host(cephadm_module, 'test'): + CephadmServe(cephadm_module)._refresh_host_daemons('test') + with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)): + cephadm_module.ceph_volume.lvm_list.get_data('test') + assert set(cephadm_module.ceph_volume.lvm_list.devices_by_type('block')) == set(['/dev/vdb', + '/dev/vdc', + '/dev/vdg', + '/dev/vde', + '/dev/vdf', + '/dev/vdh']) + + def test_devices_by_type_db(self, cephadm_module: CephadmOrchestrator) -> None: + with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')): + with with_host(cephadm_module, 'test'): + CephadmServe(cephadm_module)._refresh_host_daemons('test') + with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)): + cephadm_module.ceph_volume.lvm_list.get_data('test') + assert set(cephadm_module.ceph_volume.lvm_list.devices_by_type('db')) == set(['/dev/vdi', + '/dev/vdk']) + + def test_devices_by_type_wal(self, cephadm_module: CephadmOrchestrator) -> None: + with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')): + with with_host(cephadm_module, 'test'): + CephadmServe(cephadm_module)._refresh_host_daemons('test') + with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)): + cephadm_module.ceph_volume.lvm_list.get_data('test') + assert cephadm_module.ceph_volume.lvm_list.devices_by_type('wal') == ['/dev/vdj'] + + def test_block_devices(self, cephadm_module: CephadmOrchestrator) -> None: + with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')): + with with_host(cephadm_module, 'test'): + CephadmServe(cephadm_module)._refresh_host_daemons('test') + with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)): + cephadm_module.ceph_volume.lvm_list.get_data('test') + assert set(cephadm_module.ceph_volume.lvm_list.block_devices()) == set(['/dev/vdb', + '/dev/vdc', + '/dev/vdg', + '/dev/vde', + '/dev/vdf', + '/dev/vdh']) + + def test_db_devices(self, cephadm_module: CephadmOrchestrator) -> None: + with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')): + with with_host(cephadm_module, 'test'): + CephadmServe(cephadm_module)._refresh_host_daemons('test') + with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)): + cephadm_module.ceph_volume.lvm_list.get_data('test') + assert set(cephadm_module.ceph_volume.lvm_list.db_devices()) == set(['/dev/vdk', + '/dev/vdi']) + + def test_wal_devices(self, cephadm_module: CephadmOrchestrator) -> None: + with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')): + with with_host(cephadm_module, 'test'): + CephadmServe(cephadm_module)._refresh_host_daemons('test') + with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)): + cephadm_module.ceph_volume.lvm_list.get_data('test') + assert set(cephadm_module.ceph_volume.lvm_list.wal_devices()) == set(['/dev/vdj']) + + def test_all_devices(self, cephadm_module: CephadmOrchestrator) -> None: + with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')): + with with_host(cephadm_module, 'test'): + CephadmServe(cephadm_module)._refresh_host_daemons('test') + with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)): + cephadm_module.ceph_volume.lvm_list.get_data('test') + assert set(cephadm_module.ceph_volume.lvm_list.all_devices()) == set(['/dev/vdg', + '/dev/vdj', + '/dev/vdh', + '/dev/vdi', + '/dev/vdc', + '/dev/vde', + '/dev/vdf', + '/dev/vdb', + '/dev/vdk']) + + def test_device_osd_mapping(self, cephadm_module: CephadmOrchestrator) -> None: + with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')): + with with_host(cephadm_module, 'test'): + CephadmServe(cephadm_module)._refresh_host_daemons('test') + with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)): + cephadm_module.ceph_volume.lvm_list.get_data('test') + assert cephadm_module.ceph_volume.lvm_list.device_osd_mapping() == {'/dev/vdb': {'osd_ids': ['0']}, + '/dev/vdk': {'osd_ids': ['0', '1']}, + '/dev/vdc': {'osd_ids': ['1']}, + '/dev/vdf': {'osd_ids': ['2']}, + '/dev/vde': {'osd_ids': ['3']}, + '/dev/vdg': {'osd_ids': ['4']}, + '/dev/vdj': {'osd_ids': ['4', '5']}, + '/dev/vdi': {'osd_ids': ['4', '5']}, + '/dev/vdh': {'osd_ids': ['5']}} + + def test_block_device_osd_mapping(self, cephadm_module: CephadmOrchestrator) -> None: + with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')): + with with_host(cephadm_module, 'test'): + CephadmServe(cephadm_module)._refresh_host_daemons('test') + with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)): + cephadm_module.ceph_volume.lvm_list.get_data('test') + assert cephadm_module.ceph_volume.lvm_list.block_device_osd_mapping() == {'/dev/vdb': {'osd_ids': ['0']}, + '/dev/vdc': {'osd_ids': ['1']}, + '/dev/vdf': {'osd_ids': ['2']}, + '/dev/vde': {'osd_ids': ['3']}, + '/dev/vdg': {'osd_ids': ['4']}, + '/dev/vdh': {'osd_ids': ['5']}} + + def test_db_device_osd_mapping(self, cephadm_module: CephadmOrchestrator) -> None: + with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')): + with with_host(cephadm_module, 'test'): + CephadmServe(cephadm_module)._refresh_host_daemons('test') + with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)): + cephadm_module.ceph_volume.lvm_list.get_data('test') + assert cephadm_module.ceph_volume.lvm_list.db_device_osd_mapping() == {'/dev/vdk': {'osd_ids': ['0', '1']}, + '/dev/vdi': {'osd_ids': ['4', '5']}} + + def test_wal_device_osd_mapping(self, cephadm_module: CephadmOrchestrator) -> None: + with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')): + with with_host(cephadm_module, 'test'): + CephadmServe(cephadm_module)._refresh_host_daemons('test') + with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)): + cephadm_module.ceph_volume.lvm_list.get_data('test') + assert cephadm_module.ceph_volume.lvm_list.wal_device_osd_mapping() == {'/dev/vdj': {'osd_ids': ['4', '5']}} + + def test_is_shared_device(self, cephadm_module: CephadmOrchestrator) -> None: + with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')): + with with_host(cephadm_module, 'test'): + CephadmServe(cephadm_module)._refresh_host_daemons('test') + with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)): + cephadm_module.ceph_volume.lvm_list.get_data('test') + assert cephadm_module.ceph_volume.lvm_list.is_shared_device('/dev/vdj') + + def test_is_shared_device_with_invalid_device(self, cephadm_module: CephadmOrchestrator) -> None: + with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')): + with with_host(cephadm_module, 'test'): + CephadmServe(cephadm_module)._refresh_host_daemons('test') + with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)): + cephadm_module.ceph_volume.lvm_list.get_data('test') + with pytest.raises(RuntimeError) as e: + assert cephadm_module.ceph_volume.lvm_list.is_shared_device('/dev/invalid-device') + assert str(e.value) == 'Not a valid device path.' + + def test_is_block_device(self, cephadm_module: CephadmOrchestrator) -> None: + with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')): + with with_host(cephadm_module, 'test'): + CephadmServe(cephadm_module)._refresh_host_daemons('test') + with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)): + cephadm_module.ceph_volume.lvm_list.get_data('test') + assert cephadm_module.ceph_volume.lvm_list.is_block_device('/dev/vdb') + + def test_is_db_device(self, cephadm_module: CephadmOrchestrator) -> None: + with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')): + with with_host(cephadm_module, 'test'): + CephadmServe(cephadm_module)._refresh_host_daemons('test') + with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)): + cephadm_module.ceph_volume.lvm_list.get_data('test') + assert cephadm_module.ceph_volume.lvm_list.is_db_device('/dev/vdk') + + def test_is_wal_device(self, cephadm_module: CephadmOrchestrator) -> None: + with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')): + with with_host(cephadm_module, 'test'): + CephadmServe(cephadm_module)._refresh_host_daemons('test') + with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)): + cephadm_module.ceph_volume.lvm_list.get_data('test') + assert cephadm_module.ceph_volume.lvm_list.is_wal_device('/dev/vdj') + + def test_get_block_devices_from_osd_id(self, cephadm_module: CephadmOrchestrator) -> None: + with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')): + with with_host(cephadm_module, 'test'): + CephadmServe(cephadm_module)._refresh_host_daemons('test') + with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)): + cephadm_module.ceph_volume.lvm_list.get_data('test') + assert cephadm_module.ceph_volume.lvm_list.get_block_devices_from_osd_id('0') == ['/dev/vdb'] + + def test_osd_ids(self, cephadm_module: CephadmOrchestrator) -> None: + with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')): + with with_host(cephadm_module, 'test'): + CephadmServe(cephadm_module)._refresh_host_daemons('test') + with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)): + cephadm_module.ceph_volume.lvm_list.get_data('test') + assert set(cephadm_module.ceph_volume.lvm_list.osd_ids()) == set(['0', '1', '2', '3', '4', '5']) diff --git a/src/pybind/mgr/cephadm/tests/test_replace_device.py b/src/pybind/mgr/cephadm/tests/test_replace_device.py new file mode 100644 index 0000000000000..b4a2c81ad9a76 --- /dev/null +++ b/src/pybind/mgr/cephadm/tests/test_replace_device.py @@ -0,0 +1,53 @@ +import pytest +from mock import patch +from .fixtures import _run_cephadm, with_host, wait +from .ceph_volume_data import data +from cephadm.serve import CephadmServe +from cephadm import CephadmOrchestrator +from orchestrator import OrchestratorError + + +class TestReplaceDevice: + def test_invalid_device(self, cephadm_module: CephadmOrchestrator) -> None: + with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')): + with with_host(cephadm_module, 'test'): + CephadmServe(cephadm_module)._refresh_host_daemons('test') + with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)): + with pytest.raises(OrchestratorError) as e: + cephadm_module.replace_device('test', '/dev/invalid-device') + assert "/dev/invalid-device doesn't appear to be used for an OSD, not a valid device in test." in str(e.value) + + def test_invalid_hostname(self, cephadm_module: CephadmOrchestrator) -> None: + with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')): + with with_host(cephadm_module, 'test'): + CephadmServe(cephadm_module)._refresh_host_daemons('test') + with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)): + with pytest.raises(OrchestratorError): + cephadm_module.replace_device('invalid-hostname', '/dev/vdb') + + def test_block_device(self, cephadm_module: CephadmOrchestrator) -> None: + with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')): + with with_host(cephadm_module, 'test'): + CephadmServe(cephadm_module)._refresh_host_daemons('test') + with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)): + c = cephadm_module.replace_device('test', '/dev/vdb') + result = wait(cephadm_module, c) + assert result == "Scheduled to destroy osds: ['0'] and mark /dev/vdb as being replaced." + + def test_shared_db_device_no_ireallymeanit_flag(self, cephadm_module: CephadmOrchestrator) -> None: + with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')): + with with_host(cephadm_module, 'test'): + CephadmServe(cephadm_module)._refresh_host_daemons('test') + with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)): + with pytest.raises(OrchestratorError) as e: + cephadm_module.replace_device('test', '/dev/vdk') + assert "/dev/vdk is a shared device.\nReplacing /dev/vdk implies destroying OSD(s): ['0', '1'].\nPlease, *be very careful*, this can be a very dangerous operation.\nIf you know what you are doing, pass --yes-i-really-mean-it" in str(e.value) + + def test_shared_db_device(self, cephadm_module: CephadmOrchestrator) -> None: + with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')): + with with_host(cephadm_module, 'test'): + CephadmServe(cephadm_module)._refresh_host_daemons('test') + with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)): + c = cephadm_module.replace_device('test', '/dev/vdk', yes_i_really_mean_it=True) + result = wait(cephadm_module, c) + assert result == "Scheduled to destroy osds: ['0', '1'] and mark /dev/vdk as being replaced." diff --git a/src/pybind/mgr/orchestrator/_interface.py b/src/pybind/mgr/orchestrator/_interface.py index 82a8c13a9c11e..c05332df59a28 100644 --- a/src/pybind/mgr/orchestrator/_interface.py +++ b/src/pybind/mgr/orchestrator/_interface.py @@ -520,6 +520,15 @@ def rescan_host(self, hostname: str) -> OrchResult: """ raise NotImplementedError() + def replace_device(self, + hostname: str, + device: str, + clear: bool = False, + yes_i_really_mean_it: bool = False) -> OrchResult: + """Perform all required operations in order to replace a device. + """ + raise NotImplementedError() + def get_inventory(self, host_filter: Optional['InventoryFilter'] = None, refresh: bool = False) -> OrchResult[List['InventoryHost']]: """ Returns something that was created by `ceph-volume inventory`. @@ -699,12 +708,18 @@ def preview_osdspecs(self, def remove_osds(self, osd_ids: List[str], replace: bool = False, + replace_block: bool = False, + replace_db: bool = False, + replace_wal: bool = False, force: bool = False, zap: bool = False, no_destroy: bool = False) -> OrchResult[str]: """ :param osd_ids: list of OSD IDs :param replace: marks the OSD as being destroyed. See :ref:`orchestrator-osd-replace` + :param replace_block: marks the corresponding block device as being replaced. + :param replace_db: marks the corresponding db device as being replaced. + :param replace_wal: marks the corresponding wal device as being replaced. :param force: Forces the OSD removal process without waiting for the data to be drained first. :param zap: Zap/Erase all devices associated with the OSDs (DESTROYS DATA) :param no_destroy: Do not destroy associated VGs/LVs with the OSD. diff --git a/src/pybind/mgr/orchestrator/module.py b/src/pybind/mgr/orchestrator/module.py index be0096bb2d96e..7dd8c95af52c7 100644 --- a/src/pybind/mgr/orchestrator/module.py +++ b/src/pybind/mgr/orchestrator/module.py @@ -818,6 +818,21 @@ def _host_rescan(self, hostname: str, with_summary: bool = False) -> HandleComma return HandleCommandResult(stdout=completion.result_str()) return HandleCommandResult(stdout=completion.result_str().split('.')[0]) + @_cli_read_command('orch device replace') + def _replace_device(self, + hostname: str, + device: str, + clear: bool = False, + yes_i_really_mean_it: bool = False) -> HandleCommandResult: + """Perform all required operations in order to replace a device. + """ + completion = self.replace_device(hostname=hostname, + device=device, + clear=clear, + yes_i_really_mean_it=yes_i_really_mean_it) + raise_if_exception(completion) + return HandleCommandResult(stdout=completion.result_str()) + @_cli_read_command('orch device ls') def _list_devices(self, hostname: Optional[List[str]] = None, @@ -1415,8 +1430,9 @@ def _osd_rm_start(self, zap: bool = False, no_destroy: bool = False) -> HandleCommandResult: """Remove OSD daemons""" - completion = self.remove_osds(osd_id, replace=replace, force=force, - zap=zap, no_destroy=no_destroy) + completion = self.remove_osds(osd_id, + replace=replace, + force=force, zap=zap, no_destroy=no_destroy) raise_if_exception(completion) return HandleCommandResult(stdout=completion.result_str()) diff --git a/src/python-common/ceph/deployment/drive_selection/selector.py b/src/python-common/ceph/deployment/drive_selection/selector.py index 041f1ed30446f..59ebbb6347e43 100644 --- a/src/python-common/ceph/deployment/drive_selection/selector.py +++ b/src/python-common/ceph/deployment/drive_selection/selector.py @@ -131,6 +131,10 @@ def assign_devices(self, device_filter): for disk in self.disks: logger.debug("Processing disk {}".format(disk.path)) + if disk.being_replaced: + logger.debug('Ignoring disk {} as it is being replaced.'.format(disk.path)) + continue + if not disk.available and not disk.ceph_device: logger.debug( ("Ignoring disk {}. " diff --git a/src/python-common/ceph/deployment/inventory.py b/src/python-common/ceph/deployment/inventory.py index a3023882108e3..e2c1a5605f9a6 100644 --- a/src/python-common/ceph/deployment/inventory.py +++ b/src/python-common/ceph/deployment/inventory.py @@ -54,7 +54,8 @@ class Device(object): 'human_readable_type', 'device_id', 'lsm_data', - 'crush_device_class' + 'crush_device_class', + 'being_replaced' ] def __init__(self, @@ -67,7 +68,8 @@ def __init__(self, lsm_data=None, # type: Optional[Dict[str, Dict[str, str]]] created=None, # type: Optional[datetime.datetime] ceph_device=None, # type: Optional[bool] - crush_device_class=None # type: Optional[str] + crush_device_class=None, # type: Optional[str] + being_replaced=None, # type: Optional[bool] ): self.path = path @@ -80,6 +82,7 @@ def __init__(self, self.created = created if created is not None else datetime_now() self.ceph_device = ceph_device self.crush_device_class = crush_device_class + self.being_replaced = being_replaced def __eq__(self, other): # type: (Any) -> bool @@ -129,7 +132,8 @@ def __repr__(self) -> str: 'lvs': self.lvs if self.lvs else 'None', 'available': str(self.available), 'ceph_device': str(self.ceph_device), - 'crush_device_class': str(self.crush_device_class) + 'crush_device_class': str(self.crush_device_class), + 'being_replaced': str(self.being_replaced) } if not self.available and self.rejected_reasons: device_desc['rejection reasons'] = self.rejected_reasons From 3c9b07eb87e67027e9988c1587c07e27ed168657 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Miguel=20Olmo=20Mart=C3=ADnez?= Date: Mon, 7 Oct 2024 16:55:51 +0200 Subject: [PATCH 144/148] exporter: New metric for report ceph daemons health MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ceph exporter provide metrics to report ceph daemons communication health using the admin socket Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=2146728 https://tracker.ceph.com/issues/68428 Signed-off-by: Juan Miguel Olmo Martínez --- doc/monitoring/index.rst | 24 ++++++ src/exporter/DaemonMetricCollector.cc | 9 ++- src/exporter/DaemonMetricCollector.h | 2 +- src/test/exporter/test_exporter.cc | 110 +++++++++++++++++++++++++- 4 files changed, 141 insertions(+), 4 deletions(-) diff --git a/doc/monitoring/index.rst b/doc/monitoring/index.rst index 794fdf8419505..afccd9ab16ac3 100644 --- a/doc/monitoring/index.rst +++ b/doc/monitoring/index.rst @@ -64,6 +64,30 @@ in: It is good to outline that the main tool allowing users to observe and monitor a Ceph cluster is the **Ceph dashboard**. It provides graphics where the most important cluster and service metrics are represented. Most of the examples in this document are extracted from the dashboard graphics or extrapolated from the metrics exposed by the Ceph dashboard. +Ceph daemon health metrics +========================== + +The Ceph exporter provides a metric called ``ceph_daemon_socket_up`` that reports the liveness status of each Ceph daemon that exposes an admin socket. + +The ``ceph_daemon_socket_up`` metric indicates the health status of a Ceph daemon based on its ability to respond via the admin socket, where a value of ``1`` means healthy, and ``0`` means unhealthy. Although a Ceph daemon might still be "alive" when it reports ``ceph_daemon_socket_up=0``, this situation highlights a significant issue in its functionality. As such, this metric serves as an excellent tool for detecting problems in any of the main Ceph daemons. + +Labels: +- **``ceph_daemon``**: Identifier of the Ceph daemon exposing an admin socket on the host. +- **``hostname``**: Name of the host where the Ceph daemon is running. + +Example: + +.. code-block:: bash + + ceph_daemon_socket_up{ceph_daemon="mds.a",hostname="testhost"} 1 + ceph_daemon_socket_up{ceph_daemon="osd.1",hostname="testhost"} 0 + +To identify any Ceph daemons that were not responsive at any point in the last 12 hours, you can use the following PromQL expression: + +.. code-block:: bash + + ceph_daemon_socket_up == 0 or min_over_time(ceph_daemon_socket_up[12h]) == 0 + Performance metrics =================== diff --git a/src/exporter/DaemonMetricCollector.cc b/src/exporter/DaemonMetricCollector.cc index d4930ea35c0d2..4b8a8131bcfd3 100644 --- a/src/exporter/DaemonMetricCollector.cc +++ b/src/exporter/DaemonMetricCollector.cc @@ -168,10 +168,17 @@ void DaemonMetricCollector::dump_asok_metrics(bool sort_metrics, int64_t counter if (sockClientsPing) { bool ok; sock_client.ping(&ok); + std::string ceph_daemon_socket_up_desc( + "Reports the health status of a Ceph daemon, as determined by whether it is able to respond via its admin socket (1 = healthy, 0 = unhealthy)."); + labels_t ceph_daemon_socket_up_labels; + ceph_daemon_socket_up_labels["hostname"] = quote(ceph_get_hostname()); + ceph_daemon_socket_up_labels["ceph_daemon"] = quote(daemon_name); + add_metric(builder, static_cast(ok), "ceph_daemon_socket_up", ceph_daemon_socket_up_desc, + "gauge", ceph_daemon_socket_up_labels); if (!ok) { failures++; continue; - } + } } std::string counter_dump_response = dump_response.size() > 0 ? dump_response : asok_request(sock_client, "counter dump", daemon_name); diff --git a/src/exporter/DaemonMetricCollector.h b/src/exporter/DaemonMetricCollector.h index d2e929b4d670f..3302e95df916c 100644 --- a/src/exporter/DaemonMetricCollector.h +++ b/src/exporter/DaemonMetricCollector.h @@ -42,11 +42,11 @@ class DaemonMetricCollector { std::map clients; std::string metrics; std::pair add_fixed_name_metrics(std::string metric_name); + void update_sockets(); private: std::mutex metrics_mutex; std::unique_ptr builder; - void update_sockets(); void request_loop(boost::asio::steady_timer &timer); void dump_asok_metric(boost::json::object perf_info, diff --git a/src/test/exporter/test_exporter.cc b/src/test/exporter/test_exporter.cc index 907884fe35d60..e24773886bcb3 100644 --- a/src/test/exporter/test_exporter.cc +++ b/src/test/exporter/test_exporter.cc @@ -1,6 +1,8 @@ #include "common/ceph_argparse.h" #include "common/config.h" #include "common/config_proxy.h" +#include "common/admin_socket.h" +#include "common/admin_socket_client.h" #include #include "gtest/gtest.h" #include "common/ceph_context.h" @@ -8,6 +10,7 @@ #include "global/global_init.h" #include "exporter/util.h" #include "exporter/DaemonMetricCollector.h" +#include #include #include @@ -674,6 +677,27 @@ static std::vector> promethize_data = { {"rocksdb.submit_sync_latency_sum", "ceph_rocksdb_submit_sync_latency_sum"} }; + +class AdminSocketTest +{ +public: + explicit AdminSocketTest(AdminSocket *asokc) + : m_asokc(asokc) + { + } + bool init(const std::string &uri) { + return m_asokc->init(uri); + } + std::string bind_and_listen(const std::string &sock_path, int *fd) { + return m_asokc->bind_and_listen(sock_path, fd); + } + bool shutdown() { + m_asokc->shutdown(); + return true; + } + AdminSocket *m_asokc; +}; + int main(int argc, char **argv) { ::testing::InitGoogleTest(&argc, argv); @@ -1289,8 +1313,11 @@ ceph_mon_session_rm{ceph_daemon="mon.a"} 577 # TYPE ceph_mon_session_trim counter ceph_mon_session_trim{ceph_daemon="mon.a"} 9 )"; - - ASSERT_TRUE(collector.metrics.find(expectedMetrics) != std::string::npos); + + std::string actualMetrics = collector.metrics; + std::cout << "Actual MON Metrics: " << actualMetrics << std::endl; + ASSERT_TRUE(actualMetrics.find(expectedMetrics) != std::string::npos); + //ASSERT_TRUE(collector.metrics.find(expectedMetrics) != std::string::npos); // Test for labeled metrics - RGW daemon = "ceph-client.rgw.foo.ceph-node-00.aayrrj.2.93993527376064"; @@ -1452,3 +1479,82 @@ TEST(Exporter, add_fixed_name_metrics) { EXPECT_EQ(new_metric.first, expected_labels); ASSERT_TRUE(new_metric.second == expected_metric_name); } + +TEST(Exporter, UpdateSockets) { + const std::string mock_dir = "/tmp/fake_sock_dir"; + + // Create the mock directory + std::filesystem::create_directories(mock_dir); + + // Create a mix of vstart and real cluster mock .asok files + std::ofstream(mock_dir + "/ceph-osd.0.asok").close(); + std::ofstream(mock_dir + "/ceph-mds.a.asok").close(); + std::ofstream(mock_dir + "/ceph-mgr.chatest-node-00.ijzynn.asok").close(); + std::ofstream(mock_dir + "/ceph-client.rgw.rgwfoo.chatest-node-00.yqaoen.2.94354846193952.asok").close(); + std::ofstream(mock_dir + "/ceph-client.ceph-exporter.chatest-node-00.asok").close(); + std::ofstream(mock_dir + "/ceph-mon.chatest-node-00.asok").close(); + + g_conf().set_val("exporter_sock_dir", mock_dir); + + DaemonMetricCollector collector; + + // Run the function that interacts with the mock directory + collector.update_sockets(); + + // Verify the expected results + ASSERT_EQ(collector.clients.size(), 4); + ASSERT_TRUE(collector.clients.find("ceph-osd.0") != collector.clients.end()); + ASSERT_TRUE(collector.clients.find("ceph-mds.a") != collector.clients.end()); + ASSERT_TRUE(collector.clients.find("ceph-mon.chatest-node-00") != collector.clients.end()); + ASSERT_TRUE(collector.clients.find("ceph-client.rgw.rgwfoo.chatest-node-00.yqaoen.2.94354846193952") != collector.clients.end()); + + + // Remove the mock directory and files + std::filesystem::remove_all(mock_dir); +} + + +TEST(Exporter, HealthMetrics) { + std::map clients; + DaemonMetricCollector &collector = collector_instance(); + std::string daemon = "test_daemon"; + std::string expectedCounterDump = ""; + std::string expectedCounterSchema = ""; + std::string metricName = "ceph_daemon_socket_up"; + + // Fake admin socket + std::string asok_path = "/tmp/" + daemon + ".asok"; + std::unique_ptr asokc = std::make_unique(g_ceph_context); + AdminSocketClient client(asok_path); + + // Add the daemon clients to the collector + clients.insert({daemon, std::move(client)}); + collector.clients = clients; + + auto verifyMetricValue = [&](const std::string &metricValue, bool shouldInitializeSocket) { + collector.metrics = ""; + + if (shouldInitializeSocket) { + AdminSocketTest asoct(asokc.get()); + ASSERT_TRUE(asoct.init(asok_path)); + } + + collector.dump_asok_metrics(true, 5, true, expectedCounterDump, expectedCounterSchema, false); + + if (shouldInitializeSocket) { + AdminSocketTest asoct(asokc.get()); + ASSERT_TRUE(asoct.shutdown()); + } + + std::string retrievedMetrics = collector.metrics; + std::string pattern = metricName + R"(\{[^}]*ceph_daemon=\")" + daemon + R"(\"[^}]*\}\s+)" + metricValue + R"(\b)"; + std::regex regexPattern(pattern); + ASSERT_TRUE(std::regex_search(retrievedMetrics, regexPattern)); + }; + + // Test an admin socket not answering: metric value should be "0" + verifyMetricValue("0", false); + + // Test an admin socket answering: metric value should be "1" + verifyMetricValue("1", true); +} From 82b6a1c5786958bb443d92ee798dd3741f07fdf9 Mon Sep 17 00:00:00 2001 From: Avan Thakkar Date: Wed, 16 Oct 2024 13:29:34 +0530 Subject: [PATCH 145/148] mgr/smb: rm all `smb dump` commands Fixes: https://tracker.ceph.com/issues/68545 Signed-off-by: Avan Thakkar --- src/pybind/mgr/smb/module.py | 41 +------------ src/pybind/mgr/smb/tests/test_smb.py | 89 ---------------------------- 2 files changed, 1 insertion(+), 129 deletions(-) diff --git a/src/pybind/mgr/smb/module.py b/src/pybind/mgr/smb/module.py index 77a08229cf017..4512ad6add336 100644 --- a/src/pybind/mgr/smb/module.py +++ b/src/pybind/mgr/smb/module.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, Any, Dict, List, Optional, cast +from typing import TYPE_CHECKING, Any, List, Optional, cast import logging @@ -350,45 +350,6 @@ def show(self, resource_names: Optional[List[str]] = None) -> Simplified: return resources[0].to_simplified() return {'resources': [r.to_simplified() for r in resources]} - @cli.SMBCommand('dump cluster-config', perm='r') - def dump_config(self, cluster_id: str) -> Dict[str, Any]: - """DEBUG: Generate an example configuration""" - # TODO: Remove this command prior to release - return self._handler.generate_config(cluster_id) - - @cli.SMBCommand('dump service-spec', perm='r') - def dump_service_spec(self, cluster_id: str) -> Dict[str, Any]: - """DEBUG: Generate an example smb service spec""" - # TODO: Remove this command prior to release - return dict( - self._handler.generate_smb_service_spec(cluster_id).to_json() - ) - - @cli.SMBCommand('dump everything', perm='r') - def dump_everything(self) -> Dict[str, Any]: - """DEBUG: Show me everything""" - # TODO: Remove this command prior to release - everything: Dict[str, Any] = {} - everything['PUBLIC'] = {} - log.warning('dumping PUBLIC') - for key in self._public_store: - e = self._public_store[key] - log.warning('dumping e: %s %r', e.uri, e.full_key) - everything['PUBLIC'][e.uri] = e.get() - log.warning('dumping PRIV') - everything['PRIV'] = {} - for key in self._priv_store: - e = self._priv_store[key] - log.warning('dumping e: %s %r', e.uri, e.full_key) - everything['PRIV'][e.uri] = e.get() - log.warning('dumping INTERNAL') - everything['INTERNAL'] = {} - for key in self._internal_store: - e = self._internal_store[key] - log.warning('dumping e: %s %r', e.uri, e.full_key) - everything['INTERNAL'][e.uri] = e.get() - return everything - def submit_smb_spec(self, spec: SMBSpec) -> None: """Submit a new or updated smb spec object to ceph orchestration.""" completion = self.apply_smb(spec) diff --git a/src/pybind/mgr/smb/tests/test_smb.py b/src/pybind/mgr/smb/tests/test_smb.py index c9fd02968b904..0d3610326c225 100644 --- a/src/pybind/mgr/smb/tests/test_smb.py +++ b/src/pybind/mgr/smb/tests/test_smb.py @@ -410,72 +410,6 @@ def test_cmd_apply_share(tmodule): assert bdata["results"][0]["state"] == "created" -def test_share_dump_config(tmodule): - _example_cfg_1(tmodule) - - cfg = tmodule.dump_config('foo') - assert cfg == { - 'samba-container-config': "v0", - 'configs': { - 'foo': { - 'instance_name': 'foo', - 'instance_features': [], - 'shares': ['Ess One', 'Ess Two'], - 'globals': ['default', 'foo'], - }, - }, - 'shares': { - 'Ess One': { - 'options': { - 'path': '/', - 'read only': 'No', - 'browseable': 'Yes', - 'kernel share modes': 'no', - 'x:ceph:id': 'foo.s1', - 'vfs objects': 'acl_xattr ceph_new', - 'acl_xattr:security_acl_name': 'user.NTACL', - 'ceph_new:config_file': '/etc/ceph/ceph.conf', - 'ceph_new:filesystem': 'cephfs', - 'ceph_new:user_id': 'smb.fs.cluster.foo', - }, - }, - 'Ess Two': { - 'options': { - 'path': '/two', - 'read only': 'No', - 'browseable': 'Yes', - 'kernel share modes': 'no', - 'x:ceph:id': 'foo.stwo', - 'vfs objects': 'acl_xattr ceph_new', - 'acl_xattr:security_acl_name': 'user.NTACL', - 'ceph_new:config_file': '/etc/ceph/ceph.conf', - 'ceph_new:filesystem': 'cephfs', - 'ceph_new:user_id': 'smb.fs.cluster.foo', - }, - }, - }, - 'globals': { - 'default': { - 'options': { - 'load printers': 'No', - 'printing': 'bsd', - 'printcap name': '/dev/null', - 'disable spoolss': 'Yes', - }, - }, - 'foo': { - 'options': { - 'idmap config * : backend': 'autorid', - 'idmap config * : range': '2000-9999999', - 'realm': 'dom1.example.com', - 'security': 'ads', - 'workgroup': 'DOM1', - }, - }, - }, - } - - def test_cluster_create_ad1(tmodule): _example_cfg_1(tmodule) @@ -613,29 +547,6 @@ def test_cluster_rm(tmodule): assert result.success -def test_dump_service_spec(tmodule): - _example_cfg_1(tmodule) - tmodule._public_store.overwrite( - { - 'foo.config.smb': '', - } - ) - tmodule._priv_store.overwrite( - { - 'foo.join.2b9902c05d08bcba.json': '', - 'foo.join.08129d4d3b8c37c7.json': '', - } - ) - - cfg = tmodule.dump_service_spec('foo') - assert cfg - assert cfg['service_id'] == 'foo' - assert cfg['spec']['cluster_id'] == 'foo' - assert cfg['spec']['features'] == ['domain'] - assert cfg['spec']['config_uri'] == 'mem:foo/config.smb' - assert len(cfg['spec']['join_sources']) == 2 - - def test_cmd_show_resource_json(tmodule): _example_cfg_1(tmodule) From f7a379fe9bc4d57f23c1f5c00807bf3dfa2851d3 Mon Sep 17 00:00:00 2001 From: Shweta Bhosale Date: Wed, 9 Oct 2024 14:53:30 +0530 Subject: [PATCH 146/148] cephadm: Added new cephadm command to list all the default images Fixes: https://tracker.ceph.com/issues/68438 Signed-off-by: Shweta Bhosale --- src/cephadm/cephadm.py | 12 ++++++ src/cephadm/cephadmlib/container_types.py | 50 ++++++++++++++++++++++- 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py index 1ab98a0ac4f1e..5520ff52bd5a4 100755 --- a/src/cephadm/cephadm.py +++ b/src/cephadm/cephadm.py @@ -29,6 +29,7 @@ from io import StringIO from threading import Thread, Event from pathlib import Path +from configparser import ConfigParser from cephadmlib.constants import ( # default images @@ -142,6 +143,7 @@ SidecarContainer, extract_uid_gid, is_container_running, + get_mgr_images, ) from cephadmlib.decorators import ( deprecated_command, @@ -4679,6 +4681,13 @@ def probe_hba(scan_path: str) -> None: return f'Ok. {len(all_scan_files)} adapters detected: {len(scan_files)} rescanned, {len(skipped)} skipped, {len(failures)} failed ({elapsed:.2f}s)' +def command_list_images(ctx: CephadmContext) -> None: + """this function will list the default images used by different services""" + cp_obj = ConfigParser() + cp_obj['mgr'] = get_mgr_images() + # print default images + cp_obj.write(sys.stdout) + ################################## @@ -5542,6 +5551,9 @@ def _get_parser(): 'disk-rescan', help='rescan all HBAs to detect new/removed devices') parser_disk_rescan.set_defaults(func=command_rescan_disks) + parser_list_images = subparsers.add_parser( + 'list-images', help='list all the default images') + parser_list_images.set_defaults(func=command_list_images) return parser diff --git a/src/cephadm/cephadmlib/container_types.py b/src/cephadm/cephadmlib/container_types.py index 665c4d89652a6..791a545538a3c 100644 --- a/src/cephadm/cephadmlib/container_types.py +++ b/src/cephadm/cephadmlib/container_types.py @@ -8,7 +8,28 @@ from typing import Dict, List, Optional, Any, Union, Tuple, Iterable, cast from .call_wrappers import call, call_throws, CallVerbosity -from .constants import DEFAULT_TIMEOUT +from .constants import ( + DEFAULT_TIMEOUT, + # default container images + DEFAULT_ALERT_MANAGER_IMAGE, + DEFAULT_GRAFANA_IMAGE, + DEFAULT_LOKI_IMAGE, + DEFAULT_NODE_EXPORTER_IMAGE, + DEFAULT_PROMETHEUS_IMAGE, + DEFAULT_PROMTAIL_IMAGE, + DEFAULT_HAPROXY_IMAGE, + DEFAULT_KEEPALIVED_IMAGE, + DEFAULT_NVMEOF_IMAGE, + DEFAULT_SNMP_GATEWAY_IMAGE, + DEFAULT_ELASTICSEARCH_IMAGE, + DEFAULT_JAEGER_COLLECTOR_IMAGE, + DEFAULT_JAEGER_AGENT_IMAGE, + DEFAULT_JAEGER_QUERY_IMAGE, + DEFAULT_SMB_IMAGE, + DEFAULT_SMBMETRICS_IMAGE, + DEFAULT_NGINX_IMAGE, + DEFAULT_OAUTH2_PROXY_IMAGE, +) from .container_engines import Docker, Podman from .context import CephadmContext from .daemon_identity import DaemonIdentity, DaemonSubIdentity @@ -660,3 +681,30 @@ def enable_shared_namespaces( cc = f'container:{name}' for n in ns: _replace_container_arg(args, n.to_option(cc)) + + +def get_mgr_images() -> dict: + """Return dict of default mgr images""" + mgr_prefix = 'mgr/cephadm/container_image_' + mgr_images = {} + mgr_images[mgr_prefix + 'prometheus'] = DEFAULT_PROMETHEUS_IMAGE + mgr_images[mgr_prefix + 'alertmanager'] = DEFAULT_ALERT_MANAGER_IMAGE + mgr_images[mgr_prefix + 'graphana'] = DEFAULT_GRAFANA_IMAGE + mgr_images[mgr_prefix + 'loki'] = DEFAULT_LOKI_IMAGE + mgr_images[mgr_prefix + 'promtail'] = DEFAULT_PROMTAIL_IMAGE + mgr_images[mgr_prefix + 'node_exporter'] = DEFAULT_NODE_EXPORTER_IMAGE + mgr_images[mgr_prefix + 'haproxy'] = DEFAULT_HAPROXY_IMAGE + mgr_images[mgr_prefix + 'keepalived'] = DEFAULT_KEEPALIVED_IMAGE + mgr_images[mgr_prefix + 'nvmeof'] = DEFAULT_NVMEOF_IMAGE + mgr_images[mgr_prefix + 'snmp_gateway'] = DEFAULT_SNMP_GATEWAY_IMAGE + mgr_images[mgr_prefix + 'elasticsearch'] = DEFAULT_ELASTICSEARCH_IMAGE + mgr_images[ + mgr_prefix + 'jaeger_collector' + ] = DEFAULT_JAEGER_COLLECTOR_IMAGE + mgr_images[mgr_prefix + 'jaeger_agent'] = DEFAULT_JAEGER_AGENT_IMAGE + mgr_images[mgr_prefix + 'jaeger_query'] = DEFAULT_JAEGER_QUERY_IMAGE + mgr_images[mgr_prefix + 'smb'] = DEFAULT_SMB_IMAGE + mgr_images[mgr_prefix + 'smbmetrics'] = DEFAULT_SMBMETRICS_IMAGE + mgr_images[mgr_prefix + 'nginx'] = DEFAULT_NGINX_IMAGE + mgr_images[mgr_prefix + 'oauth2_proxy'] = DEFAULT_OAUTH2_PROXY_IMAGE + return mgr_images From 87612f499f86c9864c3bf6371cdd46954176e5ab Mon Sep 17 00:00:00 2001 From: Pedro Gonzalez Gomez Date: Mon, 7 Oct 2024 21:22:20 +0200 Subject: [PATCH 147/148] mgr/dashboard: fix lifecycle issues Fixes: https://tracker.ceph.com/issues/68434 Signed-off-by: Pedro Gonzalez Gomez --- .../rgw-bucket-details.component.html | 10 +++++-- .../frontend/src/app/ceph/rgw/rgw.module.ts | 4 ++- .../frontend/src/app/shared/pipes/xml.pipe.ts | 8 ++++-- .../shared/services/json-to-xml.service.ts | 20 ++++++++++---- .../frontend/src/styles/_carbon-defaults.scss | 7 +++++ .../mgr/dashboard/services/rgw_client.py | 27 +++++++++++++++---- 6 files changed, 61 insertions(+), 15 deletions(-) diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-bucket-details/rgw-bucket-details.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-bucket-details/rgw-bucket-details.component.html index ddc202152b9f4..463eac88b1e99 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-bucket-details/rgw-bucket-details.component.html +++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-bucket-details/rgw-bucket-details.component.html @@ -158,8 +158,14 @@
-
{{selection.lifecycle | json}}
-
{{ (selection.lifecycle | xml) || '-'}}
+ + {{selection.lifecycle | json}} + + + {{ (selection.lifecycle | xml:{'Rules':'Rule'}) || '-'}} + diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw.module.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw.module.ts index 3439562c8e223..5f8c6f50135c2 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw.module.ts +++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw.module.ts @@ -70,7 +70,8 @@ import { IconModule, LoadingModule, ModalModule, - ProgressIndicatorModule + ProgressIndicatorModule, + CodeSnippetModule } from 'carbon-components-angular'; import { CephSharedModule } from '../shared/ceph-shared.module'; @@ -94,6 +95,7 @@ import { CephSharedModule } from '../shared/ceph-shared.module'; ModalModule, GridModule, ProgressIndicatorModule, + CodeSnippetModule, ButtonModule, LoadingModule, IconModule, diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/pipes/xml.pipe.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/pipes/xml.pipe.ts index 59d7572e9f004..45cca684dab01 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/shared/pipes/xml.pipe.ts +++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/pipes/xml.pipe.ts @@ -7,9 +7,13 @@ import { JsonToXmlService } from '../services/json-to-xml.service'; export class XmlPipe implements PipeTransform { constructor(private jsonToXmlService: JsonToXmlService) {} - transform(value: string, valueFormat: string = 'json'): string { + transform( + value: string, + replaceKey: Record = {}, + valueFormat: string = 'json' + ): string { if (valueFormat === 'json') { - value = this.jsonToXmlService.format(value); + value = this.jsonToXmlService.format(value, replaceKey); } return value; } diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/services/json-to-xml.service.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/services/json-to-xml.service.ts index 8f1d128c0c59c..e9d30f9b7f2f4 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/shared/services/json-to-xml.service.ts +++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/services/json-to-xml.service.ts @@ -6,29 +6,39 @@ import { Injectable } from '@angular/core'; export class JsonToXmlService { constructor() {} - format(json: any, indentSize: number = 2, currentIndent: number = 0): string { + format( + json: any, + replaceKey: Record = null, + indentSize: number = 2, + currentIndent: number = 0 + ): string { if (!json) return null; let xml = ''; if (typeof json === 'string') { json = JSON.parse(json); } - for (const key in json) { + for (let key in json) { if (json.hasOwnProperty(key)) { const value = json[key]; const indentation = ' '.repeat(currentIndent); - + if (replaceKey) { + const [oldKey, newKey] = Object.entries(replaceKey)[0]; + if (key === oldKey) { + key = newKey; + } + } if (Array.isArray(value)) { value.forEach((item) => { xml += `${indentation}<${key}>\n` + - this.format(item, indentSize, currentIndent + indentSize) + + this.format(item, replaceKey, indentSize, currentIndent + indentSize) + `${indentation}\n`; }); } else if (typeof value === 'object') { xml += `${indentation}<${key}>\n` + - this.format(value, indentSize, currentIndent + indentSize) + + this.format(value, replaceKey, indentSize, currentIndent + indentSize) + `${indentation}\n`; } else { xml += `${indentation}<${key}>${value}\n`; diff --git a/src/pybind/mgr/dashboard/frontend/src/styles/_carbon-defaults.scss b/src/pybind/mgr/dashboard/frontend/src/styles/_carbon-defaults.scss index 1d12facaf6a2f..61ca421101e6d 100644 --- a/src/pybind/mgr/dashboard/frontend/src/styles/_carbon-defaults.scss +++ b/src/pybind/mgr/dashboard/frontend/src/styles/_carbon-defaults.scss @@ -142,3 +142,10 @@ Dashboard page cd-dashboard { font-size: 12px; } + +/****************************************** +Code snippet +******************************************/ +.cds--snippet { + width: fit-content; +} diff --git a/src/pybind/mgr/dashboard/services/rgw_client.py b/src/pybind/mgr/dashboard/services/rgw_client.py index 2441b73b361be..340e894d23ae1 100755 --- a/src/pybind/mgr/dashboard/services/rgw_client.py +++ b/src/pybind/mgr/dashboard/services/rgw_client.py @@ -10,6 +10,7 @@ import time import uuid import xml.etree.ElementTree as ET # noqa: N814 +from collections import defaultdict from enum import Enum from subprocess import SubprocessError from urllib.parse import urlparse @@ -700,12 +701,28 @@ def set_tags(self, bucket_name, tags, request=None): raise DashboardException(msg=str(e), component='rgw') return result + @staticmethod + def _handle_rules(pairs): + result = defaultdict(list) + for key, value in pairs: + if key == 'Rule': + result['Rules'].append(value) + else: + result[key] = value + return result + @RestClient.api_get('/{bucket_name}?lifecycle') def get_lifecycle(self, bucket_name, request=None): # pylint: disable=unused-argument try: - result = request() # type: ignore - result = {'LifecycleConfiguration': result} + decoded_request = request(raw_content=True).decode("utf-8") # type: ignore + result = { + 'LifecycleConfiguration': + json.loads( + decoded_request, + object_pairs_hook=RgwClient._handle_rules + ) + } except RequestException as e: if e.content: content = json_str_to_object(e.content) @@ -757,15 +774,15 @@ def set_lifecycle(self, bucket_name, lifecycle, request=None): lifecycle = RgwClient.dict_to_xml(lifecycle) try: if lifecycle and '' not in str(lifecycle): - lifecycle = f'{lifecycle}' + lifecycle = f'\n{lifecycle}\n' result = request(data=lifecycle) # type: ignore except RequestException as e: + msg = '' if e.content: content = json_str_to_object(e.content) if content.get("Code") == "MalformedXML": msg = "Invalid Lifecycle document" - raise DashboardException(msg=msg, component='rgw') - raise DashboardException(msg=str(e), component='rgw') + raise DashboardException(msg=msg or str(e), component='rgw') return result @RestClient.api_delete('/{bucket_name}?lifecycle') From 4b2ba587b7d8090523fc8eddb31893c4ee9c87af Mon Sep 17 00:00:00 2001 From: Teoman ONAY Date: Mon, 17 Jun 2024 13:16:48 +0200 Subject: [PATCH 148/148] mgmt-gateway: add e2e testing Add mgmt-gateway teuthology test scenarios Signed-off-by: Teoman ONAY --- .../workunits/task/test_mgmt_gateway.yaml | 77 +++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 qa/suites/orch/cephadm/workunits/task/test_mgmt_gateway.yaml diff --git a/qa/suites/orch/cephadm/workunits/task/test_mgmt_gateway.yaml b/qa/suites/orch/cephadm/workunits/task/test_mgmt_gateway.yaml new file mode 100644 index 0000000000000..5207fd415b7e6 --- /dev/null +++ b/qa/suites/orch/cephadm/workunits/task/test_mgmt_gateway.yaml @@ -0,0 +1,77 @@ +overrides: + ceph: + log-ignorelist: + - CEPHADM_FAILED_DAEMON + log-only-match: + - CEPHADM_ +roles: +- - host.a + - mon.a + - mgr.a + - osd.0 +- - host.b + - mon.b + - mgr.b + - osd.1 +- - host.c + - mon.c + - osd.2 +tasks: +- install: +- cephadm: +- cephadm.shell: + host.c: + - | + set -ex + # Deploy monitoring stack + ceph orch apply node-exporter + ceph orch apply grafana + ceph orch apply alertmanager + ceph orch apply prometheus + sleep 240 + # generate SSL certificate + openssl req -x509 -newkey rsa:4096 -keyout /tmp/key.pem -out /tmp/cert.pem -sha256 -days 30 -nodes -subj "/CN=*" + # Generate a mgmt.spec template + cat << EOT > /tmp/mgmt.spec + service_type: mgmt-gateway + service_id: foo + placement: + hosts: + - ${HOSTNAME} + spec: + ssl_protocols: + - TLSv1.2 + - TLSv1.3 + ssl_ciphers: + - AES128-SHA + - AES256-SHA + enable_health_check_endpoint: True + EOT + # Add generated certificates to spec file + echo " ssl_certificate: |" >> /tmp/mgmt.spec + while read LINE; do echo $LINE | sed -e "s/^/ /"; done < /tmp/cert.pem >> /tmp/mgmt.spec + echo " ssl_certificate_key: |" >> /tmp/mgmt.spec + while read LINE; do echo $LINE | sed -e "s/^/ /"; done < /tmp/key.pem >> /tmp/mgmt.spec + # Apply spec + ceph orch apply -i /tmp/mgmt.spec +- cephadm.wait_for_service: + service: mgmt-gateway +- cephadm.shell: + host.a: + - | + set -ex + # retrieve mgmt hostname and ip + MGMT_GTW_HOST=$(ceph orch ps --daemon-type mgmt-gateway -f json | jq -e '.[]' | jq -r '.hostname') + MGMT_GTW_IP=$(ceph orch host ls -f json | jq -r --arg MGMT_GTW_HOST "$MGMT_GTW_HOST" '.[] | select(.hostname==$MGMT_GTW_HOST) | .addr') + # check mgmt-gateway health + curl -k -s https://${MGMT_GTW_IP}/health + curl -k -s https://${MGMT_GTW_IP}:29443/health + # wait for background services to be reconfigured following mgmt-gateway installation + sleep 180 + # check grafana endpoints are responsive and database health is okay + curl -k -s https://${MGMT_GTW_IP}/grafana/api/health | jq -e '.database == "ok"' + # check prometheus endpoints are responsive + curl -k -s -u admin:admin https://${MGMT_GTW_IP}/prometheus/api/v1/status/config | jq -e '.status == "success"' + # check alertmanager endpoints are responsive + curl -k -s -u admin:admin https://${MGMT_GTW_IP}/alertmanager/api/v2/status +