From 6ae002460b8474abe7ebc3592605b88f53253387 Mon Sep 17 00:00:00 2001
From: neeraj pratap singh
 <neerajpratapsingh@li-ff7f0d4c-3462-11b2-a85c-d4004c0fa1a0.ibm.com>
Date: Wed, 26 Jun 2024 22:29:16 +0530
Subject: [PATCH 001/148] mgr/vol : shortening the name of function

Fixes: https://tracker.ceph.com/issues/66815
Introduced by:https://github.com/ceph/ceph/pull/55838#discussion_r1573655512
Signed-off-by: Neeraj Pratap Singh <neesingh@redhat.com>
---
 .../volumes/fs/operations/versions/metadata_manager.py    | 2 +-
 .../mgr/volumes/fs/operations/versions/subvolume_v1.py    | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/pybind/mgr/volumes/fs/operations/versions/metadata_manager.py b/src/pybind/mgr/volumes/fs/operations/versions/metadata_manager.py
index 610a61e6a4c1f..146d6d3f453d6 100644
--- a/src/pybind/mgr/volumes/fs/operations/versions/metadata_manager.py
+++ b/src/pybind/mgr/volumes/fs/operations/versions/metadata_manager.py
@@ -172,7 +172,7 @@ def list_all_options_from_section(self, section):
                 metadata_dict[option] = self.config.get(section,option)
         return metadata_dict
 
-    def list_all_keys_with_specified_values_from_section(self, section, value):
+    def filter_keys(self, section, value):
         keys = []
         if self.config.has_section(section):
             options = self.config.options(section)
diff --git a/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v1.py b/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v1.py
index 90f35a4c90b39..f037d5d2a1bbd 100644
--- a/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v1.py
+++ b/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v1.py
@@ -752,7 +752,7 @@ def get_pending_clones(self, snapname):
 
         try:
             if self.has_pending_clones(snapname):
-                pending_track_id_list = self.metadata_mgr.list_all_keys_with_specified_values_from_section('clone snaps', snapname)
+                pending_track_id_list = self.metadata_mgr.filter_keys('clone snaps', snapname)
             else:
                 return pending_clones_info
         except MetadataMgrException as me:
@@ -774,9 +774,9 @@ def get_pending_clones(self, snapname):
                     raise VolumeException(-e.args[0], e.args[1])
                 else:
                     try:
-                        # If clone is completed between 'list_all_keys_with_specified_values_from_section'
-                        # and readlink(track_id_path) call then readlink will fail with error ENOENT (2)
-                        # Hence we double check whether track_id is exist in .meta file or not.
+                        # If clone is completed between 'filter_keys' and readlink(track_id_path) call
+                        # then readlink will fail with error ENOENT (2). Hence we double check whether
+                        # track_id exists in .meta file or not.
                         # Edge case scenario.
                         # If track_id for clone exist but path /volumes/_index/clone/{track_id} not found
                         # then clone is orphan.

From 9ae2c89511be4b64c17974cd0fc6770641f9af4d Mon Sep 17 00:00:00 2001
From: Rishabh Dave <ridave@redhat.com>
Date: Tue, 9 Jul 2024 18:49:52 +0530
Subject: [PATCH 002/148] qa/cephfs: ignore when specific OSD is reported down
 during upgrade

We already ignore health warning regarding OSD being down during upgrade
but health warning regarding specific OSD being down is not added to the
ignorelist which causes upgrade jobs to be marked as failed even though
they were successful.

Fixes: https://tracker.ceph.com/issues/66877
Signed-off-by: Rishabh Dave <ridave@redhat.com>
---
 .../mds_upgrade_sequence/overrides/ignorelist_upgrade.yaml       | 1 +
 1 file changed, 1 insertion(+)

diff --git a/qa/suites/fs/upgrade/mds_upgrade_sequence/overrides/ignorelist_upgrade.yaml b/qa/suites/fs/upgrade/mds_upgrade_sequence/overrides/ignorelist_upgrade.yaml
index 713adb9628ab6..96e4353e99c7a 100644
--- a/qa/suites/fs/upgrade/mds_upgrade_sequence/overrides/ignorelist_upgrade.yaml
+++ b/qa/suites/fs/upgrade/mds_upgrade_sequence/overrides/ignorelist_upgrade.yaml
@@ -2,3 +2,4 @@ overrides:
   ceph:
     log-ignorelist:
       - OSD_DOWN
+      - osd.*is down

From 2268923dd9f18a7ba05f5d04ca39b26be4068a67 Mon Sep 17 00:00:00 2001
From: Pere Diaz Bou <pere-altea@hotmail.com>
Date: Tue, 6 Aug 2024 10:47:53 +0200
Subject: [PATCH 003/148] qa/rados/upgrade: ignore PG_DEGRADED

Fixes: https://tracker.ceph.com/issues/67182
Signed-off-by: Pere Diaz Bou <pere-altea@hotmail.com>
---
 qa/suites/upgrade/reef-x/parallel/0-start.yaml | 2 ++
 qa/suites/upgrade/reef-x/parallel/1-tasks.yaml | 1 +
 2 files changed, 3 insertions(+)

diff --git a/qa/suites/upgrade/reef-x/parallel/0-start.yaml b/qa/suites/upgrade/reef-x/parallel/0-start.yaml
index 3814ea3efdb50..146bd57960dad 100644
--- a/qa/suites/upgrade/reef-x/parallel/0-start.yaml
+++ b/qa/suites/upgrade/reef-x/parallel/0-start.yaml
@@ -31,3 +31,5 @@ overrides:
     conf:
       osd:
         osd shutdown pgref assert: true
+    log-ignorelist:
+        - PG_DEGRADED
diff --git a/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml b/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml
index bf3005fad458f..ce4e0cc228bba 100644
--- a/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml
+++ b/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml
@@ -6,6 +6,7 @@ overrides:
       - MON_DOWN
       - out of quorum
       - PG_AVAILABILITY
+      - PG_DEGRADED
 tasks:
 - install:
     branch: reef

From 12a9aba43caedf6ea2cb897708e0b31d96ee358f Mon Sep 17 00:00:00 2001
From: Adam King <adking@redhat.com>
Date: Mon, 9 Sep 2024 18:28:45 -0400
Subject: [PATCH 004/148] cephadm: add ability to continue on failure when
 applying multiple specs

Additionally, add the flag that does so when cephadm applies
a spec during bootstrap. Bootstrap will continue to completion
regardless of whether applying the spec fails, so we might
as well try applying all of it while reporting errors we
do see back to the user

Fixes: https://tracker.ceph.com/issues/65338

Signed-off-by: Adam King <adking@redhat.com>
---
 src/cephadm/cephadm.py                    |  2 +-
 src/pybind/mgr/cephadm/module.py          | 16 +++++++-
 src/pybind/mgr/orchestrator/_interface.py |  7 +++-
 src/pybind/mgr/orchestrator/module.py     | 48 +++++++++++++++++++----
 4 files changed, 62 insertions(+), 11 deletions(-)

diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py
index e71addf7bfa5a..33cac4a34032a 100755
--- a/src/cephadm/cephadm.py
+++ b/src/cephadm/cephadm.py
@@ -2946,7 +2946,7 @@ def mgr_has_latest_epoch():
         mounts = {}
         mounts[pathify(ctx.apply_spec)] = '/tmp/spec.yml:ro'
         try:
-            out = cli(['orch', 'apply', '-i', '/tmp/spec.yml'], extra_mounts=mounts)
+            out = cli(['orch', 'apply', '--continue-on-error', '-i', '/tmp/spec.yml'], extra_mounts=mounts)
             logger.info(out)
         except Exception:
             ctx.error_code = -errno.EINVAL
diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py
index 1a9a10862180e..650d9711bd3be 100644
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -3516,7 +3516,12 @@ def _apply_service_spec(self, spec: ServiceSpec) -> str:
         return "Scheduled %s update..." % spec.service_name()
 
     @handle_orch_error
-    def apply(self, specs: Sequence[GenericSpec], no_overwrite: bool = False) -> List[str]:
+    def apply(
+        self,
+        specs: Sequence[GenericSpec],
+        no_overwrite: bool = False,
+        continue_on_error: bool = True
+    ) -> List[str]:
         results = []
         for spec in specs:
             if no_overwrite:
@@ -3528,7 +3533,14 @@ def apply(self, specs: Sequence[GenericSpec], no_overwrite: bool = False) -> Lis
                     results.append('Skipped %s service spec. To change %s spec omit --no-overwrite flag'
                                    % (cast(ServiceSpec, spec).service_name(), cast(ServiceSpec, spec).service_name()))
                     continue
-            results.append(self._apply(spec))
+            try:
+                res = self._apply(spec)
+                results.append(res)
+            except Exception as e:
+                if continue_on_error:
+                    results.append(f'Failed to apply spec for {spec}: {str(e)}')
+                else:
+                    raise e
         return results
 
     @handle_orch_error
diff --git a/src/pybind/mgr/orchestrator/_interface.py b/src/pybind/mgr/orchestrator/_interface.py
index c33f38cfdd470..7b602401a7e61 100644
--- a/src/pybind/mgr/orchestrator/_interface.py
+++ b/src/pybind/mgr/orchestrator/_interface.py
@@ -574,7 +574,12 @@ def cert_store_get_key(
         raise NotImplementedError()
 
     @handle_orch_error
-    def apply(self, specs: Sequence["GenericSpec"], no_overwrite: bool = False) -> List[str]:
+    def apply(
+        self,
+        specs: Sequence["GenericSpec"],
+        no_overwrite: bool = False,
+        continue_on_error: bool = False
+    ) -> List[str]:
         """
         Applies any spec
         """
diff --git a/src/pybind/mgr/orchestrator/module.py b/src/pybind/mgr/orchestrator/module.py
index d0f3286177ce5..7c943b076f4a8 100644
--- a/src/pybind/mgr/orchestrator/module.py
+++ b/src/pybind/mgr/orchestrator/module.py
@@ -1620,12 +1620,14 @@ def apply_misc(self,
                    format: Format = Format.plain,
                    unmanaged: bool = False,
                    no_overwrite: bool = False,
+                   continue_on_error: bool = False,
                    inbuf: Optional[str] = None) -> HandleCommandResult:
         """Update the size or placement for a service or apply a large yaml spec"""
         usage = """Usage:
   ceph orch apply -i <yaml spec> [--dry-run]
   ceph orch apply <service_type> [--placement=<placement_string>] [--unmanaged]
         """
+        errs: List[str] = []
         if inbuf:
             if service_type or placement or unmanaged:
                 raise OrchestratorValidationError(usage)
@@ -1635,7 +1637,14 @@ def apply_misc(self,
             # None entries in the output. Let's skip them silently.
             content = [o for o in yaml_objs if o is not None]
             for s in content:
-                spec = json_to_generic_spec(s)
+                try:
+                    spec = json_to_generic_spec(s)
+                except Exception as e:
+                    if continue_on_error:
+                        errs.append(f'Failed to convert {s} from json object: {str(e)}')
+                        continue
+                    else:
+                        raise e
 
                 # validate the config (we need MgrModule for that)
                 if isinstance(spec, ServiceSpec) and spec.config:
@@ -1643,7 +1652,12 @@ def apply_misc(self,
                         try:
                             self.get_foreign_ceph_option('mon', k)
                         except KeyError:
-                            raise SpecValidationError(f'Invalid config option {k} in spec')
+                            err = SpecValidationError(f'Invalid config option {k} in spec')
+                            if continue_on_error:
+                                errs.append(str(err))
+                                continue
+                            else:
+                                raise err
 
                 # There is a general "osd" service with no service id, but we use
                 # that to dump osds created individually with "ceph orch daemon add osd"
@@ -1658,7 +1672,12 @@ def apply_misc(self,
                     and spec.service_type == 'osd'
                     and not spec.service_id
                 ):
-                    raise SpecValidationError('Please provide the service_id field in your OSD spec')
+                    err = SpecValidationError('Please provide the service_id field in your OSD spec')
+                    if continue_on_error:
+                        errs.append(str(err))
+                        continue
+                    else:
+                        raise err
 
                 if dry_run and not isinstance(spec, HostSpec):
                     spec.preview_only = dry_run
@@ -1668,15 +1687,30 @@ def apply_misc(self,
                     continue
                 specs.append(spec)
         else:
+            # Note in this case there is only ever one spec
+            # being applied so there is no need to worry about
+            # handling of continue_on_error
             placementspec = PlacementSpec.from_string(placement)
             if not service_type:
                 raise OrchestratorValidationError(usage)
             specs = [ServiceSpec(service_type.value, placement=placementspec,
                                  unmanaged=unmanaged, preview_only=dry_run)]
-        return self._apply_misc(specs, dry_run, format, no_overwrite)
-
-    def _apply_misc(self, specs: Sequence[GenericSpec], dry_run: bool, format: Format, no_overwrite: bool = False) -> HandleCommandResult:
-        completion = self.apply(specs, no_overwrite)
+        cmd_result = self._apply_misc(specs, dry_run, format, no_overwrite, continue_on_error)
+        if errs:
+            # HandleCommandResult is a named tuple, so use
+            # _replace to modify it.
+            cmd_result = cmd_result._replace(stdout=cmd_result.stdout + '\n' + '\n'.join(errs))
+        return cmd_result
+
+    def _apply_misc(
+        self,
+        specs: Sequence[GenericSpec],
+        dry_run: bool,
+        format: Format,
+        no_overwrite: bool = False,
+        continue_on_error: bool = False
+    ) -> HandleCommandResult:
+        completion = self.apply(specs, no_overwrite, continue_on_error)
         raise_if_exception(completion)
         out = completion.result_str()
         if dry_run:

From e905fedfccbfc70ae42e0cbac9164a1bf918ad01 Mon Sep 17 00:00:00 2001
From: Matan Breizman <mbreizma@redhat.com>
Date: Tue, 10 Sep 2024 12:09:24 +0000
Subject: [PATCH 005/148] osd/PG: make use of SnapMapper::update_snap_map

https://github.com/ceph/ceph/pull/58868 introduced
SnapMapper::update_snap_map to be used both by
Crimson and Classic.
No change in behavior.

Signed-off-by: Matan Breizman <mbreizma@redhat.com>
---
 src/osd/PG.cc | 42 +++---------------------------------------
 1 file changed, 3 insertions(+), 39 deletions(-)

diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index f7a5033574f76..ee14f650e5336 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -1137,46 +1137,10 @@ void PG::update_snap_map(
   const vector<pg_log_entry_t> &log_entries,
   ObjectStore::Transaction &t)
 {
-  for (auto i = log_entries.cbegin(); i != log_entries.cend(); ++i) {
+  for (const auto& entry : log_entries) {
     OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
-    if (i->soid.snap < CEPH_MAXSNAP) {
-      if (i->is_delete()) {
-	int r = snap_mapper.remove_oid(
-	  i->soid,
-	  &_t);
-	if (r)
-	  derr << __func__ << " remove_oid " << i->soid << " failed with " << r << dendl;
-        // On removal tolerate missing key corruption
-        ceph_assert(r == 0 || r == -ENOENT);
-      } else if (i->is_update()) {
-	ceph_assert(i->snaps.length() > 0);
-	vector<snapid_t> snaps;
-	bufferlist snapbl = i->snaps;
-	auto p = snapbl.cbegin();
-	try {
-	  decode(snaps, p);
-	} catch (...) {
-	  derr << __func__ << " decode snaps failure on " << *i << dendl;
-	  snaps.clear();
-	}
-	set<snapid_t> _snaps(snaps.begin(), snaps.end());
-
-	if (i->is_clone() || i->is_promote()) {
-	  snap_mapper.add_oid(
-	    i->soid,
-	    _snaps,
-	    &_t);
-	} else if (i->is_modify()) {
-	  int r = snap_mapper.update_snaps(
-	    i->soid,
-	    _snaps,
-	    0,
-	    &_t);
-	  ceph_assert(r == 0);
-	} else {
-	  ceph_assert(i->is_clean());
-	}
-      }
+    if (entry.soid.snap < CEPH_MAXSNAP) {
+      snap_mapper.update_snap_map(entry, &_t);
     }
   }
 }

From a79e9a4e7aec195c904505ffdfd4851cb9eba532 Mon Sep 17 00:00:00 2001
From: Kevin Zhao <kevin.zhao@linaro.org>
Date: Mon, 12 Aug 2024 14:12:58 +0800
Subject: [PATCH 006/148] doc/rgw/uadk: Add UADK document for compressor zlib

Signed-off-by: Kevin Zhao <kevin.zhao@linaro.org>
---
 doc/radosgw/uadk-accel.rst | 131 +++++++++++++++++++++++++++++++++++++
 1 file changed, 131 insertions(+)
 create mode 100644 doc/radosgw/uadk-accel.rst

diff --git a/doc/radosgw/uadk-accel.rst b/doc/radosgw/uadk-accel.rst
new file mode 100644
index 0000000000000..748fe6d3b3b57
--- /dev/null
+++ b/doc/radosgw/uadk-accel.rst
@@ -0,0 +1,131 @@
+===============================================
+UADK Acceleration for Compression
+===============================================
+
+UADK is a framework for applications to access hardware accelerators in a
+unified, secure, and efficient way. UADK is comprised of UACCE, libwd and many
+other algorithm libraries.
+
+See `Compressor UADK Support`_.
+
+
+UADK in the Software Stack
+==========================
+
+UADK is a general-purpose user space accelerator framework that uses shared
+virtual addressing (SVA) to provide a unified programming interface for hardware
+acceleration of cryptographic and compression algorithms.
+
+UADK includes Unified/User-space-access-intended Accelerator Framework (UACCE),
+which enables hardware accelerators that support SVA to adapt to UADK.
+
+Currently, HiSilicon Kunpeng hardware accelerators have been registered with
+UACCE. Through the UADK framework, users can run cryptographic and compression
+algorithms using hardware accelerators instead of CPUs, freeing up CPU computing
+power and improving computing performance.
+
+A user can access the hardware accelerators by performing user-mode operations on
+the character devices, or the use of UADK can be done via frameworks that have
+been enabled by others including UADK support (for example, OpenSSL* libcrypto*,
+DPDK, and the Linux* Kernel Crypto Framework).
+
+See `OpenSSL UADK Engine`_.
+
+UADK Environment Setup
+======================
+UADK consists of UACCE, vendors’ drivers, and an algorithm layer. UADK requires the
+hardware accelerator to support SVA, and the operating system to support IOMMU and
+SVA. Hardware accelerators from different vendors are registered as different character
+devices with UACCE by using kernel-mode drivers of the vendors.
+
+::
+
+          +----------------------------------+
+          |                apps              |
+          +----+------------------------+----+
+               |                        |
+               |                        |
+       +-------+--------+       +-------+-------+
+       |   scheduler    |       | alg libraries |
+       +-------+--------+       +-------+-------+
+               |                         |
+               |                         |
+               |                         |
+               |                +--------+------+
+               |                | vendor drivers|
+               |                +-+-------------+
+               |                  |
+               |                  |
+            +--+------------------+--+
+            |         libwd          |
+    User    +----+-------------+-----+
+    --------------------------------------------------
+    Kernel    +--+-----+   +------+
+              | uacce  |   | smmu |
+              +---+----+   +------+
+                  |
+              +---+------------------+
+              | vendor kernel driver |
+              +----------------------+
+    --------------------------------------------------
+             +----------------------+
+             |   HW Accelerators    |
+             +----------------------+
+
+Configuration
+=============
+
+#. Kernel Requirement
+
+User needs to make sure that UACCE is already supported in Linux kernel. The kernel version
+should be at least v5.9 with SVA (Shared Virtual Addressing) enabled.
+
+UACCE may be built as a module or built into the kernel. Here's an example to build UACCE
+with hardware accelerators for the HiSilicon Kunpeng platform.
+
+    .. prompt:: bash $
+
+       CONFIG_IOMMU_SVA_LIB=y
+       CONFIG_ARM_SMMU=y
+       CONFIG_ARM_SMMU_V3=y
+       CONFIG_ARM_SMMU_V3_SVA=y
+       CONFIG_PCI_PASID=y
+       CONFIG_UACCE=y
+       CONFIG_CRYPTO_DEV_HISI_QM=y
+       CONFIG_CRYPTO_DEV_HISI_ZIP=y
+
+Make sure all these above kernel configurations are selected.
+
+#. UADK enablement
+If the architecture is aarch64, it will automatically download the UADK source code to build
+the static library. If it runs on other architecture, user can enable it with build parameters
+`-DWITH_UADK=true`
+
+#. Manual Build UADK
+As the above paragraph shows, the UADK is enabled automatically, no need to build manually.
+For developer who is interested in UADK, you can refer to the below steps for building.
+
+   .. prompt:: bash $ 
+
+      git clone https://github.com/Linaro/uadk.git
+      cd uadk
+      mkdir build
+      ./autogen.sh
+      ./configure --prefix=$PWD/build
+      make
+      make install
+
+   .. note:: Without –prefix, UADK will be installed to /usr/local/lib by default.
+   If get error:"cannot find -lnuma", please install the `libnuma-dev`
+
+#. Configure
+
+   Edit the Ceph configuration file (usually ``ceph.conf``) to enable UADK
+   support for *zlib* compression::
+
+         uadk_compressor_enabled=true
+
+   The default value in `global.yaml.in` for `uadk_compressor_enabled` is false.
+
+.. _Compressor UADK Support: https://github.com/ceph/ceph/pull/58336
+.. _OpenSSL UADK Engine: https://github.com/Linaro/uadk_engine

From ccd58786b90e358b19fa1d5108802856d6b4b237 Mon Sep 17 00:00:00 2001
From: Rishabh Dave <ridave@redhat.com>
Date: Wed, 18 Sep 2024 11:07:02 +0530
Subject: [PATCH 007/148] mgr/vol: use pre-defined timeout period instead of
 hardcoded value

Currently timeout is set to 5. But hardcoding this is unnecessary since
the class already defines a attribute for this purpose. Use that
instead.

Signed-off-by: Rishabh Dave <ridave@redhat.com>
---
 src/pybind/mgr/volumes/fs/async_job.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pybind/mgr/volumes/fs/async_job.py b/src/pybind/mgr/volumes/fs/async_job.py
index 6834e3e240b33..83a119ca5564c 100644
--- a/src/pybind/mgr/volumes/fs/async_job.py
+++ b/src/pybind/mgr/volumes/fs/async_job.py
@@ -167,7 +167,7 @@ def run(self):
                     for i in range(c, self.nr_concurrent_jobs):
                         self.threads.append(JobThread(self, self.vc, name="{0}.{1}.{2}".format(self.name_pfx, time.time(), i)))
                         self.threads[-1].start()
-                self.cv.wait(timeout=5)
+                self.cv.wait(timeout=self.wakeup_timeout)
 
     def shutdown(self):
         self.stopping.set()

From 85dff0d19185fa6dfad723ce80b6b3314de9752c Mon Sep 17 00:00:00 2001
From: Xuehan Xu <xuxuehan@qianxin.com>
Date: Wed, 18 Sep 2024 17:34:39 +0800
Subject: [PATCH 008/148] crimson/osd: purge strays when PGs go clean

Signed-off-by: Xuehan Xu <xuxuehan@qianxin.com>
---
 src/crimson/osd/osd.cc | 24 ++++++++++++++++++++++++
 src/crimson/osd/osd.h  |  2 ++
 src/crimson/osd/pg.cc  | 18 +++++++++++++++++-
 src/crimson/osd/pg.h   | 12 ++++++++++--
 4 files changed, 53 insertions(+), 3 deletions(-)

diff --git a/src/crimson/osd/osd.cc b/src/crimson/osd/osd.cc
index 8d2d10fbd7c45..34ad97ceb068a 100644
--- a/src/crimson/osd/osd.cc
+++ b/src/crimson/osd/osd.cc
@@ -23,6 +23,7 @@
 #include "messages/MOSDOp.h"
 #include "messages/MOSDPeeringOp.h"
 #include "messages/MOSDPGCreate2.h"
+#include "messages/MOSDPGRemove.h"
 #include "messages/MOSDPGUpdateLogMissing.h"
 #include "messages/MOSDPGUpdateLogMissingReply.h"
 #include "messages/MOSDRepOpReply.h"
@@ -863,6 +864,8 @@ OSD::do_ms_dispatch(
     [[fallthrough]];
   case MSG_OSD_PG_LOG:
     return handle_peering_op(conn, boost::static_pointer_cast<MOSDPeeringOp>(m));
+  case MSG_OSD_PG_REMOVE:
+    return handle_pg_remove(conn, boost::static_pointer_cast<MOSDPGRemove>(m));
   case MSG_OSD_REPOP:
     return handle_rep_op(conn, boost::static_pointer_cast<MOSDRepOp>(m));
   case MSG_OSD_REPOPREPLY:
@@ -1555,6 +1558,27 @@ seastar::future<> OSD::handle_peering_op(
     std::move(*evt)).second;
 }
 
+seastar::future<> OSD::handle_pg_remove(
+  crimson::net::ConnectionRef conn,
+  Ref<MOSDPGRemove> m)
+{
+  LOG_PREFIX(OSD::handle_pg_remove);
+  const int from = m->get_source().num();
+  std::vector<seastar::future<>> futs;
+  for (auto &pg : m->pg_list) {
+    DEBUG("{} from {}", pg, from);
+    futs.emplace_back(
+      pg_shard_manager.start_pg_operation<RemotePeeringEvent>(
+	conn,
+	pg_shard_t{from, pg.shard},
+	pg,
+	m->get_epoch(),
+	m->get_epoch(),
+	PeeringState::DeleteStart()).second);
+  }
+  return seastar::when_all_succeed(std::move(futs));
+}
+
 seastar::future<> OSD::check_osdmap_features()
 {
   LOG_PREFIX(OSD::check_osdmap_features);
diff --git a/src/crimson/osd/osd.h b/src/crimson/osd/osd.h
index de39d80827494..d7d54d5d2c3c3 100644
--- a/src/crimson/osd/osd.h
+++ b/src/crimson/osd/osd.h
@@ -208,6 +208,8 @@ class OSD final : public crimson::net::Dispatcher,
                                         Ref<MOSDRepOpReply> m);
   seastar::future<> handle_peering_op(crimson::net::ConnectionRef conn,
                                       Ref<MOSDPeeringOp> m);
+  seastar::future<> handle_pg_remove(crimson::net::ConnectionRef conn,
+				     Ref<MOSDPGRemove> m);
   seastar::future<> handle_recovery_subreq(crimson::net::ConnectionRef conn,
                                            Ref<MOSDFastDispatchOp> m);
   seastar::future<> handle_scrub_command(crimson::net::ConnectionRef conn,
diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc
index 644cc84513d49..c92978fcfc2b9 100644
--- a/src/crimson/osd/pg.cc
+++ b/src/crimson/osd/pg.cc
@@ -517,7 +517,8 @@ Context *PG::on_clean()
 {
   recovery_handler->on_pg_clean();
   scrubber.on_primary_active_clean();
-  return nullptr;
+  recovery_finisher = new C_PG_FinishRecovery(*this);
+  return recovery_finisher;
 }
 
 seastar::future<> PG::clear_temp_objects()
@@ -1883,4 +1884,19 @@ void PG::cancel_pglog_based_recovery_op() {
   pglog_based_recovery_op->cancel();
   reset_pglog_based_recovery_op();
 }
+
+void PG::C_PG_FinishRecovery::finish(int r) {
+  LOG_PREFIX(PG::C_PG_FinishRecovery::finish);
+  auto &peering_state = pg.get_peering_state();
+  if (peering_state.is_deleting() || !peering_state.is_clean()) {
+    DEBUGDPP("raced with delete or repair", pg);
+    return;
+  }
+  if (this == pg.recovery_finisher) {
+    peering_state.purge_strays();
+    pg.recovery_finisher = nullptr;
+  } else {
+    DEBUGDPP("stale recovery finsher", pg);
+  }
+}
 }
diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h
index 11c0e3668b142..91bd529b95d63 100644
--- a/src/crimson/osd/pg.h
+++ b/src/crimson/osd/pg.h
@@ -375,7 +375,7 @@ class PG : public boost::intrusive_ref_counter<
   }
   void check_blocklisted_watchers() final;
   void clear_primary_state() final {
-    // Not needed yet
+    recovery_finisher = nullptr;
   }
 
   void queue_check_readable(epoch_t last_peering_reset,
@@ -394,7 +394,7 @@ class PG : public boost::intrusive_ref_counter<
   void on_replica_activate() final;
   void on_activate_complete() final;
   void on_new_interval() final {
-    // Not needed yet
+    recovery_finisher = nullptr;
   }
   Context *on_clean() final;
   void on_activate_committed() final {
@@ -712,9 +712,17 @@ class PG : public boost::intrusive_ref_counter<
   }
   seastar::future<> stop();
 private:
+  class C_PG_FinishRecovery : public Context {
+  public:
+    explicit C_PG_FinishRecovery(PG &pg) : pg(pg) {}
+    void finish(int r) override;
+  private:
+    PG& pg;
+  };
   std::unique_ptr<PGBackend> backend;
   std::unique_ptr<RecoveryBackend> recovery_backend;
   std::unique_ptr<PGRecovery> recovery_handler;
+  C_PG_FinishRecovery *recovery_finisher;
 
   PeeringState peering_state;
   eversion_t projected_last_update;

From 91734345b612b65ef7ccbd8ec6c3b485287294ec Mon Sep 17 00:00:00 2001
From: Zac Dover <zac.dover@proton.me>
Date: Wed, 18 Sep 2024 21:02:32 +1000
Subject: [PATCH 009/148] doc/radosgw: correct RST formatting

fixup

Signed-off-by: Zac Dover <zac.dover@proton.me>
---
 doc/radosgw/index.rst      | 2 +-
 doc/radosgw/uadk-accel.rst | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/doc/radosgw/index.rst b/doc/radosgw/index.rst
index da92692fa8b7b..3085e1a528f9f 100644
--- a/doc/radosgw/index.rst
+++ b/doc/radosgw/index.rst
@@ -88,4 +88,4 @@ Cluster with one API and then retrieve that data with the other API.
    D3N Data Cache <d3n_datacache>
    Cloud Transition <cloud-transition>
    Metrics <metrics>
-
+   UADK Acceleration for Compression <uadk-accel>
diff --git a/doc/radosgw/uadk-accel.rst b/doc/radosgw/uadk-accel.rst
index 748fe6d3b3b57..fdf99f891f0a7 100644
--- a/doc/radosgw/uadk-accel.rst
+++ b/doc/radosgw/uadk-accel.rst
@@ -115,8 +115,9 @@ For developer who is interested in UADK, you can refer to the below steps for bu
       make
       make install
 
-   .. note:: Without –prefix, UADK will be installed to /usr/local/lib by default.
-   If get error:"cannot find -lnuma", please install the `libnuma-dev`
+   .. note:: Without –prefix, UADK will be installed to /usr/local/lib by
+             default. If get error:"cannot find -lnuma", please install 
+             the `libnuma-dev`.
 
 #. Configure
 

From a5d0f546807311d3fc37facd0b4acc98009a6271 Mon Sep 17 00:00:00 2001
From: Xuehan Xu <xuxuehan@qianxin.com>
Date: Thu, 19 Sep 2024 17:30:01 +0800
Subject: [PATCH 010/148] crimson/osd/backfill_state: push peer pg infos'
 last_backfills only when all objects before them are backfilled

Fixes: https://tracker.ceph.com/issues/68147
Signed-off-by: Xuehan Xu <xuxuehan@qianxin.com>
---
 src/crimson/osd/backfill_state.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/crimson/osd/backfill_state.cc b/src/crimson/osd/backfill_state.cc
index 70c43f49faf72..172c5e9cb0c1f 100644
--- a/src/crimson/osd/backfill_state.cc
+++ b/src/crimson/osd/backfill_state.cc
@@ -570,6 +570,7 @@ void BackfillState::ProgressTracker::complete_to(
   } else {
     ceph_abort_msg("completing untracked object shall not happen");
   }
+  auto new_last_backfill = peering_state().earliest_backfill();
   for (auto it = std::begin(registry);
        it != std::end(registry) &&
          it->second.stage != op_stage_t::enqueued_push;
@@ -579,6 +580,8 @@ void BackfillState::ProgressTracker::complete_to(
     peering_state().update_complete_backfill_object_stats(
       soid,
       *item.stats);
+    assert(soid > new_last_backfill);
+    new_last_backfill = soid;
   }
   if (Enqueuing::all_enqueued(peering_state(),
                               backfill_state().backfill_info,
@@ -587,7 +590,7 @@ void BackfillState::ProgressTracker::complete_to(
     backfill_state().last_backfill_started = hobject_t::get_max();
     backfill_listener().update_peers_last_backfill(hobject_t::get_max());
   } else {
-    backfill_listener().update_peers_last_backfill(obj);
+    backfill_listener().update_peers_last_backfill(new_last_backfill);
   }
 }
 

From bfe15f68075c80df9099da52111a40a5c16cfa31 Mon Sep 17 00:00:00 2001
From: Xuehan Xu <xuxuehan@qianxin.com>
Date: Fri, 13 Sep 2024 17:35:06 +0800
Subject: [PATCH 011/148] crimson/osd/backfill_state: always go to Enqueuing
 when object is pushed during Waiting

Fixes: https://tracker.ceph.com/issues/68061
Signed-off-by: Xuehan Xu <xuxuehan@qianxin.com>
---
 src/crimson/osd/backfill_state.cc | 22 ++++++++--------------
 src/crimson/osd/backfill_state.h  |  2 +-
 2 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/src/crimson/osd/backfill_state.cc b/src/crimson/osd/backfill_state.cc
index 70c43f49faf72..1db852da5ccd5 100644
--- a/src/crimson/osd/backfill_state.cc
+++ b/src/crimson/osd/backfill_state.cc
@@ -403,7 +403,7 @@ BackfillState::PrimaryScanning::react(ObjectPushed evt)
 {
   logger().debug("PrimaryScanning::react() on ObjectPushed; evt.object={}",
                  evt.object);
-  backfill_state().progress_tracker->complete_to(evt.object, evt.stat);
+  backfill_state().progress_tracker->complete_to(evt.object, evt.stat, true);
   return discard_event();
 }
 
@@ -480,7 +480,7 @@ BackfillState::ReplicasScanning::react(ObjectPushed evt)
 {
   logger().debug("ReplicasScanning::react() on ObjectPushed; evt.object={}",
                  evt.object);
-  backfill_state().progress_tracker->complete_to(evt.object, evt.stat);
+  backfill_state().progress_tracker->complete_to(evt.object, evt.stat, true);
   return discard_event();
 }
 
@@ -496,16 +496,8 @@ BackfillState::Waiting::react(ObjectPushed evt)
 {
   logger().debug("Waiting::react() on ObjectPushed; evt.object={}",
                  evt.object);
-  backfill_state().progress_tracker->complete_to(evt.object, evt.stat);
-  if (!Enqueuing::all_enqueued(peering_state(),
-                               backfill_state().backfill_info,
-                               backfill_state().peer_backfill_info)) {
-    return transit<Enqueuing>();
-  } else {
-    // we still have something to wait on
-    logger().debug("Waiting::react() on ObjectPushed; still waiting");
-    return discard_event();
-  }
+  backfill_state().progress_tracker->complete_to(evt.object, evt.stat, false);
+  return transit<Enqueuing>();;
 }
 
 // -- Done
@@ -559,7 +551,8 @@ void BackfillState::ProgressTracker::enqueue_drop(const hobject_t& obj)
 
 void BackfillState::ProgressTracker::complete_to(
   const hobject_t& obj,
-  const pg_stat_t& stats)
+  const pg_stat_t& stats,
+  bool may_push_to_max)
 {
   logger().debug("{}: obj={}",
                  __func__, obj);
@@ -580,7 +573,8 @@ void BackfillState::ProgressTracker::complete_to(
       soid,
       *item.stats);
   }
-  if (Enqueuing::all_enqueued(peering_state(),
+  if (may_push_to_max &&
+      Enqueuing::all_enqueued(peering_state(),
                               backfill_state().backfill_info,
                               backfill_state().peer_backfill_info) &&
       tracked_objects_completed()) {
diff --git a/src/crimson/osd/backfill_state.h b/src/crimson/osd/backfill_state.h
index 6c36db81813b7..66ba2307f808a 100644
--- a/src/crimson/osd/backfill_state.h
+++ b/src/crimson/osd/backfill_state.h
@@ -421,7 +421,7 @@ class BackfillState::ProgressTracker {
 
   bool enqueue_push(const hobject_t&);
   void enqueue_drop(const hobject_t&);
-  void complete_to(const hobject_t&, const pg_stat_t&);
+  void complete_to(const hobject_t&, const pg_stat_t&, bool may_push_to_max);
 };
 
 } // namespace crimson::osd

From 14c905d0c1f609d438aed3b4a4f600825d07d845 Mon Sep 17 00:00:00 2001
From: Xuehan Xu <xuxuehan@qianxin.com>
Date: Fri, 20 Sep 2024 11:35:07 +0800
Subject: [PATCH 012/148] test/crimson/test_backfill: set BackfillState's log
 level to debug

Signed-off-by: Xuehan Xu <xuxuehan@qianxin.com>
---
 src/test/crimson/test_backfill.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/test/crimson/test_backfill.cc b/src/test/crimson/test_backfill.cc
index 6648719c61c8f..0abb62a0320e0 100644
--- a/src/test/crimson/test_backfill.cc
+++ b/src/test/crimson/test_backfill.cc
@@ -270,6 +270,9 @@ BackfillFixture::BackfillFixture(
                                                    this->backfill_targets),
                    std::make_unique<PGFacade>(this->backfill_source))
 {
+  seastar::global_logger_registry().set_all_loggers_level(
+    seastar::log_level::debug
+  );
   backfill_state.process_event(crimson::osd::BackfillState::Triggered{}.intrusive_from_this());
 }
 

From ec2af67dba8ba2874f8b60c10e51c75808ebb0a2 Mon Sep 17 00:00:00 2001
From: Xuehan Xu <xuxuehan@qianxin.com>
Date: Sat, 21 Sep 2024 13:27:01 +0800
Subject: [PATCH 013/148] crimson/osd/pg: remove snapmapper objects when
 eventually removing collections at the last moment of pg deleting, just as pg
 meta objects

Fixes: https://tracker.ceph.com/issues/68174
Signed-off-by: Xuehan Xu <xuxuehan@qianxin.com>
---
 src/crimson/osd/pg.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc
index d210773ca3031..5ef8c2d97afe4 100644
--- a/src/crimson/osd/pg.cc
+++ b/src/crimson/osd/pg.cc
@@ -481,6 +481,7 @@ PG::do_delete_work(ceph::os::Transaction &t, ghobject_t _next)
   auto [objs_to_rm, next] = fut.get();
   if (objs_to_rm.empty()) {
     logger().info("all objs removed, removing coll for {}", pgid);
+    t.remove(coll_ref->get_cid(), pgid.make_snapmapper_oid());
     t.remove(coll_ref->get_cid(), pgmeta_oid);
     t.remove_collection(coll_ref->get_cid());
     (void) shard_services.get_store().do_transaction(
@@ -490,7 +491,7 @@ PG::do_delete_work(ceph::os::Transaction &t, ghobject_t _next)
     return {next, false};
   } else {
     for (auto &obj : objs_to_rm) {
-      if (obj == pgmeta_oid) {
+      if (obj == pgmeta_oid || obj.is_internal_pg_local()) {
         continue;
       }
       logger().trace("pg {}, removing obj {}", pgid, obj);

From 4db3bb77b6458f8b54af7f9279151d616f042d49 Mon Sep 17 00:00:00 2001
From: Sachin Prabhu <sprabhu@redhat.com>
Date: Thu, 12 Sep 2024 17:13:25 +0100
Subject: [PATCH 014/148] mgr/smb: accept public_addrs on cli when creating
 cluster

We can set the public ip address to set for the cluster using the
declarative method by providing the information in the resource
description. The corresponding functionality is not available with the
imperative method of creating the smb cluster.

This patch adds this functionality by allowing the user the option of
providing the a public address on the command line when creating the
smb cluster.

Signed-off-by: Sachin Prabhu <sp@spui.uk>
---
 doc/mgr/smb.rst              |  5 +++++
 src/pybind/mgr/smb/module.py | 14 ++++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/doc/mgr/smb.rst b/doc/mgr/smb.rst
index 05e6369ddf107..3252c485a9aa7 100644
--- a/doc/mgr/smb.rst
+++ b/doc/mgr/smb.rst
@@ -96,6 +96,11 @@ clustering
     enables clustering regardless of the placement count. A value of ``never``
     disables clustering regardless of the placement count. If unspecified,
     ``default`` is assumed.
+public_addrs
+    Optional. A string in the form of <ipaddress/prefixlength>[%<destination interface>].
+    Supported only when using Samba's clustering. Assign "virtual" IP
+    addresses that will be managed by the clustering subsystem and may automatically
+    move between nodes running Samba containers.
 
 Remove Cluster
 ++++++++++++++
diff --git a/src/pybind/mgr/smb/module.py b/src/pybind/mgr/smb/module.py
index 1e71721202e80..e2ec9663af52f 100644
--- a/src/pybind/mgr/smb/module.py
+++ b/src/pybind/mgr/smb/module.py
@@ -167,6 +167,7 @@ def cluster_create(
         custom_dns: Optional[List[str]] = None,
         placement: Optional[str] = None,
         clustering: Optional[SMBClustering] = None,
+        public_addrs: Optional[List[str]] = None,
     ) -> results.Result:
         """Create an smb cluster"""
         domain_settings = None
@@ -251,6 +252,18 @@ def cluster_create(
                 )
             )
 
+        c_public_addrs = []
+        if public_addrs:
+            for pa in public_addrs:
+                pa_arr = pa.split('%', 1)
+                address = pa_arr[0]
+                destination = pa_arr[1] if len(pa_arr) > 1 else None
+                c_public_addrs.append(
+                    resources.ClusterPublicIPAssignment(
+                        address=address, destination=destination
+                    )
+                )
+
         pspec = resources.WrappedPlacementSpec.wrap(
             PlacementSpec.from_string(placement)
         )
@@ -262,6 +275,7 @@ def cluster_create(
             custom_dns=custom_dns,
             placement=pspec,
             clustering=clustering,
+            public_addrs=c_public_addrs,
         )
         to_apply.append(cluster)
         return self._handler.apply(to_apply, create_only=True).squash(cluster)

From 7d9fe0a5dbc9abf942b26147e6dc17f85529dfe4 Mon Sep 17 00:00:00 2001
From: John Mulligan <jmulligan@redhat.com>
Date: Wed, 18 Sep 2024 15:28:46 -0700
Subject: [PATCH 015/148] mgr/smb: use is comparisions for enum values

Use `is` based comparisions for two enum related functions as mypy likes
this better.

Signed-off-by: John Mulligan <jmulligan@redhat.com>
---
 src/pybind/mgr/smb/enums.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/pybind/mgr/smb/enums.py b/src/pybind/mgr/smb/enums.py
index dea45f951f831..3e8544f43cf5a 100644
--- a/src/pybind/mgr/smb/enums.py
+++ b/src/pybind/mgr/smb/enums.py
@@ -21,7 +21,7 @@ class CephFSStorageProvider(_StrEnum):
 
     def expand(self) -> 'CephFSStorageProvider':
         """Expand abbreviated/default values into the full/expanded form."""
-        if self == self.SAMBA_VFS:
+        if self is self.SAMBA_VFS:
             # mypy gets confused by enums
             return self.__class__(self.SAMBA_VFS_NEW)
         return self
@@ -89,9 +89,9 @@ class LoginAccess(_StrEnum):
     def expand(self) -> 'LoginAccess':
         """Exapend abbreviated enum values into their full forms."""
         # the extra LoginAccess(...) calls are to appease mypy
-        if self == self.READ_ONLY_SHORT:
+        if self is self.READ_ONLY_SHORT:
             return LoginAccess(self.READ_ONLY)
-        if self == self.READ_WRITE_SHORT:
+        if self is self.READ_WRITE_SHORT:
             return LoginAccess(self.READ_WRITE)
         return self
 

From 51516ba146e9602c0dea1de65b040d737d1dab6a Mon Sep 17 00:00:00 2001
From: John Mulligan <jmulligan@redhat.com>
Date: Wed, 18 Sep 2024 17:44:39 -0700
Subject: [PATCH 016/148] python-common: add a utils function to replace
 distutils.util.strtobool

As distutils is removed from python 3.12 ceph can no longer use the
simple conversion function once located in that module. Add our own
trivial replacement function.

Signed-off-by: John Mulligan <jmulligan@redhat.com>
---
 src/python-common/ceph/utils.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/python-common/ceph/utils.py b/src/python-common/ceph/utils.py
index e92a2d1de7db8..0544e9f4173d1 100644
--- a/src/python-common/ceph/utils.py
+++ b/src/python-common/ceph/utils.py
@@ -167,3 +167,18 @@ def http_req(hostname: str = '',
         log.error(e)
         # handle error here if needed
         raise
+
+
+_TRUE_VALS = {'y', 'yes', 't', 'true', 'on', '1'}
+_FALSE_VALS = {'n', 'no', 'f', 'false', 'off', '0'}
+
+
+def strtobool(value: str) -> bool:
+    """Convert a string to a boolean value.
+    Based on a simlilar function once available at distutils.util.strtobool.
+    """
+    if value.lower() in _TRUE_VALS:
+        return True
+    if value.lower() in _FALSE_VALS:
+        return False
+    raise ValueError(f'invalid truth value {value!r}')

From ffcc157a694f0e40829b5ecd2692e54f0a763607 Mon Sep 17 00:00:00 2001
From: John Mulligan <jmulligan@redhat.com>
Date: Wed, 18 Sep 2024 17:45:58 -0700
Subject: [PATCH 017/148] pybind/mgr: replace imports of distutils.util

In python 3.12 distutils is removed. Replace uses of
distutils.util.strtobool with our own utility function.

Signed-off-by: John Mulligan <jmulligan@redhat.com>
---
 src/pybind/mgr/dashboard/tools.py                | 2 +-
 src/pybind/mgr/volumes/fs/operations/pin_util.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pybind/mgr/dashboard/tools.py b/src/pybind/mgr/dashboard/tools.py
index 51ed9c471aac6..14de970cceb0f 100644
--- a/src/pybind/mgr/dashboard/tools.py
+++ b/src/pybind/mgr/dashboard/tools.py
@@ -9,9 +9,9 @@
 import time
 import urllib
 from datetime import datetime, timedelta
-from distutils.util import strtobool
 
 import cherrypy
+from ceph.utils import strtobool
 from mgr_util import build_url
 
 from . import mgr
diff --git a/src/pybind/mgr/volumes/fs/operations/pin_util.py b/src/pybind/mgr/volumes/fs/operations/pin_util.py
index a12ab5b4d4b28..631fdd8fcaa25 100644
--- a/src/pybind/mgr/volumes/fs/operations/pin_util.py
+++ b/src/pybind/mgr/volumes/fs/operations/pin_util.py
@@ -3,7 +3,7 @@
 import cephfs
 
 from ..exception import VolumeException
-from distutils.util import strtobool
+from ceph.utils import strtobool
 
 _pin_value = {
     "export": lambda x: int(x),

From a2cbb40925742d0795ef76e3d8548e34477b8db9 Mon Sep 17 00:00:00 2001
From: John Mulligan <jmulligan@redhat.com>
Date: Thu, 19 Sep 2024 12:44:03 -0700
Subject: [PATCH 018/148] pybind/mgr/telemetry: remove misleading tox env

For some reason there's a 'mypy' environment listed in the telemetry
tox.ini that always runs pytest. Remove it.
We'll see if this causes the CI to fail, as I can't find anywhere that
uses it.

Signed-off-by: John Mulligan <jmulligan@redhat.com>
---
 src/pybind/mgr/telemetry/tox.ini | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/pybind/mgr/telemetry/tox.ini b/src/pybind/mgr/telemetry/tox.ini
index a887590eed89b..b2210da54eaa8 100644
--- a/src/pybind/mgr/telemetry/tox.ini
+++ b/src/pybind/mgr/telemetry/tox.ini
@@ -1,7 +1,6 @@
 [tox]
 envlist =
     py3
-    mypy
 skipsdist = true
 
 [testenv]

From de90c32240eaa8cd40a22ffc8b24d8d893ad6863 Mon Sep 17 00:00:00 2001
From: John Mulligan <jmulligan@redhat.com>
Date: Thu, 19 Sep 2024 13:07:02 -0700
Subject: [PATCH 019/148] mypy-constrains.txt: bump mypy up to version 1.9

Ceph is still very behind on the version of mypy used in the various
tox test dirs. Bump up to version 1.9 as it only requires a few trivial
fixes to use.

Signed-off-by: John Mulligan <jmulligan@redhat.com>
---
 src/mypy-constrains.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mypy-constrains.txt b/src/mypy-constrains.txt
index 7810870804ed2..0a79b8ef4f11d 100644
--- a/src/mypy-constrains.txt
+++ b/src/mypy-constrains.txt
@@ -2,7 +2,7 @@
 # Unfortunately this means we have to manually update those 
 # packages regularly. 
 
-mypy==1.1.1
+mypy==1.9
 
 # global
 types-python-dateutil==0.1.3

From b0f5e1086a64a4ad249fbd27b8fb256de38ca1bd Mon Sep 17 00:00:00 2001
From: Xuehan Xu <xuxuehan@qianxin.com>
Date: Wed, 25 Sep 2024 07:44:46 +0800
Subject: [PATCH 020/148] crimson/osd/pg: also trigger callbacks for empty
 peering transactions

Signed-off-by: Xuehan Xu <xuxuehan@qianxin.com>
---
 src/crimson/osd/shard_services.cc | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/crimson/osd/shard_services.cc b/src/crimson/osd/shard_services.cc
index 5f7c4a624471b..df6d10d6aa7d2 100644
--- a/src/crimson/osd/shard_services.cc
+++ b/src/crimson/osd/shard_services.cc
@@ -767,14 +767,20 @@ seastar::future<> ShardServices::dispatch_context_transaction(
   LOG_PREFIX(OSDSingletonState::dispatch_context_transaction);
   if (ctx.transaction.empty()) {
     DEBUG("empty transaction");
-    return seastar::now();
+    co_await get_store().flush(col);
+    Context* on_commit(
+      ceph::os::Transaction::collect_all_contexts(ctx.transaction));
+    if (on_commit) {
+      on_commit->complete(0);
+    }
+    co_return;
   }
 
   DEBUG("do_transaction ...");
-  auto ret = get_store().do_transaction(
+  co_await get_store().do_transaction(
     col,
     ctx.transaction.claim_and_reset());
-  return ret;
+  co_return;
 }
 
 seastar::future<> ShardServices::dispatch_context_messages(

From a88c84aa1ca6ead3b31c291a1ff03077562881e0 Mon Sep 17 00:00:00 2001
From: Xuehan Xu <xuxuehan@qianxin.com>
Date: Wed, 25 Sep 2024 07:45:10 +0800
Subject: [PATCH 021/148] crimson/osd/pg: correct log messages for
 ShardServices::dispatch_context_messages

Signed-off-by: Xuehan Xu <xuxuehan@qianxin.com>
---
 src/crimson/osd/shard_services.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/crimson/osd/shard_services.cc b/src/crimson/osd/shard_services.cc
index df6d10d6aa7d2..a053d9d5044c5 100644
--- a/src/crimson/osd/shard_services.cc
+++ b/src/crimson/osd/shard_services.cc
@@ -786,7 +786,7 @@ seastar::future<> ShardServices::dispatch_context_transaction(
 seastar::future<> ShardServices::dispatch_context_messages(
   BufferedRecoveryMessages &&ctx)
 {
-  LOG_PREFIX(OSDSingletonState::dispatch_context_transaction);
+  LOG_PREFIX(OSDSingletonState::dispatch_context_messages);
   auto ret = seastar::parallel_for_each(std::move(ctx.message_map),
     [FNAME, this](auto& osd_messages) {
       auto& [peer, messages] = osd_messages;

From 358f33a148c9a65478e33648f16e8c8af73c98f2 Mon Sep 17 00:00:00 2001
From: Adam Kupczyk <akupczyk@ibm.com>
Date: Fri, 13 Sep 2024 16:39:51 +0000
Subject: [PATCH 022/148] os/bluestore: Fix ceph-bluestore-tool allocmap
 command

BlueStore::read_allocation_from_drive_for_bluestore_tool was
not informed that multiple bdev labels can exist and reserve space.
Comparison of real alloc vs recovered alloc was failing.

Fixes: https://tracker.ceph.com/issues/67596

Signed-off-by: Adam Kupczyk <akupczyk@ibm.com>
---
 src/os/bluestore/BlueStore.cc | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc
index 236336386728f..069105bc8d67c 100644
--- a/src/os/bluestore/BlueStore.cc
+++ b/src/os/bluestore/BlueStore.cc
@@ -20527,6 +20527,14 @@ int BlueStore::read_allocation_from_drive_for_bluestore_tool()
     if (ret < 0) {
       return ret;
     }
+    if (bdev_label_multi) {
+      uint64_t lsize = std::max(BDEV_LABEL_BLOCK_SIZE, min_alloc_size);
+      for (uint64_t p : bdev_label_valid_locations) {
+	if (p != BDEV_FIRST_LABEL_POSITION) {
+	  allocator->init_rm_free(p, lsize);
+	}
+      }
+    }
 
     duration = ceph_clock_now() - start;
     stats.insert_count = 0;

From 9e3449995f8c21fd3bde6308517aebcd79478988 Mon Sep 17 00:00:00 2001
From: Rishabh Dave <ridave@redhat.com>
Date: Fri, 19 Jul 2024 19:30:39 +0530
Subject: [PATCH 023/148] qa/cephfs: test clone progress reporter after subvol
 is deleted but...

snapshot is retained despite of deletion (using --retain-snapshots
option of "subvolume rm" command).

Signed-off-by: Rishabh Dave <ridave@redhat.com>
---
 qa/tasks/cephfs/test_volumes.py | 69 ++++++++++++++++++++++++++++++++-
 1 file changed, 68 insertions(+), 1 deletion(-)

diff --git a/qa/tasks/cephfs/test_volumes.py b/qa/tasks/cephfs/test_volumes.py
index 2baefd72c3fbc..6a4e25948256e 100644
--- a/qa/tasks/cephfs/test_volumes.py
+++ b/qa/tasks/cephfs/test_volumes.py
@@ -7876,7 +7876,22 @@ def tearDown(self):
                 self.run_ceph_cmd('fs subvolume snapshot rm --force '
                                   f'--format json {v} {sv} {ss}')
 
-            self.run_ceph_cmd(f'fs subvolume rm {v} {sv}')
+            try:
+                self.run_ceph_cmd(f'fs subvolume rm {v} {sv}')
+            except CommandFailedError as e:
+                if e.exitstatus == errno.ENOENT:
+                    log.info(
+                        'ignoring this error, perhaps subvolume was deleted '
+                        'during the test and snapshot deleted above is a '
+                        'retained snapshot. when a retained snapshot (which is '
+                        'snapshot retained despite of subvolume deletion) is '
+                        'deleted, the subvolume directory is also deleted '
+                        'along. and before retained snapshot deletion, the '
+                        'subvolume is reported by "subvolume ls" command, which'
+                        'is what probably caused confusion here')
+                    pass
+                else:
+                    raise
 
         # verify trash dir is clean
         self._wait_for_trash_empty()
@@ -8090,6 +8105,58 @@ def test_clone_to_diff_group_and_less_than_cloner_threads(self):
         # and not cancelling these clone doesnt affect this test case.
         self.cancel_clones_and_ignore_if_finished(c)
 
+    def test_clone_after_subvol_is_removed(self):
+        '''
+        Initiate cloning after source subvolume has been deleted but with
+        snapshots retained and then test that, when this clone is in progress,
+        one progress bar is printed in output of command "ceph status" that
+        shows progress of this clone.
+        '''
+        v = self.volname
+        sv = 'sv1'
+        ss = 'ss1'
+        # XXX: "clone" must be part of clone name for sake of tearDown()
+        c = 'ss1clone1'
+
+        # XXX: without setting mds_snap_rstat to true rstats are not updated on
+        # a subvolume snapshot and therefore clone progress bar will not show
+        # any progress.
+        self.config_set('mds', 'mds_snap_rstat', 'true')
+
+        self.run_ceph_cmd(f'fs subvolume create {v} {sv} --mode=777')
+        size = self._do_subvolume_io(sv, None, None, 10, 1024)
+
+        self.run_ceph_cmd(f'fs subvolume snapshot create {v} {sv} {ss}')
+        self.wait_till_rbytes_is_right(v, sv, size)
+
+        self.run_ceph_cmd(f'fs subvolume rm {v} {sv} --retain-snapshots')
+        self.run_ceph_cmd(f'fs subvolume snapshot clone {v} {sv} {ss} {c}')
+
+        with safe_while(tries=15, sleep=10) as proceed:
+            while proceed():
+                pev = self.get_pevs_from_ceph_status(c)
+
+                if len(pev) < 1:
+                   continue
+                elif len(pev) > 1:
+                    raise RuntimeError('For 1 clone "ceph status" output has 2 '
+                                       'progress bars, it should have only 1 '
+                                       f'progress bar.\npev -\n{pev}')
+
+                # ensure that exactly 1 progress bar for cloning is present in
+                # "ceph status" output
+                msg = ('"progress_events" dict in "ceph status" output must have '
+                       f'exactly one entry.\nprogress_event dict -\n{pev}')
+                self.assertEqual(len(pev), 1, msg)
+
+                pev_msg = tuple(pev.values())[0]['message']
+                self.assertIn('1 ongoing clones', pev_msg)
+                break
+
+        # allowing clone jobs to finish will consume too much time and space
+        # and not cancelling these clone doesnt affect this test case.
+        self.cancel_clones_and_ignore_if_finished(c)
+
     def test_clones_equal_to_cloner_threads(self):
         '''
         Test that one progress bar is printed in output of "ceph status" output

From 21cf769ae78021cf6968666ab7dc5e779835fd01 Mon Sep 17 00:00:00 2001
From: Rishabh Dave <ridave@redhat.com>
Date: Fri, 27 Sep 2024 00:41:25 +0530
Subject: [PATCH 024/148] mgr/mgr_util: don't set event when it is already set

In class RTimer in mgr_util.py, "self.finished.set()" is run even though
the event self.finished was set just now. If it wasn't set, the while
loop the precedes it would've never finished running. Therefore, remove
this redundant line of code.

Signed-off-by: Rishabh Dave <ridave@redhat.com>
---
 src/pybind/mgr/mgr_util.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/pybind/mgr/mgr_util.py b/src/pybind/mgr/mgr_util.py
index 67246545eea0f..a999b6525e9f1 100644
--- a/src/pybind/mgr/mgr_util.py
+++ b/src/pybind/mgr/mgr_util.py
@@ -88,7 +88,6 @@ def run(self):
             while not self.finished.is_set():
                 self.finished.wait(self.interval)
                 self.function(*self.args, **self.kwargs)
-            self.finished.set()
         except Exception as e:
             logger.error("task exception: %s", e)
             raise

From 4a4fc7bad533a362fb71aee5ea36014efaecf1b9 Mon Sep 17 00:00:00 2001
From: Patrick Donnelly <pdonnell@redhat.com>
Date: Thu, 26 Sep 2024 20:46:17 -0400
Subject: [PATCH 025/148] qa: ignore pg availability/degraded warnings

Fixes: https://tracker.ceph.com/issues/68284
Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
---
 qa/cephfs/overrides/pg_health.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/qa/cephfs/overrides/pg_health.yaml b/qa/cephfs/overrides/pg_health.yaml
index 1740134a2e01b..07ca62e01fbec 100644
--- a/qa/cephfs/overrides/pg_health.yaml
+++ b/qa/cephfs/overrides/pg_health.yaml
@@ -9,3 +9,5 @@ overrides:
       - PG_DEGRADED
       - Reduced data availability
       - Degraded data redundancy
+      - pg .* is stuck inactive
+      - pg .* is .*degraded

From 8db39bcbdb4eab197fabf7e611379cdc5e182143 Mon Sep 17 00:00:00 2001
From: Xuehan Xu <xuxuehan@qianxin.com>
Date: Mon, 23 Sep 2024 10:29:20 +0800
Subject: [PATCH 026/148] crimson/osd/backfill_state: do at least one time of
 replica scanning if necessary in the Enqueuing state

Fixes: https://tracker.ceph.com/issues/68175
Signed-off-by: Xuehan Xu <xuxuehan@qianxin.com>
---
 src/crimson/osd/backfill_state.cc | 52 +++++++++++++++++--------------
 1 file changed, 29 insertions(+), 23 deletions(-)

diff --git a/src/crimson/osd/backfill_state.cc b/src/crimson/osd/backfill_state.cc
index 70c43f49faf72..7f5b869abbf34 100644
--- a/src/crimson/osd/backfill_state.cc
+++ b/src/crimson/osd/backfill_state.cc
@@ -225,7 +225,7 @@ bool BackfillState::Enqueuing::should_rescan_primary(
   const BackfillInterval& backfill_info) const
 {
   return backfill_info.begin <= earliest_peer_backfill(peer_backfill_info) &&
-	 !backfill_info.extends_to_end();
+	 !backfill_info.extends_to_end() && backfill_info.empty();
 }
 
 void BackfillState::Enqueuing::trim_backfilled_object_from_intervals(
@@ -327,16 +327,29 @@ BackfillState::Enqueuing::Enqueuing(my_context ctx)
   }
   trim_backfill_infos();
 
-  while (!all_emptied(primary_bi, backfill_state().peer_backfill_info)) {
+  if (should_rescan_primary(backfill_state().peer_backfill_info,
+				   primary_bi)) {
+    // need to grab one another chunk of the object namespace and restart
+    // the queueing.
+    logger().debug("{}: reached end for current local chunk", __func__);
+    post_event(RequestPrimaryScanning{});
+    return;
+  }
+
+  do {
     if (!backfill_listener().budget_available()) {
       post_event(RequestWaiting{});
       return;
     } else if (should_rescan_replicas(backfill_state().peer_backfill_info,
-                                      primary_bi)) {
+				      primary_bi)) {
       // Count simultaneous scans as a single op and let those complete
       post_event(RequestReplicasScanning{});
       return;
     }
+
+    if (all_emptied(primary_bi, backfill_state().peer_backfill_info)) {
+      break;
+    }
     // Get object within set of peers to operate on and the set of targets
     // for which that object applies.
     if (const hobject_t check = \
@@ -355,30 +368,23 @@ BackfillState::Enqueuing::Enqueuing(my_context ctx)
       trim_backfilled_object_from_intervals(std::move(result),
 					    backfill_state().last_backfill_started,
 					    backfill_state().peer_backfill_info);
-      primary_bi.pop_front();
+      if (!primary_bi.empty()) {
+	primary_bi.pop_front();
+      }
     }
     backfill_listener().maybe_flush();
-  }
+  } while (!all_emptied(primary_bi, backfill_state().peer_backfill_info));
 
-  if (should_rescan_primary(backfill_state().peer_backfill_info,
-                            primary_bi)) {
-    // need to grab one another chunk of the object namespace and restart
-    // the queueing.
-    logger().debug("{}: reached end for current local chunk",
-                   __func__);
-    post_event(RequestPrimaryScanning{});
-  } else {
-    if (backfill_state().progress_tracker->tracked_objects_completed()
-	&& Enqueuing::all_enqueued(peering_state(),
-				   backfill_state().backfill_info,
-				   backfill_state().peer_backfill_info)) {
-      backfill_state().last_backfill_started = hobject_t::get_max();
-      backfill_listener().update_peers_last_backfill(hobject_t::get_max());
-    }
-    logger().debug("{}: reached end for both local and all peers "
-                   "but still has in-flight operations", __func__);
-    post_event(RequestWaiting{});
+  if (backfill_state().progress_tracker->tracked_objects_completed()
+      && Enqueuing::all_enqueued(peering_state(),
+				 backfill_state().backfill_info,
+				 backfill_state().peer_backfill_info)) {
+    backfill_state().last_backfill_started = hobject_t::get_max();
+    backfill_listener().update_peers_last_backfill(hobject_t::get_max());
   }
+  logger().debug("{}: reached end for both local and all peers "
+		 "but still has in-flight operations", __func__);
+  post_event(RequestWaiting{});
 }
 
 // -- PrimaryScanning

From b96d714b23b3f5294df9c28d1f6f5488c4253853 Mon Sep 17 00:00:00 2001
From: Rishabh Dave <ridave@redhat.com>
Date: Fri, 27 Sep 2024 00:43:31 +0530
Subject: [PATCH 027/148] mgr/mgr_util: log traceback when exception occurs in
 RTimer.run()

When an exception occurs in class RTimer of mgr_util.py, only the
exception message is logged but not only this is insufficient for
debugging but also it is hard to spot in logs. This should not be the
case, especially for an occurring exception. Therefore, add code to log
traceback and exception name as well along with exception's message.

Log entry before this patch -

2024-09-27T00:22:38.656+0530 7f05c7e006c0  0 [volumes ERROR mgr_util] task exception: dummy exception for testing

Log entry with this patch -

2024-09-27T00:40:26.509+0530 7f61d64006c0  0 [volumes ERROR mgr_util] exception encountered in RTimer instance "<RTimer(Thread-7, started daemon 140058183075520)>":
Traceback (most recent call last):
  File "/home/rishabh/repos/ceph/minor3/src/pybind/mgr/mgr_util.py", line 91, in run
    self.function(*self.args, **self.kwargs)
  File "/home/rishabh/repos/ceph/minor3/src/pybind/mgr/volumes/fs/stats_util.py", line 232, in _update_progress_bars
    raise RuntimeError('dummy exception for testing')
RuntimeError: dummy exception for testing

Signed-off-by: Rishabh Dave <ridave@redhat.com>
---
 src/pybind/mgr/mgr_util.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/pybind/mgr/mgr_util.py b/src/pybind/mgr/mgr_util.py
index a999b6525e9f1..5d37d478de7b1 100644
--- a/src/pybind/mgr/mgr_util.py
+++ b/src/pybind/mgr/mgr_util.py
@@ -22,6 +22,7 @@
 from ipaddress import ip_address
 from threading import Lock, Condition
 from typing import no_type_check, NewType
+from traceback import format_exc as tb_format_exc
 import urllib
 from functools import wraps
 if sys.version_info >= (3, 3):
@@ -88,8 +89,9 @@ def run(self):
             while not self.finished.is_set():
                 self.finished.wait(self.interval)
                 self.function(*self.args, **self.kwargs)
-        except Exception as e:
-            logger.error("task exception: %s", e)
+        except Exception:
+            logger.error(f'exception encountered in RTimer instance "{self}":'
+                         f'\n{tb_format_exc()}')
             raise
 
 

From 829c857b9e1be3a4133f088f63950e961ecce67e Mon Sep 17 00:00:00 2001
From: myoungwon oh <ohmyoungwon@gmail.com>
Date: Wed, 11 Sep 2024 06:04:30 +0000
Subject: [PATCH 028/148] crimson/os/seastore: fix data inconsistency during
 ool writes

In RBM, seastore issues ool writes with allocated address.
If a transaction conflict occurs at this point, the allocated address is freed,
allowing the address to be reused.
However, data inconsistency can occur if seastore issues ool writes with
freed address before the preceding ool write has not been complete.

To fix this issue, this commit frees the allocated address after ool writes is
don in the event of the transaction conflict after ool write is issued.

Signed-off-by: Myoungwon Oh <myoungwon.oh@samsung.com>
---
 src/crimson/os/seastore/cache.cc              |  8 ++++++--
 .../os/seastore/extent_placement_manager.cc   | 14 +++++++++++++-
 src/crimson/os/seastore/transaction.h         | 19 +++++++++++++++++++
 3 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc
index cf8d3c0891d7f..5dcb7514ee1ab 100644
--- a/src/crimson/os/seastore/cache.cc
+++ b/src/crimson/os/seastore/cache.cc
@@ -990,8 +990,12 @@ void Cache::mark_transaction_conflicted(
     }
     efforts.mutate_delta_bytes += delta_stat.bytes;
 
-    for (auto &i: t.pre_alloc_list) {
-      epm.mark_space_free(i->get_paddr(), i->get_length());
+    if (t.get_pending_ool()) {
+      t.get_pending_ool()->is_conflicted = true;
+    } else {
+      for (auto &i: t.pre_alloc_list) {
+	epm.mark_space_free(i->get_paddr(), i->get_length());
+      }
     }
 
     auto& ool_stats = t.get_ool_write_stats();
diff --git a/src/crimson/os/seastore/extent_placement_manager.cc b/src/crimson/os/seastore/extent_placement_manager.cc
index 34ac199eed8dd..0458fbfed7480 100644
--- a/src/crimson/os/seastore/extent_placement_manager.cc
+++ b/src/crimson/os/seastore/extent_placement_manager.cc
@@ -987,7 +987,19 @@ RandomBlockOolWriter::alloc_write_ool_extents(
     return alloc_write_iertr::now();
   }
   return seastar::with_gate(write_guard, [this, &t, &extents] {
-    return do_write(t, extents);
+    seastar::lw_shared_ptr<rbm_pending_ool_t> ptr =
+      seastar::make_lw_shared<rbm_pending_ool_t>();
+    ptr->pending_extents = t.get_pre_alloc_list();
+    assert(!t.is_conflicted());
+    t.set_pending_ool(ptr);
+    return do_write(t, extents
+    ).finally([this, ptr=ptr] {
+      if (ptr->is_conflicted) {
+	for (auto &e : ptr->pending_extents) {
+	  rb_cleaner->mark_space_free(e->get_paddr(), e->get_length());
+	}
+      }
+    });
   });
 }
 
diff --git a/src/crimson/os/seastore/transaction.h b/src/crimson/os/seastore/transaction.h
index 52515937a9e59..5d8ad00ba228b 100644
--- a/src/crimson/os/seastore/transaction.h
+++ b/src/crimson/os/seastore/transaction.h
@@ -80,6 +80,11 @@ struct rewrite_stats_t {
   }
 };
 
+struct rbm_pending_ool_t {
+  bool is_conflicted = false;
+  std::list<CachedExtentRef> pending_extents;
+};
+
 /**
  * Transaction
  *
@@ -554,6 +559,18 @@ class Transaction {
     return static_cast<T&>(*view);
   }
 
+  void set_pending_ool(seastar::lw_shared_ptr<rbm_pending_ool_t> ptr) {
+    pending_ool = ptr;
+  }
+
+  seastar::lw_shared_ptr<rbm_pending_ool_t> get_pending_ool() {
+    return pending_ool;
+  }
+
+  const auto& get_pre_alloc_list() {
+    return pre_alloc_list;
+  }
+
 private:
   friend class Cache;
   friend Ref make_test_transaction();
@@ -650,6 +667,8 @@ class Transaction {
   const src_t src;
 
   transaction_id_t trans_id = TRANS_ID_NULL;
+
+  seastar::lw_shared_ptr<rbm_pending_ool_t> pending_ool;
 };
 using TransactionRef = Transaction::Ref;
 

From 3482ebcd3c7fba37b40b91428f53880d66e4c86f Mon Sep 17 00:00:00 2001
From: Rishabh Dave <ridave@redhat.com>
Date: Fri, 27 Sep 2024 13:50:29 +0530
Subject: [PATCH 029/148] mgr/vol: don't define progress bar ID repeatedly

Orignally, when the feature was in development, IDs for clone progress
bars were set to randomly generated UUID strings. But, eventually, it
was decided to assign fixed strings to them. Unlike UUIDs, these strings
stay the same even when progress bars are destroyed and re-created.
Therefore, instead of re-assigning the same strings every time
initiate_reporting() is called, move them to __init__() so that both the
IDs are defined only once.

Signed-off-by: Rishabh Dave <ridave@redhat.com>
---
 src/pybind/mgr/volumes/fs/stats_util.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/pybind/mgr/volumes/fs/stats_util.py b/src/pybind/mgr/volumes/fs/stats_util.py
index cec33eaa8873d..3334dc5a3d765 100644
--- a/src/pybind/mgr/volumes/fs/stats_util.py
+++ b/src/pybind/mgr/volumes/fs/stats_util.py
@@ -106,6 +106,11 @@ def __init__(self, volclient, vol_spec):
         # reporting has already been initiated by calling RTimer.is_alive().
         self.update_task = RTimer(1, self._update_progress_bars)
 
+        # progress event ID for ongoing clone jobs
+        self.on_pev_id: Optional[str] = 'mgr-vol-ongoing-clones'
+        # progress event ID for ongoing+pending clone jobs
+        self.onpen_pev_id: Optional[str] = 'mgr-vol-total-clones'
+
     def initiate_reporting(self):
         if self.update_task.is_alive():
             log.info('progress reporting thread is already alive, not '
@@ -113,11 +118,6 @@ def initiate_reporting(self):
             return
 
         log.info('initiating progress reporting for clones...')
-        # progress event ID for ongoing clone jobs
-        self.on_pev_id: Optional[str] = 'mgr-vol-ongoing-clones'
-        # progress event ID for ongoing+pending clone jobs
-        self.onpen_pev_id: Optional[str] = 'mgr-vol-total-clones'
-
         self.update_task = RTimer(1, self._update_progress_bars)
         self.update_task.start()
         log.info('progress reporting for clones has been initiated')
@@ -294,10 +294,7 @@ def _finish_progress_events(self):
         assert self.onpen_pev_id is not None
 
         self.volclient.mgr.remote('progress', 'complete', self.on_pev_id)
-        self.on_pev_id = None
-
         self.volclient.mgr.remote('progress', 'complete', self.onpen_pev_id)
-        self.onpen_pev_id = None
 
         log.info('finished removing progress bars from "ceph status" output')
 

From 706eb26f560bfdd7c34c62445d27c9ebf7f7ad26 Mon Sep 17 00:00:00 2001
From: Patrick Donnelly <pdonnell@redhat.com>
Date: Fri, 27 Sep 2024 13:42:48 -0400
Subject: [PATCH 030/148] mds: do not dump empty bufptr

Fixes: https://tracker.ceph.com/issues/68243
Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
---
 src/mds/CInode.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
index 0e9b6996ad2c5..dfad411d323d8 100644
--- a/src/mds/CInode.cc
+++ b/src/mds/CInode.cc
@@ -4589,8 +4589,11 @@ void InodeStoreBase::dump(Formatter *f) const
     for (const auto& [key, val] : *xattrs) {
       f->open_object_section("xattr");
       f->dump_string("key", key);
-      std::string v(val.c_str(), val.length());
-      f->dump_string("val", v);
+      if (val.length()) {
+        f->dump_string("val", std::string(val.c_str(), val.length()));
+      } else {
+        f->dump_string("val", "");
+      }
       f->close_section();
     }
   }

From ee8c7d2e3bb692fb263a9bb6828c7b9a55a44504 Mon Sep 17 00:00:00 2001
From: Redouane Kachach <rkachach@ibm.com>
Date: Fri, 27 Sep 2024 15:48:18 +0200
Subject: [PATCH 031/148] mgr/cephadm: adding config to check client cert for
 internal nginx Fixes: https://tracker.ceph.com/issues/68310

Signed-off-by: Redouane Kachach <rkachach@ibm.com>
---
 .../templates/services/mgmt-gateway/internal_server.conf.j2 | 3 +++
 src/pybind/mgr/cephadm/tests/test_services.py               | 6 ++++++
 2 files changed, 9 insertions(+)

diff --git a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/internal_server.conf.j2 b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/internal_server.conf.j2
index f2c32f8797750..0801adebd0844 100644
--- a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/internal_server.conf.j2
+++ b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/internal_server.conf.j2
@@ -1,5 +1,8 @@
 
 server {
+    ssl_client_certificate /etc/nginx/ssl/ca.crt;
+    ssl_verify_client on;
+
     listen              {{ internal_port }} ssl;
     listen              [::]:{{ internal_port }} ssl;
     ssl_certificate     /etc/nginx/ssl/nginx_internal.crt;
diff --git a/src/pybind/mgr/cephadm/tests/test_services.py b/src/pybind/mgr/cephadm/tests/test_services.py
index a9b7da624a0e6..b874161f10959 100644
--- a/src/pybind/mgr/cephadm/tests/test_services.py
+++ b/src/pybind/mgr/cephadm/tests/test_services.py
@@ -3446,6 +3446,9 @@ def get_services_endpoints(name):
                                              }"""),
                     "nginx_internal_server.conf": dedent("""
                                              server {
+                                                 ssl_client_certificate /etc/nginx/ssl/ca.crt;
+                                                 ssl_verify_client on;
+
                                                  listen              29443 ssl;
                                                  listen              [::]:29443 ssl;
                                                  ssl_certificate     /etc/nginx/ssl/nginx_internal.crt;
@@ -3760,6 +3763,9 @@ def get_services_endpoints(name):
                                              }"""),
                     "nginx_internal_server.conf": dedent("""
                                              server {
+                                                 ssl_client_certificate /etc/nginx/ssl/ca.crt;
+                                                 ssl_verify_client on;
+
                                                  listen              29443 ssl;
                                                  listen              [::]:29443 ssl;
                                                  ssl_certificate     /etc/nginx/ssl/nginx_internal.crt;

From c0e05bf36067294420631f33c5e43c32077eeb82 Mon Sep 17 00:00:00 2001
From: Guillaume Abrioux <gabrioux@ibm.com>
Date: Mon, 30 Sep 2024 09:17:11 +0000
Subject: [PATCH 032/148] ceph-volume: drop unnecessary call to
 `get_single_lv()`

`Zap.zap_lv()` currently makes a call to `get_single_lv()`:

```
        lv = api.get_single_lv(filters={'lv_name': device.lv_name,
                                        'vg_name': device.vg_name})
```

this isn't needed and redundant as zap_lv() takes an instance of `Device()`
as argument which has already a `lv_api` attribute:

class Device in device.py:

```
            else:
                vgname, lvname = self.path.split('/')
                filters = {'lv_name': lvname, 'vg_name': vgname}
            lv = lvm.get_single_lv(filters=filters)         # <---- same call

        if lv:
            self.lv_api = lv
```

This implies a duplicate call to `subprocess.Popen()` unnecessarily.

Fixes: https://tracker.ceph.com/issues/68312

Signed-off-by: Guillaume Abrioux <gabrioux@ibm.com>
---
 src/ceph-volume/ceph_volume/devices/lvm/zap.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/ceph-volume/ceph_volume/devices/lvm/zap.py b/src/ceph-volume/ceph_volume/devices/lvm/zap.py
index c1bef82c10975..2b6925f5b2739 100644
--- a/src/ceph-volume/ceph_volume/devices/lvm/zap.py
+++ b/src/ceph-volume/ceph_volume/devices/lvm/zap.py
@@ -191,8 +191,7 @@ def zap_lv(self, device):
         Device examples: vg-name/lv-name, /dev/vg-name/lv-name
         Requirements: Must be a logical volume (LV)
         """
-        lv = api.get_single_lv(filters={'lv_name': device.lv_name, 'vg_name':
-                                        device.vg_name})
+        lv: api.Volume = device.lv_api
         self.unmount_lv(lv)
 
         zap_device(device.path)

From a55a75c57e7a42a1317e4d7fc86c1964b71137f0 Mon Sep 17 00:00:00 2001
From: Rishabh Dave <ridave@redhat.com>
Date: Fri, 23 Aug 2024 18:19:43 +0530
Subject: [PATCH 033/148] mon,cephfs: require confirmation when changing
 max_mds on unhealthy cluster

User must pass the confirmation flag (--yes-i-really-mean-it) to change
the value of CephFS setting variable "max_mds" when the Ceph cluster is
unhealthy.

This measure was decided upon to prevent users from changing "max_mds"
as a measure of troubleshotoing unhealthy cluster.

Fixes: https://tracker.ceph.com/issues/66301
Signed-off-by: Rishabh Dave <ridave@redhat.com>
---
 src/mon/FSCommands.cc | 11 +++++++++++
 src/mon/MDSMonitor.cc |  7 +++++++
 src/mon/MDSMonitor.h  |  1 +
 3 files changed, 19 insertions(+)

diff --git a/src/mon/FSCommands.cc b/src/mon/FSCommands.cc
index 62d37574ded67..b935ace4affba 100644
--- a/src/mon/FSCommands.cc
+++ b/src/mon/FSCommands.cc
@@ -385,6 +385,17 @@ class SetHandler : public FileSystemCommandHandler
       return -EINVAL;
     }
 
+  bool confirm = false;
+  cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);
+  if (var == "max_mds" && !confirm && mon->mdsmon()->has_any_health_warning()) {
+    ss << "One or more file system health warnings are present. Modifying "
+       << "the file system setting variable \"max_mds\" may not help "
+       << "troubleshoot or recover from these warnings and may further "
+       << "destabilize the system. If you really wish to proceed, run "
+       << "again with --yes-i-really-mean-it";
+    return -EPERM;
+  }
+
     return set_val(mon, fsmap, op, cmdmap, ss, fsp->get_fscid(), var, val);
   }
 };
diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc
index 76a57ac443de7..d8cca4ceb61b1 100644
--- a/src/mon/MDSMonitor.cc
+++ b/src/mon/MDSMonitor.cc
@@ -1557,6 +1557,13 @@ bool MDSMonitor::has_health_warnings(vector<mds_metric_t> warnings)
   return false;
 }
 
+bool MDSMonitor::has_any_health_warning()
+{
+  return std::any_of(
+    pending_daemon_health.begin(), pending_daemon_health.end(),
+    [](auto& it) { return !it.second.metrics.empty() ? true : false; });
+}
+
 int MDSMonitor::filesystem_command(
     FSMap &fsmap,
     MonOpRequestRef op,
diff --git a/src/mon/MDSMonitor.h b/src/mon/MDSMonitor.h
index b0f88cd31302d..dd2a269009de2 100644
--- a/src/mon/MDSMonitor.h
+++ b/src/mon/MDSMonitor.h
@@ -53,6 +53,7 @@ class MDSMonitor : public PaxosService, public PaxosFSMap, protected CommandHand
   bool prepare_update(MonOpRequestRef op) override;
   bool should_propose(double& delay) override;
   bool has_health_warnings(std::vector<mds_metric_t> warnings);
+  bool has_any_health_warning();
 
   bool should_print_status() const {
     auto& fs = get_fsmap();

From 4d5ec87ab404c2b94aab6865061175eb5870fa33 Mon Sep 17 00:00:00 2001
From: Rishabh Dave <ridave@redhat.com>
Date: Tue, 27 Aug 2024 13:04:35 +0530
Subject: [PATCH 034/148] qa/cephfs: add tests for confirmationn required to
 change max_mds

Add tests to ensure that when cluster has any health warning, especially
MDS_TRIM, confirmation flag is mandatory to change max_mds.

Signed-off-by: Rishabh Dave <ridave@redhat.com>
---
 qa/tasks/cephfs/filesystem.py |  7 +++--
 qa/tasks/cephfs/test_admin.py | 57 +++++++++++++++++++++++++++++++++++
 2 files changed, 62 insertions(+), 2 deletions(-)

diff --git a/qa/tasks/cephfs/filesystem.py b/qa/tasks/cephfs/filesystem.py
index 1c00a49077dff..2b7fd2ee56945 100644
--- a/qa/tasks/cephfs/filesystem.py
+++ b/qa/tasks/cephfs/filesystem.py
@@ -640,8 +640,11 @@ def set_down(self, down=True):
     def set_joinable(self, joinable=True):
         self.set_var("joinable", joinable)
 
-    def set_max_mds(self, max_mds):
-        self.set_var("max_mds", "%d" % max_mds)
+    def set_max_mds(self, max_mds, confirm=True):
+        if confirm:
+            self.set_var('max_mds', f'{max_mds}', '--yes-i-really-mean-it')
+        else:
+            self.set_var("max_mds", f"{max_mds}",)
 
     def set_session_timeout(self, timeout):
         self.set_var("session_timeout", "%d" % timeout)
diff --git a/qa/tasks/cephfs/test_admin.py b/qa/tasks/cephfs/test_admin.py
index ff9962e73104d..315d9140119d0 100644
--- a/qa/tasks/cephfs/test_admin.py
+++ b/qa/tasks/cephfs/test_admin.py
@@ -2659,3 +2659,60 @@ def test_with_health_warn_with_2_active_MDSs(self):
                               errmsgs=health_warn)
         self.run_ceph_cmd(f'mds fail {mds1_id} --yes-i-really-mean-it')
         self.run_ceph_cmd(f'mds fail {mds2_id} --yes-i-really-mean-it')
+
+
+class TestFSSetMaxMDS(TestAdminCommands):
+
+    def test_when_unhealthy_without_confirm(self):
+        '''
+        Test that command "ceph fs set <fsname> max_mds <num>" without the
+        confirmation flag (--yes-i-really-mean-it) fails when cluster is
+        unhealthy.
+        '''
+        self.gen_health_warn_mds_cache_oversized()
+
+        with self.assertRaises(CommandFailedError) as cfe:
+            self.fs.set_max_mds(2, confirm=False)
+        self.assertEqual(cfe.exception.exitstatus, errno.EPERM)
+
+    def test_when_unhealthy_with_confirm(self):
+        '''
+        Test that command "ceph fs set <fsname> max_mds <num>
+        --yes-i-really-mean-it" runs successfully when cluster is unhealthy.
+        '''
+        self.gen_health_warn_mds_cache_oversized()
+
+        self.fs.set_max_mds(2, confirm=True)
+        self.assertEqual(self.fs.get_var('max_mds'), 2)
+
+    def test_when_mds_trim_without_confirm(self):
+        '''
+        Test that command "ceph fs set <fsname> max_mds <num>" without the
+        confirmation flag (--yes-i-really-mean-it) fails when cluster has
+        MDS_TRIM health warning.
+        '''
+        self.gen_health_warn_mds_trim()
+
+        with self.assertRaises(CommandFailedError) as cfe:
+            self.fs.set_max_mds(2, confirm=False)
+        self.assertEqual(cfe.exception.exitstatus, errno.EPERM)
+
+    def test_when_mds_trim_when_with_confirm(self):
+        '''
+        Test that command "ceph fs set <fsname> max_mds <num>
+        --yes-i-really-mean-it" runs successfully when cluster has MDS_TRIM
+        health warning.
+        '''
+        self.gen_health_warn_mds_trim()
+
+        self.fs.set_max_mds(2, confirm=True)
+        self.assertEqual(self.fs.get_var('max_mds'), 2)
+
+    def test_when_healthy_with_confirm(self):
+        '''
+        Test that command "ceph fs set <fsname> max_mds <num>
+        --yes-i-really-mean-it" runs successfully also when cluster is
+        healthy.
+        '''
+        self.fs.set_max_mds(2, confirm=True)
+        self.assertEqual(self.fs.get_var('max_mds'), 2)

From 2d28faaeea11988867471a53e40145f309951307 Mon Sep 17 00:00:00 2001
From: Rishabh Dave <ridave@redhat.com>
Date: Tue, 27 Aug 2024 13:33:23 +0530
Subject: [PATCH 035/148] doc/cephfs: update about changing max_mds FS setting
 variable

Update the documentation for CephFs admininstration as well
troubleshooting.

Signed-off-by: Rishabh Dave <ridave@redhat.com>
---
 doc/cephfs/administration.rst  | 11 +++++++++--
 doc/cephfs/troubleshooting.rst |  5 +++++
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/doc/cephfs/administration.rst b/doc/cephfs/administration.rst
index 5760e67f73e64..07646bff06786 100644
--- a/doc/cephfs/administration.rst
+++ b/doc/cephfs/administration.rst
@@ -61,10 +61,17 @@ is a subset of the same information from the ``ceph fs dump`` command.
 
 ::
 
-    ceph fs set <file system name> <var> <val>
+    ceph fs set <file system name> <var> <val> [--yes-i-really-mean-it]
 
 Change a setting on a file system. These settings are specific to the named
-file system and do not affect other file systems.
+file system and do not affect other file systems. Confirmation flag is only
+needed for changing ``max_mds`` when cluster is unhealthy.
+
+.. note:: It is mandatory to pass confirmation flag (--yes--i-really-mean-it)
+   for modifying FS setting variable ``max_mds`` when cluster is unhealthy.
+   It has been added a precaution to tell users that modifying ``max_mds``
+   during troubleshooting or recovery might not help. Instead, it might
+   further destabilize the cluster.
 
 ::
 
diff --git a/doc/cephfs/troubleshooting.rst b/doc/cephfs/troubleshooting.rst
index 34de1b7501df9..78d0a8f54d336 100644
--- a/doc/cephfs/troubleshooting.rst
+++ b/doc/cephfs/troubleshooting.rst
@@ -128,6 +128,11 @@ things to do:
 
   That prevents any clients from establishing new sessions with the MDS.
 
+* **Dont tweak max_mds** Modifying the FS setting variable ``max_mds`` is
+  sometimes perceived as a good step during troubleshooting or recovery effort.
+  Instead, doing so might further destabilize the cluster. If ``max_mds`` must
+  be changed in such circumstances, run the command to change ``max_mds`` with
+  the confirmation flag (``--yes-i-really-mean-it``)
 
 
 Expediting MDS journal trim

From a71c8e8d1186823cf5d01f23d7b922c5e2665aa5 Mon Sep 17 00:00:00 2001
From: Rishabh Dave <ridave@redhat.com>
Date: Tue, 27 Aug 2024 13:50:49 +0530
Subject: [PATCH 036/148] PendingReleaseNotes: add a release note about confirm
 flag for max_mds

Add a release note for the fact that users now need to pass the
confirmation flag for modifying "max_mds" when cluster is unhealthy.

Signed-off-by: Rishabh Dave <ridave@redhat.com>
---
 PendingReleaseNotes | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/PendingReleaseNotes b/PendingReleaseNotes
index 0185d6e54eaf2..c35924c6e8690 100644
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -13,6 +13,14 @@
   output is changed from 'STATUS' to 'STATE'. The state of a group snapshot
   that was shown as 'ok' is now shown as 'complete', which is more descriptive.
 
+* CephFS: Modifying the FS setting variable "max_mds" when a cluster is
+  unhealthy now requires users to pass the confirmation flag
+  (--yes-i-really-mean-it). This has been added as a precaution to tell the
+  users that modifying "max_mds" may not help with troubleshooting or recovery
+  effort. Instead, it might further destabilize the cluster.
+
+
+
 >=19.0.0
 
 * cephx: key rotation is now possible using `ceph auth rotate`. Previously,

From 3e3b7fa8ea22a2dd7f5d5c97d6e096a5e30585f4 Mon Sep 17 00:00:00 2001
From: Adam King <adking@redhat.com>
Date: Tue, 1 Oct 2024 10:03:13 -0400
Subject: [PATCH 037/148] mgr/cephadm: add "allow_set_io_flusher_fail = true;"
 to ganesha conf

This is necessary for ganesha 6.1 running in a container
to start up without hitting a permission failure (without
having to run the container as --privileged)  and doesn't
seem to cause any damage when read in by ganesha v5.9 (the
current version in the main branch containers)

Signed-off-by: Adam King <adking@redhat.com>
---
 src/pybind/mgr/cephadm/templates/services/nfs/ganesha.conf.j2 | 1 +
 src/pybind/mgr/cephadm/tests/test_services.py                 | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/pybind/mgr/cephadm/templates/services/nfs/ganesha.conf.j2 b/src/pybind/mgr/cephadm/templates/services/nfs/ganesha.conf.j2
index ded403169c976..03ff8a32ca292 100644
--- a/src/pybind/mgr/cephadm/templates/services/nfs/ganesha.conf.j2
+++ b/src/pybind/mgr/cephadm/templates/services/nfs/ganesha.conf.j2
@@ -4,6 +4,7 @@ NFS_CORE_PARAM {
         Enable_RQUOTA = false;
         Protocols = 4;
         NFS_Port = {{ port }};
+        allow_set_io_flusher_fail = true;
 {% if bind_addr %}
         Bind_addr = {{ bind_addr }};
 {% endif %}
diff --git a/src/pybind/mgr/cephadm/tests/test_services.py b/src/pybind/mgr/cephadm/tests/test_services.py
index a9b7da624a0e6..538ca65371489 100644
--- a/src/pybind/mgr/cephadm/tests/test_services.py
+++ b/src/pybind/mgr/cephadm/tests/test_services.py
@@ -2710,6 +2710,7 @@ def fake_keys():
             '        Enable_RQUOTA = false;\n'
             '        Protocols = 4;\n'
             '        NFS_Port = 2049;\n'
+            '        allow_set_io_flusher_fail = true;\n'
             '        HAProxy_Hosts = 192.168.122.111, 10.10.2.20, 192.168.122.222;\n'
             '}\n'
             '\n'

From fd895dde9d69ae7be7a78d8db37a2f94ded27080 Mon Sep 17 00:00:00 2001
From: Guillaume Abrioux <gabrioux@ibm.com>
Date: Mon, 30 Sep 2024 15:30:25 +0200
Subject: [PATCH 038/148] cephadm: pull container images from quay.io

Now that all required images are hosted and/or mirrored on quay.io
we can move away from docker.io

Fixes: https://tracker.ceph.com/issues/68323

Signed-off-by: Guillaume Abrioux <gabrioux@ibm.com>
---
 src/cephadm/cephadmlib/constants.py           |  8 ++---
 src/cephadm/cephadmlib/data_utils.py          | 12 +++----
 src/cephadm/samples/custom_container.json     |  2 +-
 src/cephadm/tests/build/test_cephadm_build.py |  4 +--
 src/cephadm/tests/test_cephadm.py             | 36 +++++++++----------
 src/cephadm/tests/test_custom_container.py    |  2 +-
 src/cephadm/tox.ini                           |  3 +-
 src/pybind/mgr/cephadm/module.py              |  8 ++---
 src/pybind/mgr/cephadm/tests/test_cephadm.py  |  2 +-
 src/pybind/mgr/cephadm/tests/test_spec.py     | 20 +++++------
 src/pybind/mgr/cephadm/upgrade.py             | 12 +++----
 .../service-daemon-list.component.spec.ts     | 12 +++----
 .../orchestrator/tests/test_orchestrator.py   |  2 +-
 src/pybind/mgr/tox.ini                        |  3 +-
 14 files changed, 64 insertions(+), 62 deletions(-)

diff --git a/src/cephadm/cephadmlib/constants.py b/src/cephadm/cephadmlib/constants.py
index d25eb1391e0c0..354c378239802 100644
--- a/src/cephadm/cephadmlib/constants.py
+++ b/src/cephadm/cephadmlib/constants.py
@@ -5,15 +5,15 @@
 DEFAULT_IMAGE_IS_MAIN = True
 DEFAULT_IMAGE_RELEASE = 'squid'
 DEFAULT_PROMETHEUS_IMAGE = 'quay.io/prometheus/prometheus:v2.51.0'
-DEFAULT_LOKI_IMAGE = 'docker.io/grafana/loki:3.0.0'
-DEFAULT_PROMTAIL_IMAGE = 'docker.io/grafana/promtail:3.0.0'
+DEFAULT_LOKI_IMAGE = 'quay.io/ceph/loki:3.0.0'
+DEFAULT_PROMTAIL_IMAGE = 'quay.io/ceph/promtail:3.0.0'
 DEFAULT_NODE_EXPORTER_IMAGE = 'quay.io/prometheus/node-exporter:v1.7.0'
 DEFAULT_ALERT_MANAGER_IMAGE = 'quay.io/prometheus/alertmanager:v0.27.0'
 DEFAULT_GRAFANA_IMAGE = 'quay.io/ceph/grafana:10.4.8'
 DEFAULT_HAPROXY_IMAGE = 'quay.io/ceph/haproxy:2.3'
 DEFAULT_KEEPALIVED_IMAGE = 'quay.io/ceph/keepalived:2.2.4'
 DEFAULT_NVMEOF_IMAGE = 'quay.io/ceph/nvmeof:1.2.17'
-DEFAULT_SNMP_GATEWAY_IMAGE = 'docker.io/maxwo/snmp-notifier:v1.2.1'
+DEFAULT_SNMP_GATEWAY_IMAGE = 'quay.io/ceph/snmp-notifier:v1.2.1'
 DEFAULT_ELASTICSEARCH_IMAGE = 'quay.io/omrizeneva/elasticsearch:6.8.23'
 DEFAULT_JAEGER_COLLECTOR_IMAGE = 'quay.io/jaegertracing/jaeger-collector:1.29'
 DEFAULT_JAEGER_AGENT_IMAGE = 'quay.io/jaegertracing/jaeger-agent:1.29'
@@ -22,7 +22,7 @@
 DEFAULT_SMBMETRICS_IMAGE = 'quay.io/samba.org/samba-metrics:latest'
 DEFAULT_NGINX_IMAGE = 'quay.io/ceph/nginx:sclorg-nginx-126'
 DEFAULT_OAUTH2_PROXY_IMAGE = 'quay.io/oauth2-proxy/oauth2-proxy:v7.6.0'
-DEFAULT_REGISTRY = 'docker.io'  # normalize unqualified digests to this
+DEFAULT_REGISTRY = 'quay.io'  # normalize unqualified digests to this
 # ------------------------------------------------------------------------------
 
 LATEST_STABLE_RELEASE = 'squid'
diff --git a/src/cephadm/cephadmlib/data_utils.py b/src/cephadm/cephadmlib/data_utils.py
index 2f4674752cc17..0ab8b38d2b518 100644
--- a/src/cephadm/cephadmlib/data_utils.py
+++ b/src/cephadm/cephadmlib/data_utils.py
@@ -165,17 +165,17 @@ def is_fsid(s):
 def normalize_image_digest(digest: str) -> str:
     """
     Normal case:
-    >>> normalize_image_digest('ceph/ceph', 'docker.io')
-    'docker.io/ceph/ceph'
+    >>> normalize_image_digest('ceph/ceph', 'quay.io')
+    'quay.io/ceph/ceph'
 
     No change:
-    >>> normalize_image_digest('quay.ceph.io/ceph/ceph', 'docker.io')
+    >>> normalize_image_digest('quay.ceph.io/ceph/ceph', 'quay.io')
     'quay.ceph.io/ceph/ceph'
 
-    >>> normalize_image_digest('docker.io/ubuntu', 'docker.io')
-    'docker.io/ubuntu'
+    >>> normalize_image_digest('quay.io/ubuntu', 'quay.io')
+    'quay.io/ubuntu'
 
-    >>> normalize_image_digest('localhost/ceph', 'docker.io')
+    >>> normalize_image_digest('localhost/ceph', 'quay.io')
     'localhost/ceph'
     """
     known_shortnames = [
diff --git a/src/cephadm/samples/custom_container.json b/src/cephadm/samples/custom_container.json
index 194a44d2abbf1..210cf1e3e552a 100644
--- a/src/cephadm/samples/custom_container.json
+++ b/src/cephadm/samples/custom_container.json
@@ -1,5 +1,5 @@
 {
-    "image": "docker.io/prom/alertmanager:v0.20.0",
+    "image": "quay.io/prometheus/alertmanager:v0.20.0",
     "ports": [9093, 9094],
     "args": [
         "-p", "9093:9093",
diff --git a/src/cephadm/tests/build/test_cephadm_build.py b/src/cephadm/tests/build/test_cephadm_build.py
index 1465c2c5efea7..c2995a76d4b15 100644
--- a/src/cephadm/tests/build/test_cephadm_build.py
+++ b/src/cephadm/tests/build/test_cephadm_build.py
@@ -34,12 +34,12 @@
     },
     'ubuntu-20.04': {
         'name': 'cephadm-build-test:ubuntu-20-04-py3',
-        'base_image': 'docker.io/library/ubuntu:20.04',
+        'base_image': 'quay.io/library/ubuntu:20.04',
         'script': 'apt update && apt install -y python3-venv',
     },
     'ubuntu-22.04': {
         'name': 'cephadm-build-test:ubuntu-22-04-py3',
-        'base_image': 'docker.io/library/ubuntu:22.04',
+        'base_image': 'quay.io/library/ubuntu:22.04',
         'script': 'apt update && apt install -y python3-venv',
     },
 }
diff --git a/src/cephadm/tests/test_cephadm.py b/src/cephadm/tests/test_cephadm.py
index 928982de70b6f..f27b9bcd3625a 100644
--- a/src/cephadm/tests/test_cephadm.py
+++ b/src/cephadm/tests/test_cephadm.py
@@ -533,12 +533,12 @@ def test_registry_login(self, _logger, _get_parm, _call_throws):
 
     def test_get_image_info_from_inspect(self):
         # podman
-        out = """204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1,[docker.io/ceph/ceph@sha256:1cc9b824e1b076cdff52a9aa3f0cc8557d879fb2fbbba0cafed970aca59a3992]"""
+        out = """204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1,[quay.io/ceph/ceph@sha256:1cc9b824e1b076cdff52a9aa3f0cc8557d879fb2fbbba0cafed970aca59a3992]"""
         r = _cephadm.get_image_info_from_inspect(out, 'registry/ceph/ceph:latest')
         print(r)
         assert r == {
             'image_id': '204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1',
-            'repo_digests': ['docker.io/ceph/ceph@sha256:1cc9b824e1b076cdff52a9aa3f0cc8557d879fb2fbbba0cafed970aca59a3992']
+            'repo_digests': ['quay.io/ceph/ceph@sha256:1cc9b824e1b076cdff52a9aa3f0cc8557d879fb2fbbba0cafed970aca59a3992']
         }
 
         # docker
@@ -550,13 +550,13 @@ def test_get_image_info_from_inspect(self):
         }
 
         # multiple digests (podman)
-        out = """e935122ab143a64d92ed1fbb27d030cf6e2f0258207be1baf1b509c466aeeb42,[docker.io/prom/prometheus@sha256:e4ca62c0d62f3e886e684806dfe9d4e0cda60d54986898173c1083856cfda0f4 docker.io/prom/prometheus@sha256:efd99a6be65885c07c559679a0df4ec709604bcdd8cd83f0d00a1a683b28fb6a]"""
+        out = """e935122ab143a64d92ed1fbb27d030cf6e2f0258207be1baf1b509c466aeeb42,[quay.io/prom/prometheus@sha256:e4ca62c0d62f3e886e684806dfe9d4e0cda60d54986898173c1083856cfda0f4 quay.io/prom/prometheus@sha256:efd99a6be65885c07c559679a0df4ec709604bcdd8cd83f0d00a1a683b28fb6a]"""
         r = _cephadm.get_image_info_from_inspect(out, 'registry/prom/prometheus:latest')
         assert r == {
             'image_id': 'e935122ab143a64d92ed1fbb27d030cf6e2f0258207be1baf1b509c466aeeb42',
             'repo_digests': [
-                'docker.io/prom/prometheus@sha256:e4ca62c0d62f3e886e684806dfe9d4e0cda60d54986898173c1083856cfda0f4',
-                'docker.io/prom/prometheus@sha256:efd99a6be65885c07c559679a0df4ec709604bcdd8cd83f0d00a1a683b28fb6a',
+                'quay.io/prom/prometheus@sha256:e4ca62c0d62f3e886e684806dfe9d4e0cda60d54986898173c1083856cfda0f4',
+                'quay.io/prom/prometheus@sha256:efd99a6be65885c07c559679a0df4ec709604bcdd8cd83f0d00a1a683b28fb6a',
             ]
         }
 
@@ -604,7 +604,7 @@ def test_infer_local_ceph_image(self, _logger, _listdir):
                                  '')
         out = '''quay.ceph.io/ceph-ci/ceph@sha256:87f200536bb887b36b959e887d5984dd7a3f008a23aa1f283ab55d48b22c6185|dad864ee21e9|main|2022-03-23 16:29:19 +0000 UTC
         quay.ceph.io/ceph-ci/ceph@sha256:b50b130fcda2a19f8507ddde3435bb4722266956e1858ac395c838bc1dcf1c0e|514e6a882f6e|pacific|2022-03-23 15:58:34 +0000 UTC
-        docker.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508|666bbfa87e8d|v15.2.5|2020-09-16 14:15:15 +0000 UTC'''
+        quay.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508|666bbfa87e8d|v15.2.5|2020-09-16 14:15:15 +0000 UTC'''
         with mock.patch('cephadm.call_throws', return_value=(out, '', '')):
             with mock.patch('cephadm.get_container_info', return_value=cinfo):
                 image = _cephadm.infer_local_ceph_image(ctx, ctx.container_engine)
@@ -613,7 +613,7 @@ def test_infer_local_ceph_image(self, _logger, _listdir):
         # make sure first valid image is used when no container_info is found
         out = '''quay.ceph.io/ceph-ci/ceph@sha256:87f200536bb887b36b959e887d5984dd7a3f008a23aa1f283ab55d48b22c6185|dad864ee21e9|main|2022-03-23 16:29:19 +0000 UTC
         quay.ceph.io/ceph-ci/ceph@sha256:b50b130fcda2a19f8507ddde3435bb4722266956e1858ac395c838bc1dcf1c0e|514e6a882f6e|pacific|2022-03-23 15:58:34 +0000 UTC
-        docker.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508|666bbfa87e8d|v15.2.5|2020-09-16 14:15:15 +0000 UTC'''
+        quay.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508|666bbfa87e8d|v15.2.5|2020-09-16 14:15:15 +0000 UTC'''
         with mock.patch('cephadm.call_throws', return_value=(out, '', '')):
             with mock.patch('cephadm.get_container_info', return_value=None):
                 image = _cephadm.infer_local_ceph_image(ctx, ctx.container_engine)
@@ -621,12 +621,12 @@ def test_infer_local_ceph_image(self, _logger, _listdir):
 
         # make sure images without digest are discarded (no container_info is found)
         out = '''quay.ceph.io/ceph-ci/ceph@|||
-        docker.io/ceph/ceph@|||
-        docker.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508|666bbfa87e8d|v15.2.5|2020-09-16 14:15:15 +0000 UTC'''
+        quay.io/ceph/ceph@|||
+        quay.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508|666bbfa87e8d|v15.2.5|2020-09-16 14:15:15 +0000 UTC'''
         with mock.patch('cephadm.call_throws', return_value=(out, '', '')):
             with mock.patch('cephadm.get_container_info', return_value=None):
                 image = _cephadm.infer_local_ceph_image(ctx, ctx.container_engine)
-                assert image == 'docker.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508'
+                assert image == 'quay.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508'
 
 
 
@@ -2409,7 +2409,7 @@ class TestSNMPGateway:
 
     def test_unit_run_V2c(self, cephadm_fs):
         fsid = 'ca734440-3dc6-11ec-9b98-5254002537a6'
-        with with_cephadm_ctx(['--image=docker.io/maxwo/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
+        with with_cephadm_ctx(['--image=quay.io/ceph/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
             import json
             ctx.config_json = json.dumps(self.V2c_config)
             ctx.fsid = fsid
@@ -2434,11 +2434,11 @@ def test_unit_run_V2c(self, cephadm_fs):
             )
             with open(f'/var/lib/ceph/{fsid}/snmp-gateway.daemon_id/unit.run', 'r') as f:
                 run_cmd = f.readlines()[-1].rstrip()
-                assert run_cmd.endswith('docker.io/maxwo/snmp-notifier:v1.2.1 --web.listen-address=:9464 --snmp.destination=192.168.1.10:162 --snmp.version=V2c --log.level=info --snmp.trap-description-template=/etc/snmp_notifier/description-template.tpl')
+                assert run_cmd.endswith('quay.io/ceph/snmp-notifier:v1.2.1 --web.listen-address=:9464 --snmp.destination=192.168.1.10:162 --snmp.version=V2c --log.level=info --snmp.trap-description-template=/etc/snmp_notifier/description-template.tpl')
 
     def test_unit_run_V3_noPriv(self, cephadm_fs):
         fsid = 'ca734440-3dc6-11ec-9b98-5254002537a6'
-        with with_cephadm_ctx(['--image=docker.io/maxwo/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
+        with with_cephadm_ctx(['--image=quay.io/ceph/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
             import json
             ctx.config_json = json.dumps(self.V3_no_priv_config)
             ctx.fsid = fsid
@@ -2463,11 +2463,11 @@ def test_unit_run_V3_noPriv(self, cephadm_fs):
             )
             with open(f'/var/lib/ceph/{fsid}/snmp-gateway.daemon_id/unit.run', 'r') as f:
                 run_cmd = f.readlines()[-1].rstrip()
-                assert run_cmd.endswith('docker.io/maxwo/snmp-notifier:v1.2.1 --web.listen-address=:9465 --snmp.destination=192.168.1.10:162 --snmp.version=V3 --log.level=info --snmp.trap-description-template=/etc/snmp_notifier/description-template.tpl --snmp.authentication-enabled --snmp.authentication-protocol=SHA --snmp.security-engine-id=8000C53F00000000')
+                assert run_cmd.endswith('quay.io/ceph/snmp-notifier:v1.2.1 --web.listen-address=:9465 --snmp.destination=192.168.1.10:162 --snmp.version=V3 --log.level=info --snmp.trap-description-template=/etc/snmp_notifier/description-template.tpl --snmp.authentication-enabled --snmp.authentication-protocol=SHA --snmp.security-engine-id=8000C53F00000000')
 
     def test_unit_run_V3_Priv(self, cephadm_fs):
         fsid = 'ca734440-3dc6-11ec-9b98-5254002537a6'
-        with with_cephadm_ctx(['--image=docker.io/maxwo/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
+        with with_cephadm_ctx(['--image=quay.io/ceph/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
             import json
             ctx.config_json = json.dumps(self.V3_priv_config)
             ctx.fsid = fsid
@@ -2492,11 +2492,11 @@ def test_unit_run_V3_Priv(self, cephadm_fs):
             )
             with open(f'/var/lib/ceph/{fsid}/snmp-gateway.daemon_id/unit.run', 'r') as f:
                 run_cmd = f.readlines()[-1].rstrip()
-                assert run_cmd.endswith('docker.io/maxwo/snmp-notifier:v1.2.1 --web.listen-address=:9464 --snmp.destination=192.168.1.10:162 --snmp.version=V3 --log.level=info --snmp.trap-description-template=/etc/snmp_notifier/description-template.tpl --snmp.authentication-enabled --snmp.authentication-protocol=SHA --snmp.security-engine-id=8000C53F00000000 --snmp.private-enabled --snmp.private-protocol=DES')
+                assert run_cmd.endswith('quay.io/ceph/snmp-notifier:v1.2.1 --web.listen-address=:9464 --snmp.destination=192.168.1.10:162 --snmp.version=V3 --log.level=info --snmp.trap-description-template=/etc/snmp_notifier/description-template.tpl --snmp.authentication-enabled --snmp.authentication-protocol=SHA --snmp.security-engine-id=8000C53F00000000 --snmp.private-enabled --snmp.private-protocol=DES')
 
     def test_unit_run_no_dest(self, cephadm_fs):
         fsid = 'ca734440-3dc6-11ec-9b98-5254002537a6'
-        with with_cephadm_ctx(['--image=docker.io/maxwo/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
+        with with_cephadm_ctx(['--image=quay.io/ceph/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
             import json
             ctx.config_json = json.dumps(self.no_destination_config)
             ctx.fsid = fsid
@@ -2512,7 +2512,7 @@ def test_unit_run_no_dest(self, cephadm_fs):
 
     def test_unit_run_bad_version(self, cephadm_fs):
         fsid = 'ca734440-3dc6-11ec-9b98-5254002537a6'
-        with with_cephadm_ctx(['--image=docker.io/maxwo/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
+        with with_cephadm_ctx(['--image=quay.io/ceph/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
             import json
             ctx.config_json = json.dumps(self.bad_version_config)
             ctx.fsid = fsid
diff --git a/src/cephadm/tests/test_custom_container.py b/src/cephadm/tests/test_custom_container.py
index c185b0908df6c..197ed38dca3be 100644
--- a/src/cephadm/tests/test_custom_container.py
+++ b/src/cephadm/tests/test_custom_container.py
@@ -47,7 +47,7 @@ def setUp(self):
                     ]
                 ]
             },
-            image='docker.io/library/hello-world:latest'
+            image='quay.io/hello-world/hello-world:latest'
         )
 
     def test_entrypoint(self):
diff --git a/src/cephadm/tox.ini b/src/cephadm/tox.ini
index 70e9a411238fb..20608c1681ce1 100644
--- a/src/cephadm/tox.ini
+++ b/src/cephadm/tox.ini
@@ -49,7 +49,8 @@ deps =
     flake8-quotes
 commands =
     flake8 --config=tox.ini {posargs:cephadm.py cephadmlib}
-    bash -c 'test $(git ls-files 'cephadm.py' 'cephadmlib/*.py' | sort -u | xargs grep "docker.io" | wc -l) == 11'
+    bash -c 'test $(git ls-files 'cephadm.py' 'cephadmlib/*.py' | sort -u | xargs grep "docker.io" | wc -l) == 1'
+    bash -c 'test $(git ls-files 'cephadm.py' 'cephadmlib/*.py' | sort -u | xargs grep "quay.io" | wc -l) == 25'
 # Downstream distributions may choose to alter this "docker.io" number,
 # to make sure no new references to docker.io are creeping in unnoticed.
 
diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py
index 5216c489064c9..178f9cb7ce803 100644
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -135,13 +135,13 @@ def os_exit_noop(status: int) -> None:
 DEFAULT_PROMETHEUS_IMAGE = 'quay.io/prometheus/prometheus:v2.51.0'
 DEFAULT_NODE_EXPORTER_IMAGE = 'quay.io/prometheus/node-exporter:v1.7.0'
 DEFAULT_NVMEOF_IMAGE = 'quay.io/ceph/nvmeof:1.2.17'
-DEFAULT_LOKI_IMAGE = 'docker.io/grafana/loki:3.0.0'
-DEFAULT_PROMTAIL_IMAGE = 'docker.io/grafana/promtail:3.0.0'
+DEFAULT_LOKI_IMAGE = 'quay.io/ceph/loki:3.0.0'
+DEFAULT_PROMTAIL_IMAGE = 'quay.io/ceph/promtail:3.0.0'
 DEFAULT_ALERT_MANAGER_IMAGE = 'quay.io/prometheus/alertmanager:v0.27.0'
 DEFAULT_GRAFANA_IMAGE = 'quay.io/ceph/grafana:10.4.8'
 DEFAULT_HAPROXY_IMAGE = 'quay.io/ceph/haproxy:2.3'
 DEFAULT_KEEPALIVED_IMAGE = 'quay.io/ceph/keepalived:2.2.4'
-DEFAULT_SNMP_GATEWAY_IMAGE = 'docker.io/maxwo/snmp-notifier:v1.2.1'
+DEFAULT_SNMP_GATEWAY_IMAGE = 'quay.io/ceph/snmp-notifier:v1.2.1'
 DEFAULT_ELASTICSEARCH_IMAGE = 'quay.io/omrizeneva/elasticsearch:6.8.23'
 DEFAULT_JAEGER_COLLECTOR_IMAGE = 'quay.io/jaegertracing/jaeger-collector:1.29'
 DEFAULT_JAEGER_AGENT_IMAGE = 'quay.io/jaegertracing/jaeger-agent:1.29'
@@ -446,7 +446,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
         Option(
             'default_registry',
             type='str',
-            default='docker.io',
+            default='quay.io',
             desc='Search-registry to which we should normalize unqualified image names. '
                  'This is not the default registry',
         ),
diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py
index 5a485f98be390..975c125225dc8 100644
--- a/src/pybind/mgr/cephadm/tests/test_cephadm.py
+++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py
@@ -2040,7 +2040,7 @@ def test_blink_device_light_custom_per_host(self, _run_cephadm, cephadm_module):
             ), CephadmOrchestrator.apply_iscsi),
             (CustomContainerSpec(
                 service_id='hello-world',
-                image='docker.io/library/hello-world:latest',
+                image='quay.io/hello-world/hello-world:latest',
                 uid=65534,
                 gid=65534,
                 dirs=['foo/bar'],
diff --git a/src/pybind/mgr/cephadm/tests/test_spec.py b/src/pybind/mgr/cephadm/tests/test_spec.py
index 78a2d73118fe7..42e590945cd96 100644
--- a/src/pybind/mgr/cephadm/tests/test_spec.py
+++ b/src/pybind/mgr/cephadm/tests/test_spec.py
@@ -130,7 +130,7 @@ def convert_to_old_style_json(j):
         "hostname": "ceph-001",
         "container_id": "d94d7969094d",
         "container_image_id": "0881eb8f169f5556a292b4e2c01d683172b12830a62a9225a98a8e206bb734f0",
-        "container_image_name": "docker.io/prom/alertmanager:latest",
+        "container_image_name": "quay.io/prometheus/alertmanager:latest",
         "daemon_id": "ceph-001",
         "daemon_type": "alertmanager",
         "version": "0.20.0",
@@ -145,7 +145,7 @@ def convert_to_old_style_json(j):
         "hostname": "ceph-001",
         "container_id": "c4b036202241",
         "container_image_id": "204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1",
-        "container_image_name": "docker.io/ceph/ceph:v15",
+        "container_image_name": "quay.io/ceph/ceph:v15",
         "daemon_id": "ceph-001",
         "daemon_type": "crash",
         "version": "15.2.0",
@@ -160,7 +160,7 @@ def convert_to_old_style_json(j):
         "hostname": "ceph-001",
         "container_id": "5b7b94b48f31",
         "container_image_id": "87a51ecf0b1c9a7b187b21c1b071425dafea0d765a96d5bc371c791169b3d7f4",
-        "container_image_name": "docker.io/ceph/ceph-grafana:latest",
+        "container_image_name": "quay.io/ceph/ceph-grafana:latest",
         "daemon_id": "ceph-001",
         "daemon_type": "grafana",
         "version": "6.6.2",
@@ -175,7 +175,7 @@ def convert_to_old_style_json(j):
         "hostname": "ceph-001",
         "container_id": "9ca007280456",
         "container_image_id": "204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1",
-        "container_image_name": "docker.io/ceph/ceph:v15",
+        "container_image_name": "quay.io/ceph/ceph:v15",
         "daemon_id": "ceph-001.gkjwqp",
         "daemon_type": "mgr",
         "version": "15.2.0",
@@ -190,7 +190,7 @@ def convert_to_old_style_json(j):
         "hostname": "ceph-001",
         "container_id": "3d1ba9a2b697",
         "container_image_id": "204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1",
-        "container_image_name": "docker.io/ceph/ceph:v15",
+        "container_image_name": "quay.io/ceph/ceph:v15",
         "daemon_id": "ceph-001",
         "daemon_type": "mon",
         "version": "15.2.0",
@@ -205,7 +205,7 @@ def convert_to_old_style_json(j):
         "hostname": "ceph-001",
         "container_id": "36d026c68ba1",
         "container_image_id": "e5a616e4b9cf68dfcad7782b78e118be4310022e874d52da85c55923fb615f87",
-        "container_image_name": "docker.io/prom/node-exporter:latest",
+        "container_image_name": "quay.io/prometheus/node-exporter:latest",
         "daemon_id": "ceph-001",
         "daemon_type": "node-exporter",
         "version": "0.18.1",
@@ -220,7 +220,7 @@ def convert_to_old_style_json(j):
         "hostname": "ceph-001",
         "container_id": "faf76193cbfe",
         "container_image_id": "204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1",
-        "container_image_name": "docker.io/ceph/ceph:v15",
+        "container_image_name": "quay.io/ceph/ceph:v15",
         "daemon_id": "0",
         "daemon_type": "osd",
         "version": "15.2.0",
@@ -235,7 +235,7 @@ def convert_to_old_style_json(j):
         "hostname": "ceph-001",
         "container_id": "f82505bae0f1",
         "container_image_id": "204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1",
-        "container_image_name": "docker.io/ceph/ceph:v15",
+        "container_image_name": "quay.io/ceph/ceph:v15",
         "daemon_id": "1",
         "daemon_type": "osd",
         "version": "15.2.0",
@@ -250,7 +250,7 @@ def convert_to_old_style_json(j):
         "hostname": "ceph-001",
         "container_id": "2708d84cd484",
         "container_image_id": "358a0d2395fe711bb8258e8fb4b2d7865c0a9a6463969bcd1452ee8869ea6653",
-        "container_image_name": "docker.io/prom/prometheus:latest",
+        "container_image_name": "quay.io/prom/prometheus:latest",
         "daemon_id": "ceph-001",
         "daemon_type": "prometheus",
         "version": "2.17.1",
@@ -569,7 +569,7 @@ def convert_to_old_style_json(j):
         CustomContainerSpec(
             service_type='container',
             service_id='hello-world',
-            image='docker.io/library/hello-world:latest',
+            image='quay.io/hello-world/hello-world:latest',
         ),
         DaemonDescription(
             daemon_type='container',
diff --git a/src/pybind/mgr/cephadm/upgrade.py b/src/pybind/mgr/cephadm/upgrade.py
index d8ffab2da5187..ed3d26807e5ce 100644
--- a/src/pybind/mgr/cephadm/upgrade.py
+++ b/src/pybind/mgr/cephadm/upgrade.py
@@ -29,17 +29,17 @@
 def normalize_image_digest(digest: str, default_registry: str) -> str:
     """
     Normal case:
-    >>> normalize_image_digest('ceph/ceph', 'docker.io')
-    'docker.io/ceph/ceph'
+    >>> normalize_image_digest('ceph/ceph', 'quay.io')
+    'quay.io/ceph/ceph'
 
     No change:
-    >>> normalize_image_digest('quay.ceph.io/ceph/ceph', 'docker.io')
+    >>> normalize_image_digest('quay.ceph.io/ceph/ceph', 'quay.io')
     'quay.ceph.io/ceph/ceph'
 
-    >>> normalize_image_digest('docker.io/ubuntu', 'docker.io')
-    'docker.io/ubuntu'
+    >>> normalize_image_digest('quay.io/centos', 'quay.io')
+    'quay.io/centos'
 
-    >>> normalize_image_digest('localhost/ceph', 'docker.io')
+    >>> normalize_image_digest('localhost/ceph', 'quay.io')
     'localhost/ceph'
     """
     known_shortnames = [
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-daemon-list/service-daemon-list.component.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-daemon-list/service-daemon-list.component.spec.ts
index d3ea8c018f66a..367418c752e07 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-daemon-list/service-daemon-list.component.spec.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-daemon-list/service-daemon-list.component.spec.ts
@@ -27,7 +27,7 @@ describe('ServiceDaemonListComponent', () => {
       hostname: 'osd0',
       container_id: '003c10beafc8c27b635bcdfed1ed832e4c1005be89bb1bb05ad4cc6c2b98e41b',
       container_image_id: 'e70344c77bcbf3ee389b9bf5128f635cf95f3d59e005c5d8e67fc19bcc74ed23',
-      container_image_name: 'docker.io/ceph/daemon-base:latest-master-devel',
+      container_image_name: 'quay.io/ceph/daemon-base:latest-master-devel',
       daemon_id: '3',
       daemon_type: 'osd',
       daemon_name: 'osd.3',
@@ -47,7 +47,7 @@ describe('ServiceDaemonListComponent', () => {
       hostname: 'osd0',
       container_id: 'baeec41a01374b3ed41016d542d19aef4a70d69c27274f271e26381a0cc58e7a',
       container_image_id: 'e70344c77bcbf3ee389b9bf5128f635cf95f3d59e005c5d8e67fc19bcc74ed23',
-      container_image_name: 'docker.io/ceph/daemon-base:latest-master-devel',
+      container_image_name: 'quay.io/ceph/daemon-base:latest-master-devel',
       daemon_id: '4',
       daemon_type: 'osd',
       daemon_name: 'osd.4',
@@ -63,7 +63,7 @@ describe('ServiceDaemonListComponent', () => {
       hostname: 'osd0',
       container_id: '8483de277e365bea4365cee9e1f26606be85c471e4da5d51f57e4b85a42c616e',
       container_image_id: 'e70344c77bcbf3ee389b9bf5128f635cf95f3d59e005c5d8e67fc19bcc74ed23',
-      container_image_name: 'docker.io/ceph/daemon-base:latest-master-devel',
+      container_image_name: 'quay.io/ceph/daemon-base:latest-master-devel',
       daemon_id: '5',
       daemon_type: 'osd',
       daemon_name: 'osd.5',
@@ -79,7 +79,7 @@ describe('ServiceDaemonListComponent', () => {
       hostname: 'mon0',
       container_id: '6ca0574f47e300a6979eaf4e7c283a8c4325c2235ae60358482fc4cd58844a21',
       container_image_id: 'e70344c77bcbf3ee389b9bf5128f635cf95f3d59e005c5d8e67fc19bcc74ed23',
-      container_image_name: 'docker.io/ceph/daemon-base:latest-master-devel',
+      container_image_name: 'quay.io/ceph/daemon-base:latest-master-devel',
       daemon_id: 'a',
       daemon_name: 'mon.a',
       daemon_type: 'mon',
@@ -99,7 +99,7 @@ describe('ServiceDaemonListComponent', () => {
       service_name: 'osd',
       status: {
         container_image_id: 'e70344c77bcbf3ee389b9bf5128f635cf95f3d59e005c5d8e67fc19bcc74ed23',
-        container_image_name: 'docker.io/ceph/daemon-base:latest-master-devel',
+        container_image_name: 'quay.io/ceph/daemon-base:latest-master-devel',
         size: 3,
         running: 3,
         last_refresh: '2020-02-25T04:33:26.465699'
@@ -111,7 +111,7 @@ describe('ServiceDaemonListComponent', () => {
       service_name: 'crash',
       status: {
         container_image_id: 'e70344c77bcbf3ee389b9bf5128f635cf95f3d59e005c5d8e67fc19bcc74ed23',
-        container_image_name: 'docker.io/ceph/daemon-base:latest-master-devel',
+        container_image_name: 'quay.io/ceph/daemon-base:latest-master-devel',
         size: 1,
         running: 1,
         last_refresh: '2020-02-25T04:33:26.465766'
diff --git a/src/pybind/mgr/orchestrator/tests/test_orchestrator.py b/src/pybind/mgr/orchestrator/tests/test_orchestrator.py
index 726a7ac7937c5..3247b06a3993b 100644
--- a/src/pybind/mgr/orchestrator/tests/test_orchestrator.py
+++ b/src/pybind/mgr/orchestrator/tests/test_orchestrator.py
@@ -102,7 +102,7 @@ def test_yaml():
   host_pattern: '*'
 status:
   container_image_id: 74803e884bea289d2d2d3ebdf6d37cd560499e955595695b1390a89800f4e37a
-  container_image_name: docker.io/ceph/daemon-base:latest-master-devel
+  container_image_name: quay.io/ceph/daemon-base:latest-main-devel
   created: '2020-06-10T10:37:31.051288Z'
   last_refresh: '2020-06-10T10:57:40.715637Z'
   running: 1
diff --git a/src/pybind/mgr/tox.ini b/src/pybind/mgr/tox.ini
index a8a2d39d01a73..f39ececa93dd5 100644
--- a/src/pybind/mgr/tox.ini
+++ b/src/pybind/mgr/tox.ini
@@ -160,7 +160,8 @@ modules =
 commands =
     flake8 --config=tox.ini {posargs} \
       {posargs:{[testenv:flake8]modules}}
-    bash -c 'test $(git ls-files cephadm | grep ".py$" | grep -v tests | xargs grep "docker.io" | wc -l) == 13'
+    bash -c 'test $(git ls-files cephadm | grep ".py$" | grep -v tests | xargs grep "docker.io" | wc -l) == 3'
+    bash -c 'test $(git ls-files cephadm | grep ".py$" | grep -v tests | xargs grep "quay.io" | wc -l) == 26'
 
 [testenv:jinjalint]
 deps =

From 69baa6de2579f0e3ce8298e14b970b8c68deae9c Mon Sep 17 00:00:00 2001
From: Patrick Donnelly <pdonnell@redhat.com>
Date: Fri, 27 Sep 2024 21:21:34 -0400
Subject: [PATCH 039/148] common: assert debug mutex lock is not held if
 !recursive

There's appropriate checks for unlock and post-lock but nothing to stop the
undefined behavior of a double-lock on a non-recursive mutex.

Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
---
 src/common/mutex_debug.h | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/common/mutex_debug.h b/src/common/mutex_debug.h
index c1a4ff2a43501..d56d0ebee9987 100644
--- a/src/common/mutex_debug.h
+++ b/src/common/mutex_debug.h
@@ -169,20 +169,16 @@ class mutex_debug_impl : public mutex_debugging_base
   }
 
   bool try_lock(bool no_lockdep = false) {
-    bool locked = try_lock_impl();
-    if (locked) {
-      if (enable_lockdep(no_lockdep))
-	_locked();
-      _post_lock();
-    }
-    return locked;
+    ceph_assert(recursive || !is_locked_by_me());
+    return _try_lock(no_lockdep);
   }
 
   void lock(bool no_lockdep = false) {
+    ceph_assert(recursive || !is_locked_by_me());
     if (enable_lockdep(no_lockdep))
       _will_lock(recursive);
 
-    if (try_lock(no_lockdep))
+    if (_try_lock(no_lockdep))
       return;
 
     lock_impl();
@@ -198,6 +194,16 @@ class mutex_debug_impl : public mutex_debugging_base
     unlock_impl();
   }
 
+private:
+  bool _try_lock(bool no_lockdep) {
+    bool locked = try_lock_impl();
+    if (locked) {
+      if (enable_lockdep(no_lockdep))
+	_locked();
+      _post_lock();
+    }
+    return locked;
+  }
 };
 
 

From bec702dad7a1075c482b4e89c2e2f745e4058123 Mon Sep 17 00:00:00 2001
From: Patrick Donnelly <pdonnell@redhat.com>
Date: Fri, 27 Sep 2024 21:24:43 -0400
Subject: [PATCH 040/148] common,osdc: remove obsolete ceph::mutex_debugging

Now that we confirm a lock is not held in mutex_debug::lock.

Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
---
 src/common/ceph_mutex.h   |  5 -----
 src/common/config_proxy.h |  1 -
 src/osdc/Journaler.h      | 14 --------------
 3 files changed, 20 deletions(-)

diff --git a/src/common/ceph_mutex.h b/src/common/ceph_mutex.h
index 059d81f2ac39b..6ed8c56d5dad3 100644
--- a/src/common/ceph_mutex.h
+++ b/src/common/ceph_mutex.h
@@ -83,7 +83,6 @@ namespace ceph {
     return {};
   }
 
-  static constexpr bool mutex_debugging = false;
   #define ceph_mutex_is_locked(m) true
   #define ceph_mutex_is_locked_by_me(m) true
 }
@@ -131,8 +130,6 @@ namespace ceph {
     return {std::forward<Args>(args)...};
   }
 
-  static constexpr bool mutex_debugging = true;
-
   // debug methods
   #define ceph_mutex_is_locked(m) ((m).is_locked())
   #define ceph_mutex_is_not_locked(m) (!(m).is_locked())
@@ -186,8 +183,6 @@ namespace ceph {
     return {};
   }
 
-  static constexpr bool mutex_debugging = false;
-
   // debug methods.  Note that these can blindly return true
   // because any code that does anything other than assert these
   // are true is broken.
diff --git a/src/common/config_proxy.h b/src/common/config_proxy.h
index b9b47d9cef472..12a273b8c84f7 100644
--- a/src/common/config_proxy.h
+++ b/src/common/config_proxy.h
@@ -31,7 +31,6 @@ class ConfigProxy {
   using rev_obs_map_t = ObsMgr::rev_obs_map;
 
   void _call_observers(rev_obs_map_t& rev_obs) {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     for (auto& [obs, keys] : rev_obs) {
       (*obs)->handle_conf_change(*this, keys);
     }
diff --git a/src/osdc/Journaler.h b/src/osdc/Journaler.h
index 4a574ed66d94e..d15862c08ba52 100644
--- a/src/osdc/Journaler.h
+++ b/src/osdc/Journaler.h
@@ -529,43 +529,35 @@ class Journaler {
   // ===================
 
   Header get_last_committed() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return last_committed;
   }
   Header get_last_written() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return last_written;
   }
 
   uint64_t get_layout_period() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return layout.get_period();
   }
   file_layout_t get_layout() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return layout;
   }
   bool is_active() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return state == STATE_ACTIVE;
   }
   bool is_stopping() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return state == STATE_STOPPING;
   }
   int get_error() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return error;
   }
   bool is_readonly() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return readonly;
   }
@@ -573,32 +565,26 @@ class Journaler {
   bool _is_readable();
   bool try_read_entry(bufferlist& bl);
   uint64_t get_write_pos() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return write_pos;
   }
   uint64_t get_write_safe_pos() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return safe_pos;
   }
   uint64_t get_read_pos() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return read_pos;
   }
   uint64_t get_expire_pos() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return expire_pos;
   }
   uint64_t get_trimmed_pos() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return trimmed_pos;
   }
   size_t get_journal_envelope_size() const { 
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return journal_stream.get_envelope_size(); 
   }

From 1213df95915a792be66126acec1e08aa5bf3c795 Mon Sep 17 00:00:00 2001
From: Patrick Donnelly <pdonnell@redhat.com>
Date: Mon, 30 Sep 2024 09:51:54 -0400
Subject: [PATCH 041/148] test/common: fix invalid vim mode

Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
---
 src/test/common/test_mutex_debug.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/test/common/test_mutex_debug.cc b/src/test/common/test_mutex_debug.cc
index 977dfe738a921..565dcd64de9c0 100644
--- a/src/test/common/test_mutex_debug.cc
+++ b/src/test/common/test_mutex_debug.cc
@@ -1,5 +1,5 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 &smarttab
+// vim: ts=8 sw=2 smarttab
 /*
  * Ceph - scalable distributed file system
  *

From 84ebb30ea94bea60a5ff93f1a6b334000455da82 Mon Sep 17 00:00:00 2001
From: Patrick Donnelly <pdonnell@redhat.com>
Date: Mon, 30 Sep 2024 09:53:09 -0400
Subject: [PATCH 042/148] common/test: do not test exception raised from
 recursive lock

The C++ standard does not require that implementations raise std::system_error
when double-locking a non-recursive lock. Our implementation of debug_mutex
now catches this error with a ceph_assert so it cannot be caught.

Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
---
 src/test/common/test_mutex_debug.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/test/common/test_mutex_debug.cc b/src/test/common/test_mutex_debug.cc
index 565dcd64de9c0..29eb8076859c9 100644
--- a/src/test/common/test_mutex_debug.cc
+++ b/src/test/common/test_mutex_debug.cc
@@ -65,7 +65,6 @@ TEST(MutexDebug, NotRecursive) {
   ASSERT_TRUE(m.is_locked());
   ASSERT_FALSE(std::async(std::launch::async, ttl, &m).get());
 
-  ASSERT_THROW(m.lock(), std::system_error);
   ASSERT_TRUE(m.is_locked());
   ASSERT_FALSE(std::async(std::launch::async, ttl, &m).get());
 

From 8d2d54f6c26295e8c5ba5e5fe9ca5e14f8ba7786 Mon Sep 17 00:00:00 2001
From: neeraj pratap singh
 <neerajpratapsingh@li-ff7f0d4c-3462-11b2-a85c-d4004c0fa1a0.ibm.com>
Date: Thu, 26 Sep 2024 14:25:00 +0530
Subject: [PATCH 043/148] cephfs-shell: fixing the cephfs-shell test failures

cephfs-shell is failing in Ubuntu22.04, because it is
behaving weirdly with cmd2's version. It is taking cmd2
version as 0.0.0 instead of the correct version.

Fixes: https://tracker.ceph.com/issues/63700
Signed-off-by: Neeraj Pratap Singh <neesingh@redhat.com>
---
 src/tools/cephfs/shell/cephfs-shell | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/tools/cephfs/shell/cephfs-shell b/src/tools/cephfs/shell/cephfs-shell
index 9449007a80b97..3f19a637e6864 100755
--- a/src/tools/cephfs/shell/cephfs-shell
+++ b/src/tools/cephfs/shell/cephfs-shell
@@ -15,14 +15,22 @@ import re
 import shlex
 import stat
 import errno
+import distro
 
 from cmd2 import Cmd
 from cmd2 import __version__ as cmd2_version
 from packaging.version import Version
 
+# DFLAG is used to override the checks done by cephfs-shell
+# for cmd2 versions due to weird behaviour of Ubuntu22.04 with
+# cmd2's version i.e. it always gets the version of cmd2 as
+# "0.0.0" instead of the actual cmd2 version.
+DFLAG = False
+if distro.name() == "Ubuntu" and distro.version() == "22.04":
+    DFLAG = True
 # XXX: In cmd2 versions < 1.0.1, we'll get SystemExit(2) instead of
 # Cmd2ArgparseError
-if Version(cmd2_version) >= Version("1.0.1"):
+if Version(cmd2_version) >= Version("1.0.1") or DFLAG is True:
     from cmd2.exceptions import Cmd2ArgparseError
 else:
     # HACK: so that we don't have check for version everywhere
@@ -1700,7 +1708,7 @@ def read_shell_conf(shell, shell_conf_file):
 
     sec = 'cephfs-shell'
     opts = []
-    if Version(cmd2_version) >= Version("0.10.0"):
+    if Version(cmd2_version) >= Version("0.10.0") or DFLAG is True:
         for attr in shell.settables.keys():
             opts.append(attr)
     else:
@@ -1768,7 +1776,7 @@ def manage_args():
     args.exe_and_quit = False    # Execute and quit, don't launch the shell.
 
     if args.batch:
-        if Version(cmd2_version) <= Version("0.9.13"):
+        if Version(cmd2_version) <= Version("0.9.13") and DFLAG is not True:
             args.commands = ['load ' + args.batch, ',quit']
         else:
             args.commands = ['run_script ' + args.batch, ',quit']
@@ -1813,7 +1821,7 @@ def execute_cmds_and_quit(args):
     # value to indicate whether the execution of the commands should stop, but
     # since 0.9.7 it returns the return value of do_* methods only if it's
     # not None. When it is None it returns False instead of None.
-    if Version(cmd2_version) <= Version("0.9.6"):
+    if Version(cmd2_version) <= Version("0.9.6") and DFLAG is not True:
         stop_exec_val = None
     else:
         stop_exec_val = False

From a48080af3956ae84fb2b3e5da2db1ca16c308c2f Mon Sep 17 00:00:00 2001
From: Patrick Donnelly <pdonnell@redhat.com>
Date: Wed, 2 Oct 2024 10:48:34 -0400
Subject: [PATCH 044/148] test/common: add death test for double !recursive
 lock

Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
---
 src/test/common/test_mutex_debug.cc | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/src/test/common/test_mutex_debug.cc b/src/test/common/test_mutex_debug.cc
index 29eb8076859c9..cee4b427770ae 100644
--- a/src/test/common/test_mutex_debug.cc
+++ b/src/test/common/test_mutex_debug.cc
@@ -57,20 +57,13 @@ TEST(MutexDebug, Lock) {
   test_lock<ceph::mutex_debug>();
 }
 
-TEST(MutexDebug, NotRecursive) {
+TEST(MutexDebugDeathTest, NotRecursive) {
   ceph::mutex_debug m("foo");
-  auto ttl = &test_try_lock<mutex_debug>;
-
-  ASSERT_NO_THROW(m.lock());
-  ASSERT_TRUE(m.is_locked());
-  ASSERT_FALSE(std::async(std::launch::async, ttl, &m).get());
-
+  // avoid assert during test cleanup where the mutex is locked and cannot be
+  // pthread_mutex_destroy'd
+  std::unique_lock locker{m};
   ASSERT_TRUE(m.is_locked());
-  ASSERT_FALSE(std::async(std::launch::async, ttl, &m).get());
-
-  ASSERT_NO_THROW(m.unlock());
-  ASSERT_FALSE(m.is_locked());
-  ASSERT_TRUE(std::async(std::launch::async, ttl, &m).get());
+  ASSERT_DEATH(m.lock(), "FAILED ceph_assert(recursive || !is_locked_by_me())");
 }
 
 TEST(MutexRecursiveDebug, Lock) {

From 5a9c1c06e8dffaadebbe44ed8c329bd5bc34537e Mon Sep 17 00:00:00 2001
From: Anoop C S <anoopcs@cryptolab.net>
Date: Fri, 4 Oct 2024 16:04:15 +0530
Subject: [PATCH 045/148] cephadm/smb: Add a provision to specify ctdb log
 level

sambacc already accepts 'log_level' as a field inside ctdb config stub
to explicitly set the log level for ctdbd. Make use of this to provide
a means when non default log levels are desired in future for ctdb.

Signed-off-by: Anoop C S <anoopcs@cryptolab.net>
---
 src/cephadm/cephadmlib/daemons/smb.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/cephadm/cephadmlib/daemons/smb.py b/src/cephadm/cephadmlib/daemons/smb.py
index 74cb13f4ab022..5e400481a857a 100644
--- a/src/cephadm/cephadmlib/daemons/smb.py
+++ b/src/cephadm/cephadmlib/daemons/smb.py
@@ -72,6 +72,7 @@ class Config:
     instance_id: str
     source_config: str
     samba_debug_level: int
+    ctdb_log_level: str
     debug_delay: int
     domain_member: bool
     clustered: bool
@@ -98,6 +99,7 @@ def __init__(
         domain_member: bool,
         clustered: bool,
         samba_debug_level: int = 0,
+        ctdb_log_level: str = '',
         debug_delay: int = 0,
         join_sources: Optional[List[str]] = None,
         user_sources: Optional[List[str]] = None,
@@ -119,6 +121,7 @@ def __init__(
         self.domain_member = domain_member
         self.clustered = clustered
         self.samba_debug_level = samba_debug_level
+        self.ctdb_log_level = ctdb_log_level
         self.debug_delay = debug_delay
         self.join_sources = join_sources or []
         self.user_sources = user_sources or []
@@ -756,7 +759,7 @@ def prepare_data_dir(self, data_dir: str, uid: int, gid: int) -> None:
     def _write_ctdb_stub_config(self, path: pathlib.Path) -> None:
         reclock_cmd = ' '.join(_MUTEX_SUBCMD + [self._cfg.cluster_lock_uri])
         nodes_cmd = ' '.join(_NODES_SUBCMD)
-        stub_config = {
+        stub_config: Dict[str, Any] = {
             'samba-container-config': 'v0',
             'ctdb': {
                 # recovery_lock is passed directly to ctdb: needs '!' prefix
@@ -768,6 +771,8 @@ def _write_ctdb_stub_config(self, path: pathlib.Path) -> None:
                 ),
             },
         }
+        if self._cfg.ctdb_log_level:
+            stub_config['ctdb']['log_level'] = self._cfg.ctdb_log_level
         with file_utils.write_new(path) as fh:
             json.dump(stub_config, fh)
 

From 6d8f61015f29ef3bfd737d78a0b8734171574c98 Mon Sep 17 00:00:00 2001
From: Kotresh HR <khiremat@redhat.com>
Date: Thu, 26 Sep 2024 21:23:21 +0530
Subject: [PATCH 046/148] qa: Add data read/write test for nfs-ganesha

Fixes: https://tracker.ceph.com/issues/68146
Signed-off-by: Kotresh HR <khiremat@redhat.com>
---
 qa/tasks/cephfs/test_nfs.py | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/qa/tasks/cephfs/test_nfs.py b/qa/tasks/cephfs/test_nfs.py
index 932d504d47f3e..19076ea44b3be 100644
--- a/qa/tasks/cephfs/test_nfs.py
+++ b/qa/tasks/cephfs/test_nfs.py
@@ -8,6 +8,7 @@
 from tasks.mgr.mgr_test_case import MgrTestCase
 from teuthology import contextutil
 from teuthology.exceptions import CommandFailedError
+from teuthology.orchestra.run import Raw
 
 log = logging.getLogger(__name__)
 
@@ -319,7 +320,7 @@ def _get_port_ip_info(self):
                     else:
                         log.warning(f'{e}, retrying')
 
-    def _test_mnt(self, pseudo_path, port, ip, check=True):
+    def _test_mnt(self, pseudo_path, port, ip, check=True, datarw=False):
         '''
         Test mounting of created exports
         :param pseudo_path: It is the pseudo root name
@@ -347,12 +348,27 @@ def _test_mnt(self, pseudo_path, port, ip, check=True):
         self.ctx.cluster.run(args=['sudo', 'chmod', '1777', '/mnt'])
 
         try:
+            # Clean up volumes directory created by subvolume create by some tests
+            self.ctx.cluster.run(args=['sudo', 'rm', '-rf', '/mnt/volumes'])
             self.ctx.cluster.run(args=['touch', '/mnt/test'])
             out_mnt = self._sys_cmd(['ls', '/mnt'])
             self.assertEqual(out_mnt,  b'test\n')
+            if datarw:
+              self.ctx.cluster.run(args=['echo', 'test data', Raw('|'), 'tee', '/mnt/test1'])
+              out_test1 = self._sys_cmd(['cat', '/mnt/test1'])
+              self.assertEqual(out_test1,  b'test data\n')
         finally:
             self.ctx.cluster.run(args=['sudo', 'umount', '/mnt'])
 
+    def _test_data_read_write(self, pseudo_path, port, ip):
+        '''
+        Check if read/write works fine
+        '''
+        try:
+            self._test_mnt(pseudo_path, port, ip, True, True)
+        except CommandFailedError as e:
+            self.fail(f"expected read/write of a file to be successful but failed with {e.exitstatus}")
+
     def _write_to_read_only_export(self, pseudo_path, port, ip):
         '''
         Check if write to read only export fails
@@ -599,6 +615,18 @@ def test_write_to_read_only_export(self):
         self._write_to_read_only_export(self.pseudo_path, port, ip)
         self._test_delete_cluster()
 
+    def test_data_read_write(self):
+        '''
+        Test date read and write on export.
+        '''
+        self._test_create_cluster()
+        self._create_export(export_id='1', create_fs=True,
+                            extra_cmd=['--pseudo-path', self.pseudo_path])
+        port, ip = self._get_port_ip_info()
+        self._check_nfs_cluster_status('running', 'NFS Ganesha cluster restart failed')
+        self._test_data_read_write(self.pseudo_path, port, ip)
+        self._test_delete_cluster()
+
     def test_cluster_info(self):
         '''
         Test cluster info outputs correct ip and hostname

From 59b996f0ed022f1bafd77317467d2e18ff0fa710 Mon Sep 17 00:00:00 2001
From: Kotresh HR <khiremat@redhat.com>
Date: Sat, 28 Sep 2024 23:34:50 +0530
Subject: [PATCH 047/148] qa: Add libcephfs client test with objectcacher
 disabled

Fixes: https://tracker.ceph.com/issues/68146
Signed-off-by: Kotresh HR <khiremat@redhat.com>
---
 qa/suites/fs/libcephfs/tasks/client.yaml | 1 +
 qa/workunits/client/test_oc_disabled.sh  | 5 +++++
 2 files changed, 6 insertions(+)
 create mode 100755 qa/workunits/client/test_oc_disabled.sh

diff --git a/qa/suites/fs/libcephfs/tasks/client.yaml b/qa/suites/fs/libcephfs/tasks/client.yaml
index da84137322069..42ca9336c8e7d 100644
--- a/qa/suites/fs/libcephfs/tasks/client.yaml
+++ b/qa/suites/fs/libcephfs/tasks/client.yaml
@@ -12,3 +12,4 @@ tasks:
     clients:
       client.0:
         - client/test.sh
+        - client/test_oc_disabled.sh
diff --git a/qa/workunits/client/test_oc_disabled.sh b/qa/workunits/client/test_oc_disabled.sh
new file mode 100755
index 0000000000000..88552aa50bdc5
--- /dev/null
+++ b/qa/workunits/client/test_oc_disabled.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+
+set -ex
+
+ceph_test_client --client_oc=false

From b5af1c1ffe8786a96c866edcec69c78030e9f2e4 Mon Sep 17 00:00:00 2001
From: Kotresh HR <khiremat@redhat.com>
Date: Sat, 28 Sep 2024 23:19:30 +0530
Subject: [PATCH 048/148] test/client: Fix aio nonblocking test

The same bufferlist is used without cleaning
for multiple calls. The test 'LlreadvLlwritev'
used to fail because of it. Fixed the same.

Fixes: https://tracker.ceph.com/issues/68146
Signed-off-by: Kotresh HR <khiremat@redhat.com>
---
 src/test/client/nonblocking.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/test/client/nonblocking.cc b/src/test/client/nonblocking.cc
index d4aecb10ffcb4..93bcfabd3fcf1 100644
--- a/src/test/client/nonblocking.cc
+++ b/src/test/client/nonblocking.cc
@@ -111,6 +111,8 @@ TEST_F(TestClient, LlreadvLlwritev) {
   writefinish.reset(new C_SaferCond("test-nonblocking-writefinish"));
   readfinish.reset(new C_SaferCond("test-nonblocking-readfinish"));
   ssize_t nwritten_a = iov_out_a[0].iov_len + iov_out_a[1].iov_len;
+  // reset bufferlist
+  bl.clear();
 
   rc = client->ll_preadv_pwritev(fh, iov_out_a, 2, 100, true, writefinish.get(), nullptr);
   ASSERT_EQ(0, rc);
@@ -130,6 +132,8 @@ TEST_F(TestClient, LlreadvLlwritev) {
   writefinish.reset(new C_SaferCond("test-nonblocking-writefinish"));
   readfinish.reset(new C_SaferCond("test-nonblocking-readfinish"));
   ssize_t nwritten_b = iov_out_b[0].iov_len + iov_out_b[1].iov_len;
+  // reset bufferlist
+  bl.clear();
 
   rc = client->ll_preadv_pwritev(fh, iov_out_b, 2, 1000, true, writefinish.get(), nullptr, true, false);
   ASSERT_EQ(0, rc);

From 3ebe97484d26cf5d9cd78636ee4718c075a2897b Mon Sep 17 00:00:00 2001
From: Kotresh HR <khiremat@redhat.com>
Date: Thu, 26 Sep 2024 10:50:32 +0530
Subject: [PATCH 049/148] client: Fix libcephfs aio metadata corruption.

Problem:
With cephfs nfs-ganesha, there were following
asserts hit while doing write on a file.

1. FAILED ceph_assert((bool)_front == (bool)_size)
2. FAILED ceph_assert(cap_refs[c] > 0)

Cause:
In aio path, the client_lock was not being held
in the internal callback after the io is done where
it's expected to be taken leading to corruption.

Fix:
Take client_lock in the callback

Fixes: https://tracker.ceph.com/issues/68146
Signed-off-by: Kotresh HR <khiremat@redhat.com>
---
 src/client/Client.cc | 19 +++++++++++++++++--
 src/client/Client.h  | 15 +++++++++++++++
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/src/client/Client.cc b/src/client/Client.cc
index e208cf7667577..1bc67ce38bb87 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -11399,10 +11399,18 @@ int64_t Client::_write_success(Fh *f, utime_t start, uint64_t fpos,
   return r;
 }
 
+void Client::C_Lock_Client_Finisher::finish(int r)
+{
+  std::scoped_lock lock(clnt->client_lock);
+  onfinish->complete(r);
+}
+
 void Client::C_Write_Finisher::finish_io(int r)
 {
   bool fini;
 
+  ceph_assert(ceph_mutex_is_locked_by_me(clnt->client_lock));
+
   clnt->put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
 
   if (r >= 0) {
@@ -11438,6 +11446,8 @@ void Client::C_Write_Finisher::finish_fsync(int r)
   bool fini;
   client_t const whoami = clnt->whoami;  // For the benefit of ldout prefix
 
+  ceph_assert(ceph_mutex_is_locked_by_me(clnt->client_lock));
+
   ldout(clnt->cct, 3) << "finish_fsync r = " << r << dendl;
 
   fsync_finished = true;
@@ -11598,6 +11608,7 @@ int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
 
   std::unique_ptr<Context> iofinish = nullptr;
   std::unique_ptr<C_Write_Finisher> cwf = nullptr;
+  std::unique_ptr<Context> filer_iofinish = nullptr;
   
   if (in->inline_version < CEPH_INLINE_NONE) {
     if (endoff > cct->_conf->client_max_inline_size ||
@@ -11709,7 +11720,10 @@ int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
     if (onfinish == nullptr) {
       // We need a safer condition to wait on.
       cond_iofinish = new C_SaferCond();
-      iofinish.reset(cond_iofinish);
+      filer_iofinish.reset(cond_iofinish);
+    } else {
+      //Register a wrapper callback for the C_Write_Finisher which takes 'client_lock'
+      filer_iofinish.reset(new C_Lock_Client_Finisher(this, iofinish.get()));
     }
 
     get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
@@ -11717,11 +11731,12 @@ int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
     filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
 		       offset, size, bl, ceph::real_clock::now(), 0,
 		       in->truncate_size, in->truncate_seq,
-		       iofinish.get());
+		       filer_iofinish.get());
 
     if (onfinish) {
       // handle non-blocking caller (onfinish != nullptr), we can now safely
       // release all the managed pointers
+      filer_iofinish.release();
       iofinish.release();
       onuninline.release();
       cwf.release();
diff --git a/src/client/Client.h b/src/client/Client.h
index 5a1e69394d02a..f8c39e2fdd6ab 100644
--- a/src/client/Client.h
+++ b/src/client/Client.h
@@ -1409,6 +1409,21 @@ class Client : public Dispatcher, public md_config_obs_t {
     void finish(int r) override;
   };
 
+  // A wrapper callback which takes the 'client_lock' and finishes the context.
+  // One of the usecase is the filer->write_trunc which doesn't hold client_lock
+  // in the call back passed. So, use this wrapper in such cases.
+  class C_Lock_Client_Finisher : public Context {
+  public:
+    C_Lock_Client_Finisher(Client *clnt, Context *onfinish)
+      : clnt(clnt), onfinish(onfinish) {}
+
+  private:
+    Client *clnt;
+    Context *onfinish;
+
+    void finish(int r) override;
+  };
+
   class C_Write_Finisher : public Context {
   public:
     void finish_io(int r);

From 10c8330f20cd2e93ce036d0ea2c38552d71b62c6 Mon Sep 17 00:00:00 2001
From: Kotresh HR <khiremat@redhat.com>
Date: Mon, 30 Sep 2024 12:45:04 +0530
Subject: [PATCH 050/148] client: Fix caps_ref[c]<0 assert

When libcephfs aio tests (src/test/client) are run
with objectcacher disabled (ceph_test_client --client_oc=false),
the TestClient.LlreadvLlwritev fails and core dumps. The client
hits the assert 'caps_ref[c]<0'.

This patch fixes the same. There is no need to give out cap_ref
and take it again between multiple read because of short reads.
In some cases, the get_caps used to fail in C_Read_Sync_NonBlocking::finish
causing cap_ref to go negative when put_cap_ref is done at last in
C_Read_Finish::finish_io

Fixes: https://tracker.ceph.com/issues/68308
Signed-off-by: Kotresh HR <khiremat@redhat.com>
---
 src/client/Client.cc | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/src/client/Client.cc b/src/client/Client.cc
index 1bc67ce38bb87..e73f821438b3a 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -10798,7 +10798,6 @@ void Client::C_Read_Sync_NonBlocking::finish(int r)
         goto success;
     }
 
-    clnt->put_cap_ref(in, CEPH_CAP_FILE_RD);
     // reverify size
     {
       r = clnt->_getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
@@ -10810,14 +10809,6 @@ void Client::C_Read_Sync_NonBlocking::finish(int r)
     if ((uint64_t)pos >= in->size)
       goto success;
 
-    {
-      int have_caps2 = 0;
-      r = clnt->get_caps(f, CEPH_CAP_FILE_RD, have_caps, &have_caps2, -1);
-      if (r < 0) {
-        goto error;
-      }
-    }
-
     wanted = left;
     retry();
     clnt->client_lock.unlock();

From 942474c2f5b4c696364f3b7411ae7d96444edfa8 Mon Sep 17 00:00:00 2001
From: Kotresh HR <khiremat@redhat.com>
Date: Sat, 28 Sep 2024 01:18:23 +0530
Subject: [PATCH 051/148] client: Fix aio zerobyte file read

The following test fails when run with objectcacher
disabled.

TestClient.LlreadvLlwritevZeroBytes Failure - nonblocking.cc

ceph/src/osdc/Striper.cc: 186: FAILED ceph_assert(len > 0)

Traceback:
 ceph version Development (no_version) squid (dev)
 1: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x125) [0x7fc0a340aafe]
 2: (ceph::register_assert_context(ceph::common::CephContext*)+0) [0x7fc0a340ad20]
 3: (Striper::file_to_extents(ceph::common::CephContext*, file_layout_t const*, ...)+0x184) [0x562727e13ab4]
 4: (Striper::file_to_extents(ceph::common::CephContext*, char const*, ...)+0x97) [0x562727e145d1]
 5: (Striper::file_to_extents(ceph::common::CephContext*, inodeno_t, ...)+0x75) [0x562727d29520]
 6: (Filer::read_trunc(inodeno_t, file_layout_t const*, snapid_t, ...)+0x61) [0x562727d66ea5]
 7: (Client::C_Read_Sync_NonBlocking::retry()+0x10c) [0x562727cd8a8e]
 8: (Client::_read(Fh*, long, unsigned long, ceph::buffer::v15_2_0::list*, Context*)+0x578) [0x562727d10cb6]
 9: (Client::_preadv_pwritev_locked(Fh*, iovec const*, int, long, bool, ...)+0x3a7) [0x562727d18159]
 10: (Client::ll_preadv_pwritev(Fh*, iovec const*, int, long, bool, ...)+0x179) [0x562727d18b99]
 11: (TestClient_LlreadvLlwritevZeroBytes_Test::TestBody()+0x592) [0x562727ca5352]
 12: (void testing::internal::HandleSehExceptionsInMethodIfSupported<testing::Test, void>(testing::Test*, ...)+0x1b) [0x562727d9dea3]
 13: (void testing::internal::HandleExceptionsInMethodIfSupported<testing::Test, void>(testing::Test*, ...)+0x80) [0x562727da2b26]
 14: (testing::Test::Run()+0xb4) [0x562727d927ae]
 15: (testing::TestInfo::Run()+0x104) [0x562727d92988]
 16: (testing::TestSuite::Run()+0xb2) [0x562727d92b34]
 17: (testing::internal::UnitTestImpl::RunAllTests()+0x36b) [0x562727d95303]
 18: (bool testing::internal::HandleSehExceptionsInMethodIfSupported<testing::internal::UnitTestImpl, bool>(testing::internal::UnitTestImpl*, ...)(), char const*)+0x1b) [0x562727d9e15f]
 19: (bool testing::internal::HandleExceptionsInMethodIfSupported<testing::internal::UnitTestImpl, bool>(testing::internal::UnitTestImpl*, ...)+0x80) [0x562727da3083]
 20: (testing::UnitTest::Run()+0x63) [0x562727d92813]
 21: (RUN_ALL_TESTS()+0x11) [0x562727c828d9]
 22: main()

The patch fixes the same.

Fixes: https://tracker.ceph.com/issues/68309
Signed-off-by: Kotresh HR <khiremat@redhat.com>
---
 src/client/Client.cc | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/client/Client.cc b/src/client/Client.cc
index e73f821438b3a..6577dd575f1fd 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -10962,6 +10962,20 @@ int64_t Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl,
     // branch below but in a non-blocking fashion. The code in _read_sync
     // is duplicated and modified and exists in
     // C_Read_Sync_NonBlocking::finish().
+
+    // trim read based on file size?
+    if ((offset >= in->size) || (size == 0)) {
+      // read is requested at the EOF or the read len is zero, therefore just
+      // release managed pointers and complete the C_Read_Finisher immediately with 0 bytes
+
+      Context *iof = iofinish.release();
+      crf.release();
+      iof->complete(0);
+
+      // Signal async completion
+      return 0;
+    }
+
     C_Read_Sync_NonBlocking *crsa =
       new C_Read_Sync_NonBlocking(this, iofinish.release(), f, in, f->pos,
                                   offset, size, bl, filer.get(), have);

From db926acb533ac058090e3bbf1343bba0ca367051 Mon Sep 17 00:00:00 2001
From: Casey Bodley <cbodley@redhat.com>
Date: Fri, 4 Oct 2024 15:31:07 -0400
Subject: [PATCH 052/148] doc/dev/radosgw: update paths that moved under
 src/rgw/driver/rados/

Signed-off-by: Casey Bodley <cbodley@redhat.com>
---
 doc/dev/radosgw/bucket_index.rst | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/doc/dev/radosgw/bucket_index.rst b/doc/dev/radosgw/bucket_index.rst
index 6764641e0f50e..ceff57b58cfc8 100644
--- a/doc/dev/radosgw/bucket_index.rst
+++ b/doc/dev/radosgw/bucket_index.rst
@@ -32,7 +32,7 @@ For a given bucket, the index may be split into several rados objects, called bu
 
 The default shard count for new buckets is 11, but can be overridden in the zonegroup's ``bucket_index_max_shards`` or ceph.conf's ``rgw_override_bucket_index_max_shards``. As the number of objects in a bucket grows, its index shard count will also increase as a result of dynamic resharding.
 
-Information about the bucket's index object layout is stored in ``RGWBucketInfo`` as ``struct rgw::BucketLayout`` from ``src/rgw/rgw_bucket_layout.h``. The resharding logic is in ``src/rgw/rgw_reshard.cc``.
+Information about the bucket's index object layout is stored in ``RGWBucketInfo`` as ``struct rgw::BucketLayout`` from ``src/rgw/rgw_bucket_layout.h``. The resharding logic is in ``src/rgw/driver/rados/rgw_reshard.cc``.
 
 -----------------
 Index Transaction
@@ -46,7 +46,7 @@ To keep the bucket index consistent, all object writes or deletes must also upda
 
 Object writes and deletes may race with each other, so a given object may have more than one prepared transaction at a time. RGW considers an object entry to be 'pending' if there are any outstanding transactions, or 'completed' otherwise.
 
-This transaction is implemented in ``src/rgw/rgw_rados.cc`` as ``RGWRados::Object::Write::write_meta()`` for object writes, and ``RGWRados::Object::Delete::delete_obj()`` for object deletes. The bucket index operations are implemented in ``src/cls/rgw/cls_rgw.cc`` as ``rgw_bucket_prepare_op()`` and ``rgw_bucket_complete_op()``.
+This transaction is implemented in ``src/rgw/driver/rados/rgw_rados.cc`` as ``RGWRados::Object::Write::write_meta()`` for object writes, and ``RGWRados::Object::Delete::delete_obj()`` for object deletes. The bucket index operations are implemented in ``src/cls/rgw/cls_rgw.cc`` as ``rgw_bucket_prepare_op()`` and ``rgw_bucket_complete_op()``.
 
 -------
 Listing
@@ -56,7 +56,7 @@ When listing objects, RGW will read all entries (pending and completed) from the
 
 If an RGW crashes in the middle of an `Index Transaction`_, an index entry may get stuck in this 'pending' state. When bucket listing encounters these pending entries, it also sends information from the head object back to the bucket index so it can update the entry and resolve its stale transactions. This message is called 'dir suggest', because the bucket index treats it as a hint or suggestion.
 
-Bucket listing is implemented in ``src/rgw/rgw_rados.cc`` as ``RGWRados::Bucket::List::list_objects_ordered()`` and ``RGWRados::Bucket::List::list_objects_unordered()``. ``RGWRados::check_disk_state()`` is the part that reads the head object and encodes suggested changes. The corresponding bucket index operations are implemented in ``src/cls/rgw/cls_rgw.cc`` as ``rgw_bucket_list()`` and ``rgw_dir_suggest_changes()``.
+Bucket listing is implemented in ``src/rgw/driver/rados/rgw_rados.cc`` as ``RGWRados::Bucket::List::list_objects_ordered()`` and ``RGWRados::Bucket::List::list_objects_unordered()``. ``RGWRados::check_disk_state()`` is the part that reads the head object and encodes suggested changes. The corresponding bucket index operations are implemented in ``src/cls/rgw/cls_rgw.cc`` as ``rgw_bucket_list()`` and ``rgw_dir_suggest_changes()``.
 
 --------------------
 S3 Object Versioning
@@ -66,9 +66,9 @@ For versioned buckets, the bucket index contains an entry for each object versio
 
 RGW stores a head object in the rgw.buckets.data pool for each object version. This rados object's oid is a combination of the object name and its version id.
 
-In S3, a GET/HEAD request for an object name will give you that object's "current" version. To support this, RGW stores an extra 'object logical head' (olh) object whose oid includes the object name only, that acts as an indirection to the head object of its current version. This indirection logic is implemented in ``src/rgw/rgw_rados.cc`` as ``RGWRados::follow_olh()``.
+In S3, a GET/HEAD request for an object name will give you that object's "current" version. To support this, RGW stores an extra 'object logical head' (olh) object whose oid includes the object name only, that acts as an indirection to the head object of its current version. This indirection logic is implemented in ``src/rgw/driver/rados/rgw_rados.cc`` as ``RGWRados::follow_olh()``.
 
-To maintain the consistency between this olh object and the bucket index, the index keeps a separate 'olh' entry for each object name. This entry stores a log of all writes/deletes to its versions. In ``src/rgw/rgw_rados.cc``, ``RGWRados::apply_olh_log()`` replays this log to guarantee that this olh object converges on the same "current" version as the bucket index.
+To maintain the consistency between this olh object and the bucket index, the index keeps a separate 'olh' entry for each object name. This entry stores a log of all writes/deletes to its versions. In ``src/rgw/driver/rados/rgw_rados.cc``, ``RGWRados::apply_olh_log()`` replays this log to guarantee that this olh object converges on the same "current" version as the bucket index.
 
 .. _ListObjectsV2: https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjects.html
 .. _ListObjectVersions: https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectVersions.html

From cfe254758b1ca9647c4dcfb13b6a3310558b88d2 Mon Sep 17 00:00:00 2001
From: Casey Bodley <cbodley@redhat.com>
Date: Fri, 4 Oct 2024 15:50:05 -0400
Subject: [PATCH 053/148] doc/radosgw/multisite: fix Configuring Secondary
 Zones -> Updating the Period

this was copy/pasted from Configuring a Master Zone -> Update the Period
but still referred to the 'master zone'

Signed-off-by: Casey Bodley <cbodley@redhat.com>
---
 doc/radosgw/multisite.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/radosgw/multisite.rst b/doc/radosgw/multisite.rst
index 6a21b7479e6f6..d6925c8ed9c04 100644
--- a/doc/radosgw/multisite.rst
+++ b/doc/radosgw/multisite.rst
@@ -507,7 +507,7 @@ For example:
 Updating the Period
 -------------------
 
-After updating the master zone configuration, update the period:
+After updating the secondary zone configuration, update the period:
 
 .. prompt:: bash #
 

From 485cb051192a6142104756ed88a900a5ba455179 Mon Sep 17 00:00:00 2001
From: Nizamudeen A <nia@redhat.com>
Date: Mon, 7 Oct 2024 12:11:11 +0530
Subject: [PATCH 054/148] mgr/dashboard: add gw_groups to all nvmeof endpoints

This was missed in the previous implementation

Signed-off-by: Nizamudeen A <nia@redhat.com>
---
 .../mgr/dashboard/controllers/nvmeof.py       | 86 +++++++++++++------
 src/pybind/mgr/dashboard/openapi.yaml         | 69 +++++++++++++++
 2 files changed, 127 insertions(+), 28 deletions(-)

diff --git a/src/pybind/mgr/dashboard/controllers/nvmeof.py b/src/pybind/mgr/dashboard/controllers/nvmeof.py
index 757b9e8ac02cf..5db6a4f1acfec 100644
--- a/src/pybind/mgr/dashboard/controllers/nvmeof.py
+++ b/src/pybind/mgr/dashboard/controllers/nvmeof.py
@@ -63,7 +63,10 @@ def list(self, gw_group: Optional[str] = None):
 
         @EndpointDoc(
             "Get information from a specific NVMeoF subsystem",
-            parameters={"nqn": Param(str, "NVMeoF subsystem NQN")},
+            parameters={
+                "nqn": Param(str, "NVMeoF subsystem NQN"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
+            },
         )
         @map_model(model.Subsystem, first="subsystems")
         @handle_nvmeof_error
@@ -78,6 +81,7 @@ def get(self, nqn: str, gw_group: Optional[str] = None):
                 "nqn": Param(str, "NVMeoF subsystem NQN"),
                 "max_namespaces": Param(int, "Maximum number of namespaces", True, 1024),
                 "enable_ha": Param(bool, "Enable high availability"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @empty_response
@@ -95,6 +99,7 @@ def create(self, nqn: str, enable_ha: bool, max_namespaces: int = 1024,
             parameters={
                 "nqn": Param(str, "NVMeoF subsystem NQN"),
                 "force": Param(bool, "Force delete", "false"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @empty_response
@@ -111,12 +116,15 @@ def delete(self, nqn: str, force: Optional[str] = "false", gw_group: Optional[st
     class NVMeoFListener(RESTController):
         @EndpointDoc(
             "List all NVMeoF listeners",
-            parameters={"nqn": Param(str, "NVMeoF subsystem NQN")},
+            parameters={
+                "nqn": Param(str, "NVMeoF subsystem NQN"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
+            },
         )
         @map_collection(model.Listener, pick="listeners")
         @handle_nvmeof_error
-        def list(self, nqn: str):
-            return NVMeoFClient().stub.list_listeners(
+        def list(self, nqn: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.list_listeners(
                 NVMeoFClient.pb2.list_listeners_req(subsystem=nqn)
             )
 
@@ -128,6 +136,7 @@ def list(self, nqn: str):
                 "traddr": Param(str, "NVMeoF transport address"),
                 "trsvcid": Param(int, "NVMeoF transport service port", True, 4420),
                 "adrfam": Param(int, "NVMeoF address family (0 - IPv4, 1 - IPv6)", True, 0),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @empty_response
@@ -138,9 +147,10 @@ def create(
             host_name: str,
             traddr: str,
             trsvcid: int = 4420,
-            adrfam: int = 0,  # IPv4
+            adrfam: int = 0,  # IPv4,
+            gw_group: Optional[str] = None
         ):
-            return NVMeoFClient(traddr=traddr).stub.create_listener(
+            return NVMeoFClient(gw_group=gw_group, traddr=traddr).stub.create_listener(
                 NVMeoFClient.pb2.create_listener_req(
                     nqn=nqn,
                     host_name=host_name,
@@ -158,6 +168,7 @@ def create(
                 "traddr": Param(str, "NVMeoF transport address"),
                 "trsvcid": Param(int, "NVMeoF transport service port", True, 4420),
                 "adrfam": Param(int, "NVMeoF address family (0 - IPv4, 1 - IPv6)", True, 0),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @empty_response
@@ -170,8 +181,9 @@ def delete(
             trsvcid: int = 4420,
             adrfam: int = 0,  # IPv4
             force: bool = False,
+            gw_group: Optional[str] = None
         ):
-            return NVMeoFClient().stub.delete_listener(
+            return NVMeoFClient(gw_group=gw_group).stub.delete_listener(
                 NVMeoFClient.pb2.delete_listener_req(
                     nqn=nqn,
                     host_name=host_name,
@@ -187,12 +199,15 @@ def delete(
     class NVMeoFNamespace(RESTController):
         @EndpointDoc(
             "List all NVMeoF namespaces in a subsystem",
-            parameters={"nqn": Param(str, "NVMeoF subsystem NQN")},
+            parameters={
+                "nqn": Param(str, "NVMeoF subsystem NQN"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
+            },
         )
         @map_collection(model.Namespace, pick="namespaces")
         @handle_nvmeof_error
-        def list(self, nqn: str):
-            return NVMeoFClient().stub.list_namespaces(
+        def list(self, nqn: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.list_namespaces(
                 NVMeoFClient.pb2.list_namespaces_req(subsystem=nqn)
             )
 
@@ -201,12 +216,13 @@ def list(self, nqn: str):
             parameters={
                 "nqn": Param(str, "NVMeoF subsystem NQN"),
                 "nsid": Param(str, "NVMeoF Namespace ID"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @map_model(model.Namespace, first="namespaces")
         @handle_nvmeof_error
-        def get(self, nqn: str, nsid: str):
-            return NVMeoFClient().stub.list_namespaces(
+        def get(self, nqn: str, nsid: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.list_namespaces(
                 NVMeoFClient.pb2.list_namespaces_req(subsystem=nqn, nsid=int(nsid))
             )
 
@@ -217,12 +233,13 @@ def get(self, nqn: str, nsid: str):
             parameters={
                 "nqn": Param(str, "NVMeoF subsystem NQN"),
                 "nsid": Param(str, "NVMeoF Namespace ID"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @map_model(model.NamespaceIOStats)
         @handle_nvmeof_error
-        def io_stats(self, nqn: str, nsid: str):
-            return NVMeoFClient().stub.namespace_get_io_stats(
+        def io_stats(self, nqn: str, nsid: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.namespace_get_io_stats(
                 NVMeoFClient.pb2.namespace_get_io_stats_req(
                     subsystem_nqn=nqn, nsid=int(nsid))
             )
@@ -237,6 +254,7 @@ def io_stats(self, nqn: str, nsid: str):
                 "size": Param(int, "RBD image size"),
                 "block_size": Param(int, "NVMeoF namespace block size"),
                 "load_balancing_group": Param(int, "Load balancing group"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @map_model(model.NamespaceCreation)
@@ -250,8 +268,9 @@ def create(
             size: Optional[int] = 1024,
             block_size: int = 512,
             load_balancing_group: Optional[int] = None,
+            gw_group: Optional[str] = None,
         ):
-            return NVMeoFClient().stub.namespace_add(
+            return NVMeoFClient(gw_group=gw_group).stub.namespace_add(
                 NVMeoFClient.pb2.namespace_add_req(
                     subsystem_nqn=nqn,
                     rbd_image_name=rbd_image_name,
@@ -274,6 +293,7 @@ def create(
                 "rw_mbytes_per_second": Param(int, "Read/Write MB/s"),
                 "r_mbytes_per_second": Param(int, "Read MB/s"),
                 "w_mbytes_per_second": Param(int, "Write MB/s"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @empty_response
@@ -288,12 +308,13 @@ def update(
             rw_mbytes_per_second: Optional[int] = None,
             r_mbytes_per_second: Optional[int] = None,
             w_mbytes_per_second: Optional[int] = None,
+            gw_group: Optional[str] = None
         ):
             if rbd_image_size:
                 mib = 1024 * 1024
                 new_size_mib = int((rbd_image_size + mib - 1) / mib)
 
-                response = NVMeoFClient().stub.namespace_resize(
+                response = NVMeoFClient(gw_group=gw_group).stub.namespace_resize(
                     NVMeoFClient.pb2.namespace_resize_req(
                         subsystem_nqn=nqn, nsid=int(nsid), new_size=new_size_mib
                     )
@@ -336,12 +357,13 @@ def update(
             parameters={
                 "nqn": Param(str, "NVMeoF subsystem NQN"),
                 "nsid": Param(str, "NVMeoF Namespace ID"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @empty_response
         @handle_nvmeof_error
-        def delete(self, nqn: str, nsid: str):
-            return NVMeoFClient().stub.namespace_delete(
+        def delete(self, nqn: str, nsid: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.namespace_delete(
                 NVMeoFClient.pb2.namespace_delete_req(subsystem_nqn=nqn, nsid=int(nsid))
             )
 
@@ -351,7 +373,10 @@ def delete(self, nqn: str, nsid: str):
     class NVMeoFHost(RESTController):
         @EndpointDoc(
             "List all allowed hosts for an NVMeoF subsystem",
-            parameters={"nqn": Param(str, "NVMeoF subsystem NQN")},
+            parameters={
+                "nqn": Param(str, "NVMeoF subsystem NQN"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
+            },
         )
         @map_collection(
             model.Host,
@@ -362,8 +387,8 @@ class NVMeoFHost(RESTController):
             else o,
         )
         @handle_nvmeof_error
-        def list(self, nqn: str):
-            return NVMeoFClient().stub.list_hosts(
+        def list(self, nqn: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.list_hosts(
                 NVMeoFClient.pb2.list_hosts_req(subsystem=nqn)
             )
 
@@ -372,12 +397,13 @@ def list(self, nqn: str):
             parameters={
                 "nqn": Param(str, "NVMeoF subsystem NQN"),
                 "host_nqn": Param(str, 'NVMeoF host NQN. Use "*" to allow any host.'),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @empty_response
         @handle_nvmeof_error
-        def create(self, nqn: str, host_nqn: str):
-            return NVMeoFClient().stub.add_host(
+        def create(self, nqn: str, host_nqn: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.add_host(
                 NVMeoFClient.pb2.add_host_req(subsystem_nqn=nqn, host_nqn=host_nqn)
             )
 
@@ -386,12 +412,13 @@ def create(self, nqn: str, host_nqn: str):
             parameters={
                 "nqn": Param(str, "NVMeoF subsystem NQN"),
                 "host_nqn": Param(str, 'NVMeoF host NQN. Use "*" to disallow any host.'),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @empty_response
         @handle_nvmeof_error
-        def delete(self, nqn: str, host_nqn: str):
-            return NVMeoFClient().stub.remove_host(
+        def delete(self, nqn: str, host_nqn: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.remove_host(
                 NVMeoFClient.pb2.remove_host_req(subsystem_nqn=nqn, host_nqn=host_nqn)
             )
 
@@ -400,12 +427,15 @@ def delete(self, nqn: str, host_nqn: str):
     class NVMeoFConnection(RESTController):
         @EndpointDoc(
             "List all NVMeoF Subsystem Connections",
-            parameters={"nqn": Param(str, "NVMeoF subsystem NQN")},
+            parameters={
+                "nqn": Param(str, "NVMeoF subsystem NQN"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
+            },
         )
         @map_collection(model.Connection, pick="connections")
         @handle_nvmeof_error
-        def list(self, nqn: str):
-            return NVMeoFClient().stub.list_connections(
+        def list(self, nqn: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.list_connections(
                 NVMeoFClient.pb2.list_connections_req(subsystem=nqn)
             )
 
diff --git a/src/pybind/mgr/dashboard/openapi.yaml b/src/pybind/mgr/dashboard/openapi.yaml
index e8ab663d0d593..5df80259d9f5d 100644
--- a/src/pybind/mgr/dashboard/openapi.yaml
+++ b/src/pybind/mgr/dashboard/openapi.yaml
@@ -8293,6 +8293,7 @@ paths:
                   description: Enable high availability
                   type: boolean
                 gw_group:
+                  description: NVMeoF gateway group
                   type: string
                 max_namespaces:
                   default: 1024
@@ -8346,6 +8347,7 @@ paths:
         schema:
           type: boolean
       - allowEmptyValue: true
+        description: NVMeoF gateway group
         in: query
         name: gw_group
         schema:
@@ -8384,6 +8386,7 @@ paths:
         schema:
           type: string
       - allowEmptyValue: true
+        description: NVMeoF gateway group
         in: query
         name: gw_group
         schema:
@@ -8417,6 +8420,12 @@ paths:
         required: true
         schema:
           type: string
+      - allowEmptyValue: true
+        description: NVMeoF gateway group
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '200':
           content:
@@ -8446,6 +8455,12 @@ paths:
         required: true
         schema:
           type: string
+      - allowEmptyValue: true
+        description: NVMeoF gateway group
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '200':
           content:
@@ -8479,6 +8494,9 @@ paths:
           application/json:
             schema:
               properties:
+                gw_group:
+                  description: NVMeoF gateway group
+                  type: string
                 host_nqn:
                   description: NVMeoF host NQN. Use "*" to allow any host.
                   type: string
@@ -8525,6 +8543,12 @@ paths:
         required: true
         schema:
           type: string
+      - allowEmptyValue: true
+        description: NVMeoF gateway group
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '202':
           content:
@@ -8559,6 +8583,12 @@ paths:
         required: true
         schema:
           type: string
+      - allowEmptyValue: true
+        description: NVMeoF gateway group
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '200':
           content:
@@ -8596,6 +8626,9 @@ paths:
                   default: 0
                   description: NVMeoF address family (0 - IPv4, 1 - IPv6)
                   type: integer
+                gw_group:
+                  description: NVMeoF gateway group
+                  type: string
                 host_name:
                   description: NVMeoF hostname
                   type: string
@@ -8673,6 +8706,12 @@ paths:
         name: force
         schema:
           type: boolean
+      - allowEmptyValue: true
+        description: NVMeoF gateway group
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '202':
           content:
@@ -8707,6 +8746,12 @@ paths:
         required: true
         schema:
           type: string
+      - allowEmptyValue: true
+        description: NVMeoF gateway group
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '200':
           content:
@@ -8748,6 +8793,9 @@ paths:
                   default: true
                   description: Create RBD image
                   type: boolean
+                gw_group:
+                  description: NVMeoF gateway group
+                  type: string
                 load_balancing_group:
                   description: Load balancing group
                   type: integer
@@ -8805,6 +8853,12 @@ paths:
         required: true
         schema:
           type: string
+      - allowEmptyValue: true
+        description: NVMeoF gateway group
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '202':
           content:
@@ -8844,6 +8898,12 @@ paths:
         required: true
         schema:
           type: string
+      - allowEmptyValue: true
+        description: NVMeoF gateway group
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '200':
           content:
@@ -8883,6 +8943,9 @@ paths:
           application/json:
             schema:
               properties:
+                gw_group:
+                  description: NVMeoF gateway group
+                  type: string
                 load_balancing_group:
                   description: Load balancing group
                   type: integer
@@ -8937,6 +9000,12 @@ paths:
         required: true
         schema:
           type: string
+      - allowEmptyValue: true
+        description: NVMeoF gateway group
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '200':
           content:

From ee16b099d540f2a60dd84fcbc69499c1b1e649a3 Mon Sep 17 00:00:00 2001
From: Yuval Lifshitz <ylifshit@ibm.com>
Date: Tue, 1 Oct 2024 15:19:46 +0000
Subject: [PATCH 055/148] common: missing std include with GCC 14

In file included from src/rgw/driver/posix/bucket_cache.h:19,
                 from src/test/rgw/test_posix_bucket_cache.cc:4:
src/common/cohort_lru.h: In member function _void cohort::lru::TreeX<T, TTree, CLT, CEQ, K, LK>::lock()_:
src/common/cohort_lru.h:334:14: error: _for_each_ is not a member of _std_
  334 |         std::for_each(locks.begin(), locks.end(),
      |              ^~~~~~~~
src/common/cohort_lru.h: In member function _void cohort::lru::TreeX<T, TTree, CLT, CEQ, K, LK>::unlock()_:
/home/yuvalif/ceph5/src/common/cohort_lru.h:339:14: error: _for_each_ is not a member of _std_
  339 |         std::for_each(locks.begin(), locks.end(),
      |              ^~~~~~~~

Signed-off-by: Yuval Lifshitz <ylifshit@ibm.com>
---
 src/common/cohort_lru.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/common/cohort_lru.h b/src/common/cohort_lru.h
index af2baaa5c67bf..86ced8d183c71 100644
--- a/src/common/cohort_lru.h
+++ b/src/common/cohort_lru.h
@@ -15,6 +15,12 @@
 
 #include <boost/intrusive/list.hpp>
 #include <boost/intrusive/slist.hpp>
+#include <cstdint>
+#include <atomic>
+#include <mutex>
+#include <algorithm>
+#include <functional>
+#include <vector>
 
 #ifdef __CEPH__
 # include "include/ceph_assert.h"

From ea53aceb8d72187f7f8629aa6d3b66c7cca88a86 Mon Sep 17 00:00:00 2001
From: Aashish Sharma
 <aasharma@li-e74156cc-2f67-11b2-a85c-e98659a63c5c.ibm.com>
Date: Wed, 25 Sep 2024 18:09:32 +0530
Subject: [PATCH 056/148] mgr/dashboard: show non default realm sync status in
 rgw overview page

Currently, we just show the sync status of the default realm in rgw
overview page. This PR is to show the sync status of non-default realms
as well. Multisite sync status can be viewed for any of the active daemon
which runs in default/non-default realm.

Fixes: https://tracker.ceph.com/issues/68329

Signed-off-by: Aashish Sharma <aasharma@redhat.com>
---
 src/pybind/mgr/dashboard/controllers/rgw.py            |  4 ++--
 .../rgw-overview-dashboard.component.ts                |  4 +++-
 .../src/app/shared/api/rgw-multisite.service.ts        |  4 +++-
 src/pybind/mgr/dashboard/openapi.yaml                  |  7 ++++++-
 src/pybind/mgr/dashboard/services/rgw_client.py        | 10 +++++++++-
 5 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/src/pybind/mgr/dashboard/controllers/rgw.py b/src/pybind/mgr/dashboard/controllers/rgw.py
index 8667d469060f8..4969d11935d6f 100755
--- a/src/pybind/mgr/dashboard/controllers/rgw.py
+++ b/src/pybind/mgr/dashboard/controllers/rgw.py
@@ -162,9 +162,9 @@ class RgwMultisiteController(RESTController):
     @ReadPermission
     @allow_empty_body
     # pylint: disable=W0102,W0613
-    def get_sync_status(self):
+    def get_sync_status(self, daemon_name=None):
         multisite_instance = RgwMultisite()
-        result = multisite_instance.get_multisite_sync_status()
+        result = multisite_instance.get_multisite_sync_status(daemon_name)
         return result
 
     @Endpoint(path='/sync-policy')
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-overview-dashboard/rgw-overview-dashboard.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-overview-dashboard/rgw-overview-dashboard.component.ts
index 8b5901769c357..00037a7235b8e 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-overview-dashboard/rgw-overview-dashboard.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-overview-dashboard/rgw-overview-dashboard.component.ts
@@ -91,7 +91,9 @@ export class RgwOverviewDashboardComponent implements OnInit, OnDestroy {
         this.totalPoolUsedBytes = data['total_pool_bytes_used'];
         this.averageObjectSize = data['average_object_size'];
       });
-      this.getSyncStatus();
+      setTimeout(() => {
+        this.getSyncStatus();
+      });
     });
     this.BucketSub = this.rgwBucketService
       .getTotalBucketsAndUsersLength()
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/rgw-multisite.service.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/rgw-multisite.service.ts
index d57cd523a4dfe..e4688358013ab 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/rgw-multisite.service.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/rgw-multisite.service.ts
@@ -28,7 +28,9 @@ export class RgwMultisiteService {
   }
 
   getSyncStatus() {
-    return this.http.get(`${this.url}/sync_status`);
+    return this.rgwDaemonService.request((params: HttpParams) => {
+      return this.http.get(`${this.url}/sync_status`, { params: params });
+    });
   }
 
   status() {
diff --git a/src/pybind/mgr/dashboard/openapi.yaml b/src/pybind/mgr/dashboard/openapi.yaml
index 8f98f1f62a0a8..7cdc0357ae4f1 100644
--- a/src/pybind/mgr/dashboard/openapi.yaml
+++ b/src/pybind/mgr/dashboard/openapi.yaml
@@ -11653,7 +11653,12 @@ paths:
       - RgwMultisite
   /api/rgw/multisite/sync_status:
     get:
-      parameters: []
+      parameters:
+      - allowEmptyValue: true
+        in: query
+        name: daemon_name
+        schema:
+          type: string
       responses:
         '200':
           content:
diff --git a/src/pybind/mgr/dashboard/services/rgw_client.py b/src/pybind/mgr/dashboard/services/rgw_client.py
index 2441b73b361be..e1e113a812e7d 100755
--- a/src/pybind/mgr/dashboard/services/rgw_client.py
+++ b/src/pybind/mgr/dashboard/services/rgw_client.py
@@ -1981,8 +1981,16 @@ def get_multisite_status(self):
             is_multisite_configured = False
         return is_multisite_configured
 
-    def get_multisite_sync_status(self):
+    def get_multisite_sync_status(self, daemon_name: str):
         rgw_multisite_sync_status_cmd = ['sync', 'status']
+        daemons = _get_daemons()
+        try:
+            realm_name = daemons[daemon_name].realm_name
+        except (KeyError, AttributeError):
+            raise DashboardException('Unable to get realm name from daemon',
+                                     http_status_code=500, component='rgw')
+        if realm_name:
+            rgw_multisite_sync_status_cmd.extend(['--rgw-realm', realm_name])
         try:
             exit_code, out, _ = mgr.send_rgwadmin_command(rgw_multisite_sync_status_cmd, False)
             if exit_code > 0:

From 8dd9e9dad6a5ad9b427d73c4547286fe46b67d46 Mon Sep 17 00:00:00 2001
From: Aashish Sharma
 <aasharma@li-e74156cc-2f67-11b2-a85c-e98659a63c5c.ibm.com>
Date: Thu, 3 Oct 2024 13:58:14 +0530
Subject: [PATCH 057/148] =?UTF-8?q?mgr/dashboard:=20fix=20gateways=20secti?=
 =?UTF-8?q?on=20error:=E2=80=9D404=20-=20Not=20Found=20RGW=20Daemon=20not?=
 =?UTF-8?q?=20found:=20None=E2=80=9D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A case was missed here where we do have a default realm created but no default_zonegorup, in that case, the existing behavior should prevail and that's not being handled. If a default_realm is created but no default_zonegroup is there, weshould continue getting the keys from daemon_name = next(iter(daemon_keys))

Fixes: https://tracker.ceph.com/issues/68376

Signed-off-by: Aashish Sharma <aasharma@redhat.com>
---
 .../mgr/dashboard/services/rgw_client.py      | 31 ++++++++++---------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/src/pybind/mgr/dashboard/services/rgw_client.py b/src/pybind/mgr/dashboard/services/rgw_client.py
index 2441b73b361be..8846f42e70764 100755
--- a/src/pybind/mgr/dashboard/services/rgw_client.py
+++ b/src/pybind/mgr/dashboard/services/rgw_client.py
@@ -288,21 +288,22 @@ def instance(userid: Optional[str] = None,
 
         daemon_keys = RgwClient._daemons.keys()
         if not daemon_name:
-            if len(daemon_keys) > 1:
-                try:
-                    multiiste = RgwMultisite()
-                    default_zonegroup = multiiste.get_all_zonegroups_info()['default_zonegroup']
-
-                    # Iterate through _daemons.values() to find the daemon with the
-                    # matching zonegroup_id
-                    for daemon in RgwClient._daemons.values():
-                        if daemon.zonegroup_id == default_zonegroup:
-                            daemon_name = daemon.name
-                            break
-                except Exception:  # pylint: disable=broad-except
-                    daemon_name = next(iter(daemon_keys))
-            else:
-                # Handle the case where there is only one or no key in _daemons
+            try:
+                if len(daemon_keys) > 1:
+                    default_zonegroup = (
+                        RgwMultisite()
+                        .get_all_zonegroups_info()['default_zonegroup']
+                    )
+                    if default_zonegroup:
+                        daemon_name = next(
+                            (daemon.name
+                             for daemon in RgwClient._daemons.values()
+                             if daemon.zonegroup_id == default_zonegroup),
+                            None
+                        )
+                daemon_name = daemon_name or next(iter(daemon_keys))
+            except Exception as e:  # pylint: disable=broad-except
+                logger.exception('Failed to determine default RGW daemon: %s', str(e))
                 daemon_name = next(iter(daemon_keys))
 
         # Discard all cached instances if any rgw setting has changed

From 471ebae9240192a4e143c00894e1736dd1921308 Mon Sep 17 00:00:00 2001
From: Nitzan Mordechai <nmordech@redhat.com>
Date: Mon, 7 Oct 2024 10:21:58 +0000
Subject: [PATCH 058/148] qa/suites/crimson-rados/perf: add ssh keys

cbt use ssh connection by defualt, without ssh_keys the task
won't generate public key and the cbt task will fail

Fixes: https://tracker.ceph.com/issues/68421
Signed-off-by: Nitzan Mordechai <nmordech@redhat.com>
---
 qa/suites/crimson-rados/perf/deploy/ceph.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/qa/suites/crimson-rados/perf/deploy/ceph.yaml b/qa/suites/crimson-rados/perf/deploy/ceph.yaml
index 0f6021975a4a2..50d170f502272 100644
--- a/qa/suites/crimson-rados/perf/deploy/ceph.yaml
+++ b/qa/suites/crimson-rados/perf/deploy/ceph.yaml
@@ -10,3 +10,4 @@ tasks:
       osd:
         debug monc: 20
     flavor: crimson
+- ssh_keys:

From 984a76f5a3ef3556122a6f81d63de756d0d9dc4d Mon Sep 17 00:00:00 2001
From: Yuval Lifshitz <ylifshit@ibm.com>
Date: Mon, 7 Oct 2024 15:34:15 +0000
Subject: [PATCH 059/148] test/rgw/lua: use stats polling instead of sleep

this makes the test more consistent as well as faster

fixes: https://tracker.ceph.com/issues/68335

Signed-off-by: Yuval Lifshitz <ylifshit@ibm.com>
---
 src/rgw/rgw_lua_background.h |   5 +-
 src/test/rgw/test_rgw_lua.cc | 178 ++++++++++++++++++++++-------------
 2 files changed, 117 insertions(+), 66 deletions(-)

diff --git a/src/rgw/rgw_lua_background.h b/src/rgw/rgw_lua_background.h
index 7b8d12599f4e8..2973a753fff63 100644
--- a/src/rgw/rgw_lua_background.h
+++ b/src/rgw/rgw_lua_background.h
@@ -153,9 +153,8 @@ class Background : public RGWRealmReloader::Pauser {
 
   void run();
 
-protected:
   std::string rgw_script;
-  virtual int read_script();
+  int read_script();
 
 public:
   Background(rgw::sal::Driver* _driver,
@@ -173,7 +172,7 @@ class Background : public RGWRealmReloader::Pauser {
     std::unique_lock cond_lock(table_mutex);
     rgw_map[key] = value;
   }
-   
+
   // update the manager after 
   void set_manager(rgw::sal::LuaManager* _lua_manager);
   void pause() override;
diff --git a/src/test/rgw/test_rgw_lua.cc b/src/test/rgw/test_rgw_lua.cc
index b2e11e442a28f..ad923023a6d01 100644
--- a/src/test/rgw/test_rgw_lua.cc
+++ b/src/test/rgw/test_rgw_lua.cc
@@ -9,6 +9,7 @@
 #include "rgw_lua_background.h"
 #include "rgw_lua_data_filter.h"
 #include "rgw_sal_config.h"
+#include "rgw_perf_counters.h"
 
 using namespace std;
 using namespace rgw;
@@ -184,9 +185,51 @@ inline std::unique_ptr<sal::RadosStore> make_store() {
   return std::make_unique<StoreBundle>(std::move(context_pool));
 };
 
+class TestLuaManager : public rgw::sal::StoreLuaManager {
+  public:
+    std::string lua_script;
+    unsigned read_time = 0;
+    TestLuaManager() {
+      rgw_perf_start(g_cct);
+    }
+    int get_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, std::string& script) override {
+      std::this_thread::sleep_for(std::chrono::seconds(read_time));
+      script = lua_script;
+      return 0;
+    }
+    int put_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, const std::string& script) override {
+      return 0;
+    }
+    int del_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key) override {
+      return 0;
+    }
+    int add_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name) override {
+      return 0;
+    }
+    int remove_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name) override {
+      return 0;
+    }
+    int list_packages(const DoutPrefixProvider* dpp, optional_yield y, rgw::lua::packages_t& packages) override {
+      return 0;
+    }
+    int reload_packages(const DoutPrefixProvider* dpp, optional_yield y) override {
+      return 0;
+    }
+    ~TestLuaManager() {
+      rgw_perf_stop(g_cct);
+    }
+};
+
+void set_script(rgw::sal::LuaManager* manager, const std::string& script) {
+  static_cast<TestLuaManager*>(manager)->lua_script = script;
+}
+void set_read_time(rgw::sal::LuaManager* manager, unsigned read_time) {
+  static_cast<TestLuaManager*>(manager)->read_time = read_time;
+}
+
 #define DEFINE_REQ_STATE RGWProcessEnv pe; \
   auto store = make_store();                   \
-  pe.lua.manager = store->get_lua_manager(""); \
+  pe.lua.manager = std::make_unique<TestLuaManager>(); \
   RGWEnv e; \
   req_state s(g_cct, pe, &e, 0);
 
@@ -850,24 +893,12 @@ TEST(TestRGWLua, OpsLog)
 }
 
 class TestBackground : public rgw::lua::Background {
-  const unsigned read_time;
-
-protected:
-  int read_script() override {
-    // don't read the object from the store
-    std::this_thread::sleep_for(std::chrono::seconds(read_time));
-    return 0;
-  }
-
 public:
-  TestBackground(sal::RadosStore* store, const std::string& script, rgw::sal::LuaManager* manager, unsigned read_time = 0) : 
+  TestBackground(sal::RadosStore* store, rgw::sal::LuaManager* manager) : 
     rgw::lua::Background(store, 
         g_cct, 
         manager,
-        1 /* run every second */),
-    read_time(read_time) {
-      // the script is passed in the constructor
-      rgw_script = script;
+        1 /* run every second */) {
     }
 
   ~TestBackground() override {
@@ -878,20 +909,19 @@ class TestBackground : public rgw::lua::Background {
 TEST(TestRGWLuaBackground, Start)
 {
   auto store = make_store();
-  auto manager = store->get_lua_manager("");
+  auto manager = std::make_unique<TestLuaManager>();
   {
     // ctr and dtor without running
-    TestBackground lua_background(store.get(), "", manager.get());
+    TestBackground lua_background(store.get(), manager.get());
   }
   {
     // ctr and dtor with running
-    TestBackground lua_background(store.get(), "", manager.get());
+    TestBackground lua_background(store.get(), manager.get());
     lua_background.start();
   }
 }
 
-
-constexpr auto wait_time = std::chrono::seconds(3);
+constexpr auto wait_time = std::chrono::milliseconds(100);
 
 template<typename T>
 const T& get_table_value(const TestBackground& b, const std::string& index) {
@@ -903,6 +933,15 @@ const T& get_table_value(const TestBackground& b, const std::string& index) {
   }
 }
 
+#define WAIT_FOR_BACKGROUND \
+{ \
+  unsigned max_tries = 100; \
+  do { \
+    std::this_thread::sleep_for(wait_time); \
+    --max_tries; \
+  } while (perfcounter->get(l_rgw_lua_script_ok) + perfcounter->get(l_rgw_lua_script_fail) == 0 && max_tries > 0); \
+}
+
 TEST(TestRGWLuaBackground, Script)
 {
   const std::string script = R"(
@@ -912,10 +951,11 @@ TEST(TestRGWLuaBackground, Script)
   )";
 
   auto store = make_store();
-  auto manager = store->get_lua_manager("");
-  TestBackground lua_background(store.get(), script, manager.get());
+  auto manager = std::make_unique<TestLuaManager>();
+  set_script(manager.get(), script);
+  TestBackground lua_background(store.get(), manager.get());
   lua_background.start();
-  std::this_thread::sleep_for(wait_time);
+  WAIT_FOR_BACKGROUND;
   EXPECT_EQ(get_table_value<std::string>(lua_background, "hello"), "world");
 }
 
@@ -928,9 +968,10 @@ TEST(TestRGWLuaBackground, RequestScript)
   )";
 
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), background_script, pe.lua.manager.get());
+  set_script(pe.lua.manager.get(), background_script);
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
   lua_background.start();
-  std::this_thread::sleep_for(wait_time);
+  WAIT_FOR_BACKGROUND;
 
   const std::string request_script = R"(
     local key = "hello"
@@ -947,8 +988,9 @@ TEST(TestRGWLuaBackground, RequestScript)
   ASSERT_EQ(rc, 0);
   EXPECT_EQ(get_table_value<std::string>(lua_background, "hello"), "from request");
   // now we resume and let the background set the value
+  perfcounter->set(l_rgw_lua_script_ok, 0);
   lua_background.resume(store.get());
-  std::this_thread::sleep_for(wait_time);
+  WAIT_FOR_BACKGROUND;
   EXPECT_EQ(get_table_value<std::string>(lua_background, "hello"), "from background");
 }
 
@@ -965,14 +1007,16 @@ TEST(TestRGWLuaBackground, Pause)
   )";
 
   auto store = make_store();
-  auto manager = store->get_lua_manager("");
-  TestBackground lua_background(store.get(), script, manager.get());
+  auto manager = std::make_unique<TestLuaManager>();
+  set_script(manager.get(), script);
+  TestBackground lua_background(store.get(), manager.get());
   lua_background.start();
-  std::this_thread::sleep_for(wait_time);
+  WAIT_FOR_BACKGROUND;
   const auto value_len = get_table_value<std::string>(lua_background, "hello").size();
   EXPECT_GT(value_len, 0);
   lua_background.pause();
-  std::this_thread::sleep_for(wait_time);
+  // make sure no execution occurs
+  std::this_thread::sleep_for(wait_time*10);
   // no change in len
   EXPECT_EQ(value_len, get_table_value<std::string>(lua_background, "hello").size());
 }
@@ -991,15 +1035,17 @@ TEST(TestRGWLuaBackground, PauseWhileReading)
   )";
 
   auto store = make_store();
-  auto manager = store->get_lua_manager("");
-  TestBackground lua_background(store.get(), script, manager.get(), 2);
+  auto manager = std::make_unique<TestLuaManager>();
+  set_script(manager.get(), script);
+  set_read_time(manager.get(), 2);
+  TestBackground lua_background(store.get(), manager.get());
   lua_background.start();
-  constexpr auto long_wait_time = std::chrono::seconds(6);
-  std::this_thread::sleep_for(long_wait_time);
+  WAIT_FOR_BACKGROUND;
   const auto value_len = get_table_value<std::string>(lua_background, "hello").size();
   EXPECT_GT(value_len, 0);
   lua_background.pause();
-  std::this_thread::sleep_for(long_wait_time);
+  // make sure no execution occurs
+  std::this_thread::sleep_for(wait_time*10);
   // one execution might occur after pause
   EXPECT_TRUE(value_len + 1 >= get_table_value<std::string>(lua_background, "hello").size());
 }
@@ -1013,14 +1059,16 @@ TEST(TestRGWLuaBackground, ReadWhilePaused)
   )";
 
   auto store = make_store();
-  auto manager = store->get_lua_manager("");
-  TestBackground lua_background(store.get(), script, manager.get());
+  auto manager = std::make_unique<TestLuaManager>();
+  set_script(manager.get(), script);
+  TestBackground lua_background(store.get(), manager.get());
   lua_background.pause();
   lua_background.start();
-  std::this_thread::sleep_for(wait_time);
+  // make sure no execution occurs
+  std::this_thread::sleep_for(wait_time*10);
   EXPECT_EQ(get_table_value<std::string>(lua_background, "hello"), "");
   lua_background.resume(store.get());
-  std::this_thread::sleep_for(wait_time);
+  WAIT_FOR_BACKGROUND;
   EXPECT_EQ(get_table_value<std::string>(lua_background, "hello"), "world");
 }
 
@@ -1037,18 +1085,21 @@ TEST(TestRGWLuaBackground, PauseResume)
   )";
 
   auto store = make_store();
-  auto manager = store->get_lua_manager("");
-  TestBackground lua_background(store.get(), script, manager.get());
+  auto manager = std::make_unique<TestLuaManager>();
+  set_script(manager.get(), script);
+  TestBackground lua_background(store.get(), manager.get());
   lua_background.start();
-  std::this_thread::sleep_for(wait_time);
+  WAIT_FOR_BACKGROUND;
   const auto value_len = get_table_value<std::string>(lua_background, "hello").size();
   EXPECT_GT(value_len, 0);
   lua_background.pause();
-  std::this_thread::sleep_for(wait_time);
+  // make sure no execution occurs
+  std::this_thread::sleep_for(wait_time*10);
   // no change in len
   EXPECT_EQ(value_len, get_table_value<std::string>(lua_background, "hello").size());
+  perfcounter->set(l_rgw_lua_script_ok, 0);
   lua_background.resume(store.get());
-  std::this_thread::sleep_for(wait_time);
+  WAIT_FOR_BACKGROUND;
   // should be a change in len
   EXPECT_GT(get_table_value<std::string>(lua_background, "hello").size(), value_len);
 }
@@ -1066,18 +1117,19 @@ TEST(TestRGWLuaBackground, MultipleStarts)
   )";
 
   auto store = make_store();
-  auto manager = store->get_lua_manager("");
-  TestBackground lua_background(store.get(), script, manager.get());
+  auto manager = std::make_unique<TestLuaManager>();
+  set_script(manager.get(), script);
+  TestBackground lua_background(store.get(), manager.get());
   lua_background.start();
-  std::this_thread::sleep_for(wait_time);
+  WAIT_FOR_BACKGROUND;
   const auto value_len = get_table_value<std::string>(lua_background, "hello").size();
   EXPECT_GT(value_len, 0);
   lua_background.start();
   lua_background.shutdown();
   lua_background.shutdown();
-  std::this_thread::sleep_for(wait_time);
+  perfcounter->set(l_rgw_lua_script_ok, 0);
   lua_background.start();
-  std::this_thread::sleep_for(wait_time);
+  WAIT_FOR_BACKGROUND;
   // should be a change in len
   EXPECT_GT(get_table_value<std::string>(lua_background, "hello").size(), value_len);
 }
@@ -1085,7 +1137,7 @@ TEST(TestRGWLuaBackground, MultipleStarts)
 TEST(TestRGWLuaBackground, TableValues)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   const std::string request_script = R"(
     RGW["key1"] = "string value"
@@ -1107,7 +1159,7 @@ TEST(TestRGWLuaBackground, TableValues)
 TEST(TestRGWLuaBackground, TablePersist)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   std::string request_script = R"(
     RGW["key1"] = "string value"
@@ -1137,7 +1189,7 @@ TEST(TestRGWLuaBackground, TablePersist)
 TEST(TestRGWLuaBackground, TableValuesFromRequest)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
   lua_background.start();
 
   const std::string request_script = R"(
@@ -1165,7 +1217,7 @@ TEST(TestRGWLuaBackground, TableValuesFromRequest)
 TEST(TestRGWLuaBackground, TableInvalidValue)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
   lua_background.start();
 
   const std::string request_script = R"(
@@ -1191,7 +1243,7 @@ TEST(TestRGWLuaBackground, TableInvalidValue)
 TEST(TestRGWLuaBackground, TableErase)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   std::string request_script = R"(
     RGW["size"] = 0
@@ -1229,7 +1281,7 @@ TEST(TestRGWLuaBackground, TableErase)
 TEST(TestRGWLuaBackground, TableIterate)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   const std::string request_script = R"(
     RGW["key1"] = "string value"
@@ -1256,7 +1308,7 @@ TEST(TestRGWLuaBackground, TableIterate)
 TEST(TestRGWLuaBackground, TableIterateWrite)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   const std::string request_script = R"(
     RGW["a"] = 1
@@ -1286,7 +1338,7 @@ TEST(TestRGWLuaBackground, TableIterateWrite)
 TEST(TestRGWLuaBackground, TableIncrement)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   const std::string request_script = R"(
     RGW["key1"] = 42
@@ -1306,7 +1358,7 @@ TEST(TestRGWLuaBackground, TableIncrement)
 TEST(TestRGWLuaBackground, TableIncrementBy)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   const std::string request_script = R"(
     RGW["key1"] = 42
@@ -1328,7 +1380,7 @@ TEST(TestRGWLuaBackground, TableIncrementBy)
 TEST(TestRGWLuaBackground, TableDecrement)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   const std::string request_script = R"(
     RGW["key1"] = 42
@@ -1348,7 +1400,7 @@ TEST(TestRGWLuaBackground, TableDecrement)
 TEST(TestRGWLuaBackground, TableDecrementBy)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   const std::string request_script = R"(
     RGW["key1"] = 42
@@ -1370,7 +1422,7 @@ TEST(TestRGWLuaBackground, TableDecrementBy)
 TEST(TestRGWLuaBackground, TableIncrementValueError)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   std::string request_script = R"(
     -- cannot increment string values
@@ -1405,7 +1457,7 @@ TEST(TestRGWLuaBackground, TableIncrementValueError)
 TEST(TestRGWLuaBackground, TableIncrementError)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   std::string request_script = R"(
     -- missing argument
@@ -1494,7 +1546,7 @@ TEST(TestRGWLua, Data)
   )";
 
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
   s.host_id = "foo";
   pe.lua.background = &lua_background;
   lua::RGWObjFilter filter(&s, script);

From fc537c8d914274791ec179bf08a95dc558d81266 Mon Sep 17 00:00:00 2001
From: Yingxin Cheng <yingxin.cheng@intel.com>
Date: Fri, 6 Sep 2024 16:54:22 +0800
Subject: [PATCH 060/148] crimson/os/seastore: misc cleanups

Signed-off-by: Yingxin Cheng <yingxin.cheng@intel.com>
---
 src/crimson/os/futurized_store.h    |   8 +-
 src/crimson/os/seastore/seastore.cc | 245 ++++++++++++++--------------
 src/crimson/os/seastore/seastore.h  |  64 +++-----
 3 files changed, 154 insertions(+), 163 deletions(-)

diff --git a/src/crimson/os/futurized_store.h b/src/crimson/os/futurized_store.h
index fe09cc5451072..0dca695ba3a1e 100644
--- a/src/crimson/os/futurized_store.h
+++ b/src/crimson/os/futurized_store.h
@@ -75,14 +75,15 @@ class FuturizedStore {
       CollectionRef c,
       const ghobject_t& oid) = 0;
 
-    using omap_values_t = std::map<std::string, ceph::bufferlist, std::less<>>;
+    using omap_values_t = attrs_t;
     using omap_keys_t = std::set<std::string>;
     virtual read_errorator::future<omap_values_t> omap_get_values(
       CollectionRef c,
       const ghobject_t& oid,
       const omap_keys_t& keys) = 0;
 
-    virtual read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values(
+    using omap_values_paged_t = std::tuple<bool, omap_values_t>;
+    virtual read_errorator::future<omap_values_paged_t> omap_get_values(
       CollectionRef c,           ///< [in] collection
       const ghobject_t &oid,     ///< [in] oid
       const std::optional<std::string> &start ///< [in] start, empty for begin
@@ -147,7 +148,8 @@ class FuturizedStore {
       return seastar::now();
     }
 
-    virtual read_errorator::future<std::map<uint64_t, uint64_t>> fiemap(
+    using fiemap_ret_t = std::map<uint64_t, uint64_t>;
+    virtual read_errorator::future<fiemap_ret_t> fiemap(
       CollectionRef ch,
       const ghobject_t& oid,
       uint64_t off,
diff --git a/src/crimson/os/seastore/seastore.cc b/src/crimson/os/seastore/seastore.cc
index 1577433237351..d708231b47b1a 100644
--- a/src/crimson/os/seastore/seastore.cc
+++ b/src/crimson/os/seastore/seastore.cc
@@ -30,8 +30,6 @@
 #include "crimson/os/seastore/onode_manager.h"
 #include "crimson/os/seastore/object_data_handler.h"
 
-
-using std::string;
 using crimson::common::local_conf;
 
 template <> struct fmt::formatter<crimson::os::seastore::op_type_t>
@@ -278,10 +276,10 @@ SeaStore::mount_ertr::future<> SeaStore::mount()
           return set_secondaries();
         });
       });
-    }).safe_then([this] {
-      return shard_stores.invoke_on_all([](auto &local_store) {
-        return local_store.mount_managers();
-      });
+    });
+  }).safe_then([this] {
+    return shard_stores.invoke_on_all([](auto &local_store) {
+      return local_store.mount_managers();
     });
   }).handle_error(
     crimson::ct_error::assert_all{
@@ -345,15 +343,15 @@ seastar::future<> SeaStore::write_fsid(uuid_d new_osd_fsid)
     auto [ret, fsid] = tuple;
     std::string str_fsid = stringify(new_osd_fsid);
     if (ret == -1) {
-       return write_meta("fsid", stringify(new_osd_fsid));
+      return write_meta("fsid", stringify(new_osd_fsid));
     } else if (ret == 0 && fsid != str_fsid) {
-       ERROR("on-disk fsid {} != provided {}",
-         fsid, stringify(new_osd_fsid));
-       throw std::runtime_error("store fsid error");
-     } else {
+      ERROR("on-disk fsid {} != provided {}",
+            fsid, stringify(new_osd_fsid));
+      throw std::runtime_error("store fsid error");
+    } else {
       return seastar::now();
-     }
-   });
+    }
+  });
 }
 
 seastar::future<>
@@ -413,7 +411,8 @@ seastar::future<> SeaStore::set_secondaries()
 SeaStore::mkfs_ertr::future<> SeaStore::test_mkfs(uuid_d new_osd_fsid)
 {
   ceph_assert(seastar::this_shard_id() == primary_core);
-  return read_meta("mkfs_done").then([this, new_osd_fsid] (auto tuple) {
+  return read_meta("mkfs_done"
+  ).then([this, new_osd_fsid](auto tuple) {
     auto [done, value] = tuple;
     if (done == 0) {
       return seastar::now();
@@ -449,7 +448,8 @@ seastar::future<> SeaStore::prepare_meta(uuid_d new_osd_fsid)
 SeaStore::mkfs_ertr::future<> SeaStore::mkfs(uuid_d new_osd_fsid)
 {
   ceph_assert(seastar::this_shard_id() == primary_core);
-  return read_meta("mkfs_done").then([this, new_osd_fsid] (auto tuple) {
+  return read_meta("mkfs_done"
+  ).then([this, new_osd_fsid](auto tuple) {
     auto [done, value] = tuple;
     if (done == 0) {
       return seastar::now();
@@ -542,7 +542,7 @@ SeaStore::mkfs_ertr::future<> SeaStore::mkfs(uuid_d new_osd_fsid)
   });
 }
 
-using coll_core_t = FuturizedStore::coll_core_t;
+using coll_core_t = SeaStore::coll_core_t;
 seastar::future<std::vector<coll_core_t>>
 SeaStore::list_collections()
 {
@@ -566,9 +566,10 @@ store_statfs_t SeaStore::Shard::stat() const
 
 seastar::future<store_statfs_t> SeaStore::stat() const
 {
-  ceph_assert(seastar::this_shard_id() == primary_core);
   LOG_PREFIX(SeaStore::stat);
   DEBUG("");
+
+  ceph_assert(seastar::this_shard_id() == primary_core);
   return shard_stores.map_reduce0(
     [](const SeaStore::Shard &local_store) {
       return local_store.stat();
@@ -914,10 +915,11 @@ SeaStore::Shard::list_objects(CollectionRef ch,
 		    seastar::stop_iteration
 		    >(seastar::stop_iteration::no);
 		});
-	      }).si_then([&ret] {
-		return list_iertr::make_ready_future<
-		  OnodeManager::list_onodes_bare_ret>(std::move(ret));
-	      });
+	      }
+            ).si_then([&ret] {
+              return list_iertr::make_ready_future<
+                OnodeManager::list_onodes_bare_ret>(std::move(ret));
+            });
           }
         });
       }).safe_then([&ret](auto&& _ret) {
@@ -949,7 +951,8 @@ SeaStore::Shard::open_collection(const coll_t& cid)
 {
   LOG_PREFIX(SeaStore::open_collection);
   DEBUG("{}", cid);
-  return list_collections().then([cid, this] (auto colls_cores) {
+  return list_collections(
+  ).then([cid, this] (auto colls_cores) {
     if (auto found = std::find(colls_cores.begin(),
                                colls_cores.end(),
                                std::make_pair(cid, seastar::this_shard_id()));
@@ -1032,7 +1035,7 @@ SeaStore::Shard::read(
     Transaction::src_t::READ,
     "read_obj",
     op_type_t::READ,
-    [=, this](auto &t, auto &onode) -> ObjectDataHandler::read_ret {
+    [this, offset, len, op_flags](auto &t, auto &onode) -> ObjectDataHandler::read_ret {
       size_t size = onode.get_layout().size;
 
       if (offset >= size) {
@@ -1098,10 +1101,10 @@ SeaStore::Shard::readv(
   return seastar::do_with(
     _oid,
     ceph::bufferlist{},
-    [=, this, &m](auto &oid, auto &ret) {
+    [ch, op_flags, this, &m](auto &oid, auto &ret) {
     return crimson::do_for_each(
       m,
-      [=, this, &oid, &ret](auto &p) {
+      [ch, op_flags, this, &oid, &ret](auto &p) {
       return read(
 	ch, oid, p.first, p.second, op_flags
 	).safe_then([&ret](auto bl) {
@@ -1112,7 +1115,6 @@ SeaStore::Shard::readv(
         (std::move(ret));
     });
   });
-  return read_errorator::make_ready_future<ceph::bufferlist>();
 }
 
 using crimson::os::seastore::omap_manager::BtreeOMapManager;
@@ -1123,20 +1125,19 @@ SeaStore::Shard::get_attr(
   const ghobject_t& oid,
   std::string_view name) const
 {
-  auto c = static_cast<SeastoreCollection*>(ch.get());
   LOG_PREFIX(SeaStore::get_attr);
-  DEBUG("{} {}", c->get_cid(), oid);
+  DEBUG("{} {}", ch->get_cid(), oid);
 
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
 
   return repeat_with_onode<ceph::bufferlist>(
-    c,
+    ch,
     oid,
     Transaction::src_t::READ,
     "get_attr",
     op_type_t::GET_ATTR,
-    [=, this](auto &t, auto& onode) -> _omap_get_value_ret {
+    [this, name](auto &t, auto& onode) -> _omap_get_value_ret {
       auto& layout = onode.get_layout();
       if (name == OI_ATTR && layout.oi_size) {
         ceph::bufferlist bl;
@@ -1170,19 +1171,18 @@ SeaStore::Shard::get_attrs(
   const ghobject_t& oid)
 {
   LOG_PREFIX(SeaStore::get_attrs);
-  auto c = static_cast<SeastoreCollection*>(ch.get());
-  DEBUG("{} {}", c->get_cid(), oid);
+  DEBUG("{} {}", ch->get_cid(), oid);
 
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
 
   return repeat_with_onode<attrs_t>(
-    c,
+    ch,
     oid,
     Transaction::src_t::READ,
     "get_addrs",
     op_type_t::GET_ATTRS,
-    [=, this](auto &t, auto& onode) {
+    [this](auto &t, auto& onode) {
       auto& layout = onode.get_layout();
       return omap_list(onode, layout.xattr_root, t, std::nullopt,
         OMapManager::omap_list_config_t()
@@ -1202,7 +1202,7 @@ SeaStore::Shard::get_attrs(
           attrs.emplace(SS_ATTR, std::move(bl));
          DEBUGT("set ss from onode layout", t);
         }
-        return seastar::make_ready_future<omap_values_t>(std::move(attrs));
+        return seastar::make_ready_future<attrs_t>(std::move(attrs));
       });
     }
   ).handle_error(
@@ -1229,7 +1229,7 @@ seastar::future<struct stat> SeaStore::Shard::stat(
     Transaction::src_t::READ,
     "stat",
     op_type_t::STAT,
-    [=, this](auto &t, auto &onode) {
+    [this, oid](auto &t, auto &onode) {
       struct stat st;
       auto &olayout = onode.get_layout();
       st.st_size = olayout.size;
@@ -1266,9 +1266,8 @@ SeaStore::Shard::omap_get_values(
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
 
-  auto c = static_cast<SeastoreCollection*>(ch.get());
   return repeat_with_onode<omap_values_t>(
-    c,
+    ch,
     oid,
     Transaction::src_t::READ,
     "omap_get_values",
@@ -1298,21 +1297,20 @@ SeaStore::Shard::_omap_get_value(
     std::move(root),
     std::string(key),
     [&t](auto &manager, auto& root, auto& key) -> _omap_get_value_ret {
-      if (root.is_null()) {
+    if (root.is_null()) {
+      return crimson::ct_error::enodata::make();
+    }
+    return manager.omap_get_value(root, t, key
+    ).si_then([](auto opt) -> _omap_get_value_ret {
+      if (!opt) {
         return crimson::ct_error::enodata::make();
       }
-      return manager.omap_get_value(root, t, key
-      ).si_then([](auto opt) -> _omap_get_value_ret {
-        if (!opt) {
-          return crimson::ct_error::enodata::make();
-        }
-        return seastar::make_ready_future<ceph::bufferlist>(std::move(*opt));
-      });
-    }
-  );
+      return seastar::make_ready_future<ceph::bufferlist>(std::move(*opt));
+    });
+  });
 }
 
-SeaStore::Shard::_omap_get_values_ret
+SeaStore::base_iertr::future<SeaStore::Shard::omap_values_t>
 SeaStore::Shard::_omap_get_values(
   Transaction &t,
   omap_root_t &&omap_root,
@@ -1325,31 +1323,29 @@ SeaStore::Shard::_omap_get_values(
     BtreeOMapManager(*transaction_manager),
     std::move(omap_root),
     omap_values_t(),
-    [&](auto &manager, auto &root, auto &ret) {
-      return trans_intr::do_for_each(
-        keys.begin(),
-        keys.end(),
-        [&](auto &key) {
-          return manager.omap_get_value(
-            root,
-            t,
-            key
-          ).si_then([&ret, &key](auto &&p) {
-            if (p) {
-              bufferlist bl;
-              bl.append(*p);
-              ret.emplace(
-                std::move(key),
-                std::move(bl));
-            }
-            return seastar::now();
-          });
+    [&t, &keys](auto &manager, auto &root, auto &ret) {
+    return trans_intr::do_for_each(
+      keys.begin(),
+      keys.end(),
+      [&t, &manager, &root, &ret](auto &key) {
+      return manager.omap_get_value(
+        root,
+        t,
+        key
+      ).si_then([&ret, &key](auto &&p) {
+        if (p) {
+          bufferlist bl;
+          bl.append(*p);
+          ret.emplace(
+            std::move(key),
+            std::move(bl));
         }
-      ).si_then([&ret] {
-        return std::move(ret);
+        return seastar::now();
       });
-    }
-  );
+    }).si_then([&ret] {
+      return std::move(ret);
+    });
+  });
 }
 
 SeaStore::Shard::omap_list_ret
@@ -1377,22 +1373,20 @@ SeaStore::Shard::omap_list(
   });
 }
 
-SeaStore::Shard::omap_get_values_ret_t
+SeaStore::Shard::read_errorator::future<SeaStore::Shard::omap_values_paged_t>
 SeaStore::Shard::omap_get_values(
   CollectionRef ch,
   const ghobject_t &oid,
-  const std::optional<string> &start)
+  const std::optional<std::string> &start)
 {
-  auto c = static_cast<SeastoreCollection*>(ch.get());
   LOG_PREFIX(SeaStore::omap_get_values);
-  DEBUG("{} {}", c->get_cid(), oid);
+  DEBUG("{} {}", ch->get_cid(), oid);
 
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
 
-  using ret_bare_t = std::tuple<bool, SeaStore::Shard::omap_values_t>;
-  return repeat_with_onode<ret_bare_t>(
-    c,
+  return repeat_with_onode<omap_values_paged_t>(
+    ch,
     oid,
     Transaction::src_t::READ,
     "omap_list",
@@ -1413,7 +1407,8 @@ SeaStore::Shard::omap_get_values(
   });
 }
 
-SeaStore::Shard::_fiemap_ret SeaStore::Shard::_fiemap(
+SeaStore::base_iertr::future<SeaStore::Shard::fiemap_ret_t>
+SeaStore::Shard::_fiemap(
   Transaction &t,
   Onode &onode,
   uint64_t off,
@@ -1421,7 +1416,7 @@ SeaStore::Shard::_fiemap_ret SeaStore::Shard::_fiemap(
 {
   return seastar::do_with(
     ObjectDataHandler(max_object_size),
-    [=, this, &t, &onode] (auto &objhandler) {
+    [this, off, len, &t, &onode](auto &objhandler) {
     return objhandler.fiemap(
       ObjectDataHandler::context_t{
         *transaction_manager,
@@ -1433,7 +1428,7 @@ SeaStore::Shard::_fiemap_ret SeaStore::Shard::_fiemap(
   });
 }
 
-SeaStore::Shard::read_errorator::future<std::map<uint64_t, uint64_t>>
+SeaStore::Shard::read_errorator::future<SeaStore::Shard::fiemap_ret_t>
 SeaStore::Shard::fiemap(
   CollectionRef ch,
   const ghobject_t& oid,
@@ -1446,13 +1441,13 @@ SeaStore::Shard::fiemap(
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
 
-  return repeat_with_onode<std::map<uint64_t, uint64_t>>(
+  return repeat_with_onode<fiemap_ret_t>(
     ch,
     oid,
     Transaction::src_t::READ,
     "fiemap_read",
     op_type_t::READ,
-    [=, this](auto &t, auto &onode) -> _fiemap_ret {
+    [this, off, len](auto &t, auto &onode) -> base_iertr::future<fiemap_ret_t> {
     size_t size = onode.get_layout().size;
     if (off >= size) {
       INFOT("fiemap offset is over onode size!", t);
@@ -1497,7 +1492,7 @@ seastar::future<> SeaStore::Shard::do_transaction_no_callbacks(
     "do_transaction",
     op_type_t::TRANSACTION,
     [this](auto &ctx) {
-      return with_trans_intr(*ctx.transaction, [&, this](auto &t) {
+      return with_trans_intr(*ctx.transaction, [&ctx, this](auto &t) {
         LOG_PREFIX(SeaStore::Shard::do_transaction_no_callbacks);
         SUBDEBUGT(seastore_t, "start with {} objects",
                   t, ctx.iter.objects.size());
@@ -1578,8 +1573,9 @@ SeaStore::Shard::_do_transaction_step(
   SUBTRACET(seastore_t, "got op {}", *ctx.transaction, (uint32_t)op->op);
 
   using ceph::os::Transaction;
-  if (op->op == Transaction::OP_NOP)
+  if (op->op == Transaction::OP_NOP) {
     return tm_iertr::now();
+  }
 
   switch (op->op) {
     case Transaction::OP_RMCOLL:
@@ -1611,14 +1607,14 @@ SeaStore::Shard::_do_transaction_step(
     create = true;
   }
   if (!onodes[op->oid]) {
+    const ghobject_t& oid = i.get_oid(op->oid);
     if (!create) {
-      fut = onode_manager->get_onode(*ctx.transaction, i.get_oid(op->oid));
+      fut = onode_manager->get_onode(*ctx.transaction, oid);
     } else {
-      fut = onode_manager->get_or_create_onode(
-        *ctx.transaction, i.get_oid(op->oid));
+      fut = onode_manager->get_or_create_onode(*ctx.transaction, oid);
     }
   }
-  return fut.si_then([&, op](auto get_onode) {
+  return fut.si_then([&, op, this](auto get_onode) {
     OnodeRef &o = onodes[op->oid];
     if (!o) {
       assert(get_onode);
@@ -1632,7 +1628,7 @@ SeaStore::Shard::_do_transaction_step(
       //      support parallel extents loading
       return onode_manager->get_or_create_onode(
 	*ctx.transaction, i.get_oid(op->dest_oid)
-      ).si_then([&, op](auto dest_onode) {
+      ).si_then([&onodes, &d_onodes, op](auto dest_onode) {
 	assert(dest_onode);
 	auto &d_o = onodes[op->dest_oid];
 	assert(!d_o);
@@ -1644,7 +1640,7 @@ SeaStore::Shard::_do_transaction_step(
     } else {
       return OnodeManager::get_or_create_onode_iertr::now();
     }
-  }).si_then([&, op, this]() -> tm_ret {
+  }).si_then([&ctx, &i, &onodes, &d_onodes, op, this, FNAME]() -> tm_ret {
     LOG_PREFIX(SeaStore::_do_transaction_step);
     try {
       switch (op->op) {
@@ -2126,8 +2122,8 @@ SeaStore::Shard::_omap_clear(
 {
   LOG_PREFIX(SeaStore::_omap_clear);
   DEBUGT("{} {} keys", *ctx.transaction, *onode);
-  return _xattr_rmattr(ctx, onode, std::string(OMAP_HEADER_XATTR_KEY))
-    .si_then([this, &ctx, &onode]() -> tm_ret {
+  return _xattr_rmattr(ctx, onode, std::string(OMAP_HEADER_XATTR_KEY)
+  ).si_then([this, &ctx, &onode]() -> tm_ret {
     if (auto omap_root = onode->get_layout().omap_root.get(
       onode->get_metadata_hint(device->get_block_size()));
       omap_root.is_null()) {
@@ -2142,8 +2138,8 @@ SeaStore::Shard::_omap_clear(
         auto &omap_root) {
         return omap_manager.omap_clear(
           omap_root,
-          *ctx.transaction)
-        .si_then([&] {
+          *ctx.transaction
+        ).si_then([&] {
           if (omap_root.must_update()) {
 	    onode->update_omap_root(*ctx.transaction, omap_root);
           }
@@ -2489,6 +2485,21 @@ SeaStore::Shard::_get_collection(const coll_t& cid)
   return new SeastoreCollection{cid};
 }
 
+seastar::future<> SeaStore::write_meta(
+  const std::string& key,
+  const std::string& value) {
+  ceph_assert(seastar::this_shard_id() == primary_core);
+  return seastar::do_with(key, value,
+    [this](auto& key, auto& value) {
+    return shard_stores.local().write_meta(key, value
+    ).then([this, &key, &value] {
+      return mdstore->write_meta(key, value);
+    }).handle_error(
+      crimson::ct_error::assert_all{"Invalid error in SeaStore::write_meta"}
+    );
+  });
+}
+
 seastar::future<> SeaStore::Shard::write_meta(
   const std::string& key,
   const std::string& value)
@@ -2501,27 +2512,22 @@ seastar::future<> SeaStore::Shard::write_meta(
   // For TM::submit_transaction()
   ++(shard_stats.processing_inlock_io_num);
 
-  return seastar::do_with(
-      key, value,
-      [this, FNAME](auto& key, auto& value) {
-	return repeat_eagain([this, FNAME, &key, &value] {
-	  ++(shard_stats.repeat_io_num);
-
-	  return transaction_manager->with_transaction_intr(
-	    Transaction::src_t::MUTATE,
-            "write_meta",
-	    [this, FNAME, &key, &value](auto& t)
-          {
-            DEBUGT("Have transaction, key: {}; value: {}", t, key, value);
-            return transaction_manager->update_root_meta(
-              t, key, value
-            ).si_then([this, &t] {
-              return transaction_manager->submit_transaction(t);
-            });
-          });
-	});
-      }
-  ).handle_error(
+  return repeat_eagain([this, FNAME, &key, &value] {
+    ++(shard_stats.repeat_io_num);
+
+    return transaction_manager->with_transaction_intr(
+      Transaction::src_t::MUTATE,
+      "write_meta",
+      [this, FNAME, &key, &value](auto& t)
+    {
+      DEBUGT("Have transaction, key: {}; value: {}", t, key, value);
+      return transaction_manager->update_root_meta(
+        t, key, value
+      ).si_then([this, &t] {
+        return transaction_manager->submit_transaction(t);
+      });
+    });
+  }).handle_error(
     crimson::ct_error::assert_all{"Invalid error in SeaStore::write_meta"}
   ).finally([this] {
     assert(shard_stats.pending_io_num);
@@ -2535,10 +2541,11 @@ seastar::future<> SeaStore::Shard::write_meta(
 seastar::future<std::tuple<int, std::string>>
 SeaStore::read_meta(const std::string& key)
 {
-  ceph_assert(seastar::this_shard_id() == primary_core);
   LOG_PREFIX(SeaStore::read_meta);
   DEBUG("key: {}", key);
-  return mdstore->read_meta(key).safe_then([](auto v) {
+  ceph_assert(seastar::this_shard_id() == primary_core);
+  return mdstore->read_meta(key
+  ).safe_then([](auto v) {
     if (v) {
       return std::make_tuple(0, std::move(*v));
     } else {
diff --git a/src/crimson/os/seastore/seastore.h b/src/crimson/os/seastore/seastore.h
index fb495a422f656..58d4f5e593cbe 100644
--- a/src/crimson/os/seastore/seastore.h
+++ b/src/crimson/os/seastore/seastore.h
@@ -71,20 +71,19 @@ struct col_obj_ranges_t {
 
 class SeaStore final : public FuturizedStore {
 public:
+  using base_ertr = TransactionManager::base_ertr;
+  using base_iertr = TransactionManager::base_iertr;
+
   class MDStore {
   public:
-    using base_iertr = crimson::errorator<
-      crimson::ct_error::input_output_error
-    >;
-
-    using write_meta_ertr = base_iertr;
+    using write_meta_ertr = base_ertr;
     using write_meta_ret = write_meta_ertr::future<>;
     virtual write_meta_ret write_meta(
       const std::string &key,
       const std::string &val
     ) = 0;
 
-    using read_meta_ertr = base_iertr;
+    using read_meta_ertr = base_ertr;
     using read_meta_ret = write_meta_ertr::future<std::optional<std::string>>;
     virtual read_meta_ret read_meta(const std::string &key) = 0;
 
@@ -136,10 +135,7 @@ class SeaStore final : public FuturizedStore {
       const omap_keys_t& keys) final;
 
     /// Retrieves paged set of values > start (if present)
-    using omap_get_values_ret_bare_t = std::tuple<bool, omap_values_t>;
-    using omap_get_values_ret_t = read_errorator::future<
-      omap_get_values_ret_bare_t>;
-    omap_get_values_ret_t omap_get_values(
+    read_errorator::future<omap_values_paged_t> omap_get_values(
       CollectionRef c,           ///< [in] collection
       const ghobject_t &oid,     ///< [in] oid
       const std::optional<std::string> &start ///< [in] start, empty for begin
@@ -170,7 +166,7 @@ class SeaStore final : public FuturizedStore {
      * stages and locks as do_transaction. */
     seastar::future<> flush(CollectionRef ch) final;
 
-    read_errorator::future<std::map<uint64_t, uint64_t>> fiemap(
+    read_errorator::future<fiemap_ret_t> fiemap(
       CollectionRef ch,
       const ghobject_t& oid,
       uint64_t off,
@@ -190,7 +186,6 @@ class SeaStore final : public FuturizedStore {
       secondaries.emplace_back(&sec_dev);
     }
 
-    using coll_core_t = FuturizedStore::coll_core_t;
     seastar::future<std::vector<coll_core_t>> list_collections();
 
     seastar::future<> write_meta(const std::string& key,
@@ -334,14 +329,16 @@ class SeaStore final : public FuturizedStore {
       });
     }
 
-    using _fiemap_ret = ObjectDataHandler::fiemap_ret;
-    _fiemap_ret _fiemap(
-      Transaction &t,
-      Onode &onode,
-      uint64_t off,
-      uint64_t len) const;
+    using omap_list_bare_ret = OMapManager::omap_list_bare_ret;
+    using omap_list_ret = OMapManager::omap_list_ret;
+    omap_list_ret omap_list(
+      Onode& onode,
+      const omap_root_le_t& omap_root,
+      Transaction& t,
+      const std::optional<std::string>& start,
+      OMapManager::omap_list_config_t config) const;
 
-    using _omap_get_value_iertr = OMapManager::base_iertr::extend<
+    using _omap_get_value_iertr = base_iertr::extend<
       crimson::ct_error::enodata
       >;
     using _omap_get_value_ret = _omap_get_value_iertr::future<ceph::bufferlist>;
@@ -350,25 +347,20 @@ class SeaStore final : public FuturizedStore {
       omap_root_t &&root,
       std::string_view key) const;
 
-    using _omap_get_values_iertr = OMapManager::base_iertr;
-    using _omap_get_values_ret = _omap_get_values_iertr::future<omap_values_t>;
-    _omap_get_values_ret _omap_get_values(
+    base_iertr::future<omap_values_t> _omap_get_values(
       Transaction &t,
       omap_root_t &&root,
       const omap_keys_t &keys) const;
 
     friend class SeaStoreOmapIterator;
 
-    using omap_list_bare_ret = OMapManager::omap_list_bare_ret;
-    using omap_list_ret = OMapManager::omap_list_ret;
-    omap_list_ret omap_list(
+    base_iertr::future<fiemap_ret_t> _fiemap(
+      Transaction &t,
       Onode &onode,
-      const omap_root_le_t& omap_root,
-      Transaction& t,
-      const std::optional<std::string>& start,
-      OMapManager::omap_list_config_t config) const;
+      uint64_t off,
+      uint64_t len) const;
 
-    using tm_iertr = TransactionManager::base_iertr;
+    using tm_iertr = base_iertr;
     using tm_ret = tm_iertr::future<>;
     tm_ret _do_transaction_step(
       internal_context_t &ctx,
@@ -535,17 +527,7 @@ class SeaStore final : public FuturizedStore {
     return shard_stores.local().get_fsid();
   }
 
-  seastar::future<> write_meta(
-    const std::string& key,
-    const std::string& value) final {
-    ceph_assert(seastar::this_shard_id() == primary_core);
-    return shard_stores.local().write_meta(
-      key, value).then([this, key, value] {
-      return mdstore->write_meta(key, value);
-    }).handle_error(
-      crimson::ct_error::assert_all{"Invalid error in SeaStore::write_meta"}
-    );
-  }
+  seastar::future<> write_meta(const std::string& key, const std::string& value) final;
 
   seastar::future<std::tuple<int, std::string>> read_meta(const std::string& key) final;
 

From a49e49a1fd61914cd6cb2e1281c1733efe20abe7 Mon Sep 17 00:00:00 2001
From: Yingxin Cheng <yingxin.cheng@intel.com>
Date: Tue, 10 Sep 2024 11:41:15 +0800
Subject: [PATCH 061/148] crimson/os/seastore: convert length logs to the hex
 format

Signed-off-by: Yingxin Cheng <yingxin.cheng@intel.com>
---
 src/crimson/os/seastore/cached_extent.cc      | 10 +++++----
 src/crimson/os/seastore/cached_extent.h       |  2 +-
 .../onode_manager/staged-fltree/node_layout.h |  2 +-
 src/crimson/os/seastore/seastore_types.cc     | 22 ++++++++++---------
 4 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/src/crimson/os/seastore/cached_extent.cc b/src/crimson/os/seastore/cached_extent.cc
index cdad6dfb1b03d..76c18bde667a4 100644
--- a/src/crimson/os/seastore/cached_extent.cc
+++ b/src/crimson/os/seastore/cached_extent.cc
@@ -158,12 +158,14 @@ parent_tracker_t::~parent_tracker_t() {
 
 std::ostream &operator<<(std::ostream &out, const LBAMapping &rhs)
 {
-  out << "LBAMapping(" << rhs.get_key() << "~" << rhs.get_length()
+  out << "LBAMapping(" << rhs.get_key()
+      << "~0x" << std::hex << rhs.get_length() << std::dec
       << "->" << rhs.get_val();
   if (rhs.is_indirect()) {
-    out << " indirect(" << rhs.get_intermediate_base() << "~"
-	<< rhs.get_intermediate_key() << "~"
-	<< rhs.get_intermediate_length() << ")";
+    out << ",indirect(" << rhs.get_intermediate_base()
+        << "~0x" << std::hex << rhs.get_intermediate_length()
+        << "@0x" << rhs.get_intermediate_offset() << std::dec
+        << ")";
   }
   out << ")";
   return out;
diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h
index 6c5c6c6fcc292..6025725aa337d 100644
--- a/src/crimson/os/seastore/cached_extent.h
+++ b/src/crimson/os/seastore/cached_extent.h
@@ -350,7 +350,7 @@ class CachedExtent
 	<< ", modify_time=" << sea_time_point_printer_t{modify_time}
 	<< ", paddr=" << get_paddr()
 	<< ", prior_paddr=" << prior_poffset_str
-	<< ", length=" << get_length()
+	<< std::hex << ", length=0x" << get_length() << std::dec
 	<< ", state=" << state
 	<< ", last_committed_crc=" << last_committed_crc
 	<< ", refcount=" << use_count()
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h
index 960ea6ba41181..397a014a7c3d2 100644
--- a/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h
@@ -925,7 +925,7 @@ class NodeLayoutT final : public InternalNodeImpl, public LeafNodeImpl {
     std::ostringstream sos;
     sos << "Node" << NODE_TYPE << FIELD_TYPE
         << "@" << extent.get_laddr()
-        << "+" << std::hex << extent.get_length() << std::dec
+        << "+0x" << std::hex << extent.get_length() << std::dec
         << "Lv" << (unsigned)level()
         << (is_level_tail() ? "$" : "");
     name = sos.str();
diff --git a/src/crimson/os/seastore/seastore_types.cc b/src/crimson/os/seastore/seastore_types.cc
index e1430b30019a5..f379dd0117c8d 100644
--- a/src/crimson/os/seastore/seastore_types.cc
+++ b/src/crimson/os/seastore/seastore_types.cc
@@ -54,7 +54,9 @@ std::ostream &operator<<(std::ostream &out, const device_id_printer_t &id)
   } else if (_id == DEVICE_ID_ROOT) {
     return out << "Dev(ROOT)";
   } else {
-    return out << "Dev(" << (unsigned)_id << ")";
+    return out << "Dev(0x"
+               << std::hex << (unsigned)_id << std::dec
+               << ")";
   }
 }
 
@@ -64,7 +66,7 @@ std::ostream &operator<<(std::ostream &out, const segment_id_t &segment)
     return out << "Seg[NULL]";
   } else {
     return out << "Seg[" << device_id_printer_t{segment.device_id()}
-               << "," << segment.device_segment_id()
+               << ",0x" << std::hex << segment.device_segment_id() << std::dec
                << "]";
   }
 }
@@ -93,12 +95,12 @@ std::ostream& operator<<(std::ostream& out, segment_seq_printer_t seq)
 }
 
 std::ostream &operator<<(std::ostream &out, const laddr_t &laddr) {
-  return out << 'L' << std::hex << laddr.value << std::dec;
+  return out << "L0x" << std::hex << laddr.value << std::dec;
 }
 
 std::ostream &operator<<(std::ostream &out, const laddr_offset_t &laddr_offset) {
   return out << laddr_offset.get_aligned_laddr()
-	     << "+" << std::hex << laddr_offset.get_offset() << std::dec;
+	     << "+0x" << std::hex << laddr_offset.get_offset() << std::dec;
 }
 
 std::ostream &operator<<(std::ostream &out, const pladdr_t &pladdr)
@@ -123,18 +125,18 @@ std::ostream &operator<<(std::ostream &out, const paddr_t &rhs)
   } else if (has_device_off(id)) {
     auto &s = rhs.as_res_paddr();
     out << device_id_printer_t{id}
-        << ","
-        << s.get_device_off();
+        << ",0x"
+        << std::hex << s.get_device_off() << std::dec;
   } else if (rhs.get_addr_type() == paddr_types_t::SEGMENT) {
     auto &s = rhs.as_seg_paddr();
     out << s.get_segment_id()
-        << ","
-        << s.get_segment_off();
+        << ",0x"
+        << std::hex << s.get_segment_off() << std::dec;
   } else if (rhs.get_addr_type() == paddr_types_t::RANDOM_BLOCK) {
     auto &s = rhs.as_blk_paddr();
     out << device_id_printer_t{s.get_device_id()}
-        << ","
-        << s.get_device_off();
+        << ",0x"
+        << std::hex << s.get_device_off() << std::dec;
   } else {
     out << "INVALID!";
   }

From d39949e8e9c5db4692d5c5ab7168eb965e0c84e5 Mon Sep 17 00:00:00 2001
From: Yingxin Cheng <yingxin.cheng@intel.com>
Date: Sun, 29 Sep 2024 11:14:14 +0800
Subject: [PATCH 062/148] crimson/os/seastore: adjust op names

Signed-off-by: Yingxin Cheng <yingxin.cheng@intel.com>
---
 src/crimson/os/seastore/seastore.cc | 38 ++++++++++++++---------------
 src/crimson/os/seastore/seastore.h  |  4 +--
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/src/crimson/os/seastore/seastore.cc b/src/crimson/os/seastore/seastore.cc
index d708231b47b1a..9206d38035a6c 100644
--- a/src/crimson/os/seastore/seastore.cc
+++ b/src/crimson/os/seastore/seastore.cc
@@ -40,8 +40,8 @@ template <> struct fmt::formatter<crimson::os::seastore::op_type_t>
   auto format(op_type_t op, FormatContext& ctx) const {
     std::string_view name = "unknown";
     switch (op) {
-      case op_type_t::TRANSACTION:
-      name = "transaction";
+      case op_type_t::DO_TRANSACTION:
+      name = "do_transaction";
       break;
     case op_type_t::READ:
       name = "read";
@@ -61,8 +61,8 @@ template <> struct fmt::formatter<crimson::os::seastore::op_type_t>
     case op_type_t::OMAP_GET_VALUES:
       name = "omap_get_values";
       break;
-    case op_type_t::OMAP_LIST:
-      name = "omap_list";
+    case op_type_t::OMAP_GET_VALUES2:
+      name = "omap_get_values2";
       break;
     case op_type_t::MAX:
       name = "unknown";
@@ -141,14 +141,14 @@ void SeaStore::Shard::register_metrics()
   namespace sm = seastar::metrics;
   using op_type_t = crimson::os::seastore::op_type_t;
   std::pair<op_type_t, sm::label_instance> labels_by_op_type[] = {
-    {op_type_t::TRANSACTION,     sm::label_instance("latency", "TRANSACTION")},
-    {op_type_t::READ,            sm::label_instance("latency", "READ")},
-    {op_type_t::WRITE,           sm::label_instance("latency", "WRITE")},
-    {op_type_t::GET_ATTR,        sm::label_instance("latency", "GET_ATTR")},
-    {op_type_t::GET_ATTRS,       sm::label_instance("latency", "GET_ATTRS")},
-    {op_type_t::STAT,            sm::label_instance("latency", "STAT")},
-    {op_type_t::OMAP_GET_VALUES, sm::label_instance("latency",  "OMAP_GET_VALUES")},
-    {op_type_t::OMAP_LIST,       sm::label_instance("latency", "OMAP_LIST")},
+    {op_type_t::DO_TRANSACTION,   sm::label_instance("latency", "DO_TRANSACTION")},
+    {op_type_t::READ,             sm::label_instance("latency", "READ")},
+    {op_type_t::WRITE,            sm::label_instance("latency", "WRITE")},
+    {op_type_t::GET_ATTR,         sm::label_instance("latency", "GET_ATTR")},
+    {op_type_t::GET_ATTRS,        sm::label_instance("latency", "GET_ATTRS")},
+    {op_type_t::STAT,             sm::label_instance("latency", "STAT")},
+    {op_type_t::OMAP_GET_VALUES,  sm::label_instance("latency", "OMAP_GET_VALUES")},
+    {op_type_t::OMAP_GET_VALUES2, sm::label_instance("latency", "OMAP_GET_VALUES2")},
   };
 
   for (auto& [op_type, label] : labels_by_op_type) {
@@ -1033,7 +1033,7 @@ SeaStore::Shard::read(
     ch,
     oid,
     Transaction::src_t::READ,
-    "read_obj",
+    "read",
     op_type_t::READ,
     [this, offset, len, op_flags](auto &t, auto &onode) -> ObjectDataHandler::read_ret {
       size_t size = onode.get_layout().size;
@@ -1076,7 +1076,7 @@ SeaStore::Shard::exists(
     c,
     oid,
     Transaction::src_t::READ,
-    "oid_exists",
+    "exists",
     op_type_t::READ,
     [](auto&, auto&) {
     return seastar::make_ready_future<bool>(true);
@@ -1180,7 +1180,7 @@ SeaStore::Shard::get_attrs(
     ch,
     oid,
     Transaction::src_t::READ,
-    "get_addrs",
+    "get_attrs",
     op_type_t::GET_ATTRS,
     [this](auto &t, auto& onode) {
       auto& layout = onode.get_layout();
@@ -1389,8 +1389,8 @@ SeaStore::Shard::omap_get_values(
     ch,
     oid,
     Transaction::src_t::READ,
-    "omap_list",
-    op_type_t::OMAP_LIST,
+    "omap_get_values2",
+    op_type_t::OMAP_GET_VALUES2,
     [this, start](auto &t, auto &onode) {
       return omap_list(
 	onode,
@@ -1445,7 +1445,7 @@ SeaStore::Shard::fiemap(
     ch,
     oid,
     Transaction::src_t::READ,
-    "fiemap_read",
+    "fiemap",
     op_type_t::READ,
     [this, off, len](auto &t, auto &onode) -> base_iertr::future<fiemap_ret_t> {
     size_t size = onode.get_layout().size;
@@ -1490,7 +1490,7 @@ seastar::future<> SeaStore::Shard::do_transaction_no_callbacks(
     std::move(_t),
     Transaction::src_t::MUTATE,
     "do_transaction",
-    op_type_t::TRANSACTION,
+    op_type_t::DO_TRANSACTION,
     [this](auto &ctx) {
       return with_trans_intr(*ctx.transaction, [&ctx, this](auto &t) {
         LOG_PREFIX(SeaStore::Shard::do_transaction_no_callbacks);
diff --git a/src/crimson/os/seastore/seastore.h b/src/crimson/os/seastore/seastore.h
index 58d4f5e593cbe..611e909619ac4 100644
--- a/src/crimson/os/seastore/seastore.h
+++ b/src/crimson/os/seastore/seastore.h
@@ -35,14 +35,14 @@ using OnodeRef = boost::intrusive_ptr<Onode>;
 class TransactionManager;
 
 enum class op_type_t : uint8_t {
-    TRANSACTION = 0,
+    DO_TRANSACTION = 0,
     READ,
     WRITE,
     GET_ATTR,
     GET_ATTRS,
     STAT,
     OMAP_GET_VALUES,
-    OMAP_LIST,
+    OMAP_GET_VALUES2,
     MAX
 };
 

From 589e9cb9356ad55b7cfbb1b0041e8cbd7fd71e57 Mon Sep 17 00:00:00 2001
From: Yingxin Cheng <yingxin.cheng@intel.com>
Date: Sun, 29 Sep 2024 11:47:46 +0800
Subject: [PATCH 063/148] crimson/os/seastore: refine logs of seastore.h/cc

Signed-off-by: Yingxin Cheng <yingxin.cheng@intel.com>
---
 src/crimson/os/seastore/seastore.cc | 398 ++++++++++++++++++----------
 src/crimson/os/seastore/seastore.h  |   9 +-
 src/osd/osd_types_fmt.h             |   2 +
 3 files changed, 265 insertions(+), 144 deletions(-)

diff --git a/src/crimson/os/seastore/seastore.cc b/src/crimson/os/seastore/seastore.cc
index 9206d38035a6c..e2dee84637831 100644
--- a/src/crimson/os/seastore/seastore.cc
+++ b/src/crimson/os/seastore/seastore.cc
@@ -17,6 +17,7 @@
 #include "common/safe_io.h"
 #include "include/stringify.h"
 #include "os/Transaction.h"
+#include "osd/osd_types_fmt.h"
 
 #include "crimson/common/buffer_io.h"
 
@@ -192,6 +193,9 @@ void SeaStore::Shard::register_metrics()
 
 seastar::future<> SeaStore::start()
 {
+  LOG_PREFIX(SeaStore::start);
+  INFO("...");
+
   ceph_assert(seastar::this_shard_id() == primary_core);
 #ifndef NDEBUG
   bool is_test = true;
@@ -212,19 +216,30 @@ seastar::future<> SeaStore::start()
   }).then([this, is_test] {
     ceph_assert(device);
     return shard_stores.start(root, device.get(), is_test);
+  }).then([FNAME] {
+    INFO("done");
   });
 }
 
 seastar::future<> SeaStore::test_start(DeviceRef device_obj)
 {
+  LOG_PREFIX(SeaStore::test_start);
+  INFO("...");
+
   ceph_assert(device_obj);
   ceph_assert(root == "");
   device = std::move(device_obj);
-  return shard_stores.start_single(root, device.get(), true);
+  return shard_stores.start_single(root, device.get(), true
+  ).then([FNAME] {
+    INFO("done");
+  });
 }
 
 seastar::future<> SeaStore::stop()
 {
+  LOG_PREFIX(SeaStore::stop);
+  INFO("...");
+
   ceph_assert(seastar::this_shard_id() == primary_core);
   return seastar::do_for_each(secondaries, [](auto& sec_dev) {
     return sec_dev->stop();
@@ -237,17 +252,28 @@ seastar::future<> SeaStore::stop()
     }
   }).then([this] {
     return shard_stores.stop();
+  }).then([FNAME] {
+    INFO("done");
   });
 }
 
 SeaStore::mount_ertr::future<> SeaStore::test_mount()
 {
+  LOG_PREFIX(SeaStore::test_mount);
+  INFO("...");
+
   ceph_assert(seastar::this_shard_id() == primary_core);
-  return shard_stores.local().mount_managers();
+  return shard_stores.local().mount_managers(
+  ).then([FNAME] {
+    INFO("done");
+  });
 }
 
 SeaStore::mount_ertr::future<> SeaStore::mount()
 {
+  LOG_PREFIX(SeaStore::mount);
+  INFO("...");
+
   ceph_assert(seastar::this_shard_id() == primary_core);
   return device->mount(
   ).safe_then([this] {
@@ -281,6 +307,8 @@ SeaStore::mount_ertr::future<> SeaStore::mount()
     return shard_stores.invoke_on_all([](auto &local_store) {
       return local_store.mount_managers();
     });
+  }).safe_then([FNAME] {
+    INFO("done");
   }).handle_error(
     crimson::ct_error::assert_all{
       "Invalid error in SeaStore::mount"
@@ -300,9 +328,14 @@ seastar::future<> SeaStore::Shard::mount_managers()
 
 seastar::future<> SeaStore::umount()
 {
+  LOG_PREFIX(SeaStore::umount);
+  INFO("...");
+
   ceph_assert(seastar::this_shard_id() == primary_core);
   return shard_stores.invoke_on_all([](auto &local_store) {
     return local_store.umount();
+  }).then([FNAME] {
+    INFO("done");
   });
 }
 
@@ -330,7 +363,7 @@ seastar::future<> SeaStore::Shard::umount()
     onode_manager.reset();
   }).handle_error(
     crimson::ct_error::assert_all{
-      "Invalid error in SeaStore::umount"
+      "Invalid error in SeaStoreS::umount"
     }
   );
 }
@@ -377,6 +410,8 @@ SeaStore::Shard::mkfs_managers()
 	"mkfs_seastore",
 	[this](auto& t)
       {
+        LOG_PREFIX(SeaStoreS::mkfs_managers);
+        DEBUGT("...", t);
 	return onode_manager->mkfs(t
 	).si_then([this, &t] {
 	  return collection_manager->mkfs(t);
@@ -410,16 +445,22 @@ seastar::future<> SeaStore::set_secondaries()
 
 SeaStore::mkfs_ertr::future<> SeaStore::test_mkfs(uuid_d new_osd_fsid)
 {
+  LOG_PREFIX(SeaStore::test_mkfs);
+  INFO("uuid={} ...", new_osd_fsid);
+
   ceph_assert(seastar::this_shard_id() == primary_core);
   return read_meta("mkfs_done"
-  ).then([this, new_osd_fsid](auto tuple) {
+  ).then([this, new_osd_fsid, FNAME](auto tuple) {
     auto [done, value] = tuple;
     if (done == 0) {
+      ERROR("failed");
       return seastar::now();
     } 
     return shard_stores.local().mkfs_managers(
     ).then([this, new_osd_fsid] {
       return prepare_meta(new_osd_fsid);
+    }).then([FNAME] {
+      INFO("done");
     });
   });
 }
@@ -447,28 +488,29 @@ seastar::future<> SeaStore::prepare_meta(uuid_d new_osd_fsid)
 
 SeaStore::mkfs_ertr::future<> SeaStore::mkfs(uuid_d new_osd_fsid)
 {
+  LOG_PREFIX(SeaStore::mkfs);
+  INFO("uuid={}, root={} ...", new_osd_fsid, root);
+
   ceph_assert(seastar::this_shard_id() == primary_core);
   return read_meta("mkfs_done"
-  ).then([this, new_osd_fsid](auto tuple) {
+  ).then([this, new_osd_fsid, FNAME](auto tuple) {
     auto [done, value] = tuple;
     if (done == 0) {
+      ERROR("failed");
       return seastar::now();
     } else {
       return seastar::do_with(
         secondary_device_set_t(),
-        [this, new_osd_fsid](auto& sds) {
+        [this, new_osd_fsid, FNAME](auto& sds) {
         auto fut = seastar::now();
-        LOG_PREFIX(SeaStore::mkfs);
-        DEBUG("root: {}", root);
         if (!root.empty()) {
           fut = seastar::open_directory(root
-          ).then([this, &sds, new_osd_fsid](seastar::file rdir) mutable {
+          ).then([this, &sds, new_osd_fsid, FNAME](seastar::file rdir) mutable {
             std::unique_ptr<seastar::file> root_f =
               std::make_unique<seastar::file>(std::move(rdir));
             auto sub = root_f->list_directory(
-              [this, &sds, new_osd_fsid](auto de) mutable -> seastar::future<>
+              [this, &sds, new_osd_fsid, FNAME](auto de) mutable -> seastar::future<>
             {
-              LOG_PREFIX(SeaStore::mkfs);
               DEBUG("found file: {}", de.name);
               if (de.name.find("block.") == 0
                   && de.name.length() > 6 /* 6 for "block." */) {
@@ -533,6 +575,8 @@ SeaStore::mkfs_ertr::future<> SeaStore::mkfs(uuid_d new_osd_fsid)
         return prepare_meta(new_osd_fsid);
       }).safe_then([this] {
 	return umount();
+      }).safe_then([FNAME] {
+        INFO("done");
       }).handle_error(
         crimson::ct_error::assert_all{
           "Invalid error in SeaStore::mkfs"
@@ -546,14 +590,18 @@ using coll_core_t = SeaStore::coll_core_t;
 seastar::future<std::vector<coll_core_t>>
 SeaStore::list_collections()
 {
+  LOG_PREFIX(SeaStore::list_collections);
+  DEBUG("...");
+
   ceph_assert(seastar::this_shard_id() == primary_core);
   return shard_stores.map([](auto &local_store) {
     return local_store.list_collections();
-  }).then([](std::vector<std::vector<coll_core_t>> results) {
+  }).then([FNAME](std::vector<std::vector<coll_core_t>> results) {
     std::vector<coll_core_t> collections;
     for (auto& colls : results) {
       collections.insert(collections.end(), colls.begin(), colls.end());
     }
+    DEBUG("got {} collections", collections.size());
     return seastar::make_ready_future<std::vector<coll_core_t>>(
       std::move(collections));
   });
@@ -561,13 +609,16 @@ SeaStore::list_collections()
 
 store_statfs_t SeaStore::Shard::stat() const
 {
-  return transaction_manager->store_stat();
+  LOG_PREFIX(SeaStoreS::stat);
+  auto ss = transaction_manager->store_stat();
+  DEBUG("stat={}", ss);
+  return ss;
 }
 
 seastar::future<store_statfs_t> SeaStore::stat() const
 {
   LOG_PREFIX(SeaStore::stat);
-  DEBUG("");
+  DEBUG("...");
 
   ceph_assert(seastar::this_shard_id() == primary_core);
   return shard_stores.map_reduce0(
@@ -579,19 +630,30 @@ seastar::future<store_statfs_t> SeaStore::stat() const
       ss.add(ret);
       return std::move(ss);
     }
-  ).then([](store_statfs_t ss) {
+  ).then([FNAME](store_statfs_t ss) {
+    DEBUG("done, stat={}", ss);
     return seastar::make_ready_future<store_statfs_t>(std::move(ss));
   });
 }
 
 seastar::future<store_statfs_t> SeaStore::pool_statfs(int64_t pool_id) const
 {
-   //TODO
-   return SeaStore::stat();
+  LOG_PREFIX(SeaStore::pool_statfs);
+  DEBUG("pool_id={} ...", pool_id);
+  ceph_assert(seastar::this_shard_id() == primary_core);
+  //TODO
+  return SeaStore::stat(
+  ).then([FNAME, pool_id](store_statfs_t ss) {
+    DEBUG("done, pool_id={}, ret={}", pool_id, ss);
+    return seastar::make_ready_future<store_statfs_t>(std::move(ss));
+  });
 }
 
 seastar::future<> SeaStore::report_stats()
 {
+  LOG_PREFIX(SeaStore::report_stats);
+  DEBUG("...");
+
   ceph_assert(seastar::this_shard_id() == primary_core);
   shard_device_stats.resize(seastar::smp::count);
   shard_io_stats.resize(seastar::smp::count);
@@ -610,8 +672,7 @@ seastar::future<> SeaStore::report_stats()
       local_store.get_io_stats(report_detail, seconds);
     shard_cache_stats[seastar::this_shard_id()] =
       local_store.get_cache_stats(report_detail, seconds);
-  }).then([this] {
-    LOG_PREFIX(SeaStore);
+  }).then([this, FNAME] {
     auto now = seastar::lowres_clock::now();
     if (last_tp == seastar::lowres_clock::time_point::min()) {
       last_tp = now;
@@ -858,24 +919,26 @@ SeaStore::Shard::list_objects(CollectionRef ch,
         "list_objects",
         [this, ch, start, end, &limit, &ret](auto &t)
       {
+        LOG_PREFIX(SeaStoreS::list_objects);
+        DEBUGT("cid={} start={} end={} limit={} ...",
+               t, ch->get_cid(), start, end, limit);
         return get_coll_bits(
           ch, t
-	).si_then([this, ch, &t, start, end, &limit, &ret](auto bits) {
+	).si_then([FNAME, this, ch, &t, start, end, &limit, &ret](auto bits) {
           if (!bits) {
+            DEBUGT("no bits, return none", t);
             return list_iertr::make_ready_future<
               OnodeManager::list_onodes_bare_ret
 	      >(std::make_tuple(
 		  std::vector<ghobject_t>(),
 		  ghobject_t::get_max()));
           } else {
-	    LOG_PREFIX(SeaStore::list_objects);
-	    DEBUGT("start {}, end {}, limit {}, bits {}",
-	      t, start, end, limit, *bits);
+	    DEBUGT("bits={} ...", t, *bits);
             auto filter = SeaStore::get_objs_range(ch, *bits);
 	    using list_iertr = OnodeManager::list_onodes_iertr;
 	    using repeat_ret = list_iertr::future<seastar::stop_iteration>;
             return trans_intr::repeat(
-              [this, &t, &ret, &limit, end,
+              [this, FNAME, &t, &ret, &limit, end,
 	       filter, ranges = get_ranges(ch, start, end, filter)
 	      ]() mutable -> repeat_ret {
 		if (limit == 0 || ranges.empty()) {
@@ -887,11 +950,10 @@ SeaStore::Shard::list_objects(CollectionRef ch,
 		auto pstart = ite->first;
 		auto pend = ite->second;
 		ranges.pop_front();
-		LOG_PREFIX(SeaStore::list_objects);
-		DEBUGT("pstart {}, pend {}, limit {}", t, pstart, pend, limit);
+		DEBUGT("pstart {}, pend {}, limit {} ...", t, pstart, pend, limit);
 		return onode_manager->list_onodes(
 		  t, pstart, pend, limit
-		).si_then([&limit, &ret, pend, &t, last=ranges.empty(), end]
+		).si_then([&limit, &ret, pend, &t, last=ranges.empty(), end, FNAME]
 			  (auto &&_ret) mutable {
 		  auto &next_objects = std::get<0>(_ret);
 		  auto &ret_objects = std::get<0>(ret);
@@ -902,7 +964,6 @@ SeaStore::Shard::list_objects(CollectionRef ch,
 		  std::get<1>(ret) = std::get<1>(_ret);
 		  assert(limit >= next_objects.size());
 		  limit -= next_objects.size();
-		  LOG_PREFIX(SeaStore::list_objects);
 		  DEBUGT("got {} objects, left limit {}",
 		    t, next_objects.size(), limit);
 		  assert(limit == 0 ||
@@ -916,7 +977,9 @@ SeaStore::Shard::list_objects(CollectionRef ch,
 		    >(seastar::stop_iteration::no);
 		});
 	      }
-            ).si_then([&ret] {
+            ).si_then([&ret, FNAME] {
+              DEBUG("got {} objects, next={}",
+                    std::get<0>(ret).size(), std::get<1>(ret));
               return list_iertr::make_ready_future<
                 OnodeManager::list_onodes_bare_ret>(std::move(ret));
             });
@@ -929,7 +992,7 @@ SeaStore::Shard::list_objects(CollectionRef ch,
       return std::move(ret);
     }).handle_error(
       crimson::ct_error::assert_all{
-        "Invalid error in SeaStore::list_objects"
+        "Invalid error in SeaStoreS::list_objects"
       }
     );
   }).finally([this] {
@@ -941,24 +1004,26 @@ SeaStore::Shard::list_objects(CollectionRef ch,
 seastar::future<CollectionRef>
 SeaStore::Shard::create_new_collection(const coll_t& cid)
 {
-  LOG_PREFIX(SeaStore::create_new_collection);
-  DEBUG("{}", cid);
+  LOG_PREFIX(SeaStoreS::create_new_collection);
+  DEBUG("cid={}", cid);
   return seastar::make_ready_future<CollectionRef>(_get_collection(cid));
 }
 
 seastar::future<CollectionRef>
 SeaStore::Shard::open_collection(const coll_t& cid)
 {
-  LOG_PREFIX(SeaStore::open_collection);
-  DEBUG("{}", cid);
+  LOG_PREFIX(SeaStoreS::open_collection);
+  DEBUG("cid={} ...", cid);
   return list_collections(
-  ).then([cid, this] (auto colls_cores) {
+  ).then([cid, this, FNAME] (auto colls_cores) {
     if (auto found = std::find(colls_cores.begin(),
                                colls_cores.end(),
                                std::make_pair(cid, seastar::this_shard_id()));
       found != colls_cores.end()) {
+      DEBUG("cid={} exists", cid);
       return seastar::make_ready_future<CollectionRef>(_get_collection(cid));
     } else {
+      DEBUG("cid={} not exists", cid);
       return seastar::make_ready_future<CollectionRef>();
     }
   });
@@ -968,6 +1033,8 @@ seastar::future<>
 SeaStore::Shard::set_collection_opts(CollectionRef c,
                                         const pool_opts_t& opts)
 {
+  LOG_PREFIX(SeaStoreS::set_collection_opts);
+  DEBUG("cid={}, opts={} not implemented", c->get_cid(), opts);
   //TODO
   return seastar::now();
 }
@@ -989,6 +1056,8 @@ SeaStore::Shard::list_collections()
           "list_collections",
           [this, &ret](auto& t)
         {
+          LOG_PREFIX(SeaStoreS::list_collections);
+          DEBUGT("...", t);
           return transaction_manager->read_collection_root(t
           ).si_then([this, &t](auto coll_root) {
             return collection_manager->list(coll_root, t);
@@ -1007,7 +1076,7 @@ SeaStore::Shard::list_collections()
     }
   ).handle_error(
     crimson::ct_error::assert_all{
-      "Invalid error in SeaStore::list_collections"
+      "Invalid error in SeaStoreS::list_collections"
     }
   ).finally([this] {
     assert(shard_stats.pending_read_num);
@@ -1023,9 +1092,6 @@ SeaStore::Shard::read(
   size_t len,
   uint32_t op_flags)
 {
-  LOG_PREFIX(SeaStore::read);
-  DEBUG("oid {} offset {} len {}", oid, offset, len);
-
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
 
@@ -1036,12 +1102,16 @@ SeaStore::Shard::read(
     "read",
     op_type_t::READ,
     [this, offset, len, op_flags](auto &t, auto &onode) -> ObjectDataHandler::read_ret {
+      LOG_PREFIX(SeaStoreS::read);
       size_t size = onode.get_layout().size;
-
       if (offset >= size) {
+        DEBUGT("0x{:x}~0x{:x} onode-size=0x{:x} flags=0x{:x}, got none",
+               t, offset, len, size, op_flags);
 	return seastar::make_ready_future<ceph::bufferlist>();
       }
 
+      DEBUGT("0x{:x}~0x{:x} onode-size=0x{:x} flags=0x{:x} ...",
+             t, offset, len, size, op_flags);
       size_t corrected_len = (len == 0) ?
 	size - offset :
 	std::min(size - offset, len);
@@ -1053,7 +1123,11 @@ SeaStore::Shard::read(
           onode,
         },
         offset,
-        corrected_len);
+        corrected_len
+      ).si_then([FNAME, &t](auto bl) {
+        DEBUGT("got bl length=0x{:x}", t, bl.length());
+        return bl;
+      });
     }
   ).finally([this] {
     assert(shard_stats.pending_read_num);
@@ -1066,9 +1140,7 @@ SeaStore::Shard::exists(
   CollectionRef c,
   const ghobject_t& oid)
 {
-  LOG_PREFIX(SeaStore::exists);
-  DEBUG("oid {}", oid);
-
+  LOG_PREFIX(SeaStoreS::exists);
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
 
@@ -1078,10 +1150,12 @@ SeaStore::Shard::exists(
     Transaction::src_t::READ,
     "exists",
     op_type_t::READ,
-    [](auto&, auto&) {
+    [FNAME](auto& t, auto&) {
+    DEBUGT("exists", t);
     return seastar::make_ready_future<bool>(true);
   }).handle_error(
-    crimson::ct_error::enoent::handle([] {
+    crimson::ct_error::enoent::handle([FNAME] {
+      DEBUG("not exists");
       return seastar::make_ready_future<bool>(false);
     }),
     crimson::ct_error::assert_all{"unexpected error"}
@@ -1098,10 +1172,14 @@ SeaStore::Shard::readv(
   interval_set<uint64_t>& m,
   uint32_t op_flags)
 {
+  LOG_PREFIX(SeaStoreS::readv);
+  DEBUG("cid={} oid={} op_flags=0x{:x} {} intervals",
+        ch->get_cid(), _oid, op_flags, m.num_intervals());
+
   return seastar::do_with(
     _oid,
     ceph::bufferlist{},
-    [ch, op_flags, this, &m](auto &oid, auto &ret) {
+    [ch, op_flags, this, FNAME, &m](auto &oid, auto &ret) {
     return crimson::do_for_each(
       m,
       [ch, op_flags, this, &oid, &ret](auto &p) {
@@ -1110,7 +1188,8 @@ SeaStore::Shard::readv(
 	).safe_then([&ret](auto bl) {
         ret.claim_append(bl);
       });
-    }).safe_then([&ret] {
+    }).safe_then([&ret, FNAME] {
+      DEBUG("got bl length=0x{:x}", ret.length());
       return read_errorator::make_ready_future<ceph::bufferlist>
         (std::move(ret));
     });
@@ -1125,9 +1204,6 @@ SeaStore::Shard::get_attr(
   const ghobject_t& oid,
   std::string_view name) const
 {
-  LOG_PREFIX(SeaStore::get_attr);
-  DEBUG("{} {}", ch->get_cid(), oid);
-
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
 
@@ -1138,17 +1214,21 @@ SeaStore::Shard::get_attr(
     "get_attr",
     op_type_t::GET_ATTR,
     [this, name](auto &t, auto& onode) -> _omap_get_value_ret {
+      LOG_PREFIX(SeaStoreS::get_attr);
       auto& layout = onode.get_layout();
       if (name == OI_ATTR && layout.oi_size) {
         ceph::bufferlist bl;
         bl.append(ceph::bufferptr(&layout.oi[0], layout.oi_size));
+        DEBUGT("got OI_ATTR, value length=0x{:x}", t, bl.length());
         return seastar::make_ready_future<ceph::bufferlist>(std::move(bl));
       }
       if (name == SS_ATTR && layout.ss_size) {
         ceph::bufferlist bl;
         bl.append(ceph::bufferptr(&layout.ss[0], layout.ss_size));
+        DEBUGT("got SS_ATTR, value length=0x{:x}", t, bl.length());
         return seastar::make_ready_future<ceph::bufferlist>(std::move(bl));
       }
+      DEBUGT("name={} ...", t, name);
       return _omap_get_value(
         t,
         layout.xattr_root.get(
@@ -1170,9 +1250,6 @@ SeaStore::Shard::get_attrs(
   CollectionRef ch,
   const ghobject_t& oid)
 {
-  LOG_PREFIX(SeaStore::get_attrs);
-  DEBUG("{} {}", ch->get_cid(), oid);
-
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
 
@@ -1183,6 +1260,8 @@ SeaStore::Shard::get_attrs(
     "get_attrs",
     op_type_t::GET_ATTRS,
     [this](auto &t, auto& onode) {
+      LOG_PREFIX(SeaStoreS::get_attrs);
+      DEBUGT("...", t);
       auto& layout = onode.get_layout();
       return omap_list(onode, layout.xattr_root, t, std::nullopt,
         OMapManager::omap_list_config_t()
@@ -1190,17 +1269,17 @@ SeaStore::Shard::get_attrs(
 	  .without_max()
       ).si_then([&layout, &t, FNAME](auto p) {
         auto& attrs = std::get<1>(p);
+        DEBUGT("got {} attrs, OI length=0x{:x}, SS length=0x{:x}",
+               t, attrs.size(), (uint32_t)layout.oi_size, (uint32_t)layout.ss_size);
         ceph::bufferlist bl;
         if (layout.oi_size) {
           bl.append(ceph::bufferptr(&layout.oi[0], layout.oi_size));
           attrs.emplace(OI_ATTR, std::move(bl));
-         DEBUGT("set oi from onode layout", t);
         }
         if (layout.ss_size) {
           bl.clear();
           bl.append(ceph::bufferptr(&layout.ss[0], layout.ss_size));
           attrs.emplace(SS_ATTR, std::move(bl));
-         DEBUGT("set ss from onode layout", t);
         }
         return seastar::make_ready_future<attrs_t>(std::move(attrs));
       });
@@ -1222,7 +1301,6 @@ seastar::future<struct stat> SeaStore::Shard::stat(
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
 
-  LOG_PREFIX(SeaStore::stat);
   return repeat_with_onode<struct stat>(
     c,
     oid,
@@ -1230,18 +1308,20 @@ seastar::future<struct stat> SeaStore::Shard::stat(
     "stat",
     op_type_t::STAT,
     [this, oid](auto &t, auto &onode) {
+      LOG_PREFIX(SeaStoreS::stat);
       struct stat st;
       auto &olayout = onode.get_layout();
       st.st_size = olayout.size;
       st.st_blksize = device->get_block_size();
       st.st_blocks = (st.st_size + st.st_blksize - 1) / st.st_blksize;
       st.st_nlink = 1;
-      DEBUGT("cid {}, oid {}, return size {}", t, c->get_cid(), oid, st.st_size);
+      DEBUGT("oid={}, size={}, blksize={}",
+             t, oid, st.st_size, st.st_blksize);
       return seastar::make_ready_future<struct stat>(st);
     }
   ).handle_error(
     crimson::ct_error::assert_all{
-      "Invalid error in SeaStore::stat"
+      "Invalid error in SeaStoreS::stat"
     }
   ).finally([this] {
     assert(shard_stats.pending_read_num);
@@ -1273,6 +1353,8 @@ SeaStore::Shard::omap_get_values(
     "omap_get_values",
     op_type_t::OMAP_GET_VALUES,
     [this, keys](auto &t, auto &onode) {
+      LOG_PREFIX(SeaStoreS::omap_get_values);
+      DEBUGT("{} keys ...", t, keys.size());
       omap_root_t omap_root = onode.get_layout().omap_root.get(
 	onode.get_metadata_hint(device->get_block_size()));
       return _omap_get_values(
@@ -1297,14 +1379,18 @@ SeaStore::Shard::_omap_get_value(
     std::move(root),
     std::string(key),
     [&t](auto &manager, auto& root, auto& key) -> _omap_get_value_ret {
+    LOG_PREFIX(SeaStoreS::_omap_get_value);
     if (root.is_null()) {
+      DEBUGT("key={} is absent because of null root", t, key);
       return crimson::ct_error::enodata::make();
     }
     return manager.omap_get_value(root, t, key
-    ).si_then([](auto opt) -> _omap_get_value_ret {
+    ).si_then([&key, &t, FNAME](auto opt) -> _omap_get_value_ret {
       if (!opt) {
+        DEBUGT("key={} is absent", t, key);
         return crimson::ct_error::enodata::make();
       }
+      DEBUGT("key={}, value length=0x{:x}", t, key, opt->length());
       return seastar::make_ready_future<ceph::bufferlist>(std::move(*opt));
     });
   });
@@ -1316,14 +1402,16 @@ SeaStore::Shard::_omap_get_values(
   omap_root_t &&omap_root,
   const omap_keys_t &keys) const
 {
+  LOG_PREFIX(SeaStoreS::_omap_get_values);
   if (omap_root.is_null()) {
+    DEBUGT("{} keys are absent because of null root", t, keys.size());
     return seastar::make_ready_future<omap_values_t>();
   }
   return seastar::do_with(
     BtreeOMapManager(*transaction_manager),
     std::move(omap_root),
     omap_values_t(),
-    [&t, &keys](auto &manager, auto &root, auto &ret) {
+    [&t, &keys, FNAME](auto &manager, auto &root, auto &ret) {
     return trans_intr::do_for_each(
       keys.begin(),
       keys.end(),
@@ -1342,7 +1430,8 @@ SeaStore::Shard::_omap_get_values(
         }
         return seastar::now();
       });
-    }).si_then([&ret] {
+    }).si_then([&t, &ret, &keys, FNAME] {
+      DEBUGT("{} keys got {} values", t, keys.size(), ret.size());
       return std::move(ret);
     });
   });
@@ -1379,9 +1468,6 @@ SeaStore::Shard::omap_get_values(
   const ghobject_t &oid,
   const std::optional<std::string> &start)
 {
-  LOG_PREFIX(SeaStore::omap_get_values);
-  DEBUG("{} {}", ch->get_cid(), oid);
-
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
 
@@ -1392,6 +1478,8 @@ SeaStore::Shard::omap_get_values(
     "omap_get_values2",
     op_type_t::OMAP_GET_VALUES2,
     [this, start](auto &t, auto &onode) {
+      LOG_PREFIX(SeaStoreS::omap_get_values);
+      DEBUGT("start={} ...", t, start.has_value() ? *start : "");
       return omap_list(
 	onode,
 	onode.get_layout().omap_root,
@@ -1399,7 +1487,12 @@ SeaStore::Shard::omap_get_values(
 	start,
 	OMapManager::omap_list_config_t()
 	  .with_inclusive(false, false)
-	  .without_max());
+	  .without_max()
+      ).si_then([FNAME, &t](omap_values_paged_t ret) {
+        DEBUGT("got {} values, complete={}",
+               t, std::get<1>(ret).size(), std::get<0>(ret));
+        return ret;
+      });
     }
   ).finally([this] {
     assert(shard_stats.pending_read_num);
@@ -1414,6 +1507,7 @@ SeaStore::Shard::_fiemap(
   uint64_t off,
   uint64_t len) const
 {
+  LOG_PREFIX(SeaStoreS::_fiemap);
   return seastar::do_with(
     ObjectDataHandler(max_object_size),
     [this, off, len, &t, &onode](auto &objhandler) {
@@ -1425,6 +1519,9 @@ SeaStore::Shard::_fiemap(
       },
       off,
       len);
+  }).si_then([FNAME, &t](auto ret) {
+    DEBUGT("got {} intervals", t, ret.size());
+    return ret;
   });
 }
 
@@ -1435,9 +1532,6 @@ SeaStore::Shard::fiemap(
   uint64_t off,
   uint64_t len)
 {
-  LOG_PREFIX(SeaStore::fiemap);
-  DEBUG("oid: {}, off: {}, len: {} ", oid, off, len);
-
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
 
@@ -1448,11 +1542,15 @@ SeaStore::Shard::fiemap(
     "fiemap",
     op_type_t::READ,
     [this, off, len](auto &t, auto &onode) -> base_iertr::future<fiemap_ret_t> {
+    LOG_PREFIX(SeaStoreS::fiemap);
     size_t size = onode.get_layout().size;
     if (off >= size) {
-      INFOT("fiemap offset is over onode size!", t);
+      DEBUGT("0x{:x}~0x{:x} onode-size=0x{:x}, got none",
+             t, off, len, size);
       return seastar::make_ready_future<std::map<uint64_t, uint64_t>>();
     }
+    DEBUGT("0x{:x}~0x{:x} onode-size=0x{:x} ...",
+           t, off, len, size);
     size_t adjust_len = (len == 0) ?
       size - off:
       std::min(size - off, len);
@@ -1464,7 +1562,7 @@ SeaStore::Shard::fiemap(
 }
 
 void SeaStore::Shard::on_error(ceph::os::Transaction &t) {
-  LOG_PREFIX(SeaStore::on_error);
+  LOG_PREFIX(SeaStoreS::on_error);
   ERROR(" transaction dump:\n");
   JSONFormatter f(true);
   f.open_object_section("transaction");
@@ -1485,17 +1583,22 @@ seastar::future<> SeaStore::Shard::do_transaction_no_callbacks(
   ++(shard_stats.starting_io_num);
 
   // repeat_with_internal_context ensures ordering via collection lock
+  auto num_bytes = _t.get_num_bytes();
   return repeat_with_internal_context(
     _ch,
     std::move(_t),
     Transaction::src_t::MUTATE,
     "do_transaction",
     op_type_t::DO_TRANSACTION,
-    [this](auto &ctx) {
-      return with_trans_intr(*ctx.transaction, [&ctx, this](auto &t) {
-        LOG_PREFIX(SeaStore::Shard::do_transaction_no_callbacks);
-        SUBDEBUGT(seastore_t, "start with {} objects",
-                  t, ctx.iter.objects.size());
+    [this, num_bytes](auto &ctx) {
+      LOG_PREFIX(SeaStoreS::do_transaction_no_callbacks);
+      return with_trans_intr(*ctx.transaction, [&ctx, this, FNAME, num_bytes](auto &t) {
+        DEBUGT("cid={}, {} operations, {} bytes, {} colls, {} objects ...",
+               t, ctx.ch->get_cid(),
+               ctx.ext_transaction.get_num_ops(),
+               num_bytes,
+               ctx.iter.colls.size(),
+               ctx.iter.objects.size());
 #ifndef NDEBUG
 	TRACET(" transaction dump:\n", t);
 	JSONFormatter f(true);
@@ -1529,6 +1632,8 @@ seastar::future<> SeaStore::Shard::do_transaction_no_callbacks(
         }).si_then([this, &ctx] {
           return transaction_manager->submit_transaction(*ctx.transaction);
         });
+      }).safe_then([FNAME, &ctx] {
+        DEBUGT("done", *ctx.transaction);
       });
     }
   ).finally([this] {
@@ -1568,12 +1673,12 @@ SeaStore::Shard::_do_transaction_step(
   std::vector<OnodeRef> &d_onodes,
   ceph::os::Transaction::iterator &i)
 {
-  LOG_PREFIX(SeaStore::Shard::_do_transaction_step);
+  LOG_PREFIX(SeaStoreS::_do_transaction_step);
   auto op = i.decode_op();
-  SUBTRACET(seastore_t, "got op {}", *ctx.transaction, (uint32_t)op->op);
 
   using ceph::os::Transaction;
   if (op->op == Transaction::OP_NOP) {
+    DEBUGT("op NOP", *ctx.transaction);
     return tm_iertr::now();
   }
 
@@ -1581,15 +1686,18 @@ SeaStore::Shard::_do_transaction_step(
     case Transaction::OP_RMCOLL:
     {
       coll_t cid = i.get_cid(op->cid);
+      DEBUGT("op RMCOLL, cid={} ...", *ctx.transaction, cid);
       return _remove_collection(ctx, cid);
     }
     case Transaction::OP_MKCOLL:
     {
       coll_t cid = i.get_cid(op->cid);
+      DEBUGT("op MKCOLL, cid={} ...", *ctx.transaction, cid);
       return _create_collection(ctx, cid, op->split_bits);
     }
     case Transaction::OP_COLL_HINT:
     {
+      DEBUGT("op COLL_HINT", *ctx.transaction);
       ceph::bufferlist hint;
       i.decode_bl(hint);
       return tm_iertr::now();
@@ -1609,12 +1717,16 @@ SeaStore::Shard::_do_transaction_step(
   if (!onodes[op->oid]) {
     const ghobject_t& oid = i.get_oid(op->oid);
     if (!create) {
+      DEBUGT("op {}, get oid={} ...",
+             *ctx.transaction, (uint32_t)op->op, oid);
       fut = onode_manager->get_onode(*ctx.transaction, oid);
     } else {
+      DEBUGT("op {}, get_or_create oid={} ...",
+             *ctx.transaction, (uint32_t)op->op, oid);
       fut = onode_manager->get_or_create_onode(*ctx.transaction, oid);
     }
   }
-  return fut.si_then([&, op, this](auto get_onode) {
+  return fut.si_then([&, op, this, FNAME](auto get_onode) {
     OnodeRef &o = onodes[op->oid];
     if (!o) {
       assert(get_onode);
@@ -1624,10 +1736,12 @@ SeaStore::Shard::_do_transaction_step(
     if ((op->op == Transaction::OP_CLONE
 	  || op->op == Transaction::OP_COLL_MOVE_RENAME)
 	&& !d_onodes[op->dest_oid]) {
+      const ghobject_t& dest_oid = i.get_oid(op->dest_oid);
+      DEBUGT("op {}, get_or_create dest oid={} ...",
+             *ctx.transaction, (uint32_t)op->op, dest_oid);
       //TODO: use when_all_succeed after making onode tree
       //      support parallel extents loading
-      return onode_manager->get_or_create_onode(
-	*ctx.transaction, i.get_oid(op->dest_oid)
+      return onode_manager->get_or_create_onode(*ctx.transaction, dest_oid
       ).si_then([&onodes, &d_onodes, op](auto dest_onode) {
 	assert(dest_onode);
 	auto &d_o = onodes[op->dest_oid];
@@ -1641,12 +1755,12 @@ SeaStore::Shard::_do_transaction_step(
       return OnodeManager::get_or_create_onode_iertr::now();
     }
   }).si_then([&ctx, &i, &onodes, &d_onodes, op, this, FNAME]() -> tm_ret {
-    LOG_PREFIX(SeaStore::_do_transaction_step);
+    const ghobject_t& oid = i.get_oid(op->oid);
     try {
       switch (op->op) {
       case Transaction::OP_REMOVE:
       {
-	TRACET("removing {}", *ctx.transaction, i.get_oid(op->oid));
+        DEBUGT("op REMOVE, oid={} ...", *ctx.transaction, oid);
         return _remove(ctx, onodes[op->oid]
 	).si_then([&onodes, &d_onodes, op] {
 	  onodes[op->oid].reset();
@@ -1656,6 +1770,7 @@ SeaStore::Shard::_do_transaction_step(
       case Transaction::OP_CREATE:
       case Transaction::OP_TOUCH:
       {
+        DEBUGT("op CREATE/TOUCH, oid={} ...", *ctx.transaction, oid);
         return _touch(ctx, onodes[op->oid]);
       }
       case Transaction::OP_WRITE:
@@ -1665,6 +1780,8 @@ SeaStore::Shard::_do_transaction_step(
         uint32_t fadvise_flags = i.get_fadvise_flags();
         ceph::bufferlist bl;
         i.decode_bl(bl);
+        DEBUGT("op WRITE, oid={}, 0x{:x}~0x{:x}, flags=0x{:x} ...",
+               *ctx.transaction, oid, off, len, fadvise_flags);
         return _write(
 	  ctx, onodes[op->oid], off, len, std::move(bl),
 	  fadvise_flags);
@@ -1672,6 +1789,7 @@ SeaStore::Shard::_do_transaction_step(
       case Transaction::OP_TRUNCATE:
       {
         uint64_t off = op->off;
+        DEBUGT("op TRUNCATE, oid={}, 0x{:x} ...", *ctx.transaction, oid, off);
         return _truncate(ctx, onodes[op->oid], off);
       }
       case Transaction::OP_SETATTR:
@@ -1680,80 +1798,96 @@ SeaStore::Shard::_do_transaction_step(
         std::map<std::string, bufferlist> to_set;
         ceph::bufferlist& bl = to_set[name];
         i.decode_bl(bl);
+        DEBUGT("op SETATTR, oid={}, attr name={}, value length=0x{:x} ...",
+               *ctx.transaction, oid, name, bl.length());
         return _setattrs(ctx, onodes[op->oid], std::move(to_set));
       }
       case Transaction::OP_SETATTRS:
       {
         std::map<std::string, bufferlist> to_set;
         i.decode_attrset(to_set);
+        DEBUGT("op SETATTRS, oid={}, attrs size={} ...",
+               *ctx.transaction, oid, to_set.size());
         return _setattrs(ctx, onodes[op->oid], std::move(to_set));
       }
       case Transaction::OP_RMATTR:
       {
         std::string name = i.decode_string();
+        DEBUGT("op RMATTR, oid={}, attr name={} ...",
+               *ctx.transaction, oid, name);
         return _rmattr(ctx, onodes[op->oid], name);
       }
       case Transaction::OP_RMATTRS:
       {
+        DEBUGT("op RMATTRS, oid={} ...", *ctx.transaction, oid);
         return _rmattrs(ctx, onodes[op->oid]);
       }
       case Transaction::OP_OMAP_SETKEYS:
       {
         std::map<std::string, ceph::bufferlist> aset;
         i.decode_attrset(aset);
+        DEBUGT("op OMAP_SETKEYS, oid={}, omap size={} ...",
+               *ctx.transaction, oid, aset.size());
         return _omap_set_values(ctx, onodes[op->oid], std::move(aset));
       }
       case Transaction::OP_OMAP_SETHEADER:
       {
         ceph::bufferlist bl;
         i.decode_bl(bl);
+        DEBUGT("op OMAP_SETHEADER, oid={}, length=0x{:x} ...",
+               *ctx.transaction, oid, bl.length());
         return _omap_set_header(ctx, onodes[op->oid], std::move(bl));
       }
       case Transaction::OP_OMAP_RMKEYS:
       {
         omap_keys_t keys;
         i.decode_keyset(keys);
+        DEBUGT("op OMAP_RMKEYS, oid={}, omap size={} ...",
+               *ctx.transaction, oid, keys.size());
         return _omap_rmkeys(ctx, onodes[op->oid], std::move(keys));
       }
       case Transaction::OP_OMAP_RMKEYRANGE:
       {
-        string first, last;
+        std::string first, last;
         first = i.decode_string();
         last = i.decode_string();
+        DEBUGT("op OMAP_RMKEYRANGE, oid={}, first={}, last={} ...",
+               *ctx.transaction, oid, first, last);
         return _omap_rmkeyrange(
 	  ctx, onodes[op->oid],
 	  std::move(first), std::move(last));
       }
       case Transaction::OP_OMAP_CLEAR:
       {
+        DEBUGT("op OMAP_CLEAR, oid={} ...", *ctx.transaction, oid);
         return _omap_clear(ctx, onodes[op->oid]);
       }
       case Transaction::OP_ZERO:
       {
         objaddr_t off = op->off;
         extent_len_t len = op->len;
+        DEBUGT("op ZERO, oid={}, 0x{:x}~0x{:x} ...",
+               *ctx.transaction, oid, off, len);
         return _zero(ctx, onodes[op->oid], off, len);
       }
       case Transaction::OP_SETALLOCHINT:
       {
+        DEBUGT("op SETALLOCHINT, oid={}, not implemented",
+               *ctx.transaction, oid);
         // TODO
         return tm_iertr::now();
       }
       case Transaction::OP_CLONE:
       {
-	TRACET("cloning {} to {}",
-	  *ctx.transaction,
-	  i.get_oid(op->oid),
-	  i.get_oid(op->dest_oid));
+        DEBUGT("op CLONE, oid={}, dest oid={} ...",
+               *ctx.transaction, oid, i.get_oid(op->dest_oid));
 	return _clone(ctx, onodes[op->oid], d_onodes[op->dest_oid]);
       }
       case Transaction::OP_COLL_MOVE_RENAME:
       {
+        DEBUGT("op COLL_MOVE_RENAME, oid={}, dest oid={} ...",
+               *ctx.transaction, oid, i.get_oid(op->dest_oid));
 	ceph_assert(op->cid == op->dest_cid);
-	TRACET("renaming {} to {}",
-	  *ctx.transaction,
-	  i.get_oid(op->oid),
-	  i.get_oid(op->dest_oid));
 	return _rename(
 	  ctx, onodes[op->oid], d_onodes[op->dest_oid]
 	).si_then([&onodes, &d_onodes, op] {
@@ -1789,7 +1923,7 @@ SeaStore::Shard::_do_transaction_step(
       return seastar::now();
     }),
     crimson::ct_error::assert_all{
-      "Invalid error in SeaStore::do_transaction_step"
+      "Invalid error in SeaStoreS::do_transaction_step"
     }
   );
 }
@@ -1825,7 +1959,7 @@ SeaStore::Shard::_rename(
   ).handle_error_interruptible(
     crimson::ct_error::input_output_error::pass_further(),
     crimson::ct_error::assert_all{
-      "Invalid error in SeaStore::_rename"}
+      "Invalid error in SeaStoreS::_rename"}
   );
 }
 
@@ -1846,7 +1980,7 @@ SeaStore::Shard::_remove_omaps(
       ).handle_error_interruptible(
 	crimson::ct_error::input_output_error::pass_further(),
 	crimson::ct_error::assert_all{
-	  "Invalid error in SeaStore::_remove"
+	  "Invalid error in SeaStoreS::_remove_omaps"
 	}
       );
     });
@@ -1859,8 +1993,6 @@ SeaStore::Shard::_remove(
   internal_context_t &ctx,
   OnodeRef &onode)
 {
-  LOG_PREFIX(SeaStore::_remove);
-  DEBUGT("onode={}", *ctx.transaction, *onode);
   return _remove_omaps(
     ctx,
     onode,
@@ -1888,7 +2020,7 @@ SeaStore::Shard::_remove(
   }).handle_error_interruptible(
     crimson::ct_error::input_output_error::pass_further(),
     crimson::ct_error::assert_all(
-      "Invalid error in SeaStore::_remove"
+      "Invalid error in SeaStoreS::_remove"
     )
   );
 }
@@ -1898,8 +2030,6 @@ SeaStore::Shard::_touch(
   internal_context_t &ctx,
   OnodeRef &onode)
 {
-  LOG_PREFIX(SeaStore::_touch);
-  DEBUGT("onode={}", *ctx.transaction, *onode);
   return tm_iertr::now();
 }
 
@@ -1911,8 +2041,6 @@ SeaStore::Shard::_write(
   ceph::bufferlist &&_bl,
   uint32_t fadvise_flags)
 {
-  LOG_PREFIX(SeaStore::_write);
-  DEBUGT("onode={} {}~{}", *ctx.transaction, *onode, offset, len);
   const auto &object_size = onode->get_layout().size;
   if (offset + len > object_size) {
     onode->update_onode_size(
@@ -2003,8 +2131,6 @@ SeaStore::Shard::_clone(
   OnodeRef &onode,
   OnodeRef &d_onode)
 {
-  LOG_PREFIX(SeaStore::_clone);
-  DEBUGT("onode={} d_onode={}", *ctx.transaction, *onode, *d_onode);
   return seastar::do_with(
     ObjectDataHandler(max_object_size),
     [this, &ctx, &onode, &d_onode](auto &objHandler) {
@@ -2030,9 +2156,10 @@ SeaStore::Shard::_zero(
   objaddr_t offset,
   extent_len_t len)
 {
-  LOG_PREFIX(SeaStore::_zero);
-  DEBUGT("onode={} {}~{}", *ctx.transaction, *onode, offset, len);
   if (offset + len >= max_object_size) {
+    LOG_PREFIX(SeaStoreS::_zero);
+    ERRORT("0x{:x}~0x{:x} >= 0x{:x}",
+           *ctx.transaction, offset, len, max_object_size);
     return crimson::ct_error::input_output_error::make();
   }
   const auto &object_size = onode->get_layout().size;
@@ -2088,8 +2215,6 @@ SeaStore::Shard::_omap_set_values(
   OnodeRef &onode,
   std::map<std::string, ceph::bufferlist> &&aset)
 {
-  LOG_PREFIX(SeaStore::_omap_set_values);
-  DEBUGT("{} {} keys", *ctx.transaction, *onode, aset.size());
   return _omap_set_kvs(
     onode,
     onode->get_layout().omap_root,
@@ -2108,8 +2233,6 @@ SeaStore::Shard::_omap_set_header(
   OnodeRef &onode,
   ceph::bufferlist &&header)
 {
-  LOG_PREFIX(SeaStore::_omap_set_header);
-  DEBUGT("{} {} bytes", *ctx.transaction, *onode, header.length());
   std::map<std::string, bufferlist> to_set;
   to_set[OMAP_HEADER_XATTR_KEY] = header;
   return _setattrs(ctx, onode,std::move(to_set));
@@ -2120,8 +2243,6 @@ SeaStore::Shard::_omap_clear(
   internal_context_t &ctx,
   OnodeRef &onode)
 {
-  LOG_PREFIX(SeaStore::_omap_clear);
-  DEBUGT("{} {} keys", *ctx.transaction, *onode);
   return _xattr_rmattr(ctx, onode, std::string(OMAP_HEADER_XATTR_KEY)
   ).si_then([this, &ctx, &onode]() -> tm_ret {
     if (auto omap_root = onode->get_layout().omap_root.get(
@@ -2155,8 +2276,6 @@ SeaStore::Shard::_omap_rmkeys(
   OnodeRef &onode,
   omap_keys_t &&keys)
 {
-  LOG_PREFIX(SeaStore::_omap_rmkeys);
-  DEBUGT("{} {} keys", *ctx.transaction, *onode, keys.size());
   auto omap_root = onode->get_layout().omap_root.get(
     onode->get_metadata_hint(device->get_block_size()));
   if (omap_root.is_null()) {
@@ -2197,10 +2316,9 @@ SeaStore::Shard::_omap_rmkeyrange(
   std::string first,
   std::string last)
 {
-  LOG_PREFIX(SeaStore::_omap_rmkeyrange);
-  DEBUGT("{} first={} last={}", *ctx.transaction, *onode, first, last);
   if (first > last) {
-    ERRORT("range error, first: {} > last:{}", *ctx.transaction, first, last);
+    LOG_PREFIX(SeaStoreS::_omap_rmkeyrange);
+    ERRORT("range error, first:{} > last:{}", *ctx.transaction, first, last);
     ceph_abort();
   }
   auto omap_root = onode->get_layout().omap_root.get(
@@ -2243,8 +2361,6 @@ SeaStore::Shard::_truncate(
   OnodeRef &onode,
   uint64_t size)
 {
-  LOG_PREFIX(SeaStore::_truncate);
-  DEBUGT("onode={} size={}", *ctx.transaction, *onode, size);
   onode->update_onode_size(*ctx.transaction, size);
   return seastar::do_with(
     ObjectDataHandler(max_object_size),
@@ -2265,9 +2381,7 @@ SeaStore::Shard::_setattrs(
   OnodeRef &onode,
   std::map<std::string, bufferlist>&& aset)
 {
-  LOG_PREFIX(SeaStore::_setattrs);
-  DEBUGT("onode={}", *ctx.transaction, *onode);
-
+  LOG_PREFIX(SeaStoreS::_setattrs);
   auto fut = tm_iertr::now();
   auto& layout = onode->get_layout();
   if (auto it = aset.find(OI_ATTR); it != aset.end()) {
@@ -2329,8 +2443,6 @@ SeaStore::Shard::_rmattr(
   OnodeRef &onode,
   std::string name)
 {
-  LOG_PREFIX(SeaStore::_rmattr);
-  DEBUGT("onode={}", *ctx.transaction, *onode);
   auto& layout = onode->get_layout();
   if ((name == OI_ATTR) && (layout.oi_size > 0)) {
     onode->clear_object_info(*ctx.transaction);
@@ -2352,7 +2464,7 @@ SeaStore::Shard::_xattr_rmattr(
   OnodeRef &onode,
   std::string &&name)
 {
-  LOG_PREFIX(SeaStore::_xattr_rmattr);
+  LOG_PREFIX(SeaStoreS::_xattr_rmattr);
   DEBUGT("onode={}", *ctx.transaction, *onode);
   auto xattr_root = onode->get_layout().xattr_root.get(
     onode->get_metadata_hint(device->get_block_size()));
@@ -2380,8 +2492,6 @@ SeaStore::Shard::_rmattrs(
   internal_context_t &ctx,
   OnodeRef &onode)
 {
-  LOG_PREFIX(SeaStore::_rmattrs);
-  DEBUGT("onode={}", *ctx.transaction, *onode);
   onode->clear_object_info(*ctx.transaction);
   onode->clear_snapset(*ctx.transaction);
   return _xattr_clear(ctx, onode);
@@ -2392,7 +2502,7 @@ SeaStore::Shard::_xattr_clear(
   internal_context_t &ctx,
   OnodeRef &onode)
 {
-  LOG_PREFIX(SeaStore::_xattr_clear);
+  LOG_PREFIX(SeaStoreS::_xattr_clear);
   DEBUGT("onode={}", *ctx.transaction, *onode);
   auto xattr_root = onode->get_layout().xattr_root.get(
     onode->get_metadata_hint(device->get_block_size()));
@@ -2442,7 +2552,7 @@ SeaStore::Shard::_create_collection(
   }).handle_error_interruptible(
     tm_iertr::pass_further{},
     crimson::ct_error::assert_all{
-      "Invalid error in SeaStore::_create_collection"
+      "Invalid error in SeaStoreS::_create_collection"
     }
   );
 }
@@ -2474,7 +2584,7 @@ SeaStore::Shard::_remove_collection(
   }).handle_error_interruptible(
     tm_iertr::pass_further{},
     crimson::ct_error::assert_all{
-      "Invalid error in SeaStore::_create_collection"
+      "Invalid error in SeaStoreS::_create_collection"
     }
   );
 }
@@ -2488,12 +2598,17 @@ SeaStore::Shard::_get_collection(const coll_t& cid)
 seastar::future<> SeaStore::write_meta(
   const std::string& key,
   const std::string& value) {
+  LOG_PREFIX(SeaStore::write_meta);
+  DEBUG("key={} value={} ...", key, value);
+
   ceph_assert(seastar::this_shard_id() == primary_core);
   return seastar::do_with(key, value,
-    [this](auto& key, auto& value) {
+    [this, FNAME](auto& key, auto& value) {
     return shard_stores.local().write_meta(key, value
     ).then([this, &key, &value] {
       return mdstore->write_meta(key, value);
+    }).safe_then([FNAME, &key, &value] {
+      DEBUG("key={} value={} done", key, value);
     }).handle_error(
       crimson::ct_error::assert_all{"Invalid error in SeaStore::write_meta"}
     );
@@ -2504,23 +2619,21 @@ seastar::future<> SeaStore::Shard::write_meta(
   const std::string& key,
   const std::string& value)
 {
-  LOG_PREFIX(SeaStore::write_meta);
-  DEBUG("key: {}; value: {}", key, value);
-
   ++(shard_stats.io_num);
   ++(shard_stats.pending_io_num);
   // For TM::submit_transaction()
   ++(shard_stats.processing_inlock_io_num);
 
-  return repeat_eagain([this, FNAME, &key, &value] {
+  return repeat_eagain([this, &key, &value] {
     ++(shard_stats.repeat_io_num);
 
     return transaction_manager->with_transaction_intr(
       Transaction::src_t::MUTATE,
       "write_meta",
-      [this, FNAME, &key, &value](auto& t)
+      [this, &key, &value](auto& t)
     {
-      DEBUGT("Have transaction, key: {}; value: {}", t, key, value);
+      LOG_PREFIX(SeaStoreS::write_meta);
+      DEBUGT("key={} value={} ...", t, key, value);
       return transaction_manager->update_root_meta(
         t, key, value
       ).si_then([this, &t] {
@@ -2528,7 +2641,7 @@ seastar::future<> SeaStore::Shard::write_meta(
       });
     });
   }).handle_error(
-    crimson::ct_error::assert_all{"Invalid error in SeaStore::write_meta"}
+    crimson::ct_error::assert_all{"Invalid error in SeaStoreS::write_meta"}
   ).finally([this] {
     assert(shard_stats.pending_io_num);
     --(shard_stats.pending_io_num);
@@ -2542,13 +2655,16 @@ seastar::future<std::tuple<int, std::string>>
 SeaStore::read_meta(const std::string& key)
 {
   LOG_PREFIX(SeaStore::read_meta);
-  DEBUG("key: {}", key);
+  DEBUG("key={} ...", key);
+
   ceph_assert(seastar::this_shard_id() == primary_core);
   return mdstore->read_meta(key
-  ).safe_then([](auto v) {
+  ).safe_then([key, FNAME](auto v) {
     if (v) {
+      DEBUG("key={}, value={}", key, *v);
       return std::make_tuple(0, std::move(*v));
     } else {
+      ERROR("key={} failed", key);
       return std::make_tuple(-1, std::string(""));
     }
   }).handle_error(
@@ -2605,7 +2721,7 @@ shard_stats_t SeaStore::Shard::get_io_stats(
   ret.minus(last_shard_stats);
 
   if (report_detail && seconds != 0) {
-    LOG_PREFIX(SeaStore::get_io_stats);
+    LOG_PREFIX(SeaStoreS::get_io_stats);
     auto calc_conflicts = [](uint64_t ios, uint64_t repeats) {
       return (double)(repeats-ios)/ios;
     };
diff --git a/src/crimson/os/seastore/seastore.h b/src/crimson/os/seastore/seastore.h
index 611e909619ac4..f851cedda827c 100644
--- a/src/crimson/os/seastore/seastore.h
+++ b/src/crimson/os/seastore/seastore.h
@@ -300,18 +300,21 @@ class SeaStore final : public FuturizedStore {
       auto begin_time = std::chrono::steady_clock::now();
       return seastar::do_with(
         oid, Ret{}, std::forward<F>(f),
-        [this, src, op_type, begin_time, tname
+        [this, ch, src, op_type, begin_time, tname
         ](auto &oid, auto &ret, auto &f)
       {
-        return repeat_eagain([&, this, src, tname] {
+        return repeat_eagain([&, this, ch, src, tname] {
           assert(src == Transaction::src_t::READ);
           ++(shard_stats.repeat_read_num);
 
           return transaction_manager->with_transaction_intr(
             src,
             tname,
-            [&, this](auto& t)
+            [&, this, ch, tname](auto& t)
           {
+            LOG_PREFIX(SeaStoreS::repeat_with_onode);
+            SUBDEBUGT(seastore, "{} cid={} oid={} ...",
+                      t, tname, ch->get_cid(), oid);
             return onode_manager->get_onode(t, oid
             ).si_then([&](auto onode) {
               return seastar::do_with(std::move(onode), [&](auto& onode) {
diff --git a/src/osd/osd_types_fmt.h b/src/osd/osd_types_fmt.h
index 04f4d46ee5109..100ce6e4646b3 100644
--- a/src/osd/osd_types_fmt.h
+++ b/src/osd/osd_types_fmt.h
@@ -392,4 +392,6 @@ inline std::ostream &operator<<(std::ostream &lhs, const object_stat_sum_t &sum)
 
 #if FMT_VERSION >= 90000
 template <bool TrackChanges> struct fmt::formatter<pg_missing_set<TrackChanges>> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<pool_opts_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<store_statfs_t> : fmt::ostream_formatter {};
 #endif

From 14eacf64559c9130977026ba085f1c6887645c7b Mon Sep 17 00:00:00 2001
From: Yingxin Cheng <yingxin.cheng@intel.com>
Date: Sun, 29 Sep 2024 13:47:17 +0800
Subject: [PATCH 064/148] crimson/os/seastore: workaround log linkage issue
 under clang14

The logs printing lambda-captured variables cannot be linked correctly
with SeaStore::Shard::repeat_with_onode() under clang14.

Signed-off-by: Yingxin Cheng <yingxin.cheng@intel.com>
---
 src/crimson/os/seastore/seastore.cc | 311 ++++++++++++++++------------
 src/crimson/os/seastore/seastore.h  |  31 +++
 2 files changed, 210 insertions(+), 132 deletions(-)

diff --git a/src/crimson/os/seastore/seastore.cc b/src/crimson/os/seastore/seastore.cc
index e2dee84637831..d90edbb20dbe2 100644
--- a/src/crimson/os/seastore/seastore.cc
+++ b/src/crimson/os/seastore/seastore.cc
@@ -1084,6 +1084,42 @@ SeaStore::Shard::list_collections()
   });
 }
 
+SeaStore::base_iertr::future<ceph::bufferlist>
+SeaStore::Shard::_read(
+  Transaction& t,
+  Onode& onode,
+  uint64_t offset,
+  std::size_t len,
+  uint32_t op_flags)
+{
+  LOG_PREFIX(SeaStoreS::_read);
+  size_t size = onode.get_layout().size;
+  if (offset >= size) {
+    DEBUGT("0x{:x}~0x{:x} onode-size=0x{:x} flags=0x{:x}, got none",
+           t, offset, len, size, op_flags);
+    return seastar::make_ready_future<ceph::bufferlist>();
+  }
+
+  DEBUGT("0x{:x}~0x{:x} onode-size=0x{:x} flags=0x{:x} ...",
+         t, offset, len, size, op_flags);
+  size_t corrected_len = (len == 0) ?
+    size - offset :
+    std::min(size - offset, len);
+
+  return ObjectDataHandler(max_object_size).read(
+    ObjectDataHandler::context_t{
+      *transaction_manager,
+      t,
+      onode,
+    },
+    offset,
+    corrected_len
+  ).si_then([FNAME, &t](auto bl) {
+    DEBUGT("got bl length=0x{:x}", t, bl.length());
+    return bl;
+  });
+}
+
 SeaStore::Shard::read_errorator::future<ceph::bufferlist>
 SeaStore::Shard::read(
   CollectionRef ch,
@@ -1101,35 +1137,9 @@ SeaStore::Shard::read(
     Transaction::src_t::READ,
     "read",
     op_type_t::READ,
-    [this, offset, len, op_flags](auto &t, auto &onode) -> ObjectDataHandler::read_ret {
-      LOG_PREFIX(SeaStoreS::read);
-      size_t size = onode.get_layout().size;
-      if (offset >= size) {
-        DEBUGT("0x{:x}~0x{:x} onode-size=0x{:x} flags=0x{:x}, got none",
-               t, offset, len, size, op_flags);
-	return seastar::make_ready_future<ceph::bufferlist>();
-      }
-
-      DEBUGT("0x{:x}~0x{:x} onode-size=0x{:x} flags=0x{:x} ...",
-             t, offset, len, size, op_flags);
-      size_t corrected_len = (len == 0) ?
-	size - offset :
-	std::min(size - offset, len);
-
-      return ObjectDataHandler(max_object_size).read(
-        ObjectDataHandler::context_t{
-          *transaction_manager,
-          t,
-          onode,
-        },
-        offset,
-        corrected_len
-      ).si_then([FNAME, &t](auto bl) {
-        DEBUGT("got bl length=0x{:x}", t, bl.length());
-        return bl;
-      });
-    }
-  ).finally([this] {
+    [this, offset, len, op_flags](auto &t, auto &onode) {
+    return _read(t, onode, offset, len, op_flags);
+  }).finally([this] {
     assert(shard_stats.pending_read_num);
     --(shard_stats.pending_read_num);
   });
@@ -1198,6 +1208,34 @@ SeaStore::Shard::readv(
 
 using crimson::os::seastore::omap_manager::BtreeOMapManager;
 
+SeaStore::Shard::_omap_get_value_ret
+SeaStore::Shard::_get_attr(
+  Transaction& t,
+  Onode& onode,
+  std::string_view name) const
+{
+  LOG_PREFIX(SeaStoreS::_get_attr);
+  auto& layout = onode.get_layout();
+  if (name == OI_ATTR && layout.oi_size) {
+    ceph::bufferlist bl;
+    bl.append(ceph::bufferptr(&layout.oi[0], layout.oi_size));
+    DEBUGT("got OI_ATTR, value length=0x{:x}", t, bl.length());
+    return seastar::make_ready_future<ceph::bufferlist>(std::move(bl));
+  }
+  if (name == SS_ATTR && layout.ss_size) {
+    ceph::bufferlist bl;
+    bl.append(ceph::bufferptr(&layout.ss[0], layout.ss_size));
+    DEBUGT("got SS_ATTR, value length=0x{:x}", t, bl.length());
+    return seastar::make_ready_future<ceph::bufferlist>(std::move(bl));
+  }
+  DEBUGT("name={} ...", t, name);
+  return _omap_get_value(
+    t,
+    layout.xattr_root.get(
+      onode.get_metadata_hint(device->get_block_size())),
+    name);
+}
+
 SeaStore::Shard::get_attr_errorator::future<ceph::bufferlist>
 SeaStore::Shard::get_attr(
   CollectionRef ch,
@@ -1213,29 +1251,9 @@ SeaStore::Shard::get_attr(
     Transaction::src_t::READ,
     "get_attr",
     op_type_t::GET_ATTR,
-    [this, name](auto &t, auto& onode) -> _omap_get_value_ret {
-      LOG_PREFIX(SeaStoreS::get_attr);
-      auto& layout = onode.get_layout();
-      if (name == OI_ATTR && layout.oi_size) {
-        ceph::bufferlist bl;
-        bl.append(ceph::bufferptr(&layout.oi[0], layout.oi_size));
-        DEBUGT("got OI_ATTR, value length=0x{:x}", t, bl.length());
-        return seastar::make_ready_future<ceph::bufferlist>(std::move(bl));
-      }
-      if (name == SS_ATTR && layout.ss_size) {
-        ceph::bufferlist bl;
-        bl.append(ceph::bufferptr(&layout.ss[0], layout.ss_size));
-        DEBUGT("got SS_ATTR, value length=0x{:x}", t, bl.length());
-        return seastar::make_ready_future<ceph::bufferlist>(std::move(bl));
-      }
-      DEBUGT("name={} ...", t, name);
-      return _omap_get_value(
-        t,
-        layout.xattr_root.get(
-          onode.get_metadata_hint(device->get_block_size())),
-        name);
-    }
-  ).handle_error(
+    [this, name](auto &t, auto& onode) {
+    return _get_attr(t, onode, name);
+  }).handle_error(
     crimson::ct_error::input_output_error::assert_failure{
       "EIO when getting attrs"},
     crimson::ct_error::pass_further_all{}
@@ -1245,6 +1263,36 @@ SeaStore::Shard::get_attr(
   });
 }
 
+SeaStore::base_iertr::future<SeaStore::Shard::attrs_t>
+SeaStore::Shard::_get_attrs(
+  Transaction& t,
+  Onode& onode)
+{
+  LOG_PREFIX(SeaStoreS::_get_attrs);
+  DEBUGT("...", t);
+  auto& layout = onode.get_layout();
+  return omap_list(onode, layout.xattr_root, t, std::nullopt,
+    OMapManager::omap_list_config_t()
+      .with_inclusive(false, false)
+      .without_max()
+  ).si_then([&layout, &t, FNAME](auto p) {
+    auto& attrs = std::get<1>(p);
+    DEBUGT("got {} attrs, OI length=0x{:x}, SS length=0x{:x}",
+           t, attrs.size(), (uint32_t)layout.oi_size, (uint32_t)layout.ss_size);
+    ceph::bufferlist bl;
+    if (layout.oi_size) {
+      bl.append(ceph::bufferptr(&layout.oi[0], layout.oi_size));
+      attrs.emplace(OI_ATTR, std::move(bl));
+    }
+    if (layout.ss_size) {
+      bl.clear();
+      bl.append(ceph::bufferptr(&layout.ss[0], layout.ss_size));
+      attrs.emplace(SS_ATTR, std::move(bl));
+    }
+    return seastar::make_ready_future<attrs_t>(std::move(attrs));
+  });
+}
+
 SeaStore::Shard::get_attrs_ertr::future<SeaStore::Shard::attrs_t>
 SeaStore::Shard::get_attrs(
   CollectionRef ch,
@@ -1260,31 +1308,8 @@ SeaStore::Shard::get_attrs(
     "get_attrs",
     op_type_t::GET_ATTRS,
     [this](auto &t, auto& onode) {
-      LOG_PREFIX(SeaStoreS::get_attrs);
-      DEBUGT("...", t);
-      auto& layout = onode.get_layout();
-      return omap_list(onode, layout.xattr_root, t, std::nullopt,
-        OMapManager::omap_list_config_t()
-	  .with_inclusive(false, false)
-	  .without_max()
-      ).si_then([&layout, &t, FNAME](auto p) {
-        auto& attrs = std::get<1>(p);
-        DEBUGT("got {} attrs, OI length=0x{:x}, SS length=0x{:x}",
-               t, attrs.size(), (uint32_t)layout.oi_size, (uint32_t)layout.ss_size);
-        ceph::bufferlist bl;
-        if (layout.oi_size) {
-          bl.append(ceph::bufferptr(&layout.oi[0], layout.oi_size));
-          attrs.emplace(OI_ATTR, std::move(bl));
-        }
-        if (layout.ss_size) {
-          bl.clear();
-          bl.append(ceph::bufferptr(&layout.ss[0], layout.ss_size));
-          attrs.emplace(SS_ATTR, std::move(bl));
-        }
-        return seastar::make_ready_future<attrs_t>(std::move(attrs));
-      });
-    }
-  ).handle_error(
+    return _get_attrs(t, onode);
+  }).handle_error(
     crimson::ct_error::input_output_error::assert_failure{
       "EIO when getting attrs"},
     crimson::ct_error::pass_further_all{}
@@ -1294,6 +1319,23 @@ SeaStore::Shard::get_attrs(
   });
 }
 
+seastar::future<struct stat> SeaStore::Shard::_stat(
+  Transaction& t,
+  Onode& onode,
+  const ghobject_t& oid)
+{
+  LOG_PREFIX(SeaStoreS::_stat);
+  struct stat st;
+  auto &olayout = onode.get_layout();
+  st.st_size = olayout.size;
+  st.st_blksize = device->get_block_size();
+  st.st_blocks = (st.st_size + st.st_blksize - 1) / st.st_blksize;
+  st.st_nlink = 1;
+  DEBUGT("oid={}, size={}, blksize={}",
+         t, oid, st.st_size, st.st_blksize);
+  return seastar::make_ready_future<struct stat>(st);
+}
+
 seastar::future<struct stat> SeaStore::Shard::stat(
   CollectionRef c,
   const ghobject_t& oid)
@@ -1308,18 +1350,8 @@ seastar::future<struct stat> SeaStore::Shard::stat(
     "stat",
     op_type_t::STAT,
     [this, oid](auto &t, auto &onode) {
-      LOG_PREFIX(SeaStoreS::stat);
-      struct stat st;
-      auto &olayout = onode.get_layout();
-      st.st_size = olayout.size;
-      st.st_blksize = device->get_block_size();
-      st.st_blocks = (st.st_size + st.st_blksize - 1) / st.st_blksize;
-      st.st_nlink = 1;
-      DEBUGT("oid={}, size={}, blksize={}",
-             t, oid, st.st_size, st.st_blksize);
-      return seastar::make_ready_future<struct stat>(st);
-    }
-  ).handle_error(
+    return _stat(t, onode, oid);
+  }).handle_error(
     crimson::ct_error::assert_all{
       "Invalid error in SeaStoreS::stat"
     }
@@ -1337,6 +1369,22 @@ SeaStore::Shard::omap_get_header(
   return get_attr(ch, oid, OMAP_HEADER_XATTR_KEY);
 }
 
+SeaStore::base_iertr::future<SeaStore::Shard::omap_values_t>
+SeaStore::Shard::do_omap_get_values(
+  Transaction& t,
+  Onode& onode,
+  const omap_keys_t& keys)
+{
+  LOG_PREFIX(SeaStoreS::do_omap_get_values);
+  DEBUGT("{} keys ...", t, keys.size());
+  omap_root_t omap_root = onode.get_layout().omap_root.get(
+    onode.get_metadata_hint(device->get_block_size()));
+  return _omap_get_values(
+    t,
+    std::move(omap_root),
+    keys);
+}
+
 SeaStore::Shard::read_errorator::future<SeaStore::Shard::omap_values_t>
 SeaStore::Shard::omap_get_values(
   CollectionRef ch,
@@ -1353,16 +1401,8 @@ SeaStore::Shard::omap_get_values(
     "omap_get_values",
     op_type_t::OMAP_GET_VALUES,
     [this, keys](auto &t, auto &onode) {
-      LOG_PREFIX(SeaStoreS::omap_get_values);
-      DEBUGT("{} keys ...", t, keys.size());
-      omap_root_t omap_root = onode.get_layout().omap_root.get(
-	onode.get_metadata_hint(device->get_block_size()));
-      return _omap_get_values(
-	t,
-	std::move(omap_root),
-	keys);
-    }
-  ).finally([this] {
+    return do_omap_get_values(t, onode, keys);
+  }).finally([this] {
     assert(shard_stats.pending_read_num);
     --(shard_stats.pending_read_num);
   });
@@ -1462,6 +1502,29 @@ SeaStore::Shard::omap_list(
   });
 }
 
+SeaStore::base_iertr::future<SeaStore::Shard::omap_values_paged_t>
+SeaStore::Shard::do_omap_get_values(
+  Transaction& t,
+  Onode& onode,
+  const std::optional<std::string>& start)
+{
+  LOG_PREFIX(SeaStoreS::do_omap_get_values);
+  DEBUGT("start={} ...", t, start.has_value() ? *start : "");
+  return omap_list(
+    onode,
+    onode.get_layout().omap_root,
+    t,
+    start,
+    OMapManager::omap_list_config_t()
+      .with_inclusive(false, false)
+      .without_max()
+  ).si_then([FNAME, &t](omap_values_paged_t ret) {
+    DEBUGT("got {} values, complete={}",
+           t, std::get<1>(ret).size(), std::get<0>(ret));
+    return ret;
+  });
+}
+
 SeaStore::Shard::read_errorator::future<SeaStore::Shard::omap_values_paged_t>
 SeaStore::Shard::omap_get_values(
   CollectionRef ch,
@@ -1478,23 +1541,8 @@ SeaStore::Shard::omap_get_values(
     "omap_get_values2",
     op_type_t::OMAP_GET_VALUES2,
     [this, start](auto &t, auto &onode) {
-      LOG_PREFIX(SeaStoreS::omap_get_values);
-      DEBUGT("start={} ...", t, start.has_value() ? *start : "");
-      return omap_list(
-	onode,
-	onode.get_layout().omap_root,
-	t,
-	start,
-	OMapManager::omap_list_config_t()
-	  .with_inclusive(false, false)
-	  .without_max()
-      ).si_then([FNAME, &t](omap_values_paged_t ret) {
-        DEBUGT("got {} values, complete={}",
-               t, std::get<1>(ret).size(), std::get<0>(ret));
-        return ret;
-      });
-    }
-  ).finally([this] {
+    return do_omap_get_values(t, onode, start);
+  }).finally([this] {
     assert(shard_stats.pending_read_num);
     --(shard_stats.pending_read_num);
   });
@@ -1508,9 +1556,20 @@ SeaStore::Shard::_fiemap(
   uint64_t len) const
 {
   LOG_PREFIX(SeaStoreS::_fiemap);
+  size_t size = onode.get_layout().size;
+  if (off >= size) {
+    DEBUGT("0x{:x}~0x{:x} onode-size=0x{:x}, got none",
+           t, off, len, size);
+    return seastar::make_ready_future<std::map<uint64_t, uint64_t>>();
+  }
+  DEBUGT("0x{:x}~0x{:x} onode-size=0x{:x} ...",
+         t, off, len, size);
+  size_t adjust_len = (len == 0) ?
+    size - off:
+    std::min(size - off, len);
   return seastar::do_with(
     ObjectDataHandler(max_object_size),
-    [this, off, len, &t, &onode](auto &objhandler) {
+    [this, off, adjust_len, &t, &onode](auto &objhandler) {
     return objhandler.fiemap(
       ObjectDataHandler::context_t{
         *transaction_manager,
@@ -1518,7 +1577,7 @@ SeaStore::Shard::_fiemap(
         onode,
       },
       off,
-      len);
+      adjust_len);
   }).si_then([FNAME, &t](auto ret) {
     DEBUGT("got {} intervals", t, ret.size());
     return ret;
@@ -1541,20 +1600,8 @@ SeaStore::Shard::fiemap(
     Transaction::src_t::READ,
     "fiemap",
     op_type_t::READ,
-    [this, off, len](auto &t, auto &onode) -> base_iertr::future<fiemap_ret_t> {
-    LOG_PREFIX(SeaStoreS::fiemap);
-    size_t size = onode.get_layout().size;
-    if (off >= size) {
-      DEBUGT("0x{:x}~0x{:x} onode-size=0x{:x}, got none",
-             t, off, len, size);
-      return seastar::make_ready_future<std::map<uint64_t, uint64_t>>();
-    }
-    DEBUGT("0x{:x}~0x{:x} onode-size=0x{:x} ...",
-           t, off, len, size);
-    size_t adjust_len = (len == 0) ?
-      size - off:
-      std::min(size - off, len);
-    return _fiemap(t, onode, off, adjust_len);
+    [this, off, len](auto &t, auto &onode) {
+    return _fiemap(t, onode, off, len);
   }).finally([this] {
     assert(shard_stats.pending_read_num);
     --(shard_stats.pending_read_num);
diff --git a/src/crimson/os/seastore/seastore.h b/src/crimson/os/seastore/seastore.h
index f851cedda827c..185072744f2d8 100644
--- a/src/crimson/os/seastore/seastore.h
+++ b/src/crimson/os/seastore/seastore.h
@@ -357,6 +357,37 @@ class SeaStore final : public FuturizedStore {
 
     friend class SeaStoreOmapIterator;
 
+    base_iertr::future<ceph::bufferlist> _read( 
+      Transaction& t,
+      Onode& onode,
+      uint64_t offset,
+      std::size_t len,
+      uint32_t op_flags);
+
+    _omap_get_value_ret _get_attr(
+      Transaction& t,
+      Onode& onode,
+      std::string_view name) const;
+
+    base_iertr::future<attrs_t> _get_attrs(
+      Transaction& t,
+      Onode& onode);
+
+    seastar::future<struct stat> _stat(
+      Transaction& t,
+      Onode& onode,
+      const ghobject_t& oid);
+
+    base_iertr::future<omap_values_t> do_omap_get_values(
+      Transaction& t,
+      Onode& onode,
+      const omap_keys_t& keys);
+
+    base_iertr::future<omap_values_paged_t> do_omap_get_values(
+      Transaction& t,
+      Onode& onode,
+      const std::optional<std::string>& start);
+
     base_iertr::future<fiemap_ret_t> _fiemap(
       Transaction &t,
       Onode &onode,

From 1ee32107df9c641908bc0b908be47f1d5af3bf63 Mon Sep 17 00:00:00 2001
From: Yingxin Cheng <yingxin.cheng@intel.com>
Date: Tue, 10 Sep 2024 11:52:56 +0800
Subject: [PATCH 065/148] crimson/os/seastore/transaction_manager: misc
 cleanups

Signed-off-by: Yingxin Cheng <yingxin.cheng@intel.com>
---
 .../os/seastore/transaction_manager.cc        |  44 ++++---
 src/crimson/os/seastore/transaction_manager.h | 112 ++++++++----------
 2 files changed, 74 insertions(+), 82 deletions(-)

diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc
index a76b7fbe0c96c..0fd23ef6afbbb 100644
--- a/src/crimson/os/seastore/transaction_manager.cc
+++ b/src/crimson/os/seastore/transaction_manager.cc
@@ -98,7 +98,8 @@ TransactionManager::mkfs_ertr::future<> TransactionManager::mkfs()
   });
 }
 
-TransactionManager::mount_ertr::future<> TransactionManager::mount()
+TransactionManager::mount_ertr::future<>
+TransactionManager::mount()
 {
   LOG_PREFIX(TransactionManager::mount);
   INFO("enter");
@@ -175,7 +176,8 @@ TransactionManager::mount_ertr::future<> TransactionManager::mount()
   );
 }
 
-TransactionManager::close_ertr::future<> TransactionManager::close() {
+TransactionManager::close_ertr::future<>
+TransactionManager::close() {
   LOG_PREFIX(TransactionManager::close);
   INFO("enter");
   return epm->stop_background(
@@ -241,11 +243,11 @@ TransactionManager::ref_ret TransactionManager::remove(
   });
 }
 
-TransactionManager::ref_ret TransactionManager::_dec_ref(
+TransactionManager::ref_ret TransactionManager::remove(
   Transaction &t,
   laddr_t offset)
 {
-  LOG_PREFIX(TransactionManager::_dec_ref);
+  LOG_PREFIX(TransactionManager::remove);
   TRACET("{}", t, offset);
   return lba_manager->decref_extent(t, offset
   ).si_then([this, FNAME, offset, &t](auto result) -> ref_ret {
@@ -273,17 +275,18 @@ TransactionManager::refs_ret TransactionManager::remove(
   LOG_PREFIX(TransactionManager::remove);
   DEBUG("{} offsets", offsets.size());
   return seastar::do_with(std::move(offsets), std::vector<unsigned>(),
-      [this, &t] (auto &&offsets, auto &refcnt) {
-      return trans_intr::do_for_each(offsets.begin(), offsets.end(),
-        [this, &t, &refcnt] (auto &laddr) {
-        return this->remove(t, laddr).si_then([&refcnt] (auto ref) {
-          refcnt.push_back(ref);
-          return ref_iertr::now();
-        });
-      }).si_then([&refcnt] {
-        return ref_iertr::make_ready_future<std::vector<unsigned>>(std::move(refcnt));
+    [this, &t](auto &&offsets, auto &refcnts) {
+    return trans_intr::do_for_each(offsets.begin(), offsets.end(),
+      [this, &t, &refcnts](auto &laddr) {
+      return this->remove(t, laddr
+      ).si_then([&refcnts](auto ref) {
+        refcnts.push_back(ref);
+        return ref_iertr::now();
       });
+    }).si_then([&refcnts] {
+      return ref_iertr::make_ready_future<std::vector<unsigned>>(std::move(refcnts));
     });
+  });
 }
 
 TransactionManager::submit_transaction_iertr::future<>
@@ -340,6 +343,7 @@ TransactionManager::update_lba_mappings(
         return;
       }
       if (extent->is_logical()) {
+        assert(is_logical_type(extent->get_type()));
         // for rewritten extents, last_committed_crc should have been set
         // because the crc of the original extent may be reused.
         // also see rewrite_logical_extent()
@@ -359,6 +363,7 @@ TransactionManager::update_lba_mappings(
 #endif
         lextents.emplace_back(extent->template cast<LogicalCachedExtent>());
       } else {
+        assert(is_physical_type(extent->get_type()));
         pextents.emplace_back(extent);
       }
     };
@@ -566,7 +571,8 @@ TransactionManager::rewrite_logical_extent(
       0,
       lextent->get_length(),
       extent_ref_count_t(0),
-      [this, lextent, &t](auto &extents, auto &off, auto &left, auto &refcount) {
+      [this, lextent, &t]
+      (auto &extents, auto &off, auto &left, auto &refcount) {
       return trans_intr::do_for_each(
         extents,
         [lextent, this, &t, &off, &left, &refcount](auto &nextent) {
@@ -665,11 +671,6 @@ TransactionManager::rewrite_extent_ret TransactionManager::rewrite_extent(
     t.get_rewrite_stats().account_n_dirty();
   }
 
-  if (is_backref_node(extent->get_type())) {
-    DEBUGT("rewriting backref extent -- {}", t, *extent);
-    return backref_manager->rewrite_extent(t, extent);
-  }
-
   if (is_root_type(extent->get_type())) {
     DEBUGT("rewriting root extent -- {}", t, *extent);
     cache->duplicate_for_write(t, extent);
@@ -677,8 +678,13 @@ TransactionManager::rewrite_extent_ret TransactionManager::rewrite_extent(
   }
 
   if (extent->is_logical()) {
+    assert(is_logical_type(extent->get_type()));
     return rewrite_logical_extent(t, extent->cast<LogicalCachedExtent>());
+  } else if (is_backref_node(extent->get_type())) {
+    DEBUGT("rewriting backref extent -- {}", t, *extent);
+    return backref_manager->rewrite_extent(t, extent);
   } else {
+    assert(is_lba_node(extent->get_type()));
     DEBUGT("rewriting physical extent -- {}", t, *extent);
     return lba_manager->rewrite_extent(t, extent);
   }
diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h
index 828b8a25592fc..6d1b010ab69ea 100644
--- a/src/crimson/os/seastore/transaction_manager.h
+++ b/src/crimson/os/seastore/transaction_manager.h
@@ -215,49 +215,6 @@ class TransactionManager : public ExtentCallbackInterface {
     });
   }
 
-  template <typename T>
-  std::variant<LBAMappingRef, base_iertr::future<TCachedExtentRef<T>>>
-  get_extent_if_linked(
-    Transaction &t,
-    LBAMappingRef pin)
-  {
-    ceph_assert(pin->is_parent_viewable());
-    // checking the lba child must be atomic with creating
-    // and linking the absent child
-    auto v = pin->get_logical_extent(t);
-    if (v.has_child()) {
-      return v.get_child_fut().safe_then([pin=std::move(pin)](auto extent) {
-#ifndef NDEBUG
-        auto lextent = extent->template cast<LogicalCachedExtent>();
-        auto pin_laddr = pin->get_key();
-        if (pin->is_indirect()) {
-          pin_laddr = pin->get_intermediate_base();
-        }
-        assert(lextent->get_laddr() == pin_laddr);
-#endif
-	return extent->template cast<T>();
-      });
-    } else {
-      return pin;
-    }
-  }
-
-  base_iertr::future<LogicalCachedExtentRef> read_pin_by_type(
-    Transaction &t,
-    LBAMappingRef pin,
-    extent_types_t type)
-  {
-    ceph_assert(!pin->parent_modified());
-    auto v = pin->get_logical_extent(t);
-    // checking the lba child must be atomic with creating
-    // and linking the absent child
-    if (v.has_child()) {
-      return std::move(v.get_child_fut());
-    } else {
-      return pin_to_extent_by_type(t, std::move(pin), type);
-    }
-  }
-
   /// Obtain mutable copy of extent
   LogicalCachedExtentRef get_mutable_extent(Transaction &t, LogicalCachedExtentRef ref) {
     LOG_PREFIX(TransactionManager::get_mutable_extent);
@@ -282,7 +239,6 @@ class TransactionManager : public ExtentCallbackInterface {
     return ret;
   }
 
-
   using ref_iertr = LBAManager::ref_iertr;
   using ref_ret = ref_iertr::future<extent_ref_count_t>;
 
@@ -302,26 +258,15 @@ class TransactionManager : public ExtentCallbackInterface {
    * remove
    *
    * Remove the extent and the corresponding lba mapping,
-   * users must make sure that lba mapping's refcount is 1
+   * users must make sure that lba mapping's refcount > 1
    */
   ref_ret remove(
     Transaction &t,
     LogicalCachedExtentRef &ref);
 
-  /**
-   * remove
-   *
-   * 1. Remove the indirect mapping(s), and if refcount drops to 0,
-   *    also remove the direct mapping and retire the extent.
-   * 
-   * 2. Remove the direct mapping(s) and retire the extent if
-   * 	refcount drops to 0.
-   */
   ref_ret remove(
     Transaction &t,
-    laddr_t offset) {
-    return _dec_ref(t, offset);
-  }
+    laddr_t offset);
 
   /// remove refcount for list of offset
   using refs_ret = ref_iertr::future<std::vector<unsigned>>;
@@ -411,7 +356,10 @@ class TransactionManager : public ExtentCallbackInterface {
   }
 
   template <typename T>
-  read_extent_ret<T> get_mutable_extent_by_laddr(Transaction &t, laddr_t laddr, extent_len_t len) {
+  read_extent_ret<T> get_mutable_extent_by_laddr(
+      Transaction &t,
+      laddr_t laddr,
+      extent_len_t len) {
     return get_pin(t, laddr
     ).si_then([this, &t, len](auto pin) {
       ceph_assert(pin->is_data_stable() && !pin->is_zero_reserved());
@@ -853,6 +801,49 @@ class TransactionManager : public ExtentCallbackInterface {
 
   shard_stats_t& shard_stats;
 
+  template <typename T>
+  std::variant<LBAMappingRef, base_iertr::future<TCachedExtentRef<T>>>
+  get_extent_if_linked(
+    Transaction &t,
+    LBAMappingRef pin)
+  {
+    ceph_assert(pin->is_parent_viewable());
+    // checking the lba child must be atomic with creating
+    // and linking the absent child
+    auto v = pin->get_logical_extent(t);
+    if (v.has_child()) {
+      return v.get_child_fut().safe_then([pin=std::move(pin)](auto extent) {
+#ifndef NDEBUG
+        auto lextent = extent->template cast<LogicalCachedExtent>();
+        auto pin_laddr = pin->get_key();
+        if (pin->is_indirect()) {
+          pin_laddr = pin->get_intermediate_base();
+        }
+        assert(lextent->get_laddr() == pin_laddr);
+#endif
+	return extent->template cast<T>();
+      });
+    } else {
+      return pin;
+    }
+  }
+
+  base_iertr::future<LogicalCachedExtentRef> read_pin_by_type(
+    Transaction &t,
+    LBAMappingRef pin,
+    extent_types_t type)
+  {
+    ceph_assert(!pin->parent_modified());
+    auto v = pin->get_logical_extent(t);
+    // checking the lba child must be atomic with creating
+    // and linking the absent child
+    if (v.has_child()) {
+      return std::move(v.get_child_fut());
+    } else {
+      return pin_to_extent_by_type(t, std::move(pin), type);
+    }
+  }
+
   rewrite_extent_ret rewrite_logical_extent(
     Transaction& t,
     LogicalCachedExtentRef extent);
@@ -862,11 +853,6 @@ class TransactionManager : public ExtentCallbackInterface {
     ExtentPlacementManager::dispatch_result_t dispatch_result,
     std::optional<journal_seq_t> seq_to_trim = std::nullopt);
 
-  /// Remove refcount for offset
-  ref_ret _dec_ref(
-    Transaction &t,
-    laddr_t offset);
-
   using update_lba_mappings_ret = LBAManager::update_mappings_ret;
   update_lba_mappings_ret update_lba_mappings(
     Transaction &t,

From ec5c6c5761ed1124d12c2e036262e6135fc99a9b Mon Sep 17 00:00:00 2001
From: Yingxin Cheng <yingxin.cheng@intel.com>
Date: Tue, 10 Sep 2024 11:55:05 +0800
Subject: [PATCH 066/148] crimson/os/seastore/transaction_manager: refine logs

Signed-off-by: Yingxin Cheng <yingxin.cheng@intel.com>
---
 .../lba_manager/btree/btree_lba_manager.h     |   8 +-
 .../os/seastore/transaction_manager.cc        |  82 ++++++------
 src/crimson/os/seastore/transaction_manager.h | 122 ++++++++++--------
 3 files changed, 122 insertions(+), 90 deletions(-)

diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h
index 5d6fa3cb1b170..ef10ff9623b50 100644
--- a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h
+++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h
@@ -173,16 +173,22 @@ class BtreeLBAMapping : public BtreeNodeMapping<laddr_t, paddr_t> {
     if (!parent_modified()) {
       return;
     }
+    LOG_PREFIX(BtreeLBAMapping::maybe_fix_pos);
     auto &p = static_cast<LBALeafNode&>(*parent);
     p.maybe_fix_mapping_pos(*this);
+    SUBDEBUGT(seastore_lba, "fixed pin {}",
+              ctx.trans, static_cast<LBAMapping&>(*this));
   }
 
   LBAMappingRef refresh_with_pending_parent() final {
+    LOG_PREFIX(BtreeLBAMapping::refresh_with_pending_parent);
     assert(is_parent_valid() && !is_parent_viewable());
     auto &p = static_cast<LBALeafNode&>(*parent);
     auto &viewable_p = static_cast<LBALeafNode&>(
       *p.find_pending_version(ctx.trans, get_key()));
-    return viewable_p.get_mapping(ctx, get_key());
+    auto new_pin = viewable_p.get_mapping(ctx, get_key());
+    SUBDEBUGT(seastore_lba, "new pin {}", ctx.trans, static_cast<LBAMapping&>(*new_pin));
+    return new_pin;
   }
 protected:
   std::unique_ptr<BtreeNodeMapping<laddr_t, paddr_t>> _duplicate(
diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc
index 0fd23ef6afbbb..f4e3b0858f2f1 100644
--- a/src/crimson/os/seastore/transaction_manager.cc
+++ b/src/crimson/os/seastore/transaction_manager.cc
@@ -48,7 +48,7 @@ TransactionManager::TransactionManager(
 TransactionManager::mkfs_ertr::future<> TransactionManager::mkfs()
 {
   LOG_PREFIX(TransactionManager::mkfs);
-  INFO("enter");
+  INFO("...");
   return epm->mount(
   ).safe_then([this] {
     return journal->open_for_mkfs();
@@ -94,7 +94,7 @@ TransactionManager::mkfs_ertr::future<> TransactionManager::mkfs()
   }).safe_then([this] {
     return close();
   }).safe_then([FNAME] {
-    INFO("completed");
+    INFO("done");
   });
 }
 
@@ -102,7 +102,7 @@ TransactionManager::mount_ertr::future<>
 TransactionManager::mount()
 {
   LOG_PREFIX(TransactionManager::mount);
-  INFO("enter");
+  INFO("...");
   cache->init();
   return epm->mount(
   ).safe_then([this] {
@@ -169,7 +169,7 @@ TransactionManager::mount()
     return epm->open_for_write();
   }).safe_then([FNAME, this] {
     epm->start_background();
-    INFO("completed");
+    INFO("done");
   }).handle_error(
     mount_ertr::pass_further{},
     crimson::ct_error::assert_all{"unhandled error"}
@@ -179,7 +179,7 @@ TransactionManager::mount()
 TransactionManager::close_ertr::future<>
 TransactionManager::close() {
   LOG_PREFIX(TransactionManager::close);
-  INFO("enter");
+  INFO("...");
   return epm->stop_background(
   ).then([this] {
     return cache->close();
@@ -189,7 +189,7 @@ TransactionManager::close() {
   }).safe_then([this] {
     return epm->close();
   }).safe_then([FNAME] {
-    INFO("completed");
+    INFO("done");
     return seastar::now();
   });
 }
@@ -231,14 +231,14 @@ TransactionManager::ref_ret TransactionManager::remove(
   LogicalCachedExtentRef &ref)
 {
   LOG_PREFIX(TransactionManager::remove);
-  TRACET("{}", t, *ref);
+  DEBUGT("{} ...", t, *ref);
   return lba_manager->decref_extent(t, ref->get_laddr()
   ).si_then([this, FNAME, &t, ref](auto result) {
-    DEBUGT("extent refcount is decremented to {} -- {}",
-           t, result.refcount, *ref);
     if (result.refcount == 0) {
       cache->retire_extent(t, ref);
     }
+    DEBUGT("removed {}~0x{:x} refcount={} -- {}",
+           t, result.addr, result.length, result.refcount, *ref);
     return result.refcount;
   });
 }
@@ -248,11 +248,9 @@ TransactionManager::ref_ret TransactionManager::remove(
   laddr_t offset)
 {
   LOG_PREFIX(TransactionManager::remove);
-  TRACET("{}", t, offset);
+  DEBUGT("{} ...", t, offset);
   return lba_manager->decref_extent(t, offset
   ).si_then([this, FNAME, offset, &t](auto result) -> ref_ret {
-    DEBUGT("extent refcount is decremented to {} -- {}~{}, {}",
-           t, result.refcount, offset, result.length, result.addr);
     auto fut = ref_iertr::now();
     if (result.refcount == 0) {
       if (result.addr.is_paddr() &&
@@ -261,8 +259,9 @@ TransactionManager::ref_ret TransactionManager::remove(
           t, result.addr.get_paddr(), result.length);
       }
     }
-
-    return fut.si_then([result=std::move(result)] {
+    return fut.si_then([result=std::move(result), offset, &t, FNAME] {
+      DEBUGT("removed {}~0x{:x} refcount={} -- offset={}",
+             t, result.addr, result.length, result.refcount, offset);
       return result.refcount;
     });
   });
@@ -273,9 +272,9 @@ TransactionManager::refs_ret TransactionManager::remove(
   std::vector<laddr_t> offsets)
 {
   LOG_PREFIX(TransactionManager::remove);
-  DEBUG("{} offsets", offsets.size());
+  DEBUGT("{} offsets ...", t, offsets.size());
   return seastar::do_with(std::move(offsets), std::vector<unsigned>(),
-    [this, &t](auto &&offsets, auto &refcnts) {
+    [this, &t, FNAME](auto &&offsets, auto &refcnts) {
     return trans_intr::do_for_each(offsets.begin(), offsets.end(),
       [this, &t, &refcnts](auto &laddr) {
       return this->remove(t, laddr
@@ -283,7 +282,8 @@ TransactionManager::refs_ret TransactionManager::remove(
         refcnts.push_back(ref);
         return ref_iertr::now();
       });
-    }).si_then([&refcnts] {
+    }).si_then([&refcnts, &t, FNAME] {
+      DEBUGT("removed {} offsets", t, refcnts.size());
       return ref_iertr::make_ready_future<std::vector<unsigned>>(std::move(refcnts));
     });
   });
@@ -520,7 +520,6 @@ TransactionManager::rewrite_logical_extent(
     ERRORT("extent has been invalidated -- {}", t, *extent);
     ceph_abort();
   }
-  TRACET("rewriting extent -- {}", t, *extent);
 
   auto lextent = extent->cast<LogicalCachedExtent>();
   cache->retire_extent(t, extent);
@@ -534,7 +533,7 @@ TransactionManager::rewrite_logical_extent(
       lextent->get_rewrite_generation())->cast<LogicalCachedExtent>();
     nlextent->rewrite(t, *lextent, 0);
 
-    DEBUGT("rewriting logical extent -- {} to {}", t, *lextent, *nlextent);
+    DEBUGT("rewriting meta -- {} to {}", t, *lextent, *nlextent);
 
 #ifndef NDEBUG
     if (get_checksum_needed(lextent->get_paddr())) {
@@ -571,17 +570,16 @@ TransactionManager::rewrite_logical_extent(
       0,
       lextent->get_length(),
       extent_ref_count_t(0),
-      [this, lextent, &t]
+      [this, FNAME, lextent, &t]
       (auto &extents, auto &off, auto &left, auto &refcount) {
       return trans_intr::do_for_each(
         extents,
-        [lextent, this, &t, &off, &left, &refcount](auto &nextent) {
-        LOG_PREFIX(TransactionManager::rewrite_logical_extent);
+        [lextent, this, FNAME, &t, &off, &left, &refcount](auto &nextent) {
         bool first_extent = (off == 0);
         ceph_assert(left >= nextent->get_length());
         auto nlextent = nextent->template cast<LogicalCachedExtent>();
         nlextent->rewrite(t, *lextent, off);
-        DEBUGT("rewriting logical extent -- {} to {}", t, *lextent, *nlextent);
+        DEBUGT("rewriting data -- {} to {}", t, *lextent, *nlextent);
 
         /* This update_mapping is, strictly speaking, unnecessary for delayed_alloc
          * extents since we're going to do it again once we either do the ool write
@@ -635,10 +633,18 @@ TransactionManager::rewrite_extent_ret TransactionManager::rewrite_extent(
   {
     auto updated = cache->update_extent_from_transaction(t, extent);
     if (!updated) {
-      DEBUGT("extent is already retired, skipping -- {}", t, *extent);
+      DEBUGT("target={} {} already retired, skipping -- {}", t,
+             rewrite_gen_printer_t{target_generation},
+             sea_time_point_printer_t{modify_time},
+             *extent);
       return rewrite_extent_iertr::now();
     }
+
     extent = updated;
+    DEBUGT("target={} {} -- {} ...", t,
+           rewrite_gen_printer_t{target_generation},
+           sea_time_point_printer_t{modify_time},
+           *extent);
     ceph_assert(!extent->is_pending_io());
   }
 
@@ -656,9 +662,9 @@ TransactionManager::rewrite_extent_ret TransactionManager::rewrite_extent(
       // FIXME: is_dirty() is true for mutation pending extents
       // which shouldn't do inplace rewrite because a pending transaction
       // may fail.
-      DEBUGT("delta overwriting extent -- {}", t, *extent);
       t.add_inplace_rewrite_extent(extent);
       extent->set_inplace_rewrite_generation();
+      DEBUGT("rewritten as inplace rewrite -- {}", t, *extent);
       return rewrite_extent_iertr::now();
     }
     extent->set_target_rewrite_generation(INIT_GENERATION);
@@ -672,22 +678,24 @@ TransactionManager::rewrite_extent_ret TransactionManager::rewrite_extent(
   }
 
   if (is_root_type(extent->get_type())) {
-    DEBUGT("rewriting root extent -- {}", t, *extent);
     cache->duplicate_for_write(t, extent);
+    DEBUGT("rewritten root {}", t, *extent);
     return rewrite_extent_iertr::now();
   }
 
+  auto fut = rewrite_extent_iertr::now();
   if (extent->is_logical()) {
     assert(is_logical_type(extent->get_type()));
-    return rewrite_logical_extent(t, extent->cast<LogicalCachedExtent>());
+    fut = rewrite_logical_extent(t, extent->cast<LogicalCachedExtent>());
   } else if (is_backref_node(extent->get_type())) {
-    DEBUGT("rewriting backref extent -- {}", t, *extent);
-    return backref_manager->rewrite_extent(t, extent);
+    fut = backref_manager->rewrite_extent(t, extent);
   } else {
     assert(is_lba_node(extent->get_type()));
-    DEBUGT("rewriting physical extent -- {}", t, *extent);
-    return lba_manager->rewrite_extent(t, extent);
+    fut = lba_manager->rewrite_extent(t, extent);
   }
+  return fut.si_then([FNAME, &t] {
+    DEBUGT("rewritten", t);
+  });
 }
 
 TransactionManager::get_extents_if_live_ret
@@ -699,7 +707,7 @@ TransactionManager::get_extents_if_live(
   extent_len_t len)
 {
   LOG_PREFIX(TransactionManager::get_extents_if_live);
-  TRACET("{} {}~{} {}", t, type, laddr, len, paddr);
+  DEBUGT("{} {}~0x{:x} {} ...", t, type, laddr, len, paddr);
 
   // This only works with segments to check if alive,
   // as parallel transactions may split the extent at the same time.
@@ -709,7 +717,7 @@ TransactionManager::get_extents_if_live(
   ).si_then([=, this, &t](auto extent)
 	    -> get_extents_if_live_ret {
     if (extent && extent->get_length() == len) {
-      DEBUGT("{} {}~{} {} is live in cache -- {}",
+      DEBUGT("{} {}~0x{:x} {} is cached and alive -- {}",
              t, type, laddr, len, paddr, *extent);
       std::list<CachedExtentRef> res;
       res.emplace_back(std::move(extent));
@@ -763,7 +771,9 @@ TransactionManager::get_extents_if_live(
               list.emplace_back(std::move(ret));
               return seastar::now();
             });
-          }).si_then([&list] {
+          }).si_then([&list, &t, FNAME, type, laddr, len, paddr] {
+            DEBUGT("{} {}~0x{:x} {} is alive as {} extents",
+                   t, type, laddr, len, paddr, list.size());
             return get_extents_if_live_ret(
               interruptible::ready_future_marker{},
               std::move(list));
@@ -784,11 +794,11 @@ TransactionManager::get_extents_if_live(
       ).si_then([=, &t](auto ret) {
         std::list<CachedExtentRef> res;
         if (ret) {
-          DEBUGT("{} {}~{} {} is live as physical extent -- {}",
+          DEBUGT("{} {}~0x{:x} {} is absent and alive as physical extent -- {}",
                  t, type, laddr, len, paddr, *ret);
           res.emplace_back(std::move(ret));
         } else {
-          DEBUGT("{} {}~{} {} is not live as physical extent",
+          DEBUGT("{} {}~0x{:x} {} is not alive as physical extent",
                  t, type, laddr, len, paddr);
         }
         return get_extents_if_live_ret(
diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h
index 6d1b010ab69ea..c7a94a9ef1132 100644
--- a/src/crimson/os/seastore/transaction_manager.h
+++ b/src/crimson/os/seastore/transaction_manager.h
@@ -106,8 +106,12 @@ class TransactionManager : public ExtentCallbackInterface {
     Transaction &t,
     laddr_t offset) {
     LOG_PREFIX(TransactionManager::get_pin);
-    SUBTRACET(seastore_tm, "{}", t, offset);
-    return lba_manager->get_mapping(t, offset);
+    SUBDEBUGT(seastore_tm, "{} ...", t, offset);
+    return lba_manager->get_mapping(t, offset
+    ).si_then([FNAME, &t](LBAMappingRef pin) {
+      SUBDEBUGT(seastore_tm, "got {}", t, *pin);
+      return pin;
+    });
   }
 
   /**
@@ -122,9 +126,13 @@ class TransactionManager : public ExtentCallbackInterface {
     laddr_t offset,
     extent_len_t length) {
     LOG_PREFIX(TransactionManager::get_pins);
-    SUBDEBUGT(seastore_tm, "{}~{}", t, offset, length);
+    SUBDEBUGT(seastore_tm, "{}~0x{:x} ...", t, offset, length);
     return lba_manager->get_mappings(
-      t, offset, length);
+      t, offset, length
+    ).si_then([FNAME, &t](lba_pin_list_t pins) {
+      SUBDEBUGT(seastore_tm, "got {} pins", t, pins.size());
+      return pins;
+    });
   }
 
   /**
@@ -142,15 +150,15 @@ class TransactionManager : public ExtentCallbackInterface {
     laddr_t offset,
     extent_len_t length) {
     LOG_PREFIX(TransactionManager::read_extent);
-    SUBTRACET(seastore_tm, "{}~{}", t, offset, length);
+    SUBDEBUGT(seastore_tm, "{}~0x{:x} {} ...",
+              t, offset, length, T::TYPE);
     return get_pin(
       t, offset
     ).si_then([this, FNAME, &t, offset, length] (auto pin)
       -> read_extent_ret<T> {
       if (length != pin->get_length() || !pin->get_val().is_real()) {
-        SUBERRORT(seastore_tm,
-            "offset {} len {} got wrong pin {}",
-            t, offset, length, *pin);
+        SUBERRORT(seastore_tm, "{}~0x{:x} {} got wrong {}",
+                  t, offset, length, T::TYPE, *pin);
         ceph_assert(0 == "Should be impossible");
       }
       return this->read_pin<T>(t, std::move(pin));
@@ -167,15 +175,15 @@ class TransactionManager : public ExtentCallbackInterface {
     Transaction &t,
     laddr_t offset) {
     LOG_PREFIX(TransactionManager::read_extent);
-    SUBTRACET(seastore_tm, "{}", t, offset);
+    SUBDEBUGT(seastore_tm, "{} {} ...",
+              t, offset, T::TYPE);
     return get_pin(
       t, offset
     ).si_then([this, FNAME, &t, offset] (auto pin)
       -> read_extent_ret<T> {
       if (!pin->get_val().is_real()) {
-        SUBERRORT(seastore_tm,
-            "offset {} got wrong pin {}",
-            t, offset, *pin);
+        SUBERRORT(seastore_tm, "{} {} got wrong {}",
+                  t, offset, T::TYPE, *pin);
         ceph_assert(0 == "Should be impossible");
       }
       return this->read_pin<T>(t, std::move(pin));
@@ -187,6 +195,8 @@ class TransactionManager : public ExtentCallbackInterface {
     Transaction &t,
     LBAMappingRef pin)
   {
+    LOG_PREFIX(TransactionManager::read_pin);
+    SUBDEBUGT(seastore_tm, "{} {} ...", t, T::TYPE, *pin);
     auto fut = base_iertr::make_ready_future<LBAMappingRef>();
     if (!pin->is_parent_viewable()) {
       if (pin->is_parent_valid()) {
@@ -212,6 +222,9 @@ class TransactionManager : public ExtentCallbackInterface {
       } else {
 	return this->pin_to_extent<T>(t, std::move(std::get<0>(ret)));
       }
+    }).si_then([FNAME, &t](TCachedExtentRef<T> ext) {
+      SUBDEBUGT(seastore_tm, "got {}", t, *ext);
+      return ext;
     });
   }
 
@@ -222,17 +235,9 @@ class TransactionManager : public ExtentCallbackInterface {
       t,
       ref)->cast<LogicalCachedExtent>();
     if (!ret->has_laddr()) {
-      SUBDEBUGT(seastore_tm,
-	"duplicating extent for write -- {} -> {}",
-	t,
-	*ref,
-	*ret);
+      SUBDEBUGT(seastore_tm, "duplicate from {}", t, *ref);
       ret->set_laddr(ref->get_laddr());
     } else {
-      SUBTRACET(seastore_tm,
-	"extent is already duplicated -- {}",
-	t,
-	*ref);
       assert(ref->is_mutable());
       assert(&*ref == &*ret);
     }
@@ -291,23 +296,23 @@ class TransactionManager : public ExtentCallbackInterface {
     extent_len_t len,
     placement_hint_t placement_hint = placement_hint_t::HOT) {
     LOG_PREFIX(TransactionManager::alloc_non_data_extent);
-    SUBTRACET(seastore_tm, "{} len={}, placement_hint={}, laddr_hint={}",
-              t, T::TYPE, len, placement_hint, laddr_hint);
+    SUBDEBUGT(seastore_tm, "{} hint {}~0x{:x} phint={} ...",
+              t, T::TYPE, laddr_hint, len, placement_hint);
     auto ext = cache->alloc_new_non_data_extent<T>(
       t,
       len,
       placement_hint,
       INIT_GENERATION);
     if (!ext) {
+      SUBERRORT(seastore_tm, "insufficient space!", t);
       return crimson::ct_error::enospc::make();
     }
     return lba_manager->alloc_extent(
       t,
       laddr_hint,
       *ext
-    ).si_then([ext=std::move(ext), laddr_hint, &t](auto &&) mutable {
-      LOG_PREFIX(TransactionManager::alloc_non_data_extent);
-      SUBDEBUGT(seastore_tm, "new extent: {}, laddr_hint: {}", t, *ext, laddr_hint);
+    ).si_then([ext=std::move(ext), &t, FNAME](auto &&) mutable {
+      SUBDEBUGT(seastore_tm, "allocated {}", t, *ext);
       return alloc_extent_iertr::make_ready_future<TCachedExtentRef<T>>(
 	std::move(ext));
     });
@@ -330,14 +335,15 @@ class TransactionManager : public ExtentCallbackInterface {
     extent_len_t len,
     placement_hint_t placement_hint = placement_hint_t::HOT) {
     LOG_PREFIX(TransactionManager::alloc_data_extents);
-    SUBTRACET(seastore_tm, "{} len={}, placement_hint={}, laddr_hint={}",
-              t, T::TYPE, len, placement_hint, laddr_hint);
+    SUBDEBUGT(seastore_tm, "{} hint {}~0x{:x} phint={} ...",
+              t, T::TYPE, laddr_hint, len, placement_hint);
     auto exts = cache->alloc_new_data_extents<T>(
       t,
       len,
       placement_hint,
       INIT_GENERATION);
     if (exts.empty()) {
+      SUBERRORT(seastore_tm, "insufficient space!", t);
       return crimson::ct_error::enospc::make();
     }
     return lba_manager->alloc_extents(
@@ -348,7 +354,7 @@ class TransactionManager : public ExtentCallbackInterface {
       EXTENT_DEFAULT_REF_COUNT
     ).si_then([exts=std::move(exts), &t, FNAME](auto &&) mutable {
       for (auto &ext : exts) {
-	SUBDEBUGT(seastore_tm, "new extent: {}", t, *ext);
+	SUBDEBUGT(seastore_tm, "allocated {}", t, *ext);
       }
       return alloc_extent_iertr::make_ready_future<
 	std::vector<TCachedExtentRef<T>>>(std::move(exts));
@@ -360,14 +366,17 @@ class TransactionManager : public ExtentCallbackInterface {
       Transaction &t,
       laddr_t laddr,
       extent_len_t len) {
+    LOG_PREFIX(TransactionManager::get_mutable_extent_by_laddr);
+    SUBDEBUGT(seastore_tm, "{}~0x{:x} ...", t, laddr, len);
     return get_pin(t, laddr
     ).si_then([this, &t, len](auto pin) {
       ceph_assert(pin->is_data_stable() && !pin->is_zero_reserved());
       ceph_assert(!pin->is_clone());
       ceph_assert(pin->get_length() == len);
       return this->read_pin<T>(t, std::move(pin));
-    }).si_then([this, &t](auto extent) {
+    }).si_then([this, &t, FNAME](auto extent) {
       auto ext = get_mutable_extent(t, extent)->template cast<T>();
+      SUBDEBUGT(seastore_tm, "got mutable {}", t, *ext);
       return read_extent_iertr::make_ready_future<TCachedExtentRef<T>>(
 	std::move(ext));
     });
@@ -424,10 +433,8 @@ class TransactionManager : public ExtentCallbackInterface {
       extent_len_t original_len = pin->get_length();
       paddr_t original_paddr = pin->get_val();
       LOG_PREFIX(TransactionManager::remap_pin);
-      SUBDEBUGT(seastore_tm,
-	"original laddr: {}, original paddr: {}, original length: {},"
-	" remap to {} extents",
-	t, original_laddr, original_paddr, original_len, remaps.size());
+      SUBDEBUGT(seastore_tm, "{}~0x{:x} {} into {} remaps ... {}",
+                t, original_laddr, original_len, original_paddr, remaps.size(), *pin);
       // The according extent might be stable or pending.
       auto fut = base_iertr::now();
       if (!pin->is_indirect()) {
@@ -484,14 +491,13 @@ class TransactionManager : public ExtentCallbackInterface {
 	    auto remap_len = remap.len;
 	    auto remap_laddr = (original_laddr + remap_offset).checked_to_laddr();
 	    auto remap_paddr = original_paddr.add_offset(remap_offset);
+	    SUBDEBUGT(seastore_tm, "remap direct pin into {}~0x{:x} {} ...",
+	              t, remap_laddr, remap_len, remap_paddr);
 	    ceph_assert(remap_len < original_len);
 	    ceph_assert(remap_offset + remap_len <= original_len);
 	    ceph_assert(remap_len != 0);
 	    ceph_assert(remap_offset % cache->get_block_size() == 0);
 	    ceph_assert(remap_len % cache->get_block_size() == 0);
-	    SUBDEBUGT(seastore_tm,
-	      "remap laddr: {}, remap paddr: {}, remap length: {}", t,
-	      remap_laddr, remap_paddr, remap_len);
 	    auto extent = cache->alloc_remapped_extent<T>(
 	      t,
 	      remap_laddr,
@@ -503,13 +509,15 @@ class TransactionManager : public ExtentCallbackInterface {
 	  }
 	});
       }
-      return fut.si_then([this, &t, &pin, &remaps, &extents] {
+      return fut.si_then([this, &t, &pin, &remaps, &extents, FNAME] {
 	return lba_manager->remap_mappings(
 	  t,
 	  std::move(pin),
 	  std::vector<remap_entry>(remaps.begin(), remaps.end()),
 	  std::move(extents)
-	).si_then([](auto ret) {
+	).si_then([FNAME, &t](auto ret) {
+	  SUBDEBUGT(seastore_tm, "remapped {} pins",
+	            t, ret.remapped_mappings.size());
 	  return Cache::retire_extent_iertr::make_ready_future<
 	    std::vector<LBAMappingRef>>(std::move(ret.remapped_mappings));
 	});
@@ -529,11 +537,15 @@ class TransactionManager : public ExtentCallbackInterface {
     laddr_t hint,
     extent_len_t len) {
     LOG_PREFIX(TransactionManager::reserve_region);
-    SUBDEBUGT(seastore_tm, "len={}, laddr_hint={}", t, len, hint);
+    SUBDEBUGT(seastore_tm, "hint {}~0x{:x} ...", t, hint, len);
     return lba_manager->reserve_region(
       t,
       hint,
-      len);
+      len
+    ).si_then([FNAME, &t](auto pin) {
+      SUBDEBUGT(seastore_tm, "reserved {}", t, *pin);
+      return pin;
+    });
   }
 
   /*
@@ -560,15 +572,17 @@ class TransactionManager : public ExtentCallbackInterface {
         : mapping.get_key();
 
     LOG_PREFIX(TransactionManager::clone_pin);
-    SUBDEBUGT(seastore_tm, "len={}, laddr_hint={}, clone_offset {}",
-      t, mapping.get_length(), hint, intermediate_key);
+    SUBDEBUGT(seastore_tm, "{} clone to hint {} ...", t, mapping, hint);
     return lba_manager->clone_mapping(
       t,
       hint,
       mapping.get_length(),
       intermediate_key,
       intermediate_base
-    );
+    ).si_then([FNAME, &t](auto pin) {
+      SUBDEBUGT(seastore_tm, "cloned as {}", t, *pin);
+      return pin;
+    });
   }
 
   /* alloc_extents
@@ -583,10 +597,10 @@ class TransactionManager : public ExtentCallbackInterface {
      extent_len_t len,
      int num) {
      LOG_PREFIX(TransactionManager::alloc_extents);
-     SUBDEBUGT(seastore_tm, "len={}, laddr_hint={}, num={}",
-               t, len, hint, num);
+     SUBDEBUGT(seastore_tm, "hint {}~({} * 0x{:x}) ...",
+               t, hint, num, len);
      return seastar::do_with(std::vector<TCachedExtentRef<T>>(),
-       [this, &t, hint, len, num] (auto &extents) {
+       [this, &t, hint, len, num, FNAME](auto &extents) {
        return trans_intr::do_for_each(
                        boost::make_counting_iterator(0),
                        boost::make_counting_iterator(num),
@@ -595,7 +609,8 @@ class TransactionManager : public ExtentCallbackInterface {
            [&extents](auto &&node) {
            extents.push_back(node);
          });
-       }).si_then([&extents] {
+       }).si_then([&extents, &t, FNAME] {
+         SUBDEBUGT(seastore_tm, "allocated {} extents", t, extents.size());
          return alloc_extents_iertr::make_ready_future
                 <std::vector<TCachedExtentRef<T>>>(std::move(extents));
        });
@@ -701,7 +716,7 @@ class TransactionManager : public ExtentCallbackInterface {
     const std::string& key,
     const std::string& value) {
     LOG_PREFIX(TransactionManager::update_root_meta);
-    SUBDEBUGT(seastore_tm, "seastore_tm, {} -> {}", t, key, value);
+    SUBDEBUGT(seastore_tm, "seastore_tm, {} -> {} ...", t, key, value);
     return cache->get_root(
       t
     ).si_then([this, &t, &key, &value](RootBlockRef root) {
@@ -756,7 +771,7 @@ class TransactionManager : public ExtentCallbackInterface {
     return cache->get_root(t).si_then([&t](auto croot) {
       LOG_PREFIX(TransactionManager::read_collection_root);
       auto ret = croot->get_root().collection_root.get();
-      SUBTRACET(seastore_tm, "{}~{}",
+      SUBTRACET(seastore_tm, "{}~0x{:x}",
                 t, ret.get_location(), ret.get_size());
       return ret;
     });
@@ -769,7 +784,7 @@ class TransactionManager : public ExtentCallbackInterface {
    */
   void write_collection_root(Transaction &t, coll_root_t cmroot) {
     LOG_PREFIX(TransactionManager::write_collection_root);
-    SUBDEBUGT(seastore_tm, "{}~{}",
+    SUBDEBUGT(seastore_tm, "{}~0x{:x}",
               t, cmroot.get_location(), cmroot.get_size());
     auto croot = cache->get_root_fast(t);
     croot = cache->duplicate_for_write(t, croot)->cast<RootBlock>();
@@ -872,7 +887,7 @@ class TransactionManager : public ExtentCallbackInterface {
     Transaction &t,
     LBAMappingRef pin) {
     LOG_PREFIX(TransactionManager::pin_to_extent);
-    SUBTRACET(seastore_tm, "getting extent {}", t, *pin);
+    SUBTRACET(seastore_tm, "getting absent extent from pin {} ...", t, *pin);
     static_assert(is_logical_type(T::TYPE));
     using ret = pin_to_extent_ret<T>;
     auto &pref = *pin;
@@ -936,7 +951,8 @@ class TransactionManager : public ExtentCallbackInterface {
       extent_types_t type)
   {
     LOG_PREFIX(TransactionManager::pin_to_extent_by_type);
-    SUBTRACET(seastore_tm, "getting extent {} type {}", t, *pin, type);
+    SUBTRACET(seastore_tm, "getting absent extent from pin {} type {} ...",
+              t, *pin, type);
     assert(is_logical_type(type));
     auto &pref = *pin;
     return cache->get_absent_extent_by_type(

From 97e68b20aa3bf0d54ca0e10d0e7c9003adb61eb5 Mon Sep 17 00:00:00 2001
From: Xuehan Xu <xuxuehan@qianxin.com>
Date: Tue, 3 Sep 2024 16:25:25 +0800
Subject: [PATCH 067/148] crimson/osd/backfill_state: add the object to be
 pushed in the peer missing set of PeeringState

Fixes: https://tracker.ceph.com/issues/67874
Signed-off-by: Xuehan Xu <xuxuehan@qianxin.com>
---
 src/crimson/osd/backfill_facades.h |  6 ++++++
 src/crimson/osd/backfill_state.cc  | 27 +++++++++++++++++++++------
 src/crimson/osd/backfill_state.h   |  7 ++++++-
 src/crimson/osd/pg_recovery.cc     |  8 +++++---
 src/crimson/osd/pg_recovery.h      |  3 ++-
 src/test/crimson/test_backfill.cc  | 10 ++++++++--
 6 files changed, 48 insertions(+), 13 deletions(-)

diff --git a/src/crimson/osd/backfill_facades.h b/src/crimson/osd/backfill_facades.h
index 683dc6ea64948..522a93a1ddcbe 100644
--- a/src/crimson/osd/backfill_facades.h
+++ b/src/crimson/osd/backfill_facades.h
@@ -52,6 +52,12 @@ struct PeeringFacade final : BackfillState::PeeringFacade {
     return peering_state.is_backfilling();
   }
 
+  void prepare_backfill_for_missing(
+    const hobject_t &soid,
+    const eversion_t &v,
+    const std::vector<pg_shard_t> &peers) override {
+    return peering_state.prepare_backfill_for_missing(soid, v, peers);
+  }
   PeeringFacade(PeeringState& peering_state)
     : peering_state(peering_state) {
   }
diff --git a/src/crimson/osd/backfill_state.cc b/src/crimson/osd/backfill_state.cc
index d015a77545cf4..5e4687877c2cd 100644
--- a/src/crimson/osd/backfill_state.cc
+++ b/src/crimson/osd/backfill_state.cc
@@ -251,6 +251,7 @@ BackfillState::Enqueuing::update_on_peers(const hobject_t& check)
   logger().debug("{}: check={}", __func__, check);
   const auto& primary_bi = backfill_state().backfill_info;
   result_t result { {}, primary_bi.begin };
+  std::map<hobject_t, std::pair<eversion_t, std::vector<pg_shard_t>>> backfills;
 
   for (const auto& bt : peering_state().get_backfill_targets()) {
     const auto& peer_bi = backfill_state().peer_backfill_info.at(bt);
@@ -258,9 +259,13 @@ BackfillState::Enqueuing::update_on_peers(const hobject_t& check)
     // Find all check peers that have the wrong version
     if (const eversion_t& obj_v = primary_bi.objects.begin()->second;
         check == primary_bi.begin && check == peer_bi.begin) {
-      if(peer_bi.objects.begin()->second != obj_v &&
-          backfill_state().progress_tracker->enqueue_push(primary_bi.begin)) {
-        backfill_listener().enqueue_push(primary_bi.begin, obj_v);
+      if (peer_bi.objects.begin()->second != obj_v) {
+	std::ignore = backfill_state().progress_tracker->enqueue_push(
+	  primary_bi.begin);
+	auto &[v, peers] = backfills[primary_bi.begin];
+	assert(v == obj_v || v == eversion_t());
+	v = obj_v;
+	peers.push_back(bt);
       } else {
         // it's fine, keep it! OR already recovering
       }
@@ -269,12 +274,22 @@ BackfillState::Enqueuing::update_on_peers(const hobject_t& check)
       // Only include peers that we've caught up to their backfill line
       // otherwise, they only appear to be missing this object
       // because their peer_bi.begin > backfill_info.begin.
-      if (primary_bi.begin > peering_state().get_peer_last_backfill(bt) &&
-          backfill_state().progress_tracker->enqueue_push(primary_bi.begin)) {
-        backfill_listener().enqueue_push(primary_bi.begin, obj_v);
+      if (primary_bi.begin > peering_state().get_peer_last_backfill(bt)) {
+	std::ignore = backfill_state().progress_tracker->enqueue_push(
+	  primary_bi.begin);
+	auto &[v, peers] = backfills[primary_bi.begin];
+	assert(v == obj_v || v == eversion_t());
+	v = obj_v;
+	peers.push_back(bt);
       }
     }
   }
+  for (auto &backfill : backfills) {
+    auto &soid = backfill.first;
+    auto &obj_v = backfill.second.first;
+    auto &peers = backfill.second.second;
+    backfill_listener().enqueue_push(soid, obj_v, peers);
+  }
   return result;
 }
 
diff --git a/src/crimson/osd/backfill_state.h b/src/crimson/osd/backfill_state.h
index 4cdd4daafce6d..da88b611fcf9b 100644
--- a/src/crimson/osd/backfill_state.h
+++ b/src/crimson/osd/backfill_state.h
@@ -315,7 +315,8 @@ struct BackfillState::BackfillListener {
 
   virtual void enqueue_push(
     const hobject_t& obj,
-    const eversion_t& v) = 0;
+    const eversion_t& v,
+    const std::vector<pg_shard_t> &peers) = 0;
 
   virtual void enqueue_drop(
     const pg_shard_t& target,
@@ -354,6 +355,10 @@ struct BackfillState::PeeringFacade {
   virtual void update_complete_backfill_object_stats(const hobject_t &hoid,
                                              const pg_stat_t &stats) = 0;
   virtual bool is_backfilling() const = 0;
+  virtual void prepare_backfill_for_missing(
+    const hobject_t &soid,
+    const eversion_t &v,
+    const std::vector<pg_shard_t> &peers) = 0;
   virtual ~PeeringFacade() {}
 };
 
diff --git a/src/crimson/osd/pg_recovery.cc b/src/crimson/osd/pg_recovery.cc
index f4a7d8a63db9f..55d64925ec5c0 100644
--- a/src/crimson/osd/pg_recovery.cc
+++ b/src/crimson/osd/pg_recovery.cc
@@ -520,10 +520,12 @@ void PGRecovery::request_primary_scan(
 
 void PGRecovery::enqueue_push(
   const hobject_t& obj,
-  const eversion_t& v)
+  const eversion_t& v,
+  const std::vector<pg_shard_t> &peers)
 {
-  logger().info("{}: obj={} v={}",
-                 __func__, obj, v);
+  logger().info("{}: obj={} v={} peers={}", __func__, obj, v, peers);
+  auto &peering_state = pg->get_peering_state();
+  peering_state.prepare_backfill_for_missing(obj, v, peers);
   auto [recovering, added] = pg->get_recovery_backend()->add_recovering(obj);
   if (!added)
     return;
diff --git a/src/crimson/osd/pg_recovery.h b/src/crimson/osd/pg_recovery.h
index f5b8632a38263..eb9c928fe5db6 100644
--- a/src/crimson/osd/pg_recovery.h
+++ b/src/crimson/osd/pg_recovery.h
@@ -106,7 +106,8 @@ class PGRecovery : public crimson::osd::BackfillState::BackfillListener {
     const hobject_t& begin) final;
   void enqueue_push(
     const hobject_t& obj,
-    const eversion_t& v) final;
+    const eversion_t& v,
+    const std::vector<pg_shard_t> &peers) final;
   void enqueue_drop(
     const pg_shard_t& target,
     const hobject_t& obj,
diff --git a/src/test/crimson/test_backfill.cc b/src/test/crimson/test_backfill.cc
index 1ce9b42ad381f..30aef449278ba 100644
--- a/src/test/crimson/test_backfill.cc
+++ b/src/test/crimson/test_backfill.cc
@@ -128,7 +128,8 @@ class BackfillFixture : public crimson::osd::BackfillState::BackfillListener {
 
   void enqueue_push(
     const hobject_t& obj,
-    const eversion_t& v) override;
+    const eversion_t& v,
+    const std::vector<pg_shard_t> &peers) override;
 
   void enqueue_drop(
     const pg_shard_t& target,
@@ -222,6 +223,10 @@ struct BackfillFixture::PeeringFacade
   void update_complete_backfill_object_stats(const hobject_t &hoid,
                                              const pg_stat_t &stats) override {
   }
+  void prepare_backfill_for_missing(
+    const hobject_t &soid,
+    const eversion_t &v,
+    const std::vector<pg_shard_t> &peers) override {}
   bool is_backfilling() const override {
     return true;
   }
@@ -282,7 +287,8 @@ void BackfillFixture::request_primary_scan(
 
 void BackfillFixture::enqueue_push(
   const hobject_t& obj,
-  const eversion_t& v)
+  const eversion_t& v,
+  const std::vector<pg_shard_t> &)
 {
   for (auto& [ _, bt ] : backfill_targets) {
     bt.store.push(obj, v);

From 5b90117348d030db16738ae06e9308ade4355bb0 Mon Sep 17 00:00:00 2001
From: JonBailey1993 <jonathan.bailey1@ibm.com>
Date: Tue, 8 Oct 2024 09:43:10 +0100
Subject: [PATCH 068/148] common/io_exerciser: Add version argument to
 callbacks in ceph_radios_io_sequence

Add new version that was missing from ceph_test_rados_io_sequence callbacks due to interface changes

Signed-off-by: Jon Bailey <jonathan.bailey1@ibm.com>
---
 src/common/io_exerciser/RadosIo.cc | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/src/common/io_exerciser/RadosIo.cc b/src/common/io_exerciser/RadosIo.cc
index 3f907ccf47416..41be2885f3f67 100644
--- a/src/common/io_exerciser/RadosIo.cc
+++ b/src/common/io_exerciser/RadosIo.cc
@@ -118,7 +118,8 @@ void RadosIo::applyIoOp(IoOp &op)
       op_info = std::make_shared<AsyncOpInfo>(0, op.length1);
       op_info->bl1 = db->generate_data(0, op.length1);
       op_info->wop.write_full(op_info->bl1);
-      auto create_cb = [this] (boost::system::error_code ec) {
+      auto create_cb = [this] (boost::system::error_code ec,
+                               version_t ver) {
         ceph_assert(ec == boost::system::errc::success);
         finish_io();
       };
@@ -132,7 +133,8 @@ void RadosIo::applyIoOp(IoOp &op)
       start_io();
       op_info = std::make_shared<AsyncOpInfo>();
       op_info->wop.remove();
-      auto remove_cb = [this] (boost::system::error_code ec) {
+      auto remove_cb = [this] (boost::system::error_code ec,
+                               version_t ver) {
         ceph_assert(ec == boost::system::errc::success);
         finish_io();
       };
@@ -148,7 +150,9 @@ void RadosIo::applyIoOp(IoOp &op)
       op_info->rop.read(op.offset1 * block_size,
                         op.length1 * block_size,
                         &op_info->bl1, nullptr);
-      auto read_cb = [this, op_info] (boost::system::error_code ec, bufferlist bl) {
+      auto read_cb = [this, op_info] (boost::system::error_code ec,
+                                      version_t ver,
+                                      bufferlist bl) {
         ceph_assert(ec == boost::system::errc::success);
         db->validate(op_info->bl1, op_info->offset1, op_info->length1);
         finish_io();
@@ -174,6 +178,7 @@ void RadosIo::applyIoOp(IoOp &op)
                     op.length2 * block_size,
                     &op_info->bl2, nullptr);
       auto read2_cb = [this, op_info] (boost::system::error_code ec,
+                                       version_t ver,
                                        bufferlist bl) {
         ceph_assert(ec == boost::system::errc::success);
         db->validate(op_info->bl1, op_info->offset1, op_info->length1);
@@ -202,6 +207,7 @@ void RadosIo::applyIoOp(IoOp &op)
                     op.length3 * block_size,
                     &op_info->bl3, nullptr);
       auto read3_cb = [this, op_info] (boost::system::error_code ec,
+                                       version_t ver,
                                        bufferlist bl) {
         ceph_assert(ec == boost::system::errc::success);
         db->validate(op_info->bl1, op_info->offset1, op_info->length1);
@@ -222,7 +228,8 @@ void RadosIo::applyIoOp(IoOp &op)
       op_info->bl1 = db->generate_data(op.offset1, op.length1);
 
       op_info->wop.write(op.offset1 * block_size, op_info->bl1);
-      auto write_cb = [this] (boost::system::error_code ec) {
+      auto write_cb = [this] (boost::system::error_code ec,
+                              version_t ver) {
         ceph_assert(ec == boost::system::errc::success);
         finish_io();
       };
@@ -241,7 +248,8 @@ void RadosIo::applyIoOp(IoOp &op)
       op_info->bl2 = db->generate_data(op.offset2, op.length2);
       op_info->wop.write(op.offset1 * block_size, op_info->bl1);
       op_info->wop.write(op.offset2 * block_size, op_info->bl2);
-      auto write2_cb = [this] (boost::system::error_code ec) {
+      auto write2_cb = [this] (boost::system::error_code ec,
+                               version_t ver) {
         ceph_assert(ec == boost::system::errc::success);
         finish_io();
       };
@@ -263,7 +271,8 @@ void RadosIo::applyIoOp(IoOp &op)
       op_info->wop.write(op.offset1 * block_size, op_info->bl1);
       op_info->wop.write(op.offset2 * block_size, op_info->bl2);
       op_info->wop.write(op.offset3 * block_size, op_info->bl3);
-      auto write3_cb = [this] (boost::system::error_code ec) {
+      auto write3_cb = [this] (boost::system::error_code ec,
+                               version_t ver) {
         ceph_assert(ec == boost::system::errc::success);
         finish_io();
       };

From 256b20de486337dde92fcb2067e0351ea6e67f54 Mon Sep 17 00:00:00 2001
From: Patrick Donnelly <pdonnell@redhat.com>
Date: Thu, 26 Sep 2024 20:39:40 -0400
Subject: [PATCH 069/148] qa: do not fail cephfs QA tests for slow bluestore
 ops

Fixes: https://tracker.ceph.com/issues/68283
Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
---
 qa/cephfs/overrides/ignorelist_health.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/qa/cephfs/overrides/ignorelist_health.yaml b/qa/cephfs/overrides/ignorelist_health.yaml
index 678548fe2cc22..94b4257977759 100644
--- a/qa/cephfs/overrides/ignorelist_health.yaml
+++ b/qa/cephfs/overrides/ignorelist_health.yaml
@@ -21,3 +21,6 @@ overrides:
       - overall HEALTH_
       - Replacing daemon
       - deprecated feature inline_data
+      - BLUESTORE_SLOW_OP_ALERT
+      - slow operation indications in BlueStore
+      - experiencing slow operations in BlueStore

From 9fc65f160cd3764a68fb3697d067c358761fc837 Mon Sep 17 00:00:00 2001
From: Adam Kupczyk <akupczyk@ibm.com>
Date: Mon, 7 Oct 2024 11:45:41 +0000
Subject: [PATCH 070/148] os/bluestore: Make truncate() drop unused allocations

Now when truncate() drops unused allocations.
Modified Close() in BlueRocksEnv to unconditionally call truncate.

Fixes: https://tracker.ceph.com/issues/68385

Signed-off-by: Adam Kupczyk <akupczyk@ibm.com>
---
 src/os/bluestore/BlueFS.cc       | 65 +++++++++++++++++++++++++-------
 src/os/bluestore/BlueRocksEnv.cc | 14 ++-----
 2 files changed, 56 insertions(+), 23 deletions(-)

diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc
index 3dcd96830c48d..5f4f1a4d48ac2 100644
--- a/src/os/bluestore/BlueFS.cc
+++ b/src/os/bluestore/BlueFS.cc
@@ -3760,15 +3760,16 @@ int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/
 {
   auto t0 = mono_clock::now();
   std::lock_guard hl(h->lock);
+  auto& fnode = h->file->fnode;
   dout(10) << __func__ << " 0x" << std::hex << offset << std::dec
-           << " file " << h->file->fnode << dendl;
+           << " file " << fnode << dendl;
   if (h->file->deleted) {
     dout(10) << __func__ << "  deleted, no-op" << dendl;
     return 0;
   }
 
   // we never truncate internal log files
-  ceph_assert(h->file->fnode.ino > 1);
+  ceph_assert(fnode.ino > 1);
 
   // truncate off unflushed data?
   if (h->pos < offset &&
@@ -3782,20 +3783,58 @@ int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/
     if (r < 0)
       return r;
   }
-  if (offset == h->file->fnode.size) {
-    return 0;  // no-op!
-  }
-  if (offset > h->file->fnode.size) {
+  if (offset > fnode.size) {
     ceph_abort_msg("truncate up not supported");
   }
-  ceph_assert(h->file->fnode.size >= offset);
+  ceph_assert(offset <= fnode.size);
   _flush_bdev(h);
-
-  std::lock_guard ll(log.lock);
-  vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size - offset);
-  h->file->fnode.size = offset;
-  h->file->is_dirty = true;
-  log.t.op_file_update_inc(h->file->fnode);
+  {
+    std::lock_guard ll(log.lock);
+    std::lock_guard dl(dirty.lock);
+    bool changed_extents = false;
+    vselector->sub_usage(h->file->vselector_hint, fnode);
+    uint64_t x_off = 0;
+    auto p = fnode.seek(offset, &x_off);
+    uint64_t cut_off =
+      (p == fnode.extents.end()) ? 0 : p2roundup(x_off, alloc_size[p->bdev]);
+    uint64_t new_allocated;
+    if (0 == cut_off) {
+      // whole pextent to remove
+      changed_extents = true;
+      new_allocated = offset;
+    } else if (cut_off < p->length) {
+      dirty.pending_release[p->bdev].insert(p->offset + cut_off, p->length - cut_off);
+      new_allocated = (offset - x_off) + cut_off;
+      p->length = cut_off;
+      changed_extents = true;
+      ++p;
+    } else {
+      ceph_assert(cut_off >= p->length);
+      new_allocated  = (offset - x_off) + p->length;
+      // just leave it here
+      ++p;
+    }
+    while (p != fnode.extents.end()) {
+      dirty.pending_release[p->bdev].insert(p->offset, p->length);
+      p = fnode.extents.erase(p);
+      changed_extents = true;
+    }
+    if (changed_extents) {
+      fnode.size = offset;
+      fnode.allocated = new_allocated;
+      fnode.reset_delta();
+      log.t.op_file_update(fnode);
+      // sad, but is_dirty must be set to signal flushing of the log
+      h->file->is_dirty = true;
+    } else {
+      if (offset != fnode.size) {
+        fnode.size = offset;
+        //skipping log.t.op_file_update_inc, it will be done by flush()
+        h->file->is_dirty = true;
+      }
+    }
+    vselector->add_usage(h->file->vselector_hint, fnode);
+  }
   logger->tinc(l_bluefs_truncate_lat, mono_clock::now() - t0);
   return 0;
 }
diff --git a/src/os/bluestore/BlueRocksEnv.cc b/src/os/bluestore/BlueRocksEnv.cc
index 68040af428280..7cbe0a1d12146 100644
--- a/src/os/bluestore/BlueRocksEnv.cc
+++ b/src/os/bluestore/BlueRocksEnv.cc
@@ -221,18 +221,12 @@ class BlueRocksWritableFile : public rocksdb::WritableFile {
   }
 
   rocksdb::Status Close() override {
-    fs->fsync(h);
 
-    // mimic posix env, here.  shrug.
-    size_t block_size;
-    size_t last_allocated_block;
-    GetPreallocationStatus(&block_size, &last_allocated_block);
-    if (last_allocated_block > 0) {
-      int r = fs->truncate(h, h->pos);
-      if (r < 0)
-	return err_to_status(r);
+    int r = fs->truncate(h, h->pos);
+    if (r < 0) {
+      return err_to_status(r);
     }
-
+    fs->fsync(h);
     return rocksdb::Status::OK();
   }
 

From 512eea1af52126e9b082744ee7b870c12c23d55d Mon Sep 17 00:00:00 2001
From: Ernesto Puerta <37327689+epuertat@users.noreply.github.com>
Date: Wed, 9 Oct 2024 08:12:42 +0200
Subject: [PATCH 071/148] doc: fix email

Signed-off-by: Ernesto Puerta <37327689+epuertat@users.noreply.github.com>
---
 doc/governance.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/governance.rst b/doc/governance.rst
index 3105a917f1b01..284a9570397c3 100644
--- a/doc/governance.rst
+++ b/doc/governance.rst
@@ -82,7 +82,7 @@ Current Members
  * Casey Bodley <cbodley@redhat.com>
  * Dan van der Ster <dan.vanderster@clyso.com>
  * David Orman <ormandj@1111systems.com>
- * Ernesto Puerta <epuerta@redhat.com>
+ * Ernesto Puerta <epuertat@redhat.com>
  * Gregory Farnum <gfarnum@redhat.com>
  * Haomai Wang <haomai@xsky.com>
  * Ilya Dryomov <idryomov@redhat.com>

From 1b535c011f1d0e50a149e7195d4b50af28c01800 Mon Sep 17 00:00:00 2001
From: Jos Collin <jcollin@redhat.com>
Date: Wed, 9 Oct 2024 15:46:00 +0530
Subject: [PATCH 072/148] doc: update 'header get' output in
 cephfs-journal-tool.rst

Signed-off-by: Jos Collin <jcollin@redhat.com>
---
 doc/cephfs/cephfs-journal-tool.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/cephfs/cephfs-journal-tool.rst b/doc/cephfs/cephfs-journal-tool.rst
index 4ad7304481f7f..3ae1139ceac2c 100644
--- a/doc/cephfs/cephfs-journal-tool.rst
+++ b/doc/cephfs/cephfs-journal-tool.rst
@@ -105,12 +105,12 @@ Example: header get/set
       "write_pos": 4274947,
       "expire_pos": 4194304,
       "trimmed_pos": 4194303,
+      "stream_format": 1,
       "layout": { "stripe_unit": 4194304,
-          "stripe_count": 4194304,
+          "stripe_count": 1,
           "object_size": 4194304,
-          "cas_hash": 4194304,
-          "object_stripe_unit": 4194304,
-          "pg_pool": 4194304}}
+          "pool_id": 2,
+          "pool_ns": ""}}
 
     # cephfs-journal-tool header set trimmed_pos 4194303
     Updating trimmed_pos 0x400000 -> 0x3fffff

From a0c51d0e7f05e84411e3877b5861f3eec26ad934 Mon Sep 17 00:00:00 2001
From: JonBailey1993 <jonathan.bailey1@ibm.com>
Date: Wed, 9 Oct 2024 11:28:42 +0100
Subject: [PATCH 073/148] common/io_exerciser: Modify is_locked_by_me call in
 ceph_test_rados_io_sequence

is_locked_by_me() is a function of ceph::mutex which is only used in debug builds. By using the ceph_mutex_is_locked_by_me macro, we can neatly make sure we only run this function in debug mode, allowing compilation to no longer be affected when running in release mode.

Signed-off-by: Jon Bailey <jonathan.bailey1@ibm.com>
---
 src/common/io_exerciser/RadosIo.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/common/io_exerciser/RadosIo.cc b/src/common/io_exerciser/RadosIo.cc
index 41be2885f3f67..a28a1e2f488b4 100644
--- a/src/common/io_exerciser/RadosIo.cc
+++ b/src/common/io_exerciser/RadosIo.cc
@@ -81,7 +81,7 @@ RadosIo::AsyncOpInfo::AsyncOpInfo(uint64_t offset1, uint64_t length1,
 
 bool RadosIo::readyForIoOp(IoOp &op)
 {
-  ceph_assert(lock.is_locked_by_me()); //Must be called with lock held
+  ceph_assert(ceph_mutex_is_locked_by_me(lock)); //Must be called with lock held
   if (!om->readyForIoOp(op)) {
     return false;
   }

From 78ae3b13509b5cc053b1f5831f0f6a675e99975b Mon Sep 17 00:00:00 2001
From: Avan Thakkar <athakkar@redhat.com>
Date: Wed, 9 Oct 2024 18:31:11 +0530
Subject: [PATCH 074/148] qa/cephfs: update earmark values to valid ones in
 test_volumes.py

smb.test is an invalid earmark now it should be either smb or
smb.cluster.<cluster_id>. Update the test_volumes.py to set
valid earmarks wherever used.

Fixes: https://tracker.ceph.com/issues/68448
Signed-off-by: Avan Thakkar <athakkar@redhat.com>
---
 qa/tasks/cephfs/test_volumes.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/qa/tasks/cephfs/test_volumes.py b/qa/tasks/cephfs/test_volumes.py
index 2baefd72c3fbc..9ca85ee67f9e3 100644
--- a/qa/tasks/cephfs/test_volumes.py
+++ b/qa/tasks/cephfs/test_volumes.py
@@ -2388,7 +2388,7 @@ def test_subvolume_set_and_get_earmark(self):
         self._fs_cmd("subvolume", "create", self.volname, subvolume)
 
         # set earmark
-        earmark = "smb.test"
+        earmark = "smb"
         self._fs_cmd("subvolume", "earmark", "set", self.volname, subvolume, "--earmark", earmark)
 
         # get earmark
@@ -2401,7 +2401,7 @@ def test_subvolume_clear_earmark(self):
         self._fs_cmd("subvolume", "create", self.volname, subvolume)
 
         # set earmark
-        earmark = "smb.test"
+        earmark = "smb"
         self._fs_cmd("subvolume", "earmark", "set", self.volname, subvolume, "--earmark", earmark)
 
         # remove earmark
@@ -2559,7 +2559,7 @@ def test_subvolume_info(self):
             self.assertIn(feature, subvol_info["features"], msg="expected feature '{0}' in subvolume".format(feature))
 
         # set earmark
-        earmark = "smb.test"
+        earmark = "smb"
         self._fs_cmd("subvolume", "earmark", "set", self.volname, subvolume, "--earmark", earmark)
 
         subvol_info = json.loads(self._get_subvolume_info(self.volname, subvolume))

From d513cc527ca5260ffb957c739cb84f8d3474d728 Mon Sep 17 00:00:00 2001
From: Vallari Agrawal <val.agl002@gmail.com>
Date: Wed, 9 Oct 2024 16:17:04 +0530
Subject: [PATCH 075/148] labeler: add nvmeof labelers

Signed-off-by: Vallari Agrawal <val.agl002@gmail.com>
---
 .github/labeler.yml | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/.github/labeler.yml b/.github/labeler.yml
index 1b50ff7c5a391..9f2ed1e479019 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -263,6 +263,19 @@ rbd:
   - systemd/rbdmap.service.in
   - udev/50-rbd.rules
 
+nvmeof:
+  - qa/suites/nvmeof/**
+  - qa/tasks/nvmeof.py
+  - qa/workunits/nvmeof/**
+  - src/ceph_nvmeof_monitor_client.cc
+  - src/cephadm/cephadmlib/daemons/nvmeof.py
+  - src/messages/MNVMeofGw*
+  - src/mon/NVMeofGw*
+  - src/nvmeof/**
+  - src/pybind/mgr/cephadm/services/nvmeof.py
+  - src/pybind/mgr/cephadm/templates/services/nvmeof/**
+  - src/tools/ceph-dencoder/nvmeof*
+
 rgw:
   - qa/suites/rgw/**
   - qa/tasks/rgw*

From 6c419323584103aefed847d8af0a261eda938c29 Mon Sep 17 00:00:00 2001
From: Anoop C S <anoopcs@cryptolab.net>
Date: Thu, 10 Oct 2024 11:00:03 +0530
Subject: [PATCH 076/148] cephadm/smb: Add NET_RAW capability to deploy ctdbd

CTDB heavily depends on raw sockets to send gratuitous ARPs[1](see the
second point from the list of reasons to use gratuitous ARPs). As per
the current design it is also inevitable while sending tickle ACKs[2]
in the event of an IP failover. man capabilities(7)[3] further mandates
CAP_NET_RAW to use raw sockets. Therefore append NET_RAW to the list of
capabilties while deploying ctdbd containers.

[1] https://wiki.wireshark.org/Gratuitous_ARP
[2] https://ctdb.samba.org/manpages/ctdb.1.html
[3] https://www.man7.org/linux/man-pages/man7/capabilities.7.html

Signed-off-by: Anoop C S <anoopcs@cryptolab.net>
---
 src/cephadm/cephadmlib/daemons/smb.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/cephadm/cephadmlib/daemons/smb.py b/src/cephadm/cephadmlib/daemons/smb.py
index 74cb13f4ab022..a0e648e857cbf 100644
--- a/src/cephadm/cephadmlib/daemons/smb.py
+++ b/src/cephadm/cephadmlib/daemons/smb.py
@@ -370,6 +370,8 @@ def container_args(self) -> List[str]:
         # make conditional?
         # CAP_NET_ADMIN is needed for event script to add public ips to iface
         cargs.append('--cap-add=NET_ADMIN')
+        # CAP_NET_RAW allows to send gratuitous ARPs/tickle ACKs via raw sockets
+        cargs.append('--cap-add=NET_RAW')
         return cargs
 
 

From 0bab553399c2c407140f8223f22afb59d5819a10 Mon Sep 17 00:00:00 2001
From: Vallari Agrawal <val.agl002@gmail.com>
Date: Tue, 1 Oct 2024 16:36:21 +0530
Subject: [PATCH 077/148] qa/suites/nvmeof: use "latest" image of gateway and
 cli

Change nvmeof gateway and cli image from 1.2 to "latest".

Signed-off-by: Vallari Agrawal <val.agl002@gmail.com>
---
 qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml        | 4 ++--
 qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml       | 4 ++--
 qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml      | 4 ++--
 .../thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml b/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml
index 2e4741e814079..7c97edae552da 100644
--- a/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml
+++ b/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml
@@ -1,14 +1,14 @@
 tasks:
 - nvmeof:
     installer: host.a
-    gw_image: quay.io/ceph/nvmeof:1.2 # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
+    gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
     rbd:
       pool_name: mypool
       image_name_prefix: myimage
     gateway_config:
       subsystems_count: 3
       namespaces_count: 20
-      cli_image: quay.io/ceph/nvmeof-cli:1.2
+      cli_image: quay.io/ceph/nvmeof-cli:latest
 
 - cephadm.wait_for_service:
     service: nvmeof.mypool.mygroup0
diff --git a/qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml b/qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml
index 2e873a04bab2a..9ef3700442717 100644
--- a/qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml
+++ b/qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml
@@ -1,14 +1,14 @@
 tasks:
 - nvmeof:
     installer: host.a
-    gw_image: quay.io/ceph/nvmeof:1.2 # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
+    gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
     rbd:
       pool_name: mypool
       image_name_prefix: myimage
     gateway_config:
       subsystems_count: 3
       namespaces_count: 20
-      cli_image: quay.io/ceph/nvmeof-cli:1.2
+      cli_image: quay.io/ceph/nvmeof-cli:latest
 
 - cephadm.wait_for_service:
     service: nvmeof.mypool.mygroup0
diff --git a/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml b/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml
index 83d16e4cb2c9a..12cb50b408d49 100644
--- a/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml
+++ b/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml
@@ -1,14 +1,14 @@
 tasks:
 - nvmeof:
     installer: host.a
-    gw_image: quay.io/ceph/nvmeof:1.2 # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
+    gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
     rbd:
       pool_name: mypool
       image_name_prefix: myimage
     gateway_config:
       subsystems_count: 3
       namespaces_count: 20
-      cli_image: quay.io/ceph/nvmeof-cli:1.2
+      cli_image: quay.io/ceph/nvmeof-cli:latest
 
 - cephadm.wait_for_service:
     service: nvmeof.mypool.mygroup0
diff --git a/qa/suites/nvmeof/thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml b/qa/suites/nvmeof/thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml
index 6db0c0d4e1829..b4755a6433b0a 100644
--- a/qa/suites/nvmeof/thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml
+++ b/qa/suites/nvmeof/thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml
@@ -1,14 +1,14 @@
 tasks:
 - nvmeof:
     installer: host.a
-    gw_image: quay.io/ceph/nvmeof:1.2 # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
+    gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
     rbd:
       pool_name: mypool
       image_name_prefix: myimage
     gateway_config:
       subsystems_count: 3
       namespaces_count: 20 # each subsystem
-      cli_image: quay.io/ceph/nvmeof-cli:1.2
+      cli_image: quay.io/ceph/nvmeof-cli:latest
 
 - cephadm.wait_for_service:
     service: nvmeof.mypool.mygroup0

From 303f18b1aff8274d79b1d5e7d84ee3096e4694a1 Mon Sep 17 00:00:00 2001
From: Vallari Agrawal <val.agl002@gmail.com>
Date: Wed, 9 Oct 2024 12:57:32 +0530
Subject: [PATCH 078/148] qa/workunits/nvmeof/setup_subsystem.sh: use
 --no-group-append

In newer version of nvmeof cli, "subsystem add" needs
this tag to ensure subsystem name is value of --subsystem.
Otherwise, in newer cli version, the gateway group is appended
at the end of the subsystem name.

This fixes the teuthology nvmeof suite (currently all jobs fails
because of this).

Signed-off-by: Vallari Agrawal <val.agl002@gmail.com>
---
 qa/workunits/nvmeof/setup_subsystem.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qa/workunits/nvmeof/setup_subsystem.sh b/qa/workunits/nvmeof/setup_subsystem.sh
index fb72e1d6402dd..cc4024323eb87 100755
--- a/qa/workunits/nvmeof/setup_subsystem.sh
+++ b/qa/workunits/nvmeof/setup_subsystem.sh
@@ -29,7 +29,7 @@ list_subsystems () {
 # add all subsystems
 for i in $(seq 1 $NVMEOF_SUBSYSTEMS_COUNT); do
     subsystem_nqn="${NVMEOF_SUBSYSTEMS_PREFIX}${i}"
-    sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT subsystem add --subsystem $subsystem_nqn
+    sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT subsystem add --subsystem $subsystem_nqn --no-group-append
 done
 
 list_subsystems

From 074dee9cbd63df7529977fc969e6e333fd4312eb Mon Sep 17 00:00:00 2001
From: Lee Sanders <ljsanders@uk.ibm.com>
Date: Fri, 4 Oct 2024 15:13:57 +0100
Subject: [PATCH 079/148] qa/suites/tasks/cbt.py: Deprecating cosbench from
 Teuthology in preparation for deletion of cosbench support from CBT. The code
 being deleting is infrastructure code, no qa test suite uses this function.
 Therefore it can be safely deleted.

Signed-off-by: Lee Sanders <ljsanders@uk.ibm.com>
---
 qa/tasks/cbt.py | 131 +-----------------------------------------------
 1 file changed, 2 insertions(+), 129 deletions(-)

diff --git a/qa/tasks/cbt.py b/qa/tasks/cbt.py
index 84e096520b40f..e6a9dc8223cf8 100644
--- a/qa/tasks/cbt.py
+++ b/qa/tasks/cbt.py
@@ -47,22 +47,11 @@ def generate_cbt_config(self):
 
         benchmark_config = self.config.get('benchmarks')
         benchmark_type = next(iter(benchmark_config.keys()))
+  
         if benchmark_type in ['librbdfio', 'fio']:
           testdir = misc.get_testdir(self.ctx)
           benchmark_config[benchmark_type]['cmd_path'] = os.path.join(testdir, 'fio/fio')
-        if benchmark_type == 'cosbench':
-            # create cosbench_dir and cosbench_xml_dir
-            testdir = misc.get_testdir(self.ctx)
-            benchmark_config['cosbench']['cosbench_dir'] = os.path.join(testdir, 'cos')
-            benchmark_config['cosbench']['cosbench_xml_dir'] = os.path.join(testdir, 'xml')
-            self.ctx.cluster.run(args=['mkdir', '-p', '-m0755', '--', benchmark_config['cosbench']['cosbench_xml_dir']])
-            benchmark_config['cosbench']['controller'] = osd_hosts[0]
-
-            # set auth details
-            remotes_and_roles = self.ctx.cluster.remotes.items()
-            ips = [host for (host, port) in
-                   (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
-            benchmark_config['cosbench']['auth'] = "username=cosbench:operator;password=intel2012;url=http://%s:80/auth/v1.0;retry=9" %(ips[0])
+  
         client_endpoints_config = self.config.get('client_endpoints', None)
         monitoring_profiles = self.config.get('monitoring_profiles', {})
 
@@ -117,77 +106,6 @@ def install_dependencies(self):
                 ]
             )
 
-        if benchmark_type == 'cosbench':
-            # install cosbench
-            self.log.info('install dependencies for cosbench')
-            if system_type == 'rpm':
-                cosbench_depends = ['wget', 'unzip', 'java-1.7.0-openjdk', 'curl']
-            else:
-                cosbench_depends = ['wget', 'unzip', 'openjdk-8-jre', 'curl']
-            self.first_mon.run(args=install_cmd + cosbench_depends)
-            testdir = misc.get_testdir(self.ctx)
-            cosbench_version = '0.4.2.c3'
-            cosbench_location = 'https://github.com/intel-cloud/cosbench/releases/download/v0.4.2.c3/0.4.2.c3.zip'
-            os_version = misc.get_system_type(self.first_mon, False, True)
-
-            # additional requirements for bionic
-            if os_version == '18.04':
-                self.first_mon.run(
-                    args=['sudo', 'apt-get', '-y', 'purge', 'openjdk-11*'])
-                # use our own version of cosbench
-                cosbench_version = 'cosbench-0.4.2.c3.1'
-                # contains additional parameter "-N" to nc
-                cosbench_location = 'http://drop.ceph.com/qa/cosbench-0.4.2.c3.1.zip'
-                cosbench_dir = os.path.join(testdir, cosbench_version)
-                self.ctx.cluster.run(args=['mkdir', '-p', '-m0755', '--', cosbench_dir])
-                self.first_mon.run(
-                    args=[
-                        'cd', testdir, run.Raw('&&'),
-                        'wget',
-                        cosbench_location, run.Raw('&&'),
-                        'unzip', '{name}.zip'.format(name=cosbench_version), '-d', cosbench_version
-                    ]
-                )
-            else:
-                self.first_mon.run(
-                    args=[
-                        'cd', testdir, run.Raw('&&'),
-                        'wget',
-                        cosbench_location, run.Raw('&&'),
-                        'unzip', '{name}.zip'.format(name=cosbench_version)
-                    ]
-                )
-            self.first_mon.run(
-                args=[
-                    'cd', testdir, run.Raw('&&'),
-                    'ln', '-s', cosbench_version, 'cos',
-                ]
-            )
-            self.first_mon.run(
-                args=[
-                    'cd', os.path.join(testdir, 'cos'), run.Raw('&&'),
-                    'chmod', '+x', run.Raw('*.sh'),
-                ]
-            )
-
-            # start cosbench and check info
-            self.log.info('start cosbench')
-            self.first_mon.run(
-                args=[
-                    'cd', testdir, run.Raw('&&'),
-                    'cd', 'cos', run.Raw('&&'),
-                    'sh', 'start-all.sh'
-                ]
-            )
-            self.log.info('check cosbench info')
-            self.first_mon.run(
-                args=[
-                    'cd', testdir, run.Raw('&&'),
-                    'cd', 'cos', run.Raw('&&'),
-                    'sh', 'cli.sh', 'info'
-                ]
-            )
-
     def checkout_cbt(self):
         testdir = misc.get_testdir(self.ctx)
         repo = self.config.get('repo', 'https://github.com/ceph/cbt.git')
@@ -269,51 +187,6 @@ def end(self):
                 ]
             )
 
-        if benchmark_type == 'cosbench':
-            os_version = misc.get_system_type(self.first_mon, False, True)
-            if os_version == '18.04':
-                cosbench_version = 'cosbench-0.4.2.c3.1'
-            else:
-                cosbench_version = '0.4.2.c3'
-            # note: stop-all requires 'nc'
-            self.first_mon.run(
-                args=[
-                    'cd', testdir, run.Raw('&&'),
-                    'cd', 'cos', run.Raw('&&'),
-                    'sh', 'stop-all.sh',
-                    run.Raw('||'), 'true'
-                ]
-            )
-            self.first_mon.run(
-                args=[
-                    'sudo', 'killall', '-9', 'java',
-                    run.Raw('||'), 'true'
-                ]
-            )
-            self.first_mon.run(
-                args=[
-                    'rm', '--one-file-system', '-rf', '--',
-                    '{tdir}/cos'.format(tdir=testdir),
-                ]
-            )
-            self.first_mon.run(
-                args=[
-                    'rm', '--one-file-system', '-rf', '--',
-                    '{tdir}/{version}'.format(tdir=testdir, version=cosbench_version),
-                ]
-            )
-            self.first_mon.run(
-                args=[
-                    'rm', '--one-file-system', '-rf', '--',
-                    '{tdir}/{version}.zip'.format(tdir=testdir, version=cosbench_version),
-                ]
-            )
-            self.first_mon.run(
-                args=[
-                    'rm', '--one-file-system', '-rf', '--',
-                    '{tdir}/xml'.format(tdir=testdir),
-                ]
-            )
         # Collect cbt performance data
         cbt_performance = CBTperformance()
         cbt_performance.collect(self.ctx, self.config)

From 9b7fb48c2464ddf26271f43cd8dd6ad969a80fe0 Mon Sep 17 00:00:00 2001
From: Lee Sanders <ljsanders@uk.ibm.com>
Date: Thu, 10 Oct 2024 11:21:46 +0100
Subject: [PATCH 080/148] Add cosbench deprecation warning to qa/README

Signed-off-by: Lee Sanders <ljsanders@uk.ibm.com>
---
 qa/README | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/qa/README b/qa/README
index f9b8988c6f9f6..a6a95c479bc9c 100644
--- a/qa/README
+++ b/qa/README
@@ -83,3 +83,8 @@ supported_distros as distros$ will be run just once: either on centos, rhel or
 ubuntu, chosen randomly.
 
 The teuthology code can be found in https://github.com/ceph/teuthology.git
+
+Note: The performance suites clone CBT from master here: https://github.com/ceph/cbt.git
+CBT will not support cosbench beyond release tag v0.3, therefore no qa suite should use cosbench.
+cosbench support has been removed from qa/tasks/cbt.py.
+

From 0317b5f87ac22399f6242d72f0bb9924794687de Mon Sep 17 00:00:00 2001
From: Zac Dover <zac.dover@proton.me>
Date: Thu, 10 Oct 2024 22:11:22 +1000
Subject: [PATCH 081/148] doc: SubmittingPatches-backports - remove backports
 team

Remove all references to the "Stable Releases and Backports Team", which
as of October 2024 does not exist.

Fixes: https://tracker.ceph.com/issues/68471

Co-authored-by: Laura Flores <lflores@ibm.com>
Signed-off-by: Zac Dover <zac.dover@proton.me>
---
 SubmittingPatches-backports.rst | 51 ++++++---------------------------
 1 file changed, 8 insertions(+), 43 deletions(-)

diff --git a/SubmittingPatches-backports.rst b/SubmittingPatches-backports.rst
index 0f96aec65c4f8..bb55088cb5fac 100644
--- a/SubmittingPatches-backports.rst
+++ b/SubmittingPatches-backports.rst
@@ -121,14 +121,11 @@ If you do not have sufficient permissions to modify any field of the tracker
 issue, just add a comment describing what changes you would like to make.
 Someone with permissions will make the necessary modifications on your behalf.
 
-For straightforward backports, that's all that you (as the developer of the fix)
-need to do. Volunteers from the `Stable Releases and Backports team`_ will
-proceed to create Backport issues to track the necessary backports and stage the
-backports by opening GitHub PRs with the cherry-picks. If you don't want to
-wait, and provided you have sufficient permissions at https://tracker.ceph.com,
-you can `create Backport tracker issues` and `stage backports`_ yourself. In
-that case, read on.
-
+Authors of pull requests are responsible for creating associated backport pull
+requests. As long as you have sufficient permissions at
+https://tracker.ceph.com, you can `create Backport tracker issues` and `stage
+backports`_ yourself. Read these linked sections to learn how to create
+backport tracker issues and how to stage backports: 
 
 .. _`create backport tracker issues`:
 .. _`backport tracker issue`:
@@ -146,10 +143,7 @@ issues can be created in the backport tracker issue for tracking the backporting
 
 Under ordinary circumstances, the developer who merges the ``main`` PR will flag
 the ``main`` branch tracker issue for backport by changing the Status to "Pending
-Backport", and volunteers from the `Stable Releases and Backports team`_
-periodically create backport tracker issues by running the
-``backport-create-issue`` script. They also do the actual backporting. But that
-does take time and you may not want to wait.
+Backport". 
 
 You might be tempted to forge ahead and create the backport issues yourself.
 Please don't do that - it is difficult (bordering on impossible) to get all the
@@ -360,20 +354,11 @@ Once the backport PR is open, the first order of business is to set the
 Milestone tag to the stable release the backport PR is targeting. For example,
 if the PR is targeting "nautilus", set the Milestone tag to "nautilus".
 
-If you don't have sufficient GitHub permissions to set the Milestone, don't
-worry. Members of the `Stable Releases and Backports team`_ periodically run
-a script (``ceph-backport.sh --milestones``) which scans all PRs targetting stable
-branches and automatically adds the correct Milestone tag if it is missing.
-
 Next, check which component label was applied to the ``main`` PR corresponding to
 this backport, and double-check that that label is applied to the backport PR as
 well. For example, if the ``main`` PR carries the component label "core", the
 backport PR should also get that label.
 
-In general, it is the responsibility of the `Stable Releases and Backports
-team`_ to ensure that backport PRs are properly labelled. If in doubt, just
-leave the labelling to them.
-
 .. _`backport PR reviewing`:
 .. _`backport PR testing`:
 .. _`backport PR merging`:
@@ -381,9 +366,8 @@ leave the labelling to them.
 Reviewing, testing, and merging of backport PRs
 -----------------------------------------------
 
-Once your backport PR is open and the Milestone is set properly, the
-`Stable Releases and Backports team` will take care of getting the PR
-reviewed and tested. Once the PR is reviewed and tested, it will be merged.
+Once your backport PR is open, it will be reviewed and tested. When the PR has
+been reviewed and tested, it will be merged.
 
 If you would like to facilitate this process, you can solicit reviews and run
 integration tests on the PR. In this case, add comments to the PR describing the
@@ -394,22 +378,3 @@ it will be merged. Even if you have sufficient GitHub permissions to merge the
 PR, please do *not* merge it yourself. (Uncontrolled merging to stable branches
 unnecessarily complicates the release preparation process, which is done by
 volunteers.)
-
-
-Stable Releases and Backports team
-----------------------------------
-
-Ceph has a `Stable Releases and Backports`_ team, staffed by volunteers,
-which is charged with maintaining the stable releases and backporting bugfixes
-from the ``main`` branch to them. (That team maintains a wiki, accessible by
-clicking the `Stable Releases and Backports`_ link, which describes various
-workflows in the backporting lifecycle.)
-
-.. _`Stable Releases and Backports`: http://tracker.ceph.com/projects/ceph-releases/wiki
-
-Ordinarily, it is enough to fill out the "Backport" field in the bug (tracker
-issue). The volunteers from the Stable Releases and Backports team will
-backport the fix, run regression tests on it, and include it in one or more
-future point releases.
-
-

From 95916c91906604c516e78b550010e9de511fc1e9 Mon Sep 17 00:00:00 2001
From: Zac Dover <zac.dover@proton.me>
Date: Thu, 10 Oct 2024 22:17:12 +1000
Subject: [PATCH 082/148] doc/dev: remove "Stable Releases and Backports"

Remove mention of "Stable Releases and Backports" from
doc/dev/developer_guide/essentials.rst.

Co-authored-by: Laura Flores <lflores@ibm.com>
Signed-off-by: Zac Dover <zac.dover@proton.me>
---
 doc/dev/developer_guide/essentials.rst | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/doc/dev/developer_guide/essentials.rst b/doc/dev/developer_guide/essentials.rst
index cbde8779a66da..7cce4c6f898ff 100644
--- a/doc/dev/developer_guide/essentials.rst
+++ b/doc/dev/developer_guide/essentials.rst
@@ -287,16 +287,13 @@ See :ref:`kubernetes-dev`
 Backporting
 -----------
 
-All bugfixes should be merged to the ``main`` branch before being
-backported. To flag a bugfix for backporting, make sure it has a
-`tracker issue`_ associated with it and set the ``Backport`` field to a
-comma-separated list of previous releases (e.g. "hammer,jewel") that you think
-need the backport.
-The rest (including the actual backporting) will be taken care of by the
-`Stable Releases and Backports`_ team.
+All bugfixes should be merged to the ``main`` branch before being backported.
+To flag a bugfix for backporting, make sure it has a `tracker issue`_
+associated with it and set the ``Backport`` field to a comma-separated list of
+previous releases (e.g. "hammer,jewel") that you think need the backport. You
+are responsible for the backporting of pull requests that you raise.
 
 .. _`tracker issue`: http://tracker.ceph.com/
-.. _`Stable Releases and Backports`: http://tracker.ceph.com/projects/ceph-releases/wiki
 
 Dependabot
 ----------

From 5a1a21573b92113144060e8475778c669b1de4aa Mon Sep 17 00:00:00 2001
From: Naman Munet <namanmunet@li-ff83bccc-26af-11b2-a85c-a4b04bfb1003.ibm.com>
Date: Mon, 7 Oct 2024 10:41:29 +0530
Subject: [PATCH 083/148] mgr/dashboard: unable to edit pipe config for bucket
 level policy of a bucket

Fixes: https://tracker.ceph.com/issues/68387

Fixes Includes:
1) Passing additional parameter for 'user' and 'mode' as the user can be either system/dashboard or other values while creating pipe.
2) Previously while removing the src/dest bucket field, we were getting same old values on editing pipe, but now it will become '*' if empty value passed from frontend.

Signed-off-by: Naman Munet <namanmunet@li-ff83bccc-26af-11b2-a85c-a4b04bfb1003.ibm.com>
---
 src/pybind/mgr/dashboard/controllers/rgw.py   | 10 ++---
 ...w-multisite-sync-pipe-modal.component.html |  6 +++
 ...ultisite-sync-pipe-modal.component.spec.ts | 43 ++++++++++++++++++-
 ...rgw-multisite-sync-pipe-modal.component.ts |  7 ++-
 .../app/shared/api/rgw-multisite.service.ts   | 11 ++++-
 src/pybind/mgr/dashboard/openapi.yaml         | 11 ++---
 .../mgr/dashboard/services/rgw_client.py      | 24 ++++++-----
 7 files changed, 87 insertions(+), 25 deletions(-)

diff --git a/src/pybind/mgr/dashboard/controllers/rgw.py b/src/pybind/mgr/dashboard/controllers/rgw.py
index 8667d469060f8..b8e07a708e79d 100755
--- a/src/pybind/mgr/dashboard/controllers/rgw.py
+++ b/src/pybind/mgr/dashboard/controllers/rgw.py
@@ -244,11 +244,13 @@ def create_sync_pipe(self, group_id: str, pipe_id: str,
                          source_zones: Dict[str, Any],
                          destination_zones: Dict[str, Any],
                          source_bucket: str = '',
-                         destination_bucket: str = '', bucket_name: str = ''):
+                         destination_bucket: str = '', bucket_name: str = '',
+                         user: str = '', mode: str = ''):
         multisite_instance = RgwMultisite()
         return multisite_instance.create_sync_pipe(group_id, pipe_id, source_zones,
                                                    destination_zones, source_bucket,
-                                                   destination_bucket, bucket_name, True)
+                                                   destination_bucket, bucket_name, True,
+                                                   user, mode)
 
     @Endpoint(method='DELETE', path='/sync-pipe')
     @EndpointDoc("Remove the sync pipe")
@@ -256,12 +258,10 @@ def create_sync_pipe(self, group_id: str, pipe_id: str,
     def remove_sync_pipe(self, group_id: str, pipe_id: str,
                          source_zones: Optional[List[str]] = None,
                          destination_zones: Optional[List[str]] = None,
-                         destination_bucket: str = '',
                          bucket_name: str = ''):
         multisite_instance = RgwMultisite()
         return multisite_instance.remove_sync_pipe(group_id, pipe_id, source_zones,
-                                                   destination_zones, destination_bucket,
-                                                   bucket_name, True)
+                                                   destination_zones, bucket_name, True)
 
 
 @APIRouter('/rgw/daemon', Scope.RGW)
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.html
index e50666cdeaa96..767305958d4c8 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.html
@@ -64,6 +64,9 @@
                    i18n-placeholder
                    placeholder="Source Bucket Name..."
                    formControlName="source_bucket"/>
+            <cd-help-text>
+              <span i18n>{{ allBucketSelectedHelpText }}</span>
+            </cd-help-text>
           </div>
           </div>
         <div class="form-group row">
@@ -78,6 +81,9 @@
                    i18n-placeholder
                    placeholder="Destination Bucket Name..."
                    formControlName="destination_bucket"/>
+            <cd-help-text>
+              <span i18n>{{ allBucketSelectedHelpText }}</span>
+            </cd-help-text>
           </div>
         </div>
       </div>
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.spec.ts
index 369658d7d427f..1127db1c59a59 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.spec.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.spec.ts
@@ -89,6 +89,47 @@ describe('RgwMultisiteSyncPipeModalComponent', () => {
     component.submit();
     expect(spy).toHaveBeenCalled();
     expect(putDataSpy).toHaveBeenCalled();
-    expect(putDataSpy).toHaveBeenCalledWith(component.pipeForm.getRawValue());
+    expect(putDataSpy).toHaveBeenCalledWith({
+      ...component.pipeForm.getRawValue(),
+      mode: '',
+      user: ''
+    });
+  });
+
+  it('should pass "user" and "mode" while creating/editing pipe', () => {
+    component.editing = true;
+    component.pipeForm.patchValue({
+      pipe_id: 'pipe1',
+      group_id: 's3-bucket-replication:enabled',
+      source_bucket: '',
+      source_zones: { added: ['zone1-zg1-realm1'], removed: [] },
+      destination_bucket: '',
+      destination_zones: { added: ['zone2-zg1-realm1'], removed: [] }
+    });
+    component.pipeSelectedRow = {
+      dest: { bucket: '*', zones: ['zone2-zg1-realm1'] },
+      id: 'pipi1',
+      params: {
+        dest: {},
+        mode: 'user',
+        priority: 0,
+        source: { filter: { tags: [] } },
+        user: 'dashboard'
+      },
+      source: { bucket: '*', zones: ['zone1-zg1-realm1'] }
+    };
+
+    component.sourceZones.data.selected = ['zone1-zg1-realm1'];
+    component.destZones.data.selected = ['zone2-zg1-realm1'];
+    const spy = jest.spyOn(component, 'submit');
+    const putDataSpy = jest.spyOn(multisiteServiceMock, 'createEditSyncPipe');
+    component.submit();
+    expect(spy).toHaveBeenCalled();
+    expect(putDataSpy).toHaveBeenCalled();
+    expect(putDataSpy).toHaveBeenCalledWith({
+      ...component.pipeForm.getRawValue(),
+      mode: 'user',
+      user: 'dashboard'
+    });
   });
 });
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.ts
index 2f41dbd23c843..43742ef60b839 100755
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.ts
@@ -18,6 +18,8 @@ import { ZoneData } from '../models/rgw-multisite-zone-selector';
 import { SucceededActionLabelsI18n } from '~/app/shared/constants/app.constants';
 
 const ALL_ZONES = $localize`All zones (*)`;
+const ALL_BUCKET_SELECTED_HELP_TEXT =
+  'If no value is provided, all the buckets in the zone group will be selected.';
 
 @Component({
   selector: 'cd-rgw-multisite-sync-pipe-modal',
@@ -33,6 +35,7 @@ export class RgwMultisiteSyncPipeModalComponent implements OnInit {
   sourceZones = new ZoneData(false, 'Filter Zones');
   destZones = new ZoneData(false, 'Filter Zones');
   icons = Icons;
+  allBucketSelectedHelpText = ALL_BUCKET_SELECTED_HELP_TEXT;
 
   constructor(
     public activeModal: NgbActiveModal,
@@ -187,7 +190,9 @@ export class RgwMultisiteSyncPipeModalComponent implements OnInit {
       .createEditSyncPipe({
         ...this.pipeForm.getRawValue(),
         source_zones: sourceZones,
-        destination_zones: destZones
+        destination_zones: destZones,
+        user: this.editing ? this.pipeSelectedRow?.params?.user : '',
+        mode: this.editing ? this.pipeSelectedRow?.params?.mode : ''
       })
       .subscribe(
         () => {
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/rgw-multisite.service.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/rgw-multisite.service.ts
index d57cd523a4dfe..5e12a00ec95d3 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/rgw-multisite.service.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/rgw-multisite.service.ts
@@ -123,8 +123,15 @@ export class RgwMultisiteService {
     );
   }
 
-  createEditSyncPipe(payload: any) {
-    return this.http.put(`${this.url}/sync-pipe`, payload);
+  createEditSyncPipe(payload: any, user?: string, mode?: string) {
+    let params = new HttpParams();
+    if (user) {
+      params = params.append('user', user);
+    }
+    if (mode) {
+      params = params.append('mode', mode);
+    }
+    return this.http.put(`${this.url}/sync-pipe`, payload, { params });
   }
 
   removeSyncPipe(pipe_id: string, group_id: string, bucket_name?: string) {
diff --git a/src/pybind/mgr/dashboard/openapi.yaml b/src/pybind/mgr/dashboard/openapi.yaml
index e8ab663d0d593..4fac085d1f361 100644
--- a/src/pybind/mgr/dashboard/openapi.yaml
+++ b/src/pybind/mgr/dashboard/openapi.yaml
@@ -11384,6 +11384,9 @@ paths:
                   type: string
                 group_id:
                   type: string
+                mode:
+                  default: ''
+                  type: string
                 pipe_id:
                   type: string
                 source_bucket:
@@ -11391,6 +11394,9 @@ paths:
                   type: string
                 source_zones:
                   type: string
+                user:
+                  default: ''
+                  type: string
               required:
               - group_id
               - pipe_id
@@ -11445,11 +11451,6 @@ paths:
         name: destination_zones
         schema:
           type: string
-      - default: ''
-        in: query
-        name: destination_bucket
-        schema:
-          type: string
       - default: ''
         in: query
         name: bucket_name
diff --git a/src/pybind/mgr/dashboard/services/rgw_client.py b/src/pybind/mgr/dashboard/services/rgw_client.py
index 2441b73b361be..e45c4fa447b31 100755
--- a/src/pybind/mgr/dashboard/services/rgw_client.py
+++ b/src/pybind/mgr/dashboard/services/rgw_client.py
@@ -2236,7 +2236,8 @@ def create_sync_pipe(self, group_id: str, pipe_id: str,
                          source_bucket: str = '',
                          destination_bucket: str = '',
                          bucket_name: str = '',
-                         update_period=False):
+                         update_period=False,
+                         user: str = '', mode: str = ''):
 
         if source_zones['added'] or destination_zones['added']:
             rgw_sync_policy_cmd = ['sync', 'group', 'pipe', 'create',
@@ -2245,11 +2246,9 @@ def create_sync_pipe(self, group_id: str, pipe_id: str,
             if bucket_name:
                 rgw_sync_policy_cmd += ['--bucket', bucket_name]
 
-            if source_bucket:
-                rgw_sync_policy_cmd += ['--source-bucket', source_bucket]
+            rgw_sync_policy_cmd += ['--source-bucket', source_bucket]
 
-            if destination_bucket:
-                rgw_sync_policy_cmd += ['--dest-bucket', destination_bucket]
+            rgw_sync_policy_cmd += ['--dest-bucket', destination_bucket]
 
             if source_zones['added']:
                 rgw_sync_policy_cmd += ['--source-zones', ','.join(source_zones['added'])]
@@ -2257,6 +2256,12 @@ def create_sync_pipe(self, group_id: str, pipe_id: str,
             if destination_zones['added']:
                 rgw_sync_policy_cmd += ['--dest-zones', ','.join(destination_zones['added'])]
 
+            if user:
+                rgw_sync_policy_cmd += ['--uid', user]
+
+            if mode:
+                rgw_sync_policy_cmd += ['--mode', mode]
+
             logger.info("Creating sync pipe!")
             try:
                 exit_code, _, err = mgr.send_rgwadmin_command(rgw_sync_policy_cmd)
@@ -2271,13 +2276,13 @@ def create_sync_pipe(self, group_id: str, pipe_id: str,
         if ((source_zones['removed'] and '*' not in source_zones['added'])
                 or (destination_zones['removed'] and '*' not in destination_zones['added'])):
             self.remove_sync_pipe(group_id, pipe_id, source_zones['removed'],
-                                  destination_zones['removed'], destination_bucket,
-                                  bucket_name)
+                                  destination_zones['removed'],
+                                  bucket_name, True)
 
     def remove_sync_pipe(self, group_id: str, pipe_id: str,
                          source_zones: Optional[List[str]] = None,
                          destination_zones: Optional[List[str]] = None,
-                         destination_bucket: str = '', bucket_name: str = '',
+                         bucket_name: str = '',
                          update_period=False):
         rgw_sync_policy_cmd = ['sync', 'group', 'pipe', 'remove',
                                '--group-id', group_id, '--pipe-id', pipe_id]
@@ -2291,9 +2296,6 @@ def remove_sync_pipe(self, group_id: str, pipe_id: str,
         if destination_zones:
             rgw_sync_policy_cmd += ['--dest-zones', ','.join(destination_zones)]
 
-        if destination_bucket:
-            rgw_sync_policy_cmd += ['--dest-bucket', destination_bucket]
-
         logger.info("Removing sync pipe! %s", rgw_sync_policy_cmd)
         try:
             exit_code, _, err = mgr.send_rgwadmin_command(rgw_sync_policy_cmd)

From 47e7a24c7b94cbb677298d26af6ac09519f70161 Mon Sep 17 00:00:00 2001
From: Leonid Chernin <leonidc@il.ibm.com>
Date: Wed, 9 Oct 2024 06:59:09 +0000
Subject: [PATCH 084/148] mon/nvmeofgw*: fix HA usecase when gateway has no
 listeners: behaves like no-subsystems

Signed-off-by: Leonid Chernin <leonidc@il.ibm.com>
---
 src/mon/NVMeofGwMap.cc |  3 +--
 src/mon/NVMeofGwMap.h  |  2 +-
 src/mon/NVMeofGwMon.cc | 22 ++++++++++++++++++++--
 3 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/src/mon/NVMeofGwMap.cc b/src/mon/NVMeofGwMap.cc
index 7b1bc9b8e56cf..c01ea9e710321 100755
--- a/src/mon/NVMeofGwMap.cc
+++ b/src/mon/NVMeofGwMap.cc
@@ -254,7 +254,7 @@ void NVMeofGwMap::track_deleting_gws(const NvmeGroupKey& group_key,
   }
 }
 
-int NVMeofGwMap::process_gw_map_gw_no_subsystems(
+int NVMeofGwMap::process_gw_map_gw_no_subsys_no_listeners(
   const NvmeGwId &gw_id, const NvmeGroupKey& group_key, bool &propose_pending)
 {
   int rc = 0;
@@ -424,7 +424,6 @@ void NVMeofGwMap::find_failback_gw(
   auto& gws_states = created_gws[group_key];
   auto& gw_state = created_gws[group_key][gw_id];
   bool do_failback = false;
-
   dout(10) << "Find failback GW for GW " << gw_id << dendl;
   for (auto& gw_state_it: gws_states) {
     auto& st = gw_state_it.second;
diff --git a/src/mon/NVMeofGwMap.h b/src/mon/NVMeofGwMap.h
index 2971037174218..267d85b10f918 100755
--- a/src/mon/NVMeofGwMap.h
+++ b/src/mon/NVMeofGwMap.h
@@ -54,7 +54,7 @@ class NVMeofGwMap
   int process_gw_map_gw_down(
     const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
     bool &propose_pending);
-  int process_gw_map_gw_no_subsystems(
+  int process_gw_map_gw_no_subsys_no_listeners(
     const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
     bool &propose_pending);
   void update_active_timers(bool &propose_pending);
diff --git a/src/mon/NVMeofGwMon.cc b/src/mon/NVMeofGwMon.cc
index 734e90defd946..d9e936e27df34 100644
--- a/src/mon/NVMeofGwMon.cc
+++ b/src/mon/NVMeofGwMon.cc
@@ -367,6 +367,13 @@ bool NVMeofGwMon::preprocess_command(MonOpRequestRef op)
 	std::stringstream  sstrm1;
 	sstrm1 << state.availability;
 	f->dump_string("Availability", sstrm1.str());
+	uint32_t num_listeners = 0;
+	if (state.availability == gw_availability_t::GW_AVAILABLE) {
+	  for (auto &subs: state.subsystems) {
+	    num_listeners += subs.listeners.size();
+	  }
+	  f->dump_unsigned("num-listeners", num_listeners);
+	}
 	sstrm1.str("");
 	for (auto &state_itr: map.created_gws[group_key][gw_id].sm_state) {
 	  sstrm1 << " " << state_itr.first + 1 << ": "
@@ -476,7 +483,7 @@ void NVMeofGwMon::process_gw_down(const NvmeGwId &gw_id,
     if (avail == gw_availability_t::GW_UNAVAILABLE) {
       pending_map.process_gw_map_gw_down(gw_id, group_key, propose_pending);
     } else {
-      pending_map.process_gw_map_gw_no_subsystems(gw_id, group_key, propose_pending);
+      pending_map.process_gw_map_gw_no_subsys_no_listeners(gw_id, group_key, propose_pending);
     }
 
   }
@@ -600,7 +607,18 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op)
 
   if (sub.size() == 0) {
     avail = gw_availability_t::GW_CREATED;
-  }
+  } else {
+    bool listener_found = false;
+    for (auto &subs: sub) {
+      if (subs.listeners.size()) {
+        listener_found = true;
+        break;
+      }
+    }
+    if (!listener_found) {
+     avail = gw_availability_t::GW_CREATED;
+    }
+  }// for HA no-subsystems and no-listeners are same usecases
   if (pending_map.created_gws[group_key][gw_id].subsystems != sub) {
     dout(10) << "subsystems of GW changed, propose pending " << gw_id << dendl;
     pending_map.created_gws[group_key][gw_id].subsystems =  sub;

From e80b7ba4add2d698555112e0ec46328cab703688 Mon Sep 17 00:00:00 2001
From: Redouane Kachach <rkachach@ibm.com>
Date: Tue, 1 Oct 2024 10:38:39 +0200
Subject: [PATCH 085/148] mgr/cephadm: move Grafana's subpath handling logic to
 grafana config

Fixes: https://tracker.ceph.com/issues/68315

So far, Grafana's subpath handling has been managed on the Nginx
server side using a rewrite rule. Let's move this logic to the Grafana
side to make it consistent with the rest of the monitoring services.

Signed-off-by: Redouane Kachach <rkachach@ibm.com>
---
 .../mgr/cephadm/templates/services/grafana/grafana.ini.j2      | 3 ++-
 .../templates/services/mgmt-gateway/external_server.conf.j2    | 1 -
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pybind/mgr/cephadm/templates/services/grafana/grafana.ini.j2 b/src/pybind/mgr/cephadm/templates/services/grafana/grafana.ini.j2
index 972ef22e7b58e..967f1355af14b 100644
--- a/src/pybind/mgr/cephadm/templates/services/grafana/grafana.ini.j2
+++ b/src/pybind/mgr/cephadm/templates/services/grafana/grafana.ini.j2
@@ -15,7 +15,8 @@
   http_port = {{ http_port }}
   http_addr = {{ http_addr }}
 {% if mgmt_gw_enabled %}
-  root_url = %(protocol)s://%(domain)s/grafana/
+  root_url = %(protocol)s://%(domain)s:%(http_port)s/grafana/
+  serve_from_sub_path = true
 {% endif %}
 [snapshots]
   external_enabled = false
diff --git a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/external_server.conf.j2 b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/external_server.conf.j2
index 260e7418e2d7f..b830034a7d4e9 100644
--- a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/external_server.conf.j2
+++ b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/external_server.conf.j2
@@ -109,7 +109,6 @@ server {
 
 {% if grafana_endpoints %}
     location /grafana {
-        rewrite ^/grafana/(.*) /$1 break;
         proxy_pass {{ grafana_scheme }}://grafana_servers;
         # clear any Authorization header as Prometheus and Alertmanager are using basic-auth browser
         # will send this header if Grafana is running on the same node as one of those services

From 38d9cf4ca760c667d105435a714f76dbff926960 Mon Sep 17 00:00:00 2001
From: Ronen Friedman <rfriedma@redhat.com>
Date: Mon, 23 Sep 2024 01:59:30 -0500
Subject: [PATCH 086/148] osd/scrub: introduce ScrubStore::at_level_t

to hold the caching and backend details related to the representation
of scrub-detected errors as OMap entries of a uniquely-named object.

In a followup commit - the ScrubStore is modified to hold two of
these objects, one for the shallow errors and one for the deep errors.

Signed-off-by: Ronen Friedman <rfriedma@redhat.com>
---
 src/osd/scrubber/ScrubStore.cc | 49 +++++++++++++++++++++----------
 src/osd/scrubber/ScrubStore.h  | 53 +++++++++++++++++++++++++++++-----
 2 files changed, 79 insertions(+), 23 deletions(-)

diff --git a/src/osd/scrubber/ScrubStore.cc b/src/osd/scrubber/ScrubStore.cc
index a00ab2caecee6..af223cb5cdc09 100644
--- a/src/osd/scrubber/ScrubStore.cc
+++ b/src/osd/scrubber/ScrubStore.cc
@@ -109,19 +109,29 @@ Store::create(ObjectStore* store,
   ceph_assert(t);
   ghobject_t oid = make_scrub_object(pgid);
   t->touch(coll, oid);
-  return new Store{coll, oid, store};
+  return new Store{*store, t, pgid, coll};
+}
+
+
+Store::Store(
+    ObjectStore& osd_store,
+    ObjectStore::Transaction* t,
+    const spg_t& pgid,
+    const coll_t& coll)
+    : object_store{osd_store}
+    , coll{coll}
+{
+  ceph_assert(t);
+
+  const auto err_obj = pgid.make_temp_ghobject(fmt::format("scrub_{}", pgid));
+  t->touch(coll, err_obj);
+  errors_db.emplace(pgid, err_obj, OSDriver{&object_store, coll, err_obj});
 }
 
-Store::Store(const coll_t& coll, const ghobject_t& oid, ObjectStore* store)
-  : coll(coll),
-    hoid(oid),
-    driver(store, coll, hoid),
-    backend(&driver)
-{}
 
 Store::~Store()
 {
-  ceph_assert(results.empty());
+  ceph_assert(!errors_db || errors_db->results.empty());
 }
 
 void Store::add_error(int64_t pool, const inconsistent_obj_wrapper& e)
@@ -131,11 +141,13 @@ void Store::add_error(int64_t pool, const inconsistent_obj_wrapper& e)
 
 void Store::add_object_error(int64_t pool, const inconsistent_obj_wrapper& e)
 {
+  const auto key = to_object_key(pool, e.object);
   bufferlist bl;
   e.encode(bl);
-  results[to_object_key(pool, e.object)] = bl;
+  errors_db->results[key] = bl;
 }
 
+
 void Store::add_error(int64_t pool, const inconsistent_snapset_wrapper& e)
 {
   add_snap_error(pool, e);
@@ -145,26 +157,28 @@ void Store::add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e)
 {
   bufferlist bl;
   e.encode(bl);
-  results[to_snap_key(pool, e.object)] = bl;
+  errors_db->results[to_snap_key(pool, e.object)] = bl;
 }
 
 bool Store::empty() const
 {
-  return results.empty();
+  return errors_db->results.empty();
 }
 
 void Store::flush(ObjectStore::Transaction* t)
 {
   if (t) {
-    OSDriver::OSTransaction txn = driver.get_transaction(t);
-    backend.set_keys(results, &txn);
+    OSDriver::OSTransaction txn = errors_db->driver.get_transaction(t);
+    errors_db->backend.set_keys(errors_db->results, &txn);
   }
-  results.clear();
+  errors_db->results.clear();
 }
 
 void Store::cleanup(ObjectStore::Transaction* t)
 {
-  t->remove(coll, hoid);
+  ceph_assert(t);
+  if (errors_db)
+    t->remove(coll, errors_db->errors_hoid);
 }
 
 std::vector<bufferlist>
@@ -195,8 +209,11 @@ Store::get_errors(const string& begin,
 		  uint64_t max_return) const
 {
   vector<bufferlist> errors;
+  if (!errors_db)
+    return errors;
+
   auto next = std::make_pair(begin, bufferlist{});
-  while (max_return && !backend.get_next(next.first, &next)) {
+  while (max_return && !errors_db->backend.get_next(next.first, &next)) {
     if (next.first >= end)
       break;
     errors.push_back(next.second);
diff --git a/src/osd/scrubber/ScrubStore.h b/src/osd/scrubber/ScrubStore.h
index 567badf608b6c..949a976051e67 100644
--- a/src/osd/scrubber/ScrubStore.h
+++ b/src/osd/scrubber/ScrubStore.h
@@ -5,6 +5,7 @@
 #define CEPH_SCRUB_RESULT_H
 
 #include "common/map_cacher.hpp"
+#include "osd/osd_types_fmt.h"
 #include "osd/SnapMapper.h"  // for OSDriver
 
 namespace librados {
@@ -45,18 +46,56 @@ class Store {
     uint64_t max_return) const;
 
  private:
-  Store(const coll_t& coll, const ghobject_t& oid, ObjectStore* store);
+  /**
+   * at_level_t
+   *
+   * The machinery for caching and storing errors at a specific scrub level.
+   */
+  struct at_level_t {
+    at_level_t(const spg_t& pgid, const ghobject_t& err_obj, OSDriver&& drvr)
+	: errors_hoid{err_obj}
+	, driver{std::move(drvr)}
+	, backend{&driver}
+    {}
+
+    /// the object in the PG store, where the errors are stored
+    ghobject_t errors_hoid;
+
+    /// abstracted key fetching
+    OSDriver driver;
+
+    /// a K,V cache for the errors that are detected during the scrub
+    /// session. The errors marked for a specific object are stored as
+    /// an OMap entry with the object's name as the key.
+    MapCacher::MapCacher<std::string, ceph::buffer::list> backend;
+
+    /// a temp object mapping seq-id to inconsistencies
+    std::map<std::string, ceph::buffer::list> results;
+  };
+
+  Store(ObjectStore& osd_store,
+    ObjectStore::Transaction* t,
+    const spg_t& pgid,
+    const coll_t& coll);
+
   std::vector<ceph::buffer::list> get_errors(const std::string& start,
 					     const std::string& end,
 					     uint64_t max_return) const;
  private:
+  /// the OSD's storage backend
+  ObjectStore& object_store;
+
+  /// the collection (i.e. - the PG store) in which the errors are stored
   const coll_t coll;
-  const ghobject_t hoid;
-  // a temp object holding mappings from seq-id to inconsistencies found in
-  // scrubbing
-  OSDriver driver;
-  mutable MapCacher::MapCacher<std::string, ceph::buffer::list> backend;
-  std::map<std::string, ceph::buffer::list> results;
+
+  /**
+   * the machinery (backend details, cache, etc.) for storing both levels
+   * of errors (note: 'optional' to allow delayed creation w/o dynamic
+   * allocations; and 'mutable', as the caching mechanism is used in const
+   * methods)
+   */
+  mutable std::optional<at_level_t> errors_db;
+  // not yet: mutable std::optional<at_level_t> deep_db;
 };
 }  // namespace Scrub
 

From 571e2f3c193fc0d117cfd577fe90798fc75e98fa Mon Sep 17 00:00:00 2001
From: Ronen Friedman <rfriedma@redhat.com>
Date: Mon, 23 Sep 2024 03:58:59 -0500
Subject: [PATCH 087/148] osd/scrub: directly create or reinit the ScrubStore

The ScrubStore is now directly created or reinitialized by the
Scrubber. Note that the store object is not identical to the
errors DB: the errors DB is an entity in the OSD store (a
collection of OMap entries in a uniquely-named object(s)),
while the ScrubSTore object is a cacher and accessor for
that entity. That one can be recreated or disposed of at
will.

We now do not recreate the ScrubStore object for every scrub.

Signed-off-by: Ronen Friedman <rfriedma@redhat.com>
---
 src/osd/scrubber/ScrubStore.cc  | 41 ++++++++++++++++++++-----------
 src/osd/scrubber/ScrubStore.h   | 42 ++++++++++++++++++++++++--------
 src/osd/scrubber/pg_scrubber.cc | 43 +++++++++++++++++++++++++++------
 src/osd/scrubber/pg_scrubber.h  | 10 ++++++++
 4 files changed, 104 insertions(+), 32 deletions(-)

diff --git a/src/osd/scrubber/ScrubStore.cc b/src/osd/scrubber/ScrubStore.cc
index af223cb5cdc09..0c36be6b66b02 100644
--- a/src/osd/scrubber/ScrubStore.cc
+++ b/src/osd/scrubber/ScrubStore.cc
@@ -99,20 +99,6 @@ string last_snap_key(int64_t pool)
 
 namespace Scrub {
 
-Store*
-Store::create(ObjectStore* store,
-	      ObjectStore::Transaction* t,
-	      const spg_t& pgid,
-	      const coll_t& coll)
-{
-  ceph_assert(store);
-  ceph_assert(t);
-  ghobject_t oid = make_scrub_object(pgid);
-  t->touch(coll, oid);
-  return new Store{*store, t, pgid, coll};
-}
-
-
 Store::Store(
     ObjectStore& osd_store,
     ObjectStore::Transaction* t,
@@ -174,6 +160,33 @@ void Store::flush(ObjectStore::Transaction* t)
   errors_db->results.clear();
 }
 
+
+void Store::clear_level_db(
+    ObjectStore::Transaction* t,
+    at_level_t& db)
+{
+  // easiest way to guarantee that the object representing the DB exists
+  t->touch(coll, db.errors_hoid);
+
+  // remove all the keys in the DB
+  t->omap_clear(coll, db.errors_hoid);
+
+  // restart the 'in progress' part of the MapCacher
+  db.backend.reset();
+}
+
+
+void Store::reinit(ObjectStore::Transaction* t, [[maybe_unused]] scrub_level_t level)
+{
+  // Note: only one caller, and it creates the transaction passed to reinit().
+  // No need to assert on 't'
+
+  if (errors_db) {
+    clear_level_db(t, *errors_db);
+  }
+}
+
+
 void Store::cleanup(ObjectStore::Transaction* t)
 {
   ceph_assert(t);
diff --git a/src/osd/scrubber/ScrubStore.h b/src/osd/scrubber/ScrubStore.h
index 949a976051e67..600905e85e8a2 100644
--- a/src/osd/scrubber/ScrubStore.h
+++ b/src/osd/scrubber/ScrubStore.h
@@ -20,11 +20,16 @@ namespace Scrub {
 class Store {
  public:
   ~Store();
-  static Store* create(ObjectStore* store,
-		       ObjectStore::Transaction* t,
-		       const spg_t& pgid,
-		       const coll_t& coll);
+
+  Store(ObjectStore& osd_store,
+    ObjectStore::Transaction* t,
+    const spg_t& pgid,
+    const coll_t& coll);
+
+
+  /// mark down detected errors, either shallow or deep
   void add_object_error(int64_t pool, const inconsistent_obj_wrapper& e);
+
   void add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e);
 
   // and a variant-friendly interface:
@@ -33,8 +38,22 @@ class Store {
 
   bool empty() const;
   void flush(ObjectStore::Transaction*);
+
+  /// remove both shallow and deep errors DBs. Called on interval.
   void cleanup(ObjectStore::Transaction*);
 
+  /**
+   * prepare the Store object for a new scrub session.
+   * This involves clearing one or both of the errors DBs, and resetting
+   * the cache.
+   *
+   * @param level: the scrub level to prepare for. Whenever a deep scrub
+   * is requested, both the shallow and deep errors DBs are cleared.
+   * If, on the other hand, a shallow scrub is requested, only the shallow
+   * errors DB is cleared.
+   */
+  void reinit(ObjectStore::Transaction* t, scrub_level_t level);
+
   std::vector<ceph::buffer::list> get_snap_errors(
     int64_t pool,
     const librados::object_id_t& start,
@@ -73,15 +92,9 @@ class Store {
     std::map<std::string, ceph::buffer::list> results;
   };
 
-  Store(ObjectStore& osd_store,
-    ObjectStore::Transaction* t,
-    const spg_t& pgid,
-    const coll_t& coll);
-
   std::vector<ceph::buffer::list> get_errors(const std::string& start,
 					     const std::string& end,
 					     uint64_t max_return) const;
- private:
   /// the OSD's storage backend
   ObjectStore& object_store;
 
@@ -96,6 +109,15 @@ class Store {
    */
   mutable std::optional<at_level_t> errors_db;
   // not yet: mutable std::optional<at_level_t> deep_db;
+
+  /**
+   * Clear the DB of errors at a specific scrub level by performing an
+   * omap_clear() on the DB object, and resetting the MapCacher.
+   */
+  void clear_level_db(
+      ObjectStore::Transaction* t,
+      at_level_t& db);
+
 };
 }  // namespace Scrub
 
diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc
index 555d13ba72b2b..a085481f477ac 100644
--- a/src/osd/scrubber/pg_scrubber.cc
+++ b/src/osd/scrubber/pg_scrubber.cc
@@ -1183,6 +1183,7 @@ void PgScrubber::_request_scrub_map(pg_shard_t replica,
   m_osds->send_message_osd_cluster(replica.osd, repscrubop, get_osdmap_epoch());
 }
 
+// only called on interval change. Both DBs are to be removed.
 void PgScrubber::cleanup_store(ObjectStore::Transaction* t)
 {
   if (!m_store)
@@ -1200,6 +1201,38 @@ void PgScrubber::cleanup_store(ObjectStore::Transaction* t)
   ceph_assert(!m_store);
 }
 
+
+void PgScrubber::reinit_scrub_store()
+{
+  // Entering, 0 to 3 of the following objects(*) may exist:
+  // ((*)'objects' here: both code objects (the ScrubStore object) and
+  // actual Object Store objects).
+  // 1. The ScrubStore object itself.
+  // 2,3. The two special hobjects in the coll (the PG data) holding the last
+  //      scrub's results. <<note: only one DB in this commit>>
+  //
+  // The Store object can be deleted and recreated, as a way to guarantee
+  // no junk is left. We won't do it here, but we will clear the at_level_t
+  // structures.
+  // The hobjects: possibly. The shallow DB object is always cleared. The
+  // deep one - only if running a deep scrub.
+  ObjectStore::Transaction t;
+  if (m_store) {
+    dout(10) << __func__ << " reusing existing store" << dendl;
+    m_store->flush(&t);
+  } else {
+    dout(10) << __func__ << " creating new store" << dendl;
+    m_store = std::make_unique<Scrub::Store>(
+	*m_pg->osd->store, &t, m_pg->info.pgid, m_pg->coll);
+  }
+
+  // regardless of whether the ScrubStore object was recreated or reused, we need to
+  // (possibly) clear the actual DB objects in the Object Store.
+  m_store->reinit(&t, m_active_target->level());
+  m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t), nullptr);
+}
+
+
 void PgScrubber::on_init()
 {
   // going upwards from 'inactive'
@@ -1217,14 +1250,8 @@ void PgScrubber::on_init()
     m_is_deep ? scrub_level_t::deep : scrub_level_t::shallow,
     m_pg->get_actingset());
 
-  //  create a new store
-  {
-    ObjectStore::Transaction t;
-    cleanup_store(&t);
-    m_store.reset(
-      Scrub::Store::create(m_pg->osd->store, &t, m_pg->info.pgid, m_pg->coll));
-    m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t), nullptr);
-  }
+  // create or reuse the 'known errors' store
+  reinit_scrub_store();
 
   m_start = m_pg->info.pgid.pgid.get_hobj_start();
   m_active = true;
diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h
index ff8c98d387ea2..1a5813bd9235c 100644
--- a/src/osd/scrubber/pg_scrubber.h
+++ b/src/osd/scrubber/pg_scrubber.h
@@ -771,6 +771,16 @@ class PgScrubber : public ScrubPgIF,
 
   std::unique_ptr<Scrub::Store> m_store;
 
+  /**
+   * the ScrubStore sub-object caches and manages the database of known
+   * scrub errors. reinit_scrub_store() clears the database and re-initializes
+   * the ScrubStore object.
+   *
+   * in the next iteration - reinit_..() potentially deletes only the
+   * shallow errors part of the database.
+   */
+  void reinit_scrub_store();
+
   int num_digest_updates_pending{0};
   hobject_t m_start, m_end;  ///< note: half-closed: [start,end)
 

From ce58c88158381e252ffa432ff855a01570cc98dd Mon Sep 17 00:00:00 2001
From: Ronen Friedman <rfriedma@redhat.com>
Date: Mon, 23 Sep 2024 05:15:57 -0500
Subject: [PATCH 088/148] osd/scrub: add dout() capability to the ScrubStore

now that the ScrubSTore object is directly created by the
scrubber, (and has a lifetime that does not extend beyond
the scrubber object), we can add the same dout()
mechanism used by the other scrubber sub-objects.

Note: that mechanism will be changed shortly, so that the
sub-objects would use one prefix() creator supplied by
the Scrubber object.

Signed-off-by: Ronen Friedman <rfriedma@redhat.com>
---
 src/osd/scrubber/ScrubStore.cc  | 50 +++++++++++++++++++++++++++++----
 src/osd/scrubber/ScrubStore.h   | 20 +++++++++----
 src/osd/scrubber/pg_scrubber.cc |  2 +-
 3 files changed, 61 insertions(+), 11 deletions(-)

diff --git a/src/osd/scrubber/ScrubStore.cc b/src/osd/scrubber/ScrubStore.cc
index 0c36be6b66b02..dd141d1c38ca4 100644
--- a/src/osd/scrubber/ScrubStore.cc
+++ b/src/osd/scrubber/ScrubStore.cc
@@ -1,11 +1,13 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
 // vim: ts=8 sw=2 smarttab
 
-#include "ScrubStore.h"
+#include "./ScrubStore.h"
 #include "osd/osd_types.h"
 #include "common/scrub_types.h"
 #include "include/rados/rados_types.hpp"
 
+#include "pg_scrubber.h"
+
 using std::ostringstream;
 using std::string;
 using std::vector;
@@ -95,16 +97,31 @@ string last_snap_key(int64_t pool)
   hoid.build_hash_cache();
   return "SCRUB_SS_" + hoid.to_str();
 }
+
+}  // namespace
+
+#undef dout_context
+#define dout_context (m_scrubber.get_pg_cct())
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix _prefix_fn(_dout, this, __func__)
+
+template <class T>
+static std::ostream& _prefix_fn(std::ostream* _dout, T* t, std::string fn = "")
+{
+  return t->gen_prefix(*_dout, fn);
 }
 
 namespace Scrub {
 
 Store::Store(
+    PgScrubber& scrubber,
     ObjectStore& osd_store,
     ObjectStore::Transaction* t,
     const spg_t& pgid,
     const coll_t& coll)
-    : object_store{osd_store}
+    : m_scrubber{scrubber}
+    , object_store{osd_store}
     , coll{coll}
 {
   ceph_assert(t);
@@ -120,6 +137,18 @@ Store::~Store()
   ceph_assert(!errors_db || errors_db->results.empty());
 }
 
+
+std::ostream& Store::gen_prefix(std::ostream& out, std::string_view fn) const
+{
+  if (fn.starts_with("operator")) {
+    // it's a lambda, and __func__ is not available
+    return m_scrubber.gen_prefix(out) << "Store::";
+  } else {
+    return m_scrubber.gen_prefix(out) << "Store::" << fn << ": ";
+  }
+}
+
+
 void Store::add_error(int64_t pool, const inconsistent_obj_wrapper& e)
 {
   add_object_error(pool, e);
@@ -163,8 +192,11 @@ void Store::flush(ObjectStore::Transaction* t)
 
 void Store::clear_level_db(
     ObjectStore::Transaction* t,
-    at_level_t& db)
+    at_level_t& db,
+    std::string_view db_name)
 {
+  dout(20) << fmt::format("removing (omap) entries for {} error DB", db_name)
+	   << dendl;
   // easiest way to guarantee that the object representing the DB exists
   t->touch(coll, db.errors_hoid);
 
@@ -176,19 +208,27 @@ void Store::clear_level_db(
 }
 
 
-void Store::reinit(ObjectStore::Transaction* t, [[maybe_unused]] scrub_level_t level)
+void Store::reinit(
+    ObjectStore::Transaction* t,
+    [[maybe_unused]] scrub_level_t level)
 {
+  dout(20) << fmt::format(
+		  "re-initializing the Scrub::Store (for {} scrub)",
+		  (level == scrub_level_t::deep ? "deep" : "shallow"))
+	   << dendl;
+
   // Note: only one caller, and it creates the transaction passed to reinit().
   // No need to assert on 't'
 
   if (errors_db) {
-    clear_level_db(t, *errors_db);
+    clear_level_db(t, *errors_db, "scrub");
   }
 }
 
 
 void Store::cleanup(ObjectStore::Transaction* t)
 {
+  dout(20) << "discarding error DBs" << dendl;
   ceph_assert(t);
   if (errors_db)
     t->remove(coll, errors_db->errors_hoid);
diff --git a/src/osd/scrubber/ScrubStore.h b/src/osd/scrubber/ScrubStore.h
index 600905e85e8a2..a83841e2cfbb2 100644
--- a/src/osd/scrubber/ScrubStore.h
+++ b/src/osd/scrubber/ScrubStore.h
@@ -14,6 +14,7 @@ struct object_id_t;
 
 struct inconsistent_obj_wrapper;
 struct inconsistent_snapset_wrapper;
+class PgScrubber;
 
 namespace Scrub {
 
@@ -21,10 +22,12 @@ class Store {
  public:
   ~Store();
 
-  Store(ObjectStore& osd_store,
-    ObjectStore::Transaction* t,
-    const spg_t& pgid,
-    const coll_t& coll);
+  Store(
+      PgScrubber& scrubber,
+      ObjectStore& osd_store,
+      ObjectStore::Transaction* t,
+      const spg_t& pgid,
+      const coll_t& coll);
 
 
   /// mark down detected errors, either shallow or deep
@@ -64,6 +67,8 @@ class Store {
     const librados::object_id_t& start,
     uint64_t max_return) const;
 
+  std::ostream& gen_prefix(std::ostream& out, std::string_view fn) const;
+
  private:
   /**
    * at_level_t
@@ -95,6 +100,10 @@ class Store {
   std::vector<ceph::buffer::list> get_errors(const std::string& start,
 					     const std::string& end,
 					     uint64_t max_return) const;
+
+  /// access to the owning Scrubber object, for logging mostly
+  PgScrubber& m_scrubber;
+
   /// the OSD's storage backend
   ObjectStore& object_store;
 
@@ -116,7 +125,8 @@ class Store {
    */
   void clear_level_db(
       ObjectStore::Transaction* t,
-      at_level_t& db);
+      at_level_t& db,
+      std::string_view db_name);
 
 };
 }  // namespace Scrub
diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc
index a085481f477ac..81093666f91c8 100644
--- a/src/osd/scrubber/pg_scrubber.cc
+++ b/src/osd/scrubber/pg_scrubber.cc
@@ -1223,7 +1223,7 @@ void PgScrubber::reinit_scrub_store()
   } else {
     dout(10) << __func__ << " creating new store" << dendl;
     m_store = std::make_unique<Scrub::Store>(
-	*m_pg->osd->store, &t, m_pg->info.pgid, m_pg->coll);
+	*this, *m_pg->osd->store, &t, m_pg->info.pgid, m_pg->coll);
   }
 
   // regardless of whether the ScrubStore object was recreated or reused, we need to

From 283f4c258641f86d3de3431bfdfba31856387ea6 Mon Sep 17 00:00:00 2001
From: Ronen Friedman <rfriedma@redhat.com>
Date: Mon, 23 Sep 2024 05:25:05 -0500
Subject: [PATCH 089/148] common: extend MapCacher API

to include 'no out' version of get_next()

Signed-off-by: Ronen Friedman <rfriedma@redhat.com>
---
 src/common/map_cacher.hpp     | 45 +++++++++++++++++++++++++++++++++++
 src/osd/scrubber/ScrubStore.h |  5 ++++
 2 files changed, 50 insertions(+)

diff --git a/src/common/map_cacher.hpp b/src/common/map_cacher.hpp
index 4d843be75dc64..95353425de9e6 100644
--- a/src/common/map_cacher.hpp
+++ b/src/common/map_cacher.hpp
@@ -16,6 +16,7 @@
 #define MAPCACHER_H
 
 #include "include/Context.h"
+#include "include/expected.hpp"
 #include "common/sharedptr_registry.hpp"
 
 namespace MapCacher {
@@ -130,6 +131,50 @@ class MapCacher {
     return -EINVAL;
   } ///< @return error value, 0 on success, -ENOENT if no more entries
 
+  /// Fetch first key/value std::pair after specified key
+  struct PosAndData {
+    K last_key;
+    V data;
+  };
+  using MaybePosAndData = tl::expected<PosAndData, int>;
+
+  MaybePosAndData get_1st_after_key(
+      K key  ///< [in] key after which to get next
+  )
+  {
+    ceph_assert(driver);
+    while (true) {
+      std::pair<K, boost::optional<V>> cached;
+      bool got_cached = in_progress.get_next(key, &cached);
+
+      ///\todo a driver->get_next() that returns an expected<K, V> would be nice
+      bool got_store{false};
+      std::pair<K, V> store;
+      int r = driver->get_next(key, &store);
+      if (r < 0 && r != -ENOENT) {
+        return tl::unexpected(r);
+      } else if (r == 0) {
+	got_store = true;
+      }
+
+      if (!got_cached && !got_store) {
+        return tl::unexpected(-ENOENT);
+      } else if (got_cached && (!got_store || store.first >= cached.first)) {
+	if (cached.second) {
+	  return PosAndData{cached.first, *cached.second};
+	} else {
+	  key = cached.first;
+	  continue;  // value was cached as removed, recurse
+	}
+      } else {
+	return PosAndData{store.first, store.second};
+      }
+    }
+    ceph_abort();  // not reachable
+    return tl::unexpected(-EINVAL);
+  }
+
+
   /// Adds operation setting keys to Transaction
   void set_keys(
     const std::map<K, V> &keys,  ///< [in] keys/values to std::set
diff --git a/src/osd/scrubber/ScrubStore.h b/src/osd/scrubber/ScrubStore.h
index a83841e2cfbb2..7d590d2d1915e 100644
--- a/src/osd/scrubber/ScrubStore.h
+++ b/src/osd/scrubber/ScrubStore.h
@@ -97,6 +97,11 @@ class Store {
     std::map<std::string, ceph::buffer::list> results;
   };
 
+  using CacherPosData =
+      MapCacher::MapCacher<std::string, ceph::buffer::list>::PosAndData;
+  using ExpCacherPosData = tl::expected<CacherPosData, int>;
+
+
   std::vector<ceph::buffer::list> get_errors(const std::string& start,
 					     const std::string& end,
 					     uint64_t max_return) const;

From 031580fb662f35daacf61a8aa2a4b4f3b32b7b6b Mon Sep 17 00:00:00 2001
From: Ronen Friedman <rfriedma@redhat.com>
Date: Mon, 23 Sep 2024 08:51:22 -0500
Subject: [PATCH 090/148] common/scrub,osd/scrub: minor cleanups to ScrubStore

Including:
- introducing 'no out param' encode() for the inconsistent wrappers;
- renaming the ambiguous 'empty()' to 'is_empty()';
- removing unused code;
- a few other minor cleanups.

Signed-off-by: Ronen Friedman <rfriedma@redhat.com>
---
 src/common/scrub_types.cc      | 14 ++++++++
 src/common/scrub_types.h       |  2 ++
 src/osd/scrubber/ScrubStore.cc | 61 +++++++++++-----------------------
 src/osd/scrubber/ScrubStore.h  | 18 ++++------
 4 files changed, 42 insertions(+), 53 deletions(-)

diff --git a/src/common/scrub_types.cc b/src/common/scrub_types.cc
index b03a3cab70c84..4b4d191e09c39 100644
--- a/src/common/scrub_types.cc
+++ b/src/common/scrub_types.cc
@@ -161,6 +161,13 @@ void inconsistent_obj_wrapper::encode(bufferlist& bl) const
   ENCODE_FINISH(bl);
 }
 
+bufferlist inconsistent_obj_wrapper::encode() const
+{
+  bufferlist bl;
+  encode(bl);
+  return bl;
+}
+
 void inconsistent_obj_wrapper::decode(bufferlist::const_iterator& bp)
 {
   DECODE_START(2, bp);
@@ -240,6 +247,13 @@ void inconsistent_snapset_wrapper::encode(bufferlist& bl) const
   ENCODE_FINISH(bl);
 }
 
+bufferlist inconsistent_snapset_wrapper::encode() const
+{
+  bufferlist bl;
+  encode(bl);
+  return bl;
+}
+
 void inconsistent_snapset_wrapper::decode(bufferlist::const_iterator& bp)
 {
   DECODE_START(2, bp);
diff --git a/src/common/scrub_types.h b/src/common/scrub_types.h
index dd206f56f6035..d86fc12b6c8cf 100644
--- a/src/common/scrub_types.h
+++ b/src/common/scrub_types.h
@@ -152,6 +152,7 @@ struct inconsistent_obj_wrapper : librados::inconsistent_obj_t {
 			const pg_shard_t &primary);
   void set_version(uint64_t ver) { version = ver; }
   void encode(ceph::buffer::list& bl) const;
+  ceph::buffer::list encode() const;
   void decode(ceph::buffer::list::const_iterator& bp);
 };
 
@@ -181,6 +182,7 @@ struct inconsistent_snapset_wrapper : public librados::inconsistent_snapset_t {
   void set_size_mismatch();
 
   void encode(ceph::buffer::list& bl) const;
+  ceph::buffer::list encode() const;
   void decode(ceph::buffer::list::const_iterator& bp);
 };
 
diff --git a/src/osd/scrubber/ScrubStore.cc b/src/osd/scrubber/ScrubStore.cc
index dd141d1c38ca4..033ea6b24dfd4 100644
--- a/src/osd/scrubber/ScrubStore.cc
+++ b/src/osd/scrubber/ScrubStore.cc
@@ -15,21 +15,9 @@ using std::vector;
 using ceph::bufferlist;
 
 namespace {
-ghobject_t make_scrub_object(const spg_t& pgid)
-{
-  ostringstream ss;
-  ss << "scrub_" << pgid;
-  return pgid.make_temp_ghobject(ss.str());
-}
-
 string first_object_key(int64_t pool)
 {
-  auto hoid = hobject_t(object_t(),
-			"",
-			0,
-			0x00000000,
-			pool,
-			"");
+  auto hoid = hobject_t(object_t(), "", CEPH_NOSNAP, 0x00000000, pool, "");
   hoid.build_hash_cache();
   return "SCRUB_OBJ_" + hoid.to_str();
 }
@@ -49,12 +37,7 @@ string to_object_key(int64_t pool, const librados::object_id_t& oid)
 
 string last_object_key(int64_t pool)
 {
-  auto hoid = hobject_t(object_t(),
-			"",
-			0,
-			0xffffffff,
-			pool,
-			"");
+  auto hoid = hobject_t(object_t(), "", CEPH_NOSNAP, 0xffffffff, pool, "");
   hoid.build_hash_cache();
   return "SCRUB_OBJ_" + hoid.to_str();
 }
@@ -62,14 +45,9 @@ string last_object_key(int64_t pool)
 string first_snap_key(int64_t pool)
 {
   // scrub object is per spg_t object, so we can misuse the hash (pg.seed) for
-  // the representing the minimal and maximum keys. and this relies on how
+  // representing the minimal and maximum keys. and this relies on how
   // hobject_t::to_str() works: hex(pool).hex(revhash).
-  auto hoid = hobject_t(object_t(),
-			"",
-			0,
-			0x00000000,
-			pool,
-			"");
+  auto hoid = hobject_t(object_t(), "", 0, 0x00000000, pool, "");
   hoid.build_hash_cache();
   return "SCRUB_SS_" + hoid.to_str();
 }
@@ -88,12 +66,7 @@ string to_snap_key(int64_t pool, const librados::object_id_t& oid)
 
 string last_snap_key(int64_t pool)
 {
-  auto hoid = hobject_t(object_t(),
-			"",
-			0,
-			0xffffffff,
-			pool,
-			"");
+  auto hoid = hobject_t(object_t(), "", 0, 0xffffffff, pool, "");
   hoid.build_hash_cache();
   return "SCRUB_SS_" + hoid.to_str();
 }
@@ -168,22 +141,23 @@ void Store::add_error(int64_t pool, const inconsistent_snapset_wrapper& e)
   add_snap_error(pool, e);
 }
 
+
 void Store::add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e)
 {
-  bufferlist bl;
-  e.encode(bl);
-  errors_db->results[to_snap_key(pool, e.object)] = bl;
+  errors_db->results[to_snap_key(pool, e.object)] = e.encode();
 }
 
-bool Store::empty() const
+
+bool Store::is_empty() const
 {
-  return errors_db->results.empty();
+  return !errors_db || errors_db->results.empty();
 }
 
+
 void Store::flush(ObjectStore::Transaction* t)
 {
   if (t) {
-    OSDriver::OSTransaction txn = errors_db->driver.get_transaction(t);
+    auto txn = errors_db->driver.get_transaction(t);
     errors_db->backend.set_keys(errors_db->results, &txn);
   }
   errors_db->results.clear();
@@ -234,10 +208,11 @@ void Store::cleanup(ObjectStore::Transaction* t)
     t->remove(coll, errors_db->errors_hoid);
 }
 
-std::vector<bufferlist>
-Store::get_snap_errors(int64_t pool,
-		       const librados::object_id_t& start,
-		       uint64_t max_return) const
+
+std::vector<bufferlist> Store::get_snap_errors(
+    int64_t pool,
+    const librados::object_id_t& start,
+    uint64_t max_return) const
 {
   const string begin = (start.name.empty() ?
 			first_snap_key(pool) : to_snap_key(pool, start));
@@ -272,6 +247,8 @@ Store::get_errors(const string& begin,
     errors.push_back(next.second);
     max_return--;
   }
+
+  dout(10) << fmt::format("{} errors reported", errors.size()) << dendl;
   return errors;
 }
 
diff --git a/src/osd/scrubber/ScrubStore.h b/src/osd/scrubber/ScrubStore.h
index 7d590d2d1915e..9eb77ab667db7 100644
--- a/src/osd/scrubber/ScrubStore.h
+++ b/src/osd/scrubber/ScrubStore.h
@@ -1,8 +1,6 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
-
-#ifndef CEPH_SCRUB_RESULT_H
-#define CEPH_SCRUB_RESULT_H
+#pragma once
 
 #include "common/map_cacher.hpp"
 #include "osd/osd_types_fmt.h"
@@ -39,7 +37,7 @@ class Store {
   void add_error(int64_t pool, const inconsistent_obj_wrapper& e);
   void add_error(int64_t pool, const inconsistent_snapset_wrapper& e);
 
-  bool empty() const;
+  [[nodiscard]] bool is_empty() const;
   void flush(ObjectStore::Transaction*);
 
   /// remove both shallow and deep errors DBs. Called on interval.
@@ -101,11 +99,6 @@ class Store {
       MapCacher::MapCacher<std::string, ceph::buffer::list>::PosAndData;
   using ExpCacherPosData = tl::expected<CacherPosData, int>;
 
-
-  std::vector<ceph::buffer::list> get_errors(const std::string& start,
-					     const std::string& end,
-					     uint64_t max_return) const;
-
   /// access to the owning Scrubber object, for logging mostly
   PgScrubber& m_scrubber;
 
@@ -124,6 +117,11 @@ class Store {
   mutable std::optional<at_level_t> errors_db;
   // not yet: mutable std::optional<at_level_t> deep_db;
 
+  std::vector<ceph::buffer::list> get_errors(
+      const std::string& start,
+      const std::string& end,
+      uint64_t max_return) const;
+
   /**
    * Clear the DB of errors at a specific scrub level by performing an
    * omap_clear() on the DB object, and resetting the MapCacher.
@@ -135,5 +133,3 @@ class Store {
 
 };
 }  // namespace Scrub
-
-#endif	// CEPH_SCRUB_RESULT_H

From daf848fa5afcf4ad86388eade472d2c3a4873826 Mon Sep 17 00:00:00 2001
From: Ronen Friedman <rfriedma@redhat.com>
Date: Mon, 23 Sep 2024 23:09:51 -0500
Subject: [PATCH 091/148] osd/scrub: separate shallow vs deep errors storage

The ScrubStore now holds two ScrubStore::at_level_t objects,
one for the shallow errors and one for the deep errors.

The shallow errors DB is recreated at the start of every scrub,
while the deep errors DB is only recreated at the start of a
deep scrub.

When queried by the operator for known scrub errors, the
ScrubStore will return the union of the errors from both
DBs.

Signed-off-by: Ronen Friedman <rfriedma@redhat.com>
---
 src/osd/scrubber/ScrubStore.cc  | 285 +++++++++++++++++++++++++++-----
 src/osd/scrubber/ScrubStore.h   |  42 ++++-
 src/osd/scrubber/pg_scrubber.cc |   2 +-
 3 files changed, 285 insertions(+), 44 deletions(-)

diff --git a/src/osd/scrubber/ScrubStore.cc b/src/osd/scrubber/ScrubStore.cc
index 033ea6b24dfd4..9c680da0de16f 100644
--- a/src/osd/scrubber/ScrubStore.cc
+++ b/src/osd/scrubber/ScrubStore.cc
@@ -99,15 +99,30 @@ Store::Store(
 {
   ceph_assert(t);
 
-  const auto err_obj = pgid.make_temp_ghobject(fmt::format("scrub_{}", pgid));
-  t->touch(coll, err_obj);
-  errors_db.emplace(pgid, err_obj, OSDriver{&object_store, coll, err_obj});
+  // shallow errors DB object
+  const auto sh_err_obj =
+      pgid.make_temp_ghobject(fmt::format("scrub_{}", pgid));
+  t->touch(coll, sh_err_obj);
+  shallow_db.emplace(
+      pgid, sh_err_obj, OSDriver{&object_store, coll, sh_err_obj});
+
+  // and the DB for deep errors
+  const auto dp_err_obj =
+      pgid.make_temp_ghobject(fmt::format("deep_scrub_{}", pgid));
+  t->touch(coll, dp_err_obj);
+  deep_db.emplace(pgid, dp_err_obj, OSDriver{&object_store, coll, dp_err_obj});
+
+  dout(20) << fmt::format(
+		  "created Scrub::Store for pg[{}], shallow: {}, deep: {}",
+		  pgid, sh_err_obj, dp_err_obj)
+	   << dendl;
 }
 
 
 Store::~Store()
 {
-  ceph_assert(!errors_db || errors_db->results.empty());
+  ceph_assert(!shallow_db || shallow_db->results.empty());
+  ceph_assert(!deep_db || deep_db->results.empty());
 }
 
 
@@ -127,12 +142,49 @@ void Store::add_error(int64_t pool, const inconsistent_obj_wrapper& e)
   add_object_error(pool, e);
 }
 
+namespace {
+
+inconsistent_obj_wrapper create_filtered_copy(
+    const inconsistent_obj_wrapper& obj,
+    uint64_t obj_err_mask,
+    uint64_t shard_err_mask)
+{
+  inconsistent_obj_wrapper dup = obj;
+  dup.errors &= obj_err_mask;
+  for (auto& [shard, si] : dup.shards) {
+    si.errors &= shard_err_mask;
+  }
+  return dup;
+}
+
+}  // namespace
+
+
 void Store::add_object_error(int64_t pool, const inconsistent_obj_wrapper& e)
 {
   const auto key = to_object_key(pool, e.object);
-  bufferlist bl;
-  e.encode(bl);
-  errors_db->results[key] = bl;
+  dout(20) << fmt::format(
+		  "adding error for object {} ({}). Errors: {} ({}/{}) wr:{}",
+		  e.object, key, librados::err_t{e.errors},
+		  librados::err_t{e.errors & librados::err_t::SHALLOW_ERRORS},
+		  librados::err_t{e.errors & librados::err_t::DEEP_ERRORS}, e)
+	   << dendl;
+
+  // divide the errors & shard errors into shallow and deep.
+  {
+    bufferlist bl;
+    create_filtered_copy(
+	e, librados::obj_err_t::SHALLOW_ERRORS, librados::err_t::SHALLOW_ERRORS)
+	.encode(bl);
+    shallow_db->results[key] = bl;
+  }
+  {
+    bufferlist bl;
+    create_filtered_copy(
+	e, librados::obj_err_t::DEEP_ERRORS, librados::err_t::DEEP_ERRORS)
+	.encode(bl);
+    deep_db->results[key] = bl;
+  }
 }
 
 
@@ -144,23 +196,29 @@ void Store::add_error(int64_t pool, const inconsistent_snapset_wrapper& e)
 
 void Store::add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e)
 {
-  errors_db->results[to_snap_key(pool, e.object)] = e.encode();
+  // note: snap errors are only placed in the shallow store
+  shallow_db->results[to_snap_key(pool, e.object)] = e.encode();
 }
 
 
 bool Store::is_empty() const
 {
-  return !errors_db || errors_db->results.empty();
+  return (!shallow_db || shallow_db->results.empty()) &&
+	 (!deep_db || deep_db->results.empty());
 }
 
 
 void Store::flush(ObjectStore::Transaction* t)
 {
   if (t) {
-    auto txn = errors_db->driver.get_transaction(t);
-    errors_db->backend.set_keys(errors_db->results, &txn);
+    auto txn = shallow_db->driver.get_transaction(t);
+    shallow_db->backend.set_keys(shallow_db->results, &txn);
+    txn = deep_db->driver.get_transaction(t);
+    deep_db->backend.set_keys(deep_db->results, &txn);
   }
-  errors_db->results.clear();
+
+  shallow_db->results.clear();
+  deep_db->results.clear();
 }
 
 
@@ -184,18 +242,23 @@ void Store::clear_level_db(
 
 void Store::reinit(
     ObjectStore::Transaction* t,
-    [[maybe_unused]] scrub_level_t level)
+    scrub_level_t level)
 {
+  // Note: only one caller, and it creates the transaction passed to reinit().
+  // No need to assert on 't'
   dout(20) << fmt::format(
 		  "re-initializing the Scrub::Store (for {} scrub)",
 		  (level == scrub_level_t::deep ? "deep" : "shallow"))
 	   << dendl;
 
-  // Note: only one caller, and it creates the transaction passed to reinit().
-  // No need to assert on 't'
-
-  if (errors_db) {
-    clear_level_db(t, *errors_db, "scrub");
+  // always clear the known shallow errors DB (as both shallow and deep scrubs
+  // would recreate it)
+  if (shallow_db) {
+    clear_level_db(t, *shallow_db, "shallow");
+  }
+  // only a deep scrub recreates the deep errors DB
+  if (level == scrub_level_t::deep && deep_db) {
+    clear_level_db(t, *deep_db, "deep");
   }
 }
 
@@ -204,8 +267,10 @@ void Store::cleanup(ObjectStore::Transaction* t)
 {
   dout(20) << "discarding error DBs" << dendl;
   ceph_assert(t);
-  if (errors_db)
-    t->remove(coll, errors_db->errors_hoid);
+  if (shallow_db)
+    t->remove(coll, shallow_db->errors_hoid);
+  if (deep_db)
+    t->remove(coll, deep_db->errors_hoid);
 }
 
 
@@ -214,42 +279,180 @@ std::vector<bufferlist> Store::get_snap_errors(
     const librados::object_id_t& start,
     uint64_t max_return) const
 {
-  const string begin = (start.name.empty() ?
-			first_snap_key(pool) : to_snap_key(pool, start));
+  vector<bufferlist> errors;
+  const string begin =
+      (start.name.empty() ? first_snap_key(pool) : to_snap_key(pool, start));
   const string end = last_snap_key(pool);
-  return get_errors(begin, end, max_return);
+
+  // the snap errors are stored only in the shallow store
+  ExpCacherPosData latest_sh = shallow_db->backend.get_1st_after_key(begin);
+
+  while (max_return-- && latest_sh.has_value() && latest_sh->last_key < end) {
+    errors.push_back(latest_sh->data);
+    latest_sh = shallow_db->backend.get_1st_after_key(latest_sh->last_key);
+  }
+
+  return errors;
 }
 
-std::vector<bufferlist>
-Store::get_object_errors(int64_t pool,
-			 const librados::object_id_t& start,
-			 uint64_t max_return) const
+
+std::vector<bufferlist> Store::get_object_errors(
+    int64_t pool,
+    const librados::object_id_t& start,
+    uint64_t max_return) const
 {
-  const string begin = (start.name.empty() ?
-			first_object_key(pool) : to_object_key(pool, start));
+  const string begin =
+      (start.name.empty() ? first_object_key(pool)
+			  : to_object_key(pool, start));
   const string end = last_object_key(pool);
+  dout(20) << fmt::format("fetching errors, from {} to {}", begin, end)
+	   << dendl;
   return get_errors(begin, end, max_return);
 }
 
-std::vector<bufferlist>
-Store::get_errors(const string& begin,
-		  const string& end,
-		  uint64_t max_return) const
+
+inline void decode(
+    librados::inconsistent_obj_t& obj,
+    ceph::buffer::list::const_iterator& bp)
 {
+  reinterpret_cast<inconsistent_obj_wrapper&>(obj).decode(bp);
+}
+
+
+inconsistent_obj_wrapper decode_wrapper(
+    hobject_t obj,
+    ceph::buffer::list::const_iterator bp)
+{
+  inconsistent_obj_wrapper iow{obj};
+  iow.decode(bp);
+  return iow;
+}
+
+
+void Store::collect_specific_store(
+    MapCacher::MapCacher<std::string, ceph::buffer::list>& backend,
+    Store::ExpCacherPosData& latest,
+    std::vector<bufferlist>& errors,
+    std::string_view end_key,
+    uint64_t max_return) const
+{
+  while (max_return-- && latest.has_value() &&
+	 latest.value().last_key < end_key) {
+    errors.push_back(latest->data);
+    latest = backend.get_1st_after_key(latest->last_key);
+  }
+}
+
+
+bufferlist Store::merge_encoded_error_wrappers(
+    hobject_t obj,
+    ExpCacherPosData& latest_sh,
+    ExpCacherPosData& latest_dp) const
+{
+  // decode both error wrappers
+  auto sh_wrap = decode_wrapper(obj, latest_sh->data.cbegin());
+  auto dp_wrap = decode_wrapper(obj, latest_dp->data.cbegin());
+  dout(20) << fmt::format(
+		  "merging errors {}. Shallow: {}-({}), Deep: {}-({})",
+		  sh_wrap.object, sh_wrap.errors, dp_wrap.errors, sh_wrap,
+		  dp_wrap)
+	   << dendl;
+
+  // merge the object errors (a simple OR of the two error bit-sets)
+  sh_wrap.errors |= dp_wrap.errors;
+
+  // merge the two shard error maps
+  for (const auto& [shard, si] : dp_wrap.shards) {
+    dout(20) << fmt::format(
+		    "shard {} dp-errors: {} sh-errors:{}", shard, si.errors,
+		    sh_wrap.shards[shard].errors)
+	     << dendl;
+    // note: we may be creating the shallow shard entry here. This is OK
+    sh_wrap.shards[shard].errors |= si.errors;
+  }
+
+  return sh_wrap.encode();
+}
+
+
+// a better way to implement get_errors(): use two generators, one for each store.
+// and sort-merge the results. Almost like a merge-sort, but with equal
+// keys combined. 'todo' once 'ranges' are really working.
+
+std::vector<bufferlist> Store::get_errors(
+    const std::string& from_key,
+    const std::string& end_key,
+    uint64_t max_return) const
+{
+  // merge the input from the two sorted DBs into 'errors' (until
+  // enough errors are collected)
   vector<bufferlist> errors;
-  if (!errors_db)
-    return errors;
+  dout(20) << fmt::format("getting errors from {} to {}", from_key, end_key)
+	   << dendl;
 
-  auto next = std::make_pair(begin, bufferlist{});
-  while (max_return && !errors_db->backend.get_next(next.first, &next)) {
-    if (next.first >= end)
+  ceph_assert(shallow_db);
+  ceph_assert(deep_db);
+  ExpCacherPosData latest_sh = shallow_db->backend.get_1st_after_key(from_key);
+  ExpCacherPosData latest_dp = deep_db->backend.get_1st_after_key(from_key);
+
+  while (max_return) {
+    dout(20) << fmt::format(
+		    "n:{} latest_sh: {}, latest_dp: {}", max_return,
+		    (latest_sh ? latest_sh->last_key : "(none)"),
+		    (latest_dp ? latest_dp->last_key : "(none)"))
+	     << dendl;
+
+    // keys not smaller than end_key are not interesting
+    if (latest_sh.has_value() && latest_sh->last_key >= end_key) {
+      latest_sh = tl::unexpected(-EINVAL);
+    }
+    if (latest_dp.has_value() && latest_dp->last_key >= end_key) {
+      latest_dp = tl::unexpected(-EINVAL);
+    }
+
+    if (!latest_sh && !latest_dp) {
+      // both stores are exhausted
+      break;
+    }
+    if (!latest_sh.has_value()) {
+      // continue with the deep store
+      dout(10) << fmt::format("collecting from deep store") << dendl;
+      collect_specific_store(
+	  deep_db->backend, latest_dp, errors, end_key, max_return);
       break;
-    errors.push_back(next.second);
+    }
+    if (!latest_dp.has_value()) {
+      // continue with the shallow store
+      dout(10) << fmt::format("collecting from shallow store") << dendl;
+      collect_specific_store(
+	  shallow_db->backend, latest_sh, errors, end_key, max_return);
+      break;
+    }
+
+    // we have results from both stores. Select the one with a lower key.
+    // If the keys are equal, combine the errors.
+    if (latest_sh->last_key == latest_dp->last_key) {
+      auto bl = merge_encoded_error_wrappers(
+	  shallow_db->errors_hoid.hobj, latest_sh, latest_dp);
+      errors.push_back(bl);
+      latest_sh = shallow_db->backend.get_1st_after_key(latest_sh->last_key);
+      latest_dp = deep_db->backend.get_1st_after_key(latest_dp->last_key);
+
+    } else if (latest_sh->last_key < latest_dp->last_key) {
+      dout(20) << fmt::format("shallow store element ({})", latest_sh->last_key)
+	       << dendl;
+      errors.push_back(latest_sh->data);
+      latest_sh = shallow_db->backend.get_1st_after_key(latest_sh->last_key);
+    } else {
+      dout(20) << fmt::format("deep store element ({})", latest_dp->last_key)
+	       << dendl;
+      errors.push_back(latest_dp->data);
+      latest_dp = deep_db->backend.get_1st_after_key(latest_dp->last_key);
+    }
     max_return--;
   }
 
   dout(10) << fmt::format("{} errors reported", errors.size()) << dendl;
   return errors;
 }
-
-} // namespace Scrub
+}  // namespace Scrub
diff --git a/src/osd/scrubber/ScrubStore.h b/src/osd/scrubber/ScrubStore.h
index 9eb77ab667db7..8a30e8daf8569 100644
--- a/src/osd/scrubber/ScrubStore.h
+++ b/src/osd/scrubber/ScrubStore.h
@@ -16,6 +16,28 @@ class PgScrubber;
 
 namespace Scrub {
 
+/**
+ * Storing errors detected during scrubbing.
+ *
+ * From both functional and internal perspectives, the store is a pair of key-value
+ * databases: one maps objects to shallow errors detected during their scrubbing,
+ * and other stores deep errors.
+ * Note that the first store is updated in both shallow and in deep scrubs. The
+ * second - only while deep scrubbing.
+ *
+ * The DBs can be consulted by the operator, when trying to list 'errors known
+ * at this point in time'. Whenever a scrub starts - the relevant entries in the
+ * DBs are removed. Specifically - the shallow errors DB is recreated each scrub,
+ * while the deep errors DB is recreated only when a deep scrub starts.
+ *
+ * When queried - the data from both DBs is merged for each named object, and
+ * returned to the operator.
+ *
+ * Implementation:
+ * Each of the two DBs is implemented as OMAP entries of a single, uniquely named,
+ * object. Both DBs are cached using the general KV Cache mechanism.
+ */
+
 class Store {
  public:
   ~Store();
@@ -114,14 +136,21 @@ class Store {
    * allocations; and 'mutable', as the caching mechanism is used in const
    * methods)
    */
-  mutable std::optional<at_level_t> errors_db;
-  // not yet: mutable std::optional<at_level_t> deep_db;
+  mutable std::optional<at_level_t> shallow_db;
+  mutable std::optional<at_level_t> deep_db;
 
   std::vector<ceph::buffer::list> get_errors(
       const std::string& start,
       const std::string& end,
       uint64_t max_return) const;
 
+  void collect_specific_store(
+      MapCacher::MapCacher<std::string, ceph::buffer::list>& backend,
+      ExpCacherPosData& latest,
+      std::vector<bufferlist>& errors,
+      std::string_view end_key,
+      uint64_t max_return) const;
+
   /**
    * Clear the DB of errors at a specific scrub level by performing an
    * omap_clear() on the DB object, and resetting the MapCacher.
@@ -131,5 +160,14 @@ class Store {
       at_level_t& db,
       std::string_view db_name);
 
+  /**
+   * merge the two error wrappers - fetched from both DBs for the same object.
+   * Specifically, the object errors are or'ed, and so are the per-shard
+   * entries.
+   */
+  bufferlist merge_encoded_error_wrappers(
+      hobject_t obj,
+      ExpCacherPosData& latest_sh,
+      ExpCacherPosData& latest_dp) const;
 };
 }  // namespace Scrub
diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc
index 81093666f91c8..594ffb15e2b5b 100644
--- a/src/osd/scrubber/pg_scrubber.cc
+++ b/src/osd/scrubber/pg_scrubber.cc
@@ -1209,7 +1209,7 @@ void PgScrubber::reinit_scrub_store()
   // actual Object Store objects).
   // 1. The ScrubStore object itself.
   // 2,3. The two special hobjects in the coll (the PG data) holding the last
-  //      scrub's results. <<note: only one DB in this commit>>
+  //      scrub's results.
   //
   // The Store object can be deleted and recreated, as a way to guarantee
   // no junk is left. We won't do it here, but we will clear the at_level_t

From 47ef574bee6fc43850e9da9c0b9b6c4a34d58dae Mon Sep 17 00:00:00 2001
From: Ronen Friedman <rfriedma@redhat.com>
Date: Mon, 7 Oct 2024 01:49:18 -0500
Subject: [PATCH 092/148] qa/standalone/scrub: test new ScrubStore
 implementation

The ScrubStore is now comprised of two separate data
structures, one for shallow errors and one for deep.

A new test is added to verify the main objective of that
design change: shallow scrubs should not overwrite deep
scrub data.

Signed-off-by: Ronen Friedman <rfriedma@redhat.com>
---
 qa/standalone/scrub/osd-scrub-repair.sh | 249 +++++++++++++++++++++++-
 1 file changed, 248 insertions(+), 1 deletion(-)

diff --git a/qa/standalone/scrub/osd-scrub-repair.sh b/qa/standalone/scrub/osd-scrub-repair.sh
index 59564f7e37e28..491e46603f72e 100755
--- a/qa/standalone/scrub/osd-scrub-repair.sh
+++ b/qa/standalone/scrub/osd-scrub-repair.sh
@@ -442,7 +442,6 @@ function TEST_auto_repair_bluestore_basic() {
         ['pool_name']="testpool"
         ['extras']=" --osd_scrub_auto_repair=true"
     )
-    local extr_dbg=3
     standard_scrub_cluster $dir cluster_conf
     local poolid=${cluster_conf['pool_id']}
     local poolname=${cluster_conf['pool_name']}
@@ -6252,6 +6251,254 @@ function TEST_request_scrub_priority() {
     grep "log_channel.*scrub ok" $dir/osd.${primary}.log | grep -v purged_snaps | head -1 | sed 's/.*[[]DBG[]]//' | grep -q $pg || return 1
 }
 
+#
+# Testing the "split scrub store" feature: shallow scrubs do not
+# purge deep errors from the store.
+#
+# Corrupt one copy of a replicated pool, creating both shallow and deep errors.
+# Then shallow-scrub the pool and verify that the deep errors are still present.
+#
+function TEST_dual_store_replicated_cluster() {
+    local dir=$1
+    local poolname=csr_pool
+    local total_objs=19
+    local extr_dbg=1 # note: 3 and above leave some temp files around
+
+    run_mon $dir a --osd_pool_default_size=2 || return 1
+    run_mgr $dir x --mgr_stats_period=1 || return 1
+    local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0 --osd-deep-scrub-randomize-ratio=0 "
+    ceph_osd_args+="--osd_scrub_backoff_ratio=0 --osd_stats_update_period_not_scrubbing=3 "
+    ceph_osd_args+="--osd_stats_update_period_scrubbing=2 --osd_op_queue=wpq --osd_scrub_auto_repair=0 "
+    for osd in $(seq 0 1)
+    do
+      run_osd $dir $osd $ceph_osd_args || return 1
+    done
+
+    create_rbd_pool || return 1
+    wait_for_clean || return 1
+
+    create_pool foo 1 || return 1
+    create_pool $poolname 1 1 || return 1
+    wait_for_clean || return 1
+
+    ceph osd pool set $poolname noscrub 1
+    ceph osd pool set $poolname nodeep-scrub 1
+
+    for i in $(seq 1 $total_objs) ; do
+        objname=ROBJ${i}
+        add_something $dir $poolname $objname || return 1
+
+        rados --pool $poolname setomapheader $objname hdr-$objname || return 1
+        rados --pool $poolname setomapval $objname key-$objname val-$objname || return 1
+    done
+
+    # Increase file 1 MB + 1KB
+    dd if=/dev/zero of=$dir/new.ROBJ19 bs=1024 count=1025
+    rados --pool $poolname put $objname $dir/new.ROBJ19 || return 1
+    rm -f $dir/new.ROBJ19
+
+    local pg=$(get_pg $poolname ROBJ0)
+    local primary=$(get_primary $poolname ROBJ0)
+
+    # Compute an old omap digest and save oi
+    CEPH_ARGS='' ceph daemon $(get_asok_path osd.0) \
+        config set osd_deep_scrub_update_digest_min_age 0
+    CEPH_ARGS='' ceph daemon $(get_asok_path osd.1) \
+        config set osd_deep_scrub_update_digest_min_age 0
+    pg_deep_scrub $pg
+
+    for i in $(seq 1 $total_objs) ; do
+        objname=ROBJ${i}
+
+        # Alternate corruption between osd.0 and osd.1
+        local osd=$(expr $i % 2)
+
+        case $i in
+        1)
+            # Size (deep scrub data_digest too)
+            local payload=UVWXYZZZ
+            echo $payload > $dir/CORRUPT
+            objectstore_tool $dir $osd $objname set-bytes $dir/CORRUPT || return 1
+            ;;
+
+        2)
+            # digest (deep scrub only)
+            local payload=UVWXYZ
+            echo $payload > $dir/CORRUPT
+            objectstore_tool $dir $osd $objname set-bytes $dir/CORRUPT || return 1
+            ;;
+
+        3)
+             # missing
+             objectstore_tool $dir $osd $objname remove || return 1
+             ;;
+
+         4)
+             # Modify omap value (deep scrub only)
+             objectstore_tool $dir $osd $objname set-omap key-$objname $dir/CORRUPT || return 1
+             ;;
+
+         5)
+            # Delete omap key (deep scrub only)
+            objectstore_tool $dir $osd $objname rm-omap key-$objname || return 1
+            ;;
+
+         6)
+            # Add extra omap key (deep scrub only)
+            echo extra > $dir/extra-val
+            objectstore_tool $dir $osd $objname set-omap key2-$objname $dir/extra-val || return 1
+            rm $dir/extra-val
+            ;;
+
+         7)
+            # Modify omap header (deep scrub only)
+            echo -n newheader > $dir/hdr
+            objectstore_tool $dir $osd $objname set-omaphdr $dir/hdr || return 1
+            rm $dir/hdr
+            ;;
+
+         8)
+            rados --pool $poolname setxattr $objname key1-$objname val1-$objname || return 1
+            rados --pool $poolname setxattr $objname key2-$objname val2-$objname || return 1
+
+            # Break xattrs
+            echo -n bad-val > $dir/bad-val
+            objectstore_tool $dir $osd $objname set-attr _key1-$objname $dir/bad-val || return 1
+            objectstore_tool $dir $osd $objname rm-attr _key2-$objname || return 1
+            echo -n val3-$objname > $dir/newval
+            objectstore_tool $dir $osd $objname set-attr _key3-$objname $dir/newval || return 1
+            rm $dir/bad-val $dir/newval
+            ;;
+
+        9)
+            objectstore_tool $dir $osd $objname get-attr _ > $dir/robj9-oi
+            echo -n D > $dir/change
+            rados --pool $poolname put $objname $dir/change
+            objectstore_tool $dir $osd $objname set-attr _ $dir/robj9-oi
+            rm $dir/oi $dir/change
+            ;;
+
+          # ROBJ10 must be handled after digests are re-computed by a deep scrub below
+          # ROBJ11 must be handled with config change before deep scrub
+          # ROBJ12 must be handled with config change before scrubs
+          # ROBJ13 must be handled before scrubs
+
+        14)
+            echo -n bad-val > $dir/bad-val
+            objectstore_tool $dir 0 $objname set-attr _ $dir/bad-val || return 1
+            objectstore_tool $dir 1 $objname rm-attr _ || return 1
+            rm $dir/bad-val
+            ;;
+
+        15)
+            objectstore_tool $dir $osd $objname rm-attr _ || return 1
+            ;;
+
+        16)
+            objectstore_tool $dir 0 $objname rm-attr snapset || return 1
+            echo -n bad-val > $dir/bad-val
+            objectstore_tool $dir 1 $objname set-attr snapset $dir/bad-val || return 1
+	    ;;
+
+	17)
+	    # Deep-scrub only (all replicas are diffent than the object info
+           local payload=ROBJ17
+           echo $payload > $dir/new.ROBJ17
+	   objectstore_tool $dir 0 $objname set-bytes $dir/new.ROBJ17 || return 1
+	   objectstore_tool $dir 1 $objname set-bytes $dir/new.ROBJ17 || return 1
+	   ;;
+
+	18)
+	    # Deep-scrub only (all replicas are diffent than the object info
+           local payload=ROBJ18
+           echo $payload > $dir/new.ROBJ18
+	   objectstore_tool $dir 0 $objname set-bytes $dir/new.ROBJ18 || return 1
+	   objectstore_tool $dir 1 $objname set-bytes $dir/new.ROBJ18 || return 1
+	   # Make one replica have a different object info, so a full repair must happen too
+	   objectstore_tool $dir $osd $objname corrupt-info || return 1
+	   ;;
+
+	19)
+	   # Set osd-max-object-size smaller than this object's size
+
+        esac
+    done
+
+    local pg=$(get_pg $poolname ROBJ0)
+
+    ceph tell osd.\* injectargs -- --osd-max-object-size=1048576
+
+    inject_eio rep data $poolname ROBJ11 $dir 0 || return 1 # shard 0 of [1, 0], osd.1
+    inject_eio rep mdata $poolname ROBJ12 $dir 1 || return 1 # shard 1 of [1, 0], osd.0
+    inject_eio rep data $poolname ROBJ13 $dir 0 || return 1 # shard 0 of [1, 0], osd.1
+
+    # first sequence: the final shallow scrub should not override any of the deep errors
+    pg_scrub $pg
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.'  > /tmp/WQR_1.json
+    pg_scrub $pg
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.'  > /tmp/WQR_1b.json
+    rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | python3 -c "$sortkeys" > $dir/sh1_results.json
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \
+        python3 -c "$sortkeys" > /tmp/WQR_1b_s.json
+
+    pg_deep_scrub $pg
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.'  > /tmp/WQR_2.json
+    rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | python3 -c "$sortkeys" > $dir/dp_results.json
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \
+        python3 -c "$sortkeys" > /tmp/WQR_2s.json
+
+    pg_scrub $pg
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.'  > /tmp/WQR_3.json
+    rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | python3 -c "$sortkeys" > $dir/sh2_results.json
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \
+        python3 -c "$sortkeys" > /tmp/WQR_3s.json
+
+    diff -u $dir/dp_results.json $dir/sh2_results.json || return 1
+
+    # inject a read error, which is a special case: the scrub encountering the read error
+    # would override the previously collected shard info.
+    inject_eio rep mdata $poolname ROBJ13 $dir 1 || return 1 # shard 1 of [1, 0], osd.0
+
+    pg_deep_scrub $pg
+
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.'  > /tmp/WQR_4.json
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \
+        python3 -c "$sortkeys" > /tmp/WQR_4s_w13.json
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | \
+        jq 'del(.inconsistents[] | select(.object.name == "ROBJ13"))' | \
+        jq '.inconsistents' | python3 -c "$sortkeys" > /tmp/WQR_4s_wo13.json
+
+    rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \
+        python3 -c "$sortkeys" > $dir/dpPart2_w13_results.json
+    # Remove the entry with "name":"ROBJ13" from the $dir/d*_results.json
+    rados list-inconsistent-obj $pg | jq "$jqfilter" | jq 'del(.inconsistents[] | select(.object.name == "ROBJ13"))' | \
+        jq '.inconsistents' | python3 -c "$sortkeys" > $dir/dpPart2_wo13_results.json
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \
+        python3 -c "$sortkeys" > /tmp/WQR_4s.json
+
+    pg_scrub $pg
+
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.'  > /tmp/WQR_5.json
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \
+        python3 -c "$sortkeys" > /tmp/WQR_5s_w13.json
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | \
+        jq 'del(.inconsistents[] | select(.object.name == "ROBJ13"))' |\
+        jq '.inconsistents' | python3 -c "$sortkeys" > /tmp/WQR_5s_wo13.json
+
+    rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | python3 -c "$sortkeys" > \
+        $dir/sh2Part2_w13_results.json
+    rados list-inconsistent-obj $pg | jq "$jqfilter" | jq 'del(.inconsistents[] | select(.object.name == "ROBJ13"))' |\
+        jq '.inconsistents' | python3 -c "$sortkeys" > $dir/shPart2_wo13_results.json
+
+    # the shallow scrub results should differ from the results of the deep
+    # scrub preceding it, but the difference should be limited to ROBJ13
+    diff -u $dir/dpPart2_w13_results.json $dir/sh2Part2_w13_results.json && return 1
+    diff -u $dir/dpPart2_wo13_results.json $dir/shPart2_wo13_results.json || return 1
+
+    ceph osd pool rm $poolname $poolname --yes-i-really-really-mean-it
+    return 0
+}
+
 
 main osd-scrub-repair "$@"
 

From 4f1ef85c7204dec9df9853597024637ac2873762 Mon Sep 17 00:00:00 2001
From: Ronen Friedman <rfriedma@redhat.com>
Date: Sat, 5 Oct 2024 07:33:49 -0500
Subject: [PATCH 093/148] osd/scrub: modify ScrubStore contents retrieval

A separate commit added a simple test to verify the new
store implementation (creating both shallow & deep errors),
scrubbing (step 1), deep scrubbing (step 2), then shallow
scrubbing again (step 3). The test verifies that
the results after step 2 include all shallow errors data (*),
and that the results after step 3 include all deep errors
data.

The test highlighted the need to correctly partition and
retrieve the "shards inconsistencies" and the "selected
shard" data, which was not fully implemented in the
previous commit. Thus, this commit adds the following:

- add_object_error() no longer filters out data saved during
  deep scrubbing; it also filters less of the shallow scrubs
  "shards inconsistencies" data;

- merge_encoded_error_wrappers() now merges the "shards
  inconsistencies" data correctly, handling the multiple
  scenarios possible.

(*) note the special case of not being able to read the
   object's version during deep scrubbing (due to a read
   error). In this case - the data collected during the
   shallow scrub will not be reported.

Signed-off-by: Ronen Friedman <rfriedma@redhat.com>
---
 src/osd/scrubber/ScrubStore.cc | 154 ++++++++++++++++++++++-----------
 src/osd/scrubber/ScrubStore.h  |   2 +
 2 files changed, 106 insertions(+), 50 deletions(-)

diff --git a/src/osd/scrubber/ScrubStore.cc b/src/osd/scrubber/ScrubStore.cc
index 9c680da0de16f..7f28ca2d642a8 100644
--- a/src/osd/scrubber/ScrubStore.cc
+++ b/src/osd/scrubber/ScrubStore.cc
@@ -142,49 +142,30 @@ void Store::add_error(int64_t pool, const inconsistent_obj_wrapper& e)
   add_object_error(pool, e);
 }
 
-namespace {
-
-inconsistent_obj_wrapper create_filtered_copy(
-    const inconsistent_obj_wrapper& obj,
-    uint64_t obj_err_mask,
-    uint64_t shard_err_mask)
-{
-  inconsistent_obj_wrapper dup = obj;
-  dup.errors &= obj_err_mask;
-  for (auto& [shard, si] : dup.shards) {
-    si.errors &= shard_err_mask;
-  }
-  return dup;
-}
-
-}  // namespace
-
 
 void Store::add_object_error(int64_t pool, const inconsistent_obj_wrapper& e)
 {
+  using librados::obj_err_t;
   const auto key = to_object_key(pool, e.object);
   dout(20) << fmt::format(
-		  "adding error for object {} ({}). Errors: {} ({}/{}) wr:{}",
-		  e.object, key, librados::err_t{e.errors},
-		  librados::err_t{e.errors & librados::err_t::SHALLOW_ERRORS},
-		  librados::err_t{e.errors & librados::err_t::DEEP_ERRORS}, e)
+		  "{}: adding error for object {} ({}). Errors: {} ({}/{}) "
+		  "unfiltered:{}",
+		  (current_level == scrub_level_t::deep ? "deep" : "shallow"),
+		  e.object, key, obj_err_t{e.errors},
+		  obj_err_t{e.errors & obj_err_t::SHALLOW_ERRORS},
+		  obj_err_t{e.errors & obj_err_t::DEEP_ERRORS}, e)
 	   << dendl;
 
-  // divide the errors & shard errors into shallow and deep.
-  {
-    bufferlist bl;
-    create_filtered_copy(
-	e, librados::obj_err_t::SHALLOW_ERRORS, librados::err_t::SHALLOW_ERRORS)
-	.encode(bl);
-    shallow_db->results[key] = bl;
-  }
-  {
-    bufferlist bl;
-    create_filtered_copy(
-	e, librados::obj_err_t::DEEP_ERRORS, librados::err_t::DEEP_ERRORS)
-	.encode(bl);
-    deep_db->results[key] = bl;
+  if (current_level == scrub_level_t::deep) {
+    // not overriding the deep errors DB during shallow scrubs
+    deep_db->results[key] = e.encode();
   }
+
+  // only shallow errors are stored in the shallow DB
+  auto e_copy = e;
+  e_copy.errors &= librados::obj_err_t::SHALLOW_ERRORS;
+  e_copy.union_shards.errors &= librados::err_t::SHALLOW_ERRORS;
+  shallow_db->results[key] = e_copy.encode();
 }
 
 
@@ -251,6 +232,8 @@ void Store::reinit(
 		  (level == scrub_level_t::deep ? "deep" : "shallow"))
 	   << dendl;
 
+  current_level = level;
+
   // always clear the known shallow errors DB (as both shallow and deep scrubs
   // would recreate it)
   if (shallow_db) {
@@ -344,6 +327,15 @@ void Store::collect_specific_store(
 }
 
 
+/*
+ * Implementation notes:
+ * - see https://github.com/ceph/ceph/commit/df3ff6dafeadb3822b35c424a890db9a14d7f60f
+ *   for why we encode the shard_info_t in the store.
+ * - to maintain known shard_info-s created during a deep scrub (but only when
+ *   needed), we use our knowledge of the level of the last scrub performed
+ *   (current_level), and the object user version as encoded in the error
+ *   structure.
+ */
 bufferlist Store::merge_encoded_error_wrappers(
     hobject_t obj,
     ExpCacherPosData& latest_sh,
@@ -352,26 +344,88 @@ bufferlist Store::merge_encoded_error_wrappers(
   // decode both error wrappers
   auto sh_wrap = decode_wrapper(obj, latest_sh->data.cbegin());
   auto dp_wrap = decode_wrapper(obj, latest_dp->data.cbegin());
-  dout(20) << fmt::format(
-		  "merging errors {}. Shallow: {}-({}), Deep: {}-({})",
-		  sh_wrap.object, sh_wrap.errors, dp_wrap.errors, sh_wrap,
-		  dp_wrap)
-	   << dendl;
 
-  // merge the object errors (a simple OR of the two error bit-sets)
-  sh_wrap.errors |= dp_wrap.errors;
-
-  // merge the two shard error maps
-  for (const auto& [shard, si] : dp_wrap.shards) {
+  // note: the '20' level is just until we're sure the merging works as
+  // expected
+  if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) {
+    dout(20) << fmt::format(
+		    "merging errors {}. Deep: {:#x}-({})", sh_wrap.object,
+		    dp_wrap.errors, dp_wrap)
+	     << dendl;
     dout(20) << fmt::format(
-		    "shard {} dp-errors: {} sh-errors:{}", shard, si.errors,
-		    sh_wrap.shards[shard].errors)
+		    "merging errors {}. Shallow: {:#x}-({})", sh_wrap.object,
+		    sh_wrap.errors, sh_wrap)
 	     << dendl;
-    // note: we may be creating the shallow shard entry here. This is OK
-    sh_wrap.shards[shard].errors |= si.errors;
+    // dev: list the attributes:
+    for (const auto& [shard, si] : sh_wrap.shards) {
+      for (const auto& [attr, bl] : si.attrs) {
+	dout(20) << fmt::format(" shallow: shard {} attr: {}", shard, attr)
+		 << dendl;
+      }
+    }
+    for (const auto& [shard, si] : dp_wrap.shards) {
+      for (const auto& [attr, bl] : si.attrs) {
+	dout(20) << fmt::format(" deep: shard {} attr: {}", shard, attr)
+		 << dendl;
+      }
+    }
+  }
+
+  // Actual merging of the shard map entries is only performed if the
+  // latest version is from the shallow scrub.
+  // Otherwise, the deep scrub, which (for the shards info) contains all data,
+  // and the shallow scrub is ignored.
+  if (current_level == scrub_level_t::shallow) {
+    // is the object data related to the same object version?
+    if (sh_wrap.version == dp_wrap.version) {
+      // combine the error information
+      dp_wrap.errors |= sh_wrap.errors;
+      for (const auto& [shard, si] : sh_wrap.shards) {
+	if (dp_wrap.shards.contains(shard)) {
+	  dout(20) << fmt::format(
+			  "-----> {}-{}  combining: sh-errors: {} dp-errors:{}",
+			  sh_wrap.object, shard, si, dp_wrap.shards[shard])
+		   << dendl;
+	  const auto saved_er = dp_wrap.shards[shard].errors;
+	  dp_wrap.shards[shard].selected_oi = si.selected_oi;
+	  dp_wrap.shards[shard].primary = si.primary;
+	  dp_wrap.shards[shard].errors |= saved_er;
+
+	  // the attributes:
+	  for (const auto& [attr, bl] : si.attrs) {
+	    if (!dp_wrap.shards[shard].attrs.contains(attr)) {
+	      dout(20) << fmt::format(
+			      "-----> {}-{}  copying shallow attr: attr: {}",
+			      sh_wrap.object, shard, attr)
+		       << dendl;
+	      dp_wrap.shards[shard].attrs[attr] = bl;
+	    }
+	    // otherwise - we'll ignore the shallow attr buffer
+	  }
+	} else {
+	  // the deep scrub data for this shard is missing. We take the shallow
+	  // scrub data.
+	  dp_wrap.shards[shard] = si;
+	}
+      }
+    } else if (sh_wrap.version > dp_wrap.version) {
+	if (false && dp_wrap.version == 0) {
+	  // there was a read error in the deep scrub. The deep version
+	  // shows as '0'. That's severe enough for us to ignore the shallow.
+	  dout(10) << fmt::format("{} ignoring deep after read failure",
+			  sh_wrap.object)
+		   << dendl;
+	} else {
+	  // There is a new shallow version of the object results.
+	  // The deep data is for an older version of that object.
+	  // There are multiple possibilities here, but for now we ignore the
+	  // deep data.
+	  dp_wrap = sh_wrap;
+	}
+      }
   }
 
-  return sh_wrap.encode();
+  return dp_wrap.encode();
 }
 
 
diff --git a/src/osd/scrubber/ScrubStore.h b/src/osd/scrubber/ScrubStore.h
index 8a30e8daf8569..0955654d78e91 100644
--- a/src/osd/scrubber/ScrubStore.h
+++ b/src/osd/scrubber/ScrubStore.h
@@ -130,6 +130,8 @@ class Store {
   /// the collection (i.e. - the PG store) in which the errors are stored
   const coll_t coll;
 
+  scrub_level_t current_level;
+
   /**
    * the machinery (backend details, cache, etc.) for storing both levels
    * of errors (note: 'optional' to allow delayed creation w/o dynamic

From 0c4028a6a356ae8c6e7d6d646e96c8e38a114789 Mon Sep 17 00:00:00 2001
From: Ronen Friedman <rfriedma@redhat.com>
Date: Tue, 8 Oct 2024 08:25:56 -0500
Subject: [PATCH 094/148] qa/standalone/scrub: remove TEST_recovery_scrub_2

That test does no longer match the actual requirements and
implementation of scrubbing.
It was already deactivated in
https://github.com/ceph/ceph/pull/59590. Here - it is
fully removed, mainly for the sake of backporting.

Signed-off-by: Ronen Friedman <rfriedma@redhat.com>
---
 qa/standalone/scrub/osd-recovery-scrub.sh | 140 ----------------------
 1 file changed, 140 deletions(-)

diff --git a/qa/standalone/scrub/osd-recovery-scrub.sh b/qa/standalone/scrub/osd-recovery-scrub.sh
index 4eac1106e8d3a..843e9b9901b9c 100755
--- a/qa/standalone/scrub/osd-recovery-scrub.sh
+++ b/qa/standalone/scrub/osd-recovery-scrub.sh
@@ -234,146 +234,6 @@ function wait_background_check() {
     return $return_code
 }
 
-# osd_scrub_during_recovery=true make sure scrub happens
-# update 26.8.24: the test should be redesigned. The current version is not
-# reliable, and playing around with the timeouts and such won't fix the
-# design issues.
-function TEST_recovery_scrub_2() {
-    local dir=$1
-    local poolname=test
-    return 0
-
-    TESTDATA="testdata.$$"
-    OSDS=8
-    PGS=32
-    OBJECTS=40
-
-    setup $dir || return 1
-    run_mon $dir a --osd_pool_default_size=1 --mon_allow_pool_size_one=true || return 1
-    run_mgr $dir x --mgr_stats_period=1 || return 1
-    local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0.1 "
-    ceph_osd_args+="--osd_scrub_backoff_ratio=0 "
-    ceph_osd_args+="--osd_stats_update_period_not_scrubbing=3 "
-    ceph_osd_args+="--osd_stats_update_period_scrubbing=2 "
-    ceph_osd_args+="--mgr_stats_period=1"
-    for osd in $(seq 0 $(expr $OSDS - 1))
-    do
-        run_osd $dir $osd --osd_scrub_during_recovery=true --osd_recovery_sleep=1 \
-                          $ceph_osd_args || return 1
-    done
-
-    # Create a pool with $PGS pgs
-    create_pool $poolname $PGS $PGS
-    wait_for_clean || return 1
-    poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }')
-
-    dd if=/dev/urandom of=$TESTDATA bs=1M count=50
-    for i in $(seq 1 $OBJECTS)
-    do
-        rados -p $poolname put obj${i} $TESTDATA
-    done
-    rm -f $TESTDATA
-
-    ceph osd pool set $poolname size 3
-
-    ceph pg dump pgs
-
-    # note that the following will be needed if the mclock scheduler is specified
-    ceph tell osd.* config get osd_mclock_override_recovery_settings
-
-    # the '_max_active' is expected to be 0
-    ceph tell osd.1 config get osd_recovery_max_active
-    # both next parameters are expected to be >=3
-    ceph tell osd.1 config set osd_recovery_max_active_hdd 6
-    ceph tell osd.1 config set osd_recovery_max_active_ssd 6
-    ceph tell osd.1 config get osd_recovery_max_active_hdd
-    ceph tell osd.1 config get osd_recovery_max_active_ssd
-
-    # Wait for recovery to start
-    count=0
-    while(true)
-    do
-      #ceph --format json pg dump pgs | jq '.pg_stats | [.[].state]'
-      ceph pg dump pgs
-      if test $(ceph --format json pg dump pgs |
-	      jq '.pg_stats | [.[].state]'| grep recovering | wc -l) -ge 2
-      then
-        break
-      fi
-      sleep 2
-      if test "$count" -eq "10"
-      then
-        echo "Not enough recovery started simultaneously"
-        return 1
-      fi
-      count=$(expr $count + 1)
-    done
-    ceph pg dump pgs
-
-    pids=""
-    recov_scrub_count=0
-    for pg in $(seq 0 $(expr $PGS - 1))
-    do
-        run_in_background pids pg_scrub_mod $poolid.$(printf "%x" $pg)
-    done
-    wait_background_check pids
-    return_code=$?
-    if [ $return_code -ne 0 ]; then return $return_code; fi
-
-    ERRORS=0
-    if test $recov_scrub_count -eq 0
-    then
-      echo "No scrubs occurred while PG recovering"
-      ERRORS=$(expr $ERRORS + 1)
-    fi
-
-    pidfile=$(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid')
-    pid=$(cat $pidfile)
-    if ! kill -0 $pid
-    then
-        echo "OSD crash occurred"
-        #tail -100 $dir/osd.0.log
-        ERRORS=$(expr $ERRORS + 1)
-    fi
-
-    # Work around for http://tracker.ceph.com/issues/38195
-    kill_daemons $dir #|| return 1
-
-    declare -a err_strings
-    ## we do not expect a refusal to scrub
-    err_strings[0]="recovery in progress.*scrubs"
-    for osd in $(seq 0 $(expr $OSDS - 1))
-    do
-        grep "recovery in progress.*scrubs" $dir/osd.${osd}.log
-    done
-    for err_string in "${err_strings[@]}"
-    do
-        found=false
-        for osd in $(seq 0 $(expr $OSDS - 1))
-        do
-            if grep "$err_string" $dir/osd.${osd}.log > /dev/null;
-            then
-                found=true
-            fi
-        done
-        if [ "$found" = "true" ]; then
-            echo "Found log message not expected '$err_string'"
-	    ERRORS=$(expr $ERRORS + 1)
-        fi
-    done
-
-    teardown $dir || return 1
-
-    if [ $ERRORS != "0" ];
-    then
-        echo "TEST FAILED WITH $ERRORS ERRORS"
-        return 1
-    fi
-
-    echo "TEST PASSED"
-    return 0
-}
-
 main osd-recovery-scrub "$@"
 
 # Local Variables:

From 0a867d149e9f7783b5e703f348b5c7e6afc099fa Mon Sep 17 00:00:00 2001
From: Redouane Kachach <rkachach@ibm.com>
Date: Tue, 1 Oct 2024 10:39:37 +0200
Subject: [PATCH 095/148] mgr/cephadm: adding more UT for mgmt-gateway service

Signed-off-by: Redouane Kachach <rkachach@ibm.com>
---
 src/pybind/mgr/cephadm/tests/test_services.py | 489 +++++++++++++++++-
 1 file changed, 483 insertions(+), 6 deletions(-)

diff --git a/src/pybind/mgr/cephadm/tests/test_services.py b/src/pybind/mgr/cephadm/tests/test_services.py
index a9b7da624a0e6..a05c87ce3c3a9 100644
--- a/src/pybind/mgr/cephadm/tests/test_services.py
+++ b/src/pybind/mgr/cephadm/tests/test_services.py
@@ -49,9 +49,9 @@
 
 cephadm_root_ca = """-----BEGIN CERTIFICATE-----\\nMIIE7DCCAtSgAwIBAgIUE8b2zZ64geu2ns3Zfn3/4L+Cf6MwDQYJKoZIhvcNAQEL\\nBQAwFzEVMBMGA1UEAwwMY2VwaGFkbS1yb290MB4XDTI0MDYyNjE0NDA1M1oXDTM0\\nMDYyNzE0NDA1M1owFzEVMBMGA1UEAwwMY2VwaGFkbS1yb290MIICIjANBgkqhkiG\\n9w0BAQEFAAOCAg8AMIICCgKCAgEAsZRJsdtTr9GLG1lWFql5SGc46ldFanNJd1Gl\\nqXq5vgZVKRDTmNgAb/XFuNEEmbDAXYIRZolZeYKMHfn0pouPRSel0OsC6/02ZUOW\\nIuN89Wgo3IYleCFpkVIumD8URP3hwdu85plRxYZTtlruBaTRH38lssyCqxaOdEt7\\nAUhvYhcMPJThB17eOSQ73mb8JEC83vB47fosI7IhZuvXvRSuZwUW30rJanWNhyZq\\neS2B8qw2RSO0+77H6gA4ftBnitfsE1Y8/F9Z/f92JOZuSMQXUB07msznPbRJia3f\\nueO8gOc32vxd1A1/Qzp14uX34yEGY9ko2lW226cZO29IVUtXOX+LueQttwtdlpz8\\ne6Npm09pXhXAHxV/OW3M28MdXmobIqT/m9MfkeAErt5guUeC5y8doz6/3VQRjFEn\\nRpN0WkblgnNAQ3DONPc+Qd9Fi/wZV2X7bXoYpNdoWDsEOiE/eLmhG1A2GqU/mneP\\nzQ6u79nbdwTYpwqHpa+PvusXeLfKauzI8lLUJotdXy9EK8iHUofibB61OljYye6B\\nG3b8C4QfGsw8cDb4APZd/6AZYyMx/V3cGZ+GcOV7WvsC8k7yx5Uqasm/kiGQ3EZo\\nuNenNEYoGYrjb8D/8QzqNUTwlEh27/ps80tO7l2GGTvWVZL0PRZbmLDvO77amtOf\\nOiRXMoUCAwEAAaMwMC4wGwYDVR0RBBQwEocQAAAAAAAAAAAAAAAAAAAAATAPBgNV\\nHRMBAf8EBTADAQH/MA0GCSqGSIb3DQEBCwUAA4ICAQAxwzX5AhYEWhTV4VUwUj5+\\nqPdl4Q2tIxRokqyE+cDxoSd+6JfGUefUbNyBxDt0HaBq8obDqqrbcytxnn7mpnDu\\nhtiauY+I4Amt7hqFOiFA4cCLi2mfok6g2vL53tvhd9IrsfflAU2wy7hL76Ejm5El\\nA+nXlkJwps01Whl9pBkUvIbOn3pXX50LT4hb5zN0PSu957rjd2xb4HdfuySm6nW4\\n4GxtVWfmGA6zbC4XMEwvkuhZ7kD2qjkAguGDF01uMglkrkCJT3OROlNBuSTSBGqt\\ntntp5VytHvb7KTF7GttM3ha8/EU2KYaHM6WImQQTrOfiImAktOk4B3lzUZX3HYIx\\n+sByO4P4dCvAoGz1nlWYB2AvCOGbKf0Tgrh4t4jkiF8FHTXGdfvWmjgi1pddCNAy\\nn65WOCmVmLZPERAHOk1oBwqyReSvgoCFo8FxbZcNxJdlhM0Z6hzKggm3O3Dl88Xl\\n5euqJjh2STkBW8Xuowkg1TOs5XyWvKoDFAUzyzeLOL8YSG+gXV22gPTUaPSVAqdb\\nwd0Fx2kjConuC5bgTzQHs8XWA930U3XWZraj21Vaa8UxlBLH4fUro8H5lMSYlZNE\\nJHRNW8BkznAClaFSDG3dybLsrzrBFAu/Qb5zVkT1xyq0YkepGB7leXwq6vjWA5Pw\\nmZbKSphWfh0qipoqxqhfkw==\\n-----END CERTIFICATE-----\\n"""
 
-ceph_generated_cert = """-----BEGIN CERTIFICATE-----\nMIICxjCCAa4CEQDIZSujNBlKaLJzmvntjukjMA0GCSqGSIb3DQEBDQUAMCExDTAL\nBgNVBAoMBENlcGgxEDAOBgNVBAMMB2NlcGhhZG0wHhcNMjIwNzEzMTE0NzA3WhcN\nMzIwNzEwMTE0NzA3WjAhMQ0wCwYDVQQKDARDZXBoMRAwDgYDVQQDDAdjZXBoYWRt\nMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAyyMe4DMA+MeYK7BHZMHB\nq7zjliEOcNgxomjU8qbf5USF7Mqrf6+/87XWqj4pCyAW8x0WXEr6A56a+cmBVmt+\nqtWDzl020aoId6lL5EgLLn6/kMDCCJLq++Lg9cEofMSvcZh+lY2f+1p+C+00xent\nrLXvXGOilAZWaQfojT2BpRnNWWIFbpFwlcKrlg2G0cFjV5c1m6a0wpsQ9JHOieq0\nSvwCixajwq3CwAYuuiU1wjI4oJO4Io1+g8yB3nH2Mo/25SApCxMXuXh4kHLQr/T4\n4hqisvG4uJYgKMcSIrWj5o25mclByGi1UI/kZkCUES94i7Z/3ihx4Bad0AMs/9tw\nFwIDAQABMA0GCSqGSIb3DQEBDQUAA4IBAQAf+pwz7Gd7mDwU2LY0TQXsK6/8KGzh\nHuX+ErOb8h5cOAbvCnHjyJFWf6gCITG98k9nxU9NToG0WYuNm/max1y/54f0dtxZ\npUo6KSNl3w6iYCfGOeUIj8isi06xMmeTgMNzv8DYhDt+P2igN6LenqWTVztogkiV\nxQ5ZJFFLEw4sN0CXnrZX3t5ruakxLXLTLKeE0I91YJvjClSBGkVJq26wOKQNHMhx\npWxeydQ5EgPZY+Aviz5Dnxe8aB7oSSovpXByzxURSabOuCK21awW5WJCGNpmqhWK\nZzACBDEstccj57c4OGV0eayHJRsluVr2e9NHRINZA3qdB37e6gsI1xHo\n-----END CERTIFICATE-----\n"""
+ceph_generated_cert = """-----BEGIN CERTIFICATE-----\\nMIICxjCCAa4CEQDIZSujNBlKaLJzmvntjukjMA0GCSqGSIb3DQEBDQUAMCExDTAL\\nBgNVBAoMBENlcGgxEDAOBgNVBAMMB2NlcGhhZG0wHhcNMjIwNzEzMTE0NzA3WhcN\\nMzIwNzEwMTE0NzA3WjAhMQ0wCwYDVQQKDARDZXBoMRAwDgYDVQQDDAdjZXBoYWRt\\nMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAyyMe4DMA+MeYK7BHZMHB\\nq7zjliEOcNgxomjU8qbf5USF7Mqrf6+/87XWqj4pCyAW8x0WXEr6A56a+cmBVmt+\\nqtWDzl020aoId6lL5EgLLn6/kMDCCJLq++Lg9cEofMSvcZh+lY2f+1p+C+00xent\\nrLXvXGOilAZWaQfojT2BpRnNWWIFbpFwlcKrlg2G0cFjV5c1m6a0wpsQ9JHOieq0\\nSvwCixajwq3CwAYuuiU1wjI4oJO4Io1+g8yB3nH2Mo/25SApCxMXuXh4kHLQr/T4\\n4hqisvG4uJYgKMcSIrWj5o25mclByGi1UI/kZkCUES94i7Z/3ihx4Bad0AMs/9tw\\nFwIDAQABMA0GCSqGSIb3DQEBDQUAA4IBAQAf+pwz7Gd7mDwU2LY0TQXsK6/8KGzh\\nHuX+ErOb8h5cOAbvCnHjyJFWf6gCITG98k9nxU9NToG0WYuNm/max1y/54f0dtxZ\\npUo6KSNl3w6iYCfGOeUIj8isi06xMmeTgMNzv8DYhDt+P2igN6LenqWTVztogkiV\\nxQ5ZJFFLEw4sN0CXnrZX3t5ruakxLXLTLKeE0I91YJvjClSBGkVJq26wOKQNHMhx\\npWxeydQ5EgPZY+Aviz5Dnxe8aB7oSSovpXByzxURSabOuCK21awW5WJCGNpmqhWK\\nZzACBDEstccj57c4OGV0eayHJRsluVr2e9NHRINZA3qdB37e6gsI1xHo\\n-----END CERTIFICATE-----\\n"""
 
-ceph_generated_key = """-----BEGIN PRIVATE KEY-----\nMIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQDLIx7gMwD4x5gr\nsEdkwcGrvOOWIQ5w2DGiaNTypt/lRIXsyqt/r7/ztdaqPikLIBbzHRZcSvoDnpr5\nyYFWa36q1YPOXTbRqgh3qUvkSAsufr+QwMIIkur74uD1wSh8xK9xmH6VjZ/7Wn4L\n7TTF6e2ste9cY6KUBlZpB+iNPYGlGc1ZYgVukXCVwquWDYbRwWNXlzWbprTCmxD0\nkc6J6rRK/AKLFqPCrcLABi66JTXCMjigk7gijX6DzIHecfYyj/blICkLExe5eHiQ\nctCv9PjiGqKy8bi4liAoxxIitaPmjbmZyUHIaLVQj+RmQJQRL3iLtn/eKHHgFp3Q\nAyz/23AXAgMBAAECggEAVoTB3Mm8azlPlaQB9GcV3tiXslSn+uYJ1duCf0sV52dV\nBzKW8s5fGiTjpiTNhGCJhchowqxoaew+o47wmGc2TvqbpeRLuecKrjScD0GkCYyQ\neM2wlshEbz4FhIZdgS6gbuh9WaM1dW/oaZoBNR5aTYo7xYTmNNeyLA/jO2zr7+4W\n5yES1lMSBXpKk7bDGKYY4bsX2b5RLr2Grh2u2bp7hoLABCEvuu8tSQdWXLEXWpXo\njwmV3hc6tabypIa0mj2Dmn2Dmt1ppSO0AZWG/WAizN3f4Z0r/u9HnbVrVmh0IEDw\n3uf2LP5o3msG9qKCbzv3lMgt9mMr70HOKnJ8ohMSKQKBgQDLkNb+0nr152HU9AeJ\nvdz8BeMxcwxCG77iwZphZ1HprmYKvvXgedqWtS6FRU+nV6UuQoPUbQxJBQzrN1Qv\nwKSlOAPCrTJgNgF/RbfxZTrIgCPuK2KM8I89VZv92TSGi362oQA4MazXC8RAWjoJ\nSu1/PHzK3aXOfVNSLrOWvIYeZQKBgQD/dgT6RUXKg0UhmXj7ExevV+c7oOJTDlMl\nvLngrmbjRgPO9VxLnZQGdyaBJeRngU/UXfNgajT/MU8B5fSKInnTMawv/tW7634B\nw3v6n5kNIMIjJmENRsXBVMllDTkT9S7ApV+VoGnXRccbTiDapBThSGd0wri/CuwK\nNWK1YFOeywKBgEDyI/XG114PBUJ43NLQVWm+wx5qszWAPqV/2S5MVXD1qC6zgCSv\nG9NLWN1CIMimCNg6dm7Wn73IM7fzvhNCJgVkWqbItTLG6DFf3/DPODLx1wTMqLOI\nqFqMLqmNm9l1Nec0dKp5BsjRQzq4zp1aX21hsfrTPmwjxeqJZdioqy2VAoGAXR5X\nCCdSHlSlUW8RE2xNOOQw7KJjfWT+WAYoN0c7R+MQplL31rRU7dpm1bLLRBN11vJ8\nMYvlT5RYuVdqQSP6BkrX+hLJNBvOLbRlL+EXOBrVyVxHCkDe+u7+DnC4epbn+N8P\nLYpwqkDMKB7diPVAizIKTBxinXjMu5fkKDs5n+sCgYBbZheYKk5M0sIxiDfZuXGB\nkf4mJdEkTI1KUGRdCwO/O7hXbroGoUVJTwqBLi1tKqLLarwCITje2T200BYOzj82\nqwRkCXGtXPKnxYEEUOiFx9OeDrzsZV00cxsEnX0Zdj+PucQ/J3Cvd0dWUspJfLHJ\n39gnaegswnz9KMQAvzKFdg==\n-----END PRIVATE KEY-----\n"""
+ceph_generated_key = """-----BEGIN PRIVATE KEY-----\\nMIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQDLIx7gMwD4x5gr\\nsEdkwcGrvOOWIQ5w2DGiaNTypt/lRIXsyqt/r7/ztdaqPikLIBbzHRZcSvoDnpr5\\nyYFWa36q1YPOXTbRqgh3qUvkSAsufr+QwMIIkur74uD1wSh8xK9xmH6VjZ/7Wn4L\\n7TTF6e2ste9cY6KUBlZpB+iNPYGlGc1ZYgVukXCVwquWDYbRwWNXlzWbprTCmxD0\\nkc6J6rRK/AKLFqPCrcLABi66JTXCMjigk7gijX6DzIHecfYyj/blICkLExe5eHiQ\\nctCv9PjiGqKy8bi4liAoxxIitaPmjbmZyUHIaLVQj+RmQJQRL3iLtn/eKHHgFp3Q\\nAyz/23AXAgMBAAECggEAVoTB3Mm8azlPlaQB9GcV3tiXslSn+uYJ1duCf0sV52dV\\nBzKW8s5fGiTjpiTNhGCJhchowqxoaew+o47wmGc2TvqbpeRLuecKrjScD0GkCYyQ\\neM2wlshEbz4FhIZdgS6gbuh9WaM1dW/oaZoBNR5aTYo7xYTmNNeyLA/jO2zr7+4W\\n5yES1lMSBXpKk7bDGKYY4bsX2b5RLr2Grh2u2bp7hoLABCEvuu8tSQdWXLEXWpXo\\njwmV3hc6tabypIa0mj2Dmn2Dmt1ppSO0AZWG/WAizN3f4Z0r/u9HnbVrVmh0IEDw\\n3uf2LP5o3msG9qKCbzv3lMgt9mMr70HOKnJ8ohMSKQKBgQDLkNb+0nr152HU9AeJ\\nvdz8BeMxcwxCG77iwZphZ1HprmYKvvXgedqWtS6FRU+nV6UuQoPUbQxJBQzrN1Qv\\nwKSlOAPCrTJgNgF/RbfxZTrIgCPuK2KM8I89VZv92TSGi362oQA4MazXC8RAWjoJ\\nSu1/PHzK3aXOfVNSLrOWvIYeZQKBgQD/dgT6RUXKg0UhmXj7ExevV+c7oOJTDlMl\\nvLngrmbjRgPO9VxLnZQGdyaBJeRngU/UXfNgajT/MU8B5fSKInnTMawv/tW7634B\\nw3v6n5kNIMIjJmENRsXBVMllDTkT9S7ApV+VoGnXRccbTiDapBThSGd0wri/CuwK\\nNWK1YFOeywKBgEDyI/XG114PBUJ43NLQVWm+wx5qszWAPqV/2S5MVXD1qC6zgCSv\\nG9NLWN1CIMimCNg6dm7Wn73IM7fzvhNCJgVkWqbItTLG6DFf3/DPODLx1wTMqLOI\\nqFqMLqmNm9l1Nec0dKp5BsjRQzq4zp1aX21hsfrTPmwjxeqJZdioqy2VAoGAXR5X\\nCCdSHlSlUW8RE2xNOOQw7KJjfWT+WAYoN0c7R+MQplL31rRU7dpm1bLLRBN11vJ8\\nMYvlT5RYuVdqQSP6BkrX+hLJNBvOLbRlL+EXOBrVyVxHCkDe+u7+DnC4epbn+N8P\\nLYpwqkDMKB7diPVAizIKTBxinXjMu5fkKDs5n+sCgYBbZheYKk5M0sIxiDfZuXGB\\nkf4mJdEkTI1KUGRdCwO/O7hXbroGoUVJTwqBLi1tKqLLarwCITje2T200BYOzj82\\nqwRkCXGtXPKnxYEEUOiFx9OeDrzsZV00cxsEnX0Zdj+PucQ/J3Cvd0dWUspJfLHJ\\n39gnaegswnz9KMQAvzKFdg==\\n-----END PRIVATE KEY-----\\n"""
 
 
 class FakeInventory:
@@ -602,6 +602,101 @@ def test_alertmanager_config(
                     use_current_daemon_image=False,
                 )
 
+    @patch("cephadm.serve.CephadmServe._run_cephadm")
+    @patch("socket.getfqdn")
+    @patch("cephadm.module.CephadmOrchestrator.get_mgr_ip", lambda _: '::1')
+    @patch("cephadm.services.monitoring.password_hash", lambda password: 'alertmanager_password_hash')
+    @patch('cephadm.cert_mgr.CertMgr.get_root_ca', lambda instance: 'cephadm_root_cert')
+    @patch('cephadm.cert_mgr.CertMgr.generate_cert', lambda instance, fqdn, ip: ('mycert', 'mykey'))
+    def test_alertmanager_config_when_mgmt_gw_enabled(self, _get_fqdn, _run_cephadm, cephadm_module: CephadmOrchestrator):
+        _run_cephadm.side_effect = async_side_effect(('{}', '', 0))
+
+        fqdn = 'host1.test'
+        _get_fqdn.return_value = fqdn
+
+        with with_host(cephadm_module, 'test'):
+            cephadm_module.secure_monitoring_stack = True
+            cephadm_module.set_store(AlertmanagerService.USER_CFG_KEY, 'alertmanager_user')
+            cephadm_module.set_store(AlertmanagerService.PASS_CFG_KEY, 'alertmanager_plain_password')
+            with with_service(cephadm_module, MgmtGatewaySpec("mgmt-gateway")) as _, \
+                 with_service(cephadm_module, AlertManagerSpec()):
+
+                y = dedent("""
+                # This file is generated by cephadm.
+                # See https://prometheus.io/docs/alerting/configuration/ for documentation.
+
+                global:
+                  resolve_timeout: 5m
+                  http_config:
+                    tls_config:
+                      ca_file: root_cert.pem
+
+                route:
+                  receiver: 'default'
+                  routes:
+                    - group_by: ['alertname']
+                      group_wait: 10s
+                      group_interval: 10s
+                      repeat_interval: 1h
+                      receiver: 'ceph-dashboard'
+
+                receivers:
+                - name: 'default'
+                  webhook_configs:
+                - name: 'ceph-dashboard'
+                  webhook_configs:
+                  - url: 'https://host_fqdn:29443/internal/dashboard/api/prometheus_receiver'
+                """).lstrip()
+
+                web_config = dedent("""
+                tls_server_config:
+                  cert_file: alertmanager.crt
+                  key_file: alertmanager.key
+                  client_auth_type: RequireAndVerifyClientCert
+                  client_ca_file: root_cert.pem
+                basic_auth_users:
+                    alertmanager_user: alertmanager_password_hash
+                """).lstrip()
+
+                _run_cephadm.assert_called_with(
+                    'test',
+                    "alertmanager.test",
+                    ['_orch', 'deploy'],
+                    [],
+                    stdin=json.dumps({
+                        "fsid": "fsid",
+                        "name": 'alertmanager.test',
+                        "image": '',
+                        "deploy_arguments": [],
+                        "params": {
+                            'tcp_ports': [9093, 9094],
+                        },
+                        "meta": {
+                            'service_name': 'alertmanager',
+                            'ports': [9093, 9094],
+                            'ip': None,
+                            'deployed_by': [],
+                            'rank': None,
+                            'rank_generation': None,
+                            'extra_container_args': None,
+                            'extra_entrypoint_args': None,
+                        },
+                        "config_blobs": {
+                            "files": {
+                                "alertmanager.yml": y,
+                                'alertmanager.crt': 'mycert',
+                                'alertmanager.key': 'mykey',
+                                'web.yml': web_config,
+                                'root_cert.pem': 'cephadm_root_cert'
+                            },
+                            'peers': [],
+                            'web_config': '/etc/alertmanager/web.yml',
+                            "use_url_prefix": True,
+                        }
+                    }),
+                    use_current_daemon_image=False,
+                )
+
     @patch("cephadm.serve.CephadmServe._run_cephadm")
     @patch("socket.getfqdn")
     @patch("cephadm.module.CephadmOrchestrator.get_mgr_ip", lambda _: '::1')
@@ -738,6 +833,110 @@ def test_ceph_exporter_config_security_enabled(self, _get_fqdn, _run_cephadm, ce
                                                             "ceph-exporter.key": "mykey"}}}),
                                                 use_current_daemon_image=False)
 
+    @patch("cephadm.serve.CephadmServe._run_cephadm")
+    @patch("mgr_module.MgrModule.get")
+    @patch("socket.getfqdn")
+    def test_node_exporter_config_without_mgmt_gw(
+        self,
+        mock_getfqdn,
+        mock_get,
+        _run_cephadm,
+        cephadm_module: CephadmOrchestrator,
+    ):
+        _run_cephadm.side_effect = async_side_effect(("{}", "", 0))
+        fqdn = 'host1.test'
+        mock_getfqdn.return_value = fqdn
+
+        with with_host(cephadm_module, "test"):
+            with with_service(cephadm_module, MonitoringSpec('node-exporter')):
+                _run_cephadm.assert_called_with(
+                    'test',
+                    "node-exporter.test",
+                    ['_orch', 'deploy'],
+                    [],
+                    stdin=json.dumps({
+                        "fsid": "fsid",
+                        "name": 'node-exporter.test',
+                        "image": '',
+                        "deploy_arguments": [],
+                        "params": {
+                            'tcp_ports': [9100],
+                        },
+                        "meta": {
+                            'service_name': 'node-exporter',
+                            'ports': [9100],
+                            'ip': None,
+                            'deployed_by': [],
+                            'rank': None,
+                            'rank_generation': None,
+                            'extra_container_args': None,
+                            'extra_entrypoint_args': None,
+                        },
+                        "config_blobs": {}
+                    }),
+                    use_current_daemon_image=False,
+                )
+
+    @patch('cephadm.cert_mgr.CertMgr.generate_cert', lambda instance, fqdn, ip: (ceph_generated_cert, ceph_generated_key))
+    @patch('cephadm.cert_mgr.CertMgr.get_root_ca', lambda instance: cephadm_root_ca)
+    @patch("cephadm.serve.CephadmServe._run_cephadm")
+    @patch("socket.getfqdn")
+    def test_node_exporter_config_with_mgmt_gw(
+        self,
+        mock_getfqdn,
+        _run_cephadm,
+        cephadm_module: CephadmOrchestrator,
+    ):
+        _run_cephadm.side_effect = async_side_effect(("{}", "", 0))
+        mock_getfqdn.return_value = 'host1.test'
+
+        y = dedent("""
+        tls_server_config:
+          cert_file: node_exporter.crt
+          key_file: node_exporter.key
+          client_auth_type: RequireAndVerifyClientCert
+          client_ca_file: root_cert.pem
+        """).lstrip()
+
+        with with_host(cephadm_module, "test"):
+            with with_service(cephadm_module, MgmtGatewaySpec("mgmt-gateway")) as _, \
+                 with_service(cephadm_module, MonitoringSpec('node-exporter')):
+                _run_cephadm.assert_called_with(
+                    'test',
+                    "node-exporter.test",
+                    ['_orch', 'deploy'],
+                    [],
+                    stdin=json.dumps({
+                        "fsid": "fsid",
+                        "name": 'node-exporter.test',
+                        "image": '',
+                        "deploy_arguments": [],
+                        "params": {
+                            'tcp_ports': [9100],
+                        },
+                        "meta": {
+                            'service_name': 'node-exporter',
+                            'ports': [9100],
+                            'ip': None,
+                            'deployed_by': [],
+                            'rank': None,
+                            'rank_generation': None,
+                            'extra_container_args': None,
+                            'extra_entrypoint_args': None,
+                        },
+                        "config_blobs": {
+                            "files": {
+                                "web.yml": y,
+                                'root_cert.pem': f"{cephadm_root_ca}",
+                                'node_exporter.crt': f"{ceph_generated_cert}",
+                                'node_exporter.key': f"{ceph_generated_key}",
+                            },
+                            'web_config': '/etc/node-exporter/web.yml',
+                        }
+                    }),
+                    use_current_daemon_image=False,
+                )
+
     @patch("cephadm.serve.CephadmServe._run_cephadm")
     @patch("cephadm.module.CephadmOrchestrator.get_mgr_ip", lambda _: '::1')
     def test_prometheus_config_security_disabled(self, _run_cephadm, cephadm_module: CephadmOrchestrator):
@@ -1240,6 +1439,286 @@ def test_promtail_config(self, _run_cephadm, cephadm_module: CephadmOrchestrator
                     use_current_daemon_image=False,
                 )
 
+    @patch("cephadm.serve.CephadmServe._run_cephadm")
+    @patch("cephadm.module.CephadmOrchestrator.get_mgr_ip", lambda _: '1::4')
+    @patch("cephadm.module.CephadmOrchestrator.get_fqdn", lambda a, b: 'host_fqdn')
+    @patch("cephadm.services.monitoring.verify_tls", lambda *_: None)
+    @patch('cephadm.cert_mgr.CertMgr.get_root_ca', lambda instance: cephadm_root_ca)
+    def test_grafana_config_with_mgmt_gw_and_ouath2_proxy(self, _run_cephadm, cephadm_module: CephadmOrchestrator):
+        _run_cephadm.side_effect = async_side_effect(("{}", "", 0))
+
+        y = dedent(f"""
+             # This file is generated by cephadm.
+             apiVersion: 1
+
+             deleteDatasources:
+               - name: 'Dashboard1'
+                 orgId: 1
+
+             datasources:
+               - name: 'Dashboard1'
+                 type: 'prometheus'
+                 access: 'proxy'
+                 orgId: 1
+                 url: 'https://host_fqdn:29443/internal/prometheus'
+                 basicAuth: true
+                 isDefault: true
+                 editable: false
+                 basicAuthUser: admin
+                 jsonData:
+                    graphiteVersion: "1.1"
+                    tlsAuth: false
+                    tlsAuthWithCACert: true
+                    tlsSkipVerify: false
+                 secureJsonData:
+                   basicAuthPassword: admin
+                   tlsCACert: "{cephadm_root_ca}"
+                   tlsClientCert: "{ceph_generated_cert}"
+                   tlsClientKey: "{ceph_generated_key}"
+
+               - name: 'Loki'
+                 type: 'loki'
+                 access: 'proxy'
+                 url: ''
+                 basicAuth: false
+                 isDefault: false
+                 editable: false""").lstrip()
+
+        oauth2_spec = OAuth2ProxySpec(provider_display_name='my_idp_provider',
+                                      client_id='my_client_id',
+                                      client_secret='my_client_secret',
+                                      oidc_issuer_url='http://192.168.10.10:8888/dex',
+                                      cookie_secret='kbAEM9opAmuHskQvt0AW8oeJRaOM2BYy5Loba0kZ0SQ=',
+                                      ssl_certificate=ceph_generated_cert,
+                                      ssl_certificate_key=ceph_generated_key)
+
+        with with_host(cephadm_module, "test"):
+            cephadm_module.cert_key_store.save_cert('grafana_cert', ceph_generated_cert, host='test')
+            cephadm_module.cert_key_store.save_key('grafana_key', ceph_generated_key, host='test')
+            with with_service(cephadm_module, PrometheusSpec("prometheus")) as _, \
+                 with_service(cephadm_module, MgmtGatewaySpec("mgmt-gateway")) as _, \
+                 with_service(cephadm_module, oauth2_spec) as _, \
+                 with_service(cephadm_module, ServiceSpec("mgr")) as _, with_service(
+                     cephadm_module, GrafanaSpec("grafana")
+            ) as _:
+                files = {
+                    'grafana.ini': dedent("""
+                        # This file is generated by cephadm.
+                        [users]
+                          default_theme = light
+                        [auth.anonymous]
+                          enabled = true
+                          org_name = 'Main Org.'
+                          org_role = 'Viewer'
+                        [server]
+                          domain = 'host_fqdn'
+                          protocol = https
+                          cert_file = /etc/grafana/certs/cert_file
+                          cert_key = /etc/grafana/certs/cert_key
+                          http_port = 3000
+                          http_addr = 
+                          root_url = %(protocol)s://%(domain)s:%(http_port)s/grafana/
+                          serve_from_sub_path = true
+                        [snapshots]
+                          external_enabled = false
+                        [security]
+                          disable_initial_admin_creation = true
+                          cookie_secure = true
+                          cookie_samesite = none
+                          allow_embedding = true
+                        [auth]
+                          disable_login_form = true
+                        [auth.proxy]
+                          enabled = true
+                          header_name = X-WEBAUTH-USER
+                          header_property = username
+                          auto_sign_up = true
+                          sync_ttl = 15
+                          whitelist = 1::4
+                          headers_encoded = false
+                          enable_login_token = false
+                          headers = Role:X-WEBAUTH-ROLE\n""").lstrip(),  # noqa: W291
+                    "provisioning/datasources/ceph-dashboard.yml": y,
+                    'certs/cert_file': dedent(f"""
+                        # generated by cephadm\n{ceph_generated_cert}""").lstrip(),
+                    'certs/cert_key': dedent(f"""
+                        # generated by cephadm\n{ceph_generated_key}""").lstrip(),
+                    'provisioning/dashboards/default.yml': dedent("""
+                        # This file is generated by cephadm.
+                        apiVersion: 1
+
+                        providers:
+                          - name: 'Ceph Dashboard'
+                            orgId: 1
+                            folder: ''
+                            type: file
+                            disableDeletion: false
+                            updateIntervalSeconds: 3
+                            editable: false
+                            options:
+                              path: '/etc/grafana/provisioning/dashboards'""").lstrip(),
+                }
+
+                _run_cephadm.assert_called_with(
+                    'test',
+                    "grafana.test",
+                    ['_orch', 'deploy'],
+                    [],
+                    stdin=json.dumps({
+                        "fsid": "fsid",
+                        "name": 'grafana.test',
+                        "image": '',
+                        "deploy_arguments": [],
+                        "params": {
+                            'tcp_ports': [3000],
+                        },
+                        "meta": {
+                            'service_name': 'grafana',
+                            'ports': [3000],
+                            'ip': None,
+                            'deployed_by': [],
+                            'rank': None,
+                            'rank_generation': None,
+                            'extra_container_args': None,
+                            'extra_entrypoint_args': None,
+                        },
+                        "config_blobs": {
+                            "files": files,
+                        },
+                    }),
+                    use_current_daemon_image=False,
+                )
+
+    @patch("cephadm.serve.CephadmServe._run_cephadm")
+    @patch("cephadm.module.CephadmOrchestrator.get_mgr_ip", lambda _: '1::4')
+    @patch("cephadm.module.CephadmOrchestrator.get_fqdn", lambda a, b: 'host_fqdn')
+    @patch("cephadm.services.monitoring.verify_tls", lambda *_: None)
+    @patch('cephadm.cert_mgr.CertMgr.get_root_ca', lambda instance: cephadm_root_ca)
+    def test_grafana_config_with_mgmt_gw(self, _run_cephadm, cephadm_module: CephadmOrchestrator):
+        _run_cephadm.side_effect = async_side_effect(("{}", "", 0))
+
+        y = dedent(f"""
+             # This file is generated by cephadm.
+             apiVersion: 1
+
+             deleteDatasources:
+               - name: 'Dashboard1'
+                 orgId: 1
+
+             datasources:
+               - name: 'Dashboard1'
+                 type: 'prometheus'
+                 access: 'proxy'
+                 orgId: 1
+                 url: 'https://host_fqdn:29443/internal/prometheus'
+                 basicAuth: true
+                 isDefault: true
+                 editable: false
+                 basicAuthUser: admin
+                 jsonData:
+                    graphiteVersion: "1.1"
+                    tlsAuth: false
+                    tlsAuthWithCACert: true
+                    tlsSkipVerify: false
+                 secureJsonData:
+                   basicAuthPassword: admin
+                   tlsCACert: "{cephadm_root_ca}"
+                   tlsClientCert: "{ceph_generated_cert}"
+                   tlsClientKey: "{ceph_generated_key}"
+
+               - name: 'Loki'
+                 type: 'loki'
+                 access: 'proxy'
+                 url: ''
+                 basicAuth: false
+                 isDefault: false
+                 editable: false""").lstrip()
+
+        with with_host(cephadm_module, "test"):
+            cephadm_module.cert_key_store.save_cert('grafana_cert', ceph_generated_cert, host='test')
+            cephadm_module.cert_key_store.save_key('grafana_key', ceph_generated_key, host='test')
+            with with_service(
+                cephadm_module, PrometheusSpec("prometheus")
+            ) as _, with_service(cephadm_module, MgmtGatewaySpec("mgmt-gateway")) as _, \
+                with_service(cephadm_module, ServiceSpec("mgr")) as _, with_service(
+                cephadm_module, GrafanaSpec("grafana")
+            ) as _:
+                files = {
+                    'grafana.ini': dedent("""
+                        # This file is generated by cephadm.
+                        [users]
+                          default_theme = light
+                        [auth.anonymous]
+                          enabled = true
+                          org_name = 'Main Org.'
+                          org_role = 'Viewer'
+                        [server]
+                          domain = 'host_fqdn'
+                          protocol = https
+                          cert_file = /etc/grafana/certs/cert_file
+                          cert_key = /etc/grafana/certs/cert_key
+                          http_port = 3000
+                          http_addr = 
+                          root_url = %(protocol)s://%(domain)s:%(http_port)s/grafana/
+                          serve_from_sub_path = true
+                        [snapshots]
+                          external_enabled = false
+                        [security]
+                          disable_initial_admin_creation = true
+                          cookie_secure = true
+                          cookie_samesite = none
+                          allow_embedding = true\n""").lstrip(),  # noqa: W291
+                    "provisioning/datasources/ceph-dashboard.yml": y,
+                    'certs/cert_file': dedent(f"""
+                        # generated by cephadm\n{ceph_generated_cert}""").lstrip(),
+                    'certs/cert_key': dedent(f"""
+                        # generated by cephadm\n{ceph_generated_key}""").lstrip(),
+                    'provisioning/dashboards/default.yml': dedent("""
+                        # This file is generated by cephadm.
+                        apiVersion: 1
+
+                        providers:
+                          - name: 'Ceph Dashboard'
+                            orgId: 1
+                            folder: ''
+                            type: file
+                            disableDeletion: false
+                            updateIntervalSeconds: 3
+                            editable: false
+                            options:
+                              path: '/etc/grafana/provisioning/dashboards'""").lstrip(),
+                }
+
+                _run_cephadm.assert_called_with(
+                    'test',
+                    "grafana.test",
+                    ['_orch', 'deploy'],
+                    [],
+                    stdin=json.dumps({
+                        "fsid": "fsid",
+                        "name": 'grafana.test',
+                        "image": '',
+                        "deploy_arguments": [],
+                        "params": {
+                            'tcp_ports': [3000],
+                        },
+                        "meta": {
+                            'service_name': 'grafana',
+                            'ports': [3000],
+                            'ip': None,
+                            'deployed_by': [],
+                            'rank': None,
+                            'rank_generation': None,
+                            'extra_container_args': None,
+                            'extra_entrypoint_args': None,
+                        },
+                        "config_blobs": {
+                            "files": files,
+                        },
+                    }),
+                    use_current_daemon_image=False,
+                )
+
     @patch("cephadm.serve.CephadmServe._run_cephadm")
     @patch("cephadm.module.CephadmOrchestrator.get_mgr_ip", lambda _: '1::4')
     @patch("cephadm.module.CephadmOrchestrator.get_fqdn", lambda a, b: 'host_fqdn')
@@ -3296,7 +3775,7 @@ class TestMgmtGateway:
     @patch("cephadm.module.CephadmOrchestrator.get_mgr_ip", lambda _: '::1')
     @patch('cephadm.cert_mgr.CertMgr.get_root_ca', lambda instance: cephadm_root_ca)
     @patch("cephadm.services.mgmt_gateway.get_dashboard_endpoints", lambda _: (["ceph-node-2:8443", "ceph-node-2:8443"], "https"))
-    def test_mgmt_gateway_config_no_auth(self, get_service_endpoints_mock: List[str], _run_cephadm, cephadm_module: CephadmOrchestrator):
+    def test_mgmt_gw_config_no_auth(self, get_service_endpoints_mock: List[str], _run_cephadm, cephadm_module: CephadmOrchestrator):
 
         def get_services_endpoints(name):
             if name == 'prometheus':
@@ -3417,7 +3896,6 @@ def get_services_endpoints(name):
                                                  }
 
                                                  location /grafana {
-                                                     rewrite ^/grafana/(.*) /$1 break;
                                                      proxy_pass https://grafana_servers;
                                                      # clear any Authorization header as Prometheus and Alertmanager are using basic-auth browser
                                                      # will send this header if Grafana is running on the same node as one of those services
@@ -3518,7 +3996,7 @@ def get_services_endpoints(name):
     @patch('cephadm.cert_mgr.CertMgr.get_root_ca', lambda instance: cephadm_root_ca)
     @patch("cephadm.services.mgmt_gateway.get_dashboard_endpoints", lambda _: (["ceph-node-2:8443", "ceph-node-2:8443"], "https"))
     @patch("cephadm.services.mgmt_gateway.MgmtGatewayService.get_oauth2_service_url", lambda _: "https://192.168.100.102:4180")
-    def test_mgmt_gateway_config_with_auth(self, get_service_endpoints_mock: List[str], _run_cephadm, cephadm_module: CephadmOrchestrator):
+    def test_mgmt_gw_config_with_auth(self, get_service_endpoints_mock: List[str], _run_cephadm, cephadm_module: CephadmOrchestrator):
 
         def get_services_endpoints(name):
             if name == 'prometheus':
@@ -3689,7 +4167,6 @@ def get_services_endpoints(name):
                                                  }
 
                                                  location /grafana {
-                                                     rewrite ^/grafana/(.*) /$1 break;
                                                      proxy_pass https://grafana_servers;
                                                      # clear any Authorization header as Prometheus and Alertmanager are using basic-auth browser
                                                      # will send this header if Grafana is running on the same node as one of those services

From 7e03ee798f4ed3aa4a0bb1a9e9d62df52e54406d Mon Sep 17 00:00:00 2001
From: Dan Mick <dmick@redhat.com>
Date: Thu, 10 Oct 2024 16:38:58 -0700
Subject: [PATCH 096/148] container/build.sh: fix arm architecture tagging

The wrong string was used for comparison, and for tagging,
so the arm64 branch and sha1 images overwrote and destroyed
the amd64 images.

Signed-off-by: Dan Mick <dmick@redhat.com>
---
 container/build.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/container/build.sh b/container/build.sh
index 7c97e2261c16f..5edf469d2d2e4 100755
--- a/container/build.sh
+++ b/container/build.sh
@@ -136,9 +136,9 @@ if [[ ${CI_CONTAINER} == "true" ]] ; then
     branch_repo_tag=$repopath/ceph:${BRANCH}
     sha1_repo_tag=$repopath/ceph:${CEPH_SHA1}
 
-    if [[ "${ARCH}" == "aarch64" ]] ; then
-        branch_repo_tag=${branch_repo_tag}-aarch64
-        sha1_repo_tag=${sha1_repo_tag}-aarch64
+    if [[ "${ARCH}" == "arm64" ]] ; then
+        branch_repo_tag=${branch_repo_tag}-arm64
+        sha1_repo_tag=${sha1_repo_tag}-arm64
     fi
 
     podman tag ${image_id} ${full_repo_tag}

From ad147f2e8d820ff251e1499c1e4c3fe57d1a2082 Mon Sep 17 00:00:00 2001
From: Aashish Sharma
 <aasharma@li-e74156cc-2f67-11b2-a85c-e98659a63c5c.ibm.com>
Date: Wed, 9 Oct 2024 19:32:49 +0530
Subject: [PATCH 097/148] mgr/cephadm: RGW service deployment defaults to
 'default' realm/zonegroup/zone despite non-default spec in service

When we create an RGW service using the ceph orch apply command, the service is always deployed in the default realm, zonegroup, and zone, even if we specify a different realm, zonegroup, or zone in the service spec. This happens because certain configuration values, like rgw_realm, rgw_zonegroup, and rgw_zone, need to be set for the RGW instances before the daemons are deployed. Currently, these configurations are being applied after the RGW daemons are deployed, which requires a service restart to reflect the correct realm, zonegroup, and zone. Ideally, these configurations should be applied before the RGW daemons are deployed, so they are correctly placed in the desired realm, zonegroup, and zone from the start.

Fixes: https://tracker.ceph.com/issues/68461

Signed-off-by: Aashish Sharma <aasharma@redhat.com>
---
 src/pybind/mgr/cephadm/module.py                  | 1 +
 src/pybind/mgr/cephadm/serve.py                   | 4 ++++
 src/pybind/mgr/cephadm/services/cephadmservice.py | 9 +++++++--
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py
index 5216c489064c9..afaf5d7846e3c 100644
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -764,6 +764,7 @@ def __init__(self, *args: Any, **kwargs: Any):
         self.iscsi_service: IscsiService = cast(IscsiService, self.cephadm_services['iscsi'])
         self.nvmeof_service: NvmeofService = cast(NvmeofService, self.cephadm_services['nvmeof'])
         self.node_proxy_service: NodeProxy = cast(NodeProxy, self.cephadm_services['node-proxy'])
+        self.rgw_service: RgwService = cast(RgwService, self.cephadm_services['rgw'])
 
         self.scheduled_async_actions: List[Callable] = []
 
diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py
index c6212c9efb83d..611c27c34538a 100644
--- a/src/pybind/mgr/cephadm/serve.py
+++ b/src/pybind/mgr/cephadm/serve.py
@@ -950,6 +950,10 @@ def update_progress() -> None:
                     )
                     continue
 
+                # set multisite config before deploying the rgw daemon
+                if service_type == 'rgw':
+                    self.mgr.rgw_service.set_realm_zg_zone(cast(RGWSpec, spec))
+
                 # deploy new daemon
                 daemon_id = slot.name
 
diff --git a/src/pybind/mgr/cephadm/services/cephadmservice.py b/src/pybind/mgr/cephadm/services/cephadmservice.py
index eb9a1c838a656..9043577bc5a60 100644
--- a/src/pybind/mgr/cephadm/services/cephadmservice.py
+++ b/src/pybind/mgr/cephadm/services/cephadmservice.py
@@ -984,10 +984,9 @@ class RgwService(CephService):
     def allow_colo(self) -> bool:
         return True
 
-    def config(self, spec: RGWSpec) -> None:  # type: ignore
+    def set_realm_zg_zone(self, spec: RGWSpec) -> None:
         assert self.TYPE == spec.service_type
 
-        # set rgw_realm rgw_zonegroup and rgw_zone, if present
         if spec.rgw_realm:
             ret, out, err = self.mgr.check_mon_command({
                 'prefix': 'config set',
@@ -1010,6 +1009,12 @@ def config(self, spec: RGWSpec) -> None:  # type: ignore
                 'value': spec.rgw_zone,
             })
 
+    def config(self, spec: RGWSpec) -> None:  # type: ignore
+        assert self.TYPE == spec.service_type
+
+        # set rgw_realm rgw_zonegroup and rgw_zone, if present
+        self.set_realm_zg_zone(spec)
+
         if spec.generate_cert and not spec.rgw_frontend_ssl_certificate:
             # generate a self-signed cert for the rgw service
             cert, key = self.mgr.cert_mgr.ssl_certs.generate_root_cert(custom_san_list=spec.zonegroup_hostnames)

From 86378344ab0a381569b116c2112a981404f93671 Mon Sep 17 00:00:00 2001
From: Nizamudeen A <nia@redhat.com>
Date: Fri, 20 Sep 2024 20:35:38 +0530
Subject: [PATCH 098/148] mgr/dashboard: introduce server side pagination for
 osds

Fixes: https://tracker.ceph.com/issues/56511
Signed-off-by: Nizamudeen A <nia@redhat.com>
---
 qa/tasks/mgr/dashboard/test_osd.py            |  5 +-
 src/pybind/mgr/dashboard/controllers/osd.py   | 28 ++++++++-
 .../osd/osd-list/osd-list.component.html      |  6 +-
 .../osd/osd-list/osd-list.component.spec.ts   | 59 +++++++++++--------
 .../osd/osd-list/osd-list.component.ts        | 14 +++--
 .../src/app/shared/api/osd.service.spec.ts    |  6 +-
 .../src/app/shared/api/osd.service.ts         | 11 +++-
 .../src/app/shared/api/paginate.model.ts      |  2 +-
 .../shared/classes/paginate-params.class.ts   | 15 +++++
 .../models/cd-table-fetch-data-context.ts     |  2 +-
 .../src/app/shared/models/osd.model.ts        | 49 +++++++++++++++
 src/pybind/mgr/dashboard/openapi.yaml         | 24 +++++++-
 src/pybind/mgr/dashboard/tests/test_osd.py    |  3 +-
 13 files changed, 178 insertions(+), 46 deletions(-)
 create mode 100644 src/pybind/mgr/dashboard/frontend/src/app/shared/classes/paginate-params.class.ts
 create mode 100644 src/pybind/mgr/dashboard/frontend/src/app/shared/models/osd.model.ts

diff --git a/qa/tasks/mgr/dashboard/test_osd.py b/qa/tasks/mgr/dashboard/test_osd.py
index 07c69ddc47cb6..be7afccf33176 100644
--- a/qa/tasks/mgr/dashboard/test_osd.py
+++ b/qa/tasks/mgr/dashboard/test_osd.py
@@ -11,6 +11,7 @@
 class OsdTest(DashboardTestCase):
 
     AUTH_ROLES = ['cluster-manager']
+    _VERSION = '1.1'
 
     @classmethod
     def setUpClass(cls):
@@ -24,7 +25,7 @@ def tearDown(self):
 
     @DashboardTestCase.RunAs('test', 'test', ['block-manager'])
     def test_access_permissions(self):
-        self._get('/api/osd')
+        self._get('/api/osd', version=self._VERSION)
         self.assertStatus(403)
         self._get('/api/osd/0')
         self.assertStatus(403)
@@ -33,7 +34,7 @@ def assert_in_and_not_none(self, data, properties):
         self.assertSchema(data, JObj({p: JAny(none=False) for p in properties}, allow_unknown=True))
 
     def test_list(self):
-        data = self._get('/api/osd')
+        data = self._get('/api/osd', version=self._VERSION)
         self.assertStatus(200)
 
         self.assertGreaterEqual(len(data), 1)
diff --git a/src/pybind/mgr/dashboard/controllers/osd.py b/src/pybind/mgr/dashboard/controllers/osd.py
index c9d1417720005..07d8db7755b8a 100644
--- a/src/pybind/mgr/dashboard/controllers/osd.py
+++ b/src/pybind/mgr/dashboard/controllers/osd.py
@@ -5,12 +5,14 @@
 import time
 from typing import Any, Dict, List, Optional, Union
 
+import cherrypy
 from ceph.deployment.drive_group import DriveGroupSpec, DriveGroupValidationError  # type: ignore
 from mgr_util import get_most_recent_rate
 
 from .. import mgr
 from ..exceptions import DashboardException
 from ..security import Scope
+from ..services._paginate import ListPaginator
 from ..services.ceph_service import CephService, SendCommandError
 from ..services.exception import handle_orchestrator_error, handle_send_command_error
 from ..services.orchestrator import OrchClient, OrchFeature
@@ -121,8 +123,30 @@ def osd_task(name, metadata, wait_for=2.0):
 @APIRouter('/osd', Scope.OSD)
 @APIDoc('OSD management API', 'OSD')
 class Osd(RESTController):
-    def list(self):
-        osds = self.get_osd_map()
+    @RESTController.MethodMap(version=APIVersion(1, 1))
+    def list(self, offset: int = 0, limit: int = 10,
+             search: str = '', sort: str = ''):
+        all_osds = self.get_osd_map()
+
+        paginator = ListPaginator(int(offset), int(limit), sort, search,
+                                  input_list=all_osds.values(),
+                                  searchable_params=['id'],
+                                  sortable_params=['id'],
+                                  default_sort='+id')
+
+        cherrypy.response.headers['X-Total-Count'] = paginator.get_count()
+
+        paginated_osds_list = list(paginator.list())
+        # creating a dictionary to have faster lookups
+        paginated_osds_by_id = {osd['id']: osd for osd in paginated_osds_list}
+        try:
+            osds = {
+                key: paginated_osds_by_id[int(key)]
+                for key in all_osds.keys()
+                if int(key) in paginated_osds_by_id
+            }
+        except ValueError as e:
+            raise DashboardException(e, component='osd', http_status_code=400)
 
         # Extending by osd stats information
         for stat in mgr.get('osd_stats')['osd_stats']:
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.html
index 5f5f91dd0ed67..a56877512f99a 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.html
@@ -6,13 +6,15 @@
        i18n>OSDs List</a>
     <ng-template ngbNavContent>
       <cd-table [data]="osds"
-                (fetchData)="getOsdList()"
+                (fetchData)="getOsdList($event)"
                 [columns]="columns"
                 selectionType="multiClick"
                 [hasDetails]="true"
                 (setExpandedRow)="setExpandedRow($event)"
                 (updateSelection)="updateSelection($event)"
-                [updateSelectionOnRefresh]="'never'">
+                [updateSelectionOnRefresh]="'never'"
+                [serverSide]="true"
+                [count]="count">
 
         <div class="table-actions">
           <cd-table-actions [permission]="permissions.osd"
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.spec.ts
index 77facfe3f8510..85ea924041405 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.spec.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.spec.ts
@@ -33,6 +33,8 @@ import {
 import { OsdReweightModalComponent } from '../osd-reweight-modal/osd-reweight-modal.component';
 import { OsdListComponent } from './osd-list.component';
 import { ResizeObserver as ResizeObserverPolyfill } from '@juggle/resize-observer';
+import { PaginateObservable } from '~/app/shared/api/paginate.model';
+import { Osd } from '~/app/shared/models/osd.model';
 
 describe('OsdListComponent', () => {
   let component: OsdListComponent;
@@ -141,38 +143,42 @@ describe('OsdListComponent', () => {
   });
 
   describe('getOsdList', () => {
-    let osds: any[];
+    let osds: Osd[];
     let flagsSpy: jasmine.Spy;
 
-    const createOsd = (n: number) =>
-      <Record<string, any>>{
-        in: 'in',
-        up: 'up',
-        tree: {
-          device_class: 'ssd'
-        },
-        stats_history: {
-          op_out_bytes: [
-            [n, n],
-            [n * 2, n * 2]
-          ],
-          op_in_bytes: [
-            [n * 3, n * 3],
-            [n * 4, n * 4]
-          ]
-        },
-        stats: {
-          stat_bytes_used: n * n,
-          stat_bytes: n * n * n
-        },
-        state: []
-      };
+    const createOsd = (n: number): Osd => ({
+      id: n,
+      host: {
+        id: 0,
+        name: 'test_host'
+      },
+      in: 1,
+      up: 1,
+      tree: {
+        device_class: 'ssd'
+      },
+      stats_history: {
+        op_out_bytes: [
+          [n, n],
+          [n * 2, n * 2]
+        ],
+        op_in_bytes: [
+          [n * 3, n * 3],
+          [n * 4, n * 4]
+        ]
+      },
+      stats: {
+        stat_bytes_used: n * n,
+        stat_bytes: n * n * n
+      },
+      state: []
+    });
 
     const expectAttributeOnEveryOsd = (attr: string) =>
       expect(component.osds.every((osd) => Boolean(_.get(osd, attr)))).toBeTruthy();
 
     beforeEach(() => {
-      spyOn(osdService, 'getList').and.callFake(() => of(osds));
+      spyOn(osdService, 'getList').and.callFake(() => new PaginateObservable<Osd[]>(of(osds)));
       flagsSpy = spyOn(osdService, 'getFlags').and.callFake(() => of([]));
       osds = [createOsd(1), createOsd(2), createOsd(3)];
       component.getOsdList();
@@ -556,8 +562,9 @@ describe('OsdListComponent', () => {
 
     beforeEach(() => {
       component.permissions = fakeAuthStorageService.getPermissions();
-      spyOn(osdService, 'getList').and.callFake(() => of(fakeOsds));
+      spyOn(osdService, 'getList').and.callFake(() => new PaginateObservable<Osd[]>(of(fakeOsds)));
       spyOn(osdService, 'getFlags').and.callFake(() => of([]));
+      component.getOsdList();
     });
 
     const testTableActions = async (
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.ts
index 103b61e79f0af..91cb0193f3cce 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.ts
@@ -39,6 +39,8 @@ import { OsdRecvSpeedModalComponent } from '../osd-recv-speed-modal/osd-recv-spe
 import { OsdReweightModalComponent } from '../osd-reweight-modal/osd-reweight-modal.component';
 import { OsdScrubModalComponent } from '../osd-scrub-modal/osd-scrub-modal.component';
 import { ModalCdsService } from '~/app/shared/services/modal-cds.service';
+import { CdTableFetchDataContext } from '~/app/shared/models/cd-table-fetch-data-context';
+import { Osd } from '~/app/shared/models/osd.model';
 
 const BASE_URL = 'osd';
 
@@ -71,6 +73,7 @@ export class OsdListComponent extends ListWithDetails implements OnInit {
   clusterWideActions: CdTableAction[];
   icons = Icons;
   osdSettings = new OsdSettings();
+  count = 0;
 
   selection = new CdTableSelection();
   osds: any[] = [];
@@ -426,10 +429,13 @@ export class OsdListComponent extends ListWithDetails implements OnInit {
     }
   }
 
-  getOsdList() {
-    const observables = [this.osdService.getList(), this.osdService.getFlags()];
-    observableForkJoin(observables).subscribe((resp: [any[], string[]]) => {
-      this.osds = resp[0].map((osd) => {
+  getOsdList(context?: CdTableFetchDataContext) {
+    if (!context) context = new CdTableFetchDataContext();
+    const pagination_obs = this.osdService.getList(context.toParams());
+    const observables = [pagination_obs.observable, this.osdService.getFlags()];
+    observableForkJoin(observables).subscribe((resp: any) => {
+      this.osds = resp[0].map((osd: Osd) => {
+        this.count = pagination_obs.count;
         osd.collectedStates = OsdListComponent.collectStates(osd);
         osd.stats_history.out_bytes = osd.stats_history.op_out_bytes.map((i: string) => i[1]);
         osd.stats_history.in_bytes = osd.stats_history.op_in_bytes.map((i: string) => i[1]);
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.spec.ts
index d1f9997791ae0..c81c9193a2e3c 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.spec.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.spec.ts
@@ -3,6 +3,7 @@ import { TestBed } from '@angular/core/testing';
 
 import { configureTestBed } from '~/testing/unit-test-helper';
 import { OsdService } from './osd.service';
+import { CdTableFetchDataContext } from '../models/cd-table-fetch-data-context';
 
 describe('OsdService', () => {
   let service: OsdService;
@@ -64,8 +65,9 @@ describe('OsdService', () => {
   });
 
   it('should call getList', () => {
-    service.getList().subscribe();
-    const req = httpTesting.expectOne('api/osd');
+    const context = new CdTableFetchDataContext(() => {});
+    service.getList(context.toParams()).observable.subscribe();
+    const req = httpTesting.expectOne('api/osd?offset=0&limit=10&search=&sort=%2Bname');
     expect(req.request.method).toBe('GET');
   });
 
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.ts
index f2ed4d7cc9e76..85a75073deafc 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.ts
@@ -1,4 +1,4 @@
-import { HttpClient } from '@angular/common/http';
+import { HttpClient, HttpParams } from '@angular/common/http';
 import { Injectable } from '@angular/core';
 
 import _ from 'lodash';
@@ -12,6 +12,9 @@ import { OsdSettings } from '../models/osd-settings';
 import { SmartDataResponseV1 } from '../models/smart';
 import { DeviceService } from '../services/device.service';
 import { CdFormGroup } from '../forms/cd-form-group';
+import { PaginateObservable } from './paginate.model';
+import { PaginateParams } from '../classes/paginate-params.class';
+import { Osd } from '../models/osd.model';
 
 @Injectable({
   providedIn: 'root'
@@ -80,8 +83,10 @@ export class OsdService {
     return this.http.post(this.path, request, { observe: 'response' });
   }
 
-  getList() {
-    return this.http.get(`${this.path}`);
+  getList(params: HttpParams): PaginateObservable<Osd[]> {
+    return new PaginateObservable<Osd[]>(
+      this.http.get<Osd[]>(this.path, new PaginateParams(params, 1, 1))
+    );
   }
 
   getOsdSettings(): Observable<OsdSettings> {
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/paginate.model.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/paginate.model.ts
index 703792a757181..77ec4e43f7cfe 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/paginate.model.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/paginate.model.ts
@@ -9,7 +9,7 @@ export class PaginateObservable<Type> {
     this.observable = obs.pipe(
       map((response: any) => {
         this.count = Number(response.headers?.get('X-Total-Count'));
-        return response['body'];
+        return response['body'] || response;
       })
     );
   }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/classes/paginate-params.class.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/classes/paginate-params.class.ts
new file mode 100644
index 0000000000000..a1b079b426b9d
--- /dev/null
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/classes/paginate-params.class.ts
@@ -0,0 +1,15 @@
+import { HttpParams } from '@angular/common/http';
+
+export class PaginateParams {
+  constructor(params: HttpParams, majorVersion = 1, minorVersion = 0) {
+    const options = {
+      params: params,
+      headers: {
+        Accept: `application/vnd.ceph.api.v${majorVersion}.${minorVersion}+json`
+      }
+    };
+
+    options['observe'] = 'response';
+    return options;
+  }
+}
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/models/cd-table-fetch-data-context.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/models/cd-table-fetch-data-context.ts
index 0df2d2ebbe071..6ea415bfee983 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/models/cd-table-fetch-data-context.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/models/cd-table-fetch-data-context.ts
@@ -18,7 +18,7 @@ export class CdTableFetchDataContext {
   search = '';
   sort = '+name';
 
-  constructor(error: () => void) {
+  constructor(error?: () => void) {
     this.error = error;
   }
 
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/models/osd.model.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/models/osd.model.ts
new file mode 100644
index 0000000000000..f22987e439ea5
--- /dev/null
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/models/osd.model.ts
@@ -0,0 +1,49 @@
+/* We will need to check what are all the value that the
+   UI need and only make them the mandatory parameters here.
+   For now based on what I saw in the unit test file;
+   osd-list.component.spec.ts, I've made the decision to make
+   things optional and non-optional. This should be re-evaluated. */
+
+export interface Osd {
+  id: number;
+  host: Host;
+  stats_history: StatsHistory;
+  state: string[];
+  stats: Stats;
+  collectedStates?: string[];
+  in?: number;
+  out?: number;
+  up?: number;
+  down?: number;
+  destroyed?: number;
+  cdIsBinary?: boolean;
+  cdIndivFlags?: string[];
+  cdClusterFlags?: string[];
+  cdExecuting?: any;
+  tree?: Tree;
+  operational_status?: string;
+}
+
+interface Tree {
+  device_class: string;
+}
+
+interface Host {
+  id: number;
+  name: string;
+}
+
+interface StatsHistory {
+  op_out_bytes: any[];
+  op_in_bytes: any[];
+  out_bytes?: any[];
+  in_bytes?: any[];
+}
+
+interface Stats {
+  stat_bytes_used: number;
+  stat_bytes: number;
+  op_w?: number;
+  op_r?: number;
+  usage?: number;
+}
diff --git a/src/pybind/mgr/dashboard/openapi.yaml b/src/pybind/mgr/dashboard/openapi.yaml
index 8f98f1f62a0a8..24422f5b030e3 100644
--- a/src/pybind/mgr/dashboard/openapi.yaml
+++ b/src/pybind/mgr/dashboard/openapi.yaml
@@ -8935,11 +8935,31 @@ paths:
       - NVMe-oF Subsystem Namespace
   /api/osd:
     get:
-      parameters: []
+      parameters:
+      - default: 0
+        in: query
+        name: offset
+        schema:
+          type: integer
+      - default: 10
+        in: query
+        name: limit
+        schema:
+          type: integer
+      - default: ''
+        in: query
+        name: search
+        schema:
+          type: string
+      - default: ''
+        in: query
+        name: sort
+        schema:
+          type: string
       responses:
         '200':
           content:
-            application/vnd.ceph.api.v1.0+json:
+            application/vnd.ceph.api.v1.1+json:
               type: object
           description: OK
         '400':
diff --git a/src/pybind/mgr/dashboard/tests/test_osd.py b/src/pybind/mgr/dashboard/tests/test_osd.py
index c3cd0dca88dca..9b6dbd10de18d 100644
--- a/src/pybind/mgr/dashboard/tests/test_osd.py
+++ b/src/pybind/mgr/dashboard/tests/test_osd.py
@@ -8,6 +8,7 @@
 from ceph.deployment.service_spec import PlacementSpec
 
 from .. import mgr
+from ..controllers._version import APIVersion
 from ..controllers.osd import Osd, OsdUi
 from ..services.osd import OsdDeploymentOptions
 from ..tests import ControllerTestCase
@@ -274,7 +275,7 @@ def test_osd_list_aggregation(self):
         osds_leftover = [0, 1, 2]
         with self._mock_osd_list(osd_stat_ids=osds_actual, osdmap_tree_node_ids=osds_leftover,
                                  osdmap_ids=osds_actual):
-            self._get('/api/osd')
+            self._get('/api/osd', version=APIVersion(1, 1))
             self.assertEqual(len(self.json_body()), 2, 'It should display two OSDs without failure')
             self.assertStatus(200)
 

From f9b50b2e88ae5d9ac4f2cab986527a0a12317da9 Mon Sep 17 00:00:00 2001
From: Nizamudeen A <nia@redhat.com>
Date: Wed, 9 Oct 2024 20:15:55 +0530
Subject: [PATCH 099/148] mgr/dashboard: fix group name bugs in the nvmeof API

there are 2 issues

1. in cephadm, i was always using the first daemon to populate the group
   in all the services for the dashboard config.

2. in the API, if there are more than 1 gateways listed in the config,
   rather than chosing a random gateway from the group, raise an
   exception and warn user to specify the gw_group parameter in the api
   request

Fixes: https://tracker.ceph.com/issues/68463
Signed-off-by: Nizamudeen A <nia@redhat.com>
---
 src/pybind/mgr/cephadm/services/nvmeof.py        |  5 ++---
 src/pybind/mgr/dashboard/services/nvmeof_conf.py | 14 +++++++++++++-
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/src/pybind/mgr/cephadm/services/nvmeof.py b/src/pybind/mgr/cephadm/services/nvmeof.py
index 8b15aace373c5..162815da24c73 100644
--- a/src/pybind/mgr/cephadm/services/nvmeof.py
+++ b/src/pybind/mgr/cephadm/services/nvmeof.py
@@ -123,10 +123,9 @@ def get_set_cmd_dicts(out: str) -> List[dict]:
             gateways = json.loads(out)['gateways']
             cmd_dicts = []
 
-            spec = cast(NvmeofServiceSpec,
-                        self.mgr.spec_store.all_specs.get(daemon_descrs[0].service_name(), None))
-
             for dd in daemon_descrs:
+                spec = cast(NvmeofServiceSpec,
+                            self.mgr.spec_store.all_specs.get(dd.service_name(), None))
                 service_name = dd.service_name()
                 if dd.hostname is None:
                     err_msg = ('Trying to config_dashboard nvmeof but no hostname is defined')
diff --git a/src/pybind/mgr/dashboard/services/nvmeof_conf.py b/src/pybind/mgr/dashboard/services/nvmeof_conf.py
index 1802f8a5fce9f..2426c59907874 100644
--- a/src/pybind/mgr/dashboard/services/nvmeof_conf.py
+++ b/src/pybind/mgr/dashboard/services/nvmeof_conf.py
@@ -177,6 +177,18 @@ def _get_running_daemon_svc_config(svc_config, running_daemons):
 
 def _get_default_service(gateways):
     if gateways:
-        service_name = list(gateways.keys())[0]
+        gateway_keys = list(gateways.keys())
+        # if there are more than 1 gateway, rather than chosing a random gateway
+        # from any of the group, raise an exception to make it clear that we need
+        # to specify the group name in the API request.
+        if len(gateway_keys) > 1:
+            raise DashboardException(
+                msg=(
+                    "Multiple NVMe-oF gateway groups are configured. "
+                    "Please specify the 'gw_group' parameter in the request."
+                ),
+                component="nvmeof"
+            )
+        service_name = gateway_keys[0]
         return service_name, gateways[service_name][0]['service_url']
     return None

From 3dc091dd12c54103b9e93b5c38b86c883d93f242 Mon Sep 17 00:00:00 2001
From: Afreen Misbah <afreen23.git@gmail.com>
Date: Fri, 11 Oct 2024 14:27:24 +0530
Subject: [PATCH 100/148] mgr/dashboard: Fix listener deletion

Listener deletion is broken due to passing wrong gateway address.
Including `traddr` in DELETE API of listener to choose correct gateway address for deletion.

The same fix we did for POST API here: https://github.com/afreen23/ceph/commit/287ff3b3603291763b3cd08f9b1543fe60d5f3b9

Fixes: https://tracker.ceph.com/issues/68506

Signed-off-by: Afreen Misbah <afreen23.git@gmail.com>
---
 src/pybind/mgr/dashboard/controllers/nvmeof.py                | 2 +-
 .../nvmeof-listeners-list/nvmeof-listeners-list.component.ts  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/pybind/mgr/dashboard/controllers/nvmeof.py b/src/pybind/mgr/dashboard/controllers/nvmeof.py
index 5db6a4f1acfec..f199867943d14 100644
--- a/src/pybind/mgr/dashboard/controllers/nvmeof.py
+++ b/src/pybind/mgr/dashboard/controllers/nvmeof.py
@@ -183,7 +183,7 @@ def delete(
             force: bool = False,
             gw_group: Optional[str] = None
         ):
-            return NVMeoFClient(gw_group=gw_group).stub.delete_listener(
+            return NVMeoFClient(gw_group=gw_group, traddr=traddr).stub.delete_listener(
                 NVMeoFClient.pb2.delete_listener_req(
                     nqn=nqn,
                     host_name=host_name,
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.ts
index f88442e1bd619..974727ad06260 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.ts
@@ -76,7 +76,7 @@ export class NvmeofListenersListComponent implements OnInit, OnChanges {
         name: this.actionLabels.DELETE,
         permission: 'delete',
         icon: Icons.destroy,
-        click: () => this.deleteSubsystemModal()
+        click: () => this.deleteListenerModal()
       }
     ];
   }
@@ -101,7 +101,7 @@ export class NvmeofListenersListComponent implements OnInit, OnChanges {
       });
   }
 
-  deleteSubsystemModal() {
+  deleteListenerModal() {
     const listener = this.selection.first();
     this.modalService.show(CriticalConfirmationModalComponent, {
       itemDescription: 'Listener',

From 517ab013e2a9bd23b482134121bcb85c5a32f028 Mon Sep 17 00:00:00 2001
From: Naman Munet <namanmunet@li-ff83bccc-26af-11b2-a85c-a4b04bfb1003.ibm.com>
Date: Tue, 8 Oct 2024 14:54:32 +0530
Subject: [PATCH 101/148] mgr/dashboard: sync policy's in Object >> Multi-site
 >> Sync-policy, does not show the zonegroup to which policy belongs to

Fixes: https://tracker.ceph.com/issues/68355

Fixes Includes: Added default zonegroup name with the sync policy details

Signed-off-by: Naman Munet <namanmunet@li-ff83bccc-26af-11b2-a85c-a4b04bfb1003.ibm.com>
---
 src/pybind/mgr/dashboard/controllers/rgw.py      | 10 ++++++++++
 .../rgw-multisite-sync-policy.component.ts       | 16 +++++++++++++---
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/src/pybind/mgr/dashboard/controllers/rgw.py b/src/pybind/mgr/dashboard/controllers/rgw.py
index 8667d469060f8..75a664dfb4f91 100755
--- a/src/pybind/mgr/dashboard/controllers/rgw.py
+++ b/src/pybind/mgr/dashboard/controllers/rgw.py
@@ -176,6 +176,15 @@ def get_sync_policy(self, bucket_name='', zonegroup_name='', all_policy=None):
         if all_policy:
             sync_policy_list = []
             buckets = json.loads(RgwBucket().list(stats=False))
+            zonegroups_info = RgwMultisite().get_all_zonegroups_info()
+            default_zonegroup = ''
+            if 'zonegroups' in zonegroups_info and 'default_zonegroup' in zonegroups_info:
+                default_zonegroup = next(
+                    (zonegroup['name'] for zonegroup in zonegroups_info['zonegroups']
+                        if 'id' in zonegroup and 'name' in zonegroup
+                        and zonegroup['id'] == zonegroups_info['default_zonegroup']),
+                    ''
+                )
             for bucket in buckets:
                 sync_policy = multisite_instance.get_sync_policy(bucket, zonegroup_name)
                 for policy in sync_policy['groups']:
@@ -183,6 +192,7 @@ def get_sync_policy(self, bucket_name='', zonegroup_name='', all_policy=None):
                     sync_policy_list.append(policy)
             other_sync_policy = multisite_instance.get_sync_policy(bucket_name, zonegroup_name)
             for policy in other_sync_policy['groups']:
+                policy['zonegroup'] = default_zonegroup
                 sync_policy_list.append(policy)
             return sync_policy_list
         return multisite_instance.get_sync_policy(bucket_name, zonegroup_name)
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-policy/rgw-multisite-sync-policy.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-policy/rgw-multisite-sync-policy.component.ts
index ee261db5042c3..03228856125d9 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-policy/rgw-multisite-sync-policy.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-policy/rgw-multisite-sync-policy.component.ts
@@ -88,12 +88,22 @@ export class RgwMultisiteSyncPolicyComponent extends ListWithDetails implements
       {
         name: $localize`Zonegroup`,
         prop: 'zonegroup',
-        flexGrow: 1
+        flexGrow: 1,
+        cellTransformation: CellTemplate.map,
+        customTemplateConfig: {
+          undefined: '-',
+          '': '-'
+        }
       },
       {
         name: $localize`Bucket`,
         prop: 'bucket',
-        flexGrow: 1
+        flexGrow: 1,
+        cellTransformation: CellTemplate.map,
+        customTemplateConfig: {
+          undefined: '-',
+          '': '-'
+        }
       }
     ];
     this.rgwDaemonService.list().subscribe();
@@ -137,7 +147,7 @@ export class RgwMultisiteSyncPolicyComponent extends ListWithDetails implements
           groupName: policy['id'],
           status: policy['status'],
           bucket: policy['bucketName'],
-          zonegroup: ''
+          zonegroup: policy['zonegroup']
         });
       });
       this.syncPolicyData = [...this.syncPolicyData];

From 88e4484acf198a64c700f18bcc06af1014356c43 Mon Sep 17 00:00:00 2001
From: Avan Thakkar <athakkar@redhat.com>
Date: Mon, 14 Oct 2024 19:32:11 +0530
Subject: [PATCH 102/148] mgr/cephadm: add ok_to_stop func for smb service

Fixes: https://tracker.ceph.com/issues/68527
Signed-off-by: Avan Thakkar <athakkar@redhat.com>
---
 src/pybind/mgr/cephadm/services/smb.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/src/pybind/mgr/cephadm/services/smb.py b/src/pybind/mgr/cephadm/services/smb.py
index dabc202a024bd..e322acb0e3e73 100644
--- a/src/pybind/mgr/cephadm/services/smb.py
+++ b/src/pybind/mgr/cephadm/services/smb.py
@@ -1,6 +1,9 @@
+import errno
 import logging
 from typing import Any, Dict, List, Tuple, cast, Optional
 
+from mgr_module import HandleCommandResult
+
 from ceph.deployment.service_spec import ServiceSpec, SMBSpec
 
 from orchestrator import DaemonDescription
@@ -117,6 +120,23 @@ def ignore_possible_stray(
             return True
         return False
 
+    def ok_to_stop(
+        self, daemon_ids: List[str], force: bool = False, known: Optional[List[str]] = None
+    ) -> HandleCommandResult:
+        # if only 1 smb, alert user (this is not passable with --force)
+        warn, warn_message = self._enough_daemons_to_stop(self.TYPE, daemon_ids, "SMB", 1, True)
+        if warn:
+            return HandleCommandResult(-errno.EBUSY, "", warn_message)
+
+        # if reached here, there is > 1 smb daemon.
+        if force:
+            return HandleCommandResult(0, warn_message, "")
+
+        # if reached here, > 1 smb daemon and no force flag.
+        # Provide warning
+        warn_message = "WARNING: Removing SMB daemons can cause clients to lose connectivity. "
+        return HandleCommandResult(-errno.EBUSY, "", warn_message)
+
     def _allow_config_key_command(self, name: str) -> str:
         # permit the samba container config access to the mon config key store
         # with keys like smb/config/<cluster_id>/*.

From 022b90a75335471a6973cab180f71812c0d6125f Mon Sep 17 00:00:00 2001
From: Patrick Donnelly <pdonnell@redhat.com>
Date: Mon, 14 Oct 2024 12:14:53 -0400
Subject: [PATCH 103/148] doc/governance: add new CSC members

Congratulations!

Election: https://vote.heliosvoting.org/helios/elections/f276a15a-84c5-11ef-a0e4-b69e035002b0/view
Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
---
 doc/governance.rst | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/doc/governance.rst b/doc/governance.rst
index 284a9570397c3..b145116e5566d 100644
--- a/doc/governance.rst
+++ b/doc/governance.rst
@@ -104,6 +104,17 @@ Current Members
  * Yingxin Cheng <yingxin.cheng@intel.com>
  * Yuri Weinstein <yweinste@redhat.com>
  * Zac Dover <zac.dover@proton.me>
+ * Laura Flores <lflores@redhat.com>
+ * Venky Shankar <vshankar@redhat.com>
+ * Guillaume Abrioux <gabrioux@redhat.com>
+ * Anthony D'Atri <anthony.datri@gmail.com>
+ * Joseph Mundackal <jmundackal@bloomberg.net>
+ * Gaurav Sitlani <gsitlani@ibm.com>
+ * Afreen Misbah <afreen@ibm.com>
+ * Radoslaw Zarzynski <rzarzyns@redhat.com>
+ * Matan Breizman <mbreizma@redhat.com>
+ * Yaarit Hatuka <yhatuka@ibm.com>
+ * Adam C. Emerson <aemerson@redhat.com>
 
 .. _ctl:
 

From e4177406f9734f1c8af91f8292aa972d01fb77f9 Mon Sep 17 00:00:00 2001
From: Patrick Donnelly <pdonnell@ibm.com>
Date: Mon, 14 Oct 2024 14:50:41 -0400
Subject: [PATCH 104/148] mailmap: add my ibm email

Signed-off-by: Patrick Donnelly <pdonnell@ibm.com>
---
 .githubmap       | 2 +-
 .mailmap         | 3 ++-
 .organizationmap | 1 +
 .peoplemap       | 2 +-
 4 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/.githubmap b/.githubmap
index 724de9c002d4c..01785190643a6 100644
--- a/.githubmap
+++ b/.githubmap
@@ -27,7 +27,7 @@ b-ranto Boris Ranto <branto@redhat.com>
 badone Brad Hubbard <bhubbard@redhat.com>
 baruza Barbora Ančincová <bara@redhat.com>
 bassamtabbara Bassam Tabbara <bassam.tabbara@quantum.com>
-batrick Patrick Donnelly <pdonnell@redhat.com>
+batrick Patrick Donnelly <pdonnell@ibm.com>
 bigjust Justin Caratzas <jcaratza@redhat.com>
 bk201 Kiefer Chang <kiefer.chang@suse.com>
 BlaineEXE Blaine Gardner <bgardner@suse.com>
diff --git a/.mailmap b/.mailmap
index 1c4ac95340abd..9428951b391f0 100644
--- a/.mailmap
+++ b/.mailmap
@@ -544,7 +544,8 @@ Pan Liu <pan.liu@istuary.com> <liupan1111@gmail.com>
 Parth Arora <paarora@redhat.com> parth-gr <paarora@redhat.com>
 Pascal de Bruijn <pascal@unilogicnetworks.net>
 Patience Warnick <patience@cranium.pelton.net> <patiencew@29311d96-e01e-0410-9327-a35deaab8ce9>
-Patrick Donnelly <pdonnell@redhat.com> <pdonell@redhat.com>
+Patrick Donnelly <pdonnell@ibm.com> <pdonnell@redhat.com>
+Patrick Donnelly <pdonnell@ibm.com> <batrick@batbytes.com>
 Patrick McGarry <patrick@inktank.com>
 Patrick McGarry <pmcgarry@redhat.com> <pmcgarry@gmail.com>
 Patrick Seidensal <pseidensal@suse.com>
diff --git a/.organizationmap b/.organizationmap
index 3a601f4e2b2bf..42e639c274d62 100644
--- a/.organizationmap
+++ b/.organizationmap
@@ -361,6 +361,7 @@ IBM <contact@IBM.com> Samuel Matzek <smatzek@us.ibm.com>
 IBM <contact@IBM.com> Shraddha Agrawal <shraddhaag@ibm.com>
 IBM <contact@IBM.com> Kushal Deb <Kushal.Deb@ibm.com>
 IBM <contact@IBM.com> Shweta Bhosale <Shweta.Bhosale1@ibm.com>
+IBM <contact@IBM.com> Patrick Donnelly <pdonnell@ibm.com>
 IBM <contact@IBM.com> Sunil Angadi <Sunil.Angadi@ibm.com>
 IBM <contact@IBM.com> Teoman Onay <tonay@ibm.com>
 IBM <contact@ibm.com> Ulrich Weigand <ulrich.weigand@de.ibm.com>
diff --git a/.peoplemap b/.peoplemap
index 507f50edb43e8..418e8505fb49c 100644
--- a/.peoplemap
+++ b/.peoplemap
@@ -73,5 +73,5 @@ Yehuda Sadeh <ysadehwe@redhat.com> Yehuda Sadeh <yehuda@inktank.com>
 Yuri Weinstein <yuriw@redhat.com> Yuri Weinstein <yuri.weinstein@inktank.com>
 Zhi Zhang <zhangz.david@outlook.com> Zhi (David) Zhang <zhangz@yahoo-inc.com>
 Zheng Yin <zhengyin@huayun.com> Zheng Yin <zhengyin@chinac.com>
-Patrick Donnelly <pdonnell@redhat.com> Patrick Donnelly <batrick@batbytes.com>
+Patrick Donnelly <pdonnell@ibm.com> Patrick Donnelly <pdonnell@redhat.com> Patrick Donnelly <batrick@batbytes.com>
 Myoungwon Oh <myoungwon.oh@samsung.com> Myoungwon Oh <omwmw@sk.com> Myoungwon Oh <ohmyoungwon@gmail.com>

From 2f61b2847d92b5156408dbcfa5b6e09e2de404c1 Mon Sep 17 00:00:00 2001
From: Patrick Donnelly <pdonnell@ibm.com>
Date: Mon, 14 Oct 2024 14:57:31 -0400
Subject: [PATCH 105/148] doc/governance: update my CSC email

Signed-off-by: Patrick Donnelly <pdonnell@ibm.com>
---
 doc/governance.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/governance.rst b/doc/governance.rst
index 284a9570397c3..1080948e5cd05 100644
--- a/doc/governance.rst
+++ b/doc/governance.rst
@@ -96,7 +96,7 @@ Current Members
  * Mike Perez <miperez@redhat.com>
  * Myoungwon Oh <myoungwon.oh@samsung.com>
  * Neha Ojha <nojha@redhat.com>
- * Patrick Donnelly <pdonnell@redhat.com>
+ * Patrick Donnelly <pdonnell@ibm.com>
  * Sam Just <sjust@redhat.com>
  * Vikhyat Umrao <vikhyat@redhat.com>
  * Xie Xingguo <xie.xingguo@zte.com.cn>

From c4c647480adbd702296a632707c34b172121b9b0 Mon Sep 17 00:00:00 2001
From: Casey Bodley <cbodley@redhat.com>
Date: Mon, 14 Oct 2024 16:07:38 -0400
Subject: [PATCH 106/148] osdc: remove unused overloads for async::Completion

ea67f3dee2a3f8fcdcbb0bc0e80e38ec70378f05 switched to
asio::any_completion_handler<> for completions, but left some converting
overloads behind for compatibility. none of those overloads appear to be
used, so remove them

Signed-off-by: Casey Bodley <cbodley@redhat.com>
---
 src/osdc/Objecter.h | 95 ---------------------------------------------
 1 file changed, 95 deletions(-)

diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h
index 68bd76268ae94..927c7e413296f 100644
--- a/src/osdc/Objecter.h
+++ b/src/osdc/Objecter.h
@@ -48,7 +48,6 @@
 #include "include/function2.hpp"
 #include "include/neorados/RADOS_Decodable.hpp"
 
-#include "common/async/completion.h"
 #include "common/admin_socket.h"
 #include "common/ceph_time.h"
 #include "common/ceph_mutex.h"
@@ -1968,30 +1967,6 @@ class Objecter : public md_config_obs_t, public Dispatcher {
     }
   }
 
-  boost::asio::any_completion_handler<void(boost::system::error_code)>
-  OpCompletionVert(std::unique_ptr<ceph::async::Completion<
-		     void(boost::system::error_code)>> c) {
-    if (c)
-      return [c = std::move(c)](boost::system::error_code ec) mutable {
-	c->dispatch(std::move(c), ec);
-      };
-    else
-      return nullptr;
-  }
-
-  template<typename T>
-  boost::asio::any_completion_handler<void(boost::system::error_code, T)>
-  OpCompletionVert(std::unique_ptr<ceph::async::Completion<
-		     void(boost::system::error_code, T)>> c) {
-    if (c) {
-      return [c = std::move(c)](boost::system::error_code ec, T t) mutable {
-	c->dispatch(std::move(c), ec, std::move(t));
-      };
-    } else {
-      return nullptr;
-    }
-  }
-
   struct Op : public RefCountedObject {
     OSDSession *session = nullptr;
     int incarnation = 0;
@@ -3268,18 +3243,6 @@ class Objecter : public md_config_obs_t, public Dispatcher {
     return linger_watch(info, op, snapc, mtime, inbl,
 			OpContextVert<ceph::buffer::list>(onfinish, nullptr), objver);
   }
-  ceph_tid_t linger_watch(LingerOp *info,
-			  ObjectOperation& op,
-			  const SnapContext& snapc, ceph::real_time mtime,
-			  ceph::buffer::list& inbl,
-			  std::unique_ptr<ceph::async::Completion<
-			    void(boost::system::error_code,
-			         ceph::buffer::list)>> onfinish,
-			  version_t *objver) {
-    return linger_watch(info, op, snapc, mtime, inbl,
-			OpCompletionVert<ceph::buffer::list>(
-			  std::move(onfinish)), objver);
-  }
   ceph_tid_t linger_notify(LingerOp *info,
 			   ObjectOperation& op,
 			   snapid_t snap, ceph::buffer::list& inbl,
@@ -3295,17 +3258,6 @@ class Objecter : public md_config_obs_t, public Dispatcher {
 			 OpContextVert(onack, poutbl),
 			 objver);
   }
-  ceph_tid_t linger_notify(LingerOp *info,
-			   ObjectOperation& op,
-			   snapid_t snap, ceph::buffer::list& inbl,
-			   std::unique_ptr<ceph::async::Completion<
-			     void(boost::system::error_code,
-			          ceph::buffer::list)>> onack,
-			   version_t *objver) {
-    return linger_notify(info, op, snap, inbl,
-			 OpCompletionVert<ceph::buffer::list>(
-			   std::move(onack)), objver);
-  }
   tl::expected<ceph::timespan,
 	       boost::system::error_code> linger_check(LingerOp *info);
   void linger_cancel(LingerOp *info);  // releases a reference
@@ -3886,12 +3838,6 @@ class Objecter : public md_config_obs_t, public Dispatcher {
     create_pool_snap(pool, snapName,
 		     OpContextVert<ceph::buffer::list>(c, nullptr));
   }
-  void create_pool_snap(
-    int64_t pool, std::string_view snapName,
-    std::unique_ptr<ceph::async::Completion<PoolOp::OpSig>> c) {
-    create_pool_snap(pool, snapName,
-		     OpCompletionVert<ceph::buffer::list>(std::move(c)));
-  }
   void allocate_selfmanaged_snap(int64_t pool,
 				 boost::asio::any_completion_handler<
 				 void(boost::system::error_code,
@@ -3901,12 +3847,6 @@ class Objecter : public md_config_obs_t, public Dispatcher {
     allocate_selfmanaged_snap(pool,
 			      OpContextVert(c, psnapid));
   }
-  void allocate_selfmanaged_snap(int64_t pool,
-				 std::unique_ptr<ceph::async::Completion<void(
-				   boost::system::error_code, snapid_t)>> c) {
-    allocate_selfmanaged_snap(pool,
-			      OpCompletionVert<snapid_t>(std::move(c)));
-  }
   void delete_pool_snap(int64_t pool, std::string_view snapName,
 			decltype(PoolOp::onfinish)&& onfinish);
   void delete_pool_snap(int64_t pool, std::string_view snapName,
@@ -3914,12 +3854,6 @@ class Objecter : public md_config_obs_t, public Dispatcher {
     delete_pool_snap(pool, snapName,
 		     OpContextVert<ceph::buffer::list>(c, nullptr));
   }
-  void delete_pool_snap(int64_t pool, std::string_view snapName,
-			std::unique_ptr<ceph::async::Completion<void(
-                          boost::system::error_code, ceph::buffer::list)>> c) {
-    delete_pool_snap(pool, snapName,
-		     OpCompletionVert<ceph::buffer::list>(std::move(c)));
-  }
 
   void delete_selfmanaged_snap(int64_t pool, snapid_t snap,
 			       decltype(PoolOp::onfinish)&& onfinish);
@@ -3928,12 +3862,6 @@ class Objecter : public md_config_obs_t, public Dispatcher {
     delete_selfmanaged_snap(pool, snap,
 			    OpContextVert<ceph::buffer::list>(c, nullptr));
   }
-  void delete_selfmanaged_snap(int64_t pool, snapid_t snap,
-			       std::unique_ptr<ceph::async::Completion<void(
-                                 boost::system::error_code, ceph::buffer::list)>> c) {
-    delete_selfmanaged_snap(pool, snap,
-			    OpCompletionVert<ceph::buffer::list>(std::move(c)));
-  }
 
 
   void create_pool(std::string_view name,
@@ -3945,25 +3873,12 @@ class Objecter : public md_config_obs_t, public Dispatcher {
 		OpContextVert<ceph::buffer::list>(onfinish, nullptr),
 		crush_rule);
   }
-  void create_pool(std::string_view name,
-		   std::unique_ptr<ceph::async::Completion<void(
-                     boost::system::error_code, ceph::buffer::list)>> c,
-		   int crush_rule=-1) {
-    create_pool(name,
-		OpCompletionVert<ceph::buffer::list>(std::move(c)),
-		crush_rule);
-  }
   void delete_pool(int64_t pool,
 		   decltype(PoolOp::onfinish)&& onfinish);
   void delete_pool(int64_t pool,
 		   Context* onfinish) {
     delete_pool(pool, OpContextVert<ceph::buffer::list>(onfinish, nullptr));
   }
-  void delete_pool(int64_t pool,
-		   std::unique_ptr<ceph::async::Completion<void(
-                    boost::system::error_code, ceph::buffer::list)>> c) {
-    delete_pool(pool, OpCompletionVert<ceph::buffer::list>(std::move(c)));
-  }
 
   void delete_pool(std::string_view name,
 		   decltype(PoolOp::onfinish)&& onfinish);
@@ -3972,11 +3887,6 @@ class Objecter : public md_config_obs_t, public Dispatcher {
 		   Context* onfinish) {
     delete_pool(name, OpContextVert<ceph::buffer::list>(onfinish, nullptr));
   }
-  void delete_pool(std::string_view name,
-		   std::unique_ptr<ceph::async::Completion<void(
-                     boost::system::error_code, ceph::buffer::list)>> c) {
-    delete_pool(name, OpCompletionVert<ceph::buffer::list>(std::move(c)));
-  }
 
   void handle_pool_op_reply(MPoolOpReply *m);
   int pool_op_cancel(ceph_tid_t tid, int r);
@@ -4026,11 +3936,6 @@ class Objecter : public md_config_obs_t, public Dispatcher {
 		    Context *onfinish) {
     get_fs_stats_(poolid, OpContextVert(onfinish, result));
   }
-  void get_fs_stats(std::optional<int64_t> poolid,
-		    std::unique_ptr<ceph::async::Completion<void(
-                      boost::system::error_code, struct ceph_statfs)>> c) {
-    get_fs_stats_(poolid, OpCompletionVert<struct ceph_statfs>(std::move(c)));
-  }
   int statfs_op_cancel(ceph_tid_t tid, int r);
   void _finish_statfs_op(StatfsOp *op, int r);
 

From 7b783876960d39de1b87d55135c4207325c4ce69 Mon Sep 17 00:00:00 2001
From: Samuel Just <sjust@redhat.com>
Date: Mon, 14 Oct 2024 19:42:59 -0700
Subject: [PATCH 107/148] crimson: remove watchers upon object deletion

Fixes: https://tracker.ceph.com/issues/68538
Signed-off-by: Samuel Just <sjust@redhat.com>
---
 src/crimson/osd/ops_executer.cc | 36 ++++++++++++++++++++++++---------
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/src/crimson/osd/ops_executer.cc b/src/crimson/osd/ops_executer.cc
index df4f73d4077d1..4464466eff0d7 100644
--- a/src/crimson/osd/ops_executer.cc
+++ b/src/crimson/osd/ops_executer.cc
@@ -678,16 +678,32 @@ OpsExecuter::do_execute_op(OSDOp& osd_op)
       whiteout = true;
     }
     return do_write_op([this, whiteout](auto& backend, auto& os, auto& txn) {
-      int num_bytes = 0;
-      // Calculate num_bytes to be removed
-      if (obc->obs.oi.soid.is_snap()) {
-        ceph_assert(obc->ssc->snapset.clone_overlap.count(obc->obs.oi.soid.snap));
-        num_bytes = obc->ssc->snapset.get_clone_bytes(obc->obs.oi.soid.snap);
-      } else {
-        num_bytes = obc->obs.oi.size;
-      }
-      return backend.remove(os, txn, *osd_op_params,
-                            delta_stats, whiteout, num_bytes);
+      struct emptyctx_t {};
+      return with_effect_on_obc(
+	emptyctx_t{},
+	[&](auto &ctx) {
+	  int num_bytes = 0;
+	  // Calculate num_bytes to be removed
+	  if (obc->obs.oi.soid.is_snap()) {
+	    ceph_assert(obc->ssc->snapset.clone_overlap.count(
+			  obc->obs.oi.soid.snap));
+	    num_bytes = obc->ssc->snapset.get_clone_bytes(
+	      obc->obs.oi.soid.snap);
+	  } else {
+	    num_bytes = obc->obs.oi.size;
+	  }
+	  return backend.remove(os, txn, *osd_op_params,
+				delta_stats, whiteout, num_bytes);
+	},
+	[](auto &&ctx, ObjectContextRef obc, Ref<PG>) {
+	  return seastar::do_for_each(
+	    obc->watchers,
+	    [](auto &p) { return p.second->remove(); }
+	  ).then([obc] {
+	    obc->watchers.clear();
+	    return seastar::now();
+	  });
+	});
     });
   }
   case CEPH_OSD_OP_CALL:

From 1f99108d197f1c579838107d4b57be806b6807e1 Mon Sep 17 00:00:00 2001
From: Samuel Just <sjust@redhat.com>
Date: Mon, 14 Oct 2024 18:46:37 -0700
Subject: [PATCH 108/148] crimson: add missing field to SUBLOGDPPI and LOGDPPI

SUBLOGDPPI and LOGDPPI need an extra {} for the interrupt_cond.

Signed-off-by: Samuel Just <sjust@redhat.com>
---
 src/crimson/common/log.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/crimson/common/log.h b/src/crimson/common/log.h
index 4f564ac044d05..c38b225c94b4f 100644
--- a/src/crimson/common/log.h
+++ b/src/crimson/common/log.h
@@ -90,7 +90,7 @@ static inline seastar::log_level to_log_level(int level) {
 #define SUBLOGDPP(subname_, level_, MSG, dpp, ...) \
   LOGGER(subname_).log(level_, "{} {}: " MSG, dpp, FNAME , ##__VA_ARGS__)
 #define SUBLOGDPPI(subname_, level_, MSG, dpp, ...) \
-  LOGGER(subname_).log(level_, "{} {}: " MSG, \
+  LOGGER(subname_).log(level_, "{} {} {}: " MSG,			\
   interruptor::get_interrupt_cond(), dpp, FNAME , ##__VA_ARGS__)
 #define SUBTRACEDPP(subname_, ...) SUBLOGDPP(subname_, seastar::log_level::trace, __VA_ARGS__)
 #define SUBTRACEDPPI(subname_, ...) SUBLOGDPPI(subname_, seastar::log_level::trace, __VA_ARGS__)
@@ -106,7 +106,7 @@ static inline seastar::log_level to_log_level(int level) {
 #define LOGDPP(level_, MSG, dpp, ...) \
   LOCAL_LOGGER.log(level_, "{} {}: " MSG, dpp, FNAME , ##__VA_ARGS__)
 #define LOGDPPI(level_, MSG, dpp, ...) \
-  LOCAL_LOGGER.log(level_, "{} {}: " MSG, \
+  LOCAL_LOGGER.log(level_, "{} {} {}: " MSG, \
   interruptor::get_interrupt_cond(), dpp, FNAME , ##__VA_ARGS__)
 #define TRACEDPP(...) LOGDPP(seastar::log_level::trace, __VA_ARGS__)
 #define TRACEDPPI(...) LOGDPPI(seastar::log_level::trace, __VA_ARGS__)

From 4bea366e5de5b110086c8174eaf39798448ff77f Mon Sep 17 00:00:00 2001
From: Samuel Just <sjust@redhat.com>
Date: Tue, 27 Aug 2024 19:08:10 +0000
Subject: [PATCH 109/148] crimson: fix typo OpsExecutor->OpsExecuter

Signed-off-by: Samuel Just <sjust@redhat.com>
---
 src/crimson/osd/ops_executer.cc | 8 ++++----
 src/crimson/osd/ops_executer.h  | 4 ++--
 src/crimson/osd/pg_backend.cc   | 2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/crimson/osd/ops_executer.cc b/src/crimson/osd/ops_executer.cc
index 4464466eff0d7..0a07fa7ee293e 100644
--- a/src/crimson/osd/ops_executer.cc
+++ b/src/crimson/osd/ops_executer.cc
@@ -504,7 +504,7 @@ OpsExecuter::list_snaps_iertr::future<> OpsExecuter::do_list_snaps(
       auto p = ss.clone_snaps.find(clone);
       if (p == ss.clone_snaps.end()) {
 	logger().error(
-	  "OpsExecutor::do_list_snaps: {} has inconsistent "
+	  "OpsExecuter::do_list_snaps: {} has inconsistent "
 	  "clone_snaps, missing clone {}",
 	  os.oi.soid,
 	  clone);
@@ -518,7 +518,7 @@ OpsExecuter::list_snaps_iertr::future<> OpsExecuter::do_list_snaps(
       auto p = ss.clone_overlap.find(clone);
       if (p == ss.clone_overlap.end()) {
 	logger().error(
-	  "OpsExecutor::do_list_snaps: {} has inconsistent "
+	  "OpsExecuter::do_list_snaps: {} has inconsistent "
 	  "clone_overlap, missing clone {}",
 	  os.oi.soid,
 	  clone);
@@ -532,7 +532,7 @@ OpsExecuter::list_snaps_iertr::future<> OpsExecuter::do_list_snaps(
       auto p = ss.clone_size.find(clone);
       if (p == ss.clone_size.end()) {
 	logger().error(
-	  "OpsExecutor::do_list_snaps: {} has inconsistent "
+	  "OpsExecuter::do_list_snaps: {} has inconsistent "
 	  "clone_size, missing clone {}",
 	  os.oi.soid,
 	  clone);
@@ -551,7 +551,7 @@ OpsExecuter::list_snaps_iertr::future<> OpsExecuter::do_list_snaps(
   }
   resp.seq = ss.seq;
   logger().error(
-    "OpsExecutor::do_list_snaps: {}, resp.clones.size(): {}",
+    "OpsExecuter::do_list_snaps: {}, resp.clones.size(): {}",
     os.oi.soid,
     resp.clones.size());
   resp.encode(osd_op.outdata);
diff --git a/src/crimson/osd/ops_executer.h b/src/crimson/osd/ops_executer.h
index 0dea7d0515e93..e25a035616edd 100644
--- a/src/crimson/osd/ops_executer.h
+++ b/src/crimson/osd/ops_executer.h
@@ -213,10 +213,10 @@ class OpsExecuter : public seastar::enable_lw_shared_from_this<OpsExecuter> {
    * execute_clone
    *
    * If snapc contains a snap which occurred logically after the last write
-   * seen by this object (see OpsExecutor::should_clone()), we first need
+   * seen by this object (see OpsExecuter::should_clone()), we first need
    * make a clone of the object at its current state.  execute_clone primes
    * txn with that clone operation and returns an
-   * OpsExecutor::CloningContext which will allow us to fill in the corresponding
+   * OpsExecuter::CloningContext which will allow us to fill in the corresponding
    * metadata and log_entries once the operations have been processed.
    *
    * Note that this strategy differs from classic, which instead performs this
diff --git a/src/crimson/osd/pg_backend.cc b/src/crimson/osd/pg_backend.cc
index fa8201b61c28d..24a381b4cf7e2 100644
--- a/src/crimson/osd/pg_backend.cc
+++ b/src/crimson/osd/pg_backend.cc
@@ -1289,7 +1289,7 @@ void PGBackend::clone(
   const ObjectState& d_os,
   ceph::os::Transaction& txn)
 {
-  // See OpsExecutor::execute_clone documentation
+  // See OpsExecuter::execute_clone documentation
   txn.clone(coll->get_cid(), ghobject_t{os.oi.soid}, ghobject_t{d_os.oi.soid});
   {
     ceph::bufferlist bv;

From a7812e095c13debcd844883db5888bdf5a185170 Mon Sep 17 00:00:00 2001
From: Samuel Just <sjust@redhat.com>
Date: Tue, 10 Sep 2024 23:52:32 +0000
Subject: [PATCH 110/148] crimson/.../internal_client_request: remove
 unnecessary system_shutdown guard

Signed-off-by: Samuel Just <sjust@redhat.com>
---
 .../osd_operations/internal_client_request.cc | 150 +++++++++---------
 1 file changed, 74 insertions(+), 76 deletions(-)

diff --git a/src/crimson/osd/osd_operations/internal_client_request.cc b/src/crimson/osd/osd_operations/internal_client_request.cc
index a19bb0826f004..b1224f6e25942 100644
--- a/src/crimson/osd/osd_operations/internal_client_request.cc
+++ b/src/crimson/osd/osd_operations/internal_client_request.cc
@@ -53,88 +53,86 @@ CommonPGPipeline& InternalClientRequest::client_pp()
 seastar::future<> InternalClientRequest::start()
 {
   track_event<StartEvent>();
-  return crimson::common::handle_system_shutdown([this] {
-      LOG_PREFIX(InternalClientRequest::start);
-      DEBUGI("{}: in repeat", *this);
-
-      return interruptor::with_interruption([this]() mutable {
-        return enter_stage<interruptor>(
-	  client_pp().wait_for_active
-        ).then_interruptible([this] {
-          return with_blocking_event<PGActivationBlocker::BlockingEvent,
-	  			     interruptor>([this] (auto&& trigger) {
-            return pg->wait_for_active_blocker.wait(std::move(trigger));
-          });
-        }).then_interruptible([this] {
-          return enter_stage<interruptor>(
-            client_pp().recover_missing);
-        }).then_interruptible([this] {
-          return do_recover_missing(pg, get_target_oid(), osd_reqid_t());
-        }).then_interruptible([this](bool unfound) {
-          if (unfound) {
-            throw std::system_error(
-              std::make_error_code(std::errc::operation_canceled),
-              fmt::format("{} is unfound, drop it!", get_target_oid()));
-          }
-          return enter_stage<interruptor>(
-            client_pp().get_obc);
-        }).then_interruptible([this] () -> PG::load_obc_iertr::future<> {
-          LOG_PREFIX(InternalClientRequest::start);
-          DEBUGI("{}: getting obc lock", *this);
-          return seastar::do_with(create_osd_ops(),
-            [this](auto& osd_ops) mutable {
-            LOG_PREFIX(InternalClientRequest::start);
-            DEBUGI("InternalClientRequest: got {} OSDOps to execute",
-                           std::size(osd_ops));
+  LOG_PREFIX(InternalClientRequest::start);
+  DEBUGI("{}: in repeat", *this);
+
+  return interruptor::with_interruption([this]() mutable {
+    return enter_stage<interruptor>(
+      client_pp().wait_for_active
+    ).then_interruptible([this] {
+      return with_blocking_event<PGActivationBlocker::BlockingEvent,
+	interruptor>([this] (auto&& trigger) {
+	  return pg->wait_for_active_blocker.wait(std::move(trigger));
+	});
+    }).then_interruptible([this] {
+      return enter_stage<interruptor>(
+	client_pp().recover_missing);
+    }).then_interruptible([this] {
+      return do_recover_missing(pg, get_target_oid(), osd_reqid_t());
+    }).then_interruptible([this](bool unfound) {
+      if (unfound) {
+	throw std::system_error(
+	  std::make_error_code(std::errc::operation_canceled),
+	  fmt::format("{} is unfound, drop it!", get_target_oid()));
+      }
+      return enter_stage<interruptor>(
+	client_pp().get_obc);
+    }).then_interruptible([this] () -> PG::load_obc_iertr::future<> {
+	LOG_PREFIX(InternalClientRequest::start);
+	DEBUGI("{}: getting obc lock", *this);
+	return seastar::do_with(
+	  create_osd_ops(),
+	  [this](auto& osd_ops) mutable {
+	    LOG_PREFIX(InternalClientRequest::start);
+	    DEBUGI("InternalClientRequest: got {} OSDOps to execute",
+		   std::size(osd_ops));
             [[maybe_unused]] const int ret = op_info.set_from_op(
               std::as_const(osd_ops), pg->get_pgid().pgid, *pg->get_osdmap());
             assert(ret == 0);
             // call with_locked_obc() in order, but wait concurrently for loading.
             enter_stage_sync(client_pp().lock_obc);
-            return pg->with_locked_obc(get_target_oid(), op_info,
-              [&osd_ops, this](auto, auto obc) {
-              return enter_stage<interruptor>(client_pp().process
-              ).then_interruptible(
-                [obc=std::move(obc), &osd_ops, this] {
-                return pg->do_osd_ops(
-                  std::move(obc),
-                  osd_ops,
-                  std::as_const(op_info),
-                  get_do_osd_ops_params()
-                ).safe_then_unpack_interruptible(
-                  [](auto submitted, auto all_completed) {
-                    return all_completed.handle_error_interruptible(
-                      crimson::ct_error::eagain::handle([] {
-                        return seastar::now();
-                      }));
-                  }, crimson::ct_error::eagain::handle([] {
-                    return interruptor::now();
-                  })
-                );
-              });
-            });
+            return pg->with_locked_obc(
+	      get_target_oid(), op_info,
+	      [&osd_ops, this](auto, auto obc) {
+		return enter_stage<interruptor>(client_pp().process
+		).then_interruptible(
+		  [obc=std::move(obc), &osd_ops, this] {
+		    return pg->do_osd_ops(
+		      std::move(obc),
+		      osd_ops,
+		      std::as_const(op_info),
+		      get_do_osd_ops_params()
+		    ).safe_then_unpack_interruptible(
+		      [](auto submitted, auto all_completed) {
+			return all_completed.handle_error_interruptible(
+			  crimson::ct_error::eagain::handle([] {
+			    return seastar::now();
+			  }));
+		      }, crimson::ct_error::eagain::handle([] {
+			return interruptor::now();
+		      })
+		    );
+		  });
+	      });
           });
-        }).si_then([this] {
-          logger().debug("{}: complete", *this);
-          return handle.complete();
-        }).handle_error_interruptible(
-          PG::load_obc_ertr::all_same_way([] {
-            return seastar::now();
-          })
-	);
-      }, [](std::exception_ptr eptr) {
-	return seastar::now();
-      }, pg, start_epoch
-
-    ).then([this] {
-      track_event<CompletionEvent>();
-    }).handle_exception_type([](std::system_error &error) {
-      logger().debug("error {}, message: {}", error.code(), error.what());
-      return seastar::now();
-    }).finally([this] {
-      logger().debug("{}: exit", *this);
-      handle.exit();
-    });
+      }).si_then([this] {
+	logger().debug("{}: complete", *this);
+	return handle.complete();
+      }).handle_error_interruptible(
+	PG::load_obc_ertr::all_same_way([] {
+	  return seastar::now();
+	})
+      );
+  }, [](std::exception_ptr eptr) {
+    return seastar::now();
+  }, pg, start_epoch).then([this] {
+    track_event<CompletionEvent>();
+  }).handle_exception_type([](std::system_error &error) {
+    logger().debug("error {}, message: {}", error.code(), error.what());
+    return seastar::now();
+  }).finally([this] {
+    logger().debug("{}: exit", *this);
+    handle.exit();
   });
 }
 

From a091414c67ba9f1407c3756dd75ca2aa3b1074ac Mon Sep 17 00:00:00 2001
From: Samuel Just <sjust@redhat.com>
Date: Wed, 11 Sep 2024 01:31:57 +0000
Subject: [PATCH 111/148] crimson/.../internal_client_request: factor out
 with_interruption

Signed-off-by: Samuel Just <sjust@redhat.com>
---
 .../osd_operations/internal_client_request.cc | 138 +++++++++---------
 .../osd_operations/internal_client_request.h  |   2 +
 2 files changed, 74 insertions(+), 66 deletions(-)

diff --git a/src/crimson/osd/osd_operations/internal_client_request.cc b/src/crimson/osd/osd_operations/internal_client_request.cc
index b1224f6e25942..d4213928a3e46 100644
--- a/src/crimson/osd/osd_operations/internal_client_request.cc
+++ b/src/crimson/osd/osd_operations/internal_client_request.cc
@@ -50,6 +50,77 @@ CommonPGPipeline& InternalClientRequest::client_pp()
   return pg->request_pg_pipeline;
 }
 
+InternalClientRequest::interruptible_future<>
+InternalClientRequest::with_interruption()
+{
+  return enter_stage<interruptor>(
+    client_pp().wait_for_active
+  ).then_interruptible([this] {
+    return with_blocking_event<PGActivationBlocker::BlockingEvent,
+      interruptor>([this] (auto&& trigger) {
+	return pg->wait_for_active_blocker.wait(std::move(trigger));
+      });
+  }).then_interruptible([this] {
+    return enter_stage<interruptor>(
+      client_pp().recover_missing);
+  }).then_interruptible([this] {
+    return do_recover_missing(pg, get_target_oid(), osd_reqid_t());
+  }).then_interruptible([this](bool unfound) {
+    if (unfound) {
+      throw std::system_error(
+	std::make_error_code(std::errc::operation_canceled),
+	fmt::format("{} is unfound, drop it!", get_target_oid()));
+    }
+    return enter_stage<interruptor>(
+      client_pp().get_obc);
+  }).then_interruptible([this] () -> PG::load_obc_iertr::future<> {
+      LOG_PREFIX(InternalClientRequest::with_interruption);
+      DEBUGI("{}: getting obc lock", *this);
+      return seastar::do_with(
+	create_osd_ops(),
+	[this](auto& osd_ops) mutable {
+	  LOG_PREFIX(InternalClientRequest::with_interruption);
+	  DEBUGI("InternalClientRequest: got {} OSDOps to execute",
+		 std::size(osd_ops));
+	  [[maybe_unused]] const int ret = op_info.set_from_op(
+	    std::as_const(osd_ops), pg->get_pgid().pgid, *pg->get_osdmap());
+	  assert(ret == 0);
+	  // call with_locked_obc() in order, but wait concurrently for loading.
+	  enter_stage_sync(client_pp().lock_obc);
+	  return pg->with_locked_obc(
+	    get_target_oid(), op_info,
+	    [&osd_ops, this](auto, auto obc) {
+	      return enter_stage<interruptor>(client_pp().process
+	      ).then_interruptible(
+		[obc=std::move(obc), &osd_ops, this] {
+		  return pg->do_osd_ops(
+		    std::move(obc),
+		    osd_ops,
+		    std::as_const(op_info),
+		    get_do_osd_ops_params()
+		  ).safe_then_unpack_interruptible(
+		    [](auto submitted, auto all_completed) {
+		      return all_completed.handle_error_interruptible(
+			crimson::ct_error::eagain::handle([] {
+			    return seastar::now();
+			}));
+		    }, crimson::ct_error::eagain::handle([] {
+		      return interruptor::now();
+		    })
+		  );
+		});
+	    });
+	});
+    }).si_then([this] {
+      logger().debug("{}: complete", *this);
+      return handle.complete();
+    }).handle_error_interruptible(
+      PG::load_obc_ertr::all_same_way([] {
+	return seastar::now();
+      })
+    );
+}
+
 seastar::future<> InternalClientRequest::start()
 {
   track_event<StartEvent>();
@@ -57,72 +128,7 @@ seastar::future<> InternalClientRequest::start()
   DEBUGI("{}: in repeat", *this);
 
   return interruptor::with_interruption([this]() mutable {
-    return enter_stage<interruptor>(
-      client_pp().wait_for_active
-    ).then_interruptible([this] {
-      return with_blocking_event<PGActivationBlocker::BlockingEvent,
-	interruptor>([this] (auto&& trigger) {
-	  return pg->wait_for_active_blocker.wait(std::move(trigger));
-	});
-    }).then_interruptible([this] {
-      return enter_stage<interruptor>(
-	client_pp().recover_missing);
-    }).then_interruptible([this] {
-      return do_recover_missing(pg, get_target_oid(), osd_reqid_t());
-    }).then_interruptible([this](bool unfound) {
-      if (unfound) {
-	throw std::system_error(
-	  std::make_error_code(std::errc::operation_canceled),
-	  fmt::format("{} is unfound, drop it!", get_target_oid()));
-      }
-      return enter_stage<interruptor>(
-	client_pp().get_obc);
-    }).then_interruptible([this] () -> PG::load_obc_iertr::future<> {
-	LOG_PREFIX(InternalClientRequest::start);
-	DEBUGI("{}: getting obc lock", *this);
-	return seastar::do_with(
-	  create_osd_ops(),
-	  [this](auto& osd_ops) mutable {
-	    LOG_PREFIX(InternalClientRequest::start);
-	    DEBUGI("InternalClientRequest: got {} OSDOps to execute",
-		   std::size(osd_ops));
-            [[maybe_unused]] const int ret = op_info.set_from_op(
-              std::as_const(osd_ops), pg->get_pgid().pgid, *pg->get_osdmap());
-            assert(ret == 0);
-            // call with_locked_obc() in order, but wait concurrently for loading.
-            enter_stage_sync(client_pp().lock_obc);
-            return pg->with_locked_obc(
-	      get_target_oid(), op_info,
-	      [&osd_ops, this](auto, auto obc) {
-		return enter_stage<interruptor>(client_pp().process
-		).then_interruptible(
-		  [obc=std::move(obc), &osd_ops, this] {
-		    return pg->do_osd_ops(
-		      std::move(obc),
-		      osd_ops,
-		      std::as_const(op_info),
-		      get_do_osd_ops_params()
-		    ).safe_then_unpack_interruptible(
-		      [](auto submitted, auto all_completed) {
-			return all_completed.handle_error_interruptible(
-			  crimson::ct_error::eagain::handle([] {
-			    return seastar::now();
-			  }));
-		      }, crimson::ct_error::eagain::handle([] {
-			return interruptor::now();
-		      })
-		    );
-		  });
-	      });
-          });
-      }).si_then([this] {
-	logger().debug("{}: complete", *this);
-	return handle.complete();
-      }).handle_error_interruptible(
-	PG::load_obc_ertr::all_same_way([] {
-	  return seastar::now();
-	})
-      );
+    return with_interruption();
   }, [](std::exception_ptr eptr) {
     return seastar::now();
   }, pg, start_epoch).then([this] {
diff --git a/src/crimson/osd/osd_operations/internal_client_request.h b/src/crimson/osd/osd_operations/internal_client_request.h
index f198e58464338..2f3585013344d 100644
--- a/src/crimson/osd/osd_operations/internal_client_request.h
+++ b/src/crimson/osd/osd_operations/internal_client_request.h
@@ -41,6 +41,8 @@ class InternalClientRequest : public PhasedOperationT<InternalClientRequest>,
 
   CommonPGPipeline& client_pp();
 
+  InternalClientRequest::interruptible_future<> with_interruption();
+
   seastar::future<> do_process();
 
   Ref<PG> pg;

From 238f3e573d48a082f49713cfa310110190ee521d Mon Sep 17 00:00:00 2001
From: Samuel Just <sjust@redhat.com>
Date: Wed, 11 Sep 2024 21:16:51 +0000
Subject: [PATCH 112/148] crimson/.../internal_client_request: convert
 with_interruption to coroutine

Signed-off-by: Samuel Just <sjust@redhat.com>
---
 .../osd_operations/internal_client_request.cc | 123 +++++++++---------
 1 file changed, 61 insertions(+), 62 deletions(-)

diff --git a/src/crimson/osd/osd_operations/internal_client_request.cc b/src/crimson/osd/osd_operations/internal_client_request.cc
index d4213928a3e46..d0aa0822f8030 100644
--- a/src/crimson/osd/osd_operations/internal_client_request.cc
+++ b/src/crimson/osd/osd_operations/internal_client_request.cc
@@ -53,72 +53,71 @@ CommonPGPipeline& InternalClientRequest::client_pp()
 InternalClientRequest::interruptible_future<>
 InternalClientRequest::with_interruption()
 {
-  return enter_stage<interruptor>(
+  LOG_PREFIX(InternalClientRequest::with_interruption);
+  co_await enter_stage<interruptor>(
     client_pp().wait_for_active
-  ).then_interruptible([this] {
-    return with_blocking_event<PGActivationBlocker::BlockingEvent,
-      interruptor>([this] (auto&& trigger) {
-	return pg->wait_for_active_blocker.wait(std::move(trigger));
-      });
-  }).then_interruptible([this] {
-    return enter_stage<interruptor>(
-      client_pp().recover_missing);
-  }).then_interruptible([this] {
-    return do_recover_missing(pg, get_target_oid(), osd_reqid_t());
-  }).then_interruptible([this](bool unfound) {
-    if (unfound) {
-      throw std::system_error(
-	std::make_error_code(std::errc::operation_canceled),
-	fmt::format("{} is unfound, drop it!", get_target_oid()));
-    }
-    return enter_stage<interruptor>(
-      client_pp().get_obc);
-  }).then_interruptible([this] () -> PG::load_obc_iertr::future<> {
-      LOG_PREFIX(InternalClientRequest::with_interruption);
-      DEBUGI("{}: getting obc lock", *this);
-      return seastar::do_with(
-	create_osd_ops(),
-	[this](auto& osd_ops) mutable {
-	  LOG_PREFIX(InternalClientRequest::with_interruption);
-	  DEBUGI("InternalClientRequest: got {} OSDOps to execute",
-		 std::size(osd_ops));
-	  [[maybe_unused]] const int ret = op_info.set_from_op(
-	    std::as_const(osd_ops), pg->get_pgid().pgid, *pg->get_osdmap());
-	  assert(ret == 0);
-	  // call with_locked_obc() in order, but wait concurrently for loading.
-	  enter_stage_sync(client_pp().lock_obc);
-	  return pg->with_locked_obc(
-	    get_target_oid(), op_info,
-	    [&osd_ops, this](auto, auto obc) {
-	      return enter_stage<interruptor>(client_pp().process
-	      ).then_interruptible(
-		[obc=std::move(obc), &osd_ops, this] {
-		  return pg->do_osd_ops(
-		    std::move(obc),
-		    osd_ops,
-		    std::as_const(op_info),
-		    get_do_osd_ops_params()
-		  ).safe_then_unpack_interruptible(
-		    [](auto submitted, auto all_completed) {
-		      return all_completed.handle_error_interruptible(
-			crimson::ct_error::eagain::handle([] {
-			    return seastar::now();
-			}));
-		    }, crimson::ct_error::eagain::handle([] {
-		      return interruptor::now();
-		    })
-		  );
-		});
-	    });
+  );
+
+  co_await with_blocking_event<PGActivationBlocker::BlockingEvent,
+			       interruptor>([this] (auto&& trigger) {
+    return pg->wait_for_active_blocker.wait(std::move(trigger));
+  });
+
+  co_await enter_stage<interruptor>(client_pp().recover_missing);
+
+  bool unfound = co_await do_recover_missing(
+    pg, get_target_oid(), osd_reqid_t());
+
+  if (unfound) {
+    throw std::system_error(
+      std::make_error_code(std::errc::operation_canceled),
+      fmt::format("{} is unfound, drop it!", get_target_oid()));
+  }
+  co_await enter_stage<interruptor>(
+    client_pp().get_obc);
+
+  DEBUGI("{}: getting obc lock", *this);
+
+  auto osd_ops = create_osd_ops();
+
+  DEBUGI("InternalClientRequest: got {} OSDOps to execute",
+	 std::size(osd_ops));
+  [[maybe_unused]] const int ret = op_info.set_from_op(
+    std::as_const(osd_ops), pg->get_pgid().pgid, *pg->get_osdmap());
+  assert(ret == 0);
+  // call with_locked_obc() in order, but wait concurrently for loading.
+  enter_stage_sync(client_pp().lock_obc);
+
+  auto fut = pg->with_locked_obc(
+    get_target_oid(), op_info,
+    [&osd_ops, this](auto, auto obc) {
+      return enter_stage<interruptor>(client_pp().process
+      ).then_interruptible(
+	[obc=std::move(obc), &osd_ops, this] {
+	  return pg->do_osd_ops(
+	    std::move(obc),
+	    osd_ops,
+	    std::as_const(op_info),
+	    get_do_osd_ops_params()
+	  ).safe_then_unpack_interruptible(
+	    [](auto submitted, auto all_completed) {
+	      return all_completed.handle_error_interruptible(
+		crimson::ct_error::eagain::handle([] {
+		  return seastar::now();
+		}));
+	    }, crimson::ct_error::eagain::handle([] {
+	      return interruptor::now();
+	    })
+	  );
 	});
-    }).si_then([this] {
-      logger().debug("{}: complete", *this);
-      return handle.complete();
     }).handle_error_interruptible(
-      PG::load_obc_ertr::all_same_way([] {
-	return seastar::now();
-      })
+      crimson::ct_error::assert_all("unexpected error")
     );
+  co_await std::move(fut);
+
+  logger().debug("{}: complete", *this);
+  co_await interruptor::make_interruptible(handle.complete());
+  co_return;
 }
 
 seastar::future<> InternalClientRequest::start()

From 96c771383ae0458de68517f1e1f1757e27367d0d Mon Sep 17 00:00:00 2001
From: Samuel Just <sjust@redhat.com>
Date: Fri, 13 Sep 2024 23:55:43 +0000
Subject: [PATCH 113/148] crimson: eliminate get_obc stage

f90af12d introduced check_already_complete_get_obc to replace get_obc,
but left get_obc and didn't update the other users.

Signed-off-by: Samuel Just <sjust@redhat.com>
---
 src/crimson/osd/osd_operation_external_tracking.h     | 11 -----------
 src/crimson/osd/osd_operations/client_request.h       |  1 -
 src/crimson/osd/osd_operations/common/pg_pipeline.h   |  3 ---
 .../osd/osd_operations/internal_client_request.cc     |  2 +-
 .../osd/osd_operations/internal_client_request.h      |  2 +-
 src/crimson/osd/osd_operations/snaptrim_event.cc      |  2 +-
 src/crimson/osd/osd_operations/snaptrim_event.h       |  2 +-
 7 files changed, 4 insertions(+), 19 deletions(-)

diff --git a/src/crimson/osd/osd_operation_external_tracking.h b/src/crimson/osd/osd_operation_external_tracking.h
index 530732ba71028..d2786a95e4d3c 100644
--- a/src/crimson/osd/osd_operation_external_tracking.h
+++ b/src/crimson/osd/osd_operation_external_tracking.h
@@ -36,7 +36,6 @@ struct LttngBackend
     ClientRequest::PGPipeline::RecoverMissing::
       BlockingEvent::ExitBarrierEvent::Backend,
     ClientRequest::PGPipeline::CheckAlreadyCompleteGetObc::BlockingEvent::Backend,
-    ClientRequest::PGPipeline::GetOBC::BlockingEvent::Backend,
     ClientRequest::PGPipeline::LockOBC::BlockingEvent::Backend,
     ClientRequest::PGPipeline::LockOBC::BlockingEvent::ExitBarrierEvent::Backend,
     ClientRequest::PGPipeline::Process::BlockingEvent::Backend,
@@ -117,10 +116,6 @@ struct LttngBackend
               const ClientRequest::PGPipeline::CheckAlreadyCompleteGetObc& blocker) override {
   }
 
-  void handle(ClientRequest::PGPipeline::GetOBC::BlockingEvent& ev,
-              const Operation& op,
-              const ClientRequest::PGPipeline::GetOBC& blocker) override {
-  }
 
   void handle(ClientRequest::PGPipeline::LockOBC::BlockingEvent& ev,
               const Operation& op,
@@ -171,7 +166,6 @@ struct HistoricBackend
     ClientRequest::PGPipeline::RecoverMissing::
       BlockingEvent::ExitBarrierEvent::Backend,
     ClientRequest::PGPipeline::CheckAlreadyCompleteGetObc::BlockingEvent::Backend,
-    ClientRequest::PGPipeline::GetOBC::BlockingEvent::Backend,
     ClientRequest::PGPipeline::LockOBC::BlockingEvent::Backend,
     ClientRequest::PGPipeline::LockOBC::BlockingEvent::ExitBarrierEvent::Backend,
     ClientRequest::PGPipeline::Process::BlockingEvent::Backend,
@@ -252,11 +246,6 @@ struct HistoricBackend
               const ClientRequest::PGPipeline::CheckAlreadyCompleteGetObc& blocker) override {
   }
 
-  void handle(ClientRequest::PGPipeline::GetOBC::BlockingEvent& ev,
-              const Operation& op,
-              const ClientRequest::PGPipeline::GetOBC& blocker) override {
-  }
-
   void handle(ClientRequest::PGPipeline::LockOBC::BlockingEvent& ev,
               const Operation& op,
               const ClientRequest::PGPipeline::LockOBC& blocker) override {
diff --git a/src/crimson/osd/osd_operations/client_request.h b/src/crimson/osd/osd_operations/client_request.h
index ea7aade22ac75..f14e76504fcd6 100644
--- a/src/crimson/osd/osd_operations/client_request.h
+++ b/src/crimson/osd/osd_operations/client_request.h
@@ -104,7 +104,6 @@ class ClientRequest final : public PhasedOperationT<ClientRequest>,
       PGPipeline::RecoverMissing::BlockingEvent,
       scrub::PGScrubber::BlockingEvent,
       PGPipeline::CheckAlreadyCompleteGetObc::BlockingEvent,
-      PGPipeline::GetOBC::BlockingEvent,
       PGPipeline::LockOBC::BlockingEvent,
       PGPipeline::Process::BlockingEvent,
       PGPipeline::WaitRepop::BlockingEvent,
diff --git a/src/crimson/osd/osd_operations/common/pg_pipeline.h b/src/crimson/osd/osd_operations/common/pg_pipeline.h
index 2b2d03ae4b3ed..0146cb247945f 100644
--- a/src/crimson/osd/osd_operations/common/pg_pipeline.h
+++ b/src/crimson/osd/osd_operations/common/pg_pipeline.h
@@ -23,9 +23,6 @@ class CommonPGPipeline {
   struct CheckAlreadyCompleteGetObc : OrderedExclusivePhaseT<CheckAlreadyCompleteGetObc> {
     static constexpr auto type_name = "CommonPGPipeline::check_already_complete_get_obc";
   } check_already_complete_get_obc;
-  struct GetOBC : OrderedExclusivePhaseT<GetOBC> {
-    static constexpr auto type_name = "CommonPGPipeline::get_obc";
-  } get_obc;
   struct LockOBC : OrderedConcurrentPhaseT<LockOBC> {
     static constexpr auto type_name = "CommonPGPipeline::lock_obc";
   } lock_obc;
diff --git a/src/crimson/osd/osd_operations/internal_client_request.cc b/src/crimson/osd/osd_operations/internal_client_request.cc
index d0aa0822f8030..2bfa4296b2829 100644
--- a/src/crimson/osd/osd_operations/internal_client_request.cc
+++ b/src/crimson/osd/osd_operations/internal_client_request.cc
@@ -74,7 +74,7 @@ InternalClientRequest::with_interruption()
       fmt::format("{} is unfound, drop it!", get_target_oid()));
   }
   co_await enter_stage<interruptor>(
-    client_pp().get_obc);
+    client_pp().check_already_complete_get_obc);
 
   DEBUGI("{}: getting obc lock", *this);
 
diff --git a/src/crimson/osd/osd_operations/internal_client_request.h b/src/crimson/osd/osd_operations/internal_client_request.h
index 2f3585013344d..6e31ee993b9cb 100644
--- a/src/crimson/osd/osd_operations/internal_client_request.h
+++ b/src/crimson/osd/osd_operations/internal_client_request.h
@@ -58,7 +58,7 @@ class InternalClientRequest : public PhasedOperationT<InternalClientRequest>,
     CommonPGPipeline::WaitForActive::BlockingEvent,
     PGActivationBlocker::BlockingEvent,
     CommonPGPipeline::RecoverMissing::BlockingEvent,
-    CommonPGPipeline::GetOBC::BlockingEvent,
+    CommonPGPipeline::CheckAlreadyCompleteGetObc::BlockingEvent,
     CommonPGPipeline::LockOBC::BlockingEvent,
     CommonPGPipeline::Process::BlockingEvent,
     CompletionEvent
diff --git a/src/crimson/osd/osd_operations/snaptrim_event.cc b/src/crimson/osd/osd_operations/snaptrim_event.cc
index 7512b3d108dfc..9ed0b73cfb458 100644
--- a/src/crimson/osd/osd_operations/snaptrim_event.cc
+++ b/src/crimson/osd/osd_operations/snaptrim_event.cc
@@ -396,7 +396,7 @@ SnapTrimObjSubEvent::start()
   });
 
   co_await enter_stage<interruptor>(
-    client_pp().get_obc);
+    client_pp().check_already_complete_get_obc);
 
   logger().debug("{}: getting obc for {}", *this, coid);
   // end of commonality
diff --git a/src/crimson/osd/osd_operations/snaptrim_event.h b/src/crimson/osd/osd_operations/snaptrim_event.h
index 06d8f43c2f3c9..cdd82cdbf3086 100644
--- a/src/crimson/osd/osd_operations/snaptrim_event.h
+++ b/src/crimson/osd/osd_operations/snaptrim_event.h
@@ -170,7 +170,7 @@ class SnapTrimObjSubEvent : public PhasedOperationT<SnapTrimObjSubEvent> {
 
   std::tuple<
     StartEvent,
-    CommonPGPipeline::GetOBC::BlockingEvent,
+    CommonPGPipeline::CheckAlreadyCompleteGetObc::BlockingEvent,
     CommonPGPipeline::Process::BlockingEvent,
     CommonPGPipeline::WaitRepop::BlockingEvent,
     CompletionEvent

From 7da7c3d736cebed2233ed836f53219b8dfe85047 Mon Sep 17 00:00:00 2001
From: Samuel Just <sjust@redhat.com>
Date: Mon, 16 Sep 2024 22:16:37 +0000
Subject: [PATCH 114/148] crimson/osd: move pipelines to osd_operation.h

Each of the two existing pipelines are shared across multiple
ops.  Rather than defining them in a specific op or in
osd_operations/common/pg_pipeline.h, just declare them in
osd_operation.h.

Signed-off-by: Samuel Just <sjust@redhat.com>
---
 src/crimson/osd/osd_operation.h               | 31 ++++++++++++++++
 .../osd/osd_operations/client_request.h       |  1 -
 .../osd/osd_operations/common/pg_pipeline.h   | 37 -------------------
 .../osd_operations/internal_client_request.h  |  1 -
 .../osd/osd_operations/peering_event.h        |  9 -----
 .../osd/osd_operations/snaptrim_event.h       |  1 -
 6 files changed, 31 insertions(+), 49 deletions(-)
 delete mode 100644 src/crimson/osd/osd_operations/common/pg_pipeline.h

diff --git a/src/crimson/osd/osd_operation.h b/src/crimson/osd/osd_operation.h
index fb0432edb8f9a..fd8b049c0bf08 100644
--- a/src/crimson/osd/osd_operation.h
+++ b/src/crimson/osd/osd_operation.h
@@ -40,6 +40,37 @@ struct PerShardPipeline {
   } create_or_wait_pg;
 };
 
+struct PGPeeringPipeline {
+  struct AwaitMap : OrderedExclusivePhaseT<AwaitMap> {
+    static constexpr auto type_name = "PeeringEvent::PGPipeline::await_map";
+  } await_map;
+  struct Process : OrderedExclusivePhaseT<Process> {
+    static constexpr auto type_name = "PeeringEvent::PGPipeline::process";
+  } process;
+};
+
+struct CommonPGPipeline {
+  struct WaitForActive : OrderedExclusivePhaseT<WaitForActive> {
+    static constexpr auto type_name = "CommonPGPipeline:::wait_for_active";
+  } wait_for_active;
+  struct RecoverMissing : OrderedConcurrentPhaseT<RecoverMissing> {
+    static constexpr auto type_name = "CommonPGPipeline::recover_missing";
+  } recover_missing;
+  struct CheckAlreadyCompleteGetObc : OrderedExclusivePhaseT<CheckAlreadyCompleteGetObc> {
+    static constexpr auto type_name = "CommonPGPipeline::check_already_complete_get_obc";
+  } check_already_complete_get_obc;
+  struct LockOBC : OrderedConcurrentPhaseT<LockOBC> {
+    static constexpr auto type_name = "CommonPGPipeline::lock_obc";
+  } lock_obc;
+  struct Process : OrderedExclusivePhaseT<Process> {
+    static constexpr auto type_name = "CommonPGPipeline::process";
+  } process;
+  struct WaitRepop : OrderedConcurrentPhaseT<WaitRepop> {
+    static constexpr auto type_name = "ClientRequest::PGPipeline::wait_repop";
+  } wait_repop;
+};
+
+
 enum class OperationTypeCode {
   client_request = 0,
   peering_event,
diff --git a/src/crimson/osd/osd_operations/client_request.h b/src/crimson/osd/osd_operations/client_request.h
index f14e76504fcd6..331cedaadfff2 100644
--- a/src/crimson/osd/osd_operations/client_request.h
+++ b/src/crimson/osd/osd_operations/client_request.h
@@ -14,7 +14,6 @@
 #include "crimson/osd/osdmap_gate.h"
 #include "crimson/osd/osd_operation.h"
 #include "crimson/osd/osd_operations/client_request_common.h"
-#include "crimson/osd/osd_operations/common/pg_pipeline.h"
 #include "crimson/osd/pg_activation_blocker.h"
 #include "crimson/osd/pg_map.h"
 #include "crimson/osd/scrub/pg_scrubber.h"
diff --git a/src/crimson/osd/osd_operations/common/pg_pipeline.h b/src/crimson/osd/osd_operations/common/pg_pipeline.h
deleted file mode 100644
index 0146cb247945f..0000000000000
--- a/src/crimson/osd/osd_operations/common/pg_pipeline.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#pragma once
-
-#include "osd/osd_op_util.h"
-#include "crimson/osd/osd_operation.h"
-
-namespace crimson::osd {
-
-class CommonPGPipeline {
-protected:
-  friend class InternalClientRequest;
-  friend class SnapTrimEvent;
-  friend class SnapTrimObjSubEvent;
-
-  struct WaitForActive : OrderedExclusivePhaseT<WaitForActive> {
-    static constexpr auto type_name = "CommonPGPipeline:::wait_for_active";
-  } wait_for_active;
-  struct RecoverMissing : OrderedConcurrentPhaseT<RecoverMissing> {
-    static constexpr auto type_name = "CommonPGPipeline::recover_missing";
-  } recover_missing;
-  struct CheckAlreadyCompleteGetObc : OrderedExclusivePhaseT<CheckAlreadyCompleteGetObc> {
-    static constexpr auto type_name = "CommonPGPipeline::check_already_complete_get_obc";
-  } check_already_complete_get_obc;
-  struct LockOBC : OrderedConcurrentPhaseT<LockOBC> {
-    static constexpr auto type_name = "CommonPGPipeline::lock_obc";
-  } lock_obc;
-  struct Process : OrderedExclusivePhaseT<Process> {
-    static constexpr auto type_name = "CommonPGPipeline::process";
-  } process;
-  struct WaitRepop : OrderedConcurrentPhaseT<WaitRepop> {
-    static constexpr auto type_name = "ClientRequest::PGPipeline::wait_repop";
-  } wait_repop;
-};
-
-} // namespace crimson::osd
diff --git a/src/crimson/osd/osd_operations/internal_client_request.h b/src/crimson/osd/osd_operations/internal_client_request.h
index 6e31ee993b9cb..782fb809042a6 100644
--- a/src/crimson/osd/osd_operations/internal_client_request.h
+++ b/src/crimson/osd/osd_operations/internal_client_request.h
@@ -6,7 +6,6 @@
 #include "crimson/common/type_helpers.h"
 #include "crimson/osd/osd_operation.h"
 #include "crimson/osd/osd_operations/client_request_common.h"
-#include "crimson/osd/osd_operations/common/pg_pipeline.h"
 #include "crimson/osd/pg.h"
 #include "crimson/osd/pg_activation_blocker.h"
 
diff --git a/src/crimson/osd/osd_operations/peering_event.h b/src/crimson/osd/osd_operations/peering_event.h
index 1e6bd957289ff..85de5c711d67c 100644
--- a/src/crimson/osd/osd_operations/peering_event.h
+++ b/src/crimson/osd/osd_operations/peering_event.h
@@ -23,15 +23,6 @@ class ShardServices;
 class PG;
 class BackfillRecovery;
 
-  struct PGPeeringPipeline {
-    struct AwaitMap : OrderedExclusivePhaseT<AwaitMap> {
-      static constexpr auto type_name = "PeeringEvent::PGPipeline::await_map";
-    } await_map;
-    struct Process : OrderedExclusivePhaseT<Process> {
-      static constexpr auto type_name = "PeeringEvent::PGPipeline::process";
-    } process;
-  };
-
 template <class T>
 class PeeringEvent : public PhasedOperationT<T> {
   T* that() {
diff --git a/src/crimson/osd/osd_operations/snaptrim_event.h b/src/crimson/osd/osd_operations/snaptrim_event.h
index cdd82cdbf3086..1164b3169d293 100644
--- a/src/crimson/osd/osd_operations/snaptrim_event.h
+++ b/src/crimson/osd/osd_operations/snaptrim_event.h
@@ -9,7 +9,6 @@
 #include "crimson/osd/osdmap_gate.h"
 #include "crimson/osd/osd_operation.h"
 #include "crimson/common/subop_blocker.h"
-#include "crimson/osd/osd_operations/common/pg_pipeline.h"
 #include "crimson/osd/pg.h"
 #include "crimson/osd/pg_activation_blocker.h"
 #include "osd/osd_types.h"

From 0a83d956e546d7d04c55de34a788234533ed5293 Mon Sep 17 00:00:00 2001
From: Samuel Just <sjust@redhat.com>
Date: Thu, 19 Sep 2024 00:59:21 +0000
Subject: [PATCH 115/148] crimson: remove the eagain error from PG::do_osd_ops

The idea here is that PG::do_osd_ops propogates an eagain after starting
a repair upon encountering an eio to indicate that the op should restart
from the top of ClientRequest::process_op.

However, InternalClientRequest's handler for this error simply ignores
it.  ClientRequest's handling, while superficially reasonable, doesn't
actually work.  Re-calling process_op would mean reentering previous
stages.  This is problematic for at least a few reasons:
1. Reentering a prior stage with the same handler doesn't actually work
   since the corresponding event entries will already be populated.
2. There might be other ops on the same object waiting on the process
   stage.  They'd need to be sent back as well in order to preserve
   ordering.

Because this mechanism doesn't really seem to be fully baked, let's
remove it for now and try to reintroduce it later after
do_osd_ops[_execute] are a bit simpler.

Signed-off-by: Samuel Just <sjust@redhat.com>
---
 .../osd/osd_operations/client_request.cc      | 23 +++++++++++++------
 .../osd/osd_operations/client_request.h       |  7 +-----
 .../osd_operations/internal_client_request.cc |  2 ++
 3 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/src/crimson/osd/osd_operations/client_request.cc b/src/crimson/osd/osd_operations/client_request.cc
index 8e9a7c4d7490c..6eed04df6a5ac 100644
--- a/src/crimson/osd/osd_operations/client_request.cc
+++ b/src/crimson/osd/osd_operations/client_request.cc
@@ -403,11 +403,6 @@ ClientRequest::process_op(
 		   *pg, *this, this_instance_id);
 	  return do_process(
 	    ihref, pg, obc, this_instance_id
-	  ).handle_error_interruptible(
-	    crimson::ct_error::eagain::handle(
-	      [this, pg, this_instance_id, &ihref]() mutable {
-		return process_op(ihref, pg, this_instance_id);
-	      })
 	  );
 	}
       );
@@ -437,7 +432,7 @@ ClientRequest::process_op(
   co_await std::move(process);
 }
 
-ClientRequest::do_process_iertr::future<>
+ClientRequest::interruptible_future<>
 ClientRequest::do_process(
   instance_handle_t &ihref,
   Ref<PG> pg, crimson::osd::ObjectContextRef obc,
@@ -509,12 +504,26 @@ ClientRequest::do_process(
 
   auto [submitted, all_completed] = co_await pg->do_osd_ops(
     m, r_conn, obc, op_info, snapc
+  ).handle_error_interruptible(
+    crimson::ct_error::eagain::handle([] {
+      ceph_assert(0 == "not handled");
+      return std::make_tuple(
+	interruptor::now(),
+	PG::do_osd_ops_iertr::make_ready_future<MURef<MOSDOpReply>>());
+    })
   );
   co_await std::move(submitted);
 
   co_await ihref.enter_stage<interruptor>(client_pp(*pg).wait_repop, *this);
 
-  auto reply = co_await std::move(all_completed);
+  auto reply = co_await std::move(
+    all_completed
+  ).handle_error_interruptible(
+    crimson::ct_error::eagain::handle([] {
+      ceph_assert(0 == "not handled");
+      return MURef<MOSDOpReply>();
+    })
+  );
 
   co_await ihref.enter_stage<interruptor>(client_pp(*pg).send_reply, *this);
   DEBUGDPP("{}.{}: sending response",
diff --git a/src/crimson/osd/osd_operations/client_request.h b/src/crimson/osd/osd_operations/client_request.h
index 331cedaadfff2..6ee57e9874cd1 100644
--- a/src/crimson/osd/osd_operations/client_request.h
+++ b/src/crimson/osd/osd_operations/client_request.h
@@ -274,12 +274,7 @@ class ClientRequest final : public PhasedOperationT<ClientRequest>,
   interruptible_future<> with_sequencer(FuncT&& func);
   interruptible_future<> reply_op_error(const Ref<PG>& pg, int err);
 
-
-  using do_process_iertr =
-    ::crimson::interruptible::interruptible_errorator<
-      ::crimson::osd::IOInterruptCondition,
-      ::crimson::errorator<crimson::ct_error::eagain>>;
-  do_process_iertr::future<> do_process(
+  interruptible_future<> do_process(
     instance_handle_t &ihref,
     Ref<PG> pg,
     crimson::osd::ObjectContextRef obc,
diff --git a/src/crimson/osd/osd_operations/internal_client_request.cc b/src/crimson/osd/osd_operations/internal_client_request.cc
index 2bfa4296b2829..dabff1a33bdb6 100644
--- a/src/crimson/osd/osd_operations/internal_client_request.cc
+++ b/src/crimson/osd/osd_operations/internal_client_request.cc
@@ -103,9 +103,11 @@ InternalClientRequest::with_interruption()
 	    [](auto submitted, auto all_completed) {
 	      return all_completed.handle_error_interruptible(
 		crimson::ct_error::eagain::handle([] {
+		  ceph_assert(0 == "not handled");
 		  return seastar::now();
 		}));
 	    }, crimson::ct_error::eagain::handle([] {
+	      ceph_assert(0 == "not handled");
 	      return interruptor::now();
 	    })
 	  );

From a43452f47ee6f2eb7e2496ee242848acba8f8472 Mon Sep 17 00:00:00 2001
From: Samuel Just <sjust@redhat.com>
Date: Thu, 19 Sep 2024 23:58:48 +0000
Subject: [PATCH 116/148] crimson: OpsExecutor::flush_clone_metadata no longer
 needs to return a future

Snapmapper updates happen during log commit now.

Signed-off-by: Samuel Just <sjust@redhat.com>
---
 src/crimson/osd/ops_executer.cc | 10 ++--------
 src/crimson/osd/ops_executer.h  |  4 ++--
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/src/crimson/osd/ops_executer.cc b/src/crimson/osd/ops_executer.cc
index 0a07fa7ee293e..9bf60140374c8 100644
--- a/src/crimson/osd/ops_executer.cc
+++ b/src/crimson/osd/ops_executer.cc
@@ -973,7 +973,7 @@ void OpsExecuter::CloningContext::apply_to(
   processed_obc.ssc->snapset = std::move(new_snapset);
 }
 
-OpsExecuter::interruptible_future<std::vector<pg_log_entry_t>>
+std::vector<pg_log_entry_t>
 OpsExecuter::flush_clone_metadata(
   std::vector<pg_log_entry_t>&& log_entries,
   SnapMapper& snap_mapper,
@@ -981,7 +981,6 @@ OpsExecuter::flush_clone_metadata(
   ceph::os::Transaction& txn)
 {
   assert(!txn.empty());
-  auto maybe_snap_mapped = interruptor::now();
   update_clone_overlap();
   if (cloning_ctx) {
     std::move(*cloning_ctx).apply_to(log_entries, *obc);
@@ -993,12 +992,7 @@ OpsExecuter::flush_clone_metadata(
   }
   logger().debug("{} done, initial snapset={}, new snapset={}",
     __func__, obc->obs.oi.soid, obc->ssc->snapset);
-  return std::move(
-    maybe_snap_mapped
-  ).then_interruptible([log_entries=std::move(log_entries)]() mutable {
-    return interruptor::make_ready_future<std::vector<pg_log_entry_t>>(
-      std::move(log_entries));
-  });
+  return std::move(log_entries);
 }
 
 ObjectContextRef OpsExecuter::prepare_clone(
diff --git a/src/crimson/osd/ops_executer.h b/src/crimson/osd/ops_executer.h
index e25a035616edd..0b61f80b9983b 100644
--- a/src/crimson/osd/ops_executer.h
+++ b/src/crimson/osd/ops_executer.h
@@ -267,7 +267,7 @@ class OpsExecuter : public seastar::enable_lw_shared_from_this<OpsExecuter> {
   */
   void update_clone_overlap();
 
-  interruptible_future<std::vector<pg_log_entry_t>> flush_clone_metadata(
+  std::vector<pg_log_entry_t> flush_clone_metadata(
     std::vector<pg_log_entry_t>&& log_entries,
     SnapMapper& snap_mapper,
     OSDriver& osdriver,
@@ -510,7 +510,7 @@ OpsExecuter::flush_changes_n_do_ops_effects(
   }
 
   if (want_mutate) {
-    auto log_entries = co_await flush_clone_metadata(
+    auto log_entries = flush_clone_metadata(
       prepare_transaction(ops),
       snap_mapper,
       osdriver,

From 24b7b4f4b5d53927d5cc6689fd0ca1ec2276a5f3 Mon Sep 17 00:00:00 2001
From: Samuel Just <sjust@redhat.com>
Date: Fri, 20 Sep 2024 02:23:47 +0000
Subject: [PATCH 117/148] crimson: futures from flush_changes_n_do_ops_effects
 must not fail

The return signature previously suggested that the second future
returned could be an error.  This seemed necessary due to how
effects are handled:

template <typename MutFunc>
OpsExecuter::rep_op_fut_t
OpsExecuter::flush_changes_n_do_ops_effects(
  const std::vector<OSDOp>& ops,
  SnapMapper& snap_mapper,
  OSDriver& osdriver,
  MutFunc mut_func) &&
{
...
    all_completed =
      std::move(all_completed).then_interruptible([this, pg=this->pg] {
      // let's do the cleaning of `op_effects` in destructor
      return interruptor::do_for_each(op_effects,
        [pg=std::move(pg)](auto& op_effect) {
        return op_effect->execute(pg);
      });

However, all of the actual execute implementations (created via
OpsExecuter::with_effect_on_obc) return a bare seastar::future and
cannot fail.

In a larger sense, it's actually critical that neither future returned
from flush_changes_n_do_ops_effects may fail -- they represent applying
the transaction locally and remotely.  If either portion fails, there
would need to be an interval change to recover.

Signed-off-by: Samuel Just <sjust@redhat.com>
---
 src/crimson/osd/ops_executer.h | 11 ++++---
 src/crimson/osd/pg.cc          | 53 +++++++++++++++++++++++++---------
 2 files changed, 45 insertions(+), 19 deletions(-)

diff --git a/src/crimson/osd/ops_executer.h b/src/crimson/osd/ops_executer.h
index 0b61f80b9983b..185ead24e7550 100644
--- a/src/crimson/osd/ops_executer.h
+++ b/src/crimson/osd/ops_executer.h
@@ -179,7 +179,7 @@ class OpsExecuter : public seastar::enable_lw_shared_from_this<OpsExecuter> {
   // should be used.
   struct effect_t {
     // an effect can affect PG, i.e. create a watch timeout
-    virtual osd_op_errorator::future<> execute(Ref<PG> pg) = 0;
+    virtual seastar::future<> execute(Ref<PG> pg) = 0;
     virtual ~effect_t() = default;
   };
 
@@ -400,7 +400,7 @@ class OpsExecuter : public seastar::enable_lw_shared_from_this<OpsExecuter> {
   execute_op(OSDOp& osd_op);
 
   using rep_op_fut_tuple =
-    std::tuple<interruptible_future<>, osd_op_ierrorator::future<>>;
+    std::tuple<interruptible_future<>, interruptible_future<>>;
   using rep_op_fut_t =
     interruptible_future<rep_op_fut_tuple>;
   template <typename MutFunc>
@@ -475,7 +475,7 @@ auto OpsExecuter::with_effect_on_obc(
          effect_func(std::move(effect_func)),
          obc(std::move(obc)) {
     }
-    osd_op_errorator::future<> execute(Ref<PG> pg) final {
+    seastar::future<> execute(Ref<PG> pg) final {
       return std::move(effect_func)(std::move(ctx),
                                     std::move(obc),
                                     std::move(pg));
@@ -502,8 +502,7 @@ OpsExecuter::flush_changes_n_do_ops_effects(
   assert(obc);
 
   auto submitted = interruptor::now();
-  auto all_completed =
-    interruptor::make_interruptible(osd_op_errorator::now());
+  auto all_completed = interruptor::now();
 
   if (cloning_ctx) {
     ceph_assert(want_mutate);
@@ -536,7 +535,7 @@ OpsExecuter::flush_changes_n_do_ops_effects(
     // need extra ref pg due to apply_stats() which can be executed after
     // informing snap mapper
     all_completed =
-      std::move(all_completed).safe_then_interruptible([this, pg=this->pg] {
+      std::move(all_completed).then_interruptible([this, pg=this->pg] {
       // let's do the cleaning of `op_effects` in destructor
       return interruptor::do_for_each(op_effects,
         [pg=std::move(pg)](auto& op_effect) {
diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc
index 97d48c1fa454c..8ab4e4e899b8e 100644
--- a/src/crimson/osd/pg.cc
+++ b/src/crimson/osd/pg.cc
@@ -999,6 +999,28 @@ PG::do_osd_ops_execute(
       ceph_osd_op_name(osd_op.op.op));
     return ox->execute_op(osd_op);
   }).safe_then_interruptible([this, ox, &ops] {
+    /* flush_changes_n_do_ops_effects now returns
+     *
+     * interruptible_future<
+     *    tuple<interruptible_future<>, interruptible_future<>>>
+     *
+     * Previously, this lambda relied on the second element of that tuple to
+     * include OpsExecutor::osd_op_errorator in order to propogate the
+     * following three errors to the next callback.  This is actually quite
+     * awkward as the second future is the completion future, which really
+     * cannot fail (for it to do so would require an interval change to
+     * correct).
+     *
+     * Rather than reworking this now, I'll leave it as is and refactor it
+     * later.
+     */
+    using complete_iertr = crimson::interruptible::interruptible_errorator<
+      ::crimson::osd::IOInterruptCondition,
+      OpsExecuter::osd_op_errorator>;
+    using ret_t = std::tuple<
+      interruptible_future<>,
+      complete_iertr::future<>>;
+
     logger().debug(
       "do_osd_ops_execute: object {} all operations successful",
       ox->get_target());
@@ -1014,22 +1036,22 @@ PG::do_osd_ops_execute(
         // they tried, they failed.
         logger().info(" full, replying to FULL_TRY op");
         if (get_pgpool().info.has_flag(pg_pool_t::FLAG_FULL_QUOTA))
-          return interruptor::make_ready_future<OpsExecuter::rep_op_fut_tuple>(
-            seastar::now(),
-            OpsExecuter::osd_op_ierrorator::future<>(
-              crimson::ct_error::edquot::make()));
+	  return interruptor::make_ready_future<ret_t>(
+            interruptor::now(),
+	    complete_iertr::future<>(
+	      crimson::ct_error::edquot::make()));
         else
-          return interruptor::make_ready_future<OpsExecuter::rep_op_fut_tuple>(
-            seastar::now(),
-            OpsExecuter::osd_op_ierrorator::future<>(
-              crimson::ct_error::enospc::make()));
+	  return interruptor::make_ready_future<ret_t>(
+            interruptor::now(),
+	    complete_iertr::future<>(
+	      crimson::ct_error::enospc::make()));
       } else {
         // drop request
         logger().info(" full, dropping request (bad client)");
-        return interruptor::make_ready_future<OpsExecuter::rep_op_fut_tuple>(
-          seastar::now(),
-          OpsExecuter::osd_op_ierrorator::future<>(
-            crimson::ct_error::eagain::make()));
+	return interruptor::make_ready_future<ret_t>(
+	  interruptor::now(),
+	  complete_iertr::future<>(
+	    crimson::ct_error::eagain::make()));
       }
     }
     return std::move(*ox).flush_changes_n_do_ops_effects(
@@ -1049,7 +1071,12 @@ PG::do_osd_ops_execute(
           std::move(txn),
           std::move(osd_op_p),
           std::move(log_entries));
-    });
+      }).then_interruptible([](auto &&futs) {
+	auto &&[submitted, completed] = std::move(futs);
+	return interruptor::make_ready_future<ret_t>(
+	  std::move(submitted),
+	  std::move(completed));
+      });
   }).safe_then_unpack_interruptible(
     [success_func=std::move(success_func), rollbacker, this, failure_func_ptr, obc]
     (auto submitted_fut, auto _all_completed_fut) mutable {

From 5e28a3bd3b58353ff29cf1cd1b9627575158c290 Mon Sep 17 00:00:00 2001
From: Samuel Just <sjust@redhat.com>
Date: Fri, 20 Sep 2024 12:56:17 -0700
Subject: [PATCH 118/148] crimson: introduce rollback_obc_if_modified without
 an error argument

Signed-off-by: Samuel Just <sjust@redhat.com>
---
 src/crimson/osd/ops_executer.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/src/crimson/osd/ops_executer.h b/src/crimson/osd/ops_executer.h
index 185ead24e7550..6986f49ea08a7 100644
--- a/src/crimson/osd/ops_executer.h
+++ b/src/crimson/osd/ops_executer.h
@@ -551,6 +551,7 @@ OpsExecuter::flush_changes_n_do_ops_effects(
 
 template <class Func>
 struct OpsExecuter::RollbackHelper {
+  void rollback_obc_if_modified();
   void rollback_obc_if_modified(const std::error_code& e);
   seastar::lw_shared_ptr<OpsExecuter> ox;
   Func func;
@@ -562,6 +563,33 @@ OpsExecuter::create_rollbacker(Func&& func) {
   return {shared_from_this(), std::forward<Func>(func)};
 }
 
+template <class Func>
+void OpsExecuter::RollbackHelper<Func>::rollback_obc_if_modified()
+{
+  // Oops, an operation had failed. do_osd_ops() altogether with
+  // OpsExecuter already dropped the ObjectStore::Transaction if
+  // there was any. However, this is not enough to completely
+  // rollback as we gave OpsExecuter the very single copy of `obc`
+  // we maintain and we did it for both reading and writing.
+  // Now all modifications must be reverted.
+  //
+  // The conditional's purpose is to efficiently handle hot errors
+  // which may appear as a result of e.g. CEPH_OSD_OP_CMPXATTR or
+  // CEPH_OSD_OP_OMAP_CMP. These are read-like ops and clients
+  // typically append them before any write. If OpsExecuter hasn't
+  // seen any modifying operation, `obc` is supposed to be kept
+  // unchanged.
+  assert(ox);
+  const auto need_rollback = ox->has_seen_write();
+  crimson::get_logger(ceph_subsys_osd).debug(
+    "{}: object {} got error, need_rollback={}",
+    __func__,
+    ox->obc->get_oid(),
+    need_rollback);
+  if (need_rollback) {
+    func(ox->obc);
+  }
+}
 
 template <class Func>
 void OpsExecuter::RollbackHelper<Func>::rollback_obc_if_modified(

From 7a826eb86c423e895345557632091a934f7c7d7e Mon Sep 17 00:00:00 2001
From: Samuel Just <sjust@redhat.com>
Date: Thu, 19 Sep 2024 19:39:08 -0700
Subject: [PATCH 119/148] crimson: PG::submit_error_log returns eversion_t
 rather than optional

It seems like the motivation here was to allow do_osd_ops_execute to
communicate that it didn't submit an error log by making
maybe_submit_error_log a std::optional<eversion_t>.  However,
submit_error_log itself always returns a version.  Fix submit_error_log
and compensate in do_osd_ops_execute.

Signed-off-by: Samuel Just <sjust@redhat.com>
---
 src/crimson/osd/pg.cc | 10 +++++++---
 src/crimson/osd/pg.h  |  2 +-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc
index 8ab4e4e899b8e..26d1fa883bbce 100644
--- a/src/crimson/osd/pg.cc
+++ b/src/crimson/osd/pg.cc
@@ -1122,7 +1122,11 @@ PG::do_osd_ops_execute(
     if constexpr (!std::is_same_v<Ret, void>) {
       if(op_info.may_write()) {
         maybe_submit_error_log =
-          submit_error_log(m, op_info, obc, e, rep_tid);
+          submit_error_log(
+	    m, op_info, obc, e, rep_tid
+	  ).then_interruptible([](auto &&e) {
+	    return std::make_optional<eversion_t>(std::move(e));
+	  });
       }
     }
     return maybe_submit_error_log.then_interruptible(
@@ -1175,7 +1179,7 @@ PG::interruptible_future<> PG::complete_error_log(const ceph_tid_t& rep_tid,
   return result;
 }
 
-PG::interruptible_future<std::optional<eversion_t>> PG::submit_error_log(
+PG::interruptible_future<eversion_t> PG::submit_error_log(
   Ref<MOSDOp> m,
   const OpInfo &op_info,
   ObjectContextRef obc,
@@ -1241,7 +1245,7 @@ PG::interruptible_future<std::optional<eversion_t>> PG::submit_error_log(
         get_collection_ref(), std::move(t)
       ).then([this] {
         peering_state.update_trim_to();
-        return seastar::make_ready_future<std::optional<eversion_t>>(projected_last_update);
+        return seastar::make_ready_future<eversion_t>(projected_last_update);
       });
     });
   });
diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h
index d8bbc56abcc46..5bd5c3aeff849 100644
--- a/src/crimson/osd/pg.h
+++ b/src/crimson/osd/pg.h
@@ -621,7 +621,7 @@ class PG : public boost::intrusive_ref_counter<
   void dump_primary(Formatter*);
   interruptible_future<> complete_error_log(const ceph_tid_t& rep_tid,
                                        const eversion_t& version);
-  interruptible_future<std::optional<eversion_t>> submit_error_log(
+  interruptible_future<eversion_t> submit_error_log(
     Ref<MOSDOp> m,
     const OpInfo &op_info,
     ObjectContextRef obc,

From 00057b45f03ae9864a83451b498b4e0239496785 Mon Sep 17 00:00:00 2001
From: Samuel Just <sjust@redhat.com>
Date: Thu, 26 Sep 2024 14:10:06 -0700
Subject: [PATCH 120/148] crimson: introduce PG::run_executer,submit_executer

These are intended to replace do_osd_ops*.  The implementation
is simpler and does not involve passing success and failure
callbacks.  It also moves responsibility for dealing with
the MOSDOpReply and client related error handling over to
ClientRequest.

do_osd_op* will be removed once users are switched over.

Signed-off-by: Samuel Just <sjust@redhat.com>
---
 src/crimson/osd/pg.cc | 79 +++++++++++++++++++++++++++++++++++++++++++
 src/crimson/osd/pg.h  | 27 +++++++++++++++
 2 files changed, 106 insertions(+)

diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc
index 26d1fa883bbce..bb5c1e9000baf 100644
--- a/src/crimson/osd/pg.cc
+++ b/src/crimson/osd/pg.cc
@@ -13,6 +13,9 @@
 #include <boost/range/numeric.hpp>
 #include <fmt/format.h>
 #include <fmt/ostream.h>
+
+#include <seastar/util/defer.hh>
+
 #include "include/utime_fmt.h"
 
 #include "common/hobject.h"
@@ -1251,6 +1254,82 @@ PG::interruptible_future<eversion_t> PG::submit_error_log(
   });
 }
 
+PG::run_executer_fut PG::run_executer(
+  seastar::lw_shared_ptr<OpsExecuter> ox,
+  ObjectContextRef obc,
+  const OpInfo &op_info,
+  std::vector<OSDOp>& ops)
+{
+  LOG_PREFIX(PG::run_executer);
+  auto rollbacker = ox->create_rollbacker(
+    [stored_obc=duplicate_obc(obc)](auto &obc) mutable {
+      obc->update_from(*stored_obc);
+    });
+  auto rollback_on_error = seastar::defer([&rollbacker] {
+    rollbacker.rollback_obc_if_modified();
+  });
+
+  for (auto &op: ops) {
+    DEBUGDPP("object {} handle op {}", *this, ox->get_target(), op);
+    co_await ox->execute_op(op);
+  }
+  DEBUGDPP("object {} all operations successful", *this, ox->get_target());
+
+  // check for full
+  if ((ox->delta_stats.num_bytes > 0 ||
+       ox->delta_stats.num_objects > 0) &&
+      get_pgpool().info.has_flag(pg_pool_t::FLAG_FULL)) {
+    const auto& m = ox->get_message();
+    if (m.get_reqid().name.is_mds() ||   // FIXME: ignore MDS for now
+	m.has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
+      INFODPP("full, but proceeding due to FULL_FORCE, or MDS", *this);
+    } else if (m.has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
+      // they tried, they failed.
+      INFODPP("full, replying to FULL_TRY op", *this);
+      if (get_pgpool().info.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
+	co_await run_executer_fut(
+	  crimson::ct_error::edquot::make());
+      } else {
+	co_await run_executer_fut(
+	  crimson::ct_error::enospc::make());
+      }
+    } else {
+      // drop request
+      INFODPP("full, dropping request (bad client)", *this);
+      co_await run_executer_fut(
+	crimson::ct_error::eagain::make());
+    }
+  }
+  rollback_on_error.cancel();
+}
+
+PG::submit_executer_fut PG::submit_executer(
+  seastar::lw_shared_ptr<OpsExecuter> ox,
+  const std::vector<OSDOp>& ops)
+{
+  LOG_PREFIX(PG::submit_executer);
+  // transaction must commit at this point
+  return std::move(
+    *ox
+  ).flush_changes_n_do_ops_effects(
+    ops,
+    snap_mapper,
+    osdriver,
+    [FNAME, this](auto&& txn,
+		  auto&& obc,
+		  auto&& osd_op_p,
+		  auto&& log_entries) {
+      DEBUGDPP("object {} submitting txn", *this, obc->get_oid());
+      mutate_object(obc, txn, osd_op_p);
+      return submit_transaction(
+	std::move(obc),
+	std::move(txn),
+	std::move(osd_op_p),
+	std::move(log_entries));
+    });
+}
+
+
 PG::do_osd_ops_iertr::future<PG::pg_rep_op_fut_t<MURef<MOSDOpReply>>>
 PG::do_osd_ops(
   Ref<MOSDOp> m,
diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h
index 5bd5c3aeff849..c91f93171dbc1 100644
--- a/src/crimson/osd/pg.h
+++ b/src/crimson/osd/pg.h
@@ -645,6 +645,33 @@ class PG : public boost::intrusive_ref_counter<
     }
   } background_process_lock;
 
+  using run_executer_ertr = crimson::compound_errorator_t<
+    OpsExecuter::osd_op_errorator,
+    crimson::errorator<
+      crimson::ct_error::edquot,
+      crimson::ct_error::eagain,
+      crimson::ct_error::enospc
+      >
+    >;
+  using run_executer_iertr = crimson::interruptible::interruptible_errorator<
+    ::crimson::osd::IOInterruptCondition,
+    run_executer_ertr>;
+  using run_executer_fut = run_executer_iertr::future<>;
+  run_executer_fut run_executer(
+    seastar::lw_shared_ptr<OpsExecuter> ox,
+    ObjectContextRef obc,
+    const OpInfo &op_info,
+    std::vector<OSDOp>& ops);
+
+  using submit_executer_ret = std::tuple<
+    interruptible_future<>,
+    interruptible_future<>>;
+  using submit_executer_fut = interruptible_future<
+    submit_executer_ret>;
+  submit_executer_fut submit_executer(
+    seastar::lw_shared_ptr<OpsExecuter> ox,
+    const std::vector<OSDOp>& ops);
+
   using do_osd_ops_ertr = crimson::errorator<
    crimson::ct_error::eagain>;
   using do_osd_ops_iertr =

From 304e20e9bcf6f29b0f0f22089665d78099265fec Mon Sep 17 00:00:00 2001
From: Samuel Just <sjust@redhat.com>
Date: Thu, 26 Sep 2024 15:15:48 -0700
Subject: [PATCH 121/148] crimson: switch ClientRequest::do_request to use
 *_executer rather than do_osd_ops

Signed-off-by: Samuel Just <sjust@redhat.com>
---
 .../osd/osd_operations/client_request.cc      | 141 +++++++++++++++---
 1 file changed, 117 insertions(+), 24 deletions(-)

diff --git a/src/crimson/osd/osd_operations/client_request.cc b/src/crimson/osd/osd_operations/client_request.cc
index 6eed04df6a5ac..c226222fa0c75 100644
--- a/src/crimson/osd/osd_operations/client_request.cc
+++ b/src/crimson/osd/osd_operations/client_request.cc
@@ -502,36 +502,129 @@ ClientRequest::do_process(
     co_return;
   }
 
-  auto [submitted, all_completed] = co_await pg->do_osd_ops(
-    m, r_conn, obc, op_info, snapc
-  ).handle_error_interruptible(
-    crimson::ct_error::eagain::handle([] {
-      ceph_assert(0 == "not handled");
-      return std::make_tuple(
-	interruptor::now(),
-	PG::do_osd_ops_iertr::make_ready_future<MURef<MOSDOpReply>>());
+  auto ox = seastar::make_lw_shared<OpsExecuter>(
+    pg, obc, op_info, *m, r_conn, snapc);
+  auto ret = co_await pg->run_executer(
+    ox, obc, op_info, m->ops
+  ).si_then([]() -> std::optional<std::error_code> {
+    return std::nullopt;
+  }).handle_error_interruptible(crimson::ct_error::all_same_way(
+    [](auto e) -> std::optional<std::error_code> {
+      return e;
     })
   );
-  co_await std::move(submitted);
 
-  co_await ihref.enter_stage<interruptor>(client_pp(*pg).wait_repop, *this);
+  auto should_log_error = [](std::error_code e) -> bool {
+    switch (e.value()) {
+    case EDQUOT:
+    case ENOSPC:
+    case EAGAIN:
+      return false;
+    default:
+      return true;
+    }
+  };
 
-  auto reply = co_await std::move(
-    all_completed
-  ).handle_error_interruptible(
-    crimson::ct_error::eagain::handle([] {
-      ceph_assert(0 == "not handled");
-      return MURef<MOSDOpReply>();
-    })
-  );
+  if (ret && !should_log_error(*ret)) {
+    co_await reply_op_error(pg, -ret->value());
+    co_return;
+  }
+
+  {
+    auto all_completed = interruptor::now();
+    if (ret) {
+      assert(should_log_error(*ret));
+      if (op_info.may_write()) {
+	auto rep_tid = pg->shard_services.get_tid();
+	auto version = co_await pg->submit_error_log(
+	  m, op_info, obc, *ret, rep_tid);
+
+	all_completed = pg->complete_error_log(
+	  rep_tid, version);
+      }
+      // simply return the error below, leaving all_completed alone
+    } else {
+      auto submitted = interruptor::now();
+      std::tie(submitted, all_completed) = co_await pg->submit_executer(
+	std::move(ox), m->ops);
+      co_await std::move(submitted);
+    }
+    co_await ihref.enter_stage<interruptor>(client_pp(*pg).wait_repop, *this);
+
+    co_await std::move(all_completed);
+  }
 
   co_await ihref.enter_stage<interruptor>(client_pp(*pg).send_reply, *this);
-  DEBUGDPP("{}.{}: sending response",
-	   *pg, *this, this_instance_id);
-  // TODO: gate the crosscore sending
-  co_await interruptor::make_interruptible(
-    get_foreign_connection().send_with_throttling(std::move(reply))
-  );
+
+  if (ret) {
+    int err = -ret->value();
+    DEBUGDPP("{}: replying with error {}", *pg, *this, err);
+
+    auto reply = crimson::make_message<MOSDOpReply>(
+      m.get(), err, pg->get_osdmap_epoch(), 0, false);
+
+    if (!m->ops.empty() && m->ops.back().op.flags & CEPH_OSD_OP_FLAG_FAILOK) {
+      reply->set_result(0);
+    }
+
+    // For all ops except for CMPEXT, the correct error value is encoded
+    // in e. For CMPEXT, osdop.rval has the actual error value.
+    if (err == -ct_error::cmp_fail_error_value) {
+      assert(!m->ops.empty());
+      for (auto &osdop : m->ops) {
+	if (osdop.rval < 0) {
+	  reply->set_result(osdop.rval);
+	  break;
+	}
+      }
+    }
+
+    reply->set_enoent_reply_versions(
+      pg->peering_state.get_info().last_update,
+      pg->peering_state.get_info().last_user_version);
+    reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
+    
+    // TODO: gate the crosscore sending
+    co_await interruptor::make_interruptible(
+      get_foreign_connection().send_with_throttling(std::move(reply)));
+  } else {
+    int result = m->ops.empty() ? 0 : m->ops.back().rval.code;
+    if (op_info.may_read() && result >= 0) {
+      for (auto &osdop : m->ops) {
+	if (osdop.rval < 0 && !(osdop.op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
+	  result = osdop.rval.code;
+	  break;
+	}
+      }
+    } else if (result > 0 && op_info.may_write() && !op_info.allows_returnvec()) {
+      result = 0;
+    } else if (result < 0 &&
+	     (m->ops.empty() ?
+	      0 : m->ops.back().op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
+      result = 0;
+    }
+    auto reply = crimson::make_message<MOSDOpReply>(
+      m.get(),
+      result,
+      pg->get_osdmap_epoch(),
+      0,
+      false);
+    reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
+    if (obc->obs.exists) {
+      reply->set_reply_versions(pg->peering_state.get_info().last_update,
+				obc->obs.oi.user_version);
+    } else {
+      reply->set_reply_versions(pg->peering_state.get_info().last_update,
+				pg->peering_state.get_info().last_user_version);
+    }
+    
+    DEBUGDPP("{}.{}: sending response {}",
+	     *pg, *this, this_instance_id, *m);
+    // TODO: gate the crosscore sending
+    co_await interruptor::make_interruptible(
+      get_foreign_connection().send_with_throttling(std::move(reply))
+    );
+  }
 }
 
 bool ClientRequest::is_misdirected(const PG& pg) const

From fc41fcb9d2a7c5b589ea68ad0644ac92d22fe761 Mon Sep 17 00:00:00 2001
From: Samuel Just <sjust@redhat.com>
Date: Thu, 26 Sep 2024 22:30:59 +0000
Subject: [PATCH 122/148] crimson: factor out InternalClientRequest::do_process

Signed-off-by: Samuel Just <sjust@redhat.com>
---
 .../osd_operations/internal_client_request.cc | 44 +++++++++++--------
 .../osd_operations/internal_client_request.h  |  3 ++
 2 files changed, 29 insertions(+), 18 deletions(-)

diff --git a/src/crimson/osd/osd_operations/internal_client_request.cc b/src/crimson/osd/osd_operations/internal_client_request.cc
index dabff1a33bdb6..d0ee392ecb638 100644
--- a/src/crimson/osd/osd_operations/internal_client_request.cc
+++ b/src/crimson/osd/osd_operations/internal_client_request.cc
@@ -50,6 +50,30 @@ CommonPGPipeline& InternalClientRequest::client_pp()
   return pg->request_pg_pipeline;
 }
 
+InternalClientRequest::interruptible_future<>
+InternalClientRequest::do_process(
+  crimson::osd::ObjectContextRef obc,
+  std::vector<OSDOp> &osd_ops)
+{
+  return pg->do_osd_ops(
+    std::move(obc),
+    osd_ops,
+    std::as_const(op_info),
+    get_do_osd_ops_params()
+  ).safe_then_unpack_interruptible(
+    [](auto submitted, auto all_completed) {
+      return all_completed.handle_error_interruptible(
+	crimson::ct_error::eagain::handle([] {
+	  ceph_assert(0 == "not handled");
+	  return seastar::now();
+	}));
+    }, crimson::ct_error::eagain::handle([] {
+      ceph_assert(0 == "not handled");
+      return interruptor::now();
+    })
+  );
+}
+
 InternalClientRequest::interruptible_future<>
 InternalClientRequest::with_interruption()
 {
@@ -93,24 +117,8 @@ InternalClientRequest::with_interruption()
     [&osd_ops, this](auto, auto obc) {
       return enter_stage<interruptor>(client_pp().process
       ).then_interruptible(
-	[obc=std::move(obc), &osd_ops, this] {
-	  return pg->do_osd_ops(
-	    std::move(obc),
-	    osd_ops,
-	    std::as_const(op_info),
-	    get_do_osd_ops_params()
-	  ).safe_then_unpack_interruptible(
-	    [](auto submitted, auto all_completed) {
-	      return all_completed.handle_error_interruptible(
-		crimson::ct_error::eagain::handle([] {
-		  ceph_assert(0 == "not handled");
-		  return seastar::now();
-		}));
-	    }, crimson::ct_error::eagain::handle([] {
-	      ceph_assert(0 == "not handled");
-	      return interruptor::now();
-	    })
-	  );
+	[obc=std::move(obc), &osd_ops, this]() mutable {
+	  return do_process(std::move(obc), osd_ops);
 	});
     }).handle_error_interruptible(
       crimson::ct_error::assert_all("unexpected error")
diff --git a/src/crimson/osd/osd_operations/internal_client_request.h b/src/crimson/osd/osd_operations/internal_client_request.h
index 782fb809042a6..6023db0a8dbe2 100644
--- a/src/crimson/osd/osd_operations/internal_client_request.h
+++ b/src/crimson/osd/osd_operations/internal_client_request.h
@@ -41,6 +41,9 @@ class InternalClientRequest : public PhasedOperationT<InternalClientRequest>,
   CommonPGPipeline& client_pp();
 
   InternalClientRequest::interruptible_future<> with_interruption();
+  InternalClientRequest::interruptible_future<> do_process(
+    crimson::osd::ObjectContextRef obc,
+    std::vector<OSDOp> &osd_ops);
 
   seastar::future<> do_process();
 

From c091f3b2ab6a89762e6fcf5ccaa49b65c9ab6fca Mon Sep 17 00:00:00 2001
From: Samuel Just <sjust@redhat.com>
Date: Thu, 26 Sep 2024 22:43:35 +0000
Subject: [PATCH 123/148] crimson: convert InternalClientRequest::do_request to
 use *_executer rather than do_osd_ops*

Signed-off-by: Samuel Just <sjust@redhat.com>
---
 .../osd_operations/internal_client_request.cc | 35 ++++++++++---------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/src/crimson/osd/osd_operations/internal_client_request.cc b/src/crimson/osd/osd_operations/internal_client_request.cc
index d0ee392ecb638..6ad447cf32ee4 100644
--- a/src/crimson/osd/osd_operations/internal_client_request.cc
+++ b/src/crimson/osd/osd_operations/internal_client_request.cc
@@ -55,23 +55,26 @@ InternalClientRequest::do_process(
   crimson::osd::ObjectContextRef obc,
   std::vector<OSDOp> &osd_ops)
 {
-  return pg->do_osd_ops(
-    std::move(obc),
-    osd_ops,
-    std::as_const(op_info),
-    get_do_osd_ops_params()
-  ).safe_then_unpack_interruptible(
-    [](auto submitted, auto all_completed) {
-      return all_completed.handle_error_interruptible(
-	crimson::ct_error::eagain::handle([] {
-	  ceph_assert(0 == "not handled");
-	  return seastar::now();
-	}));
-    }, crimson::ct_error::eagain::handle([] {
-      ceph_assert(0 == "not handled");
-      return interruptor::now();
-    })
+  LOG_PREFIX(InternalClientRequest::do_process);
+  auto params = get_do_osd_ops_params();
+  auto ox = seastar::make_lw_shared<OpsExecuter>(
+    pg, obc, op_info, params, params.get_connection(), SnapContext{});
+  co_await pg->run_executer(
+    ox, obc, op_info, osd_ops
+  ).handle_error_interruptible(
+    crimson::ct_error::all_same_way(
+      [this, FNAME](auto e) {
+	ERRORDPPI("{}: got unexpected error {}", *pg, *this, e);
+	ceph_assert(0 == "should not return an error");
+	return interruptor::now();
+      })
   );
+
+  auto [submitted, completed] = co_await pg->submit_executer(
+    std::move(ox), osd_ops);
+
+  co_await std::move(submitted);
+  co_await std::move(completed);
 }
 
 InternalClientRequest::interruptible_future<>

From a0efff116cd038b08c0ce31a5c32c4b9df574088 Mon Sep 17 00:00:00 2001
From: Samuel Just <sjust@redhat.com>
Date: Thu, 10 Oct 2024 16:22:28 +0000
Subject: [PATCH 124/148] crimson: clarify ops_executer.h comment

Signed-off-by: Samuel Just <sjust@redhat.com>
---
 src/crimson/osd/ops_executer.h | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/crimson/osd/ops_executer.h b/src/crimson/osd/ops_executer.h
index 6986f49ea08a7..3a7aaef7cd036 100644
--- a/src/crimson/osd/ops_executer.h
+++ b/src/crimson/osd/ops_executer.h
@@ -170,13 +170,9 @@ class OpsExecuter : public seastar::enable_lw_shared_from_this<OpsExecuter> {
 
   object_stat_sum_t delta_stats;
 private:
-  // an operation can be divided into two stages: main and effect-exposing
-  // one. The former is performed immediately on call to `do_osd_op()` while
-  // the later on `submit_changes()` – after successfully processing main
-  // stages of all involved operations. When any stage fails, none of all
-  // scheduled effect-exposing stages will be executed.
-  // when operation requires this division, some variant of `with_effect()`
-  // should be used.
+  // with_effect can be used to schedule operations to be performed
+  // at commit time.  effects will be discarded if the operation does
+  // not commit.
   struct effect_t {
     // an effect can affect PG, i.e. create a watch timeout
     virtual seastar::future<> execute(Ref<PG> pg) = 0;

From 8f3ac965c310d80270e53644c56f3bca30511240 Mon Sep 17 00:00:00 2001
From: Samuel Just <sjust@redhat.com>
Date: Thu, 26 Sep 2024 22:49:59 +0000
Subject: [PATCH 125/148] crimson: remove now unused PG::do_osd_ops* and
 log_reply

Signed-off-by: Samuel Just <sjust@redhat.com>
---
 src/crimson/osd/pg.cc | 308 ------------------------------------------
 src/crimson/osd/pg.h  |  35 +----
 2 files changed, 1 insertion(+), 342 deletions(-)

diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc
index bb5c1e9000baf..9cdd19d01332f 100644
--- a/src/crimson/osd/pg.cc
+++ b/src/crimson/osd/pg.cc
@@ -978,181 +978,6 @@ ObjectContextRef duplicate_obc(const ObjectContextRef &obc) {
   return object_context;
 }
 
-template <class Ret, class SuccessFunc, class FailureFunc>
-PG::do_osd_ops_iertr::future<PG::pg_rep_op_fut_t<Ret>>
-PG::do_osd_ops_execute(
-  seastar::lw_shared_ptr<OpsExecuter> ox,
-  ObjectContextRef obc,
-  const OpInfo &op_info,
-  Ref<MOSDOp> m,
-  std::vector<OSDOp>& ops,
-  SuccessFunc&& success_func,
-  FailureFunc&& failure_func)
-{
-  assert(ox);
-  auto rollbacker = ox->create_rollbacker(
-    [object_context=duplicate_obc(obc)] (auto& obc) mutable {
-    obc->update_from(*object_context);
-  });
-  auto failure_func_ptr = seastar::make_lw_shared(std::move(failure_func));
-  return interruptor::do_for_each(ops, [ox](OSDOp& osd_op) {
-    logger().debug(
-      "do_osd_ops_execute: object {} - handling op {}",
-      ox->get_target(),
-      ceph_osd_op_name(osd_op.op.op));
-    return ox->execute_op(osd_op);
-  }).safe_then_interruptible([this, ox, &ops] {
-    /* flush_changes_n_do_ops_effects now returns
-     *
-     * interruptible_future<
-     *    tuple<interruptible_future<>, interruptible_future<>>>
-     *
-     * Previously, this lambda relied on the second element of that tuple to
-     * include OpsExecutor::osd_op_errorator in order to propogate the
-     * following three errors to the next callback.  This is actually quite
-     * awkward as the second future is the completion future, which really
-     * cannot fail (for it to do so would require an interval change to
-     * correct).
-     *
-     * Rather than reworking this now, I'll leave it as is and refactor it
-     * later.
-     */
-    using complete_iertr = crimson::interruptible::interruptible_errorator<
-      ::crimson::osd::IOInterruptCondition,
-      OpsExecuter::osd_op_errorator>;
-    using ret_t = std::tuple<
-      interruptible_future<>,
-      complete_iertr::future<>>;
-
-    logger().debug(
-      "do_osd_ops_execute: object {} all operations successful",
-      ox->get_target());
-    // check for full
-    if ((ox->delta_stats.num_bytes > 0 ||
-      ox->delta_stats.num_objects > 0) &&
-      get_pgpool().info.has_flag(pg_pool_t::FLAG_FULL)) {
-      const auto& m = ox->get_message();
-      if (m.get_reqid().name.is_mds() ||   // FIXME: ignore MDS for now
-        m.has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
-        logger().info(" full, but proceeding due to FULL_FORCE or MDS");
-      } else if (m.has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
-        // they tried, they failed.
-        logger().info(" full, replying to FULL_TRY op");
-        if (get_pgpool().info.has_flag(pg_pool_t::FLAG_FULL_QUOTA))
-	  return interruptor::make_ready_future<ret_t>(
-            interruptor::now(),
-	    complete_iertr::future<>(
-	      crimson::ct_error::edquot::make()));
-        else
-	  return interruptor::make_ready_future<ret_t>(
-            interruptor::now(),
-	    complete_iertr::future<>(
-	      crimson::ct_error::enospc::make()));
-      } else {
-        // drop request
-        logger().info(" full, dropping request (bad client)");
-	return interruptor::make_ready_future<ret_t>(
-	  interruptor::now(),
-	  complete_iertr::future<>(
-	    crimson::ct_error::eagain::make()));
-      }
-    }
-    return std::move(*ox).flush_changes_n_do_ops_effects(
-      ops,
-      snap_mapper,
-      osdriver,
-      [this] (auto&& txn,
-              auto&& obc,
-              auto&& osd_op_p,
-              auto&& log_entries) {
-	logger().debug(
-	  "do_osd_ops_execute: object {} submitting txn",
-	  obc->get_oid());
-        mutate_object(obc, txn, osd_op_p);
-	return submit_transaction(
-          std::move(obc),
-          std::move(txn),
-          std::move(osd_op_p),
-          std::move(log_entries));
-      }).then_interruptible([](auto &&futs) {
-	auto &&[submitted, completed] = std::move(futs);
-	return interruptor::make_ready_future<ret_t>(
-	  std::move(submitted),
-	  std::move(completed));
-      });
-  }).safe_then_unpack_interruptible(
-    [success_func=std::move(success_func), rollbacker, this, failure_func_ptr, obc]
-    (auto submitted_fut, auto _all_completed_fut) mutable {
-
-    auto all_completed_fut = _all_completed_fut.safe_then_interruptible_tuple(
-      std::move(success_func),
-      crimson::ct_error::object_corrupted::handle(
-      [rollbacker, this, obc] (const std::error_code& e) mutable {
-      // this is a path for EIO. it's special because we want to fix the obejct
-      // and try again. that is, the layer above `PG::do_osd_ops` is supposed to
-      // restart the execution.
-      rollbacker.rollback_obc_if_modified(e);
-      return repair_object(obc->obs.oi.soid,
-                           obc->obs.oi.version
-      ).then_interruptible([] {
-        return do_osd_ops_iertr::future<Ret>{crimson::ct_error::eagain::make()};
-      });
-    }), OpsExecuter::osd_op_errorator::all_same_way(
-        [rollbacker, failure_func_ptr]
-        (const std::error_code& e) mutable {
-          // handle non-fatal errors only
-          ceph_assert(e.value() == EDQUOT ||
-                      e.value() == ENOSPC ||
-                      e.value() == EAGAIN);
-          rollbacker.rollback_obc_if_modified(e);
-          return (*failure_func_ptr)(e);
-    }));
-
-    return PG::do_osd_ops_iertr::make_ready_future<pg_rep_op_fut_t<Ret>>(
-      std::move(submitted_fut),
-      std::move(all_completed_fut)
-    );
-  }, OpsExecuter::osd_op_errorator::all_same_way(
-    [this, op_info, m, obc,
-     rollbacker, failure_func_ptr]
-    (const std::error_code& e) mutable {
-    ceph_tid_t rep_tid = shard_services.get_tid();
-    rollbacker.rollback_obc_if_modified(e);
-    // record error log
-    auto maybe_submit_error_log =
-      interruptor::make_ready_future<std::optional<eversion_t>>(std::nullopt);
-    // call submit_error_log only for non-internal clients
-    if constexpr (!std::is_same_v<Ret, void>) {
-      if(op_info.may_write()) {
-        maybe_submit_error_log =
-          submit_error_log(
-	    m, op_info, obc, e, rep_tid
-	  ).then_interruptible([](auto &&e) {
-	    return std::make_optional<eversion_t>(std::move(e));
-	  });
-      }
-    }
-    return maybe_submit_error_log.then_interruptible(
-    [this, failure_func_ptr, e, rep_tid] (auto version) {
-      auto all_completed =
-      [this, failure_func_ptr, e, rep_tid,  version] {
-        if (version.has_value()) {
-          return complete_error_log(rep_tid, version.value()
-          ).then_interruptible([failure_func_ptr, e] {
-            return (*failure_func_ptr)(e);
-          });
-        } else {
-          return (*failure_func_ptr)(e);
-        }
-      };
-      return PG::do_osd_ops_iertr::make_ready_future<pg_rep_op_fut_t<Ret>>(
-        std::move(seastar::now()),
-        std::move(all_completed())
-      );
-    });
-  }));
-}
-
 PG::interruptible_future<> PG::complete_error_log(const ceph_tid_t& rep_tid,
                                          const eversion_t& version)
 {
@@ -1329,139 +1154,6 @@ PG::submit_executer_fut PG::submit_executer(
     });
 }
 
-
-PG::do_osd_ops_iertr::future<PG::pg_rep_op_fut_t<MURef<MOSDOpReply>>>
-PG::do_osd_ops(
-  Ref<MOSDOp> m,
-  crimson::net::ConnectionXcoreRef conn,
-  ObjectContextRef obc,
-  const OpInfo &op_info,
-  const SnapContext& snapc)
-{
-  if (__builtin_expect(stopping, false)) {
-    throw crimson::common::system_shutdown_exception();
-  }
-  return do_osd_ops_execute<MURef<MOSDOpReply>>(
-    seastar::make_lw_shared<OpsExecuter>(
-      Ref<PG>{this}, obc, op_info, *m, conn, snapc),
-    obc,
-    op_info,
-    m,
-    m->ops,
-    // success_func
-    [this, m, obc, may_write = op_info.may_write(),
-     may_read = op_info.may_read(), rvec = op_info.allows_returnvec()] {
-      // TODO: should stop at the first op which returns a negative retval,
-      //       cmpext uses it for returning the index of first unmatched byte
-      int result = m->ops.empty() ? 0 : m->ops.back().rval.code;
-      if (may_read && result >= 0) {
-        for (auto &osdop : m->ops) {
-          if (osdop.rval < 0 && !(osdop.op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
-            result = osdop.rval.code;
-            break;
-          }
-        }
-      } else if (result > 0 && may_write && !rvec) {
-        result = 0;
-      } else if (result < 0 && (m->ops.empty() ?
-        0 : m->ops.back().op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
-        result = 0;
-      }
-      auto reply = crimson::make_message<MOSDOpReply>(m.get(),
-                                             result,
-                                             get_osdmap_epoch(),
-                                             0,
-                                             false);
-      reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
-      logger().debug(
-        "do_osd_ops: {} - object {} sending reply",
-        *m,
-        m->get_hobj());
-      if (obc->obs.exists) {
-        reply->set_reply_versions(peering_state.get_info().last_update,
-          obc->obs.oi.user_version);
-      } else {
-        reply->set_reply_versions(peering_state.get_info().last_update,
-          peering_state.get_info().last_user_version);
-      }
-      return do_osd_ops_iertr::make_ready_future<MURef<MOSDOpReply>>(
-        std::move(reply));
-    },
-    // failure_func
-    [m, this]
-    (const std::error_code& e) {
-    logger().error("do_osd_ops_execute::failure_func {} got error: {}",
-                    *m, e);
-    return log_reply(m, e);
-  });
-}
-
-PG::do_osd_ops_iertr::future<MURef<MOSDOpReply>>
-PG::log_reply(
-  Ref<MOSDOp> m,
-  const std::error_code& e)
-{
-  auto reply = crimson::make_message<MOSDOpReply>(
-    m.get(), -e.value(), get_osdmap_epoch(), 0, false);
-  if (m->ops.empty() ? 0 :
-    m->ops.back().op.flags & CEPH_OSD_OP_FLAG_FAILOK) {
-      reply->set_result(0);
-    }
-  // For all ops except for CMPEXT, the correct error value is encoded
-  // in e.value(). For CMPEXT, osdop.rval has the actual error value.
-  if (e.value() == ct_error::cmp_fail_error_value) {
-    assert(!m->ops.empty());
-    for (auto &osdop : m->ops) {
-      if (osdop.rval < 0) {
-        reply->set_result(osdop.rval);
-        break;
-      }
-    }
-  }
-  reply->set_enoent_reply_versions(
-    peering_state.get_info().last_update,
-    peering_state.get_info().last_user_version);
-  reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
-  return do_osd_ops_iertr::make_ready_future<MURef<MOSDOpReply>>(
-    std::move(reply));
-}
-
-PG::do_osd_ops_iertr::future<PG::pg_rep_op_fut_t<>>
-PG::do_osd_ops(
-  ObjectContextRef obc,
-  std::vector<OSDOp>& ops,
-  const OpInfo &op_info,
-  const do_osd_ops_params_t &&msg_params)
-{
-  // This overload is generally used for internal client requests,
-  // use an empty SnapContext.
-  return seastar::do_with(
-    std::move(msg_params),
-    [=, this, &ops, &op_info](auto &msg_params) {
-    return do_osd_ops_execute<void>(
-      seastar::make_lw_shared<OpsExecuter>(
-        Ref<PG>{this},
-        obc,
-        op_info,
-        msg_params,
-        msg_params.get_connection(),
-        SnapContext{}
-      ),
-      obc,
-      op_info,
-      Ref<MOSDOp>(),
-      ops,
-      // success_func
-      [] {
-        return do_osd_ops_iertr::now();
-      },
-      // failure_func
-      [] (const std::error_code& e) {
-        return do_osd_ops_iertr::now();
-      });
-  });
-}
-
 PG::interruptible_future<MURef<MOSDOpReply>> PG::do_pg_ops(Ref<MOSDOp> m)
 {
   if (__builtin_expect(stopping, false)) {
diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h
index c91f93171dbc1..3a8ddad922a50 100644
--- a/src/crimson/osd/pg.h
+++ b/src/crimson/osd/pg.h
@@ -672,41 +672,8 @@ class PG : public boost::intrusive_ref_counter<
     seastar::lw_shared_ptr<OpsExecuter> ox,
     const std::vector<OSDOp>& ops);
 
-  using do_osd_ops_ertr = crimson::errorator<
-   crimson::ct_error::eagain>;
-  using do_osd_ops_iertr =
-    ::crimson::interruptible::interruptible_errorator<
-      ::crimson::osd::IOInterruptCondition,
-      ::crimson::errorator<crimson::ct_error::eagain>>;
-  template <typename Ret = void>
-  using pg_rep_op_fut_t =
-    std::tuple<interruptible_future<>,
-               do_osd_ops_iertr::future<Ret>>;
-  do_osd_ops_iertr::future<pg_rep_op_fut_t<MURef<MOSDOpReply>>> do_osd_ops(
-    Ref<MOSDOp> m,
-    crimson::net::ConnectionXcoreRef conn,
-    ObjectContextRef obc,
-    const OpInfo &op_info,
-    const SnapContext& snapc);
-
   struct do_osd_ops_params_t;
-  do_osd_ops_iertr::future<MURef<MOSDOpReply>> log_reply(
-    Ref<MOSDOp> m,
-    const std::error_code& e);
-  do_osd_ops_iertr::future<pg_rep_op_fut_t<>> do_osd_ops(
-    ObjectContextRef obc,
-    std::vector<OSDOp>& ops,
-    const OpInfo &op_info,
-    const do_osd_ops_params_t &&params);
-  template <class Ret, class SuccessFunc, class FailureFunc>
-  do_osd_ops_iertr::future<pg_rep_op_fut_t<Ret>> do_osd_ops_execute(
-    seastar::lw_shared_ptr<OpsExecuter> ox,
-    ObjectContextRef obc,
-    const OpInfo &op_info,
-    Ref<MOSDOp> m,
-    std::vector<OSDOp>& ops,
-    SuccessFunc&& success_func,
-    FailureFunc&& failure_func);
+
   interruptible_future<MURef<MOSDOpReply>> do_pg_ops(Ref<MOSDOp> m);
   interruptible_future<
     std::tuple<interruptible_future<>, interruptible_future<>>>

From 7ac64b0b245798b1d4a85b1da86497d2baf2bceb Mon Sep 17 00:00:00 2001
From: Samuel Just <sjust@redhat.com>
Date: Tue, 1 Oct 2024 13:05:03 -0700
Subject: [PATCH 126/148] crimson: OpsExecuter no longer needs to be a lw
 shared ptr

ClientRequest and InternalClientRequest can declare them
as auto variables.

Signed-off-by: Samuel Just <sjust@redhat.com>
---
 src/crimson/osd/ops_executer.h                |  6 ++---
 .../osd/osd_operations/client_request.cc      |  3 +--
 .../osd_operations/internal_client_request.cc |  2 +-
 src/crimson/osd/pg.cc                         | 23 +++++++++----------
 src/crimson/osd/pg.h                          |  4 ++--
 5 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/src/crimson/osd/ops_executer.h b/src/crimson/osd/ops_executer.h
index 3a7aaef7cd036..068f510d1ef82 100644
--- a/src/crimson/osd/ops_executer.h
+++ b/src/crimson/osd/ops_executer.h
@@ -40,7 +40,7 @@ namespace crimson::osd {
 class PG;
 
 // OpsExecuter -- a class for executing ops targeting a certain object.
-class OpsExecuter : public seastar::enable_lw_shared_from_this<OpsExecuter> {
+class OpsExecuter {
   friend class SnapTrimObjSubEvent;
 
   using call_errorator = crimson::errorator<
@@ -549,14 +549,14 @@ template <class Func>
 struct OpsExecuter::RollbackHelper {
   void rollback_obc_if_modified();
   void rollback_obc_if_modified(const std::error_code& e);
-  seastar::lw_shared_ptr<OpsExecuter> ox;
+  OpsExecuter *ox;
   Func func;
 };
 
 template <class Func>
 inline OpsExecuter::RollbackHelper<Func>
 OpsExecuter::create_rollbacker(Func&& func) {
-  return {shared_from_this(), std::forward<Func>(func)};
+  return {this, std::forward<Func>(func)};
 }
 
 template <class Func>
diff --git a/src/crimson/osd/osd_operations/client_request.cc b/src/crimson/osd/osd_operations/client_request.cc
index c226222fa0c75..a89fb2c84bc56 100644
--- a/src/crimson/osd/osd_operations/client_request.cc
+++ b/src/crimson/osd/osd_operations/client_request.cc
@@ -502,8 +502,7 @@ ClientRequest::do_process(
     co_return;
   }
 
-  auto ox = seastar::make_lw_shared<OpsExecuter>(
-    pg, obc, op_info, *m, r_conn, snapc);
+  OpsExecuter ox(pg, obc, op_info, *m, r_conn, snapc);
   auto ret = co_await pg->run_executer(
     ox, obc, op_info, m->ops
   ).si_then([]() -> std::optional<std::error_code> {
diff --git a/src/crimson/osd/osd_operations/internal_client_request.cc b/src/crimson/osd/osd_operations/internal_client_request.cc
index 6ad447cf32ee4..9e5867caf8067 100644
--- a/src/crimson/osd/osd_operations/internal_client_request.cc
+++ b/src/crimson/osd/osd_operations/internal_client_request.cc
@@ -57,7 +57,7 @@ InternalClientRequest::do_process(
 {
   LOG_PREFIX(InternalClientRequest::do_process);
   auto params = get_do_osd_ops_params();
-  auto ox = seastar::make_lw_shared<OpsExecuter>(
+  OpsExecuter ox(
     pg, obc, op_info, params, params.get_connection(), SnapContext{});
   co_await pg->run_executer(
     ox, obc, op_info, osd_ops
diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc
index 9cdd19d01332f..744a1dbc02b97 100644
--- a/src/crimson/osd/pg.cc
+++ b/src/crimson/osd/pg.cc
@@ -1080,13 +1080,13 @@ PG::interruptible_future<eversion_t> PG::submit_error_log(
 }
 
 PG::run_executer_fut PG::run_executer(
-  seastar::lw_shared_ptr<OpsExecuter> ox,
+  OpsExecuter &ox,
   ObjectContextRef obc,
   const OpInfo &op_info,
   std::vector<OSDOp>& ops)
 {
   LOG_PREFIX(PG::run_executer);
-  auto rollbacker = ox->create_rollbacker(
+  auto rollbacker = ox.create_rollbacker(
     [stored_obc=duplicate_obc(obc)](auto &obc) mutable {
       obc->update_from(*stored_obc);
     });
@@ -1095,16 +1095,16 @@ PG::run_executer_fut PG::run_executer(
   });
 
   for (auto &op: ops) {
-    DEBUGDPP("object {} handle op {}", *this, ox->get_target(), op);
-    co_await ox->execute_op(op);
+    DEBUGDPP("object {} handle op {}", *this, ox.get_target(), op);
+    co_await ox.execute_op(op);
   }
-  DEBUGDPP("object {} all operations successful", *this, ox->get_target());
+  DEBUGDPP("object {} all operations successful", *this, ox.get_target());
 
   // check for full
-  if ((ox->delta_stats.num_bytes > 0 ||
-       ox->delta_stats.num_objects > 0) &&
+  if ((ox.delta_stats.num_bytes > 0 ||
+       ox.delta_stats.num_objects > 0) &&
       get_pgpool().info.has_flag(pg_pool_t::FLAG_FULL)) {
-    const auto& m = ox->get_message();
+    const auto& m = ox.get_message();
     if (m.get_reqid().name.is_mds() ||   // FIXME: ignore MDS for now
 	m.has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
       INFODPP("full, but proceeding due to FULL_FORCE, or MDS", *this);
@@ -1129,13 +1129,12 @@ PG::run_executer_fut PG::run_executer(
 }
 
 PG::submit_executer_fut PG::submit_executer(
-  seastar::lw_shared_ptr<OpsExecuter> ox,
-  const std::vector<OSDOp>& ops)
-{
+  OpsExecuter &&ox,
+  const std::vector<OSDOp>& ops) {
   LOG_PREFIX(PG::submit_executer);
   // transaction must commit at this point
   return std::move(
-    *ox
+    ox
   ).flush_changes_n_do_ops_effects(
     ops,
     snap_mapper,
diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h
index 3a8ddad922a50..604f49005ff04 100644
--- a/src/crimson/osd/pg.h
+++ b/src/crimson/osd/pg.h
@@ -658,7 +658,7 @@ class PG : public boost::intrusive_ref_counter<
     run_executer_ertr>;
   using run_executer_fut = run_executer_iertr::future<>;
   run_executer_fut run_executer(
-    seastar::lw_shared_ptr<OpsExecuter> ox,
+    OpsExecuter &ox,
     ObjectContextRef obc,
     const OpInfo &op_info,
     std::vector<OSDOp>& ops);
@@ -669,7 +669,7 @@ class PG : public boost::intrusive_ref_counter<
   using submit_executer_fut = interruptible_future<
     submit_executer_ret>;
   submit_executer_fut submit_executer(
-    seastar::lw_shared_ptr<OpsExecuter> ox,
+    OpsExecuter &&ox,
     const std::vector<OSDOp>& ops);
 
   struct do_osd_ops_params_t;

From 2b562b64a64777b1428e9ad3187b50619cbf1a4d Mon Sep 17 00:00:00 2001
From: Samuel Just <sjust@redhat.com>
Date: Tue, 1 Oct 2024 13:11:31 -0700
Subject: [PATCH 127/148] crimson: remove unused
 OpsExecuter::rollback_obc_if_modified overload

Signed-off-by: Samuel Just <sjust@redhat.com>
---
 src/crimson/osd/ops_executer.h | 31 -------------------------------
 1 file changed, 31 deletions(-)

diff --git a/src/crimson/osd/ops_executer.h b/src/crimson/osd/ops_executer.h
index 068f510d1ef82..e770e825b32d0 100644
--- a/src/crimson/osd/ops_executer.h
+++ b/src/crimson/osd/ops_executer.h
@@ -548,7 +548,6 @@ OpsExecuter::flush_changes_n_do_ops_effects(
 template <class Func>
 struct OpsExecuter::RollbackHelper {
   void rollback_obc_if_modified();
-  void rollback_obc_if_modified(const std::error_code& e);
   OpsExecuter *ox;
   Func func;
 };
@@ -587,36 +586,6 @@ void OpsExecuter::RollbackHelper<Func>::rollback_obc_if_modified()
   }
 }
 
-template <class Func>
-void OpsExecuter::RollbackHelper<Func>::rollback_obc_if_modified(
-  const std::error_code& e)
-{
-  // Oops, an operation had failed. do_osd_ops() altogether with
-  // OpsExecuter already dropped the ObjectStore::Transaction if
-  // there was any. However, this is not enough to completely
-  // rollback as we gave OpsExecuter the very single copy of `obc`
-  // we maintain and we did it for both reading and writing.
-  // Now all modifications must be reverted.
-  //
-  // The conditional's purpose is to efficiently handle hot errors
-  // which may appear as a result of e.g. CEPH_OSD_OP_CMPXATTR or
-  // CEPH_OSD_OP_OMAP_CMP. These are read-like ops and clients
-  // typically append them before any write. If OpsExecuter hasn't
-  // seen any modifying operation, `obc` is supposed to be kept
-  // unchanged.
-  assert(ox);
-  const auto need_rollback = ox->has_seen_write();
-  crimson::get_logger(ceph_subsys_osd).debug(
-    "{}: object {} got error {}, need_rollback={}",
-    __func__,
-    ox->obc->get_oid(),
-    e,
-    need_rollback);
-  if (need_rollback) {
-    func(ox->obc);
-  }
-}
-
 // PgOpsExecuter -- a class for executing ops targeting a certain PG.
 class PgOpsExecuter {
   template <typename T = void>

From e036fde7e4b03241e617cbb3ef8f19a703aae716 Mon Sep 17 00:00:00 2001
From: Dnyaneshwari
 <dnyaneshwari@li-9c9fbecc-2d5c-11b2-a85c-e2a7cc8a424f.ibm.com>
Date: Mon, 14 Oct 2024 09:56:45 +0530
Subject: [PATCH 128/148] mgr/dashboard: The subvolumes are missing from the
 dropdown menu on the "Create NFS export" page

Fixes: https://tracker.ceph.com/issues/68519

Signed-off-by: Dnyaneshwari Talwekar <dtalweka@redhat.com>
---
 .../src/app/ceph/nfs/nfs-form/nfs-form.component.html        | 5 -----
 .../frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.ts | 2 +-
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.html
index 1a73490175db7..0da4913e9b8a4 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.html
@@ -106,7 +106,6 @@
                       [invalid]="nfsForm.controls.fsal.controls.user_id.invalid && (nfsForm.controls.fsal.controls.user_id.dirty)"
                       [invalidText]="userIdError"
                       [skeleton]="allRGWUsers === null"
-                      (valueChange)="pathChangeHandler()"
                       i18n>
             <option *ngIf="allRGWUsers === null"
                     value="">Loading...</option>
@@ -223,8 +222,6 @@
                  name="path"
                  formControlName="path"
                  [ngbTypeahead]="pathDataSource"
-                 (selectItem)="pathChangeHandler()"
-                 (blur)="pathChangeHandler()"
                  [invalid]="nfsForm.controls.path.invalid && (nfsForm.controls.path.dirty)">
         </cds-text-label>
         <ng-template #pathError>
@@ -259,8 +256,6 @@
                  name="path"
                  formControlName="path"
                  [ngbTypeahead]="bucketDataSource"
-                 (selectItem)="pathChangeHandler()"
-                 (blur)="pathChangeHandler()"
                  [invalid]="nfsForm.controls.path.invalid && (nfsForm.controls.path.dirty)">
         </cds-text-label>
         <ng-template #bucketPathError>
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.ts
index 2317671b02238..d502524256ee9 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.ts
@@ -434,7 +434,7 @@ export class NfsFormComponent extends CdForm implements OnInit {
           fs_name: this.selectedFsName
         }
       });
-      this.volumeChangeHandler();
+      this.getSubVolGrp(this.selectedFsName);
     }
     if (!_.isEmpty(this.selectedSubvolGroup)) {
       this.nfsForm.patchValue({

From 8443821073b23946a32009106c45581db0d51e8f Mon Sep 17 00:00:00 2001
From: Anoop C S <anoopcs@cryptolab.net>
Date: Thu, 29 Aug 2024 11:53:44 +0530
Subject: [PATCH 129/148] client: Fix symlink open with O_PATH and O_NOFOLLOW

man open(2)[1] says the following for O_PATH:

. . .
If  pathname is a symbolic link and the O_NOFOLLOW flag is also
specified, then the call returns a file descriptor referring to the
symbolic link.  This file descriptor can be used as the dirfd argument
in calls to fchownat(2), fstatat(2), linkat(2), and readlinkat(2) with
an empty pathname to have the calls operate on the symbolic link.
. . .

symlink check within may_open() failed to consider the O_PATH flag
resulting in a ELOOP error to the client. In order to return a valid
file descriptor we introduce a check for the presence of O_PATH in
the client provided flags.

Fixes: https://tracker.ceph.com/issues/67833

[1] https://www.man7.org/linux/man-pages/man2/open.2.html

Signed-off-by: Anoop C S <anoopcs@cryptolab.net>
---
 src/client/Client.cc       | 4 ++++
 src/test/libcephfs/test.cc | 7 +++++++
 2 files changed, 11 insertions(+)

diff --git a/src/client/Client.cc b/src/client/Client.cc
index 6577dd575f1fd..9c6785fe65e5a 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -6125,6 +6125,10 @@ int Client::may_open(Inode *in, int flags, const UserPerm& perms)
   int r = 0;
   switch (in->mode & S_IFMT) {
     case S_IFLNK:
+#if defined(__linux__) && defined(O_PATH)
+      if (flags & O_PATH)
+        break;
+#endif
       r = -CEPHFS_ELOOP;
       goto out;
     case S_IFDIR:
diff --git a/src/test/libcephfs/test.cc b/src/test/libcephfs/test.cc
index f2c87168633bb..b51689ab2637e 100644
--- a/src/test/libcephfs/test.cc
+++ b/src/test/libcephfs/test.cc
@@ -976,6 +976,13 @@ TEST(LibCephFS, Symlinks) {
   fd = ceph_open(cmount, test_symlink, O_NOFOLLOW, 0);
   ASSERT_EQ(fd, -CEPHFS_ELOOP);
 
+#if defined(__linux__) && defined(O_PATH)
+  // test the O_NOFOLLOW with O_PATH case
+  fd = ceph_open(cmount, test_symlink, O_PATH|O_NOFOLLOW, 0);
+  ASSERT_GT(fd, 0);
+  ceph_close(cmount, fd);
+#endif /* __linux */
+
   // stat the original file
   struct ceph_statx stx_orig;
   ASSERT_EQ(ceph_statx(cmount, test_file, &stx_orig, CEPH_STATX_ALL_STATS, 0), 0);

From 24f453dd39c25e00527d0ed0a1e9fefa6295999b Mon Sep 17 00:00:00 2001
From: Anoop C S <anoopcs@cryptolab.net>
Date: Tue, 27 Aug 2024 15:50:44 +0530
Subject: [PATCH 130/148] client: Resolve symlink from dirfd for empty pathname

man readlinkat(2)[1] points at a special case for readlinkat() syscall
as follows:

. . .
Since Linux 2.6.39, pathname can be an empty string, in which case the
call operates on the symbolic link referred to by dirfd (which should
have been obtained using open(2) with the O_PATH and O_NOFOLLOW flags).
. . .

man open(2)[2] further explains the need for such a special case when
a symlink is opened with O_PATH and O_NOFOLLOW:

. . .
If  pathname is a symbolic link and the O_NOFOLLOW flag is also
specified, then the call returns a file descriptor referring to the
symbolic link.  This file descriptor can be used as the dirfd argument
in calls to fchownat(2), fstatat(2), linkat(2), and readlinkat(2) with
an empty pathname to have the calls operate on the symbolic link.
. . .

Accordingly have a check to resolve symlinks out of dirfd when empty
pathnames are encountered within readlinkat(). In addition to that
match the standard file system behavior to return ENOENT instead of
EINVAL when the inode pointed to by dirfd is not a symbolic link with
empty pathnames.

Fixes: https://tracker.ceph.com/issues/67833

[1] https://www.man7.org/linux/man-pages/man2/readlinkat.2.html
[2] https://www.man7.org/linux/man-pages/man2/open.2.html

Signed-off-by: Anoop C S <anoopcs@cryptolab.net>
---
 src/client/Client.cc       |  6 ++++++
 src/test/libcephfs/test.cc | 12 ++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/src/client/Client.cc b/src/client/Client.cc
index 9c6785fe65e5a..f8373095b38c1 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -7957,6 +7957,12 @@ int Client::readlinkat(int dirfd, const char *relpath, char *buf, loff_t size, c
     return r;
   }
 
+  if (!strcmp(relpath, "")) {
+    if (!dirinode.get()->is_symlink())
+      return -CEPHFS_ENOENT;
+    return _readlink(dirinode.get(), buf, size);
+  }
+
   InodeRef in;
   filepath path(relpath);
   r = path_walk(path, &in, perms, false, 0, dirinode);
diff --git a/src/test/libcephfs/test.cc b/src/test/libcephfs/test.cc
index b51689ab2637e..6f10d2bbd4e06 100644
--- a/src/test/libcephfs/test.cc
+++ b/src/test/libcephfs/test.cc
@@ -3019,6 +3019,18 @@ TEST(LibCephFS, Readlinkat) {
   ASSERT_EQ(0, memcmp(target, rel_file_path, target_len));
 
   ASSERT_EQ(0, ceph_close(cmount, fd));
+#if defined(__linux__) && defined(O_PATH)
+  // test readlinkat with empty pathname relative to O_PATH|O_NOFOLLOW fd
+  fd = ceph_open(cmount, link_path, O_PATH | O_NOFOLLOW, 0);
+  ASSERT_LE(0, fd);
+  size_t link_target_len = strlen(rel_file_path);
+  char link_target[link_target_len+1];
+  ASSERT_EQ(link_target_len, ceph_readlinkat(cmount, fd, "", link_target, link_target_len));
+  link_target[link_target_len] = '\0';
+  ASSERT_EQ(0, memcmp(link_target, rel_file_path, link_target_len));
+  ASSERT_EQ(0, ceph_close(cmount, fd));
+#endif /* __linux */
+
   ASSERT_EQ(0, ceph_unlink(cmount, link_path));
   ASSERT_EQ(0, ceph_unlink(cmount, file_path));
   ASSERT_EQ(0, ceph_rmdir(cmount, dir_path));

From 0be8d01c9ddde0d7d24edd34dc75f6cfc861b5ba Mon Sep 17 00:00:00 2001
From: Milind Changire <mchangir@redhat.com>
Date: Fri, 27 Sep 2024 16:10:22 +0530
Subject: [PATCH 131/148] log: thread name save/fetch infra

* pthread name is saved in a thread_local storage
* the thread_local name is copied into Entry object's ctor
* Log::dump_recent() reads the thread name from the Entry
  object's data member when dumping logs

Fixes: https://tracker.ceph.com/issues/50743
Signed-off-by: Milind Changire <mchangir@redhat.com>
---
 src/common/Thread.cc |  4 ++--
 src/common/Thread.h  |  8 +++++++-
 src/log/Entry.h      | 10 +++++++++-
 src/log/Log.cc       | 11 ++++-------
 4 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/src/common/Thread.cc b/src/common/Thread.cc
index 9a7a31923c1b7..3903e8c0ed721 100644
--- a/src/common/Thread.cc
+++ b/src/common/Thread.cc
@@ -83,7 +83,7 @@ void *Thread::entry_wrapper()
   if (pid && cpuid >= 0)
     _set_affinity(cpuid);
 
-  ceph_pthread_setname(pthread_self(), thread_name.c_str());
+  ceph_pthread_setname(pthread_self(), Thread::thread_name.c_str());
   return entry();
 }
 
@@ -154,7 +154,7 @@ int Thread::try_create(size_t stacksize)
 void Thread::create(const char *name, size_t stacksize)
 {
   ceph_assert(strlen(name) < 16);
-  thread_name = name;
+  Thread::thread_name = name;
 
   int ret = try_create(stacksize);
   if (ret != 0) {
diff --git a/src/common/Thread.h b/src/common/Thread.h
index 5242fb5f30758..d3892c1b36b71 100644
--- a/src/common/Thread.h
+++ b/src/common/Thread.h
@@ -20,11 +20,14 @@
 #include <string_view>
 #include <system_error>
 #include <thread>
+#include <cstring>
 
 #include <pthread.h>
 #include <sys/types.h>
 
+#include "include/ceph_assert.h"
 #include "include/compat.h"
+#include "include/spinlock.h"
 
 extern pid_t ceph_gettid();
 
@@ -33,7 +36,7 @@ class Thread {
   pthread_t thread_id;
   pid_t pid;
   int cpuid;
-  std::string thread_name;
+  static inline thread_local std::string thread_name;
 
   void *entry_wrapper();
 
@@ -61,6 +64,9 @@ class Thread {
   int join(void **prval = 0);
   int detach();
   int set_affinity(int cpuid);
+  static const std::string get_thread_name() {
+    return Thread::thread_name;
+  }
 };
 
 // Functions for with std::thread
diff --git a/src/log/Entry.h b/src/log/Entry.h
index 3677c8eb95180..db39eca0ef3ba 100644
--- a/src/log/Entry.h
+++ b/src/log/Entry.h
@@ -4,9 +4,12 @@
 #ifndef __CEPH_LOG_ENTRY_H
 #define __CEPH_LOG_ENTRY_H
 
+#include "include/compat.h"
+
 #include "log/LogClock.h"
 
 #include "common/StackStringStream.h"
+#include "common/Thread.h"
 
 #include "boost/container/small_vector.hpp"
 
@@ -14,6 +17,7 @@
 
 #include <string_view>
 
+
 namespace ceph {
 namespace logging {
 
@@ -27,7 +31,10 @@ class Entry {
     m_thread(pthread_self()),
     m_prio(pr),
     m_subsys(sub)
-  {}
+  {
+    strncpy(m_thread_name, Thread::get_thread_name().data(), 16);
+    m_thread_name[15] = '\0';
+  }
   Entry(const Entry &) = default;
   Entry& operator=(const Entry &) = default;
   Entry(Entry &&e) = default;
@@ -40,6 +47,7 @@ class Entry {
   time m_stamp;
   pthread_t m_thread;
   short m_prio, m_subsys;
+  char m_thread_name[16];
 
   static log_clock& clock() {
     static log_clock clock;
diff --git a/src/log/Log.cc b/src/log/Log.cc
index 69f6df82ecbb7..49dd03c06c096 100644
--- a/src/log/Log.cc
+++ b/src/log/Log.cc
@@ -493,13 +493,13 @@ void Log::dump_recent()
   _flush(m_flush, false);
 
   _log_message("--- begin dump of recent events ---", true);
-  std::set<pthread_t> recent_pthread_ids;
+  std::set<std::pair<pthread_t, const char *>> recent_pthread_ids;
   {
     EntryVector t;
     t.insert(t.end(), std::make_move_iterator(m_recent.begin()), std::make_move_iterator(m_recent.end()));
     m_recent.clear();
     for (const auto& e : t) {
-      recent_pthread_ids.emplace(e.m_thread);
+      recent_pthread_ids.emplace(std::make_pair(e.m_thread, e.m_thread_name));
     }
     _flush(t, true);
   }
@@ -515,14 +515,11 @@ void Log::dump_recent()
 			   m_stderr_log, m_stderr_crash), true);
 
   _log_message("--- pthread ID / name mapping for recent threads ---", true);
-  for (const auto pthread_id : recent_pthread_ids)
+  for (auto& [pthread_id, pthread_name] : recent_pthread_ids)
   {
-    char pthread_name[16] = {0}; //limited by 16B include terminating null byte.
-    ceph_pthread_getname(pthread_id, pthread_name, sizeof(pthread_name));
     // we want the ID to be printed in the same format as we use for a log entry.
     // The reason is easier grepping.
-    _log_message(fmt::format("  {:x} / {}",
-			     tid_to_int(pthread_id), pthread_name), true);
+    _log_message(fmt::format("  {:x} / {}", tid_to_int(pthread_id), pthread_name), true);
   }
 
   _log_message(fmt::format("  max_recent {:9}", m_recent.capacity()), true);

From 3ab5d1f67f1cac210f4c7f0540900670c25de80b Mon Sep 17 00:00:00 2001
From: Redouane Kachach <rkachach@ibm.com>
Date: Tue, 15 Oct 2024 13:34:32 +0200
Subject: [PATCH 132/148] mgr/cephadm: disabling nginx buffering for grafana
 location

Disabling Nginx buffering for Grafana, as it may lead to errors or
delays while loading the main Grafana page, particularly when
receiving JavaScript files.

Fixes: https://tracker.ceph.com/issues/68315

Signed-off-by: Redouane Kachach <rkachach@ibm.com>
---
 .../templates/services/mgmt-gateway/external_server.conf.j2     | 1 +
 src/pybind/mgr/cephadm/tests/test_services.py                   | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/external_server.conf.j2 b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/external_server.conf.j2
index b830034a7d4e9..91efa91a8d50f 100644
--- a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/external_server.conf.j2
+++ b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/external_server.conf.j2
@@ -113,6 +113,7 @@ server {
         # clear any Authorization header as Prometheus and Alertmanager are using basic-auth browser
         # will send this header if Grafana is running on the same node as one of those services
         proxy_set_header Authorization "";
+        proxy_buffering off;
         {% if oauth2_proxy_url %}
         auth_request /oauth2/auth;
         error_page 401 = /oauth2/sign_in;
diff --git a/src/pybind/mgr/cephadm/tests/test_services.py b/src/pybind/mgr/cephadm/tests/test_services.py
index a05c87ce3c3a9..072f4bec554e1 100644
--- a/src/pybind/mgr/cephadm/tests/test_services.py
+++ b/src/pybind/mgr/cephadm/tests/test_services.py
@@ -3900,6 +3900,7 @@ def get_services_endpoints(name):
                                                      # clear any Authorization header as Prometheus and Alertmanager are using basic-auth browser
                                                      # will send this header if Grafana is running on the same node as one of those services
                                                      proxy_set_header Authorization "";
+                                                     proxy_buffering off;
                                                  }
 
                                                  location /prometheus {
@@ -4171,6 +4172,7 @@ def get_services_endpoints(name):
                                                      # clear any Authorization header as Prometheus and Alertmanager are using basic-auth browser
                                                      # will send this header if Grafana is running on the same node as one of those services
                                                      proxy_set_header Authorization "";
+                                                     proxy_buffering off;
                                                      auth_request /oauth2/auth;
                                                      error_page 401 = /oauth2/sign_in;
 

From 2ed1a3bd70c818835c4e7dc521d31cdd9f9e780b Mon Sep 17 00:00:00 2001
From: Ernesto Puerta <epuertat@redhat.com>
Date: Tue, 15 Oct 2024 14:09:48 +0200
Subject: [PATCH 133/148] .github: detect GPL license in PRs

Signed-off-by: Ernesto Puerta <epuertat@redhat.com>
---
 .github/workflows/check-license.yml | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 .github/workflows/check-license.yml

diff --git a/.github/workflows/check-license.yml b/.github/workflows/check-license.yml
new file mode 100644
index 0000000000000..d201ed7135439
--- /dev/null
+++ b/.github/workflows/check-license.yml
@@ -0,0 +1,13 @@
+---
+name: "Check Incomatible Licenses"
+on: [pull_request]
+
+jobs:
+  check_pr:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Check PR
+      uses: JJ/github-pr-contains-action@526dfe784d8604ea1c39b6c26609074de95b1ffd  # releases/v14.1
+      with:
+        github-token: ${{github.token}}
+        diffDoesNotContain: "GNU General Public License"

From 629922bf6a0905cc87707f5e2d027f6320aafd99 Mon Sep 17 00:00:00 2001
From: Jos Collin <jcollin@redhat.com>
Date: Fri, 11 Oct 2024 10:33:47 +0530
Subject: [PATCH 134/148] doc: update Key Idea in cephfs-mirroring.rst

Updates the snapdiff feature and it's url.

Signed-off-by: Jos Collin <jcollin@redhat.com>
---
 doc/dev/cephfs-mirroring.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/dev/cephfs-mirroring.rst b/doc/dev/cephfs-mirroring.rst
index a804a0075995f..e09fed213f230 100644
--- a/doc/dev/cephfs-mirroring.rst
+++ b/doc/dev/cephfs-mirroring.rst
@@ -17,12 +17,10 @@ Key Idea
 --------
 
 For a given snapshot pair in a directory, `cephfs-mirror` daemon will rely on
-readdir diff to identify changes in a directory tree. The diffs are applied to
+`CephFS Snapdiff Feature` to identify changes in a directory tree. The diffs are applied to
 directory in the remote file system thereby only synchronizing files that have
 changed between two snapshots.
 
-This feature is tracked here: https://tracker.ceph.com/issues/47034.
-
 Currently, snapshot data is synchronized by bulk copying to the remote
 filesystem.
 
@@ -407,3 +405,5 @@ Feature Status
 --------------
 
 `cephfs-mirror` daemon is built by default (follows `WITH_CEPHFS` CMake rule).
+
+.. _CephFS Snapdiff Feature: https://croit.io/blog/cephfs-snapdiff-feature

From b6cb908e0b7e215def9760f480149fd7f1b881db Mon Sep 17 00:00:00 2001
From: Casey Bodley <cbodley@redhat.com>
Date: Tue, 15 Oct 2024 11:29:53 -0400
Subject: [PATCH 135/148] rgw: document mstart.sh and related scripts

Signed-off-by: Casey Bodley <cbodley@redhat.com>
---
 src/mrgw.sh   |  2 ++
 src/mrun      |  2 ++
 src/mstart.sh | 28 ++++++++++++++++++++++++++++
 src/mstop.sh  |  2 ++
 4 files changed, 34 insertions(+)

diff --git a/src/mrgw.sh b/src/mrgw.sh
index 05739bf015ebc..86bef336867de 100755
--- a/src/mrgw.sh
+++ b/src/mrgw.sh
@@ -1,5 +1,7 @@
 #!/usr/bin/env bash
 
+# Start/restart a radosgw instance on the given mstart.sh cluster.
+
 set -e
 
 rgw_frontend=${RGW_FRONTEND:-"beast"}
diff --git a/src/mrun b/src/mrun
index a85221800218b..df7e3542b93a5 100755
--- a/src/mrun
+++ b/src/mrun
@@ -1,5 +1,7 @@
 #!/usr/bin/env bash
 
+# Run a ceph command against the given mstart.sh cluster.
+
 [ $# -lt 2 ] && echo "usage: $0 <name> <command> [params...]" && exit 1
 
 root=`dirname $0`
diff --git a/src/mstart.sh b/src/mstart.sh
index 34b57e1761125..0c512ca9eb8c3 100755
--- a/src/mstart.sh
+++ b/src/mstart.sh
@@ -1,5 +1,33 @@
 #!/bin/sh
 
+# Deploy a vstart.sh cluster in a named subdirectory. This makes it possible to
+# start multiple clusters in different subdirectories. See mstop.sh for cleanup.
+#
+# Example:
+#
+# ~/ceph/build $ MON=1 OSD=1 RGW=1 MDS=0 MGR=0 ../src/mstart.sh c1 -n -d
+# ~/ceph/build $ MON=1 OSD=1 RGW=1 MDS=0 MGR=0 ../src/mstart.sh c2 -n -d
+#
+# ~/ceph/build $ ls run
+# c1  c2
+# ~/ceph/build $ ls run/c1
+# asok  ceph.conf  dev  keyring  out
+#
+# ~/ceph/build $ ../src/mrun c1 radosgw-admin user list
+# [
+#     "56789abcdef0123456789abcdef0123456789abcdef0123456789abcdef01234",
+#     "testx$9876543210abcdef0123456789abcdef0123456789abcdef0123456789abcdef",
+#     "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef",
+#     "testacct1user",
+#     "test",
+#     "testacct2root",
+#     "testacct1root",
+#     "testid"
+# ]
+#
+# ~/ceph/build $ ../src/mstop.sh c1
+# ~/ceph/build $ ../src/mstop.sh c2
+
 usage="usage: $0 <name> [vstart options]..\n"
 
 usage_exit() {
diff --git a/src/mstop.sh b/src/mstop.sh
index 702d1765941e5..eec0ca02e42ae 100755
--- a/src/mstop.sh
+++ b/src/mstop.sh
@@ -1,5 +1,7 @@
 #!/usr/bin/env bash
 
+# Stop a named cluster started by mstart.sh
+
 set -e
 
 script_root=`dirname $0`

From c78d1ba668d1ad2364db39ffa07be2f8a3d61a48 Mon Sep 17 00:00:00 2001
From: Casey Bodley <cbodley@redhat.com>
Date: Tue, 15 Oct 2024 11:30:37 -0400
Subject: [PATCH 136/148] rgw: add mstart-related scripts to CODEOWNERS and
 labeler

Signed-off-by: Casey Bodley <cbodley@redhat.com>
---
 .github/CODEOWNERS  | 4 ++++
 .github/labeler.yml | 3 +++
 2 files changed, 7 insertions(+)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index d8d18693efcf6..3e81444ea3d0b 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -164,6 +164,10 @@ README*                                         @ceph/doc-writers
 /src/cls/rgw_gc                                 @ceph/rgw
 /src/cls/user                                   @ceph/rgw
 /src/cls/version                                @ceph/rgw
+/src/mrgw.sh                                    @ceph/rgw
+/src/mrun                                       @ceph/rgw
+/src/mstart.sh                                  @ceph/rgw
+/src/mstop.sh                                   @ceph/rgw
 /src/rgw                                        @ceph/rgw
 /src/s3select                                   @ceph/rgw
 /src/spawn                                      @ceph/rgw
diff --git a/.github/labeler.yml b/.github/labeler.yml
index 9f2ed1e479019..cc32be3850126 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -288,6 +288,9 @@ rgw:
   - src/cls/rgw_gc/**
   - src/cls/timeindex/**
   - src/mrgw.sh
+  - src/mrun
+  - src/mstart.sh
+  - src/mstop.sh
   - src/rgw/**
   - src/test/cls_rgw/**
   - src/test/librgw_*

From 67f884d39c31bd7ece3666f8092814ae9dfc29f1 Mon Sep 17 00:00:00 2001
From: Max Kellermann <max.kellermann@ionos.com>
Date: Tue, 15 Oct 2024 17:52:45 +0200
Subject: [PATCH 137/148] CodingStyle: allow C++ forward declarations

The Google coding guide opposes to forward declarations, but I
disagree with that opinion.  In my opinion, forward declarations are
useful.  Ceph build times are miserable due to header dependency bloat
and template bloat, both of which can be reduced using forward
declarations.

All cons listed in https://google.github.io/styleguide/cppguide.html

> Forward declarations can hide a dependency, allowing user code to
> skip necessary recompilation when headers change.

That is a pro, not a con.  Skipping (unnecessary) recompilation is a
good thing, it's the goal of forward declarations.

> A forward declaration as opposed to an #include statement makes it
> difficult for automatic tooling to discover the module defining the
> symbol.

That certainly depends on the tools one uses, but I cannot imagine
today's IDEs are limited to one compilation unit.

> A forward declaration may be broken by subsequent changes to the
> library.

True, and that will lead to a compiler error.

> Forward declarations of functions and templates can prevent the
> header owners from making otherwise-compatible changes to their
> APIs, such as widening a parameter type, adding a template parameter
> with a default value, or migrating to a new namespace.

Forward declarations do not prevent any of that.  But if you change
the "real" declaration, all incompatible forward declarations will
cause a compiler error.

> Forward declaring symbols from namespace std:: yields undefined
> behavior.

Sad, but true.  But that is not an argument against forward
declarations for Ceph's own types.

> It can be difficult to determine whether a forward declaration or a
> full #include is needed.

If it compiles without the `#include`, then the forward declaration is
fine.  (Or the primary header happened to be already included by
somebody else.)

> Replacing an #include with a forward declaration can silently change
> the meaning of code: [...] If the #include was replaced with forward
> decls for B and D, test() would call f(void*).

True, but this is a contrived example, and is bad coding style because
it is error prone.  Casts to `void*` can and should be avoided.  There
are rare examples where such casts are necessary (boundary to C APIs),
and then it's very unusual to pass derived incomplete types.

> Forward declaring multiple symbols from a header can be more verbose
> than simply #includeing the header.

True, but that misses the point of forward declarations.

> Structuring code to enable forward declarations (e.g., using pointer
> members instead of object members) can make the code slower and more
> complex.

True, but that is not a property of forward declarations.  I don't
suggest doing such a thing.

Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
---
 CodingStyle | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/CodingStyle b/CodingStyle
index 659298f0e5ae4..019d23c7703dc 100644
--- a/CodingStyle
+++ b/CodingStyle
@@ -108,6 +108,12 @@ by section.
    portability since `#pragma once` is widely supported and is known
    to work on GCC and Clang.
 
+* Header Files -> Forward declarations:
+
+    Forward declarations of structs, unions, classes and enums can be
+    used to reduce header dependencies.  This speeds up compile times
+    because the compiler has to process less code.
+
 
 The following guidelines have not been followed in the legacy code,
 but are worth mentioning and should be followed strictly for new code:

From 56b60c01cacafa40eeb009d76a6854507d33f3a5 Mon Sep 17 00:00:00 2001
From: Afreen Misbah <afreen23.git@gmail.com>
Date: Fri, 11 Oct 2024 20:58:56 +0530
Subject: [PATCH 138/148] mgr/dashboard: Adapt gateway group changes in nvmeof
 UI

- Added gateway group param in namespace request - GET, POST, PATCH, DELETE
- Added gateway group param in Listeners request - GET
- Added gateway group param in Initiators - GET, POST, DELETE

Fixes https://tracker.ceph.com/issues/68510

Signed-off-by: Afreen Misbah <afreen@ibm.com>
---
 .../mgr/dashboard/controllers/nvmeof.py       | 10 ++--
 .../nvmeof-initiators-form.component.ts       | 11 +++--
 .../nvmeof-initiators-list.component.ts       | 25 +++++-----
 .../nvmeof-listeners-form.component.ts        |  7 ++-
 .../nvmeof-listeners-list.component.ts        | 10 ++--
 .../nvmeof-namespaces-form.component.ts       |  7 ++-
 .../nvmeof-namespaces-list.component.ts       | 48 +++++++++++--------
 .../nvmeof-subsystems-details.component.html  |  8 +++-
 .../nvmeof-subsystems-form.component.ts       |  4 +-
 .../src/app/shared/api/nvmeof.service.spec.ts |  9 +++-
 .../src/app/shared/api/nvmeof.service.ts      | 42 ++++++++++------
 11 files changed, 107 insertions(+), 74 deletions(-)

diff --git a/src/pybind/mgr/dashboard/controllers/nvmeof.py b/src/pybind/mgr/dashboard/controllers/nvmeof.py
index f199867943d14..519c310a98bcc 100644
--- a/src/pybind/mgr/dashboard/controllers/nvmeof.py
+++ b/src/pybind/mgr/dashboard/controllers/nvmeof.py
@@ -463,16 +463,17 @@ def status(self) -> dict:
                      parameters={
                          'subsystem_nqn': (str, 'Subsystem NQN'),
                          "host_nqn": Param(str, 'Comma separated list of NVMeoF host NQNs'),
+                         "gw_group": Param(str, "NVMeoF gateway group")
                      })
         @empty_response
         @handle_nvmeof_error
         @CreatePermission
-        def add(self, subsystem_nqn: str, host_nqn: str = ""):
+        def add(self, subsystem_nqn: str, gw_group: str, host_nqn: str = ""):
             response = None
             all_host_nqns = host_nqn.split(',')
 
             for nqn in all_host_nqns:
-                response = NVMeoFClient().stub.add_host(
+                response = NVMeoFClient(gw_group=gw_group).stub.add_host(
                     NVMeoFClient.pb2.add_host_req(subsystem_nqn=subsystem_nqn, host_nqn=nqn)
                 )
                 if response.status != 0:
@@ -484,16 +485,17 @@ def add(self, subsystem_nqn: str, host_nqn: str = ""):
                      parameters={
                          "subsystem_nqn": Param(str, "NVMeoF subsystem NQN"),
                          "host_nqn": Param(str, 'Comma separated list of NVMeoF host NQN.'),
+                         "gw_group": Param(str, "NVMeoF gateway group")
                      })
         @empty_response
         @handle_nvmeof_error
         @DeletePermission
-        def remove(self, subsystem_nqn: str, host_nqn: str):
+        def remove(self, subsystem_nqn: str, host_nqn: str, gw_group: str):
             response = None
             to_delete_nqns = host_nqn.split(',')
 
             for del_nqn in to_delete_nqns:
-                response = NVMeoFClient().stub.remove_host(
+                response = NVMeoFClient(gw_group=gw_group).stub.remove_host(
                     NVMeoFClient.pb2.remove_host_req(subsystem_nqn=subsystem_nqn, host_nqn=del_nqn)
                 )
                 if response.status != 0:
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-form/nvmeof-initiators-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-form/nvmeof-initiators-form.component.ts
index 3a143a1a8df90..32f7c76a36282 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-form/nvmeof-initiators-form.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-form/nvmeof-initiators-form.component.ts
@@ -10,7 +10,7 @@ import { AuthStorageService } from '~/app/shared/services/auth-storage.service';
 import { TaskWrapperService } from '~/app/shared/services/task-wrapper.service';
 import { FinishedTask } from '~/app/shared/models/finished-task';
 import { ActivatedRoute, Router } from '@angular/router';
-import { NvmeofService } from '~/app/shared/api/nvmeof.service';
+import { InitiatorRequest, NvmeofService } from '~/app/shared/api/nvmeof.service';
 
 @Component({
   selector: 'cd-nvmeof-initiators-form',
@@ -26,6 +26,7 @@ export class NvmeofInitiatorsFormComponent implements OnInit {
   remove: boolean = false;
   subsystemNQN: string;
   removeHosts: { name: string; value: boolean; id: number }[] = [];
+  group: string;
 
   constructor(
     private authStorageService: AuthStorageService,
@@ -52,6 +53,9 @@ export class NvmeofInitiatorsFormComponent implements OnInit {
   );
 
   ngOnInit() {
+    this.route.queryParams.subscribe((params) => {
+      this.group = params?.['group'];
+    });
     this.createForm();
     this.action = this.actionLabels.ADD;
     this.route.params.subscribe((params: { subsystem_nqn: string }) => {
@@ -108,8 +112,9 @@ export class NvmeofInitiatorsFormComponent implements OnInit {
     const hosts: string[] = this.addedHosts.value;
     let taskUrl = `nvmeof/initiator/${URLVerbs.ADD}`;
 
-    const request = {
-      host_nqn: hosts.join(',')
+    const request: InitiatorRequest = {
+      host_nqn: hosts.join(','),
+      gw_group: this.group
     };
 
     if (allowAnyHost) {
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-list/nvmeof-initiators-list.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-list/nvmeof-initiators-list.component.ts
index fff38e6985a43..a5575a9c9267e 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-list/nvmeof-initiators-list.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-list/nvmeof-initiators-list.component.ts
@@ -1,4 +1,4 @@
-import { Component, Input, OnChanges, OnInit, TemplateRef, ViewChild } from '@angular/core';
+import { Component, Input, OnInit, TemplateRef, ViewChild } from '@angular/core';
 import { Router } from '@angular/router';
 import { NvmeofService } from '~/app/shared/api/nvmeof.service';
 import { CriticalConfirmationModalComponent } from '~/app/shared/components/critical-confirmation-modal/critical-confirmation-modal.component';
@@ -20,9 +20,11 @@ const BASE_URL = 'block/nvmeof/subsystems';
   templateUrl: './nvmeof-initiators-list.component.html',
   styleUrls: ['./nvmeof-initiators-list.component.scss']
 })
-export class NvmeofInitiatorsListComponent implements OnInit, OnChanges {
+export class NvmeofInitiatorsListComponent implements OnInit {
   @Input()
   subsystemNQN: string;
+  @Input()
+  group: string;
 
   @ViewChild('hostTpl', { static: true })
   hostTpl: TemplateRef<any>;
@@ -58,10 +60,10 @@ export class NvmeofInitiatorsListComponent implements OnInit, OnChanges {
         permission: 'create',
         icon: Icons.add,
         click: () =>
-          this.router.navigate([
-            BASE_URL,
-            { outlets: { modal: [URLVerbs.ADD, this.subsystemNQN, 'initiator'] } }
-          ]),
+          this.router.navigate(
+            [BASE_URL, { outlets: { modal: [URLVerbs.ADD, this.subsystemNQN, 'initiator'] } }],
+            { queryParams: { group: this.group } }
+          ),
         canBePrimary: (selection: CdTableSelection) => !selection.hasSelection
       },
       {
@@ -79,17 +81,13 @@ export class NvmeofInitiatorsListComponent implements OnInit, OnChanges {
     return this.selection.selected.findIndex((selected) => selected.nqn === '*');
   }
 
-  ngOnChanges() {
-    this.listInitiators();
-  }
-
   updateSelection(selection: CdTableSelection) {
     this.selection = selection;
   }
 
   listInitiators() {
     this.nvmeofService
-      .getInitiators(this.subsystemNQN)
+      .getInitiators(this.subsystemNQN, this.group)
       .subscribe((initiators: NvmeofSubsystemInitiator[]) => {
         this.initiators = initiators;
       });
@@ -118,7 +116,10 @@ export class NvmeofInitiatorsListComponent implements OnInit, OnChanges {
             nqn: this.subsystemNQN,
             plural: itemNames.length > 1
           }),
-          call: this.nvmeofService.removeInitiators(this.subsystemNQN, { host_nqn })
+          call: this.nvmeofService.removeInitiators(this.subsystemNQN, {
+            host_nqn,
+            gw_group: this.group
+          })
         })
     });
   }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-form/nvmeof-listeners-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-form/nvmeof-listeners-form.component.ts
index cd362bf8abe19..8310e65d203e5 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-form/nvmeof-listeners-form.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-form/nvmeof-listeners-form.component.ts
@@ -103,7 +103,8 @@ export class NvmeofListenersFormComponent implements OnInit {
     const host = this.listenerForm.getValue('host');
     let trsvcid = Number(this.listenerForm.getValue('trsvcid'));
     if (!trsvcid) trsvcid = 4420;
-    const request = {
+    const request: ListenerRequest = {
+      gw_group: this.group,
       host_name: host.hostname,
       traddr: host.addr,
       trsvcid
@@ -128,9 +129,7 @@ export class NvmeofListenersFormComponent implements OnInit {
           component.listenerForm.setErrors({ cdSubmitButton: true });
         },
         complete: () => {
-          this.router.navigate([this.pageURL, { outlets: { modal: null } }], {
-            queryParams: { group: this.group }
-          });
+          this.router.navigate([this.pageURL, { outlets: { modal: null } }]);
         }
       });
   }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.ts
index 974727ad06260..b49adda7c1b92 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.ts
@@ -1,4 +1,4 @@
-import { Component, Input, OnChanges, OnInit } from '@angular/core';
+import { Component, Input, OnInit } from '@angular/core';
 import { Router } from '@angular/router';
 import { NvmeofService } from '~/app/shared/api/nvmeof.service';
 import { CriticalConfirmationModalComponent } from '~/app/shared/components/critical-confirmation-modal/critical-confirmation-modal.component';
@@ -21,7 +21,7 @@ const BASE_URL = 'block/nvmeof/subsystems';
   templateUrl: './nvmeof-listeners-list.component.html',
   styleUrls: ['./nvmeof-listeners-list.component.scss']
 })
-export class NvmeofListenersListComponent implements OnInit, OnChanges {
+export class NvmeofListenersListComponent implements OnInit {
   @Input()
   subsystemNQN: string;
   @Input()
@@ -81,17 +81,13 @@ export class NvmeofListenersListComponent implements OnInit, OnChanges {
     ];
   }
 
-  ngOnChanges() {
-    this.listListeners();
-  }
-
   updateSelection(selection: CdTableSelection) {
     this.selection = selection;
   }
 
   listListeners() {
     this.nvmeofService
-      .listListeners(this.subsystemNQN)
+      .listListeners(this.subsystemNQN, this.group)
       .subscribe((listResponse: NvmeofListener[]) => {
         this.listeners = listResponse.map((listener, index) => {
           listener['id'] = index;
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-form/nvmeof-namespaces-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-form/nvmeof-namespaces-form.component.ts
index f5721e11ab6d3..b65ad62bdb4b1 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-form/nvmeof-namespaces-form.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-form/nvmeof-namespaces-form.component.ts
@@ -41,6 +41,7 @@ export class NvmeofNamespacesFormComponent implements OnInit {
   nsid: string;
   currentBytes: number;
   invalidSizeError: boolean;
+  group: string;
 
   constructor(
     public actionLabels: ActionLabelsI18n,
@@ -62,6 +63,9 @@ export class NvmeofNamespacesFormComponent implements OnInit {
   }
 
   init() {
+    this.route.queryParams.subscribe((params) => {
+      this.group = params?.['group'];
+    });
     this.createForm();
     this.action = this.actionLabels.CREATE;
     this.route.params.subscribe((params: { subsystem_nqn: string; nsid: string }) => {
@@ -74,7 +78,7 @@ export class NvmeofNamespacesFormComponent implements OnInit {
     this.edit = true;
     this.action = this.actionLabels.EDIT;
     this.nvmeofService
-      .getNamespace(this.subsystemNQN, this.nsid)
+      .getNamespace(this.subsystemNQN, this.nsid, this.group)
       .subscribe((res: NvmeofSubsystemNamespace) => {
         const convertedSize = this.dimlessBinaryPipe.transform(res.rbd_image_size).split(' ');
         this.currentBytes = res.rbd_image_size;
@@ -120,6 +124,7 @@ export class NvmeofNamespacesFormComponent implements OnInit {
     const image_size = this.nsForm.getValue('image_size');
     const image_size_unit = this.nsForm.getValue('unit');
     const request = {} as NamespaceCreateRequest | NamespaceEditRequest;
+    request['gw_group'] = this.group;
     if (image_size) {
       const key: string = this.edit ? 'rbd_image_size' : 'size';
       const value: number = this.formatterService.toBytes(image_size + image_size_unit);
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-list/nvmeof-namespaces-list.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-list/nvmeof-namespaces-list.component.ts
index c40b538c82088..8f8f6eb8d0598 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-list/nvmeof-namespaces-list.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-list/nvmeof-namespaces-list.component.ts
@@ -1,4 +1,4 @@
-import { Component, Input, OnChanges, OnInit } from '@angular/core';
+import { Component, Input, OnInit } from '@angular/core';
 import { Router } from '@angular/router';
 import { NvmeofService } from '~/app/shared/api/nvmeof.service';
 import { CriticalConfirmationModalComponent } from '~/app/shared/components/critical-confirmation-modal/critical-confirmation-modal.component';
@@ -23,9 +23,11 @@ const BASE_URL = 'block/nvmeof/subsystems';
   templateUrl: './nvmeof-namespaces-list.component.html',
   styleUrls: ['./nvmeof-namespaces-list.component.scss']
 })
-export class NvmeofNamespacesListComponent implements OnInit, OnChanges {
+export class NvmeofNamespacesListComponent implements OnInit {
   @Input()
   subsystemNQN: string;
+  @Input()
+  group: string;
 
   namespacesColumns: any;
   tableActions: CdTableAction[];
@@ -117,10 +119,10 @@ export class NvmeofNamespacesListComponent implements OnInit, OnChanges {
         permission: 'create',
         icon: Icons.add,
         click: () =>
-          this.router.navigate([
-            BASE_URL,
-            { outlets: { modal: [URLVerbs.CREATE, this.subsystemNQN, 'namespace'] } }
-          ]),
+          this.router.navigate(
+            [BASE_URL, { outlets: { modal: [URLVerbs.CREATE, this.subsystemNQN, 'namespace'] } }],
+            { queryParams: { group: this.group } }
+          ),
         canBePrimary: (selection: CdTableSelection) => !selection.hasSelection
       },
       {
@@ -128,41 +130,45 @@ export class NvmeofNamespacesListComponent implements OnInit, OnChanges {
         permission: 'update',
         icon: Icons.edit,
         click: () =>
-          this.router.navigate([
-            BASE_URL,
-            {
-              outlets: {
-                modal: [URLVerbs.EDIT, this.subsystemNQN, 'namespace', this.selection.first().nsid]
+          this.router.navigate(
+            [
+              BASE_URL,
+              {
+                outlets: {
+                  modal: [
+                    URLVerbs.EDIT,
+                    this.subsystemNQN,
+                    'namespace',
+                    this.selection.first().nsid
+                  ]
+                }
               }
-            }
-          ])
+            ],
+            { queryParams: { group: this.group } }
+          )
       },
       {
         name: this.actionLabels.DELETE,
         permission: 'delete',
         icon: Icons.destroy,
-        click: () => this.deleteSubsystemModal()
+        click: () => this.deleteNamespaceModal()
       }
     ];
   }
 
-  ngOnChanges() {
-    this.listNamespaces();
-  }
-
   updateSelection(selection: CdTableSelection) {
     this.selection = selection;
   }
 
   listNamespaces() {
     this.nvmeofService
-      .listNamespaces(this.subsystemNQN)
+      .listNamespaces(this.subsystemNQN, this.group)
       .subscribe((res: NvmeofSubsystemNamespace[]) => {
         this.namespaces = res;
       });
   }
 
-  deleteSubsystemModal() {
+  deleteNamespaceModal() {
     const namespace = this.selection.first();
     this.modalService.show(CriticalConfirmationModalComponent, {
       itemDescription: 'Namespace',
@@ -174,7 +180,7 @@ export class NvmeofNamespacesListComponent implements OnInit, OnChanges {
             nqn: this.subsystemNQN,
             nsid: namespace.nsid
           }),
-          call: this.nvmeofService.deleteNamespace(this.subsystemNQN, namespace.nsid)
+          call: this.nvmeofService.deleteNamespace(this.subsystemNQN, namespace.nsid, this.group)
         })
     });
   }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-details/nvmeof-subsystems-details.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-details/nvmeof-subsystems-details.component.html
index 7f15a1360adc2..58a1e01a52510 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-details/nvmeof-subsystems-details.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-details/nvmeof-subsystems-details.component.html
@@ -24,14 +24,18 @@
       <a ngbNavLink
          i18n>Namespaces</a>
       <ng-template ngbNavContent>
-        <cd-nvmeof-namespaces-list [subsystemNQN]="subsystemNQN"></cd-nvmeof-namespaces-list>
+        <cd-nvmeof-namespaces-list [subsystemNQN]="subsystemNQN"
+                                   [group]="group">
+        </cd-nvmeof-namespaces-list>
       </ng-template>
     </ng-container>
     <ng-container ngbNavItem="initiators">
       <a ngbNavLink
          i18n>Initiators</a>
       <ng-template ngbNavContent>
-        <cd-nvmeof-initiators-list [subsystemNQN]="subsystemNQN"></cd-nvmeof-initiators-list>
+        <cd-nvmeof-initiators-list [subsystemNQN]="subsystemNQN"
+                                   [group]="group">
+        </cd-nvmeof-initiators-list>
       </ng-template>
     </ng-container>
   </nav>
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.ts
index f7b35a2d645ec..7e5b064f37929 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.ts
@@ -118,9 +118,7 @@ export class NvmeofSubsystemsFormComponent implements OnInit {
           component.subsystemForm.setErrors({ cdSubmitButton: true });
         },
         complete: () => {
-          this.router.navigate([this.pageURL, { outlets: { modal: null } }], {
-            queryParams: { group: this.group }
-          });
+          this.router.navigate([this.pageURL, { outlets: { modal: null } }]);
         }
       });
   }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.spec.ts
index 313db3445f298..a5c84e60b6f95 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.spec.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.spec.ts
@@ -27,6 +27,7 @@ describe('NvmeofService', () => {
     expect(service).toBeTruthy();
   });
 
+  // gateways
   it('should call listGatewayGroups', () => {
     service.listGatewayGroups().subscribe();
     const req = httpTesting.expectOne('api/nvmeof/gateway/group');
@@ -39,6 +40,7 @@ describe('NvmeofService', () => {
     expect(req.request.method).toBe('GET');
   });
 
+  // subsystems
   it('should call listSubsystems', () => {
     service.listSubsystems(mockGroupName).subscribe();
     const req = httpTesting.expectOne(`api/nvmeof/subsystem?gw_group=${mockGroupName}`);
@@ -69,9 +71,12 @@ describe('NvmeofService', () => {
     expect(req.request.method).toBe('DELETE');
   });
 
+  // initiators
   it('should call getInitiators', () => {
-    service.getInitiators(mockNQN).subscribe();
-    const req = httpTesting.expectOne(`api/nvmeof/subsystem/${mockNQN}/host`);
+    service.getInitiators(mockNQN, mockGroupName).subscribe();
+    const req = httpTesting.expectOne(
+      `api/nvmeof/subsystem/${mockNQN}/host?gw_group=${mockGroupName}`
+    );
     expect(req.request.method).toBe('GET');
   });
 });
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.ts
index 40202d0d67250..a2bbf507bc345 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.ts
@@ -8,6 +8,7 @@ import { catchError, mapTo } from 'rxjs/operators';
 export const MAX_NAMESPACE = 1024;
 
 export interface ListenerRequest {
+  gw_group: string;
   host_name: string;
   traddr: string;
   trsvcid: number;
@@ -17,14 +18,17 @@ export interface NamespaceCreateRequest {
   rbd_image_name: string;
   rbd_pool: string;
   size: number;
+  gw_group: string;
 }
 
 export interface NamespaceEditRequest {
   rbd_image_size: number;
+  gw_group: string;
 }
 
 export interface InitiatorRequest {
   host_nqn: string;
+  gw_group: string;
 }
 
 const API_PATH = 'api/nvmeof';
@@ -81,8 +85,8 @@ export class NvmeofService {
   }
 
   // Initiators
-  getInitiators(subsystemNQN: string) {
-    return this.http.get(`${API_PATH}/subsystem/${subsystemNQN}/host`);
+  getInitiators(subsystemNQN: string, group: string) {
+    return this.http.get(`${API_PATH}/subsystem/${subsystemNQN}/host?gw_group=${group}`);
   }
 
   addInitiators(subsystemNQN: string, request: InitiatorRequest) {
@@ -92,14 +96,17 @@ export class NvmeofService {
   }
 
   removeInitiators(subsystemNQN: string, request: InitiatorRequest) {
-    return this.http.delete(`${UI_API_PATH}/subsystem/${subsystemNQN}/host/${request.host_nqn}`, {
-      observe: 'response'
-    });
+    return this.http.delete(
+      `${UI_API_PATH}/subsystem/${subsystemNQN}/host/${request.host_nqn}/${request.gw_group}`,
+      {
+        observe: 'response'
+      }
+    );
   }
 
   // Listeners
-  listListeners(subsystemNQN: string) {
-    return this.http.get(`${API_PATH}/subsystem/${subsystemNQN}/listener`);
+  listListeners(subsystemNQN: string, group: string) {
+    return this.http.get(`${API_PATH}/subsystem/${subsystemNQN}/listener?gw_group=${group}`);
   }
 
   createListener(subsystemNQN: string, request: ListenerRequest) {
@@ -121,12 +128,14 @@ export class NvmeofService {
   }
 
   // Namespaces
-  listNamespaces(subsystemNQN: string) {
-    return this.http.get(`${API_PATH}/subsystem/${subsystemNQN}/namespace`);
+  listNamespaces(subsystemNQN: string, group: string) {
+    return this.http.get(`${API_PATH}/subsystem/${subsystemNQN}/namespace?gw_group=${group}`);
   }
 
-  getNamespace(subsystemNQN: string, nsid: string) {
-    return this.http.get(`${API_PATH}/subsystem/${subsystemNQN}/namespace/${nsid}`);
+  getNamespace(subsystemNQN: string, nsid: string, group: string) {
+    return this.http.get(
+      `${API_PATH}/subsystem/${subsystemNQN}/namespace/${nsid}?gw_group=${group}`
+    );
   }
 
   createNamespace(subsystemNQN: string, request: NamespaceCreateRequest) {
@@ -141,9 +150,12 @@ export class NvmeofService {
     });
   }
 
-  deleteNamespace(subsystemNQN: string, nsid: string) {
-    return this.http.delete(`${API_PATH}/subsystem/${subsystemNQN}/namespace/${nsid}`, {
-      observe: 'response'
-    });
+  deleteNamespace(subsystemNQN: string, nsid: string, group: string) {
+    return this.http.delete(
+      `${API_PATH}/subsystem/${subsystemNQN}/namespace/${nsid}?gw_group=${group}`,
+      {
+        observe: 'response'
+      }
+    );
   }
 }

From 80d0037c2512e696e224e293b6b4153ed6be0350 Mon Sep 17 00:00:00 2001
From: Afreen Misbah <afreen23.git@gmail.com>
Date: Thu, 26 Sep 2024 17:12:23 +0530
Subject: [PATCH 139/148] mailmap: Add dashboard new joinees in maps

- Afreen Misbah
- Dnyaneshwari Talwekar
- Naman Munet
- Prachi Goel
- Puja Shahu

Signed-off-by: Afreen Misbah <afreen@ibm.com>
---
 .githubmap       |  5 +++++
 .mailmap         |  5 +++++
 .organizationmap | 10 ++++++++++
 3 files changed, 20 insertions(+)

diff --git a/.githubmap b/.githubmap
index b93132cf1ee4c..68015b4c1a689 100644
--- a/.githubmap
+++ b/.githubmap
@@ -12,6 +12,7 @@ aaSharma14 Aashish Sharma <aasharma@redhat.com>
 aclamk Adam Kupczyk <akupczyk@redhat.com>
 adamemerson Adam C. Emerson <aemerson@redhat.com>
 adk3798 Adam King <adking@redhat.com>
+afreen23 Afreen Misbah <afreen@ibm.com>
 ajarr Ramana Raja <rraja@redhat.com>
 alfonsomthd Alfonso Martínez <almartin@redhat.com>
 alfredodeza Alfredo Deza <adeza@redhat.com>
@@ -47,6 +48,7 @@ Devp00l Stephan Müller <smueller@suse.com>
 dillaman Jason Dillaman <dillaman@redhat.com>
 djgalloway David Galloway <dgallowa@redhat.com>
 dmick Dan Mick <dmick@redhat.com>
+dnyanee1997 Dnyaneshwari talwekar <dtalweka@redhat.com>
 dragonylffly Li Wang <laurence.liwang@gmail.com>
 dsavineau Dimitri Savineau <dsavinea@redhat.com>
 dvanders Dan van der Ster <dan.vanderster@clyso.com>
@@ -96,6 +98,7 @@ mikechristie Mike Christie <mchristi@redhat.com>
 mogeb Mohamad Gebai <mgebai@suse.com>
 MrFreezeex Arthur Outhenin-Chalandre <arthur.outhenin-chalandre@cern.ch>
 myoungwon Myoungwon Oh <myoungwon.oh@samsung.com>
+nmunet Naman Munet <nmunet@redhat.com>
 Naveenaidu Naveen Naidu <naveen.naidu@ibm.com>
 neha-ojha Neha Ojha <nojha@redhat.com>
 NitzanMordhai Nitzan Mordechai <nmordech@redhat.com>
@@ -109,6 +112,8 @@ p-se Patrick Seidensal <pseidensal@suse.com>
 pcuzner Paul Cuzner <pcuzner@redhat.com>
 Pegonzal Pedro Gonzalez Gomez <pegonzal@redhat.com>
 pereman2 Pere Diaz Bou <pdiazbou@redhat.com>
+prgoel-code Prachi prgoel@redhat.com
+pujaoshahu Puja Shahu <pshahu@redhat.com>
 rchagam Anjaneya Chagam <anjaneya.chagam@intel.com>
 renhwztetecs huanwen ren <ren.huanwen@zte.com.cn>
 ricardoasmarques Ricardo Marques <rimarques@suse.com>
diff --git a/.mailmap b/.mailmap
index 8359b1473aedb..20aecd0c2321d 100644
--- a/.mailmap
+++ b/.mailmap
@@ -24,6 +24,7 @@ Adam Kupczyk <akupczyk@redhat.com> <aclamk@gmail.com>
 Adam Kupczyk <akupczyk@redhat.com> <akucpzyk@redhat.com>
 Adam Twardowski <adam.twardowski@gmail.com>
 Adir Lev <adirl@mellanox.com>
+Afreen Misbah <afreen@ibm.com>
 Ahoussi Armand <ahoussi.say@telecom-bretagne.eu> <delco225>
 Ailing Zhang <zhangal1992@gmail.com> <ailzhang@users.noreply.github.com>
 Aishwarya Mathuria <amathuri@redhat.com> amathuria <NOT@FOUND>
@@ -168,6 +169,7 @@ Dhairya Parmar <dparmar@redhat.com> dparmar18 <dparmar@redhat.com>
 Dingdang Zhang <boqian.zy@alibaba-inc.com>
 Dmitry Smirnov <onlyjob@member.fsf.org> <onlyjob@debian.org>
 Dmitry Yatsushkevich <dyatsushkevich@mirantis.com> <dmitry.yatsushkevich@gmail.com>
+Dnyaneshwari talwekar <dtalweka@redhat.com>
 Dominik Hannen <cantares1+github@gmail.com> <dhxgit@users.noreply.github.com>
 Dongdong Tao <dongodng.tao@canonical.com>
 Dongdong Tao <tdd21151186@gmail.com>
@@ -508,6 +510,7 @@ Myoungwon Oh <omwmw@sk.com>
 Myoungwon Oh <omwmw@sk.com> <ommw@sk.com>
 Na Xie <xie.na@h3c.com>
 Nag Pavan Chilakam <nagpavan.chilakam@gmail.com> <55574442+nagpavan-chilakam@users.noreply.github.com>
+Naman Munet <nmunet@redhat.com>
 Nancy Su <su_nan@inspur.com>
 Nathan Cutler <ncutler@suse.com>
 Nathan Cutler <ncutler@suse.com> <cutler@suse.cz>
@@ -572,6 +575,8 @@ Pooja Gautam <pooja.gautam@ts.fujitsu.com>
 Pritha Srivastava <prsrivas@redhat.com>
 Pritha Srivastava <prsrivas@redhat.com> <pritha@dhcp35-190.lab.eng.blr.redhat.com>
 Pritha Srivastava <prsrivas@redhat.com> <prsivas@redhat.com>
+Prachi prgoel@redhat.com
+Puja Shahu <pshahu@redhat.com>
 Qi Liang Hong <qilianghong@huawei.com>
 Qiankun Zheng <zheng.qiankun@h3c.com>
 Qinfei Liu <lucas.liuqinfei@huawei.com> <18138800392@163.com>
diff --git a/.organizationmap b/.organizationmap
index bc194953d1b88..7a1061a194c70 100644
--- a/.organizationmap
+++ b/.organizationmap
@@ -346,17 +346,22 @@ Huayun <contact@huayun.com> Zheng Yin <zhengyin@huayun.com>
 Huazhong University of Science and Technology <contact@hust.edu.cn> Luo Runbing <runsisi@hust.edu.cn>
 HXT Semiconductor <contact@hxt-semitech.org> Jiang Yutang <yutang2.jiang@hxt-semitech.com>
 IBM <contact@IBM.com> Adam Kupczyk <akupczyk@ibm.com>
+IBM <contact@IBM.com> Afreen Misbah <afreen@ibm.com>
 IBM <contact@IBM.com> Aliaksei Makarau <aliaksei.makarau@ibm.com>
 IBM <contact@IBM.com> Andrew Solomon <asolomon@us.ibm.com>
+IBM <contact@IBM.com> Dnyaneshwari talwekar <Dnyaneshwari.Talwekar@ibm.com>
 IBM <contact@IBM.com> Guillaume Abrioux <gabrioux@ibm.com>
 IBM <contact@IBM.com> Jonas Pfefferle <jpf@ibm.com>
 IBM <contact@IBM.com> Laura Flores <lflores@ibm.com>
 IBM <contact@IBM.com> Martin Ohmacht <mohmacht@us.ibm.com>
 IBM <contact@IBM.com> Michel Normand <normand@linux.vnet.ibm.com>
+IBM <contact@IBM.com> Naman Munet <Naman.Munet@ibm.com>
 IBM <contact@IBM.com> Naveen Naidu <naveen.naidu@ibm.com>
 IBM <contact@IBM.com> Neeraj Pratap Singh <Neeraj.Pratap.Singh1@ibm.com>
 IBM <contact@IBM.com> Or Ozeri <oro@il.ibm.com>
 IBM <contact@IBM.com> Paul Cuzner <pcuzner@ibm.com>
+IBM <contact@IBM.com> Prachi Goel <PRACHI.GOEL2@ibm.com>
+IBM <contact@IBM.com> Puja Shahu <puja-shahu.omprakash@ibm.com>
 IBM <contact@IBM.com> Samuel Matzek <smatzek@us.ibm.com>
 IBM <contact@IBM.com> Shraddha Agrawal <shraddhaag@ibm.com>
 IBM <contact@IBM.com> Sunil Angadi <Sunil.Angadi@ibm.com>
@@ -582,6 +587,7 @@ Red Hat <contact@redhat.com> Adam King <adking@redhat.com>
 Red Hat <contact@redhat.com> Adam King <adking@redhat.com>
 Red Hat <contact@redhat.com> Adam Kupczyk <akupczyk@redhat.com>
 Red Hat <contact@redhat.com> Ademar de Souza Reis Jr <areis@redhat.com>
+Red Hat <contact@redhat.com> Afreen Misbah <afrahman@redhat.com>
 Red Hat <contact@redhat.com> Aishwarya Mathuria <amathuri@redhat.com>
 Red Hat <contact@redhat.com> Albin Antony <aantony@redhat.com>
 Red Hat <contact@redhat.com> Alex Elder <aelder@redhat.com>
@@ -618,6 +624,7 @@ Red Hat <contact@redhat.com> Deepika Upadhyay <dupadhya@redhat.com>
 Red Hat <contact@redhat.com> Dhairya Parmar <dparmar@redhat.com>
 Red Hat <contact@redhat.com> Dimitri Savineau <dsavinea@redhat.com>
 Red Hat <contact@redhat.com> Divyansh Kamboj <dkamboj@redhat.com>
+Red Hat <contact@redhat.com> Dnyaneshwari talwekar <dtalweka@redhat.com>
 Red Hat <contact@redhat.com> Douglas Fuller <dfuller@redhat.com>
 Red Hat <contact@redhat.com> Ernesto Puerta <epuertat@redhat.com>
 Red Hat <contact@redhat.com> Erwan Velu <erwan@redhat.com>
@@ -683,6 +690,7 @@ Red Hat <contact@redhat.com> Mike Hackett <mhackett@redhat.com>
 Red Hat <contact@redhat.com> Mike Perez <miperez@redhat.com>
 Red Hat <contact@redhat.com> Milan Broz <mbroz@redhat.com>
 Red Hat <contact@redhat.com> Milind Changire <mchangir@redhat.com>
+Red Hat <contact@redhat.com> Naman Munet <nmunet@redhat.com>
 Red Hat <contact@redhat.com> Nathan Weinberg <nweinber@redhat.com>
 Red Hat <contact@redhat.com> Neeraj Pratap Singh <neesingh@redhat.com>
 Red Hat <contact@redhat.com> Neha Ojha <nojha@redhat.com>
@@ -706,9 +714,11 @@ Red Hat <contact@redhat.com> Pere Diaz Bou <pdiazbou@redhat.com>
 Red Hat <contact@redhat.com> Pete Zaitcev <zaitcev@redhat.com>
 Red Hat <contact@redhat.com> Petr Lautrbach <plautrba@redhat.com>
 Red Hat <contact@redhat.com> Petr Machata <pmachata@redhat.com>
+Red Hat <contact@redhat.com> Prachi prgoel@redhat.com
 Red Hat <contact@redhat.com> Prasanna Kumar Kalever <prasanna.kalever@redhat.com>
 Red Hat <contact@redhat.com> Prashant D <pdhange@redhat.com>
 Red Hat <contact@redhat.com> Pritha Srivastava <prsrivas@redhat.com>
+Red Hat <contact@redhat.com> Puja Shahu <pshahu@redhat.com>
 Red Hat <contact@redhat.com> Radoslaw Zarzynski <rzarzynski@redhat.com>
 Red Hat <contact@redhat.com> Rafael Quintero <rquinter@redhat.com>
 Red Hat <contact@redhat.com> Ramakrishnan Periyasamy <rperiyas@redhat.com>

From 7343be720870d4a5f82b55beee4685457a003067 Mon Sep 17 00:00:00 2001
From: Adam Kupczyk <akupczyk@ibm.com>
Date: Tue, 15 Oct 2024 12:41:22 +0000
Subject: [PATCH 140/148] os/bluestore: Fix repair of multilabel when collides
 with BlueFS

The problem was that BDEV_FIRST_LABEL_POSITION was removed from
bdev_label_valid_locations set.
Now, if label at BDEV_FIRST_LABEL_POSITION is valid, it is in the set.

Fixes: https://tracker.ceph.com/issues/68528

Signed-off-by: Adam Kupczyk <akupczyk@ibm.com>
---
 src/os/bluestore/BlueStore.cc | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc
index 44a171873c08c..6c31639811e6e 100644
--- a/src/os/bluestore/BlueStore.cc
+++ b/src/os/bluestore/BlueStore.cc
@@ -6794,9 +6794,8 @@ void BlueStore::_main_bdev_label_try_reserve()
   vector<uint64_t> candidate_positions;
   vector<uint64_t> accepted_positions;
   uint64_t lsize = std::max(BDEV_LABEL_BLOCK_SIZE, min_alloc_size);
-  for (size_t i = 1; i < bdev_label_positions.size(); i++) {
-    uint64_t location = bdev_label_positions[i];
-    if (location + lsize <= bdev->get_size()) {
+  for (uint64_t location : bdev_label_valid_locations) {
+    if (location != BDEV_FIRST_LABEL_POSITION) {
       candidate_positions.push_back(location);
     }
   }
@@ -11497,9 +11496,7 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
     string p = path + "/block";
     _write_bdev_label(cct, bdev, p, bdev_label, bdev_labels_in_repair);
     for (uint64_t pos : bdev_labels_in_repair) {
-      if (pos != BDEV_FIRST_LABEL_POSITION) {
-        bdev_label_valid_locations.push_back(pos);
-      }
+      bdev_label_valid_locations.push_back(pos);
     }
     repaired += bdev_labels_in_repair.size();
   }

From dd2a150f40fb11abe6bd1ee51bca03419aaa7d7f Mon Sep 17 00:00:00 2001
From: Guillaume Abrioux <gabrioux@ibm.com>
Date: Wed, 9 Oct 2024 14:59:38 +0000
Subject: [PATCH 141/148] ceph-volume: address test_activate_dmcrypt_tpm

This mocks the call to `luks_close()`, otherwise this test
fails when run on a system where `cryptsetup` isn't available.

Signed-off-by: Guillaume Abrioux <gabrioux@ibm.com>
---
 .../ceph_volume/tests/objectstore/test_rawbluestore.py           | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/ceph-volume/ceph_volume/tests/objectstore/test_rawbluestore.py b/src/ceph-volume/ceph_volume/tests/objectstore/test_rawbluestore.py
index f4f50b06f8a29..fd7c468037c5c 100644
--- a/src/ceph-volume/ceph_volume/tests/objectstore/test_rawbluestore.py
+++ b/src/ceph-volume/ceph_volume/tests/objectstore/test_rawbluestore.py
@@ -159,6 +159,7 @@ def test_activate_osd_id_and_fsid(self,
 
     @patch('ceph_volume.objectstore.rawbluestore.encryption_utils.rename_mapper', Mock(return_value=MagicMock()))
     @patch('ceph_volume.util.disk.get_bluestore_header')
+    @patch('ceph_volume.objectstore.rawbluestore.encryption_utils.luks_close', Mock(return_value=MagicMock()))
     @patch('ceph_volume.objectstore.rawbluestore.encryption_utils.luks_open', Mock(return_value=MagicMock()))
     def test_activate_dmcrypt_tpm(self, m_bs_header, rawbluestore, fake_lsblk_all, mock_raw_direct_report, is_root) -> None:
         m_bs_header.return_value = {

From b5e7008d28a5acd63ea9cd0c6b27f400dad409af Mon Sep 17 00:00:00 2001
From: Guillaume Abrioux <gabrioux@ibm.com>
Date: Wed, 9 Oct 2024 15:00:53 +0000
Subject: [PATCH 142/148] ceph-volume: address mypy errors in disk.py

typical error:

```
ceph_volume/util/disk.py:1374: error: Incompatible types in assignment (expression has type "Optional[str]", variable has type "str")  [assignment]
```

This commits addresses it.

Signed-off-by: Guillaume Abrioux <gabrioux@ibm.com>
---
 src/ceph-volume/ceph_volume/util/disk.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/ceph-volume/ceph_volume/util/disk.py b/src/ceph-volume/ceph_volume/util/disk.py
index 78c140597d653..3ac51c11e3469 100644
--- a/src/ceph-volume/ceph_volume/util/disk.py
+++ b/src/ceph-volume/ceph_volume/util/disk.py
@@ -1370,8 +1370,8 @@ def slashed_path(self) -> str:
         """
         result: str = self.path
         if self.is_lvm:
-            vg: str = self.environment.get('DM_VG_NAME')
-            lv: str = self.environment.get('DM_LV_NAME')
+            vg: str = self.environment.get('DM_VG_NAME', '')
+            lv: str = self.environment.get('DM_LV_NAME', '')
             result = f'/dev/{vg}/{lv}'
         return result
 
@@ -1385,6 +1385,6 @@ def dashed_path(self) -> str:
         """
         result: str = self.path
         if self.is_lvm:
-            name: str = self.environment.get('DM_NAME')
+            name: str = self.environment.get('DM_NAME', '')
             result = f'/dev/mapper/{name}'
         return result

From 212c8740831a7650b5be86c27d14f8c0b6eacbef Mon Sep 17 00:00:00 2001
From: Guillaume Abrioux <gabrioux@ibm.com>
Date: Wed, 31 Jul 2024 14:36:48 +0000
Subject: [PATCH 143/148] orch: disk replacement enhancement

This introduces a new `ceph orch device replace` command in order to
improve the user experience when it comes to replacing the underlying
device of an OSD.

Fixes: https://tracker.ceph.com/issues/68456

Signed-off-by: Guillaume Abrioux <gabrioux@ibm.com>
---
 doc/cephadm/operations.rst                    |  69 +++
 src/ceph-volume/ceph_volume/__init__.py       |   1 +
 src/ceph-volume/ceph_volume/api/lvm.py        |  17 +-
 .../ceph_volume/devices/lvm/zap.py            | 173 +++++--
 src/ceph-volume/ceph_volume/tests/conftest.py |   2 +-
 .../ceph_volume/tests/devices/lvm/test_zap.py |  23 +-
 .../ceph_volume/tests/test_inventory.py       |   1 +
 .../ceph_volume/util/arg_validators.py        |  14 +
 src/ceph-volume/ceph_volume/util/device.py    |  26 +-
 src/ceph-volume/ceph_volume/util/disk.py      |  17 +-
 src/pybind/mgr/cephadm/ceph_volume.py         | 430 ++++++++++++++++++
 src/pybind/mgr/cephadm/module.py              |  53 +++
 src/pybind/mgr/cephadm/serve.py               |   5 +-
 src/pybind/mgr/cephadm/services/osd.py        |  45 +-
 .../mgr/cephadm/tests/ceph_volume_data.py     |   1 +
 src/pybind/mgr/cephadm/tests/conftest.py      |   7 +-
 src/pybind/mgr/cephadm/tests/fixtures.py      |   4 +-
 .../mgr/cephadm/tests/test_ceph_volume.py     | 231 ++++++++++
 .../mgr/cephadm/tests/test_replace_device.py  |  53 +++
 src/pybind/mgr/orchestrator/_interface.py     |  15 +
 src/pybind/mgr/orchestrator/module.py         |  20 +-
 .../deployment/drive_selection/selector.py    |   4 +
 .../ceph/deployment/inventory.py              |  10 +-
 23 files changed, 1151 insertions(+), 70 deletions(-)
 create mode 100644 src/pybind/mgr/cephadm/ceph_volume.py
 create mode 100644 src/pybind/mgr/cephadm/tests/ceph_volume_data.py
 create mode 100644 src/pybind/mgr/cephadm/tests/test_ceph_volume.py
 create mode 100644 src/pybind/mgr/cephadm/tests/test_replace_device.py

diff --git a/doc/cephadm/operations.rst b/doc/cephadm/operations.rst
index 3b117c1bd6a60..420ee655ac8ba 100644
--- a/doc/cephadm/operations.rst
+++ b/doc/cephadm/operations.rst
@@ -734,3 +734,72 @@ Purge ceph daemons from all hosts in the cluster
 
   # For each host:
   cephadm rm-cluster --force --zap-osds --fsid <fsid>
+
+
+Replacing a device
+==================
+
+The ``ceph orch device replace`` command automates the process of replacing the underlying device of an OSD.
+Previously, this process required manual intervention at various stages.
+With this new command, all necessary operations are performed automatically, streamlining the replacement process
+and improving the overall user experience.
+
+.. note:: This only supports LVM-based deployed OSD(s)
+
+.. prompt:: bash #
+
+  ceph orch device replace <host> <device-path>
+
+In the case the device being replaced is shared by multiple OSDs (eg: DB/WAL device shared by multiple OSDs), the orchestrator will warn you.
+
+.. prompt:: bash #
+
+  [ceph: root@ceph /]# ceph orch device replace osd-1 /dev/vdd
+
+  Error EINVAL: /dev/vdd is a shared device.
+  Replacing /dev/vdd implies destroying OSD(s): ['0', '1'].
+  Please, *be very careful*, this can be a very dangerous operation.
+  If you know what you are doing, pass --yes-i-really-mean-it
+
+If you know what you are doing, you can go ahead and pass ``--yes-i-really-mean-it``.
+
+.. prompt:: bash #
+
+  [ceph: root@ceph /]# ceph orch device replace osd-1 /dev/vdd --yes-i-really-mean-it
+    Scheduled to destroy osds: ['6', '7', '8'] and mark /dev/vdd as being replaced.
+
+``cephadm`` will make ``ceph-volume`` zap and destroy all related devices and mark the corresponding OSD as ``destroyed`` so the
+different OSD(s) ID(s) will be preserved:
+
+.. prompt:: bash #
+
+  [ceph: root@ceph-1 /]# ceph osd tree
+    ID  CLASS  WEIGHT   TYPE NAME         STATUS     REWEIGHT  PRI-AFF
+    -1         0.97659  root default
+    -3         0.97659      host devel-1
+     0    hdd  0.29300          osd.0     destroyed   1.00000  1.00000
+     1    hdd  0.29300          osd.1     destroyed   1.00000  1.00000
+     2    hdd  0.19530          osd.2            up   1.00000  1.00000
+     3    hdd  0.19530          osd.3            up   1.00000  1.00000
+
+The device being replaced is finally seen as ``being replaced`` preventing ``cephadm`` from redeploying the OSDs too fast:
+
+.. prompt:: bash #
+
+  [ceph: root@ceph-1 /]# ceph orch device ls
+  HOST     PATH      TYPE  DEVICE ID   SIZE  AVAILABLE  REFRESHED  REJECT REASONS
+  osd-1  /dev/vdb  hdd               200G  Yes        13s ago
+  osd-1  /dev/vdc  hdd               200G  Yes        13s ago
+  osd-1  /dev/vdd  hdd               200G  Yes        13s ago    Is being replaced
+  osd-1  /dev/vde  hdd               200G  No         13s ago    Has a FileSystem, Insufficient space (<10 extents) on vgs, LVM detected
+  osd-1  /dev/vdf  hdd               200G  No         13s ago    Has a FileSystem, Insufficient space (<10 extents) on vgs, LVM detected
+
+If for any reason you need to clear the 'device replace header' on a device, then you can use ``ceph orch device replace <host> <device> --clear``:
+
+.. prompt:: bash #
+
+  [ceph: root@devel-1 /]# ceph orch device replace devel-1 /dev/vdk --clear
+  Replacement header cleared on /dev/vdk
+  [ceph: root@devel-1 /]#
+
+After that, ``cephadm`` will redeploy the OSD service spec within a few minutes (unless the service is set to ``unmanaged``).
diff --git a/src/ceph-volume/ceph_volume/__init__.py b/src/ceph-volume/ceph_volume/__init__.py
index b10100c02185a..814619cfdddb4 100644
--- a/src/ceph-volume/ceph_volume/__init__.py
+++ b/src/ceph-volume/ceph_volume/__init__.py
@@ -6,6 +6,7 @@
 sys_info = namedtuple('sys_info', ['devices'])
 sys_info.devices = dict()
 logger = logging.getLogger(__name__)
+BEING_REPLACED_HEADER: str = 'CEPH_DEVICE_BEING_REPLACED'
 
 
 class AllowLoopDevices:
diff --git a/src/ceph-volume/ceph_volume/api/lvm.py b/src/ceph-volume/ceph_volume/api/lvm.py
index 16cbc08b26254..fc376f891fd25 100644
--- a/src/ceph-volume/ceph_volume/api/lvm.py
+++ b/src/ceph-volume/ceph_volume/api/lvm.py
@@ -10,6 +10,8 @@
 from math import floor
 from ceph_volume import process, util, conf
 from ceph_volume.exceptions import SizeAllocationError
+from typing import Any, Dict
+
 
 logger = logging.getLogger(__name__)
 
@@ -807,13 +809,16 @@ def get_all_devices_vgs(name_prefix=''):
                    '--units=b', '--nosuffix']
 
 
-class Volume(object):
+class Volume:
     """
     Represents a Logical Volume from LVM, with some top-level attributes like
     ``lv_name`` and parsed tags as a dictionary of key/value pairs.
     """
 
-    def __init__(self, **kw):
+    def __init__(self, **kw: str) -> None:
+        self.lv_path: str = ''
+        self.lv_name: str = ''
+        self.lv_uuid: str = ''
         for k, v in kw.items():
             setattr(self, k, v)
         self.lv_api = kw
@@ -824,13 +829,13 @@ def __init__(self, **kw):
         self.encrypted = self.tags.get('ceph.encrypted', '0') == '1'
         self.used_by_ceph = 'ceph.osd_id' in self.tags
 
-    def __str__(self):
+    def __str__(self) -> str:
         return '<%s>' % self.lv_api['lv_path']
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         return self.__str__()
 
-    def as_dict(self):
+    def as_dict(self) -> Dict[str, Any]:
         obj = {}
         obj.update(self.lv_api)
         obj['tags'] = self.tags
@@ -839,7 +844,7 @@ def as_dict(self):
         obj['path'] = self.lv_path
         return obj
 
-    def report(self):
+    def report(self) -> Dict[str, Any]:
         if not self.used_by_ceph:
             return {
                 'name': self.lv_name,
diff --git a/src/ceph-volume/ceph_volume/devices/lvm/zap.py b/src/ceph-volume/ceph_volume/devices/lvm/zap.py
index 2b6925f5b2739..388f6aeea2708 100644
--- a/src/ceph-volume/ceph_volume/devices/lvm/zap.py
+++ b/src/ceph-volume/ceph_volume/devices/lvm/zap.py
@@ -5,12 +5,12 @@
 
 from textwrap import dedent
 
-from ceph_volume import decorators, terminal, process
+from ceph_volume import decorators, terminal, process, BEING_REPLACED_HEADER
 from ceph_volume.api import lvm as api
 from ceph_volume.util import system, encryption, disk, arg_validators, str_to_int, merge_dict
 from ceph_volume.util.device import Device
 from ceph_volume.systemd import systemctl
-from typing import List
+from typing import Any, Dict, List
 
 logger = logging.getLogger(__name__)
 mlogger = terminal.MultiLogger(__name__)
@@ -95,29 +95,29 @@ def zap_data(path):
         'conv=fsync'
     ])
 
-
-def find_associated_devices(osd_id=None, osd_fsid=None):
+def find_associated_devices(osd_id: str = '', osd_fsid: str = '') -> List[api.Volume]:
     """
     From an ``osd_id`` and/or an ``osd_fsid``, filter out all the LVs in the
     system that match those tag values, further detect if any partitions are
     part of the OSD, and then return the set of LVs and partitions (if any).
     """
     lv_tags = {}
-    if osd_id:
-        lv_tags['ceph.osd_id'] = osd_id
-    if osd_fsid:
-        lv_tags['ceph.osd_fsid'] = osd_fsid
+    lv_tags = {key: value for key, value in {
+        'ceph.osd_id': osd_id,
+        'ceph.osd_fsid': osd_fsid
+    }.items() if value}
 
     lvs = api.get_lvs(tags=lv_tags)
+
     if not lvs:
         raise RuntimeError('Unable to find any LV for zapping OSD: '
-                           '%s' % osd_id or osd_fsid)
-
+                            f'{osd_id or osd_fsid}')
     devices_to_zap = ensure_associated_lvs(lvs, lv_tags)
-    return [Device(path) for path in set(devices_to_zap) if path]
 
+    return [Device(path) for path in set(devices_to_zap) if path]
 
-def ensure_associated_lvs(lvs, lv_tags={}):
+def ensure_associated_lvs(lvs: List[api.Volume],
+                          lv_tags: Dict[str, Any] = {}) -> List[str]:
     """
     Go through each LV and ensure if backing devices (journal, wal, block)
     are LVs or partitions, so that they can be accurately reported.
@@ -166,14 +166,14 @@ def ensure_associated_lvs(lvs, lv_tags={}):
     return list(set(verified_devices))
 
 
-class Zap(object):
-
+class Zap:
     help = 'Removes all data and filesystems from a logical volume or partition.'
 
-    def __init__(self, argv):
+    def __init__(self, argv: List[str]) -> None:
         self.argv = argv
+        self.osd_ids_to_zap: List[str] = []
 
-    def unmount_lv(self, lv):
+    def unmount_lv(self, lv: api.Volume) -> None:
         if lv.tags.get('ceph.cluster_name') and lv.tags.get('ceph.osd_id'):
             lv_path = "/var/lib/ceph/osd/{}-{}".format(lv.tags['ceph.cluster_name'], lv.tags['ceph.osd_id'])
         else:
@@ -186,39 +186,95 @@ def unmount_lv(self, lv):
         if dmcrypt and dmcrypt_uuid:
             self.dmcrypt_close(dmcrypt_uuid)
 
-    def zap_lv(self, device):
+    def _write_replacement_header(self, device: str) -> None:
+        """Write a replacement header to a device.
+
+        This method writes the string defined in `BEING_REPLACED_HEADER`
+        to the specified device. This header indicates that the device
+        is in the process of being replaced.
+
+        Args:
+            device (str): The path to the device on which the replacement
+                          header will be written.
+        """
+        disk._dd_write(device,
+                       BEING_REPLACED_HEADER)
+
+    def clear_replace_header(self) -> bool:
+        """Safely erase the replacement header on a device if it is marked as being replaced.
+
+        This method checks whether the given device is marked as being replaced
+        (`device.is_being_replaced`). If true, it proceeds to erase the replacement header
+        from the device using the `_erase_replacement_header` method. The method returns
+        a boolean indicating whether any action was taken.
+
+        Args:
+            device (Device): The device object, which includes information about the device's
+                            path and status (such as whether it is currently being replaced).
+
+        Returns:
+            bool: True if the replacement header was successfully erased, False if the
+                device was not marked as being replaced or no action was necessary.
+        """
+        result: bool = False
+        device: Device = self.args.clear_replace_header
+        if device.is_being_replaced:
+            self._erase_replacement_header(device.path)
+            result = True
+        return result
+
+    def _erase_replacement_header(self, device: str) -> None:
+        """Erase the replacement header on a device.
+
+        This method writes a sequence of null bytes (`0x00`) over the area of the device
+        where the replacement header is stored, effectively erasing it.
+
+        Args:
+            device (str): The path to the device from which the replacement header will be erased.
+        """
+        disk._dd_write(device,
+                       b'\x00' * len(BEING_REPLACED_HEADER))
+
+    def zap_lv(self, device: Device) -> None:
         """
         Device examples: vg-name/lv-name, /dev/vg-name/lv-name
         Requirements: Must be a logical volume (LV)
         """
         lv: api.Volume = device.lv_api
         self.unmount_lv(lv)
-
+        self.parent_device: str = disk.get_parent_device_from_mapper(lv.lv_path)
         zap_device(device.path)
 
         if self.args.destroy:
             lvs = api.get_lvs(filters={'vg_name': device.vg_name})
-            if lvs == []:
-                mlogger.info('No LVs left, exiting', device.vg_name)
-                return
-            elif len(lvs) <= 1:
+            if len(lvs) <= 1:
                 mlogger.info('Only 1 LV left in VG, will proceed to destroy '
                              'volume group %s', device.vg_name)
                 pvs = api.get_pvs(filters={'lv_uuid': lv.lv_uuid})
                 api.remove_vg(device.vg_name)
                 for pv in pvs:
                     api.remove_pv(pv.pv_name)
+                replacement_args: Dict[str, bool] = {
+                    'block': self.args.replace_block,
+                    'db': self.args.replace_db,
+                    'wal': self.args.replace_wal
+                }
+                if replacement_args.get(lv.tags.get('ceph.type'), False):
+                    mlogger.info(f'Marking {self.parent_device} as being replaced')
+                    self._write_replacement_header(self.parent_device)
             else:
                 mlogger.info('More than 1 LV left in VG, will proceed to '
                              'destroy LV only')
                 mlogger.info('Removing LV because --destroy was given: %s',
                              device.path)
+                if self.args.replace_block:
+                    mlogger.info(f'--replace-block passed but the device still has {str(len(lvs))} LV(s)')
                 api.remove_lv(device.path)
         elif lv:
             # just remove all lvm metadata, leaving the LV around
             lv.clear_tags()
 
-    def zap_partition(self, device):
+    def zap_partition(self, device: Device) -> None:
         """
         Device example: /dev/sda1
         Requirements: Must be a partition
@@ -246,7 +302,7 @@ def zap_partition(self, device):
             mlogger.info("Destroying partition since --destroy was used: %s" % device.path)
             disk.remove_partition(device)
 
-    def zap_lvm_member(self, device):
+    def zap_lvm_member(self, device: Device) -> None:
         """
         An LVM member may have more than one LV and or VG, for example if it is
         a raw device with multiple partitions each belonging to a different LV
@@ -266,7 +322,7 @@ def zap_lvm_member(self, device):
 
 
 
-    def zap_raw_device(self, device):
+    def zap_raw_device(self, device: Device) -> None:
         """
         Any whole (raw) device passed in as input will be processed here,
         checking for LVM membership and partitions (if any).
@@ -286,10 +342,19 @@ def zap_raw_device(self, device):
             self.zap_partition(Device('/dev/%s' % part_name))
 
         zap_device(device.path)
+        # TODO(guits): I leave this commented out, this should be part of a separate patch in order to
+        # support device replacement with raw-based OSDs
+        # if self.args.replace_block:
+        #     disk._dd_write(device.path, 'CEPH_DEVICE_BEING_REPLACED')
 
     @decorators.needs_root
-    def zap(self, devices=None):
-        devices = devices or self.args.devices
+    def zap(self) -> None:
+        """Zap a device.
+
+        Raises:
+            SystemExit: When the device is a mapper and not a mpath device.
+        """
+        devices = self.args.devices
 
         for device in devices:
             mlogger.info("Zapping: %s", device.path)
@@ -316,21 +381,21 @@ def zap(self, devices=None):
             )
 
     @decorators.needs_root
-    def zap_osd(self):
+    def zap_osd(self) -> None:
         if self.args.osd_id and not self.args.no_systemd:
             osd_is_running = systemctl.osd_is_active(self.args.osd_id)
             if osd_is_running:
                 mlogger.error("OSD ID %s is running, stop it with:" % self.args.osd_id)
                 mlogger.error("systemctl stop ceph-osd@%s" % self.args.osd_id)
                 raise SystemExit("Unable to zap devices associated with OSD ID: %s" % self.args.osd_id)
-        devices = find_associated_devices(self.args.osd_id, self.args.osd_fsid)
-        self.zap(devices)
+        self.args.devices = find_associated_devices(self.args.osd_id, self.args.osd_fsid)
+        self.zap()
 
-    def dmcrypt_close(self, dmcrypt_uuid):
+    def dmcrypt_close(self, dmcrypt_uuid: str) -> None:
         mlogger.info("Closing encrypted volume %s", dmcrypt_uuid)
         encryption.dmcrypt_close(mapping=dmcrypt_uuid, skip_path_check=True)
 
-    def main(self):
+    def main(self) -> None:
         sub_command_help = dedent("""
         Zaps the given logical volume(s), raw device(s) or partition(s) for reuse by ceph-volume.
         If given a path to a logical volume it must be in the format of vg/lv. Any
@@ -418,12 +483,56 @@ def main(self):
             help='Skip systemd unit checks',
         )
 
+        parser.add_argument(
+            '--replace-block',
+            dest='replace_block',
+            action='store_true',
+            help='Mark the block device as unavailable.'
+        )
+
+        parser.add_argument(
+            '--replace-db',
+            dest='replace_db',
+            action='store_true',
+            help='Mark the db device as unavailable.'
+        )
+
+        parser.add_argument(
+            '--replace-wal',
+            dest='replace_wal',
+            action='store_true',
+            help='Mark the wal device as unavailable.'
+        )
+
+        parser.add_argument(
+            '--clear-replace-header',
+            dest='clear_replace_header',
+            type=arg_validators.ValidClearReplaceHeaderDevice(),
+            help='clear the replace header on devices.'
+        )
+
         if len(self.argv) == 0:
             print(sub_command_help)
             return
 
         self.args = parser.parse_args(self.argv)
 
+        if self.args.clear_replace_header:
+            rc: bool = False
+            try:
+                rc = self.clear_replace_header()
+            except Exception as e:
+                raise SystemExit(e)
+            if rc:
+                mlogger.info(f'Replacement header cleared on {self.args.clear_replace_header}')
+            else:
+                mlogger.info(f'No replacement header detected on {self.args.clear_replace_header}, nothing to do.')
+            raise SystemExit(not rc)
+
+        if self.args.replace_block or self.args.replace_db or self.args.replace_wal:
+            self.args.destroy = True
+            mlogger.info('--replace-block|db|wal passed, enforcing --destroy.')
+
         if self.args.osd_id or self.args.osd_fsid:
             self.zap_osd()
         else:
diff --git a/src/ceph-volume/ceph_volume/tests/conftest.py b/src/ceph-volume/ceph_volume/tests/conftest.py
index ee58081d97da1..e6bf31737b69c 100644
--- a/src/ceph-volume/ceph_volume/tests/conftest.py
+++ b/src/ceph-volume/ceph_volume/tests/conftest.py
@@ -360,7 +360,7 @@ def apply(devices=None, lsblk=None, lv=None, blkid=None, udevadm=None,
               has_bluestore_label=False):
         if devices:
             for dev in devices.keys():
-                devices[dev]['device_nodes'] = os.path.basename(dev)
+                devices[dev]['device_nodes'] = [os.path.basename(dev)]
         else:
             devices = {}
         lsblk = lsblk if lsblk else {}
diff --git a/src/ceph-volume/ceph_volume/tests/devices/lvm/test_zap.py b/src/ceph-volume/ceph_volume/tests/devices/lvm/test_zap.py
index d630a7a6bf887..efe52c053ffc3 100644
--- a/src/ceph-volume/ceph_volume/tests/devices/lvm/test_zap.py
+++ b/src/ceph-volume/ceph_volume/tests/devices/lvm/test_zap.py
@@ -7,11 +7,30 @@
 from ceph_volume.devices.lvm import zap
 
 
-class TestZap(object):
-    def test_invalid_osd_id_passed(self):
+class TestZap:
+    def test_invalid_osd_id_passed(self) -> None:
         with pytest.raises(SystemExit):
             zap.Zap(argv=['--osd-id', 'foo']).main()
 
+    @patch('ceph_volume.util.disk._dd_write', Mock())
+    @patch('ceph_volume.util.arg_validators.Device')
+    def test_clear_replace_header_is_being_replaced(self, m_device: Mock) -> None:
+        m_dev = m_device.return_value
+        m_dev.is_being_replaced = True
+        with pytest.raises(SystemExit) as e:
+            zap.Zap(argv=['--clear', '/dev/foo']).main()
+        assert e.value.code == 0
+
+    @patch('ceph_volume.util.disk._dd_write', Mock())
+    @patch('ceph_volume.util.arg_validators.Device')
+    def test_clear_replace_header_is_not_being_replaced(self, m_device: Mock) -> None:
+        m_dev = m_device.return_value
+        m_dev.is_being_replaced = False
+        with pytest.raises(SystemExit) as e:
+            zap.Zap(argv=['--clear', '/dev/foo']).main()
+        assert e.value.code == 1
+
+
 class TestFindAssociatedDevices(object):
 
     def test_no_lvs_found_that_match_id(self, monkeypatch, device_info):
diff --git a/src/ceph-volume/ceph_volume/tests/test_inventory.py b/src/ceph-volume/ceph_volume/tests/test_inventory.py
index 785d8b56e86b6..832c083664212 100644
--- a/src/ceph-volume/ceph_volume/tests/test_inventory.py
+++ b/src/ceph-volume/ceph_volume/tests/test_inventory.py
@@ -126,6 +126,7 @@ class TestInventory(object):
         'lvs',
         'device_id',
         'lsm_data',
+        'being_replaced'
     ]
 
     expected_sys_api_keys = [
diff --git a/src/ceph-volume/ceph_volume/util/arg_validators.py b/src/ceph-volume/ceph_volume/util/arg_validators.py
index 99e7d039e742b..e75b34e550e3c 100644
--- a/src/ceph-volume/ceph_volume/util/arg_validators.py
+++ b/src/ceph-volume/ceph_volume/util/arg_validators.py
@@ -7,6 +7,9 @@
 from ceph_volume.util.encryption import set_dmcrypt_no_workqueue
 
 
+mlogger = terminal.MultiLogger(__name__)
+
+
 def valid_osd_id(val):
     return str(int(val))
 
@@ -70,6 +73,17 @@ def _is_valid_device(self, raise_sys_exit=True):
         return self._device
 
 
+class ValidClearReplaceHeaderDevice(ValidDevice):
+    def __call__(self, dev_path: str) -> str:
+        super().get_device(dev_path)
+        return self._format_device(self._is_valid_device())
+
+    def _is_valid_device(self) -> Device:
+        if not self._device.is_being_replaced:
+            mlogger.info(f'{self.dev_path} has no replacement header.')
+        return self._device
+
+
 class ValidDataDevice(ValidDevice):
     def __call__(self, dev_path):
         super().get_device(dev_path)
diff --git a/src/ceph-volume/ceph_volume/util/device.py b/src/ceph-volume/ceph_volume/util/device.py
index 9c2c11e7f316f..82ee3266e3f1f 100644
--- a/src/ceph-volume/ceph_volume/util/device.py
+++ b/src/ceph-volume/ceph_volume/util/device.py
@@ -1,13 +1,14 @@
 # -*- coding: utf-8 -*-
-
+# type: ignore
 import logging
 import os
 from functools import total_ordering
-from ceph_volume import sys_info, allow_loop_devices
+from ceph_volume import sys_info, allow_loop_devices, BEING_REPLACED_HEADER
 from ceph_volume.api import lvm
 from ceph_volume.util import disk, system
 from ceph_volume.util.lsmdisk import LSMDisk
 from ceph_volume.util.constants import ceph_disk_guids
+from typing import List, Tuple
 
 
 logger = logging.getLogger(__name__)
@@ -92,6 +93,7 @@ class Device(object):
         'sys_api',
         'device_id',
         'lsm_data',
+        'being_replaced'
     ]
     pretty_report_sys_fields = [
         'actuators',
@@ -136,6 +138,7 @@ def __init__(self, path, with_lsm=False, lvs=None, lsblk_all=None, all_devices_v
         self._exists = None
         self._is_lvm_member = None
         self.ceph_device = False
+        self.being_replaced: bool = self.is_being_replaced
         self._parse()
         if self.path in sys_info.devices.keys():
             self.device_nodes = sys_info.devices[self.path]['device_nodes']
@@ -298,7 +301,7 @@ def report(self):
             rot=self.rotational,
             available=self.available,
             model=self.model,
-            device_nodes=self.device_nodes
+            device_nodes=','.join(self.device_nodes)
         )
 
     def json_report(self):
@@ -590,7 +593,7 @@ def vg_free(self):
             return [vg_free]
 
     @property
-    def has_partitions(self):
+    def has_partitions(self) -> bool:
         '''
         Boolean to determine if a given device has partitions.
         '''
@@ -598,7 +601,14 @@ def has_partitions(self):
             return True
         return False
 
-    def _check_generic_reject_reasons(self):
+    @property
+    def is_being_replaced(self) -> bool:
+        '''
+        Boolean to indicate if the device is being replaced.
+        '''
+        return disk._dd_read(self.path, 26) == BEING_REPLACED_HEADER
+
+    def _check_generic_reject_reasons(self) -> List[str]:
         reasons = [
             ('id_bus', 'usb', 'id_bus'),
             ('ro', '1', 'read-only'),
@@ -639,9 +649,11 @@ def _check_generic_reject_reasons(self):
             rejected.append('Has partitions')
         if self.has_fs:
             rejected.append('Has a FileSystem')
+        if self.is_being_replaced:
+            rejected.append('Is being replaced')
         return rejected
 
-    def _check_lvm_reject_reasons(self):
+    def _check_lvm_reject_reasons(self) -> Tuple[bool, List[str]]:
         rejected = []
         if self.vgs:
             available_vgs = [vg for vg in self.vgs if int(vg.vg_free_count) > 10]
@@ -654,7 +666,7 @@ def _check_lvm_reject_reasons(self):
 
         return len(rejected) == 0, rejected
 
-    def _check_raw_reject_reasons(self):
+    def _check_raw_reject_reasons(self) -> Tuple[bool, List[str]]:
         rejected = self._check_generic_reject_reasons()
         if len(self.vgs) > 0:
             rejected.append('LVM detected')
diff --git a/src/ceph-volume/ceph_volume/util/disk.py b/src/ceph-volume/ceph_volume/util/disk.py
index 3ac51c11e3469..30ee56808c762 100644
--- a/src/ceph-volume/ceph_volume/util/disk.py
+++ b/src/ceph-volume/ceph_volume/util/disk.py
@@ -7,7 +7,7 @@
 from ceph_volume import process, allow_loop_devices
 from ceph_volume.api import lvm
 from ceph_volume.util.system import get_file_contents
-from typing import Dict, List, Any
+from typing import Dict, List, Any, Union
 
 
 logger = logging.getLogger(__name__)
@@ -857,13 +857,14 @@ def get_devices(_sys_block_path='/sys/block', device=''):
             device_slaves = os.listdir(os.path.join(sysdir, 'slaves'))
             metadata['partitions'] = get_partitions_facts(sysdir)
 
+        metadata['device_nodes'] = []
         if device_slaves:
-            metadata['device_nodes'] = ','.join(device_slaves)
+            metadata['device_nodes'].extend(device_slaves)
         else:
             if block[2] == 'part':
-                metadata['device_nodes'] = block[3]
+                metadata['device_nodes'].append(block[3])
             else:
-                metadata['device_nodes'] = devname
+                metadata['device_nodes'].append(devname)
 
         metadata['actuators'] = None
         if os.path.isdir(sysdir + "/queue/independent_access_ranges/"):
@@ -979,7 +980,7 @@ def _dd_read(device: str, count: int, skip: int = 0) -> str:
 
     return result
 
-def _dd_write(device: str, data: str, skip: int = 0) -> None:
+def _dd_write(device: str, data: Union[str, bytes], skip: int = 0) -> None:
     """Write bytes to a device
 
     Args:
@@ -991,10 +992,14 @@ def _dd_write(device: str, data: str, skip: int = 0) -> None:
         OSError: If there is an error opening or writing to the device.
         Exception: If any other error occurs during the write operation.
     """
+
+    if isinstance(data, str):
+        data = data.encode('utf-8')
+
     try:
         with open(device, 'r+b') as b:
             b.seek(skip)
-            b.write(data.encode('utf-8'))
+            b.write(data)
     except OSError:
         logger.warning(f"Can't write to {device}")
         raise
diff --git a/src/pybind/mgr/cephadm/ceph_volume.py b/src/pybind/mgr/cephadm/ceph_volume.py
new file mode 100644
index 0000000000000..a270bb7028f46
--- /dev/null
+++ b/src/pybind/mgr/cephadm/ceph_volume.py
@@ -0,0 +1,430 @@
+from cephadm.serve import CephadmServe
+from typing import List, TYPE_CHECKING, Any, Dict, Set, Tuple
+if TYPE_CHECKING:
+    from cephadm import CephadmOrchestrator
+
+
+class CephVolume:
+    def __init__(self, mgr: "CephadmOrchestrator", _inheritance: bool = False) -> None:
+        self.mgr: "CephadmOrchestrator" = mgr
+        if not _inheritance:
+            self.lvm_list: "CephVolumeLvmList" = CephVolumeLvmList(mgr)
+
+    def run_json(self, hostname: str, command: List[str]) -> Dict[str, Any]:
+        """Execute a JSON command on the specified hostname and return the result.
+
+        This method wraps the asynchronous execution of a JSON command on the
+        specified hostname, waiting for the command to complete. It utilizes the
+        `_run_json` method to perform the actual execution.
+
+        Args:
+            hostname (str): The hostname of the target node where the JSON command
+                            will be executed.
+            command (List[str]): A list of command arguments to be passed to the
+                                JSON command.
+
+        Returns:
+            Dict[str, Any]: A dictionary containing the JSON response from the
+                            executed command, which may include various data
+                            based on the command executed.
+        """
+        return self.mgr.wait_async(self._run_json(hostname, command))
+
+    def run(self, hostname: str, command: List[str], **kw: Any) -> Tuple[List[str], List[str], int]:
+        """Execute a command on the specified hostname and return the result.
+
+        This method wraps the asynchronous execution of a command on the
+        specified hostname, waiting for the command to complete. It utilizes the
+        `_run` method to perform the actual execution.
+
+        Args:
+            hostname (str): The hostname of the target node where the command
+                            will be executed.
+            command (List[str]): A list of command arguments to be passed to the
+                                command.
+            **kw (Any): Additional keyword arguments to customize the command
+                        execution.
+
+        Returns:
+            Tuple[List[str], List[str], int]: A tuple containing:
+                - A list of strings representing the standard output of the command.
+                - A list of strings representing the standard error output of the command.
+                - An integer representing the return code of the command execution.
+        """
+        return self.mgr.wait_async(self._run(hostname, command, **kw))
+
+    async def _run(self,
+                   hostname: str,
+                   command: List[str],
+                   **kw: Any) -> Tuple[List[str], List[str], int]:
+        """Execute a ceph-volume command on the specified hostname and return the result.
+
+        This asynchronous method constructs a ceph-volume command and then executes
+        it on the specified host.
+        The result of the command is returned in JSON format.
+
+        Args:
+            hostname (str): The hostname of the target node where the command will be executed.
+            command (List[str]): A list of command arguments to be passed to the Ceph command.
+            **kw (Any): Additional keyword arguments to customize the command execution.
+
+        Returns:
+            Tuple[List[str], List[str], int]: A tuple containing:
+                - A list of strings representing the standard output of the command.
+                - A list of strings representing the standard error output of the command.
+                - An integer representing the return code of the command execution.
+        """
+        cmd: List[str] = ['--']
+        cmd.extend(command)
+        result = await CephadmServe(self.mgr)._run_cephadm(
+            hostname, 'osd', 'ceph-volume',
+            cmd,
+            **kw)
+        return result
+
+    async def _run_json(self,
+                        hostname: str,
+                        command: List[str]) -> Dict[str, Any]:
+        """Execute a ceph-volume command on a specified hostname.
+
+        This asynchronous method constructs a ceph-volume command and then executes
+        it on the specified host.
+        The result of the command is returned in JSON format.
+
+        Args:
+            hostname (str): The hostname of the target node where the command will be executed.
+            command (List[str]): A list of command arguments to be passed to the Ceph command.
+
+        Returns:
+            Dict[str, Any]: The result of the command execution as a dictionary parsed from
+                            the JSON output.
+        """
+        cmd: List[str] = ['--']
+        cmd.extend(command)
+        result = await CephadmServe(self.mgr)._run_cephadm_json(
+            hostname, 'osd', 'ceph-volume',
+            cmd)
+        return result
+
+    def clear_replace_header(self, hostname: str, device: str) -> str:
+        """Clear the replacement header on a specified device for a given hostname.
+
+        This method checks if a replacement header exists on the specified device
+        and clears it if found. After clearing, it invalidates the cached device
+        information for the specified hostname and kicks the serve loop.
+
+        Args:
+            hostname (str): The hostname of the device on which the replacement header
+                            will be cleared. This is used to identify the specific
+                            device within the manager's context.
+            device (str): The path to the device (e.g., '/dev/sda') from which the
+                          replacement header will be cleared.
+
+        Returns:
+            str: A message indicating the result of the operation. It will either confirm
+                 that the replacement header was cleared or state that no replacement header
+                 was detected on the device.
+        """
+        output: str = ''
+        result = self.run(hostname, ['lvm',
+                                     'zap',
+                                     '--clear-replace-header',
+                                     device],
+                          error_ok=True)
+        out, err, rc = result
+        if not rc:
+            output = f'Replacement header cleared on {device}'
+            self.mgr.cache.invalidate_host_devices(hostname)
+            self.mgr._kick_serve_loop()
+        else:
+            plain_out: str = '\n'.join(out)
+            plain_err: str = '\n'.join(err)
+            output = f'No replacement header could be cleared on {device}.\n{plain_out}\n{plain_err}'
+        return output
+
+
+class CephVolumeLvmList(CephVolume):
+    def __init__(self, mgr: "CephadmOrchestrator") -> None:
+        super().__init__(mgr, True)
+        self.data: Dict[str, Any] = {}
+
+    def get_data(self, hostname: str) -> None:
+        """Execute the `ceph-volume lvm list` command to list LVM-based OSDs.
+
+        This asynchronous method interacts with the Ceph manager to retrieve
+        information about the Logical Volume Manager (LVM) devices associated
+        with the OSDs. It calls the `ceph-volume lvm list` command in JSON format
+        to gather relevant data.
+
+        Returns:
+            None: This method does not return a value. The retrieved data is
+                  stored in the `self.data` attribute for further processing.
+        """
+        self.data = self.run_json(hostname,
+                                  ['lvm', 'list', '--format', 'json'])
+
+    def devices_by_type(self, device_type: str) -> List[str]:
+        """Retrieve a list of devices of a specified type across all OSDs.
+
+        This method iterates through all OSDs and collects devices that match
+        the specified type (e.g., 'block', 'db', 'wal'). The resulting list
+        contains unique device paths.
+
+        Args:
+            device_type (str): The type of devices to retrieve. This should
+                               be one of the recognized device types such as
+                               'block', 'db', or 'wal'.
+
+        Returns:
+            List[str]: A list of unique device paths of the specified type
+                       found across all OSDs. If no devices of the specified
+                       type are found, an empty list is returned.
+        """
+        result: Set[str] = set()
+        for osd in self.osd_ids():
+            for lv in self.data.get(osd, []):
+                if lv.get('type') == device_type:
+                    result.update(lv.get('devices', []))
+        return list(result)
+
+    def block_devices(self) -> List[str]:
+        """List all block devices used by OSDs.
+
+        This method returns a list of devices that are used as 'block' devices
+        for storing the main OSD data.
+
+        Returns:
+            List[str]: A list of device paths (strings) that are used as 'block' devices.
+        """
+        return self.devices_by_type('block')
+
+    def db_devices(self) -> List[str]:
+        """List all database (DB) devices used by OSDs.
+
+        This method returns a list of devices that are used as 'db' devices
+        for storing the database files associated with OSDs.
+
+        Returns:
+            List[str]: A list of device paths (strings) that are used as 'db' devices.
+        """
+        return self.devices_by_type('db')
+
+    def wal_devices(self) -> List[str]:
+        """List all write-ahead log (WAL) devices used by OSDs.
+
+        This method returns a list of devices that are used as 'wal' devices
+        for storing write-ahead log data associated with OSDs.
+
+        Returns:
+            List[str]: A list of device paths (strings) that are used as 'wal' devices.
+        """
+        return self.devices_by_type('wal')
+
+    def all_devices(self) -> List[str]:
+        """List all devices used by OSDs for 'block', 'db', or 'wal' purposes.
+
+        This method aggregates all devices that are currently used by the OSDs
+        in the system for the following device types:
+        - 'block' devices: Used to store the OSD's data.
+        - 'db' devices: Used for database purposes.
+        - 'wal' devices: Used for Write-Ahead Logging.
+
+        The returned list combines devices from all these categories.
+
+        Returns:
+            List[str]: A list of device paths (strings) that are used as 'block', 'db', or 'wal' devices.
+        """
+        return self.block_devices() + self.db_devices() + self.wal_devices()
+
+    def device_osd_mapping(self, device_type: str = '') -> Dict[str, Dict[str, List[str]]]:
+        """Create a mapping of devices to their corresponding OSD IDs based on device type.
+
+        This method serves as a 'proxy' function, designed to be called by the *_device_osd_mapping() methods.
+
+        This method iterates over the OSDs and their logical volumes to build a
+        dictionary that maps each device of the specified type to the list of
+        OSD IDs that use it. The resulting dictionary can be used to determine
+        which OSDs share a specific device.
+
+        Args:
+            device_type (str): The type of the device to filter by (e.g., 'block', 'db', or 'wal').
+                               If an empty string is provided, devices of all types will be included.
+
+        Returns:
+            Dict[str, Dict[str, List[str]]]: A dictionary where the keys are device
+            names and the values are dictionaries containing a list of OSD IDs
+            that use the corresponding device.
+
+        eg:
+        ```
+            {
+                '/dev/vda': {'osd_ids': ['0', '1']},
+                '/dev/vdb': {'osd_ids': ['2']}
+            }
+        ```
+
+        """
+        result: Dict[str, Dict[str, List[str]]] = {}
+        for osd in self.osd_ids():
+            for lv in self.data.get(osd, []):
+                if lv.get('type') == device_type or not device_type:
+                    for device in lv.get('devices', []):
+                        if device not in result:
+                            result[device] = {'osd_ids': []}
+                        result[device]['osd_ids'].append(osd)
+        return result
+
+    def block_device_osd_mapping(self) -> Dict[str, Dict[str, List[str]]]:
+        """Get a dictionnary with all block devices and their corresponding
+        osd(s) id(s).
+
+        eg:
+        ```
+        {'/dev/vdb': {'osd_ids': ['0']},
+         '/dev/vdc': {'osd_ids': ['1']},
+         '/dev/vdf': {'osd_ids': ['2']},
+         '/dev/vde': {'osd_ids': ['3', '4']}}
+         ```
+
+        Returns:
+            Dict[str, Dict[str, List[str]]]: A dict including all block devices with their corresponding
+        osd id(s).
+        """
+        return self.device_osd_mapping('block')
+
+    def db_device_osd_mapping(self) -> Dict[str, Dict[str, List[str]]]:
+        """Get a dictionnary with all db devices and their corresponding
+        osd(s) id(s).
+
+        eg:
+        ```
+        {'/dev/vdv': {'osd_ids': ['0', '1', '2', '3']},
+         '/dev/vdx': {'osd_ids': ['4']}}
+         ```
+
+        Returns:
+            Dict[str, Dict[str, List[str]]]: A dict including all db devices with their corresponding
+        osd id(s).
+        """
+        return self.device_osd_mapping('db')
+
+    def wal_device_osd_mapping(self) -> Dict[str, Dict[str, List[str]]]:
+        """Get a dictionnary with all wal devices and their corresponding
+        osd(s) id(s).
+
+        eg:
+        ```
+        {'/dev/vdy': {'osd_ids': ['0', '1', '2', '3']},
+         '/dev/vdz': {'osd_ids': ['4']}}
+         ```
+
+        Returns:
+            Dict[str, Dict[str, List[str]]]: A dict including all wal devices with their corresponding
+        osd id(s).
+        """
+        return self.device_osd_mapping('wal')
+
+    def is_shared_device(self, device: str) -> bool:
+        """Determines if a device is shared between multiple OSDs.
+
+        This method checks if a given device is shared by multiple OSDs for a specified device type
+        (such as 'block', 'db', or 'wal'). If the device is associated with more than one OSD,
+        it is considered shared.
+
+        Args:
+            device (str): The device path to check (e.g., '/dev/sda').
+            device_type (str): The type of the device (e.g., 'block', 'db', 'wal').
+
+        Raises:
+            RuntimeError: If the device is not valid or not found in the shared devices mapping.
+
+        Returns:
+            bool: True if the device is shared by more than one OSD, False otherwise.
+        """
+        device_osd_mapping = self.device_osd_mapping()
+        if not device or device not in device_osd_mapping:
+            raise RuntimeError('Not a valid device path.')
+        return len(device_osd_mapping[device]['osd_ids']) > 1
+
+    def is_block_device(self, device: str) -> bool:
+        """Check if a specified device is a block device.
+
+        This method checks if the specified device is included in the
+        list of block devices used by OSDs.
+
+        Args:
+            device (str): The path of the device to check.
+
+        Returns:
+            bool: True if the device is a block device,
+                  False otherwise.
+        """
+        return device in self.block_devices()
+
+    def is_db_device(self, device: str) -> bool:
+        """Check if a specified device is a DB device.
+
+        This method checks if the specified device is included in the
+        list of DB devices used by OSDs.
+
+        Args:
+            device (str): The path of the device to check.
+
+        Returns:
+            bool: True if the device is a DB device,
+                  False otherwise.
+        """
+        return device in self.db_devices()
+
+    def is_wal_device(self, device: str) -> bool:
+        """Check if a specified device is a WAL device.
+
+        This method checks if the specified device is included in the
+        list of WAL devices used by OSDs.
+
+        Args:
+            device (str): The path of the device to check.
+
+        Returns:
+            bool: True if the device is a WAL device,
+                  False otherwise.
+        """
+        return device in self.wal_devices()
+
+    def get_block_devices_from_osd_id(self, osd_id: str) -> List[str]:
+        """Retrieve the list of block devices associated with a given OSD ID.
+
+        This method looks up the specified OSD ID in the `data` attribute
+        and returns a list of devices that are of type 'block'. If there are
+        no devices of type 'block' for the specified OSD ID, an empty list is returned.
+
+        Args:
+            osd_id (str): The OSD ID for which to retrieve block devices.
+
+        Returns:
+            List[str]: A list of block device paths associated with the
+                       specified OSD ID. If no block devices are found,
+                       an empty list is returned.
+        """
+        result: List[str] = []
+        for lv in self.data.get(osd_id, []):
+            if lv.get('type') == 'block':
+                result = lv.get('devices', [])
+        return result
+
+    def osd_ids(self) -> List[str]:
+        """Retrieve the list of OSD IDs.
+
+        This method returns a list of OSD IDs by extracting the keys
+        from the `data` attribute, which is expected to contain
+        information about OSDs. If there is no data available, an
+        empty list is returned.
+
+        Returns:
+            List[str]: A list of OSD IDs. If no data is present,
+                       an empty list is returned.
+        """
+        result: List[str] = []
+        if self.data:
+            result = list(self.data.keys())
+        return result
diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py
index f8f0efc9d2831..dc43b48726379 100644
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -101,6 +101,7 @@
 from .configchecks import CephadmConfigChecks
 from .offline_watcher import OfflineHostWatcher
 from .tuned_profiles import TunedProfileUtils
+from .ceph_volume import CephVolume
 
 try:
     import asyncssh
@@ -792,6 +793,8 @@ def __init__(self, *args: Any, **kwargs: Any):
         # as part of the handling of stray daemons
         self.recently_altered_daemons: Dict[str, datetime.datetime] = {}
 
+        self.ceph_volume: CephVolume = CephVolume(self)
+
     def shutdown(self) -> None:
         self.log.debug('shutdown')
         self._worker_pool.close()
@@ -3828,9 +3831,56 @@ def upgrade_resume(self) -> str:
     def upgrade_stop(self) -> str:
         return self.upgrade.upgrade_stop()
 
+    @handle_orch_error
+    def replace_device(self,
+                       hostname: str,
+                       device: str,
+                       clear: bool = False,
+                       yes_i_really_mean_it: bool = False) -> Any:
+        output: str = ''
+
+        self.ceph_volume.lvm_list.get_data(hostname=hostname)
+
+        if clear:
+            output = self.ceph_volume.clear_replace_header(hostname, device)
+        else:
+            osds_to_zap: List[str] = []
+            if hostname not in list(self.inventory.keys()):
+                raise OrchestratorError(f'{hostname} invalid host.')
+
+            if device not in self.ceph_volume.lvm_list.all_devices():
+                raise OrchestratorError(f"{device} doesn't appear to be used for an OSD, not a valid device in {hostname}.")
+
+            device_osd_mapping = self.ceph_volume.lvm_list.device_osd_mapping()
+            osds_to_zap = device_osd_mapping[device]['osd_ids']
+
+            if self.ceph_volume.lvm_list.is_shared_device(device):
+                if not yes_i_really_mean_it:
+                    raise OrchestratorError(f'{device} is a shared device.\n'
+                                            f'Replacing {device} implies destroying OSD(s): {osds_to_zap}.\n'
+                                            'Please, *be very careful*, this can be a very dangerous operation.\n'
+                                            'If you know what you are doing, pass --yes-i-really-mean-it')
+            if not self.to_remove_osds.rm_util.safe_to_destroy([int(osd_id) for osd_id in osds_to_zap]):
+                raise OrchestratorError(f"Destroying OSD(s) {osds_to_zap} would cause some PGs to be undersized/degraded.\n"
+                                        'Refusing to proceed.')
+            replace_block: bool = self.ceph_volume.lvm_list.is_block_device(device)
+            replace_db: bool = self.ceph_volume.lvm_list.is_db_device(device)
+            replace_wal: bool = self.ceph_volume.lvm_list.is_wal_device(device)
+
+            self.remove_osds(list(osds_to_zap),
+                             replace_block=replace_block,
+                             replace_db=replace_db,
+                             replace_wal=replace_wal)
+
+            output = f'Scheduled to destroy osds: {osds_to_zap} and mark {device} as being replaced.'
+        return output
+
     @handle_orch_error
     def remove_osds(self, osd_ids: List[str],
                     replace: bool = False,
+                    replace_block: bool = False,
+                    replace_db: bool = False,
+                    replace_wal: bool = False,
                     force: bool = False,
                     zap: bool = False,
                     no_destroy: bool = False) -> str:
@@ -3853,6 +3903,9 @@ def remove_osds(self, osd_ids: List[str],
             try:
                 self.to_remove_osds.enqueue(OSD(osd_id=int(daemon.daemon_id),
                                                 replace=replace,
+                                                replace_block=replace_block,
+                                                replace_db=replace_db,
+                                                replace_wal=replace_wal,
                                                 force=force,
                                                 zap=zap,
                                                 no_destroy=no_destroy,
diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py
index 611c27c34538a..4a7959ae04502 100644
--- a/src/pybind/mgr/cephadm/serve.py
+++ b/src/pybind/mgr/cephadm/serve.py
@@ -96,7 +96,10 @@ def serve(self) -> None:
                 if not self.mgr.paused:
                     self._run_async_actions()
 
-                    self.mgr.to_remove_osds.process_removal_queue()
+                    removal_queue_result = self.mgr.to_remove_osds.process_removal_queue()
+                    self.log.debug(f'process_removal_queue() returned = {removal_queue_result}')
+                    if removal_queue_result:
+                        continue
 
                     self.mgr.migration.migrate()
                     if self.mgr.migration.is_migration_ongoing():
diff --git a/src/pybind/mgr/cephadm/services/osd.py b/src/pybind/mgr/cephadm/services/osd.py
index 9b09b8c9f4925..80bf92772c49b 100644
--- a/src/pybind/mgr/cephadm/services/osd.py
+++ b/src/pybind/mgr/cephadm/services/osd.py
@@ -551,6 +551,12 @@ def zap_osd(self, osd: "OSD") -> str:
         "Zaps all devices that are associated with an OSD"
         if osd.hostname is not None:
             cmd = ['--', 'lvm', 'zap', '--osd-id', str(osd.osd_id)]
+            if osd.replace_block:
+                cmd.append('--replace-block')
+            if osd.replace_db:
+                cmd.append('--replace-db')
+            if osd.replace_wal:
+                cmd.append('--replace-wal')
             if not osd.no_destroy:
                 cmd.append('--destroy')
             with self.mgr.async_timeout_handler(osd.hostname, f'cephadm ceph-volume {" ".join(cmd)}'):
@@ -618,6 +624,9 @@ def __init__(self,
                  started: bool = False,
                  stopped: bool = False,
                  replace: bool = False,
+                 replace_block: bool = False,
+                 replace_db: bool = False,
+                 replace_wal: bool = False,
                  force: bool = False,
                  hostname: Optional[str] = None,
                  zap: bool = False,
@@ -649,6 +658,12 @@ def __init__(self,
 
         # If this is a replace or remove operation
         self.replace = replace
+        # If this is a block device replacement
+        self.replace_block = replace_block
+        # If this is a db device replacement
+        self.replace_db = replace_db
+        # If this is a wal device replacement
+        self.replace_wal = replace_wal
         # If we wait for the osd to be drained
         self.force = force
         # The name of the node
@@ -676,7 +691,7 @@ def start_draining(self) -> bool:
         if self.stopped:
             logger.debug(f"Won't start draining {self}. OSD draining is stopped.")
             return False
-        if self.replace:
+        if self.any_replace_params:
             self.rm_util.set_osd_flag([self], 'out')
         else:
             self.rm_util.reweight_osd(self, 0.0)
@@ -686,7 +701,7 @@ def start_draining(self) -> bool:
         return True
 
     def stop_draining(self) -> bool:
-        if self.replace:
+        if self.any_replace_params:
             self.rm_util.set_osd_flag([self], 'in')
         else:
             if self.original_weight:
@@ -764,6 +779,9 @@ def to_json(self) -> dict:
         out['draining'] = self.draining
         out['stopped'] = self.stopped
         out['replace'] = self.replace
+        out['replace_block'] = self.replace_block
+        out['replace_db'] = self.replace_db
+        out['replace_wal'] = self.replace_wal
         out['force'] = self.force
         out['zap'] = self.zap
         out['hostname'] = self.hostname  # type: ignore
@@ -789,6 +807,13 @@ def from_json(cls, inp: Optional[Dict[str, Any]], rm_util: RemoveUtil) -> Option
             inp['hostname'] = hostname
         return cls(**inp)
 
+    @property
+    def any_replace_params(self) -> bool:
+        return any([self.replace,
+                    self.replace_block,
+                    self.replace_db,
+                    self.replace_wal])
+
     def __hash__(self) -> int:
         return hash(self.osd_id)
 
@@ -812,7 +837,7 @@ def __init__(self, mgr: "CephadmOrchestrator") -> None:
         # network calls, like mon commands.
         self.lock = Lock()
 
-    def process_removal_queue(self) -> None:
+    def process_removal_queue(self) -> bool:
         """
         Performs actions in the _serve() loop to remove an OSD
         when criteria is met.
@@ -820,6 +845,8 @@ def process_removal_queue(self) -> None:
         we can't hold self.lock, as we're calling _remove_daemon in the loop
         """
 
+        result: bool = False
+
         # make sure that we don't run on OSDs that are not in the cluster anymore.
         self.cleanup()
 
@@ -863,16 +890,23 @@ def process_removal_queue(self) -> None:
             if self.mgr.cache.has_daemon(f'osd.{osd.osd_id}'):
                 CephadmServe(self.mgr)._remove_daemon(f'osd.{osd.osd_id}', osd.hostname)
                 logger.info(f"Successfully removed {osd} on {osd.hostname}")
+                result = True
             else:
                 logger.info(f"Daemon {osd} on {osd.hostname} was already removed")
 
-            if osd.replace:
+            any_replace_params: bool = any([osd.replace,
+                                            osd.replace_block,
+                                            osd.replace_db,
+                                            osd.replace_wal])
+            if any_replace_params:
                 # mark destroyed in osdmap
                 if not osd.destroy():
                     raise orchestrator.OrchestratorError(
                         f"Could not destroy {osd}")
                 logger.info(
                     f"Successfully destroyed old {osd} on {osd.hostname}; ready for replacement")
+                if any_replace_params:
+                    osd.zap = True
             else:
                 # purge from osdmap
                 if not osd.purge():
@@ -884,7 +918,7 @@ def process_removal_queue(self) -> None:
                 logger.info(f"Zapping devices for {osd} on {osd.hostname}")
                 osd.do_zap()
                 logger.info(f"Successfully zapped devices for {osd} on {osd.hostname}")
-
+            self.mgr.cache.invalidate_host_devices(osd.hostname)
             logger.debug(f"Removing {osd} from the queue.")
 
         # self could change while this is processing (osds get added from the CLI)
@@ -893,6 +927,7 @@ def process_removal_queue(self) -> None:
         with self.lock:
             self.osds.intersection_update(new_queue)
             self._save_to_store()
+        return result
 
     def cleanup(self) -> None:
         # OSDs can always be cleaned up manually. This ensures that we run on existing OSDs
diff --git a/src/pybind/mgr/cephadm/tests/ceph_volume_data.py b/src/pybind/mgr/cephadm/tests/ceph_volume_data.py
new file mode 100644
index 0000000000000..afd6d89d39e40
--- /dev/null
+++ b/src/pybind/mgr/cephadm/tests/ceph_volume_data.py
@@ -0,0 +1 @@
+data = '{"0":[{"devices":["/dev/vdb"],"lv_name":"osd-block-8cd7fa43-ef40-49e7-abb2-db5cfd91bc92","lv_path":"/dev/ceph-81c76363-7a89-47d2-83c1-fdcbab5d6668/osd-block-8cd7fa43-ef40-49e7-abb2-db5cfd91bc92","lv_size":"214744170496","lv_tags":"ceph.block_device=/dev/ceph-81c76363-7a89-47d2-83c1-fdcbab5d6668/osd-block-8cd7fa43-ef40-49e7-abb2-db5cfd91bc92,ceph.block_uuid=d518Lz-gTnC-FyX7-4MN2-icIp-LBCB-zdQw2p,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.db_device=/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-f0f5e20c-f1ee-42df-9a78-0e70b9c08e6c,ceph.db_uuid=EInXUQ-LDDO-7jCL-Y0Jb-tPZ2-KuKl-VNJ2hX,ceph.encrypted=0,ceph.osd_fsid=8cd7fa43-ef40-49e7-abb2-db5cfd91bc92,ceph.osd_id=0,ceph.osdspec_affinity=osd.shared_db,ceph.type=block,ceph.vdo=0,ceph.with_tpm=0","lv_uuid":"d518Lz-gTnC-FyX7-4MN2-icIp-LBCB-zdQw2p","name":"osd-block-8cd7fa43-ef40-49e7-abb2-db5cfd91bc92","path":"/dev/ceph-81c76363-7a89-47d2-83c1-fdcbab5d6668/osd-block-8cd7fa43-ef40-49e7-abb2-db5cfd91bc92","tags":{"ceph.block_device":"/dev/ceph-81c76363-7a89-47d2-83c1-fdcbab5d6668/osd-block-8cd7fa43-ef40-49e7-abb2-db5cfd91bc92","ceph.block_uuid":"d518Lz-gTnC-FyX7-4MN2-icIp-LBCB-zdQw2p","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.db_device":"/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-f0f5e20c-f1ee-42df-9a78-0e70b9c08e6c","ceph.db_uuid":"EInXUQ-LDDO-7jCL-Y0Jb-tPZ2-KuKl-VNJ2hX","ceph.encrypted":"0","ceph.osd_fsid":"8cd7fa43-ef40-49e7-abb2-db5cfd91bc92","ceph.osd_id":"0","ceph.osdspec_affinity":"osd.shared_db","ceph.type":"block","ceph.vdo":"0","ceph.with_tpm":"0"},"type":"block","vg_name":"ceph-81c76363-7a89-47d2-83c1-fdcbab5d6668"},{"devices":["/dev/vdk"],"lv_name":"osd-db-f0f5e20c-f1ee-42df-9a78-0e70b9c08e6c","lv_path":"/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-f0f5e20c-f1ee-42df-9a78-0e70b9c08e6c","lv_size":"107369988096","lv_tags":"ceph.block_device=/dev/ceph-81c76363-7a89-47d2-83c1-fdcbab5d6668/osd-block-8cd7fa43-ef40-49e7-abb2-db5cfd91bc92,ceph.block_uuid=d518Lz-gTnC-FyX7-4MN2-icIp-LBCB-zdQw2p,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.db_device=/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-f0f5e20c-f1ee-42df-9a78-0e70b9c08e6c,ceph.db_uuid=EInXUQ-LDDO-7jCL-Y0Jb-tPZ2-KuKl-VNJ2hX,ceph.encrypted=0,ceph.osd_fsid=8cd7fa43-ef40-49e7-abb2-db5cfd91bc92,ceph.osd_id=0,ceph.osdspec_affinity=osd.shared_db,ceph.type=db,ceph.vdo=0,ceph.with_tpm=0","lv_uuid":"EInXUQ-LDDO-7jCL-Y0Jb-tPZ2-KuKl-VNJ2hX","name":"osd-db-f0f5e20c-f1ee-42df-9a78-0e70b9c08e6c","path":"/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-f0f5e20c-f1ee-42df-9a78-0e70b9c08e6c","tags":{"ceph.block_device":"/dev/ceph-81c76363-7a89-47d2-83c1-fdcbab5d6668/osd-block-8cd7fa43-ef40-49e7-abb2-db5cfd91bc92","ceph.block_uuid":"d518Lz-gTnC-FyX7-4MN2-icIp-LBCB-zdQw2p","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.db_device":"/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-f0f5e20c-f1ee-42df-9a78-0e70b9c08e6c","ceph.db_uuid":"EInXUQ-LDDO-7jCL-Y0Jb-tPZ2-KuKl-VNJ2hX","ceph.encrypted":"0","ceph.osd_fsid":"8cd7fa43-ef40-49e7-abb2-db5cfd91bc92","ceph.osd_id":"0","ceph.osdspec_affinity":"osd.shared_db","ceph.type":"db","ceph.vdo":"0","ceph.with_tpm":"0"},"type":"db","vg_name":"ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf"}],"1":[{"devices":["/dev/vdc"],"lv_name":"osd-block-aaa4c8cb-2b54-4df8-9846-17063c59b6ce","lv_path":"/dev/ceph-964cfc71-ad91-4189-97c1-cab4fd3066bb/osd-block-aaa4c8cb-2b54-4df8-9846-17063c59b6ce","lv_size":"214744170496","lv_tags":"ceph.block_device=/dev/ceph-964cfc71-ad91-4189-97c1-cab4fd3066bb/osd-block-aaa4c8cb-2b54-4df8-9846-17063c59b6ce,ceph.block_uuid=Ccvedr-7t3C-BgIg-lfSl-qW3J-Zw1V-FuH14l,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.db_device=/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-38f53373-7575-4c90-98ca-28f189685774,ceph.db_uuid=1mEAHd-mxQn-Qr9c-DkD8-XGOQ-xfIN-ZsPReC,ceph.encrypted=0,ceph.osd_fsid=aaa4c8cb-2b54-4df8-9846-17063c59b6ce,ceph.osd_id=1,ceph.osdspec_affinity=osd.shared_db,ceph.type=block,ceph.vdo=0,ceph.with_tpm=0","lv_uuid":"Ccvedr-7t3C-BgIg-lfSl-qW3J-Zw1V-FuH14l","name":"osd-block-aaa4c8cb-2b54-4df8-9846-17063c59b6ce","path":"/dev/ceph-964cfc71-ad91-4189-97c1-cab4fd3066bb/osd-block-aaa4c8cb-2b54-4df8-9846-17063c59b6ce","tags":{"ceph.block_device":"/dev/ceph-964cfc71-ad91-4189-97c1-cab4fd3066bb/osd-block-aaa4c8cb-2b54-4df8-9846-17063c59b6ce","ceph.block_uuid":"Ccvedr-7t3C-BgIg-lfSl-qW3J-Zw1V-FuH14l","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.db_device":"/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-38f53373-7575-4c90-98ca-28f189685774","ceph.db_uuid":"1mEAHd-mxQn-Qr9c-DkD8-XGOQ-xfIN-ZsPReC","ceph.encrypted":"0","ceph.osd_fsid":"aaa4c8cb-2b54-4df8-9846-17063c59b6ce","ceph.osd_id":"1","ceph.osdspec_affinity":"osd.shared_db","ceph.type":"block","ceph.vdo":"0","ceph.with_tpm":"0"},"type":"block","vg_name":"ceph-964cfc71-ad91-4189-97c1-cab4fd3066bb"},{"devices":["/dev/vdk"],"lv_name":"osd-db-38f53373-7575-4c90-98ca-28f189685774","lv_path":"/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-38f53373-7575-4c90-98ca-28f189685774","lv_size":"107369988096","lv_tags":"ceph.block_device=/dev/ceph-964cfc71-ad91-4189-97c1-cab4fd3066bb/osd-block-aaa4c8cb-2b54-4df8-9846-17063c59b6ce,ceph.block_uuid=Ccvedr-7t3C-BgIg-lfSl-qW3J-Zw1V-FuH14l,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.db_device=/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-38f53373-7575-4c90-98ca-28f189685774,ceph.db_uuid=1mEAHd-mxQn-Qr9c-DkD8-XGOQ-xfIN-ZsPReC,ceph.encrypted=0,ceph.osd_fsid=aaa4c8cb-2b54-4df8-9846-17063c59b6ce,ceph.osd_id=1,ceph.osdspec_affinity=osd.shared_db,ceph.type=db,ceph.vdo=0,ceph.with_tpm=0","lv_uuid":"1mEAHd-mxQn-Qr9c-DkD8-XGOQ-xfIN-ZsPReC","name":"osd-db-38f53373-7575-4c90-98ca-28f189685774","path":"/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-38f53373-7575-4c90-98ca-28f189685774","tags":{"ceph.block_device":"/dev/ceph-964cfc71-ad91-4189-97c1-cab4fd3066bb/osd-block-aaa4c8cb-2b54-4df8-9846-17063c59b6ce","ceph.block_uuid":"Ccvedr-7t3C-BgIg-lfSl-qW3J-Zw1V-FuH14l","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.db_device":"/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-38f53373-7575-4c90-98ca-28f189685774","ceph.db_uuid":"1mEAHd-mxQn-Qr9c-DkD8-XGOQ-xfIN-ZsPReC","ceph.encrypted":"0","ceph.osd_fsid":"aaa4c8cb-2b54-4df8-9846-17063c59b6ce","ceph.osd_id":"1","ceph.osdspec_affinity":"osd.shared_db","ceph.type":"db","ceph.vdo":"0","ceph.with_tpm":"0"},"type":"db","vg_name":"ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf"}],"2":[{"devices":["/dev/vdf"],"lv_name":"osd-block-a0434b49-759a-46a4-91dc-d7cc65af3a33","lv_path":"/dev/ceph-3ba7a728-709b-408c-a043-9e48704b5ffb/osd-block-a0434b49-759a-46a4-91dc-d7cc65af3a33","lv_size":"214744170496","lv_tags":"ceph.block_device=/dev/ceph-3ba7a728-709b-408c-a043-9e48704b5ffb/osd-block-a0434b49-759a-46a4-91dc-d7cc65af3a33,ceph.block_uuid=adQsil-KScK-5QkX-bLbg-EpJa-sNJL-3oDtaO,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.encrypted=0,ceph.osd_fsid=a0434b49-759a-46a4-91dc-d7cc65af3a33,ceph.osd_id=2,ceph.osdspec_affinity=None,ceph.type=block,ceph.vdo=0,ceph.with_tpm=0","lv_uuid":"adQsil-KScK-5QkX-bLbg-EpJa-sNJL-3oDtaO","name":"osd-block-a0434b49-759a-46a4-91dc-d7cc65af3a33","path":"/dev/ceph-3ba7a728-709b-408c-a043-9e48704b5ffb/osd-block-a0434b49-759a-46a4-91dc-d7cc65af3a33","tags":{"ceph.block_device":"/dev/ceph-3ba7a728-709b-408c-a043-9e48704b5ffb/osd-block-a0434b49-759a-46a4-91dc-d7cc65af3a33","ceph.block_uuid":"adQsil-KScK-5QkX-bLbg-EpJa-sNJL-3oDtaO","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.encrypted":"0","ceph.osd_fsid":"a0434b49-759a-46a4-91dc-d7cc65af3a33","ceph.osd_id":"2","ceph.osdspec_affinity":"None","ceph.type":"block","ceph.vdo":"0","ceph.with_tpm":"0"},"type":"block","vg_name":"ceph-3ba7a728-709b-408c-a043-9e48704b5ffb"}],"3":[{"devices":["/dev/vde"],"lv_name":"osd-block-861ea81a-c24b-4c69-b4f6-e527151b132f","lv_path":"/dev/ceph-97ac74d9-d351-4a7e-bbd1-27b8dd3e7f7b/osd-block-861ea81a-c24b-4c69-b4f6-e527151b132f","lv_size":"214744170496","lv_tags":"ceph.block_device=/dev/ceph-97ac74d9-d351-4a7e-bbd1-27b8dd3e7f7b/osd-block-861ea81a-c24b-4c69-b4f6-e527151b132f,ceph.block_uuid=GBfm14-4hPu-oaWk-wSdA-O1Fw-eU5o-Q2KOh8,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.encrypted=0,ceph.osd_fsid=861ea81a-c24b-4c69-b4f6-e527151b132f,ceph.osd_id=3,ceph.osdspec_affinity=None,ceph.type=block,ceph.vdo=0,ceph.with_tpm=0","lv_uuid":"GBfm14-4hPu-oaWk-wSdA-O1Fw-eU5o-Q2KOh8","name":"osd-block-861ea81a-c24b-4c69-b4f6-e527151b132f","path":"/dev/ceph-97ac74d9-d351-4a7e-bbd1-27b8dd3e7f7b/osd-block-861ea81a-c24b-4c69-b4f6-e527151b132f","tags":{"ceph.block_device":"/dev/ceph-97ac74d9-d351-4a7e-bbd1-27b8dd3e7f7b/osd-block-861ea81a-c24b-4c69-b4f6-e527151b132f","ceph.block_uuid":"GBfm14-4hPu-oaWk-wSdA-O1Fw-eU5o-Q2KOh8","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.encrypted":"0","ceph.osd_fsid":"861ea81a-c24b-4c69-b4f6-e527151b132f","ceph.osd_id":"3","ceph.osdspec_affinity":"None","ceph.type":"block","ceph.vdo":"0","ceph.with_tpm":"0"},"type":"block","vg_name":"ceph-97ac74d9-d351-4a7e-bbd1-27b8dd3e7f7b"}],"4":[{"devices":["/dev/vdg"],"lv_name":"osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd","lv_path":"/dev/ceph-20acdce8-5548-4707-a38e-b8e925485bc5/osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd","lv_size":"214744170496","lv_tags":"ceph.block_device=/dev/ceph-20acdce8-5548-4707-a38e-b8e925485bc5/osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd,ceph.block_uuid=diO6OQ-jjkD-tdVS-FJ5f-VcP7-8QEW-geP4Ds,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.db_device=/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-19fc3a21-ce53-4881-9217-f1d58166af16,ceph.db_uuid=5mng9E-Q3ej-37eY-Ny9C-p6wf-h17w-gC3jtx,ceph.encrypted=0,ceph.osd_fsid=242c4a21-b076-424c-94fb-3f556ed2ddbd,ceph.osd_id=4,ceph.osdspec_affinity=osd.shared_db_wal,ceph.type=block,ceph.vdo=0,ceph.wal_device=/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008,ceph.wal_uuid=ppb82k-9cEs-yb1K-QTNl-c4BM-33PQ-bNX0c2,ceph.with_tpm=0","lv_uuid":"diO6OQ-jjkD-tdVS-FJ5f-VcP7-8QEW-geP4Ds","name":"osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd","path":"/dev/ceph-20acdce8-5548-4707-a38e-b8e925485bc5/osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd","tags":{"ceph.block_device":"/dev/ceph-20acdce8-5548-4707-a38e-b8e925485bc5/osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd","ceph.block_uuid":"diO6OQ-jjkD-tdVS-FJ5f-VcP7-8QEW-geP4Ds","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.db_device":"/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-19fc3a21-ce53-4881-9217-f1d58166af16","ceph.db_uuid":"5mng9E-Q3ej-37eY-Ny9C-p6wf-h17w-gC3jtx","ceph.encrypted":"0","ceph.osd_fsid":"242c4a21-b076-424c-94fb-3f556ed2ddbd","ceph.osd_id":"4","ceph.osdspec_affinity":"osd.shared_db_wal","ceph.type":"block","ceph.vdo":"0","ceph.wal_device":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008","ceph.wal_uuid":"ppb82k-9cEs-yb1K-QTNl-c4BM-33PQ-bNX0c2","ceph.with_tpm":"0"},"type":"block","vg_name":"ceph-20acdce8-5548-4707-a38e-b8e925485bc5"},{"devices":["/dev/vdj"],"lv_name":"osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008","lv_path":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008","lv_size":"107369988096","lv_tags":"ceph.block_device=/dev/ceph-20acdce8-5548-4707-a38e-b8e925485bc5/osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd,ceph.block_uuid=diO6OQ-jjkD-tdVS-FJ5f-VcP7-8QEW-geP4Ds,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.encrypted=0,ceph.osd_fsid=242c4a21-b076-424c-94fb-3f556ed2ddbd,ceph.osd_id=4,ceph.osdspec_affinity=osd.shared_db_wal,ceph.type=wal,ceph.vdo=0,ceph.wal_device=/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008,ceph.wal_uuid=ppb82k-9cEs-yb1K-QTNl-c4BM-33PQ-bNX0c2,ceph.with_tpm=0","lv_uuid":"ppb82k-9cEs-yb1K-QTNl-c4BM-33PQ-bNX0c2","name":"osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008","path":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008","tags":{"ceph.block_device":"/dev/ceph-20acdce8-5548-4707-a38e-b8e925485bc5/osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd","ceph.block_uuid":"diO6OQ-jjkD-tdVS-FJ5f-VcP7-8QEW-geP4Ds","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.encrypted":"0","ceph.osd_fsid":"242c4a21-b076-424c-94fb-3f556ed2ddbd","ceph.osd_id":"4","ceph.osdspec_affinity":"osd.shared_db_wal","ceph.type":"wal","ceph.vdo":"0","ceph.wal_device":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008","ceph.wal_uuid":"ppb82k-9cEs-yb1K-QTNl-c4BM-33PQ-bNX0c2","ceph.with_tpm":"0"},"type":"wal","vg_name":"ceph-776f980b-152a-4e8f-99b6-bae27ed0b528"},{"devices":["/dev/vdi"],"lv_name":"osd-db-19fc3a21-ce53-4881-9217-f1d58166af16","lv_path":"/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-19fc3a21-ce53-4881-9217-f1d58166af16","lv_size":"107369988096","lv_tags":"ceph.block_device=/dev/ceph-20acdce8-5548-4707-a38e-b8e925485bc5/osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd,ceph.block_uuid=diO6OQ-jjkD-tdVS-FJ5f-VcP7-8QEW-geP4Ds,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.db_device=/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-19fc3a21-ce53-4881-9217-f1d58166af16,ceph.db_uuid=5mng9E-Q3ej-37eY-Ny9C-p6wf-h17w-gC3jtx,ceph.encrypted=0,ceph.osd_fsid=242c4a21-b076-424c-94fb-3f556ed2ddbd,ceph.osd_id=4,ceph.osdspec_affinity=osd.shared_db_wal,ceph.type=db,ceph.vdo=0,ceph.wal_device=/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008,ceph.wal_uuid=ppb82k-9cEs-yb1K-QTNl-c4BM-33PQ-bNX0c2,ceph.with_tpm=0","lv_uuid":"5mng9E-Q3ej-37eY-Ny9C-p6wf-h17w-gC3jtx","name":"osd-db-19fc3a21-ce53-4881-9217-f1d58166af16","path":"/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-19fc3a21-ce53-4881-9217-f1d58166af16","tags":{"ceph.block_device":"/dev/ceph-20acdce8-5548-4707-a38e-b8e925485bc5/osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd","ceph.block_uuid":"diO6OQ-jjkD-tdVS-FJ5f-VcP7-8QEW-geP4Ds","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.db_device":"/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-19fc3a21-ce53-4881-9217-f1d58166af16","ceph.db_uuid":"5mng9E-Q3ej-37eY-Ny9C-p6wf-h17w-gC3jtx","ceph.encrypted":"0","ceph.osd_fsid":"242c4a21-b076-424c-94fb-3f556ed2ddbd","ceph.osd_id":"4","ceph.osdspec_affinity":"osd.shared_db_wal","ceph.type":"db","ceph.vdo":"0","ceph.wal_device":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008","ceph.wal_uuid":"ppb82k-9cEs-yb1K-QTNl-c4BM-33PQ-bNX0c2","ceph.with_tpm":"0"},"type":"db","vg_name":"ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452"}],"5":[{"devices":["/dev/vdj"],"lv_name":"osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea","lv_path":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea","lv_size":"107369988096","lv_tags":"ceph.block_device=/dev/ceph-84a4ccfc-80f1-4784-9558-a9a08b15a351/osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f,ceph.block_uuid=gmQkh2-T5i3-Kwfa-YMMO-j88X-RvDw-dx7N6E,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.encrypted=0,ceph.osd_fsid=8cf28853-3453-49b0-a3f9-a693443ed75f,ceph.osd_id=5,ceph.osdspec_affinity=osd.shared_db_wal,ceph.type=wal,ceph.vdo=0,ceph.wal_device=/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea,ceph.wal_uuid=DFQDJy-6bE0-iagr-hgmh-oUEH-HF2R-ILBzzz,ceph.with_tpm=0","lv_uuid":"DFQDJy-6bE0-iagr-hgmh-oUEH-HF2R-ILBzzz","name":"osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea","path":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea","tags":{"ceph.block_device":"/dev/ceph-84a4ccfc-80f1-4784-9558-a9a08b15a351/osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f","ceph.block_uuid":"gmQkh2-T5i3-Kwfa-YMMO-j88X-RvDw-dx7N6E","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.encrypted":"0","ceph.osd_fsid":"8cf28853-3453-49b0-a3f9-a693443ed75f","ceph.osd_id":"5","ceph.osdspec_affinity":"osd.shared_db_wal","ceph.type":"wal","ceph.vdo":"0","ceph.wal_device":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea","ceph.wal_uuid":"DFQDJy-6bE0-iagr-hgmh-oUEH-HF2R-ILBzzz","ceph.with_tpm":"0"},"type":"wal","vg_name":"ceph-776f980b-152a-4e8f-99b6-bae27ed0b528"},{"devices":["/dev/vdh"],"lv_name":"osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f","lv_path":"/dev/ceph-84a4ccfc-80f1-4784-9558-a9a08b15a351/osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f","lv_size":"214744170496","lv_tags":"ceph.block_device=/dev/ceph-84a4ccfc-80f1-4784-9558-a9a08b15a351/osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f,ceph.block_uuid=gmQkh2-T5i3-Kwfa-YMMO-j88X-RvDw-dx7N6E,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.db_device=/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-635f592b-1d4f-4117-aaa6-b68878f84dfb,ceph.db_uuid=wf407q-HwuD-OWhh-xm2A-d2sv-Fdsx-JqeUj2,ceph.encrypted=0,ceph.osd_fsid=8cf28853-3453-49b0-a3f9-a693443ed75f,ceph.osd_id=5,ceph.osdspec_affinity=osd.shared_db_wal,ceph.type=block,ceph.vdo=0,ceph.wal_device=/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea,ceph.wal_uuid=DFQDJy-6bE0-iagr-hgmh-oUEH-HF2R-ILBzzz,ceph.with_tpm=0","lv_uuid":"gmQkh2-T5i3-Kwfa-YMMO-j88X-RvDw-dx7N6E","name":"osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f","path":"/dev/ceph-84a4ccfc-80f1-4784-9558-a9a08b15a351/osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f","tags":{"ceph.block_device":"/dev/ceph-84a4ccfc-80f1-4784-9558-a9a08b15a351/osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f","ceph.block_uuid":"gmQkh2-T5i3-Kwfa-YMMO-j88X-RvDw-dx7N6E","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.db_device":"/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-635f592b-1d4f-4117-aaa6-b68878f84dfb","ceph.db_uuid":"wf407q-HwuD-OWhh-xm2A-d2sv-Fdsx-JqeUj2","ceph.encrypted":"0","ceph.osd_fsid":"8cf28853-3453-49b0-a3f9-a693443ed75f","ceph.osd_id":"5","ceph.osdspec_affinity":"osd.shared_db_wal","ceph.type":"block","ceph.vdo":"0","ceph.wal_device":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea","ceph.wal_uuid":"DFQDJy-6bE0-iagr-hgmh-oUEH-HF2R-ILBzzz","ceph.with_tpm":"0"},"type":"block","vg_name":"ceph-84a4ccfc-80f1-4784-9558-a9a08b15a351"},{"devices":["/dev/vdi"],"lv_name":"osd-db-635f592b-1d4f-4117-aaa6-b68878f84dfb","lv_path":"/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-635f592b-1d4f-4117-aaa6-b68878f84dfb","lv_size":"107369988096","lv_tags":"ceph.block_device=/dev/ceph-84a4ccfc-80f1-4784-9558-a9a08b15a351/osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f,ceph.block_uuid=gmQkh2-T5i3-Kwfa-YMMO-j88X-RvDw-dx7N6E,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.db_device=/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-635f592b-1d4f-4117-aaa6-b68878f84dfb,ceph.db_uuid=wf407q-HwuD-OWhh-xm2A-d2sv-Fdsx-JqeUj2,ceph.encrypted=0,ceph.osd_fsid=8cf28853-3453-49b0-a3f9-a693443ed75f,ceph.osd_id=5,ceph.osdspec_affinity=osd.shared_db_wal,ceph.type=db,ceph.vdo=0,ceph.wal_device=/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea,ceph.wal_uuid=DFQDJy-6bE0-iagr-hgmh-oUEH-HF2R-ILBzzz,ceph.with_tpm=0","lv_uuid":"wf407q-HwuD-OWhh-xm2A-d2sv-Fdsx-JqeUj2","name":"osd-db-635f592b-1d4f-4117-aaa6-b68878f84dfb","path":"/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-635f592b-1d4f-4117-aaa6-b68878f84dfb","tags":{"ceph.block_device":"/dev/ceph-84a4ccfc-80f1-4784-9558-a9a08b15a351/osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f","ceph.block_uuid":"gmQkh2-T5i3-Kwfa-YMMO-j88X-RvDw-dx7N6E","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.db_device":"/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-635f592b-1d4f-4117-aaa6-b68878f84dfb","ceph.db_uuid":"wf407q-HwuD-OWhh-xm2A-d2sv-Fdsx-JqeUj2","ceph.encrypted":"0","ceph.osd_fsid":"8cf28853-3453-49b0-a3f9-a693443ed75f","ceph.osd_id":"5","ceph.osdspec_affinity":"osd.shared_db_wal","ceph.type":"db","ceph.vdo":"0","ceph.wal_device":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea","ceph.wal_uuid":"DFQDJy-6bE0-iagr-hgmh-oUEH-HF2R-ILBzzz","ceph.with_tpm":"0"},"type":"db","vg_name":"ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452"}]}'
diff --git a/src/pybind/mgr/cephadm/tests/conftest.py b/src/pybind/mgr/cephadm/tests/conftest.py
index e8add2c7b834a..5cc2fabaf49b6 100644
--- a/src/pybind/mgr/cephadm/tests/conftest.py
+++ b/src/pybind/mgr/cephadm/tests/conftest.py
@@ -1,13 +1,14 @@
 import pytest
 
 from cephadm.services.osd import RemoveUtil, OSD
-from tests import mock
-
+from mock import mock
 from .fixtures import with_cephadm_module
+from cephadm import CephadmOrchestrator
+from typing import Generator
 
 
 @pytest.fixture()
-def cephadm_module():
+def cephadm_module() -> Generator[CephadmOrchestrator, None, None]:
     with with_cephadm_module({}) as m:
         yield m
 
diff --git a/src/pybind/mgr/cephadm/tests/fixtures.py b/src/pybind/mgr/cephadm/tests/fixtures.py
index dd858c6c7dabe..dda0c6720ac6c 100644
--- a/src/pybind/mgr/cephadm/tests/fixtures.py
+++ b/src/pybind/mgr/cephadm/tests/fixtures.py
@@ -35,11 +35,11 @@ def get_module_option_ex(_, module, key, default=None):
     return None
 
 
-def _run_cephadm(ret):
+def _run_cephadm(ret, rc: int = 0):
     async def foo(s, host, entity, cmd, e, **kwargs):
         if cmd == 'gather-facts':
             return '{}', '', 0
-        return [ret], '', 0
+        return [ret], '', rc
     return foo
 
 
diff --git a/src/pybind/mgr/cephadm/tests/test_ceph_volume.py b/src/pybind/mgr/cephadm/tests/test_ceph_volume.py
new file mode 100644
index 0000000000000..cc1378a75753c
--- /dev/null
+++ b/src/pybind/mgr/cephadm/tests/test_ceph_volume.py
@@ -0,0 +1,231 @@
+import json
+import pytest
+from .ceph_volume_data import data
+from cephadm.serve import CephadmServe
+from cephadm import CephadmOrchestrator
+from mock import patch
+from .fixtures import _run_cephadm, with_host
+
+
+class TestCephVolume:
+    def test_run(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm('fake-output', 0)):
+                    c = cephadm_module.ceph_volume.run('test', ['/bin/foo'])
+                assert c == (['fake-output'], '', 0)
+
+    def test_run_json(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm('{"this-is-a-fake-key": "this-is-a-fake-value"}', 0)):
+                    c = cephadm_module.ceph_volume.run_json('test', ['/bin/foo'])
+                assert c == {"this-is-a-fake-key": "this-is-a-fake-value"}
+
+    def test_clear_replace_header_ok(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm('fake-output', 0)):
+                    c = cephadm_module.ceph_volume.clear_replace_header('test', '/dev/foo')
+                assert c == 'Replacement header cleared on /dev/foo'
+
+    def test_clear_replace_header_nok(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm('', 1)):
+                    c = cephadm_module.ceph_volume.clear_replace_header('fake-output', '/dev/foo')
+                assert c.strip() == 'No replacement header could be cleared on /dev/foo.'
+
+
+class TestCephVolumeList:
+    def test_get_data(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert cephadm_module.ceph_volume.lvm_list.data == json.loads(data)
+
+    def test_devices_by_type_block(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert set(cephadm_module.ceph_volume.lvm_list.devices_by_type('block')) == set(['/dev/vdb',
+                                                                                                     '/dev/vdc',
+                                                                                                     '/dev/vdg',
+                                                                                                     '/dev/vde',
+                                                                                                     '/dev/vdf',
+                                                                                                     '/dev/vdh'])
+
+    def test_devices_by_type_db(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert set(cephadm_module.ceph_volume.lvm_list.devices_by_type('db')) == set(['/dev/vdi',
+                                                                                                  '/dev/vdk'])
+
+    def test_devices_by_type_wal(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert cephadm_module.ceph_volume.lvm_list.devices_by_type('wal') == ['/dev/vdj']
+
+    def test_block_devices(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert set(cephadm_module.ceph_volume.lvm_list.block_devices()) == set(['/dev/vdb',
+                                                                                            '/dev/vdc',
+                                                                                            '/dev/vdg',
+                                                                                            '/dev/vde',
+                                                                                            '/dev/vdf',
+                                                                                            '/dev/vdh'])
+
+    def test_db_devices(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert set(cephadm_module.ceph_volume.lvm_list.db_devices()) == set(['/dev/vdk',
+                                                                                         '/dev/vdi'])
+
+    def test_wal_devices(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert set(cephadm_module.ceph_volume.lvm_list.wal_devices()) == set(['/dev/vdj'])
+
+    def test_all_devices(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert set(cephadm_module.ceph_volume.lvm_list.all_devices()) == set(['/dev/vdg',
+                                                                                          '/dev/vdj',
+                                                                                          '/dev/vdh',
+                                                                                          '/dev/vdi',
+                                                                                          '/dev/vdc',
+                                                                                          '/dev/vde',
+                                                                                          '/dev/vdf',
+                                                                                          '/dev/vdb',
+                                                                                          '/dev/vdk'])
+
+    def test_device_osd_mapping(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert cephadm_module.ceph_volume.lvm_list.device_osd_mapping() == {'/dev/vdb': {'osd_ids': ['0']},
+                                                                                        '/dev/vdk': {'osd_ids': ['0', '1']},
+                                                                                        '/dev/vdc': {'osd_ids': ['1']},
+                                                                                        '/dev/vdf': {'osd_ids': ['2']},
+                                                                                        '/dev/vde': {'osd_ids': ['3']},
+                                                                                        '/dev/vdg': {'osd_ids': ['4']},
+                                                                                        '/dev/vdj': {'osd_ids': ['4', '5']},
+                                                                                        '/dev/vdi': {'osd_ids': ['4', '5']},
+                                                                                        '/dev/vdh': {'osd_ids': ['5']}}
+
+    def test_block_device_osd_mapping(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert cephadm_module.ceph_volume.lvm_list.block_device_osd_mapping() == {'/dev/vdb': {'osd_ids': ['0']},
+                                                                                              '/dev/vdc': {'osd_ids': ['1']},
+                                                                                              '/dev/vdf': {'osd_ids': ['2']},
+                                                                                              '/dev/vde': {'osd_ids': ['3']},
+                                                                                              '/dev/vdg': {'osd_ids': ['4']},
+                                                                                              '/dev/vdh': {'osd_ids': ['5']}}
+
+    def test_db_device_osd_mapping(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert cephadm_module.ceph_volume.lvm_list.db_device_osd_mapping() == {'/dev/vdk': {'osd_ids': ['0', '1']},
+                                                                                           '/dev/vdi': {'osd_ids': ['4', '5']}}
+
+    def test_wal_device_osd_mapping(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert cephadm_module.ceph_volume.lvm_list.wal_device_osd_mapping() == {'/dev/vdj': {'osd_ids': ['4', '5']}}
+
+    def test_is_shared_device(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert cephadm_module.ceph_volume.lvm_list.is_shared_device('/dev/vdj')
+
+    def test_is_shared_device_with_invalid_device(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    with pytest.raises(RuntimeError) as e:
+                        assert cephadm_module.ceph_volume.lvm_list.is_shared_device('/dev/invalid-device')
+                    assert str(e.value) == 'Not a valid device path.'
+
+    def test_is_block_device(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert cephadm_module.ceph_volume.lvm_list.is_block_device('/dev/vdb')
+
+    def test_is_db_device(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert cephadm_module.ceph_volume.lvm_list.is_db_device('/dev/vdk')
+
+    def test_is_wal_device(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert cephadm_module.ceph_volume.lvm_list.is_wal_device('/dev/vdj')
+
+    def test_get_block_devices_from_osd_id(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert cephadm_module.ceph_volume.lvm_list.get_block_devices_from_osd_id('0') == ['/dev/vdb']
+
+    def test_osd_ids(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert set(cephadm_module.ceph_volume.lvm_list.osd_ids()) == set(['0', '1', '2', '3', '4', '5'])
diff --git a/src/pybind/mgr/cephadm/tests/test_replace_device.py b/src/pybind/mgr/cephadm/tests/test_replace_device.py
new file mode 100644
index 0000000000000..b4a2c81ad9a76
--- /dev/null
+++ b/src/pybind/mgr/cephadm/tests/test_replace_device.py
@@ -0,0 +1,53 @@
+import pytest
+from mock import patch
+from .fixtures import _run_cephadm, with_host, wait
+from .ceph_volume_data import data
+from cephadm.serve import CephadmServe
+from cephadm import CephadmOrchestrator
+from orchestrator import OrchestratorError
+
+
+class TestReplaceDevice:
+    def test_invalid_device(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    with pytest.raises(OrchestratorError) as e:
+                        cephadm_module.replace_device('test', '/dev/invalid-device')
+                    assert "/dev/invalid-device doesn't appear to be used for an OSD, not a valid device in test." in str(e.value)
+
+    def test_invalid_hostname(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    with pytest.raises(OrchestratorError):
+                        cephadm_module.replace_device('invalid-hostname', '/dev/vdb')
+
+    def test_block_device(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    c = cephadm_module.replace_device('test', '/dev/vdb')
+                    result = wait(cephadm_module, c)
+                    assert result == "Scheduled to destroy osds: ['0'] and mark /dev/vdb as being replaced."
+
+    def test_shared_db_device_no_ireallymeanit_flag(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    with pytest.raises(OrchestratorError) as e:
+                        cephadm_module.replace_device('test', '/dev/vdk')
+                    assert "/dev/vdk is a shared device.\nReplacing /dev/vdk implies destroying OSD(s): ['0', '1'].\nPlease, *be very careful*, this can be a very dangerous operation.\nIf you know what you are doing, pass --yes-i-really-mean-it" in str(e.value)
+
+    def test_shared_db_device(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    c = cephadm_module.replace_device('test', '/dev/vdk', yes_i_really_mean_it=True)
+                    result = wait(cephadm_module, c)
+                    assert result == "Scheduled to destroy osds: ['0', '1'] and mark /dev/vdk as being replaced."
diff --git a/src/pybind/mgr/orchestrator/_interface.py b/src/pybind/mgr/orchestrator/_interface.py
index 82a8c13a9c11e..c05332df59a28 100644
--- a/src/pybind/mgr/orchestrator/_interface.py
+++ b/src/pybind/mgr/orchestrator/_interface.py
@@ -520,6 +520,15 @@ def rescan_host(self, hostname: str) -> OrchResult:
         """
         raise NotImplementedError()
 
+    def replace_device(self,
+                       hostname: str,
+                       device: str,
+                       clear: bool = False,
+                       yes_i_really_mean_it: bool = False) -> OrchResult:
+        """Perform all required operations in order to replace a device.
+        """
+        raise NotImplementedError()
+
     def get_inventory(self, host_filter: Optional['InventoryFilter'] = None, refresh: bool = False) -> OrchResult[List['InventoryHost']]:
         """
         Returns something that was created by `ceph-volume inventory`.
@@ -699,12 +708,18 @@ def preview_osdspecs(self,
 
     def remove_osds(self, osd_ids: List[str],
                     replace: bool = False,
+                    replace_block: bool = False,
+                    replace_db: bool = False,
+                    replace_wal: bool = False,
                     force: bool = False,
                     zap: bool = False,
                     no_destroy: bool = False) -> OrchResult[str]:
         """
         :param osd_ids: list of OSD IDs
         :param replace: marks the OSD as being destroyed. See :ref:`orchestrator-osd-replace`
+        :param replace_block: marks the corresponding block device as being replaced.
+        :param replace_db: marks the corresponding db device as being replaced.
+        :param replace_wal: marks the corresponding wal device as being replaced.
         :param force: Forces the OSD removal process without waiting for the data to be drained first.
         :param zap: Zap/Erase all devices associated with the OSDs (DESTROYS DATA)
         :param no_destroy: Do not destroy associated VGs/LVs with the OSD.
diff --git a/src/pybind/mgr/orchestrator/module.py b/src/pybind/mgr/orchestrator/module.py
index be0096bb2d96e..7dd8c95af52c7 100644
--- a/src/pybind/mgr/orchestrator/module.py
+++ b/src/pybind/mgr/orchestrator/module.py
@@ -818,6 +818,21 @@ def _host_rescan(self, hostname: str, with_summary: bool = False) -> HandleComma
             return HandleCommandResult(stdout=completion.result_str())
         return HandleCommandResult(stdout=completion.result_str().split('.')[0])
 
+    @_cli_read_command('orch device replace')
+    def _replace_device(self,
+                        hostname: str,
+                        device: str,
+                        clear: bool = False,
+                        yes_i_really_mean_it: bool = False) -> HandleCommandResult:
+        """Perform all required operations in order to replace a device.
+        """
+        completion = self.replace_device(hostname=hostname,
+                                         device=device,
+                                         clear=clear,
+                                         yes_i_really_mean_it=yes_i_really_mean_it)
+        raise_if_exception(completion)
+        return HandleCommandResult(stdout=completion.result_str())
+
     @_cli_read_command('orch device ls')
     def _list_devices(self,
                       hostname: Optional[List[str]] = None,
@@ -1415,8 +1430,9 @@ def _osd_rm_start(self,
                       zap: bool = False,
                       no_destroy: bool = False) -> HandleCommandResult:
         """Remove OSD daemons"""
-        completion = self.remove_osds(osd_id, replace=replace, force=force,
-                                      zap=zap, no_destroy=no_destroy)
+        completion = self.remove_osds(osd_id,
+                                      replace=replace,
+                                      force=force, zap=zap, no_destroy=no_destroy)
         raise_if_exception(completion)
         return HandleCommandResult(stdout=completion.result_str())
 
diff --git a/src/python-common/ceph/deployment/drive_selection/selector.py b/src/python-common/ceph/deployment/drive_selection/selector.py
index 041f1ed30446f..59ebbb6347e43 100644
--- a/src/python-common/ceph/deployment/drive_selection/selector.py
+++ b/src/python-common/ceph/deployment/drive_selection/selector.py
@@ -131,6 +131,10 @@ def assign_devices(self, device_filter):
         for disk in self.disks:
             logger.debug("Processing disk {}".format(disk.path))
 
+            if disk.being_replaced:
+                logger.debug('Ignoring disk {} as it is being replaced.'.format(disk.path))
+                continue
+
             if not disk.available and not disk.ceph_device:
                 logger.debug(
                     ("Ignoring disk {}. "
diff --git a/src/python-common/ceph/deployment/inventory.py b/src/python-common/ceph/deployment/inventory.py
index a3023882108e3..e2c1a5605f9a6 100644
--- a/src/python-common/ceph/deployment/inventory.py
+++ b/src/python-common/ceph/deployment/inventory.py
@@ -54,7 +54,8 @@ class Device(object):
         'human_readable_type',
         'device_id',
         'lsm_data',
-        'crush_device_class'
+        'crush_device_class',
+        'being_replaced'
     ]
 
     def __init__(self,
@@ -67,7 +68,8 @@ def __init__(self,
                  lsm_data=None,  # type: Optional[Dict[str, Dict[str, str]]]
                  created=None,  # type: Optional[datetime.datetime]
                  ceph_device=None,  # type: Optional[bool]
-                 crush_device_class=None  # type: Optional[str]
+                 crush_device_class=None,  # type: Optional[str]
+                 being_replaced=None,  # type: Optional[bool]
                  ):
 
         self.path = path
@@ -80,6 +82,7 @@ def __init__(self,
         self.created = created if created is not None else datetime_now()
         self.ceph_device = ceph_device
         self.crush_device_class = crush_device_class
+        self.being_replaced = being_replaced
 
     def __eq__(self, other):
         # type: (Any) -> bool
@@ -129,7 +132,8 @@ def __repr__(self) -> str:
             'lvs': self.lvs if self.lvs else 'None',
             'available': str(self.available),
             'ceph_device': str(self.ceph_device),
-            'crush_device_class': str(self.crush_device_class)
+            'crush_device_class': str(self.crush_device_class),
+            'being_replaced': str(self.being_replaced)
         }
         if not self.available and self.rejected_reasons:
             device_desc['rejection reasons'] = self.rejected_reasons

From 3c9b07eb87e67027e9988c1587c07e27ed168657 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Miguel=20Olmo=20Mart=C3=ADnez?= <jolmomar@redhat.com>
Date: Mon, 7 Oct 2024 16:55:51 +0200
Subject: [PATCH 144/148] exporter: New metric for report ceph daemons health
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ceph exporter provide metrics to report ceph daemons communication health using
 the admin socket

Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=2146728
       https://tracker.ceph.com/issues/68428

Signed-off-by: Juan Miguel Olmo Martínez <jolmomar@redhat.com>
---
 doc/monitoring/index.rst              |  24 ++++++
 src/exporter/DaemonMetricCollector.cc |   9 ++-
 src/exporter/DaemonMetricCollector.h  |   2 +-
 src/test/exporter/test_exporter.cc    | 110 +++++++++++++++++++++++++-
 4 files changed, 141 insertions(+), 4 deletions(-)

diff --git a/doc/monitoring/index.rst b/doc/monitoring/index.rst
index 794fdf8419505..afccd9ab16ac3 100644
--- a/doc/monitoring/index.rst
+++ b/doc/monitoring/index.rst
@@ -64,6 +64,30 @@ in:
 
 It is good to outline that the main tool allowing users to observe and monitor a Ceph cluster is the **Ceph dashboard**. It provides graphics where the most important cluster and service metrics are represented. Most of the examples in this document are extracted from the dashboard graphics or extrapolated from the metrics exposed by the Ceph dashboard.
 
+Ceph daemon health metrics
+==========================
+
+The Ceph exporter provides a metric called ``ceph_daemon_socket_up`` that reports the liveness status of each Ceph daemon that exposes an admin socket.
+
+The ``ceph_daemon_socket_up`` metric indicates the health status of a Ceph daemon based on its ability to respond via the admin socket, where a value of ``1`` means healthy, and ``0`` means unhealthy. Although a Ceph daemon might still be "alive" when it reports ``ceph_daemon_socket_up=0``, this situation highlights a significant issue in its functionality. As such, this metric serves as an excellent tool for detecting problems in any of the main Ceph daemons.
+
+Labels:
+- **``ceph_daemon``**: Identifier of the Ceph daemon exposing an admin socket on the host.
+- **``hostname``**: Name of the host where the Ceph daemon is running.
+
+Example:
+
+.. code-block:: bash
+
+   ceph_daemon_socket_up{ceph_daemon="mds.a",hostname="testhost"} 1
+   ceph_daemon_socket_up{ceph_daemon="osd.1",hostname="testhost"} 0
+
+To identify any Ceph daemons that were not responsive at any point in the last 12 hours, you can use the following PromQL expression:
+
+.. code-block:: bash
+
+   ceph_daemon_socket_up == 0 or min_over_time(ceph_daemon_socket_up[12h]) == 0
+
 
 Performance metrics
 ===================
diff --git a/src/exporter/DaemonMetricCollector.cc b/src/exporter/DaemonMetricCollector.cc
index d4930ea35c0d2..4b8a8131bcfd3 100644
--- a/src/exporter/DaemonMetricCollector.cc
+++ b/src/exporter/DaemonMetricCollector.cc
@@ -168,10 +168,17 @@ void DaemonMetricCollector::dump_asok_metrics(bool sort_metrics, int64_t counter
     if (sockClientsPing) {
       bool ok;
       sock_client.ping(&ok);
+      std::string ceph_daemon_socket_up_desc(
+      "Reports the health status of a Ceph daemon, as determined by whether it is able to respond via its admin socket (1 = healthy, 0 = unhealthy).");
+      labels_t ceph_daemon_socket_up_labels;
+      ceph_daemon_socket_up_labels["hostname"] = quote(ceph_get_hostname());
+      ceph_daemon_socket_up_labels["ceph_daemon"] = quote(daemon_name);
+      add_metric(builder, static_cast<int>(ok), "ceph_daemon_socket_up", ceph_daemon_socket_up_desc,
+             "gauge", ceph_daemon_socket_up_labels);
       if (!ok) {
         failures++;
         continue;
-      } 
+      }
     }
     std::string counter_dump_response = dump_response.size() > 0 ? dump_response :
       asok_request(sock_client, "counter dump", daemon_name);
diff --git a/src/exporter/DaemonMetricCollector.h b/src/exporter/DaemonMetricCollector.h
index d2e929b4d670f..3302e95df916c 100644
--- a/src/exporter/DaemonMetricCollector.h
+++ b/src/exporter/DaemonMetricCollector.h
@@ -42,11 +42,11 @@ class DaemonMetricCollector {
   std::map<std::string, AdminSocketClient> clients;
   std::string metrics;
   std::pair<labels_t, std::string> add_fixed_name_metrics(std::string metric_name);
+  void update_sockets();
 
 private:
   std::mutex metrics_mutex;
   std::unique_ptr<MetricsBuilder> builder;
-  void update_sockets();
   void request_loop(boost::asio::steady_timer &timer);
 
   void dump_asok_metric(boost::json::object perf_info,
diff --git a/src/test/exporter/test_exporter.cc b/src/test/exporter/test_exporter.cc
index 907884fe35d60..e24773886bcb3 100644
--- a/src/test/exporter/test_exporter.cc
+++ b/src/test/exporter/test_exporter.cc
@@ -1,6 +1,8 @@
 #include "common/ceph_argparse.h"
 #include "common/config.h"
 #include "common/config_proxy.h"
+#include "common/admin_socket.h"
+#include "common/admin_socket_client.h"
 #include <gmock/gmock.h>
 #include "gtest/gtest.h"
 #include "common/ceph_context.h"
@@ -8,6 +10,7 @@
 #include "global/global_init.h"
 #include "exporter/util.h"
 #include "exporter/DaemonMetricCollector.h"
+#include <filesystem>
 
 #include <regex>
 #include <string>
@@ -674,6 +677,27 @@ static std::vector<std::pair<std::string, std::string>> promethize_data = {
   {"rocksdb.submit_sync_latency_sum", "ceph_rocksdb_submit_sync_latency_sum"}
 };
 
+
+class AdminSocketTest
+{
+public:
+  explicit AdminSocketTest(AdminSocket *asokc)
+    : m_asokc(asokc)
+  {
+  }
+  bool init(const std::string &uri) {
+    return m_asokc->init(uri);
+  }
+  std::string bind_and_listen(const std::string &sock_path, int *fd) {
+    return m_asokc->bind_and_listen(sock_path, fd);
+  }
+  bool shutdown() {
+    m_asokc->shutdown();
+    return true;
+  }
+  AdminSocket *m_asokc;
+};
+
 int main(int argc, char **argv)
 {
   ::testing::InitGoogleTest(&argc, argv);
@@ -1289,8 +1313,11 @@ ceph_mon_session_rm{ceph_daemon="mon.a"} 577
 # TYPE ceph_mon_session_trim counter
 ceph_mon_session_trim{ceph_daemon="mon.a"} 9
 )";
-  
-  ASSERT_TRUE(collector.metrics.find(expectedMetrics) != std::string::npos);
+
+  std::string actualMetrics = collector.metrics;
+  std::cout << "Actual MON Metrics: " << actualMetrics << std::endl;
+  ASSERT_TRUE(actualMetrics.find(expectedMetrics) != std::string::npos);
+  //ASSERT_TRUE(collector.metrics.find(expectedMetrics) != std::string::npos);
 
   // Test for labeled metrics - RGW
   daemon = "ceph-client.rgw.foo.ceph-node-00.aayrrj.2.93993527376064";
@@ -1452,3 +1479,82 @@ TEST(Exporter, add_fixed_name_metrics) {
     EXPECT_EQ(new_metric.first, expected_labels);
     ASSERT_TRUE(new_metric.second == expected_metric_name);
 }
+
+TEST(Exporter, UpdateSockets) {
+    const std::string mock_dir = "/tmp/fake_sock_dir";
+
+    // Create the mock directory
+    std::filesystem::create_directories(mock_dir);
+
+    // Create a mix of vstart and real cluster mock .asok files
+    std::ofstream(mock_dir + "/ceph-osd.0.asok").close();
+    std::ofstream(mock_dir + "/ceph-mds.a.asok").close();
+    std::ofstream(mock_dir + "/ceph-mgr.chatest-node-00.ijzynn.asok").close();
+    std::ofstream(mock_dir + "/ceph-client.rgw.rgwfoo.chatest-node-00.yqaoen.2.94354846193952.asok").close();
+    std::ofstream(mock_dir + "/ceph-client.ceph-exporter.chatest-node-00.asok").close();
+    std::ofstream(mock_dir + "/ceph-mon.chatest-node-00.asok").close();
+
+    g_conf().set_val("exporter_sock_dir", mock_dir);
+
+    DaemonMetricCollector collector;
+
+    // Run the function that interacts with the mock directory
+    collector.update_sockets();
+
+    // Verify the expected results
+    ASSERT_EQ(collector.clients.size(), 4);
+    ASSERT_TRUE(collector.clients.find("ceph-osd.0") != collector.clients.end());
+    ASSERT_TRUE(collector.clients.find("ceph-mds.a") != collector.clients.end());
+    ASSERT_TRUE(collector.clients.find("ceph-mon.chatest-node-00") != collector.clients.end());
+    ASSERT_TRUE(collector.clients.find("ceph-client.rgw.rgwfoo.chatest-node-00.yqaoen.2.94354846193952") != collector.clients.end());
+
+
+    // Remove the mock directory and files
+    std::filesystem::remove_all(mock_dir);
+}
+
+
+TEST(Exporter, HealthMetrics) {
+    std::map<std::string, AdminSocketClient> clients;
+    DaemonMetricCollector &collector = collector_instance();
+    std::string daemon = "test_daemon";
+    std::string expectedCounterDump = "";
+    std::string expectedCounterSchema = "";
+    std::string metricName = "ceph_daemon_socket_up";
+
+    // Fake admin socket
+    std::string asok_path = "/tmp/" + daemon + ".asok";
+    std::unique_ptr<AdminSocket> asokc = std::make_unique<AdminSocket>(g_ceph_context);
+    AdminSocketClient client(asok_path);
+
+    // Add the daemon clients to the collector
+    clients.insert({daemon, std::move(client)});
+    collector.clients = clients;
+
+    auto verifyMetricValue = [&](const std::string &metricValue, bool shouldInitializeSocket) {
+        collector.metrics = "";
+
+        if (shouldInitializeSocket) {
+            AdminSocketTest asoct(asokc.get());
+            ASSERT_TRUE(asoct.init(asok_path));
+        }
+
+        collector.dump_asok_metrics(true, 5, true, expectedCounterDump, expectedCounterSchema, false);
+
+        if (shouldInitializeSocket) {
+            AdminSocketTest asoct(asokc.get());
+            ASSERT_TRUE(asoct.shutdown());
+        }
+
+        std::string retrievedMetrics = collector.metrics;
+        std::string pattern = metricName + R"(\{[^}]*ceph_daemon=\")" + daemon + R"(\"[^}]*\}\s+)" + metricValue + R"(\b)";
+        std::regex regexPattern(pattern);
+        ASSERT_TRUE(std::regex_search(retrievedMetrics, regexPattern));
+    };
+
+    // Test an admin socket not answering: metric value should be "0"
+    verifyMetricValue("0", false);
+
+    // Test an admin socket answering: metric value should be "1"
+    verifyMetricValue("1", true);
+}

From 82b6a1c5786958bb443d92ee798dd3741f07fdf9 Mon Sep 17 00:00:00 2001
From: Avan Thakkar <athakkar@redhat.com>
Date: Wed, 16 Oct 2024 13:29:34 +0530
Subject: [PATCH 145/148] mgr/smb: rm all `smb dump` commands

Fixes: https://tracker.ceph.com/issues/68545
Signed-off-by: Avan Thakkar <athakkar@redhat.com>
---
 src/pybind/mgr/smb/module.py         | 41 +------------
 src/pybind/mgr/smb/tests/test_smb.py | 89 ----------------------------
 2 files changed, 1 insertion(+), 129 deletions(-)

diff --git a/src/pybind/mgr/smb/module.py b/src/pybind/mgr/smb/module.py
index 77a08229cf017..4512ad6add336 100644
--- a/src/pybind/mgr/smb/module.py
+++ b/src/pybind/mgr/smb/module.py
@@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, cast
+from typing import TYPE_CHECKING, Any, List, Optional, cast
 
 import logging
 
@@ -350,45 +350,6 @@ def show(self, resource_names: Optional[List[str]] = None) -> Simplified:
             return resources[0].to_simplified()
         return {'resources': [r.to_simplified() for r in resources]}
 
-    @cli.SMBCommand('dump cluster-config', perm='r')
-    def dump_config(self, cluster_id: str) -> Dict[str, Any]:
-        """DEBUG: Generate an example configuration"""
-        # TODO: Remove this command prior to release
-        return self._handler.generate_config(cluster_id)
-
-    @cli.SMBCommand('dump service-spec', perm='r')
-    def dump_service_spec(self, cluster_id: str) -> Dict[str, Any]:
-        """DEBUG: Generate an example smb service spec"""
-        # TODO: Remove this command prior to release
-        return dict(
-            self._handler.generate_smb_service_spec(cluster_id).to_json()
-        )
-
-    @cli.SMBCommand('dump everything', perm='r')
-    def dump_everything(self) -> Dict[str, Any]:
-        """DEBUG: Show me everything"""
-        # TODO: Remove this command prior to release
-        everything: Dict[str, Any] = {}
-        everything['PUBLIC'] = {}
-        log.warning('dumping PUBLIC')
-        for key in self._public_store:
-            e = self._public_store[key]
-            log.warning('dumping e: %s %r', e.uri, e.full_key)
-            everything['PUBLIC'][e.uri] = e.get()
-        log.warning('dumping PRIV')
-        everything['PRIV'] = {}
-        for key in self._priv_store:
-            e = self._priv_store[key]
-            log.warning('dumping e: %s %r', e.uri, e.full_key)
-            everything['PRIV'][e.uri] = e.get()
-        log.warning('dumping INTERNAL')
-        everything['INTERNAL'] = {}
-        for key in self._internal_store:
-            e = self._internal_store[key]
-            log.warning('dumping e: %s %r', e.uri, e.full_key)
-            everything['INTERNAL'][e.uri] = e.get()
-        return everything
-
     def submit_smb_spec(self, spec: SMBSpec) -> None:
         """Submit a new or updated smb spec object to ceph orchestration."""
         completion = self.apply_smb(spec)
diff --git a/src/pybind/mgr/smb/tests/test_smb.py b/src/pybind/mgr/smb/tests/test_smb.py
index c9fd02968b904..0d3610326c225 100644
--- a/src/pybind/mgr/smb/tests/test_smb.py
+++ b/src/pybind/mgr/smb/tests/test_smb.py
@@ -410,72 +410,6 @@ def test_cmd_apply_share(tmodule):
     assert bdata["results"][0]["state"] == "created"
 
 
-def test_share_dump_config(tmodule):
-    _example_cfg_1(tmodule)
-
-    cfg = tmodule.dump_config('foo')
-    assert cfg == {
-        'samba-container-config': "v0",
-        'configs': {
-            'foo': {
-                'instance_name': 'foo',
-                'instance_features': [],
-                'shares': ['Ess One', 'Ess Two'],
-                'globals': ['default', 'foo'],
-            },
-        },
-        'shares': {
-            'Ess One': {
-                'options': {
-                    'path': '/',
-                    'read only': 'No',
-                    'browseable': 'Yes',
-                    'kernel share modes': 'no',
-                    'x:ceph:id': 'foo.s1',
-                    'vfs objects': 'acl_xattr ceph_new',
-                    'acl_xattr:security_acl_name': 'user.NTACL',
-                    'ceph_new:config_file': '/etc/ceph/ceph.conf',
-                    'ceph_new:filesystem': 'cephfs',
-                    'ceph_new:user_id': 'smb.fs.cluster.foo',
-                },
-            },
-            'Ess Two': {
-                'options': {
-                    'path': '/two',
-                    'read only': 'No',
-                    'browseable': 'Yes',
-                    'kernel share modes': 'no',
-                    'x:ceph:id': 'foo.stwo',
-                    'vfs objects': 'acl_xattr ceph_new',
-                    'acl_xattr:security_acl_name': 'user.NTACL',
-                    'ceph_new:config_file': '/etc/ceph/ceph.conf',
-                    'ceph_new:filesystem': 'cephfs',
-                    'ceph_new:user_id': 'smb.fs.cluster.foo',
-                },
-            },
-        },
-        'globals': {
-            'default': {
-                'options': {
-                    'load printers': 'No',
-                    'printing': 'bsd',
-                    'printcap name': '/dev/null',
-                    'disable spoolss': 'Yes',
-                },
-            },
-            'foo': {
-                'options': {
-                    'idmap config * : backend': 'autorid',
-                    'idmap config * : range': '2000-9999999',
-                    'realm': 'dom1.example.com',
-                    'security': 'ads',
-                    'workgroup': 'DOM1',
-                },
-            },
-        },
-    }
-
-
 def test_cluster_create_ad1(tmodule):
     _example_cfg_1(tmodule)
 
@@ -613,29 +547,6 @@ def test_cluster_rm(tmodule):
     assert result.success
 
 
-def test_dump_service_spec(tmodule):
-    _example_cfg_1(tmodule)
-    tmodule._public_store.overwrite(
-        {
-            'foo.config.smb': '',
-        }
-    )
-    tmodule._priv_store.overwrite(
-        {
-            'foo.join.2b9902c05d08bcba.json': '',
-            'foo.join.08129d4d3b8c37c7.json': '',
-        }
-    )
-
-    cfg = tmodule.dump_service_spec('foo')
-    assert cfg
-    assert cfg['service_id'] == 'foo'
-    assert cfg['spec']['cluster_id'] == 'foo'
-    assert cfg['spec']['features'] == ['domain']
-    assert cfg['spec']['config_uri'] == 'mem:foo/config.smb'
-    assert len(cfg['spec']['join_sources']) == 2
-
-
 def test_cmd_show_resource_json(tmodule):
     _example_cfg_1(tmodule)
 

From f7a379fe9bc4d57f23c1f5c00807bf3dfa2851d3 Mon Sep 17 00:00:00 2001
From: Shweta Bhosale <Shweta.Bhosale1@ibm.com>
Date: Wed, 9 Oct 2024 14:53:30 +0530
Subject: [PATCH 146/148] cephadm: Added new cephadm command to list all the
 default images

Fixes: https://tracker.ceph.com/issues/68438

Signed-off-by: Shweta Bhosale <Shweta.Bhosale1@ibm.com>
---
 src/cephadm/cephadm.py                    | 12 ++++++
 src/cephadm/cephadmlib/container_types.py | 50 ++++++++++++++++++++++-
 2 files changed, 61 insertions(+), 1 deletion(-)

diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py
index 1ab98a0ac4f1e..5520ff52bd5a4 100755
--- a/src/cephadm/cephadm.py
+++ b/src/cephadm/cephadm.py
@@ -29,6 +29,7 @@
 from io import StringIO
 from threading import Thread, Event
 from pathlib import Path
+from configparser import ConfigParser
 
 from cephadmlib.constants import (
     # default images
@@ -142,6 +143,7 @@
     SidecarContainer,
     extract_uid_gid,
     is_container_running,
+    get_mgr_images,
 )
 from cephadmlib.decorators import (
     deprecated_command,
@@ -4679,6 +4681,13 @@ def probe_hba(scan_path: str) -> None:
     return f'Ok. {len(all_scan_files)} adapters detected: {len(scan_files)} rescanned, {len(skipped)} skipped, {len(failures)} failed ({elapsed:.2f}s)'
 
 
+def command_list_images(ctx: CephadmContext) -> None:
+    """this function will list the default images used by different services"""
+    cp_obj = ConfigParser()
+    cp_obj['mgr'] = get_mgr_images()
+    # print default images
+    cp_obj.write(sys.stdout)
+
 ##################################
 
 
@@ -5542,6 +5551,9 @@ def _get_parser():
         'disk-rescan', help='rescan all HBAs to detect new/removed devices')
     parser_disk_rescan.set_defaults(func=command_rescan_disks)
 
+    parser_list_images = subparsers.add_parser(
+        'list-images', help='list all the default images')
+    parser_list_images.set_defaults(func=command_list_images)
     return parser
 
 
diff --git a/src/cephadm/cephadmlib/container_types.py b/src/cephadm/cephadmlib/container_types.py
index 665c4d89652a6..791a545538a3c 100644
--- a/src/cephadm/cephadmlib/container_types.py
+++ b/src/cephadm/cephadmlib/container_types.py
@@ -8,7 +8,28 @@
 from typing import Dict, List, Optional, Any, Union, Tuple, Iterable, cast
 
 from .call_wrappers import call, call_throws, CallVerbosity
-from .constants import DEFAULT_TIMEOUT
+from .constants import (
+    DEFAULT_TIMEOUT,
+    # default container images
+    DEFAULT_ALERT_MANAGER_IMAGE,
+    DEFAULT_GRAFANA_IMAGE,
+    DEFAULT_LOKI_IMAGE,
+    DEFAULT_NODE_EXPORTER_IMAGE,
+    DEFAULT_PROMETHEUS_IMAGE,
+    DEFAULT_PROMTAIL_IMAGE,
+    DEFAULT_HAPROXY_IMAGE,
+    DEFAULT_KEEPALIVED_IMAGE,
+    DEFAULT_NVMEOF_IMAGE,
+    DEFAULT_SNMP_GATEWAY_IMAGE,
+    DEFAULT_ELASTICSEARCH_IMAGE,
+    DEFAULT_JAEGER_COLLECTOR_IMAGE,
+    DEFAULT_JAEGER_AGENT_IMAGE,
+    DEFAULT_JAEGER_QUERY_IMAGE,
+    DEFAULT_SMB_IMAGE,
+    DEFAULT_SMBMETRICS_IMAGE,
+    DEFAULT_NGINX_IMAGE,
+    DEFAULT_OAUTH2_PROXY_IMAGE,
+)
 from .container_engines import Docker, Podman
 from .context import CephadmContext
 from .daemon_identity import DaemonIdentity, DaemonSubIdentity
@@ -660,3 +681,30 @@ def enable_shared_namespaces(
     cc = f'container:{name}'
     for n in ns:
         _replace_container_arg(args, n.to_option(cc))
+
+
+def get_mgr_images() -> dict:
+    """Return dict of default mgr images"""
+    mgr_prefix = 'mgr/cephadm/container_image_'
+    mgr_images = {}
+    mgr_images[mgr_prefix + 'prometheus'] = DEFAULT_PROMETHEUS_IMAGE
+    mgr_images[mgr_prefix + 'alertmanager'] = DEFAULT_ALERT_MANAGER_IMAGE
+    mgr_images[mgr_prefix + 'graphana'] = DEFAULT_GRAFANA_IMAGE
+    mgr_images[mgr_prefix + 'loki'] = DEFAULT_LOKI_IMAGE
+    mgr_images[mgr_prefix + 'promtail'] = DEFAULT_PROMTAIL_IMAGE
+    mgr_images[mgr_prefix + 'node_exporter'] = DEFAULT_NODE_EXPORTER_IMAGE
+    mgr_images[mgr_prefix + 'haproxy'] = DEFAULT_HAPROXY_IMAGE
+    mgr_images[mgr_prefix + 'keepalived'] = DEFAULT_KEEPALIVED_IMAGE
+    mgr_images[mgr_prefix + 'nvmeof'] = DEFAULT_NVMEOF_IMAGE
+    mgr_images[mgr_prefix + 'snmp_gateway'] = DEFAULT_SNMP_GATEWAY_IMAGE
+    mgr_images[mgr_prefix + 'elasticsearch'] = DEFAULT_ELASTICSEARCH_IMAGE
+    mgr_images[
+        mgr_prefix + 'jaeger_collector'
+    ] = DEFAULT_JAEGER_COLLECTOR_IMAGE
+    mgr_images[mgr_prefix + 'jaeger_agent'] = DEFAULT_JAEGER_AGENT_IMAGE
+    mgr_images[mgr_prefix + 'jaeger_query'] = DEFAULT_JAEGER_QUERY_IMAGE
+    mgr_images[mgr_prefix + 'smb'] = DEFAULT_SMB_IMAGE
+    mgr_images[mgr_prefix + 'smbmetrics'] = DEFAULT_SMBMETRICS_IMAGE
+    mgr_images[mgr_prefix + 'nginx'] = DEFAULT_NGINX_IMAGE
+    mgr_images[mgr_prefix + 'oauth2_proxy'] = DEFAULT_OAUTH2_PROXY_IMAGE
+    return mgr_images

From 87612f499f86c9864c3bf6371cdd46954176e5ab Mon Sep 17 00:00:00 2001
From: Pedro Gonzalez Gomez <pegonzal@redhat.com>
Date: Mon, 7 Oct 2024 21:22:20 +0200
Subject: [PATCH 147/148] mgr/dashboard: fix lifecycle issues

Fixes: https://tracker.ceph.com/issues/68434

Signed-off-by: Pedro Gonzalez Gomez <pegonzal@redhat.com>
---
 .../rgw-bucket-details.component.html         | 10 +++++--
 .../frontend/src/app/ceph/rgw/rgw.module.ts   |  4 ++-
 .../frontend/src/app/shared/pipes/xml.pipe.ts |  8 ++++--
 .../shared/services/json-to-xml.service.ts    | 20 ++++++++++----
 .../frontend/src/styles/_carbon-defaults.scss |  7 +++++
 .../mgr/dashboard/services/rgw_client.py      | 27 +++++++++++++++----
 6 files changed, 61 insertions(+), 15 deletions(-)

diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-bucket-details/rgw-bucket-details.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-bucket-details/rgw-bucket-details.component.html
index ddc202152b9f4..463eac88b1e99 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-bucket-details/rgw-bucket-details.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-bucket-details/rgw-bucket-details.component.html
@@ -158,8 +158,14 @@
                   </div>
                 </td>
                 <td>
-                  <pre *ngIf="lifecycleFormat === 'json'">{{selection.lifecycle | json}}</pre>
-                  <pre *ngIf="lifecycleFormat === 'xml'">{{ (selection.lifecycle | xml) || '-'}}</pre>
+                  <cds-code-snippet display="multi"
+                                    *ngIf="lifecycleFormat === 'json'">
+                    {{selection.lifecycle | json}}
+                  </cds-code-snippet>
+                  <cds-code-snippet display="multi"
+                                    *ngIf="lifecycleFormat === 'xml'">
+                    {{ (selection.lifecycle | xml:{'Rules':'Rule'}) || '-'}}
+                  </cds-code-snippet>
                 </td>
               </tr>
               <tr>
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw.module.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw.module.ts
index 3439562c8e223..5f8c6f50135c2 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw.module.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw.module.ts
@@ -70,7 +70,8 @@ import {
   IconModule,
   LoadingModule,
   ModalModule,
-  ProgressIndicatorModule
+  ProgressIndicatorModule,
+  CodeSnippetModule
 } from 'carbon-components-angular';
 import { CephSharedModule } from '../shared/ceph-shared.module';
 
@@ -94,6 +95,7 @@ import { CephSharedModule } from '../shared/ceph-shared.module';
     ModalModule,
     GridModule,
     ProgressIndicatorModule,
+    CodeSnippetModule,
     ButtonModule,
     LoadingModule,
     IconModule,
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/pipes/xml.pipe.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/pipes/xml.pipe.ts
index 59d7572e9f004..45cca684dab01 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/pipes/xml.pipe.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/pipes/xml.pipe.ts
@@ -7,9 +7,13 @@ import { JsonToXmlService } from '../services/json-to-xml.service';
 export class XmlPipe implements PipeTransform {
   constructor(private jsonToXmlService: JsonToXmlService) {}
 
-  transform(value: string, valueFormat: string = 'json'): string {
+  transform(
+    value: string,
+    replaceKey: Record<string, string> = {},
+    valueFormat: string = 'json'
+  ): string {
     if (valueFormat === 'json') {
-      value = this.jsonToXmlService.format(value);
+      value = this.jsonToXmlService.format(value, replaceKey);
     }
     return value;
   }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/services/json-to-xml.service.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/services/json-to-xml.service.ts
index 8f1d128c0c59c..e9d30f9b7f2f4 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/services/json-to-xml.service.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/services/json-to-xml.service.ts
@@ -6,29 +6,39 @@ import { Injectable } from '@angular/core';
 export class JsonToXmlService {
   constructor() {}
 
-  format(json: any, indentSize: number = 2, currentIndent: number = 0): string {
+  format(
+    json: any,
+    replaceKey: Record<string, string> = null,
+    indentSize: number = 2,
+    currentIndent: number = 0
+  ): string {
     if (!json) return null;
     let xml = '';
     if (typeof json === 'string') {
       json = JSON.parse(json);
     }
 
-    for (const key in json) {
+    for (let key in json) {
       if (json.hasOwnProperty(key)) {
         const value = json[key];
         const indentation = ' '.repeat(currentIndent);
-
+        if (replaceKey) {
+          const [oldKey, newKey] = Object.entries(replaceKey)[0];
+          if (key === oldKey) {
+            key = newKey;
+          }
+        }
         if (Array.isArray(value)) {
           value.forEach((item) => {
             xml +=
               `${indentation}<${key}>\n` +
-              this.format(item, indentSize, currentIndent + indentSize) +
+              this.format(item, replaceKey, indentSize, currentIndent + indentSize) +
               `${indentation}</${key}>\n`;
           });
         } else if (typeof value === 'object') {
           xml +=
             `${indentation}<${key}>\n` +
-            this.format(value, indentSize, currentIndent + indentSize) +
+            this.format(value, replaceKey, indentSize, currentIndent + indentSize) +
             `${indentation}</${key}>\n`;
         } else {
           xml += `${indentation}<${key}>${value}</${key}>\n`;
diff --git a/src/pybind/mgr/dashboard/frontend/src/styles/_carbon-defaults.scss b/src/pybind/mgr/dashboard/frontend/src/styles/_carbon-defaults.scss
index 1d12facaf6a2f..61ca421101e6d 100644
--- a/src/pybind/mgr/dashboard/frontend/src/styles/_carbon-defaults.scss
+++ b/src/pybind/mgr/dashboard/frontend/src/styles/_carbon-defaults.scss
@@ -142,3 +142,10 @@ Dashboard page
 cd-dashboard {
   font-size: 12px;
 }
+
+/******************************************
+Code snippet
+******************************************/
+.cds--snippet {
+  width: fit-content;
+}
diff --git a/src/pybind/mgr/dashboard/services/rgw_client.py b/src/pybind/mgr/dashboard/services/rgw_client.py
index 2441b73b361be..340e894d23ae1 100755
--- a/src/pybind/mgr/dashboard/services/rgw_client.py
+++ b/src/pybind/mgr/dashboard/services/rgw_client.py
@@ -10,6 +10,7 @@
 import time
 import uuid
 import xml.etree.ElementTree as ET  # noqa: N814
+from collections import defaultdict
 from enum import Enum
 from subprocess import SubprocessError
 from urllib.parse import urlparse
@@ -700,12 +701,28 @@ def set_tags(self, bucket_name, tags, request=None):
             raise DashboardException(msg=str(e), component='rgw')
         return result
 
+    @staticmethod
+    def _handle_rules(pairs):
+        result = defaultdict(list)
+        for key, value in pairs:
+            if key == 'Rule':
+                result['Rules'].append(value)
+            else:
+                result[key] = value
+        return result
+
     @RestClient.api_get('/{bucket_name}?lifecycle')
     def get_lifecycle(self, bucket_name, request=None):
         # pylint: disable=unused-argument
         try:
-            result = request()  # type: ignore
-            result = {'LifecycleConfiguration': result}
+            decoded_request = request(raw_content=True).decode("utf-8")  # type: ignore
+            result = {
+                'LifecycleConfiguration':
+                json.loads(
+                    decoded_request,
+                    object_pairs_hook=RgwClient._handle_rules
+                )
+            }
         except RequestException as e:
             if e.content:
                 content = json_str_to_object(e.content)
@@ -757,15 +774,15 @@ def set_lifecycle(self, bucket_name, lifecycle, request=None):
             lifecycle = RgwClient.dict_to_xml(lifecycle)
         try:
             if lifecycle and '<LifecycleConfiguration>' not in str(lifecycle):
-                lifecycle = f'<LifecycleConfiguration>{lifecycle}</LifecycleConfiguration>'
+                lifecycle = f'<LifecycleConfiguration>\n{lifecycle}\n</LifecycleConfiguration>'
             result = request(data=lifecycle)  # type: ignore
         except RequestException as e:
+            msg = ''
             if e.content:
                 content = json_str_to_object(e.content)
                 if content.get("Code") == "MalformedXML":
                     msg = "Invalid Lifecycle document"
-                    raise DashboardException(msg=msg, component='rgw')
-            raise DashboardException(msg=str(e), component='rgw')
+            raise DashboardException(msg=msg or str(e), component='rgw')
         return result
 
     @RestClient.api_delete('/{bucket_name}?lifecycle')

From 4b2ba587b7d8090523fc8eddb31893c4ee9c87af Mon Sep 17 00:00:00 2001
From: Teoman ONAY <tonay@ibm.com>
Date: Mon, 17 Jun 2024 13:16:48 +0200
Subject: [PATCH 148/148] mgmt-gateway: add e2e testing

Add mgmt-gateway teuthology test scenarios

Signed-off-by: Teoman ONAY <tonay@ibm.com>
---
 .../workunits/task/test_mgmt_gateway.yaml     | 77 +++++++++++++++++++
 1 file changed, 77 insertions(+)
 create mode 100644 qa/suites/orch/cephadm/workunits/task/test_mgmt_gateway.yaml

diff --git a/qa/suites/orch/cephadm/workunits/task/test_mgmt_gateway.yaml b/qa/suites/orch/cephadm/workunits/task/test_mgmt_gateway.yaml
new file mode 100644
index 0000000000000..5207fd415b7e6
--- /dev/null
+++ b/qa/suites/orch/cephadm/workunits/task/test_mgmt_gateway.yaml
@@ -0,0 +1,77 @@
+overrides:
+  ceph:
+    log-ignorelist:
+      - CEPHADM_FAILED_DAEMON
+    log-only-match:
+      - CEPHADM_
+roles:
+- - host.a
+  - mon.a
+  - mgr.a
+  - osd.0
+- - host.b
+  - mon.b
+  - mgr.b
+  - osd.1
+- - host.c
+  - mon.c
+  - osd.2
+tasks:
+- install:
+- cephadm:
+- cephadm.shell:
+    host.c:
+      - |
+        set -ex
+        # Deploy monitoring stack
+        ceph orch apply node-exporter
+        ceph orch apply grafana
+        ceph orch apply alertmanager
+        ceph orch apply prometheus
+        sleep 240
+        # generate SSL certificate
+        openssl req -x509 -newkey rsa:4096 -keyout /tmp/key.pem -out /tmp/cert.pem -sha256 -days 30 -nodes -subj "/CN=*"
+        # Generate a mgmt.spec template
+        cat << EOT > /tmp/mgmt.spec
+        service_type: mgmt-gateway
+        service_id: foo
+        placement:
+          hosts:
+            - ${HOSTNAME}
+        spec:
+          ssl_protocols:
+            - TLSv1.2
+            - TLSv1.3
+          ssl_ciphers:
+            - AES128-SHA
+            - AES256-SHA
+          enable_health_check_endpoint: True
+        EOT
+        # Add generated certificates to spec file
+        echo "  ssl_certificate: |" >> /tmp/mgmt.spec 
+        while read LINE; do echo $LINE | sed -e "s/^/    /"; done < /tmp/cert.pem >> /tmp/mgmt.spec
+        echo "  ssl_certificate_key: |" >> /tmp/mgmt.spec
+        while read LINE; do echo $LINE | sed -e "s/^/    /"; done < /tmp/key.pem >> /tmp/mgmt.spec
+        # Apply spec
+        ceph orch apply -i /tmp/mgmt.spec
+- cephadm.wait_for_service:
+    service: mgmt-gateway
+- cephadm.shell:
+    host.a:
+      - |
+        set -ex
+        # retrieve mgmt hostname and ip
+        MGMT_GTW_HOST=$(ceph orch ps --daemon-type mgmt-gateway -f json | jq -e '.[]' | jq -r '.hostname')
+        MGMT_GTW_IP=$(ceph orch host ls -f json | jq -r --arg MGMT_GTW_HOST "$MGMT_GTW_HOST" '.[] | select(.hostname==$MGMT_GTW_HOST) | .addr')
+        # check mgmt-gateway health
+        curl -k -s https://${MGMT_GTW_IP}/health
+        curl -k -s https://${MGMT_GTW_IP}:29443/health
+        # wait for background services to be reconfigured following mgmt-gateway installation
+        sleep 180
+        # check grafana endpoints are responsive and database health is okay
+        curl -k -s https://${MGMT_GTW_IP}/grafana/api/health | jq -e '.database == "ok"'
+        # check prometheus endpoints are responsive
+        curl -k -s -u admin:admin https://${MGMT_GTW_IP}/prometheus/api/v1/status/config | jq -e '.status == "success"'
+        # check alertmanager endpoints are responsive
+        curl -k -s -u admin:admin https://${MGMT_GTW_IP}/alertmanager/api/v2/status
+