From 86acdbd0500469460f608fbd6c96b75a8ba84752 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Mon, 5 Feb 2024 22:14:28 -0800
Subject: [PATCH] Suppress infinite-recursion warning (#2307)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2307

- Suppress infinite-recursion warning in location that is known
to not cause infinite recursion, as it is an implementation of
pure virtual using CRTP

Reviewed By: r-barnes

Differential Revision: D53359145

fbshipit-source-id: 4a06134efecbda49d353a17fe60b9a6496ee1b32
---
 bench/EmbeddingSpMDMNBit2Benchmark.cc         |  2 +-
 .../split_embedding_inference_converter.py    | 30 ++++++++++-------
 ...lit_table_batched_embeddings_ops_common.py |  8 +++--
 ..._table_batched_embeddings_ops_inference.py | 32 ++++++++++++-------
 ...t_table_batched_embeddings_ops_training.py | 32 ++++++++++++-------
 .../ssd_split_table_batched_embeddings_ops.py |  4 +++
 include/fbgemm/Fbgemm.h                       |  9 ++++++
 7 files changed, 78 insertions(+), 39 deletions(-)

diff --git a/bench/EmbeddingSpMDMNBit2Benchmark.cc b/bench/EmbeddingSpMDMNBit2Benchmark.cc
index 98a4e79a0d..8400112bc4 100644
--- a/bench/EmbeddingSpMDMNBit2Benchmark.cc
+++ b/bench/EmbeddingSpMDMNBit2Benchmark.cc
@@ -153,7 +153,7 @@ static void print_benchmark_results() {
       << "autovec b/w (GB/s), autovec effective b/w (GB/s), autovec time, "
       << "ref b/w (GB/s), ref effective b/w (GB/s), ref time, "
       << "asmjit speedup ratio, autovec speedup ratio" << std::endl;
-  for (int i = 0; i < benchmarks.size(); ++i) {
+  for (size_t i = 0; i < benchmarks.size(); ++i) {
     BenchmarkSpec& spec = benchmarks[i].first;
     BenchmarkResult& res = benchmarks[i].second;
     float asmjit_speedup = res.ref_bw > 0.0 ? res.asmjit_bw / res.ref_bw : 0;
diff --git a/fbgemm_gpu/fbgemm_gpu/split_embedding_inference_converter.py b/fbgemm_gpu/fbgemm_gpu/split_embedding_inference_converter.py
index a2dd34ade5..a67fa1afc0 100644
--- a/fbgemm_gpu/fbgemm_gpu/split_embedding_inference_converter.py
+++ b/fbgemm_gpu/fbgemm_gpu/split_embedding_inference_converter.py
@@ -130,9 +130,11 @@ def _process_split_embs(self, model: torch.nn.Module) -> None:
                             pruned_weight.size()[0],
                             D,
                             weight_ty,
-                            EmbeddingLocation.HOST
-                            if use_cpu
-                            else EmbeddingLocation.DEVICE,
+                            (
+                                EmbeddingLocation.HOST
+                                if use_cpu
+                                else EmbeddingLocation.DEVICE
+                            ),
                         )
                     )
                     index_remapping_list.append(index_remapping)
@@ -144,19 +146,23 @@ def _process_split_embs(self, model: torch.nn.Module) -> None:
 
                 q_child = IntNBitTableBatchedEmbeddingBagsCodegen(
                     embedding_specs=new_embedding_specs,
-                    index_remapping=index_remapping_list
-                    if self.pruning_ratio is not None
-                    else None,
+                    index_remapping=(
+                        index_remapping_list if self.pruning_ratio is not None else None
+                    ),
                     pooling_mode=child.pooling_mode,
                     device="cpu" if use_cpu else torch.cuda.current_device(),
                     weight_lists=weight_lists,
                     use_array_for_index_remapping=self.use_array_for_index_remapping,
-                    fp8_exponent_bits=self._get_quantization_config("exponent_bits")
-                    if is_fp8_weight
-                    else None,
-                    fp8_exponent_bias=self._get_quantization_config("exponent_bias")
-                    if is_fp8_weight
-                    else None,
+                    fp8_exponent_bits=(
+                        self._get_quantization_config("exponent_bits")
+                        if is_fp8_weight
+                        else None
+                    ),
+                    fp8_exponent_bias=(
+                        self._get_quantization_config("exponent_bias")
+                        if is_fp8_weight
+                        else None
+                    ),
                 )
                 setattr(model, name, q_child)
             else:
diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_common.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_common.py
index 506a689845..341b9e3707 100644
--- a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_common.py
+++ b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_common.py
@@ -98,9 +98,11 @@ def construct_cache_state(
         start, end = _cache_hash_size_cumsum[t_], _cache_hash_size_cumsum[t_ + 1]
         cache_index_table_map[start:end] = [t] * (end - start)
     cache_hash_size_cumsum = [
-        _cache_hash_size_cumsum[t_]
-        if location_list[t_] == EmbeddingLocation.MANAGED_CACHING
-        else -1
+        (
+            _cache_hash_size_cumsum[t_]
+            if location_list[t_] == EmbeddingLocation.MANAGED_CACHING
+            else -1
+        )
         for t_ in feature_table_map
     ]
     cache_hash_size_cumsum.append(total_cache_hash_size)
diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_inference.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_inference.py
index e91e4e36e8..a60728da4d 100644
--- a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_inference.py
+++ b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_inference.py
@@ -947,7 +947,9 @@ def _apply_cache_state(
         ], "Only 1-way or 32-way(64-way for AMD) implmeneted for now"
 
         self.cache_algorithm = cache_algorithm
+        # pyre-ignore[16]
         self.timestep_counter = torch.classes.fbgemm.AtomicCounter()
+        # pyre-ignore[16]
         self.timestep_prefetch_size = torch.classes.fbgemm.AtomicCounter()
 
         self.max_prefetch_depth = MAX_PREFETCH_DEPTH
@@ -959,6 +961,7 @@ def _apply_cache_state(
             lxu_cache_locations_empty = torch.empty(
                 0, device=self.current_device, dtype=torch.int32
             ).fill_(-1)
+        # pyre-ignore[16]
         self.lxu_cache_locations_list = torch.classes.fbgemm.TensorQueue(
             lxu_cache_locations_empty
         )
@@ -1100,9 +1103,11 @@ def _apply_cache_state(
         self.register_buffer(
             "lxu_state",
             torch.zeros(
-                size=(self.total_cache_hash_size + 1,)
-                if cache_algorithm == CacheAlgorithm.LFU
-                else (cache_sets, self.cache_assoc),
+                size=(
+                    (self.total_cache_hash_size + 1,)
+                    if cache_algorithm == CacheAlgorithm.LFU
+                    else (cache_sets, self.cache_assoc)
+                ),
                 device=self.current_device,
                 dtype=torch.int64,
             ),
@@ -1294,7 +1299,7 @@ def split_embedding_weights_with_scale_bias(
     @torch.jit.export
     def split_embedding_weights(
         self,
-        split_scale_shifts: bool = True
+        split_scale_shifts: bool = True,
         # When true, return list of two tensors, the first with weights and
         # the second with scale_bias.
         # This should've been named as split_scale_bias.
@@ -1303,11 +1308,13 @@ def split_embedding_weights(
         """
         Returns a list of weights, split by table
         """
-        splits: List[
-            Tuple[Tensor, Optional[Tensor], Optional[Tensor]]
-        ] = self.split_embedding_weights_with_scale_bias(
-            split_scale_bias_mode=(1 if split_scale_shifts else 0)
+        # fmt: off
+        splits: List[Tuple[Tensor, Optional[Tensor], Optional[Tensor]]] = (
+            self.split_embedding_weights_with_scale_bias(
+                split_scale_bias_mode=(1 if split_scale_shifts else 0)
+            )
         )
+        # fmt: on
         return [
             (split_weight_scale_bias[0], split_weight_scale_bias[1])
             for split_weight_scale_bias in splits
@@ -1411,9 +1418,11 @@ def set_index_remappings(
         # Hash mapping pruning
         if not use_array_for_index_remapping:
             capacities = [
-                round_up(int(row * 1.0 / pruning_hash_load_factor), 32)
-                if index_remap is not None
-                else 0
+                (
+                    round_up(int(row * 1.0 / pruning_hash_load_factor), 32)
+                    if index_remap is not None
+                    else 0
+                )
                 for (index_remap, row) in zip(index_remapping, rows)
             ]
             hash_table = torch.empty(
@@ -1445,6 +1454,7 @@ def set_index_remappings(
 
             if self.use_cpu:
                 self.index_remapping_hash_table_cpu = (
+                    # pyre-ignore[16]
                     torch.classes.fbgemm.PrunedMapCPU()
                 )
                 self.index_remapping_hash_table_cpu.insert(
diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py
index fc246084bd..59046cb394 100644
--- a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py
+++ b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py
@@ -646,9 +646,11 @@ def __init__(  # noqa C901
                         embedding_specs,
                         rowwise=rowwise,
                         cacheable=False,
-                        placement=EmbeddingLocation.MANAGED
-                        if ((not rowwise) and uvm_non_rowwise_momentum)
-                        else None,
+                        placement=(
+                            EmbeddingLocation.MANAGED
+                            if ((not rowwise) and uvm_non_rowwise_momentum)
+                            else None
+                        ),
                     ),
                     prefix="momentum1",
                     # pyre-fixme[6]: Expected `Type[Type[torch._dtype]]` for 3rd param
@@ -671,9 +673,11 @@ def __init__(  # noqa C901
                         embedding_specs,
                         rowwise=rowwise,
                         cacheable=False,
-                        placement=EmbeddingLocation.MANAGED
-                        if ((not rowwise) and uvm_non_rowwise_momentum)
-                        else None,
+                        placement=(
+                            EmbeddingLocation.MANAGED
+                            if ((not rowwise) and uvm_non_rowwise_momentum)
+                            else None
+                        ),
                     ),
                     prefix="momentum2",
                     # pyre-fixme[6]: Expected `Type[Type[torch._dtype]]` for 3rd param
@@ -1411,9 +1415,11 @@ def get_optimizer_state(self) -> List[Dict[str, torch.Tensor]]:
             or self.optimizer == OptimType.EXACT_ADAGRAD
         ):
             list_of_state_dict = [
-                {"sum": states[0], "prev_iter": states[1], "row_counter": states[2]}
-                if self._used_rowwise_adagrad_with_counter
-                else {"sum": states[0]}
+                (
+                    {"sum": states[0], "prev_iter": states[1], "row_counter": states[2]}
+                    if self._used_rowwise_adagrad_with_counter
+                    else {"sum": states[0]}
+                )
                 for states in split_optimizer_states
             ]
         elif self.optimizer == OptimType.SGD or self.optimizer == OptimType.EXACT_SGD:
@@ -1741,9 +1747,11 @@ def _apply_cache_state(
         self.register_buffer(
             "lxu_state",
             torch.zeros(
-                size=(self.total_cache_hash_size + 1,)
-                if cache_algorithm == CacheAlgorithm.LFU
-                else (cache_sets, DEFAULT_ASSOC),
+                size=(
+                    (self.total_cache_hash_size + 1,)
+                    if cache_algorithm == CacheAlgorithm.LFU
+                    else (cache_sets, DEFAULT_ASSOC)
+                ),
                 device=self.current_device,
                 dtype=torch.int64,
             ),
diff --git a/fbgemm_gpu/fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py b/fbgemm_gpu/fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py
index 832f306b98..45b568d739 100644
--- a/fbgemm_gpu/fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py
+++ b/fbgemm_gpu/fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py
@@ -213,6 +213,7 @@ def __init__(
             prefix="ssd_table_batched_embeddings", dir=ssd_storage_directory
         )
         # pyre-fixme[4]: Attribute must be annotated.
+        # pyre-ignore[16]
         self.ssd_db = torch.classes.fbgemm.EmbeddingRocksDBWrapper(
             ssd_directory,
             ssd_shards,
@@ -770,6 +771,7 @@ def max_ty_D(ty: SparseType) -> int:
             prefix="ssd_table_batched_embeddings", dir=ssd_storage_directory
         )
         # pyre-fixme[4]: Attribute must be annotated.
+        # pyre-ignore[16]
         self.ssd_db = torch.classes.fbgemm.EmbeddingRocksDBWrapper(
             ssd_directory,
             ssd_shards,
@@ -794,8 +796,10 @@ def max_ty_D(ty: SparseType) -> int:
         self.ssd_set_end = torch.cuda.Event()
 
         # pyre-fixme[4]: Attribute must be annotated.
+        # pyre-ignore[16]
         self.timestep_counter = torch.classes.fbgemm.AtomicCounter()
         # pyre-fixme[4]: Attribute must be annotated.
+        # pyre-ignore[16]
         self.timestep_prefetch_size = torch.classes.fbgemm.AtomicCounter()
 
         self.weights_dev: torch.Tensor = torch.empty(
diff --git a/include/fbgemm/Fbgemm.h b/include/fbgemm/Fbgemm.h
index eb1f3a01b7..d694903c6a 100644
--- a/include/fbgemm/Fbgemm.h
+++ b/include/fbgemm/Fbgemm.h
@@ -139,6 +139,11 @@ class PackMatrix {
       int cols = 0,
       const BlockingFactors* params = nullptr);
 
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Winfinite-recursion"
+#endif
+
   /**
    * @return Pointer to a buffer containing row offset results. Some packing
    *         objects fuse row offset computation for later requantization step.
@@ -147,6 +152,10 @@ class PackMatrix {
     return static_cast<const PT*>(this)->getRowOffsetBuffer();
   }
 
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+
   /**
    * @brief When k loop is also tiled/blocked, this function is used to check if
    * have executed computations for the last k block so that we can perform