apollographql · Geal · May 14, 2024 · May 6, 2024 · May 7, 2024 · May 7, 2024
@@ -0,0 +1,23 @@
+### Prevent query plan cache collision when planning options change ([Issue #5093](https://github.com/apollographql/router/issues/5093))
+
+> [!IMPORTANT]  
+> If you have enabled [Distributed query plan caching](https://www.apollographql.com/docs/router/configuration/distributed-caching/#distributed-query-plan-caching), this release changes the hashing algorithm used for the cache keys.  On account of this, you should anticipate additional cache regeneration cost when updating between these versions while the new hashing algorithm comes into service.
+
+When query planning takes place there are a number of options such as:
+* `defer_support`
+* `generate_query_fragments`
+* `experimental_reuse_query_fragments`
+* `experimental_type_conditioned_fetching`
+* `experimental_query_planner_mode`
+
+that will affect the generated query plans.
+
+If distributed query plan caching is also enabled, then changing any of these will result in different query plans being generated and entering the cache.
+
+This could cause issue in the following scenarios:
+1. The Router configuration changes and a query plan is loaded from cache which is incompatible with the new configuration.
+2. Routers with differing configuration are sharing the same cache causing them to cache and load incompatible query plans. 
+
+Now a hash for the entire query planner configuration is included in the cache key to prevent this from happening.
+
+By [@Geal](https://github.com/Geal) in https://github.com/apollographql/router/pull/5100
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -492,7 +492,7 @@ commands:
           environment:
             # Use the settings from the "ci" profile in nextest configuration.
             NEXTEST_PROFILE: ci
-          command: xtask test --workspace --locked
+          command: xtask test --workspace --locked --features ci
       - run:
           name: Delete large files from cache
           command: |

@@ -57,6 +57,9 @@ docs_rs = ["router-bridge/docs_rs"]
 # and not yet ready for production use.
 telemetry_next = []
 
+# is set when ci builds take place. It allows us to disable some tests when CI is running on certain platforms.
+ci = []
+
 [package.metadata.docs.rs]
 features = ["docs_rs"]
 

@@ -414,6 +414,27 @@ impl Configuration {
                 .build()
             ).build())
     }
+
+    pub(crate) fn js_query_planner_config(&self) -> router_bridge::planner::QueryPlannerConfig {
+        router_bridge::planner::QueryPlannerConfig {
+            reuse_query_fragments: self.supergraph.reuse_query_fragments,
+            generate_query_fragments: Some(self.supergraph.generate_query_fragments),
+            incremental_delivery: Some(router_bridge::planner::IncrementalDeliverySupport {
+                enable_defer: Some(self.supergraph.defer_support),
+            }),
+            graphql_validation: false,
+            debug: Some(router_bridge::planner::QueryPlannerDebugConfig {
+                bypass_planner_for_single_subgraph: None,
+                max_evaluated_plans: self
+                    .supergraph
+                    .query_planning
+                    .experimental_plans_limit
+                    .or(Some(10000)),
+                paths_limit: self.supergraph.query_planning.experimental_paths_limit,
+            }),
+            type_conditioned_fetching: self.experimental_type_conditioned_fetching,
+        }
+    }
 }
 
 impl Default for Configuration {

@@ -15,12 +15,9 @@ use futures::future::BoxFuture;
 use opentelemetry_api::metrics::MeterProvider as _;
 use opentelemetry_api::metrics::ObservableGauge;
 use opentelemetry_api::KeyValue;
-use router_bridge::planner::IncrementalDeliverySupport;
 use router_bridge::planner::PlanOptions;
 use router_bridge::planner::PlanSuccess;
 use router_bridge::planner::Planner;
-use router_bridge::planner::QueryPlannerConfig;
-use router_bridge::planner::QueryPlannerDebugConfig;
 use router_bridge::planner::UsageReporting;
 use serde::Deserialize;
 use serde_json_bytes::Map;
@@ -159,27 +156,7 @@ impl PlannerMode {
         configuration: &Configuration,
         old_planner: Option<Arc<Planner<QueryPlanResult>>>,
     ) -> Result<Arc<Planner<QueryPlanResult>>, ServiceBuildError> {
-        let query_planner_configuration = QueryPlannerConfig {
-            reuse_query_fragments: configuration.supergraph.reuse_query_fragments,
-            generate_query_fragments: Some(configuration.supergraph.generate_query_fragments),
-            incremental_delivery: Some(IncrementalDeliverySupport {
-                enable_defer: Some(configuration.supergraph.defer_support),
-            }),
-            graphql_validation: false,
-            debug: Some(QueryPlannerDebugConfig {
-                bypass_planner_for_single_subgraph: None,
-                max_evaluated_plans: configuration
-                    .supergraph
-                    .query_planning
-                    .experimental_plans_limit
-                    .or(Some(10000)),
-                paths_limit: configuration
-                    .supergraph
-                    .query_planning
-                    .experimental_paths_limit,
-            }),
-            type_conditioned_fetching: configuration.experimental_type_conditioned_fetching,
-        };
+        let query_planner_configuration = configuration.js_query_planner_config();
         let planner = match old_planner {
             None => Planner::new(sdl.to_owned(), query_planner_configuration).await?,
             Some(old_planner) => {

@@ -12,7 +12,9 @@ use rand::seq::SliceRandom;
 use rand::thread_rng;
 use router_bridge::planner::PlanOptions;
 use router_bridge::planner::Planner;
+use router_bridge::planner::QueryPlannerConfig;
 use router_bridge::planner::UsageReporting;
+use serde::Serialize;
 use sha2::Digest;
 use sha2::Sha256;
 use tower::BoxError;
@@ -50,6 +52,15 @@ pub(crate) type Plugins = IndexMap<String, Box<dyn QueryPlannerPlugin>>;
 pub(crate) type InMemoryCachePlanner =
     InMemoryCache<CachingQueryKey, Result<QueryPlannerContent, Arc<QueryPlannerError>>>;
 
+#[derive(Debug, Clone, Hash, PartialEq, Eq, Serialize)]
+pub(crate) enum ConfigMode {
+    //FIXME: add the Rust planner structure once it is hashable and serializable,
+    // for now use the JS config as it expected to be identical to the Rust one
+    Rust(Arc<QueryPlannerConfig>),
+    Both(Arc<QueryPlannerConfig>),
+    Js(Arc<QueryPlannerConfig>),
+}
+
 /// A query planner wrapper that caches results.
 ///
 /// The query planner performs LRU caching.
@@ -62,6 +73,8 @@ pub(crate) struct CachingQueryPlanner<T: Clone> {
     schema: Arc<Schema>,
     plugins: Arc<Plugins>,
     enable_authorization_directives: bool,
+    config_mode: ConfigMode,
+    introspection: bool,
 }
 
 impl<T: Clone + 'static> CachingQueryPlanner<T>
@@ -90,12 +103,26 @@ where
 
         let enable_authorization_directives =
             AuthorizationPlugin::enable_directives(configuration, &schema).unwrap_or(false);
+
+        let config_mode = match configuration.experimental_query_planner_mode {
+            crate::configuration::QueryPlannerMode::New => {
+                ConfigMode::Rust(Arc::new(configuration.js_query_planner_config()))
+            }
+            crate::configuration::QueryPlannerMode::Legacy => {
+                ConfigMode::Js(Arc::new(configuration.js_query_planner_config()))
+            }
+            crate::configuration::QueryPlannerMode::Both => {
+                ConfigMode::Both(Arc::new(configuration.js_query_planner_config()))
+            }
+        };
         Ok(Self {
             cache,
             delegate,
             schema,
             plugins: Arc::new(plugins),
             enable_authorization_directives,
+            config_mode,
+            introspection: configuration.supergraph.introspection,
         })
     }
 
@@ -141,7 +168,9 @@ where
                             hash,
                             metadata,
                             plan_options,
-                            ..
+                            config_mode: _,
+                            sdl: _,
+                            introspection: _,
                         },
                         _,
                     )| WarmUpCachingQueryKey {
@@ -150,6 +179,8 @@ where
                         hash: Some(hash.clone()),
                         metadata: metadata.clone(),
                         plan_options: plan_options.clone(),
+                        config_mode: self.config_mode.clone(),
+                        introspection: self.introspection,
                     },
                 )
                 .take(count)
@@ -181,6 +212,8 @@ where
                     hash: None,
                     metadata: CacheKeyMetadata::default(),
                     plan_options: PlanOptions::default(),
+                    config_mode: self.config_mode.clone(),
+                    introspection: self.introspection,
                 });
             }
         }
@@ -195,6 +228,8 @@ where
             hash,
             metadata,
             plan_options,
+            config_mode: _,
+            introspection: _,
         } in all_cache_keys
         {
             let context = Context::new();
@@ -210,6 +245,8 @@ where
                 sdl: Arc::clone(&self.schema.raw_sdl),
                 metadata,
                 plan_options,
+                config_mode: self.config_mode.clone(),
+                introspection: self.introspection,
             };
 
             if experimental_reuse_query_plans {
@@ -391,6 +428,8 @@ where
             sdl: Arc::clone(&self.schema.raw_sdl),
             metadata,
             plan_options,
+            config_mode: self.config_mode.clone(),
+            introspection: self.introspection,
         };
 
         let context = request.context.clone();
@@ -530,8 +569,13 @@ pub(crate) struct CachingQueryKey {
     pub(crate) hash: Arc<QueryHash>,
     pub(crate) metadata: CacheKeyMetadata,
     pub(crate) plan_options: PlanOptions,
+    pub(crate) config_mode: ConfigMode,
+    pub(crate) introspection: bool,
 }
 
+// Update this key every time the cache key or the query plan format has to change.
+// When changed it MUST BE CALLED OUT PROMINENTLY IN THE CHANGELOG.
+const CACHE_KEY_VERSION: usize = 0;
 const FEDERATION_VERSION: &str = std::env!("FEDERATION_VERSION");
 
 impl std::fmt::Display for CachingQueryKey {
@@ -545,23 +589,29 @@ impl std::fmt::Display for CachingQueryKey {
         hasher.update(
             &serde_json::to_vec(&self.plan_options).expect("serialization should not fail"),
         );
+        hasher
+            .update(&serde_json::to_vec(&self.config_mode).expect("serialization should not fail"));
         hasher.update(&serde_json::to_vec(&self.sdl).expect("serialization should not fail"));
+        hasher.update([self.introspection as u8]);
         let metadata = hex::encode(hasher.finalize());
 
         write!(
             f,
-            "plan:{}:{}:{}:{}",
-            FEDERATION_VERSION, self.hash, operation, metadata,
+            "plan:{}:{}:{}:{}:{}",
+            CACHE_KEY_VERSION, FEDERATION_VERSION, self.hash, operation, metadata,
         )
     }
 }
 
 impl Hash for CachingQueryKey {
     fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.sdl.hash(state);
         self.hash.0.hash(state);
         self.operation.hash(state);
         self.metadata.hash(state);
         self.plan_options.hash(state);
+        self.config_mode.hash(state);
+        self.introspection.hash(state);
     }
 }
 
@@ -572,6 +622,8 @@ pub(crate) struct WarmUpCachingQueryKey {
     pub(crate) hash: Option<Arc<QueryHash>>,
     pub(crate) metadata: CacheKeyMetadata,
     pub(crate) plan_options: PlanOptions,
+    pub(crate) config_mode: ConfigMode,
+    pub(crate) introspection: bool,
 }
 
 #[cfg(test)]