Introduce new optional scheduler, using Morsel-driven Parallelism + r…

…ayon (#2199) (#2226) * Morsel-driven Parallelism using rayon (#2199) * Fix LIFO spawn ordering * Further docs for ExecutionPipeline * Deduplicate concurrent wakes * Add license headers * Sort Cargo.toml * Revert accidental change to ParquetExec * Handle wakeups triggered by other threads * Use SeqCst memory ordering * Review feedback * Add panic handler * Cleanup structs Add test of tokio interoperation * Review feedback * Use BatchPartitioner Cleanup error handling * Clarify shutdown characteristics * Fix racy test_panic * Don't overload Query nomenclature * Rename QueryResults to ExecutionResults * Further review feedback * Merge scheduler into datafusion/core * Review feedback * Fix partitioned execution * Format * Format Cargo.toml * Fix doc link
apache · May 4, 2022 · dc76ec1 · dc76ec1
1 parent e8ba45c
commit dc76ec1
Show file tree

Hide file tree

Showing 11 changed files with 1,905 additions and 19 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -150,7 +150,7 @@ The parquet SQL benchmarks can be run with
  cargo bench --bench parquet_query_sql
 ```
 
-These randomly generate a parquet file, and then benchmark queries sourced from [parquet_query_sql.sql](./datafusion/benches/parquet_query_sql.sql) against it. This can therefore be a quick way to add coverage of particular query and/or data paths.
+These randomly generate a parquet file, and then benchmark queries sourced from [parquet_query_sql.sql](./datafusion/core/benches/parquet_query_sql.sql) against it. This can therefore be a quick way to add coverage of particular query and/or data paths.
 
 If the environment variable `PARQUET_FILE` is set, the benchmark will run queries against this file instead of a randomly generated one. This can be useful for performing multiple runs, potentially with different code, against the same source data, or for testing against a custom dataset.
 

diff --git a/Cargo.toml b/Cargo.toml
@@ -17,8 +17,8 @@
 
 [workspace]
 members = [
-    "datafusion/core",
     "datafusion/common",
+    "datafusion/core",
     "datafusion/expr",
     "datafusion/jit",
     "datafusion/physical-expr",

diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml
@@ -24,7 +24,7 @@ repository = "https://github.com/apache/arrow-datafusion"
 readme = "../README.md"
 authors = ["Apache Arrow <dev@arrow.apache.org>"]
 license = "Apache-2.0"
-keywords = [ "arrow", "query", "sql" ]
+keywords = ["arrow", "query", "sql"]
 include = [
     "benches/*.rs",
     "src/**/*.rs",
@@ -50,6 +50,8 @@ pyarrow = ["pyo3", "arrow/pyarrow", "datafusion-common/pyarrow"]
 regex_expressions = ["datafusion-physical-expr/regex_expressions"]
 # Used to enable row format experiment
 row = ["datafusion-row"]
+# Used to enable scheduler
+scheduler = ["rayon"]
 simd = ["arrow/simd"]
 unicode_expressions = ["datafusion-physical-expr/regex_expressions"]
 
@@ -75,9 +77,10 @@ ordered-float = "3.0"
 parking_lot = "0.12"
 parquet = { version = "13", features = ["arrow"] }
 paste = "^1.0"
-pin-project-lite= "^0.2.7"
+pin-project-lite = "^0.2.7"
 pyo3 = { version = "0.16", optional = true }
 rand = "0.8"
+rayon = { version = "1.5", optional = true }
 smallvec = { version = "1.6", features = ["union"] }
 sqlparser = "0.16"
 tempfile = "3"
@@ -88,6 +91,7 @@ uuid = { version = "1.0", features = ["v4"] }
 [dev-dependencies]
 criterion = "0.3"
 doc-comment = "0.3"
+env_logger = "0.9"
 fuzz-utils = { path = "fuzz-utils" }
 
 [[bench]]
@@ -121,6 +125,7 @@ name = "physical_plan"
 [[bench]]
 harness = false
 name = "parquet_query_sql"
+required-features = ["scheduler"]
 
 [[bench]]
 harness = false

diff --git a/datafusion/core/benches/parquet_query_sql.rs b/datafusion/core/benches/parquet_query_sql.rs
@@ -24,7 +24,9 @@ use arrow::datatypes::{
 };
 use arrow::record_batch::RecordBatch;
 use criterion::{criterion_group, criterion_main, Criterion};
-use datafusion::prelude::{ParquetReadOptions, SessionContext};
+use datafusion::prelude::{SessionConfig, SessionContext};
+use datafusion::scheduler::Scheduler;
+use futures::stream::StreamExt;
 use parquet::arrow::ArrowWriter;
 use parquet::file::properties::{WriterProperties, WriterVersion};
 use rand::distributions::uniform::SampleUniform;
@@ -37,7 +39,6 @@ use std::path::Path;
 use std::sync::Arc;
 use std::time::Instant;
 use tempfile::NamedTempFile;
-use tokio_stream::StreamExt;
 
 /// The number of batches to write
 const NUM_BATCHES: usize = 2048;
@@ -193,15 +194,24 @@ fn criterion_benchmark(c: &mut Criterion) {
     assert!(Path::new(&file_path).exists(), "path not found");
     println!("Using parquet file {}", file_path);
 
-    let context = SessionContext::new();
+    let partitions = 4;
+    let config = SessionConfig::new().with_target_partitions(partitions);
+    let context = SessionContext::with_config(config);
 
-    let rt = tokio::runtime::Builder::new_multi_thread().build().unwrap();
-    rt.block_on(context.register_parquet(
-        "t",
-        file_path.as_str(),
-        ParquetReadOptions::default(),
-    ))
-    .unwrap();
+    let scheduler = Scheduler::new(partitions);
+
+    let local_rt = tokio::runtime::Builder::new_current_thread()
+        .build()
+        .unwrap();
+
+    let query_rt = tokio::runtime::Builder::new_multi_thread()
+        .worker_threads(partitions)
+        .build()
+        .unwrap();
+
+    local_rt
+        .block_on(context.register_parquet("t", file_path.as_str(), Default::default()))
+        .unwrap();
 
     // We read the queries from a file so they can be changed without recompiling the benchmark
     let mut queries_file = File::open("benches/parquet_query_sql.sql").unwrap();
@@ -220,17 +230,42 @@ fn criterion_benchmark(c: &mut Criterion) {
             continue;
         }
 
-        let query = query.as_str();
-        c.bench_function(query, |b| {
+        c.bench_function(&format!("tokio: {}", query), |b| {
             b.iter(|| {
+                let query = query.clone();
                 let context = context.clone();
-                rt.block_on(async move {
-                    let query = context.sql(query).await.unwrap();
+                let (sender, mut receiver) = futures::channel::mpsc::unbounded();
+
+                // Spawn work to a separate tokio thread pool
+                query_rt.spawn(async move {
+                    let query = context.sql(&query).await.unwrap();
                     let mut stream = query.execute_stream().await.unwrap();
-                    while criterion::black_box(stream.next().await).is_some() {}
+
+                    while let Some(next) = stream.next().await {
+                        sender.unbounded_send(next).unwrap();
+                    }
+                });
+
+                local_rt.block_on(async {
+                    while receiver.next().await.transpose().unwrap().is_some() {}
                 })
             });
         });
+
+        c.bench_function(&format!("scheduled: {}", query), |b| {
+            b.iter(|| {
+                let query = query.clone();
+                let context = context.clone();
+
+                local_rt.block_on(async {
+                    let query = context.sql(&query).await.unwrap();
+                    let plan = query.create_physical_plan().await.unwrap();
+                    let mut stream =
+                        scheduler.schedule(plan, context.task_ctx()).unwrap();
+                    while stream.next().await.transpose().unwrap().is_some() {}
+                });
+            });
+        });
     }
 
     // Temporary file must outlive the benchmarks, it is deleted when dropped

diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs
@@ -218,6 +218,8 @@ pub mod physical_optimizer;
 pub mod physical_plan;
 pub mod prelude;
 pub mod scalar;
+#[cfg(feature = "scheduler")]
+pub mod scheduler;
 pub mod sql;
 pub mod variable;