vega · jonmmease · May 19, 2022 · May 19, 2022 · May 19, 2022 · May 19, 2022
diff --git a/.github/workflows/build_test.yml b/.github/workflows/build_test.yml
@@ -175,6 +175,9 @@ jobs:
         uses: actions/setup-node@v2
         with:
           node-version: '17'
+      - name: Install protoc
+        run: |
+          sudo snap install protobuf --classic
       - name: Build package
         working-directory: vegafusion-wasm/
         run: |

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/python/vegafusion-jupyter/package-lock.json b/python/vegafusion-jupyter/package-lock.json
diff --git a/python/vegafusion-jupyter/vegafusion_jupyter/nbextension/extension.js b/python/vegafusion-jupyter/vegafusion_jupyter/nbextension/extension.js
@@ -1,6 +1,6 @@
 /*
  * VegaFusion
- * Copyright (C) 2022 Jon Mease
+ * Copyright (C) 2022 VegaFusion Technologies LLC
  * 
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Affero General Public License as

diff --git a/python/vegafusion/vegafusion/runtime.py b/python/vegafusion/vegafusion/runtime.py
@@ -6,7 +6,8 @@
 # this program the details of the active license.
 import json
 import psutil
-from .transformer import to_arrow_ipc_bytes
+import pyarrow as pa
+
 
 class VegaFusionRuntime:
     def __init__(self, cache_capacity, memory_limit, worker_threads):
@@ -71,9 +72,9 @@ def pre_transform_spec(self, spec, local_tz, row_limit=None, inline_datasets=Non
         :param row_limit: Maximum number of dataset rows to include in the returned
             specification. If exceeded, datasets will be truncated to this number of rows
             and a RowLimitExceeded warning will be included in the resulting warnings list
-        :param inline_datasets: A dict from dataset names to pandas DataFrames. Inline
-            datasets may be referenced by the input specification using the following
-            url syntax 'vegafusion+dataset://{dataset_name}'.
+        :param inline_datasets: A dict from dataset names to pandas DataFrames or pyarrow
+            Tables. Inline datasets may be referenced by the input specification using
+            the following url syntax 'vegafusion+dataset://{dataset_name}'.
         :return:
             Two-element tuple:
                 0. A string containing the JSON representation of a Vega specification
@@ -88,14 +89,25 @@ def pre_transform_spec(self, spec, local_tz, row_limit=None, inline_datasets=Non
                     'Unsupported': No transforms in the provided Vega specification were
                         eligible for pre-transforming
         """
+        from .transformer import to_arrow_table
+
         if self._grpc_channel:
             raise ValueError("pre_transform_spec not yet supported over gRPC")
         else:
             # Preprocess inline_dataset
             inline_datasets = inline_datasets or dict()
-            inline_datasets = {name: to_arrow_ipc_bytes(value, stream=True) for name, value in inline_datasets.items()}
+            inline_batches = dict()
+            for name, value in inline_datasets.items():
+                if isinstance(value, pa.Table):
+                    table = value
+                else:
+                    table = to_arrow_table(value)
+                schema = table.schema
+                batches = table.to_batches(max_chunksize=8096)
+                inline_batches[name] = (schema, batches)
+
             new_spec, warnings = self.embedded_runtime.pre_transform_spec(
-                spec, local_tz=local_tz, row_limit=row_limit, inline_datasets=inline_datasets
+                spec, local_tz=local_tz, row_limit=row_limit, inline_datasets=inline_batches
             )
             warnings = json.loads(warnings)
             return new_spec, warnings

diff --git a/python/vegafusion/vegafusion/transformer.py b/python/vegafusion/vegafusion/transformer.py
@@ -16,13 +16,12 @@
 import pandas as pd
 
 
-def to_arrow_ipc_bytes(data, stream=False):
+def to_arrow_table(data):
     """
-    Helper to convert a Pandas DataFrame to the Arrow IPC binary format
+    Helper to convert a Pandas DataFrame to a PyArrow Table
 
     :param data: Pandas DataFrame
-    :param stream: If True, write IPC Stream format. If Flase (defualt), write ipc file format.
-    :return: bytes
+    :return: pyarrow.Table
     """
     import pyarrow as pa
 
@@ -36,7 +35,7 @@ def to_arrow_ipc_bytes(data, stream=False):
             cat = data[col].cat
             data[col] = cat.categories[cat.codes]
 
-    # Serialize DataFrame to bytes in the arrow file format
+    # Convert DataFrame to table
     try:
         table = pa.Table.from_pandas(data)
     except pa.ArrowTypeError as e:
@@ -47,8 +46,22 @@ def to_arrow_ipc_bytes(data, stream=False):
             if dtype.kind == "O":
                 mapping[col] = data[col].astype(str)
         data = data.assign(**mapping)
-        # Try again, allowing exception to propagate
+        # Try again, allowing exception to propagate this time
         table = pa.Table.from_pandas(data)
+    return table
+
+
+def to_arrow_ipc_bytes(data, stream=False):
+    """
+    Helper to convert a Pandas DataFrame to the Arrow IPC binary format
+
+    :param data: Pandas DataFrame
+    :param stream: If True, write IPC Stream format. If False (default), write ipc file format.
+    :return: bytes
+    """
+    import pyarrow as pa
+
+    table = to_arrow_table(data)
 
     # Next we write the Arrow table as a feather file (The Arrow IPC format on disk).
     # Write it in memory first so we can hash the contents before touching disk.

diff --git a/vegafusion-core/Cargo.toml b/vegafusion-core/Cargo.toml
@@ -6,6 +6,7 @@ license = "AGPL-3.0-or-later"
 
 [features]
 tonic_support = [ "tonic", "tonic-build",]
+pyarrow = ["pyo3", "arrow/pyarrow", "datafusion-common/pyarrow"]
 
 [dependencies]
 thiserror = "^1.0.29"

diff --git a/vegafusion-python-embed/Cargo.toml b/vegafusion-python-embed/Cargo.toml
@@ -21,6 +21,7 @@ features = [ "pyo3",]
 
 [dependencies.vegafusion-rt-datafusion]
 path = "../vegafusion-rt-datafusion"
+features = ["pyarrow"]
 
 [dependencies.tokio]
 version = "1.18.1"
@@ -29,3 +30,7 @@ features = [ "macros", "rt-multi-thread",]
 [dependencies.pyo3]
 version = "0.16.4"
 features = [ "extension-module",]
+
+[dependencies.mimalloc]
+version = "*"
+default-features = false
diff --git a/vegafusion-python-embed/src/lib.rs b/vegafusion-python-embed/src/lib.rs
@@ -8,17 +8,26 @@
  */
 use pyo3::exceptions::PyValueError;
 use pyo3::prelude::*;
-use pyo3::types::{PyBytes, PyDict, PyString};
+use pyo3::types::{PyBytes, PyDict, PyList, PyString, PyTuple};
 use std::collections::HashMap;
+use std::sync::Arc;
 use tokio::runtime::Runtime;
 use vegafusion_core::error::ToExternalError;
 use vegafusion_core::proto::gen::pretransform::pre_transform_warning::WarningType;
 use vegafusion_core::proto::gen::services::pre_transform_result;
 use vegafusion_rt_datafusion::task_graph::runtime::TaskGraphRuntime;
 
 use serde::{Deserialize, Serialize};
+use vegafusion_core::arrow::datatypes::Schema;
+use vegafusion_core::arrow::pyarrow::PyArrowConvert;
+use vegafusion_core::arrow::record_batch::RecordBatch;
 use vegafusion_core::data::table::VegaFusionTable;
 
+use mimalloc::MiMalloc;
+
+#[global_allocator]
+static GLOBAL: MiMalloc = MiMalloc;
+
 #[derive(Clone, Serialize, Deserialize)]
 struct PreTransformWarning {
     #[serde(rename = "type")]
@@ -76,10 +85,16 @@ impl PyTaskGraphRuntime {
             .iter()
             .map(|(name, table_bytes)| {
                 let name = name.cast_as::<PyString>()?;
-                let table_bytes = table_bytes.cast_as::<PyBytes>()?;
+                let tuple = table_bytes.cast_as::<PyTuple>()?;
+                let schema = Schema::from_pyarrow(tuple.get_item(0)?)?;
+                let list = tuple.get_item(1)?.cast_as::<PyList>()?;
+                let batches: Vec<_> = list
+                    .iter()
+                    .map(RecordBatch::from_pyarrow)
+                    .collect::<PyResult<Vec<_>>>()?;
                 Ok((
                     name.to_string(),
-                    VegaFusionTable::from_ipc_bytes(table_bytes.as_bytes())?,
+                    VegaFusionTable::try_new(Arc::new(schema), batches)?,
                 ))
             })
             .collect::<PyResult<HashMap<_, _>>>()?;

diff --git a/vegafusion-rt-datafusion/Cargo.toml b/vegafusion-rt-datafusion/Cargo.toml
@@ -8,6 +8,9 @@ version = "0.4.0"
 edition = "2021"
 license = "AGPL-3.0-or-later"
 
+[features]
+pyarrow = ["vegafusion-core/pyarrow"]
+
 [dependencies]
 regex = "^1.5.5"
 lazy_static = "^1.4.0"