Reorganize python scripts

datafusion-contrib · Sep 19, 2024 · 679be88 · 679be88
1 parent 93df0d8
commit 679be88
Show file tree

Hide file tree

Showing 8 changed files with 110 additions and 40 deletions.
diff --git a/scripts/README.md b/scripts/README.md
@@ -0,0 +1,17 @@
+## Generate data
+
+Setup the virtual environment with dependencies on PyArrow, PySpark and PyOrc
+to generate the reference data:
+
+```bash
+# Run once
+./scripts/setup-venv.sh
+./scripts/prepare-test-data.sh
+```
+
+Then execute the tests:
+
+```bash
+cargo test
+```
+
diff --git a/tests/integration/generate_arrow.py → scripts/generate_arrow.py b/tests/integration/generate_arrow.py → scripts/generate_arrow.py
@@ -19,8 +19,10 @@
 import glob
 from pyarrow import orc, feather
 
-files = glob.glob("data/expected/*")
-files = [file.removeprefix("data/expected/").removesuffix(".jsn.gz") for file in files]
+dir = "tests/integration/data"
+
+files = glob.glob(f"{dir}/expected/*")
+files = [file.removeprefix(f"{dir}/expected/").removesuffix(".jsn.gz") for file in files]
 
 ignore_files = [
     "TestOrcFile.testTimestamp" # Root data type isn't struct
@@ -30,5 +32,5 @@
 
 for file in files:
     print(f"Converting {file} from ORC to feather")
-    table = orc.read_table(f"data/{file}.orc")
-    feather.write_feather(table, f"data/expected_arrow/{file}.feather")
+    table = orc.read_table(f"{dir}/{file}.orc")
+    feather.write_feather(table, f"{dir}/expected_arrow/{file}.feather")
diff --git a/tests/basic/data/generate_orc.py → scripts/generate_orc.py b/tests/basic/data/generate_orc.py → scripts/generate_orc.py
@@ -22,6 +22,8 @@
 from pyspark.sql import SparkSession
 from pyspark.sql.types import *
 
+dir = "tests/basic/data"
+
 # We're using Spark because it supports lzo compression writing
 # (PyArrow supports all except lzo writing)
 
@@ -65,9 +67,9 @@
     df.write.format("orc")\
       .option("compression", c)\
       .mode("overwrite")\
-      .save(f"./alltypes.{c}")
+      .save(f"{dir}/alltypes.{c}")
     # Since Spark saves into a directory
     # Move out and rename the expected single ORC file (because of coalesce above)
-    orc_file = glob.glob(f"./alltypes.{c}/*.orc")[0]
-    shutil.move(orc_file, f"./alltypes.{c}.orc")
-    shutil.rmtree(f"./alltypes.{c}")
+    orc_file = glob.glob(f"{dir}/alltypes.{c}/*.orc")[0]
+    shutil.move(orc_file, f"{dir}/alltypes.{c}.orc")
+    shutil.rmtree(f"{dir}/alltypes.{c}")
diff --git a/tests/basic/data/generate_orc_timestamps.py → scripts/generate_orc_timestamps.py b/tests/basic/data/generate_orc_timestamps.py → scripts/generate_orc_timestamps.py
@@ -21,6 +21,8 @@
 from pyarrow import parquet
 import pyorc
 
+dir = "tests/basic/data"
+
 schema = pa.schema([
     pa.field('timestamp_notz', pa.timestamp("ns")),
     pa.field('timestamp_utc', pa.timestamp("ns", tz="UTC")),
@@ -38,7 +40,7 @@
     dttm(1900,  1,  1, 14, 25, 14),
 ])
 table = pa.Table.from_arrays([arr, arr], schema=schema)
-orc.write_table(table, "pyarrow_timestamps.orc")
+orc.write_table(table, f"{dir}/pyarrow_timestamps.orc")
 
 
 # pyarrow overflows when trying to write this, so we have to use pyorc instead
@@ -53,7 +55,7 @@ def to_orc(obj, tz):
     id=pyorc.Int(),
     timestamp=pyorc.Timestamp()
 )
-with open("overflowing_timestamps.orc", "wb") as f:
+with open(f"{dir}/overflowing_timestamps.orc", "wb") as f:
     with pyorc.Writer(
         f,
         schema,

diff --git a/scripts/prepare-test-data.sh b/scripts/prepare-test-data.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+BASE_DIR=$SCRIPT_DIR/..
+VENV_BIN=$BASE_DIR/venv/bin
+
+cd $BASE_DIR
+$VENV_BIN/python $SCRIPT_DIR/write.py
+$VENV_BIN/python $SCRIPT_DIR/generate_orc.py
+$VENV_BIN/python $SCRIPT_DIR/generate_orc_timestamps.py
+$VENV_BIN/python $SCRIPT_DIR/generate_arrow.py
+
+echo "Done"
+
diff --git a/scripts/setup-venv.sh b/scripts/setup-venv.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+BASE_DIR=$SCRIPT_DIR/..
+VENV_BIN=$BASE_DIR/venv/bin
+
+python3 -m venv $BASE_DIR/venv
+
+$VENV_BIN/pip install -U pyorc pyspark pyarrow
+
+echo "Done"
+
diff --git a/tests/basic/data/write.py → scripts/write.py b/tests/basic/data/write.py → scripts/write.py
@@ -20,6 +20,8 @@
 import datetime
 import pyorc
 
+dir = "tests/basic/data"
+
 data = {
     "a": [1.0, 2.0, None, 4.0, 5.0],
     "b": [True, False, None, True, False],
@@ -113,7 +115,7 @@ def _write(
     ],
 }
 
-_write("struct<nest:struct<a:float,b:boolean>>", nested_struct, "nested_struct.orc")
+_write("struct<nest:struct<a:float,b:boolean>>", nested_struct, f"{dir}/nested_struct.orc")
 
 
 nested_array = {
@@ -126,7 +128,7 @@ def _write(
     ],
 }
 
-_write("struct<value:array<int>>", nested_array, "nested_array.orc")
+_write("struct<value:array<int>>", nested_array, f"{dir}/nested_array.orc")
 
 
 nested_array_float = {
@@ -136,7 +138,7 @@ def _write(
     ],
 }
 
-_write("struct<value:array<float>>", nested_array_float, "nested_array_float.orc")
+_write("struct<value:array<float>>", nested_array_float, f"{dir}/nested_array_float.orc")
 
 nested_array_struct = {
     "value": [
@@ -145,7 +147,7 @@ def _write(
     ],
 }
 
-_write("struct<value:array<struct<a:float,b:int,c:string>>>", nested_array_struct, "nested_array_struct.orc")
+_write("struct<value:array<struct<a:float,b:int,c:string>>>", nested_array_struct, f"{dir}/nested_array_struct.orc")
 
 nested_map = {
     "map": [
@@ -156,7 +158,7 @@ def _write(
     ],
 }
 
-_write("struct<map:map<string,int>>", nested_map, "nested_map.orc")
+_write("struct<map:map<string,int>>", nested_map, f"{dir}/nested_map.orc")
 
 nested_map_struct = {
     "map": [
@@ -166,46 +168,46 @@ def _write(
     ],
 }
 
-_write("struct<value:map<string,struct<a:float,b:int,c:string>>>", nested_map_struct, "nested_map_struct.orc")
+_write("struct<value:map<string,struct<a:float,b:int,c:string>>>", nested_map_struct, f"{dir}/nested_map_struct.orc")
 
 
 _write(
     infer_schema(data),
     data,
-    "test.orc",
+    f"{dir}/test.orc",
 )
 
 data_boolean = {
     "long": [True] * 32,
 }
 
-_write("struct<long:boolean>", data_boolean, "long_bool.orc")
+_write("struct<long:boolean>", data_boolean, f"{dir}/long_bool.orc")
 
-_write("struct<long:boolean>", data_boolean, "long_bool_gzip.orc", pyorc.CompressionKind.ZLIB)
+_write("struct<long:boolean>", data_boolean, f"{dir}/long_bool_gzip.orc", pyorc.CompressionKind.ZLIB)
 
 data_dict = {
     "dict": ["abcd", "efgh"] * 32,
 }
 
-_write("struct<dict:string>", data_dict, "string_long.orc")
+_write("struct<dict:string>", data_dict, f"{dir}/string_long.orc")
 
 data_dict = {
     "dict": ["abc", "efgh"] * 32,
 }
 
-_write("struct<dict:string>", data_dict, "string_dict.orc", dict_key_size_threshold=0.1)
+_write("struct<dict:string>", data_dict, f"{dir}/string_dict.orc", dict_key_size_threshold=0.1)
 
-_write("struct<dict:string>", data_dict, "string_dict_gzip.orc", pyorc.CompressionKind.ZLIB)
+_write("struct<dict:string>", data_dict, f"{dir}/string_dict_gzip.orc", pyorc.CompressionKind.ZLIB)
 
 data_dict = {
     "dict": ["abcd", "efgh"] * (10**4 // 2),
 }
 
-_write("struct<dict:string>", data_dict, "string_long_long.orc")
-_write("struct<dict:string>", data_dict, "string_long_long_gzip.orc", pyorc.CompressionKind.ZLIB)
+_write("struct<dict:string>", data_dict, f"{dir}/string_long_long.orc")
+_write("struct<dict:string>", data_dict, f"{dir}/string_long_long_gzip.orc", pyorc.CompressionKind.ZLIB)
 
 long_f32 = {
     "dict": [random.uniform(0, 1) for _ in range(10**6)],
 }
 
-_write("struct<dict:float>", long_f32, "f32_long_long_gzip.orc", pyorc.CompressionKind.ZLIB)
+_write("struct<dict:float>", long_f32, f"{dir}/f32_long_long_gzip.orc", pyorc.CompressionKind.ZLIB)
diff --git a/tests/basic/data/README.md b/tests/basic/data/README.md