Skip to content

Commit

Permalink
Reorganize python scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
Jefffrey committed Sep 19, 2024
1 parent 93df0d8 commit 679be88
Show file tree
Hide file tree
Showing 8 changed files with 110 additions and 40 deletions.
17 changes: 17 additions & 0 deletions scripts/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
## Generate data

Setup the virtual environment with dependencies on PyArrow, PySpark and PyOrc
to generate the reference data:

```bash
# Run once
./scripts/setup-venv.sh
./scripts/prepare-test-data.sh
```

Then execute the tests:

```bash
cargo test
```

Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@
import glob
from pyarrow import orc, feather

files = glob.glob("data/expected/*")
files = [file.removeprefix("data/expected/").removesuffix(".jsn.gz") for file in files]
dir = "tests/integration/data"

files = glob.glob(f"{dir}/expected/*")
files = [file.removeprefix(f"{dir}/expected/").removesuffix(".jsn.gz") for file in files]

ignore_files = [
"TestOrcFile.testTimestamp" # Root data type isn't struct
Expand All @@ -30,5 +32,5 @@

for file in files:
print(f"Converting {file} from ORC to feather")
table = orc.read_table(f"data/{file}.orc")
feather.write_feather(table, f"data/expected_arrow/{file}.feather")
table = orc.read_table(f"{dir}/{file}.orc")
feather.write_feather(table, f"{dir}/expected_arrow/{file}.feather")
10 changes: 6 additions & 4 deletions tests/basic/data/generate_orc.py → scripts/generate_orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
from pyspark.sql import SparkSession
from pyspark.sql.types import *

dir = "tests/basic/data"

# We're using Spark because it supports lzo compression writing
# (PyArrow supports all except lzo writing)

Expand Down Expand Up @@ -65,9 +67,9 @@
df.write.format("orc")\
.option("compression", c)\
.mode("overwrite")\
.save(f"./alltypes.{c}")
.save(f"{dir}/alltypes.{c}")
# Since Spark saves into a directory
# Move out and rename the expected single ORC file (because of coalesce above)
orc_file = glob.glob(f"./alltypes.{c}/*.orc")[0]
shutil.move(orc_file, f"./alltypes.{c}.orc")
shutil.rmtree(f"./alltypes.{c}")
orc_file = glob.glob(f"{dir}/alltypes.{c}/*.orc")[0]
shutil.move(orc_file, f"{dir}/alltypes.{c}.orc")
shutil.rmtree(f"{dir}/alltypes.{c}")
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
from pyarrow import parquet
import pyorc

dir = "tests/basic/data"

schema = pa.schema([
pa.field('timestamp_notz', pa.timestamp("ns")),
pa.field('timestamp_utc', pa.timestamp("ns", tz="UTC")),
Expand All @@ -38,7 +40,7 @@
dttm(1900, 1, 1, 14, 25, 14),
])
table = pa.Table.from_arrays([arr, arr], schema=schema)
orc.write_table(table, "pyarrow_timestamps.orc")
orc.write_table(table, f"{dir}/pyarrow_timestamps.orc")


# pyarrow overflows when trying to write this, so we have to use pyorc instead
Expand All @@ -53,7 +55,7 @@ def to_orc(obj, tz):
id=pyorc.Int(),
timestamp=pyorc.Timestamp()
)
with open("overflowing_timestamps.orc", "wb") as f:
with open(f"{dir}/overflowing_timestamps.orc", "wb") as f:
with pyorc.Writer(
f,
schema,
Expand Down
31 changes: 31 additions & 0 deletions scripts/prepare-test-data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/usr/bin/env bash

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
BASE_DIR=$SCRIPT_DIR/..
VENV_BIN=$BASE_DIR/venv/bin

cd $BASE_DIR
$VENV_BIN/python $SCRIPT_DIR/write.py
$VENV_BIN/python $SCRIPT_DIR/generate_orc.py
$VENV_BIN/python $SCRIPT_DIR/generate_orc_timestamps.py
$VENV_BIN/python $SCRIPT_DIR/generate_arrow.py

echo "Done"

29 changes: 29 additions & 0 deletions scripts/setup-venv.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/usr/bin/env bash

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
BASE_DIR=$SCRIPT_DIR/..
VENV_BIN=$BASE_DIR/venv/bin

python3 -m venv $BASE_DIR/venv

$VENV_BIN/pip install -U pyorc pyspark pyarrow

echo "Done"

32 changes: 17 additions & 15 deletions tests/basic/data/write.py → scripts/write.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
import datetime
import pyorc

dir = "tests/basic/data"

data = {
"a": [1.0, 2.0, None, 4.0, 5.0],
"b": [True, False, None, True, False],
Expand Down Expand Up @@ -113,7 +115,7 @@ def _write(
],
}

_write("struct<nest:struct<a:float,b:boolean>>", nested_struct, "nested_struct.orc")
_write("struct<nest:struct<a:float,b:boolean>>", nested_struct, f"{dir}/nested_struct.orc")


nested_array = {
Expand All @@ -126,7 +128,7 @@ def _write(
],
}

_write("struct<value:array<int>>", nested_array, "nested_array.orc")
_write("struct<value:array<int>>", nested_array, f"{dir}/nested_array.orc")


nested_array_float = {
Expand All @@ -136,7 +138,7 @@ def _write(
],
}

_write("struct<value:array<float>>", nested_array_float, "nested_array_float.orc")
_write("struct<value:array<float>>", nested_array_float, f"{dir}/nested_array_float.orc")

nested_array_struct = {
"value": [
Expand All @@ -145,7 +147,7 @@ def _write(
],
}

_write("struct<value:array<struct<a:float,b:int,c:string>>>", nested_array_struct, "nested_array_struct.orc")
_write("struct<value:array<struct<a:float,b:int,c:string>>>", nested_array_struct, f"{dir}/nested_array_struct.orc")

nested_map = {
"map": [
Expand All @@ -156,7 +158,7 @@ def _write(
],
}

_write("struct<map:map<string,int>>", nested_map, "nested_map.orc")
_write("struct<map:map<string,int>>", nested_map, f"{dir}/nested_map.orc")

nested_map_struct = {
"map": [
Expand All @@ -166,46 +168,46 @@ def _write(
],
}

_write("struct<value:map<string,struct<a:float,b:int,c:string>>>", nested_map_struct, "nested_map_struct.orc")
_write("struct<value:map<string,struct<a:float,b:int,c:string>>>", nested_map_struct, f"{dir}/nested_map_struct.orc")


_write(
infer_schema(data),
data,
"test.orc",
f"{dir}/test.orc",
)

data_boolean = {
"long": [True] * 32,
}

_write("struct<long:boolean>", data_boolean, "long_bool.orc")
_write("struct<long:boolean>", data_boolean, f"{dir}/long_bool.orc")

_write("struct<long:boolean>", data_boolean, "long_bool_gzip.orc", pyorc.CompressionKind.ZLIB)
_write("struct<long:boolean>", data_boolean, f"{dir}/long_bool_gzip.orc", pyorc.CompressionKind.ZLIB)

data_dict = {
"dict": ["abcd", "efgh"] * 32,
}

_write("struct<dict:string>", data_dict, "string_long.orc")
_write("struct<dict:string>", data_dict, f"{dir}/string_long.orc")

data_dict = {
"dict": ["abc", "efgh"] * 32,
}

_write("struct<dict:string>", data_dict, "string_dict.orc", dict_key_size_threshold=0.1)
_write("struct<dict:string>", data_dict, f"{dir}/string_dict.orc", dict_key_size_threshold=0.1)

_write("struct<dict:string>", data_dict, "string_dict_gzip.orc", pyorc.CompressionKind.ZLIB)
_write("struct<dict:string>", data_dict, f"{dir}/string_dict_gzip.orc", pyorc.CompressionKind.ZLIB)

data_dict = {
"dict": ["abcd", "efgh"] * (10**4 // 2),
}

_write("struct<dict:string>", data_dict, "string_long_long.orc")
_write("struct<dict:string>", data_dict, "string_long_long_gzip.orc", pyorc.CompressionKind.ZLIB)
_write("struct<dict:string>", data_dict, f"{dir}/string_long_long.orc")
_write("struct<dict:string>", data_dict, f"{dir}/string_long_long_gzip.orc", pyorc.CompressionKind.ZLIB)

long_f32 = {
"dict": [random.uniform(0, 1) for _ in range(10**6)],
}

_write("struct<dict:float>", long_f32, "f32_long_long_gzip.orc", pyorc.CompressionKind.ZLIB)
_write("struct<dict:float>", long_f32, f"{dir}/f32_long_long_gzip.orc", pyorc.CompressionKind.ZLIB)
15 changes: 0 additions & 15 deletions tests/basic/data/README.md

This file was deleted.

0 comments on commit 679be88

Please sign in to comment.