From 5dbeb4fa022cc765c5328b9588a91fd546f59e3e Mon Sep 17 00:00:00 2001 From: Will Jones Date: Mon, 3 Apr 2023 17:07:14 -0700 Subject: [PATCH] GH-34737: [C#] C Data interface for schemas and types (#34133) ### Rationale for this change This starts the C Data Interface implementation for C# with integration for `ArrowSchema`. `ArrowArray` will come in a follow-up PR. ### What changes are included in this PR? * Adds classes `CArrowSchema` and `ImportedArrowSchema` which allow interacting with the `CArrowSchema`. * Adds integration tests with PyArrow, inspired by the similar integration tests in [arrow-rs](https://github.com/apache/arrow-rs/blob/master/arrow/src/pyarrow.rs) ### Are these changes tested? Yes, the PyArrow integration tests validate the functionality. ### Are there any user-facing changes? This only adds new APIs, and doesn't change any existing ones. * Closes: #33856 * Closes: #34737 Lead-authored-by: Will Jones Co-authored-by: Weston Pace Signed-off-by: Eric Erhardt --- ci/scripts/csharp_test.sh | 11 + csharp/src/Apache.Arrow/C/CArrowSchema.cs | 124 +++++++ .../Apache.Arrow/C/CArrowSchemaExporter.cs | 278 +++++++++++++++ .../Apache.Arrow/C/CArrowSchemaImporter.cs | 301 ++++++++++++++++ csharp/src/Apache.Arrow/C/StringUtil.cs | 63 ++++ .../Apache.Arrow.Tests.csproj | 2 + .../CDataInterfacePythonTests.cs | 329 ++++++++++++++++++ .../CDataInterfaceSchemaTests.cs | 119 +++++++ 8 files changed, 1227 insertions(+) create mode 100644 csharp/src/Apache.Arrow/C/CArrowSchema.cs create mode 100644 csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs create mode 100644 csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs create mode 100644 csharp/src/Apache.Arrow/C/StringUtil.cs create mode 100644 csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs create mode 100644 csharp/test/Apache.Arrow.Tests/CDataInterfaceSchemaTests.cs diff --git a/ci/scripts/csharp_test.sh b/ci/scripts/csharp_test.sh index 9e4e35dd40d12..e4bed4e35e3d7 100755 --- a/ci/scripts/csharp_test.sh +++ b/ci/scripts/csharp_test.sh @@ -21,6 +21,17 @@ set -ex source_dir=${1}/csharp +# Python and PyArrow are required for C Data Interface tests. +if [ -z "${PYTHON}" ]; then + if type python3 > /dev/null 2>&1; then + export PYTHON=python3 + else + export PYTHON=python + fi +fi +${PYTHON} -m pip install pyarrow find-libpython +export PYTHONNET_PYDLL=$(${PYTHON} -m find_libpython) + pushd ${source_dir} dotnet test for pdb in artifacts/Apache.Arrow/*/*/Apache.Arrow.pdb; do diff --git a/csharp/src/Apache.Arrow/C/CArrowSchema.cs b/csharp/src/Apache.Arrow/C/CArrowSchema.cs new file mode 100644 index 0000000000000..af01247800655 --- /dev/null +++ b/csharp/src/Apache.Arrow/C/CArrowSchema.cs @@ -0,0 +1,124 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using System.Runtime.InteropServices; +using Apache.Arrow.Types; + +namespace Apache.Arrow.C +{ + /// + /// An Arrow C Data Interface Schema, which represents a type, field, or schema. + /// + /// + /// This is used to export , , or + /// to other languages. It matches the layout of the + /// ArrowSchema struct described in https://github.com/apache/arrow/blob/main/cpp/src/arrow/c/abi.h. + /// + [StructLayout(LayoutKind.Sequential)] + public unsafe struct CArrowSchema + { + public byte* format; + public byte* name; + public byte* metadata; + public long flags; + public long n_children; + public CArrowSchema** children; + public CArrowSchema* dictionary; + public delegate* unmanaged[Stdcall] release; + public void* private_data; + + /// + /// Allocate and zero-initialize an unmanaged pointer of this type. + /// + /// + /// This pointer must later be freed by . + /// + public static CArrowSchema* Create() + { + var ptr = (CArrowSchema*)Marshal.AllocHGlobal(sizeof(CArrowSchema)); + + ptr->format = null; + ptr->name = null; + ptr->metadata = null; + ptr->flags = 0; + ptr->n_children = 0; + ptr->children = null; + ptr->dictionary = null; + ptr->release = null; + ptr->private_data = null; + + return ptr; + } + + /// + /// Free a pointer that was allocated in . + /// + /// + /// Do not call this on a pointer that was allocated elsewhere. + /// + public static void Free(CArrowSchema* schema) + { + if (schema->release != null) + { + // Call release if not already called. + schema->release(schema); + } + Marshal.FreeHGlobal((IntPtr)schema); + } + + + /// + /// For dictionary-encoded types, whether the ordering of dictionary indices is semantically meaningful. + /// + public const long ArrowFlagDictionaryOrdered = 1; + /// + /// Whether this field is semantically nullable (regardless of whether it actually has null values) + /// + public const long ArrowFlagNullable = 2; + /// + /// For map types, whether the keys within each map value are sorted. + /// + public const long ArrowFlagMapKeysSorted = 4; + + /// + /// Get the value of a particular flag. + /// + /// + /// Known valid flags are , + /// , and . + /// + public readonly bool GetFlag(long flag) + { + return (flags & flag) == flag; + } + + internal readonly CArrowSchema* GetChild(long i) + { + if ((ulong)i >= (ulong)n_children) + { + throw new ArgumentOutOfRangeException("Child index out of bounds."); + } + if (children == null) + { + throw new ArgumentOutOfRangeException($"Child index '{i}' out of bounds."); + } + + return children[i]; + } + } +} diff --git a/csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs b/csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs new file mode 100644 index 0000000000000..5c517f418503a --- /dev/null +++ b/csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs @@ -0,0 +1,278 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Runtime.InteropServices; +using Apache.Arrow.Types; + +namespace Apache.Arrow.C +{ + public static class CArrowSchemaExporter + { + /// + /// Export a type to a . + /// + /// The datatype to export + /// An allocated but uninitialized CArrowSchema pointer. + /// + /// + /// CArrowSchema* exportPtr = CArrowSchema.Create(); + /// CArrowSchemaExporter.ExportType(dataType, exportPtr); + /// foreign_import_function(exportPtr); + /// CArrowSchema.Free(exportPtr); + /// + /// + public static unsafe void ExportType(IArrowType datatype, CArrowSchema* schema) + { + if (datatype == null) + { + throw new ArgumentNullException(nameof(datatype)); + } + if (schema == null) + { + throw new ArgumentNullException(nameof(schema)); + } + if (schema->release != null) + { + throw new ArgumentException("Cannot export schema to a struct that is already initialized."); + } + + schema->format = StringUtil.ToCStringUtf8(GetFormat(datatype)); + schema->name = null; + schema->metadata = null; + schema->flags = GetFlags(datatype); + + schema->children = ConstructChildren(datatype, out var numChildren); + schema->n_children = numChildren; + + schema->dictionary = ConstructDictionary(datatype); + + schema->release = (delegate* unmanaged[Stdcall])Marshal.GetFunctionPointerForDelegate( + ReleaseCArrowSchema); + + schema->private_data = null; + } + + /// + /// Export a field to a . + /// + /// The field to export + /// An allocated but uninitialized CArrowSchema pointer. + /// + /// + /// CArrowSchema* exportPtr = CArrowSchema.Create(); + /// CArrowSchemaExporter.ExportType(field, exportPtr); + /// foreign_import_function(exportPtr); + /// CArrowSchema.Free(exportPtr); + /// + /// + public static unsafe void ExportField(Field field, CArrowSchema* schema) + { + ExportType(field.DataType, schema); + schema->name = StringUtil.ToCStringUtf8(field.Name); + // TODO: field metadata + schema->metadata = null; + schema->flags = GetFlags(field.DataType, field.IsNullable); + } + + /// + /// Export a schema to a . + /// + /// The schema to export + /// An allocated but uninitialized CArrowSchema pointer. + /// + /// + /// CArrowSchema* exportPtr = CArrowSchema.Create(); + /// CArrowSchemaExporter.ExportType(schema, exportPtr); + /// foreign_import_function(exportPtr); + /// CArrowSchema.Free(exportPtr); + /// + /// + public static unsafe void ExportSchema(Schema schema, CArrowSchema* out_schema) + { + var structType = new StructType(schema.FieldsList); + // TODO: top-level metadata + ExportType(structType, out_schema); + } + + private static char FormatTimeUnit(TimeUnit unit) => unit switch + { + TimeUnit.Second => 's', + TimeUnit.Millisecond => 'm', + TimeUnit.Microsecond => 'u', + TimeUnit.Nanosecond => 'n', + _ => throw new InvalidDataException($"Unsupported time unit for export: {unit}"), + }; + + private static string GetFormat(IArrowType datatype) + { + switch (datatype) + { + case NullType _: return "n"; + case BooleanType _: return "b"; + // Integers + case Int8Type _: return "c"; + case UInt8Type _: return "C"; + case Int16Type _: return "s"; + case UInt16Type _: return "S"; + case Int32Type _: return "i"; + case UInt32Type _: return "I"; + case Int64Type _: return "l"; + case UInt64Type _: return "L"; + // Floats + case HalfFloatType _: return "e"; + case FloatType _: return "f"; + case DoubleType _: return "g"; + // Decimal + case Decimal128Type decimalType: + return $"d:{decimalType.Precision},{decimalType.Scale}"; + case Decimal256Type decimalType: + return $"d:{decimalType.Precision},{decimalType.Scale},256"; + // Binary + case BinaryType _: return "z"; + case StringType _: return "u"; + case FixedSizeBinaryType binaryType: + return $"w:{binaryType.ByteWidth}"; + // Date + case Date32Type _: return "tdD"; + case Date64Type _: return "tdm"; + // Time + case Time32Type timeType: + return String.Format("tt{0}", FormatTimeUnit(timeType.Unit)); + case Time64Type timeType: + // Same prefix as Time32, but allowed time units are different. + return String.Format("tt{0}", FormatTimeUnit(timeType.Unit)); + // Timestamp + case TimestampType timestampType: + return String.Format("ts{0}:{1}", FormatTimeUnit(timestampType.Unit), timestampType.Timezone); + // Nested + case ListType _: return "+l"; + case StructType _: return "+s"; + // Dictionary + case DictionaryType dictionaryType: + return GetFormat(dictionaryType.IndexType); + default: throw new NotImplementedException($"Exporting {datatype.Name} not implemented"); + }; + } + + private static long GetFlags(IArrowType datatype, bool nullable = true) + { + long flags = 0; + + if (nullable) + { + flags |= CArrowSchema.ArrowFlagNullable; + } + + if (datatype is DictionaryType dictionaryType) + { + if (dictionaryType.Ordered) + { + flags |= CArrowSchema.ArrowFlagDictionaryOrdered; + } + } + + if (datatype.TypeId == ArrowTypeId.Map) + { + // TODO: when we implement MapType, make sure to set the KEYS_SORTED flag. + throw new NotSupportedException("Exporting MapTypes is not supported."); + } + + return flags; + } + + private static unsafe CArrowSchema** ConstructChildren(IArrowType datatype, out long numChildren) + { + if (datatype is NestedType nestedType) + { + IReadOnlyList fields = nestedType.Fields; + int numFields = fields.Count; + numChildren = numFields; + if (numFields == 0) + { + throw new NotSupportedException("Exporting nested data types with zero children."); + }; + + var pointerList = (CArrowSchema**)Marshal.AllocHGlobal(numFields * IntPtr.Size); + + for (var i = 0; i < numChildren; i++) + { + CArrowSchema* cSchema = CArrowSchema.Create(); + ExportField(fields[i], cSchema); + pointerList[i] = cSchema; + } + + return pointerList; + + } + else + { + numChildren = 0; + return null; + } + } + + private static unsafe CArrowSchema* ConstructDictionary(IArrowType datatype) + { + if (datatype is DictionaryType dictType) + { + CArrowSchema* cSchema = CArrowSchema.Create(); + ExportType(dictType.ValueType, cSchema); + return cSchema; + } + else + { + return null; + } + } + + private static unsafe void ReleaseCArrowSchema(CArrowSchema* schema) + { + if (schema == null) return; + if (schema->release == null) return; + + Marshal.FreeHGlobal((IntPtr)schema->format); + Marshal.FreeHGlobal((IntPtr)schema->name); + Marshal.FreeHGlobal((IntPtr)schema->metadata); + schema->format = null; + schema->name = null; + schema->metadata = null; + + if (schema->n_children > 0) + { + for (int i = 0; i < schema->n_children; i++) + { + CArrowSchema.Free(schema->GetChild(i)); + } + Marshal.FreeHGlobal((IntPtr)schema->children); + } + + if (schema->dictionary != null) + { + CArrowSchema.Free(schema->dictionary); + } + + schema->flags = 0; + schema->n_children = 0; + schema->dictionary = null; + schema->children = null; + schema->release = null; + } + } +} \ No newline at end of file diff --git a/csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs b/csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs new file mode 100644 index 0000000000000..8e0b5e21b2383 --- /dev/null +++ b/csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs @@ -0,0 +1,301 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using Apache.Arrow.Types; + +namespace Apache.Arrow.C +{ + public static class CArrowSchemaImporter + { + /// + /// Import C pointer as an . + /// + /// + /// This will call the release callback on the passed struct, even if + /// this function fails. + /// + /// + /// Typically, you will allocate an uninitialized CArrowSchema pointer, + /// pass that to external function, and then use this method to import + /// the result. + /// + /// + /// CArrowSchema* importedPtr = CArrowSchema.Create(); + /// foreign_export_function(importedPtr); + /// ArrowType importedType = CArrowSchemaImporter.ImportType(importedPtr); + /// CArrowSchema.Free(importedPtr); + /// + /// + public static unsafe ArrowType ImportType(CArrowSchema* ptr) + { + using var importedType = new ImportedArrowSchema(ptr); + return importedType.GetAsType(); + } + + /// + /// Import C pointer as a . + /// + /// + /// This will call the release callback on the passed struct, even if + /// this function fails. + /// + /// + /// Typically, you will allocate an uninitialized CArrowSchema pointer, + /// pass that to external function, and then use this method to import + /// the result. + /// + /// + /// CArrowSchema* importedPtr = CArrowSchema.Create(); + /// foreign_export_function(importedPtr); + /// Field importedField = CArrowSchemaImporter.ImportField(importedPtr); + /// CArrowSchema.Free(importedPtr); + /// + /// + public static unsafe Field ImportField(CArrowSchema* ptr) + { + using var importedField = new ImportedArrowSchema(ptr); + return importedField.GetAsField(); + } + + /// + /// Import C pointer as a . + /// + /// + /// This will call the release callback on the passed struct, even if + /// this function fails. + /// + /// + /// Typically, you will allocate an uninitialized CArrowSchema pointer, + /// pass that to external function, and then use this method to import + /// the result. + /// + /// + /// CArrowSchema* importedPtr = CArrowSchema.Create(); + /// foreign_export_function(importedPtr); + /// Field importedSchema = CArrowSchemaImporter.ImportSchema(importedPtr); + /// CArrowSchema.Free(importedPtr); + /// + /// + public static unsafe Schema ImportSchema(CArrowSchema* ptr) + { + using var importedSchema = new ImportedArrowSchema(ptr); + return importedSchema.GetAsSchema(); + } + + private sealed unsafe class ImportedArrowSchema : IDisposable + { + private readonly CArrowSchema* _cSchema; + private readonly bool _isRoot; + + public ImportedArrowSchema(CArrowSchema* cSchema) + { + if (cSchema == null) + { + throw new ArgumentException("Passed null pointer for cSchema."); + } + _cSchema = cSchema; + if (_cSchema->release == null) + { + throw new ArgumentException("Tried to import a schema that has already been released."); + } + _isRoot = true; + } + + public ImportedArrowSchema(CArrowSchema* handle, bool isRoot) : this(handle) + { + _isRoot = isRoot; + } + + public void Dispose() + { + // We only call release on a root-level schema, not child ones. + if (_isRoot && _cSchema->release != null) + { + _cSchema->release(_cSchema); + } + } + + public ArrowType GetAsType() + { + var format = StringUtil.PtrToStringUtf8(_cSchema->format); + if (_cSchema->dictionary != null) + { + ArrowType indicesType = format switch + { + "c" => Int8Type.Default, + "C" => UInt8Type.Default, + "s" => Int16Type.Default, + "S" => UInt16Type.Default, + "i" => Int32Type.Default, + "I" => UInt32Type.Default, + "l" => Int64Type.Default, + "L" => UInt64Type.Default, + _ => throw new InvalidDataException($"Indices must be an integer, but got format string {format}"), + }; + + var dictionarySchema = new ImportedArrowSchema(_cSchema->dictionary, isRoot: false); + ArrowType dictionaryType = dictionarySchema.GetAsType(); + + bool ordered = _cSchema->GetFlag(CArrowSchema.ArrowFlagDictionaryOrdered); + + return new DictionaryType(indicesType, dictionaryType, ordered); + } + + // Special handling for nested types + if (format == "+l") + { + if (_cSchema->n_children != 1) + { + throw new InvalidDataException("Expected list type to have exactly one child."); + } + ImportedArrowSchema childSchema; + if (_cSchema->GetChild(0) == null) + { + throw new InvalidDataException("Expected list type child to be non-null."); + } + childSchema = new ImportedArrowSchema(_cSchema->GetChild(0), isRoot: false); + + Field childField = childSchema.GetAsField(); + + return new ListType(childField); + } + else if (format == "+s") + { + var child_schemas = new ImportedArrowSchema[_cSchema->n_children]; + + for (int i = 0; i < _cSchema->n_children; i++) + { + if (_cSchema->GetChild(i) == null) + { + throw new InvalidDataException("Expected struct type child to be non-null."); + } + child_schemas[i] = new ImportedArrowSchema(_cSchema->GetChild(i), isRoot: false); + } + + + List childFields = child_schemas.Select(schema => schema.GetAsField()).ToList(); + + return new StructType(childFields); + } + // TODO: Map type and large list type + + // Decimals + if (format.StartsWith("d:")) + { + bool is256 = format.EndsWith(",256"); + string parameters_part = format.Remove(0, 2); + if (is256) parameters_part.Substring(0, parameters_part.Length - 5); + string[] parameters = parameters_part.Split(','); + int precision = Int32.Parse(parameters[0]); + int scale = Int32.Parse(parameters[1]); + if (is256) + { + return new Decimal256Type(precision, scale); + } + else + { + return new Decimal128Type(precision, scale); + } + } + + // Timestamps + if (format.StartsWith("ts")) + { + TimeUnit timeUnit = format[2] switch + { + 's' => TimeUnit.Second, + 'm' => TimeUnit.Millisecond, + 'u' => TimeUnit.Microsecond, + 'n' => TimeUnit.Nanosecond, + _ => throw new InvalidDataException($"Unsupported time unit for import: {format[2]}"), + }; + + string timezone = format.Split(':')[1]; + return new TimestampType(timeUnit, timezone); + } + + // Fixed-width binary + if (format.StartsWith("w:")) + { + int width = Int32.Parse(format.Substring(2)); + return new FixedSizeBinaryType(width); + } + + return format switch + { + // Primitives + "n" => NullType.Default, + "b" => BooleanType.Default, + "c" => Int8Type.Default, + "C" => UInt8Type.Default, + "s" => Int16Type.Default, + "S" => UInt16Type.Default, + "i" => Int32Type.Default, + "I" => UInt32Type.Default, + "l" => Int64Type.Default, + "L" => UInt64Type.Default, + "e" => HalfFloatType.Default, + "f" => FloatType.Default, + "g" => DoubleType.Default, + // Binary data + "z" => BinaryType.Default, + //"Z" => new LargeBinaryType() // Not yet implemented + "u" => StringType.Default, + //"U" => new LargeStringType(), // Not yet implemented + // Date and time + "tdD" => Date32Type.Default, + "tdm" => Date64Type.Default, + "tts" => new Time32Type(TimeUnit.Second), + "ttm" => new Time32Type(TimeUnit.Millisecond), + "ttu" => new Time64Type(TimeUnit.Microsecond), + "ttn" => new Time64Type(TimeUnit.Nanosecond), + // TODO: duration not yet implemented + "tiM" => new IntervalType(IntervalUnit.YearMonth), + "tiD" => new IntervalType(IntervalUnit.DayTime), + //"tin" => new IntervalType(IntervalUnit.MonthDayNanosecond), // Not yet implemented + _ => throw new NotSupportedException("Data type is not yet supported in import.") + }; + } + + public Field GetAsField() + { + string name = StringUtil.PtrToStringUtf8(_cSchema->name); + string fieldName = string.IsNullOrEmpty(name) ? "" : name; + + bool nullable = _cSchema->GetFlag(CArrowSchema.ArrowFlagNullable); + + return new Field(fieldName, GetAsType(), nullable); + } + + public Schema GetAsSchema() + { + ArrowType fullType = GetAsType(); + if (fullType is StructType structType) + { + return new Schema(structType.Fields, default); + } + else + { + throw new ArgumentException("Imported type is not a struct type, so it cannot be converted to a schema."); + } + } + } + } +} \ No newline at end of file diff --git a/csharp/src/Apache.Arrow/C/StringUtil.cs b/csharp/src/Apache.Arrow/C/StringUtil.cs new file mode 100644 index 0000000000000..9c16493fbb312 --- /dev/null +++ b/csharp/src/Apache.Arrow/C/StringUtil.cs @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Runtime.InteropServices; + +// Adapted from: +// https://github.com/G-Research/ParquetSharp/blob/467d99298fb5a5b9d5935b9c8dbde95e63954dd3/csharp/StringUtil.cs + +namespace Apache.Arrow.C +{ + + internal static class StringUtil + { + public static unsafe byte* ToCStringUtf8(string str) + { + var utf8 = System.Text.Encoding.UTF8; + int byteCount = utf8.GetByteCount(str); + byte* byteArray = (byte*)Marshal.AllocHGlobal(byteCount + 1); + + fixed (char* chars = str) + { + utf8.GetBytes(chars, str.Length, byteArray, byteCount); + } + + // Need to make sure it is null-terminated. + byteArray[byteCount] = 0; + + return byteArray; + } + + public static unsafe string PtrToStringUtf8(byte* ptr) + { +#if NETSTANDARD2_1_OR_GREATER + return Marshal.PtrToStringUTF8(ptr); +#else + if (ptr == null) + { + return null; + } + + int length; + for (length = 0; ptr[length] != '\0'; ++length) + { + } + + return System.Text.Encoding.UTF8.GetString(ptr, length); +#endif + } + } +} diff --git a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj index a34eaca7a6c5c..cdbfe479470a4 100644 --- a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj +++ b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj @@ -13,6 +13,8 @@ all runtime; build; native; contentfiles; analyzers + + diff --git a/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs b/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs new file mode 100644 index 0000000000000..82e0f37dd7278 --- /dev/null +++ b/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs @@ -0,0 +1,329 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections.Generic; +using System.Linq; +using Apache.Arrow.C; +using Apache.Arrow.Types; +using Python.Runtime; +using Xunit; + +namespace Apache.Arrow.Tests +{ + public class CDataSchemaPythonTest + { + public CDataSchemaPythonTest() + { + bool inCIJob = Environment.GetEnvironmentVariable("GITHUB_ACTIONS") == "true"; + bool pythonSet = Environment.GetEnvironmentVariable("PYTHONNET_PYDLL") != null; + // We only skip if this is not in CI + Skip.If(!pythonSet && !inCIJob, "PYTHONNET_PYDLL not set; skipping C Data Interface tests."); + + PythonEngine.Initialize(); + } + + private static Schema GetTestSchema() + { + using (Py.GIL()) + { + var schema = new Schema.Builder() + .Field(f => f.Name("null").DataType(NullType.Default).Nullable(true)) + .Field(f => f.Name("bool").DataType(BooleanType.Default).Nullable(true)) + .Field(f => f.Name("i8").DataType(Int8Type.Default).Nullable(true)) + .Field(f => f.Name("u8").DataType(UInt8Type.Default).Nullable(true)) + .Field(f => f.Name("i16").DataType(Int16Type.Default).Nullable(true)) + .Field(f => f.Name("u16").DataType(UInt16Type.Default).Nullable(true)) + .Field(f => f.Name("i32").DataType(Int32Type.Default).Nullable(true)) + .Field(f => f.Name("u32").DataType(UInt32Type.Default).Nullable(true)) + .Field(f => f.Name("i64").DataType(Int64Type.Default).Nullable(true)) + .Field(f => f.Name("u64").DataType(UInt64Type.Default).Nullable(true)) + + .Field(f => f.Name("f16").DataType(HalfFloatType.Default).Nullable(true)) + .Field(f => f.Name("f32").DataType(FloatType.Default).Nullable(true)) + .Field(f => f.Name("f64").DataType(DoubleType.Default).Nullable(true)) + + .Field(f => f.Name("decimal128_19_3").DataType(new Decimal128Type(19, 3)).Nullable(true)) + .Field(f => f.Name("decimal256_19_3").DataType(new Decimal256Type(19, 3)).Nullable(true)) + .Field(f => f.Name("decimal256_40_2").DataType(new Decimal256Type(40, 2)).Nullable(false)) + + .Field(f => f.Name("binary").DataType(BinaryType.Default).Nullable(false)) + .Field(f => f.Name("string").DataType(StringType.Default).Nullable(false)) + .Field(f => f.Name("fw_binary_10").DataType(new FixedSizeBinaryType(10)).Nullable(false)) + + .Field(f => f.Name("date32").DataType(Date32Type.Default).Nullable(false)) + .Field(f => f.Name("date64").DataType(Date64Type.Default).Nullable(false)) + .Field(f => f.Name("time32_s").DataType(new Time32Type(TimeUnit.Second)).Nullable(false)) + .Field(f => f.Name("time32_ms").DataType(new Time32Type(TimeUnit.Millisecond)).Nullable(false)) + .Field(f => f.Name("time64_us").DataType(new Time64Type(TimeUnit.Microsecond)).Nullable(false)) + .Field(f => f.Name("time64_ns").DataType(new Time64Type(TimeUnit.Nanosecond)).Nullable(false)) + + .Field(f => f.Name("timestamp_ns").DataType(new TimestampType(TimeUnit.Nanosecond, "")).Nullable(false)) + .Field(f => f.Name("timestamp_us").DataType(new TimestampType(TimeUnit.Microsecond, "")).Nullable(false)) + .Field(f => f.Name("timestamp_us_paris").DataType(new TimestampType(TimeUnit.Microsecond, "Europe/Paris")).Nullable(true)) + + .Field(f => f.Name("list_string").DataType(new ListType(StringType.Default)).Nullable(false)) + .Field(f => f.Name("list_list_i32").DataType(new ListType(new ListType(Int32Type.Default))).Nullable(false)) + + .Field(f => f.Name("dict_string").DataType(new DictionaryType(Int32Type.Default, StringType.Default, false)).Nullable(false)) + .Field(f => f.Name("dict_string_ordered").DataType(new DictionaryType(Int32Type.Default, StringType.Default, true)).Nullable(false)) + .Field(f => f.Name("list_dict_string").DataType(new ListType(new DictionaryType(Int32Type.Default, StringType.Default, false))).Nullable(false)) + + // Checking wider characters. + .Field(f => f.Name("hello 你好 😄").DataType(BooleanType.Default).Nullable(true)) + + .Build(); + return schema; + } + } + + private static IEnumerable GetPythonFields() + { + using (Py.GIL()) + { + dynamic pa = Py.Import("pyarrow"); + yield return pa.field("null", pa.GetAttr("null").Invoke(), true); + yield return pa.field("bool", pa.bool_(), true); + yield return pa.field("i8", pa.int8(), true); + yield return pa.field("u8", pa.uint8(), true); + yield return pa.field("i16", pa.int16(), true); + yield return pa.field("u16", pa.uint16(), true); + yield return pa.field("i32", pa.int32(), true); + yield return pa.field("u32", pa.uint32(), true); + yield return pa.field("i64", pa.int64(), true); + yield return pa.field("u64", pa.uint64(), true); + + yield return pa.field("f16", pa.float16(), true); + yield return pa.field("f32", pa.float32(), true); + yield return pa.field("f64", pa.float64(), true); + + yield return pa.field("decimal128_19_3", pa.decimal128(19, 3), true); + yield return pa.field("decimal256_19_3", pa.decimal256(19, 3), true); + yield return pa.field("decimal256_40_2", pa.decimal256(40, 2), false); + + yield return pa.field("binary", pa.binary(), false); + yield return pa.field("string", pa.utf8(), false); + yield return pa.field("fw_binary_10", pa.binary(10), false); + + yield return pa.field("date32", pa.date32(), false); + yield return pa.field("date64", pa.date64(), false); + yield return pa.field("time32_s", pa.time32("s"), false); + yield return pa.field("time32_ms", pa.time32("ms"), false); + yield return pa.field("time64_us", pa.time64("us"), false); + yield return pa.field("time64_ns", pa.time64("ns"), false); + + yield return pa.field("timestamp_ns", pa.timestamp("ns"), false); + yield return pa.field("timestamp_us", pa.timestamp("us"), false); + yield return pa.field("timestamp_us_paris", pa.timestamp("us", "Europe/Paris"), true); + + yield return pa.field("list_string", pa.list_(pa.utf8()), false); + yield return pa.field("list_list_i32", pa.list_(pa.list_(pa.int32())), false); + + yield return pa.field("dict_string", pa.dictionary(pa.int32(), pa.utf8(), false), false); + yield return pa.field("dict_string_ordered", pa.dictionary(pa.int32(), pa.utf8(), true), false); + yield return pa.field("list_dict_string", pa.list_(pa.dictionary(pa.int32(), pa.utf8(), false)), false); + + yield return pa.field("hello 你好 😄", pa.bool_(), true); + } + } + + private static dynamic GetPythonSchema() + { + using (Py.GIL()) + { + dynamic pa = Py.Import("pyarrow"); + return pa.schema(GetPythonFields().ToList()); + } + } + + // Schemas created in Python, used in CSharp + [SkippableFact] + public unsafe void ImportType() + { + Schema schema = GetTestSchema(); + IEnumerable pyFields = GetPythonFields(); + + foreach ((Field field, dynamic pyField) in schema.FieldsList + .Zip(pyFields)) + { + CArrowSchema* cSchema = CArrowSchema.Create(); + + using (Py.GIL()) + { + dynamic pyDatatype = pyField.type; + // Python expects the pointer as an integer + long longPtr = ((IntPtr)cSchema).ToInt64(); + pyDatatype._export_to_c(longPtr); + } + + var dataTypeComparer = new ArrayTypeComparer(field.DataType); + ArrowType importedType = CArrowSchemaImporter.ImportType(cSchema); + dataTypeComparer.Visit(importedType); + + if (importedType is DictionaryType importedDictType) + { + Assert.Equal(((DictionaryType)field.DataType).Ordered, importedDictType.Ordered); + } + + // Since we allocated, we are responsible for freeing the pointer. + CArrowSchema.Free(cSchema); + } + } + + [SkippableFact] + public unsafe void ImportField() + { + Schema schema = GetTestSchema(); + IEnumerable pyFields = GetPythonFields(); + + foreach ((Field field, dynamic pyField) in schema.FieldsList + .Zip(pyFields)) + { + CArrowSchema* cSchema = CArrowSchema.Create(); + + using (Py.GIL()) + { + long longPtr = ((IntPtr)cSchema).ToInt64(); + pyField._export_to_c(longPtr); + } + + Field importedField = CArrowSchemaImporter.ImportField(cSchema); + FieldComparer.Compare(field, importedField); + + // Since we allocated, we are responsible for freeing the pointer. + CArrowSchema.Free(cSchema); + } + } + + [SkippableFact] + public unsafe void ImportSchema() + { + Schema schema = GetTestSchema(); + dynamic pySchema = GetPythonSchema(); + + CArrowSchema* cSchema = CArrowSchema.Create(); + + using (Py.GIL()) + { + long longPtr = ((IntPtr)cSchema).ToInt64(); + pySchema._export_to_c(longPtr); + } + + Schema importedSchema = CArrowSchemaImporter.ImportSchema(cSchema); + SchemaComparer.Compare(schema, importedSchema); + + // Since we allocated, we are responsible for freeing the pointer. + CArrowSchema.Free(cSchema); + } + + + // Schemas created in CSharp, exported to Python + [SkippableFact] + public unsafe void ExportType() + { + Schema schema = GetTestSchema(); + IEnumerable pyFields = GetPythonFields(); + + foreach ((Field field, dynamic pyField) in schema.FieldsList + .Zip(pyFields)) + { + IArrowType datatype = field.DataType; + CArrowSchema* cSchema = CArrowSchema.Create(); + CArrowSchemaExporter.ExportType(datatype, cSchema); + + // For Python, we need to provide the pointer + long longPtr = ((IntPtr)cSchema).ToInt64(); + + using (Py.GIL()) + { + dynamic pa = Py.Import("pyarrow"); + dynamic expectedPyType = pyField.type; + dynamic exportedPyType = pa.DataType._import_from_c(longPtr); + Assert.True(exportedPyType == expectedPyType); + + if (pa.types.is_dictionary(exportedPyType)) + { + Assert.Equal(expectedPyType.ordered, exportedPyType.ordered); + } + } + + // Python should have called release once `exportedPyType` went out-of-scope. + Assert.True(cSchema->release == null); + Assert.True(cSchema->format == null); + Assert.Equal(0, cSchema->flags); + Assert.Equal(0, cSchema->n_children); + Assert.True(cSchema->dictionary == null); + + // Since we allocated, we are responsible for freeing the pointer. + CArrowSchema.Free(cSchema); + } + } + + [SkippableFact] + public unsafe void ExportField() + { + Schema schema = GetTestSchema(); + IEnumerable pyFields = GetPythonFields(); + + foreach ((Field field, dynamic pyField) in schema.FieldsList + .Zip(pyFields)) + { + CArrowSchema* cSchema = CArrowSchema.Create(); + CArrowSchemaExporter.ExportField(field, cSchema); + + // For Python, we need to provide the pointer + long longPtr = ((IntPtr)cSchema).ToInt64(); + + using (Py.GIL()) + { + dynamic pa = Py.Import("pyarrow"); + dynamic exportedPyField = pa.Field._import_from_c(longPtr); + Assert.True(exportedPyField == pyField); + } + + // Python should have called release once `exportedPyField` went out-of-scope. + Assert.True(cSchema->name == null); + Assert.True(cSchema->release == null); + Assert.True(cSchema->format == null); + + // Since we allocated, we are responsible for freeing the pointer. + CArrowSchema.Free(cSchema); + } + } + + [SkippableFact] + public unsafe void ExportSchema() + { + Schema schema = GetTestSchema(); + dynamic pySchema = GetPythonSchema(); + + CArrowSchema* cSchema = CArrowSchema.Create(); + CArrowSchemaExporter.ExportSchema(schema, cSchema); + + // For Python, we need to provide the pointer + long longPtr = ((IntPtr)cSchema).ToInt64(); + + using (Py.GIL()) + { + dynamic pa = Py.Import("pyarrow"); + dynamic exportedPySchema = pa.Schema._import_from_c(longPtr); + Assert.True(exportedPySchema == pySchema); + } + + // Since we allocated, we are responsible for freeing the pointer. + CArrowSchema.Free(cSchema); + } + } +} diff --git a/csharp/test/Apache.Arrow.Tests/CDataInterfaceSchemaTests.cs b/csharp/test/Apache.Arrow.Tests/CDataInterfaceSchemaTests.cs new file mode 100644 index 0000000000000..357a18816cacc --- /dev/null +++ b/csharp/test/Apache.Arrow.Tests/CDataInterfaceSchemaTests.cs @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Runtime.InteropServices; +using Apache.Arrow.C; +using Apache.Arrow.Types; +using Xunit; + +namespace Apache.Arrow.Tests +{ + public class CDataSchemaTest + { + [Fact] + public unsafe void InitializeZeroed() + { + CArrowSchema* cSchema = CArrowSchema.Create(); + + Assert.True(cSchema->format == null); + Assert.True(cSchema->name == null); + Assert.True(cSchema->metadata == null); + Assert.Equal(0, cSchema->flags); + Assert.Equal(0, cSchema->n_children); + Assert.True(cSchema->children == null); + Assert.True(cSchema->dictionary == null); + Assert.True(cSchema->release == null); + Assert.True(cSchema->private_data == null); + + CArrowSchema.Free(cSchema); + } + + [Fact] + public unsafe void FlagsSet() + { + // Non-nullable field + { + var nonNullField = new Field("non_null", Int32Type.Default, false); + CArrowSchema* cSchema = CArrowSchema.Create(); + CArrowSchemaExporter.ExportField(nonNullField, cSchema); + Assert.False(cSchema->GetFlag(CArrowSchema.ArrowFlagNullable)); + CArrowSchema.Free(cSchema); + } + + // Nullable field + { + var nullableField = new Field("nullable", Int32Type.Default, true); + CArrowSchema* cSchema = CArrowSchema.Create(); + CArrowSchemaExporter.ExportField(nullableField, cSchema); + Assert.True(cSchema->GetFlag(CArrowSchema.ArrowFlagNullable)); + CArrowSchema.Free(cSchema); + } + + // dictionary ordered + { + var orderedDictionary = new DictionaryType(Int32Type.Default, StringType.Default, true); + CArrowSchema* cSchema = CArrowSchema.Create(); + CArrowSchemaExporter.ExportType(orderedDictionary, cSchema); + Assert.True(cSchema->GetFlag(CArrowSchema.ArrowFlagDictionaryOrdered)); + CArrowSchema.Free(cSchema); + } + + // dictionary unordered + { + var unorderedDictionary = new DictionaryType(Int32Type.Default, StringType.Default, false); + CArrowSchema* cSchema = CArrowSchema.Create(); + CArrowSchemaExporter.ExportType(unorderedDictionary, cSchema); + Assert.False(cSchema->GetFlag(CArrowSchema.ArrowFlagDictionaryOrdered)); + CArrowSchema.Free(cSchema); + } + } + + [Fact] + public unsafe void CallsReleaseForValid() + { + CArrowSchema* cSchema = CArrowSchema.Create(); + CArrowSchemaExporter.ExportType(Int32Type.Default, cSchema); + Assert.False(cSchema->release == null); + CArrowSchemaImporter.ImportType(cSchema); + Assert.True(cSchema->release == null); + CArrowSchema.Free(cSchema); + } + + [Fact] + public unsafe void CallsReleaseForInvalid() + { + // Make sure we call release callback, even if the imported schema + // is invalid. + CArrowSchema* cSchema = CArrowSchema.Create(); + + bool wasCalled = false; + var releaseCallback = (CArrowSchema* cSchema) => + { + wasCalled = true; + cSchema->release = null; + }; + cSchema->release = (delegate* unmanaged[Stdcall])Marshal.GetFunctionPointerForDelegate( + releaseCallback); + + Assert.Throws(() => + { + CArrowSchemaImporter.ImportType(cSchema); + }); + Assert.True(wasCalled); + CArrowSchema.Free(cSchema); + } + } +}