diff --git a/src/lib.rs b/src/lib.rs index 46eb2a1..829d8a4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -44,8 +44,8 @@ mod date_time; mod decimal; mod error; - mod odbc_writer; +mod quirks; mod reader; mod schema; @@ -57,6 +57,7 @@ pub use odbc_api; pub use self::{ error::Error, odbc_writer::{insert_into_table, insert_statement_from_schema, OdbcWriter, WriterError}, + quirks::Quirks, reader::{ BufferAllocationOptions, ColumnFailure, ConcurrentOdbcReader, OdbcReader, OdbcReaderBuilder, }, diff --git a/src/quirks.rs b/src/quirks.rs new file mode 100644 index 0000000..21fc6ff --- /dev/null +++ b/src/quirks.rs @@ -0,0 +1,34 @@ +/// A (non exhaustive) description of the non ODBC API conformant behavior of ODBC drivers. +/// Workarounds which are intended to help application developers seperate between the descision of +/// how to deal with non conformity from the knowledge which driver behaves weird in exactly which +/// way. +/// +/// For example it wants to avoid an if statement specifying "if the database is DB2 please use +/// terminating zeroes instead of indiactors to determine string lengths" and seperate this into +/// "IBM DB2 returns memory garbage for indicators" and another part of the application decides, +/// this is how I deal with memory garbage indicicators. +#[non_exhaustive] +#[derive(Clone, PartialEq, Eq)] +pub struct Quirks { + /// IBM DB2 has been observered that the length indicators returned from memory are garbage for + /// strings. It seems to be preferable to rely on the terminating zero exclusively to determine + /// string length. This behavior seems to so far only manifest with variadic string fields. + /// See: and also + /// + pub indicators_returned_from_bulk_fetch_are_memory_garbage: bool, +} + +impl Quirks { + /// A new instance describing an ODBC driver without quirks + pub fn new() -> Self { + Quirks { + indicators_returned_from_bulk_fetch_are_memory_garbage: false, + } + } +} + +impl Default for Quirks { + fn default() -> Self { + Self::new() + } +} diff --git a/src/reader.rs b/src/reader.rs index 5a0c05e..a462bee 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -27,8 +27,11 @@ mod to_record_batch; use self::{decimal::Decimal, map_odbc_to_arrow::MapOdbcToArrow}; -use crate::date_time::{ - days_since_epoch, ms_since_epoch, ns_since_epoch, seconds_since_epoch, us_since_epoch, +use crate::{ + date_time::{ + days_since_epoch, ms_since_epoch, ns_since_epoch, seconds_since_epoch, us_since_epoch, + }, + Quirks, }; pub use self::{ @@ -120,6 +123,7 @@ pub fn choose_column_strategy( query_metadata: &mut impl ResultSetMetadata, col_index: u16, buffer_allocation_options: BufferAllocationOptions, + quirks: &Quirks, ) -> Result, ColumnFailure> { let strat: Box = match field.data_type() { ArrowDataType::Boolean => { @@ -152,6 +156,7 @@ pub fn choose_column_strategy( sql_type, lazy_display_size, buffer_allocation_options.max_text_size, + quirks.indicators_returned_from_bulk_fetch_are_memory_garbage, )? } ArrowDataType::Decimal128(precision, scale @ 0..) => { diff --git a/src/reader/odbc_reader.rs b/src/reader/odbc_reader.rs index 7b0640d..41c026f 100644 --- a/src/reader/odbc_reader.rs +++ b/src/reader/odbc_reader.rs @@ -7,7 +7,7 @@ use arrow::{ }; use odbc_api::{buffers::ColumnarAnyBuffer, BlockCursor, Cursor}; -use crate::{BufferAllocationOptions, ConcurrentOdbcReader, Error}; +use crate::{BufferAllocationOptions, ConcurrentOdbcReader, Error, Quirks}; use super::{odbc_batch_stream::OdbcBatchStream, to_record_batch::ToRecordBatch}; @@ -359,6 +359,7 @@ pub struct OdbcReaderBuilder { max_text_size: Option, max_binary_size: Option, fallibale_allocations: bool, + quirks: Quirks, } impl OdbcReaderBuilder { @@ -379,6 +380,7 @@ impl OdbcReaderBuilder { max_text_size: None, max_binary_size: None, fallibale_allocations: false, + quirks: Quirks::new(), } } @@ -458,6 +460,13 @@ impl OdbcReaderBuilder { self } + /// Shims are workarounds which can make arrow ODBC use different implementations in order to + /// compensate for ODBC drivers which violate the ODBC specification. + pub fn with_shims(&mut self, quirks: Quirks) -> &mut Self { + self.quirks = quirks; + self + } + /// No matter if the user explicitly specified a limit in row size, a memory limit, both or /// neither. In order to construct a reader we need to decide on the buffer size in rows. fn buffer_size_in_rows(&self, bytes_per_row: usize) -> Result { @@ -495,8 +504,12 @@ impl OdbcReaderBuilder { max_binary_size: self.max_binary_size, fallibale_allocations: self.fallibale_allocations, }; - let converter = - ToRecordBatch::new(&mut cursor, self.schema.clone(), buffer_allocation_options)?; + let converter = ToRecordBatch::new( + &mut cursor, + self.schema.clone(), + buffer_allocation_options, + &self.quirks, + )?; let bytes_per_row = converter.row_size_in_bytes(); let buffer_size_in_rows = self.buffer_size_in_rows(bytes_per_row)?; let row_set_buffer = diff --git a/src/reader/text.rs b/src/reader/text.rs index d044679..e54ac84 100644 --- a/src/reader/text.rs +++ b/src/reader/text.rs @@ -1,4 +1,4 @@ -use std::{char::decode_utf16, cmp::min, num::NonZeroUsize, sync::Arc}; +use std::{char::decode_utf16, cmp::min, ffi::CStr, num::NonZeroUsize, sync::Arc}; use arrow::array::{ArrayRef, StringBuilder}; use odbc_api::{ @@ -16,6 +16,7 @@ pub fn choose_text_strategy( sql_type: OdbcDataType, lazy_display_size: impl FnOnce() -> Result, odbc_api::Error>, max_text_size: Option, + assume_indicators_are_memory_garbage: bool, ) -> Result, ColumnFailure> { let apply_buffer_limit = |len| match (len, max_text_size) { (None, None) => Err(ColumnFailure::ZeroSizedColumn { sql_type }), @@ -40,7 +41,10 @@ pub fn choose_text_strategy( .transpose() .map_err(|source| ColumnFailure::UnknownStringLength { sql_type, source })?; let octet_len = apply_buffer_limit(octet_len.map(NonZeroUsize::get))?; - narrow_text_strategy(octet_len) + // So far only Linux users seemed to have complained about panics due to garbage indices? + // Linux usually would use UTF-8, so we only invest work in working around this for narrow + // strategies + narrow_text_strategy(octet_len, assume_indicators_are_memory_garbage) }; Ok(strategy) @@ -50,8 +54,15 @@ fn wide_text_strategy(u16_len: usize) -> Box { Box::new(WideText::new(u16_len)) } -fn narrow_text_strategy(octet_len: usize) -> Box { - Box::new(NarrowText::new(octet_len)) +fn narrow_text_strategy( + octet_len: usize, + assume_indicators_are_memory_garbage: bool, +) -> Box { + if assume_indicators_are_memory_garbage { + Box::new(NarrowUseTerminatingZero::new(octet_len)) + } else { + Box::new(NarrowText::new(octet_len)) + } } /// Strategy requesting the text from the database as UTF-16 (Wide characters) and emmitting it as @@ -125,9 +136,45 @@ impl ReadStrategy for NarrowText { for value in view.iter() { builder.append_option(value.map(|bytes| { std::str::from_utf8(bytes) - .expect("ODBC column had been expected to return valid utf8, but did not.") + .expect("ODBC driver had been expected to return valid utf8, but did not.") })); } Ok(Arc::new(builder.finish())) } } + +pub struct NarrowUseTerminatingZero { + /// Maximum string length in u8, excluding terminating zero + max_str_len: usize, +} + +impl NarrowUseTerminatingZero { + pub fn new(max_str_len: usize) -> Self { + Self { max_str_len } + } +} + +impl ReadStrategy for NarrowUseTerminatingZero { + fn buffer_desc(&self) -> BufferDesc { + BufferDesc::Text { + max_str_len: self.max_str_len, + } + } + + fn fill_arrow_array(&self, column_view: AnySlice) -> Result { + let view = column_view.as_text_view().unwrap(); + let mut builder = StringBuilder::with_capacity(view.len(), self.max_str_len * view.len()); + // We can not use view.iter() since its implementation relies on the indicator buffer being + // correct. This read strategy is a workaround for the indicators being incorrect, though. + for bytes in view.raw_value_buffer().chunks_exact(self.max_str_len + 1) { + let c_str = CStr::from_bytes_until_nul(bytes) + .expect("ODBC driver must return strings terminated by zero"); + let str = c_str + .to_str() + .expect("ODBC driver had been expected to return valid utf8, but did not."); + let option = (!str.is_empty()).then_some(str); + builder.append_option(option); + } + Ok(Arc::new(builder.finish())) + } +} diff --git a/src/reader/to_record_batch.rs b/src/reader/to_record_batch.rs index 9b967b3..3558d96 100644 --- a/src/reader/to_record_batch.rs +++ b/src/reader/to_record_batch.rs @@ -7,7 +7,7 @@ use arrow::{ use log::info; use odbc_api::{buffers::ColumnarAnyBuffer, ResultSetMetadata}; -use crate::{arrow_schema_from, BufferAllocationOptions, ColumnFailure, Error}; +use crate::{arrow_schema_from, BufferAllocationOptions, ColumnFailure, Error, Quirks}; use super::{choose_column_strategy, MappingError, ReadStrategy}; @@ -27,6 +27,7 @@ impl ToRecordBatch { cursor: &mut impl ResultSetMetadata, schema: Option, buffer_allocation_options: BufferAllocationOptions, + quirks: &Quirks, ) -> Result { // Infer schema if not given by the user let schema = if let Some(schema) = schema { @@ -41,7 +42,7 @@ impl ToRecordBatch { .enumerate() .map(|(index, field)| { let col_index = (index + 1).try_into().unwrap(); - choose_column_strategy(field, cursor, col_index, buffer_allocation_options) + choose_column_strategy(field, cursor, col_index, buffer_allocation_options, quirks) .map_err(|cause| cause.into_crate_error(field.name().clone(), index)) }) .collect::>()?; diff --git a/tests/integration.rs b/tests/integration.rs index 0c11f73..eb4f0d3 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -33,7 +33,7 @@ use arrow_odbc::{ Connection, ConnectionOptions, Cursor, CursorImpl, Environment, IntoParameter, StatementConnection, }, - ColumnFailure, Error, OdbcReaderBuilder, OdbcWriter, WriterError, + ColumnFailure, Error, OdbcReaderBuilder, OdbcWriter, Quirks, WriterError, }; use stdext::function_name; @@ -300,6 +300,38 @@ fn fetch_varchar() { assert!(array_vals.is_null(2)); } +/// IBM DB2 ODBC driver reports memory garbage instead of indicators. We want to activate a +/// workaround which utilizes the terminating zeroes instead of indicators to determine string +/// length and NULL. +/// Ideally this test would run against a DB2 database. Yet the test setup proves to be a lot of +/// effort. Instead this test verifies that the workaround also works with Microsoft SQL Server. +#[test] +fn fetch_varchar_using_terminating_zeroes_to_indicate_string_length() { + let table_name = function_name!().rsplit_once(':').unwrap().1; + let cursor = cursor_over(table_name, "VARCHAR(50)", "('Hello'),('Bonjour'),(NULL)"); + + let mut quirks = Quirks::new(); + quirks.indicators_returned_from_bulk_fetch_are_memory_garbage = true; + let mut reader = OdbcReaderBuilder::new() + .with_max_num_rows_per_batch(3) + .with_shims(quirks) + .build(cursor) + .unwrap() + .into_concurrent() + .unwrap(); + let record_batch = reader.next().unwrap().unwrap(); + let array_vals = record_batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + // Assert that the correct values are found within the arrow batch + assert_eq!("Hello", array_vals.value(0)); + assert_eq!("Bonjour", array_vals.value(1)); + assert!(array_vals.is_null(2)); +} + /// Fill a record batch of Strings from a nvarchar source column #[test] fn fetch_nvarchar() {