diff --git a/bin/test.sh b/bin/test.sh index b16d7e2..6cf58f9 100644 --- a/bin/test.sh +++ b/bin/test.sh @@ -24,7 +24,14 @@ function teardown { } # Build the code + +# uninstall biobear if it's installed +if pip show biobear; then + pip uninstall -y biobear +fi + cargo build +maturin develop # check docker and aws cli are installed if ! command -v docker &> /dev/null diff --git a/python/tests/test_session.py b/python/tests/test_session.py index 8b139ff..4d01eed 100644 --- a/python/tests/test_session.py +++ b/python/tests/test_session.py @@ -137,6 +137,23 @@ def test_read_fastq(): assert len(df) == 2 +@pytest.mark.skipif( + not importlib.util.find_spec("polars"), reason="polars not installed" +) +def test_read_fastq_no_options(): + """Test reading a fastq file.""" + session = connect() + + fastq_path = DATA / "test.fq.gz" + df = session.read_fastq_file(str(fastq_path)).to_polars() + + assert len(df) == 2 + + fastq_path = DATA / "test.fq" + df = session.read_fastq_file(str(fastq_path)).to_polars() + + assert len(df) == 2 + @pytest.mark.skipif( not importlib.util.find_spec("polars"), reason="polars not installed" @@ -221,6 +238,23 @@ def test_read_fasta_fa(): assert len(df) == 2 +@pytest.mark.skipif( + not importlib.util.find_spec("polars"), reason="polars not installed" +) +def test_read_fasta_fa_no_options(): + """Test reading a fasta file.""" + session = connect() + + fasta_path = DATA / "test.fa" + df = session.read_fasta_file(str(fasta_path)).to_polars() + + assert len(df) == 2 + + fasta_path = DATA / "test.fa.gz" + df = session.read_fasta_file(str(fasta_path)).to_polars() + + assert len(df) == 2 + @pytest.mark.skipif( not importlib.util.find_spec("polars"), reason="polars not installed" diff --git a/src/datasources/fasta.rs b/src/datasources/fasta.rs index 80a15f7..efcd421 100644 --- a/src/datasources/fasta.rs +++ b/src/datasources/fasta.rs @@ -12,7 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -use crate::{error::BioBearResult, file_compression_type::FileCompressionType}; +use crate::{ + error::BioBearResult, file_compression_type::FileCompressionType, file_options::FileOptions, +}; use exon::datasources::fasta::{table_provider::ListingFASTATableOptions, SequenceDataType}; use pyo3::{pyclass, pymethods}; @@ -67,20 +69,11 @@ impl From for SequenceDataType { /// let options = FASTAReadOptions::default(); /// assert_eq!(options.file_extension, "fasta"); /// ``` +#[derive(Default)] pub struct FASTAReadOptions { - file_extension: String, - file_compression_type: FileCompressionType, - fasta_sequence_data_type: FastaSequenceDataType, -} - -impl Default for FASTAReadOptions { - fn default() -> Self { - Self { - file_extension: String::from(DEFAULT_FASTA_FILE_EXTENSION), - file_compression_type: FileCompressionType::UNCOMPRESSED, - fasta_sequence_data_type: FastaSequenceDataType::UTF8, - } - } + file_extension: Option, + file_compression_type: Option, + fasta_sequence_data_type: Option, } #[pymethods] @@ -105,25 +98,47 @@ impl FASTAReadOptions { file_extension: Option, file_compression_type: Option, fasta_sequence_data_type: Option, - ) -> BioBearResult { - let file_compression_type = - file_compression_type.unwrap_or(FileCompressionType::UNCOMPRESSED); - - let fasta_sequence_data_type = - fasta_sequence_data_type.unwrap_or(FastaSequenceDataType::UTF8); - - Ok(Self { + ) -> Self { + Self { file_compression_type, - file_extension: file_extension.unwrap_or(DEFAULT_FASTA_FILE_EXTENSION.to_string()), + file_extension, fasta_sequence_data_type, - }) + } + } +} + +impl FASTAReadOptions { + pub(crate) fn update_from_file_options( + &mut self, + file_options: &FileOptions, + ) -> BioBearResult<()> { + if let Some(file_extension) = file_options.file_extension() { + self.file_extension = Some(file_extension.to_string()); + } + + if let Some(file_compression_type) = file_options.file_compression_type() { + let fct = FileCompressionType::try_from(file_compression_type)?; + self.file_compression_type = Some(fct); + } + + Ok(()) } } impl From for ListingFASTATableOptions { fn from(options: FASTAReadOptions) -> Self { - ListingFASTATableOptions::new(options.file_compression_type.into()) - .with_sequence_data_type(options.fasta_sequence_data_type.into()) - .with_some_file_extension(Some(&options.file_extension)) + let file_compression_type = options + .file_compression_type + .unwrap_or(FileCompressionType::UNCOMPRESSED); + let fasta_sequence_data_type = options + .fasta_sequence_data_type + .unwrap_or(FastaSequenceDataType::UTF8); + let file_extension = options + .file_extension + .unwrap_or(DEFAULT_FASTA_FILE_EXTENSION.to_string()); + + ListingFASTATableOptions::new(file_compression_type.into()) + .with_sequence_data_type(fasta_sequence_data_type.into()) + .with_some_file_extension(Some(&file_extension)) } } diff --git a/src/datasources/fastq.rs b/src/datasources/fastq.rs index 4ff4436..c4eba36 100644 --- a/src/datasources/fastq.rs +++ b/src/datasources/fastq.rs @@ -12,7 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -use crate::file_compression_type::FileCompressionType; +use crate::{ + error::BioBearResult, file_compression_type::FileCompressionType, file_options::FileOptions, +}; use exon::datasources::fastq::table_provider::ListingFASTQTableOptions; use pyo3::{pyclass, pymethods}; @@ -44,18 +46,10 @@ const DEFAULT_FASTQ_FILE_EXTENSION: &str = "fastq"; /// let options = FASTQReadOptions::default(); /// assert_eq!(options.file_extension, "fastq"); /// ``` +#[derive(Default)] pub struct FASTQReadOptions { - file_extension: String, - file_compression_type: FileCompressionType, -} - -impl Default for FASTQReadOptions { - fn default() -> Self { - Self { - file_extension: DEFAULT_FASTQ_FILE_EXTENSION.to_string(), - file_compression_type: FileCompressionType::UNCOMPRESSED, - } - } + file_extension: Option, + file_compression_type: Option, } #[pymethods] @@ -80,11 +74,6 @@ impl FASTQReadOptions { file_extension: Option, file_compression_type: Option, ) -> Self { - let file_compression_type = - file_compression_type.unwrap_or(FileCompressionType::UNCOMPRESSED); - - let file_extension = file_extension.unwrap_or(DEFAULT_FASTQ_FILE_EXTENSION.to_string()); - Self { file_extension, file_compression_type, @@ -96,9 +85,36 @@ impl FASTQReadOptions { } } +impl FASTQReadOptions { + pub(crate) fn update_from_file_options( + &mut self, + file_options: &FileOptions, + ) -> BioBearResult<()> { + if let Some(file_extension) = file_options.file_extension() { + self.file_extension = Some(file_extension.to_string()); + } + + if let Some(file_compression_type) = file_options.file_compression_type() { + let fct = FileCompressionType::try_from(file_compression_type)?; + self.file_compression_type = Some(fct); + } + + Ok(()) + } +} + impl From for ListingFASTQTableOptions { fn from(options: FASTQReadOptions) -> Self { - ListingFASTQTableOptions::new(options.file_compression_type.into()) - .with_some_file_extension(Some(&options.file_extension)) + let file_compression_type = options + .file_compression_type + .unwrap_or(FileCompressionType::UNCOMPRESSED); + + let file_extension = options + .file_extension + .as_deref() + .unwrap_or(DEFAULT_FASTQ_FILE_EXTENSION); + + ListingFASTQTableOptions::new(file_compression_type.into()) + .with_some_file_extension(Some(file_extension)) } } diff --git a/src/file_options.rs b/src/file_options.rs new file mode 100644 index 0000000..2b63515 --- /dev/null +++ b/src/file_options.rs @@ -0,0 +1,86 @@ +// Copyright 2024 WHERE TRUE Technologies. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::{path::Path, str::FromStr}; + +use datafusion::datasource::file_format::file_compression_type::FileCompressionType; + +#[derive(Debug, Clone, Default)] +pub(crate) struct FileOptions { + file_extension: Option, + file_compression_type: Option, +} + +impl FileOptions { + pub fn file_extension(&self) -> Option<&str> { + self.file_extension.as_deref() + } + + pub fn file_compression_type(&self) -> Option { + self.file_compression_type + } +} + +impl From<&str> for FileOptions { + fn from(s: &str) -> Self { + let path = Path::new(s); + + let extension = match path.extension().and_then(|ext| ext.to_str()) { + Some(ext) => ext, + None => return Self::default(), + }; + + if let Ok(file_compression_type) = FileCompressionType::from_str(extension) { + if let Some(stem) = path.file_stem().and_then(|stem| stem.to_str()) { + let file_extension = Path::new(stem).extension().and_then(|ext| ext.to_str()); + return Self { + file_extension: file_extension.map(|ext| ext.to_string()), + file_compression_type: Some(file_compression_type), + }; + } + return Self { + file_extension: None, + file_compression_type: Some(file_compression_type), + }; + } + + Self { + file_extension: Some(extension.to_string()), + file_compression_type: None, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_from_str() { + let file_options = FileOptions::from("test.csv"); + assert_eq!(file_options.file_extension(), Some("csv")); + assert_eq!(file_options.file_compression_type(), None); + + let file_options = FileOptions::from("test.csv.gz"); + assert_eq!(file_options.file_extension(), Some("csv")); + assert_eq!( + file_options.file_compression_type(), + Some(FileCompressionType::GZIP) + ); + + let file_options = FileOptions::from("test"); + assert_eq!(file_options.file_extension, None); + assert_eq!(file_options.file_compression_type, None); + } +} diff --git a/src/lib.rs b/src/lib.rs index f937ea9..99d45cd 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,6 +21,7 @@ mod exon_reader; mod vcf_reader; mod file_compression_type; +mod file_options; pub use file_compression_type::FileCompressionType; diff --git a/src/session_context.rs b/src/session_context.rs index 2b619ab..e218916 100644 --- a/src/session_context.rs +++ b/src/session_context.rs @@ -25,6 +25,7 @@ use crate::datasources::hmm_dom_tab::HMMDomTabReadOptions; use crate::datasources::mzml::MzMLReadOptions; use crate::error; use crate::execution_result::ExecutionResult; +use crate::file_options::FileOptions; use crate::runtime::wait_for_future; #[pyclass] @@ -127,7 +128,9 @@ impl BioBearSessionContext { options: Option, py: Python, ) -> PyResult { - let options = options.unwrap_or_default(); + let file_options = FileOptions::from(file_path); + let mut options = options.unwrap_or_default(); + options.update_from_file_options(&file_options)?; let result = self.ctx.read_fastq(file_path, options.into()); let df = wait_for_future(py, result).map_err(error::BioBearError::from)?; @@ -217,7 +220,9 @@ impl BioBearSessionContext { options: Option, py: Python, ) -> PyResult { - let options = options.unwrap_or_default(); + let file_options = FileOptions::from(file_path); + let mut options = options.unwrap_or_default(); + options.update_from_file_options(&file_options)?; let result = self.ctx.read_fasta(file_path, options.into()); let df = wait_for_future(py, result).map_err(error::BioBearError::from)?;