Skip to content

Commit

Permalink
feat: infer options from file name (#153)
Browse files Browse the repository at this point in the history
* feat: infer options from file name
* style: clippy
* feat: update tests
* refactor: use file options struct
* feat: infer for fastq
  • Loading branch information
tshauck authored Jun 21, 2024
1 parent 884b36f commit 075e5df
Show file tree
Hide file tree
Showing 7 changed files with 212 additions and 48 deletions.
7 changes: 7 additions & 0 deletions bin/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,14 @@ function teardown {
}

# Build the code

# uninstall biobear if it's installed
if pip show biobear; then
pip uninstall -y biobear
fi

cargo build
maturin develop

# check docker and aws cli are installed
if ! command -v docker &> /dev/null
Expand Down
34 changes: 34 additions & 0 deletions python/tests/test_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,23 @@ def test_read_fastq():

assert len(df) == 2

@pytest.mark.skipif(
not importlib.util.find_spec("polars"), reason="polars not installed"
)
def test_read_fastq_no_options():
"""Test reading a fastq file."""
session = connect()

fastq_path = DATA / "test.fq.gz"
df = session.read_fastq_file(str(fastq_path)).to_polars()

assert len(df) == 2

fastq_path = DATA / "test.fq"
df = session.read_fastq_file(str(fastq_path)).to_polars()

assert len(df) == 2


@pytest.mark.skipif(
not importlib.util.find_spec("polars"), reason="polars not installed"
Expand Down Expand Up @@ -221,6 +238,23 @@ def test_read_fasta_fa():

assert len(df) == 2

@pytest.mark.skipif(
not importlib.util.find_spec("polars"), reason="polars not installed"
)
def test_read_fasta_fa_no_options():
"""Test reading a fasta file."""
session = connect()

fasta_path = DATA / "test.fa"
df = session.read_fasta_file(str(fasta_path)).to_polars()

assert len(df) == 2

fasta_path = DATA / "test.fa.gz"
df = session.read_fasta_file(str(fasta_path)).to_polars()

assert len(df) == 2


@pytest.mark.skipif(
not importlib.util.find_spec("polars"), reason="polars not installed"
Expand Down
69 changes: 42 additions & 27 deletions src/datasources/fasta.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.

use crate::{error::BioBearResult, file_compression_type::FileCompressionType};
use crate::{
error::BioBearResult, file_compression_type::FileCompressionType, file_options::FileOptions,
};
use exon::datasources::fasta::{table_provider::ListingFASTATableOptions, SequenceDataType};
use pyo3::{pyclass, pymethods};

Expand Down Expand Up @@ -67,20 +69,11 @@ impl From<FastaSequenceDataType> for SequenceDataType {
/// let options = FASTAReadOptions::default();
/// assert_eq!(options.file_extension, "fasta");
/// ```
#[derive(Default)]
pub struct FASTAReadOptions {
file_extension: String,
file_compression_type: FileCompressionType,
fasta_sequence_data_type: FastaSequenceDataType,
}

impl Default for FASTAReadOptions {
fn default() -> Self {
Self {
file_extension: String::from(DEFAULT_FASTA_FILE_EXTENSION),
file_compression_type: FileCompressionType::UNCOMPRESSED,
fasta_sequence_data_type: FastaSequenceDataType::UTF8,
}
}
file_extension: Option<String>,
file_compression_type: Option<FileCompressionType>,
fasta_sequence_data_type: Option<FastaSequenceDataType>,
}

#[pymethods]
Expand All @@ -105,25 +98,47 @@ impl FASTAReadOptions {
file_extension: Option<String>,
file_compression_type: Option<FileCompressionType>,
fasta_sequence_data_type: Option<FastaSequenceDataType>,
) -> BioBearResult<Self> {
let file_compression_type =
file_compression_type.unwrap_or(FileCompressionType::UNCOMPRESSED);

let fasta_sequence_data_type =
fasta_sequence_data_type.unwrap_or(FastaSequenceDataType::UTF8);

Ok(Self {
) -> Self {
Self {
file_compression_type,
file_extension: file_extension.unwrap_or(DEFAULT_FASTA_FILE_EXTENSION.to_string()),
file_extension,
fasta_sequence_data_type,
})
}
}
}

impl FASTAReadOptions {
pub(crate) fn update_from_file_options(
&mut self,
file_options: &FileOptions,
) -> BioBearResult<()> {
if let Some(file_extension) = file_options.file_extension() {
self.file_extension = Some(file_extension.to_string());
}

if let Some(file_compression_type) = file_options.file_compression_type() {
let fct = FileCompressionType::try_from(file_compression_type)?;
self.file_compression_type = Some(fct);
}

Ok(())
}
}

impl From<FASTAReadOptions> for ListingFASTATableOptions {
fn from(options: FASTAReadOptions) -> Self {
ListingFASTATableOptions::new(options.file_compression_type.into())
.with_sequence_data_type(options.fasta_sequence_data_type.into())
.with_some_file_extension(Some(&options.file_extension))
let file_compression_type = options
.file_compression_type
.unwrap_or(FileCompressionType::UNCOMPRESSED);
let fasta_sequence_data_type = options
.fasta_sequence_data_type
.unwrap_or(FastaSequenceDataType::UTF8);
let file_extension = options
.file_extension
.unwrap_or(DEFAULT_FASTA_FILE_EXTENSION.to_string());

ListingFASTATableOptions::new(file_compression_type.into())
.with_sequence_data_type(fasta_sequence_data_type.into())
.with_some_file_extension(Some(&file_extension))
}
}
54 changes: 35 additions & 19 deletions src/datasources/fastq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.

use crate::file_compression_type::FileCompressionType;
use crate::{
error::BioBearResult, file_compression_type::FileCompressionType, file_options::FileOptions,
};
use exon::datasources::fastq::table_provider::ListingFASTQTableOptions;
use pyo3::{pyclass, pymethods};

Expand Down Expand Up @@ -44,18 +46,10 @@ const DEFAULT_FASTQ_FILE_EXTENSION: &str = "fastq";
/// let options = FASTQReadOptions::default();
/// assert_eq!(options.file_extension, "fastq");
/// ```
#[derive(Default)]
pub struct FASTQReadOptions {
file_extension: String,
file_compression_type: FileCompressionType,
}

impl Default for FASTQReadOptions {
fn default() -> Self {
Self {
file_extension: DEFAULT_FASTQ_FILE_EXTENSION.to_string(),
file_compression_type: FileCompressionType::UNCOMPRESSED,
}
}
file_extension: Option<String>,
file_compression_type: Option<FileCompressionType>,
}

#[pymethods]
Expand All @@ -80,11 +74,6 @@ impl FASTQReadOptions {
file_extension: Option<String>,
file_compression_type: Option<FileCompressionType>,
) -> Self {
let file_compression_type =
file_compression_type.unwrap_or(FileCompressionType::UNCOMPRESSED);

let file_extension = file_extension.unwrap_or(DEFAULT_FASTQ_FILE_EXTENSION.to_string());

Self {
file_extension,
file_compression_type,
Expand All @@ -96,9 +85,36 @@ impl FASTQReadOptions {
}
}

impl FASTQReadOptions {
pub(crate) fn update_from_file_options(
&mut self,
file_options: &FileOptions,
) -> BioBearResult<()> {
if let Some(file_extension) = file_options.file_extension() {
self.file_extension = Some(file_extension.to_string());
}

if let Some(file_compression_type) = file_options.file_compression_type() {
let fct = FileCompressionType::try_from(file_compression_type)?;
self.file_compression_type = Some(fct);
}

Ok(())
}
}

impl From<FASTQReadOptions> for ListingFASTQTableOptions {
fn from(options: FASTQReadOptions) -> Self {
ListingFASTQTableOptions::new(options.file_compression_type.into())
.with_some_file_extension(Some(&options.file_extension))
let file_compression_type = options
.file_compression_type
.unwrap_or(FileCompressionType::UNCOMPRESSED);

let file_extension = options
.file_extension
.as_deref()
.unwrap_or(DEFAULT_FASTQ_FILE_EXTENSION);

ListingFASTQTableOptions::new(file_compression_type.into())
.with_some_file_extension(Some(file_extension))
}
}
86 changes: 86 additions & 0 deletions src/file_options.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
// Copyright 2024 WHERE TRUE Technologies.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use std::{path::Path, str::FromStr};

use datafusion::datasource::file_format::file_compression_type::FileCompressionType;

#[derive(Debug, Clone, Default)]
pub(crate) struct FileOptions {
file_extension: Option<String>,
file_compression_type: Option<FileCompressionType>,
}

impl FileOptions {
pub fn file_extension(&self) -> Option<&str> {
self.file_extension.as_deref()
}

pub fn file_compression_type(&self) -> Option<FileCompressionType> {
self.file_compression_type
}
}

impl From<&str> for FileOptions {
fn from(s: &str) -> Self {
let path = Path::new(s);

let extension = match path.extension().and_then(|ext| ext.to_str()) {
Some(ext) => ext,
None => return Self::default(),
};

if let Ok(file_compression_type) = FileCompressionType::from_str(extension) {
if let Some(stem) = path.file_stem().and_then(|stem| stem.to_str()) {
let file_extension = Path::new(stem).extension().and_then(|ext| ext.to_str());
return Self {
file_extension: file_extension.map(|ext| ext.to_string()),
file_compression_type: Some(file_compression_type),
};
}
return Self {
file_extension: None,
file_compression_type: Some(file_compression_type),
};
}

Self {
file_extension: Some(extension.to_string()),
file_compression_type: None,
}
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_from_str() {
let file_options = FileOptions::from("test.csv");
assert_eq!(file_options.file_extension(), Some("csv"));
assert_eq!(file_options.file_compression_type(), None);

let file_options = FileOptions::from("test.csv.gz");
assert_eq!(file_options.file_extension(), Some("csv"));
assert_eq!(
file_options.file_compression_type(),
Some(FileCompressionType::GZIP)
);

let file_options = FileOptions::from("test");
assert_eq!(file_options.file_extension, None);
assert_eq!(file_options.file_compression_type, None);
}
}
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ mod exon_reader;
mod vcf_reader;

mod file_compression_type;
mod file_options;

pub use file_compression_type::FileCompressionType;

Expand Down
9 changes: 7 additions & 2 deletions src/session_context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ use crate::datasources::hmm_dom_tab::HMMDomTabReadOptions;
use crate::datasources::mzml::MzMLReadOptions;
use crate::error;
use crate::execution_result::ExecutionResult;
use crate::file_options::FileOptions;
use crate::runtime::wait_for_future;

#[pyclass]
Expand Down Expand Up @@ -127,7 +128,9 @@ impl BioBearSessionContext {
options: Option<FASTQReadOptions>,
py: Python,
) -> PyResult<ExecutionResult> {
let options = options.unwrap_or_default();
let file_options = FileOptions::from(file_path);
let mut options = options.unwrap_or_default();
options.update_from_file_options(&file_options)?;

let result = self.ctx.read_fastq(file_path, options.into());
let df = wait_for_future(py, result).map_err(error::BioBearError::from)?;
Expand Down Expand Up @@ -217,7 +220,9 @@ impl BioBearSessionContext {
options: Option<FASTAReadOptions>,
py: Python,
) -> PyResult<ExecutionResult> {
let options = options.unwrap_or_default();
let file_options = FileOptions::from(file_path);
let mut options = options.unwrap_or_default();
options.update_from_file_options(&file_options)?;

let result = self.ctx.read_fasta(file_path, options.into());
let df = wait_for_future(py, result).map_err(error::BioBearError::from)?;
Expand Down

0 comments on commit 075e5df

Please sign in to comment.