-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #610 from hechth/hechth/issue608
Adding new generation of table manipulation tools
- Loading branch information
Showing
21 changed files
with
2,350 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,5 @@ | ||
.vscode | ||
**/tool_test_output.html | ||
**/tool_test_output.json | ||
**/tmp* | ||
**/__pycache__ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
name: tables | ||
owner: recetox | ||
remote_repository_url: "https://github.com/RECETOX/galaxytools/tree/master/tools/tables" | ||
homepage_url: "https://github.com/RECETOX/galaxytools" | ||
categories: | ||
- Metabolomics | ||
- Statistics | ||
description: "Tools to manipulate and analyze data tables." | ||
long_description: "Tools to manipulate and analyze data tables. Current tools include interpolation using scipy and arithmetic operations on tables with pandas." | ||
auto_tool_repositories: | ||
name_template: "{{ tool_id }}" | ||
description_template: "{{ tool_name }} tool from the general purpose data analysis suite developed by RECETOX." | ||
suite: | ||
name: suite_table_tools | ||
description: This tool suites contains tools for general purpose data analysis built on top of pandas, scipy, dplyr and others. | ||
type: repository_suite_definition |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
<macros> | ||
<token name="@PANDAS_VERSION@">2.2.3</token> | ||
<token name="@SCIPY_VERSION@">1.14.1</token> | ||
|
||
<xml name="requirement_pandas_pyarrow"> | ||
<requirement type="package" version="@PANDAS_VERSION@">pandas</requirement> | ||
<requirement type="package" version="18.0.0">pyarrow</requirement> | ||
</xml> | ||
|
||
<xml name="creator"> | ||
<creator> | ||
<person | ||
givenName="Kristina" | ||
familyName="Gomoryova" | ||
url="https://github.com/KristinaGomoryova" | ||
identifier="0000-0003-4407-3917" /> | ||
<person | ||
givenName="Helge" | ||
familyName="Hecht" | ||
url="https://github.com/hechth" | ||
identifier="0000-0001-6744-996X" /> | ||
<organization | ||
url="https://www.recetox.muni.cz/" | ||
email="GalaxyToolsDevelopmentandDeployment@space.muni.cz" | ||
name="RECETOX MUNI" /> | ||
</creator> | ||
</xml> | ||
|
||
<xml name="regex_sanitizer"> | ||
<sanitizer> | ||
<valid initial="string.ascii_letters,string.digits"> | ||
<add value="^"/> | ||
<add value="$"/> | ||
<add value="("/> | ||
<add value=")"/> | ||
<add value="|"/> | ||
<add value="?"/> | ||
<add value="*"/> | ||
<add value="+"/> | ||
<add value="{"/> | ||
<add value="}"/> | ||
<add value="\"/> | ||
<add value="["/> | ||
<add value="]"/> | ||
<add value="."/> | ||
<add value=","/> | ||
<add value="_"/> | ||
<add value="-"/> | ||
</valid> | ||
</sanitizer> | ||
<validator type="empty_field" /> | ||
<validator type="regex" message="Pattern must not end with backslash.">.*[^\\]$</validator> | ||
</xml> | ||
</macros> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
import argparse | ||
import logging | ||
from typing import List, Tuple | ||
|
||
|
||
import numpy as np | ||
import pandas as pd | ||
from utils import LoadDataAction, SplitColumnIndicesAction, StoreOutputAction | ||
|
||
|
||
# Constants for operations | ||
OPERATIONS = { | ||
"mul": np.multiply, | ||
"sub": np.subtract, | ||
"div": np.divide, | ||
"add": np.add, | ||
"pow": np.power, | ||
} | ||
|
||
|
||
def perform_operation(df: pd.DataFrame, column_indices: List[int], operation: str, operand: float): | ||
""" | ||
Perform the specified arithmetic operation on the given columns of the dataframe. | ||
Parameters: | ||
df (pd.DataFrame): The input dataframe. | ||
column_indices (list): The 0-based indices of the columns to perform the operation on. | ||
operation (str): The arithmetic operation to perform. | ||
operand (float): The operand for the arithmetic operation. | ||
Returns: | ||
pd.DataFrame: The dataframe with the operation applied. | ||
""" | ||
for column_index in column_indices: | ||
column_name = df.columns[column_index] | ||
df[column_name] = OPERATIONS[operation](df[column_name], operand) | ||
return df | ||
|
||
|
||
def main(input_dataset: pd.DataFrame, column_indices: List[int], operation: str, operand: float, output_dataset: Tuple[callable, str]): | ||
""" | ||
Main function to load the dataset, perform the operation, and save the result. | ||
Parameters: | ||
input_dataset (tuple): The input dataset and its file extension. | ||
column_indices (list): The 0-based indices of the columns to perform the operation on. | ||
operation (str): The arithmetic operation to perform. | ||
operand (float): The operand for the arithmetic operation. | ||
output_dataset (tuple): The output dataset and its file extension. | ||
""" | ||
try: | ||
df = perform_operation(input_dataset, column_indices, operation, operand) | ||
write_func, file_path = output_dataset | ||
write_func(df, file_path) | ||
except Exception as e: | ||
logging.error(f"Error in main function: {e}") | ||
raise | ||
|
||
|
||
if __name__ == "__main__": | ||
logging.basicConfig(level=logging.INFO) | ||
parser = argparse.ArgumentParser( | ||
description="Perform arithmetic operations on dataframe columns." | ||
) | ||
parser.add_argument( | ||
"--input_dataset", | ||
nargs=2, | ||
action=LoadDataAction, | ||
required=True, | ||
help="Path to the input dataset and its file extension (csv, tsv, parquet)", | ||
) | ||
parser.add_argument( | ||
"--columns", | ||
action=SplitColumnIndicesAction, | ||
required=True, | ||
help="Comma-separated list of 1-based indices of the columns to perform the operation on", | ||
) | ||
parser.add_argument( | ||
"--operation", | ||
type=str, | ||
choices=OPERATIONS.keys(), | ||
required=True, | ||
help="Arithmetic operation to perform", | ||
) | ||
parser.add_argument( | ||
"--operand", | ||
type=float, | ||
required=True, | ||
help="Operand for the arithmetic operation", | ||
) | ||
parser.add_argument( | ||
"--output_dataset", | ||
nargs=2, | ||
action=StoreOutputAction, | ||
required=True, | ||
help="Path to the output dataset and its file extension (csv, tsv, parquet)", | ||
) | ||
|
||
args = parser.parse_args() | ||
main( | ||
args.input_dataset, | ||
args.columns, | ||
args.operation, | ||
args.operand, | ||
args.output_dataset, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
<tool id="table_pandas_arithmetics" name="table arithmetics" version="@PANDAS_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01" license="MIT"> | ||
<description>perform arithmetic operations on a dataframe column</description> | ||
<macros> | ||
<import>macros.xml</import> | ||
<token name="@VERSION_SUFFIX@">0</token> | ||
</macros> | ||
<requirements> | ||
<expand macro="requirement_pandas_pyarrow"/> | ||
</requirements> | ||
<required_files> | ||
<include path="table_pandas_arithmetics.py" /> | ||
<include path="utils.py" /> | ||
</required_files> | ||
<expand macro="creator" /> | ||
<command detect_errors="exit_code"><![CDATA[ | ||
python3 '$__tool_directory__/table_pandas_arithmetics.py' | ||
--input_dataset '$input_dataset' '$input_dataset.ext' | ||
--column '$column' | ||
--operation '$operation' | ||
--operand '$operand' | ||
--output_dataset '$output_dataset' '$output_dataset.ext' | ||
]]></command> | ||
<inputs> | ||
<param name="input_dataset" type="data" format="csv,tsv,tabular,parquet" label="Input Dataset" help="The input dataset in CSV, TSV, tabular, or Parquet format."/> | ||
<param name="column" type="data_column" data_ref="input_dataset" use_header_names="true" label="Column" help="The column from the dataset to perform the arithmetic operation on."/> | ||
<param name="operation" type="select" label="Arithmetic Operation" help="The arithmetic operation to perform on the selected column. Choose from Multiply, Subtract, Divide, Add, or Power."> | ||
<option value="mul">Multiply</option> | ||
<option value="sub">Subtract</option> | ||
<option value="div">Divide</option> | ||
<option value="add">Add</option> | ||
<option value="pow">Power</option> | ||
</param> | ||
<param name="operand" type="float" label="Operand" help="The operand value to use in the arithmetic operation. This value will be applied to each element in the selected column."/> | ||
</inputs> | ||
<outputs> | ||
<data name="output_dataset" format_source="input_dataset" label="${tool.name} on ${on_string}"> | ||
<change_format> | ||
<when input="input_dataset.ext" value="tsv" format="tabular" /> | ||
</change_format> | ||
</data> | ||
</outputs> | ||
<tests> | ||
<test> | ||
<param name="input_dataset" value="query.tabular" ftype="tabular"/> | ||
<param name="column" value="3"/> | ||
<param name="operation" value="div"/> | ||
<param name="operand" value="100"/> | ||
<output name="output_dataset" file="arithmetics/query_divide_ri.tabular" ftype="tabular"/> | ||
</test> | ||
</tests> | ||
<help><![CDATA[ | ||
This tool performs arithmetic operations on a specified column of a dataframe. | ||
Supported operations are: multiply, subtract, divide, add, and power. | ||
Inputs | ||
------ | ||
- **Input Dataset**: The input dataset in CSV, TSV, tabular, or Parquet format. | ||
- **Column**: The column from the dataset to perform the arithmetic operation on. Select the column by its header name. | ||
- **Arithmetic Operation**: The arithmetic operation to perform on the selected column. Choose from Multiply, Subtract, Divide, Add, or Power. | ||
- **Operand**: The operand value to use in the arithmetic operation. This value will be applied to each element in the selected column. | ||
Outputs | ||
------- | ||
- **Output Dataset**: The output dataset with the arithmetic operation applied to the specified column. | ||
]]></help> | ||
<citations> | ||
<citation type="doi">10.5281/zenodo.3509134</citation> | ||
<citation type="doi">10.25080/Majora-92bf1922-00a</citation> | ||
</citations> | ||
</tool> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
import argparse | ||
import logging | ||
from typing import Tuple | ||
|
||
import pandas as pd | ||
from utils import KeyValuePairsAction, LoadDataAction, StoreOutputAction | ||
|
||
|
||
def rename_columns(df: pd.DataFrame, rename_dict: dict): | ||
""" | ||
Rename columns in the dataframe based on the provided dictionary. | ||
Parameters: | ||
df (pd.DataFrame): The input dataframe. | ||
rename_dict (dict): A dictionary with 1-based column index as key and new column name as value. | ||
Returns: | ||
pd.DataFrame: The dataframe with renamed columns. | ||
""" | ||
try: | ||
rename_map = { | ||
df.columns[key - 1]: value for key, value in rename_dict.items() | ||
} # Convert 1-based index to column name | ||
return df.rename(columns=rename_map) | ||
except IndexError as e: | ||
logging.error(f"Invalid column index: {e}") | ||
raise | ||
except Exception as e: | ||
logging.error(f"Error renaming columns: {e}") | ||
raise | ||
|
||
|
||
def main(input_dataset: pd.DataFrame, rename_dict: dict, output_dataset: Tuple[callable, str]): | ||
""" | ||
Main function to load the dataset, rename columns, and save the result. | ||
Parameters: | ||
input_dataset (pd.DataFrame): The input dataset . | ||
rename_dict (dict): A dictionary with 1-based column index as key and new column name as value. | ||
output_dataset (tuple): The function to store the output dataset and the path. | ||
""" | ||
try: | ||
write_func, file_path = output_dataset | ||
write_func(rename_columns(input_dataset, rename_dict), file_path) | ||
except Exception as e: | ||
logging.error(f"Error in main function: {e}") | ||
raise | ||
|
||
|
||
if __name__ == "__main__": | ||
logging.basicConfig(level=logging.INFO) | ||
parser = argparse.ArgumentParser(description="Rename columns in a dataframe.") | ||
parser.add_argument( | ||
"--input_dataset", | ||
nargs=2, | ||
action=LoadDataAction, | ||
required=True, | ||
help="Path to the input dataset and its file extension (csv, tsv, parquet)", | ||
) | ||
parser.add_argument( | ||
"--rename", | ||
nargs="+", | ||
action=KeyValuePairsAction, | ||
required=True, | ||
help="List of key=value pairs with 1-based column index as key and new column name as value", | ||
) | ||
parser.add_argument( | ||
"--output_dataset", | ||
nargs=2, | ||
action=StoreOutputAction, | ||
required=True, | ||
help="Path to the output dataset and its file extension (csv, tsv, parquet)", | ||
) | ||
|
||
args = parser.parse_args() | ||
main(args.input_dataset, args.rename, args.output_dataset) |
Oops, something went wrong.