Skip to content

Commit

Permalink
Merge pull request #610 from hechth/hechth/issue608
Browse files Browse the repository at this point in the history
Adding new generation of table manipulation tools
  • Loading branch information
hechth authored Jan 29, 2025
2 parents 8a94b99 + 319f8be commit d0ff40e
Show file tree
Hide file tree
Showing 21 changed files with 2,350 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
.vscode
**/tool_test_output.html
**/tool_test_output.json
**/tmp*
**/__pycache__
16 changes: 16 additions & 0 deletions tools/tables/.shed.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
name: tables
owner: recetox
remote_repository_url: "https://github.com/RECETOX/galaxytools/tree/master/tools/tables"
homepage_url: "https://github.com/RECETOX/galaxytools"
categories:
- Metabolomics
- Statistics
description: "Tools to manipulate and analyze data tables."
long_description: "Tools to manipulate and analyze data tables. Current tools include interpolation using scipy and arithmetic operations on tables with pandas."
auto_tool_repositories:
name_template: "{{ tool_id }}"
description_template: "{{ tool_name }} tool from the general purpose data analysis suite developed by RECETOX."
suite:
name: suite_table_tools
description: This tool suites contains tools for general purpose data analysis built on top of pandas, scipy, dplyr and others.
type: repository_suite_definition
54 changes: 54 additions & 0 deletions tools/tables/macros.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
<macros>
<token name="@PANDAS_VERSION@">2.2.3</token>
<token name="@SCIPY_VERSION@">1.14.1</token>

<xml name="requirement_pandas_pyarrow">
<requirement type="package" version="@PANDAS_VERSION@">pandas</requirement>
<requirement type="package" version="18.0.0">pyarrow</requirement>
</xml>

<xml name="creator">
<creator>
<person
givenName="Kristina"
familyName="Gomoryova"
url="https://github.com/KristinaGomoryova"
identifier="0000-0003-4407-3917" />
<person
givenName="Helge"
familyName="Hecht"
url="https://github.com/hechth"
identifier="0000-0001-6744-996X" />
<organization
url="https://www.recetox.muni.cz/"
email="GalaxyToolsDevelopmentandDeployment@space.muni.cz"
name="RECETOX MUNI" />
</creator>
</xml>

<xml name="regex_sanitizer">
<sanitizer>
<valid initial="string.ascii_letters,string.digits">
<add value="^"/>
<add value="$"/>
<add value="("/>
<add value=")"/>
<add value="|"/>
<add value="?"/>
<add value="*"/>
<add value="+"/>
<add value="{"/>
<add value="}"/>
<add value="\"/>
<add value="["/>
<add value="]"/>
<add value="."/>
<add value=","/>
<add value="_"/>
<add value="-"/>
</valid>
</sanitizer>
<validator type="empty_field" />
<validator type="regex" message="Pattern must not end with backslash.">.*[^\\]$</validator>
</xml>
</macros>
106 changes: 106 additions & 0 deletions tools/tables/table_pandas_arithmetics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import argparse
import logging
from typing import List, Tuple


import numpy as np
import pandas as pd
from utils import LoadDataAction, SplitColumnIndicesAction, StoreOutputAction


# Constants for operations
OPERATIONS = {
"mul": np.multiply,
"sub": np.subtract,
"div": np.divide,
"add": np.add,
"pow": np.power,
}


def perform_operation(df: pd.DataFrame, column_indices: List[int], operation: str, operand: float):
"""
Perform the specified arithmetic operation on the given columns of the dataframe.
Parameters:
df (pd.DataFrame): The input dataframe.
column_indices (list): The 0-based indices of the columns to perform the operation on.
operation (str): The arithmetic operation to perform.
operand (float): The operand for the arithmetic operation.
Returns:
pd.DataFrame: The dataframe with the operation applied.
"""
for column_index in column_indices:
column_name = df.columns[column_index]
df[column_name] = OPERATIONS[operation](df[column_name], operand)
return df


def main(input_dataset: pd.DataFrame, column_indices: List[int], operation: str, operand: float, output_dataset: Tuple[callable, str]):
"""
Main function to load the dataset, perform the operation, and save the result.
Parameters:
input_dataset (tuple): The input dataset and its file extension.
column_indices (list): The 0-based indices of the columns to perform the operation on.
operation (str): The arithmetic operation to perform.
operand (float): The operand for the arithmetic operation.
output_dataset (tuple): The output dataset and its file extension.
"""
try:
df = perform_operation(input_dataset, column_indices, operation, operand)
write_func, file_path = output_dataset
write_func(df, file_path)
except Exception as e:
logging.error(f"Error in main function: {e}")
raise


if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
parser = argparse.ArgumentParser(
description="Perform arithmetic operations on dataframe columns."
)
parser.add_argument(
"--input_dataset",
nargs=2,
action=LoadDataAction,
required=True,
help="Path to the input dataset and its file extension (csv, tsv, parquet)",
)
parser.add_argument(
"--columns",
action=SplitColumnIndicesAction,
required=True,
help="Comma-separated list of 1-based indices of the columns to perform the operation on",
)
parser.add_argument(
"--operation",
type=str,
choices=OPERATIONS.keys(),
required=True,
help="Arithmetic operation to perform",
)
parser.add_argument(
"--operand",
type=float,
required=True,
help="Operand for the arithmetic operation",
)
parser.add_argument(
"--output_dataset",
nargs=2,
action=StoreOutputAction,
required=True,
help="Path to the output dataset and its file extension (csv, tsv, parquet)",
)

args = parser.parse_args()
main(
args.input_dataset,
args.columns,
args.operation,
args.operand,
args.output_dataset,
)
72 changes: 72 additions & 0 deletions tools/tables/table_pandas_arithmetics.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
<tool id="table_pandas_arithmetics" name="table arithmetics" version="@PANDAS_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01" license="MIT">
<description>perform arithmetic operations on a dataframe column</description>
<macros>
<import>macros.xml</import>
<token name="@VERSION_SUFFIX@">0</token>
</macros>
<requirements>
<expand macro="requirement_pandas_pyarrow"/>
</requirements>
<required_files>
<include path="table_pandas_arithmetics.py" />
<include path="utils.py" />
</required_files>
<expand macro="creator" />
<command detect_errors="exit_code"><![CDATA[
python3 '$__tool_directory__/table_pandas_arithmetics.py'
--input_dataset '$input_dataset' '$input_dataset.ext'
--column '$column'
--operation '$operation'
--operand '$operand'
--output_dataset '$output_dataset' '$output_dataset.ext'
]]></command>
<inputs>
<param name="input_dataset" type="data" format="csv,tsv,tabular,parquet" label="Input Dataset" help="The input dataset in CSV, TSV, tabular, or Parquet format."/>
<param name="column" type="data_column" data_ref="input_dataset" use_header_names="true" label="Column" help="The column from the dataset to perform the arithmetic operation on."/>
<param name="operation" type="select" label="Arithmetic Operation" help="The arithmetic operation to perform on the selected column. Choose from Multiply, Subtract, Divide, Add, or Power.">
<option value="mul">Multiply</option>
<option value="sub">Subtract</option>
<option value="div">Divide</option>
<option value="add">Add</option>
<option value="pow">Power</option>
</param>
<param name="operand" type="float" label="Operand" help="The operand value to use in the arithmetic operation. This value will be applied to each element in the selected column."/>
</inputs>
<outputs>
<data name="output_dataset" format_source="input_dataset" label="${tool.name} on ${on_string}">
<change_format>
<when input="input_dataset.ext" value="tsv" format="tabular" />
</change_format>
</data>
</outputs>
<tests>
<test>
<param name="input_dataset" value="query.tabular" ftype="tabular"/>
<param name="column" value="3"/>
<param name="operation" value="div"/>
<param name="operand" value="100"/>
<output name="output_dataset" file="arithmetics/query_divide_ri.tabular" ftype="tabular"/>
</test>
</tests>
<help><![CDATA[
This tool performs arithmetic operations on a specified column of a dataframe.
Supported operations are: multiply, subtract, divide, add, and power.
Inputs
------
- **Input Dataset**: The input dataset in CSV, TSV, tabular, or Parquet format.
- **Column**: The column from the dataset to perform the arithmetic operation on. Select the column by its header name.
- **Arithmetic Operation**: The arithmetic operation to perform on the selected column. Choose from Multiply, Subtract, Divide, Add, or Power.
- **Operand**: The operand value to use in the arithmetic operation. This value will be applied to each element in the selected column.
Outputs
-------
- **Output Dataset**: The output dataset with the arithmetic operation applied to the specified column.
]]></help>
<citations>
<citation type="doi">10.5281/zenodo.3509134</citation>
<citation type="doi">10.25080/Majora-92bf1922-00a</citation>
</citations>
</tool>
76 changes: 76 additions & 0 deletions tools/tables/table_pandas_rename_column.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import argparse
import logging
from typing import Tuple

import pandas as pd
from utils import KeyValuePairsAction, LoadDataAction, StoreOutputAction


def rename_columns(df: pd.DataFrame, rename_dict: dict):
"""
Rename columns in the dataframe based on the provided dictionary.
Parameters:
df (pd.DataFrame): The input dataframe.
rename_dict (dict): A dictionary with 1-based column index as key and new column name as value.
Returns:
pd.DataFrame: The dataframe with renamed columns.
"""
try:
rename_map = {
df.columns[key - 1]: value for key, value in rename_dict.items()
} # Convert 1-based index to column name
return df.rename(columns=rename_map)
except IndexError as e:
logging.error(f"Invalid column index: {e}")
raise
except Exception as e:
logging.error(f"Error renaming columns: {e}")
raise


def main(input_dataset: pd.DataFrame, rename_dict: dict, output_dataset: Tuple[callable, str]):
"""
Main function to load the dataset, rename columns, and save the result.
Parameters:
input_dataset (pd.DataFrame): The input dataset .
rename_dict (dict): A dictionary with 1-based column index as key and new column name as value.
output_dataset (tuple): The function to store the output dataset and the path.
"""
try:
write_func, file_path = output_dataset
write_func(rename_columns(input_dataset, rename_dict), file_path)
except Exception as e:
logging.error(f"Error in main function: {e}")
raise


if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
parser = argparse.ArgumentParser(description="Rename columns in a dataframe.")
parser.add_argument(
"--input_dataset",
nargs=2,
action=LoadDataAction,
required=True,
help="Path to the input dataset and its file extension (csv, tsv, parquet)",
)
parser.add_argument(
"--rename",
nargs="+",
action=KeyValuePairsAction,
required=True,
help="List of key=value pairs with 1-based column index as key and new column name as value",
)
parser.add_argument(
"--output_dataset",
nargs=2,
action=StoreOutputAction,
required=True,
help="Path to the output dataset and its file extension (csv, tsv, parquet)",
)

args = parser.parse_args()
main(args.input_dataset, args.rename, args.output_dataset)
Loading

0 comments on commit d0ff40e

Please sign in to comment.