Merge pull request #610 from hechth/hechth/issue608

Adding new generation of table manipulation tools
RECETOX · Jan 29, 2025 · d0ff40e · d0ff40e
2 parents 8a94b99 + 319f8be
commit d0ff40e
Show file tree

Hide file tree

Showing 21 changed files with 2,350 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
 .vscode
 **/tool_test_output.html
 **/tool_test_output.json
+**/tmp*
+**/__pycache__
diff --git a/tools/tables/.shed.yml b/tools/tables/.shed.yml
@@ -0,0 +1,16 @@
+name: tables
+owner: recetox
+remote_repository_url: "https://github.com/RECETOX/galaxytools/tree/master/tools/tables"
+homepage_url: "https://github.com/RECETOX/galaxytools"
+categories:
+  - Metabolomics
+  - Statistics
+description: "Tools to manipulate and analyze data tables."
+long_description: "Tools to manipulate and analyze data tables. Current tools include interpolation using scipy and arithmetic operations on tables with pandas."
+auto_tool_repositories:
+  name_template: "{{ tool_id }}"
+  description_template: "{{ tool_name }} tool from the general purpose data analysis suite developed by RECETOX."
+suite:
+  name: suite_table_tools
+  description: This tool suites contains tools for general purpose data analysis built on top of pandas, scipy, dplyr and others.
+  type: repository_suite_definition
diff --git a/tools/tables/macros.xml b/tools/tables/macros.xml
@@ -0,0 +1,54 @@
+<macros>
+    <token name="@PANDAS_VERSION@">2.2.3</token>
+    <token name="@SCIPY_VERSION@">1.14.1</token>
+
+    <xml name="requirement_pandas_pyarrow">
+        <requirement type="package" version="@PANDAS_VERSION@">pandas</requirement>
+        <requirement type="package" version="18.0.0">pyarrow</requirement>
+    </xml>
+
+    <xml name="creator">
+        <creator>
+            <person
+                givenName="Kristina"
+                familyName="Gomoryova"
+                url="https://github.com/KristinaGomoryova"
+                identifier="0000-0003-4407-3917" />
+            <person
+                givenName="Helge"
+                familyName="Hecht"
+                url="https://github.com/hechth"
+                identifier="0000-0001-6744-996X" />
+            <organization
+                url="https://www.recetox.muni.cz/"
+                email="GalaxyToolsDevelopmentandDeployment@space.muni.cz"
+                name="RECETOX MUNI" />
+        </creator>
+    </xml>
+
+    <xml name="regex_sanitizer">
+        <sanitizer>
+            <valid initial="string.ascii_letters,string.digits">
+                <add value="^"/>
+                <add value="$"/>
+                <add value="("/>
+                <add value=")"/>
+                <add value="|"/>
+                <add value="?"/>
+                <add value="*"/>
+                <add value="+"/>
+                <add value="{"/>
+                <add value="}"/>
+                <add value="\"/>
+                <add value="["/>
+                <add value="]"/>
+                <add value="."/>
+                <add value=","/>
+                <add value="_"/>
+                <add value="-"/>
+            </valid>
+        </sanitizer>
+        <validator type="empty_field" />
+        <validator type="regex" message="Pattern must not end with backslash.">.*[^\\]$</validator>
+    </xml>
+</macros>
diff --git a/tools/tables/table_pandas_arithmetics.py b/tools/tables/table_pandas_arithmetics.py
@@ -0,0 +1,106 @@
+import argparse
+import logging
+from typing import List, Tuple
+
+
+import numpy as np
+import pandas as pd
+from utils import LoadDataAction, SplitColumnIndicesAction, StoreOutputAction
+
+
+# Constants for operations
+OPERATIONS = {
+    "mul": np.multiply,
+    "sub": np.subtract,
+    "div": np.divide,
+    "add": np.add,
+    "pow": np.power,
+}
+
+
+def perform_operation(df: pd.DataFrame, column_indices: List[int], operation: str, operand: float):
+    """
+    Perform the specified arithmetic operation on the given columns of the dataframe.
+
+    Parameters:
+    df (pd.DataFrame): The input dataframe.
+    column_indices (list): The 0-based indices of the columns to perform the operation on.
+    operation (str): The arithmetic operation to perform.
+    operand (float): The operand for the arithmetic operation.
+
+    Returns:
+    pd.DataFrame: The dataframe with the operation applied.
+    """
+    for column_index in column_indices:
+        column_name = df.columns[column_index]
+        df[column_name] = OPERATIONS[operation](df[column_name], operand)
+    return df
+
+
+def main(input_dataset: pd.DataFrame, column_indices: List[int], operation: str, operand: float, output_dataset: Tuple[callable, str]):
+    """
+    Main function to load the dataset, perform the operation, and save the result.
+
+    Parameters:
+    input_dataset (tuple): The input dataset and its file extension.
+    column_indices (list): The 0-based indices of the columns to perform the operation on.
+    operation (str): The arithmetic operation to perform.
+    operand (float): The operand for the arithmetic operation.
+    output_dataset (tuple): The output dataset and its file extension.
+    """
+    try:
+        df = perform_operation(input_dataset, column_indices, operation, operand)
+        write_func, file_path = output_dataset
+        write_func(df, file_path)
+    except Exception as e:
+        logging.error(f"Error in main function: {e}")
+        raise
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    parser = argparse.ArgumentParser(
+        description="Perform arithmetic operations on dataframe columns."
+    )
+    parser.add_argument(
+        "--input_dataset",
+        nargs=2,
+        action=LoadDataAction,
+        required=True,
+        help="Path to the input dataset and its file extension (csv, tsv, parquet)",
+    )
+    parser.add_argument(
+        "--columns",
+        action=SplitColumnIndicesAction,
+        required=True,
+        help="Comma-separated list of 1-based indices of the columns to perform the operation on",
+    )
+    parser.add_argument(
+        "--operation",
+        type=str,
+        choices=OPERATIONS.keys(),
+        required=True,
+        help="Arithmetic operation to perform",
+    )
+    parser.add_argument(
+        "--operand",
+        type=float,
+        required=True,
+        help="Operand for the arithmetic operation",
+    )
+    parser.add_argument(
+        "--output_dataset",
+        nargs=2,
+        action=StoreOutputAction,
+        required=True,
+        help="Path to the output dataset and its file extension (csv, tsv, parquet)",
+    )
+
+    args = parser.parse_args()
+    main(
+        args.input_dataset,
+        args.columns,
+        args.operation,
+        args.operand,
+        args.output_dataset,
+    )
diff --git a/tools/tables/table_pandas_arithmetics.xml b/tools/tables/table_pandas_arithmetics.xml
@@ -0,0 +1,72 @@
+<tool id="table_pandas_arithmetics" name="table arithmetics" version="@PANDAS_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01" license="MIT">
+    <description>perform arithmetic operations on a dataframe column</description>
+    <macros>
+        <import>macros.xml</import>
+        <token name="@VERSION_SUFFIX@">0</token>
+    </macros>
+    <requirements>
+        <expand macro="requirement_pandas_pyarrow"/>
+    </requirements>
+    <required_files>
+        <include path="table_pandas_arithmetics.py" />
+        <include path="utils.py" />
+    </required_files>
+    <expand macro="creator" />
+    <command detect_errors="exit_code"><![CDATA[
+        python3 '$__tool_directory__/table_pandas_arithmetics.py' 
+            --input_dataset '$input_dataset' '$input_dataset.ext' 
+            --column '$column' 
+            --operation '$operation' 
+            --operand '$operand' 
+            --output_dataset '$output_dataset' '$output_dataset.ext'
+    ]]></command>
+    <inputs>
+        <param name="input_dataset" type="data" format="csv,tsv,tabular,parquet" label="Input Dataset" help="The input dataset in CSV, TSV, tabular, or Parquet format."/>
+        <param name="column" type="data_column" data_ref="input_dataset" use_header_names="true" label="Column" help="The column from the dataset to perform the arithmetic operation on."/>
+        <param name="operation" type="select" label="Arithmetic Operation" help="The arithmetic operation to perform on the selected column. Choose from Multiply, Subtract, Divide, Add, or Power.">
+            <option value="mul">Multiply</option>
+            <option value="sub">Subtract</option>
+            <option value="div">Divide</option>
+            <option value="add">Add</option>
+            <option value="pow">Power</option>
+        </param>
+        <param name="operand" type="float" label="Operand" help="The operand value to use in the arithmetic operation. This value will be applied to each element in the selected column."/>
+    </inputs>
+    <outputs>
+        <data name="output_dataset" format_source="input_dataset" label="${tool.name} on ${on_string}">
+            <change_format>
+                <when input="input_dataset.ext" value="tsv" format="tabular" />
+            </change_format>
+        </data>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input_dataset" value="query.tabular" ftype="tabular"/>
+            <param name="column" value="3"/>
+            <param name="operation" value="div"/>
+            <param name="operand" value="100"/>
+            <output name="output_dataset" file="arithmetics/query_divide_ri.tabular" ftype="tabular"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+This tool performs arithmetic operations on a specified column of a dataframe.
+Supported operations are: multiply, subtract, divide, add, and power.
+
+Inputs
+------
+
+- **Input Dataset**: The input dataset in CSV, TSV, tabular, or Parquet format.
+- **Column**: The column from the dataset to perform the arithmetic operation on. Select the column by its header name.
+- **Arithmetic Operation**: The arithmetic operation to perform on the selected column. Choose from Multiply, Subtract, Divide, Add, or Power.
+- **Operand**: The operand value to use in the arithmetic operation. This value will be applied to each element in the selected column.
+
+Outputs
+-------
+
+- **Output Dataset**: The output dataset with the arithmetic operation applied to the specified column.
+        ]]></help>
+    <citations>
+        <citation type="doi">10.5281/zenodo.3509134</citation>
+        <citation type="doi">10.25080/Majora-92bf1922-00a</citation>
+    </citations>
+</tool>
diff --git a/tools/tables/table_pandas_rename_column.py b/tools/tables/table_pandas_rename_column.py
@@ -0,0 +1,76 @@
+import argparse
+import logging
+from typing import Tuple
+
+import pandas as pd
+from utils import KeyValuePairsAction, LoadDataAction, StoreOutputAction
+
+
+def rename_columns(df: pd.DataFrame, rename_dict: dict):
+    """
+    Rename columns in the dataframe based on the provided dictionary.
+
+    Parameters:
+    df (pd.DataFrame): The input dataframe.
+    rename_dict (dict): A dictionary with 1-based column index as key and new column name as value.
+
+    Returns:
+    pd.DataFrame: The dataframe with renamed columns.
+    """
+    try:
+        rename_map = {
+            df.columns[key - 1]: value for key, value in rename_dict.items()
+        }  # Convert 1-based index to column name
+        return df.rename(columns=rename_map)
+    except IndexError as e:
+        logging.error(f"Invalid column index: {e}")
+        raise
+    except Exception as e:
+        logging.error(f"Error renaming columns: {e}")
+        raise
+
+
+def main(input_dataset: pd.DataFrame, rename_dict: dict, output_dataset: Tuple[callable, str]):
+    """
+    Main function to load the dataset, rename columns, and save the result.
+
+    Parameters:
+    input_dataset (pd.DataFrame): The input dataset .
+    rename_dict (dict): A dictionary with 1-based column index as key and new column name as value.
+    output_dataset (tuple): The function to store the output dataset and the path.
+    """
+    try:
+        write_func, file_path = output_dataset
+        write_func(rename_columns(input_dataset, rename_dict), file_path)
+    except Exception as e:
+        logging.error(f"Error in main function: {e}")
+        raise
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    parser = argparse.ArgumentParser(description="Rename columns in a dataframe.")
+    parser.add_argument(
+        "--input_dataset",
+        nargs=2,
+        action=LoadDataAction,
+        required=True,
+        help="Path to the input dataset and its file extension (csv, tsv, parquet)",
+    )
+    parser.add_argument(
+        "--rename",
+        nargs="+",
+        action=KeyValuePairsAction,
+        required=True,
+        help="List of key=value pairs with 1-based column index as key and new column name as value",
+    )
+    parser.add_argument(
+        "--output_dataset",
+        nargs=2,
+        action=StoreOutputAction,
+        required=True,
+        help="Path to the output dataset and its file extension (csv, tsv, parquet)",
+    )
+
+    args = parser.parse_args()
+    main(args.input_dataset, args.rename, args.output_dataset)