From c29b9570437c83426198d8c679557e623eea8100 Mon Sep 17 00:00:00 2001 From: iquasere Date: Wed, 20 Dec 2023 11:30:01 +0000 Subject: [PATCH] Added test for "-iq" and "-it" parameters Added test for "--include-missing-genomes" and "--map-all" parameters Pretty json for differential maps too Updated documentation for additional outputs information --- .github/workflows/main.yml | 49 +++++++++++++++++++++++++++++++++++++- README.md | 40 ++++++++++++++++++++----------- keggpathway_map.py | 5 ++-- 3 files changed, 76 insertions(+), 18 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 0ced04b..85485bd 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -43,7 +43,6 @@ jobs: - name: Run KEGGCharter run: docker run keggcharter /bin/bash -c "keggcharter --show-available-maps" - run-keggcharter: runs-on: ubuntu-latest needs: build @@ -59,3 +58,51 @@ jobs: docker image ls -a - name: Run KEGGCharter run: docker run keggcharter /bin/bash -c "keggcharter -f KEGGCharter/cicd/keggcharter_input.tsv -rd resources_directory -keggc 'KEGG' -koc 'KO' -ecc 'EC number' -cogc 'COG ID' -qcol MP1,MP2,MP3,MP4 -tc 'Taxonomic lineage (SPECIES)' -mm 00680" + + input-quantification-and-taxonomy: + runs-on: ubuntu-latest + needs: build + steps: + - name: Download artifact + uses: actions/download-artifact@v2 + with: + name: keggcharter + path: /tmp + - name: Load Docker image + run: | + docker load --input /tmp/keggcharter.tar + docker image ls -a + - name: Run KEGGCharter + run: docker run keggcharter /bin/bash -c "keggcharter -f KEGGCharter/cicd/keggcharter_input.tsv -rd resources_directory -keggc 'KEGG' -koc 'KO' -ecc 'EC number' -cogc 'COG ID' -iq -it "My community" -mm 00680" + + include-missing-genomes: + runs-on: ubuntu-latest + needs: build + steps: + - name: Download artifact + uses: actions/download-artifact@v2 + with: + name: keggcharter + path: /tmp + - name: Load Docker image + run: | + docker load --input /tmp/keggcharter.tar + docker image ls -a + - name: Run KEGGCharter + run: docker run keggcharter /bin/bash -c "keggcharter -f KEGGCharter/cicd/keggcharter_input.tsv -rd resources_directory -keggc 'KEGG' -koc 'KO' -ecc 'EC number' -cogc 'COG ID' -qcol MP1,MP2,MP3,MP4 -tc 'Taxonomic lineage (SPECIES)' -mm 00680 --include-missing-genomes" + + map-all: + runs-on: ubuntu-latest + needs: build + steps: + - name: Download artifact + uses: actions/download-artifact@v2 + with: + name: keggcharter + path: /tmp + - name: Load Docker image + run: | + docker load --input /tmp/keggcharter.tar + docker image ls -a + - name: Run KEGGCharter + run: docker run keggcharter /bin/bash -c "keggcharter -f KEGGCharter/cicd/keggcharter_input.tsv -rd resources_directory -keggc 'KEGG' -koc 'KO' -ecc 'EC number' -cogc 'COG ID' -qcol MP1,MP2,MP3,MP4 -tc 'Taxonomic lineage (SPECIES)' -mm 00680 --map-all" diff --git a/README.md b/README.md index d344e1e..ce69e0c 100644 --- a/README.md +++ b/README.md @@ -32,10 +32,20 @@ To run KEGGCharter, an input file must be supplied. This file only needs to cont An example input file is available [here](https://github.com/iquasere/KEGGCharter/blob/master/cicd/keggcharter_input.tsv). It contains all fields referenced above, and should be used as guidance for building inputs for KEGGCharter. -To obtain metabolic representations for "Methane Metabolism" and "Fatty Acid Degradation" with KEGGCharter, for this input file, KEGGCharter can be run with the following command: +The following command will obtain metabolic representations for "Methane Metabolism" (KEGG map00680) with KEGGCharter: ``` -keggcharter -f keggcharter_input.tsv -o test_keggcharter -qcol mt_0.01a,mt_1a,mt_100a,mt_0.01b,mt_1b,mt_100b,mt_0.01c,mt_1c,mt_100c -keggc "KEGG ID" -tc "Species" -mm 00680,00071 +keggcharter -f keggcharter_input.tsv -rd resources_directory -keggc 'KEGG' -koc 'KO' -ecc 'EC number' -cogc 'COG ID' -iq -it "My community" -mm 00680 -o first_time_running_KC ``` +After it is over, you should have, inside the `first_time_running_KC` folder: +* additional information concerning your data in the file `KEGGCharter_results.tsv` +* maps in PNG format inside a `maps` folder +* JSONs with the information painted on the maps inside a `json` folder + +Additionally, you should have the `data_for_charting.tsv` and `taxon_to_mmap_to_orthologs` files. These are there so KEGGCharter can be run again, for other maps, by running the same command as before, but with the additional parameter `--resume`. With this parameter, KEGGCharter will look for those files, and new maps can be generated by changing the `--metabolic-maps` parameter. No need for repeated KOs and EC numbers retrieval! + +### What maps are available? + +You can see what maps are available for the `--metabolic-maps` parameter by running `keggcharter --show-available-maps`. ### First time KEGGCharter runs it will take a long time @@ -63,18 +73,16 @@ Fig. 2 - KEGG metabolic map of methane metabolism, with differential analysis of KEGGCharter provides several options for customizing its workflow. ``` -options: -h, --help show this help message and exit + -f FILE, --file FILE TSV or EXCEL table with information to chart -o OUTPUT, --output OUTPUT Output directory -rd RESOURCES_DIRECTORY, --resources-directory RESOURCES_DIRECTORY Directory for storing KGML and CSV files. -mm METABOLIC_MAPS, --metabolic-maps METABOLIC_MAPS IDs of metabolic maps to output - -gcol GENOMIC_COLUMNS, --genomic-columns GENOMIC_COLUMNS - Names of columns with genomic identification - -tcol TRANSCRIPTOMIC_COLUMNS, --transcriptomic-columns TRANSCRIPTOMIC_COLUMNS - Names of columns with transcriptomics quantification + -qcol QUANTIFICATION_COLUMNS, --quantification-columns QUANTIFICATION_COLUMNS + Names of columns with quantification -tls TAXA_LIST, --taxa-list TAXA_LIST List of taxa to represent in genomic potential charts (comma separated) -not NUMBER_OF_TAXA, --number-of-taxa NUMBER_OF_TAXA @@ -85,19 +93,23 @@ options: Column with KOs. -ecc EC_COLUMN, --ec-column EC_COLUMN Column with EC numbers. + -cogc COG_COLUMN, --cog-column COG_COLUMN + Column with COG IDs. + -tc TAXA_COLUMN, --taxa-column TAXA_COLUMN + Column with the taxa designations to represent with KEGGCharter. NOTE - for valid taxonomies, check: https://www.genome.jp/kegg/catalog/org_list.html -iq, --input-quantification If input table has no quantification, will create a mock quantification column -it INPUT_TAXONOMY, --input-taxonomy INPUT_TAXONOMY If no taxonomy column exists and there is only one taxon in question. - -tc TAXA_COLUMN, --taxa-column TAXA_COLUMN - Column with the taxa designations to represent with KEGGCharter + -t THREADS, --threads THREADS + Number of threads to run KEGGCharter with [max available] + --step STEP Number of IDs to submit per request through the KEGG API [40] + --map-all Ignore KEGG taxonomic information. All functions for all KOs will be represented, even if they aren't attributed by KEGG to the specific species. + --include-missing-genomes + Map the functions for KOs identified for organisms not present in KEGG Genomes. --resume If data inputed has already been analyzed by KEGGCharter. - --step STEP Number of IDs to submit per request through the KEGG API. -v, --version show program's version number and exit -required named arguments: - -f FILE, --file FILE TSV or EXCEL table with information to chart - Special functions: --show-available-maps Outputs KEGG maps IDs and descriptions to the console (so you may pick the ones you want!) @@ -106,7 +118,7 @@ Special functions: ### Mock imputation of quantification and taxonomy Sometimes, not all information required for KEGGCharter will be available. -In this cases, KEGGCharter may use mock imputations of quantification and/or taxonomy. +In these cases, KEGGCharter may use mock imputations of quantification and/or taxonomy. To input mock quantification, run with the ```--input-quantification``` parameter. This will attribute a quantification of 1 to every protein in the input dataset. diff --git a/keggpathway_map.py b/keggpathway_map.py index 57cb0e9..5e898ca 100644 --- a/keggpathway_map.py +++ b/keggpathway_map.py @@ -418,7 +418,7 @@ def genomic_potential_taxa( name = self.name.split(':')[-1] # Write JSON data to a file with open(f'{output}/json/potential_{name}.json', 'w') as file: - json.dump(box2taxon, file, indent=4) + json.dump(box2taxon, file, indent=2) self.pathway_box_list(box2taxon, dic_colors) # for every box with KOs identified from the most abundant taxa, sub-boxes are created with colours of the corresponding taxa self.to_pdf(f'{output}/maps/potential_{name}.pdf') self.create_potential_legend( @@ -461,10 +461,9 @@ def differential_expression_sample( box2colors = self.pathway_boxes_differential(df) name = self.name.split(':')[-1] - json_data = json.dumps(box2colors) # Write JSON data to a file with open(f'{output}/json/differential_{name}.json', 'w') as file: - file.write(json_data) + json.dump(box2colors, file, indent=2) self.to_pdf(f'{output}/maps/differential_{name}.pdf') self.differential_colorbar(df, f'{output}/maps/differential_{name}_legend.png')