From c29b9570437c83426198d8c679557e623eea8100 Mon Sep 17 00:00:00 2001
From: iquasere <maildosequeira@gmail.com>
Date: Wed, 20 Dec 2023 11:30:01 +0000
Subject: [PATCH] Added test for "-iq" and "-it" parameters Added test for
 "--include-missing-genomes" and "--map-all" parameters Pretty json for
 differential maps too Updated documentation for additional outputs
 information

---
 .github/workflows/main.yml | 49 +++++++++++++++++++++++++++++++++++++-
 README.md                  | 40 ++++++++++++++++++++-----------
 keggpathway_map.py         |  5 ++--
 3 files changed, 76 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 0ced04b..85485bd 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -43,7 +43,6 @@ jobs:
       - name: Run KEGGCharter
         run: docker run keggcharter /bin/bash -c "keggcharter --show-available-maps"
 
-
   run-keggcharter:
     runs-on: ubuntu-latest
     needs: build
@@ -59,3 +58,51 @@ jobs:
           docker image ls -a
       - name: Run KEGGCharter
         run: docker run keggcharter /bin/bash -c "keggcharter -f KEGGCharter/cicd/keggcharter_input.tsv -rd resources_directory -keggc 'KEGG' -koc 'KO' -ecc 'EC number' -cogc 'COG ID' -qcol MP1,MP2,MP3,MP4 -tc 'Taxonomic lineage (SPECIES)' -mm 00680"
+
+  input-quantification-and-taxonomy:
+    runs-on: ubuntu-latest
+    needs: build
+    steps:
+      - name: Download artifact
+        uses: actions/download-artifact@v2
+        with:
+          name: keggcharter
+          path: /tmp
+      - name: Load Docker image
+        run: |
+          docker load --input /tmp/keggcharter.tar
+          docker image ls -a
+      - name: Run KEGGCharter
+        run: docker run keggcharter /bin/bash -c "keggcharter -f KEGGCharter/cicd/keggcharter_input.tsv -rd resources_directory -keggc 'KEGG' -koc 'KO' -ecc 'EC number' -cogc 'COG ID' -iq -it "My community" -mm 00680"
+
+  include-missing-genomes:
+    runs-on: ubuntu-latest
+    needs: build
+    steps:
+      - name: Download artifact
+        uses: actions/download-artifact@v2
+        with:
+          name: keggcharter
+          path: /tmp
+      - name: Load Docker image
+        run: |
+          docker load --input /tmp/keggcharter.tar
+          docker image ls -a
+      - name: Run KEGGCharter
+        run: docker run keggcharter /bin/bash -c "keggcharter -f KEGGCharter/cicd/keggcharter_input.tsv -rd resources_directory -keggc 'KEGG' -koc 'KO' -ecc 'EC number' -cogc 'COG ID' -qcol MP1,MP2,MP3,MP4 -tc 'Taxonomic lineage (SPECIES)' -mm 00680 --include-missing-genomes"
+
+  map-all:
+    runs-on: ubuntu-latest
+    needs: build
+    steps:
+      - name: Download artifact
+        uses: actions/download-artifact@v2
+        with:
+          name: keggcharter
+          path: /tmp
+      - name: Load Docker image
+        run: |
+          docker load --input /tmp/keggcharter.tar
+          docker image ls -a
+      - name: Run KEGGCharter
+        run: docker run keggcharter /bin/bash -c "keggcharter -f KEGGCharter/cicd/keggcharter_input.tsv -rd resources_directory -keggc 'KEGG' -koc 'KO' -ecc 'EC number' -cogc 'COG ID' -qcol MP1,MP2,MP3,MP4 -tc 'Taxonomic lineage (SPECIES)' -mm 00680 --map-all"
diff --git a/README.md b/README.md
index d344e1e..ce69e0c 100644
--- a/README.md
+++ b/README.md
@@ -32,10 +32,20 @@ To run KEGGCharter, an input file must be supplied. This file only needs to cont
 
 An example input file is available [here](https://github.com/iquasere/KEGGCharter/blob/master/cicd/keggcharter_input.tsv). 
 It contains all fields referenced above, and should be used as guidance for building inputs for KEGGCharter.
-To obtain metabolic representations for "Methane Metabolism" and "Fatty Acid Degradation" with KEGGCharter, for this input file, KEGGCharter can be run with the following command:
+The following command will obtain metabolic representations for "Methane Metabolism" (KEGG map00680) with KEGGCharter:
 ```
-keggcharter -f keggcharter_input.tsv -o test_keggcharter -qcol mt_0.01a,mt_1a,mt_100a,mt_0.01b,mt_1b,mt_100b,mt_0.01c,mt_1c,mt_100c -keggc "KEGG ID" -tc "Species" -mm 00680,00071
+keggcharter -f keggcharter_input.tsv -rd resources_directory -keggc 'KEGG' -koc 'KO' -ecc 'EC number' -cogc 'COG ID' -iq -it "My community" -mm 00680 -o first_time_running_KC
 ```
+After it is over, you should have, inside the `first_time_running_KC` folder:
+* additional information concerning your data in the file `KEGGCharter_results.tsv`
+* maps in PNG format inside a `maps` folder
+* JSONs with the information painted on the maps inside a `json` folder 
+
+Additionally, you should have the `data_for_charting.tsv` and `taxon_to_mmap_to_orthologs` files. These are there so KEGGCharter can be run again, for other maps, by running the same command as before, but with the additional parameter `--resume`. With this parameter, KEGGCharter will look for those files, and new maps can be generated by changing the `--metabolic-maps` parameter. No need for repeated KOs and EC numbers retrieval!
+
+### What maps are available?
+
+You can see what maps are available for the `--metabolic-maps` parameter by running `keggcharter --show-available-maps`.
 
 ### First time KEGGCharter runs it will take a long time
 
@@ -63,18 +73,16 @@ Fig. 2 - KEGG metabolic map of methane metabolism, with differential analysis of
 
 KEGGCharter provides several options for customizing its workflow.
 ```
-options:
   -h, --help            show this help message and exit
+  -f FILE, --file FILE  TSV or EXCEL table with information to chart
   -o OUTPUT, --output OUTPUT
                         Output directory
   -rd RESOURCES_DIRECTORY, --resources-directory RESOURCES_DIRECTORY
                         Directory for storing KGML and CSV files.
   -mm METABOLIC_MAPS, --metabolic-maps METABOLIC_MAPS
                         IDs of metabolic maps to output
-  -gcol GENOMIC_COLUMNS, --genomic-columns GENOMIC_COLUMNS
-                        Names of columns with genomic identification
-  -tcol TRANSCRIPTOMIC_COLUMNS, --transcriptomic-columns TRANSCRIPTOMIC_COLUMNS
-                        Names of columns with transcriptomics quantification
+  -qcol QUANTIFICATION_COLUMNS, --quantification-columns QUANTIFICATION_COLUMNS
+                        Names of columns with quantification
   -tls TAXA_LIST, --taxa-list TAXA_LIST
                         List of taxa to represent in genomic potential charts (comma separated)
   -not NUMBER_OF_TAXA, --number-of-taxa NUMBER_OF_TAXA
@@ -85,19 +93,23 @@ options:
                         Column with KOs.
   -ecc EC_COLUMN, --ec-column EC_COLUMN
                         Column with EC numbers.
+  -cogc COG_COLUMN, --cog-column COG_COLUMN
+                        Column with COG IDs.
+  -tc TAXA_COLUMN, --taxa-column TAXA_COLUMN
+                        Column with the taxa designations to represent with KEGGCharter. NOTE - for valid taxonomies, check: https://www.genome.jp/kegg/catalog/org_list.html
   -iq, --input-quantification
                         If input table has no quantification, will create a mock quantification column
   -it INPUT_TAXONOMY, --input-taxonomy INPUT_TAXONOMY
                         If no taxonomy column exists and there is only one taxon in question.
-  -tc TAXA_COLUMN, --taxa-column TAXA_COLUMN
-                        Column with the taxa designations to represent with KEGGCharter
+  -t THREADS, --threads THREADS
+                        Number of threads to run KEGGCharter with [max available]
+  --step STEP           Number of IDs to submit per request through the KEGG API [40]
+  --map-all             Ignore KEGG taxonomic information. All functions for all KOs will be represented, even if they aren't attributed by KEGG to the specific species.
+  --include-missing-genomes
+                        Map the functions for KOs identified for organisms not present in KEGG Genomes.
   --resume              If data inputed has already been analyzed by KEGGCharter.
-  --step STEP           Number of IDs to submit per request through the KEGG API.
   -v, --version         show program's version number and exit
 
-required named arguments:
-  -f FILE, --file FILE  TSV or EXCEL table with information to chart
-
 Special functions:
   --show-available-maps
                         Outputs KEGG maps IDs and descriptions to the console (so you may pick the ones you want!)
@@ -106,7 +118,7 @@ Special functions:
 ### Mock imputation of quantification and taxonomy
 
 Sometimes, not all information required for KEGGCharter will be available. 
-In this cases, KEGGCharter may use mock imputations of quantification and/or taxonomy.
+In these cases, KEGGCharter may use mock imputations of quantification and/or taxonomy.
 
 To input mock quantification, run with the ```--input-quantification``` parameter. This will attribute a quantification 
 of 1 to every protein in the input dataset.
diff --git a/keggpathway_map.py b/keggpathway_map.py
index 57cb0e9..5e898ca 100644
--- a/keggpathway_map.py
+++ b/keggpathway_map.py
@@ -418,7 +418,7 @@ def genomic_potential_taxa(
         name = self.name.split(':')[-1]
         # Write JSON data to a file
         with open(f'{output}/json/potential_{name}.json', 'w') as file:
-            json.dump(box2taxon, file, indent=4)
+            json.dump(box2taxon, file, indent=2)
         self.pathway_box_list(box2taxon, dic_colors)  # for every box with KOs identified from the most abundant taxa, sub-boxes are created with colours of the corresponding taxa
         self.to_pdf(f'{output}/maps/potential_{name}.pdf')
         self.create_potential_legend(
@@ -461,10 +461,9 @@ def differential_expression_sample(
         box2colors = self.pathway_boxes_differential(df)
 
         name = self.name.split(':')[-1]
-        json_data = json.dumps(box2colors)
         # Write JSON data to a file
         with open(f'{output}/json/differential_{name}.json', 'w') as file:
-            file.write(json_data)
+            json.dump(box2colors, file, indent=2)
         self.to_pdf(f'{output}/maps/differential_{name}.pdf')
 
         self.differential_colorbar(df, f'{output}/maps/differential_{name}_legend.png')