Change templated insertion code

Chemical118 · Aug 16, 2023 · f58ad33 · f58ad33
1 parent e8778e5
commit f58ad33
Show file tree

Hide file tree

Showing 4 changed files with 29 additions and 15 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,7 @@
 GDBr : Genome identification tool for Double-strand Break Repair
 ================
-<img src="logo/gdbr.svg" alt="GDBr logo" align="right" height="160" style="display: inline-block;"> GDBr (pronounced _Genome Debugger_) is tool that identify Double-strand Break Repair (DSBR) using genome and variant. GDBr goes through three processes to identify DSBR. First step is preprocess the genome using [`RagTag`](https://github.com/malonge/RagTag) and [`svim-asm`](https://github.com/eldariont/svim-asm) and make sure they have same chromosome name with reference. Second step is correcting the variant using [`BLAST`](https://blast.ncbi.nlm.nih.gov/Blast.cgi) and filtering the variant which have repeat bt [`TRF`](https://github.com/Benson-Genomics-Lab/TRF) and [`RepeatMasker`](https://github.com/rmhubley/RepeatMasker), then save a csv file. Last step is to segregate the corrected variants into the appropriate DSBRs. 
+
+<img src="logo/gdbr.svg" alt="GDBr logo" align="right" height="160" style="display: inline-block;"> GDBr (pronounced _Genome Debugger_) is tool that identify Double-strand Break Repair (DSBR) using genome and variant. GDBr goes through three processes to identify DSBR. First step is preprocess the genome using [`RagTag`](https://github.com/malonge/RagTag) and [`svim-asm`](https://github.com/eldariont/svim-asm) and make sure they have same chromosome name with reference. Second step is correcting the variant using [`BLAST`](https://blast.ncbi.nlm.nih.gov/Blast.cgi) and filtering the variant which have repeat bt [`TRF`](https://github.com/Benson-Genomics-Lab/TRF) and [`RepeatMasker`](https://github.com/rmhubley/RepeatMasker), then save a csv file. Last step is to segregate the corrected variants into the appropriate DSBRs.
 
 [![CI](https://github.com/Chemical118/GDBr/workflows/CI/badge.svg)](https://github.com/Chemical118/GDBr/actions?query=workflow%3ACI)
 [![codecov](https://codecov.io/gh/Chemical118/GDBr/branch/master/graph/badge.svg?token=NA5V5H52M6)](https://codecov.io/gh/Chemical118/GDBr)
@@ -9,40 +10,53 @@ GDBr : Genome identification tool for Double-strand Break Repair
 You need only reference sequence and query sequences file to use `GDBr`.
 
 ### Install
+
 We strongly recommend using `conda` package manager to install `GDBr`.
 However, `rmblast` used by `RepeatMasker` are not compatible with `blast` at conda environment. So you may remove `blast` to insatall `GDBr`.
+
 ```sh
 conda remove blast
 conda install -c bioconda -c chemical118 -c conda-forge gdbr
 ```
+
 ### Quick Start
+
 ```sh
 gdbr analysis -r <reference.fa> -q <query1.fa query2.fa ...> -s <species of data> -t <number of threads>
 ```
 
 ### Steps of GDBr
+
 The above command executes the following three processes simultaneously. If you want to redo some of the processes, you can manually run the command below.
+
 #### Preprocess
+
 By using `RagTag` and `svim-asm`, `GDBr` preprocess data and return properly scaffolded query `.fa` sequence file and variant `.vcf` file.
+
 ```sh
 gdbr preprocess -r <reference.fa> -q <query1.fa query2.fa ...> -o prepro -t <number of threads>
 ```
-> Preprocess step needs lots of memory, turn on `--low_memory` if you run out of memory
 
+> Preprocess step needs lots of memory, turn on `--low_memory` if you run out of memory
 
 #### Correct
+
 By using `BLAST`, `GDBr` correct the variant file to analysis DSBR accurately. And, filter the repeat by using `TRF`, `RepeatMasker`.
+
 ```sh
 gdbr correct -r <reference.fa> -q prepro/query/*.GDBr.preprocess.fa -v prepro/vcf/*.GDBr.preprocess.vcf -s <species of data> -o sv -t <number of threads>
 ```
 
 #### Analysis
+
 `GDBr` analysis the variant and identify DSBR mechanism.
+
 ```sh
 gdbr analysis -r <reference.fa> -q prepro/query/*.GDBr.preprocess.fa -v sv/*.GDBr.correct.csv -o dsbr -t <number of threads>
 ```
 
 ### Final output
+
 `GDBr`'s final ouput is `<query basename>.GDBr.result.tsv`. This is simple description of the final output.
 
 | Field             | Description                                          |
@@ -62,4 +76,4 @@ gdbr analysis -r <reference.fa> -q prepro/query/*.GDBr.preprocess.fa -v sv/*.GDB
 | DSBR_START        | different locus DSBR start                           |
 | DSBR_END          | different locus DSBR end                             |
 | HOM_SEQ/HOM_START_SEQ | INDEL : homology sequence / SUB : left homology sequence|
-| HOM_END_SEQ           | SUB : right homology sequence                              |
+| HOM_END_SEQ           | SUB : right homology sequence                              |
diff --git a/gdbr/annotate.py b/gdbr/annotate.py
@@ -420,7 +420,7 @@ def get_homology(sv_data, ref_loc, qry_loc, qryworkdir, hom_find_len=2000, temp_
 
         if ref_lef_hom > 0 or qry_lef_hom > 0:
             left_hom, right_hom, left_hom_seq, right_hom_seq = (ref_lef_hom, ref_rht_hom, ref_lef_hom_seq, ref_rht_hom_seq) if ref_lef_hom + ref_rht_hom >= qry_lef_hom + qry_rht_hom else (qry_lef_hom, qry_rht_hom, qry_lef_hom_seq, qry_rht_hom_seq)
-            dsb_repair_type = 'SUB_HOM_GT_SV_90' if max(left_hom, right_hom) > max(ref_len, qry_len) * 0.9 else 'SUB_HOM_DUP'
+            dsb_repair_type = 'TEMP_INS_GT_SV_90' if max(left_hom, right_hom) > max(ref_len, qry_len) * 0.9 else 'TEMP_INS'
 
         else:
             dsb_repair_type = 'SUB_NOT_SPECIFIED'
@@ -509,7 +509,7 @@ def annotate_main(ref_loc, qry_loc_list, sv_loc_list, hom_find_len=2000, diff_lo
 
         sv_list = get_sv_list(sv_loc)
         tar_sv_list = list(filter(lambda t: t[2] in {'DEL', 'INS', 'SUB'}, sv_list))
-
+        
         hom_list = p_map(partial(get_homology, ref_loc=ref_loc, qry_loc=qry_loc, qryworkdir=qryworkdir,
                                  hom_find_len=hom_find_len, temp_indel_find_len=temp_indel_find_len,
                                  near_gap_find_len=near_gap_find_len, user_gap_baseline=user_gap_baseline),
@@ -551,7 +551,7 @@ def annotate_main(ref_loc, qry_loc_list, sv_loc_list, hom_find_len=2000, diff_lo
 
             if dsb_repair_type == 'HOM':
                 indel_hom_cnt[hom[3]] += 1
-            elif dsb_repair_type == 'SUB_HOM_DUP':
+            elif dsb_repair_type == 'TEMP_INS':
                 temp_ins_hom_cnt[hom[3]] += 1
                 temp_ins_hom_cnt[hom[4]] += 1
             elif dsb_repair_type == 'DIFF_LOCUS_DSBR':

diff --git a/gdbr/utilities.py b/gdbr/utilities.py
@@ -622,14 +622,14 @@ def draw_result(savedir, pre_type_cnt, cor_type_cnt, del_type_cnt, ins_type_cnt,
                         ('REPEAT', 7),
                         ('EXCEPT', 6),
                         ('HOM', 0),
-                        ('HOM_GT_SV_90', 1),
-                        ('NO_HOM', 6),
-                        ('SUB_HOM_DUP', 0),
-                        ('SUB_HOM_GT_SV_90', 1),
+                        ('HOM_GT_SV_90', 6),
+                        ('NO_HOM', 1),
+                        ('TEMP_INS', 0),
+                        ('TEMP_INS_GT_SV_90', 6),
                         ('DIFF_LOCUS_DSBR', 2),
                         ('SUB_UNIQUE_NO_HOM', 3),
                         ('SUB_REPEAT', 8),
-                        ('SUB_NOT_SPECIFIED', 7),
+                        ('SUB_NOT_SPECIFIED', 1),
                         ('REPEAT:TRF_FIRST', 7),
                         ('REPEAT:BLAST', 5),
                         ('REPEAT:TRF_LAST', 4),
@@ -746,7 +746,7 @@ def draw_result(savedir, pre_type_cnt, cor_type_cnt, del_type_cnt, ins_type_cnt,
 
     # Substitution classification
     cnt = sub_type_cnt
-    target_list = ['SUB_HOM_DUP', 'SUB_HOM_GT_SV_90', 'SUB_UNIQUE_NO_HOM', 'DIFF_LOCUS_DSBR', 'SUB_REPEAT', 'SUB_NOT_SPECIFIED'] if diff_locus_dsbr_analysis else ['SUB_HOM_DUP', 'SUB_HOM_GT_SV_90', 'SUB_NOT_SPECIFIED']
+    target_list = ['TEMP_INS', 'DIFF_LOCUS_DSBR', 'SUB_UNIQUE_NO_HOM', 'SUB_NOT_SPECIFIED', 'SUB_REPEAT', 'TEMP_INS_GT_SV_90'] if diff_locus_dsbr_analysis else ['TEMP_INS', 'SUB_NOT_SPECIFIED', 'TEMP_INS_GT_SV_90']
     target_data = [(k, cnt[k] / query_num if k in cnt else 0) for k in target_list]
     fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(6.4, 9.6))
     pd.DataFrame(dict(target_data), index=['']).plot.barh(color=code_palette_dict, stacked=True, ax=ax1, width=width)
@@ -841,11 +841,11 @@ def draw_result(savedir, pre_type_cnt, cor_type_cnt, del_type_cnt, ins_type_cnt,
     # Draw chromosome distribution
     merge_bed_df.columns = ['chrom', 'st', 'nd', 'gdbr_type', 'hom_l', 'hom_r', 'merge_id']
     merge_bed_df['sv_loc'] = [round(i) for i in (merge_bed_df['st'] + merge_bed_df['nd']) / 2]
-    merge_bed_df['Homology type'] = ['Homology' if i in {'HOM', 'SUB_HOM_DUP'} else 'No homology' for i in merge_bed_df['gdbr_type']]
+    merge_bed_df['Homology type'] = ['Homology' if i in {'HOM', 'TEMP_INS'} else 'No homology' for i in merge_bed_df['gdbr_type']]
     if a_ej_baseline is not None:
         dsb_repair_type_list = []
         for gt, h in zip(merge_bed_df['gdbr_type'], merge_bed_df['hom_l']):
-            if gt == 'SUB_HOM_DUP':
+            if gt == 'TEMP_INS':
                 dsb_repair_type_list.append('a-EJ')
             elif gt == 'HOM':
                 if h <= a_ej_baseline:

diff --git a/gdbr/version.py b/gdbr/version.py
@@ -1,2 +1,2 @@
 def get_version():
-    return '0.4.1'
+    return '0.5.0'