conf_for_big_genome.txt

## Set the detail parameters of the modules during GETA pipeline through this config file.
# the parameters set below will rewrite the default threashold.
# the template conf.txt in GETA software was the default values.
# the anotated lines start with # will be ingnored. the blank lines will be ignored.

[RepeatMasker]
-e ncbi -gff
# During the step of RepeatMasker: -species was passed by --RM_species; -pa was passed by --CPU.

[trimmomatic]
TruSeq3-PE-2.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:50 TOPHRED33
# note: must start with the adapter file name.

[hisat2-build]
-p 1
# note: the multi threads parameter -p was set to 1 by default, which was not passed by --CPU as usual.

[hisat2]
--min-intronlen 20 --max-intronlen 20000 --dta --score-min L,0.0,-0.4
# During the step of hisat2: -x was set to "genome"; -p was passed by --CPU; 

[sam2transfrag]
--fraction 0.05 --min_expressed_base_depth 2 --max_expressed_base_depth 50 --min_junction_depth 2 --max_junction_depth 50 --min_fragment_count_per_transfrags 10 --min_intron_length 20
# During the step of sam2transfrag: --no_strand_specific was passed reversely by --strand_specific; 
# If RNA-Seq reads containig single-end data, the --no_PE_sequencing should be added. Otherwise, the alignment of single-end will be ignored.
# The defalut value means: the dynamic coverage threshold of mapping regions (come frome SAM file) was determinated by the maximum base depth of each region * 0.05, as well as this threshold should between 2 and 50.

[TransDecoder.LongOrfs]
-m 100 -G universal
# note: the strand specific parameter -S will be set automatically if transcripts had introns which could determinated the direction.

[TransDecoder.Predict]
--retain_long_orfs_mode dynamic
# note: when predicting the ORF of transfrag with single exon, the parameter --train will be set automatically to the result of transfrag with multi exons.

[homolog_genewise]
--coverage_ratio 0.4 --evalue 1e-9
# During the step of homolog_genewise: --cpu was passed by --CPU; --max_gene_length, --segmentSize and --overlapSize were automatically calculated.

[homolog_genewiseGFF2GFF3]
--min_score 15 --gene_prefix genewise --filterMiddleStopCodon
# During the step of homolog_genewiseGFF2GFF3: --input_genewise_start_info was fixed to "genewise.start_info.txt"; --output_start_and_stop_hints_of_augustus was fixed to "genewise.start_stop_hints.gff".

[geneModels2AugusutsTrainingInput]
--min_evalue 1e-9 --min_identity 0.8 --min_coverage_ratio 0.8 --min_cds_num 2 --min_cds_length 600 --min_cds_exon_ratio 0.60
# During the step of geneModels2AugusutsTrainingInput: --cpu was passed by --CPU; --out_prefix was set to ati.
# if gene models number of ati.filter2.gff3 was few, recommed to decrease --min_cds_exon_ratio and increase --min_coverage_ratio.

[BGM2AT]
--min_gene_number_for_augustus_training 500 --gene_number_for_accuracy_detection 200 --min_gene_number_of_optimize_augustus_chunk 50 --max_gene_number_of_optimize_augustus_chunk 200
# During the step of BGM2AT: --flanking_length was automatically calculated; --stopAfterFirstEtraining was set or not set at different steps; --onlytrain_GFF3 was set to a intermediate file.

[prepareAugusutusHints]
--margin 20
# note: to obtain the exonpart or cdspart hints, the margin 20bp will be removed.

[paraAugusutusWithHints]
--gene_prefix augustus --min_intron_len 30 --alternatives_from_evidence
# During the step of paraAugusutusWithHints: --species was passed by --augustus_species; --cpu was passed by --CPU; --segmentSize and --overlapSize were automatically calculated.

[paraCombineGeneModels]
--overlap 30 --min_augustus_transcriptSupport_percentage 50.0 --min_augustus_intronSupport_number 1 --min_augustus_intronSupport_ratio 0.5
# During the step of paraAugusutusWithHints: --cpu was passed by --CPU.

[PfamValidateABinitio]
--CDS_length 1200 --CDS_num 4 --evalue 1e-9 --coverage 0.4
# During the step of PfamValidateABinitio: --pfam_db was passed by --pfam_db; --cpu was passed by --CPU; --out_prefix was fixed to "combine2".

[remove_genes_in_repeats]
--ratio 0.2
# The gene models overlapping with repeats was removed if the overlap ratio of CDS region >= 0.8 . If this value was set > 1.0, the gene models overlapping with repeats would not be removed.
# During the step of remove_genes_in_repeats: --filtered_gene_models was fixed to "genome.completed.genes_in_repeats.gff3".

[remove_short_genes]
--cds_length 450
# at last, the short gene models was filtered by PfamValidateABinitio.