Skip to content

Commit

Permalink
Expand NCBI BLASTDB datamanager to cover protein and protein domains.
Browse files Browse the repository at this point in the history
Based on a branch by @mike8115, rebased with trivial changes
by @peterjc like preserving the old functional test.
  • Loading branch information
mike8115 authored and peterjc committed Apr 9, 2014
1 parent 8f269fd commit 42cb875
Show file tree
Hide file tree
Showing 10 changed files with 285 additions and 88 deletions.
10 changes: 7 additions & 3 deletions data_managers/ncbi_blastdb/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ Galaxy Data Manager for NCBI BLAST databases
Copyright 2014 by Daniel Blankenberg (Penn State University, PA 16802, USA),
and additional contributors. All rights reserved. See the licence text below.

Downloads and populates blastdb data table. This is just a simple example to
demonstrate the use of Data Managers for processing BLAST databases, and
uses the NCBI's ``update_blast.pl`` script internally. See:
Downloads preformatted NCBI BLAST databases and updates ``blastdb`` and
``blastdb_p`` data tables accordingly. Uses the NCBI's ``update_blast.pl``
script internally. See:

Blankenberg et al. (2014) Wrangling Galaxy's reference data
http://dx.doi.org/10.1093/bioinformatics/btu119
Expand All @@ -27,6 +27,10 @@ v0.0.2 - Development moved to GitHub, https://github.com/peterjc/galaxy_blast
- Updated citation information (Blankenberg et al. 2014).
- Adopted standard MIT License.
- Now depends on ``package_blast_plus_2_2_29`` in ToolShed.
v0.0.3 - Updated fetch_blast_db.py to use the current date as the ID
- Tool and script now also updates ``blastdb_p`` data tables as needed
- Tool and script now also updates ``blastdb_d`` data tables as needed
- Tool now uses a dropdown menu to select the desired database
======= ======================================================================


Expand Down
113 changes: 105 additions & 8 deletions data_managers/ncbi_blastdb/blastdb.xml
Original file line number Diff line number Diff line change
@@ -1,14 +1,65 @@
<tool id="data_manager_blast_db" name="Blast DB" version="0.0.2" tool_type="manage_data">
<description>Downloader</description>
<command interpreter="python">fetch_blast_db.py --filename "${out_file}" --tool_data_table_name "blastdb"</command>
<tool id="ncbi_blast_plus_update_db" name="NCBI BLAST databases Data Manager" version="0.0.3" tool_type="manage_data">
<description>Download a pre-formatted database from NCBI FTP website</description>
<command interpreter="python">fetch_blast_db.py --filename "${out_file}" --tool_data_table_name $db_type.blastdb_type</command>
<requirements>
<requirement type="package" version="2.2.29">blast+</requirement>
</requirements>
<stdio>
<exit_code range="1:" level="fatal" description="Tool exception" />
</stdio>
<inputs>
<param name="blastdb_name" type="text" label="Blast DB Name" help="try &quot;nt&quot; as an example" optional="False"/>
<conditional name="db_type">
<param name="blastdb_type" type="select" label="Choose database type">
<option value="blastdb" selected="true">Nucleotide</option>
<option value="blastdb_p">Protein</option>
<option value="blastdb_d">Protein Domains</option>
</param>
<when value="blastdb">
<param name="blastdb_name" type="select" label="Blast DB to download" optional="false">
<option value="env_nt">Environmental nucleotide sequences (env_nt)</option>
<option value="16SMicrobial">Microbial 16S sequences (16SMicrobial)</option>
<option value="est">EST sequences (est)</option>
<option value="est_human">Human EST sequences (est_human)</option>
<option value="est_mouse">Mouse EST sequences (est_mouse)</option>
<option value="est_others">Other EST sequences (est_others)</option>
<option value="gss">Genome Survey Sequences (gss)</option>
<option value="gss_annot">Annotated Genome Survey Sequences (gss_annot)</option>
<option value="htgs">High-Throughput Genome Sequences (htgs)</option>
<option value="human_genomic">Human RefSeq chromosome records (human_genomic)</option>
<option value="nt">Nucleotide sequence database from GenBank, EMBL, DDBJ, excludes gss sts, pat, est and htg (nt)</option>
<option value="other_genomic">RefSeq chromosome records other than human (other_genomic)</option>
<option value="patnt">Patent nucleotide sequences (patnt)</option>
<option value="pdbnt">Protein Data Bank Nucleic acid structures (pdbnt)</option>
<option value="refseq_genomic">NCBI genomic reference sequences (refseq_genomic)</option>
<option value="refseq_rna">NCBI transcript reference sequences (refseq_rna)</option>
<option value="refseqgene">NCBI gene reference sequences (refseqgene)</option>
<option value="sts">Sequence Tagged Sites (sts)</option>
<option value="tsa_nt">Transcriptome Shotgun Assembly Sequence (tsa_nt)</option>
<option value="wgs">Whole Genome Shotgun Sequences Assemblies (wgs)</option>
</param>
</when>
<when value="blastdb_p">
<param name="blastdb_name" type="select" label="BlastDB to download" optional="false">
<option value="env_nr">Environmental protein sequences (env_nr)</option>
<option value="nr">Non-redundant protein sequence database from GenPept, Swissprot, PIR, PDF, PDB, NCBI RefSeq (nr)</option>
<option value="pataa">Patent protein sequences (pataa)</option>
<option value="pdbaa">Protein Data Bank protein structures (pdbnt)</option>
<option value="refseq_protein">NCBI protein reference sequences (refseq_protein)</option>
<option value="swissprot">Sequences from SwissProt database (swissprot)</option>
</param>
</when>
<when value="blastdb_d">
<param name="blastdb_name" type="select" label="BlastDB to download" optional="false">
<option value="Cdd">CDD</option>
<option value="Cog">COG</option>
<option value="Kog">KOG</option>
<option value="Pfam">Pfam</option>
<option value="Prk">PRK</option>
<option value="Smart">Smart</option>
<option value="Tigr">TIGRFAM</option>
</param>
</when>
</conditional>
<conditional name="advanced">
<param name="advanced_selector" type="select" label="Advanced Options">
<option value="basic" selected="True">Basic</option>
Expand All @@ -27,22 +78,68 @@
</outputs>
<tests>
<test>
<param name="blastdb_type" value="blastdb"/>
<param name="blastdb_name" value="est"/>
<param name="advanced_selector" value="basic"/>
<output name="out_file" file="est_out.json"/>
</test>
<test>
<param name="blastdb_type" value="blastdb"/>
<param name="blastdb_name" value="patnt"/>
<param name="advanced_selector" value="basic"/>
<output name="out_file" file="patnt.out" compare="re_match"/>
</test>
<test>
<param name="blastdb_type" value="blastdb_p"/>
<param name="blastdb_name" value="pataa"/>
<param name="advanced_selector" value="basic"/>
<output name="out_file" file="pataa.out" compare="re_match"/>
</test>
<test>
<param name="blastdb_type" value="blastdb_d"/>
<param name="blastdb_name" value="cog.out"/>
<param name="advanced_selector" value="basic"/>
<output name="out_file" file="cog.out" compare="re_match"/>
</test>
</tests>
<help>
**What it does**

Downloads Blast DBs and updates blastdb tool data tables.
Downloads preformatted Blast DBs and updates blastdb, blastdp_p and blastdb_d tables accordingly.
Protein and nucleotide databases are retrieved using the update_blastdb.pl script that comes with BLAST.
Protein domain databases are downloaded from ftp://ftp.ncbi.nih.gov/pub/mmdb/cdd/little_endian/.

For more information about these databases refer to:

ftp://ftp.ncbi.nih.gov/pub/mmdb/cdd/README (Protein domains)

ftp://ftp.ncbi.nih.gov/blast/db/README (Nucleotides and proteins)

**Output**

The data manager will download the database from NCBI and add an
entry to the corresponding data table. The database will be stored at
$galaxy_data_manager_data_path/$database_type/$blastdb_name/$database_id where,

- $galaxy_data_manager_data_path is defined in universe_wsgi.ini

- $database_type is either blastdb, blastdb_p or blastdb_d

- $blastdb_name is the name of the database you are downloading

- $database_id is either the ID generated from (name)_YYYY_MM_DD or user defined ID.

------

The data manager will attempt to read the alias file to generate the
description, but this may fail. Otherwise, it will use the ID.

.. class:: infomark
.. class:: warningmark

**Notice:** This is a functional, but basic, tool for fetching preformatted blastdbs.
Galaxy and the data manger won't stop you from downloading the same
database over and over again, even if you use the same ID.
Multiple entires in the data table should go away upon restart and
since the ID are the same, it would just overwrite the database you
already have.


-------
Expand Down
28 changes: 27 additions & 1 deletion data_managers/ncbi_blastdb/data_manager_conf.xml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,33 @@
<move type="directory">
<target base="${GALAXY_DATA_MANAGER_DATA_PATH}">blastdb/${path}</target>
</move>
<value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/blastdb/${path}/${nucleotide_alias_name}</value_translation>
<value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/blastdb/${path}/${database_alias_name}</value_translation>
<value_translation type="function">abspath</value_translation>
</column>
</output>
</data_table>
<data_table name="blastdb_p">
<output>
<column name="value" />
<column name="name" />
<column name="path" output_ref="out_file" >
<move type="directory">
<target base="${GALAXY_DATA_MANAGER_DATA_PATH}">blastdb_p/${path}</target>
</move>
<value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/blastdb_p/${path}/${database_alias_name}</value_translation>
<value_translation type="function">abspath</value_translation>
</column>
</output>
</data_table>
<data_table name="blastdb_d">
<output>
<column name="value" />
<column name="name" />
<column name="path" output_ref="out_file" >
<move type="directory">
<target base="${GALAXY_DATA_MANAGER_DATA_PATH}">blastdb_d/${path}</target>
</move>
<value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/blastdb_d/${path}/${database_alias_name}</value_translation>
<value_translation type="function">abspath</value_translation>
</column>
</output>
Expand Down
136 changes: 85 additions & 51 deletions data_managers/ncbi_blastdb/fetch_blast_db.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,16 @@
#!/usr/bin/env python
#Dan Blankenberg
#Script that calls update_blastdb.pl to download preformatted databases
#Adapted from Dan Blankenberg's data_manager_example_blastdb_ncbi_update_blastdb
#Michael Li, Microbial Biodiversity Bioinformatics group, Agriculture and Agri-Food Canada, April 2014
#Script that downloads preformatted databases from NCBI.

import optparse
import os
import sys
import subprocess
import hashlib

import time
import tarfile
from ftplib import FTP
from galaxy.util.json import from_json_string, to_json_string
DEFAULT_ALGORITHM = hashlib.sha512
CHUNK_SIZE = 2**20 #1mb

def get_dir_hash( directory, algorithm=None, followlinks=True, chunk_size=None ):
chunk_size = chunk_size or CHUNK_SIZE
algorithm = algorithm or DEFAULT_ALGORITHM
if isinstance( algorithm, basestring ):
hash = hashlib.new( algorithm )
else:
hash = algorithm()
#we hash a directory by taking names of directories, files and their contents
for dirpath, dirnames, filenames in os.walk( directory, followlinks=followlinks ):
dirnames.sort()
filenames.sort()
for name in dirnames:
hash.update( os.path.relpath( os.path.join( dirpath, name ), directory ) )
for name in filenames:
filename = os.path.join( dirpath, name )
hash.update( os.path.relpath( filename, directory ) )
fh = open( filename, 'rb' )
while True:
data = fh.read( chunk_size )
if not data:
break
hash.update( data )
fh.close()

return hash.hexdigest()

def main():
#Parse Command Line
Expand All @@ -45,44 +19,104 @@ def main():
parser.add_option( '-t', '--tool_data_table_name', dest='tool_data_table_name', action='store', type='string', default=None, help='tool_data_table_name' )
(options, args) = parser.parse_args()

#Take the JSON input file for parsing
params = from_json_string( open( options.filename ).read() )
target_directory = params[ 'output_data' ][0]['extra_files_path']
os.mkdir( target_directory )

blastdb_name = params['param_dict']['blastdb_name'] #value

#Fetch parameters from input JSON file
blastdb_name = params['param_dict']['db_type'].get( 'blastdb_name' )
blastdb_type = params['param_dict']['db_type'].get( 'blastdb_type' )
data_description = params['param_dict']['advanced'].get( 'data_description', None )
data_id = params['param_dict']['advanced'].get( 'data_id', None )

cmd_options = [ '--decompress' ]

args = [ 'update_blastdb.pl' ] + cmd_options + [ blastdb_name ]
proc = subprocess.Popen( args=args, shell=False, cwd=target_directory )
return_code = proc.wait()
if return_code != 1:
print >> sys.stderr, "Error obtaining blastdb (%s)" % return_code
sys.exit( 1 )
#update_blastdb.pl doesn't download protein domains, so we use ftp
if blastdb_type == 'blastdb_d':
try:
archive_name = blastdb_name + '_LE.tar.gz'
tar_file = open( os.path.join( target_directory, archive_name ), "wb" )

#Connect via ftp and download
ftp = FTP('ftp.ncbi.nih.gov')
ftp.login()
ftp.cwd('pub/mmdb/cdd/little_endian')
ftp.retrbinary('RETR %s' % archive_name, tar_file.write)
tar_file.close()

#Extract contents
tar_file = tarfile.open(os.path.join( target_directory, archive_name ), mode='r')
tar_file.extractall( target_directory )
tar_file.close()

#If the download fails, ftplib should generate an error in ftplib.all_errors
#Likewise, tarfile.ReadError should catch any errors when reading from the tar
#And other possible errors that can occur here...
except IOError, e:
print >> sys.stderr, "Cannot create file: %s: %s" % ( archive_name, e )
sys.exit( 1 )

except os.error, e:
print "Error while joining %s and %s: %s" % ( target_directory, archive_name, e )
sys.exit( 1 )

except ftplib.all_errors, e:
print >> sys.stderr, "Error while downloading protein domain database: %s" % ( e )
sys.exit( 1 )

except tarfile.TarError, e:
print >> sys.stderr, "Error while opening/extracting the tar file: %s" % ( e )
sys.exit( 1 )

else:
#Run update_blastdb.pl
cmd_options = [ '--decompress' ]
args = [ 'update_blastdb.pl' ] + cmd_options + [ blastdb_name ]
proc = subprocess.Popen( args=args, shell=False, cwd=target_directory )
return_code = proc.wait()

#Check if download was successful (exit code 1)
if return_code != 1:
print >> sys.stderr, "Error obtaining blastdb (%s)" % return_code
sys.exit( 1 )

#Set id and description if not provided in the advanced settings
if not data_id:
data_id = "%s_%s" % ( blastdb_name, get_dir_hash( target_directory ) )
#Use download time to create uniq id
localtime = time.localtime()
timeString = time.strftime("%Y_%m_%d", localtime)
data_id = "%s_%s" % ( blastdb_name, timeString )

if not data_description:
# Attempt to automatically set description from alias file
# Protein domain databases don't have an alias file
if not data_description and blastdb_type != 'blastdb_d':
alias_date = None
alias_file = None
try:
for line in open( os.path.join( target_directory, "%s.nal" % ( blastdb_name ) ) ):
if line.startswith( '# Alias file created ' ):
alias_date = line.split( '# Alias file created ', 1 )[1].strip()
if line.startswith( 'TITLE' ):
data_description = line.split( None, 1 )[1].strip()
break
if blastdb_type == 'blastdb':
alias_file = "%s.nal" % ( blastdb_name )
if blastdb_type == 'blastdb_p':
alias_file = "%s.pal" % ( blastdb_name )
if alias_file:
for line in open( os.path.join( target_directory, alias_file ) ):
if line.startswith( '# Alias file created ' ):
alias_date = line.split( '# Alias file created ', 1 )[1].strip()
if line.startswith( '# Date created: ' ):
alias_date = line.split( '# Date created: ', 1)[1].strip()
if line.startswith( 'TITLE' ):
data_description = line.split( None, 1 )[1].strip()
break
except Exception, e:
print >> sys.stderr, "Error Parsing Alias file for TITLE and date: %s" % ( e )
#If we manage to parse the pal or nal file, set description
if alias_date and data_description:
data_description = "%s (%s)" % ( data_description, alias_date )

#If we could not parse the nal or pal file for some reason
if not data_description:
data_description = data_id

data_table_entry = { 'value':data_id, 'name':data_description, 'path': os.path.join( blastdb_name, data_id ), 'nucleotide_alias_name': blastdb_name }
#Prepare output string to convert into JSON format
data_table_entry = { 'value':data_id, 'name':data_description, 'path': os.path.join( blastdb_name, data_id ), 'database_alias_name': blastdb_name }
data_manager_dict = { 'data_tables': { options.tool_data_table_name: [ data_table_entry ] } }

#save info to json file
Expand Down
Loading

0 comments on commit 42cb875

Please sign in to comment.