Expand NCBI BLASTDB datamanager to cover protein and protein domains.

Based on a branch by @mike8115, rebased with trivial changes by @peterjc like preserving the old functional test.
peterjc · Apr 9, 2014 · 42cb875 · 42cb875
1 parent 8f269fd
commit 42cb875
Show file tree

Hide file tree

Showing 10 changed files with 285 additions and 88 deletions.
diff --git a/data_managers/ncbi_blastdb/README.rst b/data_managers/ncbi_blastdb/README.rst
@@ -4,9 +4,9 @@ Galaxy Data Manager for NCBI BLAST databases
 Copyright 2014 by Daniel Blankenberg (Penn State University, PA 16802, USA),
 and additional contributors. All rights reserved. See the licence text below.
 
-Downloads and populates blastdb data table. This is just a simple example to
-demonstrate the use of Data Managers for processing BLAST databases, and
-uses the NCBI's ``update_blast.pl`` script internally. See:
+Downloads preformatted NCBI BLAST databases and updates ``blastdb`` and
+``blastdb_p``  data tables accordingly. Uses the NCBI's ``update_blast.pl``
+script internally. See:
 
 Blankenberg et al. (2014) Wrangling Galaxy's reference data
 http://dx.doi.org/10.1093/bioinformatics/btu119
@@ -27,6 +27,10 @@ v0.0.2  - Development moved to GitHub, https://github.com/peterjc/galaxy_blast
         - Updated citation information (Blankenberg et al. 2014).
         - Adopted standard MIT License.
         - Now depends on ``package_blast_plus_2_2_29`` in ToolShed.
+v0.0.3  - Updated fetch_blast_db.py to use the current date as the ID
+        - Tool and script now also updates ``blastdb_p`` data tables as needed
+        - Tool and script now also updates ``blastdb_d`` data tables as needed
+        - Tool now uses a dropdown menu to select the desired database
 ======= ======================================================================
 
 

diff --git a/data_managers/ncbi_blastdb/blastdb.xml b/data_managers/ncbi_blastdb/blastdb.xml
@@ -1,14 +1,65 @@
-<tool id="data_manager_blast_db" name="Blast DB" version="0.0.2" tool_type="manage_data">
-    <description>Downloader</description>
-    <command interpreter="python">fetch_blast_db.py --filename "${out_file}" --tool_data_table_name "blastdb"</command>
+<tool id="ncbi_blast_plus_update_db" name="NCBI BLAST databases Data Manager" version="0.0.3" tool_type="manage_data">
+    <description>Download a pre-formatted database from NCBI FTP website</description>
+    <command interpreter="python">fetch_blast_db.py --filename "${out_file}" --tool_data_table_name $db_type.blastdb_type</command>
     <requirements>
         <requirement type="package" version="2.2.29">blast+</requirement>
     </requirements>
     <stdio>
         <exit_code range="1:" level="fatal" description="Tool exception" />
     </stdio>
     <inputs>
-        <param name="blastdb_name" type="text" label="Blast DB Name" help="try &quot;nt&quot; as an example" optional="False"/>
+        <conditional name="db_type">
+            <param name="blastdb_type" type="select" label="Choose database type">
+                <option value="blastdb" selected="true">Nucleotide</option>
+                <option value="blastdb_p">Protein</option>
+                <option value="blastdb_d">Protein Domains</option>
+            </param>
+            <when value="blastdb">
+                <param name="blastdb_name" type="select" label="Blast DB to download" optional="false">
+                    <option value="env_nt">Environmental nucleotide sequences (env_nt)</option>
+                    <option value="16SMicrobial">Microbial 16S sequences (16SMicrobial)</option>
+                    <option value="est">EST sequences (est)</option>
+                    <option value="est_human">Human EST sequences (est_human)</option>
+                    <option value="est_mouse">Mouse EST sequences (est_mouse)</option>
+                    <option value="est_others">Other EST sequences (est_others)</option>
+                    <option value="gss">Genome Survey Sequences (gss)</option>
+                    <option value="gss_annot">Annotated Genome Survey Sequences (gss_annot)</option>
+                    <option value="htgs">High-Throughput Genome Sequences (htgs)</option>
+                    <option value="human_genomic">Human RefSeq chromosome records (human_genomic)</option>
+                    <option value="nt">Nucleotide sequence database from GenBank, EMBL, DDBJ, excludes gss sts, pat, est and htg (nt)</option>
+                    <option value="other_genomic">RefSeq chromosome records other than human (other_genomic)</option>
+                    <option value="patnt">Patent nucleotide sequences (patnt)</option>
+                    <option value="pdbnt">Protein Data Bank Nucleic acid structures (pdbnt)</option>
+                    <option value="refseq_genomic">NCBI genomic reference sequences (refseq_genomic)</option>
+                    <option value="refseq_rna">NCBI transcript reference sequences (refseq_rna)</option>
+                    <option value="refseqgene">NCBI gene reference sequences (refseqgene)</option>
+                    <option value="sts">Sequence Tagged Sites (sts)</option>
+                    <option value="tsa_nt">Transcriptome Shotgun Assembly Sequence (tsa_nt)</option>
+                    <option value="wgs">Whole Genome Shotgun Sequences Assemblies (wgs)</option>
+                </param>
+            </when>
+            <when value="blastdb_p">
+                <param name="blastdb_name" type="select" label="BlastDB to download" optional="false">
+                    <option value="env_nr">Environmental protein sequences (env_nr)</option>
+                    <option value="nr">Non-redundant protein sequence database from GenPept, Swissprot, PIR, PDF, PDB, NCBI RefSeq (nr)</option>
+                    <option value="pataa">Patent protein sequences (pataa)</option>
+                    <option value="pdbaa">Protein Data Bank protein structures (pdbnt)</option>
+                    <option value="refseq_protein">NCBI protein reference sequences (refseq_protein)</option>
+                    <option value="swissprot">Sequences from SwissProt database (swissprot)</option>
+                </param>
+            </when>
+            <when value="blastdb_d">
+                <param name="blastdb_name" type="select" label="BlastDB to download" optional="false">
+                    <option value="Cdd">CDD</option>
+                    <option value="Cog">COG</option>
+                    <option value="Kog">KOG</option>
+                    <option value="Pfam">Pfam</option>
+                    <option value="Prk">PRK</option>
+                    <option value="Smart">Smart</option>
+                    <option value="Tigr">TIGRFAM</option>
+                </param>
+            </when>
+        </conditional>
         <conditional name="advanced">
             <param name="advanced_selector" type="select" label="Advanced Options">
                 <option value="basic" selected="True">Basic</option>
@@ -27,22 +78,68 @@
     </outputs>
     <tests>
         <test>
+            <param name="blastdb_type" value="blastdb"/>
             <param name="blastdb_name" value="est"/>
             <param name="advanced_selector" value="basic"/>
             <output name="out_file" file="est_out.json"/>
         </test>
+        <test>
+            <param name="blastdb_type" value="blastdb"/>
+            <param name="blastdb_name" value="patnt"/>
+            <param name="advanced_selector" value="basic"/>
+            <output name="out_file" file="patnt.out" compare="re_match"/>
+        </test>
+        <test>
+            <param name="blastdb_type" value="blastdb_p"/>
+            <param name="blastdb_name" value="pataa"/>
+            <param name="advanced_selector" value="basic"/>
+            <output name="out_file" file="pataa.out" compare="re_match"/>
+        </test>
+        <test>
+            <param name="blastdb_type" value="blastdb_d"/>
+            <param name="blastdb_name" value="cog.out"/>
+            <param name="advanced_selector" value="basic"/>
+            <output name="out_file" file="cog.out" compare="re_match"/>
+        </test>
     </tests>
     <help>
 **What it does**
 
-Downloads Blast DBs and updates blastdb tool data tables.
+Downloads preformatted Blast DBs and updates blastdb, blastdp_p and blastdb_d tables accordingly.
+Protein and nucleotide databases are retrieved using the update_blastdb.pl script that comes with BLAST.
+Protein domain databases are downloaded from ftp://ftp.ncbi.nih.gov/pub/mmdb/cdd/little_endian/.
+
+For more information about these databases refer to:
+
+ftp://ftp.ncbi.nih.gov/pub/mmdb/cdd/README (Protein domains)
+
+ftp://ftp.ncbi.nih.gov/blast/db/README (Nucleotides and proteins)
+
+**Output**
+
+The data manager will download the database from NCBI and add an
+entry to the corresponding data table. The database will be stored at
+$galaxy_data_manager_data_path/$database_type/$blastdb_name/$database_id where,
+
+- $galaxy_data_manager_data_path is defined in universe_wsgi.ini
+
+- $database_type is either blastdb, blastdb_p or blastdb_d
+
+- $blastdb_name is the name of the database you are downloading
+
+- $database_id is either the ID generated from (name)_YYYY_MM_DD or user defined ID.
 
-------
 
+The data manager will attempt to read the alias file to generate the
+description, but this may fail. Otherwise, it will use the ID.
 
-.. class:: infomark
+.. class:: warningmark
 
-**Notice:** This is a functional, but basic, tool for fetching preformatted blastdbs.
+Galaxy and the data manger won't stop you from downloading the same
+database over and over again, even if you use the same ID.
+Multiple entires in the data table should go away upon restart and
+since the ID are the same, it would just overwrite the database you
+already have.
 
 
 -------

diff --git a/data_managers/ncbi_blastdb/data_manager_conf.xml b/data_managers/ncbi_blastdb/data_manager_conf.xml
@@ -9,7 +9,33 @@
                     <move type="directory">
                         <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">blastdb/${path}</target>
                     </move>
-                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/blastdb/${path}/${nucleotide_alias_name}</value_translation>
+                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/blastdb/${path}/${database_alias_name}</value_translation>
+                    <value_translation type="function">abspath</value_translation>
+                </column>
+            </output>
+        </data_table>
+        <data_table name="blastdb_p">
+            <output>
+                <column name="value" />
+                <column name="name" />
+                <column name="path" output_ref="out_file" >
+                    <move type="directory">
+                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">blastdb_p/${path}</target>
+                    </move>
+                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/blastdb_p/${path}/${database_alias_name}</value_translation>
+                    <value_translation type="function">abspath</value_translation>
+                </column>
+            </output>
+        </data_table>
+        <data_table name="blastdb_d">
+            <output>
+                <column name="value" />
+                <column name="name" />
+                <column name="path" output_ref="out_file" >
+                    <move type="directory">
+                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">blastdb_d/${path}</target>
+                    </move>
+                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/blastdb_d/${path}/${database_alias_name}</value_translation>
                     <value_translation type="function">abspath</value_translation>
                 </column>
             </output>

diff --git a/data_managers/ncbi_blastdb/fetch_blast_db.py b/data_managers/ncbi_blastdb/fetch_blast_db.py
@@ -1,42 +1,16 @@
 #!/usr/bin/env python
-#Dan Blankenberg
-#Script that calls update_blastdb.pl to download preformatted databases
+#Adapted from Dan Blankenberg's data_manager_example_blastdb_ncbi_update_blastdb
+#Michael Li, Microbial Biodiversity Bioinformatics group, Agriculture and Agri-Food Canada, April 2014
+#Script that downloads preformatted databases from NCBI.
 
 import optparse
 import os
 import sys
 import subprocess
-import hashlib
-
+import time
+import tarfile
+from ftplib import FTP
 from galaxy.util.json import from_json_string, to_json_string
-DEFAULT_ALGORITHM = hashlib.sha512
-CHUNK_SIZE = 2**20 #1mb
-
-def get_dir_hash( directory, algorithm=None, followlinks=True, chunk_size=None ):
-    chunk_size = chunk_size or CHUNK_SIZE
-    algorithm = algorithm or DEFAULT_ALGORITHM
-    if isinstance( algorithm, basestring ):
-        hash = hashlib.new( algorithm )
-    else:
-        hash = algorithm()
-    #we hash a directory by taking names of directories, files and their contents
-    for dirpath, dirnames, filenames in os.walk( directory, followlinks=followlinks ):
-        dirnames.sort()
-        filenames.sort()
-        for name in dirnames:
-            hash.update( os.path.relpath( os.path.join( dirpath, name ), directory ) )
-        for name in filenames:
-            filename = os.path.join( dirpath, name )
-            hash.update( os.path.relpath( filename, directory ) )
-            fh = open( filename, 'rb' )
-            while True:
-                data = fh.read( chunk_size )
-                if not data:
-                    break
-                hash.update( data )
-            fh.close()
-
-    return hash.hexdigest()
 
 def main():
     #Parse Command Line
@@ -45,44 +19,104 @@ def main():
     parser.add_option( '-t', '--tool_data_table_name', dest='tool_data_table_name', action='store', type='string', default=None, help='tool_data_table_name' )
     (options, args) = parser.parse_args()
 
+    #Take the JSON input file for parsing
     params = from_json_string( open( options.filename ).read() )
     target_directory = params[ 'output_data' ][0]['extra_files_path']
     os.mkdir( target_directory )
-
-    blastdb_name = params['param_dict']['blastdb_name'] #value
+
+    #Fetch parameters from input JSON file
+    blastdb_name = params['param_dict']['db_type'].get( 'blastdb_name' )
+    blastdb_type = params['param_dict']['db_type'].get( 'blastdb_type' )
     data_description = params['param_dict']['advanced'].get( 'data_description', None )
     data_id = params['param_dict']['advanced'].get( 'data_id', None )
 
-    cmd_options = [ '--decompress' ]
-
-    args = [ 'update_blastdb.pl' ] + cmd_options + [ blastdb_name ]
-    proc = subprocess.Popen( args=args, shell=False, cwd=target_directory )
-    return_code = proc.wait()
-    if return_code != 1:
-        print >> sys.stderr, "Error obtaining blastdb (%s)" % return_code
-        sys.exit( 1 )
+    #update_blastdb.pl doesn't download protein domains, so we use ftp
+    if blastdb_type == 'blastdb_d':
+        try:
+            archive_name = blastdb_name + '_LE.tar.gz'
+            tar_file = open( os.path.join( target_directory, archive_name ), "wb" )
+
+            #Connect via ftp and download
+            ftp = FTP('ftp.ncbi.nih.gov')
+            ftp.login()
+            ftp.cwd('pub/mmdb/cdd/little_endian')
+            ftp.retrbinary('RETR %s' % archive_name, tar_file.write)
+            tar_file.close()
+
+            #Extract contents
+            tar_file = tarfile.open(os.path.join( target_directory, archive_name ), mode='r')
+            tar_file.extractall( target_directory )
+            tar_file.close()
+
+        #If the download fails, ftplib should generate an error in ftplib.all_errors
+        #Likewise, tarfile.ReadError should catch any errors when reading from the tar
+        #And other possible errors that can occur here...
+        except IOError, e:
+            print >> sys.stderr, "Cannot create file: %s: %s" % ( archive_name, e )
+            sys.exit( 1 )
+
+        except os.error, e:
+            print "Error while joining %s and %s: %s" % ( target_directory, archive_name, e )
+            sys.exit( 1 )
+
+        except ftplib.all_errors, e:
+            print >> sys.stderr, "Error while downloading protein domain database: %s" % ( e )
+            sys.exit( 1 )
+
+        except tarfile.TarError, e:
+            print >> sys.stderr, "Error while opening/extracting the tar file: %s" % ( e )
+            sys.exit( 1 )
+
+    else:
+        #Run update_blastdb.pl
+        cmd_options = [ '--decompress' ]
+        args = [ 'update_blastdb.pl' ] + cmd_options + [ blastdb_name ]
+        proc = subprocess.Popen( args=args, shell=False, cwd=target_directory )
+        return_code = proc.wait()
+
+        #Check if download was successful (exit code 1)
+        if return_code != 1:
+            print >> sys.stderr, "Error obtaining blastdb (%s)" % return_code
+            sys.exit( 1 )
 
+    #Set id and description if not provided in the advanced settings
     if not data_id:
-        data_id = "%s_%s" % ( blastdb_name, get_dir_hash( target_directory ) )
+        #Use download time to create uniq id
+        localtime = time.localtime()
+        timeString = time.strftime("%Y_%m_%d", localtime)
+        data_id = "%s_%s" % ( blastdb_name, timeString )
 
-    if not data_description:
+    # Attempt to automatically set description from alias file
+    # Protein domain databases don't have an alias file
+    if not data_description and blastdb_type != 'blastdb_d':
         alias_date = None
+        alias_file = None
         try:
-            for line in open( os.path.join( target_directory, "%s.nal" % ( blastdb_name ) ) ):
-                if line.startswith( '# Alias file created ' ):
-                    alias_date = line.split( '# Alias file created ', 1 )[1].strip()
-                if line.startswith( 'TITLE' ):
-                    data_description = line.split( None, 1 )[1].strip()
-                    break
+            if blastdb_type == 'blastdb':
+                alias_file = "%s.nal" % ( blastdb_name )
+            if blastdb_type == 'blastdb_p':
+                alias_file = "%s.pal" % ( blastdb_name )
+            if alias_file:
+                for line in open( os.path.join( target_directory, alias_file ) ):
+                    if line.startswith( '# Alias file created ' ):
+                        alias_date = line.split( '# Alias file created ', 1 )[1].strip()
+                    if line.startswith( '# Date created: ' ):
+                        alias_date = line.split( '# Date created: ', 1)[1].strip()
+                    if line.startswith( 'TITLE' ):
+                        data_description = line.split( None, 1 )[1].strip()
+                        break
         except Exception, e:
             print >> sys.stderr, "Error Parsing Alias file for TITLE and date: %s" % ( e )
+        #If we manage to parse the pal or nal file, set description
         if alias_date and data_description:
             data_description = "%s (%s)" % ( data_description, alias_date )
 
+    #If we could not parse the nal or pal file for some reason
     if not data_description:
         data_description = data_id
 
-    data_table_entry = { 'value':data_id, 'name':data_description, 'path': os.path.join( blastdb_name, data_id ), 'nucleotide_alias_name': blastdb_name }
+    #Prepare output string to convert into JSON format
+    data_table_entry = { 'value':data_id, 'name':data_description, 'path': os.path.join( blastdb_name, data_id ), 'database_alias_name': blastdb_name }
     data_manager_dict = { 'data_tables': { options.tool_data_table_name: [ data_table_entry ]  } }
 
     #save info to json file