diff --git a/data_managers/ncbi_blastdb/README.rst b/data_managers/ncbi_blastdb/README.rst index dc71afc0..f77900ee 100644 --- a/data_managers/ncbi_blastdb/README.rst +++ b/data_managers/ncbi_blastdb/README.rst @@ -4,9 +4,9 @@ Galaxy Data Manager for NCBI BLAST databases Copyright 2014 by Daniel Blankenberg (Penn State University, PA 16802, USA), and additional contributors. All rights reserved. See the licence text below. -Downloads and populates blastdb data table. This is just a simple example to -demonstrate the use of Data Managers for processing BLAST databases, and -uses the NCBI's ``update_blast.pl`` script internally. See: +Downloads preformatted NCBI BLAST databases and updates ``blastdb`` and +``blastdb_p`` data tables accordingly. Uses the NCBI's ``update_blast.pl`` +script internally. See: Blankenberg et al. (2014) Wrangling Galaxy's reference data http://dx.doi.org/10.1093/bioinformatics/btu119 @@ -27,6 +27,10 @@ v0.0.2 - Development moved to GitHub, https://github.com/peterjc/galaxy_blast - Updated citation information (Blankenberg et al. 2014). - Adopted standard MIT License. - Now depends on ``package_blast_plus_2_2_29`` in ToolShed. +v0.0.3 - Updated fetch_blast_db.py to use the current date as the ID + - Tool and script now also updates ``blastdb_p`` data tables as needed + - Tool and script now also updates ``blastdb_d`` data tables as needed + - Tool now uses a dropdown menu to select the desired database ======= ====================================================================== diff --git a/data_managers/ncbi_blastdb/blastdb.xml b/data_managers/ncbi_blastdb/blastdb.xml index 65f94a6b..707769a0 100644 --- a/data_managers/ncbi_blastdb/blastdb.xml +++ b/data_managers/ncbi_blastdb/blastdb.xml @@ -1,6 +1,6 @@ - - Downloader - fetch_blast_db.py --filename "${out_file}" --tool_data_table_name "blastdb" + + Download a pre-formatted database from NCBI FTP website + fetch_blast_db.py --filename "${out_file}" --tool_data_table_name $db_type.blastdb_type blast+ @@ -8,7 +8,58 @@ - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -27,22 +78,68 @@ + + + + + + + + + + + + + + + + + + + **What it does** -Downloads Blast DBs and updates blastdb tool data tables. +Downloads preformatted Blast DBs and updates blastdb, blastdp_p and blastdb_d tables accordingly. +Protein and nucleotide databases are retrieved using the update_blastdb.pl script that comes with BLAST. +Protein domain databases are downloaded from ftp://ftp.ncbi.nih.gov/pub/mmdb/cdd/little_endian/. + +For more information about these databases refer to: + +ftp://ftp.ncbi.nih.gov/pub/mmdb/cdd/README (Protein domains) + +ftp://ftp.ncbi.nih.gov/blast/db/README (Nucleotides and proteins) + +**Output** + +The data manager will download the database from NCBI and add an +entry to the corresponding data table. The database will be stored at +$galaxy_data_manager_data_path/$database_type/$blastdb_name/$database_id where, + +- $galaxy_data_manager_data_path is defined in universe_wsgi.ini + +- $database_type is either blastdb, blastdb_p or blastdb_d + +- $blastdb_name is the name of the database you are downloading + +- $database_id is either the ID generated from (name)_YYYY_MM_DD or user defined ID. ------- +The data manager will attempt to read the alias file to generate the +description, but this may fail. Otherwise, it will use the ID. -.. class:: infomark +.. class:: warningmark -**Notice:** This is a functional, but basic, tool for fetching preformatted blastdbs. +Galaxy and the data manger won't stop you from downloading the same +database over and over again, even if you use the same ID. +Multiple entires in the data table should go away upon restart and +since the ID are the same, it would just overwrite the database you +already have. ------- diff --git a/data_managers/ncbi_blastdb/data_manager_conf.xml b/data_managers/ncbi_blastdb/data_manager_conf.xml index 8d78cd75..94e36b95 100644 --- a/data_managers/ncbi_blastdb/data_manager_conf.xml +++ b/data_managers/ncbi_blastdb/data_manager_conf.xml @@ -9,7 +9,33 @@ blastdb/${path} - ${GALAXY_DATA_MANAGER_DATA_PATH}/blastdb/${path}/${nucleotide_alias_name} + ${GALAXY_DATA_MANAGER_DATA_PATH}/blastdb/${path}/${database_alias_name} + abspath + + + + + + + + + + blastdb_p/${path} + + ${GALAXY_DATA_MANAGER_DATA_PATH}/blastdb_p/${path}/${database_alias_name} + abspath + + + + + + + + + + blastdb_d/${path} + + ${GALAXY_DATA_MANAGER_DATA_PATH}/blastdb_d/${path}/${database_alias_name} abspath diff --git a/data_managers/ncbi_blastdb/fetch_blast_db.py b/data_managers/ncbi_blastdb/fetch_blast_db.py index 3749bc63..92c143d3 100644 --- a/data_managers/ncbi_blastdb/fetch_blast_db.py +++ b/data_managers/ncbi_blastdb/fetch_blast_db.py @@ -1,42 +1,16 @@ #!/usr/bin/env python -#Dan Blankenberg -#Script that calls update_blastdb.pl to download preformatted databases +#Adapted from Dan Blankenberg's data_manager_example_blastdb_ncbi_update_blastdb +#Michael Li, Microbial Biodiversity Bioinformatics group, Agriculture and Agri-Food Canada, April 2014 +#Script that downloads preformatted databases from NCBI. import optparse import os import sys import subprocess -import hashlib - +import time +import tarfile +from ftplib import FTP from galaxy.util.json import from_json_string, to_json_string -DEFAULT_ALGORITHM = hashlib.sha512 -CHUNK_SIZE = 2**20 #1mb - -def get_dir_hash( directory, algorithm=None, followlinks=True, chunk_size=None ): - chunk_size = chunk_size or CHUNK_SIZE - algorithm = algorithm or DEFAULT_ALGORITHM - if isinstance( algorithm, basestring ): - hash = hashlib.new( algorithm ) - else: - hash = algorithm() - #we hash a directory by taking names of directories, files and their contents - for dirpath, dirnames, filenames in os.walk( directory, followlinks=followlinks ): - dirnames.sort() - filenames.sort() - for name in dirnames: - hash.update( os.path.relpath( os.path.join( dirpath, name ), directory ) ) - for name in filenames: - filename = os.path.join( dirpath, name ) - hash.update( os.path.relpath( filename, directory ) ) - fh = open( filename, 'rb' ) - while True: - data = fh.read( chunk_size ) - if not data: - break - hash.update( data ) - fh.close() - - return hash.hexdigest() def main(): #Parse Command Line @@ -45,44 +19,104 @@ def main(): parser.add_option( '-t', '--tool_data_table_name', dest='tool_data_table_name', action='store', type='string', default=None, help='tool_data_table_name' ) (options, args) = parser.parse_args() + #Take the JSON input file for parsing params = from_json_string( open( options.filename ).read() ) target_directory = params[ 'output_data' ][0]['extra_files_path'] os.mkdir( target_directory ) - - blastdb_name = params['param_dict']['blastdb_name'] #value + + #Fetch parameters from input JSON file + blastdb_name = params['param_dict']['db_type'].get( 'blastdb_name' ) + blastdb_type = params['param_dict']['db_type'].get( 'blastdb_type' ) data_description = params['param_dict']['advanced'].get( 'data_description', None ) data_id = params['param_dict']['advanced'].get( 'data_id', None ) - cmd_options = [ '--decompress' ] - - args = [ 'update_blastdb.pl' ] + cmd_options + [ blastdb_name ] - proc = subprocess.Popen( args=args, shell=False, cwd=target_directory ) - return_code = proc.wait() - if return_code != 1: - print >> sys.stderr, "Error obtaining blastdb (%s)" % return_code - sys.exit( 1 ) + #update_blastdb.pl doesn't download protein domains, so we use ftp + if blastdb_type == 'blastdb_d': + try: + archive_name = blastdb_name + '_LE.tar.gz' + tar_file = open( os.path.join( target_directory, archive_name ), "wb" ) + + #Connect via ftp and download + ftp = FTP('ftp.ncbi.nih.gov') + ftp.login() + ftp.cwd('pub/mmdb/cdd/little_endian') + ftp.retrbinary('RETR %s' % archive_name, tar_file.write) + tar_file.close() + + #Extract contents + tar_file = tarfile.open(os.path.join( target_directory, archive_name ), mode='r') + tar_file.extractall( target_directory ) + tar_file.close() + + #If the download fails, ftplib should generate an error in ftplib.all_errors + #Likewise, tarfile.ReadError should catch any errors when reading from the tar + #And other possible errors that can occur here... + except IOError, e: + print >> sys.stderr, "Cannot create file: %s: %s" % ( archive_name, e ) + sys.exit( 1 ) + + except os.error, e: + print "Error while joining %s and %s: %s" % ( target_directory, archive_name, e ) + sys.exit( 1 ) + + except ftplib.all_errors, e: + print >> sys.stderr, "Error while downloading protein domain database: %s" % ( e ) + sys.exit( 1 ) + + except tarfile.TarError, e: + print >> sys.stderr, "Error while opening/extracting the tar file: %s" % ( e ) + sys.exit( 1 ) + + else: + #Run update_blastdb.pl + cmd_options = [ '--decompress' ] + args = [ 'update_blastdb.pl' ] + cmd_options + [ blastdb_name ] + proc = subprocess.Popen( args=args, shell=False, cwd=target_directory ) + return_code = proc.wait() + + #Check if download was successful (exit code 1) + if return_code != 1: + print >> sys.stderr, "Error obtaining blastdb (%s)" % return_code + sys.exit( 1 ) + #Set id and description if not provided in the advanced settings if not data_id: - data_id = "%s_%s" % ( blastdb_name, get_dir_hash( target_directory ) ) + #Use download time to create uniq id + localtime = time.localtime() + timeString = time.strftime("%Y_%m_%d", localtime) + data_id = "%s_%s" % ( blastdb_name, timeString ) - if not data_description: + # Attempt to automatically set description from alias file + # Protein domain databases don't have an alias file + if not data_description and blastdb_type != 'blastdb_d': alias_date = None + alias_file = None try: - for line in open( os.path.join( target_directory, "%s.nal" % ( blastdb_name ) ) ): - if line.startswith( '# Alias file created ' ): - alias_date = line.split( '# Alias file created ', 1 )[1].strip() - if line.startswith( 'TITLE' ): - data_description = line.split( None, 1 )[1].strip() - break + if blastdb_type == 'blastdb': + alias_file = "%s.nal" % ( blastdb_name ) + if blastdb_type == 'blastdb_p': + alias_file = "%s.pal" % ( blastdb_name ) + if alias_file: + for line in open( os.path.join( target_directory, alias_file ) ): + if line.startswith( '# Alias file created ' ): + alias_date = line.split( '# Alias file created ', 1 )[1].strip() + if line.startswith( '# Date created: ' ): + alias_date = line.split( '# Date created: ', 1)[1].strip() + if line.startswith( 'TITLE' ): + data_description = line.split( None, 1 )[1].strip() + break except Exception, e: print >> sys.stderr, "Error Parsing Alias file for TITLE and date: %s" % ( e ) + #If we manage to parse the pal or nal file, set description if alias_date and data_description: data_description = "%s (%s)" % ( data_description, alias_date ) + #If we could not parse the nal or pal file for some reason if not data_description: data_description = data_id - data_table_entry = { 'value':data_id, 'name':data_description, 'path': os.path.join( blastdb_name, data_id ), 'nucleotide_alias_name': blastdb_name } + #Prepare output string to convert into JSON format + data_table_entry = { 'value':data_id, 'name':data_description, 'path': os.path.join( blastdb_name, data_id ), 'database_alias_name': blastdb_name } data_manager_dict = { 'data_tables': { options.tool_data_table_name: [ data_table_entry ] } } #save info to json file diff --git a/data_managers/ncbi_blastdb/make_data_manager_blastdb.sh b/data_managers/ncbi_blastdb/make_data_manager_blastdb.sh new file mode 100755 index 00000000..7283307a --- /dev/null +++ b/data_managers/ncbi_blastdb/make_data_manager_blastdb.sh @@ -0,0 +1,45 @@ +#!/bin/sh +echo "This will create a tar-ball suitable to upload to the toolshed." + +if [ -f "data_managers/ncbi_blastdb/make_data_manager_blastdb.sh" ] +then +echo "Good, in the expected directory" +else +echo "ERROR. Run this from the GitHub repository root directory." +exit 1 +fi + +if [ -f "ncbi_blastdb.tar.gz" ] +then +echo "ERROR. File ncbi_blastdb.tar.gz already exists." +exit 1 +fi + +#Create tar file with core XML wrappers +if [ -f "ncbi_blastdb.tar" ] +then +rm data_manager_blastdb.tar +fi + +#Create tar file (-cf then -rf to add to it) +tar -cf ncbi_blastdb.tar test-data/est_out.json +tar -rf ncbi_blastdb.tar test-data/cog.out +tar -rf ncbi_blastdb.tar test-data/pataa.out +tar -rf ncbi_blastdb.tar test-data/patnt.out +tar -rf ncbi_blastdb.tar tool-data/blastdb.loc.sample +tar -rf ncbi_blastdb.tar tool-data/blastdb_p.loc.sample +tar -rf ncbi_blastdb.tar tool-data/blastdb_d.loc.sample +tar -rf ncbi_blastdb.tar tool-data/tool_data_table_conf.xml.sample +tar -rf ncbi_blastdb.tar data_managers/ncbi_blastdb/data_manager_conf.xml +tar -rf ncbi_blastdb.tar data_managers/ncbi_blastdb/README.rst +tar -rf ncbi_blastdb.tar data_managers/ncbi_blastdb/tool_dependencies.xml +tar -rf ncbi_blastdb.tar data_managers/ncbi_blastdb/blastdb.xml +tar -rf ncbi_blastdb.tar data_managers/ncbi_blastdb/fetch_blast_db.py + + +#Zip the tar file +gzip ncbi_blastdb.tar + +#Check the output +echo "Expect a tar-ball with 13 files, have:" +tar -tzf ncbi_blastdb.tar.gz | wc -l diff --git a/test-data/cog.out b/test-data/cog.out new file mode 100644 index 00000000..14d16f02 --- /dev/null +++ b/test-data/cog.out @@ -0,0 +1 @@ +\{\"data\_tables\"\:\ \{\"blastdb\_d\"\:\ \[\{\"path\"\:\ \"Cog\/Cog\_[0-9]{4}\_[0-1][0-9]\_[0-3][0-9]\"\,\ \"name\"\:\ \"Cog\_[0-9]{4}\_[0-1][0-9]\_[0-3][0-9]\"\,\ \"value\"\:\ \"Cog\_[0-9]{4}\_[0-1][0-9]\_[0-3][0-9]\"\,\ \"database\_alias\_name\"\:\ \"Cog\"\}\]\}\} diff --git a/test-data/pataa.out b/test-data/pataa.out new file mode 100644 index 00000000..2fe8686d --- /dev/null +++ b/test-data/pataa.out @@ -0,0 +1 @@ +\{\"data\_tables\"\:\ \{\"blastdb\_p\"\:\ \[\{\"path\"\:\ \"pataa\/pataa\_[0-9]{4}\_[0-1][0-9]\_[0-3][0-9]\"\,\ \"name\"\:\ \"Protein\ sequences\ derived\ from\ the\ Patent\ division\ of\ GenBank\ \([0-9][0-9]\/[0-9][0-9]\/[0-9]{4}\ [0-9][0-9]\:[0-9][0-9]\:[0-9][0-9]\)\"\,\ \"value\"\:\ \"pataa\_[0-9]{4}\_[0-1][0-9]\_[0-3][0-9]\"\,\ \"database\_alias\_name\"\:\ \"pataa\"\}\]\}\} diff --git a/test-data/patnt.out b/test-data/patnt.out new file mode 100644 index 00000000..49c88cd8 --- /dev/null +++ b/test-data/patnt.out @@ -0,0 +1 @@ +\{\"data\_tables\"\:\ \{\"blastdb\"\:\ \[\{\"path\"\:\ \"patnt\/patnt\_[0-9]{4}\_[0-1][0-9]\_[0-3][0-9]\"\,\ \"name\"\:\ \"Nucleotide\ sequences\ derived\ from\ the\ Patent\ division\ of\ GenBank\ \([0-9][0-9]\/[0-9][0-9]\/[0-9][0-9][0-9][0-9]\ [0-9][0-9]\:[0-9][0-9]\:[0-9][0-9]\)\"\,\ \"value\"\:\ \"patnt\_[0-9]{4}\_[0-1][0-9]\_[0-3][0-9]\"\,\ \"database\_alias\_name\"\:\ \"patnt\"\}\]\}\} diff --git a/tool-data/tool_data_table_conf.xml.sample b/tool-data/tool_data_table_conf.xml.sample index f69062ff..a7af4ff4 100644 --- a/tool-data/tool_data_table_conf.xml.sample +++ b/tool-data/tool_data_table_conf.xml.sample @@ -3,4 +3,12 @@ value, name, path + + value, name, path + +
+ + value, name, path + +
diff --git a/tools/ncbi_blast_plus/ncbi_macros.xml b/tools/ncbi_blast_plus/ncbi_macros.xml index e70849b1..8a391cfa 100644 --- a/tools/ncbi_blast_plus/ncbi_macros.xml +++ b/tools/ncbi_blast_plus/ncbi_macros.xml @@ -174,11 +174,7 @@ - - - - - + @@ -204,11 +200,7 @@ - - - - - + @@ -233,11 +225,7 @@ - - - - - + @@ -257,20 +245,12 @@ - - - - - + - - - - - +