diff --git a/get_descriptions.py b/get_descriptions.py new file mode 100644 index 0000000..0596cc9 --- /dev/null +++ b/get_descriptions.py @@ -0,0 +1,391 @@ +import sqlite3 +import subprocess +import os + +SQL_TO_SQLITE_AWK = r""" +# Authors: @esperlu, @artemyk, @gkuenning, @dumblob + +# FIXME detect empty input file and issue a warning + +function printerr( s ){ print s | "cat >&2" } + +BEGIN { + # if( ARGC != 2 ){ + # printerr( \ + # "USAGE:\n"\ + # " mysql2sqlite dump_mysql.sql > dump_sqlite3.sql\n" \ + # " OR\n" \ + # " mysql2sqlite dump_mysql.sql | sqlite3 sqlite.db\n" \ + # "\n" \ + # "NOTES:\n" \ + # " Dash in filename is not supported, because dash (-) means stdin." ) + # no_END = 1 + # exit 1 + # } + + # Find INT_MAX supported by both this AWK (usually an ISO C signed int) + # and SQlite. + # On non-8bit-based architectures, the additional bits are safely ignored. + + # 8bit (lower precision should not exist) + s="127" + # "63" + 0 avoids potential parser misbehavior + if( (s + 0) "" == s ){ INT_MAX_HALF = "63" + 0 } + # 16bit + s="32767" + if( (s + 0) "" == s ){ INT_MAX_HALF = "16383" + 0 } + # 32bit + s="2147483647" + if( (s + 0) "" == s ){ INT_MAX_HALF = "1073741823" + 0 } + # 64bit (as INTEGER in SQlite3) + s="9223372036854775807" + if( (s + 0) "" == s ){ INT_MAX_HALF = "4611686018427387904" + 0 } +# # 128bit +# s="170141183460469231731687303715884105728" +# if( (s + 0) "" == s ){ INT_MAX_HALF = "85070591730234615865843651857942052864" + 0 } +# # 256bit +# s="57896044618658097711785492504343953926634992332820282019728792003956564819968" +# if( (s + 0) "" == s ){ INT_MAX_HALF = "28948022309329048855892746252171976963317496166410141009864396001978282409984" + 0 } +# # 512bit +# s="6703903964971298549787012499102923063739682910296196688861780721860882015036773488400937149083451713845015929093243025426876941405973284973216824503042048" +# if( (s + 0) "" == s ){ INT_MAX_HALF = "3351951982485649274893506249551461531869841455148098344430890360930441007518386744200468574541725856922507964546621512713438470702986642486608412251521024" + 0 } +# # 1024bit +# s="89884656743115795386465259539451236680898848947115328636715040578866337902750481566354238661203768010560056939935696678829394884407208311246423715319737062188883946712432742638151109800623047059726541476042502884419075341171231440736956555270413618581675255342293149119973622969239858152417678164812112068608" +# if( (s + 0) "" == s ){ INT_MAX_HALF = "44942328371557897693232629769725618340449424473557664318357520289433168951375240783177119330601884005280028469967848339414697442203604155623211857659868531094441973356216371319075554900311523529863270738021251442209537670585615720368478277635206809290837627671146574559986811484619929076208839082406056034304" + 0 } +# # higher precision probably not needed + + FS=",$" + print "PRAGMA synchronous = OFF;" + print "PRAGMA journal_mode = MEMORY;" + print "BEGIN TRANSACTION;" +} + +# historically 3 spaces separate non-argument local variables +function bit_to_int( str_bit, powtwo, i, res, bit, overflow ){ + powtwo = 1 + overflow = 0 + # 011101 = 1*2^0 + 0*2^1 + 1*2^2 ... + for( i = length( str_bit ); i > 0; --i ){ + bit = substr( str_bit, i, 1 ) + if( overflow || ( bit == 1 && res > INT_MAX_HALF ) ){ + printerr( \ + NR ": WARN Bit field overflow, number truncated (LSBs saved, MSBs ignored)." ) + break + } + res = res + bit * powtwo + # no warning here as it might be the last iteration + if( powtwo > INT_MAX_HALF ){ overflow = 1; continue } + powtwo = powtwo * 2 + } + return res +} + +# CREATE TRIGGER statements have funny commenting. Remember we are in trigger. +/^\/\*.*(CREATE.*TRIGGER|create.*trigger)/ { + gsub( /^.*(TRIGGER|trigger)/, "CREATE TRIGGER" ) + print + inTrigger = 1 + next +} +# The end of CREATE TRIGGER has a stray comment terminator +/(END|end) \*\/;;/ { gsub( /\*\//, "" ); print; inTrigger = 0; next } +# The rest of triggers just get passed through +inTrigger != 0 { print; next } + +# CREATE VIEW looks like a TABLE in comments +/^\/\*.*(CREATE.*TABLE|create.*table)/ { + inView = 1 + next +} +# end of CREATE VIEW +/^(\).*(ENGINE|engine).*\*\/;)/ { + inView = 0 + next +} +# content of CREATE VIEW +inView != 0 { next } + +# skip comments +/^\/\*/ { next } + +# skip PARTITION statements +/^ *[(]?(PARTITION|partition) +[^ ]+/ { next } + +# print all INSERT lines +( /^ *\(/ && /\) *[,;] *$/ ) || /^(INSERT|insert|REPLACE|replace)/ { + prev = "" + + # first replace \\ by \_ that mysqldump never generates to deal with + # sequnces like \\n that should be translated into \n, not \. + # After we convert all escapes we replace \_ by backslashes. + gsub( /\\\\/, "\\_" ) + + # single quotes are escaped by another single quote + gsub( /\\'/, "''" ) + gsub( /\\n/, "\n" ) + gsub( /\\r/, "\r" ) + gsub( /\\"/, "\"" ) + gsub( /\\\032/, "\032" ) # substitute char + + gsub( /\\_/, "\\" ) + + # sqlite3 is limited to 16 significant digits of precision + while( match( $0, /0x[0-9a-fA-F]{17}/ ) ){ + hexIssue = 1 + sub( /0x[0-9a-fA-F]+/, substr( $0, RSTART, RLENGTH-1 ), $0 ) + } + if( hexIssue ){ + printerr( \ + NR ": WARN Hex number trimmed (length longer than 16 chars)." ) + hexIssue = 0 + } + print + next +} + +# CREATE DATABASE is not supported +/^(CREATE DATABASE|create database)/ { next } + +# print the CREATE line as is and capture the table name +/^(CREATE|create)/ { + if( $0 ~ /IF NOT EXISTS|if not exists/ || $0 ~ /TEMPORARY|temporary/ ){ + caseIssue = 1 + printerr( \ + NR ": WARN Potential case sensitivity issues with table/column naming\n" \ + " (see INFO at the end)." ) + } + if( match( $0, /`[^`]+/ ) ){ + tableName = substr( $0, RSTART+1, RLENGTH-1 ) + } + aInc = 0 + prev = "" + firstInTable = 1 + print + next +} + +# Replace `FULLTEXT KEY` (probably other `XXXXX KEY`) +/^ (FULLTEXT KEY|fulltext key)/ { gsub( /[A-Za-z ]+(KEY|key)/, " KEY" ) } + +# Get rid of field lengths in KEY lines +/ (PRIMARY |primary )?(KEY|key)/ { gsub( /\([0-9]+\)/, "" ) } + +aInc == 1 && /PRIMARY KEY|primary key/ { next } + +# Replace COLLATE xxx_xxxx_xx statements with COLLATE BINARY +/ (COLLATE|collate) [a-z0-9_]*/ { gsub( /(COLLATE|collate) [a-z0-9_]*/, "COLLATE BINARY" ) } + +# Print all fields definition lines except the `KEY` lines. +/^ / && !/^( (KEY|key)|\);)/ { + if( match( $0, /[^"`]AUTO_INCREMENT|auto_increment[^"`]/) ){ + aInc = 1 + gsub( /AUTO_INCREMENT|auto_increment/, "PRIMARY KEY AUTOINCREMENT" ) + } + gsub( /(UNIQUE KEY|unique key) (`.*`|".*") /, "UNIQUE " ) + gsub( /(CHARACTER SET|character set) [^ ]+[ ,]/, "" ) + # FIXME + # CREATE TRIGGER [UpdateLastTime] + # AFTER UPDATE + # ON Package + # FOR EACH ROW + # BEGIN + # UPDATE Package SET LastUpdate = CURRENT_TIMESTAMP WHERE ActionId = old.ActionId; + # END + gsub( /(ON|on) (UPDATE|update) (CURRENT_TIMESTAMP|current_timestamp)(\(\))?/, "" ) + gsub( /(DEFAULT|default) (CURRENT_TIMESTAMP|current_timestamp)(\(\))?/, "DEFAULT current_timestamp") + gsub( /(COLLATE|collate) [^ ]+ /, "" ) + gsub( /(ENUM|enum)[^)]+\)/, "text " ) + gsub( /(SET|set)\([^)]+\)/, "text " ) + gsub( /UNSIGNED|unsigned/, "" ) + gsub( /_utf8mb3/, "" ) + gsub( /` [^ ]*(INT|int|BIT|bit)[^ ]*/, "` integer" ) + gsub( /" [^ ]*(INT|int|BIT|bit)[^ ]*/, "\" integer" ) + ere_bit_field = "[bB]'[10]+'" + if( match($0, ere_bit_field) ){ + sub( ere_bit_field, bit_to_int( substr( $0, RSTART +2, RLENGTH -2 -1 ) ) ) + } + + # remove USING BTREE and other suffixes for USING, for example: "UNIQUE KEY + # `hostname_domain` (`hostname`,`domain`) USING BTREE," + gsub( / USING [^, ]+/, "" ) + + # field comments are not supported + gsub( / (COMMENT|comment).+$/, "" ) + # Get commas off end of line + gsub( /,.?$/, "" ) + if( prev ){ + if( firstInTable ){ + print prev + firstInTable = 0 + } + else { + print "," prev + } + } + else { + # FIXME check if this is correct in all cases + if( match( $1, + /(CONSTRAINT|constraint) ["].*["] (FOREIGN KEY|foreign key)/ ) ){ + print "," + } + } + prev = $1 +} + +/ ENGINE| engine/ { + if( prev ){ + if( firstInTable ){ + print prev + firstInTable = 0 + } + else { + print "," prev + } + } + prev="" + print ");" + next +} +# `KEY` lines are extracted from the `CREATE` block and stored in array for later print +# in a separate `CREATE KEY` command. The index name is prefixed by the table name to +# avoid a sqlite error for duplicate index name. +/^( (KEY|key)|\);)/ { + if( prev ){ + if( firstInTable ){ + print prev + firstInTable = 0 + } + else { + print "," prev + } + } + prev = "" + if( $0 == ");" ){ + print + } + else { + if( match( $0, /`[^`]+/ ) ){ + indexName = substr( $0, RSTART+1, RLENGTH-1 ) + } + if( match( $0, /\([^()]+/ ) ){ + indexKey = substr( $0, RSTART+1, RLENGTH-1 ) + } + # idx_ prefix to avoid name clashes (they really happen!) + key[tableName] = key[tableName] "CREATE INDEX \"idx_" \ + tableName "_" indexName "\" ON \"" tableName "\" (" indexKey ");\n" + } +} + +END { + if( no_END ){ exit 1} + # print all KEY creation lines. + for( table in key ){ printf key[table] } + + print "END TRANSACTION;" + + if( caseIssue ){ + printerr( \ + "INFO Pure sqlite identifiers are case insensitive (even if quoted\n" \ + " or if ASCII) and doesnt cross-check TABLE and TEMPORARY TABLE\n" \ + " identifiers. Thus expect errors like \"table T has no column named F\".") + } +} +""".replace("'", "'\\''") + +def execute_command(command): + # Use subprocess.Popen to run the command + process = subprocess.Popen( + command, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True, + shell=True + ) + + # Read stdout line-by-line + while True: + output = process.stdout.readline() + if output == '' and process.poll() is not None: + break + if output: + yield output.strip() + + # Capture and print any remaining stderr output + stderr = process.communicate()[1] + if stderr: + print(stderr.strip()) + + +def get_descriptions(data_folder): + print("Loading descriptions from MySQL dump....") + sql_file_path = os.path.join(data_folder, './BDB-mySQL_All_202406.dmp') + + # Connect to SQLite (or create the database if it doesn't exist) + conn = sqlite3.connect(':memory:') # You can change 'example.db' to your desired database name + + # Create a cursor object + cursor = conn.cursor() + + # Read and execute the SQL file line-by-line + sql_command = "" + for line in execute_command(f"awk '{SQL_TO_SQLITE_AWK}' {sql_file_path}"): + # Skip comments and empty lines + if line.strip().startswith('--') or not line.strip(): + continue + + # Accumulate SQL command + sql_command += line.strip() + + # If the line ends with a semicolon, execute the command + if sql_command.endswith(';'): + try: + cursor.execute(sql_command) + sql_command = "" # Reset the command after execution + except Exception as e: + sql_command = "" # Reset the command even if there's an error + + # Commit the changes + conn.commit() + + # Create a cursor object to execute queries + cursor = conn.cursor() + + # Execute the query + query = "select reactant_set_id, assayid, entryid from ki_result;" + cursor.execute(query) + + # Fetch all the rows returned by the query + rows = cursor.fetchall() + + mappings = {} + + # Process the rows + for row in rows: + mappings[row[0]] = (row[1], row[2]) + + # Execute another query + query = "select description, assayid, entryid from assay;" + cursor.execute(query) + + mappings2 = {} + + rows = cursor.fetchall() + + for row in rows: + if type(row[0]) is str: + mappings2[f"{row[1]}-{row[2]}"] = row[0] + + final_mappings = {} + for (k, v) in mappings.items(): + try: + final_mappings[str(k)] = mappings2[f"{v[0]}-{v[1]}"] + except KeyError: + final_mappings[str(k)] = None + # Close the cursor and connection + cursor.close() + + # Close the connection + conn.close() + + print("Done loading descriptions from MySQL dump.") + return final_mappings \ No newline at end of file diff --git a/manifest.json b/manifest.json index 14f7414..dfa3ecb 100644 --- a/manifest.json +++ b/manifest.json @@ -8,7 +8,7 @@ "url" : "https://github.com/rjawesome" }, "dumper" : { - "data_url" : ["https://www.bindingdb.org/bind/downloads/BindingDB_All_2022m5.tsv.zip"], + "data_url" : ["https://www.bindingdb.org/bind/downloads/BindingDB_All_2022m5.tsv.zip", "https://www.bindingdb.org/bind/BDB-mySQL_All_202406_dmp.zip"], "uncompress" : true }, "uploader" : { diff --git a/parser.py b/parser.py index 384e3ee..d3e19fd 100644 --- a/parser.py +++ b/parser.py @@ -1,7 +1,7 @@ import csv -import json import os from typing import Dict +from get_descriptions import get_descriptions """ Fields of the Imported CSV: @@ -497,6 +497,7 @@ def merge(main: Dict[str, any], other: Dict[str, any]): def load_data(data_folder): + descriptions = get_descriptions(data_folder) docs = {} row_num = 0 for row in read_csv(os.path.join(data_folder, './BindingDB_All.tsv'), '\t'): @@ -515,13 +516,16 @@ def load_data(data_folder): row['_id'] = f"{row['object']['monomer_id']}-{primary_id}" row['predicate'] = 'physically interacts with' + if str(row['relation']['bindingdb_set_id']) in descriptions: + row['relation']['description'] = descriptions[str(row['relation']['bindingdb_set_id'])] + if row['_id'] in docs: merge(docs[row['_id']], row) else: docs[row['_id']] = arrayify(row) - # if row_num >= 1200000: + # if row_num >= 50000: # break # if row_num % 50000 == 0: # print(row_num) @@ -531,6 +535,8 @@ def load_data(data_folder): yield docs[doc_id] +# type: ignore +# import json # def main(): # from time import time