Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add assay descriptions #3

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
391 changes: 391 additions & 0 deletions get_descriptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,391 @@
import sqlite3
import subprocess
import os

SQL_TO_SQLITE_AWK = r"""
# Authors: @esperlu, @artemyk, @gkuenning, @dumblob

# FIXME detect empty input file and issue a warning

function printerr( s ){ print s | "cat >&2" }

BEGIN {
# if( ARGC != 2 ){
# printerr( \
# "USAGE:\n"\
# " mysql2sqlite dump_mysql.sql > dump_sqlite3.sql\n" \
# " OR\n" \
# " mysql2sqlite dump_mysql.sql | sqlite3 sqlite.db\n" \
# "\n" \
# "NOTES:\n" \
# " Dash in filename is not supported, because dash (-) means stdin." )
# no_END = 1
# exit 1
# }

# Find INT_MAX supported by both this AWK (usually an ISO C signed int)
# and SQlite.
# On non-8bit-based architectures, the additional bits are safely ignored.

# 8bit (lower precision should not exist)
s="127"
# "63" + 0 avoids potential parser misbehavior
if( (s + 0) "" == s ){ INT_MAX_HALF = "63" + 0 }
# 16bit
s="32767"
if( (s + 0) "" == s ){ INT_MAX_HALF = "16383" + 0 }
# 32bit
s="2147483647"
if( (s + 0) "" == s ){ INT_MAX_HALF = "1073741823" + 0 }
# 64bit (as INTEGER in SQlite3)
s="9223372036854775807"
if( (s + 0) "" == s ){ INT_MAX_HALF = "4611686018427387904" + 0 }
# # 128bit
# s="170141183460469231731687303715884105728"
# if( (s + 0) "" == s ){ INT_MAX_HALF = "85070591730234615865843651857942052864" + 0 }
# # 256bit
# s="57896044618658097711785492504343953926634992332820282019728792003956564819968"
# if( (s + 0) "" == s ){ INT_MAX_HALF = "28948022309329048855892746252171976963317496166410141009864396001978282409984" + 0 }
# # 512bit
# s="6703903964971298549787012499102923063739682910296196688861780721860882015036773488400937149083451713845015929093243025426876941405973284973216824503042048"
# if( (s + 0) "" == s ){ INT_MAX_HALF = "3351951982485649274893506249551461531869841455148098344430890360930441007518386744200468574541725856922507964546621512713438470702986642486608412251521024" + 0 }
# # 1024bit
# s="89884656743115795386465259539451236680898848947115328636715040578866337902750481566354238661203768010560056939935696678829394884407208311246423715319737062188883946712432742638151109800623047059726541476042502884419075341171231440736956555270413618581675255342293149119973622969239858152417678164812112068608"
# if( (s + 0) "" == s ){ INT_MAX_HALF = "44942328371557897693232629769725618340449424473557664318357520289433168951375240783177119330601884005280028469967848339414697442203604155623211857659868531094441973356216371319075554900311523529863270738021251442209537670585615720368478277635206809290837627671146574559986811484619929076208839082406056034304" + 0 }
# # higher precision probably not needed

FS=",$"
print "PRAGMA synchronous = OFF;"
print "PRAGMA journal_mode = MEMORY;"
print "BEGIN TRANSACTION;"
}

# historically 3 spaces separate non-argument local variables
function bit_to_int( str_bit, powtwo, i, res, bit, overflow ){
powtwo = 1
overflow = 0
# 011101 = 1*2^0 + 0*2^1 + 1*2^2 ...
for( i = length( str_bit ); i > 0; --i ){
bit = substr( str_bit, i, 1 )
if( overflow || ( bit == 1 && res > INT_MAX_HALF ) ){
printerr( \
NR ": WARN Bit field overflow, number truncated (LSBs saved, MSBs ignored)." )
break
}
res = res + bit * powtwo
# no warning here as it might be the last iteration
if( powtwo > INT_MAX_HALF ){ overflow = 1; continue }
powtwo = powtwo * 2
}
return res
}

# CREATE TRIGGER statements have funny commenting. Remember we are in trigger.
/^\/\*.*(CREATE.*TRIGGER|create.*trigger)/ {
gsub( /^.*(TRIGGER|trigger)/, "CREATE TRIGGER" )
print
inTrigger = 1
next
}
# The end of CREATE TRIGGER has a stray comment terminator
/(END|end) \*\/;;/ { gsub( /\*\//, "" ); print; inTrigger = 0; next }
# The rest of triggers just get passed through
inTrigger != 0 { print; next }

# CREATE VIEW looks like a TABLE in comments
/^\/\*.*(CREATE.*TABLE|create.*table)/ {
inView = 1
next
}
# end of CREATE VIEW
/^(\).*(ENGINE|engine).*\*\/;)/ {
inView = 0
next
}
# content of CREATE VIEW
inView != 0 { next }

# skip comments
/^\/\*/ { next }

# skip PARTITION statements
/^ *[(]?(PARTITION|partition) +[^ ]+/ { next }

# print all INSERT lines
( /^ *\(/ && /\) *[,;] *$/ ) || /^(INSERT|insert|REPLACE|replace)/ {
prev = ""

# first replace \\ by \_ that mysqldump never generates to deal with
# sequnces like \\n that should be translated into \n, not \<LF>.
# After we convert all escapes we replace \_ by backslashes.
gsub( /\\\\/, "\\_" )

# single quotes are escaped by another single quote
gsub( /\\'/, "''" )
gsub( /\\n/, "\n" )
gsub( /\\r/, "\r" )
gsub( /\\"/, "\"" )
gsub( /\\\032/, "\032" ) # substitute char

gsub( /\\_/, "\\" )

# sqlite3 is limited to 16 significant digits of precision
while( match( $0, /0x[0-9a-fA-F]{17}/ ) ){
hexIssue = 1
sub( /0x[0-9a-fA-F]+/, substr( $0, RSTART, RLENGTH-1 ), $0 )
}
if( hexIssue ){
printerr( \
NR ": WARN Hex number trimmed (length longer than 16 chars)." )
hexIssue = 0
}
print
next
}

# CREATE DATABASE is not supported
/^(CREATE DATABASE|create database)/ { next }

# print the CREATE line as is and capture the table name
/^(CREATE|create)/ {
if( $0 ~ /IF NOT EXISTS|if not exists/ || $0 ~ /TEMPORARY|temporary/ ){
caseIssue = 1
printerr( \
NR ": WARN Potential case sensitivity issues with table/column naming\n" \
" (see INFO at the end)." )
}
if( match( $0, /`[^`]+/ ) ){
tableName = substr( $0, RSTART+1, RLENGTH-1 )
}
aInc = 0
prev = ""
firstInTable = 1
print
next
}

# Replace `FULLTEXT KEY` (probably other `XXXXX KEY`)
/^ (FULLTEXT KEY|fulltext key)/ { gsub( /[A-Za-z ]+(KEY|key)/, " KEY" ) }

# Get rid of field lengths in KEY lines
/ (PRIMARY |primary )?(KEY|key)/ { gsub( /\([0-9]+\)/, "" ) }

aInc == 1 && /PRIMARY KEY|primary key/ { next }

# Replace COLLATE xxx_xxxx_xx statements with COLLATE BINARY
/ (COLLATE|collate) [a-z0-9_]*/ { gsub( /(COLLATE|collate) [a-z0-9_]*/, "COLLATE BINARY" ) }

# Print all fields definition lines except the `KEY` lines.
/^ / && !/^( (KEY|key)|\);)/ {
if( match( $0, /[^"`]AUTO_INCREMENT|auto_increment[^"`]/) ){
aInc = 1
gsub( /AUTO_INCREMENT|auto_increment/, "PRIMARY KEY AUTOINCREMENT" )
}
gsub( /(UNIQUE KEY|unique key) (`.*`|".*") /, "UNIQUE " )
gsub( /(CHARACTER SET|character set) [^ ]+[ ,]/, "" )
# FIXME
# CREATE TRIGGER [UpdateLastTime]
# AFTER UPDATE
# ON Package
# FOR EACH ROW
# BEGIN
# UPDATE Package SET LastUpdate = CURRENT_TIMESTAMP WHERE ActionId = old.ActionId;
# END
gsub( /(ON|on) (UPDATE|update) (CURRENT_TIMESTAMP|current_timestamp)(\(\))?/, "" )
gsub( /(DEFAULT|default) (CURRENT_TIMESTAMP|current_timestamp)(\(\))?/, "DEFAULT current_timestamp")
gsub( /(COLLATE|collate) [^ ]+ /, "" )
gsub( /(ENUM|enum)[^)]+\)/, "text " )
gsub( /(SET|set)\([^)]+\)/, "text " )
gsub( /UNSIGNED|unsigned/, "" )
gsub( /_utf8mb3/, "" )
gsub( /` [^ ]*(INT|int|BIT|bit)[^ ]*/, "` integer" )
gsub( /" [^ ]*(INT|int|BIT|bit)[^ ]*/, "\" integer" )
ere_bit_field = "[bB]'[10]+'"
if( match($0, ere_bit_field) ){
sub( ere_bit_field, bit_to_int( substr( $0, RSTART +2, RLENGTH -2 -1 ) ) )
}

# remove USING BTREE and other suffixes for USING, for example: "UNIQUE KEY
# `hostname_domain` (`hostname`,`domain`) USING BTREE,"
gsub( / USING [^, ]+/, "" )

# field comments are not supported
gsub( / (COMMENT|comment).+$/, "" )
# Get commas off end of line
gsub( /,.?$/, "" )
if( prev ){
if( firstInTable ){
print prev
firstInTable = 0
}
else {
print "," prev
}
}
else {
# FIXME check if this is correct in all cases
if( match( $1,
/(CONSTRAINT|constraint) ["].*["] (FOREIGN KEY|foreign key)/ ) ){
print ","
}
}
prev = $1
}

/ ENGINE| engine/ {
if( prev ){
if( firstInTable ){
print prev
firstInTable = 0
}
else {
print "," prev
}
}
prev=""
print ");"
next
}
# `KEY` lines are extracted from the `CREATE` block and stored in array for later print
# in a separate `CREATE KEY` command. The index name is prefixed by the table name to
# avoid a sqlite error for duplicate index name.
/^( (KEY|key)|\);)/ {
if( prev ){
if( firstInTable ){
print prev
firstInTable = 0
}
else {
print "," prev
}
}
prev = ""
if( $0 == ");" ){
print
}
else {
if( match( $0, /`[^`]+/ ) ){
indexName = substr( $0, RSTART+1, RLENGTH-1 )
}
if( match( $0, /\([^()]+/ ) ){
indexKey = substr( $0, RSTART+1, RLENGTH-1 )
}
# idx_ prefix to avoid name clashes (they really happen!)
key[tableName] = key[tableName] "CREATE INDEX \"idx_" \
tableName "_" indexName "\" ON \"" tableName "\" (" indexKey ");\n"
}
}

END {
if( no_END ){ exit 1}
# print all KEY creation lines.
for( table in key ){ printf key[table] }

print "END TRANSACTION;"

if( caseIssue ){
printerr( \
"INFO Pure sqlite identifiers are case insensitive (even if quoted\n" \
" or if ASCII) and doesnt cross-check TABLE and TEMPORARY TABLE\n" \
" identifiers. Thus expect errors like \"table T has no column named F\".")
}
}
""".replace("'", "'\\''")

def execute_command(command):
# Use subprocess.Popen to run the command
process = subprocess.Popen(
command,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
shell=True
)

# Read stdout line-by-line
while True:
output = process.stdout.readline()
if output == '' and process.poll() is not None:
break
if output:
yield output.strip()

# Capture and print any remaining stderr output
stderr = process.communicate()[1]
if stderr:
print(stderr.strip())


def get_descriptions(data_folder):
print("Loading descriptions from MySQL dump....")
sql_file_path = os.path.join(data_folder, './BDB-mySQL_All_202406.dmp')

# Connect to SQLite (or create the database if it doesn't exist)
conn = sqlite3.connect(':memory:') # You can change 'example.db' to your desired database name

# Create a cursor object
cursor = conn.cursor()

# Read and execute the SQL file line-by-line
sql_command = ""
for line in execute_command(f"awk '{SQL_TO_SQLITE_AWK}' {sql_file_path}"):
# Skip comments and empty lines
if line.strip().startswith('--') or not line.strip():
continue

# Accumulate SQL command
sql_command += line.strip()

# If the line ends with a semicolon, execute the command
if sql_command.endswith(';'):
try:
cursor.execute(sql_command)
sql_command = "" # Reset the command after execution
except Exception as e:
sql_command = "" # Reset the command even if there's an error

# Commit the changes
conn.commit()

# Create a cursor object to execute queries
cursor = conn.cursor()

# Execute the query
query = "select reactant_set_id, assayid, entryid from ki_result;"
cursor.execute(query)

# Fetch all the rows returned by the query
rows = cursor.fetchall()

mappings = {}

# Process the rows
for row in rows:
mappings[row[0]] = (row[1], row[2])

# Execute another query
query = "select description, assayid, entryid from assay;"
cursor.execute(query)

mappings2 = {}

rows = cursor.fetchall()

for row in rows:
if type(row[0]) is str:
mappings2[f"{row[1]}-{row[2]}"] = row[0]

final_mappings = {}
for (k, v) in mappings.items():
try:
final_mappings[str(k)] = mappings2[f"{v[0]}-{v[1]}"]
except KeyError:
final_mappings[str(k)] = None
# Close the cursor and connection
cursor.close()

# Close the connection
conn.close()

print("Done loading descriptions from MySQL dump.")
return final_mappings
Loading