Skip to content

Commit

Permalink
motif extraction with new schema
Browse files Browse the repository at this point in the history
  • Loading branch information
thrakar9 committed Dec 20, 2017
1 parent 197a264 commit bb5d547
Show file tree
Hide file tree
Showing 5 changed files with 123 additions and 79 deletions.
20 changes: 13 additions & 7 deletions CoMET.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def extract_motifs(x_data, conv_net, handle):
custom_fun = K.function([conv_net.input], [conv_scores])
# Start visualizations
motif_extraction(custom_fun, x_data, conv_layer.filters, conv_layer.kernel_size[0], handle, depth,
data_dir=FLAGS.data_dir, filetype=FLAGS.motifs_filetype)
filetype=FLAGS.motifs_filetype)


def save_experiment(conv_net):
Expand Down Expand Up @@ -117,11 +117,13 @@ def binary(x_data, y_data, handle):
validation_split=FLAGS.validation_split,
callbacks=callbacks)

fkey = save_experiment(conv_net)
model_key = save_experiment(conv_net)

# Extract the motifs from the convolutional layers
if FLAGS.motifs:
extract_motifs(x_data, conv_net, fkey)
dataset_key = FLAGS.infile.split('/')[-1].split('.')[0]
output_folder = os.path.join(FLAGS.data_dir, 'motifs', dataset_key, model_key)
extract_motifs(x_data, conv_net, output_folder)


def family(x_data, y_data, handle):
Expand Down Expand Up @@ -160,11 +162,13 @@ def family(x_data, y_data, handle):
validation_split=FLAGS.validation_split,
callbacks=callbacks)

fkey = save_experiment(conv_net)
model_key = save_experiment(conv_net)

# Extract the motifs from the convolutional layers
if FLAGS.motifs:
extract_motifs(x_data, conv_net, fkey)
dataset_key = FLAGS.infile.split('/')[-1].split('.')[0]
output_folder = os.path.join(FLAGS.data_dir, 'motifs', dataset_key, model_key)
extract_motifs(x_data, conv_net, output_folder)


def unsupervised(x_data, handle):
Expand Down Expand Up @@ -198,11 +202,13 @@ def unsupervised(x_data, handle):
validation_split=FLAGS.validation_split,
callbacks=callbacks)

fkey = save_experiment(conv_net)
model_key = save_experiment(conv_net)

# Extract the motifs from the convolutional layers
if FLAGS.motifs:
extract_motifs(x_data, conv_net, fkey)
dataset_key = FLAGS.infile.split('/')[-1].split('.')[0]
output_folder = os.path.join(FLAGS.data_dir, 'motifs', dataset_key, model_key)
extract_motifs(x_data, conv_net, output_folder)


def main():
Expand Down
14 changes: 7 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ This script works with any type of trained CoMET model, and produces the motif-e
```shell
python scripts/generate_embeddings.py --infile /path/to/dataset.tsv --model_file=/path/to/model.model [--output_file output_filename]
```
The output is saved at `embeddings/{model_ID}/dataset/{output_file}.npz`.
The output is saved at `embeddings/dataset/{model_ID}/{output_file}.npz`.

### Extract motifs from protein sequences
This script works with any type of trained CoMET model, and extracts sequence motifs from a set of input protein sequences, by looking at the receptive fields of the convolutional neurons.
Expand All @@ -74,16 +74,16 @@ python scripts/extract_motifs.py --infile /path/to/dataset.tsv --model_file=/pat

The generated output file structure is the following:

* {output_dir}/motifs/
* 1: The motifs of the first (closest to the input) convolutional layer.
* XX_YY.png: The motif extracted from neuron at position XX, from YY number of protein sequences.
* XX_YY.txt: The YY sequence patterns that activated the neuron XX in order to generate the motif.
* 2, 3, ...: The position of the next convolutional layers in the same structure as above.
* `{output_dir}/motifs/dataset/{model_ID}/`
* `1/`: The motifs of the first (closest to the input) convolutional layer.
* `XX_YY.png`: The motif extracted from neuron at position XX, from YY number of protein sequences.
* `XX_YY.txt`: The YY sequence patterns that activated the neuron XX in order to generate the motif.
* `2/`, `3/`, ...: The position of the next convolutional layers in the same structure as above.

### Search for homologous protein sequences
This script works with CoHST trained models, and scans a set of input protein sequences to identify sequence homologs to the protein dataset that was used as positive when training the model.

```shell
python scripts/search_for_homologs.py --infile /path/to/dataset --model_file=/path/to/model.model [--output_file output_filename]
```
The output is saved at `homologs/{model_ID}/dataset/{output_file}.npz`.
The output is saved at `homologs/dataset/{model_ID}/{output_file}.npz`.
164 changes: 101 additions & 63 deletions scripts/extract_motifs.py
Original file line number Diff line number Diff line change
@@ -1,64 +1,102 @@
#!/usr/bin/env python
# TODO: rewrite this
#
# import argparse
#
# import h5py
# import keras.backend as K
# from keras.models import model_from_json
#
# from evolutron.engine import Model as DeepTrainer
# from evolutron.extra_layers import custom_layers
# from evolutron.motifs import motif_extraction
# from evolutron.tools import Handle, load_dataset
#
#
# # Check if package is installed, else fallback to developer mode imports
#
#
# def main(filename, data_id):
# # First load model architecture
# hf = h5py.File(filename)
# model_config = hf.attrs['model_config'].decode('utf8')
# hf.close()
#
# net = DeepTrainer(model_from_json(model_config, custom_objects=custom_layers))
#
# # Then load model parameters
# net.load_all_param_values(filename)
#
# handle = Handle.from_filename(filename)
#
# if data_id == 'model':
# data_id = handle.dataset
#
# x_data, y_data = load_dataset(data_id, padded=True, max_aa=net.input._keras_shape[1])
#
# conv_layers = net.get_conv_layers()
#
# for depth, conv_layer in enumerate(conv_layers):
# conv_scores = conv_layer.output
# # Compile function that spits out the outputs of the correct convolutional layer
# boolean_mask = K.any(K.not_equal(net.input, 0.0), axis=-1, keepdims=True)
# conv_scores = conv_scores * K.cast(boolean_mask, K.floatx())
#
# custom_fun = K.function([net.input], [conv_scores])
# # Start visualizations
# motif_extraction(custom_fun, x_data, conv_layer.filters,
# conv_layer.kernel_size[0], handle, depth)
#
#
# if __name__ == '__main__':
# parser = argparse.ArgumentParser(description='Network visualization module.')
# parser.add_argument("model", help='Path to the file')
#
# parser.add_argument("-d", "--dataset", type=str, default='model',
# help='Dataset on which the motifs will be generated upon. Write "model" to infer' \
# 'automatically from model.')
#
# args = parser.parse_args()
#
# kwargs = {'filename': args.model,
# 'data_id': args.dataset}
#
# main(**kwargs)
# coding=utf-8
import math
import os
import sys

import keras.backend as K
import pandas as pd
from absl import flags
from keras.utils import Sequence

from evolutron.engine import load_model
from evolutron.extra_layers import custom_layers
from evolutron.motifs import motif_extraction
from evolutron.tools import preprocess_dataset

flags.DEFINE_string('infile', '', 'The input file. THe supported format currently is a TSV output from UniProt.', )
flags.DEFINE_string('model_file', '', 'The model file for the model to use for predictions.')
flags.DEFINE_string("output_dir", os.path.expandvars(os.curdir), 'The directory to store CoMET output data.')
flags.DEFINE_enum("motifs_filetype", 'txt+png', ['png', 'pdf', 'txt', 'txt+pdf', 'txt+png'],
'Choose between different file types to save the extracted motifs from CoMET.'
'A typical workflow for subsequent analysis would be to extract the motifs as text files (txt) and'
'then use the tool sites2meme to transform them to MEME format and submit for search in MAST.')

FLAGS = flags.FLAGS

try:
FLAGS(sys.argv)
except flags.Error as e:
print(e)
print(FLAGS)
sys.exit(1)


class UniProtSequence(Sequence):

def __init__(self, dataframe, batch_size, max_dim):
self.x = dataframe
self.batch_size = batch_size
self.max_dim = max_dim

def __len__(self):
return math.ceil(len(self.x) / self.batch_size)

def __getitem__(self, idx):
batch_x = self.x.iloc[idx * self.batch_size:(idx + 1) * self.batch_size]
return self.process_batch(batch_x)

def process_batch(self, batch):
return preprocess_dataset(batch.Sequence, padded=True, max_aa=self.max_dim, min_aa=self.max_dim)


def extract_motifs(model, proteins, handle):
# TODO: refactor motif extraction
# data_gen = UniProtSequence(proteins, 100, model.input_shape[1])

for depth, conv_layer in enumerate(model.get_conv_layers()):
conv_scores = conv_layer.output
# Compile function that spits out the outputs of the correct convolutional layer
boolean_mask = K.any(K.not_equal(model.input, 0.0), axis=-1, keepdims=True)
conv_scores = conv_scores * K.cast(boolean_mask, K.floatx())

# motif_extractor = Model([model.input], [conv_scores])
custom_fun = K.function([model.input], [conv_scores])

x_data = preprocess_dataset(proteins.Sequence, padded=True, max_aa=model.input_shape[1],
min_aa=model.input_shape[1])

# Start motif extraction
motif_extraction(custom_fun, x_data, conv_layer.filters, conv_layer.kernel_size[0], handle, depth,
filetype=FLAGS.motifs_filetype)


def main():
try:
model = load_model(FLAGS.model_file, custom_objects=custom_layers)
except Exception as e:
print('Unable to load model.')
print(e)
sys.exit(1)

model_key = FLAGS.model_file.split('/')[-1].split('.')[0]

try:
proteins = pd.read_csv(FLAGS.infile, sep='\t')
except Exception as e:
print('Unable to read input protein dataset.')
print(e)
sys.exit(1)

dataset_key = FLAGS.infile.split('/')[-1].split('.')[0]

output_folder = os.path.join(FLAGS.output_dir, 'motifs', dataset_key, model_key)

if not os.path.isdir(output_folder):
os.makedirs(output_folder)

extract_motifs(model, proteins, output_folder)


if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion scripts/generate_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def main():
if '.npz' not in FLAGS.output_file:
FLAGS.output_file += '.npz'

output_folder = os.path.join(FLAGS.data_dir, 'embeddings', model_key, dataset_key)
output_folder = os.path.join(FLAGS.data_dir, 'embeddings', dataset_key, model_key)
if not os.path.isdir(output_folder):
os.makedirs(output_folder)

Expand Down
2 changes: 1 addition & 1 deletion scripts/search_for_homologs.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def main():
if '.csv' not in FLAGS.output_file:
FLAGS.output_file += '.csv'

output_folder = os.path.join(FLAGS.data_dir, 'homologs', model_key, dataset_key)
output_folder = os.path.join(FLAGS.data_dir, 'homologs', dataset_key, model_key)

if not os.path.isdir(output_folder):
os.makedirs(output_folder)
Expand Down

0 comments on commit bb5d547

Please sign in to comment.