diff --git a/.gitignore b/.gitignore index e69de29..c0c219b 100644 --- a/.gitignore +++ b/.gitignore @@ -0,0 +1,124 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ diff --git a/README.md.save b/README.md.save deleted file mode 100644 index f8fd21c..0000000 --- a/README.md.save +++ /dev/null @@ -1,3 +0,0 @@ - -#### Scaden -Single-cell assisted deconvolutional network diff --git a/build/lib/scaden/__init__.py b/build/lib/scaden/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/build/lib/scaden/model/__init__.py b/build/lib/scaden/model/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/build/lib/scaden/model/architectures.py b/build/lib/scaden/model/architectures.py deleted file mode 100644 index b67b8cd..0000000 --- a/build/lib/scaden/model/architectures.py +++ /dev/null @@ -1,10 +0,0 @@ -""" -This file contains the three architectures used for the model ensemble in scaden -The architectures are stored in the dict 'architectures', each element of which is a tuple that consists -of the hidden_unit sizes (element 0) and the respective dropout rates (element 1) -""" - -architectures = {'m256': ([256, 128, 64, 32], [0, 0, 0, 0]), - 'm512': ([512, 256, 128, 64], [0, 0.3, 0.2, 0.1]), - 'm1024': ([1024, 512, 256, 128], [0, 0.6, 0.3, 0.1])} - diff --git a/build/lib/scaden/model/functions.py b/build/lib/scaden/model/functions.py deleted file mode 100644 index 400083a..0000000 --- a/build/lib/scaden/model/functions.py +++ /dev/null @@ -1,90 +0,0 @@ -""" -Functions used for the scaden model -""" -import collections -import scanpy.api as sc -import numpy as np -import tensorflow as tf -from sklearn import preprocessing as pp -import pandas as pd - - - - -def dummy_labels(m, labels): - """ - Create dummy labels needed for building the graph correctly - :param m: - :param labels: - :return: - """ - n_l = len(labels) - return np.zeros((m, n_l), dtype="float32") - -def sample_scaling(x, scaling_option): - """ - Apply scaling of data - :param x: - :param scaling_option: - :return: - """ - - if scaling_option == "log_min_max": - # Bring in log space - x = np.log2(x + 1) - - # Normalize data - mms = pp.MinMaxScaler(feature_range=(0, 1), copy=True) - - # it scales features so transpose is needed - x = mms.fit_transform(x.T).T - - return x - - - -def preprocess_h5ad_data(raw_input_path, processed_path, scaling_option="log_min_max", sig_genes=None): - """ - Preprocess raw input data for the model - :param raw_input_path: - :param scaling_option: - :param group_small: - :param signature_genes: - :return: - """ - print("Pre-processing raw data ...") - raw_input = sc.read_h5ad(raw_input_path) - - print("Subsetting genes ...") - # Select features go use - raw_input = raw_input[:, sig_genes] - - print("Scaling using " + str(scaling_option)) - # Scaling - raw_input.X = sample_scaling(raw_input.X, scaling_option) - - print("Writing to disk ...") - # Write processed data to disk - raw_input.write(processed_path) - print("Data pre-processing done.") - - -def get_signature_genes(input_path, sig_genes_complete, var_cutoff=0.1): - """ - Get overlap between signature genes and available genes - :param input_path: - :param sig_genes_complete: - :return: new sig_genes - """ - data = pd.read_table(input_path, index_col=0) - keep = data.var(axis=1) > var_cutoff - data = data.loc[keep] - available_genes = list(data.index) - new_sig_genes = list(set(available_genes).intersection(sig_genes_complete)) - n_sig_genes = len(new_sig_genes) - print(f"Found {n_sig_genes} common genes.") - return new_sig_genes - - - - diff --git a/build/lib/scaden/model/scaden.py b/build/lib/scaden/model/scaden.py deleted file mode 100644 index 21b2086..0000000 --- a/build/lib/scaden/model/scaden.py +++ /dev/null @@ -1,330 +0,0 @@ -""" -Cell Deconvolutional Network (scaden) class -""" -import os -import tensorflow as tf -import numpy as np -import pandas as pd -import scanpy.api as sc -import collections -from .functions import dummy_labels, sample_scaling -from tqdm import tqdm - -class Scaden(object): - """ - scaden class - """ - - def __init__(self, sess, model_dir, model_name, batch_size=128, learning_rate=0.0001, num_steps=1000): - self.sess=sess - self.model_dir=model_dir - self.batch_size=batch_size - self.model_name=model_name - self.beta1=0.9 - self.beta2=0.999 - self.learning_rate=learning_rate - self.data=None - self.n_classes=None - self.labels=None - self.x=None - self.y=None - self.num_steps=num_steps - self.scaling="log_min_max" - self.sig_genes=None - self.sample_names=None - self.hidden_units = [256, 128, 64, 32] - self.do_rates = [0, 0, 0, 0] - - - def model_fn(self, X, n_classes, reuse=False): - """ - Model function - :param params: - :param mode: - :return: - """ - activation = tf.nn.relu - with tf.variable_scope("scaden_model", reuse=reuse): - layer1 = tf.layers.dense(X, units=self.hidden_units[0], activation=activation , name="dense1") - do1 = tf.layers.dropout(layer1, rate=self.do_rates[0], training=self.training_mode) - layer2 = tf.layers.dense(do1, units=self.hidden_units[1], activation=activation , name="dense2") - do2 = tf.layers.dropout(layer2, rate=self.do_rates[1], training=self.training_mode) - layer3 = tf.layers.dense(do2, units=self.hidden_units[2], activation=activation , name="dense3") - do3 = tf.layers.dropout(layer3, rate=self.do_rates[2], training=self.training_mode) - layer4 = tf.layers.dense(do3, units=self.hidden_units[3], activation=activation , name="dense4") - do4 = tf.layers.dropout(layer4, rate=self.do_rates[3], training=self.training_mode) - logits = tf.layers.dense(do4, units=n_classes, activation=tf.nn.softmax, name="logits_layer") - - return logits - - def compute_loss(self, logits, targets): - """ - Compute L1 loss - :param logits: - :param targets: - :return: L1 loss - """ - loss = tf.reduce_mean(np.abs(logits - targets)) - return loss - - def compute_accuracy(self, logits, targets, pct_cut=0.05): - """ - Compute prediction accuracy - :param targets: - :param pct_cut: - :return: - """ - equality = tf.less_equal(np.abs(np.subtract(logits, targets)), pct_cut) - accuracy = tf.reduce_mean(tf.cast(equality, tf.float32)) - return accuracy - - def correlation_coefficient(self, logits, targets): - """ - Calculate the pearson correlation coefficient - :param logits: - :param targets: - :return: - """ - mx = tf.reduce_mean(logits) - my = tf.reduce_mean(targets) - xm, ym = logits-mx, targets-my - r_num = tf.reduce_sum(tf.multiply(xm, ym)) - r_den = tf.sqrt(tf.multiply(tf.reduce_sum(tf.square(xm)), tf.reduce_sum(tf.square(ym)))) - r = tf.divide(r_num, r_den) - r = tf.maximum(tf.minimum(r, 1.0), -1.0) - return r - - def visualization(self, logits, targets, classes): - """ - Create evaluation metrics - :param targets: - :param classes: - :return: - """ - # add evaluation metrics - rmse = tf.metrics.root_mean_squared_error(logits, targets)[1] - pcor = self.correlation_coefficient(logits, targets) - eval_metrics = {"rmse": rmse, "pcor": pcor} - - for i in range(logits.shape[1]): - eval_metrics["mre_" + str(classes[i])] = tf.metrics.mean_relative_error( - targets[:, i], - logits[:, i], - targets[:, i])[0] - eval_metrics["mae_" + str(classes[i])] = tf.metrics.mean_absolute_error( - targets[:, i], - logits[:, i], - targets[:, i])[0] - eval_metrics["pcor_" + str(classes[i])] = self.correlation_coefficient(targets[:, i],logits[:, i]) - - - eval_metrics["mre_total"] = tf.metrics.mean_relative_error(targets, - logits, - targets)[1] - - eval_metrics["mae_total"] = tf.metrics.mean_relative_error(targets, - logits, - targets)[1] - - eval_metrics["accuracy01"] = self.compute_accuracy(logits, targets, pct_cut=0.01) - eval_metrics["accuracy05"] = self.compute_accuracy(logits, targets, pct_cut=0.05) - eval_metrics["accuracy1"] = self.compute_accuracy(logits, targets, pct_cut=0.1) - - # Create summary scalars - for key, value in eval_metrics.items(): - tf.summary.scalar(key, value) - - tf.summary.scalar('loss', self.loss) - - merged_summary_op = tf.summary.merge_all() - - return merged_summary_op - - def load_h5ad_file(self, input_path, batch_size, datasets=[]): - """ - Load input data from a h5ad file and divide into training and test set - :param input_path: path to h5ad file - :param batch_size: batch size to use for training - :param datasets: a list of datasets to extract from the file - :return: Dataset object - """ - raw_input = sc.read_h5ad(input_path) - - # Subset dataset - if len(datasets) > 0: - all_ds = collections.Counter(raw_input.obs['ds']) - for ds in all_ds: - if ds not in datasets: - raw_input = raw_input[raw_input.obs['ds'] != ds].copy() - - - # Create training dataset - ratios = [raw_input.obs[ctype] for ctype in raw_input.uns['cell_types']] - self.x_data = raw_input.X.astype(np.float32) - self.y_data = np.array(ratios, dtype=np.float32).transpose() - # create placeholders - self.x_data_ph = tf.placeholder(self.x_data.dtype, self.x_data.shape, name="x_data_ph") - self.y_data_ph = tf.placeholder(self.y_data.dtype, self.y_data.shape, name="y_data_ph") - self.data = tf.data.Dataset.from_tensor_slices((self.x_data_ph, self.y_data_ph)) - self.data = self.data.shuffle(1000).repeat().batch(batch_size=batch_size) - - # Extract celltype and feature info - self.labels = raw_input.uns['cell_types'] - self.sig_genes = list(raw_input.var_names) - - def load_prediction_file(self, input_path, sig_genes, labels, scaling=None): - """ - Load a file to perform prediction on it - :param input_path: path to input file - :param sig_genes: the signature genes to use - :param scaling: which scaling to perform - :return: Dataset object - """ - # Load data - data = pd.read_table(input_path, sep="\t", index_col=0) - sample_names = list(data.columns) - data = data.loc[sig_genes] - self.x_data = data.T - self.x_data = self.x_data.astype(np.float32) - m = self.x_data.shape[0] - self.y_dummy = dummy_labels(m, labels) - # Scaling - if scaling: - self.x_data = sample_scaling(self.x_data, scaling_option=scaling) - - # Create Dataset object from placeholders - self.x_data_ph = tf.placeholder(self.x_data.dtype, self.x_data.shape, name="x_data_ph") - self.y_data_ph = tf.placeholder(self.y_dummy.dtype, self.y_dummy.shape, name="y_data_ph") - self.data = tf.data.Dataset.from_tensor_slices((self.x_data_ph, self.y_data_ph)) - self.data = self.data.batch(batch_size=m) - - return sample_names - - def build_model(self, input_path, train_datasets, mode="train"): - """ - Build the model graph - :param reuse: - :return: - """ - self.global_step = tf.Variable(0, name='global_step', trainable=False) - - # Load data - if mode=="train": - self.load_h5ad_file(input_path=input_path, batch_size=self.batch_size, datasets=train_datasets) - - if mode=="predict": - self.sample_names = self.load_prediction_file(input_path=input_path, sig_genes=self.sig_genes, - labels=self.labels, scaling=self.scaling) - - # Make iterator - iter = tf.data.Iterator.from_structure(self.data.output_types, self.data.output_shapes) - next_element = iter.get_next() - self.data_init_op = iter.make_initializer(self.data) - self.x, self.y = next_element - self.x = tf.cast(self.x, tf.float32) - - self.n_classes = len(self.labels) - - # Placeholder for training mode - self.training_mode = tf.placeholder_with_default(True, shape=()) - - # Model - self.logits = self.model_fn(X=self.x, n_classes=self.n_classes) - - - if mode == "train": - # Loss - self.loss = self.compute_loss(self.logits, self.y) - # Summary scalars - self.merged_summary_op = self.visualization(tf.cast(self.logits, tf.float32), targets=tf.cast(self.y, tf.float32), classes=self.labels) - learning_rate = self.learning_rate - # Optimizer - self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.loss, global_step=self.global_step) - - - def train(self, input_path, train_datasets): - """ - Train the model - :param num_steps: - :return: - """ - # Build model graph - self.build_model(input_path=input_path, train_datasets=train_datasets, mode="train") - - # Init variables - self.sess.run(tf.global_variables_initializer()) - self.sess.run(tf.local_variables_initializer()) - self.saver = tf.train.Saver() - model = os.path.join(self.model_dir, self.model_name) - self.writer = tf.summary.FileWriter(model, self.sess.graph) - self.eval_writer = tf.summary.FileWriter(os.path.join(self.model_dir, "eval"), self.sess.graph) - - # Initialize datasets - self.sess.run(self.data_init_op, feed_dict={self.x_data_ph: self.x_data, self.y_data_ph: self.y_data}) - - - # Load pre-trained weights if avaialble - self.load_weights(self.model_dir) - - # Training loop - pbar = tqdm(range(self.num_steps)) - for _ in pbar: - _, loss, summary = self.sess.run([self.optimizer, self.loss, self.merged_summary_op]) - self.writer.add_summary(summary, tf.train.global_step(self.sess, self.global_step)) - description = "Step: " + str(tf.train.global_step(self.sess, self.global_step)) + ", Loss: {:4.3f}".format( - loss) - pbar.set_description(desc=description) - - # Save the trained model - self.saver.save(self.sess, model, global_step=self.global_step) - # Save features and celltypes - pd.DataFrame(self.labels).to_csv(self.model_dir + "/celltypes.txt", sep="\t") - pd.DataFrame(self.sig_genes).to_csv(self.model_dir + "/genes.txt", sep="\t") - - - def predict(self, input_path, out_name="cdn_predictions.txt"): - """ - Perform prediction with a pre-trained model - :param out_dir: path to store results in - :param training_data: the dataset used for training - :return: - """ - # Load signature genes and celltype labels - sig_genes = pd.read_table(self.model_dir + "/genes.txt", index_col=0) - self.sig_genes = list(sig_genes['0']) - labels = pd.read_table(self.model_dir + "/celltypes.txt", index_col=0) - self.labels = list(labels['0']) - - # Build model graph - self.build_model(input_path=input_path, train_datasets=[], mode="predict") - - # Initialize variables - self.sess.run(tf.global_variables_initializer()) - self.sess.run(tf.local_variables_initializer()) - - self.saver = tf.train.Saver() - - model = os.path.join(self.model_dir, self.model_name) - self.writer = tf.summary.FileWriter(model, self.sess.graph) - - # Initialize datasets - self.sess.run(self.data_init_op, feed_dict={self.x_data_ph: self.x_data, self.y_data_ph: self.y_dummy}) - - # Load pre-trained weights if avaialble - self.load_weights(self.model_dir) - - predictions = self.sess.run([self.logits], feed_dict={self.training_mode: False}) - pred_df = pd.DataFrame(predictions[0], columns=self.labels, index=self.sample_names) - #pred_df.to_csv(out_name, sep="\t") - return pred_df - - def load_weights(self, model_dir): - """ - Load pre-trained weights if available - :param model_dir: - :return: - """ - ckpt = tf.train.get_checkpoint_state(model_dir) - if ckpt: - self.saver.restore(self.sess, ckpt.model_checkpoint_path) - print("Model parameters restored successfully") \ No newline at end of file diff --git a/build/lib/scaden/preprocessing/__init__.py b/build/lib/scaden/preprocessing/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/build/lib/scaden/preprocessing/bulk_simulation.py b/build/lib/scaden/preprocessing/bulk_simulation.py deleted file mode 100644 index 887e96d..0000000 --- a/build/lib/scaden/preprocessing/bulk_simulation.py +++ /dev/null @@ -1,308 +0,0 @@ -######################################################################### -## Simulation of artificial bulk RNA-seq datasets from scRNA-seq data # -######################################################################## - -import pandas as pd -import numpy as np -import glob -import os -import argparse -import gc - -def create_fractions(no_celltypes): - """ - Create random fractions - :param no_celltypes: number of fractions to create - :return: list of random fracs of length no_cellttypes - """ - fracs = np.random.rand(no_celltypes) - fracs_sum = np.sum(fracs) - fracs = np.divide(fracs, fracs_sum) - return fracs - - -def create_subsample(x, y, sample_size, celltypes, available_celltypes, sparse=False): - """ - Generate artifical bulk subsample with random fractions of celltypes - If sparse is set to true, add random celltypes to the missing celltypes - :param cells: scRNA-seq expression matrix - :param labels: celltype labels of single cell data (same order) - :param sample_size: number of cells to use - :param celltypes: all celltypes available in all datasets - :param available_celltypes: celltypes available in currently processed dataset - :param sparse: whether to create a sparse datasets (missing celltypes) - :return: expression of artificial subsample along with celltype fractions - """ - - if sparse: - no_keep = np.random.randint(1, len(available_celltypes)) - keep = np.random.choice(list(range(len(available_celltypes))), size=no_keep, replace=False) - available_celltypes = [available_celltypes[i] for i in keep] - - no_avail_cts = len(available_celltypes) - - # Create fractions for available celltypes - fracs = create_fractions(no_celltypes=no_avail_cts) - samp_fracs = np.multiply(fracs, sample_size) - samp_fracs = list(map(int, samp_fracs)) - - # Make complete fracions - fracs_complete = [0] * len(celltypes) - for i,act in enumerate(available_celltypes): - idx = celltypes.index(act) - fracs_complete[idx] = fracs[i] - - artificial_samples = [] - for i in range(no_avail_cts): - ct = available_celltypes[i] - cells_sub = x.loc[np.array(y['Celltype'] == ct),:] - cells_fraction = np.random.randint(0, cells_sub.shape[0], samp_fracs[i]) - cells_sub = cells_sub.iloc[cells_fraction, :] - artificial_samples.append(cells_sub) - - df_samp = pd.concat(artificial_samples, axis=0) - df_samp = df_samp.sum(axis=0) - - return (df_samp, fracs_complete) - - -def create_subsample_dataset(x, y, sample_size, celltypes, no_samples): - """ - Generate many artifial bulk samples with known fractions - This function will create normal and sparse samples (no_samples) - :param cells: - :param labels: - :param sample_size: - :param no_celltypes: - :param no_samples: - :return: dataset and corresponding labels - """ - X = [] - Y = [] - - available_celltypes = list(set(y['Celltype'].tolist())) - - # Create normal samples - for i in range(no_samples): - sample, label = create_subsample(x, y, sample_size, celltypes, available_celltypes) - X.append(sample) - Y.append(label) - if i % 100 == 0: - print(i) - - # Create sparse samples - n_sparse = int(no_samples) - #n_sparse = 0 - for i in range(n_sparse): - sample, label = create_subsample(x, y, sample_size, celltypes, available_celltypes, sparse=True) - X.append(sample) - Y.append(label) - if i % 100 == 0: - print(i) - X = pd.concat(X, axis=1).T - Y = pd.DataFrame(Y, columns=celltypes) - - # Shuffle - #X, Y = shuffle_dataset(X, Y) - - return (X, Y) - -def filter_for_celltypes(x, y, celltypes): - """ - Filter data for cells belonging to specified celltypes - :param x: - :param y: - :param celltypes: - :return: - """ - cts = list(y['Celltype']) - keep = [elem in celltypes for elem in cts] - x = x.loc[keep, :] - y = y.loc[keep, :] - return (x, y) - - - -def shuffle_dataset(x, y): - """ - Shuffle dataset while keeping x and y in synch - :param x: - :param y: - :return: - """ - idx = np.random.permutation(x.index) - x_shuff = x.reindex(idx) - y_shuff = y.reindex(idx) - return (x_shuff, y_shuff) - -def filter_matrix_signature(mat, genes): - """ - Filter expression matrix using given genes - Accounts for the possibility that some genes might not be in the matrix, for these genes - a column with zeros is added to the matrix - :param mat: - :param genes: - :return: filtered matrix - """ - n_cells = mat.shape[0] - avail_genes = mat.columns - filt_genes = [g for g in genes if g in avail_genes] - missing_genes = [g for g in genes if g not in avail_genes] - mat = mat[filt_genes] - for mg in missing_genes: - mat[mg] = np.zeros(n_cells) - mat = mat[genes] - return mat - -def load_dataset(name, dir): - """ - Load a dataset given its name and the directory - :param name: name of the dataset - :param dir: directory containing the data - :param sig_genes: the signature genes for filtering - :return: X, Y - """ - print("Loading " + name + " dataset ...") - x = pd.read_table(dir + name + "_norm_counts_all.txt", index_col=0) - y = pd.read_table(dir + name + "_celltypes.txt") - return (x, y) - -def merge_unkown_celltypes(y, unknown_celltypes): - """ - Merge all unknown celltypes together - :param x: - :param y: - :param unknown_celltypes: - :return: - """ - celltypes = list(y['Celltype']) - new_celltypes = ["Unknown" if x in unknown_celltypes else x for x in celltypes] - y['Celltype'] = new_celltypes - return y - -def collect_celltypes(ys): - """ - Collect all available celltypes given all dataset labels - :param ys: list of dataset labels - :return: list of available celltypes - """ - ct_list = [list(set(y['Celltype'].tolist())) for y in ys] - celltypes = set() - for ct in ct_list: - celltypes = celltypes.union(set(ct)) - celltypes = list(celltypes) - return celltypes - -def get_common_genes(xs, type='intersection'): - """ - Get common genes for all matrices xs - Can either be the union or the intersection (default) of all genes - :param xs: cell x gene matrices - :return: list of common genes - """ - genes = [] - for x in xs: - genes.append(list(x.columns)) - - genes = [set(g) for g in genes] - com_genes = genes[0] - if type=='union': - for gi in range(1, len(genes)): - com_genes = com_genes.union(genes[gi]) - elif type=='intersection': - for gi in range(1, len(genes)): - com_genes = com_genes.intersection(genes[gi]) - - else: - exit('Wrong type selected to get common genes. Exiting.') - - if len(com_genes) == 0: - exit("No common genes found. Exiting.") - - return list(com_genes) - -def generate_signature(x, y): - """ - Generate signature of matrix using celltypes y - :param x: expression matrix - :param y: celltypes - :return: mean-expression per celltype - """ - - signature_matrix = [] - celltypes = list(set(y['Celltype'])) - for ct in celltypes: - ct_exp = x.loc[np.array(y['Celltype'] == ct), :] - ct_exp = ct_exp.mean(axis=0) - signature_matrix.append(ct_exp) - - signature_matrix = pd.concat(signature_matrix, axis=1) - signature_matrix.columns = celltypes - return signature_matrix - - -""" -Main Section -""" - -parser = argparse.ArgumentParser() -parser.add_argument("--cells", type=int, help="Number of cells to use for each sample.", default=100) -parser.add_argument("--samples", "-n", type=int, help="Total number of samples to create for each dataset.", default=8000) -parser.add_argument("--data", type=str, help="Directory containg the datsets", default="/home/kevin/deepcell_project/datasets/PBMCs/processed_data") -parser.add_argument("--out", type=str, help="Output directory", default="./") -parser.add_argument("--pattern", type=str, help="File pattern to use for getting the datasets (default: *_norm_counts_all.txt", default="*_norm_counts_all.txt") -parser.add_argument("--unknown", type=str, help="All cell types to group into unknown", nargs='+', default=['Unknown', 'Unkown', 'Neutrophil', 'Dendritic']) -args = parser.parse_args() - -# Parameters -sample_size = args.cells -num_samples = int(args.samples / 2) # divide by two so half is sparse and half is normal samples -data_path = args.data -out_dir = args.out -pattern = args.pattern -unknown_celltypes = args.unknown - - -# List available datasets -files = glob.glob(data_path + pattern) -files = [os.path.basename(x) for x in files] -datasets = [x.split("_")[0] for x in files] -print("Datasets: " + str(datasets)) - -# Load datasets -xs, ys = [], [] -for i, n in enumerate(datasets): - x, y = load_dataset(n, data_path) - xs.append(x) - ys.append(y) - -# Get common gene list -all_genes = get_common_genes(xs, type='intersection') -print("No. of common genes: " + str(len(all_genes))) -xs = [filter_matrix_signature(m, all_genes) for m in xs] - -# Merge unknown celltypes -print("Merging unknown cell types: " + str(unknown_celltypes)) -for i in range(len(ys)): - ys[i] = merge_unkown_celltypes(ys[i], unknown_celltypes) - -# Collect all available celltypes -celltypes = collect_celltypes(ys) -print("Available celltypes: " + str(celltypes)) -pd.DataFrame(celltypes).to_csv(out_dir + "celltypes.txt", sep="\t") - -# Create signature matrices (for use with Cibersort) -sig_mats = [] -for i in range(len(xs)): - sm = generate_signature(xs[i], ys[i]) - sig_mats.append(sm) - -# Create datasets -for i in range(len(xs)): - print("Subsampling " + datasets[i] + "...") - tmpx, tmpy = create_subsample_dataset(xs[i], ys[i], sample_size, celltypes, num_samples) - tmpx.to_csv(out_dir + datasets[i] + "_samples.txt", sep="\t", index=False) - tmpy.to_csv(out_dir + datasets[i] + "_labels.txt", sep="\t", index=False) - gc.collect() - -print("Finished!") diff --git a/build/lib/scaden/preprocessing/create_h5ad_file.py b/build/lib/scaden/preprocessing/create_h5ad_file.py deleted file mode 100644 index 9f1d2c5..0000000 --- a/build/lib/scaden/preprocessing/create_h5ad_file.py +++ /dev/null @@ -1,127 +0,0 @@ -""" -Combine artificial bulk datasets and optionally other datasets into h5ad files -usable for scaden training - -When using additional datasets, they should be in similar format and best have the same output cell types. -""" - -import argparse -import anndata -import glob -import os -import pandas as pd -import numpy as np - -""" -Functions -""" - -def parse_data(x_path, y_path): - """ - Parse data and labels and divide them into training and testset - :param x_path: - :param y_path: - :return: training and test data and labels - """ - # Load the data - x = pd.read_table(x_path, sep="\t") - y = pd.read_table(y_path, sep="\t") - labels = list(y.columns) - - # Transform Y to numpy array and split in train and testset - yseries = [] - - for i in range(y.shape[0]): - yseries.append(list(y.iloc[i])) - y = np.array(yseries) - - return x, y, labels - -def load_celltypes(brain_training_dir): - - celltypes_path = os.path.join(brain_training_dir, "celltypes.txt") - celltypes = pd.read_csv(celltypes_path, sep="\t") - celltypes = list(celltypes.iloc[:, 1]) - return celltypes - -def sort_celltypes(ratios, labels, ref_labels): - """ - Bring ratios in correct order of cell types - :param ratios: - :param labels: - :param ref_labels: - :return: - """ - idx = [labels.index(x) for x in ref_labels] - ratios = ratios[:, idx] - return ratios - -""" -Main Section -""" - -# Argument parsing -parser = argparse.ArgumentParser() -parser.add_argument("--data", type=str, help="Directory containg the datsets", - default="/home/kevin/deepcell_project/datasets/PBMCs/training/pbmc_top100_3000_400/") -parser.add_argument("--out", type=str, help="Output path and name. Must end with h5ad", default="dataset.h5ad") -parser.add_argument("--pattern", type=str, help="Pattern to use for file collection (default: *_samples.txt", default="*_samples.txt") -parser.add_argument("--unknown", type=str, help="Cells to mark as unknown in the h5ad file. Can be used for gouping.", - default=["Unknown"]) -args = parser.parse_args() - -data_dir = args.data -out_path = args.out -pattern = args.pattern -unknown = args.unknown - -# List available datasets -files = glob.glob(data_dir + pattern) -files = [os.path.basename(x) for x in files] -datasets = [x.split("_")[0] for x in files] - -# get celltypes -celltypes = load_celltypes(data_dir) -print("Celltypes: " + str(celltypes)) - -adata = [] -me_dict = {} - -# Create adata datasets for each -for i, train_file in enumerate(datasets): - - rna_file = os.path.join(data_dir, train_file + "_samples.txt") - ratios_file = os.path.join(data_dir, train_file + "_labels.txt") - - x, y, labels = parse_data(rna_file, ratios_file) - # sort y - y = sort_celltypes(y, labels, celltypes) - test = [labels.index(x) for x in celltypes] - labels = [labels[i] for i in test] - - x = x.sort_index(axis=1) - ratios = pd.DataFrame(y, columns=celltypes) - ratios['ds'] = pd.Series(np.repeat(train_file, y.shape[0]), - index=ratios.index) - - - print("Processing " + str(train_file)) - adata.append(anndata.AnnData(X=x.as_matrix(), - obs=ratios, - var=pd.DataFrame(columns=[], index=list(x)))) -import gc -for i in range(1, len(adata)): - print("Concatenating " + str(i)) - adata[0] = adata[0].concatenate(adata[1]) - del adata[1] - gc.collect() - print(len(adata)) -adata = adata[0] - - -# add cell types and signature genes -adata.uns['cell_types'] = celltypes -adata.uns['unknown'] = unknown - -# save data -adata.write(out_path) diff --git a/build/lib/scaden/scaden_main.py b/build/lib/scaden/scaden_main.py deleted file mode 100644 index 16a2307..0000000 --- a/build/lib/scaden/scaden_main.py +++ /dev/null @@ -1,164 +0,0 @@ -""" -scaden Main functionality - -Contains code to -- process a training datasets -- train a model -- perform predictions - -""" - -# Imports -import tensorflow as tf -import scanpy.api as sc -from scaden.model.architectures import architectures -from scaden.model.scaden import Scaden -from scaden.model.functions import * - -""" -PARAMETERS -""" -# ==========================================# - -# Extract architectures -M256_HIDDEN_UNITS = architectures['m256'][0] -M512_HIDDEN_UNITS = architectures['m512'][0] -M1024_HIDDEN_UNITS = architectures['m1024'][0] -M256_DO_RATES = architectures['m256'][1] -M512_DO_RATES = architectures['m512'][1] -M1024_DO_RATES = architectures['m1024'][1] - - -# ==========================================# - - -def training(data_path, train_datasets, model_dir, batch_size, learning_rate, num_steps): - """ - Perform training of three a scaden model ensemble consisting of three different models - :param model_dir: - :param batch_size: - :param learning_rate: - :param num_steps: - :return: - """ - # Convert training datasets - if train_datasets == '': - train_datasets = [] - else: - train_datasets = train_datasets.split() - print("Training on: " + str(train_datasets)) - - - # M256 model training - print("Training M256 Model ...") - tf.reset_default_graph() - with tf.Session() as sess: - cdn256 = Scaden(sess=sess, - model_dir=model_dir+"/m256", - model_name='m256', - batch_size=batch_size, - learning_rate=learning_rate, - num_steps=num_steps) - cdn256.hidden_units = M256_HIDDEN_UNITS - cdn256.do_rates = M256_DO_RATES - cdn256.train(input_path=data_path, train_datasets=train_datasets) - - # Training of mid model - print("Training M512 Model ...") - tf.reset_default_graph() - with tf.Session() as sess: - cdn512 = Scaden(sess=sess, - model_dir=model_dir+"/m512", - model_name='m512', - batch_size=batch_size, - learning_rate=learning_rate, - num_steps=num_steps) - cdn512.hidden_units = M512_HIDDEN_UNITS - cdn512.do_rates = M512_DO_RATES - cdn512.train(input_path=data_path, train_datasets=train_datasets) - - # Training of large model - print("Training M1024 Model ...") - tf.reset_default_graph() - with tf.Session() as sess: - cdn1024 = Scaden(sess=sess, - model_dir=model_dir+"/m1024", - model_name='m1024', - batch_size=batch_size, - learning_rate=learning_rate, - num_steps=num_steps) - cdn1024.hidden_units = M1024_HIDDEN_UNITS - cdn1024.do_rates = M1024_DO_RATES - cdn1024.train(input_path=data_path, train_datasets=train_datasets) - - print("Training finished.") - - -def prediction(model_dir, data_path, out_name): - """ - Perform prediction using a trained scaden ensemble - :param model_dir: the directory containing the models - :param data_path: the path to the gene expression file - :param out_name: name of the output prediction file - :return: - """ - - # Small model predictions - tf.reset_default_graph() - with tf.Session() as sess: - cdn256 = Scaden(sess=sess, - model_dir=model_dir + "/m256", - model_name='m256') - cdn256.hidden_units = M256_HIDDEN_UNITS - cdn256.do_rates = M256_DO_RATES - - # Predict ratios - preds_256 = cdn256.predict(input_path=data_path, out_name='cdn_predictions_m256.txt') - - - # Mid model predictions - tf.reset_default_graph() - with tf.Session() as sess: - cdn512 = Scaden(sess=sess, - model_dir=model_dir+"/m512", - model_name='m512') - cdn512.hidden_units = M512_HIDDEN_UNITS - cdn512.do_rates = M512_DO_RATES - - # Predict ratios - preds_512 = cdn512.predict(input_path=data_path, out_name='cdn_predictions_m512.txt') - - # Large model predictions - tf.reset_default_graph() - with tf.Session() as sess: - cdn1024 = Scaden(sess=sess, - model_dir=model_dir+"/m1024", - model_name='m1024') - cdn1024.hidden_units = M1024_HIDDEN_UNITS - cdn1024.do_rates = M1024_DO_RATES - - # Predict ratios - preds_1024 = cdn1024.predict(input_path=data_path, out_name='cdn_predictions_m1024.txt') - - # Average predictions - preds = (preds_256 + preds_512 + preds_1024) / 3 - preds.to_csv(out_name, sep="\t") - - -def processing(data_path, training_data, processed_path): - """ - Process a training dataset to contain only the genes also available in the prediction data - :param data_path: path to prediction data - :param training_data: path to training data (h5ad file) - :param processed_path: name of processed file - :return: - """ - # Get the common genes (signature genes) - raw_input = sc.read_h5ad(training_data) - sig_genes_complete = list(raw_input.var_names) - sig_genes = get_signature_genes(input_path=data_path, sig_genes_complete=sig_genes_complete) - - # Pre-process data with new signature genes - preprocess_h5ad_data(raw_input_path=training_data, - processed_path=processed_path, - sig_genes=sig_genes) diff --git a/dist/scaden-0.9.0-py3-none-any.whl b/dist/scaden-0.9.0-py3-none-any.whl deleted file mode 100644 index 397e498..0000000 Binary files a/dist/scaden-0.9.0-py3-none-any.whl and /dev/null differ diff --git a/dist/scaden-0.9.0.tar.gz b/dist/scaden-0.9.0.tar.gz deleted file mode 100644 index 5db5912..0000000 Binary files a/dist/scaden-0.9.0.tar.gz and /dev/null differ diff --git a/scaden.egg-info/PKG-INFO b/scaden.egg-info/PKG-INFO deleted file mode 100644 index 487396d..0000000 --- a/scaden.egg-info/PKG-INFO +++ /dev/null @@ -1,39 +0,0 @@ -Metadata-Version: 2.1 -Name: scaden -Version: 0.9.0 -Summary: Cell type deconvolution using single cell data -Home-page: https://github.com/KevinMenden/scaden -Author: Kevin Menden -Author-email: kevin.menden@t-online.de -License: MIT License - -Copyright (c) 2019 Kevin Menden - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -Description: ![Scaden](docs/img/scaden_logo.png) - - ## Single-cell assisted deconvolutional network - - Scaden is a deep-learning based algorithm for cell type deconvolution of bulk RNA-seq samples. It was developed - at the DZNE Tübingen and the ZMNH in Hamburg. Please check out the [documentation](https://scaden.readthedocs.io) for further information. - A pre-print describing the method is available at Biorxiv: [Deep-learning based cell composition analysis from tissue expression profiles](https://www.biorxiv.org/content/10.1101/659227v1) - -Keywords: bioinformatics,deep learning,machine learning,single cell sequencing,deconvolution -Platform: UNKNOWN -Description-Content-Type: text/markdown diff --git a/scaden.egg-info/SOURCES.txt b/scaden.egg-info/SOURCES.txt deleted file mode 100644 index 9461653..0000000 --- a/scaden.egg-info/SOURCES.txt +++ /dev/null @@ -1,19 +0,0 @@ -LICENSE -MANIFEST.in -README.md -setup.py -scaden/__init__.py -scaden/scaden_main.py -scaden.egg-info/PKG-INFO -scaden.egg-info/SOURCES.txt -scaden.egg-info/dependency_links.txt -scaden.egg-info/requires.txt -scaden.egg-info/top_level.txt -scaden/model/__init__.py -scaden/model/architectures.py -scaden/model/functions.py -scaden/model/scaden.py -scaden/preprocessing/__init__.py -scaden/preprocessing/bulk_simulation.py -scaden/preprocessing/create_h5ad_file.py -scripts/scaden \ No newline at end of file diff --git a/scaden.egg-info/dependency_links.txt b/scaden.egg-info/dependency_links.txt deleted file mode 100644 index 8b13789..0000000 --- a/scaden.egg-info/dependency_links.txt +++ /dev/null @@ -1 +0,0 @@ - diff --git a/scaden.egg-info/requires.txt b/scaden.egg-info/requires.txt deleted file mode 100644 index 55b9b49..0000000 --- a/scaden.egg-info/requires.txt +++ /dev/null @@ -1,10 +0,0 @@ -pandas==0.21 -numpy==1.14.5 -scikit-learn -scipy -seaborn -tensorflow==1.10.0 -matplotlib -scanpy==1.2.2 -tqdm -click diff --git a/scaden.egg-info/top_level.txt b/scaden.egg-info/top_level.txt deleted file mode 100644 index 0475acd..0000000 --- a/scaden.egg-info/top_level.txt +++ /dev/null @@ -1 +0,0 @@ -scaden diff --git a/scaden/__main__.py b/scaden/__main__.py new file mode 100644 index 0000000..ba555b0 --- /dev/null +++ b/scaden/__main__.py @@ -0,0 +1,5 @@ +from .cli import main + +if __name__ == '__main__': + main() + diff --git a/scaden/__pycache__/__init__.cpython-36.pyc b/scaden/__pycache__/__init__.cpython-36.pyc deleted file mode 100644 index 23d0d5d..0000000 Binary files a/scaden/__pycache__/__init__.cpython-36.pyc and /dev/null differ diff --git a/scaden/__pycache__/scaden_main.cpython-36.pyc b/scaden/__pycache__/scaden_main.cpython-36.pyc deleted file mode 100644 index da6154d..0000000 Binary files a/scaden/__pycache__/scaden_main.cpython-36.pyc and /dev/null differ diff --git a/build/scripts-3.6/scaden b/scaden/cli.py old mode 100755 new mode 100644 similarity index 88% rename from build/scripts-3.6/scaden rename to scaden/cli.py index 4e1e74d..2851522 --- a/build/scripts-3.6/scaden +++ b/scaden/cli.py @@ -1,5 +1,3 @@ -#!python - """ cdn Command Line Interface author: Kevin Menden, DZNE Tübingen @@ -64,7 +62,7 @@ def train(data_path, train_datasets, model_dir, batch_size, learning_rate, steps """ -Predictin mode +Prediction mode """ @cli.command() @click.argument( @@ -80,7 +78,7 @@ def train(data_path, train_datasets, model_dir, batch_size, learning_rate, steps ) @click.option( '--outname', - default = "cdn_predictions.txt", + default = "scaden_predictions.txt", help = 'Name of predictions file.' ) def predict(data_path, model_dir, outname): @@ -118,8 +116,15 @@ def process(data_path, prediction_data, processed_path): training_data=data_path, processed_path=processed_path) +def main(): + text = """ + ____ _ + / ___| ___ __ _ __| | ___ _ __ + \___ \ / __/ _` |/ _` |/ _ \ '_ \ + ___) | (_| (_| | (_| | __/ | | | + |____/ \___\__,_|\__,_|\___|_| |_| -if __name__ == '__main__': - - + """ + click.echo(click.style(text, fg='blue')) cli() + diff --git a/scaden/model/__pycache__/__init__.cpython-36.pyc b/scaden/model/__pycache__/__init__.cpython-36.pyc deleted file mode 100644 index 22dab72..0000000 Binary files a/scaden/model/__pycache__/__init__.cpython-36.pyc and /dev/null differ diff --git a/scaden/model/__pycache__/architectures.cpython-36.pyc b/scaden/model/__pycache__/architectures.cpython-36.pyc deleted file mode 100644 index 005176c..0000000 Binary files a/scaden/model/__pycache__/architectures.cpython-36.pyc and /dev/null differ diff --git a/scaden/model/__pycache__/functions.cpython-36.pyc b/scaden/model/__pycache__/functions.cpython-36.pyc deleted file mode 100644 index 64e04f7..0000000 Binary files a/scaden/model/__pycache__/functions.cpython-36.pyc and /dev/null differ diff --git a/scaden/model/__pycache__/scaden.cpython-36.pyc b/scaden/model/__pycache__/scaden.cpython-36.pyc deleted file mode 100644 index 8692c3f..0000000 Binary files a/scaden/model/__pycache__/scaden.cpython-36.pyc and /dev/null differ diff --git a/scaden/scaden_main.py b/scaden/scaden_main.py index 9c4ce42..ac89aa7 100644 --- a/scaden/scaden_main.py +++ b/scaden/scaden_main.py @@ -62,6 +62,7 @@ def training(data_path, train_datasets, model_dir, batch_size, learning_rate, nu cdn256.hidden_units = M256_HIDDEN_UNITS cdn256.do_rates = M256_DO_RATES cdn256.train(input_path=data_path, train_datasets=train_datasets) + del cdn256 # Training of mid model print("Training M512 Model ...") @@ -76,6 +77,7 @@ def training(data_path, train_datasets, model_dir, batch_size, learning_rate, nu cdn512.hidden_units = M512_HIDDEN_UNITS cdn512.do_rates = M512_DO_RATES cdn512.train(input_path=data_path, train_datasets=train_datasets) + del cdn512 # Training of large model print("Training M1024 Model ...") @@ -90,6 +92,7 @@ def training(data_path, train_datasets, model_dir, batch_size, learning_rate, nu cdn1024.hidden_units = M1024_HIDDEN_UNITS cdn1024.do_rates = M1024_DO_RATES cdn1024.train(input_path=data_path, train_datasets=train_datasets) + del cdn1024 print("Training finished.") diff --git a/scripts/scaden b/scripts/scaden index 8d76e4e..54b3faa 100755 --- a/scripts/scaden +++ b/scripts/scaden @@ -7,127 +7,8 @@ author: Kevin Menden, DZNE Tübingen This is the main file for executing the cdn program. """ -# imports -import click -import scaden -from scaden.scaden_main import training, prediction, processing - -@click.group() -@click.version_option('1.0.0') -def cli(): - pass - - -""" -Training mode -""" -@cli.command() -@click.argument( - 'data_path', - type = click.Path(exists=True), - required = True, - metavar = '' -) -@click.option( - '--train_datasets', - default = '', - help = 'Datasets used for training. Uses all by default.' -) -@click.option( - '--model_dir', - default = "./", - help = 'Path to store the model in' -) -@click.option( - '--batch_size', - default = 128, - help = 'Batch size to use for training. Default: 128' -) -@click.option( - '--learning_rate', - default = 0.0001, - help = 'Learning rate used for training. Default: 0.0001' -) -@click.option( - '--steps', - default = 20000, - help = 'Number of training steps' -) -def train(data_path, train_datasets, model_dir, batch_size, learning_rate, steps): - """ Train a cdn model """ - training(data_path=data_path, - train_datasets=train_datasets, - model_dir=model_dir, - batch_size=batch_size, - learning_rate=learning_rate, - num_steps=steps) - - -""" -Predictin mode -""" -@cli.command() -@click.argument( - 'data_path', - type = click.Path(exists=True), - required = True, - metavar = '' -) -@click.option( - '--model_dir', - default = "./", - help = 'Path to trained model' -) -@click.option( - '--outname', - default = "scaden_predictions.txt", - help = 'Name of predictions file.' -) -def predict(data_path, model_dir, outname): - """ cdn prediction using a trained model """ - prediction(model_dir=model_dir, - data_path=data_path, - out_name=outname) - - - -""" -Processing mode -""" -@cli.command() -@click.argument( - 'data_path', - type = click.Path(exists=True), - required = True, - metavar = '' -) -@click.argument( - 'prediction_data', - type = click.Path(exists=True), - required = True, - metavar = '' -) -@click.option( - '--processed_path', - default = "processed.h5ad", - help = 'Path of processed file. Must end with .h5ad' -) -def process(data_path, prediction_data, processed_path): - """ Process a dataset for training """ - processing(data_path=prediction_data, - training_data=data_path, - processed_path=processed_path) - +from scaden.cli import main if __name__ == '__main__': + main() - text = """ - ____ _ - / ___| ___ __ _ __| | ___ _ __ - \___ \ / __/ _` |/ _` |/ _ \ '_ \ - ___) | (_| (_| | (_| | __/ | | | - |____/ \___\__,_|\__,_|\___|_| |_| - - """ - click.echo(click.style(text, fg='blue')) - cli() diff --git a/setup.py b/setup.py index c6fb3de..c3109ce 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup, find_packages -version = '0.9.1' +version = '0.9.2' with open("README.md", "r") as fh: long_description = fh.read()