diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 4e1ef42..fd627ff 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -9,23 +9,22 @@ on: jobs: deploy: - runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: '3.x' - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install setuptools wheel twine - - name: Build and publish - env: - TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} - TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} - run: | - python setup.py sdist bdist_wheel - twine upload dist/* + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: "3.x" + - name: Install dependencies + run: | + python -m pip install --upgrade pip setuptools wheel twine + pip install setuptools wheel twine + - name: Build and publish + env: + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: | + python setup.py sdist bdist_wheel + twine upload dist/* diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 9d3ee2f..0000000 --- a/.travis.yml +++ /dev/null @@ -1,39 +0,0 @@ -# Don't use language: python; this gives us an unnecessary virtualenv -language: c - - -env: - - BUILD_TARGET=3.6 - -install: - - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh - - bash miniconda.sh -b -p $HOME/miniconda - - source "$HOME/miniconda/etc/profile.d/conda.sh" - - hash -r - - conda config --set always_yes yes --set changeps1 no - - conda update -q conda - - conda info -a - - export LANG=en_US.UTF-8 - - export COVERAGE_DIR=":$HOME/htmlcov" - - printenv | sort - # Install conda-build and conda-verify - - conda install -q -c defaults -c conda-forge python=$BUILD_TARGET conda-build conda-verify codecov flake8 - -script: - - source "$HOME/miniconda/etc/profile.d/conda.sh" - - > - conda build \ - --override-channels -c defaults -c conda-forge -c r -c bioconda \ - --python $BUILD_TARGET \ - ./conda.recipe/ - - > - conda create \ - -n install-environment \ - --override-channels -c defaults -c conda-forge -c r -c local -c bioconda \ - python=$BUILD_TARGET \ - scaden conda codecov coverage - - conda activate install-environment && scaden --help - -after_success: - - codecov - diff --git a/CHANGELOG.md b/CHANGELOG.md index 752aba2..a53be87 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,31 +1,44 @@ # Scaden Changelog -## v0.9.5 +### Version 1.0.0 + +* Rebuild Scaden model and training to use TF2 Keras API instead of the old compatibility functions +* added `scaden example` command which allows to generate example data for test-running scaden and to inpstec the expected file format +* added more tests and checks input reading function in `scaden simulate` +* fixed bug in reading input data + +### Version 0.9.6 + ++ fixed Dockerfile (switched to pip installation) ++ added better error messages to `simulate` command ++ cleaned up dependencies + +### v0.9.5 * added `--seed` parameter to allow reproducible Scaden runs * added `scaden simulate` command to perform bulk simulation and training file creation * changed CLI calling -## v0.9.4 +### v0.9.4 * fixed dependencies (added python>=3.6 requirement) -## v0.9.3 +### v0.9.3 * upgrade to tf2 * cleaned up dependencies -## v0.9.2 +### v0.9.2 * small code refactoring * RAM usage improvement -## v0.9.1 +### v0.9.1 * added automatic removal of duplicate genes * changed name of prediction file -## v0.9.0 +### v0.9.0 Initial release of the Scaden deconvolution package. diff --git a/Dockerfile-dev b/Dockerfile-dev deleted file mode 100644 index 8ed6b79..0000000 --- a/Dockerfile-dev +++ /dev/null @@ -1,49 +0,0 @@ -# BASE_IMAGE is either "nvidia/cuda:10.1-runtime-ubuntu18.04" or "library/ubuntu:18.04" -ARG BASE_IMAGE -FROM ${BASE_IMAGE} - -# -# Install Miniconda in /opt/conda -# - -ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 -ENV PATH /opt/conda/bin:$PATH - -RUN apt-get update --fix-missing && \ - apt-get install -y wget bzip2 ca-certificates curl git && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-4.7.12.1-Linux-x86_64.sh -O ~/miniconda.sh && \ - /bin/bash ~/miniconda.sh -b -p /opt/conda && \ - rm ~/miniconda.sh && \ - /opt/conda/bin/conda clean -tipsy && \ - ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ - echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ - echo "conda activate base" >> ~/.bashrc - -ENV LD_LIBRARY_PATH /usr/local/cuda-10.1/lib64:/usr/local/cuda-10.1/extras/CUPTI/lib64:$LD_LIBRARY_PATH - -COPY . /src -WORKDIR /src - -ARG CPU_OR_GPU - -RUN conda update -n base -c defaults conda \ - && conda install conda-build conda-verify \ - && conda build --override-channels -c defaults -c conda-forge -c bioconda conda.recipe/ \ - && cd / \ - && rm -rf /src \ - && conda install --override-channels -c local -c defaults -c conda-forge -c bioconda scaden \ - && conda remove conda-build conda-verify \ - && conda clean -a - -# Needed for when the docker container uses a non-root user id -RUN mkdir /tmp/numba_cache & chmod 777 /tmp/numba_cache & NUMBA_CACHE_DIR=/tmp/numba_cache - -RUN if [ "x$CPU_OR_GPU" = "xgpu" ]; then \ - conda install tensorflow-gpu && \ - conda clean -a; \ - fi - -WORKDIR / diff --git a/README.md b/README.md index 63d9cce..cdda247 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,8 @@ ![Install with pip](https://img.shields.io/badge/Install%20with-pip-blue) ![Install with Bioconda](https://img.shields.io/badge/Install%20with-conda-green) ![Downloads](https://static.pepy.tech/personalized-badge/scaden?period=total&units=international_system&left_color=blue&right_color=green&left_text=Downloads) +![Docker](https://github.com/kevinmenden/scaden/workflows/Docker/badge.svg) +![Scaden CI](https://github.com/kevinmenden/scaden/workflows/Scaden%20CI/badge.svg) ## Single-cell assisted deconvolutional network @@ -23,29 +25,24 @@ Scaden overview. a) Generation of artificial bulk samples with known cell type c of Scaden model ensemble on simulated training data. c) Scaden ensemble architecture. d) A trained Scaden model can be used to deconvolve complex bulk mixtures. -### 1. System requirements -Scaden was developed and tested on Linux (Ubuntu 16.04 and 18.04). It was not tested on Windows or Mac, but should -also be usable on these systems when installing with Pip or Bioconda. Scaden does not require any special -hardware (e.g. GPU), however we recommend to have at least 16 GB of memory. -Scaden requires Python 3. All package dependencies should be handled automatically when installing with pip or conda. -### 2. Installation guide +## Installation guide Scaden can be easily installed on a Linux system, and should also work on Mac. There are currently two options for installing Scaden, either using [Bioconda](https://bioconda.github.io/) or via [pip](https://pypi.org/). -## pip +### pip To install Scaden via pip, simply run the following command: `pip install scaden` -## Bioconda +### Bioconda You can also install Scaden via bioconda, using: `conda install -c bioconda scaden` -## GPU +### GPU If you want to make use of your GPU, you will have to additionally install `tensorflow-gpu`. For pip: @@ -56,7 +53,7 @@ For conda: `conda install tensorflow-gpu` -## Docker +### Docker If you don't want to install Scaden at all, but rather use a Docker container, we provide that as well. For every release, we provide two version - one for CPU and one for GPU usage. To pull the CPU container, use this command: @@ -74,38 +71,54 @@ Additionally, we now proivde a web tool: It contains pre-generated training datasets for several tissues, and all you need to do is to upload your expression data. Please note that this is still in preview. -### 3. Demo -We provide several curated [training datasets](https://scaden.readthedocs.io/en/latest/datasets/) for Scaden. For this demo, -we will use the human PBMC training dataset, which consists of 4 different scRNA-seq datasets and 32,000 samples in total. -You can download it here: -[https://figshare.com/s/e59a03885ec4c4d8153f](https://figshare.com/s/e59a03885ec4c4d8153f). +## Usage +We provide a detailed instructions for how to use Scaden at our [Documentation page](https://scaden.readthedocs.io/en/latest/usage/) + +A deconvolution workflow with Scaden consists of four major steps: +* data simulation +* data processing +* training +* prediction + +If training data is already available, you can start at the data processing step. Otherwise you will first have to process scRNA-seq datasets and perform data simulation to generate a training dataset. As an example workflow, you can use Scaden's function `scaden example` to generate example data and go through the whole pipeline. + +First, make an example data directory and generate the example data: +```bash +mkdir example_data +scaden example --out example_data/ +``` +This generates the files "example_counts.txt", "example_celltypes.txt" and "example_bulk_data.txt" in the "example_data" directory. Next, you can generate training data: -For this demo, you will also need to download some test samples to perform deconvolution on, along with their associated labels. -You can download the data we used for the Scaden paper here: -[https://figshare.com/articles/Publication_Figures/8234030](https://figshare.com/articles/Publication_Figures/8234030) +```bash +scaden simulate --data example_data/ -n 100 --pattern "*_counts.txt +``` -We'll perform deconvolution on simulated samples from the data6k dataset. You can find the samples and labels in 'paper_data/figures/figure2/data/data6k_500_*' -once you have downloaded this data from the link mentioned above. +This generates 100 samples of training data in your current working directory. The file you need for your next step is called "data.h5ad". Now you need to perform the preprocessing using the training data and the bulk data file: -The first step is to perform preprocessing on the training data. This is done with the following command: +```bash +scaden process data.h5ad example_data/example_bulk_data.txt +``` -`scaden process pbmc_data.h5ad paper_data/figures/figure2/data/data6k_500_samples.txt` +As a result, you should now have a file called "processed.h5ad" in your directory. Now you can perform training. The following command performs training for 5000 steps per model and saves the trained weights to the "model" directory, which will be created: -This will generate a file called 'processed.h5ad', which we will use for training. The training data -we have downloaded also contains samples from the data6k scRNA-seq dataset, so we have to exclude them from training -to get a meaningfull test of Scaden's performance. The following command will train a Scaden ensemble for 5000 steps per model (recommended), -and store it in 'scaden_model'. Data from the data6k dataset will be excluded from training. Depending on your machine, this can take about 10-20 minutes. +```bash +scaden train processed.h5ad --steps 5000 --model_dir model +``` -`scaden train processed.h5ad --steps 5000 --model_dir scaden_model --train_datasets 'data8k donorA donorC'` +Finally, you can use the trained model to perform prediction: -Finally, we can perform deconvolution on the 500 simulates samples from the data6k dataset: +```bash +scaden predict --model_dir model example_data/example_bulk_data.txt +``` -`scaden predict paper_data/figures/figure2/data/data6k_500_samples.txt --model_dir scaden_model` +Now you should have a file called "scaden_predictions.txt" in your working directory, which contains your estimated cell compositions. -This will create a file named 'cdn_predictions.txt' (will be renamed in future version to 'scaden_predictions.txt'), which contains -the deconvolution results. You can now compare these predictions with the true values contained in -'paper_data/figures/figure2/data/data6k_500_labels.txt'. This should give you the same results as we obtained in the Scaden paper -(see Figure 2). -### 4. Instructions for use -For a general description on how to use Scaden, please check out our [usage documentation](https://scaden.readthedocs.io/en/latest/usage/). + + +### 1. System requirements +Scaden was developed and tested on Linux (Ubuntu 16.04 and 18.04). It was not tested on Windows or Mac, but should +also be usable on these systems when installing with Pip or Bioconda. Scaden does not require any special +hardware (e.g. GPU), however we recommend to have at least 16 GB of memory. + +Scaden requires Python 3. All package dependencies should be handled automatically when installing with pip or conda. \ No newline at end of file diff --git a/docs/changelog.md b/docs/changelog.md index 2f0e14a..28e7a4a 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,31 +1,51 @@ -# Changelog +# Scaden Changelog + +### Version 1.0.0 + +* Rebuild Scaden model and training to use TF2 Keras API instead of the old compatibility functions +* added `scaden example` command which allows to generate example data for test-running scaden and to inpstec the expected file format +* added more tests and checks input reading function in `scaden simulate` +* fixed bug in reading input data ### Version 0.9.6 + + + fixed Dockerfile (switched to pip installation) + added better error messages to `simulate` command + cleaned up dependencies -### Version 0.9.5 -+ added `scaden simulate` command to perform bulk simulation and training file creation -+ added `--seed` parameter to allow reproducible Scaden runs +### v0.9.5 -### Version 0.9.4 -+ fixed dependencies (added python>=3.6 requirement) +* added `--seed` parameter to allow reproducible Scaden runs +* added `scaden simulate` command to perform bulk simulation and training file creation +* changed CLI calling -### Version 0.9.3 -+ upgrade to Tensorflow 2 -+ cleaned up dependencies +### v0.9.4 + +* fixed dependencies (added python>=3.6 requirement) + +### v0.9.3 + +* upgrade to tf2 +* cleaned up dependencies + +### v0.9.2 + +* small code refactoring +* RAM usage improvement + +### v0.9.1 + +* added automatic removal of duplicate genes +* changed name of prediction file + +### v0.9.0 + +Initial release of the Scaden deconvolution package. -### Version 0.9.2 -+ RAM usage improvement +Commands: -### Version 0.9.1 -+ Added automatic removal of duplicate genes in Mixture file -+ Changed name of final prediction file -+ Added Scaden logo to main script +* `scaden process`: Process a training dataset for training +* `scaden train`: Train a Scaden model +* `scaden predict`: Predict cell type compositions of a given sample -### Version 0.9.0 -This is the initial release version of Scaden. While this version contains full functionality for pre-processing, training and prediction, it does not -contain thorough error messages, plotting functionality and a solid helper function for generation training data. These are all features -planned for the release of v.1.0.0. -The core functionality of Scaden is, however, implemented and fully operational. Please check the [Usage](usage) section to learn how to use Scaden. \ No newline at end of file diff --git a/docs/usage.md b/docs/usage.md index a64bfda..21b06b3 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -1,13 +1,18 @@ # Usage -For a typical deconvolution with Scaden you will have to perform three steps: +The Scaden deconvolution workflow consists of three mandatory steps: -* pre-processing of training data -* training of Scaden model -* prediction +* `scaden process` +* `scaden train` +* `scaden predict` -This assumes that you already have a training dataset. If not, Scaden contains functionality to create a dataset from one or several scRNA-seq datasets. -Please refer to the [data generation](#training-data-generation) section for instructions on how to create training datasets. +In the `process` step, Scaden performs pre-processing on your training data, making sure it has the same genes as your prediction (bulk) data and performing some data transformations to make the data suitable for machine learning + +In the `train` step, training of the Scaden ensemble model is performed. + +Finally, the predictions are generated in the `predict` step, for which the trained model is used. + +This assumes that you already have a training dataset. If not, Scaden contains functionality to create a dataset from one or several scRNA-seq datasets. For this, you can use the `scaden simulate` command after preparing some data. Have a look at the [scaden simulate](#scaden-simulate) section for instructions on how to use this command. Note that we already provide datasets for certain tissues. All available datasets are listed in the [Datasets](datasets) section. We will update this section when new datasets are added. @@ -19,7 +24,7 @@ If you don't want to install Scaden on your local machine, we now provide a webt It contains pre-generated training datasets for several tissues, and all you need to do is to upload your expression data. Please note that this is still in preview. -## Pre-processing +## scaden process The first step is to pre-process your training data. For this you need your training data and the dataset you want to perform deconvolution on. In this step, Scaden will create a new file for training which only contains the intersection of genes between the training and the prediction data. Furthermore, the training data will be log2-transformed and scaled to the range [0,1]. Use the following command for pre-processing: @@ -28,43 +33,40 @@ Furthermore, the training data will be log2-transformed and scaled to the range scaden process ``` -## Training +By the default, the output file will be called "processed.h5ad". Have a look at `scaden process --help` for further options. + +## scaden train Now that your data is set-up, you can start training a Scaden ensemble model. Scaden consists of three deep neural network models. By default, -each of them will be trained for 20,000 steps. You can train longer if you want, although we got good results with this number for datasets of -around 30,000 samples. Use the following command to just train a model for 20,000 steps: +each of them will be trained for 5,000 steps, which is the recommended number of training steps. You can train longer if you want, although we got good results with this number for datasets of +around 30,000 samples. Use the following command to just train a model: ```console scaden train ``` -This will save the model parameters in your working directory. If you want to create a specific directory for your trained models instead, -and train for 30,00 steps, you can use this command: +This will save the model parameters in your working directory. We recommend to explicitly specify a model directory like so: ```console -scaden train --model_dir --steps 30000 +scaden train --model_dir ``` - -You can also adjust the batch size and the learning rate, although we recommend using the default values. If you want to adjust them anyway, use these flages: - +For more options, have a look at ```console ---batch_size - ---learning_rate +scaden train --help ``` -## Prediction -Finally, after your model is trained, you can start the prediction. If you haven't specified any model directory and just trained a model +## scaden predict +Finally, after your model is trained, you can perform the prediction. If you haven't specified any model directory and just trained a model in your current directory, you can use the following command to perform the deconvolution: ```console scaden predict ``` -Scaden will then generate a file called 'cdn_predictions.txt' (this name will change in future releases) in your current directory. If the models were saved elsewhere, +Scaden will then generate a file called 'scaden_predictions.txt' in your current directory. If the models were saved elsewhere, you have to tell Scaden where to look for them: ```console @@ -78,27 +80,24 @@ You can also change the path and name of the output predictions file using the ` --outname 0: @@ -162,22 +187,27 @@ def load_h5ad_file(self, input_path, batch_size, datasets=[]): if ds not in datasets: raw_input = raw_input[raw_input.obs['ds'] != ds].copy() - # Create training dataset - ratios = [raw_input.obs[ctype] for ctype in raw_input.uns['cell_types']] + ratios = [ + raw_input.obs[ctype] for ctype in raw_input.uns['cell_types'] + ] self.x_data = raw_input.X.astype(np.float32) self.y_data = np.array(ratios, dtype=np.float32).transpose() - # create placeholders - self.x_data_ph = tf.compat.v1.placeholder(self.x_data.dtype, self.x_data.shape, name="x_data_ph") - self.y_data_ph = tf.compat.v1.placeholder(self.y_data.dtype, self.y_data.shape, name="y_data_ph") - self.data = tf.data.Dataset.from_tensor_slices((self.x_data_ph, self.y_data_ph)) - self.data = self.data.shuffle(1000).repeat().batch(batch_size=batch_size) + self.data = tf.data.Dataset.from_tensor_slices( + (self.x_data, self.y_data)) + self.data = self.data.shuffle(1000).repeat().batch( + batch_size=batch_size) + self.data_iter = iter(self.data) # Extract celltype and feature info self.labels = raw_input.uns['cell_types'] self.sig_genes = list(raw_input.var_names) - def load_prediction_file(self, input_path, sig_genes, labels, scaling=None): + def load_prediction_file(self, + input_path, + sig_genes, + labels, + scaling=None): """ Load a file to perform prediction on it :param input_path: path to input file @@ -192,24 +222,19 @@ def load_prediction_file(self, input_path, sig_genes, labels, scaling=None): # check for duplicates data_index = list(data.index) if not (len(data_index) == len(set(data_index))): - print("Scaden Warning: Your mixture file conatins duplicate genes! The firs occuring gene will be used for every duplicate.") + print( + "Scaden Warning: Your mixture file conatins duplicate genes! The first occuring gene will be used for every duplicate." + ) data = data.loc[~data.index.duplicated(keep='first')] data = data.loc[sig_genes] + data = data.T - self.x_data = data.T - self.x_data = self.x_data.astype(np.float32) - m = self.x_data.shape[0] - self.y_dummy = dummy_labels(m, labels) # Scaling if scaling: - self.x_data = sample_scaling(self.x_data, scaling_option=scaling) + data = sample_scaling(data, scaling_option=scaling) - # Create Dataset object from placeholders - self.x_data_ph = tf.compat.v1.placeholder(self.x_data.dtype, self.x_data.shape, name="x_data_ph") - self.y_data_ph = tf.compat.v1.placeholder(self.y_dummy.dtype, self.y_dummy.shape, name="y_data_ph") - self.data = tf.data.Dataset.from_tensor_slices((self.x_data_ph, self.y_data_ph)) - self.data = self.data.batch(batch_size=m) + self.data = data return sample_names @@ -221,39 +246,29 @@ def build_model(self, input_path, train_datasets, mode="train"): """ self.global_step = tf.Variable(0, name='global_step', trainable=False) - # Load data - if mode=="train": - self.load_h5ad_file(input_path=input_path, batch_size=self.batch_size, datasets=train_datasets) - - if mode=="predict": - self.sample_names = self.load_prediction_file(input_path=input_path, sig_genes=self.sig_genes, - labels=self.labels, scaling=self.scaling) - - # Make iterator - iter = tf.compat.v1.data.Iterator.from_structure(tf.compat.v1.data.get_output_types(self.data), tf.compat.v1.data.get_output_shapes(self.data)) - next_element = iter.get_next() - self.data_init_op = iter.make_initializer(self.data) - self.x, self.y = next_element - self.x = tf.cast(self.x, tf.float32) - - self.n_classes = len(self.labels) - - # Placeholder for training mode - self.training_mode = tf.compat.v1.placeholder_with_default(True, shape=()) - - # Model - self.logits = self.model_fn(X=self.x, n_classes=self.n_classes) - - + # Load training data if mode == "train": - # Loss - self.loss = self.compute_loss(self.logits, self.y) - # Summary scalars - self.merged_summary_op = self.visualization(tf.cast(self.logits, tf.float32), targets=tf.cast(self.y, tf.float32), classes=self.labels) - learning_rate = self.learning_rate - # Optimizer - self.optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.loss, global_step=self.global_step) + self.load_h5ad_file(input_path=input_path, + batch_size=self.batch_size, + datasets=train_datasets) + + # Load prediction data + if mode == "predict": + self.sample_names = self.load_prediction_file( + input_path=input_path, + sig_genes=self.sig_genes, + labels=self.labels, + scaling=self.scaling) + + # Build the model or load if available + self.n_classes = len(self.labels) + try: + self.model = tf.keras.models.load_model(self.model_dir, + compile=False) + logger.info("Loaded pre-trained model") + except: + self.model = self.scaden_model(n_classes=self.n_classes) def train(self, input_path, train_datasets): """ @@ -261,38 +276,44 @@ def train(self, input_path, train_datasets): :param num_steps: :return: """ + + # Define the optimizer + optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate) + # Build model graph - self.build_model(input_path=input_path, train_datasets=train_datasets, mode="train") + self.build_model(input_path=input_path, + train_datasets=train_datasets, + mode="train") - # Init variables - self.sess.run(tf.compat.v1.global_variables_initializer()) - self.sess.run(tf.compat.v1.local_variables_initializer()) - self.saver = tf.compat.v1.train.Saver() - model = os.path.join(self.model_dir, self.model_name) - self.writer = tf.compat.v1.summary.FileWriter(model, self.sess.graph) - self.eval_writer = tf.compat.v1.summary.FileWriter(os.path.join(self.model_dir, "eval"), self.sess.graph) + # Training loop + pbar = tqdm(range(self.num_steps)) + for step, _ in enumerate(pbar): - # Initialize datasets - self.sess.run(self.data_init_op, feed_dict={self.x_data_ph: self.x_data, self.y_data_ph: self.y_data}) + x, y = self.data_iter.get_next() + with tf.GradientTape() as tape: + self.logits = self.model(x, training=True) + loss = self.compute_loss(self.logits, y) - # Load pre-trained weights if avaialble - self.load_weights(self.model_dir) + grads = tape.gradient(loss, self.model.trainable_weights) - # Training loop - pbar = tqdm(range(self.num_steps)) - for _ in pbar: - _, loss, summary = self.sess.run([self.optimizer, self.loss, self.merged_summary_op]) - self.writer.add_summary(summary, tf.compat.v1.train.global_step(self.sess, self.global_step)) - description = "Step: " + str(tf.compat.v1.train.global_step(self.sess, self.global_step)) + ", Loss: {:4.3f}".format( - loss) - pbar.set_description(desc=description) + optimizer.apply_gradients(zip(grads, self.model.trainable_weights)) + + desc = (f"Step: {step}, Loss: {loss:.4f}") + pbar.set_description(desc=desc) + + # Collect garbage after 100 steps - otherwise runs out of memory + if step % 100 == 0: + gc.collect() # Save the trained model - self.saver.save(self.sess, model, global_step=self.global_step) - # Save features and celltypes - pd.DataFrame(self.labels).to_csv(self.model_dir + "/celltypes.txt", sep="\t") - pd.DataFrame(self.sig_genes).to_csv(self.model_dir + "/genes.txt", sep="\t") + self.model.save(self.model_dir) + pd.DataFrame(self.labels).to_csv(os.path.join(self.model_dir, + "celltypes.txt"), + sep="\t") + pd.DataFrame(self.sig_genes).to_csv(os.path.join( + self.model_dir, "genes.txt"), + sep="\t") def predict(self, input_path, out_name="scaden_predictions.txt"): @@ -309,35 +330,13 @@ def predict(self, input_path, out_name="scaden_predictions.txt"): self.labels = list(labels['0']) # Build model graph - self.build_model(input_path=input_path, train_datasets=[], mode="predict") - - # Initialize variables - self.sess.run(tf.compat.v1.global_variables_initializer()) - self.sess.run(tf.compat.v1.local_variables_initializer()) - - self.saver = tf.compat.v1.train.Saver() - - model = os.path.join(self.model_dir, self.model_name) - self.writer = tf.compat.v1.summary.FileWriter(model, self.sess.graph) - - # Initialize datasets - self.sess.run(self.data_init_op, feed_dict={self.x_data_ph: self.x_data, self.y_data_ph: self.y_dummy}) - - # Load pre-trained weights if avaialble - self.load_weights(self.model_dir) - - predictions = self.sess.run([self.logits], feed_dict={self.training_mode: False}) - pred_df = pd.DataFrame(predictions[0], columns=self.labels, index=self.sample_names) - #pred_df.to_csv(out_name, sep="\t") - return pred_df - - def load_weights(self, model_dir): - """ - Load pre-trained weights if available - :param model_dir: - :return: - """ - ckpt = tf.train.get_checkpoint_state(model_dir) - if ckpt: - self.saver.restore(self.sess, ckpt.model_checkpoint_path) - print("Model parameters restored successfully") + self.build_model(input_path=input_path, + train_datasets=[], + mode="predict") + + predictions = self.model.predict(self.data) + + pred_df = pd.DataFrame(predictions, + columns=self.labels, + index=self.sample_names) + return pred_df \ No newline at end of file diff --git a/scaden/predict.py b/scaden/predict.py new file mode 100644 index 0000000..c2d26dd --- /dev/null +++ b/scaden/predict.py @@ -0,0 +1,73 @@ +""" +scaden Main functionality + +Contains code to +- process a training datasets +- train a model +- perform predictions + +""" + +# Imports +import tensorflow as tf +from anndata import read_h5ad +from scaden.model.architectures import architectures +from scaden.model.scaden import Scaden +""" +PARAMETERS +""" +# ==========================================# + +# Extract architectures +M256_HIDDEN_UNITS = architectures['m256'][0] +M512_HIDDEN_UNITS = architectures['m512'][0] +M1024_HIDDEN_UNITS = architectures['m1024'][0] +M256_DO_RATES = architectures['m256'][1] +M512_DO_RATES = architectures['m512'][1] +M1024_DO_RATES = architectures['m1024'][1] + +# ==========================================# + + +def prediction(model_dir, data_path, out_name, seed=0): + """ + Perform prediction using a trained scaden ensemble + :param model_dir: the directory containing the models + :param data_path: the path to the gene expression file + :param out_name: name of the output prediction file + :return: + """ + + # Small model predictions + cdn256 = Scaden(model_dir=model_dir + "/m256", + model_name='m256', + seed=seed, + hidden_units=M256_HIDDEN_UNITS, + do_rates=M256_DO_RATES) + # Predict ratios + preds_256 = cdn256.predict(input_path=data_path, + out_name='scaden_predictions_m256.txt') + + # Mid model predictions + cdn512 = Scaden(model_dir=model_dir + "/m512", + model_name='m512', + seed=seed, + hidden_units=M512_HIDDEN_UNITS, + do_rates=M512_DO_RATES) + # Predict ratios + preds_512 = cdn512.predict(input_path=data_path, + out_name='scaden_predictions_m512.txt') + + # Large model predictions + cdn1024 = Scaden(model_dir=model_dir + "/m1024", + model_name='m1024', + seed=seed, + hidden_units=M1024_HIDDEN_UNITS, + do_rates=M256_DO_RATES) + # Predict ratios + preds_1024 = cdn1024.predict(input_path=data_path, + out_name='scaden_predictions_m1024.txt') + + # Average predictions + preds = (preds_256 + preds_512 + preds_1024) / 3 + preds.to_csv(out_name, sep="\t") diff --git a/scaden/preprocessing/bulk_simulation.py b/scaden/preprocessing/bulk_simulation.py index 8cc495c..52ce3f7 100644 --- a/scaden/preprocessing/bulk_simulation.py +++ b/scaden/preprocessing/bulk_simulation.py @@ -9,9 +9,11 @@ import pandas as pd import numpy as np from tqdm import tqdm +from pathlib import Path logger = logging.getLogger(__name__) + def create_fractions(no_celltypes): """ Create random fractions @@ -24,7 +26,12 @@ def create_fractions(no_celltypes): return fracs -def create_subsample(x, y, sample_size, celltypes, available_celltypes, sparse=False): +def create_subsample(x, + y, + sample_size, + celltypes, + available_celltypes, + sparse=False): """ Generate artifical bulk subsample with random fractions of celltypes If sparse is set to true, add random celltypes to the missing celltypes @@ -39,9 +46,9 @@ def create_subsample(x, y, sample_size, celltypes, available_celltypes, sparse=F if sparse: no_keep = np.random.randint(1, len(available_celltypes)) - keep = np.random.choice( - list(range(len(available_celltypes))), size=no_keep, replace=False - ) + keep = np.random.choice(list(range(len(available_celltypes))), + size=no_keep, + replace=False) available_celltypes = [available_celltypes[i] for i in keep] no_avail_cts = len(available_celltypes) @@ -61,7 +68,8 @@ def create_subsample(x, y, sample_size, celltypes, available_celltypes, sparse=F for i in range(no_avail_cts): ct = available_celltypes[i] cells_sub = x.loc[np.array(y["Celltype"] == ct), :] - cells_fraction = np.random.randint(0, cells_sub.shape[0], samp_fracs[i]) + cells_fraction = np.random.randint(0, cells_sub.shape[0], + samp_fracs[i]) cells_sub = cells_sub.iloc[cells_fraction, :] artificial_samples.append(cells_sub) @@ -91,9 +99,8 @@ def create_subsample_dataset(x, y, sample_size, celltypes, no_samples): pbar = tqdm(range(no_samples)) pbar.set_description(desc="Normal samples") for _ in pbar: - sample, label = create_subsample( - x, y, sample_size, celltypes, available_celltypes - ) + sample, label = create_subsample(x, y, sample_size, celltypes, + available_celltypes) X.append(sample) Y.append(label) @@ -102,9 +109,12 @@ def create_subsample_dataset(x, y, sample_size, celltypes, no_samples): pbar = tqdm(range(n_sparse)) pbar.set_description(desc="Sparse samples") for _ in pbar: - sample, label = create_subsample( - x, y, sample_size, celltypes, available_celltypes, sparse=True - ) + sample, label = create_subsample(x, + y, + sample_size, + celltypes, + available_celltypes, + sparse=True) X.append(sample) Y.append(label) X = pd.concat(X, axis=1).T @@ -160,21 +170,24 @@ def filter_matrix_signature(mat, genes): mat = mat[genes] return mat + def load_celltypes(path, name): """ Load the cell type information """ try: y = pd.read_table(path) # Check if has Celltype column if not 'Celltype' in y.columns: - logger.error(f"No 'Celltype' column found in {name}_celltypes.txt! Please make sure to include this column.") + logger.error( + f"No 'Celltype' column found in {name}_celltypes.txt! Please make sure to include this column." + ) sys.exit() except FileNotFoundError as e: - logger.error(f"No celltypes file found for {name}. It should be called {name}_celltypes.txt.") + logger.error( + f"No celltypes file found for {name}. It should be called {name}_celltypes.txt." + ) sys.exit(e) - - return y - + return y def load_dataset(name, dir, pattern): @@ -188,9 +201,37 @@ def load_dataset(name, dir, pattern): pattern = pattern.replace("*", "") print("Loading " + name + " dataset ...") - y = load_celltypes(dir + name + "_celltypes.txt", name) - x = pd.read_table(dir + name + pattern, index_col=0) - + # Try to load celltypes + try: + y = pd.read_table(os.path.join(dir, name + "_celltypes.txt")) + # Check if has Celltype column + print(y.columns) + if not 'Celltype' in y.columns: + logger.error( + f"No 'Celltype' column found in {name}_celltypes.txt! Please make sure to include this column." + ) + sys.exit() + except FileNotFoundError as e: + logger.error( + f"No celltypes file found for {name}. It should be called {name}_celltypes.txt." + ) + sys.exit(e) + + # Try to load data file + try: + x = pd.read_table(os.path.join(dir, name + pattern), index_col=0) + except FileNotFoundError as e: + logger.error( + f"No counts file found for {name}. Was looking for file {name + pattern}" + ) + + # Check that celltypes and count file have same number of cells + if not y.shape[0] == x.shape[0]: + logger.error( + f"Different number of cells in {name}_celltypes and {name + pattern}! Make sure the data has been processed correctly." + ) + sys.exit(1) + return (x, y) @@ -203,7 +244,9 @@ def merge_unkown_celltypes(y, unknown_celltypes): :return: """ celltypes = list(y["Celltype"]) - new_celltypes = ["Unknown" if x in unknown_celltypes else x for x in celltypes] + new_celltypes = [ + "Unknown" if x in unknown_celltypes else x for x in celltypes + ] y["Celltype"] = new_celltypes return y @@ -273,9 +316,8 @@ def generate_signature(x, y): return signature_matrix -def simulate_bulk( - sample_size, num_samples, data_path, out_dir, pattern, unknown_celltypes -): +def simulate_bulk(sample_size, num_samples, data_path, out_dir, pattern, + unknown_celltypes): """ Simulate artificial bulk samples from single cell datasets :param sample_size: number of cells per sample @@ -287,17 +329,20 @@ def simulate_bulk( """ num_samples = int( - num_samples / 2 - ) # divide by two so half is sparse and half is normal samples + num_samples / + 2) # divide by two so half is sparse and half is normal samples # List available datasets - files = glob.glob(data_path + pattern) + if not data_path.endswith("/"): + data_path += "/" + files = glob.glob(os.path.join(data_path, pattern)) files = [os.path.basename(x) for x in files] datasets = [x.split("_")[0] for x in files] if len(datasets) == 0: - logging.error("No datasets fround! Have you specified the pattern correctly?") - sys.exit() + logging.error( + "No datasets found! Have you specified the pattern correctly?") + sys.exit(1) print("Datasets: " + str(datasets)) @@ -326,11 +371,14 @@ def simulate_bulk( # Create datasets for i in range(len(xs)): print("Subsampling " + datasets[i] + "...") - tmpx, tmpy = create_subsample_dataset( - xs[i], ys[i], sample_size, celltypes, num_samples - ) - tmpx.to_csv(out_dir + datasets[i] + "_samples.txt", sep="\t", index=False) - tmpy.to_csv(out_dir + datasets[i] + "_labels.txt", sep="\t", index=False) + tmpx, tmpy = create_subsample_dataset(xs[i], ys[i], sample_size, + celltypes, num_samples) + tmpx.to_csv(out_dir + datasets[i] + "_samples.txt", + sep="\t", + index=False) + tmpy.to_csv(out_dir + datasets[i] + "_labels.txt", + sep="\t", + index=False) gc.collect() print("Finished!") diff --git a/scaden/preprocessing/create_h5ad_file.py b/scaden/preprocessing/create_h5ad_file.py index 6ce1656..86ae01e 100644 --- a/scaden/preprocessing/create_h5ad_file.py +++ b/scaden/preprocessing/create_h5ad_file.py @@ -15,6 +15,7 @@ logger = logging.getLogger(__name__) + def parse_data(x_path, y_path): """ Parse data and labels and divide them into training and testset @@ -94,18 +95,16 @@ def create_h5ad_file(data_dir, out_path, unknown, pattern="*_samples.txt"): x = x.sort_index(axis=1) ratios = pd.DataFrame(y, columns=celltypes) - ratios["ds"] = pd.Series(np.repeat(train_file, y.shape[0]), index=ratios.index) + ratios["ds"] = pd.Series(np.repeat(train_file, y.shape[0]), + index=ratios.index) print("Processing " + str(train_file)) x = pd.DataFrame(x) adata.append( - anndata.AnnData( - X=x.to_numpy(), obs=ratios, var=pd.DataFrame(columns=[], index=list(x)) - ) - ) - - - + anndata.AnnData(X=x.to_numpy(), + obs=ratios, + var=pd.DataFrame(columns=[], index=list(x)))) + for i in range(1, len(adata)): print("Concatenating " + str(i)) adata[0] = adata[0].concatenate(adata[1]) diff --git a/scaden/scaden/processing.py b/scaden/process.py similarity index 100% rename from scaden/scaden/processing.py rename to scaden/process.py diff --git a/scaden/scaden/__init__.py b/scaden/scaden/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/scaden/scaden/prediction.py b/scaden/scaden/prediction.py deleted file mode 100644 index 8dc33e5..0000000 --- a/scaden/scaden/prediction.py +++ /dev/null @@ -1,85 +0,0 @@ -""" -scaden Main functionality - -Contains code to -- process a training datasets -- train a model -- perform predictions - -""" - -# Imports -import tensorflow as tf -from anndata import read_h5ad -from scaden.model.architectures import architectures -from scaden.model.scaden import Scaden - -""" -PARAMETERS -""" -# ==========================================# - -# Extract architectures -M256_HIDDEN_UNITS = architectures['m256'][0] -M512_HIDDEN_UNITS = architectures['m512'][0] -M1024_HIDDEN_UNITS = architectures['m1024'][0] -M256_DO_RATES = architectures['m256'][1] -M512_DO_RATES = architectures['m512'][1] -M1024_DO_RATES = architectures['m1024'][1] - - -# ==========================================# - - -def prediction(model_dir, data_path, out_name, seed=0): - """ - Perform prediction using a trained scaden ensemble - :param model_dir: the directory containing the models - :param data_path: the path to the gene expression file - :param out_name: name of the output prediction file - :return: - """ - - # Small model predictions - tf.compat.v1.reset_default_graph() - with tf.compat.v1.Session() as sess: - cdn256 = Scaden(sess=sess, - model_dir=model_dir + "/m256", - model_name='m256', - seed=seed) - cdn256.hidden_units = M256_HIDDEN_UNITS - cdn256.do_rates = M256_DO_RATES - - # Predict ratios - preds_256 = cdn256.predict(input_path=data_path, out_name='scaden_predictions_m256.txt') - - - # Mid model predictions - tf.compat.v1.reset_default_graph() - with tf.compat.v1.Session() as sess: - cdn512 = Scaden(sess=sess, - model_dir=model_dir+"/m512", - model_name='m512', - seed=seed) - cdn512.hidden_units = M512_HIDDEN_UNITS - cdn512.do_rates = M512_DO_RATES - - # Predict ratios - preds_512 = cdn512.predict(input_path=data_path, out_name='scaden_predictions_m512.txt') - - # Large model predictions - tf.compat.v1.reset_default_graph() - with tf.compat.v1.Session() as sess: - cdn1024 = Scaden(sess=sess, - model_dir=model_dir+"/m1024", - model_name='m1024', - seed=seed) - cdn1024.hidden_units = M1024_HIDDEN_UNITS - cdn1024.do_rates = M1024_DO_RATES - - # Predict ratios - preds_1024 = cdn1024.predict(input_path=data_path, out_name='scaden_predictions_m1024.txt') - - # Average predictions - preds = (preds_256 + preds_512 + preds_1024) / 3 - preds.to_csv(out_name, sep="\t") diff --git a/scaden/scaden/training.py b/scaden/scaden/training.py deleted file mode 100644 index 3d84157..0000000 --- a/scaden/scaden/training.py +++ /dev/null @@ -1,98 +0,0 @@ -""" -scaden Main functionality - -Contains code to -- process a training datasets -- train a model -- perform predictions - -""" - -# Imports -import tensorflow as tf -from anndata import read_h5ad -from scaden.model.architectures import architectures -from scaden.model.scaden import Scaden - -""" -PARAMETERS -""" -# ==========================================# - -# Extract architectures -M256_HIDDEN_UNITS = architectures['m256'][0] -M512_HIDDEN_UNITS = architectures['m512'][0] -M1024_HIDDEN_UNITS = architectures['m1024'][0] -M256_DO_RATES = architectures['m256'][1] -M512_DO_RATES = architectures['m512'][1] -M1024_DO_RATES = architectures['m1024'][1] - -# ==========================================# - - -def training(data_path, train_datasets, model_dir, batch_size, learning_rate, num_steps, seed=0): - """ - Perform training of three a scaden model ensemble consisting of three different models - :param model_dir: - :param batch_size: - :param learning_rate: - :param num_steps: - :return: - """ - # Convert training datasets - if train_datasets == '': - train_datasets = [] - else: - train_datasets = train_datasets.split() - print(f"Training on: {train_datasets}") - - - # M256 model training - print("Training M256 Model ...") - tf.compat.v1.reset_default_graph() - with tf.compat.v1.Session() as sess: - cdn256 = Scaden(sess=sess, - model_dir=model_dir+"/m256", - model_name='m256', - batch_size=batch_size, - learning_rate=learning_rate, - num_steps=num_steps, - seed=seed) - cdn256.hidden_units = M256_HIDDEN_UNITS - cdn256.do_rates = M256_DO_RATES - cdn256.train(input_path=data_path, train_datasets=train_datasets) - del cdn256 - - # Training of mid model - print("Training M512 Model ...") - tf.compat.v1.reset_default_graph() - with tf.compat.v1.Session() as sess: - cdn512 = Scaden(sess=sess, - model_dir=model_dir+"/m512", - model_name='m512', - batch_size=batch_size, - learning_rate=learning_rate, - num_steps=num_steps, - seed=seed) - cdn512.hidden_units = M512_HIDDEN_UNITS - cdn512.do_rates = M512_DO_RATES - cdn512.train(input_path=data_path, train_datasets=train_datasets) - del cdn512 - - # Training of large model - print("Training M1024 Model ...") - tf.compat.v1.reset_default_graph() - with tf.compat.v1.Session() as sess: - cdn1024 = Scaden(sess=sess, - model_dir=model_dir+"/m1024", - model_name='m1024', - batch_size=batch_size, - learning_rate=learning_rate, - num_steps=num_steps, - seed=seed) - cdn1024.hidden_units = M1024_HIDDEN_UNITS - cdn1024.do_rates = M1024_DO_RATES - cdn1024.train(input_path=data_path, train_datasets=train_datasets) - del cdn1024 - - print("Training finished.") \ No newline at end of file diff --git a/scaden/preprocessing/simulate.py b/scaden/simulate.py similarity index 63% rename from scaden/preprocessing/simulate.py rename to scaden/simulate.py index 21c824a..617775f 100644 --- a/scaden/preprocessing/simulate.py +++ b/scaden/simulate.py @@ -6,23 +6,15 @@ and subsequenbt formatting in .h5ad file for training with Scaden """ -def simulation(simulate_dir, data_dir, sample_size, num_samples, pattern, unknown_celltypes, out_prefix): + +def simulation(simulate_dir, data_dir, sample_size, num_samples, pattern, + unknown_celltypes, out_prefix): # Perform the bulk simulation unknown_celltypes = list(unknown_celltypes) - simulate_bulk( - sample_size, - num_samples, - data_dir, - simulate_dir, - pattern, - unknown_celltypes - ) + simulate_bulk(sample_size, num_samples, data_dir, simulate_dir, pattern, + unknown_celltypes) # Create the h5ad training data file out_name = os.path.join(simulate_dir, out_prefix + ".h5ad") - create_h5ad_file( - simulate_dir, - out_name, - unknown_celltypes - ) + create_h5ad_file(simulate_dir, out_name, unknown_celltypes) diff --git a/scaden/train.py b/scaden/train.py new file mode 100644 index 0000000..1694b54 --- /dev/null +++ b/scaden/train.py @@ -0,0 +1,93 @@ +""" +scaden Main functionality + +Contains code to +- process a training datasets +- train a model +- perform predictions + +""" + +# Imports +import tensorflow as tf +from anndata import read_h5ad +from scaden.model.architectures import architectures +from scaden.model.scaden import Scaden +""" +PARAMETERS +""" +# ==========================================# + +# Extract architectures +M256_HIDDEN_UNITS = architectures['m256'][0] +M512_HIDDEN_UNITS = architectures['m512'][0] +M1024_HIDDEN_UNITS = architectures['m1024'][0] +M256_DO_RATES = architectures['m256'][1] +M512_DO_RATES = architectures['m512'][1] +M1024_DO_RATES = architectures['m1024'][1] + +# ==========================================# + + +def training(data_path, + train_datasets, + model_dir, + batch_size, + learning_rate, + num_steps, + seed=0): + """ + Perform training of three a scaden model ensemble consisting of three different models + :param model_dir: + :param batch_size: + :param learning_rate: + :param num_steps: + :return: + """ + # Convert training datasets + if train_datasets == '': + train_datasets = [] + else: + train_datasets = train_datasets.split(',') + print(f"Training on: {train_datasets}") + + # Training of M256 model + print("Training M256 Model ...") + cdn256 = Scaden(model_dir=model_dir + "/m256", + model_name='m256', + batch_size=batch_size, + learning_rate=learning_rate, + num_steps=num_steps, + seed=seed, + hidden_units=M256_HIDDEN_UNITS, + do_rates=M512_DO_RATES) + cdn256.train(input_path=data_path, train_datasets=train_datasets) + del cdn256 + + # Training of M512 model + print("Training M512 Model ...") + cdn512 = Scaden(model_dir=model_dir + "/m512", + model_name='m512', + batch_size=batch_size, + learning_rate=learning_rate, + num_steps=num_steps, + seed=seed, + hidden_units=M512_HIDDEN_UNITS, + do_rates=M512_DO_RATES) + cdn512.train(input_path=data_path, train_datasets=train_datasets) + del cdn512 + + # Training of M1024 model + print("Training M1024 Model ...") + cdn1024 = Scaden(model_dir=model_dir + "/m1024", + model_name='m1024', + batch_size=batch_size, + learning_rate=learning_rate, + num_steps=num_steps, + seed=seed, + hidden_units=M1024_HIDDEN_UNITS, + do_rates=M1024_DO_RATES) + cdn1024.train(input_path=data_path, train_datasets=train_datasets) + del cdn1024 + + print("Training finished.") \ No newline at end of file diff --git a/setup.py b/setup.py index e6c6e4a..9ab0c45 100644 --- a/setup.py +++ b/setup.py @@ -2,8 +2,7 @@ from setuptools import setup, find_packages -version = '0.9.6' - +version = '1.0.0' with open("README.md", "r", encoding="UTF-8") as fh: long_description = fh.read() @@ -11,29 +10,24 @@ with open('LICENSE', encoding="UTF-8") as f: license = f.read() -setup( - name='scaden', - version=version, - description="Cell type deconvolution using single cell data", - long_description=long_description, - long_description_content_type="text/markdown", - keywords=['bioinformatics', 'deep learning', 'machine learning', 'single cell sequencing', 'deconvolution'], - author='Kevin Menden', - author_email='kevin.menden@t-online.de', - url='https://github.com/KevinMenden/scaden', - license="MIT License", - entry_points={"console_scripts": ["scaden=scaden.__main__:main"]}, - packages=find_packages(), - include_package_data=True, - python_requires='>3.6.0', - install_requires = [ - 'pandas', - 'numpy', - 'scikit-learn', - 'tensorflow>=2.0', - 'anndata', - 'tqdm', - 'click', - 'h5py~=2.10.0' - ] -) +setup(name='scaden', + version=version, + description="Cell type deconvolution using single cell data", + long_description=long_description, + long_description_content_type="text/markdown", + keywords=[ + 'bioinformatics', 'deep learning', 'machine learning', + 'single cell sequencing', 'deconvolution' + ], + author='Kevin Menden', + author_email='kevin.menden@t-online.de', + url='https://github.com/KevinMenden/scaden', + license="MIT License", + entry_points={"console_scripts": ["scaden=scaden.__main__:main"]}, + packages=find_packages(), + include_package_data=True, + python_requires='>3.6.0', + install_requires=[ + 'pandas', 'numpy', 'scikit-learn', 'tensorflow>=2.0', 'anndata', + 'tqdm', 'click', 'h5py~=2.10.0' + ])