diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
index 4e1ef42..fd627ff 100644
--- a/.github/workflows/python-publish.yml
+++ b/.github/workflows/python-publish.yml
@@ -9,23 +9,22 @@ on:
 
 jobs:
   deploy:
-
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v2
-    - name: Set up Python
-      uses: actions/setup-python@v2
-      with:
-        python-version: '3.x'
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install setuptools wheel twine
-    - name: Build and publish
-      env:
-        TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
-        TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
-      run: |
-        python setup.py sdist bdist_wheel
-        twine upload dist/*
+      - uses: actions/checkout@v2
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: "3.x"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip setuptools wheel twine
+          pip install setuptools wheel twine
+      - name: Build and publish
+        env:
+          TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
+          TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
+        run: |
+          python setup.py sdist bdist_wheel
+          twine upload dist/*
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 9d3ee2f..0000000
--- a/.travis.yml
+++ /dev/null
@@ -1,39 +0,0 @@
-# Don't use language: python; this gives us an unnecessary virtualenv
-language: c
-
-
-env:
-  - BUILD_TARGET=3.6
-
-install:
-  - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
-  - bash miniconda.sh -b -p $HOME/miniconda
-  - source "$HOME/miniconda/etc/profile.d/conda.sh"
-  - hash -r
-  - conda config --set always_yes yes --set changeps1 no
-  - conda update -q conda
-  - conda info -a
-  - export LANG=en_US.UTF-8
-  - export COVERAGE_DIR=":$HOME/htmlcov"
-  - printenv | sort
-  # Install conda-build and conda-verify
-  - conda install -q -c defaults -c conda-forge python=$BUILD_TARGET conda-build conda-verify codecov flake8
-
-script:
-  - source "$HOME/miniconda/etc/profile.d/conda.sh"
-  - >
-    conda build \
-      --override-channels -c defaults -c conda-forge -c r -c bioconda \
-      --python $BUILD_TARGET \
-      ./conda.recipe/
-  - > 
-    conda create \
-      -n install-environment \
-      --override-channels -c defaults -c conda-forge -c r -c local -c bioconda \
-      python=$BUILD_TARGET \
-      scaden conda codecov coverage
-  - conda activate install-environment && scaden --help
-
-after_success:
-  - codecov
-
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 752aba2..a53be87 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,31 +1,44 @@
 # Scaden Changelog
 
-## v0.9.5
+### Version 1.0.0
+
+* Rebuild Scaden model and training to use TF2 Keras API instead of the old compatibility functions 
+* added `scaden example` command which allows to generate example data for test-running scaden and to inpstec the expected file format
+* added more tests and checks input reading function in `scaden simulate`
+* fixed bug in reading input data 
+
+### Version 0.9.6
+
++ fixed Dockerfile (switched to pip installation)
++ added better error messages to `simulate` command
++ cleaned up dependencies
+
+### v0.9.5
 
 * added `--seed` parameter to allow reproducible Scaden runs
 * added `scaden simulate` command to perform bulk simulation and training file creation
 * changed CLI calling
 
-## v0.9.4
+### v0.9.4
 
 * fixed dependencies (added python>=3.6 requirement)
 
-## v0.9.3
+### v0.9.3
 
 * upgrade to tf2
 * cleaned up dependencies
 
-## v0.9.2
+### v0.9.2
 
 * small code refactoring
 * RAM usage improvement
 
-## v0.9.1
+### v0.9.1
 
 * added automatic removal of duplicate genes
 * changed name of prediction file
 
-## v0.9.0   
+### v0.9.0   
 
 Initial release of the Scaden deconvolution package.
 
diff --git a/Dockerfile-dev b/Dockerfile-dev
deleted file mode 100644
index 8ed6b79..0000000
--- a/Dockerfile-dev
+++ /dev/null
@@ -1,49 +0,0 @@
-# BASE_IMAGE is either "nvidia/cuda:10.1-runtime-ubuntu18.04" or "library/ubuntu:18.04"
-ARG BASE_IMAGE
-FROM ${BASE_IMAGE}
-
-#
-# Install Miniconda in /opt/conda
-#
-
-ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
-ENV PATH /opt/conda/bin:$PATH
-
-RUN apt-get update --fix-missing && \
-    apt-get install -y wget bzip2 ca-certificates curl git && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-4.7.12.1-Linux-x86_64.sh -O ~/miniconda.sh && \
-    /bin/bash ~/miniconda.sh -b -p /opt/conda && \
-    rm ~/miniconda.sh && \
-    /opt/conda/bin/conda clean -tipsy && \
-    ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
-    echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \
-    echo "conda activate base" >> ~/.bashrc
-
-ENV LD_LIBRARY_PATH /usr/local/cuda-10.1/lib64:/usr/local/cuda-10.1/extras/CUPTI/lib64:$LD_LIBRARY_PATH
-
-COPY . /src
-WORKDIR /src
-
-ARG CPU_OR_GPU
-
-RUN conda update -n base -c defaults conda \
-  && conda install conda-build conda-verify \
-  && conda build --override-channels -c defaults -c conda-forge -c bioconda conda.recipe/ \
-  && cd / \
-  && rm -rf /src \
-  && conda install --override-channels -c local -c defaults -c conda-forge -c bioconda scaden \
-  && conda remove conda-build conda-verify \
-  && conda clean -a
-
-# Needed for when the docker container uses a non-root user id
-RUN mkdir /tmp/numba_cache & chmod 777 /tmp/numba_cache & NUMBA_CACHE_DIR=/tmp/numba_cache
-
-RUN if [ "x$CPU_OR_GPU" = "xgpu" ]; then \
-      conda install tensorflow-gpu && \
-      conda clean -a; \
-    fi
-
-WORKDIR /
diff --git a/README.md b/README.md
index 63d9cce..cdda247 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,8 @@
 ![Install with pip](https://img.shields.io/badge/Install%20with-pip-blue)
 ![Install with Bioconda](https://img.shields.io/badge/Install%20with-conda-green)
 ![Downloads](https://static.pepy.tech/personalized-badge/scaden?period=total&units=international_system&left_color=blue&right_color=green&left_text=Downloads)
+![Docker](https://github.com/kevinmenden/scaden/workflows/Docker/badge.svg)
+![Scaden CI](https://github.com/kevinmenden/scaden/workflows/Scaden%20CI/badge.svg)
 
 ## Single-cell assisted deconvolutional network
 
@@ -23,29 +25,24 @@ Scaden overview. a) Generation of artificial bulk samples with known cell type c
 of Scaden model ensemble on simulated training data. c) Scaden ensemble architecture. d) A trained Scaden model can be used
 to deconvolve complex bulk mixtures.
 
-### 1. System requirements
-Scaden was developed and tested on Linux (Ubuntu 16.04 and 18.04). It was not tested on Windows or Mac, but should
-also be usable on these systems when installing with Pip or Bioconda. Scaden does not require any special
-hardware (e.g. GPU), however we recommend to have at least 16 GB of memory.
 
-Scaden requires Python 3. All package dependencies should be handled automatically when installing with pip or conda.
 
-### 2. Installation guide
+## Installation guide
 Scaden can be easily installed on a Linux system, and should also work on Mac. 
 There are currently two options for installing Scaden, either using [Bioconda](https://bioconda.github.io/) or via [pip](https://pypi.org/).
 
-## pip
+### pip
 To install Scaden via pip, simply run the following command:
 
 `pip install scaden`
 
 
-## Bioconda
+### Bioconda
 You can also install Scaden via bioconda, using:
 
 `conda install -c bioconda scaden`
 
-## GPU
+### GPU
 If you want to make use of your GPU, you will have to additionally install `tensorflow-gpu`.
 
 For pip:
@@ -56,7 +53,7 @@ For conda:
 
 `conda install tensorflow-gpu`
 
-## Docker
+### Docker
 If you don't want to install Scaden at all, but rather use a Docker container, we provide that as well.
 For every release, we provide two version - one for CPU and one for GPU usage.
 To pull the CPU container, use this command:
@@ -74,38 +71,54 @@ Additionally, we now proivde a web tool:
 
 It contains pre-generated training datasets for several tissues, and all you need to do is to upload your expression data. Please note that this is still in preview.
 
-### 3. Demo
-We provide several curated [training datasets](https://scaden.readthedocs.io/en/latest/datasets/) for Scaden. For this demo,
-we will use the human PBMC training dataset, which consists of 4 different scRNA-seq datasets and 32,000 samples in total.
-You can download it here:
-[https://figshare.com/s/e59a03885ec4c4d8153f](https://figshare.com/s/e59a03885ec4c4d8153f).
+## Usage
+We provide a detailed instructions for how to use Scaden at our [Documentation page](https://scaden.readthedocs.io/en/latest/usage/)
+
+A deconvolution workflow with Scaden consists of four major steps:
+* data simulation
+* data processing
+* training
+* prediction
+
+If training data is already available, you can start at the data processing step. Otherwise you will first have to process scRNA-seq datasets and perform data simulation to generate a training dataset. As an example workflow, you can use Scaden's function `scaden example` to generate example data and go through the whole pipeline.
+
+First, make an example data directory and generate the example data:
+```bash
+mkdir example_data
+scaden example --out example_data/
+```
+This generates the files "example_counts.txt", "example_celltypes.txt" and "example_bulk_data.txt" in the "example_data" directory. Next, you can generate training data:
 
-For this demo, you will also need to download some test samples to perform deconvolution on, along with their associated labels.
-You can download the data we used for the Scaden paper here:
-[https://figshare.com/articles/Publication_Figures/8234030](https://figshare.com/articles/Publication_Figures/8234030)
+```bash
+scaden simulate --data example_data/ -n 100 --pattern "*_counts.txt
+```
 
-We'll perform deconvolution on simulated samples from the data6k dataset. You can find the samples and labels in 'paper_data/figures/figure2/data/data6k_500_*'
-once you have downloaded this data from the link mentioned above.
+This generates 100 samples of training data in your current working directory. The file you need for your next step is called "data.h5ad". Now you need to perform the preprocessing using the training data and the bulk data file:
 
-The first step is to perform preprocessing on the training data. This is done with the following command:
+```bash
+scaden process data.h5ad example_data/example_bulk_data.txt
+```
 
-`scaden process pbmc_data.h5ad paper_data/figures/figure2/data/data6k_500_samples.txt`
+As a result, you should now have a file called "processed.h5ad" in your directory. Now you can perform training. The following command performs training for 5000 steps per model and saves the trained weights to the "model" directory, which will be created:
 
-This will generate a file called 'processed.h5ad', which we will use for training. The training data
-we have downloaded also contains samples from the data6k scRNA-seq dataset, so we have to exclude them from training
-to get a meaningfull test of Scaden's performance. The following command will train a Scaden ensemble for 5000 steps per model (recommended),
-and store it in 'scaden_model'. Data from the data6k dataset will be excluded from training. Depending on your machine, this can take about 10-20 minutes.
+```bash
+scaden train processed.h5ad --steps 5000 --model_dir model
+```
 
-`scaden train processed.h5ad --steps 5000 --model_dir scaden_model --train_datasets 'data8k donorA donorC'`
+Finally, you can use the trained model to perform prediction:
 
-Finally, we can perform deconvolution on the 500 simulates samples from the data6k dataset:
+```bash
+scaden predict --model_dir model example_data/example_bulk_data.txt
+```
 
-`scaden predict paper_data/figures/figure2/data/data6k_500_samples.txt --model_dir scaden_model`
+Now you should have a file called "scaden_predictions.txt" in your working directory, which contains your estimated cell compositions.
 
-This will create a file named 'cdn_predictions.txt' (will be renamed in future version to 'scaden_predictions.txt'), which contains
-the deconvolution results. You can now compare these predictions with the true values contained in 
-'paper_data/figures/figure2/data/data6k_500_labels.txt'. This should give you the same results as we obtained in the Scaden paper
-(see Figure 2).
 
-### 4. Instructions for use
-For a general description on how to use Scaden, please check out our [usage documentation](https://scaden.readthedocs.io/en/latest/usage/).
+
+
+### 1. System requirements
+Scaden was developed and tested on Linux (Ubuntu 16.04 and 18.04). It was not tested on Windows or Mac, but should
+also be usable on these systems when installing with Pip or Bioconda. Scaden does not require any special
+hardware (e.g. GPU), however we recommend to have at least 16 GB of memory.
+
+Scaden requires Python 3. All package dependencies should be handled automatically when installing with pip or conda.
\ No newline at end of file
diff --git a/docs/changelog.md b/docs/changelog.md
index 2f0e14a..28e7a4a 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -1,31 +1,51 @@
-# Changelog
+# Scaden Changelog
+
+### Version 1.0.0
+
+* Rebuild Scaden model and training to use TF2 Keras API instead of the old compatibility functions 
+* added `scaden example` command which allows to generate example data for test-running scaden and to inpstec the expected file format
+* added more tests and checks input reading function in `scaden simulate`
+* fixed bug in reading input data 
 
 ### Version 0.9.6
+
+
 + fixed Dockerfile (switched to pip installation)
 + added better error messages to `simulate` command
 + cleaned up dependencies
 
-### Version 0.9.5
-+ added `scaden simulate` command to perform bulk simulation and training file creation
-+ added `--seed` parameter to allow reproducible Scaden runs
+### v0.9.5
 
-### Version 0.9.4
-+ fixed dependencies (added python>=3.6 requirement)
+* added `--seed` parameter to allow reproducible Scaden runs
+* added `scaden simulate` command to perform bulk simulation and training file creation
+* changed CLI calling
 
-### Version 0.9.3
-+ upgrade to Tensorflow 2
-+ cleaned up dependencies
+### v0.9.4
+
+* fixed dependencies (added python>=3.6 requirement)
+
+### v0.9.3
+
+* upgrade to tf2
+* cleaned up dependencies
+
+### v0.9.2
+
+* small code refactoring
+* RAM usage improvement
+
+### v0.9.1
+
+* added automatic removal of duplicate genes
+* changed name of prediction file
+
+### v0.9.0   
+
+Initial release of the Scaden deconvolution package.
 
-### Version 0.9.2
-+ RAM usage improvement
+Commands:
 
-### Version 0.9.1
-+ Added automatic removal of duplicate genes in Mixture file 
-+ Changed name of final prediction file
-+ Added Scaden logo to main script
+* `scaden process`: Process a training dataset for training
+* `scaden train`: Train a Scaden model
+* `scaden predict`: Predict cell type compositions of a given sample
 
-### Version 0.9.0
-This is the initial release version of Scaden. While this version contains full functionality for pre-processing, training and prediction, it does not
-contain thorough error messages, plotting functionality and a solid helper function for generation training data. These are all features
-planned for the release of v.1.0.0.
-The core functionality of Scaden is, however, implemented and fully operational. Please check the [Usage](usage) section to learn how to use Scaden.
\ No newline at end of file
diff --git a/docs/usage.md b/docs/usage.md
index a64bfda..21b06b3 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -1,13 +1,18 @@
 # Usage
 
-For a typical deconvolution with Scaden you will have to perform three steps:
+The Scaden deconvolution workflow consists of three mandatory steps:
 
-* pre-processing of training data
-* training of Scaden model
-* prediction
+* `scaden process`
+* `scaden train`
+* `scaden predict`
 
-This assumes that you already have a training dataset. If not, Scaden contains functionality to create a dataset from one or several scRNA-seq datasets.
-Please refer to the [data generation](#training-data-generation) section for instructions on how to create training datasets.
+In the `process` step, Scaden performs pre-processing on your training data, making sure it has the same genes as your prediction (bulk) data and performing some data transformations to make the data suitable for machine learning
+
+In the `train` step, training of the Scaden ensemble model is performed.
+
+Finally, the predictions are generated in the `predict` step, for which the trained model is used.
+
+This assumes that you already have a training dataset. If not, Scaden contains functionality to create a dataset from one or several scRNA-seq datasets. For this, you can use the `scaden simulate` command after preparing some data. Have a look at the [scaden simulate](#scaden-simulate) section for instructions on how to use this command.
 
 Note that we already provide datasets for certain tissues. All available datasets are listed in the [Datasets](datasets) section. We will
 update this section when new datasets are added. 
@@ -19,7 +24,7 @@ If you don't want to install Scaden on your local machine, we now provide a webt
 
 It contains pre-generated training datasets for several tissues, and all you need to do is to upload your expression data. Please note that this is still in preview.
 
-## Pre-processing
+## scaden process
 The first step is to pre-process your training data. For this you need your training data and the dataset you want to perform deconvolution on.
 In this step, Scaden will create a new file for training which only contains the intersection of genes between the training and the prediction data.
 Furthermore, the training data will be log2-transformed and scaled to the range [0,1]. Use the following command for pre-processing:
@@ -28,43 +33,40 @@ Furthermore, the training data will be log2-transformed and scaled to the range
 scaden process <training data> <prediction data>
 ```
 
-## Training
+By the default, the output file will be called "processed.h5ad". Have a look at `scaden process --help` for further options.
+
+## scaden train
 Now that your data is set-up, you can start training a Scaden ensemble model. Scaden consists of three deep neural network models. By default,
-each of them will be trained for 20,000 steps. You can train longer if you want, although we got good results with this number for datasets of 
-around 30,000 samples. Use the following command to just train a model for 20,000 steps:
+each of them will be trained for 5,000 steps, which is the recommended number of training steps. You can train longer if you want, although we got good results with this number for datasets of 
+around 30,000 samples. Use the following command to just train a model:
 
 
 ```console
 scaden train <processed data>
 ```
 
-This will save the model parameters in your working directory. If you want to create a specific directory for your trained models instead,
-and train for 30,00 steps, you can use this command:
+This will save the model parameters in your working directory. We recommend to explicitly specify a model directory like so:
 
 
 ```console
-scaden train <processed data> --model_dir <model dir> --steps 30000
+scaden train <processed data> --model_dir <model dir> 
 ```
 
-
-You can also adjust the batch size and the learning rate, although we recommend using the default values. If you want to adjust them anyway, use these flages:
-
+For more options, have a look at
 
 ```console
---batch_size <batch size>
-
---learning_rate <learning rate>
+scaden train --help
 ```
 
-## Prediction 
-Finally, after your model is trained, you can start the prediction. If you haven't specified any model directory and just trained a model
+## scaden predict 
+Finally, after your model is trained, you can perform the prediction. If you haven't specified any model directory and just trained a model
 in your current directory, you can use the following command to perform the deconvolution: 
 
 ```console
 scaden predict <prediction file>
 ```
 
-Scaden will then generate a file called 'cdn_predictions.txt' (this name will change in future releases) in your current directory. If the models were saved elsewhere,
+Scaden will then generate a file called 'scaden_predictions.txt' in your current directory. If the models were saved elsewhere,
 you have to tell Scaden where to look for them:
 
 ```console
@@ -78,27 +80,24 @@ You can also change the path and name of the output predictions file using the `
 --outname <path/to/output.txt
 ```
 
-## File Formats
-For Scaden to work properly, your input files have to be correctly formatted. As long as you use Scadens inbuilt functionality to generate the training data, you should have no problem 
-with formatting there. The prediction file, however, you have to format yourself. This should be a file of shape m X n, where m are your features (genes) and n your samples. So each row corresponds to 
-a gene, and each column to a sample. Leave the column name for the genes empy (just put a `\t` there). This is a rather standard format to store gene expression tables, so you should have not much work assuring that the
-format fits.
-
-Your data can either be raw counts or normalized, just make sure that they are not in logarithmic space already. When loading a prediction file, Scaden applies its scaling procedure to it, which involves taking the logarithm of your counts.
-So as long as they are not already in logarithmic space, Scaden will be able to handle both raw and normalized counts / expression values.
-
-## Training data generation
+## scaden simulate
 
 To generate training data for Scaden, you will first have to process the scRNA-seq dataset(s) you want to use for training.
-Once you have done this, you can use Scaden's command `scaden simulate` to generate the actual training data. I will explain the process in the following.
+Once you have done this, you can use Scaden's command `scaden simulate` to generate the actual training data. I will explain this process in the following.
 
 #### scRNA-seq data processing
-The first step is to process your scRNA-seq dataset(s) you want to use for training. I used Scanpy for this, and would therefore
+The first step is to process your scRNA-seq dataset(s) that you want to use for training. I used [Scanpy](https://scanpy.readthedocs.io/en/stable/) for this, and would therefore
 recommend to do the same, but you can of course use other software for this purpose. I've uploaded the scripts I used to preprocess
-the data used for the Scaden paper [here](https://doi.org/10.6084/m9.figshare.8234030.v1). Mainly you have to normalize your count data
-and create a file containing the cell type labels. 
-The file for the cell type labels should be of size (n x 1), where n is the number of cells 
-you have in your data. The single column in this file should be labeled 'Celltype'. You can have extra columns if you like, as long as you have a 'Celltype' column which specifies the cell type label in the correct order. The count data should be of size (n x g), where g is the number of genes and n is the number of samples. The order must be the same as for the cell type labels.
+the data used for the Scaden paper [here](https://doi.org/10.6084/m9.figshare.8234030.v1). 
+
+You will have to normalize your count data to library size and create a file containing cell type labels.
+The file for the cell type labels should be of size (n x 1), where n is the number of cells
+you have in your data. The single column in this file has to be labeled 'Celltype'. You can have extra columns if you like, as long as you have a 'Celltype' column which specifies the cell type labels in the correct order. The count data should be of size (n x g), where g is the number of genes and n is the number of samples. The order must be the same as for the cell type labels. You can create an example dataset of the correct format using
+
+```console
+scaden example
+```
+Which will generate the files "example_counts.txt", "example_celltypes.txt" and "example_bulk_data.txt". If you want to try out Scaden, you can directly use those files for simulation, processing, training and prediction.
 
 #### Bulk simulation
 Once the data is processed, you can use the command `scaden simulate` to generate your artificial bulk samples for training.
@@ -123,5 +122,16 @@ Make sure to include an `*` in your pattern!
 
 This command will create the artificial samples in the current working directory. You can also specificy an output directory using the `--out` parameter. Scaden will also directly create a .h5ad file in this directory, which is the file you will need for training. By default, this file will be called `data.h5ad`, however you can change the prefix using the `--prefix` flag.
 
+## File Formats
+For Scaden to work properly, your input files have to be correctly formatted. As long as you use Scadens inbuilt functionality to generate the training data, you should have no problem 
+with formatting there. The prediction file, however, you have to format yourself. This should be a file of shape m X n, where m are your features (genes) and n your samples. So each row corresponds to 
+a gene, and each column to a sample. Leave the column name for the genes empy (just put a `\t` there). This is a rather standard format to store gene expression tables, so you should have not much work assuring that the
+format fits.
+
+Your data can either be raw counts or normalized, just make sure that they are not in logarithmic space already. When loading a prediction file, Scaden applies its scaling procedure to it, which involves taking the logarithm of your counts.
+So as long as they are not already in logarithmic space, Scaden will be able to handle both raw and normalized counts / expression values.
+
+When in doubt, just use the `scaden example` command which will generate random example data of correct format, and have a look at that.
+
 
 If you get any errors with the above process, pleas don't hesitate to open an issue on GitHub.
\ No newline at end of file
diff --git a/hooks/build b/hooks/build
deleted file mode 100755
index c9e6800..0000000
--- a/hooks/build
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/bin/bash
-
-
-# DOCKER_TAG and IMAGE_NAME are set by the docker hub autobuild system
-#  https://docs.docker.com/docker-hub/builds/advanced/
-
-# Determine if building GPU or CPU image:
-## First from DOCKER_TAG:
-if [[ "${DOCKER_TAG}" == *gpu ]]; then
-  CPU_OR_GPU="gpu"
-fi
-
-if [[ "${DOCKER_TAG}" == *cpu ]]; then
-  CPU_OR_GPU="cpu"
-fi
-
-## If it is a local build, build cpu
-if [ "x${CPU_OR_GPU}" = "x" ]; then
-  CPU_OR_GPU="cpu"
-fi
-
-# If it is a local build, give an image name:
-if [ "x${IMAGE_NAME}" = "x" ]; then
-  IMAGE_NAME="scaden-${CPU_OR_GPU}:dev"
-fi
-
-# Choose base image for cpu or gpu:
-if [ "${CPU_OR_GPU}" = "gpu" ]; then
-  BASE_IMAGE="nvidia/cuda:10.1-runtime-ubuntu18.04"
-else
-  BASE_IMAGE="ubuntu:18.04"
-fi
-
-# Build:
-echo "Building ${IMAGE_NAME}"
-docker build \
-  -t "${IMAGE_NAME}" \
-  --build-arg BASE_IMAGE="${BASE_IMAGE}" \
-  --build-arg CPU_OR_GPU="${CPU_OR_GPU}" \
-  -f Dockerfile-dev \
-  .
diff --git a/scaden/__main__.py b/scaden/__main__.py
index dd56871..580d27b 100644
--- a/scaden/__main__.py
+++ b/scaden/__main__.py
@@ -2,10 +2,11 @@
 import scaden
 import logging
 import os
-from scaden.scaden.training import training
-from scaden.scaden.prediction import prediction
-from scaden.scaden.processing import processing
-from scaden.preprocessing.simulate import simulation
+from scaden.train import training
+from scaden.predict import prediction
+from scaden.process import processing
+from scaden.simulate import simulation
+from scaden.example import exampleData
 """
 
 author: Kevin Menden
@@ -15,7 +16,8 @@
 
 # Logging
 logger = logging.getLogger()
-os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
+logger.setLevel(logging.INFO)
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0'
 
 
 def main():
@@ -33,8 +35,6 @@ def main():
 
 if __name__ == '__main__':
     main()
-
-
 """
 Set up the command line client with different commands to execute
 """
@@ -45,175 +45,168 @@ def main():
 def cli():
     pass
 
+
 """
 Training mode
 """
+
+
 @cli.command()
-@click.argument(
-    'data_path',
-    type = click.Path(exists=True),
-    required = True,
-    metavar = '<training data>'
-)
+@click.argument('data_path',
+                type=click.Path(exists=True),
+                required=True,
+                metavar='<training data>')
 @click.option(
     '--train_datasets',
-    default = '',
-    help = 'Datasets used for training. Uses all by default.'
-)
-@click.option(
-    '--model_dir',
-    default = "./",
-    help = 'Path to store the model in'
-)
-@click.option(
-    '--batch_size',
-    default = 128,
-    help = 'Batch size to use for training. Default: 128'
-)
-@click.option(
-    '--learning_rate',
-    default = 0.0001,
-    help = 'Learning rate used for training. Default: 0.0001'
-)
-@click.option(
-    '--steps',
-    default = 5000,
-    help = 'Number of training steps'
-)
-@click.option(
-    '--seed',
-    default = 0,
-    help = "Set random seed"
-)
-def train(data_path, train_datasets, model_dir, batch_size, learning_rate, steps, seed):
+    default='',
+    help=
+    'Comma-separated list of datasets used for training. Uses all by default.')
+@click.option('--model_dir', default="./", help='Path to store the model in')
+@click.option('--batch_size',
+              default=128,
+              help='Batch size to use for training. Default: 128')
+@click.option('--learning_rate',
+              default=0.0001,
+              help='Learning rate used for training. Default: 0.0001')
+@click.option('--steps', default=5000, help='Number of training steps')
+@click.option('--seed', default=0, help="Set random seed")
+def train(data_path, train_datasets, model_dir, batch_size, learning_rate,
+          steps, seed):
     """ Train a Scaden model """
     training(data_path=data_path,
-                      train_datasets=train_datasets,
-                      model_dir=model_dir,
-                      batch_size=batch_size,
-                      learning_rate=learning_rate,
-                      num_steps=steps,
-                      seed=seed)
+             train_datasets=train_datasets,
+             model_dir=model_dir,
+             batch_size=batch_size,
+             learning_rate=learning_rate,
+             num_steps=steps,
+             seed=seed)
 
 
 """
 Prediction mode
 """
+
+
 @cli.command()
-@click.argument(
-    'data_path',
-    type = click.Path(exists=True),
-    required = True,
-    metavar = '<prediction data>'
-)
-@click.option(
-    '--model_dir',
-    default = "./",
-    help = 'Path to trained model'
-)
-@click.option(
-    '--outname',
-    default = "scaden_predictions.txt",
-    help = 'Name of predictions file.'
-)
-@click.option(
-    '--seed',
-    default = 0,
-    help = "Set random seed"
-)
+@click.argument('data_path',
+                type=click.Path(exists=True),
+                required=True,
+                metavar='<prediction data>')
+@click.option('--model_dir', default="./", help='Path to trained model')
+@click.option('--outname',
+              default="scaden_predictions.txt",
+              help='Name of predictions file.')
+@click.option('--seed', default=0, help="Set random seed")
 def predict(data_path, model_dir, outname, seed):
     """ Predict cell type composition using a trained Scaden model"""
     prediction(model_dir=model_dir,
-                        data_path=data_path,
-                        out_name=outname,
-                        seed=seed)
-
+               data_path=data_path,
+               out_name=outname,
+               seed=seed)
 
 
 """
 Processing mode
 """
+
+
 @cli.command()
-@click.argument(
-    'data_path',
-    type = click.Path(exists=True),
-    required = True,
-    metavar = '<training dataset to be processed>'
-)
-@click.argument(
-    'prediction_data',
-    type = click.Path(exists=True),
-    required = True,
-    metavar = '<data for prediction>'
-)
-@click.option(
-    '--processed_path',
-    default = "processed.h5ad",
-    help = 'Path of processed file. Must end with .h5ad'
-)
+@click.argument('data_path',
+                type=click.Path(exists=True),
+                required=True,
+                metavar='<training dataset to be processed>')
+@click.argument('prediction_data',
+                type=click.Path(exists=True),
+                required=True,
+                metavar='<data for prediction>')
+@click.option('--processed_path',
+              default="processed.h5ad",
+              help='Path of processed file. Must end with .h5ad')
 @click.option(
     '--var_cutoff',
-    default = 0.1,
-    help = 'Filter out genes with a variance less than the specified cutoff. A low cutoff is recommended,'
-           'this should only remove genes that are obviously uninformative.'
-)
+    default=0.1,
+    help=
+    'Filter out genes with a variance less than the specified cutoff. A low cutoff is recommended,'
+    'this should only remove genes that are obviously uninformative.')
 def process(data_path, prediction_data, processed_path, var_cutoff):
     """ Process a dataset for training """
     processing(data_path=prediction_data,
-                        training_data=data_path,
-                        processed_path=processed_path,
-                        var_cutoff=var_cutoff
-               )
+               training_data=data_path,
+               processed_path=processed_path,
+               var_cutoff=var_cutoff)
+
 
 """
 Simulate dataset
 """
-@cli.command()
-@click.option(
-    '--out', '-o',
-    default = './',
-    help = "Directory to store output files in"
-)
-@click.option(
-    '--data', '-d',
-    default = '.',
-    help = "Path to scRNA-seq dataset(s)"
-)
-@click.option(
-    '--cells', '-c',
-    default = 100,
-    help = "Number of cells per sample [default: 100]"
-)
-@click.option(
-    '--n_samples', '-n',
-    default = 1000,
-    help = "Number of samples to simulate [default: 1000]"
-)
-@click.option(
-    '--pattern', 
-    default = "*_norm_counts_all.txt",
-    help = "File pattern to recognize your processed scRNA-seq count files"
-)
-@click.option(
-    '--unknown', '-u',
-    multiple = True,
-    default = ['unknown'],
-    help = "Specifiy cell types to merge into the unknown category. Specify this flag for every cell type you want to merge in unknown. [default: unknown]"
 
-)
-@click.option(
-    '--prefix', '-p',
-    default = "data",
-    help = "Prefix to append to training .h5ad file [default: data]"
-)
+
+@cli.command()
+@click.option('--out',
+              '-o',
+              default='./',
+              help="Directory to store output files in")
+@click.option('--data', '-d', default='.', help="Path to scRNA-seq dataset(s)")
+@click.option('--cells',
+              '-c',
+              default=100,
+              help="Number of cells per sample [default: 100]")
+@click.option('--n_samples',
+              '-n',
+              default=1000,
+              help="Number of samples to simulate [default: 1000]")
+@click.option(
+    '--pattern',
+    default="*_counts.txt",
+    help="File pattern to recognize your processed scRNA-seq count files")
+@click.option(
+    '--unknown',
+    '-u',
+    multiple=True,
+    default=['unknown'],
+    help=
+    "Specifiy cell types to merge into the unknown category. Specify this flag for every cell type you want to merge in unknown. [default: unknown]"
+)
+@click.option('--prefix',
+              '-p',
+              default="data",
+              help="Prefix to append to training .h5ad file [default: data]")
 def simulate(out, data, cells, n_samples, pattern, unknown, prefix):
     """ Create artificial bulk RNA-seq data from scRNA-seq dataset(s)"""
-    simulation(
-        simulate_dir=out, 
-        data_dir=data, 
-        sample_size=cells, 
-        num_samples=n_samples, 
-        pattern=pattern, 
-        unknown_celltypes=unknown, 
-        out_prefix=prefix
-        )
+    simulation(simulate_dir=out,
+               data_dir=data,
+               sample_size=cells,
+               num_samples=n_samples,
+               pattern=pattern,
+               unknown_celltypes=unknown,
+               out_prefix=prefix)
+
+
+"""
+Generate example data
+"""
+
+
+@cli.command()
+@click.option('--out',
+              '-o',
+              default='./',
+              help="Directory to store output files in")
+@click.option('--cells',
+              '-c',
+              default=10,
+              help="Number of cells [default: 10]")
+@click.option('--genes',
+              '-g',
+              default=100,
+              help="Number of genes [default: 100]")
+@click.option('--out',
+              '-o',
+              default="./",
+              help="Output directory [default: ./]")
+@click.option('--samples',
+              '-n',
+              default=10,
+              help="Number of bulk samples [default: 10]")
+def example(cells, genes, samples, out):
+    exampleData(n_cells=cells, n_genes=genes, n_samples=samples, out_dir=out)
diff --git a/scaden/example.py b/scaden/example.py
new file mode 100644
index 0000000..311773a
--- /dev/null
+++ b/scaden/example.py
@@ -0,0 +1,51 @@
+"""
+Generate random example data which allows for testing and
+to give users examples for the input format
+"""
+import string
+import random
+import os
+import logging
+import pandas as pd
+import numpy as np
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+def exampleData(n_cells=10, n_genes=100, n_samples=10, out_dir="./"):
+    """
+    Generate an example scRNA-seq count file
+    :param n: number of cells
+    :param g: number of genes
+    """
+
+    # Generate example scRNA-seq data
+    counts = np.random.randint(low=0, high=1000, size=(n_cells, n_genes))
+    gene_names = ['gene'] * n_genes
+    for i in range(len(gene_names)):
+        gene_names[i] = gene_names[i] + str(i)
+    df = pd.DataFrame(counts, columns=gene_names)
+
+    # Generate example celltype labels
+    celltypes = ['celltype'] * np.random.randint(low=2, high=n_cells - 1)
+    for i in range(len(celltypes)):
+        celltypes[i] = celltypes[i] + str(i)
+    celltype_list = random.choices(celltypes, k=n_cells)
+    ct_df = pd.DataFrame(celltype_list, columns=['Celltype'])
+
+    # Generate example bulk RNA-seq data
+    bulk = np.random.randint(low=0, high=1000, size=(n_genes, n_samples))
+    samples = ['sample'] * n_samples
+    for i in range(len(samples)):
+        samples[i] = samples[i] + str(i)
+    bulk_df = pd.DataFrame(bulk, columns=samples, index=gene_names)
+
+    # Save the data
+    df.to_csv(os.path.join(out_dir, "example_counts.txt"), sep="\t")
+    ct_df.to_csv(os.path.join(out_dir, "example_celltypes.txt"),
+                 sep="\t",
+                 index=False)
+    bulk_df.to_csv(os.path.join(out_dir, "example_bulk_data.txt"), sep="\t")
+
+    logger.warn(f"Example data has been created in {out_dir}")
diff --git a/scaden/model/scaden.py b/scaden/model/scaden.py
index d224f92..cbdffe4 100644
--- a/scaden/model/scaden.py
+++ b/scaden/model/scaden.py
@@ -2,6 +2,9 @@
 Cell Deconvolutional Network (scaden) class
 """
 import os
+import logging
+import sys
+import gc
 import tensorflow as tf
 import numpy as np
 import pandas as pd
@@ -10,57 +13,65 @@
 from .functions import dummy_labels, sample_scaling
 from tqdm import tqdm
 
+logger = logging.getLogger(__name__)
+
+
 class Scaden(object):
     """
     scaden class
     """
-
-    def __init__(self, sess, model_dir, model_name, batch_size=128, learning_rate=0.0001,  num_steps=1000, seed=0):
-        self.sess=sess
-        self.model_dir=model_dir
-        self.batch_size=batch_size
-        self.model_name=model_name
-        self.beta1=0.9
-        self.beta2=0.999
-        self.learning_rate=learning_rate
-        self.data=None
-        self.n_classes=None
-        self.labels=None
-        self.x=None
-        self.y=None
-        self.num_steps=num_steps
-        self.scaling="log_min_max"
-        self.sig_genes=None
-        self.sample_names=None
-        self.hidden_units = [256, 128, 64, 32]
-        self.do_rates = [0, 0, 0, 0]
+    def __init__(self,
+                 model_dir,
+                 model_name,
+                 batch_size=128,
+                 learning_rate=0.0001,
+                 num_steps=1000,
+                 seed=0,
+                 hidden_units=[256, 128, 64, 32],
+                 do_rates=[0, 0, 0, 0]):
+
+        self.model_dir = model_dir
+        self.batch_size = batch_size
+        self.model_name = model_name
+        self.beta1 = 0.9
+        self.beta2 = 0.999
+        self.learning_rate = learning_rate
+        self.data = None
+        self.n_classes = None
+        self.labels = None
+        self.x = None
+        self.y = None
+        self.num_steps = num_steps
+        self.scaling = "log_min_max"
+        self.sig_genes = None
+        self.sample_names = None
+        self.hidden_units = hidden_units
+        self.do_rates = do_rates
 
         # Set seeds for reproducibility
         tf.random.set_seed(seed)
         os.environ['TF_DETERMINISTIC_OPS'] = '1'
         np.random.seed(seed)
 
-
-    def model_fn(self, X, n_classes, reuse=False):
-        """
-        Model function
-        :param params:
-        :param mode:
-        :return:
-        """
-        activation = tf.nn.relu
-        with tf.compat.v1.variable_scope("scaden_model", reuse=reuse):
-            layer1 = tf.compat.v1.layers.dense(X, units=self.hidden_units[0], activation=activation , name="dense1")
-            do1 = tf.compat.v1.layers.dropout(layer1, rate=self.do_rates[0], training=self.training_mode)
-            layer2 = tf.compat.v1.layers.dense(do1, units=self.hidden_units[1], activation=activation , name="dense2")
-            do2 = tf.compat.v1.layers.dropout(layer2, rate=self.do_rates[1], training=self.training_mode)
-            layer3 = tf.compat.v1.layers.dense(do2, units=self.hidden_units[2], activation=activation , name="dense3")
-            do3 = tf.compat.v1.layers.dropout(layer3, rate=self.do_rates[2], training=self.training_mode)
-            layer4 = tf.compat.v1.layers.dense(do3, units=self.hidden_units[3], activation=activation , name="dense4")
-            do4 = tf.compat.v1.layers.dropout(layer4, rate=self.do_rates[3], training=self.training_mode)
-            logits = tf.compat.v1.layers.dense(do4, units=n_classes, activation=tf.nn.softmax, name="logits_layer")
-
-            return logits
+    def scaden_model(self, n_classes):
+        """Create the Scaden model"""
+
+        model = tf.keras.Sequential()
+        model.add(
+            tf.keras.layers.Dense(self.hidden_units[0], activation=tf.nn.relu))
+        model.add(tf.keras.layers.Dropout(self.do_rates[0]))
+        model.add(
+            tf.keras.layers.Dense(self.hidden_units[1], activation=tf.nn.relu))
+        model.add(tf.keras.layers.Dropout(self.do_rates[1]))
+        model.add(
+            tf.keras.layers.Dense(self.hidden_units[2], activation=tf.nn.relu))
+        model.add(tf.keras.layers.Dropout(self.do_rates[2]))
+        model.add(
+            tf.keras.layers.Dense(self.hidden_units[3], activation=tf.nn.relu))
+        model.add(tf.keras.layers.Dropout(self.do_rates[3]))
+        model.add(tf.keras.layers.Dense(n_classes, activation=tf.nn.softmax))
+
+        return model
 
     def compute_loss(self, logits, targets):
         """
@@ -69,7 +80,7 @@ def compute_loss(self, logits, targets):
         :param targets:
         :return: L1 loss
         """
-        loss = tf.reduce_mean(input_tensor=tf.math.square(logits-targets))
+        loss = tf.reduce_mean(input_tensor=tf.math.square(logits - targets))
         return loss
 
     def compute_accuracy(self, logits, targets, pct_cut=0.05):
@@ -79,7 +90,8 @@ def compute_accuracy(self, logits, targets, pct_cut=0.05):
         :param pct_cut:
         :return:
         """
-        equality = tf.less_equal(tf.math.abs(tf.math.subtract(logits, targets)), pct_cut)
+        equality = tf.less_equal(
+            tf.math.abs(tf.math.subtract(logits, targets)), pct_cut)
         accuracy = tf.reduce_mean(input_tensor=tf.cast(equality, tf.float32))
         return accuracy
 
@@ -92,9 +104,11 @@ def correlation_coefficient(self, logits, targets):
         """
         mx = tf.reduce_mean(input_tensor=logits)
         my = tf.reduce_mean(input_tensor=targets)
-        xm, ym = logits-mx, targets-my
+        xm, ym = logits - mx, targets - my
         r_num = tf.reduce_sum(input_tensor=tf.multiply(xm, ym))
-        r_den = tf.sqrt(tf.multiply(tf.reduce_sum(input_tensor=tf.square(xm)), tf.reduce_sum(input_tensor=tf.square(ym))))
+        r_den = tf.sqrt(
+            tf.multiply(tf.reduce_sum(input_tensor=tf.square(xm)),
+                        tf.reduce_sum(input_tensor=tf.square(ym))))
         r = tf.divide(r_num, r_den)
         r = tf.maximum(tf.minimum(r, 1.0), -1.0)
         return r
@@ -112,28 +126,33 @@ def visualization(self, logits, targets, classes):
         eval_metrics = {"rmse": rmse, "pcor": pcor}
 
         for i in range(logits.shape[1]):
-            eval_metrics["mre_" + str(classes[i])] = tf.compat.v1.metrics.mean_relative_error(
-                targets[:, i],
-                logits[:, i],
-                targets[:, i])[0]
-            eval_metrics["mae_" + str(classes[i])] = tf.compat.v1.metrics.mean_absolute_error(
-                targets[:, i],
-                logits[:, i],
-                targets[:, i])[0]
-            eval_metrics["pcor_" + str(classes[i])] = self.correlation_coefficient(targets[:, i],logits[:, i])
-
-
-        eval_metrics["mre_total"] = tf.compat.v1.metrics.mean_relative_error(targets,
-                                                                   logits,
-                                                                   targets)[1]
-
-        eval_metrics["mae_total"] = tf.compat.v1.metrics.mean_relative_error(targets,
-                                                                   logits,
-                                                                   targets)[1]
-
-        eval_metrics["accuracy01"] = self.compute_accuracy(logits, targets, pct_cut=0.01)
-        eval_metrics["accuracy05"] = self.compute_accuracy(logits, targets, pct_cut=0.05)
-        eval_metrics["accuracy1"] = self.compute_accuracy(logits, targets, pct_cut=0.1)
+            eval_metrics[
+                "mre_" +
+                str(classes[i])] = tf.compat.v1.metrics.mean_relative_error(
+                    targets[:, i], logits[:, i], targets[:, i])[0]
+            eval_metrics[
+                "mae_" +
+                str(classes[i])] = tf.compat.v1.metrics.mean_absolute_error(
+                    targets[:, i], logits[:, i], targets[:, i])[0]
+            eval_metrics["pcor_" +
+                         str(classes[i])] = self.correlation_coefficient(
+                             targets[:, i], logits[:, i])
+
+        eval_metrics["mre_total"] = tf.compat.v1.metrics.mean_relative_error(
+            targets, logits, targets)[1]
+
+        eval_metrics["mae_total"] = tf.compat.v1.metrics.mean_relative_error(
+            targets, logits, targets)[1]
+
+        eval_metrics["accuracy01"] = self.compute_accuracy(logits,
+                                                           targets,
+                                                           pct_cut=0.01)
+        eval_metrics["accuracy05"] = self.compute_accuracy(logits,
+                                                           targets,
+                                                           pct_cut=0.05)
+        eval_metrics["accuracy1"] = self.compute_accuracy(logits,
+                                                          targets,
+                                                          pct_cut=0.1)
 
         # Create summary scalars
         for key, value in eval_metrics.items():
@@ -153,7 +172,13 @@ def load_h5ad_file(self, input_path, batch_size, datasets=[]):
         :param datasets: a list of datasets to extract from the file
         :return: Dataset object
         """
-        raw_input = read_h5ad(input_path)
+        try:
+            raw_input = read_h5ad(input_path)
+        except:
+            logger.error(
+                "Could not load training data file! Is it a .h5ad file generated with `scaden process`?"
+            )
+            sys.exit()
 
         # Subset dataset
         if len(datasets) > 0:
@@ -162,22 +187,27 @@ def load_h5ad_file(self, input_path, batch_size, datasets=[]):
                 if ds not in datasets:
                     raw_input = raw_input[raw_input.obs['ds'] != ds].copy()
 
-
         # Create training dataset
-        ratios = [raw_input.obs[ctype] for ctype in raw_input.uns['cell_types']]
+        ratios = [
+            raw_input.obs[ctype] for ctype in raw_input.uns['cell_types']
+        ]
         self.x_data = raw_input.X.astype(np.float32)
         self.y_data = np.array(ratios, dtype=np.float32).transpose()
-        # create placeholders
-        self.x_data_ph = tf.compat.v1.placeholder(self.x_data.dtype, self.x_data.shape, name="x_data_ph")
-        self.y_data_ph = tf.compat.v1.placeholder(self.y_data.dtype, self.y_data.shape, name="y_data_ph")
-        self.data = tf.data.Dataset.from_tensor_slices((self.x_data_ph, self.y_data_ph))
-        self.data = self.data.shuffle(1000).repeat().batch(batch_size=batch_size)
+        self.data = tf.data.Dataset.from_tensor_slices(
+            (self.x_data, self.y_data))
+        self.data = self.data.shuffle(1000).repeat().batch(
+            batch_size=batch_size)
+        self.data_iter = iter(self.data)
 
         # Extract celltype and feature info
         self.labels = raw_input.uns['cell_types']
         self.sig_genes = list(raw_input.var_names)
 
-    def load_prediction_file(self, input_path, sig_genes, labels, scaling=None):
+    def load_prediction_file(self,
+                             input_path,
+                             sig_genes,
+                             labels,
+                             scaling=None):
         """
         Load a file to perform prediction on it
         :param input_path: path to input file
@@ -192,24 +222,19 @@ def load_prediction_file(self, input_path, sig_genes, labels, scaling=None):
         # check for duplicates
         data_index = list(data.index)
         if not (len(data_index) == len(set(data_index))):
-            print("Scaden Warning: Your mixture file conatins duplicate genes! The firs occuring gene will be used for every duplicate.")
+            print(
+                "Scaden Warning: Your mixture file conatins duplicate genes! The first occuring gene will be used for every duplicate."
+            )
             data = data.loc[~data.index.duplicated(keep='first')]
 
         data = data.loc[sig_genes]
+        data = data.T
 
-        self.x_data = data.T
-        self.x_data = self.x_data.astype(np.float32)
-        m = self.x_data.shape[0]
-        self.y_dummy = dummy_labels(m, labels)
         # Scaling
         if scaling:
-            self.x_data = sample_scaling(self.x_data, scaling_option=scaling)
+            data = sample_scaling(data, scaling_option=scaling)
 
-        # Create Dataset object from placeholders
-        self.x_data_ph = tf.compat.v1.placeholder(self.x_data.dtype, self.x_data.shape, name="x_data_ph")
-        self.y_data_ph = tf.compat.v1.placeholder(self.y_dummy.dtype, self.y_dummy.shape, name="y_data_ph")
-        self.data = tf.data.Dataset.from_tensor_slices((self.x_data_ph, self.y_data_ph))
-        self.data = self.data.batch(batch_size=m)
+        self.data = data
 
         return sample_names
 
@@ -221,39 +246,29 @@ def build_model(self, input_path, train_datasets, mode="train"):
         """
         self.global_step = tf.Variable(0, name='global_step', trainable=False)
 
-        # Load data
-        if mode=="train":
-            self.load_h5ad_file(input_path=input_path, batch_size=self.batch_size, datasets=train_datasets)
-
-        if mode=="predict":
-            self.sample_names = self.load_prediction_file(input_path=input_path, sig_genes=self.sig_genes,
-                                                     labels=self.labels, scaling=self.scaling)
-
-        # Make iterator
-        iter = tf.compat.v1.data.Iterator.from_structure(tf.compat.v1.data.get_output_types(self.data), tf.compat.v1.data.get_output_shapes(self.data))
-        next_element = iter.get_next()
-        self.data_init_op = iter.make_initializer(self.data)
-        self.x, self.y = next_element
-        self.x = tf.cast(self.x, tf.float32)
-
-        self.n_classes = len(self.labels)
-
-        # Placeholder for training mode
-        self.training_mode = tf.compat.v1.placeholder_with_default(True, shape=())
-
-        # Model
-        self.logits = self.model_fn(X=self.x, n_classes=self.n_classes)
-
-
+        # Load training data
         if mode == "train":
-            # Loss
-            self.loss = self.compute_loss(self.logits, self.y)
-            # Summary scalars
-            self.merged_summary_op = self.visualization(tf.cast(self.logits, tf.float32), targets=tf.cast(self.y, tf.float32), classes=self.labels)
-            learning_rate = self.learning_rate
-            # Optimizer
-            self.optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.loss, global_step=self.global_step)
+            self.load_h5ad_file(input_path=input_path,
+                                batch_size=self.batch_size,
+                                datasets=train_datasets)
+
+        # Load prediction data
+        if mode == "predict":
+            self.sample_names = self.load_prediction_file(
+                input_path=input_path,
+                sig_genes=self.sig_genes,
+                labels=self.labels,
+                scaling=self.scaling)
+
+        # Build the model or load if available
+        self.n_classes = len(self.labels)
 
+        try:
+            self.model = tf.keras.models.load_model(self.model_dir,
+                                                    compile=False)
+            logger.info("Loaded pre-trained model")
+        except:
+            self.model = self.scaden_model(n_classes=self.n_classes)
 
     def train(self, input_path, train_datasets):
         """
@@ -261,38 +276,44 @@ def train(self, input_path, train_datasets):
         :param num_steps:
         :return:
         """
+
+        # Define the optimizer
+        optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate)
+
         # Build model graph
-        self.build_model(input_path=input_path, train_datasets=train_datasets, mode="train")
+        self.build_model(input_path=input_path,
+                         train_datasets=train_datasets,
+                         mode="train")
 
-        # Init variables
-        self.sess.run(tf.compat.v1.global_variables_initializer())
-        self.sess.run(tf.compat.v1.local_variables_initializer())
-        self.saver = tf.compat.v1.train.Saver()
-        model = os.path.join(self.model_dir, self.model_name)
-        self.writer = tf.compat.v1.summary.FileWriter(model, self.sess.graph)
-        self.eval_writer = tf.compat.v1.summary.FileWriter(os.path.join(self.model_dir, "eval"), self.sess.graph)
+        # Training loop
+        pbar = tqdm(range(self.num_steps))
+        for step, _ in enumerate(pbar):
 
-        # Initialize datasets
-        self.sess.run(self.data_init_op, feed_dict={self.x_data_ph: self.x_data, self.y_data_ph: self.y_data})
+            x, y = self.data_iter.get_next()
 
+            with tf.GradientTape() as tape:
+                self.logits = self.model(x, training=True)
+                loss = self.compute_loss(self.logits, y)
 
-        # Load pre-trained weights if avaialble
-        self.load_weights(self.model_dir)
+            grads = tape.gradient(loss, self.model.trainable_weights)
 
-        # Training loop
-        pbar = tqdm(range(self.num_steps))
-        for _ in pbar:
-            _, loss, summary = self.sess.run([self.optimizer, self.loss, self.merged_summary_op])
-            self.writer.add_summary(summary, tf.compat.v1.train.global_step(self.sess, self.global_step))
-            description = "Step: " + str(tf.compat.v1.train.global_step(self.sess, self.global_step)) + ", Loss: {:4.3f}".format(
-                loss)
-            pbar.set_description(desc=description)
+            optimizer.apply_gradients(zip(grads, self.model.trainable_weights))
+
+            desc = (f"Step: {step}, Loss: {loss:.4f}")
+            pbar.set_description(desc=desc)
+
+            # Collect garbage after 100 steps - otherwise runs out of memory
+            if step % 100 == 0:
+                gc.collect()
 
         # Save the trained model
-        self.saver.save(self.sess, model, global_step=self.global_step)
-        # Save features and celltypes
-        pd.DataFrame(self.labels).to_csv(self.model_dir + "/celltypes.txt", sep="\t")
-        pd.DataFrame(self.sig_genes).to_csv(self.model_dir + "/genes.txt", sep="\t")
+        self.model.save(self.model_dir)
+        pd.DataFrame(self.labels).to_csv(os.path.join(self.model_dir,
+                                                      "celltypes.txt"),
+                                         sep="\t")
+        pd.DataFrame(self.sig_genes).to_csv(os.path.join(
+            self.model_dir, "genes.txt"),
+                                            sep="\t")
 
 
     def predict(self, input_path, out_name="scaden_predictions.txt"):
@@ -309,35 +330,13 @@ def predict(self, input_path, out_name="scaden_predictions.txt"):
         self.labels = list(labels['0'])
 
         # Build model graph
-        self.build_model(input_path=input_path, train_datasets=[], mode="predict")
-
-        # Initialize variables
-        self.sess.run(tf.compat.v1.global_variables_initializer())
-        self.sess.run(tf.compat.v1.local_variables_initializer())
-
-        self.saver = tf.compat.v1.train.Saver()
-
-        model = os.path.join(self.model_dir, self.model_name)
-        self.writer = tf.compat.v1.summary.FileWriter(model, self.sess.graph)
-
-        # Initialize datasets
-        self.sess.run(self.data_init_op, feed_dict={self.x_data_ph: self.x_data, self.y_data_ph: self.y_dummy})
-
-        # Load pre-trained weights if avaialble
-        self.load_weights(self.model_dir)
-
-        predictions = self.sess.run([self.logits], feed_dict={self.training_mode: False})
-        pred_df = pd.DataFrame(predictions[0], columns=self.labels, index=self.sample_names)
-        #pred_df.to_csv(out_name, sep="\t")
-        return pred_df
-
-    def load_weights(self, model_dir):
-            """
-            Load pre-trained weights if available
-            :param model_dir:
-            :return:
-            """
-            ckpt = tf.train.get_checkpoint_state(model_dir)
-            if ckpt:
-                self.saver.restore(self.sess, ckpt.model_checkpoint_path)
-                print("Model parameters restored successfully")
+        self.build_model(input_path=input_path,
+                         train_datasets=[],
+                         mode="predict")
+
+        predictions = self.model.predict(self.data)
+
+        pred_df = pd.DataFrame(predictions,
+                               columns=self.labels,
+                               index=self.sample_names)
+        return pred_df
\ No newline at end of file
diff --git a/scaden/predict.py b/scaden/predict.py
new file mode 100644
index 0000000..c2d26dd
--- /dev/null
+++ b/scaden/predict.py
@@ -0,0 +1,73 @@
+"""
+scaden Main functionality
+
+Contains code to
+- process a training datasets
+- train a model
+- perform predictions
+
+"""
+
+# Imports
+import tensorflow as tf
+from anndata import read_h5ad
+from scaden.model.architectures import architectures
+from scaden.model.scaden import Scaden
+"""
+PARAMETERS
+"""
+# ==========================================#
+
+# Extract architectures
+M256_HIDDEN_UNITS = architectures['m256'][0]
+M512_HIDDEN_UNITS = architectures['m512'][0]
+M1024_HIDDEN_UNITS = architectures['m1024'][0]
+M256_DO_RATES = architectures['m256'][1]
+M512_DO_RATES = architectures['m512'][1]
+M1024_DO_RATES = architectures['m1024'][1]
+
+# ==========================================#
+
+
+def prediction(model_dir, data_path, out_name, seed=0):
+    """
+    Perform prediction using a trained scaden ensemble
+    :param model_dir: the directory containing the models
+    :param data_path: the path to the gene expression file
+    :param out_name: name of the output prediction file
+    :return:
+    """
+
+    # Small model predictions
+    cdn256 = Scaden(model_dir=model_dir + "/m256",
+                    model_name='m256',
+                    seed=seed,
+                    hidden_units=M256_HIDDEN_UNITS,
+                    do_rates=M256_DO_RATES)
+    # Predict ratios
+    preds_256 = cdn256.predict(input_path=data_path,
+                               out_name='scaden_predictions_m256.txt')
+
+    # Mid model predictions
+    cdn512 = Scaden(model_dir=model_dir + "/m512",
+                    model_name='m512',
+                    seed=seed,
+                    hidden_units=M512_HIDDEN_UNITS,
+                    do_rates=M512_DO_RATES)
+    # Predict ratios
+    preds_512 = cdn512.predict(input_path=data_path,
+                               out_name='scaden_predictions_m512.txt')
+
+    # Large model predictions
+    cdn1024 = Scaden(model_dir=model_dir + "/m1024",
+                     model_name='m1024',
+                     seed=seed,
+                     hidden_units=M1024_HIDDEN_UNITS,
+                     do_rates=M256_DO_RATES)
+    # Predict ratios
+    preds_1024 = cdn1024.predict(input_path=data_path,
+                                 out_name='scaden_predictions_m1024.txt')
+
+    # Average predictions
+    preds = (preds_256 + preds_512 + preds_1024) / 3
+    preds.to_csv(out_name, sep="\t")
diff --git a/scaden/preprocessing/bulk_simulation.py b/scaden/preprocessing/bulk_simulation.py
index 8cc495c..52ce3f7 100644
--- a/scaden/preprocessing/bulk_simulation.py
+++ b/scaden/preprocessing/bulk_simulation.py
@@ -9,9 +9,11 @@
 import pandas as pd
 import numpy as np
 from tqdm import tqdm
+from pathlib import Path
 
 logger = logging.getLogger(__name__)
 
+
 def create_fractions(no_celltypes):
     """
     Create random fractions
@@ -24,7 +26,12 @@ def create_fractions(no_celltypes):
     return fracs
 
 
-def create_subsample(x, y, sample_size, celltypes, available_celltypes, sparse=False):
+def create_subsample(x,
+                     y,
+                     sample_size,
+                     celltypes,
+                     available_celltypes,
+                     sparse=False):
     """
     Generate artifical bulk subsample with random fractions of celltypes
     If sparse is set to true, add random celltypes to the missing celltypes
@@ -39,9 +46,9 @@ def create_subsample(x, y, sample_size, celltypes, available_celltypes, sparse=F
 
     if sparse:
         no_keep = np.random.randint(1, len(available_celltypes))
-        keep = np.random.choice(
-            list(range(len(available_celltypes))), size=no_keep, replace=False
-        )
+        keep = np.random.choice(list(range(len(available_celltypes))),
+                                size=no_keep,
+                                replace=False)
         available_celltypes = [available_celltypes[i] for i in keep]
 
     no_avail_cts = len(available_celltypes)
@@ -61,7 +68,8 @@ def create_subsample(x, y, sample_size, celltypes, available_celltypes, sparse=F
     for i in range(no_avail_cts):
         ct = available_celltypes[i]
         cells_sub = x.loc[np.array(y["Celltype"] == ct), :]
-        cells_fraction = np.random.randint(0, cells_sub.shape[0], samp_fracs[i])
+        cells_fraction = np.random.randint(0, cells_sub.shape[0],
+                                           samp_fracs[i])
         cells_sub = cells_sub.iloc[cells_fraction, :]
         artificial_samples.append(cells_sub)
 
@@ -91,9 +99,8 @@ def create_subsample_dataset(x, y, sample_size, celltypes, no_samples):
     pbar = tqdm(range(no_samples))
     pbar.set_description(desc="Normal samples")
     for _ in pbar:
-        sample, label = create_subsample(
-            x, y, sample_size, celltypes, available_celltypes
-        )
+        sample, label = create_subsample(x, y, sample_size, celltypes,
+                                         available_celltypes)
         X.append(sample)
         Y.append(label)
 
@@ -102,9 +109,12 @@ def create_subsample_dataset(x, y, sample_size, celltypes, no_samples):
     pbar = tqdm(range(n_sparse))
     pbar.set_description(desc="Sparse samples")
     for _ in pbar:
-        sample, label = create_subsample(
-            x, y, sample_size, celltypes, available_celltypes, sparse=True
-        )
+        sample, label = create_subsample(x,
+                                         y,
+                                         sample_size,
+                                         celltypes,
+                                         available_celltypes,
+                                         sparse=True)
         X.append(sample)
         Y.append(label)
     X = pd.concat(X, axis=1).T
@@ -160,21 +170,24 @@ def filter_matrix_signature(mat, genes):
     mat = mat[genes]
     return mat
 
+
 def load_celltypes(path, name):
     """ Load the cell type information """
     try:
         y = pd.read_table(path)
         # Check if has Celltype column
         if not 'Celltype' in y.columns:
-            logger.error(f"No 'Celltype' column found in {name}_celltypes.txt! Please make sure to include this column.")
+            logger.error(
+                f"No 'Celltype' column found in {name}_celltypes.txt! Please make sure to include this column."
+            )
             sys.exit()
     except FileNotFoundError as e:
-        logger.error(f"No celltypes file found for {name}. It should be called {name}_celltypes.txt.")
+        logger.error(
+            f"No celltypes file found for {name}. It should be called {name}_celltypes.txt."
+        )
         sys.exit(e)
-    
-    return y
-
 
+    return y
 
 
 def load_dataset(name, dir, pattern):
@@ -188,9 +201,37 @@ def load_dataset(name, dir, pattern):
     pattern = pattern.replace("*", "")
     print("Loading " + name + " dataset ...")
 
-    y = load_celltypes(dir + name + "_celltypes.txt", name)
-    x = pd.read_table(dir + name + pattern, index_col=0)
-    
+    # Try to load celltypes
+    try:
+        y = pd.read_table(os.path.join(dir, name + "_celltypes.txt"))
+        # Check if has Celltype column
+        print(y.columns)
+        if not 'Celltype' in y.columns:
+            logger.error(
+                f"No 'Celltype' column found in {name}_celltypes.txt! Please make sure to include this column."
+            )
+            sys.exit()
+    except FileNotFoundError as e:
+        logger.error(
+            f"No celltypes file found for {name}. It should be called {name}_celltypes.txt."
+        )
+        sys.exit(e)
+
+    # Try to load data file
+    try:
+        x = pd.read_table(os.path.join(dir, name + pattern), index_col=0)
+    except FileNotFoundError as e:
+        logger.error(
+            f"No counts file found for {name}. Was looking for file {name + pattern}"
+        )
+
+    # Check that celltypes and count file have same number of cells
+    if not y.shape[0] == x.shape[0]:
+        logger.error(
+            f"Different number of cells in {name}_celltypes and {name + pattern}! Make sure the data has been processed correctly."
+        )
+        sys.exit(1)
+
     return (x, y)
 
 
@@ -203,7 +244,9 @@ def merge_unkown_celltypes(y, unknown_celltypes):
     :return:
     """
     celltypes = list(y["Celltype"])
-    new_celltypes = ["Unknown" if x in unknown_celltypes else x for x in celltypes]
+    new_celltypes = [
+        "Unknown" if x in unknown_celltypes else x for x in celltypes
+    ]
     y["Celltype"] = new_celltypes
     return y
 
@@ -273,9 +316,8 @@ def generate_signature(x, y):
     return signature_matrix
 
 
-def simulate_bulk(
-    sample_size, num_samples, data_path, out_dir, pattern, unknown_celltypes
-):
+def simulate_bulk(sample_size, num_samples, data_path, out_dir, pattern,
+                  unknown_celltypes):
     """
     Simulate artificial bulk samples from single cell datasets
     :param sample_size: number of cells per sample
@@ -287,17 +329,20 @@ def simulate_bulk(
     """
 
     num_samples = int(
-        num_samples / 2
-    )  # divide by two so half is sparse and half is normal samples
+        num_samples /
+        2)  # divide by two so half is sparse and half is normal samples
 
     # List available datasets
-    files = glob.glob(data_path + pattern)
+    if not data_path.endswith("/"):
+        data_path += "/"
+    files = glob.glob(os.path.join(data_path, pattern))
     files = [os.path.basename(x) for x in files]
     datasets = [x.split("_")[0] for x in files]
 
     if len(datasets) == 0:
-        logging.error("No datasets fround! Have you specified the pattern correctly?")
-        sys.exit()
+        logging.error(
+            "No datasets found! Have you specified the pattern correctly?")
+        sys.exit(1)
 
     print("Datasets: " + str(datasets))
 
@@ -326,11 +371,14 @@ def simulate_bulk(
     # Create datasets
     for i in range(len(xs)):
         print("Subsampling " + datasets[i] + "...")
-        tmpx, tmpy = create_subsample_dataset(
-            xs[i], ys[i], sample_size, celltypes, num_samples
-        )
-        tmpx.to_csv(out_dir + datasets[i] + "_samples.txt", sep="\t", index=False)
-        tmpy.to_csv(out_dir + datasets[i] + "_labels.txt", sep="\t", index=False)
+        tmpx, tmpy = create_subsample_dataset(xs[i], ys[i], sample_size,
+                                              celltypes, num_samples)
+        tmpx.to_csv(out_dir + datasets[i] + "_samples.txt",
+                    sep="\t",
+                    index=False)
+        tmpy.to_csv(out_dir + datasets[i] + "_labels.txt",
+                    sep="\t",
+                    index=False)
         gc.collect()
 
     print("Finished!")
diff --git a/scaden/preprocessing/create_h5ad_file.py b/scaden/preprocessing/create_h5ad_file.py
index 6ce1656..86ae01e 100644
--- a/scaden/preprocessing/create_h5ad_file.py
+++ b/scaden/preprocessing/create_h5ad_file.py
@@ -15,6 +15,7 @@
 
 logger = logging.getLogger(__name__)
 
+
 def parse_data(x_path, y_path):
     """
     Parse data and labels and divide them into training and testset
@@ -94,18 +95,16 @@ def create_h5ad_file(data_dir, out_path, unknown, pattern="*_samples.txt"):
 
         x = x.sort_index(axis=1)
         ratios = pd.DataFrame(y, columns=celltypes)
-        ratios["ds"] = pd.Series(np.repeat(train_file, y.shape[0]), index=ratios.index)
+        ratios["ds"] = pd.Series(np.repeat(train_file, y.shape[0]),
+                                 index=ratios.index)
 
         print("Processing " + str(train_file))
         x = pd.DataFrame(x)
         adata.append(
-            anndata.AnnData(
-                X=x.to_numpy(), obs=ratios, var=pd.DataFrame(columns=[], index=list(x))
-            )
-        )
-        
-        
-    
+            anndata.AnnData(X=x.to_numpy(),
+                            obs=ratios,
+                            var=pd.DataFrame(columns=[], index=list(x))))
+
     for i in range(1, len(adata)):
         print("Concatenating " + str(i))
         adata[0] = adata[0].concatenate(adata[1])
diff --git a/scaden/scaden/processing.py b/scaden/process.py
similarity index 100%
rename from scaden/scaden/processing.py
rename to scaden/process.py
diff --git a/scaden/scaden/__init__.py b/scaden/scaden/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/scaden/scaden/prediction.py b/scaden/scaden/prediction.py
deleted file mode 100644
index 8dc33e5..0000000
--- a/scaden/scaden/prediction.py
+++ /dev/null
@@ -1,85 +0,0 @@
-"""
-scaden Main functionality
-
-Contains code to
-- process a training datasets
-- train a model
-- perform predictions
-
-"""
-
-# Imports
-import tensorflow as tf
-from anndata import read_h5ad
-from scaden.model.architectures import architectures
-from scaden.model.scaden import Scaden
-
-"""
-PARAMETERS
-"""
-# ==========================================#
-
-# Extract architectures
-M256_HIDDEN_UNITS = architectures['m256'][0]
-M512_HIDDEN_UNITS = architectures['m512'][0]
-M1024_HIDDEN_UNITS = architectures['m1024'][0]
-M256_DO_RATES = architectures['m256'][1]
-M512_DO_RATES = architectures['m512'][1]
-M1024_DO_RATES = architectures['m1024'][1]
-
-
-# ==========================================#
-
-
-def prediction(model_dir, data_path, out_name, seed=0):
-    """
-    Perform prediction using a trained scaden ensemble
-    :param model_dir: the directory containing the models
-    :param data_path: the path to the gene expression file
-    :param out_name: name of the output prediction file
-    :return:
-    """
-
-    # Small model predictions
-    tf.compat.v1.reset_default_graph()
-    with tf.compat.v1.Session() as sess:
-        cdn256 = Scaden(sess=sess,
-                     model_dir=model_dir + "/m256",
-                     model_name='m256',
-                     seed=seed)
-        cdn256.hidden_units = M256_HIDDEN_UNITS
-        cdn256.do_rates = M256_DO_RATES
-
-        # Predict ratios
-        preds_256 = cdn256.predict(input_path=data_path,  out_name='scaden_predictions_m256.txt')
-
-
-    # Mid model predictions
-    tf.compat.v1.reset_default_graph()
-    with tf.compat.v1.Session() as sess:
-        cdn512 = Scaden(sess=sess,
-                     model_dir=model_dir+"/m512",
-                     model_name='m512',
-                     seed=seed)
-        cdn512.hidden_units = M512_HIDDEN_UNITS
-        cdn512.do_rates = M512_DO_RATES
-
-        # Predict ratios
-        preds_512 = cdn512.predict(input_path=data_path, out_name='scaden_predictions_m512.txt')
-
-    # Large model predictions
-    tf.compat.v1.reset_default_graph()
-    with tf.compat.v1.Session() as sess:
-        cdn1024 = Scaden(sess=sess,
-                      model_dir=model_dir+"/m1024",
-                      model_name='m1024',
-                      seed=seed)
-        cdn1024.hidden_units = M1024_HIDDEN_UNITS
-        cdn1024.do_rates = M1024_DO_RATES
-
-        # Predict ratios
-        preds_1024 = cdn1024.predict(input_path=data_path, out_name='scaden_predictions_m1024.txt')
-
-    # Average predictions
-    preds = (preds_256 + preds_512 + preds_1024) / 3
-    preds.to_csv(out_name, sep="\t")
diff --git a/scaden/scaden/training.py b/scaden/scaden/training.py
deleted file mode 100644
index 3d84157..0000000
--- a/scaden/scaden/training.py
+++ /dev/null
@@ -1,98 +0,0 @@
-"""
-scaden Main functionality
-
-Contains code to
-- process a training datasets
-- train a model
-- perform predictions
-
-"""
-
-# Imports
-import tensorflow as tf
-from anndata import read_h5ad
-from scaden.model.architectures import architectures
-from scaden.model.scaden import Scaden
-
-"""
-PARAMETERS
-"""
-# ==========================================#
-
-# Extract architectures
-M256_HIDDEN_UNITS = architectures['m256'][0]
-M512_HIDDEN_UNITS = architectures['m512'][0]
-M1024_HIDDEN_UNITS = architectures['m1024'][0]
-M256_DO_RATES = architectures['m256'][1]
-M512_DO_RATES = architectures['m512'][1]
-M1024_DO_RATES = architectures['m1024'][1]
-
-# ==========================================#
-
-
-def training(data_path, train_datasets, model_dir, batch_size, learning_rate, num_steps, seed=0):
-    """
-    Perform training of three a scaden model ensemble consisting of three different models
-    :param model_dir:
-    :param batch_size:
-    :param learning_rate:
-    :param num_steps:
-    :return:
-    """
-    # Convert training datasets
-    if train_datasets == '':
-        train_datasets = []
-    else:
-        train_datasets = train_datasets.split()
-        print(f"Training on: {train_datasets}")
-
-
-    # M256 model training
-    print("Training M256 Model ...")
-    tf.compat.v1.reset_default_graph()
-    with tf.compat.v1.Session() as sess:
-        cdn256 = Scaden(sess=sess,
-                     model_dir=model_dir+"/m256",
-                     model_name='m256',
-                     batch_size=batch_size,
-                     learning_rate=learning_rate,
-                     num_steps=num_steps,
-                     seed=seed)
-        cdn256.hidden_units = M256_HIDDEN_UNITS
-        cdn256.do_rates = M256_DO_RATES
-        cdn256.train(input_path=data_path, train_datasets=train_datasets)
-    del cdn256
-
-    # Training of mid model
-    print("Training M512 Model ...")
-    tf.compat.v1.reset_default_graph()
-    with tf.compat.v1.Session() as sess:
-        cdn512 = Scaden(sess=sess,
-                     model_dir=model_dir+"/m512",
-                     model_name='m512',
-                     batch_size=batch_size,
-                     learning_rate=learning_rate,
-                     num_steps=num_steps,
-                     seed=seed)
-        cdn512.hidden_units = M512_HIDDEN_UNITS
-        cdn512.do_rates = M512_DO_RATES
-        cdn512.train(input_path=data_path, train_datasets=train_datasets)
-    del cdn512
-
-    # Training of large model
-    print("Training M1024 Model ...")
-    tf.compat.v1.reset_default_graph()
-    with tf.compat.v1.Session() as sess:
-        cdn1024 = Scaden(sess=sess,
-                      model_dir=model_dir+"/m1024",
-                      model_name='m1024',
-                      batch_size=batch_size,
-                      learning_rate=learning_rate,
-                      num_steps=num_steps,
-                      seed=seed)
-        cdn1024.hidden_units = M1024_HIDDEN_UNITS
-        cdn1024.do_rates = M1024_DO_RATES
-        cdn1024.train(input_path=data_path, train_datasets=train_datasets)
-    del cdn1024
-
-    print("Training finished.")
\ No newline at end of file
diff --git a/scaden/preprocessing/simulate.py b/scaden/simulate.py
similarity index 63%
rename from scaden/preprocessing/simulate.py
rename to scaden/simulate.py
index 21c824a..617775f 100644
--- a/scaden/preprocessing/simulate.py
+++ b/scaden/simulate.py
@@ -6,23 +6,15 @@
 and subsequenbt formatting in .h5ad file for training with Scaden
 """
 
-def simulation(simulate_dir, data_dir, sample_size, num_samples, pattern, unknown_celltypes, out_prefix):
+
+def simulation(simulate_dir, data_dir, sample_size, num_samples, pattern,
+               unknown_celltypes, out_prefix):
 
     # Perform the bulk simulation
     unknown_celltypes = list(unknown_celltypes)
-    simulate_bulk(
-        sample_size, 
-        num_samples, 
-        data_dir, 
-        simulate_dir, 
-        pattern, 
-        unknown_celltypes
-        )
+    simulate_bulk(sample_size, num_samples, data_dir, simulate_dir, pattern,
+                  unknown_celltypes)
 
     # Create the h5ad training data file
     out_name = os.path.join(simulate_dir, out_prefix + ".h5ad")
-    create_h5ad_file(
-        simulate_dir, 
-        out_name, 
-        unknown_celltypes
-        )
+    create_h5ad_file(simulate_dir, out_name, unknown_celltypes)
diff --git a/scaden/train.py b/scaden/train.py
new file mode 100644
index 0000000..1694b54
--- /dev/null
+++ b/scaden/train.py
@@ -0,0 +1,93 @@
+"""
+scaden Main functionality
+
+Contains code to
+- process a training datasets
+- train a model
+- perform predictions
+
+"""
+
+# Imports
+import tensorflow as tf
+from anndata import read_h5ad
+from scaden.model.architectures import architectures
+from scaden.model.scaden import Scaden
+"""
+PARAMETERS
+"""
+# ==========================================#
+
+# Extract architectures
+M256_HIDDEN_UNITS = architectures['m256'][0]
+M512_HIDDEN_UNITS = architectures['m512'][0]
+M1024_HIDDEN_UNITS = architectures['m1024'][0]
+M256_DO_RATES = architectures['m256'][1]
+M512_DO_RATES = architectures['m512'][1]
+M1024_DO_RATES = architectures['m1024'][1]
+
+# ==========================================#
+
+
+def training(data_path,
+             train_datasets,
+             model_dir,
+             batch_size,
+             learning_rate,
+             num_steps,
+             seed=0):
+    """
+    Perform training of three a scaden model ensemble consisting of three different models
+    :param model_dir:
+    :param batch_size:
+    :param learning_rate:
+    :param num_steps:
+    :return:
+    """
+    # Convert training datasets
+    if train_datasets == '':
+        train_datasets = []
+    else:
+        train_datasets = train_datasets.split(',')
+        print(f"Training on: {train_datasets}")
+
+    # Training of M256 model
+    print("Training M256 Model ...")
+    cdn256 = Scaden(model_dir=model_dir + "/m256",
+                    model_name='m256',
+                    batch_size=batch_size,
+                    learning_rate=learning_rate,
+                    num_steps=num_steps,
+                    seed=seed,
+                    hidden_units=M256_HIDDEN_UNITS,
+                    do_rates=M512_DO_RATES)
+    cdn256.train(input_path=data_path, train_datasets=train_datasets)
+    del cdn256
+
+    # Training of M512 model
+    print("Training M512 Model ...")
+    cdn512 = Scaden(model_dir=model_dir + "/m512",
+                    model_name='m512',
+                    batch_size=batch_size,
+                    learning_rate=learning_rate,
+                    num_steps=num_steps,
+                    seed=seed,
+                    hidden_units=M512_HIDDEN_UNITS,
+                    do_rates=M512_DO_RATES)
+    cdn512.train(input_path=data_path, train_datasets=train_datasets)
+    del cdn512
+
+    # Training of M1024 model
+    print("Training M1024 Model ...")
+    cdn1024 = Scaden(model_dir=model_dir + "/m1024",
+                     model_name='m1024',
+                     batch_size=batch_size,
+                     learning_rate=learning_rate,
+                     num_steps=num_steps,
+                     seed=seed,
+                     hidden_units=M1024_HIDDEN_UNITS,
+                     do_rates=M1024_DO_RATES)
+    cdn1024.train(input_path=data_path, train_datasets=train_datasets)
+    del cdn1024
+
+    print("Training finished.")
\ No newline at end of file
diff --git a/setup.py b/setup.py
index e6c6e4a..9ab0c45 100644
--- a/setup.py
+++ b/setup.py
@@ -2,8 +2,7 @@
 
 from setuptools import setup, find_packages
 
-version = '0.9.6'
-
+version = '1.0.0'
 
 with open("README.md", "r", encoding="UTF-8") as fh:
     long_description = fh.read()
@@ -11,29 +10,24 @@
 with open('LICENSE', encoding="UTF-8") as f:
     license = f.read()
 
-setup(
-    name='scaden',
-    version=version,
-    description="Cell type deconvolution using single cell data",
-    long_description=long_description,
-    long_description_content_type="text/markdown",
-    keywords=['bioinformatics', 'deep learning', 'machine learning', 'single cell sequencing', 'deconvolution'],
-    author='Kevin Menden',
-    author_email='kevin.menden@t-online.de',
-    url='https://github.com/KevinMenden/scaden',
-    license="MIT License",
-    entry_points={"console_scripts": ["scaden=scaden.__main__:main"]},
-    packages=find_packages(),
-    include_package_data=True,
-    python_requires='>3.6.0',
-    install_requires = [
-        'pandas',
-        'numpy',
-        'scikit-learn',
-        'tensorflow>=2.0',
-        'anndata',
-        'tqdm',
-        'click',
-        'h5py~=2.10.0'
-    ]
-)
+setup(name='scaden',
+      version=version,
+      description="Cell type deconvolution using single cell data",
+      long_description=long_description,
+      long_description_content_type="text/markdown",
+      keywords=[
+          'bioinformatics', 'deep learning', 'machine learning',
+          'single cell sequencing', 'deconvolution'
+      ],
+      author='Kevin Menden',
+      author_email='kevin.menden@t-online.de',
+      url='https://github.com/KevinMenden/scaden',
+      license="MIT License",
+      entry_points={"console_scripts": ["scaden=scaden.__main__:main"]},
+      packages=find_packages(),
+      include_package_data=True,
+      python_requires='>3.6.0',
+      install_requires=[
+          'pandas', 'numpy', 'scikit-learn', 'tensorflow>=2.0', 'anndata',
+          'tqdm', 'click', 'h5py~=2.10.0'
+      ])