From 3431b9e3c230fe543de1dfebaad59bed15e6ac73 Mon Sep 17 00:00:00 2001 From: Martin van der Schelling <61459087+mpvanderschelling@users.noreply.github.com> Date: Fri, 17 Jan 2025 14:53:30 +0100 Subject: [PATCH] added cluster.ipynb --- .../notebooks/data-driven/cluster.ipynb | 259 ++++++++++++++++++ 1 file changed, 259 insertions(+) create mode 100644 docs/source/notebooks/data-driven/cluster.ipynb diff --git a/docs/source/notebooks/data-driven/cluster.ipynb b/docs/source/notebooks/data-driven/cluster.ipynb new file mode 100644 index 00000000..b5008658 --- /dev/null +++ b/docs/source/notebooks/data-driven/cluster.ipynb @@ -0,0 +1,259 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Using `f3dasm` on a High-Performance Cluster Computer\n", + "\n", + "Your `f3dasm` workflow can be seamlessly translated to a high-performance computing cluster. \n", + "The advantage is that you can parallelize the total number of experiments among the nodes of the cluster. \n", + "This is especially useful when you have a large number of experiments to run.\n", + "\n", + "> This example has been tested on the following high-performance computing cluster systems:\n", + "> \n", + "> - The [hpc06 cluster of Delft University of Technology](https://hpcwiki.tudelft.nl/index.php/Main_Page), using the [TORQUE resource manager](https://en.wikipedia.org/wiki/TORQUE).\n", + "> - The [DelftBlue: TU Delft supercomputer](https://www.tudelft.nl/dhpc/system), using the [SLURM resource manager](https://slurm.schedmd.com/documentation.html).\n", + "> - The [OSCAR compute cluster from Brown University](https://docs.ccv.brown.edu/oscar/getting-started), using the [SLURM resource manager](https://slurm.schedmd.com/documentation.html).\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from time import sleep\n", + "\n", + "import numpy as np\n", + "\n", + "from f3dasm import HPC_JOBID, ExperimentData\n", + "from f3dasm.design import make_nd_continuous_domain" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# We will create the following data-driven process:\n", + "\n", + "- Create a 20D continuous `Domain`.\n", + "- Sample from the domain using a Latin-hypercube sampler.\n", + "- With multiple nodes, use a data generation function, which will be the `\"Ackley\"` function from the benchmark functions.\n", + "\n", + "
\n", + " \"Block\"\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We want to ensure that the sampling is done only once, and that the data generation is performed in parallel. \n", + "Therefore, we can divide the different nodes into two categories:\n", + "\n", + "- The first node (`f3dasm.HPC_JOBID == 0`) will be the **master** node, responsible for creating the design-of-experiments and sampling (the `create_experimentdata` function).\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def create_experimentdata():\n", + " \"\"\"Design of Experiment\"\"\"\n", + " # Create a domain object\n", + " domain = make_nd_continuous_domain(\n", + " bounds=np.tile([0.0, 1.0], (20, 1)), dimensionality=20)\n", + "\n", + " # Create the ExperimentData object\n", + " data = ExperimentData(domain=domain)\n", + "\n", + " # Sampling from the domain\n", + " data.sample(sampler='latin', n_samples=10)\n", + "\n", + " # Store the data to disk\n", + " data.store()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- All the other nodes (`f3dasm.HPC_JOBID > 0`) will be **process** nodes, which will retrieve the `ExperimentData` from disk and proceed directly to the data generation function.\n", + "\n", + "
\n", + " \"Block\"\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def worker_node():\n", + " # Extract the experimentdata from disk\n", + " data = ExperimentData.from_file(project_dir='.')\n", + "\n", + " \"\"\"Data Generation\"\"\"\n", + " # Use the data-generator to evaluate the initial samples\n", + " data.evaluate(data_generator='Ackley', mode='cluster')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The entrypoint of the script can now check the jobid of the current node and decide whether to create the experiment data or to run the data generation function:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "if __name__ == '__main__':\n", + " # Check the jobid of the current node\n", + " if HPC_JOBID is None:\n", + " # If the jobid is none, we are not running anything now\n", + " pass\n", + "\n", + " elif HPC_JOBID == 0:\n", + " create_experimentdata()\n", + " worker_node()\n", + " elif HPC_JOBID > 0:\n", + " # Asynchronize the jobs in order to omit racing conditions\n", + " sleep(HPC_JOBID)\n", + " worker_node()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Running the Program\n", + "\n", + "You can run the workflow by submitting the bash script to the HPC queue. \n", + "Make sure you have [miniconda3](https://docs.anaconda.com/free/miniconda/index.html) installed on the cluster and that you have created a conda environment (in this example named `f3dasm_env`) with the necessary packages.\n", + "\n", + "### TORQUE Example\n", + "\n", + "---\n", + "\n", + "```bash\n", + "#!/bin/bash\n", + "# Torque directives (#PBS) must always be at the start of a job script!\n", + "#PBS -N ExampleScript\n", + "#PBS -q mse\n", + "#PBS -l nodes=1:ppn=12,walltime=12:00:00\n", + "\n", + "# Make sure I'm the only one that can read my output\n", + "umask 0077\n", + "\n", + "# The PBS_JOBID looks like 1234566[0].\n", + "# With the following line, we extract the PBS_ARRAYID, the part in the brackets []:\n", + "PBS_ARRAYID=$(echo \"${PBS_JOBID}\" | sed 's/\\[[^][]*\\]//g')\n", + "\n", + "module load use.own\n", + "module load miniconda3\n", + "cd $PBS_O_WORKDIR\n", + "\n", + "# Activate my conda environment:\n", + "source activate f3dasm_env\n", + "\n", + "# Limit the number of threads\n", + "OMP_NUM_THREADS=12\n", + "export OMP_NUM_THREADS=12\n", + "\n", + "# If the PBS_ARRAYID is not set, set it to None\n", + "if ! [ -n \"${PBS_ARRAYID+1}\" ]; then\n", + " PBS_ARRAYID=None\n", + "fi\n", + "\n", + "# Execute my Python program with the jobid flag\n", + "python main.py --jobid=${PBS_ARRAYID}\n", + "```\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### SLURM Example\n", + "\n", + "---\n", + "\n", + "```bash\n", + "#!/bin/bash -l\n", + "\n", + "#SBATCH -J \"ExampleScript\" # Name of the job\n", + "#SBATCH --get-user-env # Set environment variables\n", + "\n", + "#SBATCH --partition=compute\n", + "#SBATCH --time=12:00:00\n", + "#SBATCH --nodes=1\n", + "#SBATCH --ntasks-per-node=12\n", + "#SBATCH --cpus-per-task=1\n", + "#SBATCH --mem=0\n", + "#SBATCH --account=research-eemcs-me\n", + "#SBATCH --array=0-2\n", + "\n", + "source activate f3dasm_env\n", + "\n", + "# Execute my Python program with the jobid flag\n", + "python3 main.py --jobid=${SLURM_ARRAY_TASK_ID}\n", + "```\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can run the workflow by submitting the bash script to the HPC queue. \n", + "The following command submits an array job with 3 jobs where `f3dasm.HPC_JOBID` takes values of 0, 1, and 2.\n", + "\n", + "### TORQUE Example\n", + "\n", + "```bash\n", + "qsub pbsjob.sh -t 0-2\n", + "```\n", + "\n", + "### SLURM Example\n", + "\n", + "```bash\n", + "sbatch --array 0-2 pbsjob.sh\n", + "```" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "f3dasm_env3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.17" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}