nv-morpheus · Sep 29, 2022 · Sep 19, 2022 · Sep 19, 2022 · Sep 19, 2022 · Sep 19, 2022
diff --git a/docker/conda/environments/cuda11.5_dev.yml b/docker/conda/environments/cuda11.5_dev.yml
@@ -61,6 +61,7 @@ dependencies:
     - networkx=2.8
     - ninja=1.10
     - nodejs=17.4.0
+    - numba==0.55
     - numpydoc=1.4
     - pandas=1.3
     - pip

@@ -137,6 +137,16 @@ h4
 	text-transform: uppercase;
 }
 
+h3 code
+{
+	text-transform: none;
+}
+
+h4 code
+{
+	text-transform: none;
+}
+
 /* Paragraph Formatting */
 
 p
@@ -218,7 +228,7 @@ html.writer-html5 .rst-content table.docutils th>p
 
 /* cell text */
 html.writer-html5 .rst-content table.docutils td>p,
-html.writer-html5 .rst-content table.docutils th>p 
+html.writer-html5 .rst-content table.docutils th>p
 {
     font-size: var(--body-font-size);
     line-height: var(--body-line-height);
@@ -230,7 +240,7 @@ html.writer-html5 .rst-content table.docutils th>p
 .rst-content table.field-list td p:first-child,
 .wy-table th p:first-child,
 .rst-content table.docutils th p:first-child,
-.rst-content table.field-list th p:first-child 
+.rst-content table.field-list th p:first-child
 {
     margin-top: 0px;
 }
@@ -241,7 +251,7 @@ html.writer-html5 .rst-content table.docutils th>p
 .rst-content table.field-list td p:last-child,
 .wy-table th p:last-child,
 .rst-content table.docutils th p:last-child,
-.rst-content table.field-list th p:last-child 
+.rst-content table.field-list th p:last-child
 {
     margin-bottom: 0px;
 }

@@ -58,8 +58,10 @@ FROM base as jupyter
 RUN source activate morpheus \
     && mamba install -y -c conda-forge \
         ipywidgets \
-        jupyterlab \
-        nb_conda_kernels
+        nb_conda_kernels \
+    && pip install jupyter_contrib_nbextensions==0.5.1 \
+    && jupyter contrib nbextension install --user \
+    && pip install jupyterlab_nvdashboard==0.7.0
 
 # Launch jupyter
 CMD ["jupyter-lab", "--ip=0.0.0.0", "--no-browser", "--allow-root"]
@@ -1,17 +1,134 @@
 # "Production" Digital Fingerprinting Pipeline
 
-### Build the Morpheus container
+This example is designed to show what a full scale, production ready, DFP deployment in Morpheus would look like. It contains all of the necessary components (such as a model store), to allow multiple Morpheus pipelines to communicate at a scale that can handle the workload of an entire company.
 
-This is necessary to get the latest changes needed for DFP
+Key Differences:
+ * Multiple pipelines are specialized to perform either training or inference
+ * Requires setting up a model store to allow the training and inference pipelines to communicate
+ * Organized into a docker-compose deployment for easy startup
+ * Contains a Jupyter notebook service to ease development and debugging
+ * Can be deployed to Kubernetes using provided Helm charts
+ * Uses many customized stages to maximize performance.
 
+## Build the Morpheus container
+This is necessary to get the latest changes needed for DFP. From the root of the Morpheus repo:
 ```bash
 ./docker/build_container_release.sh
 ```
 
-### Running locally via `docker-compose`
-
+## Building and Running via `docker-compose`
+### Build
 ```bash
+cd examples/digital_fingerprinting/production
+export MORPHEUS_CONTAINER_VERSION="$(git describe --tags --abbrev=0)-runtime"
 docker-compose build
+```
 
-docker-compose up
+### Running the services
+#### Jupyter Server
+From the `examples/digital_fingerprinting/production` dir run:
+```bash
+docker-compose up jupyter
 ```
+
+Once the build is complete and the service has started you will be prompted with a message that should look something like:
+```
+jupyter  |     To access the server, open this file in a browser:
+jupyter  |         file:///root/.local/share/jupyter/runtime/jpserver-7-open.html
+jupyter  |     Or copy and paste one of these URLs:
+jupyter  |         http://localhost:8888/lab?token=<token>
+jupyter  |      or http://127.0.0.1:8888/lab?token=<token>
+```
+
+Copy and paste the url into a web browser. There are four notebooks included with the DFP example:
+* dfp_azure_training.ipynb - Training pipeline for Azure Active Directory data
+* dfp_azure_inference.ipynb - Inference pipeline for Azure Active Directory data
+* dfp_duo_training.ipynb - Training pipeline for Duo Authentication
+* dfp_duo_inference.ipynb - Inference pipeline for Duo Authentication
+
+> **Note:** The token in the url is a one-time use token, and a new one is generated with each invocation.
+
+#### Morpheus Pipeline
+By default the `morpheus_pipeline` will run the training pipeline for Duo data, from the `examples/digital_fingerprinting/production` dir run:
+```bash
+docker-compose up morpheus_pipeline
+```
+
+If instead you wish to run a different pipeline, from the `examples/digital_fingerprinting/production` dir run:
+```bash
+docker-compose run morpheus_pipeline bash
+```
+
+From the prompt within the `morpheus_pipeline` container you can run either the `dfp_azure_pipeline.py` or `dfp_duo_pipeline.py` pipeline scripts.
+```bash
+python dfp_azure_pipeline.py --help
+python dfp_duo_pipeline.py --help
+```
+
+Both scripts are capable of running either a training or inference pipeline for their respective data sources. The command line options for both are the same:
+| Flag | Type | Description |
+| ---- | ---- | ----------- |
+| `--train_users` | One of: `all`, `generic`, `individual`, `none` | Indicates whether or not to train per user or a generic model for all users. Selecting `none` runs the inference pipeline. |
+| `--skip_user` | TEXT | User IDs to skip. Mutually exclusive with `only_user` |
+| `--only_user` | TEXT | Only users specified by this option will be included. Mutually exclusive with `skip_user` |
+| `--duration` | TEXT | The duration to run starting from now [default: 60d] |
+| `--cache_dir` | TEXT | The location to cache data such as S3 downloads and pre-processed data  [env var: `DFP_CACHE_DIR`; default: `./.cache/dfp`] |
+| `--log_level` | One of: `CRITICAL`, `FATAL`, `ERROR`, `WARN`, `WARNING`, `INFO`, `DEBUG` | Specify the logging level to use.  [default: `WARNING`] |
+| `--sample_rate_s` | INTEGER | Minimum time step, in milliseconds, between object logs.  [env var: `DFP_SAMPLE_RATE_S`; default: 0] |
+| `-f`, `--input_file` | TEXT | List of files to process. Can specify multiple arguments for multiple files. Also accepts glob (*) wildcards and schema prefixes such as `s3://`. For example, to make a local cache of an s3 bucket, use `filecache::s3://mybucket/*`. See [fsspec documentation](https://filesystem-spec.readthedocs.io/en/latest/api.html?highlight=open_files#fsspec.open_files) for list of possible options. |
+| `--tracking_uri` | TEXT | The MLflow tracking URI to connect to the tracking backend. [default: `http://localhost:5000`] |
+| `--help` | | Show this message and exit. |
+
+
+#### Optional MLflow Service
+Starting either the `morpheus_pipeline` or the `jupyter` service, will start the `mlflow` service in the background.  For debugging purposes it can be helpful to view the logs of the running MLflow service.
+
+From the `examples/digital_fingerprinting/production` dir run:
+```bash
+docker-compose up mlflow
+```
+
+By default, a mlflow dashboard will be available at:
+```bash
+http://localhost:5000
+```
+
+## Kubernetes deployment
+
+The Morpheus project also maintains Helm charts and container images for Kubernetes deployment of Morpheus and MLflow (both for serving and for the Triton plugin). These are located in the NVIDIA GPU Cloud (NGC) [public catalog](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/morpheus/collections/morpheus_).
+
+### MLflow Helm chart
+
+MLflow for this production digital fingerprint use case can be installed from NGC using these same instructions for the [MLflow Triton Plugin from the Morpheus Quick Start Guide](../../../docs/source/morpheus_quickstart_guide.md#install-morpheus-mlflow-triton-plugin). The chart and image can be used for both the Triton plugin and also MLflow server.
+
+### Production DFP Helm chart
+
+The deployment of the [Morpheus SDK Client](../../../docs/source/morpheus_quickstart_guide.md#install-morpheus-sdk-client) is also done _almost_ the same way as what's specified in the Quick Start Guide. However, you would specify command arguments differently for this production DFP use case.
+
+#### Notebooks
+
+```
+helm install --set ngc.apiKey="$API_KEY",sdk.args="cd /workspace/examples/digital_fingerprinting/production/morpheus && jupyter-lab --ip='*' --no-browser --allow-root --ServerApp.allow_origin='*'" <sdk-release-name> morpheus-sdk-client/
+```
+
+Make note of the Jupyter token by examining the logs of the SDK pod:
+```
+kubectl logs sdk-cli-<sdk-release-name>
+```
+
+You should see something similar to this:
+
+```
+    Or copy and paste one of these URLs:
+        http://localhost:8888/lab?token=d16c904468fdf666c5030e18fb82f840e531178bf716e575
+     or http://127.0.0.1:8888/lab?token=d16c904468fdf666c5030e18fb82f840e531178bf716e575
+```
+
+Open your browser to the reachable address and NodePort exposed by the pod (default value of 30888) and use the generated token to login into the notebook.
+
+#### Unattended
+
+```
+helm install --set ngc.apiKey="$API_KEY",sdk.args="cd /workspace/examples/digital_fingerprinting/production/morpheus && ./launch.sh --train_users=generic --duration=1d" <sdk-release-name> morpheus-sdk-client/
+```
+
@@ -41,6 +41,12 @@ services:
       target: jupyter
       args:
         - MORPHEUS_CONTAINER_VERSION=${MORPHEUS_CONTAINER_VERSION:-v22.09.00-runtime}
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            capabilities: [gpu]    
     image: dfp_morpheus_jupyter
     container_name: jupyter
     ports:
@@ -58,7 +64,7 @@ services:
     cap_add:
       - sys_nice
 
-  morpheus_training:
+  morpheus_pipeline:
     # restart: always
     build:
       context: ./
@@ -67,7 +73,13 @@ services:
       args:
         - MORPHEUS_CONTAINER_VERSION=${MORPHEUS_CONTAINER_VERSION:-v22.09.00-runtime}
     image: dfp_morpheus
-    container_name: morpheus_training
+    container_name: morpheus_pipeline
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            capabilities: [gpu]
     networks:
       - frontend
       - backend

@@ -24,7 +24,7 @@ RUN apt update && \
     rm -rf /var/cache/apt/* /var/lib/apt/lists/*
 
 # Install python packages
-RUN pip install mlflow boto3 pymysql pyyaml
+RUN pip install "mlflow<1.29.0" boto3 pymysql pyyaml
 
 # We run on port 5000
 EXPOSE 5000

@@ -14,6 +14,8 @@
 
 import logging
 import typing
+from collections import namedtuple
+from datetime import datetime
 
 import fsspec
 import pandas as pd
@@ -26,15 +28,25 @@
 
 logger = logging.getLogger("morpheus.{}".format(__name__))
 
+TimestampFileObj = namedtuple("TimestampFileObj", ["timestamp", "file_object"])
+
 
 class DFPFileBatcherStage(SinglePortStage):
 
-    def __init__(self, c: Config, date_conversion_func, period="D", sampling_rate_s=0):
+    def __init__(self,
+                 c: Config,
+                 date_conversion_func,
+                 period="D",
+                 sampling_rate_s=0,
+                 start_time: datetime = None,
+                 end_time: datetime = None):
         super().__init__(c)
 
         self._date_conversion_func = date_conversion_func
         self._sampling_rate_s = sampling_rate_s
         self._period = period
+        self._start_time = start_time
+        self._end_time = end_time
 
     @property
     def name(self) -> str:
@@ -48,48 +60,70 @@ def accepted_types(self) -> typing.Tuple:
 
     def on_data(self, file_objects: fsspec.core.OpenFiles):
 
-        file_object_list = file_objects
+        # Determine the date of the file, and apply the window filter if we have one
+        ts_and_files = []
+        for file_object in file_objects:
+            ts = self._date_conversion_func(file_object)
+
+            # Exclude any files outside the time window
+            if ((self._start_time is not None and ts < self._start_time)
+                    or (self._end_time is not None and ts > self._end_time)):
+                continue
+
+            ts_and_files.append(TimestampFileObj(ts, file_object))
+
+        # sort the incoming data by date
+        ts_and_files.sort()
 
         # Create a dataframe with the incoming metadata
-        if ((len(file_object_list) > 1) and (self._sampling_rate_s > 0)):
+        if ((len(ts_and_files) > 1) and (self._sampling_rate_s > 0)):
             file_sampled_list = []
 
-            file_object_list.sort(key=lambda file_object: self._date_conversion_func(file_object))
+            ts_last = ts_and_files[0].timestamp
 
-            ts_last = self._date_conversion_func(file_object_list[0])
+            file_sampled_list.append(ts_and_files[0])
 
-            file_sampled_list.append(file_object_list[0])
-
-            for idx in range(1, len(file_object_list)):
-                ts = self._date_conversion_func(file_object_list[idx])
+            for idx in range(1, len(ts_and_files)):
+                ts = ts_and_files[idx].timestamp
 
                 if ((ts - ts_last).seconds >= self._sampling_rate_s):
 
-                    file_sampled_list.append(file_object_list[idx])
+                    ts_and_files.append(ts_and_files[idx])
                     ts_last = ts
             else:
-                file_object_list = file_sampled_list
+                ts_and_files = file_sampled_list
 
         df = pd.DataFrame()
 
-        df["dfp_timestamp"] = [self._date_conversion_func(file_object) for file_object in file_object_list]
-        df["key"] = [file_object.full_name for file_object in file_object_list]
-        df["objects"] = file_object_list
-
-        # Now split by the batching settings
-        df_period = df["dfp_timestamp"].dt.to_period(self._period)
+        timestamps = []
+        full_names = []
+        file_objs = []
+        for (ts, file_object) in ts_and_files:
+            timestamps.append(ts)
+            full_names.append(file_object.full_name)
+            file_objs.append(file_object)
 
-        period_gb = df.groupby(df_period)
+        df["dfp_timestamp"] = timestamps
+        df["key"] = full_names
+        df["objects"] = file_objs
 
         output_batches = []
 
-        n_groups = len(period_gb)
-        for group in period_gb.groups:
-            period_df = period_gb.get_group(group)
+        if len(df) > 0:
+            # Now split by the batching settings
+            df_period = df["dfp_timestamp"].dt.to_period(self._period)
+
+            period_gb = df.groupby(df_period)
+
+            n_groups = len(period_gb)
+            for group in period_gb.groups:
+                period_df = period_gb.get_group(group)
 
-            obj_list = fsspec.core.OpenFiles(period_df["objects"].to_list(), mode=file_objects.mode, fs=file_objects.fs)
+                obj_list = fsspec.core.OpenFiles(period_df["objects"].to_list(),
+                                                 mode=file_objects.mode,
+                                                 fs=file_objects.fs)
 
-            output_batches.append((obj_list, n_groups))
+                output_batches.append((obj_list, n_groups))
 
         return output_batches
 

@@ -242,7 +242,6 @@ def convert_to_dataframe(self, s3_object_batch: typing.Tuple[fsspec.core.OpenFil
             return output_df
         except Exception:
             logger.exception("Error while converting S3 buckets to DF.")
-            self._get_or_create_dataframe_from_s3_batch(s3_object_batch)
             raise
 
     def _build_single(self, builder: srf.Builder, input_stream: StreamPair) -> StreamPair:

@@ -69,7 +69,7 @@ def supports_cpp_node(self):
 
     def _generate_frames_fsspec(self):
 
-        files: fsspec.core.OpenFiles = fsspec.open_files(self._filenames, filecache={'cache_storage': './.cache/s3tmp'})
+        files: fsspec.core.OpenFiles = fsspec.open_files(self._filenames)
 
         if (len(files) == 0):
             raise RuntimeError(f"No files matched input strings: '{self._filenames}'. "