From e8ac81b01a6e6f5619e8878399bcef550df16551 Mon Sep 17 00:00:00 2001
From: myshevts <maxim.y.shevtsov@intel.com>
Date: Thu, 10 Mar 2022 14:32:51 +0300
Subject: [PATCH] Perf hints

---
 docs/OV_Runtime_UG/performance_hints.md | 85 ++++++++++++++++++++-----
 docs/snippets/ov_auto_batching.cpp      | 18 ++++--
 docs/snippets/ov_auto_batching.py       | 18 ++++--
 3 files changed, 96 insertions(+), 25 deletions(-)

diff --git a/docs/OV_Runtime_UG/performance_hints.md b/docs/OV_Runtime_UG/performance_hints.md
index 4e86e5d2f1f9a8..00dd57c5b085fc 100644
--- a/docs/OV_Runtime_UG/performance_hints.md
+++ b/docs/OV_Runtime_UG/performance_hints.md
@@ -4,11 +4,52 @@ Each of the OpenVINO's [supported devices](supported_plugins/Supported_Devices.m
 Also, while the performance may be optimal for the specific combination of the device and the model that is inferred, the resulting configuration is not  necessarily optimal for another device or model.
 The OpenVINO performance hints is the new way fo configuring the performance with the _portability_ in mind. 
 
-Using the hints also does "reverse" the direction of the configuration in the right fashion: rather than map the application needs to the low-level performance settings, and potentially having associated application logic to configure each possible device separately, the idea is to express a target scenario with a single config key and let the *device* to configure itself in response.
+The hints also "reverse" the direction of the configuration in the right fashion: rather than map the application needs to the low-level performance settings, and keep an associated application logic to configure each possible device separately, the idea is to express a target scenario with a single config key and let the *device* to configure itself in response.
 As the hints are supported by every OpenVINO device, this is completely portable and future-proof solution. 
-NameSTreams AUTO
+
+Previously, certain level of automatic configuration was coming from the _default_ values of the parameters. For example, number of the CPU streams was deduced from the number of CPU cores, when the `ov::streams::AUTO` (`CPU_THROUGHPUT_AUTO` in the pre-OpenVINO 2.0 parlance) is set. However, the resulting number of streams didn't account for actual compute requirements of the model to be inferred.
+The hints, in contrast, respect the actual model, so the parameters for the optimal throughput are calculated for each model individually (based on it's compute versus memory bandwidth requirements and capabilities of the device).
 
 ## Performance Hints: Latency and Throughput
+As discussed in the [Optimization Guide](../optimization_guide/dldt_optimization_guide.md) there are few different metrics associated with the inference speed.
+Throughput and latency are some of the most critical factors that influence the overall performance of an application.
+
+This why, to ease the configuration of the device, the OpenVINO already offers two dedicated hints, namely `ov::hint::PerformanceMode::THROUGHPUT` and `ov::hint::PerformanceMode::LATENCY`.
+Every OpenVINO device supports these, which makes the things portable and future-proof.
+The also allows to do a performance configuration that is fully compatible with the [automatic device selection](../auto_device_selection.md).
+
+The `benchmark_app`, that exists in both  [C++](../../samples/cpp/benchmark_app/README.md) and [Python](../../tools/benchmark_tool/README.md) versions, is the best way to evaluate the performance of the performance hints for a particular device:
+ - benchmark_app **-hint tput** -d 'device' -m 'path to your favorite model'
+ - benchmark_app **-hint latency** -d 'device' -m 'path to your favorite model'
+A special `ov::hint::PerformanceMode::UNDEFINED` acts same as specifying no hint, please also see the last section in the document on conducting the performance measurements with the `benchmark_app`.
+
+## Performance Hints: How It Works?
+Internally, every device "translates" the value of the hint to the actual performance settings.
+For example the `ov::hint::PerformanceMode::THROUGHPUT` selects number of CPU or GPU streams.
+For the GPU, additionally the optimal batch size is selected and the [automatic batching](../OV_Runtime_UG/automatic_batching.md) is applied whenever possible.
+
+The resulting (device-specific) settings can be queried back from the instance of the `ov:compiled_model`.  
+Notice that the `benchmark_app`, outputs the actual settings, for example:
+
+<code>
+$benchmark_app -hint tput -d CPU -m 'path to your favorite model'
+
+...
+
+[Step 8/11] Setting optimal runtime parameters
+
+[ INFO ] Device: CPU
+
+[ INFO ]   { PERFORMANCE_HINT , THROUGHPUT }
+
+...
+
+[ INFO ]   { OPTIMAL_NUMBER_OF_INFER_REQUESTS , 4 }
+
+[ INFO ]   { NUM_STREAMS , 4 }
+
+...
+</code> 
 
 ## Using the Performance Hints: Basic API
 In the example code-snippet below the  `ov::hint::PerformanceMode::THROUGHPUT` is specified for the `ov::hint::performance_mode` property for the compile_model:
@@ -28,8 +69,24 @@ In the example code-snippet below the  `ov::hint::PerformanceMode::THROUGHPUT` i
 
 @endsphinxdirective
 
-Seeing the results:
+## Additional (Optional) Hints from the App
+Let's take an example  of an application that processes 4 video streams.  The most future-proof way to communicate the limitation of the parallel slack is to equip the performance hint with the optional `ov::hint::num_requests` configuration key set to 4. 
+As discussed previosly, for the GPU this will limit the batch size, for the CPU - the number of inference streams, so each device uses the `ov::hint::num_requests` while converting the hint to the actual device configuration options:
+@sphinxdirective
+
+.. tab:: C++
+
+    .. doxygensnippet:: docs/snippets/ov_auto_batching.cpp
+       :language: cpp
+       :fragment: [hint_num_requests]
+
+.. tab:: Python
 
+    .. doxygensnippet:: docs/snippets/ov_auto_batching.py
+       :language: python
+       :fragment: [hint_num_requests]
+
+@endsphinxdirective
 
 ## Optimal Number of Inference Requests
 Using the hints assumes that the application queries the `ov::optimal_number_of_infer_requests` to create and run the returned number of requests simultaneously:
@@ -49,31 +106,29 @@ Using the hints assumes that the application queries the `ov::optimal_number_of_
 
 @endsphinxdirective
 
-## (Optional) Additional Hints from the App
-Let's take an example  of an application that processes 4 video streams.  The most future-proof way to communicate the limitation of the parallel slack is to equip the performance hint with the optional `ov::hint::num_requests` configuration key set to 4. 
-As discussed previosly, for the GPU this will limit the batch size, for the CPU - the number of inference streams, so each device uses the `ov::hint::num_requests` while converting the hint to the actual device configuration options:
+While an application if free to create more requests if needed (for example to support asynchronous inp[uts population) **it is very important to at least run the `ov::optimal_number_of_infer_requests` of the inference requests in parallel**, for efficiency (device utilization) reasons. 
+
+## Combining the Hints and Individual Low-Level Settings
+While sacrificing the portability at a some extent, it is possible to combine the hints with individual device-specific settings. 
+For example, you can let the device prepare a configuration `ov::hint::PerformanceMode::THROUGHPUT` while overriding any specific value:  
 @sphinxdirective
 
 .. tab:: C++
 
     .. doxygensnippet:: docs/snippets/ov_auto_batching.cpp
        :language: cpp
-       :fragment: [hint_num_requests]
+       :fragment: [hint_plus_low_level]
 
 .. tab:: Python
 
     .. doxygensnippet:: docs/snippets/ov_auto_batching.py
        :language: python
-       :fragment: hint_num_requests]
+       :fragment: [hint_plus_low_level]
 
-@endsphinxdirective
-
-Seeing the results:
 
-## Combining the Hints and Individual Low-Level Settings
-
-## Testing the Performance of THe Hints with the Benchmark_App
-The `benchmark_app`, that exists in both  [C++](../../samples/cpp/benchmark_app/README.md) and [Python](../../tools/benchmark_tool/README.md) versions, is the best way to evaluate the performance of the performaqnce hints for a particular device:
+@endsphinxdirective
+## Testing the Performance of The Hints with the Benchmark_App
+The `benchmark_app`, that exists in both  [C++](../../samples/cpp/benchmark_app/README.md) and [Python](../../tools/benchmark_tool/README.md) versions, is the best way to evaluate the performance of the performance hints for a particular device:
  - benchmark_app **-hint tput** -d 'device' -m 'path to your favorite model'
  - benchmark_app **-hint latency** -d 'device' -m 'path to your favorite model'
 -  Disabling the hints to emulate the pre-hints era (highly recommended before playing the individual low-level settings like number of streams, threads, etc):
diff --git a/docs/snippets/ov_auto_batching.cpp b/docs/snippets/ov_auto_batching.cpp
index 1e943b3c8f5514..2ae7d2bef05a4a 100644
--- a/docs/snippets/ov_auto_batching.cpp
+++ b/docs/snippets/ov_auto_batching.cpp
@@ -14,8 +14,8 @@ int main() {
 {
     // disabling the automatic batching
     // leaving intact other configurations options that the device selects for the 'throughput' hint 
-    auto compiled_model = core.compile_model(model, "GPU", {ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT),
-                                                            ov::hint::allow_auto_batching(false)});
+    auto compiled_model = core.compile_model(model, "GPU", ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT),
+                                                            ov::hint::allow_auto_batching(false));
 }
 //! [compile_model_no_auto_batching]
 
@@ -32,10 +32,18 @@ int main() {
 {
     // limiting the available parallel slack for the 'throughput' hint via the ov::hint::num_requests
     // so that certain parameters (like selected batch size) are automatically accommodated accordingly 
-    auto compiled_model = core.compile_model(model, "GPU", {ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT),
-                                                            ov::hint::num_requests(4)});
+    auto compiled_model = core.compile_model(model, "GPU", ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT),
+                                                            ov::hint::num_requests(4));
 }
 //! [hint_num_requests]
 
-    return 0;
+//! [hint_plus_low_level]
+{
+    // high-level performance hints are compatible with low-level device-specific settings 
+    auto compiled_model = core.compile_model(model, "CPU", ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT),
+                                                            ov::inference_num_threads(4));
 }
+//! [hint_plus_low_level]
+
+    return 0;
+}
\ No newline at end of file
diff --git a/docs/snippets/ov_auto_batching.py b/docs/snippets/ov_auto_batching.py
index 1e943b3c8f5514..2ae7d2bef05a4a 100644
--- a/docs/snippets/ov_auto_batching.py
+++ b/docs/snippets/ov_auto_batching.py
@@ -14,8 +14,8 @@
 {
     // disabling the automatic batching
     // leaving intact other configurations options that the device selects for the 'throughput' hint 
-    auto compiled_model = core.compile_model(model, "GPU", {ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT),
-                                                            ov::hint::allow_auto_batching(false)});
+    auto compiled_model = core.compile_model(model, "GPU", ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT),
+                                                            ov::hint::allow_auto_batching(false));
 }
 //! [compile_model_no_auto_batching]
 
@@ -32,10 +32,18 @@
 {
     // limiting the available parallel slack for the 'throughput' hint via the ov::hint::num_requests
     // so that certain parameters (like selected batch size) are automatically accommodated accordingly 
-    auto compiled_model = core.compile_model(model, "GPU", {ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT),
-                                                            ov::hint::num_requests(4)});
+    auto compiled_model = core.compile_model(model, "GPU", ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT),
+                                                            ov::hint::num_requests(4));
 }
 //! [hint_num_requests]
 
-    return 0;
+//! [hint_plus_low_level]
+{
+    // high-level performance hints are compatible with low-level device-specific settings 
+    auto compiled_model = core.compile_model(model, "CPU", ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT),
+                                                            ov::inference_num_threads(4));
 }
+//! [hint_plus_low_level]
+
+    return 0;
+}
\ No newline at end of file