better version of log probs computation

airboxlab · May 29, 2024 · 1b94d07 · 1b94d07
1 parent 4adf030
commit 1b94d07
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 9 deletions.
diff --git a/doc/source/overview/index.rst b/doc/source/overview/index.rst
@@ -113,7 +113,7 @@ Among other general considerations, there are two assumptions that must be satis
   (estimator is no longer unbiased).
 
   Note also that not all estimators require the behavior policy to cover all the actions of the evaluation policy, for instance
-  Direct Method (DM) fits a model of the Q function and uses it to estimate the value of the policy.
+  Direct Method (DM) fit a model of the Q function and uses it to estimate the value of the policy.
 
 - **Positivity**: the rewards must be non-negative to be able to compute a lower bound estimate of the target policy. In Hopes,
   you'll find a way to rescale the rewards to make them positive (using `MinMaxScaler`).

diff --git a/hopes/policy/utils.py b/hopes/policy/utils.py
@@ -15,15 +15,14 @@ def log_probs_for_deterministic_policy(
     :param epsilon: the small value to use for the probabilities of the other actions.
     """
     assert np.all(np.isin(actions, actions_bins)), "Some actions are not in the action bins."
+
+    # get index of each action in actions_bins
+    act_idx = np.searchsorted(actions_bins, actions)
+
+    # create the log probabilities
     unlikely_p = epsilon / len(actions_bins)
-    return np.log(
-        np.array(
-            [
-                [(1.0 - epsilon) + unlikely_p if a == action else unlikely_p for a in actions_bins]
-                for action in actions
-            ]
-        )
-    )
+    act_probs = np.where(np.eye(len(actions_bins)) == 0, unlikely_p, 1.0 - epsilon + unlikely_p)
+    return np.log([act_probs[a] for a in act_idx])
 
 
 def bin_actions(actions: np.ndarray, bins: np.ndarray) -> np.ndarray: