From 0c92e5d122127964ae42e447c18038f76bc7d2e6 Mon Sep 17 00:00:00 2001
From: Justin Salamon <justin.salamon@gmail.com>
Date: Tue, 12 Apr 2016 20:23:18 -0400
Subject: [PATCH] Implement offset_precision_recall_f1 and tests

---
 mir_eval/transcription.py   | 66 +++++++++++++++++++++++++++++++++++++
 tests/test_transcription.py | 22 +++++++++++++
 2 files changed, 88 insertions(+)

diff --git a/mir_eval/transcription.py b/mir_eval/transcription.py
index cfeee2f1..8a204518 100644
--- a/mir_eval/transcription.py
+++ b/mir_eval/transcription.py
@@ -582,6 +582,72 @@ def onset_precision_recall_f1(ref_intervals, est_intervals,
     return onset_precision, onset_recall, onset_f_measure
 
 
+def offset_precision_recall_f1(ref_intervals, est_intervals, offset_ratio=0.2,
+                               offset_min_tolerance=0.05, strict=False):
+    """Compute the Precision, Recall and F-measure of note offsets: an
+    estimated offset is considered correct if it is within +-50ms (or 20% of
+    the ref note duration, which ever is greater) of a ref offset. Note
+    that this metric completely ignores note onsets and note pitch. This means
+    an estimated offset will be considered correct if it matches a
+    reference offset, even if the offsetes come from notes with completely
+    different pitches (i.e. notes that would not match with `match_notes`).
+
+
+    Examples
+    --------
+    >>> ref_intervals, _ = mir_eval.io.load_valued_intervals(
+    ...     'reference.txt')
+    >>> est_intervals, _ = mir_eval.io.load_valued_intervals(
+    ...     'estimated.txt')
+    >>> (offset_precision,
+    ...  offset_recall,
+    ...  offset_f_measure) = mir_eval.transcription.offset_precision_recall_f1(
+    ...      ref_intervals, est_intervals)
+
+    Parameters
+    ----------
+    ref_intervals : np.ndarray, shape=(n,2)
+        Array of reference notes time intervals (onset and offset times)
+    est_intervals : np.ndarray, shape=(m,2)
+        Array of estimated notes time intervals (onset and offset times)
+    offset_ratio: float > 0 or None
+        The ratio of the reference note's duration used to define the
+        offset_tolerance. Default is 0.2 (20%), meaning the offset_tolerance
+        will equal the ref_duration * 0.2, or min_offset_tolerance (0.05 by
+        default, i.e. 50 ms), whichever is greater.
+    offset_min_tolerance: float > 0
+        The minimum tolerance for offset matching. See offset_ratio description
+        for an explanation of how the offset tolerance is determined.
+    strict: bool
+        If ``strict=False`` (the default), threshold checks for onset matching
+        are performed using ``<=`` (less than or equal). If ``strict=True``,
+        the threshold checks are performed using ``<`` (less than).
+
+    Returns
+    -------
+    precision : float
+        The computed precision score
+    recall : float
+        The computed recall score
+    f_measure : float
+        The computed F-measure score
+    """
+    validate_intervals(ref_intervals, est_intervals)
+    # When reference notes are empty, metrics are undefined, return 0's
+    if len(ref_intervals) == 0 or len(est_intervals) == 0:
+        return 0., 0., 0.
+
+    matching = match_offsets(ref_intervals, est_intervals,
+                             offset_ratio=offset_ratio,
+                             offset_min_tolerance=offset_min_tolerance,
+                             strict=strict)
+
+    offset_precision = float(len(matching))/len(est_intervals)
+    offset_recall = float(len(matching))/len(ref_intervals)
+    offset_f_measure = util.f_measure(offset_precision, offset_recall)
+    return offset_precision, offset_recall, offset_f_measure
+
+
 def evaluate(ref_intervals, ref_pitches, est_intervals, est_pitches, **kwargs):
     """Compute all metrics for the given reference and estimated annotations.
 
diff --git a/tests/test_transcription.py b/tests/test_transcription.py
index 8ad9c8c0..2893e11b 100644
--- a/tests/test_transcription.py
+++ b/tests/test_transcription.py
@@ -42,6 +42,12 @@
     "Onset_F-measure": 0.8888888888888889,
 }
 
+OFFSET_SCORES = {
+    "Offset_Precision": 0.6,
+    "Offset_Recall": 0.75,
+    "Offset_F-measure": 0.6666666666666665,
+}
+
 
 def test_match_offsets():
 
@@ -164,6 +170,22 @@ def test_onset_precision_recall_f1():
     assert np.allclose(scores_exp, scores_gen, atol=A_TOL)
 
 
+def test_offset_precision_recall_f1():
+
+    # load test data
+    ref_int = REF[:, :2]
+    est_int = EST[:, :2]
+
+    precision, recall, f_measure = (
+        mir_eval.transcription.offset_precision_recall_f1(ref_int, est_int))
+
+    scores_gen = np.array([precision, recall, f_measure])
+    scores_exp = np.array([OFFSET_SCORES['Offset_Precision'],
+                           OFFSET_SCORES['Offset_Recall'],
+                           OFFSET_SCORES['Offset_F-measure']])
+    assert np.allclose(scores_exp, scores_gen, atol=A_TOL)
+
+
 def test_regression():
 
     # Regression tests