From d3b8df838ccf0adf1927aa734eef6c4cb75a9905 Mon Sep 17 00:00:00 2001 From: Justin Salamon Date: Fri, 6 May 2016 13:31:34 -0400 Subject: [PATCH] Update dosctrings for completeness and language consistency --- mir_eval/transcription.py | 190 +++++++++++++++++++++----------------- 1 file changed, 104 insertions(+), 86 deletions(-) diff --git a/mir_eval/transcription.py b/mir_eval/transcription.py index a67e7f66..3c195ccb 100644 --- a/mir_eval/transcription.py +++ b/mir_eval/transcription.py @@ -12,31 +12,31 @@ There are many metrics for evaluating transcription algorithms. Here we limit ourselves to the most simple and commonly used: given two sets of notes, we -count how many estimate notes match the reference, and how many do not. Based -on these counts we compute the precision, recall, and f-measure of the estimate -given the reference. The default criteria for considering two notes to be a -match are adopted from the `MIREX Multiple fundamental frequency estimation and -tracking, Note Tracking subtask (task 2) `_: +count how many estimated notes match the reference, and how many do not. Based +on these counts we compute the precision, recall, f-measure and overlap ratio +of the estimate given the reference. The default criteria for considering two +notes to be a match are adopted from the `MIREX Multiple fundamental frequency +estimation and tracking, Note Tracking subtask (task 2) +`_: "This subtask is evaluated in two different ways. In the first setup , a -returned note is assumed correct if its onset is within +-50ms of a ref note -and its F0 is within +- quarter tone of the corresponding reference note, +returned note is assumed correct if its onset is within +-50ms of a reference +note and its F0 is within +- quarter tone of the corresponding reference note, ignoring the returned offset values. In the second setup, on top of the above requirements, a correct returned note is required to have an offset value -within 20% of the ref notes duration around the ref note's offset, or within -50ms whichever is larger." +within 20% of the reference note's duration around the reference note's +offset, or within 50ms whichever is larger." -In short, we compute precision, recall and f-measure, once without taking -offsets into account, and the second time with. +In short, we compute precision, recall, f-measure and overlap ratio, once +without taking offsets into account, and the second time with. For further details see Salamon, 2013 (page 186), and references therein: Salamon, J. (2013). Melody Extraction from Polyphonic Music Signals. Ph.D. thesis, Universitat Pompeu Fabra, Barcelona, Spain, 2013. - IMPORTANT NOTE: the evaluation code in ``mir_eval`` contains several important differences with respect to the code used in MIREX 2015 for the Note Tracking subtask on the Su dataset (henceforth "MIREX"): @@ -46,9 +46,9 @@ which can produce sub-optimal note matching. This will result in ``mir_eval``'s metrics being slightly higher compared to MIREX. 2. MIREX rounds down the onset and offset times of each note to 2 decimal - points using ``new_time = 0.01 * floor(time*100)``. ``mir_eval`` doesn't - modify the note onset and offset times. This will bring our metrics down a - notch compared to the MIREX results. + points using ``new_time = 0.01 * floor(time*100)``. ``mir_eval`` rounds down + the note onset and offset times to 4 decinal points. This will bring our + metrics down a notch compared to the MIREX results. 3. In the MIREX wiki, the criterion for matching offsets is that they must be within 0.2 * ref_duration **or 0.05 from each other, whichever is greater** (i.e. ``offset_dif <= max(0.2 * ref_duration, 0.05)``. The MIREX code @@ -66,9 +66,9 @@ ``<`` for matching onsets, offsets, and pitch values, whilst the others used ``<=`` for these checks. ``mir_eval`` provides both options: by default the latter (``<=``) is used, but you can set ``strict=True`` when calling -:func:`mir_eval.transcription.precision_recall_f1()` in which case ``<`` will -be used. The default value (``strict=False``) is the same as that used in -MIREX 2015 for the Note Tracking subtask on the Su dataset. +:func:`mir_eval.transcription.precision_recall_f1_overlap()` in which case +``<`` will be used. The default value (``strict=False``) is the same as that +used in MIREX 2015 for the Note Tracking subtask on the Su dataset. Conventions @@ -83,10 +83,24 @@ Metrics ------- -* :func:`mir_eval.transcription.precision_recall_f1`: The precision, - recall, and F-measure of the note transcription, where an estimated note is - considered correct if its pitch, onset and (optionally) offset are - sufficiently close to a reference note +* :func:`mir_eval.transcription.precision_recall_f1_overlap`: The precision, + recall, F-measure, and Average Overlap Ratio of the note transcription, + where an estimated note is considered correct if its pitch, onset and + (optionally) offset are sufficiently close to a reference note. + +* :func:`mir_eval.transcription.onset_precision_recall_f1`: The precision, + recall and F-measure of the note transcription, where an estimated note is + considered correct if its onset is sufficiently close to a reference note's + onset. That is, these metrics are computed taking only note onsets into + account, meaning two notes could be matched even if they have very different + pitch values. + +* :func:`mir_eval.transcription.offset_precision_recall_f1`: The precision, + recall and F-measure of the note transcription, where an estimated note is + considered correct if its offset is sufficiently close to a reference note's + offset. That is, these metrics are computed taking only note offsets into + account, meaning two notes could be matched even if they have very different + pitch values. ''' @@ -123,7 +137,7 @@ def validate(ref_intervals, ref_pitches, est_intervals, est_pitches): raise ValueError('Reference intervals and pitches have different ' 'lengths.') if not est_intervals.shape[0] == est_pitches.shape[0]: - raise ValueError('Estimate intervals and pitches have different ' + raise ValueError('Estimated intervals and pitches have different ' 'lengths.') # Make sure all pitch values are positive @@ -150,7 +164,7 @@ def validate_intervals(ref_intervals, est_intervals): if ref_intervals.size == 0: warnings.warn("Reference notes are empty.") if est_intervals.size == 0: - warnings.warn("Estimate notes are empty.") + warnings.warn("Estimated notes are empty.") # Validate intervals util.validate_intervals(ref_intervals) @@ -165,14 +179,15 @@ def match_note_offsets(ref_intervals, est_intervals, offset_ratio=0.2, Given two note sequences represented by ``ref_intervals`` and ``est_intervals`` (see :func:`mir_eval.io.load_valued_intervals`), we seek the largest set of correspondences ``(i, j)`` such that the offset of - ref note i has to be within ``offset_tolerance`` of the offset of est note - j, where ``offset_tolerance`` is equal to ``offset_ratio`` times the ref - note's duration, i.e. ``offset_ratio * ref_duration[i]`` where - ``ref_duration[i] = ref_intervals[i, 1] - ref_intervals[i, 0]``. If the - resulting ``offset_tolerance`` is less than ``offset_min_tolerance`` + reference note i has to be within ``offset_tolerance`` of the offset of + estimated note j, where ``offset_tolerance`` is equal to ``offset_ratio`` + times the reference note's duration, i.e. + ``offset_ratio * ref_duration[i]`` where ``ref_duration[i] = + ref_intervals[i, 1] - ref_intervals[i, 0]``. If the resulting + ``offset_tolerance`` is less than ``offset_min_tolerance`` (50 ms by default) then ``offset_min_tolerance`` is used instead. - Every ref note is matched against at most one est note. + Every reference note is matched against at most one estimated note. Note there are separate functions :func:`match_note_onsets` and :func:`match_notes` for matching notes based on onsets only or based on @@ -203,7 +218,7 @@ def match_note_offsets(ref_intervals, est_intervals, offset_ratio=0.2, ------- matching : list of tuples A list of matched reference and estimated notes. - ``matching[i] == (i, j)`` where reference note i matches estimate note + ``matching[i] == (i, j)`` where reference note i matches estimated note j. """ # set the comparison function @@ -231,7 +246,7 @@ def match_note_offsets(ref_intervals, est_intervals, offset_ratio=0.2, # Construct the graph input # Flip graph so that 'matching' is a list of tuples where the first item # in each tuple is the reference note index, and the second item is the - # estimate note index. + # estimated note index. G = {} for ref_i, est_i in zip(*hits): if est_i not in G: @@ -251,10 +266,11 @@ def match_note_onsets(ref_intervals, est_intervals, onset_tolerance=0.05, Given two note sequences represented by ``ref_intervals`` and ``est_intervals`` (see :func:`mir_eval.oi.load_valued_intervals`), we see - the largest set of correspondences ``(i,j)`` such that the onset of ref - note i is within ``onset_tolerance`` of the onset of est note j. + the largest set of correspondences ``(i,j)`` such that the onset of + reference note i is within ``onset_tolerance`` of the onset of estimated + note j. - Every ref note is matched against at most one est note. + Every reference note is matched against at most one estimated note. Note there are separate functions :func:`match_note_offsets` and :func:`match_notes` for matching notes based on offsets only or based on @@ -279,7 +295,7 @@ def match_note_onsets(ref_intervals, est_intervals, onset_tolerance=0.05, ------- matching : list of tuples A list of matched reference and estimated notes. - ``matching[i] == (i, j)`` where reference note i matches estimate note + ``matching[i] == (i, j)`` where reference note i matches estimated note j. """ # set the comparison function @@ -303,7 +319,7 @@ def match_note_onsets(ref_intervals, est_intervals, onset_tolerance=0.05, # Construct the graph input # Flip graph so that 'matching' is a list of tuples where the first item # in each tuple is the reference note index, and the second item is the - # estimate note index. + # estimated note index. G = {} for ref_i, est_i in zip(*hits): if est_i not in G: @@ -327,21 +343,21 @@ def match_notes(ref_intervals, ref_pitches, est_intervals, est_pitches, (see :func:`mir_eval.io.load_valued_intervals`), we seek the largest set of correspondences ``(i, j)`` such that: - 1. The onset of ref note i is within ``onset_tolerance`` of the onset of - est note j. - 2. The pitch of ref note i is within ``pitch_tolerance`` of the pitch of - est note j. - 3. If ``offset_ratio`` is not ``None``, the offset of ref note i has to be - within ``offset_tolerance`` of the offset of est note j, where - ``offset_tolerance`` is equal to ``offset_ratio`` times the ref note's - duration, i.e. ``offset_ratio * ref_duration[i]`` where - ``ref_duration[i] = ref_intervals[i, 1] - ref_intervals[i, 0]``. If the - resulting ``offset_tolerance`` is less than 0.05 (50 ms), 0.05 is used - instead. + 1. The onset of reference note i is within ``onset_tolerance`` of the + onset of estimated note j. + 2. The pitch of reference note i is within ``pitch_tolerance`` of the + pitch of estimated note j. + 3. If ``offset_ratio`` is not ``None``, the offset of reference note i has + to be within ``offset_tolerance`` of the offset of estimated note j, + where ``offset_tolerance`` is equal to ``offset_ratio`` times the + reference note's duration, i.e. ``offset_ratio * ref_duration[i]`` + where ``ref_duration[i] = ref_intervals[i, 1] - ref_intervals[i, 0]``. + If the resulting ``offset_tolerance`` is less than 0.05 (50 ms), 0.05 + is used instead. 4. If ``offset_ratio`` is ``None``, note offsets are ignored, and only criteria 1 and 2 are taken into consideration. - Every ref note is matched against at most one est note. + Every reference note is matched against at most one estimated note. This is useful for computing precision/recall metrics for note transcription. @@ -387,7 +403,7 @@ def match_notes(ref_intervals, ref_pitches, est_intervals, est_pitches, ------- matching : list of tuples A list of matched reference and estimated notes. - ``matching[i] == (i, j)`` where reference note i matches estimate note + ``matching[i] == (i, j)`` where reference note i matches estimated note j. """ # set the comparison function @@ -433,7 +449,7 @@ def match_notes(ref_intervals, ref_pitches, est_intervals, est_pitches, # Construct the graph input # Flip graph so that 'matching' is a list of tuples where the first item # in each tuple is the reference note index, and the second item is the - # estimate note index. + # estimated note index. G = {} for ref_i, est_i in zip(*hits): if est_i not in G: @@ -455,14 +471,14 @@ def precision_recall_f1_overlap(ref_intervals, ref_pitches, est_intervals, transcribed notes, and the Average Overlap Ratio for correctly transcribed notes (see :func:`average_overlap_ratio`). "Correctness" is determined based on note onset, pitch and (optionally) offset: an estimated note is - assumed correct if its onset is within +-50ms of a ref note and its pitch - (F0) is within +- quarter tone (50 cents) of the corresponding reference - note. If offset_ratio is None, note offsets are ignored in the comparison. - Otherwise, on top of the above requirements, a correct returned note is - required to have an offset value within 20% (by default, adjustable via the - offset_ratio parameter) of the ref note's duration around the ref note's - offset, or within offset_min_tolerance (50 ms by default), whichever is - larger. + assumed correct if its onset is within +-50ms of a reference note and its + pitch (F0) is within +- quarter tone (50 cents) of the corresponding + reference note. If offset_ratio is None, note offsets are ignored in the + comparison. Otherwise, on top of the above requirements, a correct + returned note is required to have an offset value within 20% (by default, + adjustable via the ``offset_ratio`` parameter) of the reference note's + duration around the reference note's offset, or within + ``offset_min_tolerance`` (50 ms by default), whichever is larger. Examples -------- @@ -472,13 +488,14 @@ def precision_recall_f1_overlap(ref_intervals, ref_pitches, est_intervals, ... 'estimated.txt') >>> (precision, ... recall, - ... f_measure) = mir_eval.transcription.precision_recall_f1( + ... f_measure) = mir_eval.transcription.precision_recall_f1_overlap( ... ref_intervals, ref_pitches, est_intervals, est_pitches) >>> (precision_no_offset, ... recall_no_offset, - ... f_measure_no_offset) = mir_eval.transcription.precision_recall_f1( - ... ref_intervals, ref_pitches, est_intervals, est_pitches, - ... offset_ratio=None) + ... f_measure_no_offset) = ( + ... mir_eval.transcription.precision_recall_f1_overlap( + ... ref_intervals, ref_pitches, est_intervals, est_pitches, + ... offset_ratio=None)) Parameters ---------- @@ -499,14 +516,14 @@ def precision_recall_f1_overlap(ref_intervals, ref_pitches, est_intervals, offset_ratio: float > 0 or None The ratio of the reference note's duration used to define the offset_tolerance. Default is 0.2 (20%), meaning the offset_tolerance - will equal the ref_duration * 0.2, or min_offset_tolerance (0.05 by + will equal the ref_duration * 0.2, or ``min_offset_tolerance`` (0.05 by default, i.e. 50 ms), whichever is greater. If ``offset_ratio`` is set to ``None``, offsets are ignored in the evaluation. offset_min_tolerance: float > 0 - The minimum tolerance for offset matching. See offset_ratio description - for an explanation of how the offset tolerance is determined. Note: - this parameter only influences the results if offset_ratio is not - ``None``. + The minimum tolerance for offset matching. See ``offset_ratio`` + description for an explanation of how the offset tolerance is + determined. Note: this parameter only influences the results if + ``offset_ratio`` is not ``None``. strict: bool If ``strict=False`` (the default), threshold checks for onset, offset, and pitch matching are performed using ``<=`` (less than or equal). If @@ -549,20 +566,20 @@ def precision_recall_f1_overlap(ref_intervals, ref_pitches, est_intervals, def average_overlap_ratio(ref_intervals, est_intervals, matching): - """Compute the Average Overlap Ratio between a reference and estimate - note transcription. Given a reference and corresponding estimate note, + """Compute the Average Overlap Ratio between a reference and estimated + note transcription. Given a reference and corresponding estimated note, their overlap ratio (OR) is defined as the ratio between the duration of the time segment in which the two notes overlap and the time segment spanned by the two notes combined (earliest onset to latest offset): - OR = ((min(ref_offset, est_offset) - max(ref_onset, est_onset)) / - (max(ref_offset, est_offset) - min(ref_onset, est_onset))). + >>> OR = ((min(ref_offset, est_offset) - max(ref_onset, est_onset)) / + ... (max(ref_offset, est_offset) - min(ref_onset, est_onset))) - The Average Overlap Ratio (AOR) is given the mean OR computed over all - matching reference and estimate notes. The metric goes from 0 (worst) to 1 + The Average Overlap Ratio (AOR) is given by the mean OR computed over all + matching reference and estimated notes. The metric goes from 0 (worst) to 1 (best). - Note: this function assumes the matching of reference and estimate notes + Note: this function assumes the matching of reference and estimated notes (see :func:`match_notes`) has already been performed and is provided by the `matching` parameter. Furthermore, it is highly recommended to validate the intervals (see :func:`validate_intervals`) before calling this function, @@ -577,7 +594,7 @@ def average_overlap_ratio(ref_intervals, est_intervals, matching): Array of estimated notes time intervals (onset and offset times) matching : list of tuples A list of matched reference and estimated notes. - ``matching[i] == (i, j)`` where reference note i matches estimate note + ``matching[i] == (i, j)`` where reference note i matches estimated note j. Returns @@ -603,9 +620,9 @@ def average_overlap_ratio(ref_intervals, est_intervals, matching): def onset_precision_recall_f1(ref_intervals, est_intervals, onset_tolerance=0.05, strict=False, beta=1.0): """Compute the Precision, Recall and F-measure of note onsets: an estimated - onset is considered correct if it is within +-50ms of a ref onset. Note - that this metric completely ignores note offset and note pitch. This means - an estimated onset will be considered correct if it matches a + onset is considered correct if it is within +-50ms of a reference onset. + Note that this metric completely ignores note offset and note pitch. This + means an estimated onset will be considered correct if it matches a reference onset, even if the onsets come from notes with completely different pitches (i.e. notes that would not match with `match_notes`). @@ -666,10 +683,10 @@ def offset_precision_recall_f1(ref_intervals, est_intervals, offset_ratio=0.2, beta=1.0): """Compute the Precision, Recall and F-measure of note offsets: an estimated offset is considered correct if it is within +-50ms (or 20% of - the ref note duration, which ever is greater) of a ref offset. Note + the ref note duration, which ever is greater) of a reference offset. Note that this metric completely ignores note onsets and note pitch. This means an estimated offset will be considered correct if it matches a - reference offset, even if the offsetes come from notes with completely + reference offset, even if the offsets come from notes with completely different pitches (i.e. notes that would not match with `match_notes`). @@ -693,11 +710,12 @@ def offset_precision_recall_f1(ref_intervals, est_intervals, offset_ratio=0.2, offset_ratio: float > 0 or None The ratio of the reference note's duration used to define the offset_tolerance. Default is 0.2 (20%), meaning the offset_tolerance - will equal the ref_duration * 0.2, or min_offset_tolerance (0.05 by + will equal the ref_duration * 0.2, or ``min_offset_tolerance`` (0.05 by default, i.e. 50 ms), whichever is greater. offset_min_tolerance: float > 0 - The minimum tolerance for offset matching. See offset_ratio description - for an explanation of how the offset tolerance is determined. + The minimum tolerance for offset matching. See ``offset_ratio`` + description for an explanation of how the offset tolerance is + determined. strict: bool If ``strict=False`` (the default), threshold checks for onset matching are performed using ``<=`` (less than or equal). If ``strict=True``,