-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathinter_rater_reliability.py
135 lines (108 loc) · 5.94 KB
/
inter_rater_reliability.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env python
#
# These functions were used to evaluate the inter-rater reliability of the lesion segmentation in the study:
# Eden et al., "Spatial distribution of multiple sclerosis lesions in the cervical spinal cord", manuscript under revision (submitted to Brain, 2018).
#
import numpy as np
from scipy import ndimage
import spinalcordtoolbox.image as msct_image
def compute_lesion_wise_metrics(mask_rater, mask_consensus, threshold_overlap=0.5):
'''
This function compute the lesion-wise sensitivity and precision
between the mask of a rater and the consensus reading mask.
Input:
- mask_rater [numpy.array]: manual segmentation mask of one rater
- mask_rater [numpy.array]: consensus reading mask obtained considering all raters' masks
- threshold_overlap [float]: value between 0. and 1.
A candidate lesion was considered as correctly detected (i.e. true positive)
when the rater segmentation connected-voxels overlapped with more than
[threshold_overlap] of the consensus reading segmentation voxels, otherwise
it was considered as incorrectly detected (i.e. false positive). If a lesion
of the consensus reading had an insufficient overlap (<[threshold_overlap])
with the rater segmentation voxels, then we defined it as not-detected
(i.e. false negative).
Output:
- sensitivity [float]: value between 0. and 1., defined as the ratio of true positive lesions
by the sum of true positive and false negative lesions.
- precision [float]: value between 0. and 1., defined as the ratio of true positive lesions
by the sum of true and false positive lesions.
'''
# label each individual lesion of both input masks
labels, nb_les = {}, {}
labels['rater'], nb_les['rater'] = ndimage.label(mask_rater)
labels['consensus'], nb_les['consensus'] = ndimage.label(mask_consensus)
# create a matrix (1xnb_les['rater']), filled with ones
found_rater = np.ones(nb_les['rater'], np.int16)
# initialize the number of true positive, false positive, and false negative
tp, fp, fn = 0, 0, 0
# loop across the lesions of the consensus mask
for i in range(1, nb_les['consensus'] + 1):
# measure the voxel size of the current lesion
lesion_size = np.sum(mask_consensus[labels['consensus'] == i])
# extract the labels of the rater lesions
# which overlap with the current lesion from the consensus mask
rater_lesions_list = np.unique(labels['rater'][labels['consensus'] == i])
# measure the voxel size of the overlap between the rater segmentation
# and the current lesion from the consensus mask
nb_overlap = mask_rater[labels['consensus'] == i].sum()
if nb_overlap >= threshold_overlap * lesion_size: # if the overlap is sufficient
tp += 1
for rater_lesion in rater_lesions_list:
if rater_lesion != 0:
found_rater[rater_lesion - 1] = 0 # this rater lesion is not a false positive
else: # if the overlap is unsufficient
fn += 1
for i in range(1, nb_les['rater'] + 1):
if found_rater[i - 1] == 1: # if this rater lesion did not overlap with the consensus mask
fp += 1
sensitivity, precision = None, None
if tp + fn != 0:
sensitivity = tp * 1. / (tp + fn)
if tp + fp != 0:
precision = tp * 1. / (tp + fp)
return sensitivity, precision
def compute_dice_coefficient(mask_rater, mask_consensus):
'''
This function compute the Dice coefficient
between the mask of a rater and the consensus reading mask.
Input:
- mask_rater [numpy.array]: manual segmentation mask of one rater
- mask_rater [numpy.array]: consensus reading mask obtained considering all raters' masks
Output:
- dice [float]: value between 0. and 1., defined as in "Measures of the Amount of Ecologic
Association Between Species", Lee R. Dice (1945).
'''
mask_rater = mask_rater.astype(np.bool)
mask_consensus = mask_consensus.astype(np.bool)
intersection = np.count_nonzero(mask_rater & mask_consensus)
size_i1 = np.count_nonzero(mask_rater)
size_i2 = np.count_nonzero(mask_consensus)
try:
dice = 2. * intersection / float(size_i1 + size_i2)
except ZeroDivisionError:
dice = None
return dice
def generate_consensus_reading(fname_raterMasks_lst, threshold=0.5, fname_out='./consensus_reading.nii.gz'):
'''
This function generate a consensus reading mask from several segmentation mask of the same raw image.
In the generated mask, 1 indicates that this voxel is part of the consensus reading, 0 otherwise.
Input:
- fname_raterMasks_lst [list]: list of nifti binary mask filenames.
Note that they are suspected to all have the same size,
since they are all the mask of the same raw image.
- threshold [float]: value between 0. and 1.
A voxel is labelled 1 in the output consensus mask
if [threshold] of the raters segmented this voxel as lesion.
- fname_out [string]: filename of the consensus reading mask generated.
'''
# compute the number of agreement needed to consider a voxel as part of the consensus reading mask
threshold_n = int(threshold * len(fname_raterMasks_lst))
# Sum all the rater segmentations
data_sum = np.sum(np.asarray([msct_image.Image(fname_raterMask).data for fname_raterMask in fname_raterMasks_lst]), axis=0)
# Threshold the resulting array according to the threshold
data_thresh = np.zeros(data_sum.shape)
data_thresh[np.where(data_sum >= threshold_n)] = 1
# Save the resulting array as Image in the related ofolder
im_consensus = msct_image.zeros_like(msct_image.Image(fname_raterMasks_lst[0]), dtype=np.uint8)
im_consensus.data = data_thresh
im_consensus.save(fname_out)