-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdataloader_NLST.py
388 lines (301 loc) · 14.1 KB
/
dataloader_NLST.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
from __future__ import print_function, division
import sys
import os, os.path
import torch
import numpy as np
import pydicom
import pandas as pd
import math
from torch.utils.data import Dataset
import skimage.io
import skimage.transform
import skimage.color
import skimage
local_mip_utils_path = '/Users/Yesh/Documents/BDRAD/chest_ct_projects/lung-nodule-tracker/utils'
if os.path.exists(local_mip_utils_path):
sys.path.insert(1, local_mip_utils_path)
else:
sys.path.insert(1, '/home/bdrad1/chest_ct_projects/lung-nodule-tracker/utils')
import Mip_Utility as mip_utils # utils for LUNA data
"""
The building block for NLST dataloader, it contains the annotations and the
corresponding patient CT.
The CTs for different patient and study year are loaded upon request and
the annotations are loaded only once.
- Returns a dict containing sboth the axial and coronal MIP projects for each kernel
General Schema for Return Dict: kernel: {view: {scan_with_mip_project, details}}
"""
class NLSTDataset(Dataset):
def __init__(self, fp_anns, data_dir, mip=25, nodule_cts_only=False, transform=None, all_kernels=True, voxel_transform=True,mask=True):
"""
fp_anns = filepath to 'sctabn.csv' annotations file, or pandas dataframe object itself
data_dir = directory to NLST folder containing pids & CT images
pids = Optional. Integer to specify first N pids OR array of specific PIDs you want to test
(e.g. pids=100 OR pids=[100002, 100004, 100005, ...])
mip = Apply Maximum Intensity Projections, units are in 1mm (i.e. mip=25 -> 25mm MIP applied)
nodule_cts_only = Use only CTs with a nodule.
no_voxel_transform = Apply 1mm voxel transformation to speed up preprocessing
mask = Apply lung mask to segment out lungs using voxel thresholding
"""
self.all_kernels = all_kernels
self.voxel_transform = voxel_transform
self.mask=mask
if isinstance(fp_anns, str):
self.anns = pd.read_csv(fp_anns)
elif isinstance(fp_anns, pd.DataFrame):
self.anns = fp_anns
else:
raise ValueError("fp_anns parameter must be the path to the csv (str) or a pandas dataframe object")
# TODO: After moving the code for removing examples which aren't present in the local dataset, this code may break
# double check this code when running analysis script
self.nodule_cts_only = nodule_cts_only
if nodule_cts_only:
self.anns = self.get_nodule_cts_only(self.anns)
# remove duplicates from annotations b/c we use this to loop through studies
self.anns = self.anns.drop_duplicates(subset=['pid', 'STUDY_YR'])
self.data_dir = data_dir
self.mip = mip
self.transform = transform
def __len__(self):
return len(self.anns)
def get_nodule_cts_only(self, ann):
return ann[(ann['sct_slice_num'] > 0) & (ann['sct_slice_num'] != 999)]
def convert_to_coronal(self, img_a):
# img_a is a numpy array of axial slices of shape (n_axial_slices, x, y)
# img_c is returned which is numpy array of coronal slices of shape (x, n_axial_slices, y), where x = n_coronal_slices
slices, x, y = img_a.shape
img_c = np.zeros([x, slices, y])
for i, s in enumerate(img_a):
img_c[:,i,:] = s
return img_c
def load_NLST_images(self, pid, study_yr, load_dcms=False):
"""
pid = patient id
study_yr = 0, 1, 2, ...
load_dcms = True if you want func to return pydicom objects along with img_arrays
Returns:
1: an array of numpy arrays containing the patient PID's scan in
study_yr
2: spcing for adjusting slice thickness in mip
"""
data_dir = self.data_dir
pid_dir = os.path.join(data_dir,str(pid))
if os.path.isdir(pid_dir) == False:
return {}
study_yr = 'T' + str(study_yr)
study_dir = os.path.join(pid_dir,study_yr)
"""
Return a map of all images with kenel name, so far
the function could potentially be returning different datastructures
but could optimize a bit later.
"""
mapping = {}
for kernel_foldername in self.listdir_nohidden(study_dir):
dcms = []
file_paths = [] # May just want to append kernel_dir instead
kernel = None
kernel_dir = os.path.join(study_dir, kernel_foldername)
# check if the folder contains scout images
if len([name for name in os.listdir(kernel_dir)]) > 10:
for filename in self.listdir_nohidden(kernel_dir):
dcm_path = os.path.join(kernel_dir, filename)
ds = pydicom.dcmread(dcm_path, force = True)
try:
kernel = ds.ConvolutionKernel
except Exception as e:
print('-'*30)
print('pid: {} study_yr: {}'.format(pid, study_yr))
print('Kernel foldername: {}'.format(kernel_foldername))
print(e)
kernel = ds.ConvolutionKernel
dcms.append(ds)
file_paths.append(dcm_path)
spacing = np.array([float(ds.SliceThickness), float(ds.PixelSpacing[0]), float(ds.PixelSpacing[1])])
# reorder dcms by patient position
dcms.sort(key = lambda dcm : -dcm.ImagePositionPatient[2]) # reorder dicoms along z (axial) axis
# add to return object with kernel as key
# - if multiple versions of the same kernel, return only the one with the smallest SliceThickness
# check if new kernel with same the kernel_name has a larger sliceThickness. If so, skip adding this kernel
if kernel.lower() in mapping.keys():
if spacing[0] > mapping[kernel.lower()]['spacing'][0]:
continue
mapping[kernel.lower()] = {}
mapping[kernel.lower()]['file_path'] = kernel_dir
mapping[kernel.lower()]['image_position_patient'] = dcms[0].ImagePositionPatient
mapping[kernel.lower()]['spacing'] = spacing
mapping[kernel.lower()]['img_array'] = np.array([dcm.pixel_array for dcm in dcms])
if load_dcms:
mapping[kernel.lower()]['dcms'] = dcms
return mapping
def __getitem__(self, index):
"""
Returns mapping = {
kernel_name1: {
img_array: all ct slices preprocessed,
slice_num_new: slice number of nodule after preprocessing,
slice_num_orig: slice number of nodule before preprocessing,
pid: patient ID,
study_yr: Study Year,
},
kernel_name2: {...},
...
}
2. adjusted slice
3. original slice number of nodule
"""
# select sample
mip = self.mip
ann = self.anns.iloc[index]
pid = ann['pid']
study_yr = ann['STUDY_YR']
print('-'*30)
print('Preprocessing: {}_{}'.format(pid, study_yr))
mapping = self.load_NLST_images(pid, study_yr)
output = {}
# inference on preferenced kernel, otherwise inference on all kernels
# - preference list in order from highest to lower preference
preferenced_kernels = ['standard', 'bone', 'b50f', 'b30f', 'd', 'c']
# get the kernel that matches highest on the preference list
kernels = next(([k] for k in preferenced_kernels if k in mapping.keys()), list(mapping.keys()))
for kernel in kernels:
item = mapping[kernel]
img_array = item['img_array']
spacing = item['spacing']
file_path = item['file_path']
# Resize slice (this step takes a long time, about 70 seconds)
if self.voxel_transform:
img_array, new_spacing = mip_utils.resize_image(img_array, spacing)
axial_conversion = (spacing/new_spacing)[0]
saggital_conversion = (spacing/new_spacing)[1]
coronal_conversion = (spacing/new_spacing)[2]
else:
axial_conversion = spacing[0]
saggital_conversion = spacing[1]
coronal_conversion = spacing[2]
# segment lungs
if self.mask:
img_array = mip_utils.segment_lung_from_ct_scan(img_array)
# apply transformations to both axial and coronal views
view_output = {}
views = ['axial', 'coronal']
for view in views:
# switch axis to coronal
if view == 'axial':
img_array_view = img_array
elif view == 'coronal':
img_array_view = self.convert_to_coronal(img_array)
# scale mip to appropriate value if not transforming to 1mm voxels (i.e. mip_utils.resize_image function)
if not self.voxel_transform:
if view == 'axial':
mip = int(mip/axial_conversion)
elif view == 'coronal':
mip = int(mip/coronal_conversion)
# Apply MIP - must be run after mip_utils.resize_image()!!!
img_array_view = mip_utils.createMIP(img_array_view, slices_num=mip)
# Convert to 3 channels
# img_array= np.stack((img_array,) * 3, axis=-1)
img_array_view = np.expand_dims(img_array_view, axis=-1)
# scale to 0-1
img_array_view = ((img_array_view - img_array_view.min()) * (1/(img_array_view.max() - img_array_view.min())))
# transforms
if not (self.transform is None):
# Transforms are applied to each slice
transformed_img_array = []
for img in img_array_view:
sample = {'img': img}
sample = self.transform(sample)
scale = sample['scale']
x = torch.unsqueeze(sample['img'], 0)
transformed_img_array.append(x)
transformed_img_array = torch.cat(transformed_img_array, 0)
img_array_view = transformed_img_array
view_output[view] = {
'scale' : scale,
'img_array': img_array_view,
'axial_conversion': axial_conversion,
'saggital_conversion': saggital_conversion,
'coronal_conversion': coronal_conversion,
'pid': pid,
'study_yr': study_yr,
'file_path': file_path}
# cleanup to reduce CPU RAM use
del img_array_view
del img_array
output[kernel] = view_output
return output
def listdir_nohidden(self, path):
for f in os.listdir(path):
if not f.startswith('.'):
yield f
def collater(data):
imgs = [s['img'] for s in data]
scales = [s['scale'] for s in data]
widths = [int(s.shape[0]) for s in imgs]
heights = [int(s.shape[1]) for s in imgs]
batch_size = len(imgs)
max_width = np.array(widths).max()
max_height = np.array(heights).max()
padded_imgs = torch.zeros(batch_size, max_width, max_height, 3)
for i in range(batch_size):
img = imgs[i]
padded_imgs[i, :int(img.shape[0]), :int(img.shape[1]), :] = img
padded_imgs = padded_imgs.permute(0, 3, 1, 2)
return {'img': padded_imgs, 'scale': scales}
class Resizer(object):
"""Convert ndarrays in sample to Tensors."""
def __call__(self, sample, min_side=608, max_side=1024):
image = sample['img']
rows, cols, cns = image.shape
smallest_side = min(rows, cols)
# rescale the image so the smallest side is min_side
scale = min_side / smallest_side
# check if the largest side is now greater than max_side, which can happen
# when images have a large aspect ratio
largest_side = max(rows, cols)
if largest_side * scale > max_side:
scale = max_side / largest_side
# resize the image with the computed scale
image = skimage.transform.resize(image, (int(round(rows*scale)), int(round((cols*scale)))),
mode='constant',
anti_aliasing=None)
rows, cols, cns = image.shape
pad_w = 32 - rows%32
pad_h = 32 - cols%32
new_image = np.zeros((rows + pad_w, cols + pad_h, cns)).astype(np.float32)
new_image[:rows, :cols, :] = image.astype(np.float32)
return {'img': torch.from_numpy(new_image), 'scale': scale}
class Normalizer(object):
def __init__(self):
self.mean = np.array([[[0.485, 0.456, 0.406]]])
self.std = np.array([[[0.229, 0.224, 0.225]]])
def __call__(self, sample):
"""
Normalizes sample based on provided mean and std statistics
"""
image = sample['img']
num_channels = image.shape[2]
if num_channels == 1:
mean = self.mean[:, :, 0]
std = self.std[:, :, 0]
return {'img':((image.astype(np.float32)-mean)/std)}
class UnNormalizer(object):
def __init__(self, mean=None, std=None):
if mean == None:
self.mean = [0.485, 0.456, 0.406]
else:
self.mean = mean
if std == None:
self.std = [0.229, 0.224, 0.225]
else:
self.std = std
def __call__(self, tensor):
"""
Undoes normalization from Normalizer transformation
Args:
tensor (Tensor): Tensor image of size (C, H, W) to be normalized.
Returns:
Tensor: Un-Normalized image.
"""
for t, m, s in zip(tensor, self.mean, self.std):
t.mul_(s).add_(m)
return tensor