Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add option to split data into train/test/validate sets #149

Merged
merged 23 commits into from
Nov 5, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions docs/parameters.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,5 +49,16 @@ Here is the full list of configuration parameters you can specify in a ``config.
``'segmentation'``
Output is an array of shape ``(256, 256)`` with values matching the class index label at that position. The classes are applied sequentially according to ``config.json`` so latter classes will be written over earlier class labels if there is overlap.

**seed**: int
Random generator seed. Optional, use to make results reproducible.

**split_vals**: list
Default: `[0.8, 0.2]`
Percentage of data to put in each category listed in split_names. Must be a list of floats that sum to one and match the length of `split-names`. For train, validate, and test data, a list like `[0.7, 0.2, 0.1]` is suggested.

**split_names**: list
Default: `['train', 'test']`
List of names for each subset of the data. Length of list must match length of `split_vals`.

**imagery_offset**: list of ints
An optional list of integers representing the number of pixels to offset imagery. For example ``[15, -5]`` will move the images 15 pixels right and 5 pixels up relative to the requested tile bounds.
53 changes: 38 additions & 15 deletions label_maker/package.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
from label_maker.utils import is_tif


def package_directory(dest_folder, classes, imagery, ml_type, seed=False, train_size=0.8, **kwargs):
def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_names=['train', 'test'],
split_vals=[0.8, .2], **kwargs):
"""Generate an .npz file containing arrays for training machine learning algorithms

Parameters
Expand All @@ -28,16 +29,26 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, train_
ml_type: str
Defines the type of machine learning. One of "classification", "object-detection", or "segmentation"
seed: int
Random generator seed. Optional, use to make results reproducable.
train_size: float
Portion of the data to use in training, the remainder is used as test data (default 0.8)
Random generator seed. Optional, use to make results reproducible.
split_vals: list
Default: [0.8, 0.2]
Percentage of data to put in each catagory listed in split_names.
Must be floats and must sum to one.
split_names: list
Default: ['train', 'test']
List of names for each subset of the data.
**kwargs: dict
Other properties from CLI config passed as keywords to other utility functions
"""
# if a seed is given, use it
if seed:
np.random.seed(seed)

if len(split_names) != len(split_vals):
raise ValueError('`split_names` and `split_vals` must be the same length. Please update your config.')
if not np.isclose(sum(split_vals), 1):
raise ValueError('`split_vals` must sum to one. Please update your config.')

# open labels file, create tile array
labels_file = op.join(dest_folder, 'labels.npz')
labels = np.load(labels_file)
Expand All @@ -60,7 +71,7 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, train_
# open the images and load those plus the labels into the final arrays
o = urlparse(imagery)
_, image_format = op.splitext(o.path)
if is_tif(imagery): # if a TIF is provided, use jpg as tile format
if is_tif(imagery): # if a TIF is provided, use jpg as tile format
image_format = '.jpg'
for tile in tiles:
image_file = op.join(dest_folder, 'tiles', '{}{}'.format(tile, image_format))
Expand All @@ -86,16 +97,28 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, train_
elif ml_type == 'segmentation':
y_vals.append(labels[tile][..., np.newaxis]) # Add grayscale channel

# split into train and test
split_index = int(len(x_vals) * train_size)

# convert lists to numpy arrays
# Convert lists to numpy arrays
x_vals = np.array(x_vals, dtype=np.uint8)
y_vals = np.array(y_vals, dtype=np.uint8)

martham93 marked this conversation as resolved.
Show resolved Hide resolved
print('Saving packaged file to {}'.format(op.join(dest_folder, 'data.npz')))
np.savez(op.join(dest_folder, 'data.npz'),
x_train=x_vals[:split_index, ...],
y_train=y_vals[:split_index, ...],
x_test=x_vals[split_index:, ...],
y_test=y_vals[split_index:, ...])
# Get number of data samples per split from the float proportions
split_n_samps = [len(x_vals) * val for val in split_vals]

if np.any(split_n_samps == 0):
raise ValueError('split must not generate zero samples per partition, change ratio of values in config file.')

# Convert into a cumulative sum to get indices
split_inds = np.cumsum(split_n_samps).astype(np.integer)

# Exclude last index as `np.split` handles splitting without that value
split_arrs_x = np.split(x_vals, split_inds[:-1])
split_arrs_y = np.split(y_vals, split_inds[:-1])

save_dict = {}

for si, split_name in enumerate(split_names):
save_dict[f'x_{split_name}'] = split_arrs_x[si]
save_dict[f'y_{split_name}'] = split_arrs_y[si]

np.savez(op.join(dest_folder, 'data.npz'), **save_dict)
print('Saving packaged file to {}'.format(op.join(dest_folder, 'data.npz')))
4 changes: 3 additions & 1 deletion label_maker/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,7 @@
'background_ratio': {'type': 'float'},
'ml_type': {'allowed': ['classification', 'object-detection', 'segmentation'], 'required': True},
'seed': {'type': 'integer'},
'imagery_offset': {'type': 'list', 'schema': {'type': 'integer'}, 'minlength': 2, 'maxlength': 2}
'imagery_offset': {'type': 'list', 'schema': {'type': 'integer'}, 'minlength': 2, 'maxlength': 2},
'split_vals': {'type': 'list', 'schema': {'type': 'float'}},
'split_names': {'type': 'list', 'schema': {'type': 'string'}}
}
23 changes: 23 additions & 0 deletions test/fixtures/integration/config_3way.integration.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{"country": "portugal",
"bounding_box": [
-9.4575,
38.8467,
-9.4510,
38.8513
],
"zoom": 17,
"classes": [
{ "name": "Water Tower", "filter": ["==", "man_made", "water_tower"] },
{ "name": "Building", "filter": ["has", "building"] },
{ "name": "Farmland", "filter": ["==", "landuse", "farmland"] },
{ "name": "Ruins", "filter": ["==", "historic", "ruins"] },
{ "name": "Parking", "filter": ["==", "amenity", "parking"] },
{ "name": "Roads", "filter": ["has", "highway"] }
],
"imagery": "https://api.mapbox.com/v4/mapbox.satellite/{z}/{x}/{y}.jpg?access_token=ACCESS_TOKEN",
"background_ratio": 1,
"ml_type": "classification",
"seed": 19,
"split_names": ["train", "test", "val"],
"split_vals": [0.7, 0.2, 0.1]
}
26 changes: 26 additions & 0 deletions test/integration/test_classification_package.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,24 @@

import numpy as np


class TestClassificationPackage(unittest.TestCase):
"""Tests for classification package creation"""

@classmethod
def setUpClass(cls):
makedirs('integration-cl')
copyfile('test/fixtures/integration/labels-cl.npz', 'integration-cl/labels.npz')
copytree('test/fixtures/integration/tiles', 'integration-cl/tiles')

makedirs('integration-cl-split')
copyfile('test/fixtures/integration/labels-cl.npz', 'integration-cl-split/labels.npz')
copytree('test/fixtures/integration/tiles', 'integration-cl-split/tiles')

@classmethod
def tearDownClass(cls):
rmtree('integration-cl')
rmtree('integration-cl-split')

def test_cli(self):
"""Verify data.npz produced by CLI"""
Expand Down Expand Up @@ -48,3 +55,22 @@ def test_cli(self):
[0, 0, 0, 0, 0, 0, 1]]
)
self.assertTrue(np.array_equal(data['y_test'], expected_y_test))

def test_cli_3way_split(self):
"""Verify data.npz produced by CLI when split into train/test/val"""

cmd = 'label-maker package --dest integration-cl-split --config test/fixtures/integration/config_3way.integration.json'
cmd = cmd.split(' ')
subprocess.run(cmd, universal_newlines=True)

data = np.load('integration-cl-split/data.npz')

# validate our image data with shapes
self.assertEqual(data['x_train'].shape, (5, 256, 256, 3))
self.assertEqual(data['x_test'].shape, (2, 256, 256, 3))
self.assertEqual(data['x_val'].shape, (1, 256, 256, 3))

# validate label data with shapes
self.assertEqual(data['y_train'].shape, (5, 7))
self.assertEqual(data['y_test'].shape, (2, 7))
self.assertEqual(data['y_val'].shape, (1, 7))