developmentseed · martham93 · Nov 5, 2019 · Oct 24, 2019 · Oct 25, 2019 · Oct 25, 2019
diff --git a/docs/parameters.rst b/docs/parameters.rst
@@ -49,5 +49,16 @@ Here is the full list of configuration parameters you can specify in a ``config.
  	``'segmentation'``
  		Output is an array of shape ``(256, 256)`` with values matching the class index label at that position. The classes are applied sequentially according to ``config.json`` so latter classes will be written over earlier class labels if there is overlap.
 
+**seed**: int
+    Random generator seed. Optional, use to make results reproducible.
+
+**split_vals**: list
+    Default: `[0.8, 0.2]`
+    Percentage of data to put in each category listed in split_names. Must be a list of floats that sum to one and match the length of `split-names`. For train, validate, and test data, a list like `[0.7, 0.2, 0.1]` is suggested.
+
+**split_names**: list
+    Default: `['train', 'test']`
+    List of names for each subset of the data. Length of list must match length of `split_vals`.
+
 **imagery_offset**:  list of ints
 	An optional list of integers representing the number of pixels to offset imagery. For example ``[15, -5]`` will move the images 15 pixels right and 5 pixels up relative to the requested tile bounds.
diff --git a/label_maker/package.py b/label_maker/package.py
@@ -9,7 +9,8 @@
 from label_maker.utils import is_tif
 
 
-def package_directory(dest_folder, classes, imagery, ml_type, seed=False, train_size=0.8, **kwargs):
+def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_names=['train', 'test'],
+                      split_vals=[0.8, .2], **kwargs):
     """Generate an .npz file containing arrays for training machine learning algorithms
 
     Parameters
@@ -28,16 +29,26 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, train_
     ml_type: str
         Defines the type of machine learning. One of "classification", "object-detection", or "segmentation"
     seed: int
-        Random generator seed. Optional, use to make results reproducable.
-    train_size: float
-        Portion of the data to use in training, the remainder is used as test data (default 0.8)
+        Random generator seed. Optional, use to make results reproducible.
+    split_vals: list
+        Default: [0.8, 0.2]
+        Percentage of data to put in each catagory listed in split_names.
+        Must be floats and must sum to one.
+    split_names: list
+        Default: ['train', 'test']
+        List of names for each subset of the data.
     **kwargs: dict
         Other properties from CLI config passed as keywords to other utility functions
     """
     # if a seed is given, use it
     if seed:
         np.random.seed(seed)
 
+    if len(split_names) != len(split_vals):
+        raise ValueError('`split_names` and `split_vals` must be the same length. Please update your config.')
+    if not np.isclose(sum(split_vals), 1):
+        raise ValueError('`split_vals` must sum to one. Please update your config.')
+
     # open labels file, create tile array
     labels_file = op.join(dest_folder, 'labels.npz')
     labels = np.load(labels_file)
@@ -60,7 +71,7 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, train_
     # open the images and load those plus the labels into the final arrays
     o = urlparse(imagery)
     _, image_format = op.splitext(o.path)
-    if is_tif(imagery): # if a TIF is provided, use jpg as tile format
+    if is_tif(imagery):  # if a TIF is provided, use jpg as tile format
         image_format = '.jpg'
     for tile in tiles:
         image_file = op.join(dest_folder, 'tiles', '{}{}'.format(tile, image_format))
@@ -86,16 +97,28 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, train_
         elif ml_type == 'segmentation':
             y_vals.append(labels[tile][..., np.newaxis])  # Add grayscale channel
 
-    # split into train and test
-    split_index = int(len(x_vals) * train_size)
-
-    # convert lists to numpy arrays
+    # Convert lists to numpy arrays
     x_vals = np.array(x_vals, dtype=np.uint8)
     y_vals = np.array(y_vals, dtype=np.uint8)
 
-    print('Saving packaged file to {}'.format(op.join(dest_folder, 'data.npz')))
-    np.savez(op.join(dest_folder, 'data.npz'),
-             x_train=x_vals[:split_index, ...],
-             y_train=y_vals[:split_index, ...],
-             x_test=x_vals[split_index:, ...],
-             y_test=y_vals[split_index:, ...])
+    # Get number of data samples per split from the float proportions
+    split_n_samps = [len(x_vals) * val for val in split_vals]
+
+    if np.any(split_n_samps == 0):
+        raise ValueError('split must not generate zero samples per partition, change ratio of values in config file.')
+
+    # Convert into a cumulative sum to get indices
+    split_inds = np.cumsum(split_n_samps).astype(np.integer)
+
+    # Exclude last index as `np.split` handles splitting without that value
+    split_arrs_x = np.split(x_vals, split_inds[:-1])
+    split_arrs_y = np.split(y_vals, split_inds[:-1])
+
+    save_dict = {}
+
+    for si, split_name in enumerate(split_names):
+        save_dict[f'x_{split_name}'] = split_arrs_x[si]
+        save_dict[f'y_{split_name}'] = split_arrs_y[si]
+
+    np.savez(op.join(dest_folder, 'data.npz'), **save_dict)
+    print('Saving packaged file to {}'.format(op.join(dest_folder, 'data.npz')))
diff --git a/label_maker/validate.py b/label_maker/validate.py
@@ -30,5 +30,7 @@
     'background_ratio': {'type': 'float'},
     'ml_type': {'allowed': ['classification', 'object-detection', 'segmentation'], 'required': True},
     'seed': {'type': 'integer'},
-    'imagery_offset': {'type': 'list', 'schema': {'type': 'integer'}, 'minlength': 2, 'maxlength': 2}
+    'imagery_offset': {'type': 'list', 'schema': {'type': 'integer'}, 'minlength': 2, 'maxlength': 2},
+    'split_vals': {'type': 'list', 'schema': {'type': 'float'}},
+    'split_names': {'type': 'list', 'schema': {'type': 'string'}}
 }
diff --git a/test/fixtures/integration/config_3way.integration.json b/test/fixtures/integration/config_3way.integration.json
@@ -0,0 +1,23 @@
+{"country": "portugal",
+  "bounding_box": [
+    -9.4575,
+    38.8467,
+    -9.4510,
+    38.8513
+  ],
+  "zoom": 17,
+  "classes": [
+    { "name": "Water Tower", "filter": ["==", "man_made", "water_tower"] },
+    { "name": "Building", "filter": ["has", "building"] },
+    { "name": "Farmland", "filter": ["==", "landuse", "farmland"] },
+    { "name": "Ruins", "filter": ["==", "historic", "ruins"] },
+    { "name": "Parking", "filter": ["==", "amenity", "parking"] },
+    { "name": "Roads", "filter": ["has", "highway"] }
+  ],
+  "imagery": "https://api.mapbox.com/v4/mapbox.satellite/{z}/{x}/{y}.jpg?access_token=ACCESS_TOKEN",
+  "background_ratio": 1,
+  "ml_type": "classification",
+  "seed": 19,
+  "split_names": ["train", "test", "val"],
+  "split_vals": [0.7, 0.2, 0.1]
+}
diff --git a/test/integration/test_classification_package.py b/test/integration/test_classification_package.py
@@ -7,17 +7,24 @@
 
 import numpy as np
 
+
 class TestClassificationPackage(unittest.TestCase):
     """Tests for classification package creation"""
+
     @classmethod
     def setUpClass(cls):
         makedirs('integration-cl')
         copyfile('test/fixtures/integration/labels-cl.npz', 'integration-cl/labels.npz')
         copytree('test/fixtures/integration/tiles', 'integration-cl/tiles')
 
+        makedirs('integration-cl-split')
+        copyfile('test/fixtures/integration/labels-cl.npz', 'integration-cl-split/labels.npz')
+        copytree('test/fixtures/integration/tiles', 'integration-cl-split/tiles')
+
     @classmethod
     def tearDownClass(cls):
         rmtree('integration-cl')
+        rmtree('integration-cl-split')
 
     def test_cli(self):
         """Verify data.npz produced by CLI"""
@@ -48,3 +55,22 @@ def test_cli(self):
              [0, 0, 0, 0, 0, 0, 1]]
         )
         self.assertTrue(np.array_equal(data['y_test'], expected_y_test))
+
+    def test_cli_3way_split(self):
+        """Verify data.npz produced by CLI when split into train/test/val"""
+
+        cmd = 'label-maker package --dest integration-cl-split --config test/fixtures/integration/config_3way.integration.json'
+        cmd = cmd.split(' ')
+        subprocess.run(cmd, universal_newlines=True)
+
+        data = np.load('integration-cl-split/data.npz')
+
+        # validate our image data with shapes
+        self.assertEqual(data['x_train'].shape, (5, 256, 256, 3))
+        self.assertEqual(data['x_test'].shape, (2, 256, 256, 3))
+        self.assertEqual(data['x_val'].shape, (1, 256, 256, 3))
+
+        # validate label data with shapes
+        self.assertEqual(data['y_train'].shape, (5, 7))
+        self.assertEqual(data['y_test'].shape, (2, 7))
+        self.assertEqual(data['y_val'].shape, (1, 7))