From e9c86aadf0ebe4c106d2b6c647f1cb33b709a137 Mon Sep 17 00:00:00 2001
From: Martha Morrissey <marthamorrissey@Marthas-MacBook-Pro-2.local>
Date: Thu, 24 Oct 2019 17:37:51 -0400
Subject: [PATCH 01/23] add new arguments and basic logic checks around them

---
 label_maker/package.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/label_maker/package.py b/label_maker/package.py
index 4d717c9..47e814c 100644
--- a/label_maker/package.py
+++ b/label_maker/package.py
@@ -9,7 +9,8 @@
 from label_maker.utils import is_tif
 
 
-def package_directory(dest_folder, classes, imagery, ml_type, seed=False, train_size=0.8, **kwargs):
+def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_names=['train', 'test'],
+                      split_vals=[0.8, .2], **kwargs):
     """Generate an .npz file containing arrays for training machine learning algorithms
 
     Parameters
@@ -38,6 +39,12 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, train_
     if seed:
         np.random.seed(seed)
 
+    assert len(split_names) == len(split_vals), "split_names and split_vals must be the same length." 
+    assert sum(split_vals) == 1, "split_vals must sum to one."
+
+
+
+
     # open labels file, create tile array
     labels_file = op.join(dest_folder, 'labels.npz')
     labels = np.load(labels_file)

From a2989a62d42251686397495e6a45999025ca8ca1 Mon Sep 17 00:00:00 2001
From: martham93 <marthamorrissey93@gmail.com>
Date: Thu, 24 Oct 2019 22:31:28 -0400
Subject: [PATCH 02/23] initial try using np split

---
 label_maker/package.py | 54 +++++++++++++++++++++++++++++-------------
 1 file changed, 38 insertions(+), 16 deletions(-)

diff --git a/label_maker/package.py b/label_maker/package.py
index 47e814c..722621d 100644
--- a/label_maker/package.py
+++ b/label_maker/package.py
@@ -29,9 +29,14 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_
     ml_type: str
         Defines the type of machine learning. One of "classification", "object-detection", or "segmentation"
     seed: int
-        Random generator seed. Optional, use to make results reproducable.
-    train_size: float
-        Portion of the data to use in training, the remainder is used as test data (default 0.8)
+        Random generator seed. Optional, use to make results reproducible.
+
+    split_vals: lst
+        Percentage of data to put in each catagory listed in split_names. Must be floats and must sum to one.
+
+    split_names: lst
+        List of names for each subset of the data.
+
     **kwargs: dict
         Other properties from CLI config passed as keywords to other utility functions
     """
@@ -39,12 +44,9 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_
     if seed:
         np.random.seed(seed)
 
-    assert len(split_names) == len(split_vals), "split_names and split_vals must be the same length." 
+    assert len(split_names) == len(split_vals), "split_names and split_vals must be the same length."
     assert sum(split_vals) == 1, "split_vals must sum to one."
 
-
-
-
     # open labels file, create tile array
     labels_file = op.join(dest_folder, 'labels.npz')
     labels = np.load(labels_file)
@@ -67,7 +69,7 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_
     # open the images and load those plus the labels into the final arrays
     o = urlparse(imagery)
     _, image_format = op.splitext(o.path)
-    if is_tif(imagery): # if a TIF is provided, use jpg as tile format
+    if is_tif(imagery):  # if a TIF is provided, use jpg as tile format
         image_format = '.jpg'
     for tile in tiles:
         image_file = op.join(dest_folder, 'tiles', '{}{}'.format(tile, image_format))
@@ -93,16 +95,36 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_
         elif ml_type == 'segmentation':
             y_vals.append(labels[tile][..., np.newaxis])  # Add grayscale channel
 
-    # split into train and test
-    split_index = int(len(x_vals) * train_size)
-
     # convert lists to numpy arrays
     x_vals = np.array(x_vals, dtype=np.uint8)
     y_vals = np.array(y_vals, dtype=np.uint8)
 
+    x_vals_split_lst = np.split(x_vals,
+                                [int(split_vals[0] * len(x_vals)), int((split_vals[0] + split_vals[1]) * len(x_vals))])
+
+    if len(x_vals_split_lst[-1]) == 0:
+        x_vals_split_lst = x_vals_split_lst[:-1]
+
+    y_vals_split_lst = np.split(y_vals,
+                                [int(split_vals[0] * len(x_vals)), int((split_vals[0] + split_vals[1]) * len(x_vals))])
+
+    if len(y_vals_split_lst[-1]) == 0:
+        y_vals_split_lst = y_vals_split_lst[:-1]
+
     print('Saving packaged file to {}'.format(op.join(dest_folder, 'data.npz')))
-    np.savez(op.join(dest_folder, 'data.npz'),
-             x_train=x_vals[:split_index, ...],
-             y_train=y_vals[:split_index, ...],
-             x_test=x_vals[split_index:, ...],
-             y_test=y_vals[split_index:, ...])
+
+    if len(split_vals == 2):
+        np.savez(op.join(dest_folder, 'data.npz'),
+                 x_train=x_vals_split_lst[0],
+                 y_train=y_vals_split_lst[0],
+                 x_test=x_vals_split_lst[1],
+                 y_test=y_vals_split_lst[1])
+
+    if len(split_vals == 3):
+        np.savez(op.join(dest_folder, 'data.npz'),
+                 x_train=x_vals_split_lst[0],
+                 y_train=y_vals_split_lst[1],
+                 x_test=x_vals_split_lst[1],
+                 y_test=y_vals_split_lst[1],
+                 x_val=x_vals_split_lst[2],
+                 y_val=y_vals_split_lst[2])

From 516575308e9aa056e3c6a987b7bcf64b661d695d Mon Sep 17 00:00:00 2001
From: martham93 <marthamorrissey93@gmail.com>
Date: Fri, 25 Oct 2019 08:13:06 -0400
Subject: [PATCH 03/23] clearer doc strings

---
 label_maker/package.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/label_maker/package.py b/label_maker/package.py
index 722621d..c42ccfe 100644
--- a/label_maker/package.py
+++ b/label_maker/package.py
@@ -35,7 +35,7 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_
         Percentage of data to put in each catagory listed in split_names. Must be floats and must sum to one.
 
     split_names: lst
-        List of names for each subset of the data.
+        List of names for each subset of the data, either ['train', 'test'] or ['train', 'test', 'val']
 
     **kwargs: dict
         Other properties from CLI config passed as keywords to other utility functions
@@ -44,6 +44,7 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_
     if seed:
         np.random.seed(seed)
 
+    assert len(split_names) == 2 or len(split_names) == 3.
     assert len(split_names) == len(split_vals), "split_names and split_vals must be the same length."
     assert sum(split_vals) == 1, "split_vals must sum to one."
 

From a1af80485c8a5ac62b031e710f1b7f57f2d8efd3 Mon Sep 17 00:00:00 2001
From: martham93 <marthamorrissey93@gmail.com>
Date: Fri, 25 Oct 2019 10:30:43 -0400
Subject: [PATCH 04/23] fix logic

---
 label_maker/package.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/label_maker/package.py b/label_maker/package.py
index c42ccfe..0453252 100644
--- a/label_maker/package.py
+++ b/label_maker/package.py
@@ -114,14 +114,14 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_
 
     print('Saving packaged file to {}'.format(op.join(dest_folder, 'data.npz')))
 
-    if len(split_vals == 2):
+    if len(split_vals) == 2:
         np.savez(op.join(dest_folder, 'data.npz'),
                  x_train=x_vals_split_lst[0],
                  y_train=y_vals_split_lst[0],
                  x_test=x_vals_split_lst[1],
                  y_test=y_vals_split_lst[1])
 
-    if len(split_vals == 3):
+    if len(split_vals) == 3:
         np.savez(op.join(dest_folder, 'data.npz'),
                  x_train=x_vals_split_lst[0],
                  y_train=y_vals_split_lst[1],

From 492fa92382d2a4766504e913cdc454c6d9ed1a16 Mon Sep 17 00:00:00 2001
From: martham93 <marthamorrissey93@gmail.com>
Date: Fri, 25 Oct 2019 11:13:41 -0400
Subject: [PATCH 05/23] config json new checks

---
 label_maker/validate.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/label_maker/validate.py b/label_maker/validate.py
index 2a60425..f0da9d4 100644
--- a/label_maker/validate.py
+++ b/label_maker/validate.py
@@ -30,5 +30,7 @@
     'background_ratio': {'type': 'float'},
     'ml_type': {'allowed': ['classification', 'object-detection', 'segmentation'], 'required': True},
     'seed': {'type': 'integer'},
-    'imagery_offset': {'type': 'list', 'schema': {'type': 'integer'}, 'minlength': 2, 'maxlength': 2}
+    'imagery_offset': {'type': 'list', 'schema': {'type': 'integer'}, 'minlength': 2, 'maxlength': 2},
+    'split_vals': {'type': 'list', 'schema': {'type': 'float'}, 'minlength': 2, 'maxlength': 3},
+    'split_names': {'type': 'list', 'schema': {'type': 'string'}, 'minlength': 2, 'maxlength': 3}
 }

From 072e56cd50cb5667e6042d9547f2c0af4873a2b6 Mon Sep 17 00:00:00 2001
From: martham93 <marthamorrissey93@gmail.com>
Date: Fri, 25 Oct 2019 11:14:26 -0400
Subject: [PATCH 06/23] clean up

---
 label_maker/package.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/label_maker/package.py b/label_maker/package.py
index 0453252..5d12862 100644
--- a/label_maker/package.py
+++ b/label_maker/package.py
@@ -46,7 +46,7 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_
 
     assert len(split_names) == 2 or len(split_names) == 3.
     assert len(split_names) == len(split_vals), "split_names and split_vals must be the same length."
-    assert sum(split_vals) == 1, "split_vals must sum to one."
+    assert np.isclose(sum(split_vals), 1), "split_vals must sum to one."
 
     # open labels file, create tile array
     labels_file = op.join(dest_folder, 'labels.npz')

From 9930f5b6d0701c5ee4a04eef52de2214a65d95b1 Mon Sep 17 00:00:00 2001
From: martham93 <marthamorrissey93@gmail.com>
Date: Fri, 25 Oct 2019 14:55:30 -0400
Subject: [PATCH 07/23] new config json for tests

---
 .../integration/config_3way.integration.json  | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 test/fixtures/integration/config_3way.integration.json

diff --git a/test/fixtures/integration/config_3way.integration.json b/test/fixtures/integration/config_3way.integration.json
new file mode 100644
index 0000000..e6a144c
--- /dev/null
+++ b/test/fixtures/integration/config_3way.integration.json
@@ -0,0 +1,23 @@
+{"country": "portugal",
+  "bounding_box": [
+    -9.4575,
+    38.8467,
+    -9.4510,
+    38.8513
+  ],
+  "zoom": 17,
+  "classes": [
+    { "name": "Water Tower", "filter": ["==", "man_made", "water_tower"] },
+    { "name": "Building", "filter": ["has", "building"] },
+    { "name": "Farmland", "filter": ["==", "landuse", "farmland"] },
+    { "name": "Ruins", "filter": ["==", "historic", "ruins"] },
+    { "name": "Parking", "filter": ["==", "amenity", "parking"] },
+    { "name": "Roads", "filter": ["has", "highway"] }
+  ],
+  "imagery": "https://api.mapbox.com/v4/mapbox.satellite/{z}/{x}/{y}.jpg?access_token=ACCESS_TOKEN",
+  "background_ratio": 1,
+  "ml_type": "classification",
+  "seed": 19,
+  "split_names": ["train", "test", "val"],
+  "split_vals": [0.7, 0.2, 0.1]
+}

From d73858a0f1fd2e4e4b9d780fd85adc49fd3ed8e0 Mon Sep 17 00:00:00 2001
From: martham93 <marthamorrissey93@gmail.com>
Date: Fri, 25 Oct 2019 15:05:45 -0400
Subject: [PATCH 08/23] update tests

---
 label_maker/package.py                        |  2 +-
 .../test_classification_package.py            | 35 ++++++++++++++++++-
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/label_maker/package.py b/label_maker/package.py
index 5d12862..20e4725 100644
--- a/label_maker/package.py
+++ b/label_maker/package.py
@@ -124,7 +124,7 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_
     if len(split_vals) == 3:
         np.savez(op.join(dest_folder, 'data.npz'),
                  x_train=x_vals_split_lst[0],
-                 y_train=y_vals_split_lst[1],
+                 y_train=y_vals_split_lst[0],
                  x_test=x_vals_split_lst[1],
                  y_test=y_vals_split_lst[1],
                  x_val=x_vals_split_lst[2],
diff --git a/test/integration/test_classification_package.py b/test/integration/test_classification_package.py
index db5dc3b..550f1b4 100644
--- a/test/integration/test_classification_package.py
+++ b/test/integration/test_classification_package.py
@@ -11,13 +11,25 @@ class TestClassificationPackage(unittest.TestCase):
     """Tests for classification package creation"""
     @classmethod
     def setUpClass(cls):
-        makedirs('integration-cl')
+        try:
+            makedirs('integration-cl')
+        except FileExistsError:
+            pass
         copyfile('test/fixtures/integration/labels-cl.npz', 'integration-cl/labels.npz')
         copytree('test/fixtures/integration/tiles', 'integration-cl/tiles')
 
+        try:
+            makedirs('integration-cl-split')
+
+        except FileExistsError:
+            pass
+        copyfile('test/fixtures/integration/labels-cl.npz', 'integration-cl-split/labels.npz')
+        copytree('test/fixtures/integration/tiles', 'integration-cl-split/tiles')
+
     @classmethod
     def tearDownClass(cls):
         rmtree('integration-cl')
+        rmtree('integration-cl-split')
 
     def test_cli(self):
         """Verify data.npz produced by CLI"""
@@ -48,3 +60,24 @@ def test_cli(self):
              [0, 0, 0, 0, 0, 0, 1]]
         )
         self.assertTrue(np.array_equal(data['y_test'], expected_y_test))
+
+    def test_cli_3way_split(self):
+        """Verify data.npz produced by CLI when split into train/test/val"""
+
+        cmd = 'label-maker package --dest integration-cl-split --config test/fixtures/integration/config_3way.integration.json'
+        cmd = cmd.split(' ')
+        subprocess.run(cmd, universal_newlines=True)
+
+        data = np.load('integration-cl-split/data.npz')
+
+        # validate our image data with shapes
+        self.assertEqual(data['x_train'].shape, (5, 256, 256, 3))
+        self.assertEqual(data['x_test'].shape, (2, 256, 256, 3))
+        self.assertEqual(data['x_val'].shape, (1, 256, 256, 3))
+
+        # validate label data with shapes
+        self.assertEqual(data['y_train'].shape, (5, 7))
+        self.assertEqual(data['y_test'].shape, (2, 7))
+        self.assertEqual(data['y_val'].shape, (1, 7))
+
+

From dd749e1c9a28ff6a27e9d8a4b2a489878efb846b Mon Sep 17 00:00:00 2001
From: martham93 <marthamorrissey93@gmail.com>
Date: Fri, 25 Oct 2019 15:13:42 -0400
Subject: [PATCH 09/23] clean up test format

---
 test/integration/test_classification_package.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/test/integration/test_classification_package.py b/test/integration/test_classification_package.py
index 550f1b4..55e211d 100644
--- a/test/integration/test_classification_package.py
+++ b/test/integration/test_classification_package.py
@@ -11,18 +11,12 @@ class TestClassificationPackage(unittest.TestCase):
     """Tests for classification package creation"""
     @classmethod
     def setUpClass(cls):
-        try:
-            makedirs('integration-cl')
-        except FileExistsError:
-            pass
+
+        makedirs('integration-cl')
         copyfile('test/fixtures/integration/labels-cl.npz', 'integration-cl/labels.npz')
         copytree('test/fixtures/integration/tiles', 'integration-cl/tiles')
 
-        try:
-            makedirs('integration-cl-split')
-
-        except FileExistsError:
-            pass
+        makedirs('integration-cl-split')
         copyfile('test/fixtures/integration/labels-cl.npz', 'integration-cl-split/labels.npz')
         copytree('test/fixtures/integration/tiles', 'integration-cl-split/tiles')
 

From 8f155526e4f193d50738655ad1a8fdfa041c5ffe Mon Sep 17 00:00:00 2001
From: martham93 <marthamorrissey93@gmail.com>
Date: Mon, 28 Oct 2019 15:34:00 -0600
Subject: [PATCH 10/23] fix y labels

---
 label_maker/package.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/label_maker/package.py b/label_maker/package.py
index 20e4725..dc5c00b 100644
--- a/label_maker/package.py
+++ b/label_maker/package.py
@@ -107,7 +107,7 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_
         x_vals_split_lst = x_vals_split_lst[:-1]
 
     y_vals_split_lst = np.split(y_vals,
-                                [int(split_vals[0] * len(x_vals)), int((split_vals[0] + split_vals[1]) * len(x_vals))])
+                                [int(split_vals[0] * len(y_vals)), int((split_vals[0] + split_vals[1]) * len(y_vals))])
 
     if len(y_vals_split_lst[-1]) == 0:
         y_vals_split_lst = y_vals_split_lst[:-1]

From 559dfb6334a0600be5dde4b25544e9b0abad36c5 Mon Sep 17 00:00:00 2001
From: martham93 <marthamorrissey93@gmail.com>
Date: Tue, 29 Oct 2019 12:48:08 -0600
Subject: [PATCH 11/23] fix doc string format

---
 label_maker/package.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/label_maker/package.py b/label_maker/package.py
index dc5c00b..98ba3e5 100644
--- a/label_maker/package.py
+++ b/label_maker/package.py
@@ -30,13 +30,10 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_
         Defines the type of machine learning. One of "classification", "object-detection", or "segmentation"
     seed: int
         Random generator seed. Optional, use to make results reproducible.
-
-    split_vals: lst
+    split_vals: list
         Percentage of data to put in each catagory listed in split_names. Must be floats and must sum to one.
-
-    split_names: lst
+    split_names: list
         List of names for each subset of the data, either ['train', 'test'] or ['train', 'test', 'val']
-
     **kwargs: dict
         Other properties from CLI config passed as keywords to other utility functions
     """

From 3eb7d32dbb8cb169d279df540fa3b713e09606aa Mon Sep 17 00:00:00 2001
From: martham93 <marthamorrissey93@gmail.com>
Date: Tue, 29 Oct 2019 12:52:03 -0600
Subject: [PATCH 12/23] allow for user defined number of splits

---
 label_maker/package.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/label_maker/package.py b/label_maker/package.py
index 98ba3e5..ac6025e 100644
--- a/label_maker/package.py
+++ b/label_maker/package.py
@@ -31,9 +31,12 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_
     seed: int
         Random generator seed. Optional, use to make results reproducible.
     split_vals: list
-        Percentage of data to put in each catagory listed in split_names. Must be floats and must sum to one.
+        Default: [0.8, 0.2]
+        Percentage of data to put in each catagory listed in split_names.
+        Must be floats and must sum to one.
     split_names: list
-        List of names for each subset of the data, either ['train', 'test'] or ['train', 'test', 'val']
+        Default: ['train', 'test'] 
+        List of names for each subset of the data.
     **kwargs: dict
         Other properties from CLI config passed as keywords to other utility functions
     """
@@ -41,7 +44,7 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_
     if seed:
         np.random.seed(seed)
 
-    assert len(split_names) == 2 or len(split_names) == 3.
+    #assert len(split_names) == 2 or len(split_names) == 3.
     assert len(split_names) == len(split_vals), "split_names and split_vals must be the same length."
     assert np.isclose(sum(split_vals), 1), "split_vals must sum to one."
 

From 397d31b23129a3b2550e5139b7d72576d6f57811 Mon Sep 17 00:00:00 2001
From: martham93 <marthamorrissey93@gmail.com>
Date: Tue, 29 Oct 2019 12:53:25 -0600
Subject: [PATCH 13/23] remove length check

---
 label_maker/validate.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/label_maker/validate.py b/label_maker/validate.py
index f0da9d4..13c34fb 100644
--- a/label_maker/validate.py
+++ b/label_maker/validate.py
@@ -31,6 +31,6 @@
     'ml_type': {'allowed': ['classification', 'object-detection', 'segmentation'], 'required': True},
     'seed': {'type': 'integer'},
     'imagery_offset': {'type': 'list', 'schema': {'type': 'integer'}, 'minlength': 2, 'maxlength': 2},
-    'split_vals': {'type': 'list', 'schema': {'type': 'float'}, 'minlength': 2, 'maxlength': 3},
-    'split_names': {'type': 'list', 'schema': {'type': 'string'}, 'minlength': 2, 'maxlength': 3}
+    'split_vals': {'type': 'list', 'schema': {'type': 'float'}},
+    'split_names': {'type': 'list', 'schema': {'type': 'string'}}
 }

From 7df341ba81bc1dc493570ea5645e2ac63d875178 Mon Sep 17 00:00:00 2001
From: martham93 <marthamorrissey93@gmail.com>
Date: Tue, 29 Oct 2019 12:54:31 -0600
Subject: [PATCH 14/23] remove uncessary new lines

---
 test/integration/test_classification_package.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/test/integration/test_classification_package.py b/test/integration/test_classification_package.py
index 55e211d..bc286f0 100644
--- a/test/integration/test_classification_package.py
+++ b/test/integration/test_classification_package.py
@@ -72,6 +72,4 @@ def test_cli_3way_split(self):
         # validate label data with shapes
         self.assertEqual(data['y_train'].shape, (5, 7))
         self.assertEqual(data['y_test'].shape, (2, 7))
-        self.assertEqual(data['y_val'].shape, (1, 7))
-
-
+        self.assertEqual(data['y_val'].shape, (1, 7))
\ No newline at end of file

From 3fcfcb7b47bd4fbc7a46e52a766165f993b5d58c Mon Sep 17 00:00:00 2001
From: martham93 <marthamorrissey93@gmail.com>
Date: Tue, 29 Oct 2019 13:07:40 -0600
Subject: [PATCH 15/23] update doc string

---
 docs/parameters.rst | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/docs/parameters.rst b/docs/parameters.rst
index 77d0879..4617c3a 100644
--- a/docs/parameters.rst
+++ b/docs/parameters.rst
@@ -49,5 +49,13 @@ Here is the full list of configuration parameters you can specify in a ``config.
  	``'segmentation'``
  		Output is an array of shape ``(256, 256)`` with values matching the class index label at that position. The classes are applied sequentially according to ``config.json`` so latter classes will be written over earlier class labels if there is overlap.
 
+**split_vals:** list
+    Default: `[0.8, 0.2]`
+    Percentage of data to put in each catagory listed in split_names. Must be floats and must sum to one.
+
+**split_names**: list
+    Default: `['train', 'test']`
+    List of names for each subset of the data.
+
 **imagery_offset**:  list of ints
 	An optional list of integers representing the number of pixels to offset imagery. For example ``[15, -5]`` will move the images 15 pixels right and 5 pixels up relative to the requested tile bounds.

From 36855d40e2fbff4ff2a56d3e3ebe5c78fd0d4141 Mon Sep 17 00:00:00 2001
From: martham93 <marthamorrissey93@gmail.com>
Date: Tue, 29 Oct 2019 13:16:53 -0600
Subject: [PATCH 16/23] exceptions instead of asserts

---
 label_maker/package.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/label_maker/package.py b/label_maker/package.py
index ac6025e..7b8dd22 100644
--- a/label_maker/package.py
+++ b/label_maker/package.py
@@ -35,7 +35,7 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_
         Percentage of data to put in each catagory listed in split_names.
         Must be floats and must sum to one.
     split_names: list
-        Default: ['train', 'test'] 
+        Default: ['train', 'test']
         List of names for each subset of the data.
     **kwargs: dict
         Other properties from CLI config passed as keywords to other utility functions
@@ -44,9 +44,10 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_
     if seed:
         np.random.seed(seed)
 
-    #assert len(split_names) == 2 or len(split_names) == 3.
-    assert len(split_names) == len(split_vals), "split_names and split_vals must be the same length."
-    assert np.isclose(sum(split_vals), 1), "split_vals must sum to one."
+    if len(split_names) != len(split_vals):
+        raise ValueError('`split_names` and `split_vals` must be the same length. Please update your config.')
+    if not np.isclose(sum(split_vals), 1):
+        raise ValueError('`split_vals` must sum to one. Please update your config.')
 
     # open labels file, create tile array
     labels_file = op.join(dest_folder, 'labels.npz')

From 584e6e09f57898f782cbe23325afcb195b47b3c1 Mon Sep 17 00:00:00 2001
From: martham93 <marthamorrissey93@gmail.com>
Date: Tue, 29 Oct 2019 13:45:09 -0600
Subject: [PATCH 17/23] add seed to docs

---
 docs/parameters.rst | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/docs/parameters.rst b/docs/parameters.rst
index 4617c3a..1d586d2 100644
--- a/docs/parameters.rst
+++ b/docs/parameters.rst
@@ -49,7 +49,10 @@ Here is the full list of configuration parameters you can specify in a ``config.
  	``'segmentation'``
  		Output is an array of shape ``(256, 256)`` with values matching the class index label at that position. The classes are applied sequentially according to ``config.json`` so latter classes will be written over earlier class labels if there is overlap.
 
-**split_vals:** list
+**seed**: int
+    Random generator seed. Optional, use to make results reproducible.
+
+**split_vals**: list
     Default: `[0.8, 0.2]`
     Percentage of data to put in each catagory listed in split_names. Must be floats and must sum to one.
 

From d47386ccd049275efa0580e3ccfb98585bc26e73 Mon Sep 17 00:00:00 2001
From: martham93 <marthamorrissey93@gmail.com>
Date: Tue, 29 Oct 2019 13:54:42 -0600
Subject: [PATCH 18/23] allow for more than 3 way split

---
 label_maker/package.py | 43 +++++++++++++++++++-----------------------
 1 file changed, 19 insertions(+), 24 deletions(-)

diff --git a/label_maker/package.py b/label_maker/package.py
index 7b8dd22..c9aeb50 100644
--- a/label_maker/package.py
+++ b/label_maker/package.py
@@ -97,36 +97,31 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_
         elif ml_type == 'segmentation':
             y_vals.append(labels[tile][..., np.newaxis])  # Add grayscale channel
 
-    # convert lists to numpy arrays
+    # Convert lists to numpy arrays
     x_vals = np.array(x_vals, dtype=np.uint8)
+    print(x_vals.shape)
     y_vals = np.array(y_vals, dtype=np.uint8)
+    print(y_vals).shape
 
-    x_vals_split_lst = np.split(x_vals,
-                                [int(split_vals[0] * len(x_vals)), int((split_vals[0] + split_vals[1]) * len(x_vals))])
+    # Get number of data samples per split from the float proportions
+    split_n_samps = np.rint([len(x_vals) * val for val in split_vals])
+    print(split_n_samps)
 
-    if len(x_vals_split_lst[-1]) == 0:
-        x_vals_split_lst = x_vals_split_lst[:-1]
+    if np.any(split_n_samps == 0):
+        raise ValueError
 
-    y_vals_split_lst = np.split(y_vals,
-                                [int(split_vals[0] * len(y_vals)), int((split_vals[0] + split_vals[1]) * len(y_vals))])
+    # Convert into a cumulative sum to get indices
+    split_inds = np.cumsum(split_n_samps).astype(np.integer)
 
-    if len(y_vals_split_lst[-1]) == 0:
-        y_vals_split_lst = y_vals_split_lst[:-1]
+    # Exclude last index as `np.split` handles splitting without that value
+    split_arrs_x = np.split(x_vals, split_inds[:-1])
+    split_arrs_y = np.split(y_vals, split_inds[:-1])
 
-    print('Saving packaged file to {}'.format(op.join(dest_folder, 'data.npz')))
+    save_dict = {}
 
-    if len(split_vals) == 2:
-        np.savez(op.join(dest_folder, 'data.npz'),
-                 x_train=x_vals_split_lst[0],
-                 y_train=y_vals_split_lst[0],
-                 x_test=x_vals_split_lst[1],
-                 y_test=y_vals_split_lst[1])
+    for si, split_name in enumerate(split_names):
+        save_dict[f'x_{split_name}'] = split_arrs_x[si]
+        save_dict[f'y_{split_name}'] = split_arrs_y[si]
 
-    if len(split_vals) == 3:
-        np.savez(op.join(dest_folder, 'data.npz'),
-                 x_train=x_vals_split_lst[0],
-                 y_train=y_vals_split_lst[0],
-                 x_test=x_vals_split_lst[1],
-                 y_test=y_vals_split_lst[1],
-                 x_val=x_vals_split_lst[2],
-                 y_val=y_vals_split_lst[2])
+    np.savez(op.join(dest_folder, 'data.npz'), **save_dict)
+    print('Saving packaged file to {}'.format(op.join(dest_folder, 'data.npz')))
\ No newline at end of file

From ce2d7bf92828af20c16c4fefa16c8eeadee25d1c Mon Sep 17 00:00:00 2001
From: martham93 <marthamorrissey93@gmail.com>
Date: Tue, 29 Oct 2019 14:22:41 -0600
Subject: [PATCH 19/23] updates

---
 label_maker/package.py                                 | 6 ++----
 test/fixtures/integration/config_3way.integration.json | 2 +-
 test/integration/test_classification_package.py        | 8 +++++---
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/label_maker/package.py b/label_maker/package.py
index c9aeb50..ae5b7d5 100644
--- a/label_maker/package.py
+++ b/label_maker/package.py
@@ -99,16 +99,14 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_
 
     # Convert lists to numpy arrays
     x_vals = np.array(x_vals, dtype=np.uint8)
-    print(x_vals.shape)
     y_vals = np.array(y_vals, dtype=np.uint8)
-    print(y_vals).shape
 
     # Get number of data samples per split from the float proportions
     split_n_samps = np.rint([len(x_vals) * val for val in split_vals])
-    print(split_n_samps)
+    #print(split_n_samps)
 
     if np.any(split_n_samps == 0):
-        raise ValueError
+        raise ValueError('split must not generate zero samples per partition, change ratio of values in config file.')
 
     # Convert into a cumulative sum to get indices
     split_inds = np.cumsum(split_n_samps).astype(np.integer)
diff --git a/test/fixtures/integration/config_3way.integration.json b/test/fixtures/integration/config_3way.integration.json
index e6a144c..a169202 100644
--- a/test/fixtures/integration/config_3way.integration.json
+++ b/test/fixtures/integration/config_3way.integration.json
@@ -19,5 +19,5 @@
   "ml_type": "classification",
   "seed": 19,
   "split_names": ["train", "test", "val"],
-  "split_vals": [0.7, 0.2, 0.1]
+  "split_vals": [0.6, 0.2, 0.2]
 }
diff --git a/test/integration/test_classification_package.py b/test/integration/test_classification_package.py
index bc286f0..018395d 100644
--- a/test/integration/test_classification_package.py
+++ b/test/integration/test_classification_package.py
@@ -7,11 +7,12 @@
 
 import numpy as np
 
+
 class TestClassificationPackage(unittest.TestCase):
     """Tests for classification package creation"""
+
     @classmethod
     def setUpClass(cls):
-
         makedirs('integration-cl')
         copyfile('test/fixtures/integration/labels-cl.npz', 'integration-cl/labels.npz')
         copytree('test/fixtures/integration/tiles', 'integration-cl/tiles')
@@ -58,7 +59,8 @@ def test_cli(self):
     def test_cli_3way_split(self):
         """Verify data.npz produced by CLI when split into train/test/val"""
 
-        cmd = 'label-maker package --dest integration-cl-split --config test/fixtures/integration/config_3way.integration.json'
+        cmd = 'label-maker package --dest integration-cl-split --config ' \
+              'test/fixtures/integration/config_3way.integration.json '
         cmd = cmd.split(' ')
         subprocess.run(cmd, universal_newlines=True)
 
@@ -72,4 +74,4 @@ def test_cli_3way_split(self):
         # validate label data with shapes
         self.assertEqual(data['y_train'].shape, (5, 7))
         self.assertEqual(data['y_test'].shape, (2, 7))
-        self.assertEqual(data['y_val'].shape, (1, 7))
\ No newline at end of file
+        self.assertEqual(data['y_val'].shape, (1, 7))

From dd0859cff76851b8b7a594fff4ea81f2ae31ff12 Mon Sep 17 00:00:00 2001
From: martham93 <marthamorrissey93@gmail.com>
Date: Tue, 29 Oct 2019 14:29:02 -0600
Subject: [PATCH 20/23] fix format

---
 test/integration/test_classification_package.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/test/integration/test_classification_package.py b/test/integration/test_classification_package.py
index 018395d..5da901b 100644
--- a/test/integration/test_classification_package.py
+++ b/test/integration/test_classification_package.py
@@ -59,8 +59,7 @@ def test_cli(self):
     def test_cli_3way_split(self):
         """Verify data.npz produced by CLI when split into train/test/val"""
 
-        cmd = 'label-maker package --dest integration-cl-split --config ' \
-              'test/fixtures/integration/config_3way.integration.json '
+        cmd = 'label-maker package --dest integration-cl-split --config test/fixtures/integration/config_3way.integration.json'
         cmd = cmd.split(' ')
         subprocess.run(cmd, universal_newlines=True)
 
@@ -74,4 +73,4 @@ def test_cli_3way_split(self):
         # validate label data with shapes
         self.assertEqual(data['y_train'].shape, (5, 7))
         self.assertEqual(data['y_test'].shape, (2, 7))
-        self.assertEqual(data['y_val'].shape, (1, 7))
+        self.assertEqual(data['y_val'].shape, (1, 7))
\ No newline at end of file

From 60393ff9a5ee2f3ae8a5130e3f657ee989accc85 Mon Sep 17 00:00:00 2001
From: martham93 <marthamorrissey93@gmail.com>
Date: Tue, 29 Oct 2019 14:30:34 -0600
Subject: [PATCH 21/23] fix formatting

---
 label_maker/package.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/label_maker/package.py b/label_maker/package.py
index ae5b7d5..ad5443d 100644
--- a/label_maker/package.py
+++ b/label_maker/package.py
@@ -103,7 +103,6 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_
 
     # Get number of data samples per split from the float proportions
     split_n_samps = np.rint([len(x_vals) * val for val in split_vals])
-    #print(split_n_samps)
 
     if np.any(split_n_samps == 0):
         raise ValueError('split must not generate zero samples per partition, change ratio of values in config file.')

From 37a9173f7fb03f4f7a5256c52166e1a7f53526dd Mon Sep 17 00:00:00 2001
From: martham93 <marthamorrissey93@gmail.com>
Date: Tue, 29 Oct 2019 14:58:04 -0600
Subject: [PATCH 22/23] remove rint and go abck to .7, .2, .1 test train val
 split in testing

---
 label_maker/package.py                                 | 2 +-
 test/fixtures/integration/config_3way.integration.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/label_maker/package.py b/label_maker/package.py
index ad5443d..480b4a8 100644
--- a/label_maker/package.py
+++ b/label_maker/package.py
@@ -102,7 +102,7 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_
     y_vals = np.array(y_vals, dtype=np.uint8)
 
     # Get number of data samples per split from the float proportions
-    split_n_samps = np.rint([len(x_vals) * val for val in split_vals])
+    split_n_samps = [len(x_vals) * val for val in split_vals]
 
     if np.any(split_n_samps == 0):
         raise ValueError('split must not generate zero samples per partition, change ratio of values in config file.')
diff --git a/test/fixtures/integration/config_3way.integration.json b/test/fixtures/integration/config_3way.integration.json
index a169202..e6a144c 100644
--- a/test/fixtures/integration/config_3way.integration.json
+++ b/test/fixtures/integration/config_3way.integration.json
@@ -19,5 +19,5 @@
   "ml_type": "classification",
   "seed": 19,
   "split_names": ["train", "test", "val"],
-  "split_vals": [0.6, 0.2, 0.2]
+  "split_vals": [0.7, 0.2, 0.1]
 }

From 28e2c297a95d323c01350ec51b805847a583528e Mon Sep 17 00:00:00 2001
From: martham93 <marthamorrissey93@gmail.com>
Date: Wed, 30 Oct 2019 12:00:18 -0600
Subject: [PATCH 23/23] update docs for clarity

---
 docs/parameters.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/parameters.rst b/docs/parameters.rst
index 1d586d2..ea987cf 100644
--- a/docs/parameters.rst
+++ b/docs/parameters.rst
@@ -54,11 +54,11 @@ Here is the full list of configuration parameters you can specify in a ``config.
 
 **split_vals**: list
     Default: `[0.8, 0.2]`
-    Percentage of data to put in each catagory listed in split_names. Must be floats and must sum to one.
+    Percentage of data to put in each category listed in split_names. Must be a list of floats that sum to one and match the length of `split-names`. For train, validate, and test data, a list like `[0.7, 0.2, 0.1]` is suggested.
 
 **split_names**: list
     Default: `['train', 'test']`
-    List of names for each subset of the data.
+    List of names for each subset of the data. Length of list must match length of `split_vals`.
 
 **imagery_offset**:  list of ints
 	An optional list of integers representing the number of pixels to offset imagery. For example ``[15, -5]`` will move the images 15 pixels right and 5 pixels up relative to the requested tile bounds.