From e9c86aadf0ebe4c106d2b6c647f1cb33b709a137 Mon Sep 17 00:00:00 2001 From: Martha Morrissey Date: Thu, 24 Oct 2019 17:37:51 -0400 Subject: [PATCH 01/23] add new arguments and basic logic checks around them --- label_maker/package.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/label_maker/package.py b/label_maker/package.py index 4d717c9..47e814c 100644 --- a/label_maker/package.py +++ b/label_maker/package.py @@ -9,7 +9,8 @@ from label_maker.utils import is_tif -def package_directory(dest_folder, classes, imagery, ml_type, seed=False, train_size=0.8, **kwargs): +def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_names=['train', 'test'], + split_vals=[0.8, .2], **kwargs): """Generate an .npz file containing arrays for training machine learning algorithms Parameters @@ -38,6 +39,12 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, train_ if seed: np.random.seed(seed) + assert len(split_names) == len(split_vals), "split_names and split_vals must be the same length." + assert sum(split_vals) == 1, "split_vals must sum to one." + + + + # open labels file, create tile array labels_file = op.join(dest_folder, 'labels.npz') labels = np.load(labels_file) From a2989a62d42251686397495e6a45999025ca8ca1 Mon Sep 17 00:00:00 2001 From: martham93 Date: Thu, 24 Oct 2019 22:31:28 -0400 Subject: [PATCH 02/23] initial try using np split --- label_maker/package.py | 54 +++++++++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 16 deletions(-) diff --git a/label_maker/package.py b/label_maker/package.py index 47e814c..722621d 100644 --- a/label_maker/package.py +++ b/label_maker/package.py @@ -29,9 +29,14 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_ ml_type: str Defines the type of machine learning. One of "classification", "object-detection", or "segmentation" seed: int - Random generator seed. Optional, use to make results reproducable. - train_size: float - Portion of the data to use in training, the remainder is used as test data (default 0.8) + Random generator seed. Optional, use to make results reproducible. + + split_vals: lst + Percentage of data to put in each catagory listed in split_names. Must be floats and must sum to one. + + split_names: lst + List of names for each subset of the data. + **kwargs: dict Other properties from CLI config passed as keywords to other utility functions """ @@ -39,12 +44,9 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_ if seed: np.random.seed(seed) - assert len(split_names) == len(split_vals), "split_names and split_vals must be the same length." + assert len(split_names) == len(split_vals), "split_names and split_vals must be the same length." assert sum(split_vals) == 1, "split_vals must sum to one." - - - # open labels file, create tile array labels_file = op.join(dest_folder, 'labels.npz') labels = np.load(labels_file) @@ -67,7 +69,7 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_ # open the images and load those plus the labels into the final arrays o = urlparse(imagery) _, image_format = op.splitext(o.path) - if is_tif(imagery): # if a TIF is provided, use jpg as tile format + if is_tif(imagery): # if a TIF is provided, use jpg as tile format image_format = '.jpg' for tile in tiles: image_file = op.join(dest_folder, 'tiles', '{}{}'.format(tile, image_format)) @@ -93,16 +95,36 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_ elif ml_type == 'segmentation': y_vals.append(labels[tile][..., np.newaxis]) # Add grayscale channel - # split into train and test - split_index = int(len(x_vals) * train_size) - # convert lists to numpy arrays x_vals = np.array(x_vals, dtype=np.uint8) y_vals = np.array(y_vals, dtype=np.uint8) + x_vals_split_lst = np.split(x_vals, + [int(split_vals[0] * len(x_vals)), int((split_vals[0] + split_vals[1]) * len(x_vals))]) + + if len(x_vals_split_lst[-1]) == 0: + x_vals_split_lst = x_vals_split_lst[:-1] + + y_vals_split_lst = np.split(y_vals, + [int(split_vals[0] * len(x_vals)), int((split_vals[0] + split_vals[1]) * len(x_vals))]) + + if len(y_vals_split_lst[-1]) == 0: + y_vals_split_lst = y_vals_split_lst[:-1] + print('Saving packaged file to {}'.format(op.join(dest_folder, 'data.npz'))) - np.savez(op.join(dest_folder, 'data.npz'), - x_train=x_vals[:split_index, ...], - y_train=y_vals[:split_index, ...], - x_test=x_vals[split_index:, ...], - y_test=y_vals[split_index:, ...]) + + if len(split_vals == 2): + np.savez(op.join(dest_folder, 'data.npz'), + x_train=x_vals_split_lst[0], + y_train=y_vals_split_lst[0], + x_test=x_vals_split_lst[1], + y_test=y_vals_split_lst[1]) + + if len(split_vals == 3): + np.savez(op.join(dest_folder, 'data.npz'), + x_train=x_vals_split_lst[0], + y_train=y_vals_split_lst[1], + x_test=x_vals_split_lst[1], + y_test=y_vals_split_lst[1], + x_val=x_vals_split_lst[2], + y_val=y_vals_split_lst[2]) From 516575308e9aa056e3c6a987b7bcf64b661d695d Mon Sep 17 00:00:00 2001 From: martham93 Date: Fri, 25 Oct 2019 08:13:06 -0400 Subject: [PATCH 03/23] clearer doc strings --- label_maker/package.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/label_maker/package.py b/label_maker/package.py index 722621d..c42ccfe 100644 --- a/label_maker/package.py +++ b/label_maker/package.py @@ -35,7 +35,7 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_ Percentage of data to put in each catagory listed in split_names. Must be floats and must sum to one. split_names: lst - List of names for each subset of the data. + List of names for each subset of the data, either ['train', 'test'] or ['train', 'test', 'val'] **kwargs: dict Other properties from CLI config passed as keywords to other utility functions @@ -44,6 +44,7 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_ if seed: np.random.seed(seed) + assert len(split_names) == 2 or len(split_names) == 3. assert len(split_names) == len(split_vals), "split_names and split_vals must be the same length." assert sum(split_vals) == 1, "split_vals must sum to one." From a1af80485c8a5ac62b031e710f1b7f57f2d8efd3 Mon Sep 17 00:00:00 2001 From: martham93 Date: Fri, 25 Oct 2019 10:30:43 -0400 Subject: [PATCH 04/23] fix logic --- label_maker/package.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/label_maker/package.py b/label_maker/package.py index c42ccfe..0453252 100644 --- a/label_maker/package.py +++ b/label_maker/package.py @@ -114,14 +114,14 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_ print('Saving packaged file to {}'.format(op.join(dest_folder, 'data.npz'))) - if len(split_vals == 2): + if len(split_vals) == 2: np.savez(op.join(dest_folder, 'data.npz'), x_train=x_vals_split_lst[0], y_train=y_vals_split_lst[0], x_test=x_vals_split_lst[1], y_test=y_vals_split_lst[1]) - if len(split_vals == 3): + if len(split_vals) == 3: np.savez(op.join(dest_folder, 'data.npz'), x_train=x_vals_split_lst[0], y_train=y_vals_split_lst[1], From 492fa92382d2a4766504e913cdc454c6d9ed1a16 Mon Sep 17 00:00:00 2001 From: martham93 Date: Fri, 25 Oct 2019 11:13:41 -0400 Subject: [PATCH 05/23] config json new checks --- label_maker/validate.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/label_maker/validate.py b/label_maker/validate.py index 2a60425..f0da9d4 100644 --- a/label_maker/validate.py +++ b/label_maker/validate.py @@ -30,5 +30,7 @@ 'background_ratio': {'type': 'float'}, 'ml_type': {'allowed': ['classification', 'object-detection', 'segmentation'], 'required': True}, 'seed': {'type': 'integer'}, - 'imagery_offset': {'type': 'list', 'schema': {'type': 'integer'}, 'minlength': 2, 'maxlength': 2} + 'imagery_offset': {'type': 'list', 'schema': {'type': 'integer'}, 'minlength': 2, 'maxlength': 2}, + 'split_vals': {'type': 'list', 'schema': {'type': 'float'}, 'minlength': 2, 'maxlength': 3}, + 'split_names': {'type': 'list', 'schema': {'type': 'string'}, 'minlength': 2, 'maxlength': 3} } From 072e56cd50cb5667e6042d9547f2c0af4873a2b6 Mon Sep 17 00:00:00 2001 From: martham93 Date: Fri, 25 Oct 2019 11:14:26 -0400 Subject: [PATCH 06/23] clean up --- label_maker/package.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/label_maker/package.py b/label_maker/package.py index 0453252..5d12862 100644 --- a/label_maker/package.py +++ b/label_maker/package.py @@ -46,7 +46,7 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_ assert len(split_names) == 2 or len(split_names) == 3. assert len(split_names) == len(split_vals), "split_names and split_vals must be the same length." - assert sum(split_vals) == 1, "split_vals must sum to one." + assert np.isclose(sum(split_vals), 1), "split_vals must sum to one." # open labels file, create tile array labels_file = op.join(dest_folder, 'labels.npz') From 9930f5b6d0701c5ee4a04eef52de2214a65d95b1 Mon Sep 17 00:00:00 2001 From: martham93 Date: Fri, 25 Oct 2019 14:55:30 -0400 Subject: [PATCH 07/23] new config json for tests --- .../integration/config_3way.integration.json | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 test/fixtures/integration/config_3way.integration.json diff --git a/test/fixtures/integration/config_3way.integration.json b/test/fixtures/integration/config_3way.integration.json new file mode 100644 index 0000000..e6a144c --- /dev/null +++ b/test/fixtures/integration/config_3way.integration.json @@ -0,0 +1,23 @@ +{"country": "portugal", + "bounding_box": [ + -9.4575, + 38.8467, + -9.4510, + 38.8513 + ], + "zoom": 17, + "classes": [ + { "name": "Water Tower", "filter": ["==", "man_made", "water_tower"] }, + { "name": "Building", "filter": ["has", "building"] }, + { "name": "Farmland", "filter": ["==", "landuse", "farmland"] }, + { "name": "Ruins", "filter": ["==", "historic", "ruins"] }, + { "name": "Parking", "filter": ["==", "amenity", "parking"] }, + { "name": "Roads", "filter": ["has", "highway"] } + ], + "imagery": "https://api.mapbox.com/v4/mapbox.satellite/{z}/{x}/{y}.jpg?access_token=ACCESS_TOKEN", + "background_ratio": 1, + "ml_type": "classification", + "seed": 19, + "split_names": ["train", "test", "val"], + "split_vals": [0.7, 0.2, 0.1] +} From d73858a0f1fd2e4e4b9d780fd85adc49fd3ed8e0 Mon Sep 17 00:00:00 2001 From: martham93 Date: Fri, 25 Oct 2019 15:05:45 -0400 Subject: [PATCH 08/23] update tests --- label_maker/package.py | 2 +- .../test_classification_package.py | 35 ++++++++++++++++++- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/label_maker/package.py b/label_maker/package.py index 5d12862..20e4725 100644 --- a/label_maker/package.py +++ b/label_maker/package.py @@ -124,7 +124,7 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_ if len(split_vals) == 3: np.savez(op.join(dest_folder, 'data.npz'), x_train=x_vals_split_lst[0], - y_train=y_vals_split_lst[1], + y_train=y_vals_split_lst[0], x_test=x_vals_split_lst[1], y_test=y_vals_split_lst[1], x_val=x_vals_split_lst[2], diff --git a/test/integration/test_classification_package.py b/test/integration/test_classification_package.py index db5dc3b..550f1b4 100644 --- a/test/integration/test_classification_package.py +++ b/test/integration/test_classification_package.py @@ -11,13 +11,25 @@ class TestClassificationPackage(unittest.TestCase): """Tests for classification package creation""" @classmethod def setUpClass(cls): - makedirs('integration-cl') + try: + makedirs('integration-cl') + except FileExistsError: + pass copyfile('test/fixtures/integration/labels-cl.npz', 'integration-cl/labels.npz') copytree('test/fixtures/integration/tiles', 'integration-cl/tiles') + try: + makedirs('integration-cl-split') + + except FileExistsError: + pass + copyfile('test/fixtures/integration/labels-cl.npz', 'integration-cl-split/labels.npz') + copytree('test/fixtures/integration/tiles', 'integration-cl-split/tiles') + @classmethod def tearDownClass(cls): rmtree('integration-cl') + rmtree('integration-cl-split') def test_cli(self): """Verify data.npz produced by CLI""" @@ -48,3 +60,24 @@ def test_cli(self): [0, 0, 0, 0, 0, 0, 1]] ) self.assertTrue(np.array_equal(data['y_test'], expected_y_test)) + + def test_cli_3way_split(self): + """Verify data.npz produced by CLI when split into train/test/val""" + + cmd = 'label-maker package --dest integration-cl-split --config test/fixtures/integration/config_3way.integration.json' + cmd = cmd.split(' ') + subprocess.run(cmd, universal_newlines=True) + + data = np.load('integration-cl-split/data.npz') + + # validate our image data with shapes + self.assertEqual(data['x_train'].shape, (5, 256, 256, 3)) + self.assertEqual(data['x_test'].shape, (2, 256, 256, 3)) + self.assertEqual(data['x_val'].shape, (1, 256, 256, 3)) + + # validate label data with shapes + self.assertEqual(data['y_train'].shape, (5, 7)) + self.assertEqual(data['y_test'].shape, (2, 7)) + self.assertEqual(data['y_val'].shape, (1, 7)) + + From dd749e1c9a28ff6a27e9d8a4b2a489878efb846b Mon Sep 17 00:00:00 2001 From: martham93 Date: Fri, 25 Oct 2019 15:13:42 -0400 Subject: [PATCH 09/23] clean up test format --- test/integration/test_classification_package.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/test/integration/test_classification_package.py b/test/integration/test_classification_package.py index 550f1b4..55e211d 100644 --- a/test/integration/test_classification_package.py +++ b/test/integration/test_classification_package.py @@ -11,18 +11,12 @@ class TestClassificationPackage(unittest.TestCase): """Tests for classification package creation""" @classmethod def setUpClass(cls): - try: - makedirs('integration-cl') - except FileExistsError: - pass + + makedirs('integration-cl') copyfile('test/fixtures/integration/labels-cl.npz', 'integration-cl/labels.npz') copytree('test/fixtures/integration/tiles', 'integration-cl/tiles') - try: - makedirs('integration-cl-split') - - except FileExistsError: - pass + makedirs('integration-cl-split') copyfile('test/fixtures/integration/labels-cl.npz', 'integration-cl-split/labels.npz') copytree('test/fixtures/integration/tiles', 'integration-cl-split/tiles') From 8f155526e4f193d50738655ad1a8fdfa041c5ffe Mon Sep 17 00:00:00 2001 From: martham93 Date: Mon, 28 Oct 2019 15:34:00 -0600 Subject: [PATCH 10/23] fix y labels --- label_maker/package.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/label_maker/package.py b/label_maker/package.py index 20e4725..dc5c00b 100644 --- a/label_maker/package.py +++ b/label_maker/package.py @@ -107,7 +107,7 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_ x_vals_split_lst = x_vals_split_lst[:-1] y_vals_split_lst = np.split(y_vals, - [int(split_vals[0] * len(x_vals)), int((split_vals[0] + split_vals[1]) * len(x_vals))]) + [int(split_vals[0] * len(y_vals)), int((split_vals[0] + split_vals[1]) * len(y_vals))]) if len(y_vals_split_lst[-1]) == 0: y_vals_split_lst = y_vals_split_lst[:-1] From 559dfb6334a0600be5dde4b25544e9b0abad36c5 Mon Sep 17 00:00:00 2001 From: martham93 Date: Tue, 29 Oct 2019 12:48:08 -0600 Subject: [PATCH 11/23] fix doc string format --- label_maker/package.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/label_maker/package.py b/label_maker/package.py index dc5c00b..98ba3e5 100644 --- a/label_maker/package.py +++ b/label_maker/package.py @@ -30,13 +30,10 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_ Defines the type of machine learning. One of "classification", "object-detection", or "segmentation" seed: int Random generator seed. Optional, use to make results reproducible. - - split_vals: lst + split_vals: list Percentage of data to put in each catagory listed in split_names. Must be floats and must sum to one. - - split_names: lst + split_names: list List of names for each subset of the data, either ['train', 'test'] or ['train', 'test', 'val'] - **kwargs: dict Other properties from CLI config passed as keywords to other utility functions """ From 3eb7d32dbb8cb169d279df540fa3b713e09606aa Mon Sep 17 00:00:00 2001 From: martham93 Date: Tue, 29 Oct 2019 12:52:03 -0600 Subject: [PATCH 12/23] allow for user defined number of splits --- label_maker/package.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/label_maker/package.py b/label_maker/package.py index 98ba3e5..ac6025e 100644 --- a/label_maker/package.py +++ b/label_maker/package.py @@ -31,9 +31,12 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_ seed: int Random generator seed. Optional, use to make results reproducible. split_vals: list - Percentage of data to put in each catagory listed in split_names. Must be floats and must sum to one. + Default: [0.8, 0.2] + Percentage of data to put in each catagory listed in split_names. + Must be floats and must sum to one. split_names: list - List of names for each subset of the data, either ['train', 'test'] or ['train', 'test', 'val'] + Default: ['train', 'test'] + List of names for each subset of the data. **kwargs: dict Other properties from CLI config passed as keywords to other utility functions """ @@ -41,7 +44,7 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_ if seed: np.random.seed(seed) - assert len(split_names) == 2 or len(split_names) == 3. + #assert len(split_names) == 2 or len(split_names) == 3. assert len(split_names) == len(split_vals), "split_names and split_vals must be the same length." assert np.isclose(sum(split_vals), 1), "split_vals must sum to one." From 397d31b23129a3b2550e5139b7d72576d6f57811 Mon Sep 17 00:00:00 2001 From: martham93 Date: Tue, 29 Oct 2019 12:53:25 -0600 Subject: [PATCH 13/23] remove length check --- label_maker/validate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/label_maker/validate.py b/label_maker/validate.py index f0da9d4..13c34fb 100644 --- a/label_maker/validate.py +++ b/label_maker/validate.py @@ -31,6 +31,6 @@ 'ml_type': {'allowed': ['classification', 'object-detection', 'segmentation'], 'required': True}, 'seed': {'type': 'integer'}, 'imagery_offset': {'type': 'list', 'schema': {'type': 'integer'}, 'minlength': 2, 'maxlength': 2}, - 'split_vals': {'type': 'list', 'schema': {'type': 'float'}, 'minlength': 2, 'maxlength': 3}, - 'split_names': {'type': 'list', 'schema': {'type': 'string'}, 'minlength': 2, 'maxlength': 3} + 'split_vals': {'type': 'list', 'schema': {'type': 'float'}}, + 'split_names': {'type': 'list', 'schema': {'type': 'string'}} } From 7df341ba81bc1dc493570ea5645e2ac63d875178 Mon Sep 17 00:00:00 2001 From: martham93 Date: Tue, 29 Oct 2019 12:54:31 -0600 Subject: [PATCH 14/23] remove uncessary new lines --- test/integration/test_classification_package.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/test/integration/test_classification_package.py b/test/integration/test_classification_package.py index 55e211d..bc286f0 100644 --- a/test/integration/test_classification_package.py +++ b/test/integration/test_classification_package.py @@ -72,6 +72,4 @@ def test_cli_3way_split(self): # validate label data with shapes self.assertEqual(data['y_train'].shape, (5, 7)) self.assertEqual(data['y_test'].shape, (2, 7)) - self.assertEqual(data['y_val'].shape, (1, 7)) - - + self.assertEqual(data['y_val'].shape, (1, 7)) \ No newline at end of file From 3fcfcb7b47bd4fbc7a46e52a766165f993b5d58c Mon Sep 17 00:00:00 2001 From: martham93 Date: Tue, 29 Oct 2019 13:07:40 -0600 Subject: [PATCH 15/23] update doc string --- docs/parameters.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/parameters.rst b/docs/parameters.rst index 77d0879..4617c3a 100644 --- a/docs/parameters.rst +++ b/docs/parameters.rst @@ -49,5 +49,13 @@ Here is the full list of configuration parameters you can specify in a ``config. ``'segmentation'`` Output is an array of shape ``(256, 256)`` with values matching the class index label at that position. The classes are applied sequentially according to ``config.json`` so latter classes will be written over earlier class labels if there is overlap. +**split_vals:** list + Default: `[0.8, 0.2]` + Percentage of data to put in each catagory listed in split_names. Must be floats and must sum to one. + +**split_names**: list + Default: `['train', 'test']` + List of names for each subset of the data. + **imagery_offset**: list of ints An optional list of integers representing the number of pixels to offset imagery. For example ``[15, -5]`` will move the images 15 pixels right and 5 pixels up relative to the requested tile bounds. From 36855d40e2fbff4ff2a56d3e3ebe5c78fd0d4141 Mon Sep 17 00:00:00 2001 From: martham93 Date: Tue, 29 Oct 2019 13:16:53 -0600 Subject: [PATCH 16/23] exceptions instead of asserts --- label_maker/package.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/label_maker/package.py b/label_maker/package.py index ac6025e..7b8dd22 100644 --- a/label_maker/package.py +++ b/label_maker/package.py @@ -35,7 +35,7 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_ Percentage of data to put in each catagory listed in split_names. Must be floats and must sum to one. split_names: list - Default: ['train', 'test'] + Default: ['train', 'test'] List of names for each subset of the data. **kwargs: dict Other properties from CLI config passed as keywords to other utility functions @@ -44,9 +44,10 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_ if seed: np.random.seed(seed) - #assert len(split_names) == 2 or len(split_names) == 3. - assert len(split_names) == len(split_vals), "split_names and split_vals must be the same length." - assert np.isclose(sum(split_vals), 1), "split_vals must sum to one." + if len(split_names) != len(split_vals): + raise ValueError('`split_names` and `split_vals` must be the same length. Please update your config.') + if not np.isclose(sum(split_vals), 1): + raise ValueError('`split_vals` must sum to one. Please update your config.') # open labels file, create tile array labels_file = op.join(dest_folder, 'labels.npz') From 584e6e09f57898f782cbe23325afcb195b47b3c1 Mon Sep 17 00:00:00 2001 From: martham93 Date: Tue, 29 Oct 2019 13:45:09 -0600 Subject: [PATCH 17/23] add seed to docs --- docs/parameters.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/parameters.rst b/docs/parameters.rst index 4617c3a..1d586d2 100644 --- a/docs/parameters.rst +++ b/docs/parameters.rst @@ -49,7 +49,10 @@ Here is the full list of configuration parameters you can specify in a ``config. ``'segmentation'`` Output is an array of shape ``(256, 256)`` with values matching the class index label at that position. The classes are applied sequentially according to ``config.json`` so latter classes will be written over earlier class labels if there is overlap. -**split_vals:** list +**seed**: int + Random generator seed. Optional, use to make results reproducible. + +**split_vals**: list Default: `[0.8, 0.2]` Percentage of data to put in each catagory listed in split_names. Must be floats and must sum to one. From d47386ccd049275efa0580e3ccfb98585bc26e73 Mon Sep 17 00:00:00 2001 From: martham93 Date: Tue, 29 Oct 2019 13:54:42 -0600 Subject: [PATCH 18/23] allow for more than 3 way split --- label_maker/package.py | 43 +++++++++++++++++++----------------------- 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/label_maker/package.py b/label_maker/package.py index 7b8dd22..c9aeb50 100644 --- a/label_maker/package.py +++ b/label_maker/package.py @@ -97,36 +97,31 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_ elif ml_type == 'segmentation': y_vals.append(labels[tile][..., np.newaxis]) # Add grayscale channel - # convert lists to numpy arrays + # Convert lists to numpy arrays x_vals = np.array(x_vals, dtype=np.uint8) + print(x_vals.shape) y_vals = np.array(y_vals, dtype=np.uint8) + print(y_vals).shape - x_vals_split_lst = np.split(x_vals, - [int(split_vals[0] * len(x_vals)), int((split_vals[0] + split_vals[1]) * len(x_vals))]) + # Get number of data samples per split from the float proportions + split_n_samps = np.rint([len(x_vals) * val for val in split_vals]) + print(split_n_samps) - if len(x_vals_split_lst[-1]) == 0: - x_vals_split_lst = x_vals_split_lst[:-1] + if np.any(split_n_samps == 0): + raise ValueError - y_vals_split_lst = np.split(y_vals, - [int(split_vals[0] * len(y_vals)), int((split_vals[0] + split_vals[1]) * len(y_vals))]) + # Convert into a cumulative sum to get indices + split_inds = np.cumsum(split_n_samps).astype(np.integer) - if len(y_vals_split_lst[-1]) == 0: - y_vals_split_lst = y_vals_split_lst[:-1] + # Exclude last index as `np.split` handles splitting without that value + split_arrs_x = np.split(x_vals, split_inds[:-1]) + split_arrs_y = np.split(y_vals, split_inds[:-1]) - print('Saving packaged file to {}'.format(op.join(dest_folder, 'data.npz'))) + save_dict = {} - if len(split_vals) == 2: - np.savez(op.join(dest_folder, 'data.npz'), - x_train=x_vals_split_lst[0], - y_train=y_vals_split_lst[0], - x_test=x_vals_split_lst[1], - y_test=y_vals_split_lst[1]) + for si, split_name in enumerate(split_names): + save_dict[f'x_{split_name}'] = split_arrs_x[si] + save_dict[f'y_{split_name}'] = split_arrs_y[si] - if len(split_vals) == 3: - np.savez(op.join(dest_folder, 'data.npz'), - x_train=x_vals_split_lst[0], - y_train=y_vals_split_lst[0], - x_test=x_vals_split_lst[1], - y_test=y_vals_split_lst[1], - x_val=x_vals_split_lst[2], - y_val=y_vals_split_lst[2]) + np.savez(op.join(dest_folder, 'data.npz'), **save_dict) + print('Saving packaged file to {}'.format(op.join(dest_folder, 'data.npz'))) \ No newline at end of file From ce2d7bf92828af20c16c4fefa16c8eeadee25d1c Mon Sep 17 00:00:00 2001 From: martham93 Date: Tue, 29 Oct 2019 14:22:41 -0600 Subject: [PATCH 19/23] updates --- label_maker/package.py | 6 ++---- test/fixtures/integration/config_3way.integration.json | 2 +- test/integration/test_classification_package.py | 8 +++++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/label_maker/package.py b/label_maker/package.py index c9aeb50..ae5b7d5 100644 --- a/label_maker/package.py +++ b/label_maker/package.py @@ -99,16 +99,14 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_ # Convert lists to numpy arrays x_vals = np.array(x_vals, dtype=np.uint8) - print(x_vals.shape) y_vals = np.array(y_vals, dtype=np.uint8) - print(y_vals).shape # Get number of data samples per split from the float proportions split_n_samps = np.rint([len(x_vals) * val for val in split_vals]) - print(split_n_samps) + #print(split_n_samps) if np.any(split_n_samps == 0): - raise ValueError + raise ValueError('split must not generate zero samples per partition, change ratio of values in config file.') # Convert into a cumulative sum to get indices split_inds = np.cumsum(split_n_samps).astype(np.integer) diff --git a/test/fixtures/integration/config_3way.integration.json b/test/fixtures/integration/config_3way.integration.json index e6a144c..a169202 100644 --- a/test/fixtures/integration/config_3way.integration.json +++ b/test/fixtures/integration/config_3way.integration.json @@ -19,5 +19,5 @@ "ml_type": "classification", "seed": 19, "split_names": ["train", "test", "val"], - "split_vals": [0.7, 0.2, 0.1] + "split_vals": [0.6, 0.2, 0.2] } diff --git a/test/integration/test_classification_package.py b/test/integration/test_classification_package.py index bc286f0..018395d 100644 --- a/test/integration/test_classification_package.py +++ b/test/integration/test_classification_package.py @@ -7,11 +7,12 @@ import numpy as np + class TestClassificationPackage(unittest.TestCase): """Tests for classification package creation""" + @classmethod def setUpClass(cls): - makedirs('integration-cl') copyfile('test/fixtures/integration/labels-cl.npz', 'integration-cl/labels.npz') copytree('test/fixtures/integration/tiles', 'integration-cl/tiles') @@ -58,7 +59,8 @@ def test_cli(self): def test_cli_3way_split(self): """Verify data.npz produced by CLI when split into train/test/val""" - cmd = 'label-maker package --dest integration-cl-split --config test/fixtures/integration/config_3way.integration.json' + cmd = 'label-maker package --dest integration-cl-split --config ' \ + 'test/fixtures/integration/config_3way.integration.json ' cmd = cmd.split(' ') subprocess.run(cmd, universal_newlines=True) @@ -72,4 +74,4 @@ def test_cli_3way_split(self): # validate label data with shapes self.assertEqual(data['y_train'].shape, (5, 7)) self.assertEqual(data['y_test'].shape, (2, 7)) - self.assertEqual(data['y_val'].shape, (1, 7)) \ No newline at end of file + self.assertEqual(data['y_val'].shape, (1, 7)) From dd0859cff76851b8b7a594fff4ea81f2ae31ff12 Mon Sep 17 00:00:00 2001 From: martham93 Date: Tue, 29 Oct 2019 14:29:02 -0600 Subject: [PATCH 20/23] fix format --- test/integration/test_classification_package.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/test/integration/test_classification_package.py b/test/integration/test_classification_package.py index 018395d..5da901b 100644 --- a/test/integration/test_classification_package.py +++ b/test/integration/test_classification_package.py @@ -59,8 +59,7 @@ def test_cli(self): def test_cli_3way_split(self): """Verify data.npz produced by CLI when split into train/test/val""" - cmd = 'label-maker package --dest integration-cl-split --config ' \ - 'test/fixtures/integration/config_3way.integration.json ' + cmd = 'label-maker package --dest integration-cl-split --config test/fixtures/integration/config_3way.integration.json' cmd = cmd.split(' ') subprocess.run(cmd, universal_newlines=True) @@ -74,4 +73,4 @@ def test_cli_3way_split(self): # validate label data with shapes self.assertEqual(data['y_train'].shape, (5, 7)) self.assertEqual(data['y_test'].shape, (2, 7)) - self.assertEqual(data['y_val'].shape, (1, 7)) + self.assertEqual(data['y_val'].shape, (1, 7)) \ No newline at end of file From 60393ff9a5ee2f3ae8a5130e3f657ee989accc85 Mon Sep 17 00:00:00 2001 From: martham93 Date: Tue, 29 Oct 2019 14:30:34 -0600 Subject: [PATCH 21/23] fix formatting --- label_maker/package.py | 1 - 1 file changed, 1 deletion(-) diff --git a/label_maker/package.py b/label_maker/package.py index ae5b7d5..ad5443d 100644 --- a/label_maker/package.py +++ b/label_maker/package.py @@ -103,7 +103,6 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_ # Get number of data samples per split from the float proportions split_n_samps = np.rint([len(x_vals) * val for val in split_vals]) - #print(split_n_samps) if np.any(split_n_samps == 0): raise ValueError('split must not generate zero samples per partition, change ratio of values in config file.') From 37a9173f7fb03f4f7a5256c52166e1a7f53526dd Mon Sep 17 00:00:00 2001 From: martham93 Date: Tue, 29 Oct 2019 14:58:04 -0600 Subject: [PATCH 22/23] remove rint and go abck to .7, .2, .1 test train val split in testing --- label_maker/package.py | 2 +- test/fixtures/integration/config_3way.integration.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/label_maker/package.py b/label_maker/package.py index ad5443d..480b4a8 100644 --- a/label_maker/package.py +++ b/label_maker/package.py @@ -102,7 +102,7 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_ y_vals = np.array(y_vals, dtype=np.uint8) # Get number of data samples per split from the float proportions - split_n_samps = np.rint([len(x_vals) * val for val in split_vals]) + split_n_samps = [len(x_vals) * val for val in split_vals] if np.any(split_n_samps == 0): raise ValueError('split must not generate zero samples per partition, change ratio of values in config file.') diff --git a/test/fixtures/integration/config_3way.integration.json b/test/fixtures/integration/config_3way.integration.json index a169202..e6a144c 100644 --- a/test/fixtures/integration/config_3way.integration.json +++ b/test/fixtures/integration/config_3way.integration.json @@ -19,5 +19,5 @@ "ml_type": "classification", "seed": 19, "split_names": ["train", "test", "val"], - "split_vals": [0.6, 0.2, 0.2] + "split_vals": [0.7, 0.2, 0.1] } From 28e2c297a95d323c01350ec51b805847a583528e Mon Sep 17 00:00:00 2001 From: martham93 Date: Wed, 30 Oct 2019 12:00:18 -0600 Subject: [PATCH 23/23] update docs for clarity --- docs/parameters.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/parameters.rst b/docs/parameters.rst index 1d586d2..ea987cf 100644 --- a/docs/parameters.rst +++ b/docs/parameters.rst @@ -54,11 +54,11 @@ Here is the full list of configuration parameters you can specify in a ``config. **split_vals**: list Default: `[0.8, 0.2]` - Percentage of data to put in each catagory listed in split_names. Must be floats and must sum to one. + Percentage of data to put in each category listed in split_names. Must be a list of floats that sum to one and match the length of `split-names`. For train, validate, and test data, a list like `[0.7, 0.2, 0.1]` is suggested. **split_names**: list Default: `['train', 'test']` - List of names for each subset of the data. + List of names for each subset of the data. Length of list must match length of `split_vals`. **imagery_offset**: list of ints An optional list of integers representing the number of pixels to offset imagery. For example ``[15, -5]`` will move the images 15 pixels right and 5 pixels up relative to the requested tile bounds.