From a6c5246dc813e311e1037556eef18ba5b9cda204 Mon Sep 17 00:00:00 2001 From: Peter Luitjens <43619525+busma13@users.noreply.github.com> Date: Fri, 17 Jan 2025 11:50:00 -0700 Subject: [PATCH] fix bug in sample_random when probability_to_keep is 0 (#989) This PR makes the following changes: - Fixes a bug in the `sample_random` processor when `probability_to_keep` is set to 0%. If the random number generator returns 0 then a record would be returned ( 0 <= 0 is true ). Setting the `random()` function's minimum value to 1 ensures that no records will ever be returned ( 1 <= 0 is false). - Update docs - Add test with large dataset and `probability_to_keep` set to 0% - bump standard asset from v1.3.1 to v1.3.2 ref: #988 --- asset/asset.json | 2 +- asset/package.json | 2 +- asset/src/sample_random/processor.ts | 2 +- docs/operations/sample_random.md | 5 +++-- package.json | 2 +- test/sample_random/processor-spec.ts | 8 ++++++++ 6 files changed, 15 insertions(+), 6 deletions(-) diff --git a/asset/asset.json b/asset/asset.json index 795fa361..5753e104 100644 --- a/asset/asset.json +++ b/asset/asset.json @@ -1,6 +1,6 @@ { "name": "standard", - "version": "1.3.1", + "version": "1.3.2", "description": "Teraslice standard processor asset bundle", "minimum_teraslice_version": "2.0.0" } diff --git a/asset/package.json b/asset/package.json index 2e8180f4..9dec6468 100644 --- a/asset/package.json +++ b/asset/package.json @@ -1,7 +1,7 @@ { "name": "standard", "displayName": "Asset", - "version": "1.3.1", + "version": "1.3.2", "private": true, "description": "Teraslice standard processor asset bundle", "repository": { diff --git a/asset/src/sample_random/processor.ts b/asset/src/sample_random/processor.ts index 6f112d8b..eb7384e8 100644 --- a/asset/src/sample_random/processor.ts +++ b/asset/src/sample_random/processor.ts @@ -6,7 +6,7 @@ export default class SampleRandom extends BatchProcessor { const outData: DataEntity[] = []; for (const doc of dataArray) { - if (random(0, 99) <= this.opConfig.probability_to_keep) { + if (random(1, 100) <= this.opConfig.probability_to_keep) { outData.push(doc); } } diff --git a/docs/operations/sample_random.md b/docs/operations/sample_random.md index b0c423b4..597c8ae7 100644 --- a/docs/operations/sample_random.md +++ b/docs/operations/sample_random.md @@ -1,6 +1,6 @@ # sample_random -given an array of JSON documents will return an array containing a subset of those input documents. It iterates through the array and generates a random number between 0 and 100 for each record, and if the number <= probability it is kept. Must be between 0 and 100, with 100 keeping all records and 0 rejecting all records. +given an array of JSON documents will return an array containing a subset of those input documents. It iterates through the array and generates a random number between 1 and 100 for each record, and if the number <= probability it is kept. Must be between 0 and 100, with 100 keeping all records and 0 rejecting all records. ## Usage @@ -29,6 +29,7 @@ Example of a job using the `sample_random` processor } ``` + Example of the data and the expected results ```javascript @@ -52,4 +53,4 @@ results === [ | Configuration | Description | Type | Notes | | ------------- | ------------------------------------------------------------- | ------ | ---------------------------- | | _op | Name of operation, it must reflect the exact name of the file | String | required | -| probability_to_keep | The probability of the record being kept. It iterates through the array and generates a random number between 0 and 100, and if the number <= probability it is kept. Must be between 0 and 100, with 100 keeping all records and 0 rejecting all records | required, defaults to 100 | +| probability_to_keep | The probability of the record being kept. It iterates through the array and generates a random number between 1 and 100, and if the number <= probability it is kept. Must be between 0 and 100, with 100 keeping all records and 0 rejecting all records | Number, defaults to 100 | required | diff --git a/package.json b/package.json index 45ee6427..a4bd60b2 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "standard-assets-bundle", "displayName": "Standard Assets Bundle", - "version": "1.3.1", + "version": "1.3.2", "private": true, "description": "Teraslice standard processor asset bundle", "type": "module", diff --git a/test/sample_random/processor-spec.ts b/test/sample_random/processor-spec.ts index b3b7adaa..e050eeb4 100644 --- a/test/sample_random/processor-spec.ts +++ b/test/sample_random/processor-spec.ts @@ -77,6 +77,14 @@ describe('sample_random', () => { expect(results.length).toBeLessThan(5400); expect(results.length).toBeGreaterThan(4600); }); + + it('with large datasets and 0%', async () => { + const data = makeData(10000); + harness = await makeTest({ probability_to_keep: 0 }); + const results = await harness.runSlice(data); + + expect(results.length).toEqual(0); + }); }); interface FakeData {