From 4e5fa4c6a74d9121de488cf9fdd5f1fde348f763 Mon Sep 17 00:00:00 2001 From: victoria de sainte agathe Date: Mon, 13 Nov 2023 11:07:08 +0100 Subject: [PATCH 01/13] ok --- CHANGELOG.md | 4 ++ src/differential_privacy/group_by.rs | 94 ++++++++++++++++++++++++++-- src/differential_privacy/mod.rs | 16 ++++- src/io/mod.rs | 6 +- src/relation/builder.rs | 18 +++++- src/relation/mod.rs | 5 ++ src/relation/sql.rs | 2 +- 7 files changed, 130 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8ddd09e7..e2158c86 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Fixed +- All the possible grouping keys must be output by `differential_privacy.group_by.join_with_grouping_values`[MR177](https://github.com/Qrlew/qrlew/pull/170) +- If no tau-thresholding, the budget is transferred to the aggregations [MR177](https://github.com/Qrlew/qrlew/pull/170) +- Adding values of the grouping keys must not modify the size of the table [MR177](https://github.com/Qrlew/qrlew/pull/170) ## [0.4.10] - 2023-11-09 ### Fixed diff --git a/src/differential_privacy/group_by.rs b/src/differential_privacy/group_by.rs index f2e7c428..05467e74 100644 --- a/src/differential_privacy/group_by.rs +++ b/src/differential_privacy/group_by.rs @@ -3,7 +3,7 @@ use crate::{ builder::{Ready, With, WithIterator}, differential_privacy::{private_query, DPRelation, PrivateQuery, Result}, expr::{aggregate, Expr}, - namer, + namer::{self, name_from_content}, protection::{PEPRelation, ProtectedEntity}, relation::{Join, Reduce, Relation, Variant as _}, }; @@ -166,12 +166,31 @@ impl Relation { .iter() .map(|f| f.name().to_string()) .collect::>(); + let left_names = left + .schema() + .iter() + .map(|f| f.name().to_string()) + .collect::>(); + let right_names = right + .schema() + .iter() + .map(|f| { + let name = f.name().to_string(); + if left_names.contains(&name) { + name_from_content("left_".to_string(), f) + } else { + name + } + }) + .collect::>(); let join_rel: Relation = Relation::join() + .size(right.size().clone()) .right(right) - .right_names(names.clone()) + .right_names(right_names.clone()) .left(left) - .inner() + .left_names(left_names.clone()) + .left_outer() .on_iter(on) .build(); @@ -185,7 +204,7 @@ mod tests { use crate::{ ast, builder::With, - data_type::{DataType, DataTyped, Variant}, + data_type::{DataType, DataTyped, Variant, Value, Integer}, display::Dot, expr::AggregateColumn, io::{postgresql, Database}, @@ -603,4 +622,71 @@ mod tests { let dp_query = ast::Query::from(&dp_relation); _ = database.query(&dp_query.to_string()).unwrap(); } + + #[test] + fn test_differentially_private_output_all_grouping_keys() { + // test the results contains all the keys asked by the user (i.e. in the WHERE ) + let mut database = postgresql::test_database(); + let relations = database.relations(); + let table = relations.get(&["large_user_table".into()]).unwrap().as_ref().clone(); + let input: Relation = Relation::map() + .name("map_relation") + .with(("income", expr!(income))) + .with(("city", expr!(city))) + .with(("age", expr!(age))) + .with(( + ProtectedEntity::protected_entity_id(), + expr!(id), + )) + .with(( + ProtectedEntity::protected_entity_weight(), + expr!(id), + )) + .filter( + Expr::in_list( + Expr::col("city"), + Expr::list(vec!["Paris".to_string(), "London".to_string()]), + ) + ) + .input(table.clone()) + .build(); + let reduce: Reduce = Relation::reduce() + .name("reduce_relation") + .with(("sum_income".to_string(), AggregateColumn::sum("income"))) + .group_by(expr!(city)) + .group_by(expr!(age)) + .input(input) + .build(); + let (dp_relation, _) = reduce + .differentially_private_group_by(1., 1e-2) + .unwrap() + .into(); + dp_relation.display_dot().unwrap(); + let query: &str = &ast::Query::from(&dp_relation).to_string(); + let results = database + .query(query) + .unwrap(); + let city_keys = results.iter() + .map(|row| row.to_vec().clone()[0].clone().to_string()) + .collect::>(); + println!("{:?}", city_keys); + assert_eq!(city_keys[0], "London".to_string()); + assert_eq!(city_keys[1], "Paris".to_string()); + + let input_relation_with_protected_group_by = reduce + .input() + .clone() + .join_with_grouping_values(dp_relation).unwrap(); + input_relation_with_protected_group_by.display_dot().unwrap(); + let query: &str = &ast::Query::from(&input_relation_with_protected_group_by).to_string(); + let results = database + .query(query) + .unwrap(); + let city_keys = results.iter() + .map(|row| row.to_vec().clone()[0].clone().to_string()) + .collect::>(); + assert_eq!(city_keys[0], "Paris".to_string()); + assert_eq!(city_keys.last().unwrap(), &"London".to_string()); + assert_eq!(input_relation_with_protected_group_by.size(), &Integer::from_interval(0, 100000)) + } } diff --git a/src/differential_privacy/mod.rs b/src/differential_privacy/mod.rs index b5a3b653..e5c5b592 100644 --- a/src/differential_privacy/mod.rs +++ b/src/differential_privacy/mod.rs @@ -145,6 +145,12 @@ impl Reduce { reduce }; + let (epsilon, delta) = if private_query.is_null() { + (epsilon + epsilon_tau_thresholding, delta + delta_tau_thresholding) + } else { + (epsilon, delta) + }; + // DP rewrite aggregates let (dp_relation, private_query_agg) = reduce_with_dp_group_by .differentially_private_aggregates(epsilon, delta)? @@ -179,7 +185,7 @@ mod tests { .deref() .clone(); let (epsilon, delta) = (1., 1e-3); - let (epsilon_tau_thresholding, delta_tau_thresholding) = (0.5, 2e-3); + let (epsilon_tau_thresholding, delta_tau_thresholding) = (1., 2e-3); // protect the inputs let protection = Protection::from(( @@ -216,7 +222,7 @@ mod tests { dp_relation.display_dot().unwrap(); assert_eq!( private_query, - PrivateQuery::gaussian_from_epsilon_delta_sensitivity(epsilon, delta, 50.) + PrivateQuery::gaussian_from_epsilon_delta_sensitivity(epsilon + epsilon_tau_thresholding, delta + delta_tau_thresholding, 50.) ); assert!(dp_relation .data_type() @@ -286,7 +292,11 @@ mod tests { dp_relation.display_dot().unwrap(); assert_eq!( private_query, - PrivateQuery::gaussian_from_epsilon_delta_sensitivity(epsilon, delta, 50.) + PrivateQuery::gaussian_from_epsilon_delta_sensitivity( + epsilon + epsilon_tau_thresholding, + delta + delta_tau_thresholding, + 50. + ) ); assert!(dp_relation .data_type() diff --git a/src/io/mod.rs b/src/io/mod.rs index 781bc557..79520841 100644 --- a/src/io/mod.rs +++ b/src/io/mod.rs @@ -245,11 +245,7 @@ pub trait Database: Sized { )) .with(( "city", - DataType::text_values([ - "Paris".into(), - "New-York".into(), - "Rome".into(), - ]), + DataType::text(), )) .with(("income", DataType::float_interval(100.0, 200000.0))), ) diff --git a/src/relation/builder.rs b/src/relation/builder.rs index b5a2b2f4..3515e115 100644 --- a/src/relation/builder.rs +++ b/src/relation/builder.rs @@ -648,6 +648,7 @@ pub struct JoinBuilder { operator: Option, left: RequireLeftInput, right: RequireRightInput, + size: Option, } impl JoinBuilder { @@ -682,6 +683,11 @@ impl JoinBuilder>(mut self, size: I) -> Self { + self.size = Some(size.into()); + self + } + pub fn left_outer(mut self) -> Self { self.operator = Some(JoinOperator::LeftOuter(JoinConstraint::Natural)); self @@ -808,6 +814,7 @@ impl JoinBuilder JoinBuilder for JoinBuilder { let operator = self .operator .unwrap_or(JoinOperator::Inner(JoinConstraint::Natural)); - Ok(Join::new( + let join = Join::new( name, left_names, right_names, operator, self.left.0, self.right.0, - )) + ); + let join = if let Some(size) = self.size { + join.force_size(size) + } else { + join + }; + Ok(join) } } diff --git a/src/relation/mod.rs b/src/relation/mod.rs index 2800b999..e42bc1af 100644 --- a/src/relation/mod.rs +++ b/src/relation/mod.rs @@ -980,6 +980,11 @@ impl Join { Integer::from_interval(0, max) } + pub fn force_size(mut self, size: Integer) -> Self { + self.size = size; + self + } + /// Iterate over fields and input names pub fn field_inputs<'a>(&'a self) -> impl Iterator + 'a { let field_identifiers = self.schema().iter().map(|f| f.name().to_string()); diff --git a/src/relation/sql.rs b/src/relation/sql.rs index 89f1097d..d774dc87 100644 --- a/src/relation/sql.rs +++ b/src/relation/sql.rs @@ -724,7 +724,7 @@ mod tests { let query = ast::Query::from(&join); assert_eq!( query.to_string(), - "WITH my_values (my_values) AS (SELECT * FROM (VALUES (3), (4)) AS my_values (my_values)), join_zs1x (field_gu2a, field_b8x4) AS (SELECT * FROM my_values AS _LEFT_ CROSS JOIN table AS _RIGHT_) SELECT * FROM join_zs1x".to_string() + "WITH my_values (my_values) AS (SELECT * FROM (VALUES (3), (4)) AS my_values (my_values)), join_smp1 (field_gu2a, field_b8x4) AS (SELECT * FROM my_values AS _LEFT_ CROSS JOIN table AS _RIGHT_) SELECT * FROM join_smp1".to_string() ); } } From b8445e0b70615f00046b0a74b0086fdd7714f42f Mon Sep 17 00:00:00 2001 From: victoria de sainte agathe Date: Mon, 13 Nov 2023 11:41:20 +0100 Subject: [PATCH 02/13] fixed test --- src/differential_privacy/group_by.rs | 18 ++++++++++++++++-- src/io/mod.rs | 6 +++++- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/src/differential_privacy/group_by.rs b/src/differential_privacy/group_by.rs index 05467e74..d9a0b5a1 100644 --- a/src/differential_privacy/group_by.rs +++ b/src/differential_privacy/group_by.rs @@ -204,12 +204,12 @@ mod tests { use crate::{ ast, builder::With, - data_type::{DataType, DataTyped, Variant, Value, Integer}, + data_type::{DataType, DataTyped, Variant, Integer}, display::Dot, expr::AggregateColumn, io::{postgresql, Database}, protection::{ProtectedEntity, Protection, Strategy}, - relation::{Join, Schema}, + relation::{Join, Schema, Field}, }; use std::ops::Deref; @@ -629,6 +629,20 @@ mod tests { let mut database = postgresql::test_database(); let relations = database.relations(); let table = relations.get(&["large_user_table".into()]).unwrap().as_ref().clone(); + let new_schema: Schema = table.schema() + .iter() + .map(|f| if f.name() == "city" { + Field::from_name_data_type("city", DataType::text()) + } else { + f.clone() + }) + .collect(); + let table:Relation = Relation::table() + .path(["large_user_table"]) + .name("more_users") + .size(100000) + .schema(new_schema) + .build(); let input: Relation = Relation::map() .name("map_relation") .with(("income", expr!(income))) diff --git a/src/io/mod.rs b/src/io/mod.rs index 79520841..781bc557 100644 --- a/src/io/mod.rs +++ b/src/io/mod.rs @@ -245,7 +245,11 @@ pub trait Database: Sized { )) .with(( "city", - DataType::text(), + DataType::text_values([ + "Paris".into(), + "New-York".into(), + "Rome".into(), + ]), )) .with(("income", DataType::float_interval(100.0, 200000.0))), ) From 5f463e017591c0657f4a2a349bf40227dc81306c Mon Sep 17 00:00:00 2001 From: victoria de sainte agathe Date: Mon, 13 Nov 2023 11:48:49 +0100 Subject: [PATCH 03/13] ok --- src/differential_privacy/group_by.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/differential_privacy/group_by.rs b/src/differential_privacy/group_by.rs index d9a0b5a1..3beee39f 100644 --- a/src/differential_privacy/group_by.rs +++ b/src/differential_privacy/group_by.rs @@ -683,7 +683,6 @@ mod tests { let city_keys = results.iter() .map(|row| row.to_vec().clone()[0].clone().to_string()) .collect::>(); - println!("{:?}", city_keys); assert_eq!(city_keys[0], "London".to_string()); assert_eq!(city_keys[1], "Paris".to_string()); @@ -699,6 +698,7 @@ mod tests { let city_keys = results.iter() .map(|row| row.to_vec().clone()[0].clone().to_string()) .collect::>(); + println!("{:?}", city_keys); assert_eq!(city_keys[0], "Paris".to_string()); assert_eq!(city_keys.last().unwrap(), &"London".to_string()); assert_eq!(input_relation_with_protected_group_by.size(), &Integer::from_interval(0, 100000)) From eae94286062c095d35edfe5f173fa639a4889a31 Mon Sep 17 00:00:00 2001 From: victoria de sainte agathe Date: Mon, 13 Nov 2023 12:34:39 +0100 Subject: [PATCH 04/13] fixed test --- src/differential_privacy/group_by.rs | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/differential_privacy/group_by.rs b/src/differential_privacy/group_by.rs index 3beee39f..4a684530 100644 --- a/src/differential_privacy/group_by.rs +++ b/src/differential_privacy/group_by.rs @@ -211,7 +211,11 @@ mod tests { protection::{ProtectedEntity, Protection, Strategy}, relation::{Join, Schema, Field}, }; - use std::ops::Deref; + use std::{ + ops::Deref, + collections::HashSet + }; + #[test] fn test_tau_thresholding_values() { @@ -680,11 +684,11 @@ mod tests { let results = database .query(query) .unwrap(); - let city_keys = results.iter() + let city_keys: HashSet<_> = results.iter() .map(|row| row.to_vec().clone()[0].clone().to_string()) - .collect::>(); - assert_eq!(city_keys[0], "London".to_string()); - assert_eq!(city_keys[1], "Paris".to_string()); + .collect(); + let correct_keys: HashSet<_> = vec!["London".to_string(), "Paris".to_string()].into_iter().collect(); + assert_eq!(city_keys, correct_keys); let input_relation_with_protected_group_by = reduce .input() @@ -695,12 +699,11 @@ mod tests { let results = database .query(query) .unwrap(); - let city_keys = results.iter() + let city_keys: HashSet<_> = results.iter() .map(|row| row.to_vec().clone()[0].clone().to_string()) - .collect::>(); + .collect(); println!("{:?}", city_keys); - assert_eq!(city_keys[0], "Paris".to_string()); - assert_eq!(city_keys.last().unwrap(), &"London".to_string()); - assert_eq!(input_relation_with_protected_group_by.size(), &Integer::from_interval(0, 100000)) + assert_eq!(city_keys, correct_keys); + assert_eq!(input_relation_with_protected_group_by.size(), &Integer::from_interval(0, 100000)); } } From af1b363ec7cbf39bb36513b02af2415d59e30880 Mon Sep 17 00:00:00 2001 From: victoria de sainte agathe Date: Mon, 13 Nov 2023 13:03:22 +0100 Subject: [PATCH 05/13] ok --- src/relation/rewriting.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/relation/rewriting.rs b/src/relation/rewriting.rs index 3ed08dfb..ec815301 100644 --- a/src/relation/rewriting.rs +++ b/src/relation/rewriting.rs @@ -7,6 +7,7 @@ use crate::{ data_type::{self, DataType, DataTyped, Variant as _}, expr::{self, aggregate, Aggregate, Expr, Value}, io, namer, relation, + display::Dot, }; use std::{ collections::{BTreeMap, HashMap}, @@ -412,7 +413,7 @@ impl Relation { // TODO fix this // Join the two relations on the entity column let join: Relation = Relation::join() - .right_outer() + .inner() .on_eq(entities, entities) .left_names( self.fields() @@ -471,6 +472,8 @@ impl Relation { expr } }); + self.display_dot().unwrap(); + panic!(); let clipped_relation = self.scale( entities, value_clippings.keys().cloned().collect(), @@ -509,6 +512,7 @@ impl Relation { /// Add gaussian noise of a given standard deviation to the given columns, while keeping the column min and max pub fn add_clipped_gaussian_noise(self, name_sigmas: Vec<(&str, f64)>) -> Relation { let name_sigmas: HashMap<&str, f64> = name_sigmas.into_iter().collect(); + self.display_dot().unwrap(); Relation::map() // .with_iter(name_sigmas.into_iter().map(|(name, sigma)| (name, Expr::col(name).add_gaussian_noise(sigma)))) .with_iter(self.schema().iter().map(|f| { From 61800a32e73d175065362c0d6580a3ceca6d2dbe Mon Sep 17 00:00:00 2001 From: victoria de sainte agathe Date: Mon, 13 Nov 2023 13:54:34 +0100 Subject: [PATCH 06/13] ok --- src/differential_privacy/aggregates.rs | 11 +++++++++-- src/differential_privacy/mod.rs | 8 +++++++- src/protection/mod.rs | 4 ++++ src/protection/protected_entity.rs | 9 +++++++++ src/relation/rewriting.rs | 2 -- 5 files changed, 29 insertions(+), 5 deletions(-) diff --git a/src/differential_privacy/aggregates.rs b/src/differential_privacy/aggregates.rs index a66469ae..623b9d3a 100644 --- a/src/differential_privacy/aggregates.rs +++ b/src/differential_privacy/aggregates.rs @@ -7,6 +7,7 @@ use crate::{ protection::PEPRelation, relation::{field::Field, Map, Reduce, Relation, Variant as _}, DataType, Ready, + display::Dot, }; use std::{cmp, collections::HashMap, ops::Deref}; @@ -116,11 +117,17 @@ impl PEPRelation { let mut input_builder = Map::builder() .with(( self.protected_entity_id(), - Expr::col(self.protected_entity_id()), + Expr::coalesce( + Expr::cast_as_text(Expr::col(self.protected_entity_id())), + Expr::val(self.protected_entity_null_id().to_string()) + ), )) .with(( self.protected_entity_weight(), - Expr::col(self.protected_entity_weight()), + Expr::coalesce( + Expr::col(self.protected_entity_weight()), + Expr::val(0.) + ) )); let mut group_by_names = vec![]; diff --git a/src/differential_privacy/mod.rs b/src/differential_privacy/mod.rs index a209ca4d..5ff31715 100644 --- a/src/differential_privacy/mod.rs +++ b/src/differential_privacy/mod.rs @@ -162,6 +162,7 @@ impl Reduce { #[cfg(test)] mod tests { + use std::collections::HashSet; use super::*; use crate::{ ast, @@ -512,7 +513,12 @@ mod tests { let results = database .query(query) .unwrap(); - println!("{:?}", results); + println!("results = {:?}", results); + let city_keys: HashSet<_> = results.iter() + .map(|row| row.to_vec().clone()[0].clone().to_string()) + .collect(); + let correct_keys: HashSet<_> = vec!["London".to_string(), "Paris".to_string()].into_iter().collect(); + assert_eq!(city_keys, correct_keys); } #[test] diff --git a/src/protection/mod.rs b/src/protection/mod.rs index 018c80ab..7382ed61 100644 --- a/src/protection/mod.rs +++ b/src/protection/mod.rs @@ -69,6 +69,10 @@ impl PEPRelation { ProtectedEntity::protected_entity_id() } + pub fn protected_entity_null_id(&self) -> &str { + ProtectedEntity::protected_entity_null_id() + } + pub fn protected_entity_weight(&self) -> &str { ProtectedEntity::protected_entity_weight() } diff --git a/src/protection/protected_entity.rs b/src/protection/protected_entity.rs index 30c02800..7cacfa23 100644 --- a/src/protection/protected_entity.rs +++ b/src/protection/protected_entity.rs @@ -5,6 +5,7 @@ use std::{fmt::Display, hash::Hash, ops::Deref}; pub const PROTECTION_PREFIX: &str = "_PROTECTED_"; pub const PROTECTION_COLUMNS: usize = 2; pub const PROTECTED_ENTITY_ID: &str = "_PROTECTED_ENTITY_ID_"; +pub const PROTECTED_ENTITY_NULL_ID: &str = "_PROTECTED_ENTITY_NULL_"; pub const PROTECTED_ENTITY_WEIGHT: &str = "_PROTECTED_ENTITY_WEIGHT_"; // A few utility objects @@ -185,6 +186,10 @@ impl ProtectedEntityPath { pub fn protected_entity_id() -> &'static str { PROTECTED_ENTITY_ID } + + pub fn protected_entity_null_id() -> &'static str { + PROTECTED_ENTITY_NULL_ID + } } impl Display for ProtectedEntityPath { @@ -268,6 +273,10 @@ impl ProtectedEntity { ProtectedEntityPath::protected_entity_id() } + pub fn protected_entity_null_id() -> &'static str { + ProtectedEntityPath::protected_entity_null_id() + } + pub fn protected_entity_weight() -> &'static str { PROTECTED_ENTITY_WEIGHT } diff --git a/src/relation/rewriting.rs b/src/relation/rewriting.rs index ec815301..e21d952b 100644 --- a/src/relation/rewriting.rs +++ b/src/relation/rewriting.rs @@ -472,8 +472,6 @@ impl Relation { expr } }); - self.display_dot().unwrap(); - panic!(); let clipped_relation = self.scale( entities, value_clippings.keys().cloned().collect(), From 30e870cfc34c1e1a784f29cc8dcda73ca0a5fbce Mon Sep 17 00:00:00 2001 From: victoria de sainte agathe Date: Mon, 13 Nov 2023 14:02:35 +0100 Subject: [PATCH 07/13] ok --- CHANGELOG.md | 13 ++----------- src/relation/rewriting.rs | 13 +++++++------ 2 files changed, 9 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8e0ae09d..a049b132 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,19 +6,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] -<<<<<<< HEAD -### Fixed -<<<<<<< HEAD -- All the possible grouping keys must be output by `differential_privacy.group_by.join_with_grouping_values`[MR177](https://github.com/Qrlew/qrlew/pull/170) -- If no tau-thresholding, the budget is transferred to the aggregations [MR177](https://github.com/Qrlew/qrlew/pull/170) -- Adding values of the grouping keys must not modify the size of the table [MR177](https://github.com/Qrlew/qrlew/pull/170) -======= -- `DataType`` propagation in joins: if their is not INNEr or CROSS contraint, then the output `DataType`s must be optional [MR179](https://github.com/Qrlew/qrlew/pull/179) ->>>>>>> fix_datatype_in_joins -======= +### Fixed +- `DataType` propagation in joins: if their is not INNEr or CROSS contraint, then the output `DataType`s must be optional [MR179](https://github.com/Qrlew/qrlew/pull/179) ### Added - Implemented `Coalesce` [MR178](https://github.com/Qrlew/qrlew/pull/178) ->>>>>>> implement_coalesce ## [0.4.10] - 2023-11-09 ### Fixed diff --git a/src/relation/rewriting.rs b/src/relation/rewriting.rs index e21d952b..e121cb99 100644 --- a/src/relation/rewriting.rs +++ b/src/relation/rewriting.rs @@ -472,12 +472,13 @@ impl Relation { expr } }); - let clipped_relation = self.scale( - entities, - value_clippings.keys().cloned().collect(), - scaling_factors, - ); - clipped_relation.sums_by_group(groups, value_clippings.keys().cloned().collect()) + // let clipped_relation = self.scale( + // entities, + // value_clippings.keys().cloned().collect(), + // scaling_factors, + // ); + //clipped_relation.sums_by_group(groups, value_clippings.keys().cloned().collect()) + self } /// Clip sums in the first `Reduce`s found From 9049ea05f529c6e4a7f80dd9e81ce9ad9c530794 Mon Sep 17 00:00:00 2001 From: victoria de sainte agathe Date: Mon, 13 Nov 2023 14:14:22 +0100 Subject: [PATCH 08/13] ok --- CHANGELOG.md | 3 ++- src/differential_privacy/mod.rs | 35 ++++++++++++++++++++++++++++++++- src/expr/rewriting.rs | 2 +- src/relation/rewriting.rs | 22 ++++++++++----------- 4 files changed, 48 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a049b132..b7a1f53b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Fixed -- `DataType` propagation in joins: if their is not INNEr or CROSS contraint, then the output `DataType`s must be optional [MR179](https://github.com/Qrlew/qrlew/pull/179) +- `DataType`` propagation in joins: if their is not INNEr or CROSS contraint, then the output `DataType`s must be optional [MR179](https://github.com/Qrlew/qrlew/pull/179) +- Dp query should release all the possible values of the grouping keys. [MR180](https://github.com/Qrlew/qrlew/pull/180) ### Added - Implemented `Coalesce` [MR178](https://github.com/Qrlew/qrlew/pull/178) diff --git a/src/differential_privacy/mod.rs b/src/differential_privacy/mod.rs index 5ff31715..45f510be 100644 --- a/src/differential_privacy/mod.rs +++ b/src/differential_privacy/mod.rs @@ -172,7 +172,7 @@ mod tests { expr::{AggregateColumn, Expr}, io::{postgresql, Database}, protection::{Protection, Strategy}, - relation::{Map, Relation, Variant}, + relation::{Map, Relation, Variant, Schema, Field}, protection::ProtectedEntity, }; @@ -471,6 +471,20 @@ mod tests { let mut database = postgresql::test_database(); let relations = database.relations(); let table = relations.get(&["large_user_table".into()]).unwrap().as_ref().clone(); + let new_schema: Schema = table.schema() + .iter() + .map(|f| if f.name() == "city" { + Field::from_name_data_type("city", DataType::text()) + } else { + f.clone() + }) + .collect(); + let table:Relation = Relation::table() + .path(["large_user_table"]) + .name("more_users") + .size(100000) + .schema(new_schema) + .build(); let input: Relation = Relation::map() .name("map_relation") .with(("income", expr!(income))) @@ -527,6 +541,20 @@ mod tests { let mut database = postgresql::test_database(); let relations = database.relations(); let table = relations.get(&["large_user_table".into()]).unwrap().as_ref().clone(); + let new_schema: Schema = table.schema() + .iter() + .map(|f| if f.name() == "city" { + Field::from_name_data_type("city", DataType::text()) + } else { + f.clone() + }) + .collect(); + let table:Relation = Relation::table() + .path(["large_user_table"]) + .name("more_users") + .size(100000) + .schema(new_schema) + .build(); let input: Relation = Relation::map() .name("map_relation") .with(("income", expr!(income))) @@ -573,5 +601,10 @@ mod tests { .query(query) .unwrap(); println!("{:?}", results); + let city_keys: HashSet<_> = results.iter() + .map(|row| row.to_vec().clone()[0].clone().to_string()) + .collect(); + let correct_keys: HashSet<_> = vec!["London".to_string(), "Paris".to_string()].into_iter().collect(); + assert_eq!(city_keys, correct_keys); } } diff --git a/src/expr/rewriting.rs b/src/expr/rewriting.rs index d6f9509f..04e74493 100644 --- a/src/expr/rewriting.rs +++ b/src/expr/rewriting.rs @@ -21,7 +21,7 @@ impl Expr { /// Gaussian noise based on [Box Muller transform](https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform) pub fn add_gaussian_noise(self, sigma: f64) -> Self { Expr::plus( - Expr::coalesce(self, Expr::val(0.)), + self, Expr::multiply(Expr::val(sigma), Expr::gaussian_noise()), ) } diff --git a/src/relation/rewriting.rs b/src/relation/rewriting.rs index e121cb99..abee0eec 100644 --- a/src/relation/rewriting.rs +++ b/src/relation/rewriting.rs @@ -4,7 +4,7 @@ use super::{Join, Map, Reduce, Relation, Set, Table, Values, Variant as _}; use crate::{ builder::{Ready, With, WithIterator}, - data_type::{self, DataType, DataTyped, Variant as _}, + data_type::{self, DataType, DataTyped, Variant as _, function::Function}, expr::{self, aggregate, Aggregate, Expr, Value}, io, namer, relation, display::Dot, @@ -472,13 +472,12 @@ impl Relation { expr } }); - // let clipped_relation = self.scale( - // entities, - // value_clippings.keys().cloned().collect(), - // scaling_factors, - // ); - //clipped_relation.sums_by_group(groups, value_clippings.keys().cloned().collect()) - self + let clipped_relation = self.scale( + entities, + value_clippings.keys().cloned().collect(), + scaling_factors, + ); + clipped_relation.sums_by_group(groups, value_clippings.keys().cloned().collect()) } /// Clip sums in the first `Reduce`s found @@ -516,8 +515,9 @@ impl Relation { // .with_iter(name_sigmas.into_iter().map(|(name, sigma)| (name, Expr::col(name).add_gaussian_noise(sigma)))) .with_iter(self.schema().iter().map(|f| { if name_sigmas.contains_key(&f.name()) { - let float_data_type: data_type::Float = f - .data_type() + let x = Expr::coalesce(Expr::col(f.name()), Expr::val(0.)); + let float_data_type: data_type::Float = x.super_image(&f.data_type()) + .unwrap() .into_data_type(&DataType::float()) .unwrap() .try_into() @@ -528,7 +528,7 @@ impl Relation { Expr::val(*float_data_type.max().unwrap()), Expr::greatest( Expr::val(*float_data_type.min().unwrap()), - Expr::col(f.name()).add_gaussian_noise(name_sigmas[f.name()]), + x.add_gaussian_noise(name_sigmas[f.name()]), ), ), ) From 82ab70581343d1f1fd7cddaee36e4e527ac3fee2 Mon Sep 17 00:00:00 2001 From: victoria de sainte agathe Date: Mon, 13 Nov 2023 14:44:26 +0100 Subject: [PATCH 09/13] docstrings --- src/differential_privacy/group_by.rs | 3 +++ src/differential_privacy/mod.rs | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/differential_privacy/group_by.rs b/src/differential_privacy/group_by.rs index 4a684530..cf392281 100644 --- a/src/differential_privacy/group_by.rs +++ b/src/differential_privacy/group_by.rs @@ -146,6 +146,8 @@ impl Relation { Ok(relation_with_private_values.public_values()?) } + /// We join the `self` `Relation` with the `grouping_values Relation`; + /// We use a `LEFT OUTER` join for guaranteeing that all the possible grouping keys are released pub fn join_with_grouping_values(self, grouping_values: Relation) -> Result { let left = grouping_values; let right = self; @@ -185,6 +187,7 @@ impl Relation { .collect::>(); let join_rel: Relation = Relation::join() + // we force the size of the relation to be equal to the `self Relation` .size(right.size().clone()) .right(right) .right_names(right_names.clone()) diff --git a/src/differential_privacy/mod.rs b/src/differential_privacy/mod.rs index e5c5b592..ba1dcb05 100644 --- a/src/differential_privacy/mod.rs +++ b/src/differential_privacy/mod.rs @@ -145,6 +145,8 @@ impl Reduce { reduce }; + // if the (epsilon_tau_thresholding, delta_tau_thresholding) budget has + // not been spent, allocate it to the aggregations. let (epsilon, delta) = if private_query.is_null() { (epsilon + epsilon_tau_thresholding, delta + delta_tau_thresholding) } else { @@ -185,7 +187,7 @@ mod tests { .deref() .clone(); let (epsilon, delta) = (1., 1e-3); - let (epsilon_tau_thresholding, delta_tau_thresholding) = (1., 2e-3); + let (epsilon_tau_thresholding, delta_tau_thresholding) = (0.5, 2e-3); // protect the inputs let protection = Protection::from(( From 35513dc7bb1b5774a84f58b039ba1eb270c8e841 Mon Sep 17 00:00:00 2001 From: victoria de sainte agathe Date: Mon, 13 Nov 2023 15:00:00 +0100 Subject: [PATCH 10/13] changelog --- CHANGELOG.md | 6 +++--- src/differential_privacy/aggregates.rs | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7c05b037..bfd94a9b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,12 +8,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Fixed - Dp query should release all the possible values of the grouping keys. [MR180](https://github.com/Qrlew/qrlew/pull/180) -- All the possible grouping keys must be output by `differential_privacy.group_by.join_with_grouping_values`[MR177](https://github.com/Qrlew/qrlew/pull/170) -- If no tau-thresholding, the budget is transferred to the aggregations [MR177](https://github.com/Qrlew/qrlew/pull/170) -- Adding values of the grouping keys must not modify the size of the table [MR177](https://github.com/Qrlew/qrlew/pull/170) +- Adding values of the grouping keys must not modify the size of the table [MR180](https://github.com/Qrlew/qrlew/pull/180) - `DataType`` propagation in joins: if their is not INNEr or CROSS contraint, then the output `DataType`s must be optional [MR179](https://github.com/Qrlew/qrlew/pull/179) ### Added - Implemented `Coalesce` [MR178](https://github.com/Qrlew/qrlew/pull/178) +### Changed +- If no tau-thresholding, the budget is transferred to the aggregations [MR180](https://github.com/Qrlew/qrlew/pull/180) ## [0.4.10] - 2023-11-09 ### Fixed diff --git a/src/differential_privacy/aggregates.rs b/src/differential_privacy/aggregates.rs index 623b9d3a..f4e97662 100644 --- a/src/differential_privacy/aggregates.rs +++ b/src/differential_privacy/aggregates.rs @@ -7,7 +7,6 @@ use crate::{ protection::PEPRelation, relation::{field::Field, Map, Reduce, Relation, Variant as _}, DataType, Ready, - display::Dot, }; use std::{cmp, collections::HashMap, ops::Deref}; From e5d424745728c91008bf31d847bc2b21a327935d Mon Sep 17 00:00:00 2001 From: victoria de sainte agathe Date: Mon, 13 Nov 2023 15:26:06 +0100 Subject: [PATCH 11/13] remove force size in join --- src/differential_privacy/group_by.rs | 2 -- src/relation/builder.rs | 32 +++++++++------------------- src/relation/mod.rs | 5 ----- src/relation/rewriting.rs | 2 -- src/sql/expr.rs | 14 ++++++++++++ 5 files changed, 24 insertions(+), 31 deletions(-) diff --git a/src/differential_privacy/group_by.rs b/src/differential_privacy/group_by.rs index 40487d91..087c2a16 100644 --- a/src/differential_privacy/group_by.rs +++ b/src/differential_privacy/group_by.rs @@ -187,8 +187,6 @@ impl Relation { .collect::>(); let join_rel: Relation = Relation::join() - // we force the size of the relation to be equal to the `self Relation` - .size(right.size().clone()) .right(right) .right_names(right_names.clone()) .left(left) diff --git a/src/relation/builder.rs b/src/relation/builder.rs index 3515e115..099b0dd5 100644 --- a/src/relation/builder.rs +++ b/src/relation/builder.rs @@ -648,7 +648,6 @@ pub struct JoinBuilder { operator: Option, left: RequireLeftInput, right: RequireRightInput, - size: Option, } impl JoinBuilder { @@ -683,11 +682,6 @@ impl JoinBuilder>(mut self, size: I) -> Self { - self.size = Some(size.into()); - self - } - pub fn left_outer(mut self) -> Self { self.operator = Some(JoinOperator::LeftOuter(JoinConstraint::Natural)); self @@ -814,7 +808,6 @@ impl JoinBuilder JoinBuilder for JoinBuilder { let operator = self .operator .unwrap_or(JoinOperator::Inner(JoinConstraint::Natural)); - let join = Join::new( - name, - left_names, - right_names, - operator, - self.left.0, - self.right.0, - ); - let join = if let Some(size) = self.size { - join.force_size(size) - } else { - join - }; - Ok(join) + Ok( + Join::new( + name, + left_names, + right_names, + operator, + self.left.0, + self.right.0, + ) + ) } } diff --git a/src/relation/mod.rs b/src/relation/mod.rs index 899b9cdb..5cec3184 100644 --- a/src/relation/mod.rs +++ b/src/relation/mod.rs @@ -988,11 +988,6 @@ impl Join { Integer::from_interval(0, max) } - pub fn force_size(mut self, size: Integer) -> Self { - self.size = size; - self - } - /// Iterate over fields and input names pub fn field_inputs<'a>(&'a self) -> impl Iterator + 'a { let field_identifiers = self.schema().iter().map(|f| f.name().to_string()); diff --git a/src/relation/rewriting.rs b/src/relation/rewriting.rs index abee0eec..0b7a2f0b 100644 --- a/src/relation/rewriting.rs +++ b/src/relation/rewriting.rs @@ -7,7 +7,6 @@ use crate::{ data_type::{self, DataType, DataTyped, Variant as _, function::Function}, expr::{self, aggregate, Aggregate, Expr, Value}, io, namer, relation, - display::Dot, }; use std::{ collections::{BTreeMap, HashMap}, @@ -510,7 +509,6 @@ impl Relation { /// Add gaussian noise of a given standard deviation to the given columns, while keeping the column min and max pub fn add_clipped_gaussian_noise(self, name_sigmas: Vec<(&str, f64)>) -> Relation { let name_sigmas: HashMap<&str, f64> = name_sigmas.into_iter().collect(); - self.display_dot().unwrap(); Relation::map() // .with_iter(name_sigmas.into_iter().map(|(name, sigma)| (name, Expr::col(name).add_gaussian_noise(sigma)))) .with_iter(self.schema().iter().map(|f| { diff --git a/src/sql/expr.rs b/src/sql/expr.rs index bfcc0ff0..486b5baa 100644 --- a/src/sql/expr.rs +++ b/src/sql/expr.rs @@ -964,4 +964,18 @@ mod tests { assert_eq!(true_expr.to_string(), expr.to_string()); assert_eq!(expr.to_string(), String::from("(not (a in (3, 4, 5)))")); } + + #[test] + fn test_coalesce() { + let ast_expr: ast::Expr = parse_expr(" COALESCE(col1, col2, col3, 'default')").unwrap(); + println!("ast::expr = {ast_expr}"); + let expr = Expr::try_from(ast_expr.with(&Hierarchy::empty())).unwrap(); + println!("expr = {}", expr); + for (x, t) in ast_expr.iter_with(DisplayVisitor) { + println!("{x} ({t})"); + } + let true_expr = Expr::in_list(Expr::col("a"), Expr::list([3, 4, 5])); + assert_eq!(true_expr.to_string(), expr.to_string()); + assert_eq!(expr.to_string(), String::from("(a in (3, 4, 5))")); + } } From 9b05b72a79ed57791d541032ee38a3ff988fee75 Mon Sep 17 00:00:00 2001 From: victoria de sainte agathe Date: Mon, 13 Nov 2023 15:28:47 +0100 Subject: [PATCH 12/13] ok --- src/differential_privacy/group_by.rs | 1 - src/relation/sql.rs | 2 +- src/sql/expr.rs | 13 ------------- 3 files changed, 1 insertion(+), 15 deletions(-) diff --git a/src/differential_privacy/group_by.rs b/src/differential_privacy/group_by.rs index 087c2a16..8d4ad22a 100644 --- a/src/differential_privacy/group_by.rs +++ b/src/differential_privacy/group_by.rs @@ -705,7 +705,6 @@ mod tests { .collect(); println!("{:?}", city_keys); assert_eq!(city_keys, correct_keys); - assert_eq!(input_relation_with_protected_group_by.size(), &Integer::from_interval(0, 100000)); } } diff --git a/src/relation/sql.rs b/src/relation/sql.rs index d774dc87..89f1097d 100644 --- a/src/relation/sql.rs +++ b/src/relation/sql.rs @@ -724,7 +724,7 @@ mod tests { let query = ast::Query::from(&join); assert_eq!( query.to_string(), - "WITH my_values (my_values) AS (SELECT * FROM (VALUES (3), (4)) AS my_values (my_values)), join_smp1 (field_gu2a, field_b8x4) AS (SELECT * FROM my_values AS _LEFT_ CROSS JOIN table AS _RIGHT_) SELECT * FROM join_smp1".to_string() + "WITH my_values (my_values) AS (SELECT * FROM (VALUES (3), (4)) AS my_values (my_values)), join_zs1x (field_gu2a, field_b8x4) AS (SELECT * FROM my_values AS _LEFT_ CROSS JOIN table AS _RIGHT_) SELECT * FROM join_zs1x".to_string() ); } } diff --git a/src/sql/expr.rs b/src/sql/expr.rs index 486b5baa..2dcae020 100644 --- a/src/sql/expr.rs +++ b/src/sql/expr.rs @@ -965,17 +965,4 @@ mod tests { assert_eq!(expr.to_string(), String::from("(not (a in (3, 4, 5)))")); } - #[test] - fn test_coalesce() { - let ast_expr: ast::Expr = parse_expr(" COALESCE(col1, col2, col3, 'default')").unwrap(); - println!("ast::expr = {ast_expr}"); - let expr = Expr::try_from(ast_expr.with(&Hierarchy::empty())).unwrap(); - println!("expr = {}", expr); - for (x, t) in ast_expr.iter_with(DisplayVisitor) { - println!("{x} ({t})"); - } - let true_expr = Expr::in_list(Expr::col("a"), Expr::list([3, 4, 5])); - assert_eq!(true_expr.to_string(), expr.to_string()); - assert_eq!(expr.to_string(), String::from("(a in (3, 4, 5))")); - } } From 8ae9a49bd3267220f59cd00dd529bba7b97e9b32 Mon Sep 17 00:00:00 2001 From: victoria de sainte agathe Date: Mon, 13 Nov 2023 15:31:37 +0100 Subject: [PATCH 13/13] changelog --- CHANGELOG.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bfd94a9b..1da21a43 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,8 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Fixed - Dp query should release all the possible values of the grouping keys. [MR180](https://github.com/Qrlew/qrlew/pull/180) -- Adding values of the grouping keys must not modify the size of the table [MR180](https://github.com/Qrlew/qrlew/pull/180) -- `DataType`` propagation in joins: if their is not INNEr or CROSS contraint, then the output `DataType`s must be optional [MR179](https://github.com/Qrlew/qrlew/pull/179) +- `DataType`` propagation in joins: if their is not INNER or CROSS contraint, then the output `DataType`s must be optional [MR179](https://github.com/Qrlew/qrlew/pull/179) ### Added - Implemented `Coalesce` [MR178](https://github.com/Qrlew/qrlew/pull/178) ### Changed