From 8cf15c3e321d88adce96c2405a213d36b9f6157b Mon Sep 17 00:00:00 2001 From: victoria de sainte agathe Date: Fri, 1 Dec 2023 11:01:14 +0100 Subject: [PATCH 1/3] implemented var and std --- CHANGELOG.md | 1 + src/differential_privacy/aggregates.rs | 86 ++++++++++++++++++++++++-- 2 files changed, 82 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 19bbac07..d7d390cb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ## Added +- implemented `STD` and `VAR`aggregations in the dp rewritting [#205](https://github.com/Qrlew/qrlew/issues/205) - implemented `CURRENT_DATE`, `CURRENT_TIME`, `CURRENT_TIMESTAMP` and `EXTRACT(datepart FROM col)` [#200](https://github.com/Qrlew/qrlew/issues/200) - implemented `DISTINCT` in aggregations [#197](https://github.com/Qrlew/qrlew/issues/197) - Implemented math functions: `PI`, `DEGREES`, `TAN`, `RANDOM`, `LOG10`, `LOG2`, `SQUARE` [#196](https://github.com/Qrlew/qrlew/issues/196) diff --git a/src/differential_privacy/aggregates.rs b/src/differential_privacy/aggregates.rs index 86becd7c..27a2f02f 100644 --- a/src/differential_privacy/aggregates.rs +++ b/src/differential_privacy/aggregates.rs @@ -150,6 +150,10 @@ impl PUPRelation { let one_col = format!("_ONE_{}", col_name); let sum_col = format!("_SUM_{}", col_name); let count_col = format!("_COUNT_{}", col_name); + let square_col = format!("_SQUARE_{}", col_name); + let one_square_col = format!("_ONE_{}", square_col); + let sum_square_col = format!("_SUM_{}", square_col); + let count_square_col = format!("_COUNT_{}", square_col); match aggregate.aggregate() { aggregate::Aggregate::First => { assert!(group_by_names.contains(&col_name.as_str())); @@ -179,8 +183,60 @@ impl PUPRelation { sums.push((sum_col.clone(), col_name)); output_b = output_b.with((name, Expr::col(sum_col))); } - aggregate::Aggregate::Std => todo!(), - aggregate::Aggregate::Var => todo!(), + aggregate::Aggregate::Std => { + input_b = input_b + .with((col_name.as_str(), Expr::col(col_name.as_str()))) + .with((square_col.as_str(), Expr::pow(Expr::col(col_name.as_str()), Expr::val(2)))) + .with((one_col.as_str(), Expr::val(1.))) + .with((one_square_col.as_str(), Expr::val(1.))); + sums.push((count_col.clone(), one_col)); + sums.push((sum_col.clone(), col_name)); + sums.push((count_square_col.clone(), one_square_col)); + sums.push((sum_square_col.clone(), square_col)); + output_b = output_b.with(( + name, + Expr::sqrt(Expr::greatest( + Expr::val(0.), + Expr::minus( + Expr::divide( + Expr::col(sum_square_col), + Expr::greatest(Expr::val(1.), Expr::col(count_square_col)), + ), + Expr::divide( + Expr::col(sum_col), + Expr::greatest(Expr::val(1.), Expr::col(count_col)), + ), + ) + )) + )) + } + aggregate::Aggregate::Var => { + input_b = input_b + .with((col_name.as_str(), Expr::col(col_name.as_str()))) + .with((square_col.as_str(), Expr::pow(Expr::col(col_name.as_str()), Expr::val(2)))) + .with((one_col.as_str(), Expr::val(1.))) + .with((one_square_col.as_str(), Expr::val(1.))); + sums.push((count_col.clone(), one_col)); + sums.push((sum_col.clone(), col_name)); + sums.push((count_square_col.clone(), one_square_col)); + sums.push((sum_square_col.clone(), square_col)); + output_b = output_b.with(( + name, + Expr::greatest( + Expr::val(0.), + Expr::minus( + Expr::divide( + Expr::col(sum_square_col), + Expr::greatest(Expr::val(1.), Expr::col(count_square_col)), + ), + Expr::divide( + Expr::col(sum_col), + Expr::greatest(Expr::val(1.), Expr::col(count_col)), + ), + ) + ) + )) + } _ => (), } (input_b, sums, output_b) @@ -531,6 +587,8 @@ mod tests { ("count_price".to_string(), AggregateColumn::count("price")), ("sum_price".to_string(), AggregateColumn::sum("price")), ("avg_price".to_string(), AggregateColumn::mean("price")), + ("var_price".to_string(), AggregateColumn::var("price")), + ("std_price".to_string(), AggregateColumn::std("price")), ], vec![], pup_table.deref().clone().into(), @@ -542,17 +600,19 @@ mod tests { .differentially_private_aggregates(epsilon, delta) .unwrap(); dp_relation.display_dot().unwrap(); - assert_eq!(dp_relation.schema().len(), 3); + assert_eq!(dp_relation.schema().len(), 5); + println!("data_type = {}", dp_relation.data_type()); assert!(dp_relation .data_type() .is_subset_of(&DataType::structured(vec![ ("count_price", DataType::float()), ("sum_price", DataType::float()), ("avg_price", DataType::float()), + ("var_price", DataType::float_min(0.)), + ("std_price", DataType::float_min(0.)), ]))); - let query: &str = &ast::Query::from(&relation).to_string(); - println!("{query}"); + println!("\n{query}"); _ = database.query(query).unwrap(); } @@ -590,6 +650,8 @@ mod tests { ("count_price".to_string(), AggregateColumn::count("price")), ("sum_price".to_string(), AggregateColumn::sum("price")), ("avg_price".to_string(), AggregateColumn::mean("price")), + ("var_price".to_string(), AggregateColumn::var("price")), + ("std_price".to_string(), AggregateColumn::std("price")), ], vec![expr!(item)], pup_table.deref().clone().into(), @@ -608,6 +670,8 @@ mod tests { ("count_price", DataType::float()), ("sum_price", DataType::float()), ("avg_price", DataType::float()), + ("var_price", DataType::float_min(0.)), + ("std_price", DataType::float_min(0.)), ]))); let query: &str = &ast::Query::from(&relation).to_string(); @@ -996,6 +1060,9 @@ mod tests { .with(("sum_distinct_a", AggregateColumn::sum_distinct("a"))) .with(("count_b", AggregateColumn::count("b"))) .with(("count_distinct_b", AggregateColumn::count_distinct("b"))) + .with(("avg_distinct_b", AggregateColumn::mean_distinct("b"))) + .with(("var_distinct_b", AggregateColumn::var_distinct("b"))) + .with(("std_distinct_b", AggregateColumn::std_distinct("b"))) .build(); let dp_relation = reduce.differentially_private_aggregates(epsilon.clone(), delta.clone()).unwrap(); //dp_relation.relation().display_dot().unwrap(); @@ -1006,6 +1073,9 @@ mod tests { ("sum_distinct_a", DataType::float_interval(-2000., 2000.)), ("count_b", DataType::float_interval(0., 1000.)), ("count_distinct_b", DataType::float_interval(0., 1000.)), + ("avg_distinct_b", DataType::float_interval(0., 10000.)), + ("var_distinct_b", DataType::float_interval(0., 100000.)), + ("std_distinct_b", DataType::float_interval(0., 316.22776601683796)), ]) ); @@ -1017,6 +1087,9 @@ mod tests { .with(("count_b", AggregateColumn::count("b"))) .with(("count_distinct_b", AggregateColumn::count_distinct("b"))) .with(("my_c", AggregateColumn::first("c"))) + .with(("avg_distinct_b", AggregateColumn::mean_distinct("b"))) + .with(("var_distinct_b", AggregateColumn::var_distinct("b"))) + .with(("std_distinct_b", AggregateColumn::std_distinct("b"))) .group_by(expr!(c)) .build(); let dp_relation = reduce.differentially_private_aggregates(epsilon.clone(), delta.clone()).unwrap(); @@ -1029,6 +1102,9 @@ mod tests { ("count_b", DataType::float_interval(0., 1000.)), ("count_distinct_b", DataType::float_interval(0., 1000.)), ("my_c", DataType::float_interval(10., 20.)), + ("avg_distinct_b", DataType::float_interval(0., 10000.)), + ("var_distinct_b", DataType::float_interval(0., 100000.)), + ("std_distinct_b", DataType::float_interval(0., 316.22776601683796)), ]) ); } From 45f9fd9b91a2d016604d3049253e3e57bdeab947 Mon Sep 17 00:00:00 2001 From: victoria de sainte agathe Date: Mon, 4 Dec 2023 09:24:19 +0100 Subject: [PATCH 2/3] merge with main --- CHANGELOG.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 121ae24c..71f5bdad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,12 +11,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Changed - some cleaning in the translation of Expr -> ast::Expr [#204](https://github.com/Qrlew/qrlew/issues/204) ## Added -<<<<<<< HEAD - implemented `STD` and `VAR`aggregations in the dp rewritting [#205](https://github.com/Qrlew/qrlew/issues/205) -======= - implemented `BETWEEN`, `IS TRUE`, `IS FALSE`, `IS NULL`, `CHOOSE`, `LIKE` and `ILIKE` [#203](https://github.com/Qrlew/qrlew/issues/203) - implemented `DAYNAME`, `FROM_UNIXTIME`, `UNIX_TIMESTAMP`, `DATETIME_DIFF`, `QUARTER` and `DATE_FORMAT` [#202](https://github.com/Qrlew/qrlew/issues/202) ->>>>>>> cf16b15a1f56f156c683d13e6a40151026c0f3b2 - implemented `CURRENT_DATE`, `CURRENT_TIME`, `CURRENT_TIMESTAMP` and `EXTRACT(datepart FROM col)` [#200](https://github.com/Qrlew/qrlew/issues/200) - implemented `DISTINCT` in aggregations [#197](https://github.com/Qrlew/qrlew/issues/197) - Implemented math functions: `PI`, `DEGREES`, `TAN`, `RANDOM`, `LOG10`, `LOG2`, `SQUARE` [#196](https://github.com/Qrlew/qrlew/issues/196) From 68479d864e7e17a0cc5116a1fa6c96324bf76cfb Mon Sep 17 00:00:00 2001 From: victoria de sainte agathe Date: Thu, 7 Dec 2023 12:46:55 +0100 Subject: [PATCH 3/3] ok --- src/differential_privacy/aggregates.rs | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/src/differential_privacy/aggregates.rs b/src/differential_privacy/aggregates.rs index a0da81f9..d08b4d8b 100644 --- a/src/differential_privacy/aggregates.rs +++ b/src/differential_privacy/aggregates.rs @@ -151,9 +151,7 @@ impl PUPRelation { let sum_col = format!("_SUM_{}", col_name); let count_col = format!("_COUNT_{}", col_name); let square_col = format!("_SQUARE_{}", col_name); - let one_square_col = format!("_ONE_{}", square_col); let sum_square_col = format!("_SUM_{}", square_col); - let count_square_col = format!("_COUNT_{}", square_col); match aggregate.aggregate() { aggregate::Aggregate::First => { assert!(group_by_names.contains(&col_name.as_str())); @@ -187,11 +185,9 @@ impl PUPRelation { input_b = input_b .with((col_name.as_str(), Expr::col(col_name.as_str()))) .with((square_col.as_str(), Expr::pow(Expr::col(col_name.as_str()), Expr::val(2)))) - .with((one_col.as_str(), Expr::val(1.))) - .with((one_square_col.as_str(), Expr::val(1.))); + .with((one_col.as_str(), Expr::val(1.))); sums.push((count_col.clone(), one_col)); sums.push((sum_col.clone(), col_name)); - sums.push((count_square_col.clone(), one_square_col)); sums.push((sum_square_col.clone(), square_col)); output_b = output_b.with(( name, @@ -200,7 +196,7 @@ impl PUPRelation { Expr::minus( Expr::divide( Expr::col(sum_square_col), - Expr::greatest(Expr::val(1.), Expr::col(count_square_col)), + Expr::greatest(Expr::val(1.), Expr::col(count_col.clone())), ), Expr::divide( Expr::col(sum_col), @@ -214,11 +210,9 @@ impl PUPRelation { input_b = input_b .with((col_name.as_str(), Expr::col(col_name.as_str()))) .with((square_col.as_str(), Expr::pow(Expr::col(col_name.as_str()), Expr::val(2)))) - .with((one_col.as_str(), Expr::val(1.))) - .with((one_square_col.as_str(), Expr::val(1.))); + .with((one_col.as_str(), Expr::val(1.))); sums.push((count_col.clone(), one_col)); sums.push((sum_col.clone(), col_name)); - sums.push((count_square_col.clone(), one_square_col)); sums.push((sum_square_col.clone(), square_col)); output_b = output_b.with(( name, @@ -227,7 +221,7 @@ impl PUPRelation { Expr::minus( Expr::divide( Expr::col(sum_square_col), - Expr::greatest(Expr::val(1.), Expr::col(count_square_col)), + Expr::greatest(Expr::val(1.), Expr::col(count_col.clone())), ), Expr::divide( Expr::col(sum_col),