diff --git a/CHANGELOG.md b/CHANGELOG.md index f02d6f3..fdae709 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ ## Staged +## v0.4.1-2 +- clean-up of bad data code +- adds bad data documentation +- adds util to calculate state estimation vs powerflow voltage difference per individual phase (instead of average of the three) + ## v0.4.0 - adds bad data functionalities and tests - adds util to create virtual measurements at zero injection buses diff --git a/Project.toml b/Project.toml index 2a805b8..ff36d6c 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "PowerModelsDistributionStateEstimation" uuid = "d0713e65-ce0c-4a8e-a1da-2ed737d217c5" authors = ["Marta Vanin", "Tom Van Acker"] -version = "0.4.1" +version = "0.4.2" [deps] CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" diff --git a/docs/make.jl b/docs/make.jl index f0b6db5..dd094be 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -8,10 +8,11 @@ makedocs( pages = [ "Home" => "index.md", "Manual" => [ - "Getting Started" => "quickguide.md", - "Input Data Format" => "input_data_format.md", - "Measurements and Conversions" => "measurements.md", - "State Estimation Criteria" => "se_criteria.md", + "Getting Started" => "quickguide.md", + "Input Data Format" => "input_data_format.md", + "Measurements and Conversions" => "measurements.md", + "State Estimation Criteria" => "se_criteria.md", + "Bad Data Detection and Identification" => "bad_data.md", ], "Library " => [ "Power Flow Formulations" => "formulations.md", diff --git a/docs/src/bad_data.md b/docs/src/bad_data.md new file mode 100644 index 0000000..9616d65 --- /dev/null +++ b/docs/src/bad_data.md @@ -0,0 +1,106 @@ +# Bad Data Detection and Identification + +As of version 0.4.0, PMDSE has bad data detection and identification functionalities, namely: +- Chi-square test, +- Largest normalized residuals, +- Least absolute value (LAV) estimator residual analysis. + +The LAV is a robust estimator that presents bad data rejection properties. LAV residuals can be collected and sorted, and the measurements with higher +residuals are the ones of the bad data points. +The LAV residual analysis can be done with all previous versions of the package too, but is made easier in v0.4.0: in versions up to 0.4.0 the user +needs to pass `wlav` or `rwlav` as a state estimation criterion, and assign a unitary standard deviation for all weights or all measurements. Now it is sufficient to +pass `lav` as a state estimation criterion. + +All these three techniques are very standard techniques, and a thorough theoretical discussion can be found in the well-known book: "Power system state estimation - Theory and implementation" by A. Abur and A. G. Exposito. Furthermore, numerous papers also address in which circumstances the different techniques are more or less effective. + +Below, just a functional introduction. + +First of all, +- Bad data *detection* consists of answering the yes/no question: "is there bad data"? +- Bad data *identification* consists of locating which data points are bad (to subsequently correct them or delete them). + +All the presented techniques require the user to first run a state estimation algorithm, as they are based on the analysis of its residuals. + +## Chi-square Analysis + +Chi-squares ($\Chi^2$) analysis is a bad data *detection* method. If bad data are detected, these still need to be identified. + +The method is based on the following assumptions: if all measurement errors follow a Normal distribution, and there are no bad data, then the sum of the weighted squared residuals follows a Chi-square distributions with *m-n* degrees of freedom, where *m* is the number of measurements and *n* of the system variables. + +The function `exceeds_chi_squares_threshold` takes as input the solution of a state estimation calculation and the data dictionary. It calculates the degrees of freedom and the sum of the weighted square residuals (where the weights are the inverse of each measurement's variance). If the state estimation that was run was a `wls` estimation with no weight rescaler, this sum corresponds to the objective value. However, the function always calculates the sum, to allow the user to use Chi-square calculations in combination with measurement rescalers or other state estimation criteria. +```@docs +PowerModelsDistributionStateEstimation.exceeds_chi_squares_threshold(sol_dict::Dict, data::Dict; prob_false::Float64=0.05, suppress_display::Bool=false) +``` +The function returns a boolean that states whether bad data are suspected, the value of the sum of the residuals and the threshold value above which bad data are suspected. +The threshold value depends on the degrees of freedom and the detection confidence probability, that cab=n be set by the user. The default value of the latter is 0.05, as this is often the choice in the literature. + +## Largest Normalized Residuals + +Normalized residuals can be used for both bad data *detection* and *identification*. Let the residuals be $r_i = h_i(\mathbf{x}) - \mathbf{z}$, where $h$ are the measurement functions, $\mathbf{x}$ are the system variables and $\mathbf{z}$ is the measurement vector. This is often the standard notation, e.g., in the book by Abur and Exposito. +The normalized residuals $r^N_i$ are: + +```math +\begin{align} +&r_i^N = \frac{|r_i|}{\sqrt{\Omega_{ii}}} = \frac{|r_i|}{\sqrt{R_{ii}S_{ii}}} +\end{align} +``` +The largest $r^N$ is compared to a threshold, typically 3.0 in the literature. If its value exceeds the threshold, bad data are suspected, and the bad data point is identified as the measurement that corresponds to the largest $r^N$ itself. +This package contains different functions that allow to build the measurement matrix (H), the measurement error covariance matrix (R), the gain matrix (G), the hat matrix (K), the sensitivity matrix (S) and the residual covariance matrix ($\Omega$): +```@docs +PowerModelsDistributionStateEstimation.build_H_matrix(functions::Vector, state::Array)::Matrix{Float64} +``` +```@docs +build_G_matrix(H::Matrix, R::Matrix)::Matrix{Float64} +``` +```@docs +build_R_matrix(data::Dict)::Matrix{Float64} +``` +```@docs +build_omega_matrix(R::Matrix{Float64}, H::Matrix{Float64}, G::Matrix{Float64}) +``` +```@docs +build_omega_matrix(S::Matrix{Float64}, R::Matrix{Float64}) +``` +```@docs +build_S_matrix(K::Matrix{Float64}) +``` +```@docs +build_K_matrix(H::Matrix{Float64}, G::Matrix{Float64}, R::Matrix{Float64}) +``` +$\Omega$ can then be used in the function `normalized_residuals`, which calculates all $r_i$, returns the highest $r^N$ and indicates whether its value exceeds the threshold or not. +Again, the $r_i$ calculation is independent of the chosen state estimation criterion or weight rescaler. +```@docs +PowerModelsDistributionStateEstimation.normalized_residuals(data::Dict, se_sol::Dict, Ω::Matrix; t::Float64=3.0) +``` +Finally, a simplified version of the largest normalized residuals is available: `simple_normalized_residuals`, that instead of calculating the $\Omega$ matrix, calculates the normalized residuals as: +```math +\begin{align} +&r_i^N = \frac{|r_i|}{\sqrt{\Omega_{ii}}} = \frac{|r_i|}{\sqrt{R_{ii}^2}} +\end{align} +``` +```@docs +PowerModelsDistributionStateEstimation.simple_normalized_residuals(data::Dict, se_sol::Dict, R::Matrix) +``` + +## LAV Estimator Residual Analysis + +The LAV estimator is known to be inherently robust to bad data, as it is basically a linear regression. +Thus, it is sufficient to run it and then check its residuals as in the piece of code below. The residuals do not even need to be calculated, because in a `LAV` estimation, they are by default reported as `res` in the solution dictionary. As such, the user only needs to sort the residuals in descending orders, see what their magnitude is, and whether some residuals are much higher than the others. The latter, in general, points out the bad data. + +```julia + bad_data["se_settings"] = Dict{String,Any}("criterion" => "lav", + "rescaler" => 1) + + se_result_bd_lav = _PMDSE.solve_acp_red_mc_se(bad_data, solver) + residual_tuples = [(m, maximum(meas["res"])[1]) for (m, meas) in se_result_bd_lav["solution"]["meas"]] + sorted_tuples = sort(residual_tuples, by = last, rev = true) + measurement_index_of_largest_residual = first(sorted_tuples[1]) + magnitude_of_largest_residual = last(sorted_tuples[1]) + ratio12 = (last(sorted_tuples[1])/last(sorted_tuples[2])) # ratio between the first and the second largest residuals. +``` + +## Other Notes + +Virtually all bad data detection and identification methods from the literature are done "a-posteriori", i.e., after running a state estimation, by performing statistical considerations on the measurement residuals, or "a priori" doing some measurement data pre-processing (these are not discussed but could be, e.g., removing missing data or absurd measurements like negative or zero voltage). Thus, it is easy for the user to use this framework to just run the state estimation calculations and then add customized bad data handling methods that take as input the input measurement dictionary and/or the output solution dictionary. + +An example on how to use this package to perform bad data detection and identification can be found at this [link](https://github.com/MartaVanin/SE_framework_paper_results): see both its readme and the file `src/scripts/case_study_E.jl`. \ No newline at end of file diff --git a/src/PowerModelsDistributionStateEstimation.jl b/src/PowerModelsDistributionStateEstimation.jl index 759eda7..09259e5 100644 --- a/src/PowerModelsDistributionStateEstimation.jl +++ b/src/PowerModelsDistributionStateEstimation.jl @@ -18,7 +18,7 @@ import GaussianMixtures import InfrastructureModels import JuMP import LinearAlgebra -import LinearAlgebra: diag +import LinearAlgebra: diag, diagm, I import Logging, LoggingExtras import Optim import PowerModelsDistribution diff --git a/src/bad_data/chi_squares_test.jl b/src/bad_data/chi_squares_test.jl index 8c3d833..a82fdc7 100644 --- a/src/bad_data/chi_squares_test.jl +++ b/src/bad_data/chi_squares_test.jl @@ -1,5 +1,5 @@ """ - exceeds_chi_squares_threshold(sol_dict::Dict, data::Dict; prob_false::Float64=0.05) + exceeds_chi_squares_threshold(sol_dict::Dict, data::Dict; prob_false::Float64=0.05, suppress_display::Bool=false) Standard bad data detection method that consists of performing a Chi squares analysis on the objective value, i.e., the sum of the residuals. The outcome of the analysis depends on the degrees of freedom of the problem, which are calculated calling @@ -7,22 +7,32 @@ the `get_degrees_of_freedom` function (see below). This function returns: - a Boolean. If `true`, there are probably bad data points, if `false` there probably are not. -- the scaled objective of the optimization and the Χ² threshold value. +- the sum of the weighted squares of the residuals (which might be the optimization objective or not, depending on the settings) and the Χ² threshold value. Arguments: - `sol_dict`: solution dictionary, i.e., the default output dictionary of state estimation calculations, - `data`: data input of the state estimation calculations, used to calculate the degrees of freedom, -- `prob_false`: probability of errors allowed in the Chi squares test, -- `criterion`: `wlav`, `wls`, etc., -- `rescaler`: used rescaler if != 1 +- `prob_false`: probability of errors allowed in the Chi squares test, defaults to 0.05 +- `suppress_display`: if `false`, the function also displays the result of the analysis, otherwise this is suppressed. """ -function exceeds_chi_squares_threshold(sol_dict::Dict, data::Dict; prob_false::Float64=0.05, criterion::String="wls", rescaler::Int64=1) +function exceeds_chi_squares_threshold(sol_dict::Dict, data::Dict; prob_false::Float64=0.05, suppress_display::Bool=false) dof = get_degrees_of_freedom(data) chi2 = _DST.Chisq(dof) - rescale_and_adjust_objective!(sol_dict, rescaler, criterion) - sol_dict["objective_refactored"] >= _STT.quantile(chi2, 1-prob_false) ? display("Chi-square test indicates presence of bad data.") : display("No bad data detected by Chi-square test.") - return sol_dict["objective_refactored"] >= _STT.quantile(chi2, 1-prob_false), sol_dict["objective_refactored"], _STT.quantile(chi2, 1-prob_false) + norm_residual = [] + for (m, meas) in data["meas"] + h_x = sol_dict["solution"][string(meas["cmp"])][string(meas["cmp_id"])][string(meas["var"])] + z = _DST.mean.(meas["dst"]) + r = abs.(h_x-z) + σ = _DST.std.(meas["dst"]) + sol_dict["solution"]["meas"][m]["norm_res"] = [r[i]^2/σ[i]^2 for i in 1:length(meas["dst"])] + for i in 1:length(meas["dst"]) push!(norm_residual, r[i]^2/σ[i]^2) end + end + sol_dict["J"] = sum(norm_residual) + if !suppress_display + sol_dict["J"] >= _STT.quantile(chi2, 1-prob_false) ? display("Chi-square test indicates presence of bad data.") : display("No bad data detected by Chi-square test.") + end + return sol_dict["J"] >= _STT.quantile(chi2, 1-prob_false), sol_dict["J"], _STT.quantile(chi2, 1-prob_false) end """ get_degrees_of_freedom(data::Dict) @@ -40,7 +50,6 @@ included in the calculation of `n`, either. Zero-injection buses are defined as """ function get_degrees_of_freedom(data::Dict) - # TODO: for MV/LV add transformer buses to non-zero buses? ref_bus = [bus for (_,bus) in data["bus"] if bus["bus_type"] == 3] @assert length(ref_bus) == 1 "There is more than one reference bus, double-check model" @@ -57,18 +66,4 @@ function get_degrees_of_freedom(data::Dict) return m-n -end - -function rescale_and_adjust_objective!(sol_dict, rescaler, criterion) - @assert occursin("w", criterion) "This functionality is only applicable to `weighted` state estimation methods" - if occursin("lav", criterion) - for (_, meas) in sol_dict["solution"]["meas"] - meas["res"] = (meas["res"]*rescaler).^2 - end - else - for (_, meas) in sol_dict["solution"]["meas"] - meas["res"] = meas["res"]*rescaler^2 - end - end - sol_dict["objective_refactored"] = [sum(sum.(meas["res"] for (m, meas) in sol_dict["solution"]["meas"]))][1] -end +end \ No newline at end of file diff --git a/src/bad_data/largest_normalized_residuals.jl b/src/bad_data/largest_normalized_residuals.jl index 3a797da..8e063a4 100644 --- a/src/bad_data/largest_normalized_residuals.jl +++ b/src/bad_data/largest_normalized_residuals.jl @@ -1,32 +1,34 @@ """ - simple_normalized_residuals(data::Dict, se_sol::Dict, state_estimation_type::String; rescaler::Float64=1.0) + simple_normalized_residuals(data::Dict, se_sol::Dict, R::Matrix) It normalizes the residuals only based on their standard deviation, no sensitivity matrix involved. -Avoids all the matrix calculations but is relatively rudimental -""" -function simple_normalized_residuals(data::Dict, se_sol::Dict, state_estimation_type::String; rescaler::Float64=1.0) - if occursin(state_estimation_type, "wls") - p = 2 - elseif occursin(state_estimation_type, "wlav") - p = 1 - else - error("This method works only with (r)wls and (r)wlav state estimation") - end - for (m, meas) in se_sol["solution"]["meas"] - meas["norm_res"] = abs.(meas["res"])./_DST.std.(data["meas"][m]["dst"]).^p*rescaler - end +R^2 replaces Ω, where Ω = S ⋅ R +Avoids all the matrix calculations but is a "simplified" method +""" +function simple_normalized_residuals(data::Dict, se_sol::Dict, R::Matrix) + normalized_residuals(data, se_sol, R.^2) end - """ Adds the normalized residuals to the solution dictionary and returns the largest normalized residual, its index (i.e., the measurement it refers to), and whether it exceeds the given threshold `t` or not. """ -function normalized_residuals(se_sol::Dict, Ω::Matrix; t::Float64=3.0, resc::Float64=1.0) +function normalized_residuals(data::Dict, se_sol::Dict, Ω::Matrix; t::Float64=3.0) lnr = ("0", 0) - for (m, meas) in se_sol["solution"]["meas"] - meas["nr"] = [abs(meas["res"][i])*resc/sqrt(abs(Ω[parse(Int64,m)+i-1, parse(Int64,m)+i-1])) for i in 1:length(meas["res"])] - for i in 1:length(meas["res"]) - if meas["nr"][i] > last(lnr) lnr = ("$(parse(Int64,m)+i-1)", meas["nr"][i]) end + count = 1 + for (m, meas) in data["meas"] + if !haskey(se_sol["solution"][string(meas["cmp"])], string(meas["cmp_id"])) + r = [0.0, 0.0, 0.0] + se_sol["solution"]["meas"][m]["r"] = [0.0, 0.0, 0.0] + else + h_x = se_sol["solution"][string(meas["cmp"])][string(meas["cmp_id"])][string(meas["var"])] + z = _DST.mean.(meas["dst"]) + r = abs.(h_x-z) + se_sol["solution"]["meas"][m]["r"] = r end + se_sol["solution"]["meas"][m]["nr"] = [r[i]/sqrt(abs(Ω[count+i-1, count+i-1])) for i in 1:length(meas["dst"])] + for i in 1:length(meas["dst"]) + if se_sol["solution"]["meas"][m]["nr"][i] > last(lnr) lnr = (m, se_sol["solution"]["meas"][m]["nr"][i]) end + end + count+=length(meas["dst"]) end return lnr, last(lnr) > t end @@ -54,8 +56,19 @@ function build_R_matrix(data::Dict)::Matrix{Float64} end """ Returns the Residual Covariance Matrix Ω, where: -Ωᵢᵢ = Rᵢᵢ⋅Sᵢᵢ = R - H*G^(-1)*H^T -S = I - K # <- sensitivity matrix, no need to calculate it for bad data purposes -K = H⋅G⁻¹⋅Hᵀ⋅R⁻¹ # <- hat matrix, no need to calculate it for bad data purposes +Ω = R - H*G^(-1)*H^T +""" +build_omega_matrix(R::Matrix{Float64}, H::Matrix{Float64}, G::Matrix{Float64}) = R - H*inv(G)*transpose(H) +""" +Returns the Residual Covariance Matrix Ω, where: +Ωᵢᵢ = Rᵢᵢ⋅Sᵢᵢ +""" +build_omega_matrix(S::Matrix{Float64}, R::Matrix{Float64}) = diagm((sqrt.(abs.(diag(S).*diag(R))))) +""" +Returns the sensitivity matrix S starting from the hat matrix K +""" +build_S_matrix(K::Matrix{Float64}) = Matrix{Int64}(I, size(K)) - K +""" +Returns the hat matrix K """ -build_omega_matrix(R::Matrix{Float64}, H::Matrix{Float64}, G::Matrix{Float64}) = R - H*inv(G)*transpose(H) \ No newline at end of file +build_K_matrix(H::Matrix{Float64}, G::Matrix{Float64}, R::Matrix{Float64}) = H*inv(G)*transpose(H)*inv(R) \ No newline at end of file diff --git a/src/io/postprocessing.jl b/src/io/postprocessing.jl index 5577b8a..d02e3c0 100644 --- a/src/io/postprocessing.jl +++ b/src/io/postprocessing.jl @@ -51,3 +51,37 @@ function calculate_voltage_magnitude_error(se_result::Dict, pf_result::Dict) return delta, maximum(delta), _STT.mean(delta) end +""" + calculate_voltage_magnitude_error_perphase(se_sol::Dict, pf_sol::Dict) +Like _PMDSE.calculate_voltage_magnitude_error(se_sol::Dict, pf_sol::Dict) but the errors +are per phase instead of the average values over the three pahses. +""" +function calculate_voltage_magnitude_error_perphase(se_result::Dict, pf_result::Dict) + + (haskey(se_result, "solution") && haskey(pf_result, "solution")) || return false + pf_sol, se_sol = pf_result["solution"], se_result["solution"] + + # convert the voltage magnitude variable to polar space + if haskey(pf_sol["bus"]["1"], "Wr") _PMDSE.convert_lifted_to_polar!(pf_sol, "Wr") end + if haskey(se_sol["bus"]["1"], "Wr") _PMDSE.convert_lifted_to_polar!(se_sol, "Wr") end + if haskey(pf_sol["bus"]["1"], "vr") _PMDSE.convert_rectangular_to_polar!(pf_sol) end + if haskey(se_sol["bus"]["1"], "vr") _PMDSE.convert_rectangular_to_polar!(se_sol) end + if haskey(pf_sol["bus"]["1"], "w") _PMDSE.convert_lifted_to_polar!(pf_sol, "w") end + if haskey(se_sol["bus"]["1"], "w") _PMDSE.convert_lifted_to_polar!(se_sol, "w") end + + # determine the difference between the se and pf + delta_1 = [] + delta_2 = [] + delta_3 = [] + for (b,bus) in pf_sol["bus"] for cond in 1:length(bus["vm"]) + if cond == 1 + push!(delta_1, abs(bus["vm"][cond]-se_sol["bus"][b]["vm"][cond])) + elseif cond == 2 + push!(delta_2, abs(bus["vm"][cond]-se_sol["bus"][b]["vm"][cond])) + else + push!(delta_3, abs(bus["vm"][cond]-se_sol["bus"][b]["vm"][cond])) + end + end end + + return delta_1, delta_2, delta_3, maximum(delta_1), maximum(delta_2), maximum(delta_3), _STT.mean(delta_1), _STT.mean(delta_2), _STT.mean(delta_3) +end \ No newline at end of file diff --git a/test/bad_data.jl b/test/bad_data.jl index e246956..7a867dc 100644 --- a/test/bad_data.jl +++ b/test/bad_data.jl @@ -5,7 +5,7 @@ data = _PMD.parse_file(_PMDSE.get_enwl_dss_path(ntw, fdr)) if rm_transfo _PMDSE.rm_enwl_transformer!(data) end if rd_lines _PMDSE.reduce_enwl_lines_eng!(data) end - + data["settings"]["sbase_default"] = 100.0 # insert the load profiles _PMDSE.insert_profiles!(data, season, elm, pfs, t = time_step) @@ -28,29 +28,33 @@ # sigma_dict σ_dict = Dict("load" => Dict("load" => σ_p, "bus" => σ_v), - "gen" => Dict("gen" => σ_p, - "bus" => σ_v) + "gen" => Dict("gen" => σ_p/100, + "bus" => σ_v/100) ) # write measurements based on power flow _PMDSE.write_measurements!(_PMD.ACPUPowerModel, data, pf_result, msr_path, exclude = ["vi","vr"], σ = σ_dict) # read-in measurement data and set initial values - _PMDSE.add_measurements!(data, msr_path, actual_meas = false) - - rsc = 10 - crit = "rwls" + _PMDSE.add_measurements!(data, msr_path, actual_meas = true) - data["se_settings"] = Dict{String,Any}("criterion" => crit, "rescaler" => rsc) + data["se_settings"] = Dict{String,Any}("criterion" => "rwlav", "rescaler" => 1) se_result = _PMDSE.solve_acp_red_mc_se(data, ipopt_solver) @test _PMDSE.get_degrees_of_freedom(data) == 34 - chi_result = _PMDSE.exceeds_chi_squares_threshold(se_result, data; prob_false=0.05, criterion=crit, rescaler = rsc) - @test isapprox(se_result["objective"], 0.120618, atol=1e-2) - @test chi_result[1] == false - @test isapprox(chi_result[2], 12.06, atol = 1e-2) + chi_result = _PMDSE.exceeds_chi_squares_threshold(se_result, data) + @test chi_result[1] == false + @test isapprox(chi_result[2], 0.0, atol = 1e-8) @test isapprox(chi_result[3], 48.60, atol = 1e-2) + + _PMDSE.add_measurements!(data, msr_path, actual_meas = false) + se_result = _PMDSE.solve_acp_red_mc_se(data, ipopt_solver) + chi_result = _PMDSE.exceeds_chi_squares_threshold(se_result, data) + @test chi_result[1] == true #TODO: there's no real bad data, check whether there is a problem with write_meas + @test isapprox(chi_result[2], 963.32, atol = 1e-2) + @test isapprox(chi_result[3], 48.60, atol = 1e-2) + end @testset "h_functions" begin @@ -228,11 +232,11 @@ end #@test all(isapprox.(G, stored_G_matrix, atol=1)) @test all(isapprox.(Ω, stored_Ω_matrix, atol=1)) - id_val, exc = _PMDSE.normalized_residuals(se_result, Ω) + id_val, exc = _PMDSE.normalized_residuals(data, se_result, Ω) #@test !exc #@test id_val[1] == "3" #@test isapprox(id_val[2], 0.11035175, atol=1e-8) - _PMDSE.simple_normalized_residuals(data, se_result, "wls") - @test haskey(se_result["solution"]["meas"]["5"], "norm_res") + _PMDSE.simple_normalized_residuals(data, se_result, R) + @test haskey(se_result["solution"]["meas"]["5"], "nr") end \ No newline at end of file