-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathsimpl_kgdf.R
108 lines (91 loc) · 4.77 KB
/
simpl_kgdf.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
kGDF <- function(k=1, y, fm, data, distr = "normal", mod = "GLM", noise = 0.25, perf, pass, method="jackknife"){
# sample size n
n <- length(y)
#set noise
if(distr == "normal") noise <- noise*sd(y)
# dataframes for ye and yehat
ye <- yehat <- data.frame(rep(NA,n))
if(method == "jackknife"){ # makes sure that each data point is perturbed per pass
# loop to re-run Model until each data point has been perturbed
i <- 1 # subscription
for(t in 1:pass){
# check for NAs
check.NA <- rep(NA, n)
while (sum(is.na(check.NA)) > 0){
y2 <- y # overwrite y2 each loop with original y
to.fill <- which(is.na(check.NA)) # check which data point still needs to be perturbed
# randomise subset of size k to perturb during this loop
if ((length(to.fill) == 1) & (k == 1)){
# if there is only one NA to be disturbed and sample size = k = 1
y2[to.fill] <- y[to.fill]+rnorm(k, 0, noise) # perturb last remaining point
check.NA[to.fill] <- 1 # mark perturbed subset
}else{
if ((length(to.fill) >= k ) & (length(to.fill) > 1)){
to.use <- sample(to.fill, size= k, replace= F) # randomly select k data points
}
if ((length(to.fill) < k)){
# if there are fewer remaining NAs than data points to be disturbed
fill.up <- sample(1:n, size=k-length(to.fill), replace=F) # fill up sample of size k
to.use <- c(to.fill, fill.up)
}
if (distr == "normal") y2[to.use] <- y2[to.use]+rnorm(k, 0, noise)
if(distr == "binary") y2[to.use] <- !y2[to.use]
check.NA[to.use] <- 1 # mark perturbed subset
}
assign("y2", y2, envir = .GlobalEnv) # provide "y2" in the global environment
# accommodate model with perturbed data
if(mod == "GLM" | mod == "GAM" | mod == "ANN") fm.updated <- update(fm, y2 ~., data=data)
if(mod == "rF"){
if(fm$type == "regression") fm.updated <- update(fm, y2 ~., data=data)
if(fm$type == "classification") fm.updated <- update(fm, as.factor(y2) ~., data=data)
}
if(mod == "BRT") fm.updated <- update(fm, as.formula(y2~.), data=data)
# store intermediates
ye[,i] <- y2
if(mod == "rF"){
if(fm$type == "regression") yehat[,i] <- predict(fm.updated, type="response")
if(fm$type == "classification") yehat[,i] <- predict(fm.updated, type="prob")[, 2]
}
if(mod == "ANN") yehat[,i] <- predict(fm.updated, type="raw")
if(mod == "GLM" | mod == "GAM") yehat[,i] <- predict(fm.updated, type="response")
if(mod == "BRT") yehat[,i <- predict(fm.updated, type="response", n.trees=perf)]
i <- i+1 # move one column further in the data frames
# Remove y2 from global environment
rm(y2, envir= .GlobalEnv)
}
}
cat("i =",i-1)
}
if(method=="bootstrap"){ # meaning that the k perturbed data points are selected in every pass from the complete dataset, possibly leaving out a few
for(i in 1:pass){
# perturbation
y2 <- y # perturb copy of y
to.perturb <- sample(n,size=k,replace=FALSE)
if (distr == "normal") y2[to.perturb] <- y2[to.perturb]+rnorm(k, 0, noise)
if(distr == "binary") y2[to.perturb] <- !y2[to.perturb]
assign("y2", y2, envir = .GlobalEnv) # for model update needed, apparently
# accommodate model with perturbed data
if(mod == "GLM" | mod == "GAM" | mod == "ANN") fm.updated <- update(fm, y2 ~., data=data)
if(mod == "rF" & distr == "normal") update(fm, y2 ~., data=data)
if(mod == "rF" & distr == "binary") update(fm, as.factor(y2) ~., data=data)
if(mod == "BRT") fm.updated <- update(fm, as.formula(y2~.), data = data)
# store intermediates
ye[,i] <- y2
if(mod == "rF"){
if(fm$type == "regression") yehat[,i] <- predict(fm.updated, type="response")
if(fm$type == "classification") yehat[,i] <- predict(fm.updated, type="prob")[, 2]
}
if(mod == "ANN") yehat[,i] <- predict(fm.updated, type="raw")
if(mod == "GLM" | mod == "GAM") yehat[,i] <- predict(fm.updated, type="response")
if(mod == "BRT") yehat[,i <- predict(fm.updated, type="response", n.trees=perf)]
rm(y2, envir= .GlobalEnv)
}
}
# LR: Horizontal Method:
ye <- as.matrix(ye); yehat <- as.matrix(yehat)
#Fit a LR to yehat[i,] vs ye[,i]
fm.LR <- lapply(seq_along(ye[,1]), function(x)lm(yehat[x,]~ye[x,]))
#Calculate sum of slopes m[i] as GDF
GDF.hor <- sum(sapply(fm.LR, function(i)coefficients(i)[2]), na.rm=T)
return(GDF.hor)
}