-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathFunctionalDataAnalysisPart2_RStudioMeetUp_Code.R
638 lines (516 loc) · 28.7 KB
/
FunctionalDataAnalysisPart2_RStudioMeetUp_Code.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
#############################################################################
#############################################################################
# Code for RStudio Webinar: Intro FDA Part II #
# Functional PCA, Functional PCA Regression, & Functional Linear Model #
# Author: Matthew R. Malloure, Ph.D. #
# Associate Research Scientist, Dow Inc. #
#############################################################################
#############################################################################
##### Section 0: Loading Necessary Packages and Generate Example Plots #####
## Note: Code developed using R version 4.1.3
library(fda)
library(refund)
library(pls)
library(knitr)
# All Data and Plots are Provided with the fda Package
# Berkeley Growth Study Data, Canadian Weather Data, fda Handwriting Data, Pinch Force Data
par(mfrow=c(2,2))
matplot(growth$age,growth$hgtf,type='l',lwd=2,col='black',ylab='Height (in cm)',xlab='Age (Years)',
main='Berkeley Growth Data (Girls)',cex.lab=1.5,cex.main=2,cex.axis = 1.5)
matplot(CanadianWeather$monthlyTemp,type='l',col='black',lwd=2,xlab='Month',ylab='Temperature (Celsius)',
main='Canadian Weather Data (Monthly Average)',cex.lab=1.5,cex.main=2,cex.axis = 1.5)
plot(handwrit[, 1, 1], handwrit[, 1, 2], type="l",main='Cursive Handwriting Data',xlab='Rescaled Time (Milliseconds)',
ylab='Rescaled Time (Milliseconds)',cex.lab=1.5,cex.main=2,cex.axis = 1.5)
for (j in 2:20) lines(handwrit[, j, 1], handwrit[, j, 2])
matplot (pinchtime, pinchraw, type="l", lty=2, cex=2,col=1, lwd=1, xlab = "Seconds", ylab="Force (N)",
cex.lab=1.5,cex.main=2,cex.axis = 1.5,main='Pinch Force Data')
abline(h=2, lty=2)
# Example of Smoothing Data Generated from sin Function
set.seed(3918829)
#Generating Observed Functional Observation
x = seq(-pi,pi,length.out = 200)
y = sin(x) + rnorm(length(x),0,.1)
#Construct the Fourier Basis and Penalty Term
basis = create.fourier.basis(rangeval = range(x),nbasis=100)
Lcoef = c(0,pi^2,0)
harmaccelLfd = vec2Lfd(Lcoef,range(x))
#Plot the Functional Observation
par(mfrow=c(1,1))
plot(x,y,cex.lab=1.25,pch=1,cex=2,col='gray',cex.axis = 1.5,cex.lab = 1.5)
lines(x,sin(x),col='black',lwd=4)
#Select Moderate Smoothing Parameter, Smooth the Data, and Plot
lambda = .001
fdParobj =fdPar(basis,harmaccelLfd,lambda)
sin.fd = smooth.basis(x,y,fdParobj)$fd
lines(sin.fd,lwd=4,lty=2,col='blue')
#Select Small Smoothing Parameter, Smooth the Data, and Plot
lambda = .000000000001
fdParobj =fdPar(basis,harmaccelLfd,lambda)
sin.fd = smooth.basis(x,y,fdParobj)$fd
lines(sin.fd,lwd=4,lty=2,col='red')
#Select Large Smoothing Parameter, Smooth the Data, and Plot
lambda = .1
fdParobj =fdPar(basis,harmaccelLfd,lambda)
sin.fd = smooth.basis(x,y,fdParobj)$fd
lines(sin.fd,lwd=4,lty=2,col='green')
legend('topleft',legend=c('True Function','Under-Smoothed','Smoothed','Over-Smoothed'),
col=c('black','red','blue','green'),
lty=c(1,2,2,2),lwd=4,bty='n',cex=1.5)
##### Section 1: Simulation Data #####
# Randomly generate decay curves with unequally spaced & unequal number of measurements
set.seed(49183)
par(mfrow=c(1,1))
k = 41 #Total Number of Curves
control = 11 #Specify the 11th Curve as a 'Control' for Purpose of Example
coefs = runif(k,.05,.2) #Random Coefficients for Exponential Functions
full.domain = c(0,1,2,4,6,8,12,16,20,24) #Domain Specification (Hours)
# Generate, Store (in matrix Y), and Plot the 41 Curves
Y = matrix(0,nrow=k,ncol=length(full.domain))
Y[,1] = rep(100,k) #Require that All Curves Begin at 100% (no decay has occured)
#Generate Curve 1, Adding Random Normal Error: N(0,sd=3)
Y[1,2:ncol(Y)] = 100*exp(-coefs[1]*full.domain[2:ncol(Y)]) + rnorm(ncol(Y)-1,0,3)
num.to.remove = sample(2:5,1) #Determine How Many Domain Points To Exclude
#Randomly Determine Which of the Interior Points to Exclude (Keep 1st and Last)
keeps = c(T,sample(c(rep(T,8-num.to.remove),rep(F,num.to.remove))),T)
Y[1,!keeps] = NA #Set Selected as Missing
Y[1,][Y[1,] < 0] = 0 #Ensure Non-Negative Points
#Plot the 1st Curve
plot(full.domain[keeps],Y[1,keeps],type='l',col='gray',lwd=4,xlab='Time (Hours)',
ylab='Material Characteristic',cex.lab=1.5,cex.axis=1.5)
#Repeat the Above Steps for Curves 2 to 41
for (j in 2:k) {
Y[j,2:ncol(Y)] = 100*exp(-coefs[j]*full.domain[2:ncol(Y)]) + rnorm(ncol(Y)-1,0,3)
num.to.remove = sample(2:5,1)
keeps = c(T,sample(c(rep(T,8-num.to.remove),rep(F,num.to.remove))),T)
Y[j,!keeps] = NA
Y[j,][Y[j,] < 0] = 0
lines(full.domain[keeps],Y[j,keeps],type='l',col='gray',lwd=4)
}
#To Differentiate the Control from the Other 40 Curves, Color it Black and Add a Legend
lines(full.domain[!is.na(Y[control,])],Y[control,!is.na(Y[control,])],type='l',col='black',lwd=4)
legend('topright',legend=c('Exp. Materials','Standard'),lty=1,lwd=2,col=c('gray','black'),bty='n',cex=2)
##### Section 2: Curve Smoothing #####
# Convert the Discrete Curves to Smooth Curves Using Cubic B-Splines
basis = create.bspline.basis(c(0,24),norder=4,nbasis=20) #Construct Basis Functions
fdobj = fdPar(basis,Lfdobj=2,lambda=10) #Define Functional Parameter with Roughness Penalty
#Individually Smooth Each Curve, Evaluate at Equally Spaced Timepoints, Store in a Matrix
smoothed.curves = matrix(0,nrow=length(seq(0,24)),ncol=nrow(Y))
for (i in 1:nrow(Y)) {
temp.smooth = smooth.basis(argvals=full.domain[!is.na(Y[i,])],y=Y[i,!is.na(Y[i,])],fdobj)
smoothed.curves[,i] = eval.fd(seq(0,24),temp.smooth$fd)
}
#Generate a Single Functional Data Object For Use in FPCA
Overall = smooth.basis(seq(0,24),smoothed.curves,fdobj)
## Recreate the Overlay Plot of All Profiles, Now with Smoothed Versions
plot(Overall$argvals,Overall$y[,1],xlab='Time (Hours)',ylab='Smoothed Material Characteristic',
ylim=c(min(Overall$y),max(Overall$y)),lty=1,lwd=4,col='gray',type='l',cex.lab = 1.5,cex.main=2,cex.axis=1.5)
for (j in 2:ncol(Overall$y)) {
lines(Overall$argvals,Overall$y[,j],lty=1,lwd=4,col='gray')
}
lines(Overall$argvals,Overall$y[,control],lty=1,lwd=4,col='black')
legend('topright',legend=c('Exp. Materials','Standard'),lty=1,lwd=4,col=c('gray','black'),bty='n',cex=2)
## For Illustration, Compare the Discrete and Smoothed Profiles for the Control Sample
plot(Overall$argvals,Overall$y[,control],xlab='Time (Hours)',ylab='Smoothed Material Characteristic',
ylim=c(min(Overall$y),max(Overall$y)),lty=1,lwd=4,col='black',type='l',cex.lab=1.5,cex.axis=1.5)
lines(full.domain[!is.na(Y[control,])],Y[control,!is.na(Y[control,])],col='red',lty=2,lwd=4)
legend('topright',legend=c('Discrete Curve','Smooth Curve'),lty=c(2,1),lwd=4,col=c('red','black'),bty='n',cex=2)
##### Section 3: Perform Functional Principal Components Analysis #####
pc = pca.fd(Overall$fd,nharm=2) #Perform FPCA and Store Top 2 PC Functions
#Create the Interpretable Plot of PC Functions (FPCs) (Note Not Presenting Actual FPCs)
par(mfrow=c(1,2))
plot(pc)
pc$varprop #Display Proportion of Variance Explained
#In a 2x2 Grid Plot the Mean Function, Actual FPCs, and FPC Scores ([2,2] Cell Empty)
par(mfrow=c(2,2))
PC.Matrix = eval.fd(seq(0,24),pc$harmonics)
mean.function = as.vector(eval.fd(seq(0,24),pc$meanfd))
plot(seq(0,24),mean.function,lwd=3,ylab='Material Characteristic',xlab='Time (Hours)',
main='Mean Function',col='black',ylim=c(0,100),type='l',cex.lab=1.5,cex.axis=1.5,cex.main=2)
plot(seq(0,24),PC.Matrix[,1],lwd=3,ylab='',xlab='Time (Hours)',main='PC Functions',col='blue',
ylim=range(PC.Matrix),type='l',cex.lab=1.5,cex.axis=1.5,cex.main=2)
lines(seq(0,24),PC.Matrix[,2],lwd=3,col='red',lty=2)
legend('bottomright',legend=c('FPC 1','FPC 2'),col=c('blue','red'),lty=c(1,2),lwd=2,bty='n',cex=1.5)
abline(h=0,lwd=2,lty=3)
#Plot PC Scores in 2-Dimensions, Highlighting Control Sample
plot(pc$scores,pch=16,cex=2,col='gray',main='FPC Scores',xlab='FPC Score 1',ylab='FPC Score 2',
cex.lab=1.5,cex.axis=1.5,cex.main=2)
points(pc$scores[control,1],pc$scores[control,2],pch=16,col='black',cex=2)
legend('top',legend=c('Exp. Materials','Standard'),pch=16,col=c('gray','black'),bty='n',cex=1.5)
##### Section 4: Clustering FPC Scores #####
#Apply K-Means Clustering with K=4 to the PC Scores
fit = kmeans(pc$scores,4)
#Update Scores Plot and Smooth Curve Overlay with Cluster Colors
colors = c('red','blue','green','orange')
par(mfrow=c(1,3))
x = seq(0,24)
plot(pc$scores,col=colors[fit$cluster],pch=16,cex=3,ylab='FPC Score 2',xlab='FPC Score 1',
main='Clustering FPC Scores',cex.lab=1.5,cex.axis=1.5,cex.main=2)
points(pc$scores[control,1],pc$scores[control,2],col='black',pch=16,cex=3)
plot(x,smoothed.curves[,1],type='l',col=colors[fit$cluster],lwd=3,ylim=c(1,100),
ylab='Material Characteristic',xlab='Time (Hours)',main='',cex.lab=1.5,cex.axis=1.5,cex.main=2)
for (j in 2:k) {
lines(x,smoothed.curves[,j],type='l',col=colors[fit$cluster[j]],lwd=3,ylim=c(0,1))
}
lines(x,smoothed.curves[,control],type='l',col='black',lwd=3,ylim=c(0,1))
legend('topright',legend=c('Standard',paste('Cluster:',1:4)),col=c('black',colors),lty=1,bty='n',cex=2,lwd=3)
#Suppose We Wanted to Plot the Nearest j Observed Curves Based on Euclidean Distance in 2-D Space
Plot.Closest = function(j) {
distances = c()
for (i in 1:k) distances[i] = sqrt(sum((pc$scores[control,] - pc$scores[i,])^2))
closest = seq(1:k)[order(distances)[2:(j+1)]]
plot(full.domain[!is.na(Y[closest[1],])],Y[closest[1],!is.na(Y[closest[1],])],type='l',
col=colors[fit$cluster[closest[1]]],
lwd=3,ylim=c(0,100),xlab='Time (Hours)',ylab='Material Characteristic',
main=paste('Closest',j,'Additives'),cex.lab=1.5,cex.axis=1.5,cex.main=2)
for (l in 2:length(closest)) {
lines(full.domain[!is.na(Y[closest[l],])],Y[closest[l],!is.na(Y[closest[l],])],lwd=2,
col=colors[fit$cluster[closest[l]]])
}
lines(full.domain[!is.na(Y[control,])],Y[control,!is.na(Y[control,])],lwd=3,col='black')
}
Plot.Closest(3)
##### Section 5: Exploration of Impact of Various FPC Scores on Functional Data Shape #####
#Store the Mean Function and PC Functions (Karhunen-Loeve Expansion Building Blocks)
mean.function = as.vector(eval.fd(x,pc$meanfd))
pc.functions = eval.fd(x,pc$harmonics) #When Used in this Way, Empirical Basis Functions
#Randomly Select 100 Pairs of New Scores (Restricted to the Range of Observed Scores)
new.scores = cbind(runif(100,min(pc$scores[,1]),max(pc$scores[,1])),runif(100,min(pc$scores[,2]),max(pc$scores[,2])))
new.curves = matrix(0,nrow=100,ncol=25)
#For the First Set of Scores, Plot the following in a 2 x 2 Plot Matrix
#The 1st FPC (Blue Solid Line) with FPC1 * New Score Overlaid (Dashed Blue Line)
#The 2st FPC (Red Solid Line) with FPC2 * New Score Overlaid (Dashed Red Line)
#The Scores Plot of Original Data with Added Point for New Score Pair (Maroon)
#The Resulting New Curve Based on Karhunen-Loeve (Maroon) Overlaid with the Mean Function
par(mfrow=c(2,2))
plot(seq(0,24),pc.functions[,1],type='l',lwd=3,col='blue',ylab='Percentage Performance',
xlab='Time (Hours)',main='PC Function 1',ylim=c(-20,30),cex.lab=1.5,cex.axis=1.5,cex.main=2)
lines(seq(0,24),pc.functions[,1] * new.scores[1,1],col='blue',lwd=3,lty=2)
plot(seq(0,24),pc.functions[,2],type='l',lwd=3,col='red',ylab='Percentage Performance',
xlab='Time (Hours)',main='PC Function 2',ylim=c(-8,8),cex.lab=1.5,cex.axis=1.5,cex.main=2)
lines(seq(0,24),pc.functions[,2] * new.scores[1,2],col='red',lwd=3,lty=2)
plot(pc$scores,col='gray',pch=16,cex=3,ylab='FPC Score 2',xlab='FPC Score 1',main='PC Scores',
cex.lab=1.5,cex.axis=1.5,cex.main=2)
points(pc$scores[control,1],pc$scores[control,2],col='black',pch=16,cex=3)
points(new.scores[1,1],new.scores[1,2],col='maroon',cex=3,pch=13)
new.curves[1,] = mean.function + pc.functions[,1] * new.scores[1,1] + pc.functions[,2] * new.scores[1,2]
plot(x,mean.function,ylim=c(0,100),type='l',lwd=3,ylab='Percentage Performance',xlab='Time (Hours)',
main='New Decay Curves',cex.main=2,cex.lab=1.5,cex.axis=1.5)
lines(x,new.curves[1,],col='maroon',lwd=3)
Sys.sleep(1)
#For the Remaining New Score Pairs, Progressively Build on Above Plot.
#All Results from Past Iterations are Changed to Light Gray Color.
#Current Iteration in Full Colors (Blue, Red, and Maroon Respectively)
for (l in 2:100) {
plot(x,pc.functions[,1],type='l',lwd=3,col='blue',ylab='Percentage Performance',
xlab='Time (Hours)',main='PC Function 1',ylim=c(-20,30),cex.lab=1.5,cex.axis=1.5,cex.main=2)
for (h in 1:(l-1)){
lines(x,pc.functions[,1] * new.scores[h,1],col='gray',lwd=2,lty=3)
}
lines(x,pc.functions[,1] * new.scores[l,1],col='blue',lwd=3,lty=2)
plot(x,pc.functions[,2],type='l',lwd=3,col='red',ylab='Percentage Performance',
xlab='Time (Hours)',main='PC Function 2',ylim=c(-8,8),cex.lab=1.5,cex.axis=1.5,cex.main=2)
for (h in 1:(l-1)){
lines(x,pc.functions[,2] * new.scores[h,2],col='gray',lwd=2,lty=3)
}
lines(x,pc.functions[,2] * new.scores[l,2],col='red',lwd=3,lty=2)
plot(pc$scores,col='gray',pch=16,cex=3,ylab='FPC Score 2',xlab='FPC Score 1',main='PC Scores',
cex.lab=1.5,cex.axis=1.5,cex.main=2)
points(pc$scores[control,1],pc$scores[control,2],col='black',pch=16,cex=3)
points(new.scores[1:(l-1),1],new.scores[1:(l-1),2],col='gray',cex=3,pch=1)
points(new.scores[l,1],new.scores[l,2],col='maroon',cex=3,pch=13)
new.curves[l,] = mean.function + pc.functions[,1] * new.scores[l,1] + pc.functions[,2] * new.scores[l,2]
plot(x,mean.function,ylim=c(0,100),type='l',lwd=3,ylab='Percentage Performance',xlab='Time (Hours)',
main='New Decay Curves',cex.lab=1.5,cex.axis=1.5,cex.main=2)
for (h in 1:(l-1)){
lines(x,new.curves[h,],col='gray',lwd=2,lty=3)
}
lines(x,new.curves[l,],col='maroon',lwd=3)
lines(x,mean.function,lwd=2)
Sys.sleep(1)
}
##### Section 6: Extension of Example to Functional Regression - FPCA Regression #####
#Note the Functional Regression Model We Fit Here is y ~ \int x(t) * beta(t) dt
# y = Scalar Response
# x(t) = Observed Decay Curves Defined Over Domain T
# beta(t) = Slope Function Defined Over Domain T
# Domain T from 0 to 24 Hours
#To Generate Scalar Response, Numerical Integration is Required and this Composite.Trapezoid Function is Used
composite.trapezoid <- function(x,y) {
n = length(x)
h <- (x[n] - x[1]) / n
approx = (h/2) * (y[1] + y[n] + 2*sum(y[2:(n-1)]))
return(approx)
}
#Suppose the True Slope Function is f(x) = sqrt(x) / 500
par(mfrow=c(1,2))
slope.fun = function(y) sqrt(y) / 500
#Simulate Responses Using True Slope Function, True Exponential Decay Function, and Random Normal Error
set.seed(23472)
response = c()
for (i in 1:k) {
response[i] = composite.trapezoid(x,slope.fun(x) * 100*exp(-coefs[i]*x)) + rnorm(1,0,.5)
}
#Plot the True Slope Function and the Smoothed Profiles Colored By Response Value
par(mfrow=c(1,2))
plot(seq(0,24,.01),slope.fun(seq(0,24,.01)),type='l',lwd=2,main='Slope Function',ylab='',xlab='Time (Hours)',
cex.axis=1.5,cex.lab=1.5,cex.main=2)
#Define a Color Spectrum Based on Low (Purple) to High (Red) Response Value
pal = colorRampPalette(colors=c('purple','blue','green','yellow','orange','red'))
order.for.coloring = findInterval(response,sort(response))
plot(x,smoothed.curves[,1],type='l',col=pal(length(response))[order.for.coloring[1]],lwd=3,ylim=c(1,100),
ylab='Material Characteristic',xlab='Time (Hours)',main='Application Performance',cex.lab=1.5,cex.axis=1.5,cex.main=2)
for (j in 2:k) {
lines(x,smoothed.curves[,j],type='l',col=pal(length(response))[order.for.coloring[j]],lwd=3,ylim=c(0,1))
}
legend('topright',legend=round(sort(response[order.for.coloring %in% round(seq(1,41,length.out=6))]),3),col=pal(6),
lty=1,bty='n',cex=1.25,lwd=3)
#To Perform FPCA Regression, Predict the Response Using FPC Scores in a Linear Model
par(mfrow=c(1,1))
pca.lm = lm(response ~ pc$scores)
summary(pca.lm) #Print Linear Model Summary
mean(abs(pca.lm$fitted.values - response))
##### Section 7: Extension of Example to Functional Regression - Functional Linear Model #####
#Store the Smoothed Decay Profiles as Matrix (Note We Need to Transform it)
model.matrix = as.matrix(t(Overall$y))
#Construct the Functional Linear Model - Estimate beta(t) Using 15 Cubic Regression Splines
FRModel = pfr(response ~ lf(X=model.matrix,argvals = seq(0,24),bs='cr',k=5))
summary(FRModel)
#Show the Actual by Predicted Plot For Both FPCR and FLM
plot(response,pca.lm$fitted.values,pch=16,cex=1.5,xlab='Actual Performance Property',
ylab='Predicted Performance Property',main='Functional Regression',
cex.lab=1.5,cex.axis=1.5,cex.main=2,col='red')
abline(a=0,b=1,lwd=2,col='darkgray')
points(response,FRModel$fitted.values,pch=0,cex=1.5)
legend('topleft',legend=c('FPCR','FLM'),col=c('red','black'),pch=c(16,0),cex=2,bty='n')
#Compare the Estimated Slope Function To the True Slope Function
par(mfrow=c(1,1))
plot(FRModel,lwd=3,ylab='',main='Slope Function',xlab='Time (Hours)',cex.lab=1.5,cex.axis=1.5,cex.main=2)
lines(x,slope.fun(x),col='red',lwd=3)
mean(abs(FRModel$fitted.values - response))
##### Section 8: Comparison of Functional Regression Models #####
#Comparison of Slope Functions Produced With Different Random Seed
#Repeat FLM Model Fitting Using Newly Simulated Data for Each Random Seed
#Store the Slope Function Estimate and MAE to Record Model Performance
set.seed(12945)
seeds = round(runif(10,1,100000),0)
maes = c()
Slope.Functions = list()
for (l in 1:length(seeds)) {
set.seed(seeds[l])
response = c()
for (i in 1:k) {
response[i] = composite.trapezoid(x,slope.fun(x) * 100*exp(-coefs[i]*x)) + rnorm(1,0,.5)
}
model.matrix = as.matrix(t(Overall$y))
Sim.FRModel = pfr(response ~ lf(model.matrix,argvals = x,bs='cr',k=15))
Slope.Functions[[l]] = cbind(plot(Sim.FRModel)[[1]]$x,plot(Sim.FRModel)[[1]]$fit)
maes[l] = mean(abs(Sim.FRModel$fitted.values - response))
}
#Overlay the Slope Functions for the Initial Model and Simulated Random Models with the True Slope Function
par(mfrow=c(1,1))
plot(FRModel,lwd=3,main='Slope Functions',ylab='',xlab='Time (Hours)',
cex.lab=1.5,cex.axis=1.5,cex.main=2,ylim=c(-.01,.02))
for (l in 1:length(seeds)) {
lines(Slope.Functions[[l]],lwd=1,lty=1,col='gray')
}
lines(x,slope.fun(x),col='red',lwd=3)
legend('topleft',legend=c('Initial Model','Initial Model CI','True Slope','Simulated Results'),
col=c('black','black','red','gray'),lty=c(1,2,1,1),lwd=c(3,3,3,1),bty='n')
#Compare FPCR Models with Only FPC1 and FPC1 + FPC2
par(mfrow=c(1,1))
pca.lm1 = lm(response ~ pc$scores[,1])
summary(pca.lm) #Print Linear Model Summary
#Show the Actual By Predicted Plot
plot(response,pca.lm1$fitted.values,pch=16,cex=2,xlab='Actual Performance Property',
ylab='Predicted Performance Property',main='FPC Regression',
cex.lab=1.5,cex.axis=1.5,cex.main=2)
points(response,pca.lm$fitted.values,pch=16,col='orange',cex=2)
abline(a=0,b=1,lwd=2,col='darkgray')
mean(abs(pca.lm1$fitted.values - response))
legend('topleft',legend=c('FPC1 & FPC2','FPC1 Only'),col=c('black','orange'),pch=16,cex=2,bty='n')
##### Section 9: Alternative Modeling Approaches (Original Data, No Missing Values) #####
#Regenerate the Data, but This Time Without Missing Values to Create a Level Playing Field.
set.seed(49183)
par(mfrow=c(1,2))
k = 41 #Total Number of Curves
#control = 11 #Specify the 11th Curve as a 'Control' for Purpose of Example
coefs = runif(k,.05,.2) #Random Coefficients for Exponential Functions
full.domain = c(0,1,2,4,6,8,12,16,20,24) #Domain Specification (Hours)
Y = matrix(0,nrow=k,ncol=length(full.domain))
Y[,1] = rep(100,k) #Require that All Curves Begin at 100% (no decay has occured)
#Generate Curve 1, Adding Random Normal Error: N(0,sd=3)
Y[1,2:ncol(Y)] = 100*exp(-coefs[1]*full.domain[2:ncol(Y)]) + rnorm(ncol(Y)-1,0,3)
Y[1,][Y[1,] < 0] = 0 #Ensure Non-Negative Points
#Plot the 1st Curve
plot(full.domain,Y[1,],type='l',col='gray',lwd=3,xlab='Time (Hours)',
ylab='Material Characteristic',cex.lab=1.5,cex.axis=1.5)
#Repeat the Above Steps for Curves 2 to 41
for (j in 2:k) {
Y[j,2:ncol(Y)] = 100*exp(-coefs[j]*full.domain[2:ncol(Y)]) + rnorm(ncol(Y)-1,0,3)
Y[j,][Y[j,] < 0] = 0
lines(full.domain,Y[j,],type='l',col='gray',lwd=3)
}
#To Differentiate the Control from the Other 40 Curves, Color it Black and Add a Legend
lines(full.domain,Y[control,],type='l',col='black',lwd=3)
legend('topright',legend=c('Exp. Materials','Standard'),lty=1,lwd=3,col=c('gray','black'),bty='n',cex=1.25)
set.seed(23472)
response = c()
for (i in 1:k) {
response[i] = composite.trapezoid(x,slope.fun(x) * 100*exp(-coefs[i]*x)) + rnorm(1,0,.5)
}
#Compute a Single Numerical Derivative Based On Rise/Run on Raw Data
#Fit a Simple Linear Model and Store the MAE
MAE = RMSE = c()
dv = (Y[,ncol(Y)] - Y[,1]) / (full.domain[length(full.domain)] - full.domain[1])
one.dv.model = lm(response ~ dv)
MAE[1] = mean(abs(response - one.dv.model$fitted.values))
RMSE[1] = sqrt(mean((response - one.dv.model$fitted.values)^2))
#Bin the Domain Into 4, 6 Hour Windows and Compute the Rise/Run Numerical Derivatives
#Fit a Multiple Regression Model with First Order Terms
#Note the Presence of Collinearity Amongst the Derivatives
#Also, Derivatives are Computed on the Smoothed Data for Simplicity
binned.dv = data.frame(matrix(0,nrow=nrow(Y),ncol=3))
binned.dv[,1] = (Y[,5] - Y[,1]) / (full.domain[5] - full.domain[1])
binned.dv[,2] = (Y[,8] - Y[,5]) / (full.domain[8] - full.domain[5])
binned.dv[,3] = (Y[,length(full.domain)] - Y[,8]) / (full.domain[length(full.domain)] - full.domain[8])
colnames(binned.dv) = c('0-6 Hours','6-16 Hours','16-24 Hours')
binned.dv.model = lm(response ~ as.matrix(binned.dv))
MAE[2] = mean(abs(response - binned.dv.model$fitted.values))
RMSE[2] = sqrt(mean((response - binned.dv.model$fitted.values)^2))
#Compute the Area Under the Curve (Larger Value Implies Slower Decay) Using the Raw Data
#Fit a Quadratic Model to Predict Response
AUCs = c()
for (l in 1:nrow(Y)) AUCs[l] = composite.trapezoid(full.domain,Y[l,])
auc.model = lm(response ~ AUCs)
MAE[3] = mean(abs(response - auc.model$fitted.values))
RMSE[3] = sqrt(mean((response - auc.model$fitted.values)^2))
# Apply Both PC Regression and PLS, Selecting 2 Components Each Time
# Note, Both Methods Only Possible on the Smoothed Data
pcr.model = pcr(response ~ Y,validation = 'CV')
MAE[4] = mean(abs(response - pcr.model$fitted.values[,,2]))
RMSE[4] = sqrt(mean((response - pcr.model$fitted.values[,,2])^2))
#Utilize the Exponential Decay Function as a Parametric Basis Function
#Estimate the Unknown Parameter From the Raw Data
#Fit a Quadratic Model to Predict Response from Fit Parameter
fit.exp = function(x,y) {
MSE = function(alpha) {
return(sum((y - 100*exp(-alpha*x))^2))
}
return(optimize(MSE,interval=c(.01,2),maximum=F)$minimum)
}
fit.parms = c()
for (l in 1:nrow(Y)) fit.parms[l] = fit.exp(full.domain,Y[l,])
exp.parm.model = lm(response ~ fit.parms + I(fit.parms^2))
MAE[5] = mean(abs(response - exp.parm.model$fitted.values))
RMSE[5] = sqrt(mean((response - exp.parm.model$fitted.values)^2))
#Store the MAE and Relative Errors for the FPCR and FLM Models
# Convert the Discrete Curves to Smooth Curves Using Cubic B-Splines
basis = create.bspline.basis(c(0,24),norder=4,nbasis=20) #Construct Basis Functions
fdobj = fdPar(basis,Lfdobj=2,lambda=10) #Define Functional Parameter with Roughness Penalty
Overall = smooth.basis(full.domain,t(Y),fdobj)
smoothed.curves = eval.fd(seq(0,24),Overall$fd)
pal = colorRampPalette(colors=c('purple','blue','green','yellow','orange','red'))
order.for.coloring = findInterval(response,sort(response))
plot(seq(0,24),smoothed.curves[,1],type='l',col=pal(length(response))[order.for.coloring[1]],lwd=3,ylim=c(1,100),
ylab='Material Characteristic',xlab='Time (Hours)',main='Application Performance',cex.lab=1.5,cex.axis=1.5,cex.main=2)
for (j in 2:k) {
lines(seq(0,24),smoothed.curves[,j],type='l',col=pal(length(response))[order.for.coloring[j]],lwd=3,ylim=c(0,1))
}
legend('topright',legend=round(sort(response[order.for.coloring %in% round(seq(1,41,length.out=6))]),3),col=pal(6),
lty=1,bty='n',cex=1.25,lwd=3)
pc = pca.fd(Overall$fd,nharm=2) #Perform FPCA and Store Top 2 PC Functions
par(mfrow=c(1,1))
pca.lm = lm(response ~ pc$scores)
summary(pca.lm) #Print Linear Model Summary
MAE[6] = mean(abs(pca.lm$fitted.values - response))
RMSE[6] = sqrt(mean((response - pca.lm$fitted.values)^2))
model.matrix = as.matrix(t(Overall$y))
FRModel = pfr(response ~ lf(model.matrix,argvals = full.domain,bs='cr',k=5))
summary(FRModel)
MAE[7] = mean(abs(FRModel$fitted.values - response))
RMSE[7] = sqrt(mean((response - FRModel$fitted.values)^2))
MAE = round(MAE,3)
RMSE = round(RMSE,3)
Methods = c('One Derivative','Binned Derivatives','AUCs','PCR','Exp. Decay','FPCR','FLM')
kable(cbind(Methods,MAE,RMSE),align='c')
##### Section 10: Alternative Modeling Approaches (Extreme Example 10 Curves with 25 Measurements) #####
set.seed(49183)
par(mfrow=c(1,2))
k = 10
coefs = runif(k,.05,.2)
full.domain = seq(0,24)
Y = matrix(0,nrow=k,ncol=length(full.domain))
Y[,1] = rep(100,k)
Y[1,2:ncol(Y)] = 100*exp(-coefs[1]*full.domain[2:ncol(Y)]) + rnorm(ncol(Y)-1,0,3)
Y[1,][Y[1,] < 0] = 0
plot(full.domain,Y[1,],type='l',col='gray',lwd=3,xlab='Time (Hours)',
ylab='Material Characteristic',cex.lab=1.5,cex.axis=1.5)
for (j in 2:k) {
Y[j,2:ncol(Y)] = 100*exp(-coefs[j]*full.domain[2:ncol(Y)]) + rnorm(ncol(Y)-1,0,3)
Y[j,][Y[j,] < 0] = 0
lines(full.domain,Y[j,],type='l',col='gray',lwd=3)
}
set.seed(23472)
response = c()
for (i in 1:k) {
response[i] = composite.trapezoid(x,slope.fun(x) * 100*exp(-coefs[i]*x)) + rnorm(1,0,.5)
}
MAE = RMSE = c()
dv = (Y[,ncol(Y)] - Y[,1]) / (full.domain[length(full.domain)] - full.domain[1])
one.dv.model = lm(response ~ dv)
MAE[1] = mean(abs(response - one.dv.model$fitted.values))
RMSE[1] = sqrt(mean((response - one.dv.model$fitted.values)^2))
binned.dv = data.frame(matrix(0,nrow=nrow(Y),ncol=3))
binned.dv[,1] = (Y[,9] - Y[,1]) / (full.domain[9] - full.domain[1])
binned.dv[,2] = (Y[,17] - Y[,9]) / (full.domain[17] - full.domain[9])
binned.dv[,3] = (Y[,length(full.domain)] - Y[,17]) / (full.domain[length(full.domain)] - full.domain[17])
colnames(binned.dv) = c('0-6 Hours','6-16 Hours','16-24 Hours')
binned.dv.model = lm(response ~ as.matrix(binned.dv))
MAE[2] = mean(abs(response - binned.dv.model$fitted.values))
RMSE[2] = sqrt(mean((response - binned.dv.model$fitted.values)^2))
AUCs = c()
for (l in 1:nrow(Y)) AUCs[l] = composite.trapezoid(full.domain,Y[l,])
auc.model = lm(response ~ AUCs)
MAE[3] = mean(abs(response - auc.model$fitted.values))
RMSE[3] = sqrt(mean((response - auc.model$fitted.values)^2))
pcr.model = pcr(response ~ Y,validation = 'CV')
MAE[4] = mean(abs(response - pcr.model$fitted.values[,,2]))
RMSE[4] = sqrt(mean((response - pcr.model$fitted.values[,,2])^2))
fit.exp = function(x,y) {
MSE = function(alpha) {
return(sum((y - 100*exp(-alpha*x))^2))
}
return(optimize(MSE,interval=c(.01,2),maximum=F)$minimum)
}
fit.parms = c()
for (l in 1:nrow(Y)) fit.parms[l] = fit.exp(full.domain,Y[l,])
exp.parm.model = lm(response ~ fit.parms + I(fit.parms^2))
MAE[5] = mean(abs(response - exp.parm.model$fitted.values))
RMSE[5] = sqrt(mean((response - exp.parm.model$fitted.values)^2))
basis = create.bspline.basis(c(0,24),norder=4,nbasis=20)
fdobj = fdPar(basis,Lfdobj=2,lambda=10)
Overall = smooth.basis(full.domain,t(Y),fdobj)
pc = pca.fd(Overall$fd,nharm=2)
pca.lm = lm(response ~ pc$scores)
summary(pca.lm)
MAE[6] = mean(abs(pca.lm$fitted.values - response))
RMSE[6] = sqrt(mean((response - pca.lm$fitted.values)^2))
model.matrix = as.matrix(t(Overall$y))
FRModel = pfr(response ~ lf(model.matrix,argvals = full.domain,bs='cr',k=5))
summary(FRModel)
MAE[7] = mean(abs(FRModel$fitted.values - response))
RMSE[7] = sqrt(mean((response - FRModel$fitted.values)^2))
smoothed.curves = eval.fd(seq(0,24),Overall$fd)
pal = colorRampPalette(colors=c('purple','blue','green','yellow','orange','red'))
order.for.coloring = findInterval(response,sort(response))
plot(seq(0,24),smoothed.curves[,1],type='l',col=pal(length(response))[order.for.coloring[1]],lwd=3,ylim=c(1,100),
ylab='Material Characteristic',xlab='Time (Hours)',main='Application Performance',cex.lab=1.5,cex.axis=1.5,cex.main=2)
for (j in 2:k) {
lines(seq(0,24),smoothed.curves[,j],type='l',col=pal(length(response))[order.for.coloring[j]],lwd=3,ylim=c(0,1))
}
legend('topright',legend=round(sort(response[order.for.coloring %in% round(seq(1,k,length.out=6))]),3),col=pal(6),
lty=1,bty='n',cex=1.25,lwd=3)
MAE = round(MAE,3)
RMSE = round(RMSE,3)
Methods = c('One Derivative','Binned Derivatives','AUCs','PCR','Exp. Decay','FPCR','FLM')
kable(cbind(Methods,MAE,RMSE),align='c')