forked from GiggleLiu/ModernScientificComputing
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path8.optimization.jl
2092 lines (1757 loc) · 73.5 KB
/
8.optimization.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
### A Pluto.jl notebook ###
# v0.19.22
using Markdown
using InteractiveUtils
# ╔═╡ 013c7dac-a8c6-4fff-8837-e4d6751f3fb6
begin
using AbstractTrees, PlutoUI, Reexport
include("../lib/PlutoLecturing/src/PlutoLecturing.jl")
using .PlutoLecturing
end
# ╔═╡ 1ed6d587-b984-463f-9db3-220bdfa81ebe
using Plots
# ╔═╡ f020f39c-e756-4c09-8ab3-7d10e78ca43e
using Optim
# ╔═╡ 4136799e-c4ab-432b-9e8a-208b0eae060e
using ForwardDiff
# ╔═╡ 4b3de811-278b-4154-bce6-4d67dc19ff38
using Luxor
# ╔═╡ 6644b213-63ed-4814-b034-0b39caa04f23
md"""
# Announcements
1. How to avoid PR like: [https://github.com/GiggleLiu/ModernScientificComputing/pull/27](https://github.com/GiggleLiu/ModernScientificComputing/pull/27). Help desk event (Saturday night, hosted by Yusheng Zhao) will be announced in the `#coding-club` stream?
2. Why we should never add a big file into a Github repo?
2. The ChatGPT bot in [HKUST-GZ Zulip workspace](http://zulip.hkust-gz.edu.cn/), `@ChatGPT` (user stream: `#chatgpt-discussion`), credit Yijie Xu, Github repo: [https://github.com/yeahjack/chatgpt_zulip_bot](https://github.com/yeahjack/chatgpt_zulip_bot).
3. Have to cancel the next lecture, we will not have final exam!
"""
# ╔═╡ c85bdb41-ce56-48ce-a82c-6599ccbc446c
LocalResource("images/chatgpt.png")
# ╔═╡ 3dc062b0-30d0-4332-9047-0852671cb031
TableOfContents()
# ╔═╡ 7345a2c1-d4b7-470f-a936-b4cc6c177399
md"# Optimization"
# ╔═╡ 7c26b07f-6a1f-4d40-9152-b8738a1dad62
md"Reference: **Scientific Computing - Chapter 6**"
# ╔═╡ 6b4307ea-33c0-4a20-af35-d05262856528
md"""A general continous optimization problem has the following form
```math
\min_{\mathbf x}f(\mathbf x)~~~\text{ subject to certian constraints}
```
The constraints may be either equality or inequality constraints.
"""
# ╔═╡ 647a7101-ebdc-4717-a343-22edee8a7d92
md"""
# Gradient free optimization
"""
# ╔═╡ f784dc43-5058-4375-9040-f343c346cff8
md"""
Gradient-free optimizers are optimization algorithms that do not rely on the gradient of the objective function to find the optimal solution. Instead, they use other methods such as random search, genetic algorithms, or simulated annealing to explore the search space and find the optimal solution. These methods are particularly useful when the objective function is non-differentiable or when the gradient is difficult to compute. However, gradient-free optimizers can be $(PlutoLecturing.highlight("slower and less efficient than gradient-based methods")), especially when the search space is high-dimensional.
There are several popular gradient-free optimizers, including:
* **Genetic algorithms**: These are optimization algorithms inspired by the process of natural selection. They use a population of candidate solutions and apply genetic operators such as crossover and mutation to generate new solutions.
* **Simulated annealing**: This is a probabilistic optimization algorithm that uses a temperature parameter to control the probability of accepting a worse solution. It starts with a high temperature that allows for exploration of the search space and gradually decreases the temperature to converge to the optimal solution.
* **Particle swarm optimization**: This is a population-based optimization algorithm that simulates the movement of particles in a search space. Each particle represents a candidate solution, and they move towards the best solution found so far.
* **Bayesian optimization**: This is a probabilistic optimization algorithm that uses a probabilistic model to approximate the objective function and guides the search towards promising regions of the search space.
* **Nelder-Mead algorithm**: This is a direct search method that does not require the computation of gradients of the objective function. Instead, it uses a set of simplex (a geometrical figure that generalizes the concept of a triangle to higher dimensions) to iteratively explore the search space and improve the objective function value. The Nelder-Mead algorithm is particularly effective in optimizing nonlinear and non-smooth functions, and it is widely used in engineering, physics, and other fields.
"""
# ╔═╡ 1a959468-3add-4e34-a9fe-54ca67a12894
md"NOTE: [Optim.jl documentation](https://julianlsolvers.github.io/Optim.jl/stable/) contains more detailed introduction of gradient free, gradient based and hessian based optimizers."
# ╔═╡ 90472087-2580-496e-989e-e4fa58fd1e17
md"""
## The downhill simplex method
"""
# ╔═╡ 0c5dd6d1-20cd-4ec6-9017-80dbc705b75d
md"""
Here are the steps involved in the one dimentional downhill simplex algorithm:
1. Initialize a one dimensional simplex, evaluate the function at the end points $x_1$ and $x_2$ and assume $f(x_2) > f(x_1)$.
2. Evaluate the function at $x_c = 2x_1 - x_2$.
3. Select one of the folloing operations
1. If $f(x_c)$ is smaller than $f(x_1)$, **flip** the simplex by doing $x_1 \leftarrow x_c$ and $x_2 \leftarrow x_1$.
2. If $f(x_c)$ is larger than $f(x_1)$, but smaller than $f(x_2)$, then $x_2\leftarrow x_c$, goto case 3.
3. If $f(x_c)$ is larger than $f(x_2)$, then **shrink** the simplex: evaluate $f$ on $x_d\leftarrow (x_1 + x_2)/2$, if it is larger than $f(x_1)$, then $x_2 \leftarrow x_d$, otherwise $x_1\leftarrow x_d, x_2\leftarrow x_1$.
4. Repeat step 2-3 until convergence.
"""
# ╔═╡ e6ab2552-6597-4bef-a9c1-e0cea14b0f67
function simplex1d(f, x1, x2; tol=1e-6)
# initial simplex
history = [[x1, x2]]
f1, f2 = f(x1), f(x2)
while abs(x2 - x1) > tol
xc = 2x1 - x2
fc = f(xc)
if fc < f1 # flip
x1, f1, x2, f2 = xc, fc, x1, f1
else # shrink
if fc < f2 # let the smaller one be x2.
x2, f2 = xc, fc
end
xd = (x1 + x2) / 2
fd = f(xd)
if fd < f1 # update x1 and x2
x1, f1, x2, f2 = xd, fd, x1, f1
else
x2, f2 = xd, fd
end
end
push!(history, [x1, x2])
end
return x1, f1, history
end
# ╔═╡ d16748d2-e157-43d2-b6fe-69cbe4a5dd9c
simplex1d(x->x^2, -1.0, 6.0)
# ╔═╡ 66416807-bcb0-4881-90f8-1595756e4a6f
md"""
The Nelder-Mead method is well summarized in this [wiki page](https://en.wikipedia.org/wiki/Nelder%E2%80%93Mead_method).
Here is a Julia implementation:
"""
# ╔═╡ 43486a15-7582-4532-b2ac-73db4bb07f97
function simplex(f, x0; tol=1e-6, maxiter=1000)
n = length(x0)
x = zeros(n+1, n)
fvals = zeros(n+1)
x[1,:] = x0
fvals[1] = f(x0)
alpha = 1.0
beta = 0.5
gamma = 2.0
for i in 1:n
x[i+1,:] = x[i,:]
x[i+1,i] += 1.0
fvals[i+1] = f(x[i+1,:])
end
history = [x]
for iter in 1:maxiter
# Sort the vertices by function value
order = sortperm(fvals)
x = x[order,:]
fvals = fvals[order]
# Calculate the centroid of the n best vertices
xbar = dropdims(sum(x[1:n,:], dims=1) ./ n, dims=1)
# Reflection
xr = xbar + alpha*(xbar - x[n+1,:])
fr = f(xr)
if fr < fvals[1]
# Expansion
xe = xbar + gamma*(xr - xbar)
fe = f(xe)
if fe < fr
x[n+1,:] = xe
fvals[n+1] = fe
else
x[n+1,:] = xr
fvals[n+1] = fr
end
elseif fr < fvals[n]
x[n+1,:] = xr
fvals[n+1] = fr
else
# Contraction
if fr < fvals[n+1]
xc = xbar + beta*(x[n+1,:] - xbar)
fc = f(xc)
if fc < fr
x[n+1,:] = xc
fvals[n+1] = fc
else
# Shrink
for i in 2:n+1
x[i,:] = x[1,:] + beta*(x[i,:] - x[1,:])
fvals[i] = f(x[i,:])
end
end
else
# Shrink
for i in 2:n+1
x[i,:] = x[1,:] + beta*(x[i,:] - x[1,:])
fvals[i] = f(x[i,:])
end
end
end
push!(history, x)
# Check for convergence
if maximum(abs.(x[2:end,:] .- x[1,:])) < tol && maximum(abs.(fvals[2:end] .- fvals[1])) < tol
break
end
end
# Return the best vertex and function value
bestx = x[1,:]
bestf = fvals[1]
return (bestx, bestf, history)
end
# ╔═╡ 962f24ff-2adc-41e3-bc5c-2f78952950d2
md"""
The `simplex` function takes three arguments: the objective function `f`, the initial guess `x0`, and optional arguments for the tolerance `tol` and maximum number of iterations `maxiter`.
The algorithm initializes a simplex (a high dimensional triangle) with `n+1` vertices, where `n` is the number of dimensions of the problem. The vertices are initially set to `x0` and `x0 + h_i`, where `h_i` is a small step size in the `i`th dimension. The function values at the vertices are also calculated.
The algorithm then iteratively performs **reflection**, **expansion**, **contraction**, and **shrink** operations on the simplex until convergence is achieved. The best vertex and function value are returned.
"""
# ╔═╡ 1024bec9-6741-4cf6-a214-9777c17d2348
md"""
We use the [Rosenbrock function](https://en.wikipedia.org/wiki/Rosenbrock_function) as the test function.
"""
# ╔═╡ f859faaa-b24e-4af2-8b72-85f2d753e87a
function rosenbrock(x)
(1.0 - x[1])^2 + 100.0 * (x[2] - x[1]^2)^2
end
# ╔═╡ c16945b6-397e-4cc2-9f74-9c0e6eb21a8c
let
x = -2:0.01:2
y = -2:0.01:2
f = [rosenbrock((a, b)) for b in y, a in x]
heatmap(x, y, log.(f); label="log(f)", xlabel="x₁", ylabel="x₂")
end
# ╔═╡ 81b628ce-91b1-4b63-aca1-96e10c49d7b9
function show_triangles(history)
x = -2:0.02:2
y = -2:0.02:2
f = [rosenbrock((a, b)) for b in y, a in x]
@gif for item in history
plt = heatmap(x, y, log.(f); label="log(f)", xlabel="x₁", ylabel="x₂", xlim=(-2, 2), ylim=(-2, 2))
plot!(plt, [item[:,1]..., item[1,1]], [item[:,2]..., item[1, 2]]; label="", color="white")
end fps=5
end
# ╔═╡ 44f3252c-cb3b-458f-b3b9-a87ce9f6e0e2
let
bestx, bestf, history = simplex(rosenbrock, [-1.2, -1.0]; tol=1e-3)
@info "converged in $(length(history)) steps, with error $bestf"
show_triangles(history)
end
# ╔═╡ c1c02449-50e7-4dc4-8807-a31b10624921
let
# Set the initial guess
x0 = [-1, -1.0]
# Set the optimization options
options = Optim.Options(iterations = 1000)
# Optimize the Rosenbrock function using the simplex method
result = optimize(rosenbrock, x0, NelderMead(), options)
# Print the optimization result
result
end
# ╔═╡ 50f91cd8-c74b-11ed-0b94-cdeb596bf99b
md"# Gradient based optimization"
# ╔═╡ 211b6c56-f16b-47fb-ba56-4534ac41be95
md"""
If $f: R^n \rightarrow R$ is differentiable, then the vector-valued function $\nabla f: R^n \rightarrow R^n$ defined by
```math
\nabla f(x) = \left(\begin{matrix}
\frac{\partial f(\mathbf{x})}{\partial x_1}\\
\frac{\partial f(\mathbf{x})}{\partial x_2}\\
\vdots\\
\frac{\partial f(\mathbf{x})}{\partial x_n}\\
\end{matrix}\right)
```
is called the gradient of $f$.
"""
# ╔═╡ eec10666-102c-44b6-a562-75ba78428d1e
md"""
Gradient descent is based on the observation that changing $\mathbf x$ slightly towards the negative gradient direction always decrease $f$ in the first order perturbation.
```math
f(\mathbf{x} - \epsilon \nabla f(\mathbf x)) \approx f(\mathbf x) - \epsilon \nabla f(\mathbf x)^T \nabla f(\mathbf x) = f(\mathbf x) - \epsilon \|\nabla f(\mathbf x)\|_2 < f(\mathbf{x})
```
"""
# ╔═╡ d59aaccf-1435-4c7b-998e-5dfb174990c3
md"""## Gradient descent
In each iteration, the update rule of the gradient descent method is
```math
\begin{align}
&\theta_{t+1} = \theta_t - \alpha g_t
\end{align}
```
where
* $\theta_t$ is the values of variables at time step $t$.
* $g_t$ is the gradient at time $t$ along $\theta_t$, i.e. $\nabla_{\theta_t} f(\theta_t)$.
* $\alpha$ is the learning rate.
"""
# ╔═╡ 10283699-2ebc-4e61-89e8-5585a2bf054d
md"One can obtain the gradient with `ForwardDiff`."
# ╔═╡ 3e616d99-7d57-4926-b1d5-dae47dd040e9
ForwardDiff.gradient(rosenbrock, [1.0, 3.0])
# ╔═╡ 004a79b8-afbd-40b7-9c67-a8e864432179
function gradient_descent(f, x; niters::Int, learning_rate::Real)
history = [x]
for i=1:niters
g = ForwardDiff.gradient(f, x)
x -= learning_rate * g
push!(history, x)
end
return history
end
# ╔═╡ 599b0416-d1e7-4e92-8604-80bda896d88b
function show_history(history)
x = -2:0.01:2
y = -2:0.01:2
f = [rosenbrock((a, b)) for b in y, a in x]
plt = heatmap(x, y, log.(f); label="log(f)", xlabel="x₁", ylabel="x₂", xlim=(-2, 2), ylim=(-2, 2))
plot!(plt, getindex.(history, 1), getindex.(history, 2); label="optimization", color="white")
end
# ╔═╡ 8d0dbc03-f927-4229-a8ce-6196cb62bde2
let
x0 = [-1, -1.0]
history = gradient_descent(rosenbrock, x0; niters=10000, learning_rate=0.002)
@info rosenbrock(history[end])
# plot
show_history(history)
end
# ╔═╡ 95ac921c-20c6-432e-8a70-006804d6b0da
md"""
The problem of gradient descent: easy trapped by plateaus.
"""
# ╔═╡ 3212c3f2-f3c3-4403-9584-85386db6825c
md"""
## Gradient descent with momentum
We can add a "momentum" term to the weight update, which helps the optimization algorithm to move more quickly in the right direction and avoid getting stuck in local minima.
The intuition behind the momentum method can be understood by considering a ball rolling down a hill. Without momentum, the ball would roll down the hill and eventually come to a stop at the bottom. However, with momentum, the ball would continue to roll past the bottom of the hill and up the other side, before eventually coming to a stop at a higher point. This is because the momentum of the ball helps it to overcome small bumps and obstacles in its path and continue moving in the right direction.
In each iteration, the update rule of gradient descent method with momentum is
```math
\begin{align}
&v_{t+1} = \beta v_t - \alpha g_t\\
&\theta_{t+1} = \theta_t + v_{t+1}
\end{align}
```
where
* $g_t$ is the gradient at time $t$ along $\theta_t$, i.e. $\nabla_{\theta_t} f(\theta_t)$.
* $\alpha$ is the initial learning rate.
* $\beta$ is the parameter for the gradient accumulation.
"""
# ╔═╡ 50feee93-6a1f-4256-822f-88ceede1bbec
function gradient_descent_momentum(f, x; niters::Int, β::Real, learning_rate::Real)
history = [x]
v = zero(x)
for i=1:niters
g = ForwardDiff.gradient(f, x)
v = β .* v .- learning_rate .* g
x += v
push!(history, x)
end
return history
end
# ╔═╡ 859705f4-9b5f-4403-a9ef-d21e3cbd0b06
let
x0 = [-1, -1.0]
history = gradient_descent_momentum(rosenbrock, x0; niters=10000, learning_rate=0.002, β=0.5)
@info rosenbrock(history[end])
# plot
show_history(history)
end
# ╔═╡ e2a17f45-cc79-485b-abf2-fbc81a66f908
md"""
The problem of momentum based method, easily got overshoted.
Moreover, it is not **scale-invariant**.
"""
# ╔═╡ 5b911473-57b8-46a4-bdec-6f1960c1e958
md"""
## AdaGrad
AdaGrad is an optimization algorithm used in machine learning for solving convex optimization problems. It is a gradient-based algorithm that adapts the learning rate for each parameter based on the historical gradient information. The main idea behind AdaGrad is to give more weight to the parameters that have a smaller gradient magnitude, which allows for a larger learning rate for those parameters.
"""
# ╔═╡ 29ed621d-0505-4f8c-88fc-642a3ddf3ae8
md"""
In each iteration, the update rule of AdaGrad is
```math
\begin{align}
&r_t = r_t + g_t^2\\
&\mathbf{\eta} = \frac{\alpha}{\sqrt{r_t + \epsilon}}\\
&\theta_{t+1} = \theta_t - \eta \odot g_t
\end{align}
```
where
* $\theta_t$ is the values of variables at time $t$.
* $\alpha$ is the initial learning rate.
* $g_t$ is the gradient at time $t$ along $\theta_t$
* $r_t$ is the historical squared gradient sum, which is initialized to $0$.
* $\epsilon$ is a small positive number.
* $\odot$ is the element-wise multiplication.
"""
# ╔═╡ 50b8dcf3-9cec-4487-8a57-249c213caa21
function adagrad_optimize(f, x; niters, learning_rate, ϵ=1e-8)
rt = zero(x)
η = zero(x)
history = [x]
for step in 1:niters
Δ = ForwardDiff.gradient(f, x)
@. rt = rt + Δ .^ 2
@. η = learning_rate ./ sqrt.(rt + ϵ)
x = x .- Δ .* η
push!(history, x)
end
return history
end
# ╔═╡ 09f542be-490a-42bc-81ac-07a318371ca3
let
x0 = [-1, -1.0]
history = adagrad_optimize(rosenbrock, x0; niters=10000, learning_rate=1.0)
@info rosenbrock(history[end])
# plot
show_history(history)
end
# ╔═╡ a4b4bc11-efb2-46db-b256-ba9632fadd7b
md"## Adam
The Adam optimizer is a popular optimization algorithm used in deep learning for training neural networks. It stands for Adaptive Moment Estimation and is a variant of stochastic gradient descent (SGD) that is designed to be more efficient and effective in finding the optimal weights for the neural network.
The Adam optimizer maintains a running estimate of the first and second moments of the gradients of the weights with respect to the loss function. These estimates are used to adaptively adjust the learning rate for each weight parameter during training. The first moment estimate is the mean of the gradients, while the second moment estimate is the uncentered variance of the gradients.
The Adam optimizer combines the benefits of two other optimization algorithms: AdaGrad, which adapts the learning rate based on the historical gradient information, and RMSProp, which uses a moving average of the squared gradients to scale the learning rate.
The Adam optimizer has become a popular choice for training deep neural networks due to its fast convergence and good generalization performance. It is widely used in many deep learning frameworks, such as TensorFlow, PyTorch, and Keras.
"
# ╔═╡ 8b74adc9-9e82-46db-a87e-a72928aff03f
md"""
In each iteration, the update rule of Adam is
```math
\begin{align}
&v_t = \beta_1 v_{t-1} - (1-\beta_1) g_t\\
&s_t = \beta_2 s_{t-1} - (1-\beta_2) g^2\\
&\hat v_t = v_t / (1-\beta_1^t)\\
&\hat s_t = s_t / (1-\beta_2^t)\\
&\theta_{t+1} = \theta_t -\eta \frac{\hat v_t}{\sqrt{\hat s_t} + \epsilon}
&\end{align}
```
where
* $\theta_t$ is the values of variables at time $t$.
* $\eta$ is the initial learning rate.
* $g_t$ is the gradient at time $t$ along $\theta$.
* $v_t$ is the exponential average of gradients along $\theta$.
* $s_t$ is the exponential average of squares of gradients along $\theta$.
* $\beta_1, \beta_2$ are hyperparameters.
"""
# ╔═╡ 6dc932f9-d37b-4cc7-90d9-1ed0bae20c41
function adam_optimize(f, x; niters, learning_rate, β1=0.9, β2=0.999, ϵ=1e-8)
mt = zero(x)
vt = zero(x)
βp1 = β1
βp2 = β2
history = [x]
for step in 1:niters
Δ = ForwardDiff.gradient(f, x)
@. mt = β1 * mt + (1 - β1) * Δ
@. vt = β2 * vt + (1 - β2) * Δ^2
@. Δ = mt / (1 - βp1) / (√(vt / (1 - βp2)) + ϵ) * learning_rate
βp1, βp2 = βp1 * β1, βp2 * β2
x = x .- Δ
push!(history, x)
end
return history
end
# ╔═╡ 8086d721-3360-4e66-b898-b3f1fb9e4392
let
x0 = [-1, -1.0]
history = adam_optimize(rosenbrock, x0; niters=10000, learning_rate=0.01)
@info rosenbrock(history[end])
# plot
show_history(history)
end
# ╔═╡ ce076be7-b930-4d35-b594-1009c87213a6
md"""
## The Julia package `Optimisers.jl`
"""
# ╔═╡ 6bc59fbc-14ae-4e07-8390-3195182e17a5
import Optimisers
# ╔═╡ ec098a81-5032-4d43-85bf-a51b0a19e94e
PlutoLecturing.@xbind gradient_based_optimizer Select(["Descent", "Momentum", "Nesterov", "Rprop", "RMSProp", "Adam", "RAdam", "AdaMax", "OAdam", "AdaGrad", "AdaDelta", "AMSGrad", "NAdam", "AdamW", "AdaBelief"])
# ╔═╡ 6778a8ce-73a7-40e8-92bc-c9491daa9012
PlutoLecturing.@xbind learning_rate NumberField(0:1e-4:1.0, default=1e-4)
# ╔═╡ dcf4a168-1c11-45f2-b7e7-15d6fb4becea
md"The different optimizers are introduced in the [documentation page](https://fluxml.ai/Optimisers.jl/dev/api/)"
# ╔═╡ 77761a71-65ea-434b-a697-d86278d10abd
let
x0 = [-1, -1.0]
method = eval(:(Optimisers.$(Symbol(gradient_based_optimizer))(learning_rate)))
state = Optimisers.setup(method, x0)
history = [x0]
for i=1:10000
grad = ForwardDiff.gradient(rosenbrock, x0)
state, x0 = Optimisers.update(state, x0, grad)
push!(history, x0)
end
@info rosenbrock(history[end])
# plot
show_history(history)
end
# ╔═╡ 6bee88e1-d641-4a28-9794-6031e721a79c
md"""[Optimisers.jl documentation](https://fluxml.ai/Optimisers.jl/dev/api/#Optimisation-Rules) contains **stochastic** gradient based optimizers.
"""
# ╔═╡ cbdb2ad6-2a7d-4ccb-89f9-5ec836612477
md"""
# Hessian based optimizers
"""
# ╔═╡ 66cbb653-b06d-4b06-880b-f402fd9f1d53
md"## Newton's Method"
# ╔═╡ 5af90753-f9cc-437f-9b11-98618fdb67aa
md"""
Newton's method is an optimization algorithm used to find the roots of a function, which can also be used to find the minimum or maximum of a function. The method involves using the first and second derivatives of the function to approximate the function as a quadratic function and then finding the minimum or maximum of this quadratic function. The minimum or maximum of the quadratic function is then used as the next estimate for the minimum or maximum of the original function, and the process is repeated until convergence is achieved.
"""
# ╔═╡ e80a6b59-84d4-40e9-bf1b-08719016745e
md"""
```math
\begin{align}
& H_{k}p_{k}=-g_k\\
& x_{k+1}=x_{k}+p_k
\end{align}
```
where
* $g_k$ is the gradient at time $k$ along $x_k$.
"""
# ╔═╡ da2d74e5-d411-499b-b4c3-cb6dfefb8c8d
function newton_optimizer(f, x; tol=1e-5)
k = 0
history = [x]
while k < 1000
k += 1
gk = ForwardDiff.gradient(f, x)
hk = ForwardDiff.hessian(f, x)
dx = -hk \ gk
x += dx
push!(history, x)
sum(abs2, dx) < tol && break
end
return history
end
# ╔═╡ a4832798-6e20-4630-889d-388fe55272f7
let
x0 = [-1, -1.0]
history = newton_optimizer(rosenbrock, x0; tol=1e-5)
@info "number iterations = $(length(history)), got $(rosenbrock(history[end]))"
# plot
show_history(history)
end
# ╔═╡ 89fb89d4-227f-41ba-8e42-68a9c436b930
md"The drawback of Newton's method is, the Hessian is very expensive to compute!
While gradients can be computed with the automatic differentiation method with constant overhead. The Hessian requires $O(n)$ times more resources, where $n$ is the number of parameters."
# ╔═╡ 4ade7201-ae81-41ca-affb-fffcb99776fe
md"## The Broyden–Fletcher–Goldfarb–Shanno (BFGS) algorithm"
# ╔═╡ 3e406fc4-8bb1-4b11-ae9a-a33604061a8a
md"""
The BFGS method is a popular numerical optimization algorithm used to solve unconstrained optimization problems. It is an iterative method that seeks to find the minimum of a function by iteratively updating an estimate of the inverse Hessian matrix of the function.
The BFGS method belongs to a class of $(PlutoLecturing.highlight("quasi-Newton methods")), which means that it approximates the Hessian matrix of the function using only first-order derivative information. The BFGS method updates the inverse Hessian matrix at each iteration using information from the current and previous iterations. This allows it to converge quickly to the minimum of the function.
The BFGS method is widely used in many areas of science and engineering, including machine learning, finance, and physics. It is particularly well-suited to problems where the Hessian matrix is too large to compute directly, as it only requires first-order derivative information.
```math
\begin{align}
& B_{k}p_{k}=-g_k~~~~~~~~~~\text{// Newton method like update rule}\\
& \alpha_k = {\rm argmin} ~f(x + \alpha p_k)~~~~~~~~~~\text{// using line search}\\
& s_k=\alpha_{k}p_k\\
& x_{k+1}=x_{k}+s_k\\
&y_k=g_{k+1}-g_k\\
&B_{k+1}=B_{k}+{\frac {y_{k}y_{k}^{\mathrm {T} }}{y_{k}^{\mathrm {T} }s_{k}}}-{\frac {B_{k}s_{k}s_{k}^{\mathrm {T} }B_{k}^{\mathrm {T} }}{s_{k}^{\mathrm {T} }B_{k}s_{k}}}
\end{align}
```
where
* $B_k$ is an approximation of the Hessian matrix, which is intialized to identity.
* $g_k$ is the gradient at time $k$ along $x_k$.
We can show $B_{k+1}s_k = y_k$ (secant equation) is satisfied.
"""
# ╔═╡ d0166fdc-333b-495b-965e-1c77711ba469
let
# Set the initial guess
x0 = [-1.0, -1.0]
# Set the optimization options
options = Optim.Options(iterations = 1000, store_trace=true, extended_trace=true)
# Optimize the Rosenbrock function using the simplex method
result = optimize(rosenbrock, x->ForwardDiff.gradient(rosenbrock, x), x0, BFGS(), options, inplace=false)
# Print the optimization result
@info result
show_history([t.metadata["x"] for t in result.trace])
end
# ╔═╡ ab2ad24c-a019-4608-9344-23eb1025e108
md"""
# Mathematical optimization
"""
# ╔═╡ e6dca5dc-dbf9-45ba-970d-23bd9a75769d
md"## Convex optimization"
# ╔═╡ 5b433d39-ed60-4831-a139-b6663a284e7a
md"A set $S\subseteq \mathbb{R}^n$ is convex if it contains the line segment between any two of its points, i.e.,
```math
\{\alpha \mathbf{x} + (1-\alpha)\mathbf{y}: 0\leq \alpha \leq 1\} \subseteq S
```
for all $\mathbf{x}, \mathbf{y} \in S$.
"
# ╔═╡ 1bf01867-adbe-4d00-a189-c0f024e65475
let
@drawsvg begin
function segment(a, b)
line(a, b, :stroke)
circle(a, 5, :fill)
circle(b, 5, :fill)
end
fontsize(20)
c1 = Point(-180, -20)
sethue("red")
ellipse(c1, 130, 80, :fill)
sethue("black")
ellipse(c1, 130, 80, :stroke)
segment(c1 - Point(50, 25), c1 + Point(50, -25))
Luxor.text("convex set", c1 + Point(0, 90), halign=:center)
c2 = Point(0, -20)
f(t) = Point(4cos(t) + 2cos(3t), 4sin(t) + 3sin(3t+π/2))
sethue("red")
poly(10 .* f.(range(0, 2π, length=160)) .+ c2, action = :fill)
sethue("black")
poly(10 .* f.(range(0, 2π, length=160)) .+ c2, action = :stroke)
a, b = Point(c2.x-30, c2.y+30), Point(c2.x+45, c2.y)
segment(a, b)
Luxor.text("nonconvex set", c2 + Point(0, 90), halign=:center)
end 520 200
end
# ╔═╡ 2afe850b-cfd3-41de-b3e1-fafa44e9bbe2
md"""
A function $f: S \in R^n \rightarrow R$ is convex on a convex set $S$ if its graph along any line segment in $S$ lies on or blow the chord connecting the function values at the endpoints of the segment, i.e., if
```math
f(\alpha \mathbf{x} + (1-\alpha) \mathbf{y}) \leq \alpha f(\mathbf{x}) + (1+\alpha)f(\mathbf{y})
```
for all $\alpha \in [0, 1]$ and all $\mathbf{x}, \mathbf{y}\in S$.
"""
# ╔═╡ 8a93097f-00c1-45b0-92a1-092aeccc58a5
let
@drawsvg begin
function segment(a, b)
line(a, b, :stroke)
circle(a, 5, :fill)
circle(b, 5, :fill)
end
fontsize(20)
c1 = Point(-180, -20)
xs = -1.6:0.01:1.6
ys = 0.8 * (2.0xs .^ 2 .- xs .^ 4 .- 0.2*xs .+ 1)
Luxor.poly(30 .* Point.(xs, ys) .+ Ref(c1), :stroke)
Luxor.text("nonconvex", c1 + Point(0, 90), halign=:center)
segment(c1+Point(-17, 40), c1+Point(18, 35))
c2 = Point(0, -20)
xs = [-1.8, -0.9, 0.0, 0.7, 1.8]
ys = [-0.7, 1.3, 1.7, 1.2, -0.7]
Luxor.poly(30 .* Point.(xs, ys) .+ Ref(c2), :stroke)
Luxor.text("convex", c2 + Point(0, 90), halign=:center)
segment(c2+Point(-17, 43), c2+Point(25, 30))
fontsize(20)
c3 = Point(180, -20)
xs = -1.4:0.01:1.3
ys = 0.8 * (- xs .^ 4 .- 0.2*xs .+ 2.2)
Luxor.poly(30 .* Point.(xs, ys) .+ Ref(c3), :stroke)
Luxor.text("strictly convex", c3 + Point(0, 90), halign=:center)
segment(c3+Point(-27, 40), c3+Point(25, 35))
end 520 200
end
# ╔═╡ ce27c364-9ae3-435a-8153-a1c898ae8984
md"Any local minimum of a convex function $f$ on a convex set $S\subseteq \mathbb{R}^n$ is a global minimum of $f$ on $S$."
# ╔═╡ afbc321c-c568-4b88-8dc1-9d87a4a0e7af
md"## Linear programming"
# ╔═╡ 7db726fc-b537-483a-8898-dd24b4f3aca2
md"""
Linear programs are problems that can be expressed in canonical form as
```math
{\begin{aligned}&{\text{Find a vector}}&&\mathbf {x} \\&{\text{that maximizes}}&&\mathbf {c} ^{T}\mathbf {x} \\&{\text{subject to}}&&A\mathbf {x} \leq \mathbf {b} \\&{\text{and}}&&\mathbf {x} \geq \mathbf {0} .\end{aligned}}
```
Here the components of $\mathbf x$ are the variables to be determined, $\mathbf c$ and $\mathbf b$ are given vectors (with $\mathbf {c} ^{T}$ indicating that the coefficients of $\mathbf c$ are used as a single-row matrix for the purpose of forming the matrix product), and $A$ is a given matrix.
"""
# ╔═╡ c76981ce-f1e5-4681-ad13-6188a962d962
md"""## Example
[https://jump.dev/JuMP.jl/stable/tutorials/linear/diet/](https://jump.dev/JuMP.jl/stable/tutorials/linear/diet/)
"""
# ╔═╡ 93b69c65-8df3-4782-8b39-20fb3879cbcc
md"""
[JuMP.jl documentation](https://jump.dev/JuMP.jl/stable/) also contains mathematical models such as **semidefinite programming** and **integer programming**.
"""
# ╔═╡ 2e17a5f6-a942-4688-bc23-25b1715d9886
md"""
# Assignments
1. Show the following graph $G=(V, E)$ has a unit-disk embedding.
```
V = 1, 2, ..., 10
E = [(1, 2), (1, 3),
(2, 3), (2, 4), (2, 5), (2, 6),
(3, 5), (3, 6), (3, 7),
(4, 5), (4, 8),
(5, 6), (5, 8), (5, 9),
(6, 7), (6, 8), (6, 9),
(7,9), (8, 9), (8, 10), (9, 10)]
```
So what is uni-disk embedding of a graph? Ask Chat-GPT with the following question
```
What is a unit-disk embedding of a graph?
```
### Hint:
To solve this issue, you can utilize an optimizer. Here's how:
1. Begin by assigning each vertex with a coordinate. You can represent the locations of all vertices as a $2 \times n$ matrix, denoted as $x$, where each column represents a coordinate of vertices in a two-dimensional space.
2. Construct a loss function, denoted as $f(x)$, that returns a positive value as punishment if any connected vertex pair $(v, w)$ has a distance ${\rm dist}(x_v, x_w) > 1$ (the unit distance), or if any disconnected vertex pair has a distance smaller than $1$. If all unit-disk constraints are satisfied, the function returns $0$.
3. Use an optimizer to optimize the loss function $f(x)$. If the loss can be reduced to $0$, then the corresponding $x$ represents a unit-disk embedding. If not, you may need to try multiple times to ensure that your optimizer does not trap you into a local minimum.
"""
# ╔═╡ 47747e2d-52c7-47a5-aa06-1ce25f7dd7fe
md"## Golden section search";
# ╔═╡ da2f6865-ff3c-46bd-80d4-f171b0e30ce9
function golden_section_search(f, a, b; tol=1e-5)
τ = (√5 - 1) / 2
x1 = a + (1 - τ) * (b - a)
x2 = a + τ * (b - a)
f1, f2 = f(x1), f(x2)
k = 0
while b - a > tol
k += 1
if f1 > f2
a = x1
x1 = x2
f1 = f2
x2 = a + τ * (b - a)
f2 = f(x2)
else
b = x2
x2 = x1
f2 = f1
x1 = a + (1 - τ) * (b - a)
f1 = f(x1)
end
end
#@info "number of iterations = $k"
return f1 < f2 ? (a, f1) : (b, f2)
end;
# ╔═╡ c75753d4-ceec-402e-a4bc-8b00b4934dbf
golden_section_search(x->(x-4)^2, -5, 5; tol=1e-5);
# ╔═╡ 00000000-0000-0000-0000-000000000001
PLUTO_PROJECT_TOML_CONTENTS = """
[deps]
AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
Luxor = "ae8d54c2-7ccd-5906-9d76-62fc9837b5bc"
Optim = "429524aa-4258-5aef-a3af-852621145aeb"
Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
PlutoUI = "7f904dfe-b85e-4ff6-b463-dae2292396a8"
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
[compat]
AbstractTrees = "~0.4.4"
ForwardDiff = "~0.10.35"
Luxor = "~3.7.0"
Optim = "~1.7.4"
Optimisers = "~0.2.15"
Plots = "~1.38.8"
PlutoUI = "~0.7.50"
Reexport = "~1.2.2"
"""
# ╔═╡ 00000000-0000-0000-0000-000000000002
PLUTO_MANIFEST_TOML_CONTENTS = """
# This file is machine-generated - editing it directly is not advised
julia_version = "1.8.5"
manifest_format = "2.0"
project_hash = "7e4e83290afdd2498d991561fdef1b2e36984cec"
[[deps.AbstractPlutoDingetjes]]
deps = ["Pkg"]
git-tree-sha1 = "8eaf9f1b4921132a4cff3f36a1d9ba923b14a481"
uuid = "6e696c72-6542-2067-7265-42206c756150"
version = "1.1.4"
[[deps.AbstractTrees]]
git-tree-sha1 = "faa260e4cb5aba097a73fab382dd4b5819d8ec8c"
uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
version = "0.4.4"
[[deps.Adapt]]
deps = ["LinearAlgebra", "Requires"]
git-tree-sha1 = "cc37d689f599e8df4f464b2fa3870ff7db7492ef"
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
version = "3.6.1"
[[deps.ArgTools]]
uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
version = "1.1.1"
[[deps.ArrayInterface]]
deps = ["Adapt", "LinearAlgebra", "Requires", "SnoopPrecompile", "SparseArrays", "SuiteSparse"]
git-tree-sha1 = "a89acc90c551067cd84119ff018619a1a76c6277"
uuid = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
version = "7.2.1"
[[deps.Artifacts]]
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
[[deps.Base64]]
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
[[deps.BitFlags]]
git-tree-sha1 = "43b1a4a8f797c1cddadf60499a8a077d4af2cd2d"
uuid = "d1d4a3ce-64b1-5f1a-9ba4-7e7e69966f35"
version = "0.1.7"
[[deps.Bzip2_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "19a35467a82e236ff51bc17a3a44b69ef35185a2"
uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0"
version = "1.0.8+0"
[[deps.Cairo]]
deps = ["Cairo_jll", "Colors", "Glib_jll", "Graphics", "Libdl", "Pango_jll"]
git-tree-sha1 = "d0b3f8b4ad16cb0a2988c6788646a5e6a17b6b1b"
uuid = "159f3aea-2a34-519c-b102-8c37f9878175"
version = "1.0.5"
[[deps.Cairo_jll]]
deps = ["Artifacts", "Bzip2_jll", "CompilerSupportLibraries_jll", "Fontconfig_jll", "FreeType2_jll", "Glib_jll", "JLLWrappers", "LZO_jll", "Libdl", "Pixman_jll", "Pkg", "Xorg_libXext_jll", "Xorg_libXrender_jll", "Zlib_jll", "libpng_jll"]
git-tree-sha1 = "4b859a208b2397a7a623a03449e4636bdb17bcf2"
uuid = "83423d85-b0ee-5818-9007-b63ccbeb887a"
version = "1.16.1+1"
[[deps.ChainRulesCore]]
deps = ["Compat", "LinearAlgebra", "SparseArrays"]
git-tree-sha1 = "c6d890a52d2c4d55d326439580c3b8d0875a77d9"
uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
version = "1.15.7"
[[deps.ChangesOfVariables]]
deps = ["ChainRulesCore", "LinearAlgebra", "Test"]
git-tree-sha1 = "485193efd2176b88e6622a39a246f8c5b600e74e"
uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
version = "0.1.6"
[[deps.CodecZlib]]
deps = ["TranscodingStreams", "Zlib_jll"]
git-tree-sha1 = "9c209fb7536406834aa938fb149964b985de6c83"
uuid = "944b1d66-785c-5afd-91f1-9de20f533193"
version = "0.7.1"
[[deps.ColorSchemes]]
deps = ["ColorTypes", "ColorVectorSpace", "Colors", "FixedPointNumbers", "Random", "SnoopPrecompile"]
git-tree-sha1 = "aa3edc8f8dea6cbfa176ee12f7c2fc82f0608ed3"
uuid = "35d6a980-a343-548e-a6ea-1d62b119f2f4"
version = "3.20.0"
[[deps.ColorTypes]]
deps = ["FixedPointNumbers", "Random"]
git-tree-sha1 = "eb7f0f8307f71fac7c606984ea5fb2817275d6e4"
uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
version = "0.11.4"
[[deps.ColorVectorSpace]]
deps = ["ColorTypes", "FixedPointNumbers", "LinearAlgebra", "SpecialFunctions", "Statistics", "TensorCore"]
git-tree-sha1 = "600cc5508d66b78aae350f7accdb58763ac18589"
uuid = "c3611d14-8923-5661-9e6a-0046d554d3a4"
version = "0.9.10"
[[deps.Colors]]
deps = ["ColorTypes", "FixedPointNumbers", "Reexport"]
git-tree-sha1 = "fc08e5930ee9a4e03f84bfb5211cb54e7769758a"
uuid = "5ae59095-9a9b-59fe-a467-6f913c188581"
version = "0.12.10"
[[deps.CommonSubexpressions]]
deps = ["MacroTools", "Test"]
git-tree-sha1 = "7b8a93dba8af7e3b42fecabf646260105ac373f7"
uuid = "bbf7d656-a473-5ed7-a52c-81e309532950"
version = "0.3.0"
[[deps.Compat]]
deps = ["Dates", "LinearAlgebra", "UUIDs"]
git-tree-sha1 = "7a60c856b9fa189eb34f5f8a6f6b5529b7942957"
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
version = "4.6.1"
[[deps.CompilerSupportLibraries_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
version = "1.0.1+0"
[[deps.ConstructionBase]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "89a9db8d28102b094992472d333674bd1a83ce2a"
uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9"
version = "1.5.1"
[[deps.Contour]]
git-tree-sha1 = "d05d9e7b7aedff4e5b51a029dced05cfb6125781"
uuid = "d38c429a-6771-53c6-b99e-75d170b6e991"
version = "0.6.2"
[[deps.DataAPI]]
git-tree-sha1 = "e8119c1a33d267e16108be441a287a6981ba1630"
uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
version = "1.14.0"
[[deps.DataStructures]]
deps = ["Compat", "InteractiveUtils", "OrderedCollections"]
git-tree-sha1 = "d1fff3a548102f48987a52a2e0d114fa97d730f0"
uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
version = "0.18.13"
[[deps.Dates]]
deps = ["Printf"]
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
[[deps.DelimitedFiles]]
deps = ["Mmap"]
uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
[[deps.DiffResults]]
deps = ["StaticArraysCore"]
git-tree-sha1 = "782dd5f4561f5d267313f23853baaaa4c52ea621"
uuid = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"