From 67105aecf114785efad7ec00a0ed1cf2c49e0edc Mon Sep 17 00:00:00 2001
From: Tim Holy <tim.holy@gmail.com>
Date: Thu, 24 Dec 2020 07:29:40 -0600
Subject: [PATCH] Add some precompiles

Together with a couple of changes to LoopVectorization,
this shaves about one second off the initial `mygemmavx!` demo.

There may be more methods that could be added, but this is a start.
Overall, VectorizationBase is the only substantive source of
inference time in that demo.
---
 src/precompile.jl | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/precompile.jl b/src/precompile.jl
index 22c11ca0..30cb2d74 100644
--- a/src/precompile.jl
+++ b/src/precompile.jl
@@ -1,3 +1,22 @@
 function _precompile_()
     ccall(:jl_generating_output, Cint, ()) == 1 || return nothing
+    for T in (Bool, Int, Float32, Float64)
+        for A in (Vector, Matrix)
+            precompile(stridedpointer, (A{T},))
+        end
+    end
+    function precompile_nt(@nospecialize(T))
+        for I ∈ (Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64)
+            precompile(vload_quote, (Type{T}, Type{I}, Symbol, Int, Int, Int, Int, Bool, Bool))
+        end
+        # precompile(vfmadd, (Vec{4, T}, Vec{4, T}, Vec{4, T}))  # doesn't "take" (too bad, this is expensive)
+    end
+    U = NativeTypes
+    while isa(U, Union)
+        T, U = U.a, U.b
+        precompile_nt(T)
+    end
+    precompile_nt(U)
+    precompile(_pick_vector_width, (Type, Vararg{Type,100}))
+    precompile(>=, (Int, MM{4, 1, Int}))
 end