From e0ba546e87ce92912dd9b4860a5a4fb864cda045 Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@mit.edu>
Date: Fri, 14 Feb 2014 17:04:08 -0500
Subject: [PATCH] new DFT api

---
 NEWS.md                  |   6 +
 base/complex.jl          |   3 +
 base/deprecated.jl       |  14 +-
 base/dft.jl              | 195 ++++++++++
 base/dsp.jl              | 135 +------
 base/exports.jl          |   1 +
 base/fft/FFTW.jl         | 781 ++++++++++++++++++++++++++++++++++++++
 base/fft/dct.jl          | 103 +++++
 base/fftw.jl             | 793 ---------------------------------------
 base/pointer.jl          |   9 +
 base/sysimg.jl           |  15 +-
 base/unicode/utf8proc.jl |   5 +-
 doc/stdlib/math.rst      |  36 +-
 test/dsp.jl              |  30 +-
 test/fft.jl              | 405 ++++++++++++--------
 test/random.jl           |  12 +-
 16 files changed, 1416 insertions(+), 1127 deletions(-)
 create mode 100644 base/dft.jl
 create mode 100644 base/fft/FFTW.jl
 create mode 100644 base/fft/dct.jl
 delete mode 100644 base/fftw.jl

diff --git a/NEWS.md b/NEWS.md
index af23f70a0d3e2..8522712624faa 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -193,6 +193,11 @@ Library improvements
 
     * New `vecdot` function, analogous to `vecnorm`, for Euclidean inner products over any iterable container ([#11067]).
 
+  * `p = plan_fft(x)` and similar functions now return a `Base.DFT.Plan` object, rather
+    than an anonymous function.  Calling it via `p(x)` is deprecated in favor of
+    `p * x` or `p \ x` (for the inverse), and it can also be used with `A_mul_B!`
+    to employ pre-allocated output arrays ([#12087]).
+
   * Strings
 
     * NUL-terminated strings should now be passed to C via the new `Cstring` type, not `Ptr{UInt8}` or `Ptr{Cchar}`,
@@ -1519,3 +1524,4 @@ Too numerous to mention.
 [#11922]: https://github.com/JuliaLang/julia/issues/11922
 [#11985]: https://github.com/JuliaLang/julia/issues/11985
 [#12031]: https://github.com/JuliaLang/julia/issues/12031
+[#12087]: https://github.com/JuliaLang/julia/issues/12087
diff --git a/base/complex.jl b/base/complex.jl
index 127b51ddde620..339a40d7849d0 100644
--- a/base/complex.jl
+++ b/base/complex.jl
@@ -34,6 +34,9 @@ real(x::Real) = x
 imag(x::Real) = zero(x)
 reim(z) = (real(z), imag(z))
 
+real{T<:Real}(::Type{T}) = T
+real{T<:Real}(::Type{Complex{T}}) = T
+
 isreal(x::Real) = true
 isreal(z::Complex) = imag(z) == 0
 isimag(z::Number) = real(z) == 0
diff --git a/base/deprecated.jl b/base/deprecated.jl
index 90f4f92f0edf6..dad7aac7da3f9 100644
--- a/base/deprecated.jl
+++ b/base/deprecated.jl
@@ -474,13 +474,11 @@ export float32_isvalid, float64_isvalid
 @deprecate ($)(x::Char, y::Char)  Char(UInt32(x) $ UInt32(y))
 
 # 11241
-
 @deprecate is_valid_char(ch::Char)          isvalid(ch)
 @deprecate is_valid_ascii(str::ASCIIString) isvalid(str)
 @deprecate is_valid_utf8(str::UTF8String)   isvalid(str)
 @deprecate is_valid_utf16(str::UTF16String) isvalid(str)
 @deprecate is_valid_utf32(str::UTF32String) isvalid(str)
-
 @deprecate is_valid_char(ch)   isvalid(Char, ch)
 @deprecate is_valid_ascii(str) isvalid(ASCIIString, str)
 @deprecate is_valid_utf8(str)  isvalid(UTF8String, str)
@@ -488,9 +486,19 @@ export float32_isvalid, float64_isvalid
 @deprecate is_valid_utf32(str) isvalid(UTF32String, str)
 
 # 11379
-
 @deprecate utf32(c::Integer...)   UTF32String(UInt32[c...,0])
 
+# 12087
+@deprecate call(P::Base.DFT.Plan, A) P * A
+for f in (:plan_fft, :plan_ifft, :plan_bfft, :plan_fft!, :plan_ifft!, :plan_bfft!, :plan_rfft)
+    @eval @deprecate $f(A, dims, flags) $f(A, dims; flags=flags)
+    @eval @deprecate $f(A, dims, flags, tlim) $f(A, dims; flags=flags, timelimit=tlim)
+end
+for f in (:plan_brfft, :plan_irfft)
+    @eval @deprecate $f(A, d, dims, flags) $f(A, d, dims; flags=flags)
+    @eval @deprecate $f(A, d, dims, flags, tlim) $f(A, d, dims; flags=flags, timelimit=tlim)
+end
+
 # 10862
 
 function chol(A::AbstractMatrix, uplo::Symbol)
diff --git a/base/dft.jl b/base/dft.jl
new file mode 100644
index 0000000000000..5db80e1050b1b
--- /dev/null
+++ b/base/dft.jl
@@ -0,0 +1,195 @@
+# This file is a part of Julia. License is MIT: http://julialang.org/license
+
+module DFT
+
+# DFT plan where the inputs are an array of eltype T
+abstract Plan{T}
+
+import Base: show, summary, size, ndims, length, eltype,
+             *, A_mul_B!, inv, \, A_ldiv_B!
+
+eltype{T}(::Plan{T}) = T
+eltype{P<:Plan}(T::Type{P}) = T.parameters[1]
+
+# size(p) should return the size of the input array for p
+size(p::Plan, d) = size(p)[d]
+ndims(p::Plan) = length(size(p))
+length(p::Plan) = prod(size(p))::Int
+
+##############################################################################
+export fft, ifft, bfft, fft!, ifft!, bfft!,
+       plan_fft, plan_ifft, plan_bfft, plan_fft!, plan_ifft!, plan_bfft!,
+       rfft, irfft, brfft, plan_rfft, plan_irfft, plan_brfft
+
+complexfloat{T<:FloatingPoint}(x::AbstractArray{Complex{T}}) = x
+
+# return an Array, rather than similar(x), to avoid an extra copy for FFTW
+# (which only works on StridedArray types).
+complexfloat{T<:Complex}(x::AbstractArray{T}) = copy!(Array(typeof(float(one(T))), size(x)), x)
+complexfloat{T<:FloatingPoint}(x::AbstractArray{T}) = copy!(Array(typeof(complex(one(T))), size(x)), x)
+complexfloat{T<:Real}(x::AbstractArray{T}) = copy!(Array(typeof(complex(float(one(T)))), size(x)), x)
+
+# implementations only need to provide plan_X(x, region)
+# for X in (:fft, :bfft, ...):
+for f in (:fft, :bfft, :ifft, :fft!, :bfft!, :ifft!, :rfft)
+    pf = symbol(string("plan_", f))
+    @eval begin
+        $f(x::AbstractArray) = $pf(x) * x
+        $f(x::AbstractArray, region) = $pf(x, region) * x
+        $pf(x::AbstractArray; kws...) = $pf(x, 1:ndims(x); kws...)
+    end
+end
+
+# promote to a complex floating-point type (out-of-place only),
+# so implementations only need Complex{Float} methods
+for f in (:fft, :bfft, :ifft)
+    pf = symbol(string("plan_", f))
+    @eval begin
+        $f{T<:Real}(x::AbstractArray{T}, region=1:ndims(x)) = $f(complexfloat(x), region)
+        $pf{T<:Real}(x::AbstractArray{T}, region; kws...) = $pf(complexfloat(x), region; kws...)
+        $f{T<:Union(Integer,Rational)}(x::AbstractArray{Complex{T}}, region=1:ndims(x)) = $f(complexfloat(x), region)
+        $pf{T<:Union(Integer,Rational)}(x::AbstractArray{Complex{T}}, region; kws...) = $pf(complexfloat(x), region; kws...)
+    end
+end
+rfft{T<:Union(Integer,Rational)}(x::AbstractArray{T}, region=1:ndims(x)) = rfft(float(x), region)
+plan_rfft{T<:Union(Integer,Rational)}(x::AbstractArray{T}, region; kws...) = plan_rfft(float(x), region; kws...)
+
+# only require implementation to provide *(::Plan{T}, ::Array{T})
+*{T}(p::Plan{T}, x::AbstractArray) = p * copy!(Array(T, size(x)), x)
+
+# Implementations should also implement A_mul_B!(Y, plan, X) so as to support
+# pre-allocated output arrays.  We don't define * in terms of A_mul_B!
+# generically here, however, because of subtleties for in-place and rfft plans.
+
+##############################################################################
+# To support inv, \, and A_ldiv_B!(y, p, x), we require Plan subtypes
+# to have a pinv::Plan field, which caches the inverse plan, and which
+# should be initially undefined.  They should also implement
+# plan_inv(p) to construct the inverse of a plan p.
+
+# hack from @simonster (in #6193) to compute the return type of plan_inv
+# without actually calling it or even constructing the empty arrays.
+_pinv_type(p::Plan) = typeof([plan_inv(x) for x in typeof(p)[]])
+pinv_type(p::Plan) = eltype(_pinv_type(p))
+
+inv(p::Plan) =
+    isdefined(p, :pinv) ? p.pinv::pinv_type(p) : (p.pinv = plan_inv(p))
+\(p::Plan, x::AbstractArray) = inv(p) * x
+A_ldiv_B!(y::AbstractArray, p::Plan, x::AbstractArray) = A_mul_B!(y, inv(p), x)
+
+##############################################################################
+# implementations only need to provide the unnormalized backwards FFT,
+# similar to FFTW, and we do the scaling generically to get the ifft:
+
+type ScaledPlan{T,P,N} <: Plan{T}
+    p::P
+    scale::N # not T, to avoid unnecessary promotion to Complex
+    pinv::Plan
+    ScaledPlan(p, scale) = new(p, scale)
+end
+ScaledPlan{P<:Plan,N<:Number}(p::P, scale::N) = ScaledPlan{eltype(P),P,N}(p, scale)
+ScaledPlan(p::ScaledPlan, α::Number) = ScaledPlan(p.p, p.scale * α)
+
+size(p::ScaledPlan) = size(p.p)
+
+show(io::IO, p::ScaledPlan) = print(io, p.scale, " * ", p.p)
+summary(p::ScaledPlan) = string(p.scale, " * ", summary(p.p))
+
+*(p::ScaledPlan, x::AbstractArray) = scale!(p.p * x, p.scale)
+
+*(α::Number, p::Plan) = ScaledPlan(p, α)
+*(p::Plan, α::Number) = ScaledPlan(p, α)
+*(I::UniformScaling, p::ScaledPlan) = ScaledPlan(p, I.λ)
+*(p::ScaledPlan, I::UniformScaling) = ScaledPlan(p, I.λ)
+*(I::UniformScaling, p::Plan) = ScaledPlan(p, I.λ)
+*(p::Plan, I::UniformScaling) = ScaledPlan(p, I.λ)
+
+# Normalization for ifft, given unscaled bfft, is 1/prod(dimensions)
+normalization(T, sz, region) = one(T) / prod([sz...][[region...]])
+normalization(X, region) = normalization(real(eltype(X)), size(X), region)
+
+plan_ifft(x::AbstractArray, region; kws...) =
+    ScaledPlan(plan_bfft(x, region; kws...), normalization(x, region))
+plan_ifft!(x::AbstractArray, region; kws...) =
+    ScaledPlan(plan_bfft!(x, region; kws...), normalization(x, region))
+
+plan_inv(p::ScaledPlan) = ScaledPlan(plan_inv(p.p), inv(p.scale))
+
+A_mul_B!(y::AbstractArray, p::ScaledPlan, x::AbstractArray) =
+    scale!(p.scale, A_mul_B!(y, p.p, x))
+
+##############################################################################
+# Real-input DFTs are annoying because the output has a different size
+# than the input if we want to gain the full factor-of-two(ish) savings
+# For backward real-data transforms, we must specify the original length
+# of the first dimension, since there is no reliable way to detect this
+# from the data (we can't detect whether the dimension was originally even
+# or odd).
+
+for f in (:brfft, :irfft)
+    pf = symbol(string("plan_", f))
+    @eval begin
+        $f(x::AbstractArray, d::Integer) = $pf(x, d) * x
+        $f(x::AbstractArray, d::Integer, region) = $pf(x, d, region) * x
+        $pf(x::AbstractArray, d::Integer;kws...) = $pf(x, d, 1:ndims(x);kws...)
+    end
+end
+
+for f in (:brfft, :irfft)
+    @eval begin
+        $f{T<:Real}(x::AbstractArray{T}, d::Integer, region=1:ndims(x)) = $f(complexfloat(x), d, region)
+        $f{T<:Union(Integer,Rational)}(x::AbstractArray{Complex{T}}, d::Integer, region=1:ndims(x)) = $f(complexfloat(x), d, region)
+    end
+end
+
+function rfft_output_size(x::AbstractArray, region)
+    d1 = first(region)
+    osize = [size(x)...]
+    osize[d1] = osize[d1]>>1 + 1
+    return osize
+end
+
+function brfft_output_size(x::AbstractArray, d::Integer, region)
+    d1 = first(region)
+    osize = [size(x)...]
+    @assert osize[d1] == d>>1 + 1
+    osize[d1] = d
+    return osize
+end
+
+plan_irfft{T}(x::AbstractArray{Complex{T}}, d::Integer, region; kws...) =
+    ScaledPlan(plan_brfft(x, d, region; kws...),
+               normalization(T, brfft_output_size(x, d, region), region))
+
+##############################################################################
+
+export fftshift, ifftshift
+
+fftshift(x) = circshift(x, div([size(x)...],2))
+
+function fftshift(x,dim)
+    s = zeros(Int,ndims(x))
+    s[dim] = div(size(x,dim),2)
+    circshift(x, s)
+end
+
+ifftshift(x) = circshift(x, div([size(x)...],-2))
+
+function ifftshift(x,dim)
+    s = zeros(Int,ndims(x))
+    s[dim] = -div(size(x,dim),2)
+    circshift(x, s)
+end
+
+##############################################################################
+
+# FFTW module (may move to an external package at some point):
+if Base.USE_GPL_LIBS
+    include("fft/FFTW.jl")
+    importall .FFTW
+    export FFTW, dct, idct, dct!, idct!, plan_dct, plan_idct, plan_dct!, plan_idct!
+end
+
+##############################################################################
+
+end
diff --git a/base/dsp.jl b/base/dsp.jl
index 0f6e32013a30a..fdcb2aa1d6b3f 100644
--- a/base/dsp.jl
+++ b/base/dsp.jl
@@ -2,16 +2,9 @@
 
 module DSP
 
-importall Base.FFTW
-import Base.FFTW.normalization
 import Base.trailingsize
 
-export FFTW, filt, filt!, deconv, conv, conv2, xcorr, fftshift, ifftshift,
-       dct, idct, dct!, idct!, plan_dct, plan_idct, plan_dct!, plan_idct!,
-       # the rest are defined imported from FFTW:
-       fft, bfft, ifft, rfft, brfft, irfft,
-       plan_fft, plan_bfft, plan_ifft, plan_rfft, plan_brfft, plan_irfft,
-       fft!, bfft!, ifft!, plan_fft!, plan_bfft!, plan_ifft!
+export filt, filt!, deconv, conv, conv2, xcorr
 
 _zerosi(b,a,T) = zeros(promote_type(eltype(b), eltype(a), T), max(length(a), length(b))-1)
 
@@ -117,10 +110,10 @@ function conv{T<:Base.LinAlg.BlasFloat}(u::StridedVector{T}, v::StridedVector{T}
     vpad = [v; zeros(T, np2 - nv)]
     if T <: Real
         p = plan_rfft(upad)
-        y = irfft(p(upad).*p(vpad), np2)
+        y = irfft((p*upad).*(p*vpad), np2)
     else
         p = plan_fft!(upad)
-        y = ifft!(p(upad).*p(vpad))
+        y = ifft!((p*upad).*(p*vpad))
     end
     return y[1:n]
 end
@@ -149,7 +142,7 @@ function conv2{T}(A::StridedMatrix{T}, B::StridedMatrix{T})
     At[1:sa[1], 1:sa[2]] = A
     Bt[1:sb[1], 1:sb[2]] = B
     p = plan_fft(At)
-    C = ifft(p(At).*p(Bt))
+    C = ifft((p*At).*(p*Bt))
     if T <: Real
         return real(C)
     end
@@ -168,124 +161,4 @@ function xcorr(u, v)
     flipdim(conv(flipdim(u, 1), v), 1)
 end
 
-fftshift(x) = circshift(x, div([size(x)...],2))
-
-function fftshift(x,dim)
-    s = zeros(Int,ndims(x))
-    s[dim] = div(size(x,dim),2)
-    circshift(x, s)
-end
-
-ifftshift(x) = circshift(x, div([size(x)...],-2))
-
-function ifftshift(x,dim)
-    s = zeros(Int,ndims(x))
-    s[dim] = -div(size(x,dim),2)
-    circshift(x, s)
-end
-
-# Discrete cosine and sine transforms via FFTW's r2r transforms;
-# we follow the Matlab convention and adopt a unitary normalization here.
-# Unlike Matlab we compute the multidimensional transform by default,
-# similar to the Julia fft functions.
-
-fftwcopy{T<:fftwNumber}(X::StridedArray{T}) = copy(X)
-fftwcopy{T<:Real}(X::StridedArray{T}) = float(X)
-fftwcopy{T<:Complex}(X::StridedArray{T}) = map(Complex128,X)
-
-for (f, fr2r, Y, Tx) in ((:dct, :r2r, :Y, :Number),
-                         (:dct!, :r2r!, :X, :fftwNumber))
-    plan_f = symbol("plan_",f)
-    plan_fr2r = symbol("plan_",fr2r)
-    fi = symbol("i",f)
-    plan_fi = symbol("plan_",fi)
-    Ycopy = Y == :X ? 0 : :(Y = fftwcopy(X))
-    @eval begin
-        function $f{T<:$Tx}(X::StridedArray{T}, region)
-            $Y = $fr2r(X, REDFT10, region)
-            scale!($Y, sqrt(0.5^length(region) * normalization(X,region)))
-            sqrthalf = sqrt(0.5)
-            r = map(n -> 1:n, [size(X)...])
-            for d in region
-                r[d] = 1:1
-                $Y[r...] *= sqrthalf
-                r[d] = 1:size(X,d)
-            end
-            return $Y
-        end
-
-        function $plan_f{T<:$Tx}(X::StridedArray{T}, region,
-                                 flags::Unsigned, timelimit::Real)
-            p = $plan_fr2r(X, REDFT10, region, flags, timelimit)
-            sqrthalf = sqrt(0.5)
-            r = map(n -> 1:n, [size(X)...])
-            nrm = sqrt(0.5^length(region) * normalization(X,region))
-            return X::StridedArray{T} -> begin
-                $Y = p(X)
-                scale!($Y, nrm)
-                for d in region
-                    r[d] = 1:1
-                    $Y[r...] *= sqrthalf
-                    r[d] = 1:size(X,d)
-                end
-                return $Y
-            end
-        end
-
-        function $fi{T<:$Tx}(X::StridedArray{T}, region)
-            $Ycopy
-            scale!($Y, sqrt(0.5^length(region) * normalization(X, region)))
-            sqrt2 = sqrt(2)
-            r = map(n -> 1:n, [size(X)...])
-            for d in region
-                r[d] = 1:1
-                $Y[r...] *= sqrt2
-                r[d] = 1:size(X,d)
-            end
-            return r2r!($Y, REDFT01, region)
-        end
-
-        function $plan_fi{T<:$Tx}(X::StridedArray{T}, region,
-                                 flags::Unsigned, timelimit::Real)
-            p = $plan_fr2r(X, REDFT01, region, flags, timelimit)
-            sqrt2 = sqrt(2)
-            r = map(n -> 1:n, [size(X)...])
-            nrm = sqrt(0.5^length(region) * normalization(X,region))
-            return X::StridedArray{T} -> begin
-                $Ycopy
-                scale!($Y, nrm)
-                for d in region
-                    r[d] = 1:1
-                    $Y[r...] *= sqrt2
-                    r[d] = 1:size(X,d)
-                end
-                return p($Y)
-            end
-        end
-
-    end
-    for (g,plan_g) in ((f,plan_f), (fi, plan_fi))
-        @eval begin
-            $g{T<:$Tx}(X::StridedArray{T}) = $g(X, 1:ndims(X))
-
-            $plan_g(X, region, flags::Unsigned) =
-              $plan_g(X, region, flags, NO_TIMELIMIT)
-            $plan_g(X, region) =
-              $plan_g(X, region, ESTIMATE, NO_TIMELIMIT)
-            $plan_g{T<:$Tx}(X::StridedArray{T}) =
-              $plan_g(X, 1:ndims(X), ESTIMATE, NO_TIMELIMIT)
-        end
-    end
-end
-
-# DCT of scalar is just the identity:
-dct(x::Number, dims) = length(dims) == 0 || dims[1] == 1 ? x : throw(BoundsError())
-idct(x::Number, dims) = dct(x, dims)
-dct(x::Number) = x
-idct(x::Number) = x
-plan_dct(x::Number, dims, flags, tlim) = length(dims) == 0 || dims[1] == 1 ? y::Number -> y : throw(BoundsError())
-plan_idct(x::Number, dims, flags, tlim) = plan_dct(x, dims)
-plan_dct(x::Number) = y::Number -> y
-plan_idct(x::Number) = y::Number -> y
-
 end # module
diff --git a/base/exports.jl b/base/exports.jl
index 4dfe4b6d5088f..d1c8ad5e2b3d9 100644
--- a/base/exports.jl
+++ b/base/exports.jl
@@ -1314,6 +1314,7 @@ export
     pointer,
     pointer_from_objref,
     pointer_to_array,
+    pointer_to_string,
     reenable_sigint,
     unsafe_copy!,
     unsafe_load,
diff --git a/base/fft/FFTW.jl b/base/fft/FFTW.jl
new file mode 100644
index 0000000000000..539e15fea563c
--- /dev/null
+++ b/base/fft/FFTW.jl
@@ -0,0 +1,781 @@
+# This file is a part of Julia. License is MIT: http://julialang.org/license
+
+module FFTW
+
+import ..DFT: fft, bfft, ifft, rfft, brfft, irfft, plan_fft, plan_bfft, plan_ifft, plan_rfft, plan_brfft, plan_irfft, fft!, bfft!, ifft!, plan_fft!, plan_bfft!, plan_ifft!, Plan, rfft_output_size, brfft_output_size, plan_inv, normalization, ScaledPlan
+
+import Base: show, *, convert, unsafe_convert, size, strides, ndims, pointer, A_mul_B!
+
+export r2r, r2r!, plan_r2r, plan_r2r!
+
+export export_wisdom, import_wisdom, import_system_wisdom, forget_wisdom,
+       MEASURE, DESTROY_INPUT, UNALIGNED, CONSERVE_MEMORY, EXHAUSTIVE,
+       PRESERVE_INPUT, PATIENT, ESTIMATE, WISDOM_ONLY, NO_TIMELIMIT,
+       R2HC, HC2R, DHT, REDFT00, REDFT01, REDFT10, REDFT11,
+       RODFT00, RODFT01, RODFT10, RODFT11,
+       fftwNumber, fftwReal, fftwComplex, flops
+
+## FFT: Implement fft by calling fftw.
+
+const libfftw = Base.libfftw_name
+const libfftwf = Base.libfftwf_name
+
+const version = convert(VersionNumber, split(bytestring(cglobal((:fftw_version,Base.DFT.FFTW.libfftw), UInt8)), "-")[2])
+
+## Direction of FFT
+
+const FORWARD = -1
+const BACKWARD = 1
+
+## FFTW Flags from fftw3.h
+
+const MEASURE         = UInt32(0)
+const DESTROY_INPUT   = UInt32(1 << 0)
+const UNALIGNED       = UInt32(1 << 1)
+const CONSERVE_MEMORY = UInt32(1 << 2)
+const EXHAUSTIVE      = UInt32(1 << 3)   # NO_EXHAUSTIVE is default
+const PRESERVE_INPUT  = UInt32(1 << 4)   # cancels DESTROY_INPUT
+const PATIENT         = UInt32(1 << 5)   # IMPATIENT is default
+const ESTIMATE        = UInt32(1 << 6)
+const WISDOM_ONLY     = UInt32(1 << 21)
+const NO_SIMD = UInt32(1 << 17) # disable SIMD, useful for benchmarking
+
+## R2R transform kinds
+
+const R2HC    = 0
+const HC2R    = 1
+const DHT     = 2
+const REDFT00 = 3
+const REDFT01 = 4
+const REDFT10 = 5
+const REDFT11 = 6
+const RODFT00 = 7
+const RODFT01 = 8
+const RODFT10 = 9
+const RODFT11 = 10
+
+let k2s = Dict{Int,ASCIIString}(R2HC => "R2HC", HC2R => "HC2R", DHT => "DHT", REDFT00 => "REDFT00", REDFT01 => "REDFT01", REDFT10 => "REDFT10", REDFT11 => "REDFT11", RODFT00 => "RODFT00", RODFT01 => "RODFT01", RODFT10 => "RODFT10", RODFT11 => "RODFT11")
+    global kind2string
+    kind2string(k::Integer) = k2s[Int(k)]
+end
+
+# FFTW floating-point types:
+
+typealias fftwNumber Union(Float64,Float32,Complex128,Complex64)
+typealias fftwReal Union(Float64,Float32)
+typealias fftwComplex Union(Complex128,Complex64)
+typealias fftwDouble Union(Float64,Complex128)
+typealias fftwSingle Union(Float32,Complex64)
+typealias fftwTypeDouble Union(Type{Float64},Type{Complex128})
+typealias fftwTypeSingle Union(Type{Float32},Type{Complex64})
+
+# For ESTIMATE plans, FFTW allows one to pass NULL for the array pointer,
+# since it is not written to.  Hence, it is convenient to create an
+# array-like type that carries a size and a stride like a "real" array
+# but which is converted to C_NULL as a pointer.
+immutable FakeArray{T}
+    sz::Dims
+    st::Dims
+end
+size(a::FakeArray) = a.sz
+strides(a::FakeArray) = a.st
+ndims(a::FakeArray) = length(a.sz)
+unsafe_convert{T}(::Type{Ptr{T}}, a::FakeArray{T}) = convert(Ptr{T}, C_NULL)
+pointer{T}(a::FakeArray{T}) = convert(Ptr{T}, C_NULL)
+FakeArray(T, sz::Dims) = FakeArray{T}(sz, colmajorstrides(sz))
+FakeArray(T, sz::Int...) = FakeArray(T, sz)
+fakesimilar(flags, X, T) = flags & ESTIMATE != 0 ? FakeArray(T, size(X)) : Array(T, size(X))
+alignment_of(A::FakeArray) = Int32(0)
+typealias StridedArrayish{T} Union(StridedArray{T}, FakeArray{T})
+
+## Julia wrappers around FFTW functions
+
+# Wisdom
+
+# Import and export wisdom to/from a single file for all precisions,
+# which is more user-friendly than requiring the user to call a
+# separate routine depending on the fp precision of the plans.  This
+# requires a bit of trickness since we have to (a) use the libc file
+# I/O routines with fftw_export_wisdom_to_file/import_wisdom_from_file
+# (b) we need 256 bytes of space padding between the wisdoms to work
+# around FFTW's internal file i/o buffering [see the BUFSZ constant in
+# FFTW's api/import-wisdom-from-file.c file].
+
+function export_wisdom(fname::AbstractString)
+    f = ccall(:fopen, Ptr{Void}, (Cstring,Ptr{UInt8}), fname, "w")
+    systemerror("could not open wisdom file $fname for writing", f == C_NULL)
+    ccall((:fftw_export_wisdom_to_file,libfftw), Void, (Ptr{Void},), f)
+    ccall(:fputs, Int32, (Ptr{UInt8},Ptr{Void}), " "^256, f)
+    ccall((:fftwf_export_wisdom_to_file,libfftwf), Void, (Ptr{Void},), f)
+    ccall(:fclose, Void, (Ptr{Void},), f)
+end
+
+function import_wisdom(fname::AbstractString)
+    f = ccall(:fopen, Ptr{Void}, (Cstring,Ptr{UInt8}), fname, "r")
+    systemerror("could not open wisdom file $fname for reading", f == C_NULL)
+    if ccall((:fftw_import_wisdom_from_file,libfftw),Int32,(Ptr{Void},),f)==0||
+       ccall((:fftwf_import_wisdom_from_file,libfftwf),Int32,(Ptr{Void},),f)==0
+        error("failed to import wisdom from $fname")
+    end
+    ccall(:fclose, Void, (Ptr{Void},), f)
+end
+
+function import_system_wisdom()
+    if ccall((:fftw_import_system_wisdom,libfftw), Int32, ()) == 0 ||
+       ccall((:fftwf_import_system_wisdom,libfftwf), Int32, ()) == 0
+        error("failed to import system wisdom")
+    end
+end
+
+function forget_wisdom()
+    ccall((:fftw_forget_wisdom,libfftw), Void, ())
+    ccall((:fftwf_forget_wisdom,libfftwf), Void, ())
+end
+
+# Threads
+
+let initialized = false
+    global set_num_threads
+    function set_num_threads(nthreads::Integer)
+        if !initialized
+            # must re-initialize FFTW if any FFTW routines have been called
+            ccall((:fftw_cleanup,libfftw), Void, ())
+            ccall((:fftwf_cleanup,libfftwf), Void, ())
+            stat = ccall((:fftw_init_threads,libfftw), Int32, ())
+            statf = ccall((:fftwf_init_threads,libfftwf), Int32, ())
+            if stat == 0 || statf == 0
+                error("could not initialize FFTW threads")
+            end
+            initialized = true
+        end
+        ccall((:fftw_plan_with_nthreads,libfftw), Void, (Int32,), nthreads)
+        ccall((:fftwf_plan_with_nthreads,libfftwf), Void, (Int32,), nthreads)
+    end
+end
+
+# pointer type for fftw_plan (opaque pointer)
+
+immutable fftw_plan_struct end
+typealias PlanPtr Ptr{fftw_plan_struct}
+
+# Planner timelimits
+
+const NO_TIMELIMIT = -1.0 # from fftw3.h
+
+set_timelimit(precision::fftwTypeDouble,seconds) =
+    ccall((:fftw_set_timelimit,libfftw), Void, (Float64,), seconds)
+
+set_timelimit(precision::fftwTypeSingle,seconds) =
+    ccall((:fftwf_set_timelimit,libfftwf), Void, (Float64,), seconds)
+
+# Array alignment mod 16:
+#   FFTW plans may depend on the alignment of the array mod 16 bytes,
+#   i.e. the address mod 16 of the first element of the array, in order
+#   to exploit SIMD operations.  Julia arrays are, by default, aligned
+#   to 16-byte boundaries (address mod 16 == 0), but this may not be
+#   true for data imported from external C code, or for SubArrays.
+#   Use the undocumented routine fftw_alignment_of to determine the
+#   alignment of a given pointer modulo whatever FFTW needs; this
+#   function will be documented in FFTW 3.3.4.
+
+
+if Base.libfftw_name == "libmkl_rt"
+    alignment_of{T<:fftwDouble}(A::StridedArray{T}) =
+        convert(Int32, convert(Int64, pointer(A)) % 16)
+    alignment_of{T<:fftwSingle}(A::StridedArray{T}) =
+        convert(Int32, convert(Int64, pointer(A)) % 16)
+else
+    alignment_of{T<:fftwDouble}(A::StridedArray{T}) =
+        ccall((:fftw_alignment_of, libfftw), Int32, (Ptr{T},), A)
+    alignment_of{T<:fftwSingle}(A::StridedArray{T}) =
+        ccall((:fftwf_alignment_of, libfftwf), Int32, (Ptr{T},), A)
+end
+
+# FFTWPlan (low-level)
+
+# low-level storage of the FFTW plan, along with the information
+# needed to determine whether it is applicable.   We need to put
+# this into a type to support a finalizer on the fftw_plan.
+# K is FORWARD/BACKWARD for forward/backward or r2c/c2r plans, respectively.
+# For r2r plans, K is a tuple of the transform kinds along each dimension.
+abstract FFTWPlan{T<:fftwNumber,K,inplace} <: Plan{T}
+for P in (:cFFTWPlan, :rFFTWPlan, :r2rFFTWPlan) # complex, r2c/c2r, and r2r
+    @eval begin
+        type $P{T<:fftwNumber,K,inplace} <: FFTWPlan{T,K,inplace}
+            plan::PlanPtr
+            sz::Dims # size of array on which plan operates (Int tuple)
+            osz::Dims # size of output array (Int tuple)
+            istride::Dims # strides of input
+            ostride::Dims # strides of output
+            ialign::Int32 # alignment mod 16 of input
+            oalign::Int32 # alignment mod 16 of input
+            flags::UInt32 # planner flags
+            region::Any # region (iterable) of dims that are transormed
+            pinv::ScaledPlan
+            function $P(plan::PlanPtr, sz::Dims, osz::Dims,
+                        istride::Dims, ostride::Dims,
+                        ialign::Int32, oalign::Int32, flags::Integer, R)
+                p = new(plan,sz,osz,istride,ostride,ialign,oalign,flags,R)
+                finalizer(p, destroy_plan)
+                return p
+            end
+        end
+        $P{T<:fftwNumber}(plan::PlanPtr, flags::Integer, R::Any, X::StridedArrayish{T}, Y::StridedArrayish, K) = $P{T,K,pointer(X) == pointer(Y)}(plan, size(X), size(Y), strides(X), strides(Y), alignment_of(X), alignment_of(Y), flags, R)
+    end
+end
+
+size(p::FFTWPlan) = p.sz
+
+unsafe_convert(::Type{PlanPtr}, p::FFTWPlan) = p.plan
+
+destroy_plan{T<:fftwDouble}(plan::FFTWPlan{T}) =
+    ccall((:fftw_destroy_plan,libfftw), Void, (PlanPtr,), plan)
+
+destroy_plan{T<:fftwSingle}(plan::FFTWPlan{T}) =
+    ccall((:fftwf_destroy_plan,libfftwf), Void, (PlanPtr,), plan)
+
+cost{T<:fftwDouble}(plan::FFTWPlan{T}) =
+    ccall((:fftw_cost,libfftw), Float64, (PlanPtr,), plan)
+cost{T<:fftwSingle}(plan::FFTWPlan{T}) =
+    ccall((:fftwf_cost,libfftwf), Float64, (PlanPtr,), plan)
+
+let add=[0.0], mul=[0.0], fma=[0.0]
+    global arithmetic_ops
+    function arithmetic_ops{T<:fftwDouble}(plan::FFTWPlan{T})
+        ccall((:fftw_flops,libfftw), Void,
+              (PlanPtr,Ptr{Float64},Ptr{Float64},Ptr{Float64}),
+              plan, add, mul, fma)
+        (int64(add[1]), int64(mul[1]), int64(fma[1]))
+    end
+    function arithmetic_ops{T<:fftwSingle}(plan::FFTWPlan{T})
+        ccall((:fftwf_flops,libfftwf), Void,
+              (PlanPtr,Ptr{Float64},Ptr{Float64},Ptr{Float64}),
+              plan, add, mul, fma)
+        (int64(add[1]), int64(mul[1]), int64(fma[1]))
+    end
+end
+flops(plan::FFTWPlan) = let ops=arithmetic_ops(plan)
+    ops[1] + ops[2] + 2*ops[3] # add + mul + 2*fma
+end
+
+# Pretty-printing plans
+
+function showfftdims(io, sz::Dims, istride::Dims, T)
+    if isempty(sz)
+        print(io, "0-dimensional")
+    elseif length(sz) == 1
+        print(io, sz[1], "-element")
+    else
+        print(io, join(sz, "x"))
+    end
+    if istride == colmajorstrides(sz)
+        print(io, " array of ", T)
+    else
+        print(io, " $istride-strided array of ", T)
+    end
+end
+
+# The sprint_plan function was released in FFTW 3.3.4
+sprint_plan_{T<:fftwDouble}(plan::FFTWPlan{T}) =
+    ccall((:fftw_sprint_plan,libfftw), Ptr{UInt8}, (PlanPtr,), plan)
+sprint_plan_{T<:fftwSingle}(plan::FFTWPlan{T}) =
+    ccall((:fftwf_sprint_plan,libfftwf), Ptr{UInt8}, (PlanPtr,), plan)
+function sprint_plan(plan::FFTWPlan)
+    pointer_to_string(sprint_plan_(plan), true)
+end
+
+function show{T,K,inplace}(io::IO, p::cFFTWPlan{T,K,inplace})
+    print(io, inplace ? "FFTW in-place " : "FFTW ",
+          K < 0 ? "forward" : "backward", " plan for ")
+    showfftdims(io, p.sz, p.istride, T)
+    version >= v"3.3.4" && print(io, "\n", sprint_plan(p))
+end
+
+function show{T,K,inplace}(io::IO, p::rFFTWPlan{T,K,inplace})
+    print(io, inplace ? "FFTW in-place " : "FFTW ",
+          K < 0 ? "real-to-complex" : "complex-to-real",
+          " plan for ")
+    showfftdims(io, p.sz, p.istride, T)
+    version >= v"3.3.4" && print(io, "\n", sprint_plan(p))
+end
+
+function show{T,K,inplace}(io::IO, p::r2rFFTWPlan{T,K,inplace})
+    print(io, inplace ? "FFTW in-place r2r " : "FFTW r2r ")
+    if isempty(K)
+        print(io, "0-dimensional")
+    elseif K == ntuple(i -> K[1], length(K))
+        print(io, kind2string(K[1]))
+        if length(K) > 1
+            print(io, "^", length(K))
+        end
+    else
+        print(io, join(map(kind2string, K), "x"))
+    end
+    print(io, " plan for ")
+    showfftdims(io, p.sz, p.istride, T)
+    version >= v"3.3.4" && print(io, "\n", sprint_plan(p))
+end
+
+# Check whether a FFTWPlan is applicable to a given input array, and
+# throw an informative error if not:
+function assert_applicable{T}(p::FFTWPlan{T}, X::StridedArray{T})
+    if size(X) != p.sz
+        throw(ArgumentError("FFTW plan applied to wrong-size array"))
+    elseif strides(X) != p.istride
+        throw(ArgumentError("FFTW plan applied to wrong-strides array"))
+    elseif alignment_of(X) != p.ialign || p.flags & UNALIGNED != 0
+        throw(ArgumentError("FFTW plan applied to array with wrong memory alignment"))
+    end
+end
+
+function assert_applicable{T,K,inplace}(p::FFTWPlan{T,K,inplace}, X::StridedArray{T}, Y::StridedArray)
+    assert_applicable(p, X)
+    if size(Y) != p.osz
+        throw(ArgumentError("FFTW plan applied to wrong-size output"))
+    elseif strides(Y) != p.ostride
+        throw(ArgumentError("FFTW plan applied to wrong-strides output"))
+    elseif alignment_of(Y) != p.oalign || p.flags & UNALIGNED != 0
+        throw(ArgumentError("FFTW plan applied to output with wrong memory alignment"))
+    elseif inplace != (pointer(X) == pointer(Y))
+        throw(ArgumentError(string("FFTW ",
+                                   inplace ? "in-place" : "out-of-place",
+                                   " plan applied to ",
+                                   inplace ? "out-of-place" : "in-place",
+                                   " data")))
+    end
+end
+
+# strides for a column-major (Julia-style) array of size == sz
+colmajorstrides(sz) = isempty(sz) ? () : tuple(1,cumprod(Int[sz[1:end-1]...])...)
+
+# Execute
+
+unsafe_execute!{T<:fftwDouble}(plan::FFTWPlan{T}) =
+    ccall((:fftw_execute,libfftw), Void, (PlanPtr,), plan)
+
+unsafe_execute!{T<:fftwSingle}(plan::FFTWPlan{T}) =
+    ccall((:fftwf_execute,libfftwf), Void, (PlanPtr,), plan)
+
+unsafe_execute!{T<:fftwDouble}(plan::cFFTWPlan{T},
+                               X::StridedArray{T}, Y::StridedArray{T}) =
+    ccall((:fftw_execute_dft,libfftw), Void,
+          (PlanPtr,Ptr{T},Ptr{T}), plan, X, Y)
+
+unsafe_execute!{T<:fftwSingle}(plan::cFFTWPlan{T},
+                               X::StridedArray{T}, Y::StridedArray{T}) =
+    ccall((:fftwf_execute_dft,libfftwf), Void,
+          (PlanPtr,Ptr{T},Ptr{T}), plan, X, Y)
+
+unsafe_execute!(plan::rFFTWPlan{Float64,FORWARD},
+                X::StridedArray{Float64}, Y::StridedArray{Complex128}) =
+    ccall((:fftw_execute_dft_r2c,libfftw), Void,
+          (PlanPtr,Ptr{Float64},Ptr{Complex128}), plan, X, Y)
+
+unsafe_execute!(plan::rFFTWPlan{Float32,FORWARD},
+                X::StridedArray{Float32}, Y::StridedArray{Complex64}) =
+    ccall((:fftwf_execute_dft_r2c,libfftwf), Void,
+          (PlanPtr,Ptr{Float32},Ptr{Complex64}), plan, X, Y)
+
+unsafe_execute!(plan::rFFTWPlan{Complex128,BACKWARD},
+                X::StridedArray{Complex128}, Y::StridedArray{Float64}) =
+    ccall((:fftw_execute_dft_c2r,libfftw), Void,
+          (PlanPtr,Ptr{Complex128},Ptr{Float64}), plan, X, Y)
+
+unsafe_execute!(plan::rFFTWPlan{Complex64,BACKWARD},
+                X::StridedArray{Complex64}, Y::StridedArray{Float32}) =
+    ccall((:fftwf_execute_dft_c2r,libfftwf), Void,
+          (PlanPtr,Ptr{Complex64},Ptr{Float32}), plan, X, Y)
+
+unsafe_execute!{T<:fftwDouble}(plan::r2rFFTWPlan{T},
+                               X::StridedArray{T}, Y::StridedArray{T}) =
+    ccall((:fftw_execute_r2r,libfftw), Void,
+          (PlanPtr,Ptr{T},Ptr{T}), plan, X, Y)
+
+unsafe_execute!{T<:fftwSingle}(plan::r2rFFTWPlan{T},
+                               X::StridedArray{T}, Y::StridedArray{T}) =
+    ccall((:fftwf_execute_r2r,libfftwf), Void,
+          (PlanPtr,Ptr{T},Ptr{T}), plan, X, Y)
+
+# NOTE ON GC (garbage collection):
+#    The FFTWPlan has a finalizer so that gc will destroy the plan,
+#    which is necessary for gc to work with plan_fft.  However,
+#    even when we are creating a single-use FFTWPlan [e.g. for fftn(x)],
+#    we intentionally do NOT call destroy_plan explicitly, and instead
+#    wait for garbage collection.  The reason is that, in the common
+#    case where the user calls fft(x) a second time soon afterwards,
+#    if destroy_plan has not yet been called then FFTW will internally
+#    re-use the table of trigonometric constants from the first plan.
+
+# Compute dims and howmany for FFTW guru planner
+function dims_howmany(X::StridedArrayish, Y::StridedArrayish,
+                      sz::Array{Int,1}, region)
+    reg = [region...]
+    if length(unique(reg)) < length(reg)
+        throw(ArgumentError("each dimension can be transformed at most once"))
+    end
+    ist = [strides(X)...]
+    ost = [strides(Y)...]
+    dims = [sz[reg] ist[reg] ost[reg]]'
+    oreg = [1:ndims(X);]
+    oreg[reg] = 0
+    oreg = filter(d -> d > 0, oreg)
+    howmany = [sz[oreg] ist[oreg] ost[oreg]]'
+    return (dims, howmany)
+end
+
+# check & convert kinds into int32 array with same length as region
+function fix_kinds(region, kinds)
+    if length(kinds) != length(region)
+        if length(kinds) > length(region)
+            throw(ArgumentError("too many transform kinds"))
+        else
+            if length(kinds) == 0
+                throw(ArgumentError("must supply a transform kind"))
+            end
+            k = Array(Int32, length(region))
+            k[1:length(kinds)] = [kinds...]
+            k[length(kinds)+1:end] = kinds[end]
+            kinds = k
+        end
+    else
+        kinds = Int32[kinds...]
+    end
+    for i = 1:length(kinds)
+        if kinds[i] < 0 || kinds[i] > 10
+            throw(ArgumentError("invalid transform kind"))
+        end
+    end
+    return kinds
+end
+
+# low-level FFTWPlan creation (for internal use in FFTW module)
+
+for (Tr,Tc,fftw,lib) in ((:Float64,:Complex128,"fftw",libfftw),
+                         (:Float32,:Complex64,"fftwf",libfftwf))
+
+    @eval function cFFTWPlan(X::StridedArrayish{$Tc}, Y::StridedArrayish{$Tc},
+                             region, direction::Integer,
+                             flags::Integer, timelimit::Real)
+        set_timelimit($Tr, timelimit)
+        R = copy(region)
+        dims, howmany = dims_howmany(X, Y, [size(X)...], R)
+        plan = ccall(($(string(fftw,"_plan_guru64_dft")),$lib),
+                     PlanPtr,
+                     (Int32, Ptr{Int}, Int32, Ptr{Int},
+                      Ptr{$Tc}, Ptr{$Tc}, Int32, UInt32),
+                     size(dims,2), dims, size(howmany,2), howmany,
+                     X, Y, direction, flags)
+        set_timelimit($Tr, NO_TIMELIMIT)
+        if plan == C_NULL
+            error("FFTW could not create plan") # shouldn't normally happen
+        end
+        return cFFTWPlan(plan, flags, R, X, Y,
+                         direction < 0 ? FORWARD : BACKWARD)
+    end
+
+    @eval function rFFTWPlan(X::StridedArrayish{$Tr}, Y::StridedArrayish{$Tc},
+                             region, flags::Integer, timelimit::Real)
+        R = copy(region)
+        region = circshift([region...],-1) # FFTW halves last dim
+        set_timelimit($Tr, timelimit)
+        dims, howmany = dims_howmany(X, Y, [size(X)...], region)
+        plan = ccall(($(string(fftw,"_plan_guru64_dft_r2c")),$lib),
+                     PlanPtr,
+                     (Int32, Ptr{Int}, Int32, Ptr{Int},
+                      Ptr{$Tr}, Ptr{$Tc}, UInt32),
+                     size(dims,2), dims, size(howmany,2), howmany,
+                     X, Y, flags)
+        set_timelimit($Tr, NO_TIMELIMIT)
+        if plan == C_NULL
+            error("FFTW could not create plan") # shouldn't normally happen
+        end
+        return rFFTWPlan(plan, flags, R, X, Y, FORWARD)
+    end
+
+    @eval function rFFTWPlan(X::StridedArrayish{$Tc}, Y::StridedArrayish{$Tr},
+                             region, flags::Integer, timelimit::Real)
+        R = copy(region)
+        region = circshift([region...],-1) # FFTW halves last dim
+        set_timelimit($Tr, timelimit)
+        dims, howmany = dims_howmany(X, Y, [size(Y)...], region)
+        plan = ccall(($(string(fftw,"_plan_guru64_dft_c2r")),$lib),
+                     PlanPtr,
+                     (Int32, Ptr{Int}, Int32, Ptr{Int},
+                      Ptr{$Tc}, Ptr{$Tr}, UInt32),
+                     size(dims,2), dims, size(howmany,2), howmany,
+                     X, Y, flags)
+        set_timelimit($Tr, NO_TIMELIMIT)
+        if plan == C_NULL
+            error("FFTW could not create plan") # shouldn't normally happen
+        end
+        return rFFTWPlan(plan, flags, R, X, Y, BACKWARD)
+    end
+
+    @eval function r2rFFTWPlan(X::StridedArrayish{$Tr}, Y::StridedArrayish{$Tr},
+                               region, kinds,
+                               flags::Integer, timelimit::Real)
+        R = copy(region)
+        knd = fix_kinds(region, kinds)
+        set_timelimit($Tr, timelimit)
+        dims, howmany = dims_howmany(X, Y, [size(X)...], region)
+        plan = ccall(($(string(fftw,"_plan_guru64_r2r")),$lib),
+                     PlanPtr,
+                     (Int32, Ptr{Int}, Int32, Ptr{Int},
+                      Ptr{$Tr}, Ptr{$Tr}, Ptr{Int32}, UInt32),
+                     size(dims,2), dims, size(howmany,2), howmany,
+                     X, Y, knd, flags)
+        set_timelimit($Tr, NO_TIMELIMIT)
+        if plan == C_NULL
+            error("FFTW could not create plan") # shouldn't normally happen
+        end
+        return r2rFFTWPlan(plan, flags, R, X, Y, tuple(map(Int,knd)...))
+    end
+
+    # support r2r transforms of complex = transforms of real & imag parts
+    @eval function r2rFFTWPlan(X::StridedArrayish{$Tc}, Y::StridedArrayish{$Tc},
+                               region, kinds,
+                               flags::Integer, timelimit::Real)
+        R = copy(region)
+        knd = fix_kinds(region, kinds)
+        set_timelimit($Tr, timelimit)
+        dims, howmany = dims_howmany(X, Y, [size(X)...], region)
+        dims[2:3, 1:size(dims,2)] *= 2
+        howmany[2:3, 1:size(howmany,2)] *= 2
+        howmany = [howmany [2,1,1]] # append loop over real/imag parts
+        plan = ccall(($(string(fftw,"_plan_guru64_r2r")),$lib),
+                     PlanPtr,
+                     (Int32, Ptr{Int}, Int32, Ptr{Int},
+                      Ptr{$Tc}, Ptr{$Tc}, Ptr{Int32}, UInt32),
+                     size(dims,2), dims, size(howmany,2), howmany,
+                     X, Y, knd, flags)
+        set_timelimit($Tr, NO_TIMELIMIT)
+        if plan == C_NULL
+            error("FFTW could not create plan") # shouldn't normally happen
+        end
+        return r2rFFTWPlan(plan, flags, R, X, Y, tuple(map(Int,knd)...))
+    end
+
+end
+
+# Convert arrays of numeric types to FFTW-supported packed complex-float types
+# (FIXME: is there a way to use the Julia promotion rules more cleverly here?)
+fftwcomplex{T<:fftwComplex}(X::StridedArray{T}) = X
+fftwcomplex{T<:fftwReal}(X::AbstractArray{T}) =
+    copy!(Array(typeof(complex(one(T))), size(X)), X)
+fftwcomplex{T<:Real}(X::AbstractArray{T}) = copy!(Array(Complex128, size(X)),X)
+fftwcomplex{T<:Complex}(X::AbstractArray{T}) =
+    copy!(Array(Complex128, size(X)), X)
+fftwfloat{T<:fftwReal}(X::StridedArray{T}) = X
+fftwfloat{T<:Real}(X::AbstractArray{T}) = copy!(Array(Float64, size(X)), X)
+fftwfloat{T<:Complex}(X::AbstractArray{T}) = fftwcomplex(X)
+
+for (f,direction) in ((:fft,FORWARD), (:bfft,BACKWARD))
+    plan_f = symbol("plan_",f)
+    plan_f! = symbol("plan_",f,"!")
+    idirection = -direction
+    @eval begin
+        function $plan_f{T<:fftwComplex}(X::StridedArray{T}, region;
+                                         flags::Integer=ESTIMATE,
+                                         timelimit::Real=NO_TIMELIMIT)
+            cFFTWPlan(X, fakesimilar(flags, X, T), region,
+                      $direction, flags,timelimit)::cFFTWPlan{T,$direction,false}
+        end
+
+        function $plan_f!{T<:fftwComplex}(X::StridedArray{T}, region;
+                                          flags::Integer=ESTIMATE,
+                                          timelimit::Real=NO_TIMELIMIT)
+            cFFTWPlan(X, X, region, $direction, flags, timelimit)::cFFTWPlan{T,$direction,true}
+        end
+        $plan_f{T<:fftwComplex}(X::StridedArray{T}; kws...) =
+            $plan_f(X, 1:ndims(X); kws...)
+        $plan_f!{T<:fftwComplex}(X::StridedArray{T}; kws...) =
+            $plan_f!(X, 1:ndims(X); kws...)
+
+        function plan_inv{T<:fftwComplex,inplace}(p::cFFTWPlan{T,$direction,inplace})
+            X = Array(T, p.sz)
+            Y = inplace ? X : fakesimilar(p.flags, X, T)
+            ScaledPlan(cFFTWPlan(X, Y, p.region, $idirection,
+                                 p.flags, NO_TIMELIMIT)::cFFTWPlan{T,$idirection,inplace},
+                       normalization(X, p.region))
+        end
+    end
+end
+
+function A_mul_B!{T}(y::StridedArray{T}, p::cFFTWPlan{T}, x::StridedArray{T})
+    assert_applicable(p, x, y)
+    unsafe_execute!(p, x, y)
+    return y
+end
+
+function *{T,K,N}(p::cFFTWPlan{T,K,false}, x::StridedArray{T,N})
+    assert_applicable(p, x)
+    y = Array(T, p.osz)::Array{T,N}
+    unsafe_execute!(p, x, y)
+    return y
+end
+
+function *{T,K}(p::cFFTWPlan{T,K,true}, x::StridedArray{T})
+    assert_applicable(p, x)
+    unsafe_execute!(p, x, x)
+    return x
+end
+
+# rfft/brfft and planned variants.  No in-place version for now.
+
+for (Tr,Tc) in ((:Float32,:Complex64),(:Float64,:Complex128))
+    # Note: use $FORWARD and $BACKWARD below because of issue #9775
+    @eval begin
+        function plan_rfft(X::StridedArray{$Tr}, region;
+                           flags::Integer=ESTIMATE, timelimit::Real=NO_TIMELIMIT)
+            osize = rfft_output_size(X, region)
+            Y = flags&ESTIMATE != 0 ? FakeArray($Tc,osize...) : Array($Tc,osize...)
+            rFFTWPlan(X, Y, region, flags, timelimit)::rFFTWPlan{$Tr,$FORWARD,false}
+        end
+
+        function plan_brfft(X::StridedArray{$Tc}, d::Integer, region;
+                            flags::Integer=ESTIMATE, timelimit::Real=NO_TIMELIMIT)
+            osize = brfft_output_size(X, d, region)
+            Y = flags&ESTIMATE != 0 ? FakeArray($Tr,osize...) : Array($Tr,osize...)
+
+            # FFTW currently doesn't support PRESERVE_INPUT for
+            # multidimensional out-of-place c2r transforms, so
+            # we have to handle 1d and >1d cases separately with a copy.  Ugh.
+            if length(region) <= 1
+                rFFTWPlan(X, Y, region, flags | PRESERVE_INPUT, timelimit)
+            else
+                Xc = copy(X)
+                rFFTWPlan(X, Y, region, flags, timelimit)
+            end::rFFTWPlan{$Tc,$BACKWARD,false}
+        end
+
+        plan_rfft(X::StridedArray{$Tr};kws...)=plan_rfft(X,1:ndims(X);kws...)
+        plan_brfft(X::StridedArray{$Tr};kws...)=plan_brfft(X,1:ndims(X);kws...)
+
+        function plan_inv(p::rFFTWPlan{$Tr,$FORWARD,false})
+            X = Array($Tr, p.sz)
+            Y = p.flags&ESTIMATE != 0 ? FakeArray($Tc,p.osz) : Array($Tc,p.osz)
+            ScaledPlan(rFFTWPlan(Y, X, p.region,
+                                 length(p.region)<=1 ? p.flags | PRESERVE_INPUT
+                                 : p.flags, NO_TIMELIMIT)::rFFTWPlan{$Tc,$BACKWARD,false},
+                       normalization(X, p.region))
+        end
+
+        function plan_inv(p::rFFTWPlan{$Tc,$BACKWARD,false})
+            X = Array($Tc, p.sz)
+            Y = p.flags&ESTIMATE != 0 ? FakeArray($Tr,p.osz) : Array($Tr,p.osz)
+            ScaledPlan(rFFTWPlan(Y, X, p.region, p.flags, NO_TIMELIMIT)::rFFTWPlan{$Tr,$FORWARD,false},
+                       normalization(Y, p.region))
+        end
+
+        function A_mul_B!(y::StridedArray{$Tc}, p::rFFTWPlan{$Tr,$FORWARD}, x::StridedArray{$Tr})
+            assert_applicable(p, x, y)
+            unsafe_execute!(p, x, y)
+            return y
+        end
+        function A_mul_B!(y::StridedArray{$Tr}, p::rFFTWPlan{$Tc,$BACKWARD}, x::StridedArray{$Tc})
+            assert_applicable(p, x, y)
+            unsafe_execute!(p, x, y) # note: may overwrite x as well as y!
+            return y
+        end
+
+        function *{N}(p::rFFTWPlan{$Tr,$FORWARD,false}, x::StridedArray{$Tr,N})
+            assert_applicable(p, x)
+            y = Array($Tc, p.osz)::Array{$Tc,N}
+            unsafe_execute!(p, x, y)
+            return y
+        end
+
+        function *{N}(p::rFFTWPlan{$Tc,$BACKWARD,false}, x::StridedArray{$Tc,N})
+            if p.flags & PRESERVE_INPUT != 0
+                assert_applicable(p, x)
+                y = Array($Tr, p.osz)::Array{$Tr,N}
+                unsafe_execute!(p, x, y)
+            else # need to make a copy to avoid overwriting x
+                xc = copy(x)
+                assert_applicable(p, xc)
+                y = Array($Tr, p.osz)::Array{$Tr,N}
+                unsafe_execute!(p, xc, y)
+            end
+            return y
+        end
+    end
+end
+
+# FFTW r2r transforms (low-level interface)
+
+for f in (:r2r, :r2r!)
+    pf = symbol("plan_", f)
+    @eval begin
+        $f{T<:fftwNumber}(x::AbstractArray{T}, kinds) = $pf(x, kinds) * x
+        $f{T<:fftwNumber}(x::AbstractArray{T}, kinds, region) = $pf(x, kinds, region) * x
+        $pf(x::AbstractArray, kinds; kws...) = $pf(x, kinds, 1:ndims(x); kws...)
+        $f{T<:Real}(x::AbstractArray{T}, kinds, region=1:ndims(x)) = $f(fftwfloat(x), kinds, region)
+        $pf{T<:Real}(x::AbstractArray{T}, kinds, region; kws...) = $pf(fftwfloat(x), kinds, region; kws...)
+        $f{T<:Complex}(x::AbstractArray{T}, kinds, region=1:ndims(x)) = $f(fftwcomplex(x), kinds, region)
+        $pf{T<:Complex}(x::AbstractArray{T}, kinds, region; kws...) = $pf(fftwcomplex(x), kinds, region; kws...)
+    end
+end
+
+function plan_r2r{T<:fftwNumber}(X::StridedArray{T}, kinds, region;
+                                 flags::Integer=ESTIMATE,
+                                 timelimit::Real=NO_TIMELIMIT)
+    r2rFFTWPlan(X, fakesimilar(flags, X, T), region, kinds, flags, timelimit)
+end
+
+function plan_r2r!{T<:fftwNumber}(X::StridedArray{T}, kinds, region;
+                                  flags::Integer=ESTIMATE,
+                                  timelimit::Real=NO_TIMELIMIT)
+    r2rFFTWPlan(X, X, region, kinds, flags, timelimit)
+end
+
+# mapping from r2r kind to the corresponding inverse transform
+const inv_kind = Dict{Int,Int}(R2HC => HC2R, HC2R => R2HC, DHT => DHT,
+                               REDFT00 => REDFT00,
+                               REDFT01 => REDFT10, REDFT10 => REDFT01,
+                               REDFT11 => REDFT11,
+                               RODFT00 => RODFT00,
+                               RODFT01 => RODFT10, RODFT10 => RODFT01,
+                               RODFT11 => RODFT11)
+
+# r2r inverses are normalized to 1/N, where N is a "logical" size
+# the transform with length n and kind k:
+function logical_size(n::Integer, k::Integer)
+    k <= DHT && return n
+    k == REDFT00 && return 2(n-1)
+    k == RODFT00 && return 2(n+1)
+    return 2n
+end
+
+function plan_inv{T<:fftwNumber,K,inplace}(p::r2rFFTWPlan{T,K,inplace})
+    X = Array(T, p.sz)
+    iK = fix_kinds(p.region, [inv_kind[k] for k in K])
+    Y = inplace ? X : fakesimilar(p.flags, X, T)
+    ScaledPlan(r2rFFTWPlan(X, Y, p.region, iK,
+                           p.flags, NO_TIMELIMIT),
+               normalization(real(T),
+                             map(logical_size, [p.sz...][[p.region...]], iK),
+                             1:length(iK)))
+end
+
+function A_mul_B!{T}(y::StridedArray{T}, p::r2rFFTWPlan{T}, x::StridedArray{T})
+    assert_applicable(p, x, y)
+    unsafe_execute!(p, x, y)
+    return y
+end
+
+function *{T,K,N}(p::r2rFFTWPlan{T,K,false}, x::StridedArray{T,N})
+    assert_applicable(p, x)
+    y = Array(T, p.osz)::Array{T,N}
+    unsafe_execute!(p, x, y)
+    return y
+end
+
+function *{T,K}(p::r2rFFTWPlan{T,K,true}, x::StridedArray{T})
+    assert_applicable(p, x)
+    unsafe_execute!(p, x, x)
+    return x
+end
+
+include("dct.jl")
+
+end # module
diff --git a/base/fft/dct.jl b/base/fft/dct.jl
new file mode 100644
index 0000000000000..53c8f6bb65ee7
--- /dev/null
+++ b/base/fft/dct.jl
@@ -0,0 +1,103 @@
+# This file is a part of Julia. License is MIT: http://julialang.org/license
+
+# (This is part of the FFTW module.)
+
+export dct, idct, dct!, idct!, plan_dct, plan_idct, plan_dct!, plan_idct!
+
+# Discrete cosine transforms (type II/III) via FFTW's r2r transforms;
+# we follow the Matlab convention and adopt a unitary normalization here.
+# Unlike Matlab we compute the multidimensional transform by default,
+# similar to the Julia fft functions.
+
+type DCTPlan{T<:fftwNumber,K,inplace} <: Plan{T}
+    plan::r2rFFTWPlan{T}
+    r::Array{UnitRange{Int}} # array of indices for rescaling
+    nrm::Float64 # normalization factor
+    region::Dims # dimensions being transformed
+    pinv::DCTPlan{T}
+    DCTPlan(plan,r,nrm,region) = new(plan,r,nrm,region)
+end
+
+size(p::DCTPlan) = size(p.plan)
+
+function show{T,K,inplace}(io::IO, p::DCTPlan{T,K,inplace})
+    print(io, inplace ? "FFTW in-place " : "FFTW ",
+          K == REDFT10 ? "DCT (DCT-II)" : "IDCT (DCT-III)", " plan for ")
+    showfftdims(io, p.plan.sz, p.plan.istride, eltype(p))
+end
+
+for (pf, pfr, K, inplace) in ((:plan_dct, :plan_r2r, REDFT10, false),
+                              (:plan_dct!, :plan_r2r!, REDFT10, true),
+                              (:plan_idct, :plan_r2r, REDFT01, false),
+                              (:plan_idct!, :plan_r2r!, REDFT01, true))
+    @eval function $pf{T<:fftwNumber}(X::StridedArray{T}, region; kws...)
+        r = [1:n for n in size(X)]
+        nrm = sqrt(0.5^length(region) * normalization(X,region))
+        DCTPlan{T,$K,$inplace}($pfr(X, $K, region; kws...), r, nrm,
+                               ntuple(i -> Int(region[i]), length(region)))
+    end
+end
+
+function plan_inv{T,K,inplace}(p::DCTPlan{T,K,inplace})
+    X = Array(T, p.plan.sz)
+    iK = inv_kind[K]
+    DCTPlan{T,iK,inplace}(inplace ?
+                          plan_r2r!(X, iK, p.region, flags=p.plan.flags) :
+                          plan_r2r(X, iK, p.region, flags=p.plan.flags),
+                          p.r, p.nrm, p.region)
+end
+
+for f in (:dct, :dct!, :idct, :idct!)
+    pf = symbol(string("plan_", f))
+    @eval begin
+        $f{T<:fftwNumber}(x::AbstractArray{T}) = $pf(x) * x
+        $f{T<:fftwNumber}(x::AbstractArray{T}, region) = $pf(x, region) * x
+        $pf(x::AbstractArray; kws...) = $pf(x, 1:ndims(x); kws...)
+        $f{T<:Real}(x::AbstractArray{T}, region=1:ndims(x)) = $f(fftwfloat(x), region)
+        $pf{T<:Real}(x::AbstractArray{T}, region; kws...) = $pf(fftwfloat(x), region; kws...)
+        $pf{T<:Complex}(x::AbstractArray{T}, region; kws...) = $pf(fftwcomplex(x), region; kws...)
+    end
+end
+
+const sqrthalf = sqrt(0.5)
+const sqrt2 = sqrt(2.0)
+const onerange = 1:1
+
+function A_mul_B!{T}(y::StridedArray{T}, p::DCTPlan{T,REDFT10},
+                     x::StridedArray{T})
+    assert_applicable(p.plan, x, y)
+    unsafe_execute!(p.plan, x, y)
+    scale!(y, p.nrm)
+    r = p.r
+    for d in p.region
+        oldr = r[d]
+        r[d] = onerange
+        y[r...] *= sqrthalf
+        r[d] = oldr
+    end
+    return y
+end
+
+# note: idct changes input data
+function A_mul_B!{T}(y::StridedArray{T}, p::DCTPlan{T,REDFT01},
+                     x::StridedArray{T})
+    assert_applicable(p.plan, x, y)
+    scale!(x, p.nrm)
+    r = p.r
+    for d in p.region
+        oldr = r[d]
+        r[d] = onerange
+        x[r...] *= sqrt2
+        r[d] = oldr
+    end
+    unsafe_execute!(p.plan, x, y)
+    return y
+end
+
+*{T}(p::DCTPlan{T,REDFT10,false}, x::StridedArray{T}) =
+    A_mul_B!(Array(T, p.plan.osz), p, x)
+
+*{T}(p::DCTPlan{T,REDFT01,false}, x::StridedArray{T}) =
+    A_mul_B!(Array(T, p.plan.osz), p, copy(x)) # need copy to preserve input
+
+*{T,K}(p::DCTPlan{T,K,true}, x::StridedArray{T}) = A_mul_B!(x, p, x)
diff --git a/base/fftw.jl b/base/fftw.jl
deleted file mode 100644
index c5ccaa436100d..0000000000000
--- a/base/fftw.jl
+++ /dev/null
@@ -1,793 +0,0 @@
-# This file is a part of Julia. License is MIT: http://julialang.org/license
-
-module FFTW
-
-export fft, bfft, ifft, rfft, brfft, irfft,
-       plan_fft, plan_bfft, plan_ifft, plan_rfft, plan_brfft, plan_irfft,
-       fft!, bfft!, ifft!, plan_fft!, plan_bfft!, plan_ifft!,
-       r2r, r2r!, plan_r2r, plan_r2r!,
-       export_wisdom, import_wisdom, import_system_wisdom, forget_wisdom,
-       MEASURE, DESTROY_INPUT, UNALIGNED, CONSERVE_MEMORY, EXHAUSTIVE,
-       PRESERVE_INPUT, PATIENT, ESTIMATE, WISDOM_ONLY, NO_TIMELIMIT,
-       R2HC, HC2R, DHT, REDFT00, REDFT01, REDFT10, REDFT11,
-       RODFT00, RODFT01, RODFT10, RODFT11,
-       fftwNumber, fftwReal, fftwComplex
-
-## FFT: Implement fft by calling fftw.
-
-const libfftw = Base.libfftw_name
-const libfftwf = Base.libfftwf_name
-
-## Direction of FFT
-
-const FORWARD = Int32(-1)
-const BACKWARD = Int32(1)
-
-## FFTW Flags from fftw3.h
-
-const MEASURE         = UInt32(0)
-const DESTROY_INPUT   = UInt32(1 << 0)
-const UNALIGNED       = UInt32(1 << 1)
-const CONSERVE_MEMORY = UInt32(1 << 2)
-const EXHAUSTIVE      = UInt32(1 << 3)   # NO_EXHAUSTIVE is default
-const PRESERVE_INPUT  = UInt32(1 << 4)   # cancels DESTROY_INPUT
-const PATIENT         = UInt32(1 << 5)   # IMPATIENT is default
-const ESTIMATE        = UInt32(1 << 6)
-const WISDOM_ONLY     = UInt32(1 << 21)
-
-## R2R transform kinds
-
-const R2HC    = Int32(0)
-const HC2R    = Int32(1)
-const DHT     = Int32(2)
-const REDFT00 = Int32(3)
-const REDFT01 = Int32(4)
-const REDFT10 = Int32(5)
-const REDFT11 = Int32(6)
-const RODFT00 = Int32(7)
-const RODFT01 = Int32(8)
-const RODFT10 = Int32(9)
-const RODFT11 = Int32(10)
-
-# FFTW floating-point types:
-
-typealias fftwNumber Union{Float64,Float32,Complex128,Complex64}
-typealias fftwReal Union{Float64,Float32}
-typealias fftwComplex Union{Complex128,Complex64}
-typealias fftwDouble Union{Float64,Complex128}
-typealias fftwSingle Union{Float32,Complex64}
-typealias fftwTypeDouble Union{Type{Float64},Type{Complex128}}
-typealias fftwTypeSingle Union{Type{Float32},Type{Complex64}}
-
-## Julia wrappers around FFTW functions
-
-# Wisdom
-
-# Import and export wisdom to/from a single file for all precisions,
-# which is more user-friendly than requiring the user to call a
-# separate routine depending on the fp precision of the plans.  This
-# requires a bit of trickness since we have to (a) use the libc file
-# I/O routines with fftw_export_wisdom_to_file/import_wisdom_from_file
-# (b) we need 256 bytes of space padding between the wisdoms to work
-# around FFTW's internal file i/o buffering [see the BUFSZ constant in
-# FFTW's api/import-wisdom-from-file.c file].
-
-function export_wisdom(fname::AbstractString)
-    f = ccall(:fopen, Ptr{Void}, (Cstring,Ptr{UInt8}), fname, "w")
-    systemerror("could not open wisdom file $fname for writing", f == C_NULL)
-    ccall((:fftw_export_wisdom_to_file,libfftw), Void, (Ptr{Void},), f)
-    ccall(:fputs, Int32, (Ptr{UInt8},Ptr{Void}), " "^256, f)
-    ccall((:fftwf_export_wisdom_to_file,libfftwf), Void, (Ptr{Void},), f)
-    ccall(:fclose, Void, (Ptr{Void},), f)
-end
-
-function import_wisdom(fname::AbstractString)
-    f = ccall(:fopen, Ptr{Void}, (Cstring,Ptr{UInt8}), fname, "r")
-    systemerror("could not open wisdom file $fname for reading", f == C_NULL)
-    if ccall((:fftw_import_wisdom_from_file,libfftw),Int32,(Ptr{Void},),f)==0||
-       ccall((:fftwf_import_wisdom_from_file,libfftwf),Int32,(Ptr{Void},),f)==0
-        error("failed to import wisdom from $fname")
-    end
-    ccall(:fclose, Void, (Ptr{Void},), f)
-end
-
-function import_system_wisdom()
-    if ccall((:fftw_import_system_wisdom,libfftw), Int32, ()) == 0 ||
-       ccall((:fftwf_import_system_wisdom,libfftwf), Int32, ()) == 0
-        error("failed to import system wisdom")
-    end
-end
-
-function forget_wisdom()
-    ccall((:fftw_forget_wisdom,libfftw), Void, ())
-    ccall((:fftwf_forget_wisdom,libfftwf), Void, ())
-end
-
-# Threads
-
-let initialized = false
-    global set_num_threads
-    function set_num_threads(nthreads::Integer)
-        if !initialized
-            # must re-initialize FFTW if any FFTW routines have been called
-            ccall((:fftw_cleanup,libfftw), Void, ())
-            ccall((:fftwf_cleanup,libfftwf), Void, ())
-            stat = ccall((:fftw_init_threads,libfftw), Int32, ())
-            statf = ccall((:fftwf_init_threads,libfftwf), Int32, ())
-            if stat == 0 || statf == 0
-                error("could not initialize FFTW threads")
-            end
-            initialized = true
-        end
-        ccall((:fftw_plan_with_nthreads,libfftw), Void, (Int32,), nthreads)
-        ccall((:fftwf_plan_with_nthreads,libfftwf), Void, (Int32,), nthreads)
-    end
-end
-
-# Execute
-
-execute(precision::fftwTypeDouble, plan) =
-    ccall((:fftw_execute,libfftw), Void, (Ptr{Void},), plan)
-
-execute(precision::fftwTypeSingle, plan) =
-    ccall((:fftwf_execute,libfftwf), Void, (Ptr{Void},), plan)
-
-execute(plan, X::StridedArray{Complex128}, Y::StridedArray{Complex128}) =
-    ccall((:fftw_execute_dft,libfftw), Void,
-          (Ptr{Void},Ptr{Complex128},Ptr{Complex128}), plan, X, Y)
-
-execute(plan, X::StridedArray{Complex64}, Y::StridedArray{Complex64}) =
-    ccall((:fftwf_execute_dft,libfftwf), Void,
-          (Ptr{Void},Ptr{Complex64},Ptr{Complex64}), plan, X, Y)
-
-execute(plan, X::StridedArray{Float64}, Y::StridedArray{Complex128}) =
-    ccall((:fftw_execute_dft_r2c,libfftw), Void,
-          (Ptr{Void},Ptr{Float64},Ptr{Complex128}), plan, X, Y)
-
-execute(plan, X::StridedArray{Float32}, Y::StridedArray{Complex64}) =
-    ccall((:fftwf_execute_dft_r2c,libfftwf), Void,
-          (Ptr{Void},Ptr{Float32},Ptr{Complex64}), plan, X, Y)
-
-execute(plan, X::StridedArray{Complex128}, Y::StridedArray{Float64}) =
-    ccall((:fftw_execute_dft_c2r,libfftw), Void,
-          (Ptr{Void},Ptr{Complex128},Ptr{Float64}), plan, X, Y)
-
-execute(plan, X::StridedArray{Complex64}, Y::StridedArray{Float32}) =
-    ccall((:fftwf_execute_dft_c2r,libfftwf), Void,
-          (Ptr{Void},Ptr{Complex64},Ptr{Float32}), plan, X, Y)
-
-execute_r2r{T<:fftwDouble}(plan, X::StridedArray{T}, Y::StridedArray{T}) =
-    ccall((:fftw_execute_r2r,libfftw), Void,
-          (Ptr{Void},Ptr{T},Ptr{T}), plan, X, Y)
-
-execute_r2r{T<:fftwSingle}(plan, X::StridedArray{T}, Y::StridedArray{T}) =
-    ccall((:fftwf_execute_r2r,libfftwf), Void,
-          (Ptr{Void},Ptr{T},Ptr{T}), plan, X, Y)
-
-execute{T<:fftwReal}(plan, X::StridedArray{T}, Y::StridedArray{T}) =
-    execute_r2r(plan, X, Y)
-
-# Destroy plan
-
-destroy_plan(precision::fftwTypeDouble, plan) =
-    ccall((:fftw_destroy_plan,libfftw), Void, (Ptr{Void},), plan)
-
-destroy_plan(precision::fftwTypeSingle, plan) =
-    ccall((:fftwf_destroy_plan,libfftwf), Void, (Ptr{Void},), plan)
-
-# Planner timelimits
-
-const NO_TIMELIMIT = -1.0 # from fftw3.h
-
-set_timelimit(precision::fftwTypeDouble,seconds) =
-    ccall((:fftw_set_timelimit,libfftw), Void, (Float64,), seconds)
-
-set_timelimit(precision::fftwTypeSingle,seconds) =
-    ccall((:fftwf_set_timelimit,libfftwf), Void, (Float64,), seconds)
-
-# Array alignment mod 16:
-#   FFTW plans may depend on the alignment of the array mod 16 bytes,
-#   i.e. the address mod 16 of the first element of the array, in order
-#   to exploit SIMD operations.  Julia arrays are, by default, aligned
-#   to 16-byte boundaries (address mod 16 == 0), but this may not be
-#   true for data imported from external C code, or for SubArrays.
-#   Use the undocumented routine fftw_alignment_of to determine the
-#   alignment of a given pointer modulo whatever FFTW needs.
-
-if Base.libfftw_name == "libmkl_rt"
-
-    alignment_of{T<:fftwDouble}(A::StridedArray{T}) =
-        convert(Int32, convert(Int64, pointer(A)) % 16)
-
-    alignment_of{T<:fftwSingle}(A::StridedArray{T}) =
-        convert(Int32, convert(Int64, pointer(A)) % 16)
-
-else
-
-    alignment_of{T<:fftwDouble}(A::StridedArray{T}) =
-        ccall((:fftw_alignment_of, libfftw), Int32, (Ptr{Void},), A)
-
-    alignment_of{T<:fftwSingle}(A::StridedArray{T}) =
-        ccall((:fftwf_alignment_of, libfftwf), Int32, (Ptr{Void},), A)
-
-end
-
-# Plan (low-level)
-
-# low-level storage of the FFTW plan, along with the information
-# needed to determine whether it is applicable.   We need to put
-# this into a type to support a finalizer on the fftw_plan.
-type Plan{T<:fftwNumber}
-    plan::Ptr{Void}
-    sz::Dims # size of array on which plan operates (Int tuple)
-    istride::Dims # strides of input
-    ialign::Int32 # alignment mod 16 of input
-    function Plan(plan::Ptr{Void}, sz::Dims, istride::Dims, ialign::Int32)
-        p = new(plan,sz,istride,ialign)
-        finalizer(p, p -> destroy_plan(T, p.plan))
-        return p
-    end
-end
-Plan{T<:fftwNumber}(plan::Ptr{Void}, X::StridedArray{T}) = Plan{T}(plan, size(X), strides(X), alignment_of(X))
-
-# Check whether a Plan is applicable to a given input array, and
-# throw an informative error if not:
-function assert_applicable{T<:fftwNumber}(p::Plan{T}, X::StridedArray{T})
-    if size(X) != p.sz
-        throw(ArgumentError("FFTW plan applied to wrong-size array"))
-    elseif strides(X) != p.istride
-        throw(ArgumentError("FFTW plan applied to wrong-strides array"))
-    elseif alignment_of(X) != p.ialign
-        throw(ArgumentError("FFTW plan applied to array with wrong memory alignment"))
-    end
-end
-
-# NOTE ON GC (garbage collection):
-#    The Plan has a finalizer so that gc will destroy the plan,
-#    which is necessary for gc to work with plan_fft.  However,
-#    even when we are creating a single-use Plan [e.g. for fftn(x)],
-#    we intentionally do NOT call destroy_plan explicitly, and instead
-#    wait for garbage collection.  The reason is that, in the common
-#    case where the user calls fft(x) a second time soon afterwards,
-#    if destroy_plan has not yet been called then FFTW will internally
-#    re-use the table of trigonometric constants from the first plan.
-
-# Compute dims and howmany for FFTW guru planner
-function dims_howmany(X::StridedArray, Y::StridedArray,
-                      sz::Array{Int,1}, region)
-    reg = [region...]
-    if length(unique(reg)) < length(reg)
-        throw(ArgumentError("each dimension can be transformed at most once"))
-    end
-    ist = [strides(X)...]
-    ost = [strides(Y)...]
-    dims = [sz[reg] ist[reg] ost[reg]]'
-    oreg = [1:ndims(X);]
-    oreg[reg] = 0
-    oreg = filter(d -> d > 0, oreg)
-    howmany = [sz[oreg] ist[oreg] ost[oreg]]'
-    return (dims, howmany)
-end
-
-# check & convert kinds into int32 array with same length as region
-function fix_kinds(region, kinds)
-    if length(kinds) != length(region)
-        if length(kinds) > length(region)
-            throw(ArgumentError("too many transform kinds"))
-        else
-            if length(kinds) == 0
-                throw(ArgumentError("must supply a transform kind"))
-            end
-            k = Array(Int32, length(region))
-            k[1:length(kinds)] = [kinds...]
-            k[length(kinds)+1:end] = kinds[end]
-            kinds = k
-        end
-    else
-        kinds = Int32[kinds...]
-    end
-    for i = 1:length(kinds)
-        if kinds[i] < 0 || kinds[i] > 10
-            throw(ArgumentError("invalid transform kind"))
-        end
-    end
-    return kinds
-end
-
-# low-level Plan creation (for internal use in FFTW module)
-
-for (Tr,Tc,fftw,lib) in ((:Float64,:Complex128,"fftw",libfftw),
-                         (:Float32,:Complex64,"fftwf",libfftwf))
-
-    @eval function Plan(X::StridedArray{$Tc}, Y::StridedArray{$Tc},
-                        region, direction::Integer,
-                        flags::Unsigned, timelimit::Real)
-        set_timelimit($Tr, timelimit)
-        dims, howmany = dims_howmany(X, Y, [size(X)...], region)
-        plan = ccall(($(string(fftw,"_plan_guru64_dft")),$lib),
-                     Ptr{Void},
-                     (Int32, Ptr{Int}, Int32, Ptr{Int},
-                      Ptr{$Tc}, Ptr{$Tc}, Int32, UInt32),
-                     size(dims,2), dims, size(howmany,2), howmany,
-                     X, Y, direction, flags)
-        set_timelimit($Tr, NO_TIMELIMIT)
-        if plan == C_NULL
-            error("FFTW could not create plan") # shouldn't normally happen
-        end
-        return Plan(plan, X)
-    end
-
-    @eval function Plan(X::StridedArray{$Tr}, Y::StridedArray{$Tc},
-                        region, flags::Unsigned, timelimit::Real)
-        region = circshift([region...],-1) # FFTW halves last dim
-        set_timelimit($Tr, timelimit)
-        dims, howmany = dims_howmany(X, Y, [size(X)...], region)
-        plan = ccall(($(string(fftw,"_plan_guru64_dft_r2c")),$lib),
-                     Ptr{Void},
-                     (Int32, Ptr{Int}, Int32, Ptr{Int},
-                      Ptr{$Tr}, Ptr{$Tc}, UInt32),
-                     size(dims,2), dims, size(howmany,2), howmany,
-                     X, Y, flags)
-        set_timelimit($Tr, NO_TIMELIMIT)
-        if plan == C_NULL
-            error("FFTW could not create plan") # shouldn't normally happen
-        end
-        return Plan(plan, X)
-    end
-
-    @eval function Plan(X::StridedArray{$Tc}, Y::StridedArray{$Tr},
-                        region, flags::Unsigned, timelimit::Real)
-        region = circshift([region...],-1) # FFTW halves last dim
-        set_timelimit($Tr, timelimit)
-        dims, howmany = dims_howmany(X, Y, [size(Y)...], region)
-        plan = ccall(($(string(fftw,"_plan_guru64_dft_c2r")),$lib),
-                     Ptr{Void},
-                     (Int32, Ptr{Int}, Int32, Ptr{Int},
-                      Ptr{$Tc}, Ptr{$Tr}, UInt32),
-                     size(dims,2), dims, size(howmany,2), howmany,
-                     X, Y, flags)
-        set_timelimit($Tr, NO_TIMELIMIT)
-        if plan == C_NULL
-            error("FFTW could not create plan") # shouldn't normally happen
-        end
-        return Plan(plan, X)
-    end
-
-    @eval function Plan_r2r(X::StridedArray{$Tr}, Y::StridedArray{$Tr},
-                            region, kinds,
-                            flags::Unsigned, timelimit::Real)
-        kinds = fix_kinds(region, kinds)
-        set_timelimit($Tr, timelimit)
-        dims, howmany = dims_howmany(X, Y, [size(X)...], region)
-        plan = ccall(($(string(fftw,"_plan_guru64_r2r")),$lib),
-                     Ptr{Void},
-                     (Int32, Ptr{Int}, Int32, Ptr{Int},
-                      Ptr{$Tr}, Ptr{$Tr}, Ptr{Int32}, UInt32),
-                     size(dims,2), dims, size(howmany,2), howmany,
-                     X, Y, kinds, flags)
-        set_timelimit($Tr, NO_TIMELIMIT)
-        if plan == C_NULL
-            error("FFTW could not create plan") # shouldn't normally happen
-        end
-        return Plan(plan, X)
-    end
-
-    # support r2r transforms of complex = transforms of real & imag parts
-    @eval function Plan_r2r(X::StridedArray{$Tc}, Y::StridedArray{$Tc},
-                            region, kinds,
-                            flags::Unsigned, timelimit::Real)
-        kinds = fix_kinds(region, kinds)
-        set_timelimit($Tr, timelimit)
-        dims, howmany = dims_howmany(X, Y, [size(X)...], region)
-        dims[2:3, 1:size(dims,2)] *= 2
-        howmany[2:3, 1:size(howmany,2)] *= 2
-        howmany = [howmany [2,1,1]] # append loop over real/imag parts
-        plan = ccall(($(string(fftw,"_plan_guru64_r2r")),$lib),
-                     Ptr{Void},
-                     (Int32, Ptr{Int}, Int32, Ptr{Int},
-                      Ptr{$Tc}, Ptr{$Tc}, Ptr{Int32}, UInt32),
-                     size(dims,2), dims, size(howmany,2), howmany,
-                     X, Y, kinds, flags)
-        set_timelimit($Tr, NO_TIMELIMIT)
-        if plan == C_NULL
-            error("FFTW could not create plan") # shouldn't normally happen
-        end
-        return Plan(plan, X)
-    end
-
-end
-
-# Convert arrays of numeric types to FFTW-supported packed complex-float types
-# (FIXME: is there a way to use the Julia promotion rules more cleverly here?)
-complexfloat{T<:fftwComplex}(X::StridedArray{T}) = X
-complexfloat{T<:fftwReal}(X::StridedArray{T}) = complex(X)
-complexfloat{T<:Real}(X::StridedArray{T}) = map(Complex128,X)
-complexfloat{T<:Complex}(X::StridedArray{T}) = map(Complex128,X)
-
-# In the Julia interface, a "plan" is just a function that executes
-# an efficient FFT of fixed size/strides/alignment.  For each FFT function
-# (fft, bfft, ifft, rfft, ...), we have at least two interfaces:
-#
-#   fft(x [, region]) - FFT of x, creating and destroying a plan,
-#                       optionally acting only on a subset of the dimensions
-#   p = plan_fft(x, [, region [, flags [, timelimit]]])
-#       -- returns a function p(x) that performs efficient FFTs
-#          of a given size (on given dimensions) with variants
-#          to specify the FFTW planner flags (default is ESTIMATE) and
-#          timelimit (default: NO_TIMELIMIT).
-#
-# along with in-place variants fft! and plan_fft! if feasible.
-
-for (f,direction) in ((:fft,:FORWARD), (:bfft,:BACKWARD))
-    f! = symbol(f,"!")
-    plan_f = symbol("plan_",f)
-    plan_f! = symbol("plan_",f,"!")
-    @eval begin
-        function $f{T<:fftwComplex}(X::StridedArray{T}, region)
-            Y = similar(X, T)
-            p = Plan(X, Y, region, $direction, ESTIMATE, NO_TIMELIMIT)
-            execute(T, p.plan)
-            # do not destroy_plan ... see above note on gc
-            return Y
-        end
-
-        # in-place version
-        function $f!{T<:fftwComplex}(X::StridedArray{T},region)
-            p = Plan(X, X, region, $direction, ESTIMATE, NO_TIMELIMIT)
-            execute(T, p.plan)
-            # do not destroy_plan ... see above note on gc
-            return X
-        end
-
-        function $f{T<:Number}(X::StridedArray{T}, region)
-            Y = complexfloat(X) # in-place transform
-            return $f!(Y, region)
-        end
-
-        function $plan_f{T<:fftwComplex}(X::StridedArray{T},
-                                                         region,
-                                                         flags::Unsigned,
-                                                         tlim::Real)
-            Y = similar(X, T)
-            p = Plan(X, Y, region, $direction, flags, tlim)
-            return Z::StridedArray{T} -> begin
-                assert_applicable(p, Z)
-                W = similar(Z, T)
-                execute(p.plan, Z, W)
-                return W
-            end
-        end
-
-        function $plan_f{T<:Number}(X::StridedArray{T}, region,
-                                    flags::Unsigned, tlim::Real)
-            Y = complexfloat(X) # in-place transform
-            p = Plan(Y, Y, region, $direction, flags, tlim)
-            return Z::StridedArray{T} -> begin
-                W = complexfloat(Z) # in-place transform
-                assert_applicable(p, W)
-                execute(p.plan, W, W)
-                return W
-            end
-        end
-
-        function $plan_f!{T<:fftwComplex}(X::StridedArray{T},
-                                                          region,
-                                                          flags::Unsigned,
-                                                          tlim::Real)
-            p = Plan(X, X, region, $direction, flags, tlim)
-            return Z::StridedArray{T} -> begin
-                assert_applicable(p, Z)
-                execute(p.plan, Z, Z)
-                return Z
-            end
-        end
-
-        $f(X::StridedArray) = $f(X, 1:ndims(X))
-        $f!(X::StridedArray) = $f!(X, 1:ndims(X))
-    end
-    for pf in (plan_f, plan_f!)
-        @eval begin
-            $pf{T<:Number}(X::StridedArray{T}, region, flags::Unsigned) =
-                $pf(X, region, flags, NO_TIMELIMIT)
-            $pf{T<:Number}(X::StridedArray{T}, region) =
-                $pf(X, region, ESTIMATE, NO_TIMELIMIT)
-            $pf{T<:Number}(X::StridedArray{T}) =
-                $pf(X, 1:ndims(X), ESTIMATE, NO_TIMELIMIT)
-        end
-    end
-end
-
-# Normalization for ifft
-
-normalization(X::StridedArray, region) = 1 / prod([size(X)...][[region...]])
-normalization(X::StridedArray) = 1 / length(X)
-
-# Normalized ifft inverse transforms:
-
-for (f,fb) in ((:ifft,:bfft), (:ifft!,:bfft!))
-    pf = symbol("plan_", f)
-    pfb = symbol("plan_", fb)
-    @eval begin
-        $f(X, region) = scale!($fb(X, region), normalization(X, region))
-        $f(X) = scale!($fb(X), normalization(X))
-
-        function $pf(X, region, flags, tlim)
-            nrm = normalization(X, region)
-            p = $pfb(X, region, flags, tlim)
-            return Z -> scale!(p(Z), nrm)
-        end
-        $pf(X, region, flags) = $pf(X, region, flags, NO_TIMELIMIT)
-        $pf(X, region) = $pf(X, region, ESTIMATE, NO_TIMELIMIT)
-        function $pf(X)
-            nrm = normalization(X)
-            p = $pfb(X)
-            return Z -> scale!(p(Z), nrm)
-        end
-    end
-end
-
-# rfft/brfft and planned variants.  No in-place version for now.
-
-for (Tr,Tc) in ((:Float32,:Complex64),(:Float64,:Complex128))
-    @eval begin
-        function rfft(X::StridedArray{$Tr}, region)
-            d1 = region[1]
-            osize = [size(X)...]
-            osize[d1] = osize[d1]>>1 + 1
-            Y = Array($Tc, osize...)
-            p = Plan(X, Y, region, ESTIMATE, NO_TIMELIMIT)
-            execute($Tr, p.plan)
-            # do not destroy_plan ... see above note on gc
-            return Y
-        end
-
-        function rfft{T<:Real}(X::StridedArray{T}, region)
-            Xr = float(X)
-            return rfft(Xr, region)
-        end
-
-        function plan_rfft(X::StridedArray{$Tr}, region,
-                           flags::Unsigned, tlim::Real)
-            d1 = region[1]
-            osize = [size(X)...]
-            osize[d1] = osize[d1]>>1 + 1
-            Y = Array($Tc, osize...)
-            p = Plan(X, Y, region, flags, tlim)
-            return Z::StridedArray{$Tr} -> begin
-                assert_applicable(p, Z)
-                W = Array($Tc, osize...)
-                execute(p.plan, Z, W)
-                return W
-            end
-        end
-
-        # FFTW currently doesn't support PRESERVE_INPUT for
-        # multidimensional out-of-place c2r transforms, so
-        # we have to handle 1d and >1d cases separately.  Ugh.
-
-        function brfft(X::StridedArray{$Tc}, d::Integer, region::Integer)
-            osize = [size(X)...]
-            @assert osize[region] == d>>1 + 1
-            osize[region] = d
-            Y = Array($Tr, osize...)
-            p = Plan(X, Y, region, ESTIMATE | PRESERVE_INPUT, NO_TIMELIMIT)
-            execute($Tr, p.plan)
-            # do not destroy_plan ... see above note on gc
-            return Y
-        end
-
-        # variant that destroys input X
-        function brfftd(X::StridedArray{$Tc}, d::Integer, region)
-            d1 = region[1]
-            osize = [size(X)...]
-            @assert osize[d1] == d>>1 + 1
-            osize[d1] = d
-            Y = Array($Tr, osize...)
-            p = Plan(X, Y, region, ESTIMATE, NO_TIMELIMIT)
-            execute($Tr, p.plan)
-            # do not destroy_plan ... see above note on gc
-            return Y
-        end
-
-        function brfft(X::StridedArray{$Tc}, d::Integer, region)
-            if length(region) == 1
-                return brfft(X, d, convert(Int, region[1]))
-            end
-            X = copy(X) # TODO: work in-place instead?
-            return brfftd(X, d, region)
-        end
-
-        function brfft{T<:Number}(X::StridedArray{T}, d::Integer, region)
-            Xc = complexfloat(X)
-            return brfftd(Xc, d, region)
-        end
-
-        function plan_brfft(X::StridedArray{$Tc}, d::Integer, region::Integer,
-                            flags::Unsigned, tlim::Real)
-            osize = [size(X)...]
-            @assert osize[region] == d>>1 + 1
-            osize[region] = d
-            Y = Array($Tr, osize...)
-            p = Plan(X, Y, region, flags | PRESERVE_INPUT, tlim)
-            return Z::StridedArray{$Tc} -> begin
-                assert_applicable(p, Z)
-                W = Array($Tr, osize...)
-                execute(p.plan, Z, W)
-                return W
-            end
-        end
-
-        function plan_brfft(X::StridedArray{$Tc}, d::Integer, region,
-                            flags::Unsigned, tlim::Real)
-            if length(region) == 1
-                return plan_brfft(X, d, convert(Int, region[1]), flags, tlim)
-            end
-            d1 = region[1]
-            osize = [size(X)...]
-            @assert osize[d1] == d>>1 + 1
-            osize[d1] = d
-            Y = Array($Tr, osize...)
-            X = copy(X)
-            p = Plan(X, Y, region, flags, tlim)
-            return Z::StridedArray{$Tc} -> begin
-                assert_applicable(p, Z)
-                Z = copy(Z)
-                W = Array($Tr, osize...)
-                execute(p.plan, Z, W)
-                return W
-            end
-        end
-
-        rfft(X::StridedArray) = rfft(X, 1:ndims(X))
-        brfft(X::StridedArray,d) = brfft(X, d, 1:ndims(X))
-
-        plan_rfft(X::StridedArray, region, flags) =
-            plan_rfft(X, region, flags, NO_TIMELIMIT)
-        plan_rfft(X::StridedArray, region) =
-            plan_rfft(X, region, ESTIMATE, NO_TIMELIMIT)
-        plan_rfft(X::StridedArray) =
-            plan_rfft(X, 1:ndims(X), ESTIMATE, NO_TIMELIMIT)
-
-        plan_brfft(X::StridedArray, d,  region, flags) =
-            plan_brfft(X, d, region, flags, NO_TIMELIMIT)
-        plan_brfft(X::StridedArray, d,  region) =
-            plan_brfft(X, d, region, ESTIMATE, NO_TIMELIMIT)
-        plan_brfft(X::StridedArray, d) =
-            plan_brfft(X, d, 1:ndims(X), ESTIMATE, NO_TIMELIMIT)
-    end
-end
-
-# Normalized rfft inverse transforms:
-
-function irfft(X, d, region)
-    Y = brfft(X, d, region)
-    return scale!(Y, normalization(Y, region))
-end
-
-function irfft(X, d)
-    Y = brfft(X, d)
-    return scale!(Y, normalization(Y))
-end
-
-function plan_irfft(X::StridedArray, d::Integer, region, flags, tlim)
-    p = plan_brfft(X, d, region, flags, tlim)
-    d1 = region[1]
-    osize = [size(X)...]
-    osize[d1] = d
-    nrm = 1 / prod(osize[[region...]])
-    return Z -> scale!(p(Z), nrm)
-end
-
-plan_irfft(X, d, region, flags) = plan_irfft(X, d, region, flags, NO_TIMELIMIT)
-plan_irfft(X, d, region) = plan_irfft(X, d, region, ESTIMATE, NO_TIMELIMIT)
-plan_irfft(X, d) = plan_irfft(X, d, 1:ndims(X), ESTIMATE, NO_TIMELIMIT)
-
-# A DFT is unambiguously defined as just the identity operation for scalars
-fft(x::Number) = x
-ifft(x::Number) = x
-bfft(x::Number) = x
-rfft(x::Real) = x
-irfft(x::Number, d::Integer) = d == 1 ? real(x) : throw(BoundsError())
-brfft(x::Number, d::Integer) = d == 1 ? real(x) : throw(BoundsError())
-fft(x::Number, dims) = length(dims) == 0 || dims[1] == 1 ? x : throw(BoundsError())
-ifft(x::Number, dims) = length(dims) == 0 || dims[1] == 1 ? x : throw(BoundsError())
-bfft(x::Number, dims) = length(dims) == 0 || dims[1] == 1 ? x : throw(BoundsError())
-fft(x::Number, dims) = length(dims) == 0 || dims[1] == 1 ? x : throw(BoundsError())
-rfft(x::Real, dims) = dims[1] == 1 ? x : throw(BoundsError())
-irfft(x::Number, d::Integer, dims) = d == 1 && dims[1] == 1 ? real(x) : throw(BoundsError())
-brfft(x::Number, d::Integer, dims) = d == 1 && dims[1] == 1 ? real(x) : throw(BoundsError())
-
-plan_fft(x::Number) = x -> x
-plan_ifft(x::Number) = x -> x
-plan_bfft(x::Number) = x -> x
-plan_rfft(x::Real) = x -> x
-plan_irfft(x::Number, d::Integer) = (irfft(x,d); x -> real(x))
-plan_brfft(x::Number, d::Integer) = (brfft(x,d); x -> real(x))
-
-plan_fft(x::Number, dims) = (fft(x,dims); x -> x)
-plan_ifft(x::Number, dims) = (ifft(x,dims); x -> x)
-plan_bfft(x::Number, dims) = (bfft(x,dims); x -> x)
-plan_rfft(x::Real, dims) = (rfft(x,dims); x -> x)
-plan_irfft(x::Number, d::Integer, dims) = (irfft(x,d,dims); x -> real(x))
-plan_brfft(x::Number, d::Integer, dims) = (brfft(x,d,dims); x -> real(x))
-
-plan_fft(x::Number, dims, flags) = plan_fft(x, dims)
-plan_ifft(x::Number, dims, flags) = plan_ifft(x, dims)
-plan_bfft(x::Number, dims, flags) = plan_bfft(x, dims)
-plan_rfft(x::Real, dims, flags) = plan_rfft(x, dims)
-plan_irfft(x::Number, d::Integer, dims, flags) = plan_irfft(x, d, dims)
-plan_brfft(x::Number, d::Integer, dims, flags) = plan_brfft(x, d, dims)
-
-plan_fft(x::Number, dims, flags, tlim) = plan_fft(x, dims)
-plan_ifft(x::Number, dims, flags, tlim) = plan_ifft(x, dims)
-plan_bfft(x::Number, dims, flags, tlim) = plan_bfft(x, dims)
-plan_rfft(x::Real, dims, flags, tlim) = plan_rfft(x, dims)
-plan_irfft(x::Number, d::Integer, dims, flags, tlim) = plan_irfft(x, d, dims)
-plan_brfft(x::Number, d::Integer, dims, flags, tlim) = plan_brfft(x, d, dims)
-
-# FFTW r2r transforms (low-level interface)
-
-function r2r{T<:fftwNumber}(X::StridedArray{T}, kinds, region)
-    Y = similar(X, T)
-    p = Plan_r2r(X, Y, region, kinds, ESTIMATE, NO_TIMELIMIT)
-    execute(T, p.plan)
-    # do not destroy_plan ... see above note on gc
-    return Y
-end
-
-function r2r!{T<:fftwNumber}(X::StridedArray{T}, kinds, region)
-    p = Plan_r2r(X, X, region, kinds, ESTIMATE, NO_TIMELIMIT)
-    execute(T, p.plan)
-    # do not destroy_plan ... see above note on gc
-    return X
-end
-
-r2r{T<:Real}(X::StridedArray{T}, kinds, region) = r2r!(float(X), kinds, region)
-r2r{T<:Complex}(X::StridedArray{T}, kinds, region) = r2r!(complexfloat(X), kinds, region)
-
-for f in (:r2r, :r2r!)
-    @eval $f(X, kinds) = $f(X, kinds, 1:ndims(X))
-end
-
-function plan_r2r{T<:fftwNumber}(
-    X::StridedArray{T}, kinds, region, flags::Unsigned, tlim::Real)
-    Y = similar(X, T)
-    p = Plan_r2r(X, Y, region, kinds, flags, tlim)
-    return Z::StridedArray{T} -> begin
-        assert_applicable(p, Z)
-        W = similar(Z, T)
-        execute_r2r(p.plan, Z, W)
-        return W
-    end
-end
-
-function plan_r2r{T<:Number}(X::StridedArray{T}, kinds, region,
-                             flags::Unsigned, tlim::Real)
-    Y = T<:Complex ? complexfloat(X) : float(X)
-    p = Plan_r2r(Y, Y, region, kinds, flags, tlim)
-    return Z::StridedArray{T} -> begin
-        W = T<:Complex ? complexfloat(Z) : float(Z)
-        execute_r2r(p.plan, W, W)
-        return W
-    end
-end
-
-function plan_r2r!{T<:fftwNumber}(
-    X::StridedArray{T}, kinds,region,flags::Unsigned, tlim::Real)
-    p = Plan_r2r(X, X, region, kinds, flags, tlim)
-    return Z::StridedArray{T} -> begin
-        assert_applicable(p, Z)
-        execute_r2r(p.plan, Z, Z)
-        return Z
-    end
-end
-
-for f in (:plan_r2r, :plan_r2r!)
-    @eval begin
-        $f(X, kinds, region, flags) = $f(X, kinds, region, flags, NO_TIMELIMIT)
-        $f(X, kinds, region) = $f(X, kinds, region, ESTIMATE, NO_TIMELIMIT)
-        $f(X, kinds) = $f(X, kinds, 1:ndims(X), ESTIMATE, NO_TIMELIMIT)
-    end
-end
-
-end # module
diff --git a/base/pointer.jl b/base/pointer.jl
index a026b8f09a30a..1956ef2263b66 100644
--- a/base/pointer.jl
+++ b/base/pointer.jl
@@ -50,6 +50,15 @@ unsafe_store!(p::Ptr{Any}, x::ANY, i::Integer) = pointerset(p, x, Int(i))
 unsafe_store!{T}(p::Ptr{T}, x, i::Integer) = pointerset(p, convert(T,x), Int(i))
 unsafe_store!{T}(p::Ptr{T}, x) = pointerset(p, convert(T,x), 1)
 
+# unsafe pointer to string conversions (don't make a copy, unlike bytestring)
+function pointer_to_string(p::Ptr{UInt8}, len::Integer, own::Bool=false)
+    a = ccall(:jl_ptr_to_array_1d, Vector{UInt8},
+              (Any, Ptr{UInt8}, Csize_t, Cint), Vector{UInt8}, p, len, own)
+    ccall(:jl_array_to_string, Any, (Any,), a)::ByteString
+end
+pointer_to_string(p::Ptr{UInt8}, own::Bool=false) =
+    pointer_to_string(p, ccall(:strlen, Csize_t, (Ptr{UInt8},), p), own)
+
 # convert a raw Ptr to an object reference, and vice-versa
 unsafe_pointer_to_objref(x::Ptr) = ccall(:jl_value_ptr, Any, (Ptr{Void},), x)
 pointer_from_objref(x::ANY) = ccall(:jl_value_ptr, Ptr{Void}, (Any,), x)
diff --git a/base/sysimg.jl b/base/sysimg.jl
index afe363fab0248..2be7151477f74 100644
--- a/base/sysimg.jl
+++ b/base/sysimg.jl
@@ -263,20 +263,19 @@ include("statistics.jl")
 include("sparse.jl")
 importall .SparseMatrix
 
+# irrational mathematical constants
+include("irrationals.jl")
+
 # signal processing
-if USE_GPL_LIBS
-    include("fftw.jl")
-    include("dsp.jl")
-    importall .DSP
-end
+include("dft.jl")
+importall .DFT
+include("dsp.jl")
+importall .DSP
 
 # system information
 include("sysinfo.jl")
 import .Sys.CPU_CORES
 
-# irrational mathematical constants
-include("irrationals.jl")
-
 # Numerical integration
 include("quadgk.jl")
 importall .QuadGK
diff --git a/base/unicode/utf8proc.jl b/base/unicode/utf8proc.jl
index 9b299136626ca..355ba24f2a23e 100644
--- a/base/unicode/utf8proc.jl
+++ b/base/unicode/utf8proc.jl
@@ -75,10 +75,7 @@ function utf8proc_map(s::ByteString, flags::Integer)
                    s, sizeof(s), p, flags)
     result < 0 && error(bytestring(ccall(:utf8proc_errmsg, Ptr{UInt8},
                                          (Cssize_t,), result)))
-    a = ccall(:jl_ptr_to_array_1d, Vector{UInt8},
-              (Any, Ptr{UInt8}, Csize_t, Cint),
-              Vector{UInt8}, p[], result, true)
-    ccall(:jl_array_to_string, Any, (Any,), a)::ByteString
+    pointer_to_string(p[], result, true)::ByteString
 end
 
 utf8proc_map(s::AbstractString, flags::Integer) = utf8proc_map(bytestring(s), flags)
diff --git a/doc/stdlib/math.rst b/doc/stdlib/math.rst
index 804712834d291..01f4b7a8b0e6b 100644
--- a/doc/stdlib/math.rst
+++ b/doc/stdlib/math.rst
@@ -1352,7 +1352,7 @@ Statistics
 Signal Processing
 -----------------
 
-Fast Fourier transform (FFT) functions in Julia are largely
+Fast Fourier transform (FFT) functions in Julia are
 implemented by calling functions from `FFTW
 <http://www.fftw.org>`_. By default, Julia does not use multi-threaded
 FFTW. Higher performance may be obtained by experimenting with
@@ -1423,12 +1423,24 @@ multi-threading. Use `FFTW.set_num_threads(np)` to use `np` threads.
 
    Same as :func:`bfft`, but operates in-place on ``A``.
 
-.. function:: plan_fft(A [, dims [, flags [, timelimit]]])
+.. function:: plan_fft(A [, dims]; flags=FFTW.ESTIMATE;  timelimit=Inf)
 
    Pre-plan an optimized FFT along given dimensions (``dims``) of arrays
    matching the shape and type of ``A``.  (The first two arguments have
-   the same meaning as for :func:`fft`.)  Returns a function ``plan(A)``
-   that computes ``fft(A, dims)`` quickly.
+   the same meaning as for :func:`fft`.)  Returns an object ``P`` which
+   represents the linear operator computed by the FFT, and which contains
+   all of the information needed to compute ``fft(A, dims)`` quickly.
+
+   To apply ``P`` to an array ``A``, use ``P * A``; in general, the
+   syntax for applying plans is much like that of matrices.  (A plan
+   can only be applied to arrays of the same size as the ``A`` for
+   which the plan was created.)  You can also apply a plan with a
+   preallocated output array ``Â`` by calling ``A_mul_B!(Â, plan,
+   A)``.  You can compute the inverse-transform plan by ``inv(P)`` and
+   apply the inverse plan with ``P \ Â`` (the inverse plan is cached
+   and reused for subsequent calls to ``inv`` or ``\``), and apply the
+   inverse plan to a pre-allocated output array ``A`` with
+   ``A_ldiv_B!(A, P, Â)``.
 
    The ``flags`` argument is a bitwise-or of FFTW planner flags, defaulting
    to ``FFTW.ESTIMATE``.  e.g. passing ``FFTW.MEASURE`` or ``FFTW.PATIENT``
@@ -1445,25 +1457,25 @@ multi-threading. Use `FFTW.set_num_threads(np)` to use `np` threads.
    are similar but produce plans that perform the equivalent of
    the inverse transforms :func:`ifft` and so on.
 
-.. function:: plan_ifft(A [, dims [, flags [, timelimit]]])
+.. function:: plan_ifft(A [, dims]; flags=FFTW.ESTIMATE;  timelimit=Inf)
 
    Same as :func:`plan_fft`, but produces a plan that performs inverse transforms
    :func:`ifft`.
 
-.. function:: plan_bfft(A [, dims [, flags [, timelimit]]])
+.. function:: plan_bfft(A [, dims]; flags=FFTW.ESTIMATE;  timelimit=Inf)
 
    Same as :func:`plan_fft`, but produces a plan that performs an unnormalized
    backwards transform :func:`bfft`.
 
-.. function:: plan_fft!(A [, dims [, flags [, timelimit]]])
+.. function:: plan_fft!(A [, dims]; flags=FFTW.ESTIMATE;  timelimit=Inf)
 
    Same as :func:`plan_fft`, but operates in-place on ``A``.
 
-.. function:: plan_ifft!(A [, dims [, flags [, timelimit]]])
+.. function:: plan_ifft!(A [, dims]; flags=FFTW.ESTIMATE;  timelimit=Inf)
 
    Same as :func:`plan_ifft`, but operates in-place on ``A``.
 
-.. function:: plan_bfft!(A [, dims [, flags [, timelimit]]])
+.. function:: plan_bfft!(A [, dims]; flags=FFTW.ESTIMATE;  timelimit=Inf)
 
    Same as :func:`plan_bfft`, but operates in-place on ``A``.
 
@@ -1500,21 +1512,21 @@ multi-threading. Use `FFTW.set_num_threads(np)` to use `np` threads.
    of the sizes of the transformed dimensions (of the real output array)
    in order to obtain the inverse transform.
 
-.. function:: plan_rfft(A [, dims [, flags [, timelimit]]])
+.. function:: plan_rfft(A [, dims]; flags=FFTW.ESTIMATE;  timelimit=Inf)
 
    Pre-plan an optimized real-input FFT, similar to :func:`plan_fft`
    except for :func:`rfft` instead of :func:`fft`.  The first two
    arguments, and the size of the transformed result, are the same as
    for :func:`rfft`.
 
-.. function:: plan_brfft(A, d [, dims [, flags [, timelimit]]])
+.. function:: plan_brfft(A, d [, dims]; flags=FFTW.ESTIMATE;  timelimit=Inf)
 
    Pre-plan an optimized real-input unnormalized transform, similar to
    :func:`plan_rfft` except for :func:`brfft` instead of :func:`rfft`.
    The first two arguments and the size of the transformed result, are
    the same as for :func:`brfft`.
 
-.. function:: plan_irfft(A, d [, dims [, flags [, timelimit]]])
+.. function:: plan_irfft(A, d [, dims]; flags=FFTW.ESTIMATE;  timelimit=Inf)
 
    Pre-plan an optimized inverse real-input FFT, similar to :func:`plan_rfft`
    except for :func:`irfft` and :func:`brfft`, respectively.  The first
diff --git a/test/dsp.jl b/test/dsp.jl
index 173c3442c3f48..f1ecff3659761 100644
--- a/test/dsp.jl
+++ b/test/dsp.jl
@@ -71,24 +71,24 @@ if Base.fftw_vendor() != :mkl
     Xidct_2 = idct(true_Xdct_2,2)
     Xidct!_2 = copy(true_Xdct_2); idct!(Xidct!_2,2)
 
-    pXdct = plan_dct(X)(X)
-    pXdct! = float(X); plan_dct!(pXdct!)(pXdct!)
-    pXdct_1 = plan_dct(X,1)(X)
-    pXdct!_1 = float(X); plan_dct!(pXdct!_1,1)(pXdct!_1)
-    pXdct_2 = plan_dct(X,2)(X)
-    pXdct!_2 = float(X); plan_dct!(pXdct!_2,2)(pXdct!_2)
-
-    pXidct = plan_idct(true_Xdct)(true_Xdct)
-    pXidct! = copy(true_Xdct); plan_idct!(pXidct!)(pXidct!)
-    pXidct_1 = plan_idct(true_Xdct_1,1)(true_Xdct_1)
-    pXidct!_1 = copy(true_Xdct_1); plan_idct!(pXidct!_1,1)(pXidct!_1)
-    pXidct_2 = plan_idct(true_Xdct_2,2)(true_Xdct_2)
-    pXidct!_2 = copy(true_Xdct_2); plan_idct!(pXidct!_2,2)(pXidct!_2)
+    pXdct = plan_dct(X)*(X)
+    pXdct! = float(X); plan_dct!(pXdct!)*(pXdct!)
+    pXdct_1 = plan_dct(X,1)*(X)
+    pXdct!_1 = float(X); plan_dct!(pXdct!_1,1)*(pXdct!_1)
+    pXdct_2 = plan_dct(X,2)*(X)
+    pXdct!_2 = float(X); plan_dct!(pXdct!_2,2)*(pXdct!_2)
+
+    pXidct = plan_idct(true_Xdct)*(true_Xdct)
+    pXidct! = copy(true_Xdct); plan_idct!(pXidct!)*(pXidct!)
+    pXidct_1 = plan_idct(true_Xdct_1,1)*(true_Xdct_1)
+    pXidct!_1 = copy(true_Xdct_1); plan_idct!(pXidct!_1,1)*(pXidct!_1)
+    pXidct_2 = plan_idct(true_Xdct_2,2)*(true_Xdct_2)
+    pXidct!_2 = copy(true_Xdct_2); plan_idct!(pXidct!_2,2)*(pXidct!_2)
 
     sXdct = dct(sX)
-    psXdct = plan_dct(sX)(sX)
+    psXdct = plan_dct(sX)*(sX)
     sYdct! = copy(Y); sXdct! = slice(sYdct!,3:5,9:12); dct!(sXdct!)
-    psYdct! = copy(Y); psXdct! = slice(psYdct!,3:5,9:12); plan_dct!(psXdct!)(psXdct!)
+    psYdct! = copy(Y); psXdct! = slice(psYdct!,3:5,9:12); plan_dct!(psXdct!)*(psXdct!)
 
     for i = 1:length(X)
         @test_approx_eq Xdct[i] true_Xdct[i]
diff --git a/test/fft.jl b/test/fft.jl
index e401daf9f7326..acff80383f8b4 100644
--- a/test/fft.jl
+++ b/test/fft.jl
@@ -9,43 +9,6 @@ m4 = [16.    2     3    13;
     9     7     6    12;
     4    14    15     1]
 
-b = rand(17,14)
-b[3:6,9:12] = m4
-sm4 = slice(b,3:6,9:12)
-
-pm4 = plan_fft(m4,1)
-
-fft_m4 = fft(m4,1)
-fftd2_m4 = fft(m4,2)
-ifft_fft_m4 = ifft(fft(m4,1),1)
-fftn_m4 = fft(m4)
-ifftn_fftn_m4 = ifft(fft(m4))
-
-fft!_m4 = complex(m4); fft!(fft!_m4,1)
-fft!d2_m4 = complex(m4); fft!(fft!d2_m4,2)
-ifft!_fft_m4 = fft(m4,1); ifft!(ifft!_fft_m4,1)
-fft!n_m4 = complex(m4); fft!(fft!n_m4)
-ifft!n_fftn_m4 = fft(m4); ifft!(ifft!n_fftn_m4)
-
-pfft_m4 = plan_fft(m4,1)(m4)
-pfftd2_m4 = plan_fft(m4,2)(m4)
-pifft_fft_m4 = plan_ifft(fft_m4,1)(fft_m4)
-pfftn_m4 = plan_fft(m4)(m4)
-pifftn_fftn_m4 = plan_ifft(fftn_m4)(fftn_m4)
-
-pfft!_m4 = complex(m4); plan_fft!(pfft!_m4,1)(pfft!_m4)
-pfft!d2_m4 = complex(m4); plan_fft!(pfft!d2_m4,2)(pfft!d2_m4)
-pifft!_fft_m4 = fft(m4,1); plan_ifft!(pifft!_fft_m4,1)(pifft!_fft_m4)
-pfft!n_m4 = complex(m4); plan_fft!(pfft!n_m4)(pfft!n_m4)
-pifft!n_fftn_m4 = fft(m4); plan_ifft!(pifft!n_fftn_m4)(pifft!n_fftn_m4)
-
-sfftn_m4 = fft(sm4)
-psfftn_m4 = plan_fft(sm4)(sm4)
-sfft!n_b = map(Complex128,b)
-sfft!n_m4 = slice(sfft!n_b,3:6,9:12); fft!(sfft!n_m4)
-psfft!n_b = map(Complex128,b)
-psfft!n_m4 = slice(psfft!n_b,3:6,9:12); plan_fft!(psfft!n_m4)(psfft!n_m4)
-
 true_fft_m4 = [
     34.            34.            34.            34.;
      7. - 1.im  -5. + 3.im  -3. + 5.im   1. - 7.im;
@@ -64,153 +27,285 @@ true_fftd2_m4 = [
    34.    3 +  5im   -4    3 -  5im ;
    34.  -11 - 13im    4  -11 + 13im ]
 
-for i = 1:length(m4)
-    @test_approx_eq fft_m4[i] true_fft_m4[i]
-    @test_approx_eq fftd2_m4[i] true_fftd2_m4[i]
-    @test_approx_eq ifft_fft_m4[i] m4[i]
-    @test_approx_eq fftn_m4[i] true_fftn_m4[i]
-    @test_approx_eq ifftn_fftn_m4[i] m4[i]
-
-    @test_approx_eq fft!_m4[i] true_fft_m4[i]
-    @test_approx_eq fft!d2_m4[i] true_fftd2_m4[i]
-    @test_approx_eq ifft!_fft_m4[i] m4[i]
-    @test_approx_eq fft!n_m4[i] true_fftn_m4[i]
-    @test_approx_eq ifft!n_fftn_m4[i] m4[i]
-
-    @test_approx_eq pfft_m4[i] true_fft_m4[i]
-    @test_approx_eq pfftd2_m4[i] true_fftd2_m4[i]
-    @test_approx_eq pifft_fft_m4[i] m4[i]
-    @test_approx_eq pfftn_m4[i] true_fftn_m4[i]
-    @test_approx_eq pifftn_fftn_m4[i] m4[i]
-
-    @test_approx_eq pfft!_m4[i] true_fft_m4[i]
-    @test_approx_eq pfft!d2_m4[i] true_fftd2_m4[i]
-    @test_approx_eq pifft!_fft_m4[i] m4[i]
-    @test_approx_eq pfft!n_m4[i] true_fftn_m4[i]
-    @test_approx_eq pifft!n_fftn_m4[i] m4[i]
-
-    @test_approx_eq sfftn_m4[i] true_fftn_m4[i]
-    @test_approx_eq sfft!n_m4[i] true_fftn_m4[i]
-    @test_approx_eq psfftn_m4[i] true_fftn_m4[i]
-    @test_approx_eq psfft!n_m4[i] true_fftn_m4[i]
+b = rand(17,14)
+b[3:6,9:12] = m4
+sm4 = slice(b,3:6,9:12)
+
+m3d = map(Float32,reshape(1:5*3*2, 5, 3, 2))
+true_fftd3_m3d = Array(Float32, 5, 3, 2)
+true_fftd3_m3d[:,:,1] = 17:2:45
+true_fftd3_m3d[:,:,2] = -15
+
+# use invoke to force usage of CTPlan versions even if FFTW is present
+for A in (Array,SubArray)
+    for f in (:fft,:ifft,:plan_fft,:plan_ifft)
+        f_ = symbol(string(f, "_"))
+        @eval begin
+            $f_{T,N}(x::$A{T,N}) = invoke($f, Tuple{AbstractArray{T,N}}, x)
+            $f_{T,N,R}(x::$A{T,N},r::R) = invoke($f,Tuple{AbstractArray{T,N},R},x,r)
+        end
+    end
 end
 
-ifft!(sfft!n_m4)
-plan_ifft!(psfft!n_m4)(psfft!n_m4)
-@test norm(sfft!n_m4 - m4) < 1e-8
-@test norm(psfft!n_m4 - m4) < 1e-8
+for (f,fi,pf,pfi) in ((fft,ifft,plan_fft,plan_ifft),
+                      (fft_,ifft_,plan_fft_,plan_ifft_))
+    pm4 = pf(m4,1)
+
+    fft_m4 = f(m4,1)
+    fftd2_m4 = f(m4,2)
+    ifft_fft_m4 = fi(f(m4,1),1)
+    fftn_m4 = f(m4)
+    ifftn_fftn_m4 = fi(f(m4))
+
+    fft!_m4 = complex(m4); fft!(fft!_m4,1)
+    fft!d2_m4 = complex(m4); fft!(fft!d2_m4,2)
+    ifft!_fft_m4 = f(m4,1); ifft!(ifft!_fft_m4,1)
+    fft!n_m4 = complex(m4); fft!(fft!n_m4)
+    ifft!n_fftn_m4 = f(m4); ifft!(ifft!n_fftn_m4)
+
+    pfft_m4 = pf(m4,1)*m4
+    pfftd2_m4 = pf(m4,2)*m4
+    pifft_fft_m4 = pfi(fft_m4,1)*fft_m4
+    pfftn_m4 = pf(m4)*m4
+    pifftn_fftn_m4 = pfi(fftn_m4)*fftn_m4
+
+    pfft!_m4 = complex(m4); plan_fft!(pfft!_m4,1)*pfft!_m4
+    pfft!d2_m4 = complex(m4); plan_fft!(pfft!d2_m4,2)*pfft!d2_m4
+    pifft!_fft_m4 = f(m4,1); plan_ifft!(pifft!_fft_m4,1)*pifft!_fft_m4
+    pfft!n_m4 = complex(m4); plan_fft!(pfft!n_m4)*pfft!n_m4
+    pifft!n_fftn_m4 = f(m4); plan_ifft!(pifft!n_fftn_m4)*pifft!n_fftn_m4
+
+    sfftn_m4 = f(sm4)
+    psfftn_m4 = pf(sm4)*sm4
+    sfft!n_b = map(Complex128,b)
+    sfft!n_m4 = slice(sfft!n_b,3:6,9:12); fft!(sfft!n_m4)
+    psfft!n_b = map(Complex128,b)
+    psfft!n_m4 = slice(psfft!n_b,3:6,9:12); plan_fft!(psfft!n_m4)*psfft!n_m4
+
+    for i = 1:length(m4)
+        @test_approx_eq fft_m4[i] true_fft_m4[i]
+        @test_approx_eq fftd2_m4[i] true_fftd2_m4[i]
+        @test_approx_eq ifft_fft_m4[i] m4[i]
+        @test_approx_eq fftn_m4[i] true_fftn_m4[i]
+        @test_approx_eq ifftn_fftn_m4[i] m4[i]
+
+        @test_approx_eq fft!_m4[i] true_fft_m4[i]
+        @test_approx_eq fft!d2_m4[i] true_fftd2_m4[i]
+        @test_approx_eq ifft!_fft_m4[i] m4[i]
+        @test_approx_eq fft!n_m4[i] true_fftn_m4[i]
+        @test_approx_eq ifft!n_fftn_m4[i] m4[i]
+
+        @test_approx_eq pfft_m4[i] true_fft_m4[i]
+        @test_approx_eq pfftd2_m4[i] true_fftd2_m4[i]
+        @test_approx_eq pifft_fft_m4[i] m4[i]
+        @test_approx_eq pfftn_m4[i] true_fftn_m4[i]
+        @test_approx_eq pifftn_fftn_m4[i] m4[i]
+
+        @test_approx_eq pfft!_m4[i] true_fft_m4[i]
+        @test_approx_eq pfft!d2_m4[i] true_fftd2_m4[i]
+        @test_approx_eq pifft!_fft_m4[i] m4[i]
+        @test_approx_eq pfft!n_m4[i] true_fftn_m4[i]
+        @test_approx_eq pifft!n_fftn_m4[i] m4[i]
+
+        @test_approx_eq sfftn_m4[i] true_fftn_m4[i]
+        @test_approx_eq sfft!n_m4[i] true_fftn_m4[i]
+        @test_approx_eq psfftn_m4[i] true_fftn_m4[i]
+        @test_approx_eq psfft!n_m4[i] true_fftn_m4[i]
+    end
 
-# The following capabilities are FFTW only.
-# They are not available in MKL, and hence do not test them.
-if Base.fftw_vendor() != :mkl
+    ifft!(sfft!n_m4)
+    plan_ifft!(psfft!n_m4)*psfft!n_m4
+    @test norm(sfft!n_m4 - m4) < 1e-8
+    @test norm(psfft!n_m4 - m4) < 1e-8
 
-    m3d = map(Float32,reshape(1:5*3*2, 5, 3, 2))
-    ifft3_fft3_m3d = ifft(fft(m3d))
+    # The following capabilities are FFTW only.
+    # They are not available in MKL, and hence do not test them.
+    if Base.fftw_vendor() != :mkl
 
-    fftd3_m3d = fft(m3d,3)
-    ifftd3_fftd3_m3d = ifft(fftd3_m3d,3)
+        ifft3_fft3_m3d = fi(f(m3d))
 
-    fft!d3_m3d = complex(m3d); fft!(fft!d3_m3d,3)
-    ifft!d3_fftd3_m3d = copy(fft!d3_m3d); ifft!(ifft!d3_fftd3_m3d,3)
+        fftd3_m3d = f(m3d,3)
+        ifftd3_fftd3_m3d = fi(fftd3_m3d,3)
 
-    pfftd3_m3d = plan_fft(m3d,3)(m3d)
-    pifftd3_fftd3_m3d = plan_ifft(fftd3_m3d,3)(fftd3_m3d)
+        fft!d3_m3d = complex(m3d); fft!(fft!d3_m3d,3)
+        ifft!d3_fftd3_m3d = copy(fft!d3_m3d); ifft!(ifft!d3_fftd3_m3d,3)
 
-    pfft!d3_m3d = complex(m3d); plan_fft!(pfft!d3_m3d,3)(pfft!d3_m3d)
-    pifft!d3_fftd3_m3d = copy(fft!d3_m3d); plan_ifft!(pifft!d3_fftd3_m3d,3)(pifft!d3_fftd3_m3d)
+        pfftd3_m3d = pf(m3d,3)*m3d
+        pifftd3_fftd3_m3d = pfi(fftd3_m3d,3)*fftd3_m3d
 
-    @test isa(fftd3_m3d, Array{Complex64,3})
-    @test isa(ifftd3_fftd3_m3d, Array{Complex64,3})
-    @test isa(fft!d3_m3d, Array{Complex64,3})
-    @test isa(ifft!d3_fftd3_m3d, Array{Complex64,3})
-    @test isa(pfftd3_m3d, Array{Complex64,3})
-    @test isa(pifftd3_fftd3_m3d, Array{Complex64,3})
-    @test isa(pfft!d3_m3d, Array{Complex64,3})
-    @test isa(pifft!d3_fftd3_m3d, Array{Complex64,3})
+        pfft!d3_m3d = complex(m3d); plan_fft!(pfft!d3_m3d,3)*pfft!d3_m3d
+        pifft!d3_fftd3_m3d = copy(fft!d3_m3d); plan_ifft!(pifft!d3_fftd3_m3d,3)*pifft!d3_fftd3_m3d
 
-    true_fftd3_m3d = Array(Float32, 5, 3, 2)
-    true_fftd3_m3d[:,:,1] = 17:2:45
-    true_fftd3_m3d[:,:,2] = -15
+        @test isa(fftd3_m3d, Array{Complex64,3})
+        @test isa(ifftd3_fftd3_m3d, Array{Complex64,3})
+        @test isa(fft!d3_m3d, Array{Complex64,3})
+        @test isa(ifft!d3_fftd3_m3d, Array{Complex64,3})
+        @test isa(pfftd3_m3d, Array{Complex64,3})
+        @test isa(pifftd3_fftd3_m3d, Array{Complex64,3})
+        @test isa(pfft!d3_m3d, Array{Complex64,3})
+        @test isa(pifft!d3_fftd3_m3d, Array{Complex64,3})
 
-    for i = 1:length(m3d)
-        @test_approx_eq fftd3_m3d[i] true_fftd3_m3d[i]
-        @test_approx_eq ifftd3_fftd3_m3d[i] m3d[i]
-        @test_approx_eq ifft3_fft3_m3d[i] m3d[i]
+        for i = 1:length(m3d)
+            @test_approx_eq fftd3_m3d[i] true_fftd3_m3d[i]
+            @test_approx_eq ifftd3_fftd3_m3d[i] m3d[i]
+            @test_approx_eq ifft3_fft3_m3d[i] m3d[i]
 
-        @test_approx_eq fft!d3_m3d[i] true_fftd3_m3d[i]
-        @test_approx_eq ifft!d3_fftd3_m3d[i] m3d[i]
+            @test_approx_eq fft!d3_m3d[i] true_fftd3_m3d[i]
+            @test_approx_eq ifft!d3_fftd3_m3d[i] m3d[i]
 
-        @test_approx_eq pfftd3_m3d[i] true_fftd3_m3d[i]
-        @test_approx_eq pifftd3_fftd3_m3d[i] m3d[i]
-        @test_approx_eq pfft!d3_m3d[i] true_fftd3_m3d[i]
+            @test_approx_eq pfftd3_m3d[i] true_fftd3_m3d[i]
+            @test_approx_eq pifftd3_fftd3_m3d[i] m3d[i]
+            @test_approx_eq pfft!d3_m3d[i] true_fftd3_m3d[i]
         @test_approx_eq pifft!d3_fftd3_m3d[i] m3d[i]
-    end
+        end
 
-end  # if fftw_vendor() != :mkl ...
+    end  # if fftw_vendor() != :mkl ...
 
-# rfft/rfftn
+    # rfft/rfftn
 
-rfft_m4 = rfft(m4,1)
-rfftd2_m4 = rfft(m4,2)
-rfftn_m4 = rfft(m4)
+    rfft_m4 = rfft(m4,1)
+    rfftd2_m4 = rfft(m4,2)
+    rfftn_m4 = rfft(m4)
 
-prfft_m4 = plan_rfft(m4,1)(m4)
-prfftd2_m4 = plan_rfft(m4,2)(m4)
-prfftn_m4 = plan_rfft(m4)(m4)
+    prfft_m4 = plan_rfft(m4,1)*m4
+    prfftd2_m4 = plan_rfft(m4,2)*m4
+    prfftn_m4 = plan_rfft(m4)*m4
 
-srfftn_m4 = rfft(sm4)
-psrfftn_m4 = plan_rfft(sm4)(sm4)
+    srfftn_m4 = rfft(sm4)
+    psrfftn_m4 = plan_rfft(sm4)*sm4
 
-for i = 1:3, j = 1:4
-    @test_approx_eq rfft_m4[i,j] true_fft_m4[i,j]
-    @test_approx_eq rfftd2_m4[j,i] true_fftd2_m4[j,i]
-    @test_approx_eq rfftn_m4[i,j] true_fftn_m4[i,j]
+    for i = 1:3, j = 1:4
+        @test_approx_eq rfft_m4[i,j] true_fft_m4[i,j]
+        @test_approx_eq rfftd2_m4[j,i] true_fftd2_m4[j,i]
+        @test_approx_eq rfftn_m4[i,j] true_fftn_m4[i,j]
 
-    @test_approx_eq prfft_m4[i,j] true_fft_m4[i,j]
-    @test_approx_eq prfftd2_m4[j,i] true_fftd2_m4[j,i]
-    @test_approx_eq prfftn_m4[i,j] true_fftn_m4[i,j]
+        @test_approx_eq prfft_m4[i,j] true_fft_m4[i,j]
+        @test_approx_eq prfftd2_m4[j,i] true_fftd2_m4[j,i]
+        @test_approx_eq prfftn_m4[i,j] true_fftn_m4[i,j]
 
-    @test_approx_eq srfftn_m4[i,j] true_fftn_m4[i,j]
-    @test_approx_eq psrfftn_m4[i,j] true_fftn_m4[i,j]
-end
+        @test_approx_eq srfftn_m4[i,j] true_fftn_m4[i,j]
+        @test_approx_eq psrfftn_m4[i,j] true_fftn_m4[i,j]
+    end
+
+    irfft_rfft_m4 = irfft(rfft_m4,size(m4,1),1)
+    irfft_rfftd2_m4 = irfft(rfftd2_m4,size(m4,2),2)
+    irfftn_rfftn_m4 = irfft(rfftn_m4,size(m4,1))
 
-irfft_rfft_m4 = irfft(rfft_m4,size(m4,1),1)
-irfft_rfftd2_m4 = irfft(rfftd2_m4,size(m4,2),2)
-irfftn_rfftn_m4 = irfft(rfftn_m4,size(m4,1))
+    pirfft_rfft_m4 = plan_irfft(rfft_m4,size(m4,1),1)*rfft_m4
+    pirfft_rfftd2_m4 = plan_irfft(rfftd2_m4,size(m4,2),2)*rfftd2_m4
+    pirfftn_rfftn_m4 = plan_irfft(rfftn_m4,size(m4,1))*rfftn_m4
 
-pirfft_rfft_m4 = plan_irfft(rfft_m4,size(m4,1),1)(rfft_m4)
-pirfft_rfftd2_m4 = plan_irfft(rfftd2_m4,size(m4,2),2)(rfftd2_m4)
-pirfftn_rfftn_m4 = plan_irfft(rfftn_m4,size(m4,1))(rfftn_m4)
+    for i = 1:length(m4)
+        @test_approx_eq irfft_rfft_m4[i] m4[i]
+        @test_approx_eq irfft_rfftd2_m4[i] m4[i]
+        @test_approx_eq irfftn_rfftn_m4[i] m4[i]
 
-for i = 1:length(m4)
-    @test_approx_eq irfft_rfft_m4[i] m4[i]
-    @test_approx_eq irfft_rfftd2_m4[i] m4[i]
-    @test_approx_eq irfftn_rfftn_m4[i] m4[i]
+        @test_approx_eq pirfft_rfft_m4[i] m4[i]
+        @test_approx_eq pirfft_rfftd2_m4[i] m4[i]
+        @test_approx_eq pirfftn_rfftn_m4[i] m4[i]
+    end
 
-    @test_approx_eq pirfft_rfft_m4[i] m4[i]
-    @test_approx_eq pirfft_rfftd2_m4[i] m4[i]
-    @test_approx_eq pirfftn_rfftn_m4[i] m4[i]
+    if Base.fftw_vendor() != :mkl
+
+        rfftn_m3d = rfft(m3d)
+        rfftd3_m3d = rfft(m3d,3)
+        @test size(rfftd3_m3d) == size(fftd3_m3d)
+        irfft_rfftd3_m3d = irfft(rfftd3_m3d,size(m3d,3),3)
+        irfftn_rfftn_m3d = irfft(rfftn_m3d,size(m3d,1))
+        for i = 1:length(m3d)
+            @test_approx_eq rfftd3_m3d[i] true_fftd3_m3d[i]
+            @test_approx_eq irfft_rfftd3_m3d[i] m3d[i]
+            @test_approx_eq irfftn_rfftn_m3d[i] m3d[i]
+        end
+
+        fftn_m3d = fft(m3d)
+        @test size(fftn_m3d) == (5,3,2)
+        rfftn_m3d = rfft(m3d)
+        @test size(rfftn_m3d) == (3,3,2)
+        for i = 1:3, j = 1:3, k = 1:2
+            @test_approx_eq rfftn_m3d[i,j,k] fftn_m3d[i,j,k]
+        end
+
+    end # !mkl
 end
 
-if Base.fftw_vendor() != :mkl
-
-    rfftn_m3d = rfft(m3d)
-    rfftd3_m3d = rfft(m3d,3)
-    @test size(rfftd3_m3d) == size(fftd3_m3d)
-    irfft_rfftd3_m3d = irfft(rfftd3_m3d,size(m3d,3),3)
-    irfftn_rfftn_m3d = irfft(rfftn_m3d,size(m3d,1))
-    for i = 1:length(m3d)
-        @test_approx_eq rfftd3_m3d[i] true_fftd3_m3d[i]
-        @test_approx_eq irfft_rfftd3_m3d[i] m3d[i]
-        @test_approx_eq irfftn_rfftn_m3d[i] m3d[i]
+# FFT self-test algorithm (for unscaled 1d forward FFTs):
+#   Funda Ergün, "Testing multivariate linear functions: Overcoming
+#   the generator bottleneck," Proc. 27th ACM Symposium on the Theory
+#   of Computing, pp. 407-416 (1995).
+# Check linearity, impulse-response, and time-shift properties.
+function fft_test{T<:Complex}(p::Base.DFT.Plan{T}, ntrials=4,
+                              tol=1e5 * eps(real(T)))
+    ndims(p) == 1 || throw(ArgumentError("not a 1d FFT"))
+    n = length(p)
+    twopi_i = (-2 * convert(real(T), π)/n * (0:n-1)) * im
+    for trial = 1:ntrials
+        # linearity:
+        x = rand(T, n)
+        y = rand(T, n)
+        α = rand(T)
+        β = rand(T)
+        X = p * (α*x + β*y)
+        err = norm(α * (p*x) + β * (p*y) - X, Inf) / norm(X, Inf)
+        err <= tol || error("linearity error $err in $p")
+
+        # impulse-response:
+        z = zeros(T, n)
+        i = rand(0:n-1)
+        z[i+1] = 1
+        X = exp(twopi_i*i)
+        err = norm(p*z - X, Inf) / norm(X, Inf)
+        err <= tol || error("impulse-response error $err in $p")
+
+        # time-shift:
+        if n > 1
+            s = rand(1:n-1)
+            X = (p*x).*exp(twopi_i*s)
+            err = norm(p*circshift(x,s) - X, Inf) / norm(X, Inf)
+            err <= tol || error("time-shift error $err in $p")
+        end
     end
+end
 
-    fftn_m3d = fft(m3d)
-    @test size(fftn_m3d) == (5,3,2)
-    rfftn_m3d = rfft(m3d)
-    @test size(rfftn_m3d) == (3,3,2)
-    for i = 1:3, j = 1:3, k = 1:2
-        @test_approx_eq rfftn_m3d[i,j,k] fftn_m3d[i,j,k]
+for T in (Complex64, Complex128)
+    for n in [1:100; 121; 143; 1000; 1024; 1031; 2000; 2048]
+        x = zeros(T, n)
+        fft_test(plan_fft(x))
+        fft_test(plan_fft_(x))
     end
+end
 
-end # if fftw_vendor() != :mkl ...
+# test inversion, scaling, and pre-allocated variants
+for T in (Complex64, Complex128)
+    for x in (T[1:100;], reshape(T[1:200;], 20,10))
+        y = similar(x)
+        for planner in (plan_fft, plan_fft_, plan_ifft, plan_ifft_)
+            p = planner(x)
+            pi = inv(p)
+            p3 = 3*p
+            p3i = inv(p3)
+            @test eltype(p) == eltype(pi) == eltype(p3) == eltype(p3i) == T
+            @test vecnorm(x - p3i * (p * 3x)) < eps(real(T)) * 10000
+            @test vecnorm(3x - pi * (p3 * x)) < eps(real(T)) * 10000
+            A_mul_B!(y, p, x)
+            @test y == p * x
+            A_ldiv_B!(y, p, x)
+            @test y == p \ x
+        end
+    end
+end
+
+# issue #9772
+for x in (randn(10),randn(10,12))
+    z = complex(x)
+    y = rfft(x)
+    @inferred rfft(x)
+    @inferred brfft(x,18)
+    @inferred brfft(y,10)
+    for f in (fft,plan_fft,bfft,plan_bfft,fft_)
+        @inferred f(x)
+        @inferred f(z)
+    end
+    # note: inference doesn't work for plan_fft_ since the
+    #       algorithm steps are included in the CTPlan type
+end
diff --git a/test/random.jl b/test/random.jl
index 19e9c90858d29..7142e5bf86c65 100644
--- a/test/random.jl
+++ b/test/random.jl
@@ -115,13 +115,13 @@ emantissa           = Int64(2)^52
 ziggurat_exp_r      = parse(BigFloat,"7.69711747013104971404462804811408952334296818528283253278834867283241051210533")
 exp_section_area    = (ziggurat_exp_r + 1)*exp(-ziggurat_exp_r)
 
-const ki = Array(UInt64, ziggurat_table_size)
-const wi = Array(Float64, ziggurat_table_size)
-const fi = Array(Float64, ziggurat_table_size)
+ki = Array(UInt64, ziggurat_table_size)
+wi = Array(Float64, ziggurat_table_size)
+fi = Array(Float64, ziggurat_table_size)
 # Tables for exponential variates
-const ke = Array(UInt64, ziggurat_table_size)
-const we = Array(Float64, ziggurat_table_size)
-const fe = Array(Float64, ziggurat_table_size)
+ke = Array(UInt64, ziggurat_table_size)
+we = Array(Float64, ziggurat_table_size)
+fe = Array(Float64, ziggurat_table_size)
 function randmtzig_fill_ziggurat_tables() # Operates on the global arrays
     wib = big(wi)
     fib = big(fi)