http://en.wikipedia.org/wiki/In-place_matrix_transposition
Best!
FFTW supports in-place transposition (of any datatype whose size is an integer multiplier of 4 bytes) for square and non-square matrices, but I don't think this is exposed right now.
If you look in base/fftw.jl, it would be easy to make a variant of the transpose(X) function that operates in-place. Just rename it to transpose!(X), set P=X in the body, and remove the PRESERVE_INPUT flag.
Note also that you should call "return reshape(P, n2, n1)" instead of "return P". This looks like a bug in the transpose routine even for the out-of-place case, at first glance.
function transpose{T<:Union(Float64,Float32,Complex128,Complex64)}(A::Matrix{T})
if length(A) > 50000
return FFTW.transpose(reshape(A, size(A, 2), size(A, 1)))
else
return [ A[j,i] for i=1:size(A,2), j=1:size(A,1) ]
end
end
transpose(x::StridedVector) = [ x[j] for i=1, j=1:size(x,1) ]
transpose(x::StridedMatrix) = [ x[j,i] for i=1:size(x,2), j=1:size(x,1) ]
julia> import Base.transpose
julia> for (libname, fname, elty) in ((:libfftw ,"fftw_plan_guru64_r2r",:Float64),
(:libfftwf,"fftwf_plan_guru64_r2r",:Float32))
@eval begin
function transpose(X::Ptr{$elty}, Y::Ptr{$elty},
n1::Integer, n2::Integer, n::Int,
s1::Integer, s2::Integer)
plan = ccall(($fname,$libname), Ptr{Void},
(Int32, Ptr{Int}, Int32, Ptr{Int}, Ptr{$elty}, Ptr{$elty}, Ptr{Int32}, Uint32),
0, C_NULL, 3, Int[n1,s1*n,n2*n, n2,s2*n,n, n,1,1],
X, Y, C_NULL, ESTIMATE | PRESERVE_INPUT)
if plan == C_NULL
error("FFTW could not create plan") # shouldn't normally happen
end
execute($elty, plan)
destroy_plan($elty, plan)
return Y
end
end
end
julia> function transpose{T}(A::StridedMatrix{T})
(n1, n2) = size(A)
# sizeof(T) does not work because A elements may be jl_value_t*
# FIXME: there should be a way to get elsize as a compile-time constant
elsize = div(sizeof(A), n1*n2)
if n1*n2 > 50000 && rem(elsize, 4) == 0
(s1, s2) = strides(A)
P = Array(T, n2, n1)
if rem(elsize, 8) == 0
FFTW.transpose(convert(Ptr{Float64}, A),
convert(Ptr{Float64}, P),
n1, n2, div(elsize, 8), s1, s2)
else
FFTW.transpose(convert(Ptr{Float32}, A),
convert(Ptr{Float32}, P),
n1, n2, div(elsize, 4), s1, s2)
end
return P
else
return [ A[j,i] for i=1:n2, j=1:n1 ]
end
end
Can be out-of-place a little more slower on your branch than in master?
I did in my master: