Hi All, I am new to Julia and really struggling to understand what is making my code which convolutes a signal with a Gaussian or Lorentzian so slow. I have the equivilent MATLAB code that runs in 4 seconds when vectorised properly. In Julia, it takes:
368.047741 seconds (19.37 G allocations: 288.664 GiB, 14.30% gc time, 0.03% compilation time)
This is regardless of me using a triple nested for loop to calculate the sum, a double nested loop and the inbuilt sum() function, or replacing the nested loops with a multiplication of the row of every signal with the circulant kernel matrix - either using * or mul!() are both slow. It is probably worth mentioning that the signal array is size (3801, 670) after padding the orginal signal and the kernel is size (3801, 3801).
The code is as follows, please feel free to tear into it…
using MAT
using SpecialMatrices
wd = "/path/to/files"
inputfile = matopen(wd*"/AllTraj_Signal.mat")
signal = read(inputfile, "pdW")
close(inputfile)
replace!(signal, NaN=>0)
signal = signal[2:671, 1:2001]
signal[:, 1] = signal[:, 2]
inputfile = matopen(wd*"/Vectors.mat")
qAng = read(inputfile, "qAng")
tvec = read(inputfile, "tt")
struct Gaussian
    Fx:: Array{Float64}
    sigma:: Float64
    height:: Float64
    fwhm:: Float64
    tconv:: StepRangeLen{Float64}
    ntc:: Int64
    duration:: Float64
    dt:: Float64
    tmin :: Float64
    tmax:: Float64
    nt:: Int64
    function Gaussian(fwhm:: Float64, x:: StepRangeLen{Float64}, x0:: Float64, tmin:: Float64, tmax::Float64, nt:: Int64)
        duration = fwhm*3
        sigma = fwhm/2sqrt(2log(2))
        height = 1/sigma*sqrt(2pi)
        Fx = height .* exp.( (-1 .* (x .- x0).^2)./ (2*sigma^2))
        Fx = Circulant(Fx)
        tconv = x; ntc = length(x)
        dt = tconv[2]-tconv[1]
        nt = nt
        new(Fx, sigma, height, fwhm, tconv, ntc, duration, dt, tmin, tmax, nt)
    end
end
struct Lorentzian
    Fx:: Array{Float64}
    gamma:: Float64
    fwhm:: Float64
    tconv:: StepRangeLen{Float64}
    ntc:: Int64
    duration:: Float64
    dt:: Float64
    tmin :: Float64
    tmax:: Float64
    nt:: Int64
    function Lorentzian(fwhm:: Float64, x:: StepRangeLen{Float64}, x0:: Float64, tmin::Float64, tmax:: Float64, nt:: Int64)
        gamma = fwhm/2
        duration = fwhm*3
        Fx = 1 ./ ((pi * gamma) .* (1 .+ ((x .- x0)./ gamma).^2 ))
        Fx = Circulant(Fx)
        tconv = x; ntc = length(x)
        dt = tconv[2]-tconv[1]
        nt = nt
        new(Fx, gamma, fwhm, tconv, ntc, duration, dt, tmin, tmax, nt)
    end
end
function generate_kernel(fwhm:: Float64, time:: StepRangeLen{Float64}, Flag:: Int)
    dt:: Float64 = sum(diff(time, dims=1))/(length(time)-1)
    nt:: Int64 = length(time)
    duration:: Float64 = fwhm*3 # must pad three time FHWM to deal with edges
    if dt != diff(time, dims=1)[1] ; error("Non-linear time range."); end
    tmin:: Float64 = minimum(time); tmax:: Float64 = maximum(time)
    tconv_min:: Float64 = tmin - duration ; tconv_max:: Float64 = tmax + duration
    tconv = tconv_min:dt:tconv_max # extended convoluted time vector
    if Flag == 0
        K = Gaussian(fwhm, tconv, tconv[1], tmin, tmax, nt)
    elseif Flag == 1
        K = Lorentzian(fwhm, tconv, tconv[1], tmin, tmax, nt)
    else
        error("Only Gaussian and Lorentzian functions currently available.")
    end
    return K
    end
function convolve(S:: Array{Float64}, K:: T) where {T<:Union{Gaussian, Lorentzian}}
    nr:: Int64, nc:: Int64 = size(S)
    if nr != K.nt && nc != K.nt; error("Inconsistent dimensions."); end
    if nc != K.nt; S = copy(transpose(S)); nr, nc = nc, nr; end # column always time axis
    padend:: Int64 = (K.duration/K.dt) # index at which 0 padding ends and signal starts
    padstart:: Int64 = (padend + K.tmax / K.dt) # index where signal ends and padding starts
    println("Convolving signal with a $(typeof(K)) function with FWHM: $(K.fwhm) fs.")
    sC = zeros(Float64, nr, K.ntc)
    sC[1:nr, 1:padend] .= S[1:nr, 1] # extended and pad signal
    sC[1:nr, padend:padstart] .= S
    sC[1:nr, padstart:end] .= S[1:nr, end]
    conv = zeros(Float64, nr, K.ntc)
    for t in 1:K.ntc  # K.ntc = 3801
        for q in 1:nr  # nr = 670
            #conv[q, t] = sum(collect(sC[q, 1:K.ntc]) .* collect(K.Fx[t, :]))*K.dt  # usiing sum function method
            for j in 1:K.ntc
                conv[q, t] += sC[q, j] * K.Fx[t, j] * K.dt  # convolute with kernel
            end
        end
    end
    return conv
end
fwhm = 100.0
tvec = 0:0.5:1000
kernel = generate_kernel(fwhm, tvec, 0)
conv = convolve(signal, kernel)
I have tried to define all my types and split up the generation of the kernel into two types and keep the actual convolution seperate from this, but it has not helped speed wise. Any tips or insights would be greatly appreciated! 
