Hi All, I am new to Julia and really struggling to understand what is making my code which convolutes a signal with a Gaussian or Lorentzian so slow. I have the equivilent MATLAB code that runs in 4 seconds when vectorised properly. In Julia, it takes:

```
368.047741 seconds (19.37 G allocations: 288.664 GiB, 14.30% gc time, 0.03% compilation time)
```

This is regardless of me using a triple nested for loop to calculate the sum, a double nested loop and the inbuilt `sum()`

function, or replacing the nested loops with a multiplication of the row of every signal with the circulant kernel matrix - either using `*`

or `mul!()`

are both slow. It is probably worth mentioning that the signal array is size `(3801, 670)`

after padding the orginal signal and the kernel is size `(3801, 3801)`

.

The code is as follows, please feel free to tear into itâ€¦

```
using MAT
using SpecialMatrices
wd = "/path/to/files"
inputfile = matopen(wd*"/AllTraj_Signal.mat")
signal = read(inputfile, "pdW")
close(inputfile)
replace!(signal, NaN=>0)
signal = signal[2:671, 1:2001]
signal[:, 1] = signal[:, 2]
inputfile = matopen(wd*"/Vectors.mat")
qAng = read(inputfile, "qAng")
tvec = read(inputfile, "tt")
struct Gaussian
Fx:: Array{Float64}
sigma:: Float64
height:: Float64
fwhm:: Float64
tconv:: StepRangeLen{Float64}
ntc:: Int64
duration:: Float64
dt:: Float64
tmin :: Float64
tmax:: Float64
nt:: Int64
function Gaussian(fwhm:: Float64, x:: StepRangeLen{Float64}, x0:: Float64, tmin:: Float64, tmax::Float64, nt:: Int64)
duration = fwhm*3
sigma = fwhm/2sqrt(2log(2))
height = 1/sigma*sqrt(2pi)
Fx = height .* exp.( (-1 .* (x .- x0).^2)./ (2*sigma^2))
Fx = Circulant(Fx)
tconv = x; ntc = length(x)
dt = tconv[2]-tconv[1]
nt = nt
new(Fx, sigma, height, fwhm, tconv, ntc, duration, dt, tmin, tmax, nt)
end
end
struct Lorentzian
Fx:: Array{Float64}
gamma:: Float64
fwhm:: Float64
tconv:: StepRangeLen{Float64}
ntc:: Int64
duration:: Float64
dt:: Float64
tmin :: Float64
tmax:: Float64
nt:: Int64
function Lorentzian(fwhm:: Float64, x:: StepRangeLen{Float64}, x0:: Float64, tmin::Float64, tmax:: Float64, nt:: Int64)
gamma = fwhm/2
duration = fwhm*3
Fx = 1 ./ ((pi * gamma) .* (1 .+ ((x .- x0)./ gamma).^2 ))
Fx = Circulant(Fx)
tconv = x; ntc = length(x)
dt = tconv[2]-tconv[1]
nt = nt
new(Fx, gamma, fwhm, tconv, ntc, duration, dt, tmin, tmax, nt)
end
end
function generate_kernel(fwhm:: Float64, time:: StepRangeLen{Float64}, Flag:: Int)
dt:: Float64 = sum(diff(time, dims=1))/(length(time)-1)
nt:: Int64 = length(time)
duration:: Float64 = fwhm*3 # must pad three time FHWM to deal with edges
if dt != diff(time, dims=1)[1] ; error("Non-linear time range."); end
tmin:: Float64 = minimum(time); tmax:: Float64 = maximum(time)
tconv_min:: Float64 = tmin - duration ; tconv_max:: Float64 = tmax + duration
tconv = tconv_min:dt:tconv_max # extended convoluted time vector
if Flag == 0
K = Gaussian(fwhm, tconv, tconv[1], tmin, tmax, nt)
elseif Flag == 1
K = Lorentzian(fwhm, tconv, tconv[1], tmin, tmax, nt)
else
error("Only Gaussian and Lorentzian functions currently available.")
end
return K
end
function convolve(S:: Array{Float64}, K:: T) where {T<:Union{Gaussian, Lorentzian}}
nr:: Int64, nc:: Int64 = size(S)
if nr != K.nt && nc != K.nt; error("Inconsistent dimensions."); end
if nc != K.nt; S = copy(transpose(S)); nr, nc = nc, nr; end # column always time axis
padend:: Int64 = (K.duration/K.dt) # index at which 0 padding ends and signal starts
padstart:: Int64 = (padend + K.tmax / K.dt) # index where signal ends and padding starts
println("Convolving signal with a $(typeof(K)) function with FWHM: $(K.fwhm) fs.")
sC = zeros(Float64, nr, K.ntc)
sC[1:nr, 1:padend] .= S[1:nr, 1] # extended and pad signal
sC[1:nr, padend:padstart] .= S
sC[1:nr, padstart:end] .= S[1:nr, end]
conv = zeros(Float64, nr, K.ntc)
for t in 1:K.ntc # K.ntc = 3801
for q in 1:nr # nr = 670
#conv[q, t] = sum(collect(sC[q, 1:K.ntc]) .* collect(K.Fx[t, :]))*K.dt # usiing sum function method
for j in 1:K.ntc
conv[q, t] += sC[q, j] * K.Fx[t, j] * K.dt # convolute with kernel
end
end
end
return conv
end
fwhm = 100.0
tvec = 0:0.5:1000
kernel = generate_kernel(fwhm, tvec, 0)
conv = convolve(signal, kernel)
```

I have tried to define all my types and split up the generation of the kernel into two types and keep the actual convolution seperate from this, but it has not helped speed wise. Any tips or insights would be greatly appreciated!