Optimizing performance

I cloned your repo and changed the functions :

@views function iterateOPs!(::IterateOPs_basic, wf::WindFarm, sim::Sim, floris::Floris, floridyn::FloriDyn)
    # Save turbine OPs
    tmpOPStates = copy(wf.States_OP[wf.StartI, :])
    tmpTStates  = copy(wf.States_T[wf.StartI, :])
    tmpWFSTates = copy(wf.States_WF[wf.StartI, :])

    # Shift states
    # Downwind step
    step_dw = sim.time_step .* sim.dyn.advection .* wf.States_WF[:, 1] 
     wf.States_OP[:, 4] .+= step_dw

    # Crosswind step
    deflection = centerline(wf.States_OP, wf.States_T, wf.States_WF, floris, wf.D[1])
    step_cw = deflection .- wf.States_OP[:, 5:6]
     wf.States_OP[:, 5:6] .= deflection

    # World coordinate system adjustment
    @inbounds @simd for i in axes(wf.States_OP,1)
      phiwi = angSOWFA2world(wf.States_WF[i, 2])  # Convert wind direction to world coordinates
      cphiwi = cos(phiwi)
      sphiwi = sin(phiwi)
      ai = step_cw[i, 1]
      sdwi = step_dw[i]
      wf.States_OP[i, 1] += cphiwi * sdwi - sphiwi * ai
      wf.States_OP[i, 2] += sphiwi * sdwi + cphiwi * ai
      wf.States_OP[i, 3] += step_cw[i, 2]
    end

    # Circshift & init first OPs
    # OPs
    wf.States_OP = circshift(wf.States_OP, (1, 0))
     wf.States_OP[wf.StartI, :] .= tmpOPStates

    # Turbines
    wf.States_T = circshift(wf.States_T, (1, 0))
     wf.States_T[wf.StartI, :] .= tmpTStates

    # Wind Farm
    wf.States_WF = circshift(wf.States_WF, (1, 0))
     wf.States_WF[wf.StartI, :] .= tmpWFSTates

    # Check if OPs are in order
    buf = zeros(Int,size(wf.States_OP, 1))
    for iT in 1:wf.nT
        inds = wf.StartI[iT]:(wf.StartI[iT] + wf.nOP - 1)
        indOP = buf[inds]
        sortperm!(indOP,wf.States_OP[inds, 4])
        if ! issorted(indOP)  # check if already sorted
           wf.States_OP[inds, :] .= wf.States_OP[inds[indOP], :]
           wf.States_T[inds, :]  .= wf.States_T[inds[indOP], :]
           wf.States_WF[inds, :] .= wf.States_WF[inds[indOP], :]
        end

    end
    return nothing
end

and

function centerline(states_op, states_t, states_wf, floris, d_rotor)
    N          = size(states_op, 1)                     # number of rows
    Δ          = Matrix{ComplexF64}(undef, N, 2)        # output
    is8      = 1 / sqrt(8)
    s2         = sqrt(2)
    α, β       = floris.alpha, floris.beta
    k_a, k_b   = floris.k_a, floris.k_b

    for i ∈ 1:N          # ← one tight SIMD loop
        ############# 1. unpack state rows ####################################
        a  = states_t[i, 1]
        b  = states_t[i, 2]
        c  = states_t[i, 3]
        d  = states_wf[i, 3]
        OP = states_op[i, 4]

        ############# 2. basic derived quantities #############################
        C_T     = calcCt(a, b)
        yaw     = -deg2rad(b)
        cosyaw  = cos(yaw)
        I       = sqrt(c*c + d*d)

        ############# 3. core length x₀ #######################################
        x₀ = (cosyaw * (1 + sqrt(1 - C_T)) /
             (s2 * (α * I + β * (1 - sqrt(1 - C_T))))) * d_rotor

        ############# 4. wake expansion (σ_y / σ_z) ###########################
        k_y     = k_a * I + k_b
        k_z     = k_y                    # identical expression

        diff    = OP - x₀
        f1      = max(diff, 0.0)
        f2      = min(OP / x₀, 1.0)

        σ_y = f1 * k_y + f2 * cosyaw * d_rotor * is8
        σ_z = f1 * k_z + f2 * d_rotor    * is8

        ############# 5. deflection angles ####################################
        Θ = 0.3 * yaw / cosyaw * (1 - sqrt(1 - C_T * cosyaw))

        ############# 6. near‑field / far‑field pieces #########################
        Δ_nfw = Θ * min(OP, x₀)

        Δ_fw₁ = Θ / 14.7 * sqrt(cosyaw / (k_y * k_z * C_T)) *
                (2.9 + 1.3 * sqrt(1 - C_T) - C_T)

        num  = (1.6 + sqrt(C_T)) *
               (1.6 * sqrt((8 * σ_y * σ_z) / (d_rotor^2 * cosyaw)) - sqrt(C_T))
        den  = (1.6 - sqrt(C_T)) *
               (1.6 * sqrt((8 * σ_y * σ_z) / (d_rotor^2 * cosyaw)) + sqrt(C_T))

        Δ_fw₂ = log(complex(num / den, 0.0))   # keep domain identical

        factor = sign(OP - x₀) / 2 + 0.5        # 0, 0.5, or 1

        Δy = Δ_nfw + factor * Δ_fw₁ * Δ_fw₂ * d_rotor

        ############# 7. store result ##########################################
        Δ[i, 1] = Δy
        Δ[i, 2] = 0        # Δz was always zero in the original
    end

    return Δ
end

down to 27 allocs and around 1.5x performance.
what takes most of the time is the log in the centerline and the 3 circshift which I’m sure we could make so much better doing all 3 at the same time

6 Likes

Very nice! The overall performance of runFLORIdyn() is now 3.4x Matlab (before: 2.4x).

You helped me to optimize the two most critical functions, and that is nice. For now, I am happy!

3 Likes

no problem, I think smn mentioned it but in julia arrays are column major so putting the big 1800 dimention as last and always fix this one first ( or loop first on this one) is perf critical, I understand that it may change ton of things in the code but its really something to consider.

1 Like

Well, if you have the same access pattern in 50 other functions then fixing that would give you performance improvements there as well. You want to structure your data so that you access it in memory in a cache friendly way. If you keep doing [..., :] it suggests you have laid it out incorrectly and will hit large performance penalties as your data become big.

4 Likes

I mean, I did not write this code, I just translated it from Matlab. And I will not make many changes before all of the code is translated.

But I got a nice analysis: FLORIDyn.jl/docs/src/analysis.md at main · ufechner7/FLORIDyn.jl · GitHub

2 Likes

Small update: I made three functions allocation free:

  • iterateOPs!
  • findTurbineGroups
  • centerline!

with help of Claude Sonnet 4 and Gemini 2.5 pro. Sometimes, one of these LLMs worked, sometimes the other. But in the end, the result is very good:

The performance is now 4.8x the Matlab speed, the memory allocations for the full simulation went down from 617 MB to 168 MB.

Thanks for all your suggestions!

3 Likes

Isn’t MatLab column major as well, so that suggests perhaps some room for improvement in the original code as well? (Depending on your familiarity with the MatLab code, you could try the change there to verify that results aren’t affected)

2 Likes

Column major or not is not the key point here. The biggest gains were achieved by:

  • using a struct instead of a Matlab cell array (which acts more like a dict)
  • adding unit tests (easy in Julia, possible in Matlab); that was key for any further refactoring
  • using @views, using issorted()
  • pre-allocating arrays before entering the simulation loop

Of course, optimizing the Matlab code would probably also be possible (to some degree), but that is not our task.

1 Like

I agree that it depends on the situation and the relative computational efforts of course. For me, I’ve seen 30-40% speed ups in computational bottlenecks for my research projects just by switching to a layout consistent with column majority, so I wanted to recommend it just in case

1 Like

To be fair, you cannot really say that before you have done it, or? It’s great the code is faster, maybe it is fast enough for you now, but this is such a basic important thing that if you ask for optimization for your code it would be criminal for us not to emphasize it over and over.

Said jokingly, it’s like you are asking for help keeping your house warm and we point out the big hole in the wall while you are saying that it’s a bit warmer now because we isolated the windows.

But if you are happy you are happy so that’s all good then.

3 Likes

What is now the major remaining bottlenecks? Lines like these:

seem potentially slow, probably calculating angles with acos, then taking cos/sin later again. Is there perhaps some way to do coordinate transforms using matrix products with SMatrix, or something?

Not sure I think most of it is log(::Complex) in the loop in centerline and the 3 circ shift from prof view that’s what I saw, a lot of the rest is get index and iterate because didn’t inbounds things everywhere yet and because its on non-aligned views

Take a look at PreallocationTools.jl it will be much nicer that a big buffer struct in the code. Especially LazyBufferCache

you can

  • use column major as @kristoffer.carlsson suggested (this is to me the best suggestion so far with no affense to other’s)
  • use loopvectorization
  • use kerbelabstraction (for gpu which would have hinted you to use column major)
  • check type stability and use float32

in the end, this is is gona blow like a breeze ( I could not refrain from this last one)

1 Like