Optimizing performance

yolhan_mannes · July 25, 2025, 11:27pm

I cloned your repo and changed the functions :

@views function iterateOPs!(::IterateOPs_basic, wf::WindFarm, sim::Sim, floris::Floris, floridyn::FloriDyn)
    # Save turbine OPs
    tmpOPStates = copy(wf.States_OP[wf.StartI, :])
    tmpTStates  = copy(wf.States_T[wf.StartI, :])
    tmpWFSTates = copy(wf.States_WF[wf.StartI, :])

    # Shift states
    # Downwind step
    step_dw = sim.time_step .* sim.dyn.advection .* wf.States_WF[:, 1] 
     wf.States_OP[:, 4] .+= step_dw

    # Crosswind step
    deflection = centerline(wf.States_OP, wf.States_T, wf.States_WF, floris, wf.D[1])
    step_cw = deflection .- wf.States_OP[:, 5:6]
     wf.States_OP[:, 5:6] .= deflection

    # World coordinate system adjustment
    @inbounds @simd for i in axes(wf.States_OP,1)
      phiwi = angSOWFA2world(wf.States_WF[i, 2])  # Convert wind direction to world coordinates
      cphiwi = cos(phiwi)
      sphiwi = sin(phiwi)
      ai = step_cw[i, 1]
      sdwi = step_dw[i]
      wf.States_OP[i, 1] += cphiwi * sdwi - sphiwi * ai
      wf.States_OP[i, 2] += sphiwi * sdwi + cphiwi * ai
      wf.States_OP[i, 3] += step_cw[i, 2]
    end

    # Circshift & init first OPs
    # OPs
    wf.States_OP = circshift(wf.States_OP, (1, 0))
     wf.States_OP[wf.StartI, :] .= tmpOPStates

    # Turbines
    wf.States_T = circshift(wf.States_T, (1, 0))
     wf.States_T[wf.StartI, :] .= tmpTStates

    # Wind Farm
    wf.States_WF = circshift(wf.States_WF, (1, 0))
     wf.States_WF[wf.StartI, :] .= tmpWFSTates

    # Check if OPs are in order
    buf = zeros(Int,size(wf.States_OP, 1))
    for iT in 1:wf.nT
        inds = wf.StartI[iT]:(wf.StartI[iT] + wf.nOP - 1)
        indOP = buf[inds]
        sortperm!(indOP,wf.States_OP[inds, 4])
        if ! issorted(indOP)  # check if already sorted
           wf.States_OP[inds, :] .= wf.States_OP[inds[indOP], :]
           wf.States_T[inds, :]  .= wf.States_T[inds[indOP], :]
           wf.States_WF[inds, :] .= wf.States_WF[inds[indOP], :]
        end

    end
    return nothing
end

and

function centerline(states_op, states_t, states_wf, floris, d_rotor)
    N          = size(states_op, 1)                     # number of rows
    Δ          = Matrix{ComplexF64}(undef, N, 2)        # output
    is8      = 1 / sqrt(8)
    s2         = sqrt(2)
    α, β       = floris.alpha, floris.beta
    k_a, k_b   = floris.k_a, floris.k_b

    for i ∈ 1:N          # ← one tight SIMD loop
        ############# 1. unpack state rows ####################################
        a  = states_t[i, 1]
        b  = states_t[i, 2]
        c  = states_t[i, 3]
        d  = states_wf[i, 3]
        OP = states_op[i, 4]

        ############# 2. basic derived quantities #############################
        C_T     = calcCt(a, b)
        yaw     = -deg2rad(b)
        cosyaw  = cos(yaw)
        I       = sqrt(c*c + d*d)

        ############# 3. core length x₀ #######################################
        x₀ = (cosyaw * (1 + sqrt(1 - C_T)) /
             (s2 * (α * I + β * (1 - sqrt(1 - C_T))))) * d_rotor

        ############# 4. wake expansion (σ_y / σ_z) ###########################
        k_y     = k_a * I + k_b
        k_z     = k_y                    # identical expression

        diff    = OP - x₀
        f1      = max(diff, 0.0)
        f2      = min(OP / x₀, 1.0)

        σ_y = f1 * k_y + f2 * cosyaw * d_rotor * is8
        σ_z = f1 * k_z + f2 * d_rotor    * is8

        ############# 5. deflection angles ####################################
        Θ = 0.3 * yaw / cosyaw * (1 - sqrt(1 - C_T * cosyaw))

        ############# 6. near‑field / far‑field pieces #########################
        Δ_nfw = Θ * min(OP, x₀)

        Δ_fw₁ = Θ / 14.7 * sqrt(cosyaw / (k_y * k_z * C_T)) *
                (2.9 + 1.3 * sqrt(1 - C_T) - C_T)

        num  = (1.6 + sqrt(C_T)) *
               (1.6 * sqrt((8 * σ_y * σ_z) / (d_rotor^2 * cosyaw)) - sqrt(C_T))
        den  = (1.6 - sqrt(C_T)) *
               (1.6 * sqrt((8 * σ_y * σ_z) / (d_rotor^2 * cosyaw)) + sqrt(C_T))

        Δ_fw₂ = log(complex(num / den, 0.0))   # keep domain identical

        factor = sign(OP - x₀) / 2 + 0.5        # 0, 0.5, or 1

        Δy = Δ_nfw + factor * Δ_fw₁ * Δ_fw₂ * d_rotor

        ############# 7. store result ##########################################
        Δ[i, 1] = Δy
        Δ[i, 2] = 0        # Δz was always zero in the original
    end

    return Δ
end

down to 27 allocs and around 1.5x performance.
what takes most of the time is the log in the centerline and the 3 circshift which I’m sure we could make so much better doing all 3 at the same time

ufechner7 · July 25, 2025, 11:47pm

Very nice! The overall performance of runFLORIdyn() is now 3.4x Matlab (before: 2.4x).

You helped me to optimize the two most critical functions, and that is nice. For now, I am happy!

yolhan_mannes · July 25, 2025, 11:51pm

no problem, I think smn mentioned it but in julia arrays are column major so putting the big 1800 dimention as last and always fix this one first ( or loop first on this one) is perf critical, I understand that it may change ton of things in the code but its really something to consider.

kristoffer.carlsson · July 26, 2025, 12:21am

Well, if you have the same access pattern in 50 other functions then fixing that would give you performance improvements there as well. You want to structure your data so that you access it in memory in a cache friendly way. If you keep doing [..., :] it suggests you have laid it out incorrectly and will hit large performance penalties as your data become big.

ufechner7 · July 26, 2025, 5:13am

I mean, I did not write this code, I just translated it from Matlab. And I will not make many changes before all of the code is translated.

But I got a nice analysis: FLORIDyn.jl/docs/src/analysis.md at main · ufechner7/FLORIDyn.jl · GitHub

ufechner7 · July 26, 2025, 11:54am

Small update: I made three functions allocation free:

iterateOPs!
findTurbineGroups
centerline!

with help of Claude Sonnet 4 and Gemini 2.5 pro. Sometimes, one of these LLMs worked, sometimes the other. But in the end, the result is very good:

The performance is now 4.8x the Matlab speed, the memory allocations for the full simulation went down from 617 MB to 168 MB.

Thanks for all your suggestions!

JADekker · July 26, 2025, 12:31pm

Isn’t MatLab column major as well, so that suggests perhaps some room for improvement in the original code as well? (Depending on your familiarity with the MatLab code, you could try the change there to verify that results aren’t affected)

ufechner7 · July 26, 2025, 12:57pm

Column major or not is not the key point here. The biggest gains were achieved by:

using a struct instead of a Matlab cell array (which acts more like a dict)
adding unit tests (easy in Julia, possible in Matlab); that was key for any further refactoring
using @views, using issorted()
pre-allocating arrays before entering the simulation loop

Of course, optimizing the Matlab code would probably also be possible (to some degree), but that is not our task.

JADekker · July 26, 2025, 2:31pm

I agree that it depends on the situation and the relative computational efforts of course. For me, I’ve seen 30-40% speed ups in computational bottlenecks for my research projects just by switching to a layout consistent with column majority, so I wanted to recommend it just in case

kristoffer.carlsson · July 26, 2025, 3:51pm

To be fair, you cannot really say that before you have done it, or? It’s great the code is faster, maybe it is fast enough for you now, but this is such a basic important thing that if you ask for optimization for your code it would be criminal for us not to emphasize it over and over.

Said jokingly, it’s like you are asking for help keeping your house warm and we point out the big hole in the wall while you are saying that it’s a bit warmer now because we isolated the windows.

But if you are happy you are happy so that’s all good then.

DNF · July 26, 2025, 4:57pm

What is now the major remaining bottlenecks? Lines like these:

yolhan_mannes:

phiwi = angSOWFA2world(wf.States_WF[i, 2])  # Convert wind direction to world coordinates
      cphiwi = cos(phiwi)
      sphiwi = sin(phiwi)
      ai = step_cw[i, 1]
      sdwi = step_dw[i]
      wf.States_OP[i, 1] += cphiwi * sdwi - sphiwi * ai
      wf.States_OP[i, 2] += sphiwi * sdwi + cphiwi * ai
      wf.States_OP[i, 3] += step_cw[i, 2]

seem potentially slow, probably calculating angles with acos, then taking cos/sin later again. Is there perhaps some way to do coordinate transforms using matrix products with SMatrix, or something?

yolhan_mannes · July 26, 2025, 4:58pm

Not sure I think most of it is log(::Complex) in the loop in centerline and the 3 circ shift from prof view that’s what I saw, a lot of the rest is get index and iterate because didn’t inbounds things everywhere yet and because its on non-aligned views

yolhan_mannes · July 26, 2025, 5:08pm

Take a look at PreallocationTools.jl it will be much nicer that a big buffer struct in the code. Especially LazyBufferCache

rveltz · July 26, 2025, 5:54pm

you can

use column major as @kristoffer.carlsson suggested (this is to me the best suggestion so far with no affense to other’s)
use loopvectorization
use kerbelabstraction (for gpu which would have hinted you to use column major)
check type stability and use float32

in the end, this is is gona blow like a breeze ( I could not refrain from this last one)

Topic		Replies	Views
Trying to understand how to write good code (example) New to Julia	34	3004	February 16, 2017
Creating the same matrix in Julia as in Matlab takes longer time New to Julia	28	2186	June 28, 2017
Understanding Julia performance in simple finite difference code New to Julia	38	4443	June 7, 2018
Poor performance due to memory allocations? Performance memory-allocation	17	2850	January 15, 2019
My julia code is somehow much slower than the matlab code New to Julia question , performance , matlab	55	4345	December 30, 2022

Optimizing performance

Related topics