Optimizing performance

I cloned your repo and changed the functions :

@views function iterateOPs!(::IterateOPs_basic, wf::WindFarm, sim::Sim, floris::Floris, floridyn::FloriDyn)
    # Save turbine OPs
    tmpOPStates = copy(wf.States_OP[wf.StartI, :])
    tmpTStates  = copy(wf.States_T[wf.StartI, :])
    tmpWFSTates = copy(wf.States_WF[wf.StartI, :])

    # Shift states
    # Downwind step
    step_dw = sim.time_step .* sim.dyn.advection .* wf.States_WF[:, 1] 
     wf.States_OP[:, 4] .+= step_dw

    # Crosswind step
    deflection = centerline(wf.States_OP, wf.States_T, wf.States_WF, floris, wf.D[1])
    step_cw = deflection .- wf.States_OP[:, 5:6]
     wf.States_OP[:, 5:6] .= deflection

    # World coordinate system adjustment
    @inbounds @simd for i in axes(wf.States_OP,1)
      phiwi = angSOWFA2world(wf.States_WF[i, 2])  # Convert wind direction to world coordinates
      cphiwi = cos(phiwi)
      sphiwi = sin(phiwi)
      ai = step_cw[i, 1]
      sdwi = step_dw[i]
      wf.States_OP[i, 1] += cphiwi * sdwi - sphiwi * ai
      wf.States_OP[i, 2] += sphiwi * sdwi + cphiwi * ai
      wf.States_OP[i, 3] += step_cw[i, 2]
    end

    # Circshift & init first OPs
    # OPs
    wf.States_OP = circshift(wf.States_OP, (1, 0))
     wf.States_OP[wf.StartI, :] .= tmpOPStates

    # Turbines
    wf.States_T = circshift(wf.States_T, (1, 0))
     wf.States_T[wf.StartI, :] .= tmpTStates

    # Wind Farm
    wf.States_WF = circshift(wf.States_WF, (1, 0))
     wf.States_WF[wf.StartI, :] .= tmpWFSTates

    # Check if OPs are in order
    buf = zeros(Int,size(wf.States_OP, 1))
    for iT in 1:wf.nT
        inds = wf.StartI[iT]:(wf.StartI[iT] + wf.nOP - 1)
        indOP = buf[inds]
        sortperm!(indOP,wf.States_OP[inds, 4])
        if ! issorted(indOP)  # check if already sorted
           wf.States_OP[inds, :] .= wf.States_OP[inds[indOP], :]
           wf.States_T[inds, :]  .= wf.States_T[inds[indOP], :]
           wf.States_WF[inds, :] .= wf.States_WF[inds[indOP], :]
        end

    end
    return nothing
end

and

function centerline(states_op, states_t, states_wf, floris, d_rotor)
    N          = size(states_op, 1)                     # number of rows
    Δ          = Matrix{ComplexF64}(undef, N, 2)        # output
    is8      = 1 / sqrt(8)
    s2         = sqrt(2)
    α, β       = floris.alpha, floris.beta
    k_a, k_b   = floris.k_a, floris.k_b

    for i ∈ 1:N          # ← one tight SIMD loop
        ############# 1. unpack state rows ####################################
        a  = states_t[i, 1]
        b  = states_t[i, 2]
        c  = states_t[i, 3]
        d  = states_wf[i, 3]
        OP = states_op[i, 4]

        ############# 2. basic derived quantities #############################
        C_T     = calcCt(a, b)
        yaw     = -deg2rad(b)
        cosyaw  = cos(yaw)
        I       = sqrt(c*c + d*d)

        ############# 3. core length x₀ #######################################
        x₀ = (cosyaw * (1 + sqrt(1 - C_T)) /
             (s2 * (α * I + β * (1 - sqrt(1 - C_T))))) * d_rotor

        ############# 4. wake expansion (σ_y / σ_z) ###########################
        k_y     = k_a * I + k_b
        k_z     = k_y                    # identical expression

        diff    = OP - x₀
        f1      = max(diff, 0.0)
        f2      = min(OP / x₀, 1.0)

        σ_y = f1 * k_y + f2 * cosyaw * d_rotor * is8
        σ_z = f1 * k_z + f2 * d_rotor    * is8

        ############# 5. deflection angles ####################################
        Θ = 0.3 * yaw / cosyaw * (1 - sqrt(1 - C_T * cosyaw))

        ############# 6. near‑field / far‑field pieces #########################
        Δ_nfw = Θ * min(OP, x₀)

        Δ_fw₁ = Θ / 14.7 * sqrt(cosyaw / (k_y * k_z * C_T)) *
                (2.9 + 1.3 * sqrt(1 - C_T) - C_T)

        num  = (1.6 + sqrt(C_T)) *
               (1.6 * sqrt((8 * σ_y * σ_z) / (d_rotor^2 * cosyaw)) - sqrt(C_T))
        den  = (1.6 - sqrt(C_T)) *
               (1.6 * sqrt((8 * σ_y * σ_z) / (d_rotor^2 * cosyaw)) + sqrt(C_T))

        Δ_fw₂ = log(complex(num / den, 0.0))   # keep domain identical

        factor = sign(OP - x₀) / 2 + 0.5        # 0, 0.5, or 1

        Δy = Δ_nfw + factor * Δ_fw₁ * Δ_fw₂ * d_rotor

        ############# 7. store result ##########################################
        Δ[i, 1] = Δy
        Δ[i, 2] = 0        # Δz was always zero in the original
    end

    return Δ
end

down to 27 allocs and around 1.5x performance.
what takes most of the time is the log in the centerline and the 3 circshift which I’m sure we could make so much better doing all 3 at the same time

6 Likes