FYI
After watching this presentation (and some referenced links) on using cpu resources more efficiently I was able to increase performance by just changing the sequence of a few lines of code (loc).
The goal is to use as many cpu execution units (EU) simultaneously as possible.
So you order sequential lines of code that can be executed independently.
Here’ one snippet from the Crystal
code.
Original
def nextp_init(rhi, kmin, modpg, primes, resinvrs)
nextp = Slice(UInt64).new(primes.size*2)
r_hi, r_lo = rhi, rhi - 2
primes.each_with_index do |prime, j|
k = (prime - 2) // modpg
r = (prime - 2) % modpg + 2
r_inv = resinvrs[r].to_u64
ri = (r_inv * r_lo - 2) % modpg + 2
ki = k * (prime + ri) + (r * ri - 2) // modpg
ki < kmin ? (ki = (kmin - ki) % prime; ki = prime - ki if ki > 0) : (ki -= kmin)
nextp[j << 1] = ki.to_u64
ri = (r_inv * r_hi - 2) % modpg + 2
ki = k * (prime + ri) + (r * ri - 2) // modpg
ki < kmin ? (ki = (kmin - ki) % prime; ki = prime - ki if ki > 0) : (ki -= kmin)
nextp[j << 1 | 1] = ki.to_u64
end
nextp
end
More Efficient/Faster
def nextp_init(rhi, kmin, modpg, primes, resinvrs)
nextp = Slice(UInt64).new(primes.size*2)
r_hi, r_lo = rhi, rhi - 2
primes.each_with_index do |prime, j|
k = (prime - 2) // modpg
r = (prime - 2) % modpg + 2
r_inv = resinvrs[r].to_u64
rl = (r_inv * r_lo - 2) % modpg + 2
rh = (r_inv * r_hi - 2) % modpg + 2
kl = k * (prime + rl) + (r * rl - 2)
kh = k * (prime + rh) + (r * rh - 2)
kl < kmin ? (kl = (kmin - kl) % prime; kl = prime - kl if kl > 0) : (kl -= kmin)
kh < kmin ? (kh = (kmin - kh) % prime; kh = prime - kh if kh > 0) : (kh -= kmin)
nextp[j << 1] = kl.to_u64
nextp[j << 1 | 1] = kh.to_u64
end
nextp
end
Changing the code sequence such that the next loc isn’t dependent on input from the previous allows two EUs to run these instructions simultaneously. This one change in this one routine appreciably increased total runtime speed.
Here’s another snippet example from the same program.
Original
primes.each_with_index do |prime, j|
k = nextp.to_unsafe[j << 1]
while k < kn
seg[k >> s] |= 1_u64 << (k & bmask)
k += prime end
nextp.to_unsafe[j << 1] = k - kn
k = nextp.to_unsafe[j << 1 | 1]
while k < kn
seg[k >> s] |= 1_u64 << (k & bmask)
k += prime end
nextp.to_unsafe[j << 1 | 1] = k - kn
end
More Efficient/Faster
primes.each_with_index do |prime, j|
kl = nextp.to_unsafe[j << 1]
kh = nextp.to_unsafe[j << 1 | 1]
while kl < kn
seg[kl >> s] |= 1_u64 << (kl & bmask)
kl += prime end
while kh < kn
seg[kh >> s] |= 1_u64 << (kh & bmask)
kh += prime end
nextp.to_unsafe[j << 1] = kl - kn
nextp.to_unsafe[j << 1 | 1] = kh - kn
end
I’ve also seen the gcc
compiled versions for C++|Nim show a bigger speed improvement (execution time reduction) than for the llvm
ones, except for Rust
which also sees significant improvement.
I don’t know how applicable this is to Julia
, but I’m in the process of updating all the language versions of my code to get these FREE
speedups by just writing the code better.