Hello,
I am trying to port code from a Rust codebase (not my own code) to Julia. Even though the code is nearly identical to the Rust one, it appears to run ~10x slower. Code is not clean at all – I am just doing a “literal” translation from the original code. It produces the exact same results, and follows the same sequence of operations (goes through the branches the same number of times, and so on). Are there blatantly obvious reasons why this code is so much slower than its Rust equivalent?
The code is below:
function compute_similar_smers(smer::UInt64, seed::Seed, hit_threshold::Int64)::Set{UInt64}
encoded::UInt64 = SPRINT.Utils.encode_smer(smer)
similar_smers::Set{UInt64} = Set()
smer_score::Int16 = 0
seed_current_digit::UInt64 = 0
smer_current_digit::UInt64 = 0
# Compute score of smer with itself (baseline)
for i in 1:length(seed)
seed_current_digit = seed.value & (31 << (i * 5))
if seed_current_digit != 0
smer_current_digit = encoded & (31 << (i * 5));
smer_current_digit = smer_current_digit >> (i * 5);
smer_score += PAM120_SPRINT[smer_current_digit,smer_current_digit]
end
end
change_smer_digit(encoded, UInt64(0), seed, hit_threshold - smer_score, similar_smers)
similar_smers
end
function change_smer_digit(
smer::UInt64,
digit_position::UInt64,
seed::Seed,
score_needed::Int64,
similar_smers::Set{UInt64}
)
if digit_position == length(seed)
return
end
current_seed_digit::UInt64 = seed.value & (31 << (digit_position * 5))
smer_current_digit::UInt64 = 0
y::UInt64 = 0
if current_seed_digit == 0
change_smer_digit(smer, digit_position + 1, seed, score_needed, similar_smers)
elseif digit_position == length(seed) - 1
smer_current_digit = (smer & (31 << (digit_position * 5))) >> (digit_position * 5)
for i in 0:19
if PAM120_SPRINT[smer_current_digit,PAM120_ORDERED_SPRINT[smer_current_digit,i]] >= score_needed
y = smer & (~(31 << (digit_position * 5)))
y = y | (PAM120_ORDERED_SPRINT[smer_current_digit,i] << (5 * digit_position))
push!(similar_smers, SPRINT.Utils.decode_smer(y, seed))
else
break
end
end
else
smer_current_digit = (smer & (31 << (digit_position * 5))) >> (digit_position * 5)
smer_current_digit_plus1::UInt64 = get_smer_dig_plus1(smer, digit_position, seed)
new_score_needed::Int64 = 0
for i in 0:19
if PAM120_SPRINT[smer_current_digit,PAM120_ORDERED_SPRINT[smer_current_digit,i]] >= score_needed
y = smer & (~(31 << (digit_position * 5)))
y = y | (PAM120_ORDERED_SPRINT[smer_current_digit,i] << (5 * digit_position))
push!(similar_smers, SPRINT.Utils.decode_smer(y, seed))
new_score_needed = score_needed + PAM120_SPRINT[smer_current_digit_plus1,smer_current_digit_plus1] - PAM120_SPRINT[smer_current_digit,PAM120_ORDERED_SPRINT[smer_current_digit,i]]
change_smer_digit(y, digit_position + 1, seed, new_score_needed, similar_smers)
else
break
end
end
end
end
function get_smer_dig_plus1(smer::UInt64, digit_position::UInt64, seed::Seed)::UInt64
seed_current_digit::UInt64 = seed.value & (31 << ((digit_position + 1) * 5))
if seed_current_digit != 0
return (smer & (31 << ((digit_position + 1) * 5))) >> ((digit_position + 1) * 5)
else
return get_smer_dig_plus1(smer, digit_position + 1, seed)
end
end
The PAM120_SPRINT and PAM120_ORDERED_SPRINT are global variables, but I was told that by declaring them as const, Julia would be able to optimize for that. Could it have anything to do with memory allocation (for my input, it runs in ~0.019s with ~233k allocations on second call to exclude compilation time, which seems like a lot, given that my function change_smer_digits
is called ~4000 times including recursive calls)?
Cheers,
Francois