Hi performance people! I am implementing the remainder routines for constant modulus from this paper by Lemire et al. in Julia from their C examples.

Below is the original C implementation followed by my julia translation.

C code from Lemire et al (2019)

```
uint32_t d = ...;// your divisor > 0
uint64_t c = UINT64_C(0xFFFFFFFFFFFFFFFF) / d + 1;
// fastmod computes (n mod d) given precomputed c
uint32_t fastmod(uint32_t n ) {
uint64_t lowbits = c * n;
return ((__uint128_t)lowbits * d) >> 64;
}
```

Julia code

```
struct Mod
modulo::UInt32
modulo_inv::UInt64
function Mod(d::UInt32)
cmod::UInt64 = div(typemax(UInt64), d) + 1
return new(d,cmod)
end
function Mod(d::Int)
ud = convert(UInt32, d)
return Mod(ud)
end
end
@inline function fastmod(n::UInt32,T::Mod)::UInt32
lowbits::UInt64 = T.modulo_inv * n
return (convert(UInt128,lowbits) * T.modulo) >> 64
end
```

I have refactored it to use a `struct`

to store the precomputation (d and c in the C code).

Are there are any optimizations I can add to the Julia version? Is `convert`

the best translation for the typecasting in the C code? I haven’t done much micro-optimization so I am not sure how to optimize the `code_native`

output or anything.

The repository for my package is here, but it’s basically the above plus tests at this point. It’s usually faster than Julia’s `mod`

( for positive 32 bit moduli), if you’re going to use a given modulus at least twice, but of course that depends.

Benchmarking code and results

```
function bench_mod(a,b,c)
for m in b
for i in eachindex(a)
c[i] = mod(a[i],m)
end
end
end
function bench_fastmod(a,b,c)
for m in b
modm = Mod(m)
for i in eachindex(a)
c[i] = fastmod(a[i],modm)
end
end
end
function bench()
a = convert(Vector{UInt32},sample(1:typemax(UInt32),100_000))
b = convert(Vector{UInt32},sample(1:typemax(UInt32),100))
c1 = similar(a)
c2 = similar(a)
bench_fastmod = @benchmark bench_fastmod($a,$b,$c2)
bench_mod = @benchmark bench_mod($a,$b,$c1)
display(bench_fastmod)
display(bench_mod)
end
```

For these sizes of `a`

and `b`

, on my Ryzen CPU, it is about twice as quick.

```
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 4.892 ms (0.00% GC)
median time: 4.952 ms (0.00% GC)
mean time: 4.953 ms (0.00% GC)
maximum time: 5.046 ms (0.00% GC)
--------------
samples: 1010
evals/sample: 1
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 12.851 ms (0.00% GC)
median time: 12.982 ms (0.00% GC)
mean time: 12.973 ms (0.00% GC)
maximum time: 13.201 ms (0.00% GC)
--------------
samples: 386
evals/sample: 1
```

thanks!