I don’t have much experience with this, so I was wondering: why is the last argument an Int32? Looking at PCLMULQDQ — Carry-Less Multiplication Quadword, I would have expected a 8 bytes, and that does seem to work as well:
const m128 = NTuple{2,VecElement{Int64}}
function carrylessmul(a::m128, b::m128)
ccall("llvm.x86.pclmulqdq", llvmcall, m128, (m128, m128, UInt8), a, b, 0)
end
julia> @code_native carrylessmul(m128((1, 2)), m128((3, 4)))
.section __TEXT,__text,regular,pure_instructions
; ┌ @ REPL[8]:2 within `carrylessmul'
vpclmulqdq $0, %xmm1, %xmm0, %xmm0
retl
nopw (%eax,%eax)
; └