How to get vectorized code for single-field structs?

I have noticed that structs with a single field do not vectorize the same way as the underlying type. Here is an example:

struct A
    x::Int
end

function f(s::NTuple{N,T}, t::NTuple{N,T}, i) where {N,T}
    ntuple(j -> ifelse(j <= i, s[j], t[j]), Val(N))
end

Then f is vectorized for Int [output shortened below]

julia> t = ntuple(Int, 2); @code_native f(t, t, 1)
; Function Signature: f(Tuple{Int64, Int64}, Tuple{Int64, Int64}, Int64)
; β”Œ @ REPL[1]:1 within `f`
# %bb.0:                                # %top
	#DEBUG_VALUE: f:s <- [DW_OP_deref] [$rsi+0]
	#DEBUG_VALUE: f:t <- [DW_OP_deref] [$rdx+0]
; β”‚ @ REPL[1] within `f`
	#DEBUG_VALUE: f:i <- $rcx
	push	rbp
	mov	rbp, rsp
	mov	rax, rdi
; β”‚ @ REPL[1]:2 within `f`
; β”‚β”Œ @ ntuple.jl:49 within `ntuple`
; β”‚β”‚β”Œ @ REPL[1]:2 within `#7`
; β”‚β”‚β”‚β”Œ @ int.jl:514 within `<=`
	vmovq	xmm0, rcx
	vpbroadcastq	xmm0, xmm0
	movabs	rcx, offset .LCPI0_0
	vmovdqa	xmm1, xmmword ptr [rcx]
	vpcmpgtq	xmm0, xmm1, xmm0
; β”‚β”‚β”‚β””
; β”‚β”‚β”‚β”Œ @ essentials.jl:796 within `ifelse`
	vmovupd	xmm1, xmmword ptr [rsi]
	vblendvpd	xmm0, xmm1, xmmword ptr [rdx], xmm0
; β”‚β”‚β””β””
	vmovupd	xmmword ptr [rdi], xmm0
	pop	rbp
	ret

but not for A:

julia> t = ntuple(A, 2); @code_native f(t, t, 1)
; Function Signature: f(Tuple{Main.A, Main.A}, Tuple{Main.A, Main.A}, Int64)
; β”Œ @ REPL[1]:1 within `f`
# %bb.0:                                # %top
	#DEBUG_VALUE: f:s <- [DW_OP_deref] [$rsi+0]
	#DEBUG_VALUE: f:t <- [DW_OP_deref] [$rdx+0]
; β”‚ @ REPL[1] within `f`
	#DEBUG_VALUE: f:i <- $rcx
	push	rbp
	mov	rbp, rsp
	mov	rax, rdi
; β”‚ @ REPL[1]:2 within `f`
; β”‚β”Œ @ ntuple.jl:49 within `ntuple`
; β”‚β”‚β”Œ @ REPL[1]:2 within `#7`
; β”‚β”‚β”‚β”Œ @ int.jl:514 within `<=`
	test	rcx, rcx
; β”‚β”‚β”‚β””
; β”‚β”‚β”‚β”Œ @ essentials.jl:796 within `ifelse`
	jg	.LBB0_1
# %bb.2:                                # %top
	mov	rdi, qword ptr [rdx]
; β”‚β”‚β”‚β””
; β”‚β”‚β”‚β”Œ @ int.jl:514 within `<=`
	cmp	rcx, 2
; β”‚β”‚β”‚β””
; β”‚β”‚β”‚β”Œ @ essentials.jl:796 within `ifelse`
	jl	.LBB0_5
.LBB0_4:
	mov	rcx, qword ptr [rsi + 8]
	jmp	.LBB0_6
.LBB0_1:
	mov	rdi, qword ptr [rsi]
; β”‚β”‚β”‚β””
; β”‚β”‚β”‚β”Œ @ int.jl:514 within `<=`
	cmp	rcx, 2
; β”‚β”‚β”‚β””
; β”‚β”‚β”‚β”Œ @ essentials.jl:796 within `ifelse`
	jge	.LBB0_4
.LBB0_5:                                # %top
	mov	rcx, qword ptr [rdx + 8]
.LBB0_6:                                # %top
; β”‚β”‚β””β””
	mov	qword ptr [rax], rdi
	mov	qword ptr [rax + 8], rcx
	pop	rbp
	ret

In the second case, @code_llvm shows instructions of the form

     %.fca.0.extract8 = extractvalue [1 x i64] %1, 0

to retrieve the single field of the struct. Maybe that’s the problem. A way to get around this would be to unwrap and rewrap the struct manually, as in

function g(s::NTuple{N,A}, t::NTuple{N,A}, i) where N
    ntuple(j -> A(ifelse(j <= i, s[j].x, t[j].x)), Val(N))
end

Then it’s again vectorized.

Ideally, the same function would give vectorized code both for Int and A. Is there a way to achieve this?

EDIT: This seems to be related to ifelse. If one defines

Base.:+(a::A, b::A) = A(a.x+b.x)

then map(+, t, t) is vectorized also for tuples with A elements.

1 Like

Use Julia v1.13

5 Likes