Performance increased after adding a if?

Hi everyone,

I was playing with a simple case of option pricing when I discovered this strange behavior.
After adding an extra conditional statement, the performance of the function actually become faster.

This is somewhat against the intuition, I’m hoping someone can help me to explain what’s happening under the hood.

Thanks

function euroOption(numSims, S, K, r, σ, T, CorP)
    S_adj = S * exp((r - σ^2.0/2.0)*T)
    psum = 0.0

    if CorP == 1
        for i in 1:numSims
            St = S_adj * exp(σ*sqrt(T)*randn())
            prem = St-K
            psum += ifelse(prem > 0.0, prem, 0.0)
        end
    elseif CorP == 0
        for i in 1:numSims
            St = S_adj * exp(σ*sqrt(T)*randn())
            prem = K-St
            psum += ifelse(prem > 0.0, prem, 0.0)
        end
    else
        println("invalid option type")
        return 0.0
    end

    price = psum / numSims * exp(-r*T)
    return price
end

function euroOption1(numSims, S, K, r, σ, T)
    S_adj = S * exp((r - σ^2.0/2.0)*T)
    psum = 0.0

    for i in 1:numSims
        St = S_adj * exp(σ*sqrt(T)*randn())
        prem = St-K
        psum += ifelse(prem > 0.0, prem, 0.0)
    end

    price = psum / numSims * exp(-r*T)
    return price
end
julia> @time euroOption(10^8, 100.0, 110.0, 0.05, 0.2, 1.0, 1)
  1.718544 seconds (35.06 k allocations: 1.771 MiB)
6.041585176794043

julia> @time euroOption(10^8, 100.0, 110.0, 0.05, 0.2, 1.0, 1)
  1.813658 seconds (6 allocations: 192 bytes)
6.039173673272666

julia> @time euroOption1(10^8, 100.0, 110.0, 0.05, 0.2, 1.0)
  2.046597 seconds (25.51 k allocations: 1.317 MiB)
6.039447950343969

julia> @time euroOption1(10^8, 100.0, 110.0, 0.05, 0.2, 1.0)
  2.043858 seconds (6 allocations: 192 bytes)
6.039694854595267
julia> function euroOption1(numSims, S, K, r, σ, T)
           S_adj = S * exp((r - σ^2.0/2.0)*T)
           psum = 0.0
       
           for i in 1:numSims
               St = S_adj * exp(σ*sqrt(T)*randn())
               prem = St-K
               psum += ifelse(prem > 0.0, prem, 0.0)
           end
       
           price = psum / numSims * exp(-r*T)
           return price
       end
euroOption1 (generic function with 1 method)

julia> function euroOption2(numSims, S, K, r, σ, T)
           S_adj = S * exp((r - σ^2.0/2.0)*T)
           psum = 0.0
       
           for i in 1:numSims
               St = S_adj * exp(σ*sqrt(T)*randn())
               prem = St-K
               psum += max(prem, 0.0)
           end
       
           price = psum / numSims * exp(-r*T)
           return price
       end
euroOption2 (generic function with 1 method)

julia> using BenchmarkTools

julia> @btime euroOption(10^8, 100.0, 110.0, 0.05, 0.2, 1.0, 1)
  944.977 ms (0 allocations: 0 bytes)
6.040284868724525

julia> @btime euroOption1(10^8, 100.0, 110.0, 0.05, 0.2, 1.0)
  924.992 ms (0 allocations: 0 bytes)
6.0417072045220435

julia> function euroOption2(numSims, S, K, r, σ, T)
           S_adj = S * exp((r - σ^2.0/2.0)*T)
           psum = 0.0
       
           for i in 1:numSims
               St = S_adj * exp(σ*sqrt(T)*randn())
               prem = St-K
               @fastmath psum += max(prem, 0.0)
           end
       
           price = psum / numSims * exp(-r*T)
           return price
       end
euroOption2 (generic function with 1 method)

julia> @btime euroOption2(10^8, 100.0, 110.0, 0.05, 0.2, 1.0)
  937.834 ms (0 allocations: 0 bytes)
6.039914675249353
1 Like

Thanks.

I switched to @btime and the results were more intuitive. However, my results by @time were quite stable, and it doesn’t seem explained by the difference between the two macros.

I might as well just use @btime, anyway. And thanks for mentioning @fastmath, I didn’t know that.

I don’t see that pattern with just @time:

julia> @time euroOption2(10^8, 100.0, 110.0, 0.05, 0.2, 1.0)
  0.955844 seconds (47 allocations: 2.750 KiB)
6.040339462548814

julia> @time euroOption2(10^8, 100.0, 110.0, 0.05, 0.2, 1.0)
  0.959690 seconds (6 allocations: 192 bytes)
6.039576104119825

julia> @time euroOption1(10^8, 100.0, 110.0, 0.05, 0.2, 1.0)
  0.948820 seconds (7 allocations: 272 bytes)
6.041294751811773

julia> @time euroOption1(10^8, 100.0, 110.0, 0.05, 0.2, 1.0)
  0.959119 seconds (6 allocations: 192 bytes)
6.043311435079458

julia> @time euroOption(10^8, 100.0, 110.0, 0.05, 0.2, 1.0, 1)
  0.982110 seconds (7 allocations: 272 bytes)
6.039844280425383

julia> @time euroOption(10^8, 100.0, 110.0, 0.05, 0.2, 1.0, 1)
  0.963529 seconds (6 allocations: 192 bytes)
6.03903922130195

As for @fastmath, it can help in some cases. For example, if you have a + b * c, it will turn that into an fma instruction instead of a mul and then an add.
It also helps a lot in other cases where there are CPU instructions doing the thing you want, like square root or max:

julia> fmax(a, b) = @fastmath max(a,b)
fmax (generic function with 1 method)

julia> fsqrt(x) = @fastmath sqrt(x)
fsqrt (generic function with 1 method)

julia> @code_native max(2.3, 3.4)
	.text
; Function max {
; Location: math.jl:575
	vcmpordsd	%xmm0, %xmm0, %xmm2
	vcmpordsd	%xmm1, %xmm1, %xmm3
; Function signbit; {
; Location: floatfuncs.jl:15
	vmovq	%xmm1, %rcx
	vmovq	%xmm0, %rax
; Function signbit; {
; Location: int.jl:93
; Function <; {
; Location: int.jl:49
	testq	%rcx, %rcx
;}}}
	vblendvpd	%xmm2, %xmm1, %xmm0, %xmm2
	vblendvpd	%xmm3, %xmm0, %xmm1, %xmm3
	vmovapd	%xmm2, %xmm4
	js	L58
; Function signbit; {
; Location: floatfuncs.jl:15
; Function signbit; {
; Location: int.jl:93
; Function <; {
; Location: int.jl:49
	testq	%rax, %rax
;}}}
	jns	L67
L46:
	vcmpltsd	%xmm1, %xmm0, %xmm0
	vblendvpd	%xmm0, %xmm2, %xmm4, %xmm0
	retq
L58:
	vmovapd	%xmm3, %xmm4
; Function signbit; {
; Location: floatfuncs.jl:15
; Function signbit; {
; Location: int.jl:93
; Function <; {
; Location: int.jl:49
	testq	%rax, %rax
;}}}
	js	L46
L67:
	vmovapd	%xmm3, %xmm4
	vcmpltsd	%xmm1, %xmm0, %xmm0
	vblendvpd	%xmm0, %xmm2, %xmm4, %xmm0
	retq
	nopw	%cs:(%rax,%rax)
;}

julia> @code_native fmax(2.3, 3.4)
	.text
; Function fmax {
; Location: REPL[29]:1
; Function max_fast; {
; Location: REPL[29]:1
	vmaxsd	%xmm0, %xmm1, %xmm0
;}
	retq
	nopw	%cs:(%rax,%rax)
;}

julia> @code_native sqrt(2.3)
	.text
; Function sqrt {
; Location: math.jl:492
; Function <; {
; Location: math.jl:492
	pushq	%rax
	vxorps	%xmm1, %xmm1, %xmm1
	vucomisd	%xmm0, %xmm1
;}
	ja	L17
; Location: math.jl:493
	vsqrtsd	%xmm0, %xmm0, %xmm0
	popq	%rax
	retq
; Location: math.jl:492
L17:
	movabsq	$throw_complex_domainerror, %rax
	movabsq	$139898210241920, %rdi  # imm = 0x7F3C97204D80
	callq	*%rax
	ud2
	nopl	(%rax)
;}

julia> @code_native fsqrt(2.3)
	.text
; Function fsqrt {
; Location: REPL[30]:1
; Function sqrt_fast; {
; Location: REPL[30]:1
	vsqrtsd	%xmm0, %xmm0, %xmm0
;}
	retq
	nopw	%cs:(%rax,%rax)
;}

In those cases, I think it could also help vectorize a for loop.

Thank you very much, that was thorough.