Julia is surprisingly slow for some simple iteration

I am running the following code:

N=6144 # some not-so-small number
B=rand(N,N)
y=rand(N)
By=zeros(N)

@time for i=1:N
  By[i]=0.0
  for j=1:N
    By[i] += B[i,j]*y[i]
  end
end

on my laptop, the above iteration takes about 30 seconds.

However, if I use C language, the same code only takes about 0.1 seconds.

This is surprising since I expect Julia to be almost as fast as C.
Is there anything wrong with my code?

See the first performance tip: Performance Tips · The Julia Language

Your code has variables at top-level (global) scope. Accessing these is very slow. If you put your code into a function it will be much faster.

1 Like

A second point is the order of nested loops. Julia stores arrays in column-major-order, i.e. the innermost loop should iterate over the first index.

See Performance Tips · The Julia Language

Well, in fact this code is indeed in a function.

"""
A nonlinear equation solver copied from my previous C codes.
"""
module NLsolve_Broyden
	using Printf

	export NLEG_Broyden

	const ROLLBACK=1000000
	const RESCALE =100

	function NLEG_Broyden(func!,x; eps=1e-7, iter_Max=1000, prierr=true)
		T = eltype(x)
		Dim = length(x)
		N=Dim;

		norm::T =0.0;
		norm0::T=1.0e10;

		rollback::T=ROLLBACK;
		rescale::T=1.0;

		N_CONSTRUCT_QUASI_JACOB=5;
		TINY_SCALE::T=0.01;

		x0=zeros(T,N)
		f0=zeros(T,N)
		f1=zeros(T,N)
		s=zeros(T,N)
		y=zeros(T,N)
		sB=zeros(T,N)
		By=zeros(T,N)
		B=zeros(T,N,N)

		for i=1:N 
			for j=1:N 
				B[i,j]=0.0;
			end
			B[i,i]=1.0; 
			x0[i]=x[i];
		end

		func!(f0, x0);

		kk=1
		for k=1:iter_Max
			kk=k

			if k<N_CONSTRUCT_QUASI_JACOB
				rescale=TINY_SCALE;
			else
				rescale=1.0;
			end

			rescale_times=0;

	@label redo

			for i=1:N
				s[i]=0.0;
				if (k<N_CONSTRUCT_QUASI_JACOB+1)
					s[i]=-f0[i];
				else
					for j=1:N 
						s[i]=s[i]-B[i,j]*f0[j];
					end
				end
				x[i]=x0[i]+rescale*s[i];
			end

			func!(f1,x);

	
			norm=0.0
			for i=1:N 
				norm+=f1[i]^2
			end
			norm=sqrt(norm);


			if prierr
				@printf("%4d   Error=%e\n",k,norm);
				flush(stdout)
			end

			if  norm < eps  
				@goto endd;
			end

			if ((k>N_CONSTRUCT_QUASI_JACOB+1)&&(norm>rollback*norm0))
				rescale*=RESCALE;
				rescale_times+=1;

				if prierr
					@printf("it seems to diverge, now rollback: rescale=%f\n",rescale);
				end

				if (rescale_times>=5)
					@printf("could not converge to the decline direction since relax-factor becomes too small...\n");
					@goto endd;
				end

				@goto redo;
			end
			norm0=norm;

						for i=1:N 
				y[i]=(f1[i]-f0[i])/rescale;
			end

		       #####################################################
################################################################
#################################################################
"This is where the code is running very slow"
			for i=1:N 
				By[i]=0.0
				sB[i]=0.0
				for j=1:N 
					By[i]=By[i]+B[i,j]*y[j]
					sB[i]=sB[i]+s[j]*B[j,i]
				end
			end
############################################################
#############################################################
#############################################################
			

			sBy = T(0.0);
			for i=1:N 
				sBy+=s[i]*By[i]
			end

			

			if sqrt(abs(sBy)) < 0.0001*eps
				@printf("pkTBkqk is too small absolutely in func NLEG_Broyden!\n");
			end

			for i=1:N 
				for j=1:N 
					B[i,j]=B[i,j]+(s[i]-By[i])*sB[j]/sBy;
				end
				x0[i]=x[i];
				f0[i]=f1[i];
			end

			

			if (k==iter_Max)
				@printf("iter_Max exceeded in NLEG_Broyden!\n");
			end
		end
	@label endd
		return (iter=kk,zero=x,sucess=(kk<iter_Max));
	end

end

This is a code I copy from my C code.
But is much slower than C.
It takes me a lot of time to find that it is the iteration that makes the code very small.

But I don’t understand why.

It’s possible there is some type instability happening. Here is your original code in a function:

function foo!(B, y, By)
    N = length(y)
    for i in 1:N
        By[i] = 0.0
        for j in 1:N
            By[i] += B[i, j] * y[i]
        end
    end
end

And I get:

julia> @time foo!(B, y, By)
  0.956673 seconds

Now let’s make it better. First, I simply change B[i, j] to B[j, i] to match Julia’s column-oriented matrices. Then I have:

julia> @time foo!(B, y, By)
  0.031369 seconds

Now let me rewrite it a little:

  • Instead of accessing y[i] and By[i] in the inner loop, I pull them out to the outer loop so only B needs to be accessed in the inner loop
  • You don’t need to multiply by y[i] at each iteration, just multiply the sum by it, it’s the same
  • I use @inbounds @simd to enable SIMD and disable bounds checks. This requires me to be a little more strict with the types it accepts, and to have some checks at the top of the code.

The code is now

function foo2!(B::Matrix, y::Vector, By::Vector)
    N = length(y)
    size(B) == (N, N) || error()
    @inbounds for i in eachindex(y, By)
        Byi = 0.0
        @inbounds @simd for j in 1:N
            Byi += B[j, i]
        end
        By[i] = Byi * y[i]
    end
end

And the timing:

julia> @time foo2!(B, y, By)
  0.014844 seconds

Of course, you could also just write it simpler, like this:

function foo3!(B, y, By)
    columns = eachcol(B) 
    for i in eachindex(columns, y, By)
        By[i] = sum(columns[i]; init=zero(eltype(B))) * y[i]
    end
end

However, this will be somewhat slower due to the views and the pairwise summation strategy:

julia> @time foo3!(B, y, By)
  0.029927 seconds
11 Likes

Are you running the code from the console like this?

julia myscript.jl

If so, you will pay a significant startup cost. It is usually better to open a REPL once and for all and run all your code there iteratively.

1 Like

Thank you for your detailed explaination!

It turns out that the although the iteration has already been in a function,
I should further wrap them into smaller subfunctions.

And your advice increases the performance even more!

This again hints at type instability in your long function btw. When you wrap parts into subfunctions you essentially create function barriers. This means that although there is type instability in your large function, the small subfunctions can be type-stable (just need a single runtime dispatch) and thus be compiled efficiently.