Fortran calling Julia. Julia 10x slower than Fortran
Dear Julia community,
I am a beginner programmer and I enjoyed learning and coding in Julia a lot.
I have been working on a Julia interface for Finite Element Analysis Program (FEAP), that is in Fortran, and we want to use Julia subroutines for material models. The end goal is to use the program Fortran + Julia in a HPC and the Julia functions will be called millions of times that is why performance is crucial.
I made a minimal working example of calling Julia from Fortran, by making some adaptations to the chapter Embedding Julia · The Julia Language. The Fortran program starts Julia and load the Julia Cfunctions, after that it calls cwrapper_foo1
(Julia Cfunction), then calls the equivalent Fortran subroutine fortran_foo1
that does the same operations, both functions are called n = 10^7 times.
Results by reading the system time and calculating the differences:
- Julia run: ~9.0 seconds
- Fortran run: ~0.5 seconds
(using valgrind
I get a similar ratio)
When benchmarking just the Julia code with BenchmarkTools.jl
, I get that around half of the time is spent in unsafe_load
and unsafe_store!
.
So my question is:
- Is there a better way to transfer or share the data between Fortran-Julia to get better performance?
- Does
jl_init()
andjl_atexit_hook(0)
be call once or every time the Julia Cfunction is called? - Is it possible to remove the
cwrapper
and call the Julia Cfunction directly from Fortran? - When Profiling, what
valgrind
shows is not understandable after calling a Julia function. What would be a proper way to profile Fortran/C + Julia?
Julia Cfunctions. File: cfunctions.jl
function foo1(
A::Ptr{Cdouble}, B::Ptr{Cdouble}, C::Ptr{Cdouble}, # input pointers
X::Ptr{Cdouble}, Y::Ptr{Cdouble}, # output pointers
N::Ptr{Cint}
)
# Loading parameters
# *************************************************************************************
n = unsafe_load(N, 1)
a1 = unsafe_load(A, 1)
a2 = unsafe_load(A, 2)
b1 = Array{Float64, 1}(undef, 6)
b2 = Array{Float64, 1}(undef, 6)
for i = 1:6
b1[i] = unsafe_load(B, i)
b2[i] = unsafe_load(B, i+6)
end # i
c1 = Array{Float64, 2}(undef, 3, 3)
c2 = Array{Float64, 2}(undef, 3, 3)
c3 = Array{Float64, 2}(undef, 3, 3)
c4 = Array{Float64, 2}(undef, 3, 3)
k = 0
for i = 1:3, j = 1:3
k = k + 1
c1[i,j] = unsafe_load(C, k + 9*0)
c2[i,j] = unsafe_load(C, k + 9*1)
c3[i,j] = unsafe_load(C, k + 9*2)
c4[i,j] = unsafe_load(C, k + 9*3)
end # i, j
# Calculations
# *************************************************************************************
E = a1
ν = a2
λ = E * ν / ((1 + ν) * (1 - 2 * ν))
μ = E / (2 * (1 + ν))
y = zeros(Float64, 6, 6)
for i = 1:3
y[i,i] = 2 * μ + λ
y[i+3,i+3] = μ
end # i
y[1,2] = λ
y[1,3] = λ
y[2,1] = λ
y[2,3] = λ
y[3,1] = λ
y[3,2] = λ
x = zeros(Float64, 6)
x[1] = (2 * μ + λ) * b1[1] + λ * b1[2] + λ * b1[3]
x[2] = (2 * μ + λ) * b1[2] + λ * b1[1] + λ * b1[3]
x[3] = (2 * μ + λ) * b1[3] + λ * b1[2] + λ * b1[1]
x[4] = 2 * μ * b1[4]
x[5] = 2 * μ * b1[5]
x[6] = 2 * μ * b1[6]
# Storing parameters
# *************************************************************************************
for i = 1:6
unsafe_store!(X, x[i], i)
end # i
k = 0
for i = 1:6, j = 1:6
k = k + 1
unsafe_store!(Y, y[i,j], k)
end # i, j
return nothing
end # foo1
const julia_foo1 = @cfunction(
foo1,
Cvoid,
(
Ptr{Cdouble}, Ptr{Cdouble}, Ptr{Cdouble},
Ptr{Cdouble}, Ptr{Cdouble},
Ptr{Cint},
)
)
C intermediate layer. File: c_layer.c
#include <julia.h>
JULIA_DEFINE_FAST_TLS()
#include <stdio.h>
#include <stdint.h>
void *get_cfunction_pointer(const char *name);
void (*c_foo1_)(double *a, double **b, double ***c, double *x, double **y, int *n);
int julia_initialize_(){
jl_init();
printf("Start Julia\n");
return 0;
}
int julia_exit_(){
jl_atexit_hook(0);
printf("Exit Julia\n");
return 0;
}
int julia_load_cfunctions_(){
printf("Loading Julia @cfunctions\n");
/* import the file cfunctions.jl */
jl_eval_string("Base.include(Main, \"cfunctions.jl\")");
/* convert Julia function to C function with @cfuntion */
c_foo1_ = get_cfunction_pointer("julia_foo1");
printf("Finish loading Julia @cfunction\n");
return 0;
}
void *get_cfunction_pointer(const char *name){
void *p = 0;
jl_value_t *boxed_pointer = jl_get_global(jl_main_module, jl_symbol(name));
p = jl_unbox_voidpointer(boxed_pointer);
return p;
}
int cwrapper_foo1_(double *a, double **b, double ***c, double *x, double **y, int *n){
c_foo1_(a, b, c, x, y, n);
return 0;
}
Fortran program. File: main.f
program main
implicit none
real (kind=8) :: a(2), b(6,2), c(3,3,4) ! input
real (kind=8) :: x(6), y(6,6), x1(6), y1(6,6) ! output
integer :: n
integer :: i,j,k
integer :: t1, t2, rate
real (kind=8) :: time
! Initialize parameters
n = 10**7
a = 0.0
b = 0.0
c = 0.0
x = 0.0
y = 0.0
x1 = 0.0
y1 = 0.0
a(1) = 1000
a(2) = 0.30
b(1,1) = 0.02
b(2,1) = -0.006
b(3,1) = -0.006
! Start Julia and load Julia Cfunctions
call julia_initialize()
call julia_load_cfunctions()
! Compile Julia function
call cwrapper_foo1(a, b, c, x, y, n)
write(*,*) 'n = ', n
! Julia loop
! *************************************************************
call system_clock(t1)
do i=1,n
! more Fortran code... lots lots of operations!!!
call cwrapper_foo1(a, b, c, x, y, n)
!call c_foo1(a, b, c, x, y, n) ! does not work atm
! more Fortran code...
end do ! i
call system_clock(t2, rate)
time = (t2 - t1) / float(rate)
write(*,*) 'julia run: ', time, 'seconds'
! Fortran loop
! *************************************************************
call system_clock(t1)
do i=1,n
! more Fortran code... many operations!!
call fortran_foo1(a, b, c, x1, y1, n)
! more Fortran code...
end do ! i
call system_clock(t2, rate)
time = (t2 - t1) / float(rate)
write(*,*) 'fortran run: ', time, 'seconds'
!write(*,*) x, y
!write(*,*) x1, y1
call julia_exit()
end program main
subroutine fortran_foo1(a, b, c, x, y, n)
implicit none
real (kind=8) :: a(2), b(6,2), c(3,3,4) ! input
real (kind=8) :: x(6), y(6,6) ! output
integer :: n
integer :: i,j,k
real (kind=8) :: E, nu, lambda, mu
E = a(1)
nu = a(2)
lambda = E*nu / ((1 + nu)*(1 - 2*nu))
mu = E / (2*(1 + nu))
do i=1,3
y(i,i) = 2*mu + lambda
y(i+3,i+3) = mu
end do ! i
y(1,2) = lambda
y(1,3) = lambda
y(2,1) = lambda
y(2,3) = lambda
y(3,1) = lambda
y(3,2) = lambda
x(1) = (2*mu + lambda)*b(1,1) + lambda*b(2,1) + lambda*b(3,1)
x(2) = (2*mu + lambda)*b(2,1) + lambda*b(1,1) + lambda*b(3,1)
x(3) = (2*mu + lambda)*b(3,1) + lambda*b(2,1) + lambda*b(1,1)
x(4) = 2*mu*b(4,1)
x(5) = 2*mu*b(5,1)
x(6) = 2*mu*b(6,1)
end subroutine fortran_foo1
makefile
FF=gfortran
CC=gcc
JL_SHARE = $(shell julia -e 'print(joinpath(Sys.BINDIR, Base.DATAROOTDIR, "julia"))')
CFLAGS += $(shell $(JL_SHARE)/julia-config.jl --cflags)
CXXFLAGS += $(shell $(JL_SHARE)/julia-config.jl --cflags)
LDFLAGS += $(shell $(JL_SHARE)/julia-config.jl --ldflags)
LDLIBS += $(shell $(JL_SHARE)/julia-config.jl --ldlibs)
all: main
main: main.o c_layer.o
$(FF) -o $@ $^ $(LDFLAGS) $(LDLIBS)
%.o: %.f
$(FF) -c $< -o $@ -g
%.o: %.c
$(CC) -c $(CFLAGS) $< -o $@ -g
.PHONY: main clean
clean:
rm -f main *.o
Julia Version 1.6.0
Commit f9720dc2eb (2021-03-24 12:55 UTC)
Platform Info:
OS: Linux (x86_64-pc-linux-gnu)
CPU: Intel(R) Core(TM) i5-6300HQ CPU @ 2.30GHz
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-11.0.1 (ORCJIT, skylake)
Any other recommendation is welcome
Thank you very much in advance for the help!