Is it possible to force use Int32, instead of Int64

I’d discourage that. The 32-bit builds of Julia will use only 8 floating point registers.
x86_64 systems all have 16 or 32 floating point registers.

This will make a lot of code run slower, due to far more frequent and severe register spills and smaller blocking parameters meaning far more loads/stores needed in general.

EDIT:
An example of how extreme this can be, the official 64 bit 1.4.1 binary:

julia> using LoopVectorization, BenchmarkTools, LinearAlgebra

julia> BLAS.set_num_threads(1)

julia> M = K = N = 120;

julia> A = rand(M,K); B = rand(K,N); C = rand(M,N); C2 = A * B;

julia> function AmulB!(C, A, B)
           @avx for n ∈ axes(C,2), m ∈ axes(C,1)
               Cmn = zero(eltype(C))
               for k ∈ axes(B,1)
                   Cmn += A[m,k] * B[k,n]
               end
               C[m,n] = Cmn
           end
       end
AmulB! (generic function with 1 method)

julia> @benchmark AmulB!($C,$A,$B)
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     34.795 μs (0.00% GC)
  median time:      34.931 μs (0.00% GC)
  mean time:        35.010 μs (0.00% GC)
  maximum time:     90.112 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

julia> @benchmark mul!($C2,$A,$B)
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     81.226 μs (0.00% GC)
  median time:      81.454 μs (0.00% GC)
  mean time:        81.670 μs (0.00% GC)
  maximum time:     127.942 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

julia> C ≈ C2
true

julia> versioninfo()
Julia Version 1.4.1
Commit 381693d3df* (2020-04-14 17:20 UTC)
Platform Info:
  OS: Linux (x86_64-pc-linux-gnu)
  CPU: Intel(R) Core(TM) i9-7900X CPU @ 3.30GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-8.0.1 (ORCJIT, skylake)

The official 32 bit version:

julia> using LoopVectorization, BenchmarkTools, LinearAlgebra

julia> BLAS.set_num_threads(1)

julia> M = K = N = 120;

julia> A = rand(M,K); B = rand(K,N); C = rand(M,N); C2 = A * B;

julia> function AmulB!(C, A, B)
           @avx for n ∈ axes(C,2), m ∈ axes(C,1)
               Cmn = zero(eltype(C))
               for k ∈ axes(B,1)
                   Cmn += A[m,k] * B[k,n]
               end
               C[m,n] = Cmn
           end
       end
AmulB! (generic function with 1 method)

julia> @benchmark AmulB!($C,$A,$B)
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     72.075 μs (0.00% GC)
  median time:      72.274 μs (0.00% GC)
  mean time:        72.468 μs (0.00% GC)
  maximum time:     125.262 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

julia> @benchmark mul!($C2,$A,$B)
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     235.576 μs (0.00% GC)
  median time:      235.826 μs (0.00% GC)
  mean time:        236.387 μs (0.00% GC)
  maximum time:     280.883 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

julia> C ≈ C2
true

julia> versioninfo()
Julia Version 1.4.1
Commit 381693d3df* (2020-04-14 17:20 UTC)
Platform Info:
  OS: Linux (i686-pc-linux-gnu)
  CPU: Intel(R) Core(TM) i9-7900X CPU @ 3.30GHz
  WORD_SIZE: 32
  LIBM: libopenlibm
  LLVM: libLLVM-8.0.1 (ORCJIT, skylake)
2 Likes