I’m making a project for my university i have many problems with julia but this i can’t understand. The problem is local_d = @ private(1) i try also to write local_d = 1 but same error, if i write @ private local_d = 1
PS. the space between @ keyword is because the site recognize as a mention
— Stactrace local_d = @ private(1)
ERROR: LoadError: UndefVarError: local_d
not defined in Main
Suggestion: check for spelling errors or missing imports.
Stacktrace:
[1] cpu_scan_local_kernel!
@ ~/.julia/packages/KernelAbstractions/lGrz7/src/KernelAbstractions.jl:307 [inlined]
[2] cpu_scan_local_kernel!(ctx::KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.StaticSize{(256,)}, KernelAbstractions.NDIteration.NoDynamicCheck, CartesianIndex{1}, Nothing, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.StaticSize{(1,)}, KernelAbstractions.NDIteration.StaticSize{(256,)}, Nothing, Nothing}}, flags::Vector{Int8}, scan_out::Vector{Int32}, N::Int64)
@ Main ./none:0
[3] __thread_run(tid::Int64, len::Int64, rem::Int64, obj::KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.StaticSize{(256,)}, KernelAbstractions.NDIteration.StaticSize{(256,)}, typeof(cpu_scan_local_kernel!)}, ndrange::Nothing, iterspace::KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.StaticSize{(1,)}, KernelAbstractions.NDIteration.StaticSize{(256,)}, Nothing, Nothing}, args::Tuple{Vector{Int8}, Vector{Int32}, Int64}, dynamic::KernelAbstractions.NDIteration.NoDynamicCheck)
@ KernelAbstractions ~/.julia/packages/KernelAbstractions/lGrz7/src/cpu.jl:145
[4] __run(obj::KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.StaticSize{(256,)}, KernelAbstractions.NDIteration.StaticSize{(256,)}, typeof(cpu_scan_local_kernel!)}, ndrange::Nothing, iterspace::KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.StaticSize{(1,)}, KernelAbstractions.NDIteration.StaticSize{(256,)}, Nothing, Nothing}, args::Tuple{Vector{Int8}, Vector{Int32}, Int64}, dynamic::KernelAbstractions.NDIteration.NoDynamicCheck, static_threads::Bool)
@ KernelAbstractions ~/.julia/packages/KernelAbstractions/lGrz7/src/cpu.jl:112
[5] (::KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.StaticSize{(256,)}, KernelAbstractions.NDIteration.StaticSize{(256,)}, typeof(cpu_scan_local_kernel!)})(::Vector{Int8}, ::Vararg{Any}; ndrange::Nothing, workgroupsize::Nothing)
@ KernelAbstractions ~/.julia/packages/KernelAbstractions/lGrz7/src/cpu.jl:46
[6] (::KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.StaticSize{(256,)}, KernelAbstractions.NDIteration.StaticSize{(256,)}, typeof(cpu_scan_local_kernel!)})(::Vector{Int8}, ::Vararg{Any})
@ KernelAbstractions ~/.julia/packages/KernelAbstractions/lGrz7/src/cpu.jl:39
[7] gpu_findall_positive!(input::Vector{Int8})
@ Main ~/Documenti/julia/findall.jl:198
[8] main()
@ Main ~/Documenti/julia/findall.jl:235
[9] top-level scope
@ ~/Documenti/julia/findall.jl:241
— Stactrace local_d = 1
ERROR: LoadError: MethodError: no method matching setindex!(::NTuple{256, Int64}, ::Int64, ::Int64)
The function setindex!
exists, but no method is defined for this combination of argument types.
Stacktrace:
[1] macro expansion
@ ~/Documenti/julia/findall.jl:51 [inlined]
[2] cpu_scan_local_kernel!
@ ~/.julia/packages/KernelAbstractions/lGrz7/src/KernelAbstractions.jl:306 [inlined]
[3] cpu_scan_local_kernel!(ctx::KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.StaticSize{(256,)}, KernelAbstractions.NDIteration.NoDynamicCheck, CartesianIndex{1}, Nothing, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.StaticSize{(1,)}, KernelAbstractions.NDIteration.StaticSize{(256,)}, Nothing, Nothing}}, flags::Vector{Int8}, scan_out::Vector{Int32}, N::Int64)
@ Main ./none:0
[4] __thread_run(tid::Int64, len::Int64, rem::Int64, obj::KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.StaticSize{(256,)}, KernelAbstractions.NDIteration.StaticSize{(256,)}, typeof(cpu_scan_local_kernel!)}, ndrange::Nothing, iterspace::KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.StaticSize{(1,)}, KernelAbstractions.NDIteration.StaticSize{(256,)}, Nothing, Nothing}, args::Tuple{Vector{Int8}, Vector{Int32}, Int64}, dynamic::KernelAbstractions.NDIteration.NoDynamicCheck)
@ KernelAbstractions ~/.julia/packages/KernelAbstractions/lGrz7/src/cpu.jl:145
[5] __run(obj::KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.StaticSize{(256,)}, KernelAbstractions.NDIteration.StaticSize{(256,)}, typeof(cpu_scan_local_kernel!)}, ndrange::Nothing, iterspace::KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.StaticSize{(1,)}, KernelAbstractions.NDIteration.StaticSize{(256,)}, Nothing, Nothing}, args::Tuple{Vector{Int8}, Vector{Int32}, Int64}, dynamic::KernelAbstractions.NDIteration.NoDynamicCheck, static_threads::Bool)
@ KernelAbstractions ~/.julia/packages/KernelAbstractions/lGrz7/src/cpu.jl:112
[6] (::KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.StaticSize{(256,)}, KernelAbstractions.NDIteration.StaticSize{(256,)}, typeof(cpu_scan_local_kernel!)})(::Vector{Int8}, ::Vararg{Any}; ndrange::Nothing, workgroupsize::Nothing)
@ KernelAbstractions ~/.julia/packages/KernelAbstractions/lGrz7/src/cpu.jl:46
[7] (::KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.StaticSize{(256,)}, KernelAbstractions.NDIteration.StaticSize{(256,)}, typeof(cpu_scan_local_kernel!)})(::Vector{Int8}, ::Vararg{Any})
@ KernelAbstractions ~/.julia/packages/KernelAbstractions/lGrz7/src/cpu.jl:39
[8] gpu_findall_positive!(input::Vector{Int8})
@ Main ~/Documenti/julia/findall.jl:199
[9] main()
@ Main ~/Documenti/julia/findall.jl:236
[10] top-level scope
@ ~/Documenti/julia/findall.jl:242
in expression starting at /home/dark/Documenti/julia/findall.jl:242
CODE:
#ENV[“JULIA_DEBUG”] = “AMDGPU”
using KernelAbstractions
using AMDGPU
using Cthulhu
if false
const KADevice = ROCBackend()
else
const KADevice = CPU()
end
const GROUP_SIZE = 256
-------------------------------
Kernel 1: Costruisce flags binari
@kernel function mark_positive_kernel!(input, flags, N)
i = @index(Global)
if i <= N
flags[i] = input[i] > 0 ? 1 : 0
#@print("Thread ", i, ": input = ", input[i], ", flag = ", flags[i], "\n")
end
end
-------------------------------
Kernel 2: Scan parziale in blocco
@ kernel function scan_local_kernel!(flags, scan_out, N)
i = @index(Global)
lidx = @index(Local)
@private group_sz = @groupsize()[1]
#group_sz = group_sz[1]
temp = @localmem(Int8, GROUP_SIZE) # o group_sz
if i <= N
temp[lidx] = flags[i]
else
temp[lidx] = 0
end
@synchronize()
local_d = @private(1)
@print("DEBUG: d:", typeof(local_d), "\n")
while local_d < group_sz
@synchronize()
if lidx > local_d
temp[lidx] += temp[lidx - local_d]
end
local_d *= 2
@print("temp = ", typeof(temp), "\n")
end
@synchronize()
if i <= N
scan_out[i] = temp[lidx]
end
@print("FInito")
end
-------------------------------
Kernel 3: Estrai somma per blocco
@ kernel function extract_block_sums!(scan_out, block_sums, N)
gidx = @index(Group)
lidx = @index(Local)
group_sz = @groupsize()
group_sz = group_sz[1]
last_thread = (gidx - 1) * group_sz + group_sz
if lidx == group_sz
block_sums[gidx] = (last_thread <= N) ? scan_out[last_thread] : scan_out[N]
end
end
-------------------------------
Kernel 4: Scan degli offset
@ kernel function scan_block_sums_kernel!(block_sums, block_offsets, num_blocks)
lidx = @index(Local)
i = lidx
group_sz = @groupsize()
group_sz = group_sz[1]
temp = @localmem(Int32, group_sz)
if i <= num_blocks
temp[lidx] = block_sums[i]
else
temp[lidx] = 0
end
@synchronize()
d = 1
while d < group_sz
@synchronize()
if lidx > d
temp[lidx] += temp[lidx - d]
end
d *= 2
end
@synchronize()
if i <= num_blocks
block_offsets[i] = (i == 1) ? 0 : temp[i - 1]
end
end
-------------------------------
Kernel 5: Aggiunge offset per blocco
@ kernel function apply_block_offsets!(scan_out, block_offsets, N)
i = @index(Global)
group_sz = @groupsize()
group_sz = group_sz[1]
gidx = (i - 1) ÷ group_sz + 1
if i <= N
scan_out[i] += block_offsets[gidx]
end
end
-------------------------------
Kernel 6: Costruisce output finale
@ kernel function findall_gpu(flags, scan_out, output_indices, N)
i = @index(Global)
if i <= N && flags[i] == 1
out_idx = scan_out[i]
output_indices[out_idx] = i
end
end
function gpu_findall_positive!(input)
N = length(input)
num_blocks = cld(N, GROUP_SIZE)
flags = KernelAbstractions.zeros(KADevice, Int8, N)
scan_out = KernelAbstractions.zeros(KADevice, Int32, N)
block_sums = KernelAbstractions.zeros(KADevice, Int32, num_blocks)
block_offsets = KernelAbstractions.zeros(KADevice, Int32, num_blocks)
# 1. Costruisci flags
global_size = cld(N, GROUP_SIZE) * GROUP_SIZE
mark_positive_kernel!(KADevice, global_size, GROUP_SIZE)(input, flags, N)
KernelAbstractions.synchronize(KADevice)
# 2. Scan per blocco
scan_local_kernel!(KADevice, global_size, GROUP_SIZE)(flags, scan_out, N)
KernelAbstractions.synchronize(KADevice)
# 3. Estrai somma per blocco
global_size_blocks = num_blocks * GROUP_SIZE
extract_block_sums!(KADevice, global_size_blocks, GROUP_SIZE)(scan_out, block_sums, N)
# 4. Scan dei block_sums
global_size_sums = cld(num_blocks, GROUP_SIZE) * GROUP_SIZE
scan_block_sums_kernel!(KADevice, global_size_sums, GROUP_SIZE)(block_sums, block_offsets, num_blocks)
# 5. Aggiungi offset globale
apply_block_offsets!(KADevice, global_size, GROUP_SIZE)(scan_out, block_offsets, N)
# 6. Alloca e riempi output
total_found = KernelAbstractions.zeros(KADevice, Int32, 1)
KernelAbstractions.copyto!(KADevice, total_found, scan_out[N:N]) # scan_out[end:end]
wait(total_found)
count = Int32(Array(total_found)[1])
output_indices = KernelAbstractions.zeros(Int32, count)
findall_gpu(KADevice, global_size, GROUP_SIZE)(flags, scan_out, output_indices, N)
return output_indices
end
function main()
print("a")
vals = Array(Int8[-1, 0, 1, 2, -2, 1, 0, 2])
has_burning_neibs = KADevice isa CPU ? vals : ROCArray(vals)
print("b")
result = gpu_findall_positive!(has_burning_neibs)
KernelAbstractions.synchronize(KADevice)
print("c")
print(Array(result)[1])
end
main()
Sorry for the bad formatting and my english…