Ref.: ipynb file
1. Original version with smaller org_sized = rand(Float32, (401, 401)) * 60
height, width = 187, 746;
#org_sized = rand(Float32, (2001, 2001)) * 60;
org_sized = rand(Float32, (401, 401)) * 60;
shadow_time_hrs = zeros(Float32, size(org_sized));
height_mat = rand(Float32, (height, width)) * 100; # originally values getting larger from (0, width//2) to the outside with the distance squared
angle_mat = round.(Int32, 2 .* atand.(0:height-1, (0:width-1)' .-(width/2-1))) .+ 1;
enlarged_size = zeros(eltype(org_sized), size(org_sized, 1) + height, size(org_sized, 2) + width);
enlarged_size[1:size(org_sized, 1), range(Int(width/2), length=size(org_sized, 2))] = org_sized;
weights = rand(Float32, 361)/ 10 .+ 0.01; # weights originally larger in the middle
function computeSumHours(org_sized, enlarged_size, angle_mat, height_mat, weights, y, x)
height, width = size(height_mat)
short_elevations = @view enlarged_size[y:y+height, x:x+width]
shadowed_segments = zeros(eltype(weights), 361)
for x2 in 1:width
for y2 in 1:height
overshadowed = (short_elevations[y2, x2] - org_sized[y, x]) > height_mat[y2, x2]
if overshadowed
angle = angle_mat[y2, x2]
if shadowed_segments[angle] == 0.0
shadowed_segments[angle] = weights[angle]
end
end
end
end
return sum(shadowed_segments)
end
function computeAllLines(org_sized, enlarged_size, angle_mat, height_mat, shadow_time_hrs, weights)
for x in 1:size(org_sized, 2) - 1
for y in 1:size(org_sized, 1) - 1
shadow_time_hrs[x, y] = computeSumHours(org_sized, enlarged_size, angle_mat, height_mat, weights, y, x)
end
end
return shadow_time_hrs
end
@time result = computeAllLines(org_sized, enlarged_size, angle_mat, height_mat, shadow_time_hrs, weights);
@time result = computeAllLines(org_sized, enlarged_size, angle_mat, height_mat, shadow_time_hrs, weights);
@time result = computeAllLines(org_sized, enlarged_size, angle_mat, height_mat, shadow_time_hrs, weights);
r0 = result;
Timings:
28.222513 seconds (358.24 k allocations: 480.214 MiB, 0.23% gc time, 0.22% compilation time)
28.034453 seconds (160.00 k allocations: 468.750 MiB, 0.18% gc time)
27.827226 seconds (160.00 k allocations: 468.750 MiB, 0.17% gc time)
2. Pre-allocated version with weights = Float32.(weights)
(The eltype of the original weights
is Float64
.)
weights = Float32.(weights)
function computeSumHours!(org_sized, enlarged_size, angle_mat, height_mat, weights, y, x, shadowed_segments)
height, width = size(height_mat)
short_elevations = @view enlarged_size[y:y+height, x:x+width]
shadowed_segments .= 0
@inbounds for x2 in 1:width
for y2 in 1:height
overshadowed = (short_elevations[y2, x2] - org_sized[y, x]) > height_mat[y2, x2]
if overshadowed
angle = angle_mat[y2, x2]
if shadowed_segments[angle] == 0.0
shadowed_segments[angle] = weights[angle]
end
end
end
end
return sum(shadowed_segments)
end
function computeAllLines!(org_sized, enlarged_size, angle_mat, height_mat, shadow_time_hrs, weights)
shadowed_segments = zeros(eltype(weights), 361)
for x in 1:size(org_sized, 2) - 1
for y in 1:size(org_sized, 1) - 1
shadow_time_hrs[x, y] = computeSumHours!(org_sized, enlarged_size, angle_mat, height_mat, weights, y, x, shadowed_segments)
end
end
return shadow_time_hrs
end
@time result = computeAllLines!(org_sized, enlarged_size, angle_mat, height_mat, shadow_time_hrs, weights);
@time result = computeAllLines!(org_sized, enlarged_size, angle_mat, height_mat, shadow_time_hrs, weights);
@time result = computeAllLines!(org_sized, enlarged_size, angle_mat, height_mat, shadow_time_hrs, weights);
r1 = result;
Timings (~30% faster):
20.481910 seconds (207.30 k allocations: 12.225 MiB, 0.29% compilation time)
20.319880 seconds (1 allocation: 1.594 KiB)
20.339460 seconds (1 allocation: 1.594 KiB)
3. Multi-thread version
weights = Float32.(weights)
function computeSumHours!(org_sized, enlarged_size, angle_mat, height_mat, weights, y, x, shadowed_segments)
height, width = size(height_mat)
short_elevations = @view enlarged_size[y:y+height, x:x+width]
shadowed_segments .= 0
Threads.@threads for x2 in 1:width
for y2 in 1:height
@inbounds overshadowed = (short_elevations[y2, x2] - org_sized[y, x]) > height_mat[y2, x2]
@inbounds if overshadowed
angle = angle_mat[y2, x2]
if shadowed_segments[angle] == 0.0
shadowed_segments[angle] = weights[angle]
end
end
end
end
return sum(shadowed_segments)
end
function computeAllLines!(org_sized, enlarged_size, angle_mat, height_mat, shadow_time_hrs, weights)
shadowed_segments = zeros(eltype(weights), 361)
for x in 1:size(org_sized, 2) - 1
for y in 1:size(org_sized, 1) - 1
shadow_time_hrs[x, y] = computeSumHours!(org_sized, enlarged_size, angle_mat, height_mat, weights, y, x, shadowed_segments)
end
end
return shadow_time_hrs
end
@show Threads.nthreads()
@time result = computeAllLines!(org_sized, enlarged_size, angle_mat, height_mat, shadow_time_hrs, weights);
@time result = computeAllLines!(org_sized, enlarged_size, angle_mat, height_mat, shadow_time_hrs, weights);
@time result = computeAllLines!(org_sized, enlarged_size, angle_mat, height_mat, shadow_time_hrs, weights);
r2 = result;
Timings (~ 5 times faster):
Threads.nthreads() = 12
5.817959 seconds (10.12 M allocations: 997.270 MiB, 3.60% gc time, 1.31% compilation time)
5.754861 seconds (9.95 M allocations: 987.269 MiB, 3.29% gc time)
5.781787 seconds (9.95 M allocations: 987.292 MiB, 3.26% gc time)
In this case, we can easily parallelize the code by essentially just adding the Threads.@threads macro!
4. Consistency
r0 ≈ r1 ≈ r2
true