Of course calling a macro to do some work is simpler than writing the code that does the work. Same is true if you call a function to do the work. Here’s again the sentence before the part you quoted:
There is a fundamental increase in complexity when you replace “code that does x” with “macro code that takes code and rewrites it into code that does x”.
I’m talking from the perspective of the developer who must decide between writing a macro or not. Or the user who wants to understand what the code is doing, maybe to fix a bug.
So the right comparison is between
reshape(permutedims(stack((.^).(list, 2)), (2, 1, 3)), :, length(list))
# or
out = similar(eltype(list), prod(size(list[1])), length(list))
for i in eachindex(list)
elem = list[i]
for x in axes(elem, 1)
for y in axes(elem, 2)
out[y + size(elem, 1)*(x-1), i] = (list[i][x,y]).^2
end
end
end
and the macro implementation (according to @less @cast colwise[y⊗x,i] := (list[i][x,y])^2
):
macro cast(exs...)
call = CallInfo(__module__, __source__, TensorCast.unparse("@cast", exs...))
_macro(exs...; call=call)
end
function _macro(exone, extwo=nothing, exthree=nothing; call::CallInfo=CallInfo(), dict=Dict())
store = (dict=dict, assert=[], mustassert=[], seen=[], need=[], top=[], main=[])
# TODO use OrderedDict() for main? To allow duplicate removal
if Meta.isexpr(exone, :macrocall)
# New style @cast @avx A[i] := B[i]
string(exone.args[1]) in ("@lazy", "@strided", "@avx", "@avxt", "@turbo", "@tturbo") || throw(MacroError(
"the macro $(exone.args[1]) isn't one of the ones this understands", call))
push!(call.flags, Symbol(string(exone.args[1])[2:end]), :premacro)
return _macro(exone.args[3:end]...; call=call, dict=dict)
end
if (:reduce in call.flags) || (:matmul in call.flags)
# First the options:
optionparse(exthree, store, call)
# Then the LHS, to get canonical list of indices:
canon, parsed = reduceparse(exone, extwo, store, call)
elseif containsindexing(extwo) # @cast A[i,j] := softmax(j) B[i,j]
push!(call.flags, :dimcast)
optionparse(exthree, store, call)
canon, parsed = reduceparse(exone, extwo, store, call)
else
# Simple @cast case:
isnothing(exthree) || throw(MacroError("too many expressions for @cast: $exthree", call))
optionparse(extwo, store, call)
canon, parsed = castparse(exone, store, call)
end
# First pass over RHS just to read sizes, prewalk sees A[i][j] before A[i]
MacroTools.prewalk(x -> rightsizes(x, store, call), parsed.right)
# To look for recursion, we need another prewalk. To find naked indices, this one stops early:
right2 = recursemacro(parsed.right, canon, store, call)
# Third pass to standardise & then glue, postwalk sees A[i] before A[i][j]
right3 = MacroTools.postwalk(x -> standardglue(x, canon, store, call), right2)
right3 = checkallseen(right3, canon, store, call)
if !(:matmul in call.flags)
# Then finally broadcasting if necc (or just permutedims etc. if not):
right4 = targetcast(right3, canon, store, call)
else
# or, for matmul, can I change just this? outputinplace is also going to need changing.
right4 = matmultarget(right3, canon, parsed, store, call)
end
# Return to LHS, build up what it requested:
if :inplace in call.flags
rightlist = inplaceoutput(right4, canon, parsed, store, call)
else
right5 = newoutput(right4, canon, parsed, store, call)
rightlist = [:( $(parsed.name) = $right5 )]
end
# Sew all these pieces into output:
outex = quote end
append!(outex.args, store.top)
append!(outex.args, findsizes(store, call)) # this must be run after newoutput() etc.
append!(outex.args, store.main)
append!(outex.args, rightlist)
if :recurse in call.flags
return (name=parsed.name, ind=parsed.outer, scalar=(:scalar in call.flags), steps=outex.args)
else
return esc(outex) # could use MacroTools.unblock()?
end
end
which generates the following (according to @macroexpand
):
quote
#= /home/j/.julia/packages/TensorCast/mQB8h/src/macro.jl:209 =#
if $(Expr(:boundscheck))
list isa Tuple || (Base.ndims(list) == 1 || Base.throw(ArgumentError("expected a vector or tuple list[i]")))
end
local (ax_i, ax_x, ax_y) = (Base.axes(list, 1), Base.axes(first(list), 1), Base.axes(first(list), 2))
local var"##nobroadcast#294" = TensorCast.transmute(TensorCast.lazystack(list), Base.Val((2, 1, 3)))
colwise = Base.reshape((^).(var"##nobroadcast#294", 2), (TensorCast.star(ax_y, ax_x), ax_i))
end
I think it’s a good example of macro pros and cons. In this case:
Pros:
- Super easy to use and powerful
- A single implementation covers a gazillion of cases
Cons
- Monumental increase in complexity. To verify the implementation one must understand the huge macro implementation and the regular code it generates.