`norm` function error on CuDeviceArray

This code will get an error about norm function

using CUDA
using Adapt
using LinearAlgebra

struct TestType{T<:AbstractArray}
    a::T
    b::T
    c::Float32
end

Adapt.adapt_structure(to, x::TestType) = TestType(adapt(to, x.a), adapt(to, x.b), x.c)

data = cudaconvert(cu(TestType([1., 2., 3.], [4., 5., 6., 7., 8.], 7.f0)))

function kernel(data)
    @cuprintln(norm(data.a))
    return
end

@cuda kernel(data)
synchronize()
ERROR: LLVM error: Cannot select: 0x5fe96dc8: f64 = fpow 0x5fe907e8, 0x5fe98f88, math.jl:886 @[ promotion.jl:343 @[ C:\buildbot\worker\package_win64\build\usr\share\julia\stdlib\v1.5\LinearAlgebra\src\generic.jl:514 ] ]
  0x5fe907e8: f64 = fp_extend 0x5fe96f68, float.jl:255 @[ number.jl:7 @[ promotion.jl:259 @[ promotion.jl:282 @[ promotion.jl:343 @[ C:\buildbot\worker\package_win64\build\usr\share\julia\stdlib\v1.5\LinearAlgebra\src\generic.jl:514 ] ] ] ] ]
    0x5fe96f68: f32,ch = CopyFromReg 0x6077cc78, Register:f32 %28, float.jl:255 @[ number.jl:7 @[ promotion.jl:259 @[ promotion.jl:282 @[ promotion.jl:343 @[ C:\buildbot\worker\package_win64\build\usr\share\julia\stdlib\v1.5\LinearAlgebra\src\generic.jl:514 ] ] ] ] ]
      0x5fe978c0: f32 = Register %28
  0x5fe98f88: f64 = sint_to_fp 0x5fe97990, float.jl:60 @[ number.jl:7 @[ C:\buildbot\worker\package_win64\build\usr\share\julia\stdlib\v1.5\LinearAlgebra\src\generic.jl:513 ] ]
    0x5fe97990: i64,ch = CopyFromReg 0x6077cc78, Register:i64 %42, float.jl:60 @[ number.jl:7 @[ C:\buildbot\worker\package_win64\build\usr\share\julia\stdlib\v1.5\LinearAlgebra\src\generic.jl:513 ] ]
      0x5fe905e0: i64 = Register %42
In function: julia_generic_normp_4183
Stacktrace:
 [1] handle_error(::Cstring) at C:\Users\skyair\.julia\packages\LLVM\dUfQc\src\core\context.jl:105
 [2] macro expansion at C:\Users\skyair\.julia\packages\LLVM\dUfQc\src\util.jl:114 [inlined]
 [3] LLVMTargetMachineEmitToMemoryBuffer(::LLVM.TargetMachine, ::LLVM.Module, ::LLVM.API.LLVMCodeGenFileType, ::Base.RefValue{Cstring}, ::Base.RefValue{Ptr{LLVM.API.LLVMOpaqueMemoryBuffer}}) at C:\Users\skyair\.julia\packages\LLVM\dUfQc\lib\libLLVM_h.jl:3512
 [4] emit(::LLVM.TargetMachine, ::LLVM.Module, ::LLVM.API.LLVMCodeGenFileType) at C:\Users\skyair\.julia\packages\LLVM\dUfQc\src\targetmachine.jl:44
 [5] mcgen(::GPUCompiler.CompilerJob, ::LLVM.Module, ::LLVM.Function, ::LLVM.API.LLVMCodeGenFileType) at C:\Users\skyair\.julia\packages\GPUCompiler\uTpNx\src\mcgen.jl:74
 [6] macro expansion at C:\Users\skyair\.julia\packages\TimerOutputs\ZmKD7\src\TimerOutput.jl:206 [inlined]
 [7] macro expansion at C:\Users\skyair\.julia\packages\GPUCompiler\uTpNx\src\driver.jl:252 [inlined]
 [8] macro expansion at C:\Users\skyair\.julia\packages\TimerOutputs\ZmKD7\src\TimerOutput.jl:206 [inlined]
 [9] codegen(::Symbol, ::GPUCompiler.CompilerJob; libraries::Bool, deferred_codegen::Bool, optimize::Bool, strip::Bool, validate::Bool, only_entry::Bool) at C:\Users\skyair\.julia\packages\GPUCompiler\uTpNx\src\driver.jl:248
 [10] compile(::Symbol, ::GPUCompiler.CompilerJob; libraries::Bool, deferred_codegen::Bool, optimize::Bool, strip::Bool, validate::Bool, only_entry::Bool) at C:\Users\skyair\.julia\packages\GPUCompiler\uTpNx\src\driver.jl:39
 [11] compile at C:\Users\skyair\.julia\packages\GPUCompiler\uTpNx\src\driver.jl:35 [inlined]
 [12] cufunction_compile(::GPUCompiler.FunctionSpec; kwargs::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}) at C:\Users\skyair\.julia\packages\CUDA\BIYoG\src\compiler\execution.jl:310
 [13] cufunction_compile(::GPUCompiler.FunctionSpec) at C:\Users\skyair\.julia\packages\CUDA\BIYoG\src\compiler\execution.jl:305
 [14] check_cache(::Dict{UInt64,Any}, ::Any, ::Any, ::GPUCompiler.FunctionSpec{typeof(kernel),Tuple{TestType{CuDeviceArray{Float32,1,1}}}}, ::UInt64; kwargs::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}) at C:\Users\skyair\.julia\packages\GPUCompiler\uTpNx\src\cache.jl:40
 [15] kernel at .\REPL[21]:2 [inlined]
 [16] cached_compilation at C:\Users\skyair\.julia\packages\GPUCompiler\uTpNx\src\cache.jl:65 [inlined]
 [17] cufunction(::typeof(kernel), ::Type{Tuple{TestType{CuDeviceArray{Float32,1,1}}}}; name::Nothing, kwargs::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}) at C:\Users\skyair\.julia\packages\CUDA\BIYoG\src\compiler\execution.jl:297
 [18] cufunction(::typeof(kernel), ::Type{Tuple{TestType{CuDeviceArray{Float32,1,1}}}}) at C:\Users\skyair\.julia\packages\CUDA\BIYoG\src\compiler\execution.jl:294
 [19] top-level scope at C:\Users\skyair\.julia\packages\CUDA\BIYoG\src\compiler\execution.jl:109

Are there any trick to solve or avoid this error?

Known issue, Julia codegen assumes forms of fpow that are not necessarily provided by the back-end. Once we have contextual dispatch we’ll be able to selectively override these functions for the GPU back-end, but for now you need to duplicate code to prevent the bad invocations.