Here is my benchmark for two runs of each:
Turing.setadbackend(:forwarddiff)
Iterations = 1001:1:2000
Number of chains = 4
Samples per chain = 1000
Wall duration = 102.14 seconds
Compute duration = 361.99 seconds
Iterations = 1001:1:2000
Number of chains = 4
Samples per chain = 1000
Wall duration = 95.9 seconds
Compute duration = 333.32 seconds
Turing.setadbackend(:tracker)
Iterations = 1001:1:2000
Number of chains = 4
Samples per chain = 1000
Wall duration = 99.84 seconds
Compute duration = 331.42 seconds
Iterations = 1001:1:2000
Number of chains = 4
Samples per chain = 1000
Wall duration = 86.06 seconds
Compute duration = 298.54 seconds
Turing.setadbackend(:zygote)
# gave up after a couple of minutes; will let it run over the weekend
Turing.setadbackend(:reversediff)
Turing.setrdcache(true)
Iterations = 1001:1:2000
Number of chains = 4
Samples per chain = 1000
Wall duration = 26.13 seconds
Compute duration = 101.67 seconds
Iterations = 1001:1:2000
Number of chains = 4
Samples per chain = 1000
Wall duration = 5.85 seconds
Compute duration = 20.05 seconds