Optimize Vector128<long> multiplication for arm64#104177
Conversation
|
@EgorBot -arm64 -profiler using System.IO.Hashing;
using BenchmarkDotNet.Attributes;
public class Bench
{
static readonly byte[] Data = new byte[1000000];
[Benchmark]
public byte[] BenchXxHash128()
{
XxHash128 hash = new();
hash.Append(Data);
return hash.GetHashAndReset();
}
} |
|
Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch |
| case TYP_LONG: | ||
| case TYP_ULONG: | ||
| { | ||
| assert(simdSize == 16); | ||
|
|
||
| // Make op1 and op2 multi-use: | ||
| GenTree* op1Dup = fgMakeMultiUse(&op1); | ||
| GenTree* op2Dup = fgMakeMultiUse(&op2); | ||
|
|
||
| // long left0 = op1.GetElement(0) | ||
| // long left1 = op1.GetElement(1) | ||
| GenTree* left0 = gtNewSimdGetElementNode(TYP_LONG, op1, gtNewIconNode(0), simdBaseJitType, 16); | ||
| GenTree* left1 = gtNewSimdGetElementNode(TYP_LONG, op1Dup, gtNewIconNode(1), simdBaseJitType, 16); | ||
|
|
||
| // long right0 = op2.GetElement(0) | ||
| // long right1 = op2.GetElement(1) | ||
| GenTree* right0 = gtNewSimdGetElementNode(TYP_LONG, op2, gtNewIconNode(0), simdBaseJitType, 16); | ||
| GenTree* right1 = gtNewSimdGetElementNode(TYP_LONG, op2Dup, gtNewIconNode(1), simdBaseJitType, 16); | ||
|
|
||
| // Vector128<long> vec = Vector128.Create(left0 * right0, left1 * right1) | ||
| op1 = gtNewOperNode(GT_MUL, TYP_LONG, left0, right0); | ||
| op2 = gtNewOperNode(GT_MUL, TYP_LONG, left1, right1); | ||
| GenTree* vec = gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, op1, simdBaseJitType, 16); | ||
| return gtNewSimdHWIntrinsicNode(TYP_SIMD16, vec, gtNewIconNode(1), op2, NI_AdvSimd_Insert, | ||
| simdBaseJitType, 16); | ||
| } |
There was a problem hiding this comment.
Is this just avoiding the cost of inlining, unrolling, and simplifying the work the JIT would have to do?
Benchmark results on Arm64
Flame graphs: Main vs PR 🔥 For clean |
|
I wanted to ask is there a reason LLVM's codegen variant did not work? On some cores, |
src/coreclr/jit/gentree.cpp
Outdated
There was a problem hiding this comment.
nit: Use gtNewSimdWithElementNode(type, vec, gtNewIconNode(1), op2, simdBaseJitType, simdSize) which ensures all the optimal handling takes place.
| op1 = gtNewBitCastNode(TYP_LONG, op1); | ||
| op2 = gtNewBitCastNode(TYP_LONG, op2); |
There was a problem hiding this comment.
Why bitcast instead of ToScalar? If this is generating better code, it seems like a pretty "core" scenario we're not handling from the ToScalar path
There was a problem hiding this comment.
@tannergooding because op2 can be either 8-byte TYP_SIMD8 or 8-byte scalar (TYP_LONG) so bitcast allowed me to simplify handling. In my initial version I forgot that this path is used for both MUL(vector, vector) and MUL(vector, scalar) (where scalar is broadcasted)
Follow up to #103555 for arm64