Conversation
There was a problem hiding this comment.
Pull request overview
This PR updates the System.IO.Hashing Adler-32 implementation to use hardware intrinsics for vectorized processing (where available) and extends the test suite to validate correctness across a wider set of input shapes.
Changes:
- Add SIMD-accelerated Adler-32 update paths for Arm (AdvSimd) and x86 (Vector128 / AVX2 / AVX-512 BW), with scalar fallback.
- Refactor scalar update to share
ModBase/NMaxconstants and introduce a scalar tail helper for vector paths. - Add new test coverage for many input lengths, max-byte data, and incremental append chunking, validated against a reference Adler-32 implementation.
Reviewed changes
Copilot reviewed 2 out of 2 changed files in this pull request and generated 1 comment.
| File | Description |
|---|---|
src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32.cs |
Introduces vectorized Adler-32 update implementations (Arm/x86) with feature detection and scalar fallback. |
src/libraries/System.IO.Hashing/tests/Adler32Tests.cs |
Adds reference-based tests targeting multiple length boundaries, overflow-stressing inputs, and incremental append behavior. |
|
Tagging subscribers to this area: @dotnet/area-system-io-hashing, @bartonjs, @vcsjones |
81edcb0 to
04d2d15
Compare
🤖 Copilot Code Review — PR #124409Holistic AssessmentMotivation: The Adler-32 implementation added in #123601 was scalar-only. Adler-32 is used in zlib/deflate, so vectorization is a well-motivated performance improvement for a hot-path algorithm. The PR is justified. Approach: The implementation follows the well-known approach from zlib/chromium of decomposing the Adler-32 s2 weighted sum into position-weighted sums computed via SIMD multiply-add intrinsics, with a prefix-sum accumulator tracking inter-block s1 contributions. Four vectorized paths are provided (AdvSimd, SSE2/SSSE3, AVX2, AVX-512BW) plus a widening fallback for Vector128 without SSSE3. This matches the pattern used by other vectorized hash implementations in this library (Crc32.Vectorized.cs, Crc64.Vectorized.cs, XxHashShared.cs). Summary: ✅ LGTM. After extensive mathematical verification of all four vectorized paths (tracing the s1/s2 accumulation through multi-block examples), the code is correct. The test coverage is thorough with a reference implementation, and the patterns are consistent with the rest of the codebase. One minor suggestion below. No blocking issues found. Detailed Findings✅ Vectorized s2 Accumulation — Verified correct across all pathsI traced through the prefix-sum logic (the
✅ Vector512 Weight Correction — CorrectThe ✅ Vector128 Non-SSSE3 Fallback — Correct and overflow-safeThe widening fallback sums widened ✅ Endianness Guard — AppropriateThe ✅ AVX2 Lane Independence — Verified
✅ Test Coverage — ThoroughTests cover: (1) various lengths hitting all vector width boundaries and tail transitions, (2) all-0xFF stress testing for accumulator overflow safety, (3) incremental append with varying chunk sizes, all validated against a clean reference implementation that applies modular reduction per byte. Good use of 💡 Missing benchmark dataThe PR description doesn't include benchmark numbers. While the algorithmic improvement is well-established from zlib implementations, having BenchmarkDotNet results would help quantify the speedup for the .NET implementation specifically. This is a nice-to-have for the PR description, not a blocker — the approach is proven. |
|
@EgorBot -amd -intel -arm |
|
On my machine with AVX2... Before:
After:
using System.IO.Hashing;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Running;
BenchmarkSwitcher.FromAssembly(typeof(Bench).Assembly).Run(args);
[MemoryDiagnoser]
public class Bench
{
private byte[] _bytes;
[Params(1, 10, 1000, 10_000)]
public int Size { get; set; }
[GlobalSetup]
public void Setup()
{
_bytes = new byte[Size];
for (int i = 0; i < Size; i++)
{
_bytes[i] = (byte)('a' + (i % 26));
}
}
[Benchmark]
public uint Adler() => Adler32.HashToUInt32(_bytes);
} |
| if (BitConverter.IsLittleEndian && | ||
| Vector128.IsHardwareAccelerated && | ||
| source.Length >= Vector128<byte>.Count * 2) |
There was a problem hiding this comment.
Could this be put in CanBeVectorized just like in Crc32.Vectorized.cs (maybe put the Vectorized code in Adler32.Vectorized.cs?
| [MethodImpl(MethodImplOptions.NoInlining)] | ||
| private static uint UpdateVector128(uint adler, ReadOnlySpan<byte> source) | ||
| { | ||
| Debug.Assert(source.Length >= Vector128<byte>.Count * 2); | ||
|
|
||
| const int BlockSize = 32; // two Vector128<byte> loads | ||
|
|
||
| uint s1 = adler & 0xFFFF; | ||
| uint s2 = (adler >> 16) & 0xFFFF; | ||
|
|
||
| ref byte sourceRef = ref MemoryMarshal.GetReference(source); | ||
| int length = source.Length; | ||
|
|
||
| Vector128<sbyte> tap1 = Vector128.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17); | ||
| Vector128<sbyte> tap2 = Vector128.Create(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); | ||
|
|
||
| do | ||
| { | ||
| int n = Math.Min(length, NMax); | ||
| int blocks = n / BlockSize; | ||
| n = blocks * BlockSize; | ||
| length -= n; | ||
|
|
||
| Vector128<uint> vs1 = Vector128<uint>.Zero; | ||
| Vector128<uint> vs2 = Vector128.CreateScalar(s2); | ||
| Vector128<uint> vps = Vector128.CreateScalar(s1 * (uint)blocks); | ||
|
|
||
| do | ||
| { | ||
| Vector128<byte> bytes1 = Vector128.LoadUnsafe(ref sourceRef); | ||
| Vector128<byte> bytes2 = Vector128.LoadUnsafe(ref sourceRef, 16); | ||
| sourceRef = ref Unsafe.Add(ref sourceRef, BlockSize); | ||
|
|
||
| vps += vs1; | ||
|
|
||
| if (Ssse3.IsSupported) | ||
| { | ||
| vs1 += Sse2.SumAbsoluteDifferences(bytes1, Vector128<byte>.Zero).AsUInt32(); | ||
| vs1 += Sse2.SumAbsoluteDifferences(bytes2, Vector128<byte>.Zero).AsUInt32(); | ||
|
|
||
| vs2 += Sse2.MultiplyAddAdjacent(Ssse3.MultiplyAddAdjacent(bytes1, tap1), Vector128<short>.One).AsUInt32(); | ||
| vs2 += Sse2.MultiplyAddAdjacent(Ssse3.MultiplyAddAdjacent(bytes2, tap2), Vector128<short>.One).AsUInt32(); | ||
| } | ||
| else | ||
| { | ||
| (Vector128<ushort> lo1, Vector128<ushort> hi1) = Vector128.Widen(bytes1); | ||
| (Vector128<ushort> lo2, Vector128<ushort> hi2) = Vector128.Widen(bytes2); | ||
| (Vector128<uint> sumLo, Vector128<uint> sumHi) = Vector128.Widen(lo1 + hi1 + lo2 + hi2); | ||
| vs1 += sumLo + sumHi; | ||
| vs2 += WeightedSumWidening128(bytes1, tap1) + WeightedSumWidening128(bytes2, tap2); | ||
|
|
||
| [MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
| static Vector128<uint> WeightedSumWidening128(Vector128<byte> data, Vector128<sbyte> weights) | ||
| { | ||
| (Vector128<ushort> dLo, Vector128<ushort> dHi) = Vector128.Widen(data); | ||
| (Vector128<short> wLo, Vector128<short> wHi) = Vector128.Widen(weights); | ||
|
|
||
| (Vector128<int> pLo1, Vector128<int> pHi1) = Vector128.Widen(dLo.AsInt16() * wLo); | ||
| (Vector128<int> pLo2, Vector128<int> pHi2) = Vector128.Widen(dHi.AsInt16() * wHi); | ||
|
|
||
| return (pLo1 + pHi1 + pLo2 + pHi2).AsUInt32(); | ||
| } | ||
| } | ||
| } | ||
| while (--blocks > 0); | ||
|
|
||
| vs2 += vps << 5; | ||
|
|
||
| s1 += Vector128.Sum(vs1); | ||
| s2 = Vector128.Sum(vs2); | ||
|
|
||
| s1 %= ModBase; | ||
| s2 %= ModBase; | ||
| } | ||
| while (length >= BlockSize); | ||
|
|
||
| if (length > 0) | ||
| { | ||
| UpdateScalarTail(ref sourceRef, length, ref s1, ref s2); | ||
| } | ||
|
|
||
| return (s2 << 16) | s1; | ||
| } | ||
|
|
||
| [MethodImpl(MethodImplOptions.NoInlining)] | ||
| private static uint UpdateVector256(uint adler, ReadOnlySpan<byte> source) | ||
| { | ||
| Debug.Assert(source.Length >= Vector256<byte>.Count); | ||
|
|
||
| const int BlockSize = 32; | ||
|
|
||
| uint s1 = adler & 0xFFFF; | ||
| uint s2 = (adler >> 16) & 0xFFFF; | ||
|
|
||
| ref byte sourceRef = ref MemoryMarshal.GetReference(source); | ||
| int length = source.Length; | ||
|
|
||
| Vector256<sbyte> weights = Vector256.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); | ||
|
|
||
| do | ||
| { | ||
| int n = Math.Min(length, NMax); | ||
| int blocks = n / BlockSize; | ||
| n = blocks * BlockSize; | ||
| length -= n; | ||
|
|
||
| Vector256<uint> vs1 = Vector256.CreateScalar(s1); | ||
| Vector256<uint> vs2 = Vector256.CreateScalar(s2); | ||
| Vector256<uint> vs3 = Vector256<uint>.Zero; | ||
|
|
||
| do | ||
| { | ||
| Vector256<byte> data = Vector256.LoadUnsafe(ref sourceRef); | ||
| sourceRef = ref Unsafe.Add(ref sourceRef, BlockSize); | ||
|
|
||
| Vector256<uint> vs1_0 = vs1; | ||
| vs1 += Avx2.SumAbsoluteDifferences(data, Vector256<byte>.Zero).AsUInt32(); | ||
| vs3 += vs1_0; | ||
|
|
||
| Vector256<short> mad = Avx2.MultiplyAddAdjacent(data, weights); | ||
| vs2 += Avx2.MultiplyAddAdjacent(mad, Vector256<short>.One).AsUInt32(); | ||
| } | ||
| while (--blocks > 0); | ||
|
|
||
| vs3 <<= 5; | ||
| vs2 += vs3; | ||
|
|
||
| s1 = (uint)Vector256.Sum(vs1.AsUInt64()); // SumAbsoluteDifferences stores the results in the even lanes | ||
| s2 = Vector256.Sum(vs2); | ||
|
|
||
| s1 %= ModBase; | ||
| s2 %= ModBase; | ||
| } | ||
| while (length >= BlockSize); | ||
|
|
||
| if (length > 0) | ||
| { | ||
| UpdateScalarTail(ref sourceRef, length, ref s1, ref s2); | ||
| } | ||
|
|
||
| return (s2 << 16) | s1; | ||
| } | ||
|
|
||
| [MethodImpl(MethodImplOptions.NoInlining)] | ||
| private static uint UpdateVector512(uint adler, ReadOnlySpan<byte> source) | ||
| { | ||
| Debug.Assert(source.Length >= Vector512<byte>.Count); | ||
|
|
||
| const int BlockSize = 64; | ||
|
|
||
| uint s1 = adler & 0xFFFF; | ||
| uint s2 = (adler >> 16) & 0xFFFF; | ||
|
|
||
| ref byte sourceRef = ref MemoryMarshal.GetReference(source); | ||
| int length = source.Length; | ||
|
|
||
| Vector512<sbyte> weights = Vector512.Create( | ||
| 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, | ||
| 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); | ||
|
|
||
| do | ||
| { | ||
| int n = Math.Min(length, NMax); | ||
| int blocks = n / BlockSize; | ||
| n = blocks * BlockSize; | ||
| length -= n; | ||
|
|
||
| Vector512<uint> vs1 = Vector512.CreateScalar(s1); | ||
| Vector512<uint> vs2 = Vector512.CreateScalar(s2); | ||
| Vector512<uint> vs3 = Vector512<uint>.Zero; | ||
|
|
||
| do | ||
| { | ||
| Vector512<byte> data = Vector512.LoadUnsafe(ref sourceRef); | ||
| sourceRef = ref Unsafe.Add(ref sourceRef, BlockSize); | ||
|
|
||
| Vector512<uint> vs1_0 = vs1; | ||
| vs1 += Avx512BW.SumAbsoluteDifferences(data, Vector512<byte>.Zero).AsUInt32(); | ||
| vs3 += vs1_0; | ||
| vs2 += Avx512BW.MultiplyAddAdjacent(Avx512BW.MultiplyAddAdjacent(data, weights), Vector512<short>.One).AsUInt32(); | ||
|
|
||
| Vector256<uint> sumLo = Avx2.SumAbsoluteDifferences(data.GetLower(), Vector256<byte>.Zero).AsUInt32(); | ||
| vs2 += Vector512.Create(sumLo << 5, Vector256<uint>.Zero); | ||
| } | ||
| while (--blocks > 0); | ||
|
|
||
| vs3 <<= 6; | ||
| vs2 += vs3; | ||
|
|
||
| s1 = (uint)Vector512.Sum(vs1.AsUInt64()); | ||
| s2 = Vector512.Sum(vs2); | ||
|
|
||
| s1 %= ModBase; | ||
| s2 %= ModBase; | ||
| } | ||
| while (length >= BlockSize); | ||
|
|
||
| if (length >= Vector256<byte>.Count) | ||
| { | ||
| return UpdateVector256((s2 << 16) | s1, MemoryMarshal.CreateReadOnlySpan(ref sourceRef, length)); | ||
| } | ||
|
|
||
| if (length > 0) | ||
| { | ||
| UpdateScalarTail(ref sourceRef, length, ref s1, ref s2); | ||
| } | ||
|
|
||
| return (s2 << 16) | s1; | ||
| } | ||
|
|
||
| [MethodImpl(MethodImplOptions.NoInlining)] | ||
| private static uint UpdateArm128(uint adler, ReadOnlySpan<byte> source) | ||
| { | ||
| Debug.Assert(source.Length >= Vector128<byte>.Count * 2); | ||
|
|
||
| const int BlockSize = 32; // two Vector128<byte> loads | ||
|
|
||
| uint s1 = adler & 0xFFFF; | ||
| uint s2 = (adler >> 16) & 0xFFFF; | ||
|
|
||
| ref byte sourceRef = ref MemoryMarshal.GetReference(source); | ||
| int length = source.Length; | ||
|
|
||
| do | ||
| { | ||
| int n = Math.Min(length, NMax); | ||
| int blocks = n / BlockSize; | ||
| n = blocks * BlockSize; | ||
| length -= n; | ||
|
|
||
| Vector128<uint> vs1 = Vector128<uint>.Zero; | ||
| Vector128<uint> vps = Vector128.CreateScalar(s1 * (uint)blocks); | ||
|
|
||
| Vector128<ushort> vColumnSum1 = Vector128<ushort>.Zero; | ||
| Vector128<ushort> vColumnSum2 = Vector128<ushort>.Zero; | ||
| Vector128<ushort> vColumnSum3 = Vector128<ushort>.Zero; | ||
| Vector128<ushort> vColumnSum4 = Vector128<ushort>.Zero; | ||
|
|
||
| do | ||
| { | ||
| Vector128<byte> bytes1 = Vector128.LoadUnsafe(ref sourceRef); | ||
| Vector128<byte> bytes2 = Vector128.LoadUnsafe(ref sourceRef, 16); | ||
| sourceRef = ref Unsafe.Add(ref sourceRef, BlockSize); | ||
|
|
||
| vps += vs1; | ||
|
|
||
| vs1 = AdvSimd.AddPairwiseWideningAndAdd( | ||
| vs1, | ||
| AdvSimd.AddPairwiseWideningAndAdd( | ||
| AdvSimd.AddPairwiseWidening(bytes1), | ||
| bytes2)); | ||
|
|
||
| vColumnSum1 = AdvSimd.AddWideningLower(vColumnSum1, bytes1.GetLower()); | ||
| vColumnSum2 = AdvSimd.AddWideningLower(vColumnSum2, bytes1.GetUpper()); | ||
| vColumnSum3 = AdvSimd.AddWideningLower(vColumnSum3, bytes2.GetLower()); | ||
| vColumnSum4 = AdvSimd.AddWideningLower(vColumnSum4, bytes2.GetUpper()); | ||
| } | ||
| while (--blocks > 0); | ||
|
|
||
| Vector128<uint> vs2 = vps << 5; | ||
| vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum1.GetLower(), Vector64.Create((ushort)32, 31, 30, 29)); | ||
| vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum1.GetUpper(), Vector64.Create((ushort)28, 27, 26, 25)); | ||
| vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum2.GetLower(), Vector64.Create((ushort)24, 23, 22, 21)); | ||
| vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum2.GetUpper(), Vector64.Create((ushort)20, 19, 18, 17)); | ||
| vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum3.GetLower(), Vector64.Create((ushort)16, 15, 14, 13)); | ||
| vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum3.GetUpper(), Vector64.Create((ushort)12, 11, 10, 9)); | ||
| vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum4.GetLower(), Vector64.Create((ushort)8, 7, 6, 5)); | ||
| vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum4.GetUpper(), Vector64.Create((ushort)4, 3, 2, 1)); | ||
|
|
||
| s1 += Vector128.Sum(vs1); | ||
| s2 += Vector128.Sum(vs2); | ||
|
|
||
| s1 %= ModBase; | ||
| s2 %= ModBase; | ||
| } | ||
| while (length >= BlockSize); | ||
|
|
||
| if (length > 0) | ||
| { | ||
| UpdateScalarTail(ref sourceRef, length, ref s1, ref s2); | ||
| } | ||
|
|
||
| return (s2 << 16) | s1; | ||
| } | ||
|
|
||
| private static void UpdateScalarTail(ref byte sourceRef, int length, ref uint s1, ref uint s2) | ||
| { | ||
| Debug.Assert(length is > 0 and < NMax); | ||
|
|
||
| foreach (byte b in MemoryMarshal.CreateReadOnlySpan(ref sourceRef, length)) | ||
| { | ||
| s1 += b; | ||
| s2 += s1; | ||
| } | ||
|
|
||
| s1 %= ModBase; | ||
| s2 %= ModBase; | ||
| } |
There was a problem hiding this comment.
I think all of this could be put in Adler32.Vectorized.cs as the csproj can separate the .NET specific stuff from say the .NET Standard target(s).
No description provided.