From 2253c22fa050fe9b9acaf72278945563438ab250 Mon Sep 17 00:00:00 2001 From: WANG Rui Date: Thu, 26 Jun 2025 16:42:21 +0800 Subject: [PATCH] Add SIMD impls of `lines_fwd` and `lines_bwd` for LoongArch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Benchmark results on LA664: - LASX ``` simd/lines_fwd/1 time: [2.4046 ns 2.4076 ns 2.4118 ns] thrpt: [395.42 MiB/s 396.10 MiB/s 396.60 MiB/s] change: time: [+20.046% +20.332% +20.762%] (p = 0.00 < 0.05) thrpt: [−17.193% −16.896% −16.699%] Performance has regressed. simd/lines_fwd/8 time: [8.4050 ns 8.4114 ns 8.4192 ns] thrpt: [906.19 MiB/s 907.03 MiB/s 907.72 MiB/s] change: time: [+4.9243% +5.0308% +5.1538%] (p = 0.00 < 0.05) thrpt: [−4.9012% −4.7898% −4.6932%] Performance has regressed. simd/lines_fwd/128 time: [35.622 ns 35.650 ns 35.685 ns] thrpt: [3.3406 GiB/s 3.3439 GiB/s 3.3465 GiB/s] change: time: [−66.111% −65.957% −65.864%] (p = 0.00 < 0.05) thrpt: [+192.94% +193.74% +195.08%] Performance has improved. simd/lines_fwd/1024 time: [53.349 ns 53.400 ns 53.457 ns] thrpt: [17.840 GiB/s 17.859 GiB/s 17.876 GiB/s] change: time: [−93.548% −93.540% −93.533%] (p = 0.00 < 0.05) thrpt: [+1446.4% +1448.1% +1449.8%] Performance has improved. simd/lines_fwd/131072 time: [3.0780 µs 3.0815 µs 3.0866 µs] thrpt: [39.549 GiB/s 39.613 GiB/s 39.659 GiB/s] change: time: [−97.069% −97.065% −97.060%] (p = 0.00 < 0.05) thrpt: [+3301.2% +3307.0% +3311.8%] Performance has improved. simd/lines_fwd/134217728 time: [4.5887 ms 4.5919 ms 4.5958 ms] thrpt: [27.199 GiB/s 27.222 GiB/s 27.241 GiB/s] change: time: [−95.733% −95.729% −95.725%] (p = 0.00 < 0.05) thrpt: [+2239.3% +2241.5% +2243.5%] Performance has improved. ``` - LSX ``` simd/lines_fwd/1 time: [6.4032 ns 6.4068 ns 6.4116 ns] thrpt: [148.74 MiB/s 148.85 MiB/s 148.94 MiB/s] change: time: [+219.68% +219.98% +220.24%] (p = 0.00 < 0.05) thrpt: [−68.773% −68.748% −68.719%] Performance has regressed. simd/lines_fwd/8 time: [12.406 ns 12.413 ns 12.422 ns] thrpt: [614.20 MiB/s 614.63 MiB/s 614.96 MiB/s] change: time: [+54.884% +55.133% +55.502%] (p = 0.00 < 0.05) thrpt: [−35.692% −35.539% −35.436%] Performance has regressed. simd/lines_fwd/128 time: [24.412 ns 24.427 ns 24.448 ns] thrpt: [4.8761 GiB/s 4.8801 GiB/s 4.8832 GiB/s] change: time: [−76.775% −76.669% −76.607%] (p = 0.00 < 0.05) thrpt: [+327.48% +328.62% +330.58%] Performance has improved. simd/lines_fwd/1024 time: [49.467 ns 49.530 ns 49.599 ns] thrpt: [19.228 GiB/s 19.255 GiB/s 19.279 GiB/s] change: time: [−94.014% −94.006% −93.998%] (p = 0.00 < 0.05) thrpt: [+1566.2% +1568.3% +1570.4%] Performance has improved. simd/lines_fwd/131072 time: [4.5825 µs 4.5858 µs 4.5900 µs] thrpt: [26.595 GiB/s 26.619 GiB/s 26.638 GiB/s] change: time: [−95.639% −95.632% −95.624%] (p = 0.00 < 0.05) thrpt: [+2185.1% +2189.3% +2192.9%] Performance has improved. simd/lines_fwd/134217728 time: [5.4066 ms 5.4103 ms 5.4151 ms] thrpt: [23.084 GiB/s 23.104 GiB/s 23.120 GiB/s] change: time: [−94.972% −94.968% −94.963%] (p = 0.00 < 0.05) thrpt: [+1885.3% +1887.3% +1889.0%] Performance has improved. ``` --- src/lib.rs | 5 ++ src/simd/lines_bwd.rs | 176 +++++++++++++++++++++++++++++++++++++++++- src/simd/lines_fwd.rs | 172 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 349 insertions(+), 4 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index d6e64d5e73fe..6c5f94fa6827 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -11,6 +11,11 @@ maybe_uninit_slice, maybe_uninit_uninit_array_transpose )] +#![cfg_attr( + target_arch = "loongarch64", + feature(stdarch_loongarch, stdarch_loongarch_feature_detection, loongarch_target_feature), + allow(clippy::incompatible_msrv) +)] #![allow(clippy::missing_transmute_annotations, clippy::new_without_default, stable_features)] #[macro_use] diff --git a/src/simd/lines_bwd.rs b/src/simd/lines_bwd.rs index dbe59add9e43..5debb0a55483 100644 --- a/src/simd/lines_bwd.rs +++ b/src/simd/lines_bwd.rs @@ -34,7 +34,7 @@ unsafe fn lines_bwd_raw( line: CoordType, line_stop: CoordType, ) -> (*const u8, CoordType) { - #[cfg(target_arch = "x86_64")] + #[cfg(any(target_arch = "x86_64", target_arch = "loongarch64"))] return unsafe { LINES_BWD_DISPATCH(beg, end, line, line_stop) }; #[cfg(target_arch = "aarch64")] @@ -65,7 +65,7 @@ unsafe fn lines_bwd_fallback( } } -#[cfg(target_arch = "x86_64")] +#[cfg(any(target_arch = "x86_64", target_arch = "loongarch64"))] static mut LINES_BWD_DISPATCH: unsafe fn( beg: *const u8, end: *const u8, @@ -162,6 +162,178 @@ unsafe fn lines_bwd_avx2( } } +#[cfg(target_arch = "loongarch64")] +unsafe fn lines_bwd_dispatch( + beg: *const u8, + end: *const u8, + line: CoordType, + line_stop: CoordType, +) -> (*const u8, CoordType) { + use std::arch::is_loongarch_feature_detected; + + let func = if is_loongarch_feature_detected!("lasx") { + lines_bwd_lasx + } else if is_loongarch_feature_detected!("lsx") { + lines_bwd_lsx + } else { + lines_bwd_fallback + }; + unsafe { LINES_BWD_DISPATCH = func }; + unsafe { func(beg, end, line, line_stop) } +} + +#[cfg(target_arch = "loongarch64")] +#[target_feature(enable = "lasx")] +unsafe fn lines_bwd_lasx( + beg: *const u8, + mut end: *const u8, + mut line: CoordType, + line_stop: CoordType, +) -> (*const u8, CoordType) { + unsafe { + use std::arch::loongarch64::*; + use std::mem::transmute as T; + + #[inline(always)] + unsafe fn horizontal_sum(sum: v32i8) -> u32 { + unsafe { + let sum = lasx_xvhaddw_h_b(sum, sum); + let sum = lasx_xvhaddw_w_h(sum, sum); + let sum = lasx_xvhaddw_d_w(sum, sum); + let sum = lasx_xvhaddw_q_d(sum, sum); + let tmp = lasx_xvpermi_q::<1>(T(sum), T(sum)); + let sum = lasx_xvadd_w(T(sum), T(tmp)); + lasx_xvpickve2gr_wu::<0>(sum) + } + } + + let lf = lasx_xvrepli_b(b'\n' as i32); + let line_stop = line_stop.min(line); + let off = end.addr() & 31; + if off != 0 && off < end.offset_from_unsigned(beg) { + (end, line) = lines_bwd_fallback(end.sub(off), end, line, line_stop); + } + + while end.offset_from_unsigned(beg) >= 128 { + let chunk_start = end.sub(128); + + let v1 = lasx_xvld::<0>(chunk_start as *const _); + let v2 = lasx_xvld::<32>(chunk_start as *const _); + let v3 = lasx_xvld::<64>(chunk_start as *const _); + let v4 = lasx_xvld::<96>(chunk_start as *const _); + + let mut sum = lasx_xvrepli_b(0); + sum = lasx_xvsub_b(sum, lasx_xvseq_b(v1, lf)); + sum = lasx_xvsub_b(sum, lasx_xvseq_b(v2, lf)); + sum = lasx_xvsub_b(sum, lasx_xvseq_b(v3, lf)); + sum = lasx_xvsub_b(sum, lasx_xvseq_b(v4, lf)); + let sum = horizontal_sum(sum); + + let line_next = line - sum as CoordType; + if line_next <= line_stop { + break; + } + + end = chunk_start; + line = line_next; + } + + while end.offset_from_unsigned(beg) >= 32 { + let chunk_start = end.sub(32); + let v = lasx_xvld::<0>(chunk_start as *const _); + let c = lasx_xvseq_b(v, lf); + + let ones = lasx_xvand_v(T(c), T(lasx_xvrepli_b(1))); + let sum = horizontal_sum(T(ones)); + + let line_next = line - sum as CoordType; + if line_next <= line_stop { + break; + } + + end = chunk_start; + line = line_next; + } + + lines_bwd_fallback(beg, end, line, line_stop) + } +} + +#[cfg(target_arch = "loongarch64")] +#[target_feature(enable = "lsx")] +unsafe fn lines_bwd_lsx( + beg: *const u8, + mut end: *const u8, + mut line: CoordType, + line_stop: CoordType, +) -> (*const u8, CoordType) { + unsafe { + use std::arch::loongarch64::*; + use std::mem::transmute as T; + + #[inline(always)] + unsafe fn horizontal_sum(sum: v16i8) -> u32 { + unsafe { + let sum = lsx_vhaddw_h_b(sum, sum); + let sum = lsx_vhaddw_w_h(sum, sum); + let sum = lsx_vhaddw_d_w(sum, sum); + let sum = lsx_vhaddw_q_d(sum, sum); + lsx_vpickve2gr_wu::<0>(T(sum)) + } + } + + let lf = lsx_vrepli_b(b'\n' as i32); + let line_stop = line_stop.min(line); + let off = end.addr() & 15; + if off != 0 && off < end.offset_from_unsigned(beg) { + (end, line) = lines_bwd_fallback(end.sub(off), end, line, line_stop); + } + + while end.offset_from_unsigned(beg) >= 64 { + let chunk_start = end.sub(64); + + let v1 = lsx_vld::<0>(chunk_start as *const _); + let v2 = lsx_vld::<16>(chunk_start as *const _); + let v3 = lsx_vld::<32>(chunk_start as *const _); + let v4 = lsx_vld::<48>(chunk_start as *const _); + + let mut sum = lsx_vrepli_b(0); + sum = lsx_vsub_b(sum, lsx_vseq_b(v1, lf)); + sum = lsx_vsub_b(sum, lsx_vseq_b(v2, lf)); + sum = lsx_vsub_b(sum, lsx_vseq_b(v3, lf)); + sum = lsx_vsub_b(sum, lsx_vseq_b(v4, lf)); + let sum = horizontal_sum(sum); + + let line_next = line - sum as CoordType; + if line_next <= line_stop { + break; + } + + end = chunk_start; + line = line_next; + } + + while end.offset_from_unsigned(beg) >= 16 { + let chunk_start = end.sub(16); + let v = lsx_vld::<0>(chunk_start as *const _); + let c = lsx_vseq_b(v, lf); + + let ones = lsx_vand_v(T(c), T(lsx_vrepli_b(1))); + let sum = horizontal_sum(T(ones)); + + let line_next = line - sum as CoordType; + if line_next <= line_stop { + break; + } + + end = chunk_start; + line = line_next; + } + + lines_bwd_fallback(beg, end, line, line_stop) + } +} + #[cfg(target_arch = "aarch64")] unsafe fn lines_bwd_neon( beg: *const u8, diff --git a/src/simd/lines_fwd.rs b/src/simd/lines_fwd.rs index e2d11f1c23bb..06e60cafd356 100644 --- a/src/simd/lines_fwd.rs +++ b/src/simd/lines_fwd.rs @@ -32,7 +32,7 @@ unsafe fn lines_fwd_raw( line: CoordType, line_stop: CoordType, ) -> (*const u8, CoordType) { - #[cfg(target_arch = "x86_64")] + #[cfg(any(target_arch = "x86_64", target_arch = "loongarch64"))] return unsafe { LINES_FWD_DISPATCH(beg, end, line, line_stop) }; #[cfg(target_arch = "aarch64")] @@ -65,7 +65,7 @@ unsafe fn lines_fwd_fallback( } } -#[cfg(target_arch = "x86_64")] +#[cfg(any(target_arch = "x86_64", target_arch = "loongarch64"))] static mut LINES_FWD_DISPATCH: unsafe fn( beg: *const u8, end: *const u8, @@ -168,6 +168,174 @@ unsafe fn lines_fwd_avx2( } } +#[cfg(target_arch = "loongarch64")] +unsafe fn lines_fwd_dispatch( + beg: *const u8, + end: *const u8, + line: CoordType, + line_stop: CoordType, +) -> (*const u8, CoordType) { + use std::arch::is_loongarch_feature_detected; + + let func = if is_loongarch_feature_detected!("lasx") { + lines_fwd_lasx + } else if is_loongarch_feature_detected!("lsx") { + lines_fwd_lsx + } else { + lines_fwd_fallback + }; + unsafe { LINES_FWD_DISPATCH = func }; + unsafe { func(beg, end, line, line_stop) } +} + +#[cfg(target_arch = "loongarch64")] +#[target_feature(enable = "lasx")] +unsafe fn lines_fwd_lasx( + mut beg: *const u8, + end: *const u8, + mut line: CoordType, + line_stop: CoordType, +) -> (*const u8, CoordType) { + unsafe { + use std::arch::loongarch64::*; + use std::mem::transmute as T; + + #[inline(always)] + unsafe fn horizontal_sum(sum: v32i8) -> u32 { + unsafe { + let sum = lasx_xvhaddw_h_b(sum, sum); + let sum = lasx_xvhaddw_w_h(sum, sum); + let sum = lasx_xvhaddw_d_w(sum, sum); + let sum = lasx_xvhaddw_q_d(sum, sum); + let tmp = lasx_xvpermi_q::<1>(T(sum), T(sum)); + let sum = lasx_xvadd_w(T(sum), T(tmp)); + lasx_xvpickve2gr_wu::<0>(sum) + } + } + + let lf = lasx_xvrepli_b(b'\n' as i32); + let off = beg.align_offset(32); + if off != 0 && off < end.offset_from_unsigned(beg) { + (beg, line) = lines_fwd_fallback(beg, beg.add(off), line, line_stop); + } + + if line < line_stop { + while end.offset_from_unsigned(beg) >= 128 { + let v1 = lasx_xvld::<0>(beg as *const _); + let v2 = lasx_xvld::<32>(beg as *const _); + let v3 = lasx_xvld::<64>(beg as *const _); + let v4 = lasx_xvld::<96>(beg as *const _); + + let mut sum = lasx_xvrepli_b(0); + sum = lasx_xvsub_b(sum, lasx_xvseq_b(v1, lf)); + sum = lasx_xvsub_b(sum, lasx_xvseq_b(v2, lf)); + sum = lasx_xvsub_b(sum, lasx_xvseq_b(v3, lf)); + sum = lasx_xvsub_b(sum, lasx_xvseq_b(v4, lf)); + let sum = horizontal_sum(sum); + + let line_next = line + sum as CoordType; + if line_next >= line_stop { + break; + } + + beg = beg.add(128); + line = line_next; + } + + while end.offset_from_unsigned(beg) >= 32 { + let v = lasx_xvld::<0>(beg as *const _); + let c = lasx_xvseq_b(v, lf); + + let ones = lasx_xvand_v(T(c), T(lasx_xvrepli_b(1))); + let sum = horizontal_sum(T(ones)); + + let line_next = line + sum as CoordType; + if line_next >= line_stop { + break; + } + + beg = beg.add(32); + line = line_next; + } + } + + lines_fwd_fallback(beg, end, line, line_stop) + } +} + +#[cfg(target_arch = "loongarch64")] +#[target_feature(enable = "lsx")] +unsafe fn lines_fwd_lsx( + mut beg: *const u8, + end: *const u8, + mut line: CoordType, + line_stop: CoordType, +) -> (*const u8, CoordType) { + unsafe { + use std::arch::loongarch64::*; + use std::mem::transmute as T; + + #[inline(always)] + unsafe fn horizontal_sum(sum: v16i8) -> u32 { + unsafe { + let sum = lsx_vhaddw_h_b(sum, sum); + let sum = lsx_vhaddw_w_h(sum, sum); + let sum = lsx_vhaddw_d_w(sum, sum); + let sum = lsx_vhaddw_q_d(sum, sum); + lsx_vpickve2gr_wu::<0>(T(sum)) + } + } + + let lf = lsx_vrepli_b(b'\n' as i32); + let off = beg.align_offset(16); + if off != 0 && off < end.offset_from_unsigned(beg) { + (beg, line) = lines_fwd_fallback(beg, beg.add(off), line, line_stop); + } + + if line < line_stop { + while end.offset_from_unsigned(beg) >= 64 { + let v1 = lsx_vld::<0>(beg as *const _); + let v2 = lsx_vld::<16>(beg as *const _); + let v3 = lsx_vld::<32>(beg as *const _); + let v4 = lsx_vld::<48>(beg as *const _); + + let mut sum = lsx_vrepli_b(0); + sum = lsx_vsub_b(sum, lsx_vseq_b(v1, lf)); + sum = lsx_vsub_b(sum, lsx_vseq_b(v2, lf)); + sum = lsx_vsub_b(sum, lsx_vseq_b(v3, lf)); + sum = lsx_vsub_b(sum, lsx_vseq_b(v4, lf)); + let sum = horizontal_sum(sum); + + let line_next = line + sum as CoordType; + if line_next >= line_stop { + break; + } + + beg = beg.add(64); + line = line_next; + } + + while end.offset_from_unsigned(beg) >= 16 { + let v = lsx_vld::<0>(beg as *const _); + let c = lsx_vseq_b(v, lf); + + let ones = lsx_vand_v(T(c), T(lsx_vrepli_b(1))); + let sum = horizontal_sum(T(ones)); + + let line_next = line + sum as CoordType; + if line_next >= line_stop { + break; + } + + beg = beg.add(16); + line = line_next; + } + } + + lines_fwd_fallback(beg, end, line, line_stop) + } +} + #[cfg(target_arch = "aarch64")] unsafe fn lines_fwd_neon( mut beg: *const u8,