From 2253c22fa050fe9b9acaf72278945563438ab250 Mon Sep 17 00:00:00 2001
From: WANG Rui <wangrui@loongson.cn>
Date: Thu, 26 Jun 2025 16:42:21 +0800
Subject: [PATCH] Add SIMD impls of `lines_fwd` and `lines_bwd` for LoongArch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Benchmark results on LA664:

- LASX

```
simd/lines_fwd/1        time:   [2.4046 ns 2.4076 ns 2.4118 ns]
                        thrpt:  [395.42 MiB/s 396.10 MiB/s 396.60 MiB/s]
                 change:
                        time:   [+20.046% +20.332% +20.762%] (p = 0.00 < 0.05)
                        thrpt:  [−17.193% −16.896% −16.699%]
                        Performance has regressed.

simd/lines_fwd/8        time:   [8.4050 ns 8.4114 ns 8.4192 ns]
                        thrpt:  [906.19 MiB/s 907.03 MiB/s 907.72 MiB/s]
                 change:
                        time:   [+4.9243% +5.0308% +5.1538%] (p = 0.00 < 0.05)
                        thrpt:  [−4.9012% −4.7898% −4.6932%]
                        Performance has regressed.

simd/lines_fwd/128      time:   [35.622 ns 35.650 ns 35.685 ns]
                        thrpt:  [3.3406 GiB/s 3.3439 GiB/s 3.3465 GiB/s]
                 change:
                        time:   [−66.111% −65.957% −65.864%] (p = 0.00 < 0.05)
                        thrpt:  [+192.94% +193.74% +195.08%]
                        Performance has improved.

simd/lines_fwd/1024     time:   [53.349 ns 53.400 ns 53.457 ns]
                        thrpt:  [17.840 GiB/s 17.859 GiB/s 17.876 GiB/s]
                 change:
                        time:   [−93.548% −93.540% −93.533%] (p = 0.00 < 0.05)
                        thrpt:  [+1446.4% +1448.1% +1449.8%]
                        Performance has improved.

simd/lines_fwd/131072   time:   [3.0780 µs 3.0815 µs 3.0866 µs]
                        thrpt:  [39.549 GiB/s 39.613 GiB/s 39.659 GiB/s]
                 change:
                        time:   [−97.069% −97.065% −97.060%] (p = 0.00 < 0.05)
                        thrpt:  [+3301.2% +3307.0% +3311.8%]
                        Performance has improved.

simd/lines_fwd/134217728
                        time:   [4.5887 ms 4.5919 ms 4.5958 ms]
                        thrpt:  [27.199 GiB/s 27.222 GiB/s 27.241 GiB/s]
                 change:
                        time:   [−95.733% −95.729% −95.725%] (p = 0.00 < 0.05)
                        thrpt:  [+2239.3% +2241.5% +2243.5%]
                        Performance has improved.
```

- LSX

```
simd/lines_fwd/1        time:   [6.4032 ns 6.4068 ns 6.4116 ns]
                        thrpt:  [148.74 MiB/s 148.85 MiB/s 148.94 MiB/s]
                 change:
                        time:   [+219.68% +219.98% +220.24%] (p = 0.00 < 0.05)
                        thrpt:  [−68.773% −68.748% −68.719%]
                        Performance has regressed.

simd/lines_fwd/8        time:   [12.406 ns 12.413 ns 12.422 ns]
                        thrpt:  [614.20 MiB/s 614.63 MiB/s 614.96 MiB/s]
                 change:
                        time:   [+54.884% +55.133% +55.502%] (p = 0.00 < 0.05)
                        thrpt:  [−35.692% −35.539% −35.436%]
                        Performance has regressed.

simd/lines_fwd/128      time:   [24.412 ns 24.427 ns 24.448 ns]
                        thrpt:  [4.8761 GiB/s 4.8801 GiB/s 4.8832 GiB/s]
                 change:
                        time:   [−76.775% −76.669% −76.607%] (p = 0.00 < 0.05)
                        thrpt:  [+327.48% +328.62% +330.58%]
                        Performance has improved.

simd/lines_fwd/1024     time:   [49.467 ns 49.530 ns 49.599 ns]
                        thrpt:  [19.228 GiB/s 19.255 GiB/s 19.279 GiB/s]
                 change:
                        time:   [−94.014% −94.006% −93.998%] (p = 0.00 < 0.05)
                        thrpt:  [+1566.2% +1568.3% +1570.4%]
                        Performance has improved.
simd/lines_fwd/131072   time:   [4.5825 µs 4.5858 µs 4.5900 µs]
                        thrpt:  [26.595 GiB/s 26.619 GiB/s 26.638 GiB/s]
                 change:
                        time:   [−95.639% −95.632% −95.624%] (p = 0.00 < 0.05)
                        thrpt:  [+2185.1% +2189.3% +2192.9%]
                        Performance has improved.

simd/lines_fwd/134217728
                        time:   [5.4066 ms 5.4103 ms 5.4151 ms]
                        thrpt:  [23.084 GiB/s 23.104 GiB/s 23.120 GiB/s]
                 change:
                        time:   [−94.972% −94.968% −94.963%] (p = 0.00 < 0.05)
                        thrpt:  [+1885.3% +1887.3% +1889.0%]
                        Performance has improved.
```
---
 src/lib.rs            |   5 ++
 src/simd/lines_bwd.rs | 176 +++++++++++++++++++++++++++++++++++++++++-
 src/simd/lines_fwd.rs | 172 ++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 349 insertions(+), 4 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index d6e64d5e73fe..6c5f94fa6827 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -11,6 +11,11 @@
     maybe_uninit_slice,
     maybe_uninit_uninit_array_transpose
 )]
+#![cfg_attr(
+    target_arch = "loongarch64",
+    feature(stdarch_loongarch, stdarch_loongarch_feature_detection, loongarch_target_feature),
+    allow(clippy::incompatible_msrv)
+)]
 #![allow(clippy::missing_transmute_annotations, clippy::new_without_default, stable_features)]
 
 #[macro_use]
diff --git a/src/simd/lines_bwd.rs b/src/simd/lines_bwd.rs
index dbe59add9e43..5debb0a55483 100644
--- a/src/simd/lines_bwd.rs
+++ b/src/simd/lines_bwd.rs
@@ -34,7 +34,7 @@ unsafe fn lines_bwd_raw(
     line: CoordType,
     line_stop: CoordType,
 ) -> (*const u8, CoordType) {
-    #[cfg(target_arch = "x86_64")]
+    #[cfg(any(target_arch = "x86_64", target_arch = "loongarch64"))]
     return unsafe { LINES_BWD_DISPATCH(beg, end, line, line_stop) };
 
     #[cfg(target_arch = "aarch64")]
@@ -65,7 +65,7 @@ unsafe fn lines_bwd_fallback(
     }
 }
 
-#[cfg(target_arch = "x86_64")]
+#[cfg(any(target_arch = "x86_64", target_arch = "loongarch64"))]
 static mut LINES_BWD_DISPATCH: unsafe fn(
     beg: *const u8,
     end: *const u8,
@@ -162,6 +162,178 @@ unsafe fn lines_bwd_avx2(
     }
 }
 
+#[cfg(target_arch = "loongarch64")]
+unsafe fn lines_bwd_dispatch(
+    beg: *const u8,
+    end: *const u8,
+    line: CoordType,
+    line_stop: CoordType,
+) -> (*const u8, CoordType) {
+    use std::arch::is_loongarch_feature_detected;
+
+    let func = if is_loongarch_feature_detected!("lasx") {
+        lines_bwd_lasx
+    } else if is_loongarch_feature_detected!("lsx") {
+        lines_bwd_lsx
+    } else {
+        lines_bwd_fallback
+    };
+    unsafe { LINES_BWD_DISPATCH = func };
+    unsafe { func(beg, end, line, line_stop) }
+}
+
+#[cfg(target_arch = "loongarch64")]
+#[target_feature(enable = "lasx")]
+unsafe fn lines_bwd_lasx(
+    beg: *const u8,
+    mut end: *const u8,
+    mut line: CoordType,
+    line_stop: CoordType,
+) -> (*const u8, CoordType) {
+    unsafe {
+        use std::arch::loongarch64::*;
+        use std::mem::transmute as T;
+
+        #[inline(always)]
+        unsafe fn horizontal_sum(sum: v32i8) -> u32 {
+            unsafe {
+                let sum = lasx_xvhaddw_h_b(sum, sum);
+                let sum = lasx_xvhaddw_w_h(sum, sum);
+                let sum = lasx_xvhaddw_d_w(sum, sum);
+                let sum = lasx_xvhaddw_q_d(sum, sum);
+                let tmp = lasx_xvpermi_q::<1>(T(sum), T(sum));
+                let sum = lasx_xvadd_w(T(sum), T(tmp));
+                lasx_xvpickve2gr_wu::<0>(sum)
+            }
+        }
+
+        let lf = lasx_xvrepli_b(b'\n' as i32);
+        let line_stop = line_stop.min(line);
+        let off = end.addr() & 31;
+        if off != 0 && off < end.offset_from_unsigned(beg) {
+            (end, line) = lines_bwd_fallback(end.sub(off), end, line, line_stop);
+        }
+
+        while end.offset_from_unsigned(beg) >= 128 {
+            let chunk_start = end.sub(128);
+
+            let v1 = lasx_xvld::<0>(chunk_start as *const _);
+            let v2 = lasx_xvld::<32>(chunk_start as *const _);
+            let v3 = lasx_xvld::<64>(chunk_start as *const _);
+            let v4 = lasx_xvld::<96>(chunk_start as *const _);
+
+            let mut sum = lasx_xvrepli_b(0);
+            sum = lasx_xvsub_b(sum, lasx_xvseq_b(v1, lf));
+            sum = lasx_xvsub_b(sum, lasx_xvseq_b(v2, lf));
+            sum = lasx_xvsub_b(sum, lasx_xvseq_b(v3, lf));
+            sum = lasx_xvsub_b(sum, lasx_xvseq_b(v4, lf));
+            let sum = horizontal_sum(sum);
+
+            let line_next = line - sum as CoordType;
+            if line_next <= line_stop {
+                break;
+            }
+
+            end = chunk_start;
+            line = line_next;
+        }
+
+        while end.offset_from_unsigned(beg) >= 32 {
+            let chunk_start = end.sub(32);
+            let v = lasx_xvld::<0>(chunk_start as *const _);
+            let c = lasx_xvseq_b(v, lf);
+
+            let ones = lasx_xvand_v(T(c), T(lasx_xvrepli_b(1)));
+            let sum = horizontal_sum(T(ones));
+
+            let line_next = line - sum as CoordType;
+            if line_next <= line_stop {
+                break;
+            }
+
+            end = chunk_start;
+            line = line_next;
+        }
+
+        lines_bwd_fallback(beg, end, line, line_stop)
+    }
+}
+
+#[cfg(target_arch = "loongarch64")]
+#[target_feature(enable = "lsx")]
+unsafe fn lines_bwd_lsx(
+    beg: *const u8,
+    mut end: *const u8,
+    mut line: CoordType,
+    line_stop: CoordType,
+) -> (*const u8, CoordType) {
+    unsafe {
+        use std::arch::loongarch64::*;
+        use std::mem::transmute as T;
+
+        #[inline(always)]
+        unsafe fn horizontal_sum(sum: v16i8) -> u32 {
+            unsafe {
+                let sum = lsx_vhaddw_h_b(sum, sum);
+                let sum = lsx_vhaddw_w_h(sum, sum);
+                let sum = lsx_vhaddw_d_w(sum, sum);
+                let sum = lsx_vhaddw_q_d(sum, sum);
+                lsx_vpickve2gr_wu::<0>(T(sum))
+            }
+        }
+
+        let lf = lsx_vrepli_b(b'\n' as i32);
+        let line_stop = line_stop.min(line);
+        let off = end.addr() & 15;
+        if off != 0 && off < end.offset_from_unsigned(beg) {
+            (end, line) = lines_bwd_fallback(end.sub(off), end, line, line_stop);
+        }
+
+        while end.offset_from_unsigned(beg) >= 64 {
+            let chunk_start = end.sub(64);
+
+            let v1 = lsx_vld::<0>(chunk_start as *const _);
+            let v2 = lsx_vld::<16>(chunk_start as *const _);
+            let v3 = lsx_vld::<32>(chunk_start as *const _);
+            let v4 = lsx_vld::<48>(chunk_start as *const _);
+
+            let mut sum = lsx_vrepli_b(0);
+            sum = lsx_vsub_b(sum, lsx_vseq_b(v1, lf));
+            sum = lsx_vsub_b(sum, lsx_vseq_b(v2, lf));
+            sum = lsx_vsub_b(sum, lsx_vseq_b(v3, lf));
+            sum = lsx_vsub_b(sum, lsx_vseq_b(v4, lf));
+            let sum = horizontal_sum(sum);
+
+            let line_next = line - sum as CoordType;
+            if line_next <= line_stop {
+                break;
+            }
+
+            end = chunk_start;
+            line = line_next;
+        }
+
+        while end.offset_from_unsigned(beg) >= 16 {
+            let chunk_start = end.sub(16);
+            let v = lsx_vld::<0>(chunk_start as *const _);
+            let c = lsx_vseq_b(v, lf);
+
+            let ones = lsx_vand_v(T(c), T(lsx_vrepli_b(1)));
+            let sum = horizontal_sum(T(ones));
+
+            let line_next = line - sum as CoordType;
+            if line_next <= line_stop {
+                break;
+            }
+
+            end = chunk_start;
+            line = line_next;
+        }
+
+        lines_bwd_fallback(beg, end, line, line_stop)
+    }
+}
+
 #[cfg(target_arch = "aarch64")]
 unsafe fn lines_bwd_neon(
     beg: *const u8,
diff --git a/src/simd/lines_fwd.rs b/src/simd/lines_fwd.rs
index e2d11f1c23bb..06e60cafd356 100644
--- a/src/simd/lines_fwd.rs
+++ b/src/simd/lines_fwd.rs
@@ -32,7 +32,7 @@ unsafe fn lines_fwd_raw(
     line: CoordType,
     line_stop: CoordType,
 ) -> (*const u8, CoordType) {
-    #[cfg(target_arch = "x86_64")]
+    #[cfg(any(target_arch = "x86_64", target_arch = "loongarch64"))]
     return unsafe { LINES_FWD_DISPATCH(beg, end, line, line_stop) };
 
     #[cfg(target_arch = "aarch64")]
@@ -65,7 +65,7 @@ unsafe fn lines_fwd_fallback(
     }
 }
 
-#[cfg(target_arch = "x86_64")]
+#[cfg(any(target_arch = "x86_64", target_arch = "loongarch64"))]
 static mut LINES_FWD_DISPATCH: unsafe fn(
     beg: *const u8,
     end: *const u8,
@@ -168,6 +168,174 @@ unsafe fn lines_fwd_avx2(
     }
 }
 
+#[cfg(target_arch = "loongarch64")]
+unsafe fn lines_fwd_dispatch(
+    beg: *const u8,
+    end: *const u8,
+    line: CoordType,
+    line_stop: CoordType,
+) -> (*const u8, CoordType) {
+    use std::arch::is_loongarch_feature_detected;
+
+    let func = if is_loongarch_feature_detected!("lasx") {
+        lines_fwd_lasx
+    } else if is_loongarch_feature_detected!("lsx") {
+        lines_fwd_lsx
+    } else {
+        lines_fwd_fallback
+    };
+    unsafe { LINES_FWD_DISPATCH = func };
+    unsafe { func(beg, end, line, line_stop) }
+}
+
+#[cfg(target_arch = "loongarch64")]
+#[target_feature(enable = "lasx")]
+unsafe fn lines_fwd_lasx(
+    mut beg: *const u8,
+    end: *const u8,
+    mut line: CoordType,
+    line_stop: CoordType,
+) -> (*const u8, CoordType) {
+    unsafe {
+        use std::arch::loongarch64::*;
+        use std::mem::transmute as T;
+
+        #[inline(always)]
+        unsafe fn horizontal_sum(sum: v32i8) -> u32 {
+            unsafe {
+                let sum = lasx_xvhaddw_h_b(sum, sum);
+                let sum = lasx_xvhaddw_w_h(sum, sum);
+                let sum = lasx_xvhaddw_d_w(sum, sum);
+                let sum = lasx_xvhaddw_q_d(sum, sum);
+                let tmp = lasx_xvpermi_q::<1>(T(sum), T(sum));
+                let sum = lasx_xvadd_w(T(sum), T(tmp));
+                lasx_xvpickve2gr_wu::<0>(sum)
+            }
+        }
+
+        let lf = lasx_xvrepli_b(b'\n' as i32);
+        let off = beg.align_offset(32);
+        if off != 0 && off < end.offset_from_unsigned(beg) {
+            (beg, line) = lines_fwd_fallback(beg, beg.add(off), line, line_stop);
+        }
+
+        if line < line_stop {
+            while end.offset_from_unsigned(beg) >= 128 {
+                let v1 = lasx_xvld::<0>(beg as *const _);
+                let v2 = lasx_xvld::<32>(beg as *const _);
+                let v3 = lasx_xvld::<64>(beg as *const _);
+                let v4 = lasx_xvld::<96>(beg as *const _);
+
+                let mut sum = lasx_xvrepli_b(0);
+                sum = lasx_xvsub_b(sum, lasx_xvseq_b(v1, lf));
+                sum = lasx_xvsub_b(sum, lasx_xvseq_b(v2, lf));
+                sum = lasx_xvsub_b(sum, lasx_xvseq_b(v3, lf));
+                sum = lasx_xvsub_b(sum, lasx_xvseq_b(v4, lf));
+                let sum = horizontal_sum(sum);
+
+                let line_next = line + sum as CoordType;
+                if line_next >= line_stop {
+                    break;
+                }
+
+                beg = beg.add(128);
+                line = line_next;
+            }
+
+            while end.offset_from_unsigned(beg) >= 32 {
+                let v = lasx_xvld::<0>(beg as *const _);
+                let c = lasx_xvseq_b(v, lf);
+
+                let ones = lasx_xvand_v(T(c), T(lasx_xvrepli_b(1)));
+                let sum = horizontal_sum(T(ones));
+
+                let line_next = line + sum as CoordType;
+                if line_next >= line_stop {
+                    break;
+                }
+
+                beg = beg.add(32);
+                line = line_next;
+            }
+        }
+
+        lines_fwd_fallback(beg, end, line, line_stop)
+    }
+}
+
+#[cfg(target_arch = "loongarch64")]
+#[target_feature(enable = "lsx")]
+unsafe fn lines_fwd_lsx(
+    mut beg: *const u8,
+    end: *const u8,
+    mut line: CoordType,
+    line_stop: CoordType,
+) -> (*const u8, CoordType) {
+    unsafe {
+        use std::arch::loongarch64::*;
+        use std::mem::transmute as T;
+
+        #[inline(always)]
+        unsafe fn horizontal_sum(sum: v16i8) -> u32 {
+            unsafe {
+                let sum = lsx_vhaddw_h_b(sum, sum);
+                let sum = lsx_vhaddw_w_h(sum, sum);
+                let sum = lsx_vhaddw_d_w(sum, sum);
+                let sum = lsx_vhaddw_q_d(sum, sum);
+                lsx_vpickve2gr_wu::<0>(T(sum))
+            }
+        }
+
+        let lf = lsx_vrepli_b(b'\n' as i32);
+        let off = beg.align_offset(16);
+        if off != 0 && off < end.offset_from_unsigned(beg) {
+            (beg, line) = lines_fwd_fallback(beg, beg.add(off), line, line_stop);
+        }
+
+        if line < line_stop {
+            while end.offset_from_unsigned(beg) >= 64 {
+                let v1 = lsx_vld::<0>(beg as *const _);
+                let v2 = lsx_vld::<16>(beg as *const _);
+                let v3 = lsx_vld::<32>(beg as *const _);
+                let v4 = lsx_vld::<48>(beg as *const _);
+
+                let mut sum = lsx_vrepli_b(0);
+                sum = lsx_vsub_b(sum, lsx_vseq_b(v1, lf));
+                sum = lsx_vsub_b(sum, lsx_vseq_b(v2, lf));
+                sum = lsx_vsub_b(sum, lsx_vseq_b(v3, lf));
+                sum = lsx_vsub_b(sum, lsx_vseq_b(v4, lf));
+                let sum = horizontal_sum(sum);
+
+                let line_next = line + sum as CoordType;
+                if line_next >= line_stop {
+                    break;
+                }
+
+                beg = beg.add(64);
+                line = line_next;
+            }
+
+            while end.offset_from_unsigned(beg) >= 16 {
+                let v = lsx_vld::<0>(beg as *const _);
+                let c = lsx_vseq_b(v, lf);
+
+                let ones = lsx_vand_v(T(c), T(lsx_vrepli_b(1)));
+                let sum = horizontal_sum(T(ones));
+
+                let line_next = line + sum as CoordType;
+                if line_next >= line_stop {
+                    break;
+                }
+
+                beg = beg.add(16);
+                line = line_next;
+            }
+        }
+
+        lines_fwd_fallback(beg, end, line, line_stop)
+    }
+}
+
 #[cfg(target_arch = "aarch64")]
 unsafe fn lines_fwd_neon(
     mut beg: *const u8,