microsoft · lhecker · Jul 2, 2025 · Jun 26, 2025 · lhecker · Jul 2, 2025
diff --git a/src/lib.rs b/src/lib.rs
@@ -11,6 +11,11 @@
     maybe_uninit_slice,
     maybe_uninit_uninit_array_transpose
 )]
+#![cfg_attr(
+    target_arch = "loongarch64",
+    feature(stdarch_loongarch, stdarch_loongarch_feature_detection, loongarch_target_feature),
+    allow(clippy::incompatible_msrv)
+)]
 #![allow(clippy::missing_transmute_annotations, clippy::new_without_default, stable_features)]
 
 #[macro_use]

diff --git a/src/simd/lines_bwd.rs b/src/simd/lines_bwd.rs
@@ -34,7 +34,7 @@ unsafe fn lines_bwd_raw(
     line: CoordType,
     line_stop: CoordType,
 ) -> (*const u8, CoordType) {
-    #[cfg(target_arch = "x86_64")]
+    #[cfg(any(target_arch = "x86_64", target_arch = "loongarch64"))]
     return unsafe { LINES_BWD_DISPATCH(beg, end, line, line_stop) };
 
     #[cfg(target_arch = "aarch64")]
@@ -65,7 +65,7 @@ unsafe fn lines_bwd_fallback(
     }
 }
 
-#[cfg(target_arch = "x86_64")]
+#[cfg(any(target_arch = "x86_64", target_arch = "loongarch64"))]
 static mut LINES_BWD_DISPATCH: unsafe fn(
     beg: *const u8,
     end: *const u8,
@@ -162,6 +162,178 @@ unsafe fn lines_bwd_avx2(
     }
 }
 
+#[cfg(target_arch = "loongarch64")]
+unsafe fn lines_bwd_dispatch(
+    beg: *const u8,
+    end: *const u8,
+    line: CoordType,
+    line_stop: CoordType,
+) -> (*const u8, CoordType) {
+    use std::arch::is_loongarch_feature_detected;
+
+    let func = if is_loongarch_feature_detected!("lasx") {
+        lines_bwd_lasx
+    } else if is_loongarch_feature_detected!("lsx") {
+        lines_bwd_lsx
+    } else {
+        lines_bwd_fallback
+    };
+    unsafe { LINES_BWD_DISPATCH = func };
+    unsafe { func(beg, end, line, line_stop) }
+}
+
+#[cfg(target_arch = "loongarch64")]
+#[target_feature(enable = "lasx")]
+unsafe fn lines_bwd_lasx(
+    beg: *const u8,
+    mut end: *const u8,
+    mut line: CoordType,
+    line_stop: CoordType,
+) -> (*const u8, CoordType) {
+    unsafe {
+        use std::arch::loongarch64::*;
+        use std::mem::transmute as T;
+
+        #[inline(always)]
+        unsafe fn horizontal_sum(sum: v32i8) -> u32 {
+            unsafe {
+                let sum = lasx_xvhaddw_h_b(sum, sum);
+                let sum = lasx_xvhaddw_w_h(sum, sum);
+                let sum = lasx_xvhaddw_d_w(sum, sum);
+                let sum = lasx_xvhaddw_q_d(sum, sum);
+                let tmp = lasx_xvpermi_q::<1>(T(sum), T(sum));
+                let sum = lasx_xvadd_w(T(sum), T(tmp));
+                lasx_xvpickve2gr_wu::<0>(sum)
+            }
+        }
+
+        let lf = lasx_xvrepli_b(b'\n' as i32);
+        let line_stop = line_stop.min(line);
+        let off = end.addr() & 31;
+        if off != 0 && off < end.offset_from_unsigned(beg) {
+            (end, line) = lines_bwd_fallback(end.sub(off), end, line, line_stop);
+        }
+
+        while end.offset_from_unsigned(beg) >= 128 {
+            let chunk_start = end.sub(128);
+
+            let v1 = lasx_xvld::<0>(chunk_start as *const _);
+            let v2 = lasx_xvld::<32>(chunk_start as *const _);
+            let v3 = lasx_xvld::<64>(chunk_start as *const _);
+            let v4 = lasx_xvld::<96>(chunk_start as *const _);
+
+            let mut sum = lasx_xvrepli_b(0);
+            sum = lasx_xvsub_b(sum, lasx_xvseq_b(v1, lf));
+            sum = lasx_xvsub_b(sum, lasx_xvseq_b(v2, lf));
+            sum = lasx_xvsub_b(sum, lasx_xvseq_b(v3, lf));
+            sum = lasx_xvsub_b(sum, lasx_xvseq_b(v4, lf));
+            let sum = horizontal_sum(sum);
+
+            let line_next = line - sum as CoordType;
+            if line_next <= line_stop {
+                break;
+            }
+
+            end = chunk_start;
+            line = line_next;
+        }
+
+        while end.offset_from_unsigned(beg) >= 32 {
+            let chunk_start = end.sub(32);
+            let v = lasx_xvld::<0>(chunk_start as *const _);
+            let c = lasx_xvseq_b(v, lf);
+
+            let ones = lasx_xvand_v(T(c), T(lasx_xvrepli_b(1)));
+            let sum = horizontal_sum(T(ones));
+
+            let line_next = line - sum as CoordType;
+            if line_next <= line_stop {
+                break;
+            }
+
+            end = chunk_start;
+            line = line_next;
+        }
+
+        lines_bwd_fallback(beg, end, line, line_stop)
+    }
+}
+
+#[cfg(target_arch = "loongarch64")]
+#[target_feature(enable = "lsx")]
+unsafe fn lines_bwd_lsx(
+    beg: *const u8,
+    mut end: *const u8,
+    mut line: CoordType,
+    line_stop: CoordType,
+) -> (*const u8, CoordType) {
+    unsafe {
+        use std::arch::loongarch64::*;
+        use std::mem::transmute as T;
+
+        #[inline(always)]
+        unsafe fn horizontal_sum(sum: v16i8) -> u32 {
+            unsafe {
+                let sum = lsx_vhaddw_h_b(sum, sum);
+                let sum = lsx_vhaddw_w_h(sum, sum);
+                let sum = lsx_vhaddw_d_w(sum, sum);
+                let sum = lsx_vhaddw_q_d(sum, sum);
+                lsx_vpickve2gr_wu::<0>(T(sum))
+            }
+        }
+
+        let lf = lsx_vrepli_b(b'\n' as i32);
+        let line_stop = line_stop.min(line);
+        let off = end.addr() & 15;
+        if off != 0 && off < end.offset_from_unsigned(beg) {
+            (end, line) = lines_bwd_fallback(end.sub(off), end, line, line_stop);
+        }
+
+        while end.offset_from_unsigned(beg) >= 64 {
+            let chunk_start = end.sub(64);
+
+            let v1 = lsx_vld::<0>(chunk_start as *const _);
+            let v2 = lsx_vld::<16>(chunk_start as *const _);
+            let v3 = lsx_vld::<32>(chunk_start as *const _);
+            let v4 = lsx_vld::<48>(chunk_start as *const _);
+
+            let mut sum = lsx_vrepli_b(0);
+            sum = lsx_vsub_b(sum, lsx_vseq_b(v1, lf));
+            sum = lsx_vsub_b(sum, lsx_vseq_b(v2, lf));
+            sum = lsx_vsub_b(sum, lsx_vseq_b(v3, lf));
+            sum = lsx_vsub_b(sum, lsx_vseq_b(v4, lf));
+            let sum = horizontal_sum(sum);
+
+            let line_next = line - sum as CoordType;
+            if line_next <= line_stop {
+                break;
+            }
+
+            end = chunk_start;
+            line = line_next;
+        }
+
+        while end.offset_from_unsigned(beg) >= 16 {
+            let chunk_start = end.sub(16);
+            let v = lsx_vld::<0>(chunk_start as *const _);
+            let c = lsx_vseq_b(v, lf);
+
+            let ones = lsx_vand_v(T(c), T(lsx_vrepli_b(1)));
+            let sum = horizontal_sum(T(ones));
+
+            let line_next = line - sum as CoordType;
+            if line_next <= line_stop {
+                break;
+            }
+
+            end = chunk_start;
+            line = line_next;
+        }
+
+        lines_bwd_fallback(beg, end, line, line_stop)
+    }
+}
+
 #[cfg(target_arch = "aarch64")]
 unsafe fn lines_bwd_neon(
     beg: *const u8,