Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@
maybe_uninit_slice,
maybe_uninit_uninit_array_transpose
)]
#![cfg_attr(
target_arch = "loongarch64",
feature(stdarch_loongarch, stdarch_loongarch_feature_detection, loongarch_target_feature),
allow(clippy::incompatible_msrv)
)]
#![allow(clippy::missing_transmute_annotations, clippy::new_without_default, stable_features)]

#[macro_use]
Expand Down
176 changes: 174 additions & 2 deletions src/simd/lines_bwd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ unsafe fn lines_bwd_raw(
line: CoordType,
line_stop: CoordType,
) -> (*const u8, CoordType) {
#[cfg(target_arch = "x86_64")]
#[cfg(any(target_arch = "x86_64", target_arch = "loongarch64"))]
return unsafe { LINES_BWD_DISPATCH(beg, end, line, line_stop) };

#[cfg(target_arch = "aarch64")]
Expand Down Expand Up @@ -65,7 +65,7 @@ unsafe fn lines_bwd_fallback(
}
}

#[cfg(target_arch = "x86_64")]
#[cfg(any(target_arch = "x86_64", target_arch = "loongarch64"))]
static mut LINES_BWD_DISPATCH: unsafe fn(
beg: *const u8,
end: *const u8,
Expand Down Expand Up @@ -162,6 +162,178 @@ unsafe fn lines_bwd_avx2(
}
}

#[cfg(target_arch = "loongarch64")]
unsafe fn lines_bwd_dispatch(
beg: *const u8,
end: *const u8,
line: CoordType,
line_stop: CoordType,
) -> (*const u8, CoordType) {
use std::arch::is_loongarch_feature_detected;

let func = if is_loongarch_feature_detected!("lasx") {
lines_bwd_lasx
} else if is_loongarch_feature_detected!("lsx") {
lines_bwd_lsx
Comment on lines +174 to +177
Copy link
Member

@lhecker lhecker Jul 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there value in supporting both? AVX2 is supported by most x64 devices, so that's the only thing we support. I'd prefer if we could do the same with LSX and only support LASX. This would reduce our long-term maintenance burden. But I don't know how widespread LASX support is...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LASX is not always available alongside LSX. For example, both LSX and LASX are present on the LA464 and LA664 cores, but the LA364E only supports LSX and lacks LASX. I'm happy to help maintain and keep this part in good shape.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, in that case... 😄

} else {
lines_bwd_fallback
};
unsafe { LINES_BWD_DISPATCH = func };
unsafe { func(beg, end, line, line_stop) }
}

#[cfg(target_arch = "loongarch64")]
#[target_feature(enable = "lasx")]
unsafe fn lines_bwd_lasx(
beg: *const u8,
mut end: *const u8,
mut line: CoordType,
line_stop: CoordType,
) -> (*const u8, CoordType) {
unsafe {
use std::arch::loongarch64::*;
use std::mem::transmute as T;

#[inline(always)]
unsafe fn horizontal_sum(sum: v32i8) -> u32 {
unsafe {
let sum = lasx_xvhaddw_h_b(sum, sum);
let sum = lasx_xvhaddw_w_h(sum, sum);
let sum = lasx_xvhaddw_d_w(sum, sum);
let sum = lasx_xvhaddw_q_d(sum, sum);
let tmp = lasx_xvpermi_q::<1>(T(sum), T(sum));
let sum = lasx_xvadd_w(T(sum), T(tmp));
lasx_xvpickve2gr_wu::<0>(sum)
}
}

let lf = lasx_xvrepli_b(b'\n' as i32);
let line_stop = line_stop.min(line);
let off = end.addr() & 31;
if off != 0 && off < end.offset_from_unsigned(beg) {
(end, line) = lines_bwd_fallback(end.sub(off), end, line, line_stop);
}

while end.offset_from_unsigned(beg) >= 128 {
let chunk_start = end.sub(128);

let v1 = lasx_xvld::<0>(chunk_start as *const _);
let v2 = lasx_xvld::<32>(chunk_start as *const _);
let v3 = lasx_xvld::<64>(chunk_start as *const _);
let v4 = lasx_xvld::<96>(chunk_start as *const _);

let mut sum = lasx_xvrepli_b(0);
sum = lasx_xvsub_b(sum, lasx_xvseq_b(v1, lf));
sum = lasx_xvsub_b(sum, lasx_xvseq_b(v2, lf));
sum = lasx_xvsub_b(sum, lasx_xvseq_b(v3, lf));
sum = lasx_xvsub_b(sum, lasx_xvseq_b(v4, lf));
let sum = horizontal_sum(sum);

let line_next = line - sum as CoordType;
if line_next <= line_stop {
break;
}

end = chunk_start;
line = line_next;
}

while end.offset_from_unsigned(beg) >= 32 {
let chunk_start = end.sub(32);
let v = lasx_xvld::<0>(chunk_start as *const _);
let c = lasx_xvseq_b(v, lf);

let ones = lasx_xvand_v(T(c), T(lasx_xvrepli_b(1)));
let sum = horizontal_sum(T(ones));

let line_next = line - sum as CoordType;
if line_next <= line_stop {
break;
}

end = chunk_start;
line = line_next;
}

lines_bwd_fallback(beg, end, line, line_stop)
}
}

#[cfg(target_arch = "loongarch64")]
#[target_feature(enable = "lsx")]
unsafe fn lines_bwd_lsx(
beg: *const u8,
mut end: *const u8,
mut line: CoordType,
line_stop: CoordType,
) -> (*const u8, CoordType) {
unsafe {
use std::arch::loongarch64::*;
use std::mem::transmute as T;

#[inline(always)]
unsafe fn horizontal_sum(sum: v16i8) -> u32 {
unsafe {
let sum = lsx_vhaddw_h_b(sum, sum);
let sum = lsx_vhaddw_w_h(sum, sum);
let sum = lsx_vhaddw_d_w(sum, sum);
let sum = lsx_vhaddw_q_d(sum, sum);
lsx_vpickve2gr_wu::<0>(T(sum))
}
}

let lf = lsx_vrepli_b(b'\n' as i32);
let line_stop = line_stop.min(line);
let off = end.addr() & 15;
if off != 0 && off < end.offset_from_unsigned(beg) {
(end, line) = lines_bwd_fallback(end.sub(off), end, line, line_stop);
}

while end.offset_from_unsigned(beg) >= 64 {
let chunk_start = end.sub(64);

let v1 = lsx_vld::<0>(chunk_start as *const _);
let v2 = lsx_vld::<16>(chunk_start as *const _);
let v3 = lsx_vld::<32>(chunk_start as *const _);
let v4 = lsx_vld::<48>(chunk_start as *const _);

let mut sum = lsx_vrepli_b(0);
sum = lsx_vsub_b(sum, lsx_vseq_b(v1, lf));
sum = lsx_vsub_b(sum, lsx_vseq_b(v2, lf));
sum = lsx_vsub_b(sum, lsx_vseq_b(v3, lf));
sum = lsx_vsub_b(sum, lsx_vseq_b(v4, lf));
let sum = horizontal_sum(sum);

let line_next = line - sum as CoordType;
if line_next <= line_stop {
break;
}

end = chunk_start;
line = line_next;
}

while end.offset_from_unsigned(beg) >= 16 {
let chunk_start = end.sub(16);
let v = lsx_vld::<0>(chunk_start as *const _);
let c = lsx_vseq_b(v, lf);

let ones = lsx_vand_v(T(c), T(lsx_vrepli_b(1)));
let sum = horizontal_sum(T(ones));

let line_next = line - sum as CoordType;
if line_next <= line_stop {
break;
}

end = chunk_start;
line = line_next;
}

lines_bwd_fallback(beg, end, line, line_stop)
}
}

#[cfg(target_arch = "aarch64")]
unsafe fn lines_bwd_neon(
beg: *const u8,
Expand Down
Loading