With great interest I read the following blog post:
https://lpython.org/blog/2023/07/lpython-novel-fast-retargetable-python-compiler/
But to my surprise I discovered the C++ code in the benchmark of dijkstra_shortest_path is pessimized rather than optimized.
The usage of maps makes this Code much, much slower as it could.
On my system (some ancient i7 laptop), the following is more than 40 times faster (see below) than the original code.
#include <iostream>
#include <vector>
int32_t dijkstra_shortest_path(int32_t n, int32_t source) {
int32_t i, j, v, u, mindist, alt, dummy, uidx;
std::vector<int32_t> dist, prev;
std::vector<bool> visited;
std::vector<int32_t> Q, graph(n*n);
Q.reserve(n);
dist.reserve(n);
prev.reserve(n);
for(i = 0; i < n; i++) {
for(j = 0; j < n; j++) {
graph[n * i + j] = std::abs(i - j);
}
}
for(v = 0; v < n; v++) {
dist.push_back(2147483647);
prev.push_back(-1);
Q.push_back(v);
visited.push_back(false);
}
dist[source] = 0;
while(Q.size() > 0) {
u = -1;
mindist = 2147483647;
for(i = 0; i < Q.size(); i++) {
if( mindist > dist[Q[i]] ) {
mindist = dist[Q[i]];
u = Q[i];
uidx = i;
}
}
Q.erase(Q.begin() + uidx);
visited[u] = true;
for(v = 0; v < n; v++) {
if( v != u and not visited[v] ) {
alt = dist[u] + graph[n * u + v];
if( alt < dist[v] ) {
dist[v] = alt;
prev[v] = u;
}
}
}
}
int32_t dist_sum = 0;
for(i = 0; i < n; i++) {
dist_sum += dist[i];
}
return dist_sum;
}
int main() {
int32_t n = 4000;
int32_t dist_sum = dijkstra_shortest_path(n, 0);
std::cout<<dist_sum<<std::endl;
return 0;
}
For convenience, this is the diff:
--- a.cpp 2023-07-30 10:59:17.482487750 +0200
+++ f.cpp 2023-07-30 14:57:58.543699027 +0200
@@ -1,13 +1,15 @@
#include <iostream>
-#include <unordered_map>
#include <vector>
int32_t dijkstra_shortest_path(int32_t n, int32_t source) {
int32_t i, j, v, u, mindist, alt, dummy, uidx;
- std::unordered_map<int32_t, int32_t> graph, dist, prev;
- std::unordered_map<int32_t, bool> visited;
- std::vector<int32_t> Q;
-
+ std::vector<int32_t> dist, prev;
+ std::vector<bool> visited;
+ std::vector<int32_t> Q, graph(n*n);
+ Q.reserve(n);
+ dist.reserve(n);
+ prev.reserve(n);
+
for(i = 0; i < n; i++) {
for(j = 0; j < n; j++) {
graph[n * i + j] = std::abs(i - j);
@@ -15,10 +17,10 @@
}
for(v = 0; v < n; v++) {
- dist[v] = 2147483647;
- prev[v] = -1;
+ dist.push_back(2147483647);
+ prev.push_back(-1);
Q.push_back(v);
- visited[v] = false;
+ visited.push_back(false);
}
dist[source] = 0;
Furthermore, I implemented a rust version for comparison.
fn dijkstra_shortest_path(n: i32, source: i32) -> i32 {
let mut dist = Vec::with_capacity(n as usize);
let mut prev = Vec::with_capacity(n as usize);
let mut visited = Vec::with_capacity(n as usize);
let mut q = Vec::with_capacity((n * n) as usize);
let mut graph = vec![0; (n * n) as usize];
for i in 0..n {
for j in 0..n {
graph[(n * i + j) as usize] = (i - j).abs();
}
}
for v in 0..n {
dist.push(2147483647);
prev.push(-1);
q.push(v);
visited.push(false);
}
dist[source as usize] = 0;
let mut u;
let mut uidx = 0;
let mut mindist;
let mut alt;
while q.len() > 0 {
u = -1;
mindist = 2147483647;
for i in 0..q.len() {
let d = dist[q[i] as usize];
if mindist > d {
mindist = d;
u = q[i];
uidx = i;
}
}
q.remove(uidx);
visited[u as usize] = true;
for v in 0..n {
if v != u && !visited[v as usize] {
alt = dist[u as usize] + graph[(n * u + v) as usize];
if alt < dist[v as usize] {
dist[v as usize] = alt;
prev[v as usize] = u;
}
}
}
}
dist.iter().sum()
}
fn main() {
let n = 4000;
let dist_sum = dijkstra_shortest_path(n, 0);
println!("{}\n", dist_sum);
}
Benchmark
$ clang++ -std=c++20 -ffast-math -O3 -march=native old.cpp -o old
$ clang++ -std=c++20 -ffast-math -O3 -march=native better.cpp -o better
$ rustc main.rs -C opt-level=3 -o better-rust
$ hyperfine --warmup=20 --min-runs=10 ./old ./better ./better-rust
Benchmark 1: ./old
Time (mean ± σ): 1.690 s ± 0.112 s [User: 1.380 s, System: 0.299 s]
Range (min … max): 1.407 s … 1.790 s 10 runs
Benchmark 2: ./better
Time (mean ± σ): 40.3 ms ± 2.7 ms [User: 31.5 ms, System: 8.2 ms]
Range (min … max): 34.4 ms … 45.8 ms 71 runs
Benchmark 3: ./better-rust
Time (mean ± σ): 40.4 ms ± 1.7 ms [User: 32.8 ms, System: 7.5 ms]
Range (min … max): 37.7 ms … 44.2 ms 68 runs
Summary
./better ran
1.00 ± 0.08 times faster than ./better-rust
41.93 ± 3.97 times faster than ./old
(compilers used: rustc 1.71.0-nightly (2f2c438dc 2023-05-08) and clang version 15.0.7 on ArchLinux)
With great interest I read the following blog post:
https://lpython.org/blog/2023/07/lpython-novel-fast-retargetable-python-compiler/
But to my surprise I discovered the C++ code in the benchmark of
dijkstra_shortest_pathis pessimized rather than optimized.The usage of maps makes this Code much, much slower as it could.
On my system (some ancient i7 laptop), the following is more than 40 times faster (see below) than the original code.
For convenience, this is the diff:
Furthermore, I implemented a rust version for comparison.
Benchmark
(compilers used:
rustc 1.71.0-nightly (2f2c438dc 2023-05-08)andclang version 15.0.7on ArchLinux)