Skip to content

Commit caabf5c

Browse files
authored
Merge pull request #158 from finnbear/wide_simd
Replace manual SIMD with `wide`
2 parents 8eb206b + ea9ae92 commit caabf5c

File tree

7 files changed

+279
-339
lines changed

7 files changed

+279
-339
lines changed

CHANGELOG.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,10 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](http://keepachangelog.com/)
66
and this project adheres to [Semantic Versioning](http://semver.org/).
77

8-
## 0.11.0 - 2025-??-??
8+
## 0.12.0 - 2025-??-??
9+
- Replace hand-written x86_64 SIMD with safe and portable [`wide`](https://crates.io/crates/wide) SIMD. [#158](https://github.com/svenstaro/bvh/pull/158) (thanks @finnbear)
10+
11+
## 0.11.0 - 2025-02-18
912
- **Breaking change:** BVH traversal now accepts a `Query: IntersectsAabb` rather than a `Ray`,
1013
allowing points, AABB's, and circles/spheres to be tested, too. Most use-cases involving `Ray`
1114
will continue to compile as-is. If you previously wrote `BvhTraverseIterator<T, D, S>`, you'll

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ serde = { optional = true, version = "1", features = ["derive"] }
2222
num = "0.4.3"
2323
nalgebra = { version = "0.33.0", features = ["default", "serde-serialize"] }
2424
rayon = { optional = true, version = "1.8.1" }
25+
wide = "0.7.32"
2526

2627
[dev-dependencies]
2728
proptest = "1.0"

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ let hit_sphere_aabbs = bvh.traverse(&ray, &spheres);
7575

7676
## Explicit SIMD
7777

78-
This crate features some manually written SIMD instructions, currently only for the `x86_64` architecture.
78+
This crate features some SIMD operations. See [`wide`](https://crates.io/crates/wide) documentation for supported architectures.
7979
While nalgebra provides us with generic SIMD optimization (and it does a great job for the most part) -
8080
some important functions, such as ray-aabb-intersection have been optimized by hand.
8181

src/ray/intersect_default.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ impl<T: BHValue, const D: usize> RayIntersection<T, D> for Ray<T, D> {
3636
}
3737
}
3838

39-
#[cfg(all(feature = "simd", target_arch = "x86_64"))]
39+
#[cfg(feature = "simd")]
4040
impl<T: BHValue, const D: usize> RayIntersection<T, D> for Ray<T, D> {
4141
default fn ray_intersects_aabb(&self, aabb: &Aabb<T, D>) -> bool {
4242
let lbr = (aabb[0].coords - self.origin.coords).component_mul(&self.inv_direction);

src/ray/intersect_simd.rs

Lines changed: 270 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,270 @@
1+
//! This file contains overrides for SIMD implementations of [`RayIntersection`]
2+
//! for any architectures supported by the `wide` crate.
3+
4+
use nalgebra::SVector;
5+
use wide::*;
6+
7+
use crate::{
8+
aabb::Aabb,
9+
utils::{fast_max, fast_min, has_nan},
10+
};
11+
12+
use super::{intersect_default::RayIntersection, Ray};
13+
14+
trait ToRegisterType {
15+
type Register;
16+
17+
fn to_register(&self) -> Self::Register;
18+
}
19+
20+
impl ToRegisterType for SVector<f32, 2> {
21+
type Register = f32x4;
22+
23+
#[inline(always)]
24+
fn to_register(&self) -> Self::Register {
25+
f32x4::new([self.y, self.y, self.y, self.x])
26+
}
27+
}
28+
29+
impl ToRegisterType for SVector<f32, 3> {
30+
type Register = f32x4;
31+
32+
#[inline(always)]
33+
fn to_register(&self) -> Self::Register {
34+
f32x4::new([self.z, self.z, self.y, self.x])
35+
}
36+
}
37+
38+
impl ToRegisterType for SVector<f32, 4> {
39+
type Register = f32x4;
40+
41+
#[inline(always)]
42+
fn to_register(&self) -> Self::Register {
43+
f32x4::new([self.w, self.z, self.y, self.x])
44+
}
45+
}
46+
47+
/// Compute the horizontal maximum of the SIMD vector
48+
#[inline(always)]
49+
fn max_elem_f32x4(v: f32x4) -> f32 {
50+
let a = v.to_array();
51+
fast_max(fast_max(a[0], a[1]), fast_max(a[2], a[3]))
52+
}
53+
54+
/// Compute the horizontal minimum of the SIMD vector
55+
#[inline(always)]
56+
fn min_elem_f32x4(v: f32x4) -> f32 {
57+
let a = v.to_array();
58+
fast_min(fast_min(a[0], a[1]), fast_min(a[2], a[3]))
59+
}
60+
61+
#[inline(always)]
62+
fn has_nan_f32x4(v: f32x4) -> bool {
63+
has_nan(&v.to_array())
64+
}
65+
66+
#[inline(always)]
67+
fn has_nan_f64x2(v: f64x2) -> bool {
68+
has_nan(&v.to_array())
69+
}
70+
71+
#[inline(always)]
72+
fn has_nan_f64x4(v: f64x4) -> bool {
73+
has_nan(&v.to_array())
74+
}
75+
76+
#[inline(always)]
77+
fn ray_intersects_aabb_f32x4(
78+
ray_origin: f32x4,
79+
ray_inv_dir: f32x4,
80+
aabb_0: f32x4,
81+
aabb_1: f32x4,
82+
) -> bool {
83+
let v1 = (aabb_0 - ray_origin) * ray_inv_dir;
84+
let v2 = (aabb_1 - ray_origin) * ray_inv_dir;
85+
86+
if has_nan_f32x4(v1) | has_nan_f32x4(v2) {
87+
return false;
88+
}
89+
90+
let inf = v1.fast_min(v2);
91+
let sup = v1.fast_max(v2);
92+
93+
let tmin = max_elem_f32x4(inf);
94+
let tmax = min_elem_f32x4(sup);
95+
96+
tmax >= fast_max(tmin, 0.0)
97+
}
98+
99+
impl RayIntersection<f32, 2> for Ray<f32, 2> {
100+
#[inline(always)]
101+
fn ray_intersects_aabb(&self, aabb: &Aabb<f32, 2>) -> bool {
102+
let ro = self.origin.coords.to_register();
103+
let ri = self.inv_direction.to_register();
104+
let aabb_0 = aabb[0].coords.to_register();
105+
let aabb_1 = aabb[1].coords.to_register();
106+
107+
ray_intersects_aabb_f32x4(ro, ri, aabb_0, aabb_1)
108+
}
109+
}
110+
111+
impl RayIntersection<f32, 3> for Ray<f32, 3> {
112+
#[inline(always)]
113+
fn ray_intersects_aabb(&self, aabb: &Aabb<f32, 3>) -> bool {
114+
let ro = self.origin.coords.to_register();
115+
let ri = self.inv_direction.to_register();
116+
let aabb_0 = aabb[0].coords.to_register();
117+
let aabb_1 = aabb[1].coords.to_register();
118+
119+
ray_intersects_aabb_f32x4(ro, ri, aabb_0, aabb_1)
120+
}
121+
}
122+
123+
impl RayIntersection<f32, 4> for Ray<f32, 4> {
124+
#[inline(always)]
125+
fn ray_intersects_aabb(&self, aabb: &Aabb<f32, 4>) -> bool {
126+
let ro = self.origin.coords.to_register();
127+
let ri = self.inv_direction.to_register();
128+
let aabb_0 = aabb[0].coords.to_register();
129+
let aabb_1 = aabb[1].coords.to_register();
130+
131+
ray_intersects_aabb_f32x4(ro, ri, aabb_0, aabb_1)
132+
}
133+
}
134+
135+
impl ToRegisterType for SVector<f64, 2> {
136+
type Register = f64x2;
137+
138+
#[inline(always)]
139+
fn to_register(&self) -> Self::Register {
140+
f64x2::new([self.y, self.x])
141+
}
142+
}
143+
144+
/// Compute the horizontal maximum of the SIMD vector
145+
#[inline(always)]
146+
fn max_elem_f64x2(v: f64x2) -> f64 {
147+
let a = v.to_array();
148+
fast_max(a[0], a[1])
149+
}
150+
151+
/// Compute the horizontal minimum of the SIMD vector
152+
#[inline(always)]
153+
fn min_elem_f64x2(v: f64x2) -> f64 {
154+
let a = v.to_array();
155+
fast_min(a[0], a[1])
156+
}
157+
158+
#[inline(always)]
159+
fn ray_intersects_aabb_f64x2(
160+
ray_origin: f64x2,
161+
ray_inv_dir: f64x2,
162+
aabb_0: f64x2,
163+
aabb_1: f64x2,
164+
) -> bool {
165+
let v1 = (aabb_0 - ray_origin) * ray_inv_dir;
166+
let v2 = (aabb_1 - ray_origin) * ray_inv_dir;
167+
168+
if has_nan_f64x2(v1) | has_nan_f64x2(v2) {
169+
return false;
170+
}
171+
172+
let inf = v1.min(v2);
173+
let sup = v1.max(v2);
174+
175+
let tmin = max_elem_f64x2(inf);
176+
let tmax = min_elem_f64x2(sup);
177+
178+
tmax >= fast_max(tmin, 0.0)
179+
}
180+
181+
impl RayIntersection<f64, 2> for Ray<f64, 2> {
182+
#[inline(always)]
183+
fn ray_intersects_aabb(&self, aabb: &Aabb<f64, 2>) -> bool {
184+
let ro = self.origin.coords.to_register();
185+
let ri = self.inv_direction.to_register();
186+
let aabb_0 = aabb[0].coords.to_register();
187+
let aabb_1 = aabb[1].coords.to_register();
188+
189+
ray_intersects_aabb_f64x2(ro, ri, aabb_0, aabb_1)
190+
}
191+
}
192+
193+
impl ToRegisterType for SVector<f64, 3> {
194+
type Register = f64x4;
195+
196+
#[inline(always)]
197+
fn to_register(&self) -> Self::Register {
198+
f64x4::new([self.z, self.z, self.y, self.x])
199+
}
200+
}
201+
202+
impl ToRegisterType for SVector<f64, 4> {
203+
type Register = f64x4;
204+
205+
#[inline(always)]
206+
fn to_register(&self) -> Self::Register {
207+
f64x4::new([self.w, self.z, self.y, self.x])
208+
}
209+
}
210+
211+
/// Compute the horizontal maximum of the SIMD vector
212+
#[inline(always)]
213+
fn max_elem_f64x4(v: f64x4) -> f64 {
214+
let a = v.to_array();
215+
fast_max(fast_max(a[0], a[1]), fast_max(a[2], a[3]))
216+
}
217+
218+
/// Compute the horizontal minimum of the SIMD vector
219+
#[inline(always)]
220+
fn min_elem_f64x4(v: f64x4) -> f64 {
221+
let a = v.to_array();
222+
fast_min(fast_min(a[0], a[1]), fast_min(a[2], a[3]))
223+
}
224+
225+
#[inline(always)]
226+
fn ray_intersects_aabb_f64x4(
227+
ray_origin: f64x4,
228+
ray_inv_dir: f64x4,
229+
aabb_0: f64x4,
230+
aabb_1: f64x4,
231+
) -> bool {
232+
let v1 = (aabb_0 - ray_origin) * ray_inv_dir;
233+
let v2 = (aabb_1 - ray_origin) * ray_inv_dir;
234+
235+
if has_nan_f64x4(v1) | has_nan_f64x4(v2) {
236+
return false;
237+
}
238+
239+
let inf = v1.min(v2);
240+
let sup = v1.max(v2);
241+
242+
let tmin = max_elem_f64x4(inf);
243+
let tmax = min_elem_f64x4(sup);
244+
245+
tmax >= fast_max(tmin, 0.0)
246+
}
247+
248+
impl RayIntersection<f64, 3> for Ray<f64, 3> {
249+
#[inline(always)]
250+
fn ray_intersects_aabb(&self, aabb: &Aabb<f64, 3>) -> bool {
251+
let ro = self.origin.coords.to_register();
252+
let ri = self.inv_direction.to_register();
253+
let aabb_0 = aabb[0].coords.to_register();
254+
let aabb_1 = aabb[1].coords.to_register();
255+
256+
ray_intersects_aabb_f64x4(ro, ri, aabb_0, aabb_1)
257+
}
258+
}
259+
260+
impl RayIntersection<f64, 4> for Ray<f64, 4> {
261+
#[inline(always)]
262+
fn ray_intersects_aabb(&self, aabb: &Aabb<f64, 4>) -> bool {
263+
let ro = self.origin.coords.to_register();
264+
let ri = self.inv_direction.to_register();
265+
let aabb_0 = aabb[0].coords.to_register();
266+
let aabb_1 = aabb[1].coords.to_register();
267+
268+
ray_intersects_aabb_f64x4(ro, ri, aabb_0, aabb_1)
269+
}
270+
}

0 commit comments

Comments
 (0)