Merge pull request #158 from finnbear/wide_simd

svenstaro · web-flow · commit caabf5cbe9a4 · 2025-06-01T17:56:14.000+02:00
Replace manual SIMD with `wide`
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,7 +5,10 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/)
 and this project adheres to [Semantic Versioning](http://semver.org/).
 
-## 0.11.0 - 2025-??-??
+## 0.12.0 - 2025-??-??
+- Replace hand-written x86_64 SIMD with safe and portable [`wide`](https://crates.io/crates/wide) SIMD. [#158](https://github.com/svenstaro/bvh/pull/158) (thanks @finnbear)
+
+## 0.11.0 - 2025-02-18
 - **Breaking change:** BVH traversal now accepts a `Query: IntersectsAabb` rather than a `Ray`,
   allowing points, AABB's, and circles/spheres to be tested, too. Most use-cases involving `Ray` 
   will continue to compile as-is. If you previously wrote `BvhTraverseIterator<T, D, S>`, you'll
diff --git a/Cargo.toml b/Cargo.toml
@@ -22,6 +22,7 @@ serde = { optional = true, version = "1", features = ["derive"] }
 num = "0.4.3"
 nalgebra = { version = "0.33.0", features = ["default", "serde-serialize"] }
 rayon = { optional = true, version = "1.8.1" }
+wide = "0.7.32"
 
 [dev-dependencies]
 proptest = "1.0"
diff --git a/README.md b/README.md
@@ -75,7 +75,7 @@ let hit_sphere_aabbs = bvh.traverse(&ray, &spheres);
 
 ## Explicit SIMD
 
-This crate features some manually written SIMD instructions, currently only for the `x86_64` architecture.
+This crate features some SIMD operations. See [`wide`](https://crates.io/crates/wide) documentation for supported architectures.
 While nalgebra provides us with generic SIMD optimization (and it does a great job for the most part) - 
 some important functions, such as ray-aabb-intersection have been optimized by hand.
 
diff --git a/src/ray/intersect_default.rs b/src/ray/intersect_default.rs
@@ -36,7 +36,7 @@ impl<T: BHValue, const D: usize> RayIntersection<T, D> for Ray<T, D> {
     }
 }
 
-#[cfg(all(feature = "simd", target_arch = "x86_64"))]
+#[cfg(feature = "simd")]
 impl<T: BHValue, const D: usize> RayIntersection<T, D> for Ray<T, D> {
     default fn ray_intersects_aabb(&self, aabb: &Aabb<T, D>) -> bool {
         let lbr = (aabb[0].coords - self.origin.coords).component_mul(&self.inv_direction);
diff --git a/src/ray/intersect_simd.rs b/src/ray/intersect_simd.rs
@@ -0,0 +1,270 @@
+//! This file contains overrides for SIMD implementations of [`RayIntersection`]
+//! for any architectures supported by the `wide` crate.
+
+use nalgebra::SVector;
+use wide::*;
+
+use crate::{
+    aabb::Aabb,
+    utils::{fast_max, fast_min, has_nan},
+};
+
+use super::{intersect_default::RayIntersection, Ray};
+
+trait ToRegisterType {
+    type Register;
+
+    fn to_register(&self) -> Self::Register;
+}
+
+impl ToRegisterType for SVector<f32, 2> {
+    type Register = f32x4;
+
+    #[inline(always)]
+    fn to_register(&self) -> Self::Register {
+        f32x4::new([self.y, self.y, self.y, self.x])
+    }
+}
+
+impl ToRegisterType for SVector<f32, 3> {
+    type Register = f32x4;
+
+    #[inline(always)]
+    fn to_register(&self) -> Self::Register {
+        f32x4::new([self.z, self.z, self.y, self.x])
+    }
+}
+
+impl ToRegisterType for SVector<f32, 4> {
+    type Register = f32x4;
+
+    #[inline(always)]
+    fn to_register(&self) -> Self::Register {
+        f32x4::new([self.w, self.z, self.y, self.x])
+    }
+}
+
+/// Compute the horizontal maximum of the SIMD vector
+#[inline(always)]
+fn max_elem_f32x4(v: f32x4) -> f32 {
+    let a = v.to_array();
+    fast_max(fast_max(a[0], a[1]), fast_max(a[2], a[3]))
+}
+
+/// Compute the horizontal minimum of the SIMD vector
+#[inline(always)]
+fn min_elem_f32x4(v: f32x4) -> f32 {
+    let a = v.to_array();
+    fast_min(fast_min(a[0], a[1]), fast_min(a[2], a[3]))
+}
+
+#[inline(always)]
+fn has_nan_f32x4(v: f32x4) -> bool {
+    has_nan(&v.to_array())
+}
+
+#[inline(always)]
+fn has_nan_f64x2(v: f64x2) -> bool {
+    has_nan(&v.to_array())
+}
+
+#[inline(always)]
+fn has_nan_f64x4(v: f64x4) -> bool {
+    has_nan(&v.to_array())
+}
+
+#[inline(always)]
+fn ray_intersects_aabb_f32x4(
+    ray_origin: f32x4,
+    ray_inv_dir: f32x4,
+    aabb_0: f32x4,
+    aabb_1: f32x4,
+) -> bool {
+    let v1 = (aabb_0 - ray_origin) * ray_inv_dir;
+    let v2 = (aabb_1 - ray_origin) * ray_inv_dir;
+
+    if has_nan_f32x4(v1) | has_nan_f32x4(v2) {
+        return false;
+    }
+
+    let inf = v1.fast_min(v2);
+    let sup = v1.fast_max(v2);
+
+    let tmin = max_elem_f32x4(inf);
+    let tmax = min_elem_f32x4(sup);
+
+    tmax >= fast_max(tmin, 0.0)
+}
+
+impl RayIntersection<f32, 2> for Ray<f32, 2> {
+    #[inline(always)]
+    fn ray_intersects_aabb(&self, aabb: &Aabb<f32, 2>) -> bool {
+        let ro = self.origin.coords.to_register();
+        let ri = self.inv_direction.to_register();
+        let aabb_0 = aabb[0].coords.to_register();
+        let aabb_1 = aabb[1].coords.to_register();
+
+        ray_intersects_aabb_f32x4(ro, ri, aabb_0, aabb_1)
+    }
+}
+
+impl RayIntersection<f32, 3> for Ray<f32, 3> {
+    #[inline(always)]
+    fn ray_intersects_aabb(&self, aabb: &Aabb<f32, 3>) -> bool {
+        let ro = self.origin.coords.to_register();
+        let ri = self.inv_direction.to_register();
+        let aabb_0 = aabb[0].coords.to_register();
+        let aabb_1 = aabb[1].coords.to_register();
+
+        ray_intersects_aabb_f32x4(ro, ri, aabb_0, aabb_1)
+    }
+}
+
+impl RayIntersection<f32, 4> for Ray<f32, 4> {
+    #[inline(always)]
+    fn ray_intersects_aabb(&self, aabb: &Aabb<f32, 4>) -> bool {
+        let ro = self.origin.coords.to_register();
+        let ri = self.inv_direction.to_register();
+        let aabb_0 = aabb[0].coords.to_register();
+        let aabb_1 = aabb[1].coords.to_register();
+
+        ray_intersects_aabb_f32x4(ro, ri, aabb_0, aabb_1)
+    }
+}
+
+impl ToRegisterType for SVector<f64, 2> {
+    type Register = f64x2;
+
+    #[inline(always)]
+    fn to_register(&self) -> Self::Register {
+        f64x2::new([self.y, self.x])
+    }
+}
+
+/// Compute the horizontal maximum of the SIMD vector
+#[inline(always)]
+fn max_elem_f64x2(v: f64x2) -> f64 {
+    let a = v.to_array();
+    fast_max(a[0], a[1])
+}
+
+/// Compute the horizontal minimum of the SIMD vector
+#[inline(always)]
+fn min_elem_f64x2(v: f64x2) -> f64 {
+    let a = v.to_array();
+    fast_min(a[0], a[1])
+}
+
+#[inline(always)]
+fn ray_intersects_aabb_f64x2(
+    ray_origin: f64x2,
+    ray_inv_dir: f64x2,
+    aabb_0: f64x2,
+    aabb_1: f64x2,
+) -> bool {
+    let v1 = (aabb_0 - ray_origin) * ray_inv_dir;
+    let v2 = (aabb_1 - ray_origin) * ray_inv_dir;
+
+    if has_nan_f64x2(v1) | has_nan_f64x2(v2) {
+        return false;
+    }
+
+    let inf = v1.min(v2);
+    let sup = v1.max(v2);
+
+    let tmin = max_elem_f64x2(inf);
+    let tmax = min_elem_f64x2(sup);
+
+    tmax >= fast_max(tmin, 0.0)
+}
+
+impl RayIntersection<f64, 2> for Ray<f64, 2> {
+    #[inline(always)]
+    fn ray_intersects_aabb(&self, aabb: &Aabb<f64, 2>) -> bool {
+        let ro = self.origin.coords.to_register();
+        let ri = self.inv_direction.to_register();
+        let aabb_0 = aabb[0].coords.to_register();
+        let aabb_1 = aabb[1].coords.to_register();
+
+        ray_intersects_aabb_f64x2(ro, ri, aabb_0, aabb_1)
+    }
+}
+
+impl ToRegisterType for SVector<f64, 3> {
+    type Register = f64x4;
+
+    #[inline(always)]
+    fn to_register(&self) -> Self::Register {
+        f64x4::new([self.z, self.z, self.y, self.x])
+    }
+}
+
+impl ToRegisterType for SVector<f64, 4> {
+    type Register = f64x4;
+
+    #[inline(always)]
+    fn to_register(&self) -> Self::Register {
+        f64x4::new([self.w, self.z, self.y, self.x])
+    }
+}
+
+/// Compute the horizontal maximum of the SIMD vector
+#[inline(always)]
+fn max_elem_f64x4(v: f64x4) -> f64 {
+    let a = v.to_array();
+    fast_max(fast_max(a[0], a[1]), fast_max(a[2], a[3]))
+}
+
+/// Compute the horizontal minimum of the SIMD vector
+#[inline(always)]
+fn min_elem_f64x4(v: f64x4) -> f64 {
+    let a = v.to_array();
+    fast_min(fast_min(a[0], a[1]), fast_min(a[2], a[3]))
+}
+
+#[inline(always)]
+fn ray_intersects_aabb_f64x4(
+    ray_origin: f64x4,
+    ray_inv_dir: f64x4,
+    aabb_0: f64x4,
+    aabb_1: f64x4,
+) -> bool {
+    let v1 = (aabb_0 - ray_origin) * ray_inv_dir;
+    let v2 = (aabb_1 - ray_origin) * ray_inv_dir;
+
+    if has_nan_f64x4(v1) | has_nan_f64x4(v2) {
+        return false;
+    }
+
+    let inf = v1.min(v2);
+    let sup = v1.max(v2);
+
+    let tmin = max_elem_f64x4(inf);
+    let tmax = min_elem_f64x4(sup);
+
+    tmax >= fast_max(tmin, 0.0)
+}
+
+impl RayIntersection<f64, 3> for Ray<f64, 3> {
+    #[inline(always)]
+    fn ray_intersects_aabb(&self, aabb: &Aabb<f64, 3>) -> bool {
+        let ro = self.origin.coords.to_register();
+        let ri = self.inv_direction.to_register();
+        let aabb_0 = aabb[0].coords.to_register();
+        let aabb_1 = aabb[1].coords.to_register();
+
+        ray_intersects_aabb_f64x4(ro, ri, aabb_0, aabb_1)
+    }
+}
+
+impl RayIntersection<f64, 4> for Ray<f64, 4> {
+    #[inline(always)]
+    fn ray_intersects_aabb(&self, aabb: &Aabb<f64, 4>) -> bool {
+        let ro = self.origin.coords.to_register();
+        let ri = self.inv_direction.to_register();
+        let aabb_0 = aabb[0].coords.to_register();
+        let aabb_1 = aabb[1].coords.to_register();
+
+        ray_intersects_aabb_f64x4(ro, ri, aabb_0, aabb_1)
+    }
+}
diff --git a/src/ray/intersect_x86_64.rs b/src/ray/intersect_x86_64.rs
diff --git a/src/ray/mod.rs b/src/ray/mod.rs

Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@ impl<T: BHValue, const D: usize> RayIntersection<T, D> for Ray<T, D> {`
`36`	`36`	`}`
`37`	`37`	`}`
`38`	`38`
`39`		`-#[cfg(all(feature = "simd", target_arch = "x86_64"))]`
	`39`	`+#[cfg(feature = "simd")]`
`40`	`40`	`impl<T: BHValue, const D: usize> RayIntersection<T, D> for Ray<T, D> {`
`41`	`41`	`default fn ray_intersects_aabb(&self, aabb: &Aabb<T, D>) -> bool {`
`42`	`42`	`let lbr = (aabb[0].coords - self.origin.coords).component_mul(&self.inv_direction);`