diff --git a/Cargo.toml b/Cargo.toml index b8b169d..8553c6d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -61,6 +61,7 @@ multiple_inherent_impl = "allow" upper_case_acronyms = "allow" struct_field_names = "allow" self_named_module_files = "allow" +mod_module_files = "allow" # I just personally prefer the `match` syntax for if-let matching. option_if_let_else = "allow" diff --git a/crates/total-viewsheds/Cargo.toml b/crates/total-viewsheds/Cargo.toml index d329087..599edad 100644 --- a/crates/total-viewsheds/Cargo.toml +++ b/crates/total-viewsheds/Cargo.toml @@ -2,7 +2,8 @@ name = "total-viewsheds" version = "0.1.0" edition = "2021" -rust-version = "1.88" +rust-version = "1.89" + [dependencies] bytemuck = "1.22.0" @@ -29,5 +30,3 @@ googletest = "0.14.2" [lints] workspace = true - - diff --git a/crates/total-viewsheds/src/compute.rs b/crates/total-viewsheds/src/compute.rs index accbaf7..d5ac19a 100644 --- a/crates/total-viewsheds/src/compute.rs +++ b/crates/total-viewsheds/src/compute.rs @@ -1,6 +1,12 @@ //! The main entrypoint for running computations. +use crate::los_pack::LineOfSightPacked; use color_eyre::Result; +use rayon::iter::IntoParallelIterator as _; +use rayon::iter::ParallelIterator as _; +use rayon::ThreadPoolBuilder; +use std::sync::Mutex; +use std::time::Instant; /// The number of angles we rotate through. The other half are done via "backwards" lines of sight. pub const SECTOR_STEPS: u16 = 180; @@ -27,10 +33,6 @@ pub struct Compute<'compute> { pub longest_lines: Vec, } -/// `NUM_CORES` is the physical number of cores on a machine. Currently hardcoded to 8 -/// as that is what an i9900k has, and is a common configuration. -/// TODO find a good syscall for this -const NUM_CORES: usize = 8; /// Configuration for computing. pub struct ComputeConfig { /// The height of the observer that views viewsheds. @@ -94,10 +96,6 @@ impl<'compute> Compute<'compute> { // We only need the "chocolate box" section of rotations to do visibility calculations. let rotations_size = kernel::chocolate_box::size(dem.width, dem.tvs_width); - #[expect( - clippy::if_then_some_else_none, - reason = "The `?` is hard to use in the closure" - )] let vulkan = if matches!(config.backend, crate::config::Backend::Vulkan) { let elevations = dem.elevations.clone(); dem.elevations = Vec::new(); // Free up some RAM. @@ -259,32 +257,69 @@ impl<'compute> Compute<'compute> { /// `run_parallel` runs the CPU kernel in parallel fn run_parallel(&mut self) -> Result<()> { - #[expect( - clippy::as_conversions, - clippy::cast_possible_truncation, - reason = "elevations start out as i16s, and i16 -> f32 -> i16 is lossless" - )] - let elevations = self - .dem - .elevations - .iter() - .map(|&x| x as i16) - .collect::>(); - - #[expect(clippy::as_conversions, reason = "u32 -> usize is valid")] - // TODO: third param is ring data which needs to be saved - let (surfaces, _longest, _) = crate::cpu::multithreaded_kernel( - &elevations, - self.dem.max_los_as_points as usize, - 360, - NUM_CORES, - false, - ); + let max_los = usize::try_from(self.dem.max_los_as_points)?; + let mut surfaces = vec![0.0f32; max_los * max_los]; + let mut longest = vec![(0u16, 0.0f32); max_los * max_los]; + + let pool = ThreadPoolBuilder::new().num_threads(8).build()?; - self.add_sector_surfaces_to_running_total(&surfaces); + { + let angle_mu = &Mutex::new(&mut surfaces); + let longest_mu = &Mutex::new(&mut longest); + + let elevations = &self.dem.elevations; + + pool.install(move || { + (0u16..360u16) + .into_par_iter() + .map(|angle| { + let start = Instant::now(); + tracing::info!("starting angle: {angle}"); + let (heatmap, long, _) = + crate::cpu::kernel(elevations, max_los, f32::from(angle), false); + tracing::info!("finished angle in {:?}", start.elapsed()); + (angle, heatmap, long) + }) + .for_each(|(angle, heatmap, long)| { + #[expect(clippy::expect_used, reason = "a poisoned mutex should crash")] + angle_mu + .lock() + .expect("mutex poisoned") + .iter_mut() + .zip(heatmap) + .for_each(|(to, from)| { + *to += from; + }); + + #[expect(clippy::expect_used, reason = "a poisoned mutex should crash")] + longest_mu + .lock() + .expect("mutex poisoned") + .iter_mut() + .zip(long) + .for_each(|(to, from)| { + if from > to.1 { + *to = (angle, from); + } + }); + }); + }); + }; - // TODO: Pack longest lines - // self.longest_lines = longest; + self.total_surfaces = surfaces; + let packed: Result> = longest + .iter() + .map(|&(angle, distance): &(u16, f32)| { + #[expect( + clippy::as_conversions, + clippy::cast_possible_truncation, + clippy::cast_sign_loss, + reason = "distances always fit in u32" + )] + LineOfSightPacked::new(distance as u32, angle) + }) + .collect(); + self.longest_lines = packed?; self.render_total_surfaces()?; self.render_longest_lines()?; diff --git a/crates/total-viewsheds/src/cpu.rs b/crates/total-viewsheds/src/cpu.rs deleted file mode 100644 index 6c42808..0000000 --- a/crates/total-viewsheds/src/cpu.rs +++ /dev/null @@ -1,987 +0,0 @@ -//! `cpu` is a CPU version of the total viewshed calculation - -use itertools::izip; -use rayon::iter::ParallelIterator as _; - -#[cfg(target_feature = "avx512f")] -use std::arch::x86_64::{ - __m512, _mm256_alignr_epi32, _mm256_mask_alignr_epi32, _mm512_alignr_epi32, - _mm512_castps_si512, _mm512_castsi512_ps, _mm512_cmple_ps_mask, _mm512_max_ps, -}; - -use rayon::prelude::IntoParallelIterator as _; -use rayon::ThreadPoolBuilder; -#[cfg(all( - target_feature = "avx2", - target_feature = "avx", - target_feature = "sse", - target_feature = "sse2" -))] -use std::arch::x86_64::{ - _mm256_blend_ps, _mm256_castps_si256, _mm256_castsi256_ps, _mm256_cmp_ps, _mm256_max_ps, - _mm256_slli_si256, _CMP_LE_OS, -}; -use std::iter::zip; -use std::simd::prelude::*; -use std::simd::{LaneCount, Mask, SupportedLaneCount}; -use std::sync::Mutex; -use std::time::Instant; -use std::{array, f32, mem, slice}; - -#[cfg(all(target_feature = "sse", target_feature = "sse2"))] -use std::arch::x86_64::{_mm_castps_si128, _mm_cmpge_ps, _mm_max_ps}; - -/// `EARTH_RADIUS_SQUARED` is the earth's radius squared in meters -const EARTH_RADIUS_SQUARED: f32 = 12_742_000.0; - -/// `TAN_ONE_RAD` helps normalize the fact that inner points are sampled more often -/// see the TVS paper for reasoning. -const TAN_ONE_RAD: f32 = 0.017_453_3; - -/// `Vectorized` is an empty struct to allow for specializations of the total viewhshed algorithm -/// TODO: maybe we can just use a generic struct? -struct Vectorized; - -/// `Viewshed` holds all the platform and vector-width specific methods the CPU kernel -/// needs to operate. -trait Viewshed -where - LaneCount: SupportedLaneCount, -{ - /// `gte` takes in a vector of angles and its prefix maximum returns a mask of - /// i32s which are either -1 or 0 in each lane. This way it can be used to "select" - /// which lanes of the target vector to use for further calculations - fn gte(&self, angle: Simd, prefix: Simd) -> Mask; - - /// `max` returns the lane-wise maximum of both vectors. It exists to help platform-specific - /// and potentially "unsafe" (in floating point terms) and speedier implementations - fn max(&self, lhs: Simd, rhs: Simd) -> Simd; - - /// `prefix_max` calculates a prefix maximum given all of the `angles` and stores - /// it in `prefix_max` - fn prefix_max( - &self, - angles: &[Simd], - prefix_max: &mut [Simd], - acc: Simd, - ) -> Simd; -} - -impl Viewshed<4> for Vectorized { - #[inline] - #[cfg(all(target_feature = "sse", target_feature = "sse2"))] - fn gte(&self, angle: f32x4, prefix: f32x4) -> Mask { - // safety: the caller of Viewshed<4> guarantees that -0.0 or NaN are not in the input - // thus allowing this to be non IEEE754 compliant - unsafe { - let mask = _mm_castps_si128(_mm_cmpge_ps(angle.into(), prefix.into())); - Mask::::from_int_unchecked(mask.into()) - } - } - - #[inline] - #[cfg(not(all(target_feature = "sse", target_feature = "sse2")))] - fn gte(&self, lhs: f32x4, rhs: f32x4) -> Mask { - lhs.simd_ge(rhs) - } - - #[inline] - #[cfg(all(target_feature = "sse", target_feature = "sse2"))] - fn max(&self, lhs: f32x4, rhs: f32x4) -> Simd { - // safety: the caller of Viewshed<4> guarantees that -0.0 or NaN are not in the input - // thus allowing this to be non IEEE754 compliant - unsafe { _mm_max_ps(lhs.into(), rhs.into()).into() } - } - - #[inline] - #[cfg(not(all(target_feature = "sse", target_feature = "sse2")))] - fn max(&self, lhs: f32x4, rhs: f32x4) -> Simd { - lhs.simd_max(r) - } - - #[inline] - fn prefix_max(&self, angles: &[f32x4], prefix_max: &mut [f32x4], acc: f32x4) -> f32x4 { - for (prefix, &angle) in zip(prefix_max.iter_mut(), angles.iter()) { - let mut v_prefix_max = { - let shifted = angle.shift_elements_right::<1>(-2000.0f32); - self.max(angle, shifted) - }; - - v_prefix_max = { - let shifted = v_prefix_max.shift_elements_right::<2>(-2000.0f32); - self.max(v_prefix_max, shifted) - }; - - *prefix = v_prefix_max; - } - - let mut local_acc = acc; - - // accumulate the prefix maxes for blocks, re-computing all prefix maxes - // to include the accumulated value - for prefix in prefix_max { - let cur_prefix: f32x4 = *prefix; - let cur_max: f32x4 = Simd::splat(cur_prefix[3]); - - *prefix = self.max(local_acc, cur_prefix); - local_acc = self.max(local_acc, cur_max); - } - - local_acc - } -} - -#[cfg(all(target_feature = "avx2", target_feature = "avx"))] -impl Viewshed<8> for Vectorized { - #[inline] - fn gte(&self, angle: f32x8, prefix: f32x8) -> Mask { - // safety: the caller of Viewshed<8> guarantees that -0.0 or NaN are not in the input - // thus allowing this to be non IEEE754 compliant - unsafe { - let mask = - _mm256_castps_si256(_mm256_cmp_ps::<_CMP_LE_OS>(prefix.into(), angle.into())); - Mask::::from_int_unchecked(mask.into()) - } - } - - #[inline] - fn max(&self, lhs: f32x8, rhs: f32x8) -> Simd { - // safety: the caller of Viewshed<8> guarantees that -0.0 or NaN are not in the input - // thus allowing this to be non IEEE754 compliant - unsafe { _mm256_max_ps(lhs.into(), rhs.into()).into() } - } - - #[inline] - fn prefix_max(&self, angles: &[f32x8], prefix_max: &mut [f32x8], acc: f32x8) -> f32x8 { - // Calculate the 4-wide block prefix max two at a time - for (prefix, &angle) in zip(prefix_max.iter_mut(), angles.iter()) { - // safety: all mm256 operations are avx2, and Viewshed<8> has feature guards for both - let mut v_prefix_max = unsafe { - let shifted = _mm256_slli_si256::<4>(_mm256_castps_si256(angle.into())); - let blended = _mm256_blend_ps::<0b1000_1000>( - _mm256_castsi256_ps(shifted), - Simd::splat(-2000.0f32).into(), - ); - self.max(angle, blended.into()) - }; - - // safety: all mm256 operations are avx2, and Viewshed<8> has feature guards for both - v_prefix_max = unsafe { - let shifted = _mm256_slli_si256::<8>(_mm256_castps_si256(v_prefix_max.into())); - let blended = _mm256_blend_ps::<0b1100_1100>( - _mm256_castsi256_ps(shifted), - Simd::splat(-2000.0f32).into(), - ); - - self.max(v_prefix_max, blended.into()) - }; - - *prefix = v_prefix_max; - } - - let mut local_acc = f32x4::splat(acc[3]); - - // safety: because f32x8s are aligned to exactly sizeof(f32x4) * 2 - // this is well aligned, so the cast is valid - // - // This is SUPER MEGA UBER VERY sketchy, and shouldn't be copied - // unless you _really_, _truly_ understand what the compiler will do - let single_wide_prefx: &mut [f32x4] = unsafe { - let ptr = prefix_max.as_mut_ptr(); - slice::from_raw_parts_mut(ptr.cast::(), prefix_max.len() * 2) - }; - - // accumulate the prefix maxes for blocks, re-computing all prefix maxes - // to include the accumulated value - for prefix in single_wide_prefx { - let cur_prefix: f32x4 = *prefix; - let cur_max: f32x4 = Simd::splat(cur_prefix[3]); - - *prefix = self.max(local_acc, cur_prefix); - local_acc = self.max(local_acc, cur_max); - } - - f32x8::splat(local_acc[3]) - } -} - -#[cfg(target_feature = "avx512f")] -fn _mm512_slli_si512(elem: __m512) -> __m512 -where - [(); { (16 - K) as i32 } as usize]:, -{ - unsafe { - let zero = f32x16::splat(-2000.0f32); - _mm512_castsi512_ps(_mm512_alignr_epi32::<{ (16 - K) as i32 }>( - _mm512_castps_si512(elem), - _mm512_castps_si512(zero.into()), - )) - } -} - -#[cfg(target_feature = "avx512f")] -impl Viewshed<16> for Vectorized { - #[inline] - fn gte(&self, angle: f32x16, prefix: f32x16) -> Mask { - // safety: the caller of Viewshed<8> guarantees that -0.0 or NaN are not in the input - // thus allowing this to be non IEEE754 compliant - unsafe { - let mask = _mm512_cmple_ps_mask(prefix.into(), angle.into()); - Mask::::from_bitmask(mask.into()) - } - } - - #[inline] - fn max(&self, lhs: f32x16, rhs: f32x16) -> f32x16 { - // safety: the caller of Viewshed<8> guarantees that -0.0 or NaN are not in the input - // thus allowing this to be non IEEE754 compliant - unsafe { _mm512_max_ps(lhs.into(), rhs.into()).into() } - } - - #[inline] - fn prefix_max(&self, angles: &[f32x16], prefix_max: &mut [f32x16], acc: f32x16) -> f32x16 { - // Calculate the 4-wide block prefix max two at a time - for (prefix, &angle) in zip(prefix_max.iter_mut(), angles.iter()) { - unsafe { - let mut v_prefix_max = - _mm512_max_ps(angle.into(), _mm512_slli_si512::<1>(angle.into()).into()); - v_prefix_max = _mm512_max_ps( - v_prefix_max.into(), - _mm512_slli_si512::<2>(v_prefix_max).into(), - ); - v_prefix_max = _mm512_max_ps( - v_prefix_max.into(), - _mm512_slli_si512::<4>(v_prefix_max).into(), - ); - v_prefix_max = _mm512_max_ps( - v_prefix_max.into(), - _mm512_slli_si512::<8>(v_prefix_max).into(), - ); - *prefix = v_prefix_max.into(); - } - } - - let mut local_acc = f32x16::splat(acc[0]); - - // accumulate the prefix maxes for blocks, re-computing all prefix maxes - // to include the accumulated value - for prefix in prefix_max { - let cur_prefix: f32x16 = *prefix; - let cur_max: f32x16 = Simd::splat(cur_prefix[15]); - - *prefix = self.max(local_acc, cur_prefix); - local_acc = self.max(local_acc, cur_max); - } - - f32x16::splat(local_acc[0]) - } -} - -#[inline] -/// `load_elevations` converts an array of i16 elevations into a height adjusted -fn load_elevations(elev_arr: [i16; N], pov_height: f32) -> Simd -where - LaneCount: SupportedLaneCount, -{ - let elevs = Simd::::from_array(elev_arr); - let float_elevs: Simd = elevs.cast(); - float_elevs - Simd::splat(pov_height) -} - -/// `IndexSIMD` holds the `dem_ids`/"indexes" of the current line of sight, -/// along with where they will be written out to -struct IndexSIMD<'idx, const N: usize> { - /// `indexes_in` holds a slice of a SIMD-size-wide array of `dem_ids`/"indexes" - indexes_in: &'idx [[i32; N]], - /// `indexes_out` is the buffer of a SIMD-size-wide where the visible `dem_ids`/"indexes" will be written out to - indexes_out: &'idx mut [[i32; N]], -} - -#[inline] -/// `line_of_sight` calculates a single line of sight for a given pov, which is passed in via `pov_height` -fn line_of_sight( - vs: &VS, - elevations: &[[i16; N]], - distances: &[Simd], - adjustments: &[Simd], - prefix_in: Simd, - indexes: Option>, - pov_height: f32, -) -> ([Simd; UNROLL], [Simd; UNROLL], Simd) -where - LaneCount: SupportedLaneCount, - VS: Viewshed, -{ - let mut sum_buf: [Simd; UNROLL] = [Simd::splat(0.0); UNROLL]; - let mut angle_buf: [Simd; UNROLL] = [Simd::splat(0.0); UNROLL]; - let mut longest_line_buf: [Simd; UNROLL] = [Simd::splat(0.0); UNROLL]; - let mut prefix_buf: [Simd; UNROLL] = [Simd::splat(0.0); UNROLL]; - - izip!( - angle_buf.iter_mut(), - elevations - .iter() - .map(|elev| load_elevations(*elev, pov_height)), - distances, - adjustments - ) - .for_each(|(angle, elev, dist, adjust)| { - *angle = elev / dist - adjust; - }); - - #[expect( - clippy::float_cmp, - reason = "-2000.0f32 is a sentinel value for the first time this accumlative function is run" - )] - if prefix_in[0] == -2000.0f32 { - angle_buf[0][0] = -2000.1f32; - } - - let prefix_out = vs.prefix_max(&angle_buf, &mut prefix_buf, prefix_in); - - let index_iter = indexes.map(|index_simd| { - index_simd - .indexes_in - .iter() - .map(|ind| Simd::::from_array(*ind)) - .zip(index_simd.indexes_out.iter_mut()) - }); - - izip!( - &mut sum_buf, - &mut longest_line_buf, - angle_buf.iter(), - prefix_buf.iter(), - distances.iter(), - OptionIter::new(index_iter) - ) - .for_each(|(next_sum, longest_line, &angle, &pref, &dists, inds)| { - let mask = vs.gte(angle, pref); - - if let Some((inds_in, inds_out)) = inds { - inds_in.store_select(inds_out, mask); - } - - let selected_distances = mask.select(dists, Simd::splat(0.0)); - *longest_line = vs.max(*longest_line, selected_distances); - - let selected_tans = mask.select(Simd::splat(TAN_ONE_RAD), Simd::splat(0.0)); - *next_sum = selected_distances * selected_tans; - }); - - (sum_buf, longest_line_buf, prefix_out) -} - -/// `dem_to_pov` -#[expect( - clippy::as_conversions, - clippy::cast_possible_truncation, - clippy::cast_possible_wrap, - reason = "so long as max_los < 2^24, the following as conversions are entirely safe" -)] -#[expect( - clippy::integer_division, - reason = "i32 is constructed from (i32, i32) converting back should succeed" -)] -const fn dem_to_pov(dem_id: i32, width: usize, max_los: usize) -> i32 { - let dem_x = (dem_id / width as i32) - max_los as i32; - let dem_y = (dem_id % width as i32) - max_los as i32; - - dem_x * (max_los as i32) + dem_y -} - -/// `Indexes` holds the `dem_id`/"indexes" and the output buffer to store them in -struct Indexes<'index> { - /// `indexes_in` holds the full line of (2 or 3*`max_los`) `dem_ids`/"indexes" - indexes_in: &'index [i32], - /// `indexes_out` holds `max_los` indexes of `max_los` length. Zeroes are used if - indexes_out: &'index mut [i32], -} - -/// `OptionIter` holds the state for an optional inner iterator. -/// If passed `None`, it will repeat `None` forever. This comes in handy -/// when working with `izip!` -struct OptionIter -where - Iter: Iterator, -{ - /// `iter` holds an optional iterator state which will - /// call `next()` - iter: Option, -} - -impl OptionIter -where - Iter: Iterator, -{ - /// `new` creates a new iter from an Option of the Iter - const fn new(iter: Option) -> Self { - Self { iter } - } -} - -impl Iterator for OptionIter -where - Iter: Iterator, -{ - type Item = Option; - - #[inline] - fn next(&mut self) -> Option { - if let Some(ref mut iter) = &mut self.iter { - iter.next().map(Some) - } else { - Some(None) - } - } -} - -/// `UnrolledAngles` holds the curved earth adjustments -struct UnrolledAngles<'angle, const WIDTH: usize, const UNROLL: usize> -where - LaneCount: SupportedLaneCount, -{ - /// `adjustments` holds a slice of UNROLL sized slices used during loop unrolling, and then the "rest" portion - adjustments: ( - &'angle [[Simd; UNROLL]], - &'angle [Simd], - ), - /// `distances` holds a slice of UNROLL sized slices used during loop unrolling, and then the "rest" portion - distances: ( - &'angle [[Simd; UNROLL]], - &'angle [Simd], - ), -} - -/// `viewshed` computes the viewshed for a single pov, using its `elevation`, and `max_los` -/// and stores the results in `heatmap` and `longest_line` using the `dem_id` -#[inline] -#[expect( - clippy::too_many_arguments, - clippy::too_many_lines, - reason = "it is what it is for now" -)] -fn viewshed( - vs: &VS, - pov_idx: usize, - elevation: i16, - dem_id: i32, - max_los: usize, - heatmap: &mut [f32], - longest_line: &mut [f32], - line: &[i16], - index_data: Option, - unrolled_angles: &UnrolledAngles, -) where - LaneCount: SupportedLaneCount, - VS: Viewshed, -{ - let result_tvs_id = dem_to_pov(dem_id, 3 * max_los, max_los); - - // if the line of sight is not within our computable points, do not consider it - #[expect( - clippy::as_conversions, - clippy::cast_possible_wrap, - clippy::cast_possible_truncation, - reason = "max_los^2 < 2^31" - )] - if result_tvs_id < 0i32 || result_tvs_id >= (max_los * max_los) as i32 { - return; - } - - let pov_height = f32::from(elevation); - - // safety: max_los % WIDTH == 0, so [pov_idx..pov_idx+max_los) will also be WIDTH wide - let (elevations, _): (&[[i16; WIDTH]], _) = - unsafe { line.get_unchecked(pov_idx..pov_idx + max_los) }.as_chunks::(); - - let (iter, rest) = index_data.map_or_else( - || (None, None), - |data| { - // safety: pov_idx should be between [0, max_los) and len(indexes_in)==2*max_los - // thus, for any pov_idx pov_idx..pov_idx+max_los is inbounds - let (indexes, _): (&[[i32; WIDTH]], _) = - unsafe { data.indexes_in.get_unchecked(pov_idx..pov_idx + max_los) } - .as_chunks::(); - - let (indexes_out, _) = data.indexes_out.as_chunks_mut::(); - - let (chunked_indexes, rest_indexes) = indexes.as_chunks::(); - let (chunked_indexes_out, rest_indexes_out) = indexes_out.as_chunks_mut::(); - - ( - Some( - zip(chunked_indexes.iter(), chunked_indexes_out.iter_mut()).map( - |(inds_in, inds_out)| IndexSIMD { - indexes_in: inds_in, - indexes_out: inds_out, - }, - ), - ), - Some(IndexSIMD { - indexes_in: rest_indexes, - indexes_out: rest_indexes_out, - }), - ) - }, - ); - - let (chunked_elevs, rest_elevs) = elevations.as_chunks::(); - - let (chunked_distances, rest_distances) = unrolled_angles.distances; - let (chunked_adjustments, rest_adjustments) = unrolled_angles.adjustments; - - let (local_sums, local_longest, prefix) = izip!( - chunked_elevs, - chunked_distances, - chunked_adjustments, - OptionIter::new(iter), - ) - .fold( - ( - [Simd::splat(0.0); UNROLL], - [Simd::splat(0.0); UNROLL], - Simd::splat(-2000.0), - ), - |(sum, longest, prefix), (elevs, dists, adjusts, inds)| { - let (next_sum, next_longest, acc) = line_of_sight::( - vs, // elevs: &[[i16; N]], - elevs, // distances: &[Simd], - dists, // adjustments: &[Simd], - adjusts, // prefix_in: Simd, - prefix, inds, pov_height, // pov_height: f32, - ); - - let mut copied_sum = sum; - zip(copied_sum.iter_mut(), next_sum).for_each(|(old, new)| { - *old += new; - }); - - let mut copied_longest_line = longest; - zip(copied_longest_line.iter_mut(), next_longest).for_each(|(old, new)| { - *old = old.simd_max(new); - }); - - (copied_sum, copied_longest_line, acc) - }, - ); - - let mut sum = local_sums - .iter() - .fold(0.0f32, |acc, partial| acc + partial.reduce_sum()); - - let mut longest = local_longest - .iter() - .fold(0.0f32, |acc, new| acc.max(new.reduce_max())); - - let (sum_buf, longest_buf, _) = line_of_sight::( - vs, - rest_elevs, - rest_distances, - rest_adjustments, - prefix, - rest, - pov_height, - ); - - sum += sum_buf - .iter() - .fold(0.0f32, |acc, partial| acc + partial.reduce_sum()); - - longest = longest_buf - .iter() - .fold(longest, |acc, new| acc.max(new.reduce_max())); - - #[expect( - clippy::as_conversions, - clippy::cast_sign_loss, - reason = "result_idx should be in [0, 2^31]" - )] - // safety: it is guaranteed by the rotation kernel that if the index is - // greater than zero that it is in-bounds. This saves ~10% of bounds checks - unsafe { - *heatmap.get_unchecked_mut(result_tvs_id as usize) += sum; - - let old_longest: *mut f32 = longest_line.get_unchecked_mut(result_tvs_id as usize); - *old_longest = (*old_longest).max(longest); - } -} - -/// `precalculate_distances` precalculates earth curvature adjustments and -/// the distance from a particular point (which is just linear) -fn precalculate_distances( - max_los: usize, -) -> (Vec>, Vec>) -where - LaneCount: SupportedLaneCount, -{ - (0..max_los) - .step_by(WIDTH) - .map(|offset| { - #[expect( - clippy::as_conversions, - clippy::cast_possible_wrap, - clippy::cast_possible_truncation, - reason = "WIDTH < 2^31" - )] - let distance_arr: [i32; WIDTH] = array::from_fn(|i| i as i32); - let distances = Simd::from_array(distance_arr); - - #[expect( - clippy::as_conversions, - clippy::cast_possible_wrap, - clippy::cast_possible_truncation, - reason = "WIDTH < 2^31" - )] - let normalized = (distances + Simd::splat(offset as i32)) * Simd::splat(100i32); - - let floats: Simd = normalized.cast(); - - (floats, floats / Simd::splat(EARTH_RADIUS_SQUARED)) - }) - .unzip() -} - -/// `total_viewshed` computes a total viewshed heatmap for a given elevation map, -/// and corresponding indexes to store the rotated data -#[inline] -fn total_viewshed>( - vs: &V, - elevation_map: &[i16], - indexes: &[i32], - max_los: usize, - output_sector_data: bool, -) -> ViewshedAngle -where - LaneCount: SupportedLaneCount, -{ - assert_eq!( - elevation_map.len(), - 2 * max_los * max_los, - "elevations should be 2 * max_los wide, and max_los tall" - ); - - assert_eq!( - indexes.len(), - 2 * max_los * max_los, - "indexes should be 2 * max_los wide, and max_los tall" - ); - - assert_eq!( - max_los % WIDTH, - 0, - "to help the vectorizer, max_los must be a multiple of {WIDTH}" - ); - - let mut sector_data_buf = vec![ - 0i32; - if output_sector_data { - max_los * max_los * max_los - } else { - 0 - } - ]; - let mut heatmap = vec![0.0f32; max_los * max_los]; - let mut longest_line = vec![0.0f32; max_los * max_los]; - let mut sector_data: Option<&mut Vec> = output_sector_data.then_some(&mut sector_data_buf); - - let width = 2 * max_los; - - // precalculate all distances and their spherical earth "adjustments". - // This saves ~33% of effort inside our hot loop - let (distances, adjustments) = precalculate_distances::(max_los); - - let unrolled_angle = UnrolledAngles { - distances: distances.as_chunks::(), - adjustments: adjustments.as_chunks::(), - }; - - for (line, line_indexes, sector_chunk) in izip!( - elevation_map.chunks_exact(width), - indexes.chunks_exact(width), - OptionIter::new( - sector_data - .as_mut() - .map(|sd| sd.chunks_exact_mut(max_los * max_los)) - ), - ) { - for (pov, (&pov_height, &result_dem_id, line_bitmap)) in izip!( - line.iter().take(max_los), - line_indexes.iter().take(max_los), - OptionIter::new(sector_chunk.map(|chunk| chunk.chunks_exact_mut(max_los))) - ) - .enumerate() - { - viewshed( - vs, - pov, - pov_height, - result_dem_id, - max_los, - &mut heatmap, - &mut longest_line, - line, - line_bitmap.map(|bitmap| Indexes { - indexes_in: line_indexes, - indexes_out: bitmap, - }), - &unrolled_angle, - ); - } - } - - ViewshedAngle { - heatmap, - longest_line, - sector_data: sector_data.cloned(), - } -} - -/// `generate_rotation` generates a rotation "map" for a given elevation list -/// Adapted from [this stack overflow answer](https://stackoverflow.com/a/71901621) -#[expect( - clippy::as_conversions, - clippy::cast_possible_truncation, - clippy::cast_possible_wrap, - clippy::cast_precision_loss, - reason = "so long as max_los^2 < 2^24, the following `as` conversions are entirely safe" -)] -fn generate_rotation(elevs: &[i16], angle: f64, max_los: usize) -> (Vec, Vec) { - let width = (max_los * 3) as isize; - - #[expect(clippy::integer_division, reason = "we don't need precision here")] - { - assert_eq!( - elevs.len() as isize % width, - 0, - "Elevations array must be square {}%{width} != 0", - elevs.len(), - ); - let elevations_div_width = elevs.len() as isize / width; - assert_eq!( - elevations_div_width, - width, - "Elevations array must be square {}/{width} (={elevations_div_width}) != {width}", - elevs.len() as isize - ); - }; - - let (sin, cos) = (f64::sin(angle.to_radians()), f64::cos(angle.to_radians())); - - #[expect(clippy::integer_division, reason = "we don't need precision here")] - let (x_center, y_center) = (width / 2, width / 2); - - let mut rotation: Vec = Vec::with_capacity(2 * max_los * max_los); - - for x in (max_los as isize)..(max_los as isize) * 2 { - let x_sin = (x - x_center) as f64 * sin; - let x_cos = (x - x_center) as f64 * cos; - for y in (max_los as isize)..width { - let y_sin = (y - y_center) as f64 * sin; - let y_cos = (y - y_center) as f64 * cos; - - let x_rot = (x_cos - y_sin).round() as isize + y_center; - let y_rot = (y_cos + x_sin).round() as isize + x_center; - - let new_idx = x_rot.clamp(0, width - 1) * width + y_rot.clamp(0, width - 1); - - rotation.push(new_idx as i32); - } - } - - debug_assert_eq!( - rotation.len() as isize, - max_los as isize * (2 * max_los as isize), - "the rotation should be 2 * max_los wide, max_los tall" - ); - - // map the indexes to their elevations - let elevations = rotation - .iter() - .map(|&idx| { - if idx < 0i32 { - i16::MIN - } else { - #[expect( - clippy::as_conversions, - reason = "elevations start out as i16s, and i16 -> f32 -> i16 is lossless" - )] - #[expect(clippy::cast_sign_loss, reason = "idx < 2^31, idx >= 0")] - // safety: idx is clamped so a get will always be in-bounds - *unsafe { elevs.get_unchecked(idx as usize) } - } - }) - .collect::>(); - - (rotation, elevations) -} - -/// `ViewshedAngle` holds the cumulative result of a single angle in the `total_viewshed` algorithm -#[derive(Debug)] -struct ViewshedAngle { - /// `heatmap` contains the longest line of sight heatmap for rendering - heatmap: Vec, - /// `longest_line` contains the longest distance for a particular point - longest_line: Vec, - /// `sector_data` holds the visibility calculations for each point in row-major order. - /// elements `[0..max_los)` are for the first point, `[max_los, 2*max_los)` for the second - /// point, and so on. - /// - /// This gets absolutely massive, so we only allocate this if we know for certain we will be using it - sector_data: Option>, -} - -impl ViewshedAngle { - /// `new` creates a new buffer for viewshed results of `heatmap`, - /// `longest_line`, and `sector_data` - fn new(max_los: usize, sector_data: bool) -> Self { - Self { - heatmap: vec![0.0f32; max_los * max_los], - longest_line: vec![0.0f32; max_los * max_los], - sector_data: sector_data.then(|| Vec::with_capacity(max_los * max_los * max_los)), - } - } - - /// `acc` accumulates a single `ViewshedAngle` into another - fn acc(&mut self, other: &Self) { - zip(&mut self.heatmap, &other.heatmap).for_each(|(to, from)| { - *to += *from; - }); - - zip(&mut self.longest_line, &other.longest_line).for_each(|(to, from)| { - *to = (*to).max(*from); - }); - - if let Some(ref mut sector_data) = &mut self.sector_data { - if let Some(other_sector_data) = &other.sector_data { - sector_data.extend_from_slice(other_sector_data); - } - } - } -} - -/// `kernel` is a CPU-based total viewshed kernel. It makes use of image rotation tof -/// optimize the cache locality of all lookups for a total viewshed calculation -fn kernel(elevations: &[i16], max_los_points: usize, angle: usize) -> ViewshedAngle { - assert!(angle < 360, "angle must be [0, 360)"); - let mut start = Instant::now(); - - #[expect( - clippy::as_conversions, - clippy::cast_precision_loss, - reason = "angle is [0,360), not more than 2^54" - )] - let (indexes, rotated_elevations) = generate_rotation(elevations, angle as f64, max_los_points); - - tracing::info!( - "rotated {:?} in {:?}, calculating viewshed", - angle, - start.elapsed() - ); - - start = Instant::now(); - - let vectorized = Vectorized {}; - - #[cfg(target_feature = "avx512f")] - { - let result = total_viewshed::<16, 8, Vectorized>( - &vectorized, - &rotated_elevations, - &indexes, - max_los_points, - false, - ); - tracing::info!("kernel for {} run in: {:?}", angle, start.elapsed()); - return result; - }; - - #[cfg(all(target_feature = "avx2", target_feature = "avx"))] - { - let result = total_viewshed::<8, 8, Vectorized>( - &vectorized, - &rotated_elevations, - &indexes, - max_los_points, - false, - ); - tracing::info!("kernel for {} run in: {:?}", angle, start.elapsed()); - return result; - }; - - #[expect( - unreachable_code, - unused_variables, - reason = "conditionally compiled out" - )] - let result = total_viewshed::<4, 8, Vectorized>( - &vectorized, - &rotated_elevations, - &indexes, - max_los_points, - false, - ); - tracing::info!("kernel for {} run in: {:?}", angle, start.elapsed()); - result -} - -/// `multithreaded_kernel` parallelizes CPU kernel calculations for a `core_count` and calculates -/// `num_angles` different angles -pub fn multithreaded_kernel( - elevations_original: &[i16], - max_los_points_original: usize, - num_angles: usize, - core_count: usize, - output_sector_data: bool, -) -> (Vec, Vec, Option>) { - let max_los_points = max_los_points_original.div_ceil(4) * 4; - let dem_width = max_los_points * 3; - let mut elevations_vec = elevations_original.to_vec(); - elevations_vec.resize(dem_width.pow(2), 0); - let elevations = &elevations_vec; - - if max_los_points != max_los_points_original { - tracing::warn!("LoS: {max_los_points_original} to {max_los_points}"); - } - if elevations.len() != elevations_original.len() { - tracing::warn!( - "Elevations array length resized: {} to {}", - elevations_original.len(), - elevations_vec.len() - ); - } - - #[expect( - clippy::expect_used, - reason = "threadpool must be created for program to run" - )] - let pool = ThreadPoolBuilder::new() - .num_threads(core_count) - .build() - .expect("couldn't build threadpool"); - - let mut final_angle = ViewshedAngle::new(max_los_points, output_sector_data); - let angle_mu = &Mutex::new(&mut final_angle); - - pool.install(move || { - (0..num_angles) - .into_par_iter() - .map(|angle| kernel(elevations, max_los_points, angle)) - .for_each(|vs| { - #[expect(clippy::expect_used, reason = "a poisoned mutex should crash")] - let mut angle_guard = angle_mu.lock().expect("mutex poisoned"); - - angle_guard.acc(&vs); - }); - }); - - let (heatmap, longest_line, sector) = ( - mem::take(&mut final_angle.heatmap), - mem::take(&mut final_angle.longest_line), - final_angle - .sector_data - .map(|mut sector_data| mem::take(&mut sector_data)), - ); - - (heatmap, longest_line, sector) -} diff --git a/crates/total-viewsheds/src/cpu/kernel.rs b/crates/total-viewsheds/src/cpu/kernel.rs new file mode 100644 index 0000000..97efe20 --- /dev/null +++ b/crates/total-viewsheds/src/cpu/kernel.rs @@ -0,0 +1,221 @@ +use crate::cpu::los::{LineOfSight as _, UnrolledLOS}; +use crate::cpu::vector::{VectorLos, DEFAULT_VECTOR_LENGTH}; +use itertools::izip; + +/// `fill_in_elevations` will fill in "blank" elevations from NASA data with the last seen elevation +/// in the line of sight +fn fill_in_elevations(elevs: &[i16], max_los: usize) -> Vec { + elevs + .chunks_exact(2 * max_los) + .flat_map(|line| { + line.iter() + .scan(0, |last_seen, &elevation| match elevation { + i16::MIN => Some(*last_seen), + _ => { + *last_seen = elevation; + Some(elevation) + } + }) + }) + .collect::>() +} + +/// `generate_rotation` generates a rotation "map" for a given elevation list +/// Adapted from [this stack overflow answer](https://stackoverflow.com/a/71901621) +#[expect( + clippy::as_conversions, + clippy::cast_possible_truncation, + clippy::cast_possible_wrap, + clippy::cast_precision_loss, + reason = "so long as max_los^2 < 2^24, the following `as` conversions are entirely safe" +)] +fn generate_rotation(elevs: &[i16], angle: f64, max_los: usize) -> (Vec, Vec) { + let width = (max_los * 3) as isize; + + #[expect(clippy::integer_division, reason = "we don't need precision here")] + { + assert_eq!( + elevs.len() as isize % width, + 0, + "Elevations array must be square {}%{width} != 0", + elevs.len(), + ); + let elevations_div_width = elevs.len() as isize / width; + assert_eq!( + elevations_div_width, + width, + "Elevations array must be square {}/{width} (={elevations_div_width}) != {width}", + elevs.len() as isize + ); + }; + + let (sin, cos) = (f64::sin(angle.to_radians()), f64::cos(angle.to_radians())); + + #[expect(clippy::integer_division, reason = "we don't need precision here")] + let (x_center, y_center) = (width / 2, width / 2); + + let mut rotation: Vec = Vec::with_capacity(2 * max_los * max_los); + + for x in (max_los as isize)..(max_los as isize) * 2 { + let x_sin = (x - x_center) as f64 * sin; + let x_cos = (x - x_center) as f64 * cos; + for y in (max_los as isize)..width { + let y_sin = (y - y_center) as f64 * sin; + let y_cos = (y - y_center) as f64 * cos; + + let x_rot = (x_cos - y_sin).round() as isize + y_center; + let y_rot = (y_cos + x_sin).round() as isize + x_center; + + let new_idx = x_rot.clamp(0, width - 1) * width + y_rot.clamp(0, width - 1); + + rotation.push(new_idx as i32); + } + } + + debug_assert_eq!( + rotation.len() as isize, + max_los as isize * (2 * max_los as isize), + "the rotation should be 2 * max_los wide, max_los tall" + ); + + // map the indexes to their elevations + let elevations = rotation + .iter() + .map(|&idx| { + if idx < 0i32 { + i16::MIN + } else { + #[expect( + clippy::as_conversions, + reason = "elevations start out as i16s, and i16 -> f32 -> i16 is lossless" + )] + #[expect(clippy::cast_sign_loss, reason = "idx < 2^31, idx >= 0")] + // safety: idx is clamped so a get will always be in-bounds + *unsafe { elevs.get_unchecked(idx as usize) } + } + }) + .collect::>(); + + (rotation, fill_in_elevations(&elevations, max_los)) +} + +#[expect( + clippy::as_conversions, + clippy::cast_possible_truncation, + clippy::cast_possible_wrap, + reason = "so long as max_los < 2^24, the following as conversions are entirely safe" +)] +#[expect( + clippy::integer_division, + reason = "i32 is constructed from (i32, i32) converting back should succeed" +)] +/// `dem_to_pov` turns the `dem_id` to the `pov_id` so that the result can be stored in a heatmap +const fn dem_to_pov(dem_id: i32, width: usize, max_los: usize) -> i32 { + let dem_x = (dem_id / width as i32) - max_los as i32; + let dem_y = (dem_id % width as i32) - max_los as i32; + + let radius = max_los as i32 / 2i32; + let circ_x = dem_x - radius; + let circ_y = dem_y - radius; + + let dist = (circ_x.pow(2) + circ_y.pow(2)).isqrt(); + if dist < radius { + dem_x * (max_los as i32) + dem_y + } else { + -1 + } +} + +/// `kernel` will calculate the longest line of sight heatmap for a given angle and elevation map +/// assuming that the maximum line of sight is `max_los` +#[expect( + clippy::inline_always, + reason = "I am become Death, destroyer of compilers" +)] // the real reason is that I need output_sector_data to be constant propagated +#[inline(always)] +pub fn kernel( + elevation_map: &[i16], + max_los: usize, + angle: f32, + output_sector_data: bool, +) -> (Vec, Vec, Vec>) { + let mut heatmap = vec![0.0f32; max_los * max_los]; + let mut longest = vec![0.0f32; max_los * max_los]; + + let mut sector_data: Vec> = vec![ + vec![]; + if output_sector_data { + max_los * max_los + } else { + 0 + } + ]; + + let (indexes, rotated_elevations) = generate_rotation(elevation_map, f64::from(angle), max_los); + + assert_eq!( + rotated_elevations.len(), + 2 * max_los * max_los, + "elevations should be 2 * max_los wide, and max_los tall" + ); + + let width = 2 * max_los; + + let mut vs = UnrolledLOS::<64>::new(max_los); + for (line, line_indexes) in izip!( + rotated_elevations.chunks_exact(width), + indexes.chunks_exact(width), + ) { + for (pov, (&pov_height, &result_dem_id)) in + izip!(line.iter().take(max_los), line_indexes.iter().take(max_los)).enumerate() + { + let result_tvs_id = dem_to_pov(result_dem_id, 3 * max_los, max_los); + + // if the line of sight is not within our computable points, do not consider it + #[expect( + clippy::as_conversions, + clippy::cast_possible_wrap, + clippy::cast_possible_truncation, + reason = "max_los^2 < 2^31" + )] + if result_tvs_id < 0i32 || result_tvs_id >= (max_los * max_los) as i32 { + continue; + } + + let neighbor = pov + 1; + + #[expect( + clippy::indexing_slicing, + reason = "if slicing is out of bounds, it should panic" + )] + let (pixel, long, sector) = vs.line_of_sight::>( + f32::from(pov_height), + &line[neighbor..neighbor + max_los], + output_sector_data, + ); + + #[expect( + clippy::as_conversions, + clippy::cast_sign_loss, + clippy::indexing_slicing, + reason = "max_los^2 < 2^31" + )] + { + // safety: result_tvs_id is guaranteed to be within [0..max_los^2) + unsafe { + *heatmap.get_unchecked_mut(result_tvs_id as usize) = pixel; + }; + // safety: result_tvs_id is guaranteed to be within [0..max_los^2) + unsafe { + *longest.get_unchecked_mut(result_tvs_id as usize) = long; + }; + + if output_sector_data { + sector_data[result_tvs_id as usize] = sector; + } + } + } + } + + (heatmap, longest, sector_data) +} diff --git a/crates/total-viewsheds/src/cpu/los.rs b/crates/total-viewsheds/src/cpu/los.rs new file mode 100644 index 0000000..b882748 --- /dev/null +++ b/crates/total-viewsheds/src/cpu/los.rs @@ -0,0 +1,282 @@ +use itertools::izip; + +/// `LineOfSight` abstracts the implementation of line of sight calculations to +/// any "carry through" that can be materialized into a (f32, f32). +pub trait LineOfSight> { + /// `line_of_sight` calculates a line of sight for the given `pov_height` + /// and outputs a triple of the surface area, longest line of sight in meters + /// and a vector of bools of which + fn line_of_sight( + &mut self, + pov_height: f32, + line: &[i16], + output_sector: bool, + ) -> (f32, f32, Vec) + where + LOS: Angle + PrefixMax + Accumulate; +} + +/// `Angle` abstracts the angle calculation between a pov and all the elevation data within +/// its band of sight +pub trait Angle { + /// `calculate_angles` calculates the angle from the `pov_height` to a given elevation + fn calculate_angles( + pov_height: f32, + elevations: &[i16], + distances: &[f32], + adjustments: &[f32], + angles_out: &mut [f32], + ); +} + +/// `Accumulate` accumulates the surface area visible and longest line of sight +/// in a pair of (f32, f32). `Accumulate` doesn't care about the implementation details +/// of accumulation so long as the `Output` type can be materialized to (f32, f32) +pub trait Accumulate> { + /// `accumulate` accumulates the surface area using the distances by comparing + /// whether a point at a distance is visible (angle > prefix) + /// If `output_sector` is true, it should output a bitmap of which distances are visible + /// at their respective locations + fn accumulate( + init: Output, + angles: &[f32], + prefix: &[f32], + distances: &[f32], + bitmap: &mut Vec, + output_sector: bool, + ) -> Output; +} + +/// `PrefixMax` calculates the prefix maximum of the given angles +pub trait PrefixMax { + /// `prefix_max` calculates the prefix max of the + fn prefix_max(highest: f32, angles_in: &[f32], angles_out: &mut [f32]); +} + +/// `EARTH_RADIUS_SQUARED` is the radius of the earth in meters +const EARTH_RADIUS_SQUARED: f32 = 12_742_000.0; + +/// `generate_distances` generates the distance from +#[expect( + clippy::as_conversions, + clippy::cast_precision_loss, + reason = "max_los is < 2^24" +)] +fn generate_distances(max_los: usize) -> (Vec, Vec) { + (1..=max_los) + .map(|step| { + ( + (step * 100) as f32, + (step * 100) as f32 / EARTH_RADIUS_SQUARED, + ) + }) + .unzip() +} + +/// `StraightLine` contains the distances and round-earth adjustments needed to compute +/// the longest line of sight. +pub struct StraightLine { + /// `max_los` is the longest possible line of sight in 100ms + max_los: usize, + /// `angles` is a buffer of size `max_los` to store all angle calculations in + angles: Vec, + /// `prefix_max` is a buffer of size `max_los` to store the inclusive prefix max calculation in + prefix_max: Vec, + /// `distances` holds the distances in meters for every step between 100m and + distances: Vec, + /// `adjustments` holds earth curvature adjustments in meters for every step between 100m and + adjustments: Vec, +} + +impl StraightLine { + /// new constructs a new `StraightLine` given the maximum line of sight in `max_los` + #[expect(unused, reason = "this is generally only for testing/benchmarking")] + pub fn new(max_los: usize) -> Self { + let (distances, adjustments) = generate_distances(max_los); + Self { + max_los, + angles: vec![0.0f32; max_los + 1], + prefix_max: vec![0.0f32; max_los], + distances, + adjustments, + } + } +} + +impl LineOfSight<(f32, f32)> for StraightLine { + #[expect( + clippy::indexing_slicing, + reason = "all indexing and slices are guaranteed by construction of a StraightLine" + )] + fn line_of_sight( + &mut self, + pov_height: f32, + line: &[i16], + output_sector: bool, + ) -> (f32, f32, Vec) + where + LOS: PrefixMax + Angle + Accumulate<(f32, f32)>, + { + let mut output: Vec = vec![]; + + self.angles[0] = -2000.0; + + LOS::calculate_angles( + pov_height, + line, + &self.distances, + &self.adjustments, + &mut self.angles[1..], + ); + + LOS::prefix_max( + -2000.0f32, + &self.angles[..self.max_los], + &mut self.prefix_max, + ); + + let (heatmap, longest) = LOS::accumulate( + (0.0, 0.0), + &self.angles[1..], + &self.prefix_max, + &self.distances, + &mut output, + output_sector, + ); + + (heatmap, longest, output) + } +} + +/// Unroll holds an unrolled heatmap and unrolled longest line of sight calculation +/// Since in Line of Sight-land max/addition are commutative, then Unroll will be materialized +/// into (f32, f32) +pub struct Unroll { + /// `heatmap` contains the summation of visible surface areas which will be reduced to a single + /// surface area at the end + pub heatmap: [f32; UNROLL], + /// `longest` contains many long lines of sight which will be reduced to a single + /// line of sight at the end + pub longest: [f32; UNROLL], +} + +/// `UnrolledLOS` implements an Unrolled `LineOfSight` calculation +pub struct UnrolledLOS +where + [(); UNROLL + 1]:, +{ + /// `distances` holds `max_los` distances + distances: Vec, + /// `adjustments` holds `max_los` earth curvature adjustments + adjustments: Vec, +} + +impl From> for (f32, f32) { + default fn from(val: Unroll) -> Self { + let heatmap = val.heatmap.iter().sum(); + let longest = val.longest.iter().fold(0.0f32, |acc, &elem| acc.max(elem)); + (heatmap, longest) + } +} + +impl UnrolledLOS +where + [(); UNROLL + 1]:, +{ + /// `new` initializes a new `UnrolledLOS`, and precalculates all the distances + /// and earth curvature adjustments + pub fn new(max_los: usize) -> Self { + let (distances, adjustments) = generate_distances(max_los); + + Self { + distances, + adjustments, + } + } +} + +impl LineOfSight> for UnrolledLOS +where + [(); UNROLL + 1]:, +{ + #[inline] + #[expect( + clippy::indexing_slicing, + reason = "all indexing and slices are guaranteed by construction of a UnrolledLOS" + )] + fn line_of_sight( + &mut self, + pov_height: f32, + line: &[i16], + output_sector: bool, + ) -> (f32, f32, Vec) + where + LOS: Angle + PrefixMax + Accumulate>, + { + let mut angles = [0.0f32; UNROLL + 1]; + let mut prefix_max = [0.0f32; UNROLL]; + + prefix_max[UNROLL - 1] = -2000.0; + angles[0] = -2000.0; + + let mut output: Vec = vec![]; + + let (chunked_line, rest_line) = line.as_chunks::<{ UNROLL }>(); + + let (chunked_distances, rest_distances) = self.distances.as_chunks::<{ UNROLL }>(); + + let (chunked_adjustments, rest_adjustments) = self.adjustments.as_chunks::<{ UNROLL }>(); + + let los = izip!(chunked_line, chunked_distances, chunked_adjustments).fold( + Unroll:: { + longest: [0.0; UNROLL], + heatmap: [0.0; UNROLL], + }, + |acc, (unroll_line, distances, adjusts)| { + LOS::calculate_angles( + pov_height, + unroll_line, + distances, + adjusts, + &mut angles[1..], + ); + + LOS::prefix_max(prefix_max[UNROLL - 1], &angles[..UNROLL], &mut prefix_max); + + let new_acc = LOS::accumulate( + acc, + &angles[1..], + &prefix_max, + distances, + &mut output, + output_sector, + ); + + angles[0] = angles[UNROLL]; + new_acc + }, + ); + + LOS::calculate_angles( + pov_height, + rest_line, + rest_distances, + rest_adjustments, + &mut angles[1..], + ); + + LOS::prefix_max(prefix_max[UNROLL - 1], &angles[..UNROLL], &mut prefix_max); + + let new_acc = LOS::accumulate( + los, + &angles[1..], + &prefix_max, + rest_distances, + &mut output, + output_sector, + ); + + let (heatmap, longest) = new_acc.into(); + (heatmap, longest, output) + } +} diff --git a/crates/total-viewsheds/src/cpu/mod.rs b/crates/total-viewsheds/src/cpu/mod.rs new file mode 100644 index 0000000..e49a6f1 --- /dev/null +++ b/crates/total-viewsheds/src/cpu/mod.rs @@ -0,0 +1,9 @@ +/// los contains all the traits necessary for implementing a line of sight algorithm +mod los; + +/// vector contains vectorized implementations of the line of sight traits +mod vector; + +/// kernel is the exported kernel module +mod kernel; +pub use kernel::kernel; diff --git a/crates/total-viewsheds/src/cpu/vector.rs b/crates/total-viewsheds/src/cpu/vector.rs new file mode 100644 index 0000000..cd50b05 --- /dev/null +++ b/crates/total-viewsheds/src/cpu/vector.rs @@ -0,0 +1,557 @@ +use itertools::izip; +use std::iter::zip; +use std::simd::cmp::SimdPartialOrd as _; +use std::simd::num::SimdFloat as _; +use std::simd::prelude::SimdInt as _; +use std::simd::{f32x4, f32x8, LaneCount, Mask, Simd, SupportedLaneCount}; + +/// `TAN_ONE_RAD` is used in normalizing the +const TAN_ONE_RAD: f32 = 0.017_453_3; + +use crate::cpu::los::{Accumulate, Angle, PrefixMax, Unroll}; + +/// `VectorMax` performs an element-wise SIMD max of floats, allowing for architecture +/// specific implementations +trait VectorMax +where + LaneCount: SupportedLaneCount, +{ + /// `max` computes an element-wise maximum from lhs and rhs, assuming neither contain NaNs + /// or -0.0 + fn max(lhs: Simd, rhs: Simd) -> Simd; +} + +/// `VectorGreater` performs a SIMD greater than of floats, allowing for architecture +/// specific implementations +trait VectorGreater +where + LaneCount: SupportedLaneCount, +{ + /// gt computes an element-wise maximum from lhs and rhs, assuming neither contain NaNs + /// or -0.0 + fn gt(lhs: Simd, rhs: Simd) -> Mask; +} + +/// `VectorLos` is an implementation of the internals of `PrefixMax`, `Angle`, and `Accumulate` +/// for Portable SIMD +pub struct VectorLos +where + LaneCount: SupportedLaneCount; + +impl VectorMax for VectorLos +where + LaneCount: SupportedLaneCount, +{ + #[inline] + default fn max(lhs: Simd, rhs: Simd) -> Simd { + lhs.simd_max(rhs) + } +} + +impl VectorGreater for VectorLos +where + LaneCount: SupportedLaneCount, +{ + #[inline] + default fn gt(lhs: Simd, rhs: Simd) -> Mask { + lhs.simd_gt(rhs) + } +} + +#[cfg(target_feature = "sse")] +impl VectorMax<4> for VectorLos<4> { + #[inline] + fn max(lhs: f32x4, rhs: f32x4) -> f32x4 { + use std::arch::x86_64::_mm_max_ps; + + // safety: the caller of Viewshed<4> guarantees that -0.0 or NaN are not in the input + // thus allowing this to be non IEEE754 compliant + unsafe { _mm_max_ps(lhs.into(), rhs.into()) }.into() + } +} + +#[cfg(target_feature = "avx")] +impl VectorGreater<4> for VectorLos<4> { + fn gt(lhs: f32x4, rhs: f32x4) -> Mask { + use std::arch::x86_64::{_mm_castps_si128, _mm_cmp_ps, _CMP_GT_OS}; + + // safety: the caller of Viewshed<4> guarantees that -0.0 or NaN are not in the input + // thus allowing this to be non IEEE754 compliant + unsafe { + let mask = _mm_castps_si128(_mm_cmp_ps::<_CMP_GT_OS>(lhs.into(), rhs.into())); + Mask::::from_int_unchecked(mask.into()) + } + } +} + +impl PrefixMax for VectorLos<4> { + #[inline] + fn prefix_max(highest: f32, angles_in: &[f32], angles_out: &mut [f32]) { + let (vector_angles, _) = angles_in.as_chunks::<4>(); + let (vector_prefix, _) = angles_out.as_chunks_mut::<4>(); + + for (prefix, &angle) in zip(vector_prefix.iter_mut(), vector_angles.iter()) { + let start = Simd::from(angle); + + let mut v_prefix_max = { + let shifted = start.shift_elements_right::<1>(-2000.0f32); + Self::max(start, shifted) + }; + + v_prefix_max = { + let shifted = v_prefix_max.shift_elements_right::<2>(-2000.0f32); + Self::max(v_prefix_max, shifted) + }; + + v_prefix_max.copy_to_slice(prefix); + } + + let mut local_acc = Simd::splat(highest); + + // accumulate the prefix maxes for blocks, re-computing all prefix maxes + // to include the accumulated value + for prefix in vector_prefix { + let cur_prefix: f32x4 = Simd::from(*prefix); + let cur_max: f32x4 = Simd::splat(cur_prefix[3]); + + Self::max(local_acc, cur_prefix).copy_to_slice(prefix); + local_acc = Self::max(local_acc, cur_max); + } + } +} + +#[cfg(target_feature = "avx")] +impl VectorMax<8> for VectorLos<8> { + #[inline] + fn max(lhs: f32x8, rhs: f32x8) -> f32x8 { + use std::arch::x86_64::_mm256_max_ps; + // safety: the caller of Viewshed<4> guarantees that -0.0 or NaN are not in the input + // thus allowing this to be non IEEE754 compliant + unsafe { _mm256_max_ps(lhs.into(), rhs.into()).into() } + } +} + +#[cfg(all( + target_feature = "sse", + target_feature = "avx", + target_feature = "avx2" +))] +impl PrefixMax for VectorLos<8> { + #[inline] + fn prefix_max(highest: f32, angles_in: &[f32], angles_out: &mut [f32]) { + use std::arch::x86_64::{ + _mm256_blend_ps, _mm256_castps_si256, _mm256_castsi256_ps, _mm256_slli_si256, + _mm_max_ps, + }; + + debug_assert!( + angles_in.len().is_multiple_of(8) && angles_in.len() == angles_out.len(), + "inconsistent lengths, buffer must be multiple of vector length" + ); + { + let (vector_angles, _) = angles_in.as_chunks::<8>(); + let (vector_prefix, _) = angles_out.as_chunks_mut::<8>(); + + izip!(vector_prefix.iter_mut(), vector_angles.iter()).for_each(|(prefix, &angle)| { + let start = Simd::from_array(angle); + // safety: PrefixMax for VectorLos<8> is guarded by a cfg block for all SIMD instructions + let mut v_prefix_max = unsafe { + let shifted = _mm256_slli_si256::<4>(_mm256_castps_si256(start.into())); + let blended = _mm256_blend_ps::<0b0001_0001>( + _mm256_castsi256_ps(shifted), + Simd::splat(-2000.0f32).into(), + ); + Self::max(start, blended.into()) + }; + + // safety: PrefixMax for VectorLos<8> is guarded by a cfg block for all SIMD instructions + v_prefix_max = unsafe { + let shifted = _mm256_slli_si256::<8>(_mm256_castps_si256(v_prefix_max.into())); + let blended = _mm256_blend_ps::<0b0011_0011>( + _mm256_castsi256_ps(shifted), + Simd::splat(-2000.0f32).into(), + ); + + Self::max(v_prefix_max, blended.into()) + }; + + v_prefix_max.copy_to_slice(prefix); + }); + }; + + { + let mut acc: f32x4 = Simd::splat(highest); + let (vector_prefix, _) = angles_out.as_chunks_mut::<4>(); + for prefix in vector_prefix.iter_mut() { + // safety: PrefixMax for VectorLos<8> is guarded by a cfg block for all SIMD instructions + let new_max: f32x4 = + unsafe { _mm_max_ps(Simd::from_array(*prefix).into(), acc.into()) }.into(); + + let cur_max = Simd::splat(prefix[3]); + + // safety: PrefixMax for VectorLos<8> is guarded by a cfg block for all SIMD instructions + acc = unsafe { _mm_max_ps(acc.into(), cur_max.into()).into() }; + + new_max.copy_to_slice(prefix); + } + } + } +} + +#[cfg(target_feature = "avx")] +impl VectorGreater<8> for VectorLos<8> { + #[inline] + fn gt(lhs: f32x8, rhs: f32x8) -> Mask { + use std::arch::x86_64::{_mm256_castps_si256, _mm256_cmp_ps, _CMP_GT_OS}; + + // safety: the caller of Viewshed<4> guarantees that -0.0 or NaN are not in the input + // thus allowing this to be non IEEE754 compliant + unsafe { + let mask = _mm256_castps_si256(_mm256_cmp_ps::<_CMP_GT_OS>(lhs.into(), rhs.into())); + Mask::::from_int_unchecked(mask.into()) + } + } +} + +#[cfg(target_feature = "avx512f")] +impl VectorMax<16> for VectorLos<16> { + #[inline] + fn max(lhs: Simd, rhs: Simd) -> Simd { + use std::arch::x86_64::_mm512_max_ps; + // safety: the caller of Viewshed<4> guarantees that -0.0 or NaN are not in the input + // thus allowing this to be non IEEE754 compliant + unsafe { _mm512_max_ps(lhs.into(), rhs.into()).into() } + } +} + +#[cfg(target_feature = "avx512f")] +impl VectorGreater<16> for VectorLos<16> { + #[inline] + fn gt(lhs: Simd, rhs: Simd) -> Mask { + use std::arch::x86_64::_mm512_cmple_ps_mask; + // safety: the caller of Viewshed<8> guarantees that -0.0 or NaN are not in the input + // thus allowing this to be non IEEE754 compliant + unsafe { + let mask = _mm512_cmple_ps_mask(lhs.into(), rhs.into()); + Mask::::from_bitmask(mask.into()) + } + } +} + +#[cfg(target_feature = "avx512f")] +impl PrefixMax for VectorLos<16> { + #[inline] + fn prefix_max(highest: f32, angles_in: &[f32], angles_out: &mut [f32]) { + use std::arch::x86_64::{ + __m512, _mm512_alignr_epi32, _mm512_castps_si512, _mm512_castsi512_ps, _mm512_max_ps, + }; + use std::simd::f32x16; + + #[cfg(target_feature = "avx512f")] + #[expect( + clippy::cast_sign_loss, + clippy::as_conversions, + clippy::cast_possible_truncation, + clippy::cast_possible_wrap, + reason = "K is always in bounds" + )] + fn mm512_slli_si512(elem: __m512) -> __m512 + where + [(); { (16 - K) as i32 } as usize]:, + { + // safety: all mm512 intrinsics are guarded by avx512f + unsafe { + let zero = f32x16::splat(-2000.0f32); + _mm512_castsi512_ps(_mm512_alignr_epi32::<{ (16 - K) as i32 }>( + _mm512_castps_si512(elem), + _mm512_castps_si512(zero.into()), + )) + } + } + + let (vector_prefix, _) = angles_out.as_chunks_mut::<16>(); + let (vector_angle, _) = angles_in.as_chunks::<16>(); + + for (prefix, &angle) in zip(vector_prefix.iter_mut(), vector_angle.iter()) { + let simd_angle = Simd::from_array(angle); + // safety: all the following operations are guarded by the avx512f build falg + unsafe { + let mut v_prefix_max = + _mm512_max_ps(simd_angle.into(), mm512_slli_si512::<1>(simd_angle.into())); + + v_prefix_max = _mm512_max_ps(v_prefix_max, mm512_slli_si512::<2>(v_prefix_max)); + + v_prefix_max = _mm512_max_ps(v_prefix_max, mm512_slli_si512::<4>(v_prefix_max)); + + v_prefix_max = _mm512_max_ps(v_prefix_max, mm512_slli_si512::<8>(v_prefix_max)); + + let simd_prefix_max: f32x16 = v_prefix_max.into(); + simd_prefix_max.copy_to_slice(prefix); + } + } + + let mut local_acc = f32x16::splat(highest); + + // accumulate the prefix maxes for blocks, re-computing all prefix maxes + // to include the accumulated value + for prefix in vector_prefix.iter_mut() { + let cur_prefix: f32x16 = Simd::from_array(*prefix); + let cur_max: f32x16 = Simd::splat(cur_prefix[15]); + + Self::max(local_acc, cur_prefix).copy_to_slice(prefix); + local_acc = Self::max(local_acc, cur_max); + } + } +} + +impl Angle for VectorLos +where + LaneCount: SupportedLaneCount, +{ + #[inline] + default fn calculate_angles( + pov_height: f32, + elevations: &[i16], + distances: &[f32], + adjustments: &[f32], + angles_out: &mut [f32], + ) { + debug_assert!( + elevations.len().is_multiple_of(WIDTH), + "expected elevations to be a multiple of {WIDTH}", + ); + + debug_assert!( + distances.len().is_multiple_of(WIDTH), + "expected distances to be a multiple of {WIDTH}", + ); + + debug_assert!( + adjustments.len().is_multiple_of(WIDTH), + "expected adjustments to be a multiple of {WIDTH}", + ); + + debug_assert!( + angles_out.len().is_multiple_of(WIDTH), + "expected angles buf to be a multiple of {WIDTH}", + ); + + let (vector_angles, _) = angles_out.as_chunks_mut::<{ WIDTH }>(); + let (vector_elevations, _) = elevations.as_chunks::<{ WIDTH }>(); + let (vector_adjustments, _) = adjustments.as_chunks::<{ WIDTH }>(); + let (vector_distances, _) = distances.as_chunks::<{ WIDTH }>(); + + for (angle, &elevation, &distance, &adjustment) in izip!( + vector_angles.iter_mut(), + vector_elevations.iter(), + vector_distances.iter(), + vector_adjustments.iter() + ) { + let float_elevation: Simd = Simd::from(elevation).cast(); + + let adjusted = float_elevation - Simd::splat(pov_height); + + let res = (adjusted / Simd::from_array(distance)) - Simd::from_array(adjustment); + + res.copy_to_slice(angle); + } + } +} + +impl Accumulate<(f32, f32)> for VectorLos +where + LaneCount: SupportedLaneCount, +{ + #[inline] + fn accumulate( + init: (f32, f32), + angles: &[f32], + prefix: &[f32], + distances: &[f32], + bitmap: &mut Vec, + output_sector: bool, + ) -> (f32, f32) { + debug_assert!(angles.len().is_multiple_of(WIDTH), ""); + debug_assert!(prefix.len().is_multiple_of(WIDTH), ""); + debug_assert!(distances.len().is_multiple_of(WIDTH), ""); + + let (vector_angles, _) = angles.as_chunks::<{ WIDTH }>(); + let (vector_prefix, _) = prefix.as_chunks::<{ WIDTH }>(); + let (vector_dists, _) = distances.as_chunks::<{ WIDTH }>(); + + izip!(vector_angles, vector_prefix, vector_dists,).fold( + init, + |acc, (&angle_arr, &prefix_arr, &distances_arr)| { + let mask = Self::gt(Simd::from_array(angle_arr), Simd::from_array(prefix_arr)); + if output_sector { + bitmap.extend(mask.to_array()); + } + + if !mask.any() { + return acc; + } + + let dist = mask.select(Simd::from_array(distances_arr), Simd::splat(0.0f32)); + ( + acc.0 + (dist * Simd::::splat(TAN_ONE_RAD)).reduce_sum(), + acc.1.max(dist.reduce_max()), + ) + }, + ) + } +} + +/// `GenericExpr` lets a const generic expression be evaluated in its +/// `CONDITION` parameter for traits that need to evaluate constant expressions +/// as part of their trait bounds +struct GenericExpr; + +/// `IsTrue `is a "marker" trait for trait bounds for when a const generic expr +/// evaluates to true, and is only implemented for `GenericExpr<{true}>` +trait IsTrue {} +impl IsTrue for GenericExpr {} + +impl Accumulate> for VectorLos +where + LaneCount: SupportedLaneCount, + GenericExpr<{ SIZE.is_multiple_of(WIDTH) }>: IsTrue, +{ + #[inline] + fn accumulate( + mut init: Unroll, + angles: &[f32], + prefix: &[f32], + distances: &[f32], + bitmap: &mut Vec, + output_sector: bool, + ) -> Unroll { + debug_assert!( + angles.len().is_multiple_of(WIDTH), + "distance unroll should be multiple of width" + ); + debug_assert!( + prefix.len().is_multiple_of(WIDTH), + "distance unroll should be multiple of width" + ); + debug_assert!( + distances.len().is_multiple_of(WIDTH), + "distance unroll should be multiple of width" + ); + debug_assert!(angles.len() <= SIZE, "angles must be less than unroll size"); + + let (vector_sum, _) = init.heatmap.as_chunks_mut::<{ WIDTH }>(); + let (vector_longest, _) = init.longest.as_chunks_mut::<{ WIDTH }>(); + + let (vector_angles, _) = angles.as_chunks::<{ WIDTH }>(); + let (vector_prefix, _) = prefix.as_chunks::<{ WIDTH }>(); + let (vector_dists, _) = distances.as_chunks::<{ WIDTH }>(); + + izip!( + vector_sum, + vector_longest, + vector_angles, + vector_prefix, + vector_dists, + ) + .for_each( + |(sum_arr, longest_arr, &angle_arr, &prefix_arr, &distances_arr)| { + let mask = Self::gt(Simd::from_array(angle_arr), Simd::from_array(prefix_arr)); + + if output_sector { + bitmap.extend(mask.to_array()); + } + + if !mask.any() { + return; + } + + let dist = mask.select(Simd::from_array(distances_arr), Simd::splat(0.0f32)); + + Self::max(Simd::from_array(*longest_arr), dist).copy_to_slice(longest_arr); + + let acc = Simd::from(*sum_arr) + dist; + + acc.copy_to_slice(sum_arr); + }, + ); + + init + } +} + +/// `DEFAULT_VECTOR_LENGTH` determines the CPU Kernel's default vector length based off +/// the architecture that the binary is built for +pub const DEFAULT_VECTOR_LENGTH: usize = const { + if cfg!(target_feature = "avx512f") { + 16 + } else if cfg!(target_feature = "sse") && cfg!(target_feature = "sse2") { + 8 + } else { + 4 + } +}; + +impl From> for (f32, f32) +where + GenericExpr<{ UNROLL.is_multiple_of(DEFAULT_VECTOR_LENGTH) }>: IsTrue, +{ + fn from(val: Unroll) -> Self { + let (heatmap, _) = val.heatmap.as_chunks::(); + let (longest, _) = val.longest.as_chunks::(); + + let heat = heatmap + .iter() + .fold(Simd::splat(0.0f32), |acc, &heat| { + acc + Simd::from_array(heat) + }) + .reduce_sum(); + + let long = longest + .iter() + .fold(Simd::splat(0.0f32), |acc, &long| { + VectorLos::::max(acc, Simd::from_array(long)) + }) + .reduce_max(); + + (heat, long) + } +} + +#[cfg(test)] +mod test { + use crate::cpu::los::{LineOfSight as _, UnrolledLOS}; + use crate::cpu::vector::VectorLos; + + #[test] + fn line_of_sight_four() { + let mut vs = UnrolledLOS::<64>::new(16); + let (visibility, longest, sector) = vs.line_of_sight::>( + 0.0f32, + &[ + 1000, 4000, 9000, 12000, 3000, 30000, 3000, 3000, 1000, 4000, 9000, 12000, 3000, + 30000, 3000, 3000, + ], + true, + ); + println!("{:?} {:?} {:?}", visibility, longest, sector); + } + + #[test] + #[cfg(all( + target_feature = "sse", + target_feature = "avx", + target_feature = "avx2" + ))] + fn line_of_sight_eight() { + let mut vs = UnrolledLOS::<64>::new(16); + let (visibility, longest, sector) = vs.line_of_sight::>( + 0.0f32, + &[ + 1000, 4000, 9000, 12000, 3000, 30000, 3000, 3000, 1000, 4000, 9000, 12000, 3000, + 30000, 3000, 3000, + ], + true, + ); + println!("{:?} {:?} {:?}", visibility, longest, sector); + } +} diff --git a/crates/total-viewsheds/src/main.rs b/crates/total-viewsheds/src/main.rs index c55cd38..68803dc 100644 --- a/crates/total-viewsheds/src/main.rs +++ b/crates/total-viewsheds/src/main.rs @@ -1,5 +1,6 @@ //! Total Viewshed Calculator #![feature(portable_simd)] +#![feature(specialization)] #![expect( incomplete_features, reason = "our usage isn't crazy and unlikely to break" @@ -47,7 +48,10 @@ mod output { pub mod ring_data; pub mod viewshed; } + +/// cpu implements a CPU kernel for the longest line of sight mod cpu; + mod projection; fn main() -> Result<()> { diff --git a/scripts/apt_init.sh b/scripts/apt_init.sh index a4a64a6..ad5ed7f 100644 --- a/scripts/apt_init.sh +++ b/scripts/apt_init.sh @@ -2,6 +2,7 @@ sudo apt install -y git clang curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh +source ~/.bashrc git clone https://github.com/AllTheLines/CacheTVS -cd CacheTVS && git checkout cpu-clean +cd CacheTVS && git checkout rberger/cpu-clean