diff --git a/Cargo.toml b/Cargo.toml
index b8b169d..8553c6d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -61,6 +61,7 @@ multiple_inherent_impl = "allow"
 upper_case_acronyms = "allow"
 struct_field_names = "allow"
 self_named_module_files = "allow"
+mod_module_files = "allow"
 
 # I just personally prefer the `match` syntax for if-let matching.
 option_if_let_else = "allow"
diff --git a/crates/total-viewsheds/Cargo.toml b/crates/total-viewsheds/Cargo.toml
index d329087..599edad 100644
--- a/crates/total-viewsheds/Cargo.toml
+++ b/crates/total-viewsheds/Cargo.toml
@@ -2,7 +2,8 @@
 name = "total-viewsheds"
 version = "0.1.0"
 edition = "2021"
-rust-version = "1.88"
+rust-version = "1.89"
+
 
 [dependencies]
 bytemuck = "1.22.0"
@@ -29,5 +30,3 @@ googletest = "0.14.2"
 
 [lints]
 workspace = true
-
-
diff --git a/crates/total-viewsheds/src/compute.rs b/crates/total-viewsheds/src/compute.rs
index accbaf7..d5ac19a 100644
--- a/crates/total-viewsheds/src/compute.rs
+++ b/crates/total-viewsheds/src/compute.rs
@@ -1,6 +1,12 @@
 //! The main entrypoint for running computations.
 
+use crate::los_pack::LineOfSightPacked;
 use color_eyre::Result;
+use rayon::iter::IntoParallelIterator as _;
+use rayon::iter::ParallelIterator as _;
+use rayon::ThreadPoolBuilder;
+use std::sync::Mutex;
+use std::time::Instant;
 
 /// The number of angles we rotate through. The other half are done via "backwards" lines of sight.
 pub const SECTOR_STEPS: u16 = 180;
@@ -27,10 +33,6 @@ pub struct Compute<'compute> {
     pub longest_lines: Vec<crate::los_pack::LineOfSightPacked>,
 }
 
-/// `NUM_CORES` is the physical number of cores on a machine. Currently hardcoded to 8
-/// as that is what an i9900k has, and is a common configuration.
-/// TODO find a good syscall for this
-const NUM_CORES: usize = 8;
 /// Configuration for computing.
 pub struct ComputeConfig {
     /// The height of the observer that views viewsheds.
@@ -94,10 +96,6 @@ impl<'compute> Compute<'compute> {
         // We only need the "chocolate box" section of rotations to do visibility calculations.
         let rotations_size = kernel::chocolate_box::size(dem.width, dem.tvs_width);
 
-        #[expect(
-            clippy::if_then_some_else_none,
-            reason = "The `?` is hard to use in the closure"
-        )]
         let vulkan = if matches!(config.backend, crate::config::Backend::Vulkan) {
             let elevations = dem.elevations.clone();
             dem.elevations = Vec::new(); // Free up some RAM.
@@ -259,32 +257,69 @@ impl<'compute> Compute<'compute> {
 
     /// `run_parallel` runs the CPU kernel in parallel
     fn run_parallel(&mut self) -> Result<()> {
-        #[expect(
-            clippy::as_conversions,
-            clippy::cast_possible_truncation,
-            reason = "elevations start out as i16s, and i16 -> f32 -> i16 is lossless"
-        )]
-        let elevations = self
-            .dem
-            .elevations
-            .iter()
-            .map(|&x| x as i16)
-            .collect::<Vec<i16>>();
-
-        #[expect(clippy::as_conversions, reason = "u32 -> usize is valid")]
-        // TODO: third param is ring data which needs to be saved
-        let (surfaces, _longest, _) = crate::cpu::multithreaded_kernel(
-            &elevations,
-            self.dem.max_los_as_points as usize,
-            360,
-            NUM_CORES,
-            false,
-        );
+        let max_los = usize::try_from(self.dem.max_los_as_points)?;
+        let mut surfaces = vec![0.0f32; max_los * max_los];
+        let mut longest = vec![(0u16, 0.0f32); max_los * max_los];
+
+        let pool = ThreadPoolBuilder::new().num_threads(8).build()?;
 
-        self.add_sector_surfaces_to_running_total(&surfaces);
+        {
+            let angle_mu = &Mutex::new(&mut surfaces);
+            let longest_mu = &Mutex::new(&mut longest);
+
+            let elevations = &self.dem.elevations;
+
+            pool.install(move || {
+                (0u16..360u16)
+                    .into_par_iter()
+                    .map(|angle| {
+                        let start = Instant::now();
+                        tracing::info!("starting angle: {angle}");
+                        let (heatmap, long, _) =
+                            crate::cpu::kernel(elevations, max_los, f32::from(angle), false);
+                        tracing::info!("finished angle in {:?}", start.elapsed());
+                        (angle, heatmap, long)
+                    })
+                    .for_each(|(angle, heatmap, long)| {
+                        #[expect(clippy::expect_used, reason = "a poisoned mutex should crash")]
+                        angle_mu
+                            .lock()
+                            .expect("mutex poisoned")
+                            .iter_mut()
+                            .zip(heatmap)
+                            .for_each(|(to, from)| {
+                                *to += from;
+                            });
+
+                        #[expect(clippy::expect_used, reason = "a poisoned mutex should crash")]
+                        longest_mu
+                            .lock()
+                            .expect("mutex poisoned")
+                            .iter_mut()
+                            .zip(long)
+                            .for_each(|(to, from)| {
+                                if from > to.1 {
+                                    *to = (angle, from);
+                                }
+                            });
+                    });
+            });
+        };
 
-        // TODO: Pack longest lines
-        // self.longest_lines = longest;
+        self.total_surfaces = surfaces;
+        let packed: Result<Vec<LineOfSightPacked>> = longest
+            .iter()
+            .map(|&(angle, distance): &(u16, f32)| {
+                #[expect(
+                    clippy::as_conversions,
+                    clippy::cast_possible_truncation,
+                    clippy::cast_sign_loss,
+                    reason = "distances always fit in u32"
+                )]
+                LineOfSightPacked::new(distance as u32, angle)
+            })
+            .collect();
+        self.longest_lines = packed?;
 
         self.render_total_surfaces()?;
         self.render_longest_lines()?;
diff --git a/crates/total-viewsheds/src/cpu.rs b/crates/total-viewsheds/src/cpu.rs
deleted file mode 100644
index 6c42808..0000000
--- a/crates/total-viewsheds/src/cpu.rs
+++ /dev/null
@@ -1,987 +0,0 @@
-//! `cpu` is a CPU version of the total viewshed calculation
-
-use itertools::izip;
-use rayon::iter::ParallelIterator as _;
-
-#[cfg(target_feature = "avx512f")]
-use std::arch::x86_64::{
-    __m512, _mm256_alignr_epi32, _mm256_mask_alignr_epi32, _mm512_alignr_epi32,
-    _mm512_castps_si512, _mm512_castsi512_ps, _mm512_cmple_ps_mask, _mm512_max_ps,
-};
-
-use rayon::prelude::IntoParallelIterator as _;
-use rayon::ThreadPoolBuilder;
-#[cfg(all(
-    target_feature = "avx2",
-    target_feature = "avx",
-    target_feature = "sse",
-    target_feature = "sse2"
-))]
-use std::arch::x86_64::{
-    _mm256_blend_ps, _mm256_castps_si256, _mm256_castsi256_ps, _mm256_cmp_ps, _mm256_max_ps,
-    _mm256_slli_si256, _CMP_LE_OS,
-};
-use std::iter::zip;
-use std::simd::prelude::*;
-use std::simd::{LaneCount, Mask, SupportedLaneCount};
-use std::sync::Mutex;
-use std::time::Instant;
-use std::{array, f32, mem, slice};
-
-#[cfg(all(target_feature = "sse", target_feature = "sse2"))]
-use std::arch::x86_64::{_mm_castps_si128, _mm_cmpge_ps, _mm_max_ps};
-
-/// `EARTH_RADIUS_SQUARED` is the earth's radius squared in meters
-const EARTH_RADIUS_SQUARED: f32 = 12_742_000.0;
-
-/// `TAN_ONE_RAD` helps normalize the fact that inner points are sampled more often
-/// see the TVS paper for reasoning.
-const TAN_ONE_RAD: f32 = 0.017_453_3;
-
-/// `Vectorized` is an empty struct to allow for specializations of the total viewhshed algorithm
-/// TODO: maybe we can just use a generic struct?
-struct Vectorized;
-
-/// `Viewshed` holds all the platform and vector-width specific methods the CPU kernel
-/// needs to operate.
-trait Viewshed<const WIDTH: usize>
-where
-    LaneCount<WIDTH>: SupportedLaneCount,
-{
-    /// `gte` takes in a vector of angles and its prefix maximum returns a mask of
-    /// i32s which are either -1 or 0 in each lane. This way it can be used to "select"
-    /// which lanes of the target vector to use for further calculations
-    fn gte(&self, angle: Simd<f32, WIDTH>, prefix: Simd<f32, WIDTH>) -> Mask<i32, WIDTH>;
-
-    /// `max` returns the lane-wise maximum of both vectors. It exists to help platform-specific
-    /// and potentially "unsafe" (in floating point terms) and speedier implementations
-    fn max(&self, lhs: Simd<f32, WIDTH>, rhs: Simd<f32, WIDTH>) -> Simd<f32, WIDTH>;
-
-    /// `prefix_max` calculates a prefix maximum given all of the `angles` and stores
-    /// it in `prefix_max`
-    fn prefix_max(
-        &self,
-        angles: &[Simd<f32, WIDTH>],
-        prefix_max: &mut [Simd<f32, WIDTH>],
-        acc: Simd<f32, WIDTH>,
-    ) -> Simd<f32, WIDTH>;
-}
-
-impl Viewshed<4> for Vectorized {
-    #[inline]
-    #[cfg(all(target_feature = "sse", target_feature = "sse2"))]
-    fn gte(&self, angle: f32x4, prefix: f32x4) -> Mask<i32, 4> {
-        // safety: the caller of Viewshed<4> guarantees that -0.0 or NaN are not in the input
-        // thus allowing this to be non IEEE754 compliant
-        unsafe {
-            let mask = _mm_castps_si128(_mm_cmpge_ps(angle.into(), prefix.into()));
-            Mask::<i32, 4>::from_int_unchecked(mask.into())
-        }
-    }
-
-    #[inline]
-    #[cfg(not(all(target_feature = "sse", target_feature = "sse2")))]
-    fn gte(&self, lhs: f32x4, rhs: f32x4) -> Mask<i32, 4> {
-        lhs.simd_ge(rhs)
-    }
-
-    #[inline]
-    #[cfg(all(target_feature = "sse", target_feature = "sse2"))]
-    fn max(&self, lhs: f32x4, rhs: f32x4) -> Simd<f32, 4> {
-        // safety: the caller of Viewshed<4> guarantees that -0.0 or NaN are not in the input
-        // thus allowing this to be non IEEE754 compliant
-        unsafe { _mm_max_ps(lhs.into(), rhs.into()).into() }
-    }
-
-    #[inline]
-    #[cfg(not(all(target_feature = "sse", target_feature = "sse2")))]
-    fn max(&self, lhs: f32x4, rhs: f32x4) -> Simd<f32, 4> {
-        lhs.simd_max(r)
-    }
-
-    #[inline]
-    fn prefix_max(&self, angles: &[f32x4], prefix_max: &mut [f32x4], acc: f32x4) -> f32x4 {
-        for (prefix, &angle) in zip(prefix_max.iter_mut(), angles.iter()) {
-            let mut v_prefix_max = {
-                let shifted = angle.shift_elements_right::<1>(-2000.0f32);
-                self.max(angle, shifted)
-            };
-
-            v_prefix_max = {
-                let shifted = v_prefix_max.shift_elements_right::<2>(-2000.0f32);
-                self.max(v_prefix_max, shifted)
-            };
-
-            *prefix = v_prefix_max;
-        }
-
-        let mut local_acc = acc;
-
-        // accumulate the prefix maxes for blocks, re-computing all prefix maxes
-        // to include the accumulated value
-        for prefix in prefix_max {
-            let cur_prefix: f32x4 = *prefix;
-            let cur_max: f32x4 = Simd::splat(cur_prefix[3]);
-
-            *prefix = self.max(local_acc, cur_prefix);
-            local_acc = self.max(local_acc, cur_max);
-        }
-
-        local_acc
-    }
-}
-
-#[cfg(all(target_feature = "avx2", target_feature = "avx"))]
-impl Viewshed<8> for Vectorized {
-    #[inline]
-    fn gte(&self, angle: f32x8, prefix: f32x8) -> Mask<i32, 8> {
-        // safety: the caller of Viewshed<8> guarantees that -0.0 or NaN are not in the input
-        // thus allowing this to be non IEEE754 compliant
-        unsafe {
-            let mask =
-                _mm256_castps_si256(_mm256_cmp_ps::<_CMP_LE_OS>(prefix.into(), angle.into()));
-            Mask::<i32, 8>::from_int_unchecked(mask.into())
-        }
-    }
-
-    #[inline]
-    fn max(&self, lhs: f32x8, rhs: f32x8) -> Simd<f32, 8> {
-        // safety: the caller of Viewshed<8> guarantees that -0.0 or NaN are not in the input
-        // thus allowing this to be non IEEE754 compliant
-        unsafe { _mm256_max_ps(lhs.into(), rhs.into()).into() }
-    }
-
-    #[inline]
-    fn prefix_max(&self, angles: &[f32x8], prefix_max: &mut [f32x8], acc: f32x8) -> f32x8 {
-        // Calculate the 4-wide block prefix max two at a time
-        for (prefix, &angle) in zip(prefix_max.iter_mut(), angles.iter()) {
-            // safety: all mm256 operations are avx2, and Viewshed<8> has feature guards for both
-            let mut v_prefix_max = unsafe {
-                let shifted = _mm256_slli_si256::<4>(_mm256_castps_si256(angle.into()));
-                let blended = _mm256_blend_ps::<0b1000_1000>(
-                    _mm256_castsi256_ps(shifted),
-                    Simd::splat(-2000.0f32).into(),
-                );
-                self.max(angle, blended.into())
-            };
-
-            // safety: all mm256 operations are avx2, and Viewshed<8> has feature guards for both
-            v_prefix_max = unsafe {
-                let shifted = _mm256_slli_si256::<8>(_mm256_castps_si256(v_prefix_max.into()));
-                let blended = _mm256_blend_ps::<0b1100_1100>(
-                    _mm256_castsi256_ps(shifted),
-                    Simd::splat(-2000.0f32).into(),
-                );
-
-                self.max(v_prefix_max, blended.into())
-            };
-
-            *prefix = v_prefix_max;
-        }
-
-        let mut local_acc = f32x4::splat(acc[3]);
-
-        // safety: because f32x8s are aligned to exactly sizeof(f32x4) * 2
-        // this is well aligned, so the cast is valid
-        //
-        // This is SUPER MEGA UBER VERY sketchy, and shouldn't be copied
-        // unless you _really_, _truly_ understand what the compiler will do
-        let single_wide_prefx: &mut [f32x4] = unsafe {
-            let ptr = prefix_max.as_mut_ptr();
-            slice::from_raw_parts_mut(ptr.cast::<f32x4>(), prefix_max.len() * 2)
-        };
-
-        // accumulate the prefix maxes for blocks, re-computing all prefix maxes
-        // to include the accumulated value
-        for prefix in single_wide_prefx {
-            let cur_prefix: f32x4 = *prefix;
-            let cur_max: f32x4 = Simd::splat(cur_prefix[3]);
-
-            *prefix = self.max(local_acc, cur_prefix);
-            local_acc = self.max(local_acc, cur_max);
-        }
-
-        f32x8::splat(local_acc[3])
-    }
-}
-
-#[cfg(target_feature = "avx512f")]
-fn _mm512_slli_si512<const K: usize>(elem: __m512) -> __m512
-where
-    [(); { (16 - K) as i32 } as usize]:,
-{
-    unsafe {
-        let zero = f32x16::splat(-2000.0f32);
-        _mm512_castsi512_ps(_mm512_alignr_epi32::<{ (16 - K) as i32 }>(
-            _mm512_castps_si512(elem),
-            _mm512_castps_si512(zero.into()),
-        ))
-    }
-}
-
-#[cfg(target_feature = "avx512f")]
-impl Viewshed<16> for Vectorized {
-    #[inline]
-    fn gte(&self, angle: f32x16, prefix: f32x16) -> Mask<i32, 16> {
-        // safety: the caller of Viewshed<8> guarantees that -0.0 or NaN are not in the input
-        // thus allowing this to be non IEEE754 compliant
-        unsafe {
-            let mask = _mm512_cmple_ps_mask(prefix.into(), angle.into());
-            Mask::<i32, 16>::from_bitmask(mask.into())
-        }
-    }
-
-    #[inline]
-    fn max(&self, lhs: f32x16, rhs: f32x16) -> f32x16 {
-        // safety: the caller of Viewshed<8> guarantees that -0.0 or NaN are not in the input
-        // thus allowing this to be non IEEE754 compliant
-        unsafe { _mm512_max_ps(lhs.into(), rhs.into()).into() }
-    }
-
-    #[inline]
-    fn prefix_max(&self, angles: &[f32x16], prefix_max: &mut [f32x16], acc: f32x16) -> f32x16 {
-        // Calculate the 4-wide block prefix max two at a time
-        for (prefix, &angle) in zip(prefix_max.iter_mut(), angles.iter()) {
-            unsafe {
-                let mut v_prefix_max =
-                    _mm512_max_ps(angle.into(), _mm512_slli_si512::<1>(angle.into()).into());
-                v_prefix_max = _mm512_max_ps(
-                    v_prefix_max.into(),
-                    _mm512_slli_si512::<2>(v_prefix_max).into(),
-                );
-                v_prefix_max = _mm512_max_ps(
-                    v_prefix_max.into(),
-                    _mm512_slli_si512::<4>(v_prefix_max).into(),
-                );
-                v_prefix_max = _mm512_max_ps(
-                    v_prefix_max.into(),
-                    _mm512_slli_si512::<8>(v_prefix_max).into(),
-                );
-                *prefix = v_prefix_max.into();
-            }
-        }
-
-        let mut local_acc = f32x16::splat(acc[0]);
-
-        // accumulate the prefix maxes for blocks, re-computing all prefix maxes
-        // to include the accumulated value
-        for prefix in prefix_max {
-            let cur_prefix: f32x16 = *prefix;
-            let cur_max: f32x16 = Simd::splat(cur_prefix[15]);
-
-            *prefix = self.max(local_acc, cur_prefix);
-            local_acc = self.max(local_acc, cur_max);
-        }
-
-        f32x16::splat(local_acc[0])
-    }
-}
-
-#[inline]
-/// `load_elevations` converts an array of i16 elevations into a height adjusted
-fn load_elevations<const N: usize>(elev_arr: [i16; N], pov_height: f32) -> Simd<f32, N>
-where
-    LaneCount<N>: SupportedLaneCount,
-{
-    let elevs = Simd::<i16, N>::from_array(elev_arr);
-    let float_elevs: Simd<f32, N> = elevs.cast();
-    float_elevs - Simd::splat(pov_height)
-}
-
-/// `IndexSIMD` holds the `dem_ids`/"indexes" of the current line of sight,
-/// along with where they will be written out to
-struct IndexSIMD<'idx, const N: usize> {
-    /// `indexes_in` holds a slice of a SIMD-size-wide array of `dem_ids`/"indexes"
-    indexes_in: &'idx [[i32; N]],
-    /// `indexes_out` is the buffer of a SIMD-size-wide where the visible `dem_ids`/"indexes" will be written out to
-    indexes_out: &'idx mut [[i32; N]],
-}
-
-#[inline]
-/// `line_of_sight` calculates a single line of sight for a given pov, which is passed in via `pov_height`
-fn line_of_sight<const N: usize, const UNROLL: usize, VS>(
-    vs: &VS,
-    elevations: &[[i16; N]],
-    distances: &[Simd<f32, N>],
-    adjustments: &[Simd<f32, N>],
-    prefix_in: Simd<f32, N>,
-    indexes: Option<IndexSIMD<N>>,
-    pov_height: f32,
-) -> ([Simd<f32, N>; UNROLL], [Simd<f32, N>; UNROLL], Simd<f32, N>)
-where
-    LaneCount<N>: SupportedLaneCount,
-    VS: Viewshed<N>,
-{
-    let mut sum_buf: [Simd<f32, N>; UNROLL] = [Simd::splat(0.0); UNROLL];
-    let mut angle_buf: [Simd<f32, N>; UNROLL] = [Simd::splat(0.0); UNROLL];
-    let mut longest_line_buf: [Simd<f32, N>; UNROLL] = [Simd::splat(0.0); UNROLL];
-    let mut prefix_buf: [Simd<f32, N>; UNROLL] = [Simd::splat(0.0); UNROLL];
-
-    izip!(
-        angle_buf.iter_mut(),
-        elevations
-            .iter()
-            .map(|elev| load_elevations(*elev, pov_height)),
-        distances,
-        adjustments
-    )
-    .for_each(|(angle, elev, dist, adjust)| {
-        *angle = elev / dist - adjust;
-    });
-
-    #[expect(
-        clippy::float_cmp,
-        reason = "-2000.0f32 is a sentinel value for the first time this accumlative function is run"
-    )]
-    if prefix_in[0] == -2000.0f32 {
-        angle_buf[0][0] = -2000.1f32;
-    }
-
-    let prefix_out = vs.prefix_max(&angle_buf, &mut prefix_buf, prefix_in);
-
-    let index_iter = indexes.map(|index_simd| {
-        index_simd
-            .indexes_in
-            .iter()
-            .map(|ind| Simd::<i32, N>::from_array(*ind))
-            .zip(index_simd.indexes_out.iter_mut())
-    });
-
-    izip!(
-        &mut sum_buf,
-        &mut longest_line_buf,
-        angle_buf.iter(),
-        prefix_buf.iter(),
-        distances.iter(),
-        OptionIter::new(index_iter)
-    )
-    .for_each(|(next_sum, longest_line, &angle, &pref, &dists, inds)| {
-        let mask = vs.gte(angle, pref);
-
-        if let Some((inds_in, inds_out)) = inds {
-            inds_in.store_select(inds_out, mask);
-        }
-
-        let selected_distances = mask.select(dists, Simd::splat(0.0));
-        *longest_line = vs.max(*longest_line, selected_distances);
-
-        let selected_tans = mask.select(Simd::splat(TAN_ONE_RAD), Simd::splat(0.0));
-        *next_sum = selected_distances * selected_tans;
-    });
-
-    (sum_buf, longest_line_buf, prefix_out)
-}
-
-/// `dem_to_pov`
-#[expect(
-    clippy::as_conversions,
-    clippy::cast_possible_truncation,
-    clippy::cast_possible_wrap,
-    reason = "so long as max_los < 2^24, the following as conversions are entirely safe"
-)]
-#[expect(
-    clippy::integer_division,
-    reason = "i32 is constructed from (i32, i32) converting back should succeed"
-)]
-const fn dem_to_pov(dem_id: i32, width: usize, max_los: usize) -> i32 {
-    let dem_x = (dem_id / width as i32) - max_los as i32;
-    let dem_y = (dem_id % width as i32) - max_los as i32;
-
-    dem_x * (max_los as i32) + dem_y
-}
-
-/// `Indexes` holds the `dem_id`/"indexes" and the output buffer to store them in
-struct Indexes<'index> {
-    /// `indexes_in` holds the full line of (2 or 3*`max_los`) `dem_ids`/"indexes"
-    indexes_in: &'index [i32],
-    /// `indexes_out` holds `max_los` indexes of `max_los` length. Zeroes are used if
-    indexes_out: &'index mut [i32],
-}
-
-/// `OptionIter` holds the state for an optional inner iterator.
-/// If passed `None`, it will repeat `None` forever. This comes in handy
-/// when working with `izip!`
-struct OptionIter<T, Iter>
-where
-    Iter: Iterator<Item = T>,
-{
-    /// `iter` holds an optional iterator state which will
-    /// call `next()`
-    iter: Option<Iter>,
-}
-
-impl<T, Iter> OptionIter<T, Iter>
-where
-    Iter: Iterator<Item = T>,
-{
-    /// `new` creates a new iter from an Option of the Iter
-    const fn new(iter: Option<Iter>) -> Self {
-        Self { iter }
-    }
-}
-
-impl<T, Iter> Iterator for OptionIter<T, Iter>
-where
-    Iter: Iterator<Item = T>,
-{
-    type Item = Option<T>;
-
-    #[inline]
-    fn next(&mut self) -> Option<Self::Item> {
-        if let Some(ref mut iter) = &mut self.iter {
-            iter.next().map(Some)
-        } else {
-            Some(None)
-        }
-    }
-}
-
-/// `UnrolledAngles` holds the curved earth adjustments
-struct UnrolledAngles<'angle, const WIDTH: usize, const UNROLL: usize>
-where
-    LaneCount<WIDTH>: SupportedLaneCount,
-{
-    /// `adjustments` holds a slice of UNROLL sized slices used during loop unrolling, and then the "rest" portion
-    adjustments: (
-        &'angle [[Simd<f32, WIDTH>; UNROLL]],
-        &'angle [Simd<f32, WIDTH>],
-    ),
-    /// `distances` holds a slice of UNROLL sized slices used during loop unrolling, and then the "rest" portion
-    distances: (
-        &'angle [[Simd<f32, WIDTH>; UNROLL]],
-        &'angle [Simd<f32, WIDTH>],
-    ),
-}
-
-/// `viewshed` computes the viewshed for a single pov, using its `elevation`, and `max_los`
-/// and stores the results in `heatmap` and `longest_line` using the `dem_id`
-#[inline]
-#[expect(
-    clippy::too_many_arguments,
-    clippy::too_many_lines,
-    reason = "it is what it is for now"
-)]
-fn viewshed<const WIDTH: usize, const UNROLL: usize, VS>(
-    vs: &VS,
-    pov_idx: usize,
-    elevation: i16,
-    dem_id: i32,
-    max_los: usize,
-    heatmap: &mut [f32],
-    longest_line: &mut [f32],
-    line: &[i16],
-    index_data: Option<Indexes>,
-    unrolled_angles: &UnrolledAngles<WIDTH, UNROLL>,
-) where
-    LaneCount<WIDTH>: SupportedLaneCount,
-    VS: Viewshed<WIDTH>,
-{
-    let result_tvs_id = dem_to_pov(dem_id, 3 * max_los, max_los);
-
-    // if the line of sight is not within our computable points, do not consider it
-    #[expect(
-        clippy::as_conversions,
-        clippy::cast_possible_wrap,
-        clippy::cast_possible_truncation,
-        reason = "max_los^2 < 2^31"
-    )]
-    if result_tvs_id < 0i32 || result_tvs_id >= (max_los * max_los) as i32 {
-        return;
-    }
-
-    let pov_height = f32::from(elevation);
-
-    // safety: max_los % WIDTH == 0, so [pov_idx..pov_idx+max_los) will also be WIDTH wide
-    let (elevations, _): (&[[i16; WIDTH]], _) =
-        unsafe { line.get_unchecked(pov_idx..pov_idx + max_los) }.as_chunks::<WIDTH>();
-
-    let (iter, rest) = index_data.map_or_else(
-        || (None, None),
-        |data| {
-            // safety: pov_idx should be between [0, max_los) and len(indexes_in)==2*max_los
-            // thus, for any pov_idx pov_idx..pov_idx+max_los is inbounds
-            let (indexes, _): (&[[i32; WIDTH]], _) =
-                unsafe { data.indexes_in.get_unchecked(pov_idx..pov_idx + max_los) }
-                    .as_chunks::<WIDTH>();
-
-            let (indexes_out, _) = data.indexes_out.as_chunks_mut::<WIDTH>();
-
-            let (chunked_indexes, rest_indexes) = indexes.as_chunks::<UNROLL>();
-            let (chunked_indexes_out, rest_indexes_out) = indexes_out.as_chunks_mut::<UNROLL>();
-
-            (
-                Some(
-                    zip(chunked_indexes.iter(), chunked_indexes_out.iter_mut()).map(
-                        |(inds_in, inds_out)| IndexSIMD {
-                            indexes_in: inds_in,
-                            indexes_out: inds_out,
-                        },
-                    ),
-                ),
-                Some(IndexSIMD {
-                    indexes_in: rest_indexes,
-                    indexes_out: rest_indexes_out,
-                }),
-            )
-        },
-    );
-
-    let (chunked_elevs, rest_elevs) = elevations.as_chunks::<UNROLL>();
-
-    let (chunked_distances, rest_distances) = unrolled_angles.distances;
-    let (chunked_adjustments, rest_adjustments) = unrolled_angles.adjustments;
-
-    let (local_sums, local_longest, prefix) = izip!(
-        chunked_elevs,
-        chunked_distances,
-        chunked_adjustments,
-        OptionIter::new(iter),
-    )
-    .fold(
-        (
-            [Simd::splat(0.0); UNROLL],
-            [Simd::splat(0.0); UNROLL],
-            Simd::splat(-2000.0),
-        ),
-        |(sum, longest, prefix), (elevs, dists, adjusts, inds)| {
-            let (next_sum, next_longest, acc) = line_of_sight::<WIDTH, UNROLL, VS>(
-                vs,      // elevs: &[[i16; N]],
-                elevs,   // distances: &[Simd<f32, N>],
-                dists,   // adjustments: &[Simd<f32, N>],
-                adjusts, // prefix_in: Simd<f32, N>,
-                prefix, inds, pov_height, // pov_height: f32,
-            );
-
-            let mut copied_sum = sum;
-            zip(copied_sum.iter_mut(), next_sum).for_each(|(old, new)| {
-                *old += new;
-            });
-
-            let mut copied_longest_line = longest;
-            zip(copied_longest_line.iter_mut(), next_longest).for_each(|(old, new)| {
-                *old = old.simd_max(new);
-            });
-
-            (copied_sum, copied_longest_line, acc)
-        },
-    );
-
-    let mut sum = local_sums
-        .iter()
-        .fold(0.0f32, |acc, partial| acc + partial.reduce_sum());
-
-    let mut longest = local_longest
-        .iter()
-        .fold(0.0f32, |acc, new| acc.max(new.reduce_max()));
-
-    let (sum_buf, longest_buf, _) = line_of_sight::<WIDTH, UNROLL, VS>(
-        vs,
-        rest_elevs,
-        rest_distances,
-        rest_adjustments,
-        prefix,
-        rest,
-        pov_height,
-    );
-
-    sum += sum_buf
-        .iter()
-        .fold(0.0f32, |acc, partial| acc + partial.reduce_sum());
-
-    longest = longest_buf
-        .iter()
-        .fold(longest, |acc, new| acc.max(new.reduce_max()));
-
-    #[expect(
-        clippy::as_conversions,
-        clippy::cast_sign_loss,
-        reason = "result_idx should be in [0, 2^31]"
-    )]
-    // safety: it is guaranteed by the rotation kernel that if the index is
-    // greater than zero that it is in-bounds. This saves ~10% of bounds checks
-    unsafe {
-        *heatmap.get_unchecked_mut(result_tvs_id as usize) += sum;
-
-        let old_longest: *mut f32 = longest_line.get_unchecked_mut(result_tvs_id as usize);
-        *old_longest = (*old_longest).max(longest);
-    }
-}
-
-/// `precalculate_distances` precalculates earth curvature adjustments and
-/// the distance from a particular point (which is just linear)
-fn precalculate_distances<const WIDTH: usize>(
-    max_los: usize,
-) -> (Vec<Simd<f32, WIDTH>>, Vec<Simd<f32, WIDTH>>)
-where
-    LaneCount<WIDTH>: SupportedLaneCount,
-{
-    (0..max_los)
-        .step_by(WIDTH)
-        .map(|offset| {
-            #[expect(
-                clippy::as_conversions,
-                clippy::cast_possible_wrap,
-                clippy::cast_possible_truncation,
-                reason = "WIDTH < 2^31"
-            )]
-            let distance_arr: [i32; WIDTH] = array::from_fn(|i| i as i32);
-            let distances = Simd::from_array(distance_arr);
-
-            #[expect(
-                clippy::as_conversions,
-                clippy::cast_possible_wrap,
-                clippy::cast_possible_truncation,
-                reason = "WIDTH < 2^31"
-            )]
-            let normalized = (distances + Simd::splat(offset as i32)) * Simd::splat(100i32);
-
-            let floats: Simd<f32, WIDTH> = normalized.cast();
-
-            (floats, floats / Simd::splat(EARTH_RADIUS_SQUARED))
-        })
-        .unzip()
-}
-
-/// `total_viewshed` computes a total viewshed heatmap for a given elevation map,
-/// and corresponding indexes to store the rotated data
-#[inline]
-fn total_viewshed<const WIDTH: usize, const UNROLL: usize, V: Viewshed<WIDTH>>(
-    vs: &V,
-    elevation_map: &[i16],
-    indexes: &[i32],
-    max_los: usize,
-    output_sector_data: bool,
-) -> ViewshedAngle
-where
-    LaneCount<WIDTH>: SupportedLaneCount,
-{
-    assert_eq!(
-        elevation_map.len(),
-        2 * max_los * max_los,
-        "elevations should be 2 * max_los wide, and max_los tall"
-    );
-
-    assert_eq!(
-        indexes.len(),
-        2 * max_los * max_los,
-        "indexes should be 2 * max_los wide, and max_los tall"
-    );
-
-    assert_eq!(
-        max_los % WIDTH,
-        0,
-        "to help the vectorizer, max_los must be a multiple of {WIDTH}"
-    );
-
-    let mut sector_data_buf = vec![
-        0i32;
-        if output_sector_data {
-            max_los * max_los * max_los
-        } else {
-            0
-        }
-    ];
-    let mut heatmap = vec![0.0f32; max_los * max_los];
-    let mut longest_line = vec![0.0f32; max_los * max_los];
-    let mut sector_data: Option<&mut Vec<i32>> = output_sector_data.then_some(&mut sector_data_buf);
-
-    let width = 2 * max_los;
-
-    // precalculate all distances and their spherical earth "adjustments".
-    // This saves ~33% of effort inside our hot loop
-    let (distances, adjustments) = precalculate_distances::<WIDTH>(max_los);
-
-    let unrolled_angle = UnrolledAngles {
-        distances: distances.as_chunks::<UNROLL>(),
-        adjustments: adjustments.as_chunks::<UNROLL>(),
-    };
-
-    for (line, line_indexes, sector_chunk) in izip!(
-        elevation_map.chunks_exact(width),
-        indexes.chunks_exact(width),
-        OptionIter::new(
-            sector_data
-                .as_mut()
-                .map(|sd| sd.chunks_exact_mut(max_los * max_los))
-        ),
-    ) {
-        for (pov, (&pov_height, &result_dem_id, line_bitmap)) in izip!(
-            line.iter().take(max_los),
-            line_indexes.iter().take(max_los),
-            OptionIter::new(sector_chunk.map(|chunk| chunk.chunks_exact_mut(max_los)))
-        )
-        .enumerate()
-        {
-            viewshed(
-                vs,
-                pov,
-                pov_height,
-                result_dem_id,
-                max_los,
-                &mut heatmap,
-                &mut longest_line,
-                line,
-                line_bitmap.map(|bitmap| Indexes {
-                    indexes_in: line_indexes,
-                    indexes_out: bitmap,
-                }),
-                &unrolled_angle,
-            );
-        }
-    }
-
-    ViewshedAngle {
-        heatmap,
-        longest_line,
-        sector_data: sector_data.cloned(),
-    }
-}
-
-/// `generate_rotation` generates a rotation "map" for a given elevation list
-/// Adapted from [this stack overflow answer](https://stackoverflow.com/a/71901621)
-#[expect(
-    clippy::as_conversions,
-    clippy::cast_possible_truncation,
-    clippy::cast_possible_wrap,
-    clippy::cast_precision_loss,
-    reason = "so long as max_los^2 < 2^24, the following `as` conversions are entirely safe"
-)]
-fn generate_rotation(elevs: &[i16], angle: f64, max_los: usize) -> (Vec<i32>, Vec<i16>) {
-    let width = (max_los * 3) as isize;
-
-    #[expect(clippy::integer_division, reason = "we don't need precision here")]
-    {
-        assert_eq!(
-            elevs.len() as isize % width,
-            0,
-            "Elevations array must be square {}%{width} != 0",
-            elevs.len(),
-        );
-        let elevations_div_width = elevs.len() as isize / width;
-        assert_eq!(
-            elevations_div_width,
-            width,
-            "Elevations array must be square {}/{width} (={elevations_div_width}) != {width}",
-            elevs.len() as isize
-        );
-    };
-
-    let (sin, cos) = (f64::sin(angle.to_radians()), f64::cos(angle.to_radians()));
-
-    #[expect(clippy::integer_division, reason = "we don't need precision here")]
-    let (x_center, y_center) = (width / 2, width / 2);
-
-    let mut rotation: Vec<i32> = Vec::with_capacity(2 * max_los * max_los);
-
-    for x in (max_los as isize)..(max_los as isize) * 2 {
-        let x_sin = (x - x_center) as f64 * sin;
-        let x_cos = (x - x_center) as f64 * cos;
-        for y in (max_los as isize)..width {
-            let y_sin = (y - y_center) as f64 * sin;
-            let y_cos = (y - y_center) as f64 * cos;
-
-            let x_rot = (x_cos - y_sin).round() as isize + y_center;
-            let y_rot = (y_cos + x_sin).round() as isize + x_center;
-
-            let new_idx = x_rot.clamp(0, width - 1) * width + y_rot.clamp(0, width - 1);
-
-            rotation.push(new_idx as i32);
-        }
-    }
-
-    debug_assert_eq!(
-        rotation.len() as isize,
-        max_los as isize * (2 * max_los as isize),
-        "the rotation should be 2 * max_los wide, max_los tall"
-    );
-
-    // map the indexes to their elevations
-    let elevations = rotation
-        .iter()
-        .map(|&idx| {
-            if idx < 0i32 {
-                i16::MIN
-            } else {
-                #[expect(
-                    clippy::as_conversions,
-                    reason = "elevations start out as i16s, and i16 -> f32 -> i16 is lossless"
-                )]
-                #[expect(clippy::cast_sign_loss, reason = "idx < 2^31, idx >= 0")]
-                // safety: idx is clamped so a get will always be in-bounds
-                *unsafe { elevs.get_unchecked(idx as usize) }
-            }
-        })
-        .collect::<Vec<i16>>();
-
-    (rotation, elevations)
-}
-
-/// `ViewshedAngle` holds the cumulative result of a single angle in the `total_viewshed` algorithm
-#[derive(Debug)]
-struct ViewshedAngle {
-    /// `heatmap` contains the longest line of sight heatmap for rendering
-    heatmap: Vec<f32>,
-    /// `longest_line` contains the longest distance for a particular point
-    longest_line: Vec<f32>,
-    /// `sector_data` holds the visibility calculations for each point in row-major order.
-    /// elements `[0..max_los)` are for the first point, `[max_los, 2*max_los)` for the second
-    /// point, and so on.
-    ///
-    /// This gets absolutely massive, so we only allocate this if we know for certain we will be using it
-    sector_data: Option<Vec<i32>>,
-}
-
-impl ViewshedAngle {
-    /// `new` creates a new buffer for viewshed results of `heatmap`,
-    /// `longest_line`, and `sector_data`
-    fn new(max_los: usize, sector_data: bool) -> Self {
-        Self {
-            heatmap: vec![0.0f32; max_los * max_los],
-            longest_line: vec![0.0f32; max_los * max_los],
-            sector_data: sector_data.then(|| Vec::with_capacity(max_los * max_los * max_los)),
-        }
-    }
-
-    /// `acc` accumulates a single `ViewshedAngle` into another
-    fn acc(&mut self, other: &Self) {
-        zip(&mut self.heatmap, &other.heatmap).for_each(|(to, from)| {
-            *to += *from;
-        });
-
-        zip(&mut self.longest_line, &other.longest_line).for_each(|(to, from)| {
-            *to = (*to).max(*from);
-        });
-
-        if let Some(ref mut sector_data) = &mut self.sector_data {
-            if let Some(other_sector_data) = &other.sector_data {
-                sector_data.extend_from_slice(other_sector_data);
-            }
-        }
-    }
-}
-
-/// `kernel` is a CPU-based total viewshed kernel. It makes use of image rotation tof
-/// optimize the cache locality of all lookups for a total viewshed calculation
-fn kernel(elevations: &[i16], max_los_points: usize, angle: usize) -> ViewshedAngle {
-    assert!(angle < 360, "angle must be [0, 360)");
-    let mut start = Instant::now();
-
-    #[expect(
-        clippy::as_conversions,
-        clippy::cast_precision_loss,
-        reason = "angle is [0,360), not more than 2^54"
-    )]
-    let (indexes, rotated_elevations) = generate_rotation(elevations, angle as f64, max_los_points);
-
-    tracing::info!(
-        "rotated {:?} in {:?}, calculating viewshed",
-        angle,
-        start.elapsed()
-    );
-
-    start = Instant::now();
-
-    let vectorized = Vectorized {};
-
-    #[cfg(target_feature = "avx512f")]
-    {
-        let result = total_viewshed::<16, 8, Vectorized>(
-            &vectorized,
-            &rotated_elevations,
-            &indexes,
-            max_los_points,
-            false,
-        );
-        tracing::info!("kernel for {} run in: {:?}", angle, start.elapsed());
-        return result;
-    };
-
-    #[cfg(all(target_feature = "avx2", target_feature = "avx"))]
-    {
-        let result = total_viewshed::<8, 8, Vectorized>(
-            &vectorized,
-            &rotated_elevations,
-            &indexes,
-            max_los_points,
-            false,
-        );
-        tracing::info!("kernel for {} run in: {:?}", angle, start.elapsed());
-        return result;
-    };
-
-    #[expect(
-        unreachable_code,
-        unused_variables,
-        reason = "conditionally compiled out"
-    )]
-    let result = total_viewshed::<4, 8, Vectorized>(
-        &vectorized,
-        &rotated_elevations,
-        &indexes,
-        max_los_points,
-        false,
-    );
-    tracing::info!("kernel for {} run in: {:?}", angle, start.elapsed());
-    result
-}
-
-/// `multithreaded_kernel` parallelizes CPU kernel calculations for a `core_count` and calculates
-/// `num_angles` different angles
-pub fn multithreaded_kernel(
-    elevations_original: &[i16],
-    max_los_points_original: usize,
-    num_angles: usize,
-    core_count: usize,
-    output_sector_data: bool,
-) -> (Vec<f32>, Vec<f32>, Option<Vec<i32>>) {
-    let max_los_points = max_los_points_original.div_ceil(4) * 4;
-    let dem_width = max_los_points * 3;
-    let mut elevations_vec = elevations_original.to_vec();
-    elevations_vec.resize(dem_width.pow(2), 0);
-    let elevations = &elevations_vec;
-
-    if max_los_points != max_los_points_original {
-        tracing::warn!("LoS: {max_los_points_original} to {max_los_points}");
-    }
-    if elevations.len() != elevations_original.len() {
-        tracing::warn!(
-            "Elevations array length resized: {} to {}",
-            elevations_original.len(),
-            elevations_vec.len()
-        );
-    }
-
-    #[expect(
-        clippy::expect_used,
-        reason = "threadpool must be created for program to run"
-    )]
-    let pool = ThreadPoolBuilder::new()
-        .num_threads(core_count)
-        .build()
-        .expect("couldn't build threadpool");
-
-    let mut final_angle = ViewshedAngle::new(max_los_points, output_sector_data);
-    let angle_mu = &Mutex::new(&mut final_angle);
-
-    pool.install(move || {
-        (0..num_angles)
-            .into_par_iter()
-            .map(|angle| kernel(elevations, max_los_points, angle))
-            .for_each(|vs| {
-                #[expect(clippy::expect_used, reason = "a poisoned mutex should crash")]
-                let mut angle_guard = angle_mu.lock().expect("mutex poisoned");
-
-                angle_guard.acc(&vs);
-            });
-    });
-
-    let (heatmap, longest_line, sector) = (
-        mem::take(&mut final_angle.heatmap),
-        mem::take(&mut final_angle.longest_line),
-        final_angle
-            .sector_data
-            .map(|mut sector_data| mem::take(&mut sector_data)),
-    );
-
-    (heatmap, longest_line, sector)
-}
diff --git a/crates/total-viewsheds/src/cpu/kernel.rs b/crates/total-viewsheds/src/cpu/kernel.rs
new file mode 100644
index 0000000..97efe20
--- /dev/null
+++ b/crates/total-viewsheds/src/cpu/kernel.rs
@@ -0,0 +1,221 @@
+use crate::cpu::los::{LineOfSight as _, UnrolledLOS};
+use crate::cpu::vector::{VectorLos, DEFAULT_VECTOR_LENGTH};
+use itertools::izip;
+
+/// `fill_in_elevations` will fill in "blank" elevations from NASA data with the last seen elevation
+/// in the line of sight
+fn fill_in_elevations(elevs: &[i16], max_los: usize) -> Vec<i16> {
+    elevs
+        .chunks_exact(2 * max_los)
+        .flat_map(|line| {
+            line.iter()
+                .scan(0, |last_seen, &elevation| match elevation {
+                    i16::MIN => Some(*last_seen),
+                    _ => {
+                        *last_seen = elevation;
+                        Some(elevation)
+                    }
+                })
+        })
+        .collect::<Vec<i16>>()
+}
+
+/// `generate_rotation` generates a rotation "map" for a given elevation list
+/// Adapted from [this stack overflow answer](https://stackoverflow.com/a/71901621)
+#[expect(
+    clippy::as_conversions,
+    clippy::cast_possible_truncation,
+    clippy::cast_possible_wrap,
+    clippy::cast_precision_loss,
+    reason = "so long as max_los^2 < 2^24, the following `as` conversions are entirely safe"
+)]
+fn generate_rotation(elevs: &[i16], angle: f64, max_los: usize) -> (Vec<i32>, Vec<i16>) {
+    let width = (max_los * 3) as isize;
+
+    #[expect(clippy::integer_division, reason = "we don't need precision here")]
+    {
+        assert_eq!(
+            elevs.len() as isize % width,
+            0,
+            "Elevations array must be square {}%{width} != 0",
+            elevs.len(),
+        );
+        let elevations_div_width = elevs.len() as isize / width;
+        assert_eq!(
+            elevations_div_width,
+            width,
+            "Elevations array must be square {}/{width} (={elevations_div_width}) != {width}",
+            elevs.len() as isize
+        );
+    };
+
+    let (sin, cos) = (f64::sin(angle.to_radians()), f64::cos(angle.to_radians()));
+
+    #[expect(clippy::integer_division, reason = "we don't need precision here")]
+    let (x_center, y_center) = (width / 2, width / 2);
+
+    let mut rotation: Vec<i32> = Vec::with_capacity(2 * max_los * max_los);
+
+    for x in (max_los as isize)..(max_los as isize) * 2 {
+        let x_sin = (x - x_center) as f64 * sin;
+        let x_cos = (x - x_center) as f64 * cos;
+        for y in (max_los as isize)..width {
+            let y_sin = (y - y_center) as f64 * sin;
+            let y_cos = (y - y_center) as f64 * cos;
+
+            let x_rot = (x_cos - y_sin).round() as isize + y_center;
+            let y_rot = (y_cos + x_sin).round() as isize + x_center;
+
+            let new_idx = x_rot.clamp(0, width - 1) * width + y_rot.clamp(0, width - 1);
+
+            rotation.push(new_idx as i32);
+        }
+    }
+
+    debug_assert_eq!(
+        rotation.len() as isize,
+        max_los as isize * (2 * max_los as isize),
+        "the rotation should be 2 * max_los wide, max_los tall"
+    );
+
+    // map the indexes to their elevations
+    let elevations = rotation
+        .iter()
+        .map(|&idx| {
+            if idx < 0i32 {
+                i16::MIN
+            } else {
+                #[expect(
+                    clippy::as_conversions,
+                    reason = "elevations start out as i16s, and i16 -> f32 -> i16 is lossless"
+                )]
+                #[expect(clippy::cast_sign_loss, reason = "idx < 2^31, idx >= 0")]
+                // safety: idx is clamped so a get will always be in-bounds
+                *unsafe { elevs.get_unchecked(idx as usize) }
+            }
+        })
+        .collect::<Vec<i16>>();
+
+    (rotation, fill_in_elevations(&elevations, max_los))
+}
+
+#[expect(
+    clippy::as_conversions,
+    clippy::cast_possible_truncation,
+    clippy::cast_possible_wrap,
+    reason = "so long as max_los < 2^24, the following as conversions are entirely safe"
+)]
+#[expect(
+    clippy::integer_division,
+    reason = "i32 is constructed from (i32, i32) converting back should succeed"
+)]
+/// `dem_to_pov` turns the `dem_id` to the `pov_id` so that the result can be stored in a heatmap
+const fn dem_to_pov(dem_id: i32, width: usize, max_los: usize) -> i32 {
+    let dem_x = (dem_id / width as i32) - max_los as i32;
+    let dem_y = (dem_id % width as i32) - max_los as i32;
+
+    let radius = max_los as i32 / 2i32;
+    let circ_x = dem_x - radius;
+    let circ_y = dem_y - radius;
+
+    let dist = (circ_x.pow(2) + circ_y.pow(2)).isqrt();
+    if dist < radius {
+        dem_x * (max_los as i32) + dem_y
+    } else {
+        -1
+    }
+}
+
+/// `kernel` will calculate the longest line of sight heatmap for a given angle and elevation map
+/// assuming that the maximum line of sight is `max_los`
+#[expect(
+    clippy::inline_always,
+    reason = "I am become Death, destroyer of compilers"
+)] // the real reason is that I need output_sector_data to be constant propagated
+#[inline(always)]
+pub fn kernel(
+    elevation_map: &[i16],
+    max_los: usize,
+    angle: f32,
+    output_sector_data: bool,
+) -> (Vec<f32>, Vec<f32>, Vec<Vec<bool>>) {
+    let mut heatmap = vec![0.0f32; max_los * max_los];
+    let mut longest = vec![0.0f32; max_los * max_los];
+
+    let mut sector_data: Vec<Vec<bool>> = vec![
+        vec![];
+        if output_sector_data {
+            max_los * max_los
+        } else {
+            0
+        }
+    ];
+
+    let (indexes, rotated_elevations) = generate_rotation(elevation_map, f64::from(angle), max_los);
+
+    assert_eq!(
+        rotated_elevations.len(),
+        2 * max_los * max_los,
+        "elevations should be 2 * max_los wide, and max_los tall"
+    );
+
+    let width = 2 * max_los;
+
+    let mut vs = UnrolledLOS::<64>::new(max_los);
+    for (line, line_indexes) in izip!(
+        rotated_elevations.chunks_exact(width),
+        indexes.chunks_exact(width),
+    ) {
+        for (pov, (&pov_height, &result_dem_id)) in
+            izip!(line.iter().take(max_los), line_indexes.iter().take(max_los)).enumerate()
+        {
+            let result_tvs_id = dem_to_pov(result_dem_id, 3 * max_los, max_los);
+
+            // if the line of sight is not within our computable points, do not consider it
+            #[expect(
+                clippy::as_conversions,
+                clippy::cast_possible_wrap,
+                clippy::cast_possible_truncation,
+                reason = "max_los^2 < 2^31"
+            )]
+            if result_tvs_id < 0i32 || result_tvs_id >= (max_los * max_los) as i32 {
+                continue;
+            }
+
+            let neighbor = pov + 1;
+
+            #[expect(
+                clippy::indexing_slicing,
+                reason = "if slicing is out of bounds, it should panic"
+            )]
+            let (pixel, long, sector) = vs.line_of_sight::<VectorLos<{ DEFAULT_VECTOR_LENGTH }>>(
+                f32::from(pov_height),
+                &line[neighbor..neighbor + max_los],
+                output_sector_data,
+            );
+
+            #[expect(
+                clippy::as_conversions,
+                clippy::cast_sign_loss,
+                clippy::indexing_slicing,
+                reason = "max_los^2 < 2^31"
+            )]
+            {
+                // safety: result_tvs_id is guaranteed to be within [0..max_los^2)
+                unsafe {
+                    *heatmap.get_unchecked_mut(result_tvs_id as usize) = pixel;
+                };
+                // safety: result_tvs_id is guaranteed to be within [0..max_los^2)
+                unsafe {
+                    *longest.get_unchecked_mut(result_tvs_id as usize) = long;
+                };
+
+                if output_sector_data {
+                    sector_data[result_tvs_id as usize] = sector;
+                }
+            }
+        }
+    }
+
+    (heatmap, longest, sector_data)
+}
diff --git a/crates/total-viewsheds/src/cpu/los.rs b/crates/total-viewsheds/src/cpu/los.rs
new file mode 100644
index 0000000..b882748
--- /dev/null
+++ b/crates/total-viewsheds/src/cpu/los.rs
@@ -0,0 +1,282 @@
+use itertools::izip;
+
+/// `LineOfSight` abstracts the implementation of line of sight calculations to
+/// any "carry through" that can be materialized into a (f32, f32).
+pub trait LineOfSight<Output: Into<(f32, f32)>> {
+    /// `line_of_sight` calculates a line of sight for the given `pov_height`
+    /// and outputs a triple of the surface area, longest line of sight in meters
+    /// and a vector of bools of which
+    fn line_of_sight<LOS>(
+        &mut self,
+        pov_height: f32,
+        line: &[i16],
+        output_sector: bool,
+    ) -> (f32, f32, Vec<bool>)
+    where
+        LOS: Angle + PrefixMax + Accumulate<Output>;
+}
+
+/// `Angle` abstracts the angle calculation between a pov and all the elevation data within
+/// its band of sight
+pub trait Angle {
+    /// `calculate_angles` calculates the angle from the `pov_height` to a given elevation
+    fn calculate_angles(
+        pov_height: f32,
+        elevations: &[i16],
+        distances: &[f32],
+        adjustments: &[f32],
+        angles_out: &mut [f32],
+    );
+}
+
+/// `Accumulate` accumulates the surface area visible and longest line of sight
+/// in a pair of (f32, f32). `Accumulate` doesn't care about the implementation details
+/// of accumulation so long as the `Output` type can be materialized to (f32, f32)
+pub trait Accumulate<Output: Into<(f32, f32)>> {
+    /// `accumulate` accumulates the surface area using the distances by comparing
+    /// whether a point at a distance is visible (angle > prefix)
+    /// If `output_sector` is true, it should output a bitmap of which distances are visible
+    /// at their respective locations
+    fn accumulate(
+        init: Output,
+        angles: &[f32],
+        prefix: &[f32],
+        distances: &[f32],
+        bitmap: &mut Vec<bool>,
+        output_sector: bool,
+    ) -> Output;
+}
+
+/// `PrefixMax` calculates the prefix maximum of the given angles
+pub trait PrefixMax {
+    /// `prefix_max` calculates the prefix max of the
+    fn prefix_max(highest: f32, angles_in: &[f32], angles_out: &mut [f32]);
+}
+
+/// `EARTH_RADIUS_SQUARED` is the radius of the earth in meters
+const EARTH_RADIUS_SQUARED: f32 = 12_742_000.0;
+
+/// `generate_distances` generates the distance from
+#[expect(
+    clippy::as_conversions,
+    clippy::cast_precision_loss,
+    reason = "max_los is < 2^24"
+)]
+fn generate_distances(max_los: usize) -> (Vec<f32>, Vec<f32>) {
+    (1..=max_los)
+        .map(|step| {
+            (
+                (step * 100) as f32,
+                (step * 100) as f32 / EARTH_RADIUS_SQUARED,
+            )
+        })
+        .unzip()
+}
+
+/// `StraightLine` contains the distances and round-earth adjustments needed to compute
+/// the longest line of sight.
+pub struct StraightLine {
+    /// `max_los` is the longest possible line of sight in 100ms
+    max_los: usize,
+    /// `angles` is a buffer of size `max_los` to store all angle calculations in
+    angles: Vec<f32>,
+    /// `prefix_max` is a buffer of size `max_los` to store the inclusive prefix max calculation in
+    prefix_max: Vec<f32>,
+    /// `distances` holds the distances in meters for every step between 100m and
+    distances: Vec<f32>,
+    /// `adjustments` holds earth curvature adjustments in meters for every step between 100m and
+    adjustments: Vec<f32>,
+}
+
+impl StraightLine {
+    /// new constructs a new `StraightLine` given the maximum line of sight in `max_los`
+    #[expect(unused, reason = "this is generally only for testing/benchmarking")]
+    pub fn new(max_los: usize) -> Self {
+        let (distances, adjustments) = generate_distances(max_los);
+        Self {
+            max_los,
+            angles: vec![0.0f32; max_los + 1],
+            prefix_max: vec![0.0f32; max_los],
+            distances,
+            adjustments,
+        }
+    }
+}
+
+impl LineOfSight<(f32, f32)> for StraightLine {
+    #[expect(
+        clippy::indexing_slicing,
+        reason = "all indexing and slices are guaranteed by construction of a StraightLine"
+    )]
+    fn line_of_sight<LOS>(
+        &mut self,
+        pov_height: f32,
+        line: &[i16],
+        output_sector: bool,
+    ) -> (f32, f32, Vec<bool>)
+    where
+        LOS: PrefixMax + Angle + Accumulate<(f32, f32)>,
+    {
+        let mut output: Vec<bool> = vec![];
+
+        self.angles[0] = -2000.0;
+
+        LOS::calculate_angles(
+            pov_height,
+            line,
+            &self.distances,
+            &self.adjustments,
+            &mut self.angles[1..],
+        );
+
+        LOS::prefix_max(
+            -2000.0f32,
+            &self.angles[..self.max_los],
+            &mut self.prefix_max,
+        );
+
+        let (heatmap, longest) = LOS::accumulate(
+            (0.0, 0.0),
+            &self.angles[1..],
+            &self.prefix_max,
+            &self.distances,
+            &mut output,
+            output_sector,
+        );
+
+        (heatmap, longest, output)
+    }
+}
+
+/// Unroll holds an unrolled heatmap and unrolled longest line of sight calculation
+/// Since in Line of Sight-land max/addition are commutative, then Unroll will be materialized
+/// into (f32, f32)
+pub struct Unroll<const UNROLL: usize> {
+    /// `heatmap` contains the summation of visible surface areas which will be reduced to a single
+    /// surface area at the end
+    pub heatmap: [f32; UNROLL],
+    /// `longest` contains many long lines of sight which will be reduced to a single
+    /// line of sight at the end
+    pub longest: [f32; UNROLL],
+}
+
+/// `UnrolledLOS` implements an Unrolled `LineOfSight` calculation
+pub struct UnrolledLOS<const UNROLL: usize>
+where
+    [(); UNROLL + 1]:,
+{
+    /// `distances` holds `max_los` distances
+    distances: Vec<f32>,
+    /// `adjustments` holds `max_los` earth curvature adjustments
+    adjustments: Vec<f32>,
+}
+
+impl<const SIZE: usize> From<Unroll<SIZE>> for (f32, f32) {
+    default fn from(val: Unroll<SIZE>) -> Self {
+        let heatmap = val.heatmap.iter().sum();
+        let longest = val.longest.iter().fold(0.0f32, |acc, &elem| acc.max(elem));
+        (heatmap, longest)
+    }
+}
+
+impl<const UNROLL: usize> UnrolledLOS<UNROLL>
+where
+    [(); UNROLL + 1]:,
+{
+    /// `new` initializes a new `UnrolledLOS`, and precalculates all the distances
+    /// and earth curvature adjustments
+    pub fn new(max_los: usize) -> Self {
+        let (distances, adjustments) = generate_distances(max_los);
+
+        Self {
+            distances,
+            adjustments,
+        }
+    }
+}
+
+impl<const UNROLL: usize> LineOfSight<Unroll<UNROLL>> for UnrolledLOS<UNROLL>
+where
+    [(); UNROLL + 1]:,
+{
+    #[inline]
+    #[expect(
+        clippy::indexing_slicing,
+        reason = "all indexing and slices are guaranteed by construction of a UnrolledLOS"
+    )]
+    fn line_of_sight<LOS>(
+        &mut self,
+        pov_height: f32,
+        line: &[i16],
+        output_sector: bool,
+    ) -> (f32, f32, Vec<bool>)
+    where
+        LOS: Angle + PrefixMax + Accumulate<Unroll<UNROLL>>,
+    {
+        let mut angles = [0.0f32; UNROLL + 1];
+        let mut prefix_max = [0.0f32; UNROLL];
+
+        prefix_max[UNROLL - 1] = -2000.0;
+        angles[0] = -2000.0;
+
+        let mut output: Vec<bool> = vec![];
+
+        let (chunked_line, rest_line) = line.as_chunks::<{ UNROLL }>();
+
+        let (chunked_distances, rest_distances) = self.distances.as_chunks::<{ UNROLL }>();
+
+        let (chunked_adjustments, rest_adjustments) = self.adjustments.as_chunks::<{ UNROLL }>();
+
+        let los = izip!(chunked_line, chunked_distances, chunked_adjustments).fold(
+            Unroll::<UNROLL> {
+                longest: [0.0; UNROLL],
+                heatmap: [0.0; UNROLL],
+            },
+            |acc, (unroll_line, distances, adjusts)| {
+                LOS::calculate_angles(
+                    pov_height,
+                    unroll_line,
+                    distances,
+                    adjusts,
+                    &mut angles[1..],
+                );
+
+                LOS::prefix_max(prefix_max[UNROLL - 1], &angles[..UNROLL], &mut prefix_max);
+
+                let new_acc = LOS::accumulate(
+                    acc,
+                    &angles[1..],
+                    &prefix_max,
+                    distances,
+                    &mut output,
+                    output_sector,
+                );
+
+                angles[0] = angles[UNROLL];
+                new_acc
+            },
+        );
+
+        LOS::calculate_angles(
+            pov_height,
+            rest_line,
+            rest_distances,
+            rest_adjustments,
+            &mut angles[1..],
+        );
+
+        LOS::prefix_max(prefix_max[UNROLL - 1], &angles[..UNROLL], &mut prefix_max);
+
+        let new_acc = LOS::accumulate(
+            los,
+            &angles[1..],
+            &prefix_max,
+            rest_distances,
+            &mut output,
+            output_sector,
+        );
+
+        let (heatmap, longest) = new_acc.into();
+        (heatmap, longest, output)
+    }
+}
diff --git a/crates/total-viewsheds/src/cpu/mod.rs b/crates/total-viewsheds/src/cpu/mod.rs
new file mode 100644
index 0000000..e49a6f1
--- /dev/null
+++ b/crates/total-viewsheds/src/cpu/mod.rs
@@ -0,0 +1,9 @@
+/// los contains all the traits necessary for implementing a line of sight algorithm
+mod los;
+
+/// vector contains vectorized implementations of the line of sight traits
+mod vector;
+
+/// kernel is the exported kernel module
+mod kernel;
+pub use kernel::kernel;
diff --git a/crates/total-viewsheds/src/cpu/vector.rs b/crates/total-viewsheds/src/cpu/vector.rs
new file mode 100644
index 0000000..cd50b05
--- /dev/null
+++ b/crates/total-viewsheds/src/cpu/vector.rs
@@ -0,0 +1,557 @@
+use itertools::izip;
+use std::iter::zip;
+use std::simd::cmp::SimdPartialOrd as _;
+use std::simd::num::SimdFloat as _;
+use std::simd::prelude::SimdInt as _;
+use std::simd::{f32x4, f32x8, LaneCount, Mask, Simd, SupportedLaneCount};
+
+/// `TAN_ONE_RAD` is used in normalizing the
+const TAN_ONE_RAD: f32 = 0.017_453_3;
+
+use crate::cpu::los::{Accumulate, Angle, PrefixMax, Unroll};
+
+/// `VectorMax` performs an element-wise SIMD max of floats, allowing for architecture
+/// specific implementations
+trait VectorMax<const WIDTH: usize>
+where
+    LaneCount<WIDTH>: SupportedLaneCount,
+{
+    /// `max` computes an element-wise maximum from lhs and rhs, assuming neither contain NaNs
+    /// or -0.0
+    fn max(lhs: Simd<f32, WIDTH>, rhs: Simd<f32, WIDTH>) -> Simd<f32, WIDTH>;
+}
+
+/// `VectorGreater` performs a SIMD greater than of floats, allowing for architecture
+/// specific implementations
+trait VectorGreater<const WIDTH: usize>
+where
+    LaneCount<WIDTH>: SupportedLaneCount,
+{
+    /// gt computes an element-wise maximum from lhs and rhs, assuming neither contain NaNs
+    /// or -0.0
+    fn gt(lhs: Simd<f32, WIDTH>, rhs: Simd<f32, WIDTH>) -> Mask<i32, WIDTH>;
+}
+
+/// `VectorLos` is an implementation of the internals of `PrefixMax`, `Angle`,  and `Accumulate`
+/// for Portable SIMD
+pub struct VectorLos<const WIDTH: usize>
+where
+    LaneCount<WIDTH>: SupportedLaneCount;
+
+impl<const WIDTH: usize> VectorMax<WIDTH> for VectorLos<WIDTH>
+where
+    LaneCount<WIDTH>: SupportedLaneCount,
+{
+    #[inline]
+    default fn max(lhs: Simd<f32, WIDTH>, rhs: Simd<f32, WIDTH>) -> Simd<f32, WIDTH> {
+        lhs.simd_max(rhs)
+    }
+}
+
+impl<const WIDTH: usize> VectorGreater<WIDTH> for VectorLos<WIDTH>
+where
+    LaneCount<WIDTH>: SupportedLaneCount,
+{
+    #[inline]
+    default fn gt(lhs: Simd<f32, WIDTH>, rhs: Simd<f32, WIDTH>) -> Mask<i32, WIDTH> {
+        lhs.simd_gt(rhs)
+    }
+}
+
+#[cfg(target_feature = "sse")]
+impl VectorMax<4> for VectorLos<4> {
+    #[inline]
+    fn max(lhs: f32x4, rhs: f32x4) -> f32x4 {
+        use std::arch::x86_64::_mm_max_ps;
+
+        // safety: the caller of Viewshed<4> guarantees that -0.0 or NaN are not in the input
+        // thus allowing this to be non IEEE754 compliant
+        unsafe { _mm_max_ps(lhs.into(), rhs.into()) }.into()
+    }
+}
+
+#[cfg(target_feature = "avx")]
+impl VectorGreater<4> for VectorLos<4> {
+    fn gt(lhs: f32x4, rhs: f32x4) -> Mask<i32, 4> {
+        use std::arch::x86_64::{_mm_castps_si128, _mm_cmp_ps, _CMP_GT_OS};
+
+        // safety: the caller of Viewshed<4> guarantees that -0.0 or NaN are not in the input
+        // thus allowing this to be non IEEE754 compliant
+        unsafe {
+            let mask = _mm_castps_si128(_mm_cmp_ps::<_CMP_GT_OS>(lhs.into(), rhs.into()));
+            Mask::<i32, 4>::from_int_unchecked(mask.into())
+        }
+    }
+}
+
+impl PrefixMax for VectorLos<4> {
+    #[inline]
+    fn prefix_max(highest: f32, angles_in: &[f32], angles_out: &mut [f32]) {
+        let (vector_angles, _) = angles_in.as_chunks::<4>();
+        let (vector_prefix, _) = angles_out.as_chunks_mut::<4>();
+
+        for (prefix, &angle) in zip(vector_prefix.iter_mut(), vector_angles.iter()) {
+            let start = Simd::from(angle);
+
+            let mut v_prefix_max = {
+                let shifted = start.shift_elements_right::<1>(-2000.0f32);
+                Self::max(start, shifted)
+            };
+
+            v_prefix_max = {
+                let shifted = v_prefix_max.shift_elements_right::<2>(-2000.0f32);
+                Self::max(v_prefix_max, shifted)
+            };
+
+            v_prefix_max.copy_to_slice(prefix);
+        }
+
+        let mut local_acc = Simd::splat(highest);
+
+        // accumulate the prefix maxes for blocks, re-computing all prefix maxes
+        // to include the accumulated value
+        for prefix in vector_prefix {
+            let cur_prefix: f32x4 = Simd::from(*prefix);
+            let cur_max: f32x4 = Simd::splat(cur_prefix[3]);
+
+            Self::max(local_acc, cur_prefix).copy_to_slice(prefix);
+            local_acc = Self::max(local_acc, cur_max);
+        }
+    }
+}
+
+#[cfg(target_feature = "avx")]
+impl VectorMax<8> for VectorLos<8> {
+    #[inline]
+    fn max(lhs: f32x8, rhs: f32x8) -> f32x8 {
+        use std::arch::x86_64::_mm256_max_ps;
+        // safety: the caller of Viewshed<4> guarantees that -0.0 or NaN are not in the input
+        // thus allowing this to be non IEEE754 compliant
+        unsafe { _mm256_max_ps(lhs.into(), rhs.into()).into() }
+    }
+}
+
+#[cfg(all(
+    target_feature = "sse",
+    target_feature = "avx",
+    target_feature = "avx2"
+))]
+impl PrefixMax for VectorLos<8> {
+    #[inline]
+    fn prefix_max(highest: f32, angles_in: &[f32], angles_out: &mut [f32]) {
+        use std::arch::x86_64::{
+            _mm256_blend_ps, _mm256_castps_si256, _mm256_castsi256_ps, _mm256_slli_si256,
+            _mm_max_ps,
+        };
+
+        debug_assert!(
+            angles_in.len().is_multiple_of(8) && angles_in.len() == angles_out.len(),
+            "inconsistent lengths, buffer must be multiple of vector length"
+        );
+        {
+            let (vector_angles, _) = angles_in.as_chunks::<8>();
+            let (vector_prefix, _) = angles_out.as_chunks_mut::<8>();
+
+            izip!(vector_prefix.iter_mut(), vector_angles.iter()).for_each(|(prefix, &angle)| {
+                let start = Simd::from_array(angle);
+                // safety: PrefixMax for VectorLos<8> is guarded by a cfg block for all SIMD instructions
+                let mut v_prefix_max = unsafe {
+                    let shifted = _mm256_slli_si256::<4>(_mm256_castps_si256(start.into()));
+                    let blended = _mm256_blend_ps::<0b0001_0001>(
+                        _mm256_castsi256_ps(shifted),
+                        Simd::splat(-2000.0f32).into(),
+                    );
+                    Self::max(start, blended.into())
+                };
+
+                // safety: PrefixMax for VectorLos<8> is guarded by a cfg block for all SIMD instructions
+                v_prefix_max = unsafe {
+                    let shifted = _mm256_slli_si256::<8>(_mm256_castps_si256(v_prefix_max.into()));
+                    let blended = _mm256_blend_ps::<0b0011_0011>(
+                        _mm256_castsi256_ps(shifted),
+                        Simd::splat(-2000.0f32).into(),
+                    );
+
+                    Self::max(v_prefix_max, blended.into())
+                };
+
+                v_prefix_max.copy_to_slice(prefix);
+            });
+        };
+
+        {
+            let mut acc: f32x4 = Simd::splat(highest);
+            let (vector_prefix, _) = angles_out.as_chunks_mut::<4>();
+            for prefix in vector_prefix.iter_mut() {
+                // safety: PrefixMax for VectorLos<8> is guarded by a cfg block for all SIMD instructions
+                let new_max: f32x4 =
+                    unsafe { _mm_max_ps(Simd::from_array(*prefix).into(), acc.into()) }.into();
+
+                let cur_max = Simd::splat(prefix[3]);
+
+                // safety: PrefixMax for VectorLos<8> is guarded by a cfg block for all SIMD instructions
+                acc = unsafe { _mm_max_ps(acc.into(), cur_max.into()).into() };
+
+                new_max.copy_to_slice(prefix);
+            }
+        }
+    }
+}
+
+#[cfg(target_feature = "avx")]
+impl VectorGreater<8> for VectorLos<8> {
+    #[inline]
+    fn gt(lhs: f32x8, rhs: f32x8) -> Mask<i32, 8> {
+        use std::arch::x86_64::{_mm256_castps_si256, _mm256_cmp_ps, _CMP_GT_OS};
+
+        // safety: the caller of Viewshed<4> guarantees that -0.0 or NaN are not in the input
+        // thus allowing this to be non IEEE754 compliant
+        unsafe {
+            let mask = _mm256_castps_si256(_mm256_cmp_ps::<_CMP_GT_OS>(lhs.into(), rhs.into()));
+            Mask::<i32, 8>::from_int_unchecked(mask.into())
+        }
+    }
+}
+
+#[cfg(target_feature = "avx512f")]
+impl VectorMax<16> for VectorLos<16> {
+    #[inline]
+    fn max(lhs: Simd<f32, 16>, rhs: Simd<f32, 16>) -> Simd<f32, 16> {
+        use std::arch::x86_64::_mm512_max_ps;
+        // safety: the caller of Viewshed<4> guarantees that -0.0 or NaN are not in the input
+        // thus allowing this to be non IEEE754 compliant
+        unsafe { _mm512_max_ps(lhs.into(), rhs.into()).into() }
+    }
+}
+
+#[cfg(target_feature = "avx512f")]
+impl VectorGreater<16> for VectorLos<16> {
+    #[inline]
+    fn gt(lhs: Simd<f32, 16>, rhs: Simd<f32, 16>) -> Mask<i32, 16> {
+        use std::arch::x86_64::_mm512_cmple_ps_mask;
+        // safety: the caller of Viewshed<8> guarantees that -0.0 or NaN are not in the input
+        // thus allowing this to be non IEEE754 compliant
+        unsafe {
+            let mask = _mm512_cmple_ps_mask(lhs.into(), rhs.into());
+            Mask::<i32, 16>::from_bitmask(mask.into())
+        }
+    }
+}
+
+#[cfg(target_feature = "avx512f")]
+impl PrefixMax for VectorLos<16> {
+    #[inline]
+    fn prefix_max(highest: f32, angles_in: &[f32], angles_out: &mut [f32]) {
+        use std::arch::x86_64::{
+            __m512, _mm512_alignr_epi32, _mm512_castps_si512, _mm512_castsi512_ps, _mm512_max_ps,
+        };
+        use std::simd::f32x16;
+
+        #[cfg(target_feature = "avx512f")]
+        #[expect(
+            clippy::cast_sign_loss,
+            clippy::as_conversions,
+            clippy::cast_possible_truncation,
+            clippy::cast_possible_wrap,
+            reason = "K is always in bounds"
+        )]
+        fn mm512_slli_si512<const K: usize>(elem: __m512) -> __m512
+        where
+            [(); { (16 - K) as i32 } as usize]:,
+        {
+            // safety: all mm512 intrinsics are guarded by avx512f
+            unsafe {
+                let zero = f32x16::splat(-2000.0f32);
+                _mm512_castsi512_ps(_mm512_alignr_epi32::<{ (16 - K) as i32 }>(
+                    _mm512_castps_si512(elem),
+                    _mm512_castps_si512(zero.into()),
+                ))
+            }
+        }
+
+        let (vector_prefix, _) = angles_out.as_chunks_mut::<16>();
+        let (vector_angle, _) = angles_in.as_chunks::<16>();
+
+        for (prefix, &angle) in zip(vector_prefix.iter_mut(), vector_angle.iter()) {
+            let simd_angle = Simd::from_array(angle);
+            // safety: all the following operations are guarded by the avx512f build falg
+            unsafe {
+                let mut v_prefix_max =
+                    _mm512_max_ps(simd_angle.into(), mm512_slli_si512::<1>(simd_angle.into()));
+
+                v_prefix_max = _mm512_max_ps(v_prefix_max, mm512_slli_si512::<2>(v_prefix_max));
+
+                v_prefix_max = _mm512_max_ps(v_prefix_max, mm512_slli_si512::<4>(v_prefix_max));
+
+                v_prefix_max = _mm512_max_ps(v_prefix_max, mm512_slli_si512::<8>(v_prefix_max));
+
+                let simd_prefix_max: f32x16 = v_prefix_max.into();
+                simd_prefix_max.copy_to_slice(prefix);
+            }
+        }
+
+        let mut local_acc = f32x16::splat(highest);
+
+        // accumulate the prefix maxes for blocks, re-computing all prefix maxes
+        // to include the accumulated value
+        for prefix in vector_prefix.iter_mut() {
+            let cur_prefix: f32x16 = Simd::from_array(*prefix);
+            let cur_max: f32x16 = Simd::splat(cur_prefix[15]);
+
+            Self::max(local_acc, cur_prefix).copy_to_slice(prefix);
+            local_acc = Self::max(local_acc, cur_max);
+        }
+    }
+}
+
+impl<const WIDTH: usize> Angle for VectorLos<WIDTH>
+where
+    LaneCount<WIDTH>: SupportedLaneCount,
+{
+    #[inline]
+    default fn calculate_angles(
+        pov_height: f32,
+        elevations: &[i16],
+        distances: &[f32],
+        adjustments: &[f32],
+        angles_out: &mut [f32],
+    ) {
+        debug_assert!(
+            elevations.len().is_multiple_of(WIDTH),
+            "expected elevations to be a multiple of {WIDTH}",
+        );
+
+        debug_assert!(
+            distances.len().is_multiple_of(WIDTH),
+            "expected distances to be a multiple of {WIDTH}",
+        );
+
+        debug_assert!(
+            adjustments.len().is_multiple_of(WIDTH),
+            "expected adjustments to be a multiple of {WIDTH}",
+        );
+
+        debug_assert!(
+            angles_out.len().is_multiple_of(WIDTH),
+            "expected angles buf to be a multiple of {WIDTH}",
+        );
+
+        let (vector_angles, _) = angles_out.as_chunks_mut::<{ WIDTH }>();
+        let (vector_elevations, _) = elevations.as_chunks::<{ WIDTH }>();
+        let (vector_adjustments, _) = adjustments.as_chunks::<{ WIDTH }>();
+        let (vector_distances, _) = distances.as_chunks::<{ WIDTH }>();
+
+        for (angle, &elevation, &distance, &adjustment) in izip!(
+            vector_angles.iter_mut(),
+            vector_elevations.iter(),
+            vector_distances.iter(),
+            vector_adjustments.iter()
+        ) {
+            let float_elevation: Simd<f32, { WIDTH }> = Simd::from(elevation).cast();
+
+            let adjusted = float_elevation - Simd::splat(pov_height);
+
+            let res = (adjusted / Simd::from_array(distance)) - Simd::from_array(adjustment);
+
+            res.copy_to_slice(angle);
+        }
+    }
+}
+
+impl<const WIDTH: usize> Accumulate<(f32, f32)> for VectorLos<WIDTH>
+where
+    LaneCount<WIDTH>: SupportedLaneCount,
+{
+    #[inline]
+    fn accumulate(
+        init: (f32, f32),
+        angles: &[f32],
+        prefix: &[f32],
+        distances: &[f32],
+        bitmap: &mut Vec<bool>,
+        output_sector: bool,
+    ) -> (f32, f32) {
+        debug_assert!(angles.len().is_multiple_of(WIDTH), "");
+        debug_assert!(prefix.len().is_multiple_of(WIDTH), "");
+        debug_assert!(distances.len().is_multiple_of(WIDTH), "");
+
+        let (vector_angles, _) = angles.as_chunks::<{ WIDTH }>();
+        let (vector_prefix, _) = prefix.as_chunks::<{ WIDTH }>();
+        let (vector_dists, _) = distances.as_chunks::<{ WIDTH }>();
+
+        izip!(vector_angles, vector_prefix, vector_dists,).fold(
+            init,
+            |acc, (&angle_arr, &prefix_arr, &distances_arr)| {
+                let mask = Self::gt(Simd::from_array(angle_arr), Simd::from_array(prefix_arr));
+                if output_sector {
+                    bitmap.extend(mask.to_array());
+                }
+
+                if !mask.any() {
+                    return acc;
+                }
+
+                let dist = mask.select(Simd::from_array(distances_arr), Simd::splat(0.0f32));
+                (
+                    acc.0 + (dist * Simd::<f32, { WIDTH }>::splat(TAN_ONE_RAD)).reduce_sum(),
+                    acc.1.max(dist.reduce_max()),
+                )
+            },
+        )
+    }
+}
+
+/// `GenericExpr` lets a const generic expression be evaluated in its
+/// `CONDITION` parameter for traits that need to evaluate constant expressions
+/// as part of their trait bounds
+struct GenericExpr<const CONDITION: bool>;
+
+/// `IsTrue `is a "marker" trait for trait bounds for when a const generic expr
+/// evaluates to true, and is only implemented for `GenericExpr<{true}>`
+trait IsTrue {}
+impl IsTrue for GenericExpr<true> {}
+
+impl<const SIZE: usize, const WIDTH: usize> Accumulate<Unroll<SIZE>> for VectorLos<WIDTH>
+where
+    LaneCount<WIDTH>: SupportedLaneCount,
+    GenericExpr<{ SIZE.is_multiple_of(WIDTH) }>: IsTrue,
+{
+    #[inline]
+    fn accumulate(
+        mut init: Unroll<SIZE>,
+        angles: &[f32],
+        prefix: &[f32],
+        distances: &[f32],
+        bitmap: &mut Vec<bool>,
+        output_sector: bool,
+    ) -> Unroll<SIZE> {
+        debug_assert!(
+            angles.len().is_multiple_of(WIDTH),
+            "distance unroll should be multiple of width"
+        );
+        debug_assert!(
+            prefix.len().is_multiple_of(WIDTH),
+            "distance unroll should be multiple of width"
+        );
+        debug_assert!(
+            distances.len().is_multiple_of(WIDTH),
+            "distance unroll should be multiple of width"
+        );
+        debug_assert!(angles.len() <= SIZE, "angles must be less than unroll size");
+
+        let (vector_sum, _) = init.heatmap.as_chunks_mut::<{ WIDTH }>();
+        let (vector_longest, _) = init.longest.as_chunks_mut::<{ WIDTH }>();
+
+        let (vector_angles, _) = angles.as_chunks::<{ WIDTH }>();
+        let (vector_prefix, _) = prefix.as_chunks::<{ WIDTH }>();
+        let (vector_dists, _) = distances.as_chunks::<{ WIDTH }>();
+
+        izip!(
+            vector_sum,
+            vector_longest,
+            vector_angles,
+            vector_prefix,
+            vector_dists,
+        )
+        .for_each(
+            |(sum_arr, longest_arr, &angle_arr, &prefix_arr, &distances_arr)| {
+                let mask = Self::gt(Simd::from_array(angle_arr), Simd::from_array(prefix_arr));
+
+                if output_sector {
+                    bitmap.extend(mask.to_array());
+                }
+
+                if !mask.any() {
+                    return;
+                }
+
+                let dist = mask.select(Simd::from_array(distances_arr), Simd::splat(0.0f32));
+
+                Self::max(Simd::from_array(*longest_arr), dist).copy_to_slice(longest_arr);
+
+                let acc = Simd::from(*sum_arr) + dist;
+
+                acc.copy_to_slice(sum_arr);
+            },
+        );
+
+        init
+    }
+}
+
+/// `DEFAULT_VECTOR_LENGTH` determines the CPU Kernel's default vector length based off
+/// the architecture that the binary is built for
+pub const DEFAULT_VECTOR_LENGTH: usize = const {
+    if cfg!(target_feature = "avx512f") {
+        16
+    } else if cfg!(target_feature = "sse") && cfg!(target_feature = "sse2") {
+        8
+    } else {
+        4
+    }
+};
+
+impl<const UNROLL: usize> From<Unroll<UNROLL>> for (f32, f32)
+where
+    GenericExpr<{ UNROLL.is_multiple_of(DEFAULT_VECTOR_LENGTH) }>: IsTrue,
+{
+    fn from(val: Unroll<UNROLL>) -> Self {
+        let (heatmap, _) = val.heatmap.as_chunks::<DEFAULT_VECTOR_LENGTH>();
+        let (longest, _) = val.longest.as_chunks::<DEFAULT_VECTOR_LENGTH>();
+
+        let heat = heatmap
+            .iter()
+            .fold(Simd::splat(0.0f32), |acc, &heat| {
+                acc + Simd::from_array(heat)
+            })
+            .reduce_sum();
+
+        let long = longest
+            .iter()
+            .fold(Simd::splat(0.0f32), |acc, &long| {
+                VectorLos::<DEFAULT_VECTOR_LENGTH>::max(acc, Simd::from_array(long))
+            })
+            .reduce_max();
+
+        (heat, long)
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use crate::cpu::los::{LineOfSight as _, UnrolledLOS};
+    use crate::cpu::vector::VectorLos;
+
+    #[test]
+    fn line_of_sight_four() {
+        let mut vs = UnrolledLOS::<64>::new(16);
+        let (visibility, longest, sector) = vs.line_of_sight::<VectorLos<4>>(
+            0.0f32,
+            &[
+                1000, 4000, 9000, 12000, 3000, 30000, 3000, 3000, 1000, 4000, 9000, 12000, 3000,
+                30000, 3000, 3000,
+            ],
+            true,
+        );
+        println!("{:?} {:?} {:?}", visibility, longest, sector);
+    }
+
+    #[test]
+    #[cfg(all(
+        target_feature = "sse",
+        target_feature = "avx",
+        target_feature = "avx2"
+    ))]
+    fn line_of_sight_eight() {
+        let mut vs = UnrolledLOS::<64>::new(16);
+        let (visibility, longest, sector) = vs.line_of_sight::<VectorLos<8>>(
+            0.0f32,
+            &[
+                1000, 4000, 9000, 12000, 3000, 30000, 3000, 3000, 1000, 4000, 9000, 12000, 3000,
+                30000, 3000, 3000,
+            ],
+            true,
+        );
+        println!("{:?} {:?} {:?}", visibility, longest, sector);
+    }
+}
diff --git a/crates/total-viewsheds/src/main.rs b/crates/total-viewsheds/src/main.rs
index c55cd38..68803dc 100644
--- a/crates/total-viewsheds/src/main.rs
+++ b/crates/total-viewsheds/src/main.rs
@@ -1,5 +1,6 @@
 //! Total Viewshed Calculator
 #![feature(portable_simd)]
+#![feature(specialization)]
 #![expect(
     incomplete_features,
     reason = "our usage isn't crazy and unlikely to break"
@@ -47,7 +48,10 @@ mod output {
     pub mod ring_data;
     pub mod viewshed;
 }
+
+/// cpu implements a CPU kernel for the longest line of sight
 mod cpu;
+
 mod projection;
 
 fn main() -> Result<()> {
diff --git a/scripts/apt_init.sh b/scripts/apt_init.sh
index a4a64a6..ad5ed7f 100644
--- a/scripts/apt_init.sh
+++ b/scripts/apt_init.sh
@@ -2,6 +2,7 @@
 
 sudo apt install -y git clang
 curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
+source ~/.bashrc
 
 git clone https://github.com/AllTheLines/CacheTVS
-cd CacheTVS && git checkout cpu-clean
+cd CacheTVS && git checkout rberger/cpu-clean