diff --git a/external/parthenon b/external/parthenon
index a92df6e4..437e02bf 160000
--- a/external/parthenon
+++ b/external/parthenon
@@ -1 +1 @@
-Subproject commit a92df6e4163291963b5d84136362782d56427484
+Subproject commit 437e02bf62734f7a7962b9e5d0fae0ab36a34dfc
diff --git a/external/patches/parthenon-use-gr-coordinates.patch b/external/patches/parthenon-use-gr-coordinates.patch
index 3b4b816c..fbb4bb3b 100644
--- a/external/patches/parthenon-use-gr-coordinates.patch
+++ b/external/patches/parthenon-use-gr-coordinates.patch
@@ -32,29 +32,4 @@ index d1290dee..50bfc840 100644
  
  namespace parthenon {
  
-diff --git a/src/interface/data_collection.cpp b/src/interface/data_collection.cpp
-index 6a1d72c9..b5ba609b 100644
---- a/src/interface/data_collection.cpp
-+++ b/src/interface/data_collection.cpp
-@@ -48,7 +48,7 @@ std::shared_ptr<T> DataCollection<T>::Add(const std::string &name,
-   if (it != containers_.end()) {
-     // check to make sure they are the same
-     if (!(*src == *(it->second))) {
--      PARTHENON_THROW("Error attempting to add a Container to a Collection");
-+      //PARTHENON_THROW("Error attempting to add a Container to a Collection");
-     }
-     return it->second;
-   }
-diff --git a/src/interface/meshblock_data.cpp b/src/interface/meshblock_data.cpp
-index 8d5dca57..0ab7dad8 100644
---- a/src/interface/meshblock_data.cpp
-+++ b/src/interface/meshblock_data.cpp
-@@ -440,7 +440,7 @@ MeshBlockData<T>::GetVariablesByUid(const std::vector<Uid_t> &uids) {
- 
- template <typename T>
- void MeshBlockData<T>::Remove(const std::string &label) {
--  throw std::runtime_error("MeshBlockData<T>::Remove not yet implemented");
-+  varMap_.erase(label);
- }
- 
- template <typename T>
+
diff --git a/kharma/b_cd/b_cd.cpp b/kharma/b_cd/b_cd.cpp
index a89a0a6d..bb7d8f7e 100644
--- a/kharma/b_cd/b_cd.cpp
+++ b/kharma/b_cd/b_cd.cpp
@@ -223,20 +223,7 @@ Real MaxDivB(MeshData<Real> *md)
 
 TaskStatus PostStepDiagnostics(const SimTime& tm, MeshData<Real> *md)
 {
-    auto pmesh = md->GetMeshPointer();
-
-    // Print this unless we quash everything
-    int verbose = pmesh->packages.Get("Globals")->Param<int>("verbose");
-    if (verbose >= 0) {
-        static Reduce<Real> max_divb;
-        max_divb.val = B_CD::MaxDivB(md);
-        max_divb.StartReduce(0, MPI_MAX);
-        while (max_divb.CheckReduce() == TaskStatus::incomplete);
-
-        if(MPIRank0()) {
-            std::cout << "Max DivB: " << max_divb.val << std::endl;
-        }
-    }
+    // TODO. Unify w/other B?
 
     return TaskStatus::complete;
 }
@@ -280,16 +267,17 @@ void FillOutput(MeshBlock *pmb, ParameterInput *pin)
 
 void UpdateCtopMax(Mesh *pmesh, ParameterInput *pin, const SimTime &tm)
 {
+    // TODO use new Reductions stuff for this
     // Reduce and record the maximum sound speed on the grid, to propagate
     // phi at that speed next step.
     // Just needs to run after every step, so we use the KHARMA callback at that point.
-    auto& params = pmesh->packages.Get("B_CD")->AllParams();
-    static AllReduce<Real> ctop_max_last_r;
-    ctop_max_last_r.val = params.Get<Real>("ctop_max");
-    ctop_max_last_r.StartReduce(MPI_MAX);
-    while (ctop_max_last_r.CheckReduce() == TaskStatus::incomplete);
-    params.Update<Real>("ctop_max_last", ctop_max_last_r.val);
-    params.Update<Real>("ctop_max", 0.0); // Reset for next max calculation
+    // auto& params = pmesh->packages.Get("B_CD")->AllParams();
+    // static AllReduce<Real> ctop_max_last_r;
+    // ctop_max_last_r.val = params.Get<Real>("ctop_max");
+    // ctop_max_last_r.StartReduce(MPI_MAX);
+    // while (ctop_max_last_r.CheckReduce() == TaskStatus::incomplete);
+    // params.Update<Real>("ctop_max_last", ctop_max_last_r.val);
+    // params.Update<Real>("ctop_max", 0.0); // Reset for next max calculation
 }
 
 } // namespace B_CD
diff --git a/kharma/b_cleanup/b_cleanup.cpp b/kharma/b_cleanup/b_cleanup.cpp
index e9d445c3..23de26ef 100644
--- a/kharma/b_cleanup/b_cleanup.cpp
+++ b/kharma/b_cleanup/b_cleanup.cpp
@@ -38,6 +38,7 @@
 
 #include "boundaries.hpp"
 #include "decs.hpp"
+#include "kharma.hpp"
 #include "kharma_driver.hpp"
 #include "grmhd.hpp"
 #include "kharma.hpp"
@@ -217,7 +218,7 @@ TaskStatus B_Cleanup::CleanupDivergence(std::shared_ptr<MeshData<Real>>& md)
     }
 
     // Calculate/print inital max divB exactly as we would during run
-    const double divb_start = B_FluxCT::GlobalMaxDivB(md.get());
+    const double divb_start = B_FluxCT::GlobalMaxDivB(md.get(), true);
     if (divb_start < rel_tolerance && !always_solve) {
         // If divB is "pretty good" and we allow not solving...
         if (MPIRank0())
@@ -236,22 +237,8 @@ TaskStatus B_Cleanup::CleanupDivergence(std::shared_ptr<MeshData<Real>>& md)
     KHARMADriver::SyncAllBounds(md);
 
     // Add a solver container and associated MeshData
-    for (auto& pmb : pmesh->block_list) {
-        auto &base = pmb->meshblock_data.Get();
-        pmb->meshblock_data.Add("solve", base);
-    }
-    // The "solve" container really only needs the RHS, the solution, and the scratch array dB
-    // This does not affect the main container, but saves a *lot* of time not syncing
-    // static variables.
-    // There's no MeshData-wide 'Remove' so we go block-by-block
-    for (auto& pmb : pmesh->block_list) {
-        auto rc_s = pmb->meshblock_data.Get("solve");
-        auto vars = rc_s->GetVariablesByFlag({Metadata::GetUserFlag("MHD")}).vars();
-        for (auto var : vars) {
-            rc_s->Remove(var->label());
-        }
-    }
-    auto &msolve = pmesh->mesh_data.GetOrAdd("solve", 0);
+    std::vector<std::string> names = KHARMA::GetVariableNames(&pmesh->packages, Metadata::GetUserFlag("B_Cleanup"));
+    auto &msolve = pmesh->mesh_data.Add("solve", names);
 
     // Create a TaskCollection of just the solve,
     // execute it to perform BiCGStab iteration
@@ -285,24 +272,6 @@ TaskStatus B_Cleanup::CleanupDivergence(std::shared_ptr<MeshData<Real>>& md)
     return TaskStatus::complete;
 }
 
-TaskStatus B_Cleanup::RemoveExtraFields(BlockList_t &blocks)
-{
-    // If we aren't needed to clean anything...
-    if (! (blocks[0]->packages.Get("B_Cleanup")->Param<int>("cleanup_interval") > 0)) {
-        // remove the internal BiCGStab variables by name,
-        // to prevent them weighing down MPI exchanges
-        // TODO anything FillGhost & not Conserved or Primitive
-        for (auto& pmb : blocks) {
-            auto rc_s = pmb->meshblock_data.Get();
-            for (auto varlabel : {"pk0", "res0", "temp0", "divB_RHS", "p"}) {
-                if (rc_s->HasVariable(varlabel))
-                    rc_s->Remove(varlabel);
-            }
-        }
-    }
-    return TaskStatus::complete;
-}
-
 TaskStatus B_Cleanup::ApplyP(MeshData<Real> *msolve, MeshData<Real> *md)
 {
     // Apply on physical zones only, we'll be syncing/updating ghosts
diff --git a/kharma/b_cleanup/b_cleanup.hpp b/kharma/b_cleanup/b_cleanup.hpp
index ebb4037f..b75652d8 100644
--- a/kharma/b_cleanup/b_cleanup.hpp
+++ b/kharma/b_cleanup/b_cleanup.hpp
@@ -67,14 +67,8 @@ TaskStatus CleanupDivergence(std::shared_ptr<MeshData<Real>>& md);
 bool CleanupThisStep(Mesh* pmesh, int nstep);
 
 /**
- * Remove the extra solver fields which B_Cleanup added during initialization.
- * Must be run before every step as the meshblocks are reconstructed per-step from
- * package variable lists.
- */
-TaskStatus RemoveExtraFields(BlockList_t &blocks);
-
-/**
- * Calculate the laplacian using divergence at corners
+ * Calculate the laplacian using divergence at corners.
+ * Extra MeshData arg is just to satisfy Parthenon solver calling convention
  */
 TaskStatus CornerLaplacian(MeshData<Real>* md, const std::string& p_var, MeshData<Real>* md_again, const std::string& lap_var);
 
diff --git a/kharma/b_ct/b_ct.cpp b/kharma/b_ct/b_ct.cpp
index 2b91ed72..2440bfd6 100644
--- a/kharma/b_ct/b_ct.cpp
+++ b/kharma/b_ct/b_ct.cpp
@@ -383,13 +383,15 @@ double B_CT::BlockMaxDivB(MeshBlockData<Real> *rc)
     return max_divb;
 }
 
-double B_CT::GlobalMaxDivB(MeshData<Real> *md)
+double B_CT::GlobalMaxDivB(MeshData<Real> *md, bool all_reduce)
 {
-    static AllReduce<Real> max_divb;
-    max_divb.val = MaxDivB(md);
-    max_divb.StartReduce(MPI_MAX);
-    while (max_divb.CheckReduce() == TaskStatus::incomplete);
-    return max_divb.val;
+    if (all_reduce) {
+        Reductions::StartToAll<Real>(md, 2, MaxDivB(md), MPI_MAX);
+        return Reductions::CheckOnAll<Real>(md, 2);
+    } else {
+        Reductions::Start<Real>(md, 2, MaxDivB(md), MPI_MAX);
+        return Reductions::Check<Real>(md, 2);
+    }
 }
 
 TaskStatus B_CT::PrintGlobalMaxDivB(MeshData<Real> *md, bool kill_on_large_divb)
diff --git a/kharma/b_ct/b_ct.hpp b/kharma/b_ct/b_ct.hpp
index b5b51dbb..0d45a5d3 100644
--- a/kharma/b_ct/b_ct.hpp
+++ b/kharma/b_ct/b_ct.hpp
@@ -98,7 +98,7 @@ double BlockMaxDivB(MeshBlockData<Real> *rc);
 /**
  * Returns the global maximum value, rather than the maximum over this rank's MeshData
  */
-double GlobalMaxDivB(MeshData<Real> *md);
+double GlobalMaxDivB(MeshData<Real> *md, bool all_reduce=false);
 
 /**
  * Diagnostics printed/computed after each step
@@ -124,22 +124,6 @@ void FillOutput(MeshBlock *pmb, ParameterInput *pin);
  */
 void CalcDivB(MeshData<Real> *md, std::string divb_field_name="divB");
 
-// Reductions: FOR LATER
-// KOKKOS_INLINE_FUNCTION Real phi(REDUCE_FUNCTION_ARGS_EH)
-// {
-//     // \Phi == \int |*F^1^0| * gdet * dx2 * dx3 == \int |B1| * gdet * dx2 * dx3
-//     return 0.5 * m::abs(U(m_u.B1, k, j, i)); // factor of gdet already in cons.B
-// }
-
-// inline Real ReducePhi0(MeshData<Real> *md)
-// {
-//     return Reductions::EHReduction(md, UserHistoryOperation::sum, phi, 0);
-// }
-// inline Real ReducePhi5(MeshData<Real> *md)
-// {
-//     return Reductions::EHReduction(md, UserHistoryOperation::sum, phi, 5);
-// }
-
 // Device functions
 template<typename Global>
 KOKKOS_INLINE_FUNCTION Real face_div(const GRCoordinates &G, Global &v, const int &ndim, const int &k, const int &j, const int &i)
diff --git a/kharma/b_flux_ct/b_flux_ct.cpp b/kharma/b_flux_ct/b_flux_ct.cpp
index af2f93e6..6f846c02 100644
--- a/kharma/b_flux_ct/b_flux_ct.cpp
+++ b/kharma/b_flux_ct/b_flux_ct.cpp
@@ -478,7 +478,8 @@ double MaxDivB(MeshData<Real> *md)
     const IndexRange kb = IndexRange{kbl.s, kbl.e + (ndim > 2)};
     const IndexRange block = IndexRange{0, B_U.GetDim(5)-1};
 
-    // TODO Keep zone of max!  Also applies to ctop.
+    // TODO Keep zone of max! See timestep calc
+    // Will need to translate them back to KS to make them useful though
 
     // This is one kernel call per block, because each block will have different bounds.
     // Could consolidate at the cost of lots of bounds checking.
@@ -504,13 +505,15 @@ double MaxDivB(MeshData<Real> *md)
     return max_divb;
 }
 
-double GlobalMaxDivB(MeshData<Real> *md)
+double GlobalMaxDivB(MeshData<Real> *md, bool all_reduce)
 {
-    static AllReduce<Real> max_divb;
-    max_divb.val = MaxDivB(md);
-    max_divb.StartReduce(MPI_MAX);
-    while (max_divb.CheckReduce() == TaskStatus::incomplete);
-    return max_divb.val;
+    if (all_reduce) {
+        Reductions::StartToAll<Real>(md, 2, MaxDivB(md), MPI_MAX);
+        return Reductions::CheckOnAll<Real>(md, 2);
+    } else {
+        Reductions::Start<Real>(md, 2, MaxDivB(md), MPI_MAX);
+        return Reductions::Check<Real>(md, 2);
+    }
 }
 
 TaskStatus PrintGlobalMaxDivB(MeshData<Real> *md, bool kill_on_large_divb)
diff --git a/kharma/b_flux_ct/b_flux_ct.hpp b/kharma/b_flux_ct/b_flux_ct.hpp
index 0fa89728..ffdd4c3f 100644
--- a/kharma/b_flux_ct/b_flux_ct.hpp
+++ b/kharma/b_flux_ct/b_flux_ct.hpp
@@ -110,8 +110,10 @@ double MaxDivB(MeshData<Real> *md);
 
 /**
  * Returns the global maximum value, rather than the maximum over this rank's MeshData
+ * 
+ * By default, only returns the correct value on rank 0 for printing
  */
-double GlobalMaxDivB(MeshData<Real> *md);
+double GlobalMaxDivB(MeshData<Real> *md, bool all_reduce=false);
 
 /**
  * Diagnostics printed/computed after each step
@@ -124,7 +126,7 @@ TaskStatus PrintGlobalMaxDivB(MeshData<Real> *md, bool kill_on_large_divb=false)
  */
 inline TaskStatus PostStepDiagnostics(const SimTime& tm, MeshData<Real> *md)
 {
-    auto& params = md->GetMeshPointer()->block_list[0]->packages.Get("B_FluxCT")->AllParams();
+    auto& params = md->GetMeshPointer()->packages.Get("B_FluxCT")->AllParams();
     return PrintGlobalMaxDivB(md, params.Get<bool>("kill_on_large_divb"));
 }
 
@@ -133,25 +135,17 @@ inline TaskStatus PostStepDiagnostics(const SimTime& tm, MeshData<Real> *md)
  */
 void FillOutput(MeshBlock *pmb, ParameterInput *pin);
 /**
- * Fill field "name" with divB
+ * Fill the field 'divb_field_name' with divB
  */
 void CalcDivB(MeshData<Real> *md, std::string divb_field_name="divB");
 
-// Reductions: phi uses global machinery, but divB is too 
-// Can also sum the hemispheres independently to be fancy (TODO?)
-KOKKOS_INLINE_FUNCTION Real phi(REDUCE_FUNCTION_ARGS_EH)
-{
-    // \Phi == \int |*F^1^0| * gdet * dx2 * dx3 == \int |B1| * gdet * dx2 * dx3
-    return 0.5 * m::abs(U(m_u.B1, k, j, i)); // factor of gdet already in cons.B
-}
-
 inline Real ReducePhi0(MeshData<Real> *md)
 {
-    return Reductions::EHReduction(md, UserHistoryOperation::sum, phi, 0);
+    return Reductions::EHReduction<Reductions::Var::phi, Real>(md, UserHistoryOperation::sum, 0);
 }
 inline Real ReducePhi5(MeshData<Real> *md)
 {
-    return Reductions::EHReduction(md, UserHistoryOperation::sum, phi, 5);
+    return Reductions::EHReduction<Reductions::Var::phi, Real>(md, UserHistoryOperation::sum, 5);
 }
 
 /**
diff --git a/kharma/boundaries/boundaries.cpp b/kharma/boundaries/boundaries.cpp
index 20561573..e3ac6c15 100644
--- a/kharma/boundaries/boundaries.cpp
+++ b/kharma/boundaries/boundaries.cpp
@@ -76,8 +76,8 @@ std::shared_ptr<KHARMAPackage> KBoundaries::Initialize(ParameterInput *pin, std:
 
     Metadata m_x1, m_x2, m_x3;
     {
-        // We can't use GetVariablesByFlag yet, so walk through and count manually
-        int nvar = KHARMA::CountVars(packages.get(), Metadata::FillGhost);
+        // We can't use GetVariablesByFlag yet, so ask the packages
+        int nvar = KHARMA::PackDimension(packages.get(), Metadata::FillGhost);
 
         // We also don't know the mesh size, since it's not constructed.  We infer.
         const int ng = pin->GetInteger("parthenon/mesh", "nghost");
diff --git a/kharma/coordinates/coordinate_utils.hpp b/kharma/coordinates/coordinate_utils.hpp
index 98e6a54e..79e95fec 100644
--- a/kharma/coordinates/coordinate_utils.hpp
+++ b/kharma/coordinates/coordinate_utils.hpp
@@ -34,6 +34,7 @@
 #pragma once
 
 #include "decs.hpp"
+#include "matrix.hpp"
 
 /**
  * Rotate a set of coordinates 'Xin' by 'angle' about the *y-axis*
diff --git a/kharma/debug.cpp b/kharma/debug.cpp
deleted file mode 100644
index be3aa6e7..00000000
--- a/kharma/debug.cpp
+++ /dev/null
@@ -1,167 +0,0 @@
-/* 
- *  File: debug.cpp
- *  
- *  BSD 3-Clause License
- *  
- *  Copyright (c) 2020, AFD Group at UIUC
- *  All rights reserved.
- *  
- *  Redistribution and use in source and binary forms, with or without
- *  modification, are permitted provided that the following conditions are met:
- *  
- *  1. Redistributions of source code must retain the above copyright notice, this
- *     list of conditions and the following disclaimer.
- *  
- *  2. Redistributions in binary form must reproduce the above copyright notice,
- *     this list of conditions and the following disclaimer in the documentation
- *     and/or other materials provided with the distribution.
- *  
- *  3. Neither the name of the copyright holder nor the names of its
- *     contributors may be used to endorse or promote products derived from
- *     this software without specific prior written permission.
- *  
- *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- *  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- *  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- *  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- *  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "debug.hpp"
-
-#include "decs.hpp"
-
-#include "floors.hpp"
-#include "grmhd_functions.hpp"
-#include "types.hpp"
-
-// TODO make this a DomainReduce, and add better verbosity options
-// TODO 
-
-TaskStatus CheckNaN(MeshData<Real> *md, int dir, IndexDomain domain)
-{
-    Flag("CheckNaN");
-    auto pmb0 = md->GetBlockData(0)->GetBlockPointer();
-
-    // Pack variables
-    auto& cmax = md->PackVariables(std::vector<std::string>{"Flux.cmax"});
-    auto& cmin = md->PackVariables(std::vector<std::string>{"Flux.cmin"});
-
-    // Get sizes
-    IndexRange ib = md->GetBoundsI(IndexDomain::interior);
-    IndexRange jb = md->GetBoundsJ(IndexDomain::interior);
-    IndexRange kb = md->GetBoundsK(IndexDomain::interior);
-    IndexRange block = IndexRange{0, cmax.GetDim(5) - 1};
-
-    // TODO these two kernels can be one with some Kokkos magic
-    int nzero = 0, nnan = 0;
-    Kokkos::Sum<int> zero_reducer(nzero);
-    Kokkos::Sum<int> nan_reducer(nnan);
-    pmb0->par_reduce("ctop_zeros", block.s, block.e, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
-        KOKKOS_LAMBDA (const int &b, const int &k, const int &j, const int &i, int &local_result) {
-            if (m::max(cmax(b, dir-1, k, j, i), cmin(b, dir-1, k, j, i)) <= 0.) {
-                ++local_result;
-            }
-        }
-    , zero_reducer);
-    pmb0->par_reduce("ctop_nans", block.s, block.e, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
-        KOKKOS_LAMBDA (const int &b, const int &k, const int &j, const int &i, int &local_result) {
-            if (m::isnan(m::max(cmax(b, dir-1, k, j, i), cmin(b, dir-1, k, j, i)))) {
-                ++local_result;
-                printf("ctop NaN at %d %d %d along dir %d\n", i, j, k, dir); // EDIT
-            }
-        }
-    , nan_reducer);
-
-    // Reductions in parallel
-    // Only need to reduce to head node, saves time
-    static Reduce<int> nzero_tot, nnan_tot;
-    nzero_tot.val = nzero;
-    nnan_tot.val = nnan;
-    nzero_tot.StartReduce(0, MPI_SUM);
-    nnan_tot.StartReduce(0, MPI_SUM);
-    while (nzero_tot.CheckReduce() == TaskStatus::incomplete);
-    while (nnan_tot.CheckReduce() == TaskStatus::incomplete);
-    nzero = nzero_tot.val;
-    nnan = nnan_tot.val;
-
-    if (MPIRank0() && (nzero > 0 || nnan > 0)) {
-        // TODO string formatting in C++ that doesn't suck
-        fprintf(stderr, "Max signal speed ctop was 0 or NaN, direction %d (%d zero, %d NaN)", dir, nzero, nnan);
-        throw std::runtime_error("Bad ctop!");
-    }
-
-    // TODO reimplement printing *where* these values were hit?
-    // May not even be that useful, as the cause is usually much earlier
-
-    EndFlag();
-    return TaskStatus::complete;
-}
-
-TaskStatus CheckNegative(MeshData<Real> *md, IndexDomain domain)
-{
-    Flag("CheckNegative");
-    auto pmb0 = md->GetBlockData(0)->GetBlockPointer();
-    // Pack variables
-    auto rho_p = md->PackVariables(std::vector<std::string>{"prims.rho"});
-    auto u_p = md->PackVariables(std::vector<std::string>{"prims.u"});
-    auto rho_c = md->PackVariables(std::vector<std::string>{"cons.rho"});
-    // Get sizes
-    IndexRange ib = md->GetBoundsI(domain);
-    IndexRange jb = md->GetBoundsJ(domain);
-    IndexRange kb = md->GetBoundsK(domain);
-    IndexRange block = IndexRange{0, rho_p.GetDim(5)-1};
-
-    // Check for negative values in the conserved vars
-    int nless = 0;
-    Kokkos::Sum<int> sum_reducer(nless);
-    pmb0->par_reduce("count_negative_U", block.s, block.e, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
-        KOKKOS_LAMBDA (const int &b, const int &k, const int &j, const int &i, int &local_result) {
-            if (rho_c(b, 0, k, j, i) < 0.) ++local_result;
-        }
-    , sum_reducer);
-
-    int nless_rho = 0, nless_u = 0;
-    Kokkos::Sum<int> sum_reducer_rho(nless_rho);
-    Kokkos::Sum<int> sum_reducer_u(nless_u);
-    pmb0->par_reduce("count_negative_RHO", block.s, block.e, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
-        KOKKOS_LAMBDA (const int &b, const int &k, const int &j, const int &i, int &local_result) {
-            if (rho_p(b, 0, k, j, i) < 0.) ++local_result;
-        }
-    , sum_reducer_rho);
-    pmb0->par_reduce("count_negative_UU", block.s, block.e, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
-        KOKKOS_LAMBDA (const int &b, const int &k, const int &j, const int &i, int &local_result) {
-            if (u_p(b, 0, k, j, i) < 0.) ++local_result;
-        }
-    , sum_reducer_u);
-
-    // Reductions in parallel
-    static Reduce<int> nless_tot, nless_rho_tot, nless_u_tot;
-    nless_tot.val = nless;
-    nless_rho_tot.val = nless_rho;
-    nless_u_tot.val = nless_u;
-    nless_tot.StartReduce(0, MPI_SUM);
-    nless_rho_tot.StartReduce(0, MPI_SUM);
-    nless_u_tot.StartReduce(0, MPI_SUM);
-    while (nless_tot.CheckReduce() == TaskStatus::incomplete);
-    while (nless_rho_tot.CheckReduce() == TaskStatus::incomplete);
-    while (nless_u_tot.CheckReduce() == TaskStatus::incomplete);
-    nless = nless_tot.val;
-    nless_rho = nless_rho_tot.val;
-    nless_u = nless_u_tot.val;
-
-    if (MPIRank0() && nless > 0) {
-        std::cout << "Number of negative conserved rho: " << nless << std::endl;
-    }
-    if (MPIRank0() && (nless_rho > 0 || nless_u > 0)) {
-        std::cout << "Number of negative primitive rho, u: " << nless_rho << "," << nless_u << std::endl;
-    }
-
-    EndFlag();
-    return TaskStatus::complete;
-}
diff --git a/kharma/debug.hpp b/kharma/debug.hpp
deleted file mode 100644
index ec792836..00000000
--- a/kharma/debug.hpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/* 
- *  File: debug.hpp
- *  
- *  BSD 3-Clause License
- *  
- *  Copyright (c) 2020, AFD Group at UIUC
- *  All rights reserved.
- *  
- *  Redistribution and use in source and binary forms, with or without
- *  modification, are permitted provided that the following conditions are met:
- *  
- *  1. Redistributions of source code must retain the above copyright notice, this
- *     list of conditions and the following disclaimer.
- *  
- *  2. Redistributions in binary form must reproduce the above copyright notice,
- *     this list of conditions and the following disclaimer in the documentation
- *     and/or other materials provided with the distribution.
- *  
- *  3. Neither the name of the copyright holder nor the names of its
- *     contributors may be used to endorse or promote products derived from
- *     this software without specific prior written permission.
- *  
- *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- *  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- *  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- *  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- *  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-#pragma once
-
-#include "decs.hpp"
-#include "types.hpp"
-
-// TODO TODO Namespace
-
-/**
- * Check the max signal speed (ctop) for 0-values or NaNs.
- * This is a final warning that something is very wrong and we should crash.
- */
-TaskStatus CheckNaN(MeshData<Real> *md, int dir, IndexDomain domain=IndexDomain::interior);
-
-/**
- * Check the primitive and conserved variables for negative values that definitely shouldn't be negative
- * That is: primitive rho, u, conserved rho*u^t
- */
-TaskStatus CheckNegative(MeshData<Real> *md, IndexDomain domain=IndexDomain::interior);
diff --git a/kharma/driver/imex_step.cpp b/kharma/driver/imex_step.cpp
index 8d86489a..3928a2c5 100644
--- a/kharma/driver/imex_step.cpp
+++ b/kharma/driver/imex_step.cpp
@@ -44,7 +44,6 @@
 #include "wind.hpp"
 // Other headers
 #include "boundaries.hpp"
-#include "debug.hpp"
 #include "flux.hpp"
 #include "resize_restart.hpp"
 #include "implicit.hpp"
@@ -70,37 +69,28 @@ TaskCollection KHARMADriver::MakeImExTaskCollection(BlockList_t &blocks, int sta
     const bool use_jcon = pkgs.count("Current");
     const bool use_linesearch = (use_implicit) ? pkgs.at("Implicit")->Param<bool>("linesearch") : false;
 
-    // If we cleaned up, this added other fields marked FillDerived
-    // Remove them before we allocate the space
-    if (use_b_cleanup) {
-        B_Cleanup::RemoveExtraFields(blocks);
-    }
-
-    // Allocate the fluid states ("containers") we need for each block
-    for (auto& pmb : blocks) {
-        // first make other useful containers
-        auto &base = pmb->meshblock_data.Get();
-        if (stage == 1) {
-            pmb->meshblock_data.Add("dUdt", base);
-            for (int i = 1; i < integrator->nstages; i++)
-                pmb->meshblock_data.Add(integrator->stage_name[i], base);
-            
-            if (use_jcon) {
-                // At the end of the step, updating "mbd_sub_step_final" updates the base
-                // So we have to keep a copy at the beginning to calculate jcon
-                pmb->meshblock_data.Add("preserve", base);
-                // Above only copies on allocate -- ensure we copy every step
-                Copy<MeshBlockData<Real>>({}, base.get(), pmb->meshblock_data.Get("preserve").get());
-            }
-
-            if (use_implicit) {
-                // When solving, we need a temporary copy with any explicit updates,
-                // but not overwriting the beginning- or mid-step values
-                pmb->meshblock_data.Add("solver", base);
-                if (use_linesearch) {
-                    // Need an additional state for linesearch
-                    pmb->meshblock_data.Add("linesearch", base);
-                }
+    // Allocate/copy the things we need
+    // TODO these can now be reduced by including the var lists/flags which actually need to be allocated
+    // TODO except the Copy they can be run on step 1 only
+    if (stage == 1) {
+        auto &base = pmesh->mesh_data.Get();
+        // Fluxes
+        pmesh->mesh_data.Add("dUdt");
+        for (int i = 1; i < integrator->nstages; i++)
+            pmesh->mesh_data.Add(integrator->stage_name[i]);
+        // Preserve state for time derivatives if we need to output current
+        if (use_jcon) {
+            pmesh->mesh_data.Add("preserve");
+            // Above only copies on allocate -- ensure we copy every step
+            Copy<MeshData<Real>>({}, base.get(), pmesh->mesh_data.Get("preserve").get());
+        }
+        if (use_implicit) {
+            // When solving, we need a temporary copy with any explicit updates,
+            // but not overwriting the beginning- or mid-step values
+            pmesh->mesh_data.Add("solver");
+            if (use_linesearch) {
+                // Need an additional state for linesearch
+                pmesh->mesh_data.Add("linesearch");
             }
         }
     }
diff --git a/kharma/driver/kharma_driver.cpp b/kharma/driver/kharma_driver.cpp
index d60ed54d..37a28dec 100644
--- a/kharma/driver/kharma_driver.cpp
+++ b/kharma/driver/kharma_driver.cpp
@@ -266,5 +266,35 @@ TaskID KHARMADriver::AddFluxCalculations(TaskID& t_start, TaskList& tl, KReconst
                   << "donor_cell, linear_mc, weno5" << std::endl;
         throw std::invalid_argument("Unsupported reconstruction algorithm!");
     }
-    return t_calculate_flux1 | t_calculate_flux2 | t_calculate_flux3;
+    auto t_calc_fluxes = t_calculate_flux1 | t_calculate_flux2 | t_calculate_flux3;
+
+    auto t_ctop = t_calc_fluxes;
+    if (md->GetMeshPointer()->packages.Get("Globals")->Param<int>("extra_checks") > 0) {
+        auto t_ctop = tl.AddTask(t_calc_fluxes, Flux::CheckCtop, md);
+    }
+
+    return t_ctop;
+}
+
+void KHARMADriver::SetGlobalTimeStep()
+{
+  // TODO TODO apply the limits from GRMHD package here
+  if (tm.dt < 0.1 * std::numeric_limits<Real>::max()) {
+    tm.dt *= 2.0;
+  }
+  Real big = std::numeric_limits<Real>::max();
+  for (auto const &pmb : pmesh->block_list) {
+    tm.dt = std::min(tm.dt, pmb->NewDt());
+    pmb->SetAllowedDt(big);
+  }
+
+    // TODO start reduce at the end of the per-meshblock stuff, check here
+#ifdef MPI_PARALLEL
+  PARTHENON_MPI_CHECK(MPI_Allreduce(MPI_IN_PLACE, &tm.dt, 1, MPI_PARTHENON_REAL, MPI_MIN,
+                                    MPI_COMM_WORLD));
+#endif
+
+  if (tm.time < tm.tlim &&
+      (tm.tlim - tm.time) < tm.dt) // timestep would take us past desired endpoint
+    tm.dt = tm.tlim - tm.time;
 }
diff --git a/kharma/driver/kharma_driver.hpp b/kharma/driver/kharma_driver.hpp
index e5a2c64e..6c695ea4 100644
--- a/kharma/driver/kharma_driver.hpp
+++ b/kharma/driver/kharma_driver.hpp
@@ -59,6 +59,9 @@ class KHARMADriver : public MultiStageDriver {
         // Eliminate Parthenon's print statements when starting up the driver, we have a bunch of our own
         void PreExecute() { timer_main.reset(); }
 
+        // Also override the timestep calculation, so we can start moving options etc out of GRMHD package
+        void SetGlobalTimeStep();
+
         /**
          * A Driver object orchestrates everything that has to be done to a mesh to take a step.
          * The function MakeTaskCollection outlines everything to be done in one sub-step,
@@ -74,19 +77,22 @@ class KHARMADriver : public MultiStageDriver {
          * 4. Recover primtive variables
          * 4a. Apply any stability limits (floors)
          * 4b. Fix any errors in recovering the primitives, re-apply floors
-         * 5. Apply any source terms (KEL), or calculate outputs (jcon) which require the change in primitive values
+         * 5. Apply any source terms (KEL), or calculate outputs (jcon) which use the primitive variables
          * 
          * This is before any synchronization between different blocks, etc, etc.
-         * Both task lists proceed roughly in this order, and you'll see the same broad outlines in both.
+         * All task lists proceed roughly in this order, but differ in which variables they synchronize via MPI,
+         * or whether they synchronize at all.
          */
         TaskCollection MakeTaskCollection(BlockList_t &blocks, int stage);
+
+        /**
+         * The default step, synchronizing conserved variables and then recovering primitive variables in the ghost zones.
+         */
         TaskCollection MakeDefaultTaskCollection(BlockList_t &blocks, int stage);
 
         /**
-         * This "TaskCollection" (step) 
-         * ImexDriver syncs primitive variables and treats them as fundamental, whereas HARMDriver syncs conserved variables.
-         * This allows ImexDriver to optionally use a semi-implicit step, adding a per-zone implicit solve via the 'Implicit'
-         * package, instead of just explicit RK2 time-stepping.  This driver also allows explicit-only RK2 operation
+         * This step syncs primitive variables and treats them as fundamental
+         * This accommodates semi-implicit stepping, allowing evolving theories with implicit source terms such as extended MHD
          */
         TaskCollection MakeImExTaskCollection(BlockList_t &blocks, int stage);
 
@@ -95,6 +101,8 @@ class KHARMADriver : public MultiStageDriver {
          */
         TaskCollection MakeSimpleTaskCollection(BlockList_t &blocks, int stage);
 
+        // The different drivers share substantially similar portions of the full task list, which we gather into
+
         /**
          * Add the flux calculations in each direction.  Since the flux functions are templated on which
          * reconstruction is being used, this amounts to a lot of shared lines.
@@ -102,10 +110,10 @@ class KHARMADriver : public MultiStageDriver {
         static TaskID AddFluxCalculations(TaskID& t_start, TaskList& tl, KReconstruction::Type recon, MeshData<Real> *md);
 
         /**
-         * Add just the synchronization step to a task list tl, dependent upon taskID t_start, syncing mesh mc1
+         * Add a synchronization retion to an existing TaskCollection tc.
+         * Since the region is self-contained, does not return a TaskID
          * 
-         * This sequence is used identically in several places, so it makes sense
-         * to define once and use elsewhere.
+         * This function polls the 'integrator' member or it would be static too
          */
         void AddFullSyncRegion(Mesh* pmesh, TaskCollection& tc, int stage);
 
diff --git a/kharma/driver/kharma_step.cpp b/kharma/driver/kharma_step.cpp
index 8bb2591b..19a6992c 100644
--- a/kharma/driver/kharma_step.cpp
+++ b/kharma/driver/kharma_step.cpp
@@ -44,7 +44,6 @@
 #include "wind.hpp"
 // Other headers
 #include "boundaries.hpp"
-#include "debug.hpp"
 #include "flux.hpp"
 #include "resize_restart.hpp"
 #include "implicit.hpp"
@@ -88,12 +87,6 @@ TaskCollection KHARMADriver::MakeDefaultTaskCollection(BlockList_t &blocks, int
     const bool use_electrons = pkgs.count("Electrons");
     const bool use_jcon = pkgs.count("Current");
 
-    // If we cleaned up, this added other fields marked FillDerived
-    // Remove them before we allocate the space
-    if (use_b_cleanup) {
-        B_Cleanup::RemoveExtraFields(blocks);
-    }
-
     // Allocate the fluid states ("containers") we need for each block
     for (auto& pmb : blocks) {
         // first make other useful containers
diff --git a/kharma/emhd/emhd.cpp b/kharma/emhd/emhd.cpp
index 34153ce0..3150a198 100644
--- a/kharma/emhd/emhd.cpp
+++ b/kharma/emhd/emhd.cpp
@@ -122,15 +122,16 @@ std::shared_ptr<KHARMAPackage> Initialize(ParameterInput *pin, std::shared_ptr<P
     // Only enable limits internally if we're actually doing EMHD
     params.Add("enable_emhd_limits", enable_emhd_limits);
 
-    Metadata::AddUserFlag("EMHD");
+
+    Metadata::AddUserFlag("EMHDVar");
 
     // General options for primitive and conserved scalar variables in ImEx driver
     // EMHD is supported only with imex driver and implicit evolution,
     // synchronizing primitive variables
     Metadata m_con  = Metadata({Metadata::Real, Metadata::Cell, Metadata::Independent, Metadata::GetUserFlag("Implicit"),
-                                Metadata::WithFluxes, Metadata::Conserved, Metadata::GetUserFlag("EMHD")});
+                                Metadata::WithFluxes, Metadata::Conserved, Metadata::GetUserFlag("EMHDVar")});
     Metadata m_prim = Metadata({Metadata::Real, Metadata::Cell, Metadata::Derived, Metadata::GetUserFlag("Implicit"),
-                                Metadata::Restart, Metadata::FillGhost, Metadata::GetUserFlag("Primitive"), Metadata::GetUserFlag("EMHD")});
+                                Metadata::Restart, Metadata::FillGhost, Metadata::GetUserFlag("Primitive"), Metadata::GetUserFlag("EMHDVar")});
 
     // Heat conduction
     if (conduction) {
@@ -184,7 +185,7 @@ std::shared_ptr<KHARMAPackage> Initialize(ParameterInput *pin, std::shared_ptr<P
 //     auto pmb = rc->GetBlockPointer();
 
 //     PackIndexMap prims_map, cons_map;
-//     auto U_E = rc->PackVariables(std::vector<MetadataFlag>{Metadata::GetUserFlag("EMHD"), Metadata::Conserved}, cons_map);
+//     auto U_E = rc->PackVariables(std::vector<MetadataFlag>{Metadata::GetUserFlag("EMHDVar"), Metadata::Conserved}, cons_map);
 //     auto P = rc->PackVariables(std::vector<MetadataFlag>{Metadata::GetUserFlag("Primitive")}, prims_map);
 //     const VarMap m_p(prims_map, false), m_u(cons_map, true);
 
@@ -216,7 +217,7 @@ void BlockPtoU(MeshBlockData<Real> *rc, IndexDomain domain, bool coarse)
     auto pmb = rc->GetBlockPointer();
 
     PackIndexMap prims_map, cons_map;
-    auto U_E = rc->PackVariables(std::vector<MetadataFlag>{Metadata::GetUserFlag("EMHD"), Metadata::Conserved}, cons_map);
+    auto U_E = rc->PackVariables(std::vector<MetadataFlag>{Metadata::GetUserFlag("EMHDVar"), Metadata::Conserved}, cons_map);
     auto P = rc->PackVariables(std::vector<MetadataFlag>{Metadata::GetUserFlag("Primitive")}, prims_map);
     const VarMap m_p(prims_map, false), m_u(cons_map, true);
 
diff --git a/kharma/floors/floors.cpp b/kharma/floors/floors.cpp
index 97f2412c..1dd4e557 100644
--- a/kharma/floors/floors.cpp
+++ b/kharma/floors/floors.cpp
@@ -34,7 +34,6 @@
 #include "floors.hpp"
 #include "floors_functions.hpp"
 
-#include "debug.hpp"
 #include "grmhd.hpp"
 #include "grmhd_functions.hpp"
 #include "pack.hpp"
@@ -43,7 +42,7 @@
 
 int CountFFlags(MeshData<Real> *md)
 {
-    return Reductions::CountFlags(md, "fflag", FFlag::flag_names, IndexDomain::interior, 0, true);
+    return Reductions::CountFlags(md, "fflag", FFlag::flag_names, IndexDomain::interior, true)[0];
 }
 
 std::shared_ptr<KHARMAPackage> Floors::Initialize(ParameterInput *pin, std::shared_ptr<Packages_t>& packages)
@@ -225,7 +224,7 @@ TaskStatus Floors::ApplyInitialFloors(ParameterInput *pin, MeshBlockData<Real> *
 
 TaskStatus Floors::ApplyGRMHDFloors(MeshBlockData<Real> *mbd, IndexDomain domain)
 {
-    auto pmb                 = mbd->GetBlockPointer();
+    auto pmb = mbd->GetBlockPointer();
 
     PackIndexMap prims_map, cons_map;
     auto P = mbd->PackVariables({Metadata::GetUserFlag("Primitive")}, prims_map);
@@ -286,6 +285,9 @@ TaskStatus Floors::ApplyGRMHDFloors(MeshBlockData<Real> *mbd, IndexDomain domain
         }
     );
 
+    //if (flag_verbose)
+    //Reductions::StartFlagReduce(md, "fflag", FFlag::flag_names, IndexDomain::interior, true, 0);
+
     return TaskStatus::complete;
 }
 
@@ -297,9 +299,15 @@ TaskStatus Floors::PostStepDiagnostics(const SimTime& tm, MeshData<Real> *md)
     const auto& pars = pmesh->packages.Get("Globals")->AllParams();
     const int flag_verbose = pars.Get<int>("flag_verbose");
 
-    // Debugging/diagnostic info about floor and inversion flags
-    if (flag_verbose >= 1) {
-        Reductions::CountFlags(md, "fflag", FFlag::flag_names, IndexDomain::interior, flag_verbose, true);
+    // Debugging/diagnostic info about floor flags
+    if (flag_verbose > 0) {
+        // TODO this should move to ApplyGRMHDFloors when everything goes MeshData
+        Reductions::StartFlagReduce(md, "fflag", FFlag::flag_names, IndexDomain::interior, true, 0);
+        // Debugging/diagnostic info about floor and inversion flags
+        Reductions::CheckFlagReduceAndPrintHits(md, "fflag", FFlag::flag_names, IndexDomain::interior, true, 0);
     }
+
+    // Anything else (energy conservation? Added material stats?)
+
     return TaskStatus::complete;
 }
diff --git a/kharma/floors/floors.hpp b/kharma/floors/floors.hpp
index 3e3d9edc..4325680c 100644
--- a/kharma/floors/floors.hpp
+++ b/kharma/floors/floors.hpp
@@ -80,7 +80,8 @@ static const std::map<int, std::string> flag_names = {
     {TEMP, "TEMPERATURE"},
     {KTOT, "ENTROPY"},
     {GEOM_RHO_FLUX, "GEOM_RHO_ON_RECON"},
-    {GEOM_U_FLUX, "GEOM_U_ON_RECON"}};
+    {GEOM_U_FLUX, "GEOM_U_ON_RECON"}
+};
 }
 
 namespace Floors {
diff --git a/kharma/flux/flux.cpp b/kharma/flux/flux.cpp
index 68bc6196..a49d704a 100644
--- a/kharma/flux/flux.cpp
+++ b/kharma/flux/flux.cpp
@@ -50,7 +50,7 @@ std::shared_ptr<KHARMAPackage> Flux::Initialize(ParameterInput *pin, std::shared
 
     // We can't just use GetVariables or something since there's no mesh yet.
     // That's what this function is for.
-    int nvar = KHARMA::CountVars(packages.get(), Metadata::WithFluxes);
+    int nvar = KHARMA::PackDimension(packages.get(), Metadata::WithFluxes);
     std::cout << "Allocating fluxes with nvar: " << nvar << std::endl;
     std::vector<int> s_flux({nvar});
     // TODO optionally move all these to faces? Not important yet, no output, more memory
@@ -266,3 +266,36 @@ void Flux::AddGeoSource(MeshData<Real> *md, MeshData<Real> *mdudt)
         }
     );
 }
+
+TaskStatus Flux::CheckCtop(MeshData<Real> *md)
+{
+    Reductions::DomainReduction<Reductions::Var::nan_ctop, int>(md, UserHistoryOperation::sum, 0);
+    Reductions::DomainReduction<Reductions::Var::zero_ctop, int>(md, UserHistoryOperation::sum, 1);
+    return TaskStatus::complete;
+}
+
+TaskStatus Flux::PostStepDiagnostics(const SimTime& tm, MeshData<Real> *md)
+{
+    auto pmesh = md->GetMeshPointer();
+    auto pmb0 = md->GetBlockData(0)->GetBlockPointer();
+    // Options
+    const auto& pars = pmesh->packages.Get("Globals")->AllParams();
+    const int extra_checks = pars.Get<int>("extra_checks");
+
+    // Check for a soundspeed (ctop) of 0 or NaN
+    // This functions as a "last resort" check to stop a
+    // simulation on obviously bad data
+    if (extra_checks >= 1) {
+        int nnan = Reductions::Check<int>(md, 0);
+        int nzero = Reductions::Check<int>(md, 1);
+
+        if (MPIRank0() && (nzero > 0 || nnan > 0)) {
+            // TODO string formatting in C++ that doesn't suck
+            fprintf(stderr, "Max signal speed ctop of 0 or NaN (%d zero, %d NaN)", nzero, nnan);
+            throw std::runtime_error("Bad ctop!");
+        }
+
+    }
+
+    return TaskStatus::complete;
+}
diff --git a/kharma/flux/flux.hpp b/kharma/flux/flux.hpp
index 289aa43d..d8907c8c 100644
--- a/kharma/flux/flux.hpp
+++ b/kharma/flux/flux.hpp
@@ -37,7 +37,6 @@
 
 #include <parthenon/parthenon.hpp>
 
-#include "debug.hpp"
 #include "floors.hpp"
 #include "flux_functions.hpp"
 #include "pack.hpp"
@@ -48,6 +47,10 @@ namespace Flux {
 
 std::shared_ptr<KHARMAPackage> Initialize(ParameterInput *pin, std::shared_ptr<Packages_t>& packages);
 
+TaskStatus CheckCtop(MeshData<Real> *md);
+
+TaskStatus PostStepDiagnostics(const SimTime& tm, MeshData<Real> *md);
+
 /**
  * Add the geometric source term present in the covariant derivative of the stress-energy tensor,
  * S_nu = sqrt(-g) T^kap_lam Gamma^lam_nu_kap
diff --git a/kharma/grmhd/grmhd.cpp b/kharma/grmhd/grmhd.cpp
index 82b464d9..9c9f1c41 100644
--- a/kharma/grmhd/grmhd.cpp
+++ b/kharma/grmhd/grmhd.cpp
@@ -41,7 +41,6 @@
 
 #include "boundaries.hpp"
 #include "current.hpp"
-#include "debug.hpp"
 #include "floors.hpp"
 #include "flux.hpp"
 #include "gr_coordinates.hpp"
@@ -50,7 +49,6 @@
 
 #include <memory>
 
-
 /**
  * GRMHD package.  Global operations on General Relativistic Magnetohydrodynamic systems.
  */
@@ -258,26 +256,32 @@ Real EstimateTimestep(MeshBlockData<Real> *rc)
         return globals.Get<double>("dt_light");
     }
 
-    typename Kokkos::MinMax<Real>::value_type minmax;
+    Reductions::Reduce3v minmax;
     pmb->par_reduce("ndt_min", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
         KOKKOS_LAMBDA (const int k, const int j, const int i,
-                      typename Kokkos::MinMax<Real>::value_type &lminmax) {
+                      Reductions::Reduce3v &lminmax) {
             double ndt_zone = 1 / (1 / (G.Dxc<1>(i) /  m::max(cmax(0, k, j, i), cmin(0, k, j, i))) +
                                    1 / (G.Dxc<2>(j) /  m::max(cmax(1, k, j, i), cmin(1, k, j, i))) +
                                    1 / (G.Dxc<3>(k) /  m::max(cmax(2, k, j, i), cmin(2, k, j, i))));
             // Effective "max speed" used for the timestep
             double ctop_max_zone = m::min(G.Dxc<1>(i), m::min(G.Dxc<2>(j), G.Dxc<3>(k))) / ndt_zone;
 
-            if (!m::isnan(ndt_zone) && (ndt_zone < lminmax.min_val))
+            if (!m::isnan(ndt_zone) && (ndt_zone < lminmax.min_val)) {
                 lminmax.min_val = ndt_zone;
-            if (!m::isnan(ctop_max_zone) && (ctop_max_zone > lminmax.max_val))
+                lminmax.min_loc = std::tuple<int, int, int>{i, j, k};
+            }
+            if (!m::isnan(ctop_max_zone) && (ctop_max_zone > lminmax.max_val)) {
                 lminmax.max_val = ctop_max_zone;
+                lminmax.max_loc = std::tuple<int, int, int>{i, j, k};
+            }
         }
-    , Kokkos::MinMax<Real>(minmax));
+    , Reductions::Reduce3(minmax));
     // Keep dt to do some checks below
     const double min_ndt = minmax.min_val;
     const double nctop = minmax.max_val;
 
+    // TODO print tuples
+
     // Apply limits
     const double cfl = grmhd_pars.Get<double>("cfl");
     const double dt_min = grmhd_pars.Get<double>("dt_min");
@@ -401,19 +405,27 @@ TaskStatus PostStepDiagnostics(const SimTime& tm, MeshData<Real> *md)
     const auto& pars = pmesh->packages.Get("Globals")->AllParams();
     const int extra_checks = pars.Get<int>("extra_checks");
 
-    // Check for a soundspeed (ctop) of 0 or NaN
-    // This functions as a "last resort" check to stop a
-    // simulation on obviously bad data
-    if (extra_checks >= 1) {
-        CheckNaN(md, X1DIR);
-        if (pmesh->ndim > 1) CheckNaN(md, X2DIR);
-        if (pmesh->ndim > 2) CheckNaN(md, X3DIR);
-    }
-
-    // Further checking for any negative values.  Floors should
+    // Checking for any negative values.  Floors should
     // prevent this, so we save it for dire debugging
     if (extra_checks >= 2) {
-        CheckNegative(md, IndexDomain::interior);
+        // Not sure when I'd do the check to hide latency, it's a step-end sort of deal
+        // Just as well it's behind extra_checks 2
+        // This may happen while ch0-1 are in flight from floors, but ch2-4 are now reusable
+        Reductions::DomainReduction<Reductions::Var::neg_rho, int>(md, UserHistoryOperation::sum, 2);
+        Reductions::DomainReduction<Reductions::Var::neg_u, int>(md, UserHistoryOperation::sum, 3);
+        Reductions::DomainReduction<Reductions::Var::neg_rhout, int>(md, UserHistoryOperation::sum, 4);
+        int nless_rho = Reductions::Check<int>(md, 2);
+        int nless_u = Reductions::Check<int>(md, 3);
+        int nless_rhout = Reductions::Check<int>(md, 4);
+
+        if (MPIRank0()) {
+            if (nless_rhout > 0) {
+                std::cout << "Number of negative conserved rho: " << nless_rhout << std::endl;
+            }
+            if (nless_rho > 0 || nless_u > 0) {
+                std::cout << "Number of negative primitive rho, u: " << nless_rho << "," << nless_u << std::endl;
+            }
+        }
     }
 
     return TaskStatus::complete;
diff --git a/kharma/grmhd/grmhd.hpp b/kharma/grmhd/grmhd.hpp
index 51736a2f..53878118 100644
--- a/kharma/grmhd/grmhd.hpp
+++ b/kharma/grmhd/grmhd.hpp
@@ -74,7 +74,7 @@ void FillOutput(MeshBlock *pmb, ParameterInput *pin);
 
 /**
  * Diagnostics performed after each step.
- * Currently finds any negative flags or 0/NaN values in ctop
+ * Currently just looks for negative density/internal energy
  */
 TaskStatus PostStepDiagnostics(const SimTime& tm, MeshData<Real> *rc);
 }
diff --git a/kharma/grmhd/grmhd_reductions.hpp b/kharma/grmhd/grmhd_reductions.hpp
index 2db2cad8..a98f40a3 100644
--- a/kharma/grmhd/grmhd_reductions.hpp
+++ b/kharma/grmhd/grmhd_reductions.hpp
@@ -59,7 +59,7 @@ KOKKOS_INLINE_FUNCTION Real edot(REDUCE_FUNCTION_ARGS_EH)
     FourVectors Dtmp;
     Real T1[GR_DIM];
     GRMHD::calc_4vecs(G, P, m_p, k, j, i, Loci::center, Dtmp);
-    Flux::calc_tensor(P, m_p, Dtmp, gam, k, j, i, X1DIR, T1);
+    Flux::calc_tensor(G, P, m_p, Dtmp, gam, k, j, i, X1DIR, T1);
     // \dot{E} == \int - T^1_0 * gdet * dx2 * dx3
     return -T1[X0DIR] * G.gdet(Loci::center, j, i);
 }
@@ -68,7 +68,7 @@ KOKKOS_INLINE_FUNCTION Real ldot(REDUCE_FUNCTION_ARGS_EH)
     FourVectors Dtmp;
     Real T1[GR_DIM];
     GRMHD::calc_4vecs(G, P, m_p, k, j, i, Loci::center, Dtmp);
-    Flux::calc_tensor(P, m_p, Dtmp, gam, k, j, i, X1DIR, T1);
+    Flux::calc_tensor(G, P, m_p, Dtmp, gam, k, j, i, X1DIR, T1);
     // \dot{L} == \int T^1_3 * gdet * dx2 * dx3
     return T1[X3DIR] * G.gdet(Loci::center, j, i);
 }
@@ -121,7 +121,7 @@ KOKKOS_INLINE_FUNCTION Real jet_lum(REDUCE_FUNCTION_ARGS_MESH)
         FourVectors Dtmp;
         Real T1[GR_DIM];
         GRMHD::calc_4vecs(G, P(b), m_p, k, j, i, Loci::center, Dtmp);
-        Flux::calc_tensor(P(b), m_p, Dtmp, gam, k, j, i, X1DIR, T1);
+        Flux::calc_tensor(G, P(b), m_p, Dtmp, gam, k, j, i, X1DIR, T1);
         // If sigma > 1...
         if ((dot(Dtmp.bcon, Dtmp.bcov) / P(b, m_p.RHO, k, j, i)) > 1.) {
             // Energy flux, like at EH. 2D integral jacobian, so we have to take X1 off of auto-applied dV
diff --git a/kharma/implicit/fixup.cpp b/kharma/implicit/fix_solve.cpp
similarity index 96%
rename from kharma/implicit/fixup.cpp
rename to kharma/implicit/fix_solve.cpp
index 81871696..9c9d7104 100644
--- a/kharma/implicit/fixup.cpp
+++ b/kharma/implicit/fix_solve.cpp
@@ -1,5 +1,5 @@
 /* 
- *  File: fixup.cpp
+ *  File: fix_solve.cpp
  *  
  *  BSD 3-Clause License
  *  
@@ -86,7 +86,7 @@ TaskStatus Implicit::FixSolve(MeshBlockData<Real> *mbd) {
                 sum_x(ip, k, j, i) = 0.;
             }
             // Fix only bad zones
-            if ((solve_fail(k, j, i)) == SolverStatus::fail) {
+            if (failed(solve_fail(k, j, i))) {
                 //printf("Fixing zone %d %d %d!\n", i, j, k);
                 double wsum = 0., wsum_x = 0.;
                 // double sum[nfvar] = {0.}, sum_x[nfvar] = {0.};
@@ -102,7 +102,7 @@ TaskStatus Implicit::FixSolve(MeshBlockData<Real> *mbd) {
                                 double w = 1./(m::abs(l) + m::abs(m) + m::abs(n) + 1);
 
                                 // Count only the good cells, if we can
-                                if ((solve_fail(kk, jj, ii)) != SolverStatus::fail) {
+                                if (!failed(solve_fail(kk, jj, ii))) {
                                     // Weight by distance.  Note interpolated "fixed" cells stay flagged
                                     wsum += w;
                                     FLOOP sum(ip, k, j, i) += w * P(ip, kk, jj, ii);
@@ -140,7 +140,7 @@ TaskStatus Implicit::FixSolve(MeshBlockData<Real> *mbd) {
 
     pmb->par_for("fix_solver_failures_PtoU", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
         KOKKOS_LAMBDA (const int& k, const int& j, const int& i) {
-            if (solve_fail(k, j, i) == SolverStatus::fail)
+            if (failed(solve_fail(k, j, i)))
                 Flux::p_to_u(G, P_all, m_p, emhd_params, gam, k, j, i, U_all, m_u);
         }
     );
diff --git a/kharma/implicit/implicit.cpp b/kharma/implicit/implicit.cpp
index 6254055a..826a4665 100644
--- a/kharma/implicit/implicit.cpp
+++ b/kharma/implicit/implicit.cpp
@@ -34,7 +34,6 @@
 
 #include "implicit.hpp"
 
-#include "debug.hpp"
 #include "grmhd.hpp"
 #include "grmhd_functions.hpp"
 #include "kharma.hpp"
@@ -125,7 +124,7 @@ std::shared_ptr<KHARMAPackage> Implicit::Initialize(ParameterInput *pin, std::sh
     bool save_residual = pin->GetOrAddBoolean("implicit", "save_residual", false);
     params.Add("save_residual", save_residual);
     if (save_residual) {
-        int nvars_implicit  = KHARMA::CountVars(packages.get(), Metadata::GetUserFlag("Implicit"));
+        int nvars_implicit  = KHARMA::PackDimension(packages.get(), Metadata::GetUserFlag("Implicit"));
 
         std::vector<int> s_vars_implicit({nvars_implicit});
         Metadata m = Metadata({Metadata::Real, Metadata::Cell, Metadata::Derived, Metadata::OneCopy}, s_vars_implicit);
@@ -153,7 +152,9 @@ TaskStatus Implicit::Step(MeshData<Real> *md_full_step_init, MeshData<Real> *md_
     const Real delta         = implicit_par.Get<Real>("jacobian_delta");
     const Real rootfind_tol  = implicit_par.Get<Real>("rootfind_tol");
     const bool use_qr        = implicit_par.Get<bool>("use_qr");
-    const int verbose       = pmb_full_step_init->packages.Get("Globals")->Param<int>("verbose");
+    const auto& globals      = pmb_full_step_init->packages.Get("Globals")->AllParams();
+    const int verbose        = globals.Get<int>("verbose");
+    const int flag_verbose   = globals.Get<int>("flag_verbose");
     const Real gam           = pmb_full_step_init->packages.Get("GRMHD")->Param<Real>("gamma");
 
     const bool linesearch         = implicit_par.Get<bool>("linesearch");
@@ -296,7 +297,7 @@ TaskStatus Implicit::Step(MeshData<Real> *md_full_step_init, MeshData<Real> *md_
                 ScratchPad2D<Real> P_linesearch_s(member.team_scratch(scratch_level), n1, nvar);
                 // Scratchpads for solver performance diagnostics
                 ScratchPad1D<Real> solve_norm_s(member.team_scratch(scratch_level), n1);
-                ScratchPad1D<int> solve_fail_s(member.team_scratch(scratch_level), n1);
+                ScratchPad1D<SolverStatus> solve_fail_s(member.team_scratch(scratch_level), n1);
 
                 // Copy some file contents to scratchpads, so we can slice them
                 for(int ip=0; ip < nvar; ++ip) {
@@ -321,7 +322,7 @@ TaskStatus Implicit::Step(MeshData<Real> *md_full_step_init, MeshData<Real> *md_
                             } else {
                                 // Need this to check if the zone had failed in any of the previous iterations.
                                 // If so, we don't attempt to update it again in the implicit solver.
-                                solve_fail_s(i) = solve_fail_all(b, 0, k, j, i);
+                                solve_fail_s(i) = (SolverStatus) solve_fail_all(b, 0, k, j, i);
                             }
                         }
                     );
@@ -534,7 +535,7 @@ TaskStatus Implicit::Step(MeshData<Real> *md_full_step_init, MeshData<Real> *md_
 
                                 // Did we converge to required tolerance? If not, update solve_fail accordingly
                                 if (solve_norm() > rootfind_tol) {
-                                    solve_fail() += SolverStatus::beyond_tol;
+                                    solve_fail() = SolverStatus::beyond_tol; // TODO was changed from +=. Valid?
                                 }
                             }
                         }
@@ -555,7 +556,7 @@ TaskStatus Implicit::Step(MeshData<Real> *md_full_step_init, MeshData<Real> *md_
                 parthenon::par_for_inner(member, ib.s, ib.e,
                     [&](const int& i) {
                         solve_norm_all(b, 0, k, j, i) = solve_norm_s(i);
-                        solve_fail_all(b, 0, k, j, i) = solve_fail_s(i);
+                        solve_fail_all(b, 0, k, j, i) = (Real) solve_fail_s(i);
                     }
                 );
             }
@@ -564,42 +565,44 @@ TaskStatus Implicit::Step(MeshData<Real> *md_full_step_init, MeshData<Real> *md_
         // If we need to print or exit on the max norm...
         if (iter >= iter_min || verbose >= 1) {
             // Take the maximum L2 norm on this rank
-            static AllReduce<Real> max_norm;
-            Kokkos::Max<Real> norm_max(max_norm.val);
+            Real lmax_norm = 0.0;
             pmb_sub_step_init->par_reduce("max_norm", block.s, block.e, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
                 KOKKOS_LAMBDA (const int& b, const int& k, const int& j, const int& i, Real& local_result) {
                     if (solve_norm_all(b, 0, k, j, i) > local_result) local_result = solve_norm_all(b, 0, k, j, i);
                 }
-            , norm_max);
-            // Then MPI reduce AllReduce to copy the global max to every rank
-            max_norm.StartReduce(MPI_MAX);
-            while (max_norm.CheckReduce() == TaskStatus::incomplete);
-            if (verbose >= 1 && MPIRank0()) printf("Iteration %d max L2 norm: %g\n", iter, max_norm.val);
-
-            // Count total number of solver fails
-            // TODO move reductions like this to PostStep
-            int nfails = 0;
-            Kokkos::Sum<int> sum_reducer(nfails);
-            pmb_sub_step_init->par_reduce("count_solver_fails", block.s, block.e, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
-                KOKKOS_LAMBDA (const int& b, const int& k, const int& j, const int& i, int& local_result) {
-                    if (solve_fail_all(b, 0, k, j, i) == SolverStatus::fail) ++local_result;
+            , Kokkos::Max<Real>(lmax_norm));
+            // Then MPI AllReduce to copy the global max to every rank
+            Reductions::StartToAll<Real>(md_solver, 4, lmax_norm, MPI_MAX);
+            Real max_norm = Reductions::CheckOnAll<Real>(md_solver, 4);
+
+            if (verbose >= 1) {
+                // Count total number of solver fails
+                int lnfails = 0;
+                pmb_sub_step_init->par_reduce("count_solver_fails", block.s, block.e, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+                    KOKKOS_LAMBDA (const int& b, const int& k, const int& j, const int& i, int& local_result) {
+                        if ((SolverStatus) solve_fail_all(b, 0, k, j, i) == SolverStatus::fail) ++local_result;
+                    }
+                , Kokkos::Sum<int>(lnfails));
+                // Then reduce to rank 0 to print the iteration by iteration
+                Reductions::Start<int>(md_solver, 5, lnfails, MPI_SUM);
+                int nfails = Reductions::Check<int>(md_solver, 5);
+                if (MPIRank0()) {
+                    printf("Iteration %d max L2 norm: %g, failed zones: %d\n", iter, max_norm, nfails);
                 }
-            , sum_reducer);
-            // Then MPI reduce AllReduce to copy the global max to every rank
-            static AllReduce<int> nfails_tot;
-            nfails_tot.val = nfails;
-            nfails_tot.StartReduce(MPI_SUM);
-            while (nfails_tot.CheckReduce() == TaskStatus::incomplete);
-            if (verbose >= 1 && MPIRank0()) printf("Number of failed zones: %d\n", nfails_tot.val);
-
-            // Break if max_norm is less than the total tolerance we set.  TODO per-zone version of this?
-            if (iter >= iter_min && max_norm.val < rootfind_tol) break;
+            }
+
+            // Finally, break if max_norm is less than the total tolerance we set
+            // TODO per-zone tolerance with masks?
+            if (iter >= iter_min && max_norm < rootfind_tol) break;
         }
         EndFlag();
     }
 
-    EndFlag();
+    if (flag_verbose > 0) {
+        Reductions::CheckFlagReduceAndPrintHits(md_solver, "solve_fail", Implicit::status_names, IndexDomain::interior, false, 2);
+    }
 
+    EndFlag();
     return TaskStatus::complete;
 
 }
@@ -613,11 +616,9 @@ TaskStatus Implicit::PostStepDiagnostics(const SimTime& tm, MeshData<Real> *md)
     const int flag_verbose = pars.Get<int>("flag_verbose");
 
     // Debugging/diagnostic info about implicit solver
-    // TODO status names
-    // if (flag_verbose >= 1) {
-    //     int nflags = Reductions::CountFlags(md, "solve_fail", Implicit::status_names, IndexDomain::interior, flag_verbose, false);
-    //     // TODO TODO yell here if there are too many flags
-    // }
+    if (flag_verbose > 0) {
+        Reductions::CheckFlagReduceAndPrintHits(md, "solve_fail", Implicit::status_names, IndexDomain::interior, false, 2);
+    }
 
     return TaskStatus::complete;
 }
diff --git a/kharma/implicit/implicit.hpp b/kharma/implicit/implicit.hpp
index 7004cd9f..91b4413b 100644
--- a/kharma/implicit/implicit.hpp
+++ b/kharma/implicit/implicit.hpp
@@ -65,7 +65,20 @@ namespace Implicit
 // `fail`: manual backtracking wasn't good enough. FixSolve will be called
 // `beyond_tol`: solver didn't converge to prescribed tolerance but didn't fail
 // `backtrack`: step length of 1 gave negative rho/uu, but manual backtracking (0.1) sufficed
-enum SolverStatus{converged=0, fail, beyond_tol, backtrack};
+enum class SolverStatus{converged=0, fail, beyond_tol, backtrack};
+
+static const std::map<int, std::string> status_names = {
+    {(int) SolverStatus::fail, "failed"},
+    {(int) SolverStatus::beyond_tol, "beyond tolerance"},
+    {(int) SolverStatus::backtrack, "backtrack"}
+};
+
+template <typename T>
+KOKKOS_INLINE_FUNCTION bool failed(T status_flag)
+{
+    // Return only zones which outright failed
+    return static_cast<int>(status_flag) == static_cast<int>(SolverStatus::fail);
+}
 
 /**
  * Initialization.  Set parameters.
diff --git a/kharma/inverter/invert_template.hpp b/kharma/inverter/invert_template.hpp
index 38a99239..163ee8d7 100644
--- a/kharma/inverter/invert_template.hpp
+++ b/kharma/inverter/invert_template.hpp
@@ -58,7 +58,9 @@ static const std::map<int, std::string> status_names = {
     {(int) Status::bad_gamma, "Gamma invalid"},
     {(int) Status::neg_rho, "Negative rho"},
     {(int) Status::neg_u, "Negative U"},
-    {(int) Status::neg_rhou, "Negative rho & U"}};
+    {(int) Status::neg_rhou, "Negative rho & U"}
+};
+
 template <typename T>
 KOKKOS_INLINE_FUNCTION bool failed(T status_flag)
 {
diff --git a/kharma/inverter/inverter.cpp b/kharma/inverter/inverter.cpp
index c120d448..62f70e84 100644
--- a/kharma/inverter/inverter.cpp
+++ b/kharma/inverter/inverter.cpp
@@ -39,45 +39,6 @@
 #include "domain.hpp"
 #include "reductions.hpp"
 
-/**
- * Internal inversion fn, templated on inverter type.  Calls through to templated u_to_p
- * This is called with the correct template argument from BlockUtoP
- */
-template<Inverter::Type inverter>
-inline void BlockPerformInversion(MeshBlockData<Real> *rc, IndexDomain domain, bool coarse)
-{
-    auto pmb = rc->GetBlockPointer();
-    const auto& G = pmb->coords;
-
-    PackIndexMap prims_map, cons_map;
-    auto U = GRMHD::PackMHDCons(rc, cons_map);
-    auto P = GRMHD::PackHDPrims(rc, prims_map);
-    const VarMap m_u(cons_map, true), m_p(prims_map, false);
-
-    GridScalar pflag = rc->Get("pflag").data;
-
-    const Real gam = pmb->packages.Get("GRMHD")->Param<Real>("gamma");
-
-    const Real err_tol = pmb->packages.Get("Inverter")->Param<Real>("err_tol");
-    const int iter_max = pmb->packages.Get("Inverter")->Param<int>("iter_max");
-    const Real stepsize = pmb->packages.Get("Inverter")->Param<Real>("stepsize");
-
-    // Get the primitives from our conserved versions
-    // Notice we recover variables for only the physical (interior or MPI-boundary)
-    // zones!  These are the only ones which are filled at our point in the step
-    auto bounds = coarse ? pmb->c_cellbounds : pmb->cellbounds;
-    const IndexRange3 b = KDomain::GetPhysicalRange(rc);
-
-    pmb->par_for("U_to_P", b.ks, b.ke, b.js, b.je, b.is, b.ie,
-        KOKKOS_LAMBDA (const int &k, const int &j, const int &i) {
-            if (KDomain::inside(k, j, i, b)) {
-                // Run over all interior zones and any initialized ghosts
-                pflag(k, j, i) = static_cast<double>(Inverter::u_to_p<inverter>(G, U, m_u, gam, k, j, i, P, m_p, Loci::center));
-            }
-        }
-    );
-}
-
 std::shared_ptr<KHARMAPackage> Inverter::Initialize(ParameterInput *pin, std::shared_ptr<Packages_t>& packages)
 {
     auto pkg = std::make_shared<KHARMAPackage>("Inverter");
@@ -126,6 +87,45 @@ std::shared_ptr<KHARMAPackage> Inverter::Initialize(ParameterInput *pin, std::sh
     return pkg;
 }
 
+/**
+ * Internal inversion fn, templated on inverter type.  Calls through to templated u_to_p
+ * This is called with the correct template argument from BlockUtoP
+ */
+template<Inverter::Type inverter>
+inline void BlockPerformInversion(MeshBlockData<Real> *rc, IndexDomain domain, bool coarse)
+{
+    auto pmb = rc->GetBlockPointer();
+    const auto& G = pmb->coords;
+
+    PackIndexMap prims_map, cons_map;
+    auto U = GRMHD::PackMHDCons(rc, cons_map);
+    auto P = GRMHD::PackHDPrims(rc, prims_map);
+    const VarMap m_u(cons_map, true), m_p(prims_map, false);
+
+    GridScalar pflag = rc->Get("pflag").data;
+
+    const Real gam = pmb->packages.Get("GRMHD")->Param<Real>("gamma");
+
+    const Real err_tol = pmb->packages.Get("Inverter")->Param<Real>("err_tol");
+    const int iter_max = pmb->packages.Get("Inverter")->Param<int>("iter_max");
+    const Real stepsize = pmb->packages.Get("Inverter")->Param<Real>("stepsize");
+
+    // Get the primitives from our conserved versions
+    // Notice we recover variables for only the physical (interior or MPI-boundary)
+    // zones!  These are the only ones which are filled at our point in the step
+    auto bounds = coarse ? pmb->c_cellbounds : pmb->cellbounds;
+    const IndexRange3 b = KDomain::GetPhysicalRange(rc);
+
+    pmb->par_for("U_to_P", b.ks, b.ke, b.js, b.je, b.is, b.ie,
+        KOKKOS_LAMBDA (const int &k, const int &j, const int &i) {
+            if (KDomain::inside(k, j, i, b)) {
+                // Run over all interior zones and any initialized ghosts
+                pflag(k, j, i) = static_cast<double>(Inverter::u_to_p<inverter>(G, U, m_u, gam, k, j, i, P, m_p, Loci::center));
+            }
+        }
+    );
+}
+
 void Inverter::BlockUtoP(MeshBlockData<Real> *rc, IndexDomain domain, bool coarse)
 {
     // This only chooses an implementation.  See BlockPerformInversion and implementations e.g. onedw.hpp
@@ -137,6 +137,7 @@ void Inverter::BlockUtoP(MeshBlockData<Real> *rc, IndexDomain domain, bool coars
     case Type::none:
         break;
     }
+    //Reductions::StartFlagReduce(md, "pflag", Inverter::status_names, IndexDomain::interior, false, 1);
 }
 
 TaskStatus Inverter::PostStepDiagnostics(const SimTime& tm, MeshData<Real> *md)
@@ -148,9 +149,11 @@ TaskStatus Inverter::PostStepDiagnostics(const SimTime& tm, MeshData<Real> *md)
     const int flag_verbose = pars.Get<int>("flag_verbose");
 
     // Debugging/diagnostic info about floor and inversion flags
+    // TODO grab the total and die on too many
     if (flag_verbose >= 1) {
-        int nflags = Reductions::CountFlags(md, "pflag", Inverter::status_names, IndexDomain::interior, flag_verbose, false);
-        // TODO TODO yell here if there are too many flags
+        // TODO this should move into BlockUtoP when everything goes MeshData
+        Reductions::StartFlagReduce(md, "pflag", Inverter::status_names, IndexDomain::interior, false, 1);
+        Reductions::CheckFlagReduceAndPrintHits(md, "pflag", Inverter::status_names, IndexDomain::interior, false, 1);
     }
 
     return TaskStatus::complete;
diff --git a/kharma/kharma.cpp b/kharma/kharma.cpp
index d7fc2ffe..6bc4d950 100644
--- a/kharma/kharma.cpp
+++ b/kharma/kharma.cpp
@@ -301,6 +301,8 @@ Packages_t KHARMA::ProcessPackages(std::unique_ptr<ParameterInput> &pin)
     auto t_grmhd = tl.AddTask(t_globals | t_driver, KHARMA::AddPackage, packages, GRMHD::Initialize, pin.get());
     // Inverter (TODO: split out fixups, then don't load this when GRMHD isn't loaded)
     auto t_inverter = tl.AddTask(t_grmhd, KHARMA::AddPackage, packages, Inverter::Initialize, pin.get());
+    // Reductions, needed for most other packages
+    auto t_reductions = tl.AddTask(t_none, KHARMA::AddPackage, packages, Reductions::Initialize, pin.get());
 
     // B field solvers, to ensure divB ~= 0.
     // Bunch of logic here: basically we want to load <=1 solver with an encoded order of preference
@@ -335,12 +337,8 @@ Packages_t KHARMA::ProcessPackages(std::unique_ptr<ParameterInput> &pin)
         if (t_b_field == t_none) t_b_field = t_b_cleanup;
     }
 
-    // Enable calculating jcon iff it is in any list of outputs (and there's even B to calculate it).
-    // Since it is never required to restart, this is the only time we'd write (hence, need) it
-    if (FieldIsOutput(pin.get(), "jcon") && t_b_field != t_none) {
-        auto t_current = tl.AddTask(t_b_field, KHARMA::AddPackage, packages, Current::Initialize, pin.get());
-    }
-    // Electrons are usually boring but not impossible without a B field (TODO add a test?)
+    // Optional standalone packages
+    // Electrons are boring but not impossible without a B field (TODO add a test?)
     if (pin->GetOrAddBoolean("electrons", "on", false)) {
         auto t_electrons = tl.AddTask(t_grmhd, KHARMA::AddPackage, packages, Electrons::Initialize, pin.get());
     }
@@ -350,6 +348,11 @@ Packages_t KHARMA::ProcessPackages(std::unique_ptr<ParameterInput> &pin)
     if (pin->GetOrAddBoolean("wind", "on", false)) {
         auto t_electrons = tl.AddTask(t_grmhd, KHARMA::AddPackage, packages, Wind::Initialize, pin.get());
     }
+    // Enable calculating jcon iff it is in any list of outputs (and there's even B to calculate it).
+    // Since it is never required to restart, this is the only time we'd write (hence, need) it
+    if (FieldIsOutput(pin.get(), "jcon") && t_b_field != t_none) {
+        auto t_current = tl.AddTask(t_b_field, KHARMA::AddPackage, packages, Current::Initialize, pin.get());
+    }
 
     // Execute the whole collection (just in case we do something fancy?)
     while (!tr.Execute()); // TODO this will inf-loop on error
@@ -365,7 +368,8 @@ Packages_t KHARMA::ProcessPackages(std::unique_ptr<ParameterInput> &pin)
     KHARMA::AddPackage(packages, KBoundaries::Initialize, pin.get());
 
     // Load the implicit package last, and only if there are any variables which need implicit evolution
-    int n_implicit = CountVars(packages.get(), Metadata::GetUserFlag("Implicit"));
+    auto all_implicit = Metadata::FlagCollection(Metadata::GetUserFlag("Implicit"));
+    int n_implicit = PackDimension(packages.get(), Metadata::GetUserFlag("Implicit"));
     if (n_implicit > 0) {
         KHARMA::AddPackage(packages, Implicit::Initialize, pin.get());
     }
diff --git a/kharma/kharma.hpp b/kharma/kharma.hpp
index 5928ec3a..3e30e7a2 100644
--- a/kharma/kharma.hpp
+++ b/kharma/kharma.hpp
@@ -31,6 +31,7 @@
  *  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
+#pragma once
 
 #include "decs.hpp"
 #include "types.hpp"
@@ -109,32 +110,36 @@ inline bool FieldIsOutput(ParameterInput *pin, std::string name)
  * This fn calculates the size a VariablePack *would* be, without making one --
  * it uses only the package list, and counts through each variable in each package.
  * Mostly useful for initialization.
- * TODO can this take flagcollections?  Move to Parthenon...
  */
-inline int CountVars(Packages_t* packages, MetadataFlag flag)
+inline int PackDimension(Packages_t* packages, Metadata::FlagCollection fc)
 {
+    // We want to exclude anything specific to B field cleanup & not used elsewhere
+    // (confusingly, this isn't *necessarily* everything in the B_Cleanup package)
+    if (packages->AllPackages().count("B_Cleanup"))
+        fc = fc - Metadata::GetUserFlag("B_Cleanup");
+
+    // Count dimensions (1 for scalars + vector lengths) of each package's variables
     int nvar = 0;
     for (auto pkg : packages->AllPackages()) {
-        for (auto field : pkg.second->AllFields()) {
-            // Specifically ignore the B_Cleanup variables, we'll never want them separately like this
-            bool is_not_cleanup = packages->AllPackages().count("B_Cleanup")
-                                    ? !field.second.IsSet(Metadata::GetUserFlag("B_Cleanup"))
-                                    : true;
-            if (field.second.IsSet(flag) && is_not_cleanup) {
-                int var_len = 0;
-                if (field.second.IsSet(Metadata::Face)) {
-                    var_len = 3; // TODO non-scalar face fields?
-                } else if (field.second.Shape().size() < 1) {
-                    var_len = 1;
-                } else {
-                    var_len = field.second.Shape()[0];
-                }
-                //std::cout << "flag: " << flag << " var: " << field.first.label() << " size: " << var_len << std::endl;
-                nvar += var_len;
-            }
-        }
+        nvar += pkg.second->GetPackDimension(fc);
     }
     return nvar;
 }
 
+/**
+ * This fn calculates the size a VariablePack *would* be, without making one --
+ * it uses only the package list, and counts through each variable in each package.
+ * Mostly useful for initialization.
+ */
+inline std::vector<std::string> GetVariableNames(Packages_t* packages, Metadata::FlagCollection fc)
+{
+    // Count dimensions (1 for scalars + vector lengths) of each package's variables
+    std::vector<std::string> names;
+    for (auto pkg : packages->AllPackages()) {
+        std::vector<std::string> pnames = pkg.second->GetVariableNames(fc);
+        names.insert(names.end(), pnames.begin(), pnames.end());
+    }
+    return names;
+}
+
 }
diff --git a/kharma/prob/post_initialize.cpp b/kharma/prob/post_initialize.cpp
index 7a09c3dc..5944bbe1 100644
--- a/kharma/prob/post_initialize.cpp
+++ b/kharma/prob/post_initialize.cpp
@@ -41,7 +41,6 @@
 #include "b_field_tools.hpp"
 #include "blob.hpp"
 #include "boundaries.hpp"
-#include "debug.hpp"
 #include "floors.hpp"
 #include "flux.hpp"
 #include "gr_coordinates.hpp"
@@ -56,6 +55,7 @@
  * Should only be used in initialization code, as the
  * reducer object & MPI comm are created on entry &
  * cleaned on exit
+ * TODO use Reductions stuff?
  */
 template<typename T>
 inline T MPIReduce_once(T f, MPI_Op O)
@@ -69,35 +69,18 @@ inline T MPIReduce_once(T f, MPI_Op O)
     return reduction.val;
 }
 
-// Define reductions we need just for PostInitialize code.
-// TODO namespace...
-KOKKOS_INLINE_FUNCTION Real bsq(REDUCE_FUNCTION_ARGS_MESH)
-{
-    FourVectors Dtmp;
-    GRMHD::calc_4vecs(G, P, m_p, k, j, i, Loci::center, Dtmp);
-    return dot(Dtmp.bcon, Dtmp.bcov);
-}
-KOKKOS_INLINE_FUNCTION Real gas_pres(REDUCE_FUNCTION_ARGS_MESH)
-{
-    return (gam - 1) * P(m_p.UU, k, j, i);
-}
-KOKKOS_INLINE_FUNCTION Real gas_beta(REDUCE_FUNCTION_ARGS_MESH)
-{
-    FourVectors Dtmp;
-    GRMHD::calc_4vecs(G, P, m_p, k, j, i, Loci::center, Dtmp);
-    return ((gam - 1) * P(m_p.UU, k, j, i))/(0.5*(dot(Dtmp.bcon, Dtmp.bcov) + SMALL));
-}
+// Shorter names for the reductions we use here
 Real MaxBsq(MeshData<Real> *md)
 {
-    return Reductions::DomainReduction(md, UserHistoryOperation::max, bsq, 0.0);
+    return Reductions::DomainReduction<Reductions::Var::bsq, Real>(md, UserHistoryOperation::max);
 }
 Real MaxPressure(MeshData<Real> *md)
 {
-    return Reductions::DomainReduction(md, UserHistoryOperation::max, gas_pres, 0.0);
+    return Reductions::DomainReduction<Reductions::Var::gas_pressure, Real>(md, UserHistoryOperation::max);
 }
 Real MinBeta(MeshData<Real> *md)
 {
-    return Reductions::DomainReduction(md, UserHistoryOperation::min, gas_beta, 0.0);
+    return Reductions::DomainReduction<Reductions::Var::beta, Real>(md, UserHistoryOperation::min);
 }
 
 void KHARMA::SeedAndNormalizeB(ParameterInput *pin, std::shared_ptr<MeshData<Real>> md)
@@ -109,8 +92,6 @@ void KHARMA::SeedAndNormalizeB(ParameterInput *pin, std::shared_ptr<MeshData<Rea
     const bool use_b_cd = pmesh->packages.AllPackages().count("B_CD");
     const int verbose = pmesh->packages.Get("Globals")->Param<int>("verbose");
 
-    // TODO this should be restructured...
-
     Flag("SeedBField");
     // Seed the magnetic field on each block
     for (auto &pmb : pmesh->block_list) {
@@ -142,7 +123,6 @@ void KHARMA::SeedAndNormalizeB(ParameterInput *pin, std::shared_ptr<MeshData<Rea
         // Calculate current beta_min value
         Real bsq_max, p_max, beta_min;
         if (beta_calc_legacy) {
-            std::cout << "Max is " << MaxBsq(md.get()) << std::endl;
             bsq_max = MPIReduce_once(MaxBsq(md.get()), MPI_MAX);
             p_max = MPIReduce_once(MaxPressure(md.get()), MPI_MAX);
             beta_min = p_max / (0.5 * bsq_max);
@@ -207,8 +187,7 @@ void KHARMA::PostInitialize(ParameterInput *pin, Mesh *pmesh, bool is_restart)
     // If your problem requires custom boundary conditions, these should be implemented
     // with the problem and assigned to the relevant functions in the "Boundaries" package.
 
-    // Make sure we've built the MeshData object we'll be synchronizing/updating
-    auto &md = pmesh->mesh_data.GetOrAdd("base", 0);
+    auto &md = pmesh->mesh_data.Get();
 
     auto& pkgs = pmesh->packages.AllPackages();
 
@@ -269,8 +248,6 @@ void KHARMA::PostInitialize(ParameterInput *pin, Mesh *pmesh, bool is_restart)
 
         // This does its own MPI syncs
         B_Cleanup::CleanupDivergence(md);
-
-        B_Cleanup::RemoveExtraFields(pmesh->block_list);
     }
 
     // Finally, synchronize boundary values.
diff --git a/kharma/prob/problem.cpp b/kharma/prob/problem.cpp
index db19c618..b6e57c6e 100644
--- a/kharma/prob/problem.cpp
+++ b/kharma/prob/problem.cpp
@@ -36,7 +36,6 @@
 
 #include "b_field_tools.hpp"
 #include "boundaries.hpp"
-#include "debug.hpp"
 #include "electrons.hpp"
 #include "floors.hpp"
 #include "flux.hpp"
diff --git a/kharma/prob/resize_restart.cpp b/kharma/prob/resize_restart.cpp
index 0a60caaf..07add691 100644
--- a/kharma/prob/resize_restart.cpp
+++ b/kharma/prob/resize_restart.cpp
@@ -35,7 +35,6 @@
 #include "resize_restart.hpp"
 
 #include "b_flux_ct.hpp"
-#include "debug.hpp"
 #include "hdf5_utils.h"
 #include "kharma_utils.hpp"
 #include "interpolation.hpp"
diff --git a/kharma/reductions/reductions.cpp b/kharma/reductions/reductions.cpp
index 07f7f1f7..ebf754b8 100644
--- a/kharma/reductions/reductions.cpp
+++ b/kharma/reductions/reductions.cpp
@@ -36,138 +36,39 @@
 
 #include <parthenon/parthenon.hpp>
 
+// TODO none of this machinery preserves zone locations,
+// which we pretty often would like...
 
-#pragma hd_warning_disable
-Real Reductions::EHReduction(MeshData<Real> *md, UserHistoryOperation op, std::function<Real(REDUCE_FUNCTION_ARGS_EH)> fn, int zone)
+std::shared_ptr<KHARMAPackage> Reductions::Initialize(ParameterInput *pin, std::shared_ptr<Packages_t>& packages)
 {
-    Flag("EHReduction");
-    auto pmesh = md->GetMeshPointer();
-
-    Real result = 0.;
-    for (auto &pmb : pmesh->block_list) {
-        // If we're on the inner edge
-        if (pmb->boundary_flag[parthenon::BoundaryFace::inner_x1] == BoundaryFlag::user) {
-            const auto& pars = pmb->packages.Get("GRMHD")->AllParams();
-            const Real gam = pars.Get<Real>("gamma");
-
-            auto& rc = pmb->meshblock_data.Get();
-            PackIndexMap prims_map, cons_map;
-            const auto& P = rc->PackVariables(std::vector<MetadataFlag>{Metadata::GetUserFlag("Primitive")}, prims_map);
-            const auto& U = rc->PackVariablesAndFluxes(std::vector<MetadataFlag>{Metadata::Conserved}, cons_map);
-            const VarMap m_u(cons_map, true), m_p(prims_map, false);
-
-            IndexRange ib = pmb->cellbounds.GetBoundsI(IndexDomain::interior);
-            IndexRange jb = pmb->cellbounds.GetBoundsJ(IndexDomain::interior);
-            IndexRange kb = pmb->cellbounds.GetBoundsK(IndexDomain::interior);
-            const auto& G = pmb->coords;
-
-            Real block_result; 
-            switch(op) {
-            case UserHistoryOperation::sum: {
-                Kokkos::Sum<Real> sum_reducer(block_result);
-                pmb->par_reduce("accretion_sum", kb.s, kb.e, jb.s, jb.e, ib.s+zone, ib.s+zone,
-                    KOKKOS_LAMBDA (const int &k, const int &j, const int &i, double &local_result) {
-                        local_result += fn(G, P, m_p, U, m_u, gam, k, j, i) * G.Dxc<3>(k) * G.Dxc<2>(j);
-                    }
-                , sum_reducer);
-                result += block_result;
-                break;
-            }
-            case UserHistoryOperation::max: {
-                Kokkos::Max<Real> max_reducer(block_result);
-                pmb->par_reduce("accretion_sum", kb.s, kb.e, jb.s, jb.e, ib.s+zone, ib.s+zone,
-                    KOKKOS_LAMBDA (const int &k, const int &j, const int &i, double &local_result) {
-                        const Real val = fn(G, P, m_p, U, m_u, gam, k, j, i) * G.Dxc<3>(k) * G.Dxc<2>(j);
-                        if (val > local_result) local_result = val;
-                    }
-                , max_reducer);
-                if (block_result > result) result = block_result;
-                break;
-            }
-            case UserHistoryOperation::min: {
-                Kokkos::Min<Real> min_reducer(block_result);
-                pmb->par_reduce("accretion_sum", kb.s, kb.e, jb.s, jb.e, ib.s+zone, ib.s+zone,
-                    KOKKOS_LAMBDA (const int &k, const int &j, const int &i, double &local_result) {
-                        const Real val = fn(G, P, m_p, U, m_u, gam, k, j, i) * G.Dxc<3>(k) * G.Dxc<2>(j);
-                        if (val < local_result) local_result = val;
-                    }
-                , min_reducer);
-                if (block_result < result) result = block_result;
-                break;
-            }
-            }
-        }
-    }
-
-    EndFlag();
-    return result;
-}
-
-#pragma hd_warning_disable
-Real Reductions::DomainReduction(MeshData<Real> *md, UserHistoryOperation op, std::function<Real(REDUCE_FUNCTION_ARGS_MESH)> fn, Real arg)
-{
-    Flag("DomainReduction");
-    auto pmesh = md->GetMeshPointer();
-
-    // TODO TODO MESHDATA THIS
-    Real result = 0.;
-    const auto& pars = pmesh->packages.Get("GRMHD")->AllParams();
-    const Real gam = pars.Get<Real>("gamma");
-
-    PackIndexMap prims_map, cons_map;
-    const auto& P = md->PackVariables(std::vector<MetadataFlag>{Metadata::GetUserFlag("Primitive")}, prims_map);
-    const auto& U = md->PackVariablesAndFluxes(std::vector<MetadataFlag>{Metadata::Conserved}, cons_map);
-    const VarMap m_u(cons_map, true), m_p(prims_map, false);
-
-    auto pmb0 = md->GetBlockData(0)->GetBlockPointer();
-    IndexRange ib = pmb0->cellbounds.GetBoundsI(IndexDomain::interior);
-    IndexRange jb = pmb0->cellbounds.GetBoundsJ(IndexDomain::interior);
-    IndexRange kb = pmb0->cellbounds.GetBoundsK(IndexDomain::interior);
-    IndexRange block = IndexRange{0, U.GetDim(5) - 1};
-    
-    switch(op) {
-    case UserHistoryOperation::sum: {
-        Kokkos::Sum<Real> sum_reducer(result);
-        pmb0->par_reduce("domain_sum", block.s, block.e, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
-            KOKKOS_LAMBDA (const int &b, const int &k, const int &j, const int &i, double &local_result) {
-                const auto& G = U.GetCoords(b);
-                local_result += fn(G, P(b), m_p, U(b), m_u, gam, k, j, i, arg) * G.Dxc<3>(k) * G.Dxc<2>(j) * G.Dxc<1>(i);
-            }
-        , sum_reducer);
-        break;
-    }
-    case UserHistoryOperation::max: {
-        Kokkos::Max<Real> max_reducer(result);
-        pmb0->par_reduce("domain_max", block.s, block.e, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
-            KOKKOS_LAMBDA (const int &b, const int &k, const int &j, const int &i, double &local_result) {
-                const auto& G = U.GetCoords(b);
-                const Real val = fn(G, P(b), m_p, U(b), m_u, gam, k, j, i, arg) * G.Dxc<3>(k) * G.Dxc<2>(j) * G.Dxc<1>(i);
-                if (val > local_result) local_result = val;
-            }
-        , max_reducer);
-        break;
-    }
-    case UserHistoryOperation::min: {
-        Kokkos::Min<Real> min_reducer(result);
-        pmb0->par_reduce("domain_min", block.s, block.e, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
-            KOKKOS_LAMBDA (const int &b, const int &k, const int &j, const int &i, double &local_result) {
-                const auto& G = U.GetCoords(b);
-                const Real val = fn(G, P(b), m_p, U(b), m_u, gam, k, j, i, arg) * G.Dxc<3>(k) * G.Dxc<2>(j) * G.Dxc<1>(i);
-                if (val < local_result) local_result = val;
-            }
-        , min_reducer);
-        break;
-    }
-    }
-
-    EndFlag();
-    return result;
+    auto pkg = std::make_shared<KHARMAPackage>("Reductions");
+    Params &params = pkg->AllParams();
+
+    // These pools are vectors of Reducers which operate on vectors (or scalars)
+    // They exist to allow several reductions to be in-flight at once to hide latency
+    // (even reductions over vectors, as with the different flags)
+    std::vector<Reduce<std::vector<int>>> vector_int_reduce_pool;
+    params.Add("vector_int_reduce_pool", vector_int_reduce_pool, true);
+    std::vector<Reduce<std::vector<Real>>> vector_reduce_pool;
+    params.Add("vector_reduce_pool", vector_reduce_pool, true);
+    std::vector<Reduce<int>> int_reduce_pool;
+    params.Add("int_reduce_pool", int_reduce_pool, true);
+    std::vector<Reduce<Real>> reduce_pool;
+    params.Add("reduce_pool", reduce_pool, true);
+
+    std::vector<AllReduce<std::vector<int>>> vector_int_allreduce_pool;
+    params.Add("vector_int_allreduce_pool", vector_int_allreduce_pool, true);
+    std::vector<AllReduce<std::vector<Real>>> vector_allreduce_pool;
+    params.Add("vector_allreduce_pool", vector_allreduce_pool, true);
+    std::vector<AllReduce<int>> int_allreduce_pool;
+    params.Add("int_allreduce_pool", int_allreduce_pool, true);
+    std::vector<AllReduce<Real>> allreduce_pool;
+    params.Add("allreduce_pool", allreduce_pool, true);
+
+    return pkg;
 }
 
-/**
- * Counts occurrences of a particular flag value
- * 
- */
+// Flag reductions: local
 int Reductions::CountFlag(MeshData<Real> *md, std::string field_name, const int& flag_val, IndexDomain domain, bool is_bitflag)
 {
     auto pmb0 = md->GetBlockData(0)->GetBlockPointer();
@@ -193,10 +94,11 @@ int Reductions::CountFlag(MeshData<Real> *md, std::string field_name, const int&
     return n_flag;
 }
 
-int Reductions::CountFlags(MeshData<Real> *md, std::string field_name, std::map<int, std::string> flag_values, IndexDomain domain, int verbose, bool is_bitflag)
+#define MAX_NFLAGS 20
+
+std::vector<int> Reductions::CountFlags(MeshData<Real> *md, std::string field_name, const std::map<int, std::string> &flag_values, IndexDomain domain, bool is_bitflag)
 {
     Flag("CountFlags_"+field_name);
-    int nflags = 0;
     auto pmb0 = md->GetBlockData(0)->GetBlockPointer();
 
     // Pack variables
@@ -209,85 +111,82 @@ int Reductions::CountFlags(MeshData<Real> *md, std::string field_name, std::map<
     IndexRange kb = md->GetBoundsK(domain);
     IndexRange block = IndexRange{0, flag.GetDim(5) - 1};
 
-    // Count all nonzero (technically, >0) values
+    const int n_of_flags = flag_values.size();
+    int flag_val_list[MAX_NFLAGS];
+    int f=0;
+    for (auto &flag : flag_values) {
+        flag_val_list[f] = flag.first;
+        f++;
+    }
+
+    // Count all nonzero (technically, >0) values,
+    // and all values of each 
     // This works for pflags or fflags, so long as they're separate
     // We don't count negative pflags as they denote zones that shouldn't be fixed
-    Kokkos::Sum<int> sum_reducer(nflags);
+    Reductions::array_type<int, MAX_NFLAGS> flag_reducer;
     pmb0->par_reduce("count_flags", block.s, block.e, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
-        KOKKOS_LAMBDA (const int &b, const int &k, const int &j, const int &i, int &local_result) {
-            if ((int) flag(b, 0, k, j, i) > 0) ++local_result;
+        KOKKOS_LAMBDA (const int &b, const int &k, const int &j, const int &i, 
+                       Reductions::array_type<int, MAX_NFLAGS> &local_result) {
+            if ((int) flag(b, 0, k, j, i) > 0) ++local_result.my_array[0];
+            for (int f=0; f<n_of_flags; f++)
+                if ((is_bitflag && static_cast<int>(flag(b, 0, k, j, i)) & flag_val_list[f]) ||
+                    (!is_bitflag && static_cast<int>(flag(b, 0, k, j, i)) == flag_val_list[f]))
+                ++local_result.my_array[f+1];
         }
-    , sum_reducer);
-
-    // TODO TODO REPLACE ABOVE WITH SOMETHING LIKE:
-    // array_sum::array_type<Real, 2> res;
-    // parthenon::par_reduce(parthenon::loop_pattern_mdrange_tag, "RadiationResidual1",
-    //                         DevExecSpace(), 0, mout->NumBlocks()-1,
-    //                         0, nang1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
-    // KOKKOS_LAMBDA(const int b, const int n, const int k, const int j, const int i,
-    //                 array_sum::array_type<Real, 2>& dsum) {
-    //     dsum.my_array[0] += m::abs(iiter(b,n,k,j,i) - iout(b,n,k,j,i));
-    //     dsum.my_array[1] += iout(b,n,k,j,i);
-    // }, array_sum::GlobalSum<Real, Kokkos::HostSpace, 2>(res));
+    , Reductions::ArraySum<int, DevExecSpace, MAX_NFLAGS>(flag_reducer));
+    
+    std::vector<int> n_each_flag;
+    for (int f=0; f<n_of_flags+1; f++)
+        n_each_flag.push_back(flag_reducer.my_array[f]);
+    
+    EndFlag();
+    return n_each_flag;
+}
 
-    // Need the total on all ranks to evaluate the if statement below
-    static AllReduce<int> n_tot;
-    n_tot.val = nflags;
-    n_tot.StartReduce(MPI_SUM);
-    while (n_tot.CheckReduce() == TaskStatus::incomplete);
-    nflags = n_tot.val;
+// Flag reductions: global
+void Reductions::StartFlagReduce(MeshData<Real> *md, std::string field_name, const std::map<int, std::string> &flag_values, IndexDomain domain, bool is_bitflag, int channel)
+{
+    Start<std::vector<int>>(md, channel, CountFlags(md, field_name, flag_values, domain, is_bitflag), MPI_SUM);
+}
 
-    // If necessary, count each flag
-    // This is slow, but it can be slow: it's not called for normal operation
-    if (verbose > 0 && nflags > 0) {
-        // Overlap reductions to save time
-        // ...at the cost of considerable complexity...
+std::vector<int> Reductions::CheckFlagReduceAndPrintHits(MeshData<Real> *md, std::string field_name, const std::map<int, std::string> &flag_values,
+                                                     IndexDomain domain, bool is_bitflag, int channel)
+{
+    Flag("CheckFlagReduce");
+    const auto& pmesh = md->GetMeshPointer();
+    const auto& verbose = pmesh->packages.Get("Globals")->Param<int>("flag_verbose");
 
-        // TODO TODO eliminate static reducers, they crash the program after it finishes
-        static Reduce<int> n_cells_r;
-        n_cells_r.val = (block.e - block.s + 1) * (kb.e - kb.s + 1) * (jb.e - jb.s + 1) * (ib.e - ib.s + 1);
-        n_cells_r.StartReduce(0, MPI_SUM);
+    // Get the relevant reducer and result
+    auto& pars = md->GetMeshPointer()->packages.Get("Reductions")->AllParams();
+    auto *vector_int_reduce_pool = pars.GetMutable<std::vector<Reduce<std::vector<int>>>>("vector_int_reduce_pool");
+    auto& vector_int_reduce = (*vector_int_reduce_pool)[channel];
 
-        static std::vector<std::shared_ptr<Reduce<int>>> reducers;
-        // Initialize reducers if they haven't been
-        if (reducers.size() == 0) {
-            for (auto& status : flag_values) {
-                std::shared_ptr<Reduce<int>> reducer = std::make_shared<Reduce<int>>();
-                reducers.push_back(reducer);
-            }
-        }
-        // Count occurrences of each flag value, assign to a reducer in order
-        int i = 0;
-        for (auto& status : flag_values) {
-            reducers[i]->val = CountFlag(md, field_name, (int) status.first, domain, is_bitflag);
-            reducers[i]->StartReduce(0, MPI_SUM);
-            ++i;
-        }
-        while (n_cells_r.CheckReduce() == TaskStatus::incomplete);
-        const int n_cells = n_cells_r.val;
-        // Check each reducer in order, add to a vector
-        std::vector<int> n_status_present;
-        for (std::shared_ptr<Reduce<int>> reducer : reducers) {
-            while (reducer->CheckReduce() == TaskStatus::incomplete);
-            n_status_present.push_back(reducer->val);
-        }
+    while (vector_int_reduce.CheckReduce() == TaskStatus::incomplete);
+    const std::vector<int> &total_flag_counts = vector_int_reduce.val;
 
+    // Print flags 
+    if (total_flag_counts[0] > 0 && verbose > 0) {
         if (MPIRank0()) {
+            // Always our domain size times total number of blocks
+            IndexRange ib = md->GetBoundsI(domain);
+            IndexRange jb = md->GetBoundsJ(domain);
+            IndexRange kb = md->GetBoundsK(domain);
+            int n_cells = pmesh->nbtotal * (kb.e - kb.s + 1) * (jb.e - jb.s + 1) * (ib.e - ib.s + 1);
+
+            int nflags = total_flag_counts[0];
             std::cout << field_name << ": " << nflags << " (" << (int)(((double) nflags )/n_cells * 100) << "% of all cells)" << std::endl;
             if (verbose > 1) {
                 // Print nonzero vector contents against flag names in order
-                int i = 0;
+                int i = 1;
                 for (auto& status : flag_values) {
-                    if (n_status_present[i] > 0) std::cout << status.second << ": " << n_status_present[i] << std::endl;
+                    if (total_flag_counts[i] > 0) std::cout << status.second << ": " << total_flag_counts[i] << std::endl;
                     ++i;
                 }
                 std::cout << std::endl;
             }
         }
-
-        // TODO Print zone locations of bad inversions
     }
 
     EndFlag();
-    return nflags;
+    return total_flag_counts;
 }
diff --git a/kharma/reductions/reductions.hpp b/kharma/reductions/reductions.hpp
index 11dd709f..b84a8518 100644
--- a/kharma/reductions/reductions.hpp
+++ b/kharma/reductions/reductions.hpp
@@ -33,26 +33,22 @@
  */
 #pragma once
 
-#include "debug.hpp"
+#include "reductions_variables.hpp"
 
 #include "flux_functions.hpp"
 #include "grmhd_functions.hpp"
 #include "types.hpp"
 
-// This is for flux/accretion rate 
-#define REDUCE_FUNCTION_ARGS_EH const GRCoordinates& G, const VariablePack<Real>& P, const VarMap& m_p, \
-                        const VariableFluxPack<Real>& U, const VarMap& m_u, const Real& gam, \
-                        const int& k, const int& j, const int& i
+namespace Reductions {
 
-// Notice this list also includes a generic Real-type argument: this is for denoting a radius or placement.
-// Provided as argument in case reductions at/within/etc multiple places are desired
-// (e.g., disk and jet, inner & outer, multiple radii)
-// TODO take off 'b' from arg list and pass block contents?
-#define REDUCE_FUNCTION_ARGS_MESH const GRCoordinates& G, const VariablePack<Real>& P, const VarMap& m_p, \
-                        const VariableFluxPack<Real>& U, const VarMap& m_u, const Real& gam, \
-                        const int& k, const int& j, const int& i, const Real& arg
+// Think about how to do channels as not ints
+//constexpr enum class Channel{fflag, pflag, iflag, };
 
-namespace Reductions {
+/**
+ * These, too, are a package.
+ * Mostly it exists to keep track of Reducers, so we can clean them up to keep MPI happy.
+ */
+std::shared_ptr<KHARMAPackage> Initialize(ParameterInput *pin, std::shared_ptr<Packages_t>& packages);
 
 /**
  * Perform a reduction using operation 'op' over a spherical shell at the given zone, measured from left side of
@@ -60,15 +56,48 @@ namespace Reductions {
  * As this only runs on innermost blocks, this is intended for accretion/event horizon
  * measurements in black hole simulations.
  */
-Real EHReduction(MeshData<Real> *md, UserHistoryOperation op, std::function<Real(REDUCE_FUNCTION_ARGS_EH)> fn, int zone);
+template<Var var, typename T>
+T EHReduction(MeshData<Real> *md, UserHistoryOperation op, int zone);
 
 /**
- * Perform a reduction using operation 'op' over all zones.
- * The extra 'arg' is passed as the last argument to the device-side function.
- * It is generally used to denote a radius inside, outside, or at which the reduction should be taken.
- * This should be used for 2D shell sums not at the EH: just divide the function result by the zone spacing dx1.
+ * Perform a reduction using operation 'op' over a given domain
+ * This should be used for all 2D shell sums not around the EH:
+ * Just set equal min/max, 2D slices are detected
  */
-Real DomainReduction(MeshData<Real> *md, UserHistoryOperation op, std::function<Real(REDUCE_FUNCTION_ARGS_MESH)> fn, Real arg);
+template<Var var, typename T>
+T DomainReduction(MeshData<Real> *md, UserHistoryOperation op, const GReal startx[3], const GReal stopx[3], int channel=-1);
+template<Var var, typename T>
+T DomainReduction(MeshData<Real> *md, UserHistoryOperation op, int channel=-1) {
+    const GReal startx[3] = {std::numeric_limits<Real>::min(), std::numeric_limits<Real>::min(), std::numeric_limits<Real>::min()};
+    const GReal stopx[3] = {std::numeric_limits<Real>::max(), std::numeric_limits<Real>::max(), std::numeric_limits<Real>::max()};
+    return DomainReduction<var, T>(md, op, startx, stopx, channel);
+}
+
+/**
+ * Start reductions with a value you have on hand
+ */
+template<typename T>
+void Start(MeshData<Real> *md, int channel, T val, MPI_Op op);
+template<typename T>
+void StartToAll(MeshData<Real> *md, int channel, T val, MPI_Op op);
+
+/**
+ * Check the results of reductions that have been started.
+ * Remember channels are COUNTED SEPARATELY between the 4 lists:
+ * Real/default, int, vector<Real> and vector<int> (i.e. Flags)
+ */
+template<typename T>
+T Check(MeshData<Real> *md, int channel);
+template<typename T>
+T CheckOnAll(MeshData<Real> *md, int channel);
+
+/**
+ * Check the results of reductions that have been started.
+ * Remember channels are COUNTED SEPARATELY between the 4 lists:
+ * Real/default, int, vector<Real> and vector<int> (i.e. Flags)
+ */
+template<typename T>
+T Check(MeshData<Real> *md, int channel);
 
 /**
  * Count instances of a particular flag value in the named field.
@@ -78,11 +107,24 @@ Real DomainReduction(MeshData<Real> *md, UserHistoryOperation op, std::function<
 int CountFlag(MeshData<Real> *md, std::string field_name, const int& flag_val, IndexDomain domain, bool is_bitflag);
 
 /**
- * Count instances of a particular flag value in the named field.
+ * Count instances of all flags in the named field.
  * is_bitflag specifies whether multiple flags may be present and will be orthogonal (e.g. FFlag),
  * or whether flags receive consecutive integer values.
- * TODO could return numbers for all flags instead of just printing
  */
-int CountFlags(MeshData<Real> *md, std::string field_name, std::map<int, std::string> flag_values, IndexDomain domain, int verbose, bool is_bitflag);
+std::vector<int> CountFlags(MeshData<Real> *md, std::string field_name, const std::map<int, std::string> &flag_values, IndexDomain domain, bool is_bitflag);
+
+/**
+ * Determine number of local flags hit with CountFlags, and send the value over MPI reducer 'channel'
+ */
+void StartFlagReduce(MeshData<Real> *md, std::string field_name, const std::map<int, std::string> &flag_values, IndexDomain domain, bool is_bitflag, int channel);
+
+/**
+ * Check a flag's MPI reduction and print any flags hit
+ */
+std::vector<int> CheckFlagReduceAndPrintHits(MeshData<Real> *md, std::string field_name, const std::map<int, std::string> &flag_values,
+                                             IndexDomain domain, bool is_bitflag, int channel);
 
 } // namespace Reductions
+
+// See the file for why we do this
+#include "reductions_impl.hpp"
diff --git a/kharma/reductions/reductions_impl.hpp b/kharma/reductions/reductions_impl.hpp
new file mode 100644
index 00000000..df42b1b8
--- /dev/null
+++ b/kharma/reductions/reductions_impl.hpp
@@ -0,0 +1,296 @@
+/* 
+ *  File: reductions_variables.hpp
+ *  
+ *  BSD 3-Clause License
+ *  
+ *  Copyright (c) 2020, AFD Group at UIUC
+ *  All rights reserved.
+ *  
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *  
+ *  1. Redistributions of source code must retain the above copyright notice, this
+ *     list of conditions and the following disclaimer.
+ *  
+ *  2. Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *  
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#pragma once
+
+// This is a weird header. It's included at the end of reductions.hpp, in order to provide
+// the source of the below templates to other files to instantiate.
+// Otherwise, it operates like a normal cpp (NOT hpp) file.
+
+// Satisfy IDE parsers who aren't wise to our schemes
+#include "reductions.hpp"
+
+template<typename T, bool all_reduce>
+inline std::string GetPoolName()
+{
+    if constexpr (all_reduce) {
+        if constexpr (std::is_same<T, Real>::value)
+            return "allreduce_pool";
+        if constexpr (std::is_same<T, int>::value)
+            return "int_allreduce_pool";
+        if constexpr (std::is_same<T, std::vector<Real>>::value)
+            return "vector_allreduce_pool";
+        if constexpr (std::is_same<T, std::vector<int>>::value)
+            return "vector_int_allreduce_pool";
+    } else {
+        if constexpr (std::is_same<T, Real>::value)
+            return "reduce_pool";
+        if constexpr (std::is_same<T, int>::value)
+            return "int_reduce_pool";
+        if constexpr (std::is_same<T, std::vector<Real>>::value)
+            return "vector_reduce_pool";
+        if constexpr (std::is_same<T, std::vector<int>>::value)
+            return "vector_int_reduce_pool";
+    }
+}
+
+// MPI reduction starts
+template<typename T>
+void Reductions::Start(MeshData<Real> *md, int channel, T val, MPI_Op op)
+{
+    // Get the relevant reducer
+    const std::string pool_name = GetPoolName<T, false>();
+    auto& pars = md->GetMeshPointer()->packages.Get("Reductions")->AllParams();
+    auto *reduce_pool = pars.GetMutable<std::vector<Reduce<T>>>(pool_name);
+    while (reduce_pool->size() <= channel) reduce_pool->push_back(Reduce<T>());
+    auto& reduce = (*reduce_pool)[channel];
+    // Fill with flags
+    reduce.val = val;
+    reduce.StartReduce(0, op);
+}
+template<typename T>
+void Reductions::StartToAll(MeshData<Real> *md, int channel, T val, MPI_Op op)
+{
+    // Get the relevant reducer
+    const std::string pool_name = GetPoolName<T, true>();
+    auto& pars = md->GetMeshPointer()->packages.Get("Reductions")->AllParams();
+    auto *allreduce_pool = pars.GetMutable<std::vector<AllReduce<T>>>(pool_name);
+    while (allreduce_pool->size() <= channel) allreduce_pool->push_back(AllReduce<T>());
+    auto& reduce = (*allreduce_pool)[channel];
+    // Fill with flags
+    reduce.val = val;
+    reduce.StartReduce(op);
+}
+
+// MPI reduction checks
+template<typename T>
+T Reductions::Check(MeshData<Real> *md, int channel)
+{
+    // Get the relevant reducer and result
+    const std::string pool_name = GetPoolName<T, false>();
+    auto& pars = md->GetMeshPointer()->packages.Get("Reductions")->AllParams();
+    auto *reduce_pool = pars.GetMutable<std::vector<Reduce<T>>>(pool_name);
+    auto& reducer = (*reduce_pool)[channel];
+
+    while (reducer.CheckReduce() == TaskStatus::incomplete);
+    return reducer.val;
+}
+template<typename T>
+T Reductions::CheckOnAll(MeshData<Real> *md, int channel)
+{
+    // Get the relevant reducer and result
+    const std::string pool_name = GetPoolName<T, true>();
+    auto& pars = md->GetMeshPointer()->packages.Get("Reductions")->AllParams();
+    auto *reduce_pool = pars.GetMutable<std::vector<AllReduce<T>>>(pool_name);
+    auto& reducer = (*reduce_pool)[channel];
+
+    while (reducer.CheckReduce() == TaskStatus::incomplete);
+    return reducer.val;
+}
+
+#define REDUCE_FUNCTION_CALL G, P(b), m_p, U(b), m_u, cmax(b), cmin(b), emhd_params, gam, k, j, i
+
+template<Reductions::Var var, typename T>
+T Reductions::EHReduction(MeshData<Real> *md, UserHistoryOperation op, int zone)
+{
+    Flag("EHReduction");
+    auto pmesh = md->GetMeshPointer();
+
+    const auto& pars = pmesh->packages.Get("GRMHD")->AllParams();
+    const Real gam = pars.Get<Real>("gamma");
+    const auto& emhd_params = EMHD::GetEMHDParameters(pmesh->packages);
+
+    PackIndexMap prims_map, cons_map;
+    const auto& P = md->PackVariables(std::vector<MetadataFlag>{Metadata::GetUserFlag("Primitive")}, prims_map);
+    const auto& U = md->PackVariablesAndFluxes(std::vector<MetadataFlag>{Metadata::Conserved}, cons_map);
+    const VarMap m_u(cons_map, true), m_p(prims_map, false);
+    const auto& cmax = md->PackVariables(std::vector<std::string>{"Flux.cmax"});
+    const auto& cmin = md->PackVariables(std::vector<std::string>{"Flux.cmin"});
+
+    auto pmb0 = md->GetBlockData(0)->GetBlockPointer();
+    IndexRange ib = pmb0->cellbounds.GetBoundsI(IndexDomain::interior);
+    IndexRange jb = pmb0->cellbounds.GetBoundsJ(IndexDomain::interior);
+    IndexRange kb = pmb0->cellbounds.GetBoundsK(IndexDomain::interior);
+    IndexRange block = IndexRange{0, U.GetDim(5) - 1};
+
+    T result(0);
+    int nb = pmesh->GetNumMeshBlocksThisRank();
+    for (int iblock=0; iblock < nb; iblock++) {
+        const auto &pmb = pmesh->block_list[iblock];
+        // Inner-edge blocks only for speed
+        if (pmb->boundary_flag[parthenon::BoundaryFace::inner_x1] == BoundaryFlag::user) {
+            const auto& G = pmb->coords;
+            T block_result;
+            switch(op) {
+            case UserHistoryOperation::sum: {
+                Kokkos::Sum<T> sum_reducer(block_result);
+                pmb->par_reduce("accretion_sum", iblock, iblock, kb.s, kb.e, jb.s, jb.e, ib.s+zone, ib.s+zone,
+                    KOKKOS_LAMBDA (const int &b, const int &k, const int &j, const int &i, T &local_result) {
+                        local_result += reduction_var<var>(REDUCE_FUNCTION_CALL) * G.Dxc<3>(k) * G.Dxc<2>(j);
+                    }
+                , sum_reducer);
+                result += block_result;
+                break;
+            }
+            case UserHistoryOperation::max: {
+                Kokkos::Max<T> max_reducer(block_result);
+                pmb->par_reduce("accretion_sum", iblock, iblock, kb.s, kb.e, jb.s, jb.e, ib.s+zone, ib.s+zone,
+                    KOKKOS_LAMBDA (const int &b, const int &k, const int &j, const int &i, T &local_result) {
+                        const T val = reduction_var<var>(REDUCE_FUNCTION_CALL) * G.Dxc<3>(k) * G.Dxc<2>(j);
+                        if (val > local_result) local_result = val;
+                    }
+                , max_reducer);
+                if (block_result > result) result = block_result;
+                break;
+            }
+            case UserHistoryOperation::min: {
+                Kokkos::Min<T> min_reducer(block_result);
+                pmb->par_reduce("accretion_sum", iblock, iblock, kb.s, kb.e, jb.s, jb.e, ib.s+zone, ib.s+zone,
+                    KOKKOS_LAMBDA (const int &b, const int &k, const int &j, const int &i, T &local_result) {
+                        const T val = reduction_var<var>(REDUCE_FUNCTION_CALL) * G.Dxc<3>(k) * G.Dxc<2>(j);
+                        if (val < local_result) local_result = val;
+                    }
+                , min_reducer);
+                if (block_result < result) result = block_result;
+                break;
+            }
+            }
+        }
+    }
+
+    EndFlag();
+    return result;
+}
+
+#define INSIDE (x[1] > startx[0] && x[2] > startx[1] && x[3] > startx[2]) && \
+                (trivial[0] ? x[1] < startx[0] + G.Dxc<1>(i) : x[1] < stopx[0]) && \
+                (trivial[1] ? x[2] < startx[1] + G.Dxc<2>(j) : x[2] < stopx[1]) && \
+                (trivial[2] ? x[3] < startx[2] + G.Dxc<3>(k) : x[3] < stopx[2])
+
+// TODO additionally template on return type to avoid counting flags with Reals
+template<Reductions::Var var, typename T>
+T Reductions::DomainReduction(MeshData<Real> *md, UserHistoryOperation op, const GReal startx[3], const GReal stopx[3], int channel)
+{
+    Flag("DomainReduction");
+    auto pmesh = md->GetMeshPointer();
+
+    const auto& pars = pmesh->packages.Get("GRMHD")->AllParams();
+    const Real gam = pars.Get<Real>("gamma");
+    const auto& emhd_params = EMHD::GetEMHDParameters(pmesh->packages);
+
+    // Just pass in everything we might want. Probably slow?
+    PackIndexMap prims_map, cons_map;
+    const auto& P = md->PackVariables(std::vector<MetadataFlag>{Metadata::GetUserFlag("Primitive")}, prims_map);
+    const auto& U = md->PackVariablesAndFluxes(std::vector<MetadataFlag>{Metadata::Conserved}, cons_map);
+    const VarMap m_u(cons_map, true), m_p(prims_map, false);
+    const auto& cmax = md->PackVariables(std::vector<std::string>{"Flux.cmax"});
+    const auto& cmin = md->PackVariables(std::vector<std::string>{"Flux.cmin"});
+
+    auto pmb0 = md->GetBlockData(0)->GetBlockPointer();
+    IndexRange ib = pmb0->cellbounds.GetBoundsI(IndexDomain::interior);
+    IndexRange jb = pmb0->cellbounds.GetBoundsJ(IndexDomain::interior);
+    IndexRange kb = pmb0->cellbounds.GetBoundsK(IndexDomain::interior);
+    IndexRange block = IndexRange{0, U.GetDim(5) - 1};
+
+    bool trivial_tmp[3] = {false, false, false};
+    VLOOP if(startx[v] == stopx[v]) {
+        trivial_tmp[v] = true;
+    }
+    const bool trivial[3] = {trivial_tmp[0], trivial_tmp[1], trivial_tmp[2]};
+
+    T result = 0.;
+    MPI_Op mop;
+    switch(op) {
+    case UserHistoryOperation::sum: {
+        Kokkos::Sum<T> sum_reducer(result);
+        pmb0->par_reduce("domain_sum", block.s, block.e, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+            KOKKOS_LAMBDA (const int &b, const int &k, const int &j, const int &i, T &local_result) {
+                const auto& G = U.GetCoords(b);
+                GReal x[4];
+                G.coord_embed(k, j, i, Loci::center, x);
+                if(INSIDE) {
+                    local_result += reduction_var<var>(REDUCE_FUNCTION_CALL) *
+                        (!trivial[2]) * G.Dxc<3>(k) * (!trivial[1]) * G.Dxc<2>(j) * (!trivial[0]) * G.Dxc<1>(i);
+                }
+            }
+        , sum_reducer);
+        mop = MPI_SUM;
+        break;
+    }
+    case UserHistoryOperation::max: {
+        Kokkos::Max<T> max_reducer(result);
+        pmb0->par_reduce("domain_max", block.s, block.e, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+            KOKKOS_LAMBDA (const int &b, const int &k, const int &j, const int &i, T &local_result) {
+                const auto& G = U.GetCoords(b);
+                GReal x[4];
+                G.coord_embed(k, j, i, Loci::center, x);
+                if(INSIDE) {
+                    const Real val = reduction_var<var>(REDUCE_FUNCTION_CALL) *
+                        (!trivial[2]) * G.Dxc<3>(k) * (!trivial[1]) * G.Dxc<2>(j) * (!trivial[0]) * G.Dxc<1>(i);
+                    if (val > local_result) local_result = val;
+                }
+            }
+        , max_reducer);
+        mop = MPI_MAX;
+        break;
+    }
+    case UserHistoryOperation::min: {
+        Kokkos::Min<T> min_reducer(result);
+        pmb0->par_reduce("domain_min", block.s, block.e, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+            KOKKOS_LAMBDA (const int &b, const int &k, const int &j, const int &i, T &local_result) {
+                const auto& G = U.GetCoords(b);
+                GReal x[4];
+                G.coord_embed(k, j, i, Loci::center, x);
+                if(INSIDE) {
+                    const Real val = reduction_var<var>(REDUCE_FUNCTION_CALL) *
+                        (!trivial[2]) * G.Dxc<3>(k) * (!trivial[1]) * G.Dxc<2>(j) * (!trivial[0]) * G.Dxc<1>(i);
+                    if (val < local_result) local_result = val;
+                }
+            }
+        , min_reducer);
+        mop = MPI_MIN;
+        break;
+    }
+    }
+
+    // Optionally start an MPI reducer w/given index, so the mesh-wide result is ready when we want it
+    if (channel >= 0) {
+        Start<T>(md, channel, result, mop);
+    }
+
+    EndFlag();
+    return result;
+}
+
+#undef INSIDE
+#undef REDUCE_FUNCTION_CALL
diff --git a/kharma/reductions/reductions_types.hpp b/kharma/reductions/reductions_types.hpp
new file mode 100644
index 00000000..7ebca674
--- /dev/null
+++ b/kharma/reductions/reductions_types.hpp
@@ -0,0 +1,121 @@
+/* 
+ *  File: reductions_variables.hpp
+ *  
+ *  BSD 3-Clause License
+ *  
+ *  Copyright (c) 2020, AFD Group at UIUC
+ *  All rights reserved.
+ *  
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *  
+ *  1. Redistributions of source code must retain the above copyright notice, this
+ *     list of conditions and the following disclaimer.
+ *  
+ *  2. Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *  
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#pragma once
+
+// This file is included with types.hpp,
+// so that all files have access to the extra Kokkos reduction machinery
+
+#include "decs.hpp"
+
+// Reduction types: teach Kokkos to keep a 3-int index, make it usable
+// See grmhd.cpp timestep calc for example
+namespace Kokkos {
+template <>
+struct reduction_identity<std::tuple<int, int, int>> {
+    KOKKOS_FORCEINLINE_FUNCTION constexpr static std::tuple<int, int, int> min() {
+        int max = std::numeric_limits<int>::max();
+        return std::tuple<int, int, int>{max, max, max};
+    }
+};
+}
+namespace Reductions {
+// Types for 3-index reduction
+typedef Kokkos::MinMaxLoc<Real, std::tuple<int, int, int>> Reduce3;
+typedef Reduce3::value_type Reduce3v;
+
+// Array type for reducing arbitrary numbers of reals
+template <class ScalarType, int N>
+struct array_type {
+    ScalarType my_array[N];
+
+    KOKKOS_INLINE_FUNCTION
+    array_type() { init(); }
+
+    KOKKOS_INLINE_FUNCTION
+    array_type(const array_type& rhs) {
+        for (int i = 0; i < N; i++) {
+            my_array[i] = rhs.my_array[i];
+        }
+    }
+
+    KOKKOS_INLINE_FUNCTION void init() {
+        for (int i = 0; i < N; i++) {
+            my_array[i] = 0;
+        }
+    }
+
+    KOKKOS_INLINE_FUNCTION array_type&
+    operator+=(const array_type& src) {
+        for (int i = 0; i < N; i++) {
+            my_array[i] += src.my_array[i];
+        }
+        return *this;
+    }
+};
+
+template <class T, class Space, int N>
+struct ArraySum {
+ public:
+  // Required
+  typedef ArraySum reducer;
+  typedef array_type<T, N> value_type;
+  typedef Kokkos::View<value_type*, Space, Kokkos::MemoryUnmanaged>
+      result_view_type;
+
+ private:
+  value_type& value;
+
+ public:
+  KOKKOS_INLINE_FUNCTION
+  ArraySum(value_type& value_) : value(value_) {}
+
+  // Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src) const {
+    dest += src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init(value_type& val) const { val.init(); }
+
+  KOKKOS_INLINE_FUNCTION
+  value_type& reference() const { return value; }
+
+  KOKKOS_INLINE_FUNCTION
+  result_view_type view() const { return result_view_type(&value, 1); }
+
+  KOKKOS_INLINE_FUNCTION
+  bool references_scalar() const { return true; }
+};
+}
\ No newline at end of file
diff --git a/kharma/reductions/reductions_variables.hpp b/kharma/reductions/reductions_variables.hpp
new file mode 100644
index 00000000..118ae5a9
--- /dev/null
+++ b/kharma/reductions/reductions_variables.hpp
@@ -0,0 +1,249 @@
+/* 
+ *  File: reductions_variables.hpp
+ *  
+ *  BSD 3-Clause License
+ *  
+ *  Copyright (c) 2020, AFD Group at UIUC
+ *  All rights reserved.
+ *  
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *  
+ *  1. Redistributions of source code must retain the above copyright notice, this
+ *     list of conditions and the following disclaimer.
+ *  
+ *  2. Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *  
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#pragma once
+
+#include "decs.hpp"
+#include "types.hpp"
+
+#include "emhd.hpp"
+#include "flux_functions.hpp"
+
+using namespace parthenon;
+
+#define REDUCE_FUNCTION_ARGS const GRCoordinates& G, const VariablePack<Real>& P, const VarMap& m_p, \
+                        const VariableFluxPack<Real>& U, const VarMap& m_u, \
+                        const VariablePack<Real>& cmax, const VariablePack<Real>& cmin,\
+                        const EMHD::EMHD_parameters& emhd_params, const Real& gam, const int& k, const int& j, const int& i
+
+namespace Reductions {
+
+// Add any new reduction variables to this list, then implementations below
+// Not elegant, but fast & portable.
+// HIPCC doesn't like passing function pointers as we used to do,
+// and it doesn't vectorize anyway. Look forward to more of this pattern in the code
+enum class Var{phi, bsq, gas_pressure, mag_pressure, beta,
+               mdot, edot, ldot, mdot_flux, edot_flux, ldot_flux, eht_lum, jet_lum,
+               nan_ctop, zero_ctop, neg_rho, neg_u, neg_rhout};
+
+// Function template for all reductions.
+template<Var T>
+Real reduction_var(REDUCE_FUNCTION_ARGS);
+
+// Can also sum the hemispheres independently to be fancy (TODO?)
+template <>
+KOKKOS_INLINE_FUNCTION Real reduction_var<Var::phi>(REDUCE_FUNCTION_ARGS)
+{
+    // \Phi == \int |*F^1^0| * gdet * dx2 * dx3 == \int |B1| * gdet * dx2 * dx3
+    return 0.5 * m::abs(U(m_u.B1, k, j, i)); // factor of gdet already in cons.B
+}
+
+template <>
+KOKKOS_INLINE_FUNCTION Real reduction_var<Var::bsq>(REDUCE_FUNCTION_ARGS)
+{
+    FourVectors Dtmp;
+    GRMHD::calc_4vecs(G, P, m_p, k, j, i, Loci::center, Dtmp);
+    return dot(Dtmp.bcon, Dtmp.bcov);
+}
+template <>
+KOKKOS_INLINE_FUNCTION Real reduction_var<Var::gas_pressure>(REDUCE_FUNCTION_ARGS)
+{
+    return (gam - 1) * P(m_p.UU, k, j, i);
+}
+template <>
+KOKKOS_INLINE_FUNCTION Real reduction_var<Var::beta>(REDUCE_FUNCTION_ARGS)
+{
+    FourVectors Dtmp;
+    GRMHD::calc_4vecs(G, P, m_p, k, j, i, Loci::center, Dtmp);
+    return ((gam - 1) * P(m_p.UU, k, j, i))/(0.5*(dot(Dtmp.bcon, Dtmp.bcov) + SMALL));
+}
+
+// Accretion rates: return a zone's contribution to the surface integral
+// forming each rate measurement.
+template <>
+KOKKOS_INLINE_FUNCTION Real reduction_var<Var::mdot>(REDUCE_FUNCTION_ARGS)
+{
+    Real ucon[GR_DIM];
+    GRMHD::calc_ucon(G, P, m_p, k, j, i, Loci::center, ucon);
+    // \dot{M} == \int rho * u^1 * gdet * dx2 * dx3
+    return -P(m_p.RHO, k, j, i) * ucon[X1DIR] * G.gdet(Loci::center, j, i);
+}
+template <>
+KOKKOS_INLINE_FUNCTION Real reduction_var<Var::edot>(REDUCE_FUNCTION_ARGS)
+{
+    FourVectors Dtmp;
+    Real T1[GR_DIM];
+    GRMHD::calc_4vecs(G, P, m_p, k, j, i, Loci::center, Dtmp);
+    Flux::calc_tensor(P, m_p, Dtmp, emhd_params, gam, k, j, i, X1DIR, T1);
+    // \dot{E} == \int - T^1_0 * gdet * dx2 * dx3
+    return -T1[X0DIR] * G.gdet(Loci::center, j, i);
+}
+template <>
+KOKKOS_INLINE_FUNCTION Real reduction_var<Var::ldot>(REDUCE_FUNCTION_ARGS)
+{
+    FourVectors Dtmp;
+    Real T1[GR_DIM];
+    GRMHD::calc_4vecs(G, P, m_p, k, j, i, Loci::center, Dtmp);
+    Flux::calc_tensor(P, m_p, Dtmp, emhd_params, gam, k, j, i, X1DIR, T1);
+    // \dot{L} == \int T^1_3 * gdet * dx2 * dx3
+    return T1[X3DIR] * G.gdet(Loci::center, j, i);
+}
+
+// Then we can define the same with fluxes.
+template <>
+KOKKOS_INLINE_FUNCTION Real reduction_var<Var::mdot_flux>(REDUCE_FUNCTION_ARGS)
+{
+    return -U.flux(X1DIR, m_u.RHO, k, j, i);
+}
+template <>
+KOKKOS_INLINE_FUNCTION Real reduction_var<Var::edot_flux>(REDUCE_FUNCTION_ARGS)
+{
+    return (U.flux(X1DIR, m_u.UU, k, j, i) - U.flux(X1DIR, m_u.RHO, k, j, i));
+}
+template <>
+KOKKOS_INLINE_FUNCTION Real reduction_var<Var::ldot_flux>(REDUCE_FUNCTION_ARGS)
+{
+    return U.flux(X1DIR, m_u.U3, k, j, i);
+}
+
+// Luminosity proxy from (for example) Porth et al 2019.
+template <>
+KOKKOS_INLINE_FUNCTION Real reduction_var<Var::eht_lum>(REDUCE_FUNCTION_ARGS)
+{
+    FourVectors Dtmp;
+    GRMHD::calc_4vecs(G, P, m_p, k, j, i, Loci::center, Dtmp);
+    Real rho = P(m_p.RHO, k, j, i);
+    Real Pg = (gam - 1.) * P(m_p.UU, k, j, i);
+    Real Bmag = m::sqrt(dot(Dtmp.bcon, Dtmp.bcov));
+    Real j_eht = rho*rho*rho/Pg/Pg * m::exp(-0.2 * m::cbrt(rho * rho / (Bmag * Pg * Pg)));
+    return j_eht;
+}
+
+// Example of checking extra conditions before adding local results:
+// sums total jet power only at exactly r=radius, for areas with sig > 1
+// TODO version w/E&M power only.  Needs "calc_tensor_EM"
+template <>
+KOKKOS_INLINE_FUNCTION Real reduction_var<Var::jet_lum>(REDUCE_FUNCTION_ARGS)
+{
+    FourVectors Dtmp;
+    Real T1[GR_DIM];
+    GRMHD::calc_4vecs(G, P, m_p, k, j, i, Loci::center, Dtmp);
+    Flux::calc_tensor(P, m_p, Dtmp, emhd_params, gam, k, j, i, X1DIR, T1);
+    // If sigma > 1...
+    if ((dot(Dtmp.bcon, Dtmp.bcov) / P(m_p.RHO, k, j, i)) > 1.) {
+        // Energy flux, like at EH
+        return -T1[X0DIR];
+    } else {
+        return 0.;
+    }
+}
+
+// Diagnostics.  Still have to return Real so we get creative.
+template <>
+KOKKOS_INLINE_FUNCTION Real reduction_var<Var::zero_ctop>(REDUCE_FUNCTION_ARGS)
+{
+    Real is_zero = 0;
+    VLOOP {
+        if(m::max(cmax(v, k, j, i), cmin(v, k, j, i)) <= 0.) {
+            is_zero = 1.; // once per zone
+#if DEBUG
+#ifndef KOKKOS_ENABLE_SYCL
+            printf("ctop zero at %d %d %d along dir %d\n", i, j, k, v+1);
+#endif
+#endif
+        }
+    }
+
+    return is_zero;
+}
+template <>
+KOKKOS_INLINE_FUNCTION Real reduction_var<Var::nan_ctop>(REDUCE_FUNCTION_ARGS)
+{
+    Real is_nan = 0.;
+    VLOOP {
+        if(m::isnan(m::max(cmax(v, k, j, i), cmin(v, k, j, i)))) {
+            is_nan = 1.;
+#if DEBUG
+#ifndef KOKKOS_ENABLE_SYCL
+            printf("ctop NaN at %d %d %d along dir %d\n", i, j, k, v+1);
+#endif
+#endif
+        }
+    }
+
+    return is_nan;
+}
+
+template <>
+KOKKOS_INLINE_FUNCTION Real reduction_var<Var::neg_rhout>(REDUCE_FUNCTION_ARGS)
+{
+    Real is_neg = 0.;
+    if (U(m_u.RHO, k, j, i) < 0.) {
+        is_neg = 1.;
+#if DEBUG
+#ifndef KOKKOS_ENABLE_SYCL
+        printf("Negative rho*u^0 (cons.rho) at %d %d %d\n", i, j, k);
+#endif
+#endif
+    }
+}
+template <>
+KOKKOS_INLINE_FUNCTION Real reduction_var<Var::neg_u>(REDUCE_FUNCTION_ARGS)
+{
+    Real is_neg = 0.;
+    if (P(m_p.UU, k, j, i) < 0.) {
+        is_neg = 1.;
+#if DEBUG
+#ifndef KOKKOS_ENABLE_SYCL
+        printf("Negative internal energy (prims.u) at %d %d %d\n", i, j, k);
+#endif
+#endif
+    }
+}
+template <>
+KOKKOS_INLINE_FUNCTION Real reduction_var<Var::neg_rho>(REDUCE_FUNCTION_ARGS)
+{
+    Real is_neg = 0.;
+    if (P(m_p.RHO, k, j, i) < 0.) {
+        is_neg = 1.;
+#if DEBUG
+#ifndef KOKKOS_ENABLE_SYCL
+        printf("Negative density (prims.rho) at %d %d %d\n", i, j, k);
+#endif
+#endif
+    }
+}
+
+}
+
+#undef REDUCE_FUNCTION_ARGS
\ No newline at end of file
diff --git a/kharma/types.hpp b/kharma/types.hpp
index adeee780..c8eafeab 100644
--- a/kharma/types.hpp
+++ b/kharma/types.hpp
@@ -37,6 +37,7 @@
 
 #include "boundary_types.hpp"
 #include "kharma_package.hpp"
+#include "reductions_types.hpp"
 
 #include <parthenon/parthenon.hpp>
 
diff --git a/machines/bp.sh b/machines/bp.sh
index 9eed4109..f7997917 100644
--- a/machines/bp.sh
+++ b/machines/bp.sh
@@ -27,6 +27,14 @@ if [[ $METAL_HOSTNAME == "fermium" ]]; then
   DEVICE_ARCH="TURING75"
   # Nvidia MPI hangs unless I do this
   MPI_EXE=mpirun
+
+  if [[ "$ARGS" == *"cuda"* ]]; then
+    echo "Nothing special for cuda"
+  else
+    # AMD for CPUs
+    CXX_NATIVE=clang++
+    C_NATIVE=clang
+  fi
 fi
 
 if [[ $METAL_HOSTNAME == "ferrum" ]]; then
diff --git a/machines/illinois.sh b/machines/illinois.sh
index f0355603..66506a85 100644
--- a/machines/illinois.sh
+++ b/machines/illinois.sh
@@ -13,9 +13,11 @@ elif [[ $HOST == *".astro.illinois.edu" ]]; then
     HOST_ARCH="ZEN2"
     # BH29 benefits from using just 1 thread/core
     export OMP_NUM_THREADS=64
+    NPROC=64
   else
     # Other machines are Skylake
     HOST_ARCH="SKX"
+    NPROC=36
   fi
 
   # Compile our own HDF5 by default
diff --git a/make.sh b/make.sh
index a891a6bc..35e4df3d 100755
--- a/make.sh
+++ b/make.sh
@@ -110,7 +110,7 @@ if [[ -z "$CXX_NATIVE" ]]; then
   elif which icpx >/dev/null 2>&1; then
     CXX_NATIVE=icpx
     C_NATIVE=icx
-    OMP_FLAG="-fopenmp"
+    OMP_FLAG="-fiopenmp"
   elif which icpc >/dev/null 2>&1; then
     CXX_NATIVE=icpc
     C_NATIVE=icc
@@ -201,9 +201,10 @@ fi
 # Allow for a custom linker program, but use CXX by
 # default as system linker may be older/incompatible
 if [[ -v LINKER ]]; then
-  LINKER="$LINKER"
-else
-  LINKER="$CXX"
+  EXTRA_FLAGS="-DCMAKE_LINKER=$LINKER"
+fi
+if [[ "$ARGS" == *"special_link_line"* ]]; then
+  EXTRA_FLAGS="-DCMAKE_CXX_LINK_EXECUTABLE='<CMAKE_LINKER> <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>'"
 fi
 
 # Avoid warning on nvcc pragmas Intel doesn't like
@@ -255,6 +256,7 @@ if [[ "$ARGS" == *"hdf5"* && "$ARGS" == *"clean"* ]]; then
 
   echo Configuring HDF5...
 
+  export CFLAGS="-fPIC $CFLAGS"
   CC=$HDF_CC sh configure -C $HDF_EXTRA --prefix=$SOURCE_DIR/external/hdf5 --enable-build-mode=production \
   --disable-dependency-tracking --disable-hl --disable-tests --disable-tools --disable-shared --disable-deprecated-symbols > build-hdf5.log
   sleep 1
@@ -304,8 +306,6 @@ if [[ "$ARGS" == *"clean"* ]]; then
   cmake ..\
     -DCMAKE_C_COMPILER="$CC" \
     -DCMAKE_CXX_COMPILER="$CXX" \
-    -DCMAKE_LINKER="$LINKER" \
-    -DCMAKE_CXX_LINK_EXECUTABLE='<CMAKE_LINKER> <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>' \
     -DCMAKE_PREFIX_PATH="$PREFIX_PATH;$CMAKE_PREFIX_PATH" \
     -DCMAKE_BUILD_TYPE=$TYPE \
     -DPAR_LOOP_LAYOUT=$OUTER_LAYOUT \