flatironinstitute · DiamonDinoia · Oct 24, 2025 · ahbarnett · Oct 28, 2025 · DiamonDinoia
diff --git a/devel/foldrescale.cpp b/devel/foldrescale.cpp
@@ -15,6 +15,9 @@
    The macro wins hands-down on i7, even for modern GCC9.
    This should be done in C++ not as a macro, someday.
 */
+
+using finufft::common::INV_2PI;
+
 #define FOLDRESCALE(x, N, p)                                                \
   (p ? (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT)INV_2PI * N) \
      : (x >= 0.0 ? (x < (FLT)N ? x : x - (FLT)N) : x + (FLT)N))

diff --git a/docs/opts.rst b/docs/opts.rst
@@ -238,17 +238,15 @@ However, with FFTW as the FFT library, FINUFFT is thread safe so long as no othe
   #include <finufft.h>
   #include <omp.h>
 
-  using namespace std;
-
   constexpr int N = 65384;
 
-  void locker(void *lck) { reinterpret_cast<recursive_mutex *>(lck)->lock(); }
-  void unlocker(void *lck) { reinterpret_cast<recursive_mutex *>(lck)->unlock(); }
+  void locker(void *lck) { reinterpret_cast<std::recursive_mutex *>(lck)->lock(); }
+  void unlocker(void *lck) { reinterpret_cast<std::recursive_mutex *>(lck)->unlock(); }
 
   int main() {
     int64_t Ns[3]; // guru describes mode array by vector [N1,N2..]
     Ns[0] = N;
-    recursive_mutex lck;
+    std::recursive_mutex lck;
 
     finufft_opts opts;
     finufft_default_opts(&opts);
@@ -259,7 +257,7 @@ However, with FFTW as the FFT library, FINUFFT is thread safe so long as no othe
     opts.fftw_lock_data = reinterpret_cast<void *>(&lck);
 
     // random nonuniform points (x) and complex strengths (c)
-    vector<complex<double>> c(N);
+    std::vector<std::complex<double>> c(N);
 
     // init FFTW threads
     fftw_init_threads();
@@ -268,7 +266,7 @@ However, with FFTW as the FFT library, FINUFFT is thread safe so long as no othe
     #pragma omp parallel for
     for (int j = 0; j < 100; ++j) {
       // allocate output array for FFTW...
-      vector<complex<double>> F1(N);
+      std::vector<std::complex<double>> F1(N);
 
       // FFTW plan
       lck.lock();

diff --git a/examples/cuda/example2d1many.cpp b/examples/cuda/example2d1many.cpp
@@ -100,12 +100,12 @@ int main(int argc, char *argv[])
     std::complex<float> Ft = std::complex<float>(0, 0),
                         J  = std::complex<float>(0, 1) * (float)iflag;
     for (auto j = 0UL; j < M; ++j)
-      Ft += c[j + i * M] * exp(J * (nt1 * x[j] + nt2 * y[j])); // crude direct
+      Ft += c[j + i * M] * std::exp(J * (nt1 * x[j] + nt2 * y[j])); // crude direct
     int it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2); // index in complex F as 1d array
     printf("[gpu %3d] one mode: abs err in F[%d,%d] is %.3g\n", i, nt1, nt2,
-           abs(Ft - fk[it + i * N]));
+           std::abs(Ft - fk[it + i * N]));
     printf("[gpu %3d] one mode: rel err in F[%d,%d] is %.3g\n", i, nt1, nt2,
-           abs(Ft - fk[it + i * N]) / infnorm(N, fk + i * N));
+           std::abs(Ft - fk[it + i * N]) / infnorm(N, fk + i * N));
   }
 
   cudaFreeHost(x);

diff --git a/examples/cuda/example2d2many.cpp b/examples/cuda/example2d2many.cpp
@@ -109,10 +109,10 @@ int main(int argc, char *argv[])
     int m = 0;
     for (int m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2) // loop in correct order over F
       for (int m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1)
-        ct += fkstart[m++] * exp(J * (m1 * x[jt] + m2 * y[jt])); // crude direct
+        ct += fkstart[m++] * std::exp(J * (m1 * x[jt] + m2 * y[jt])); // crude direct
 
     printf("[gpu %3d] one targ: rel err in c[%d] is %.3g\n", t, jt,
-           abs(cstart[jt] - ct) / infnorm(M, c));
+           std::abs(cstart[jt] - ct) / infnorm(M, c));
   }
 
   cudaFreeHost(x);

diff --git a/examples/cuda/example2d3many.cpp b/examples/cuda/example2d3many.cpp
@@ -112,10 +112,10 @@ int main(int argc, char *argv[])
     int jt  = N / 2; // check arbitrary choice of one targ pt
     std::complex<double> J(0, iflag * 1);
     std::complex<double> fkt(0, 0);
-    for (int m = 0; m < M; m++) fkt += cstart[m] * exp(J * (s[jt] * x[m] + t[jt] * y[m]));
+    for (int m = 0; m < M; m++) fkt += cstart[m] * std::exp(J * (s[jt] * x[m] + t[jt] * y[m]));
 
     printf("[gpu %3d] one targ: rel err in c[%d] is %.3g\n", tr, jt,
-           abs(fkstart[jt] - fkt) / infnorm(N, fkstart));
+           std::abs(fkstart[jt] - fkt) / infnorm(N, fkstart));
   }
 
   cudaFreeHost(x);

diff --git a/examples/cuda/getting_started.cpp b/examples/cuda/getting_started.cpp
@@ -1,31 +1,19 @@
 /*
-
-  Simple example of the 1D type-1 transform. To compile, run
-
-       nvcc -o getting_started getting_started.cpp -lcufinufft
-
-  followed by
-
-       ./getting_started
-
-  with the necessary paths set if the library is not installed in the standard
-  directories. If the library has been compiled in the standard way, this means
-
-       export CPATH="${CPATH:+${CPATH}:}../../include"
-       export LIBRARY_PATH="${LIBRARY_PATH:+${LIBRARY_PATH}:}../../build"
-       export LD_LIBRARY_PATH="${LD_LIBRARY_PATH:+${LD_LIBRARY_PATH}:}../../build"
-
- */
-
-#include <complex.h>
+  Simple example of the 1D type-1 transform using std::complex.
+  To compile:
+      nvcc -o getting_started getting_started.cpp -lcufinufft
+*/
+
+#include <cmath>
+#include <complex>
+#include <cstdio>
+#include <cstdlib>
 #include <cuComplex.h>
 #include <cuda_runtime.h>
 #include <cufinufft.h>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
+#include <vector>
 
-static const double PI = 3.141592653589793238462643383279502884;
+static constexpr double PI = 3.141592653589793238462643383279502884;
 
 int main() {
   // Problem size: number of nonuniform points (M) and grid size (N).
@@ -35,9 +23,9 @@ int main() {
   int64_t modes[1] = {N};
 
   // Host pointers: frequencies (x), coefficients (c), and output (f).
-  float *x;
-  float _Complex *c;
-  float _Complex *f;
+  std::vector<float> x(M);
+  std::vector<std::complex<float>> c(M);
+  std::vector<std::complex<float>> f(N);
 
   // Device pointers.
   float *d_x;
@@ -48,71 +36,61 @@ int main() {
 
   // Manual calculation at a single point idx.
   int idx;
-  float _Complex f0;
-
-  // Allocate the host arrays.
-  x = (float *)malloc(M * sizeof(float));
-  c = (float _Complex *)malloc(M * sizeof(float _Complex));
-  f = (float _Complex *)malloc(N * sizeof(float _Complex));
-
-  // Fill with random numbers. Frequencies must be in the interval [-pi, pi)
-  // while strengths can be any value.
-  srand(0);
+  std::complex<float> f0;
 
+  // Fill with random numbers.
+  std::srand(42);
   for (int j = 0; j < M; ++j) {
-    x[j] = 2 * PI * (((float)rand()) / RAND_MAX - 1);
-    c[j] =
-        (2 * ((float)rand()) / RAND_MAX - 1) + I * (2 * ((float)rand()) / RAND_MAX - 1);
+    x[j]     = 2 * PI * ((float)std::rand() / RAND_MAX - 1);
+    float re = 2 * ((float)std::rand()) / RAND_MAX - 1;
+    float im = 2 * ((float)std::rand()) / RAND_MAX - 1;
+    c[j]     = std::complex<float>(re, im);
   }
 
-  // Allocate the device arrays and copy the x and c arrays.
+  // Allocate the device arrays and copy x and c.
   cudaMalloc(&d_x, M * sizeof(float));
-  cudaMalloc(&d_c, M * sizeof(float _Complex));
-  cudaMalloc(&d_f, N * sizeof(float _Complex));
+  cudaMalloc(&d_c, M * sizeof(cuFloatComplex));
+  cudaMalloc(&d_f, N * sizeof(cuFloatComplex));
+
+  std::vector<cuFloatComplex> c_dev(M);
+  for (int j = 0; j < M; ++j) c_dev[j] = make_cuFloatComplex(c[j].real(), c[j].imag());
 
-  cudaMemcpy(d_x, x, M * sizeof(float), cudaMemcpyHostToDevice);
-  cudaMemcpy(d_c, c, M * sizeof(float _Complex), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_x, x.data(), M * sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_c, c_dev.data(), M * sizeof(cuFloatComplex), cudaMemcpyHostToDevice);
 
-  // Make the cufinufft plan for a 1D type-1 transform with six digits of
-  // tolerance.
-  cufinufftf_makeplan(1, 1, modes, 1, 1, 1e-6, &plan, NULL);
+  // Make the cufinufft plan for 1D type-1 transform.
+  cufinufftf_makeplan(1, 1, modes, 1, 1, 1e-6, &plan, nullptr);
 
   // Set the frequencies of the nonuniform points.
-  cufinufftf_setpts(plan, M, d_x, NULL, NULL, 0, NULL, NULL, NULL);
+  cufinufftf_setpts(plan, M, d_x, nullptr, nullptr, 0, nullptr, nullptr, nullptr);
 
   // Actually execute the plan on the given coefficients and store the result
   // in the d_f array.
   cufinufftf_execute(plan, d_c, d_f);
 
   // Copy the result back onto the host.
-  cudaMemcpy(f, d_f, N * sizeof(float _Complex), cudaMemcpyDeviceToHost);
+  cudaMemcpy(f.data(), d_f, N * sizeof(cuFloatComplex), cudaMemcpyDeviceToHost);
 
   // Destroy the plan and free the device arrays after we're done.
   cufinufftf_destroy(plan);
-
   cudaFree(d_x);
   cudaFree(d_c);
   cudaFree(d_f);
 
   // Pick an index to check the result of the calculation.
   idx = 4 * N / 7;
-
-  printf("f[%d] = %lf + %lfi\n", idx, crealf(f[idx]), cimagf(f[idx]));
+  printf("f[%d] = %lf + %lfi\n", idx, std::real(f[idx]), std::imag(f[idx]));
 
   // Calculate the result manually using the formula for the type-1
   // transform.
   f0 = 0;
 
+  std::complex<float> I(-0.0, 1.0);
   for (int j = 0; j < M; ++j) {
-    f0 += c[j] * cexp(I * x[j] * (idx - N / 2));
+    f0 += c[j] * std::exp(I * x[j] * float(idx - N / 2));
   }
 
-  printf("f0[%d] = %lf + %lfi\n", idx, crealf(f0), cimagf(f0));
-
-  // Finally free the host arrays.
-  free(x);
-  free(c);
-  free(f);
+  printf("f0[%d] = %lf + %lfi\n", idx, std::real(f0), std::imag(f0));
 
   return 0;
 }
diff --git a/examples/guru1d1.cpp b/examples/guru1d1.cpp
@@ -4,17 +4,13 @@
 
 // specific to this example...
 #include <cassert>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
 #include <vector>
 
-// only good for small projects...
-using namespace std;
-
 static const double PI = 3.141592653589793238462643383279502884;
-// allows 1i to be the imaginary unit... (C++14 onwards)
-using namespace std::complex_literals;
 
 int main(int argc, char *argv[])
 /* Example calling guru C++ interface to FINUFFT library, passing
@@ -42,43 +38,47 @@ int main(int argc, char *argv[])
   } else            // or, NULL here means use default opts...
     finufft_makeplan(type, dim, Ns, +1, ntransf, tol, &plan, NULL);
 
+  const std::complex<double> I(0.0, 1.0);
+
   // generate some random nonuniform points
-  vector<double> x(M);
+  std::vector<double> x(M);
   for (int j = 0; j < M; ++j)
-    x[j] = PI * (2 * ((double)rand() / RAND_MAX) - 1); // uniform random in [-pi,pi)
+    x[j] = PI * (2 * ((double)std::rand() / RAND_MAX) - 1); // uniform random in [-pi,pi)
   // note FINUFFT doesn't use std::vector types, so we need to make a pointer...
   finufft_setpts(plan, M, x.data(), NULL, NULL, 0, NULL, NULL, NULL);
 
   // generate some complex strengths
-  vector<complex<double>> c(M);
+  std::vector<std::complex<double>> c(M);
   for (int j = 0; j < M; ++j)
-    c[j] =
-        2 * ((double)rand() / RAND_MAX) - 1 + 1i * (2 * ((double)rand() / RAND_MAX) - 1);
+    c[j] = 2 * ((double)std::rand() / RAND_MAX) - 1 +
+           std::complex<double>(0.0, 1.0) *
+               (2 * ((double)std::rand() / RAND_MAX) - 1);
 
   // alloc output array for the Fourier modes, then do the transform
-  vector<complex<double>> F(N);
+  std::vector<std::complex<double>> F(N);
   int ier = finufft_execute(plan, c.data(), F.data());
 
   // for fun, do another with same NU pts (no re-sorting), but new strengths...
   for (int j = 0; j < M; ++j)
-    c[j] =
-        2 * ((double)rand() / RAND_MAX) - 1 + 1i * (2 * ((double)rand() / RAND_MAX) - 1);
+    c[j] = 2 * ((double)std::rand() / RAND_MAX) - 1 +
+           std::complex<double>(0.0, 1.0) *
+               (2 * ((double)std::rand() / RAND_MAX) - 1);
   ier = finufft_execute(plan, c.data(), F.data());
 
   finufft_destroy(plan); // don't forget! done with transforms of this size
 
   // rest is math checking and reporting...
   int n = 142519;                                   // check the answer just for this mode
   assert(n >= -(double)N / 2 && n < (double)N / 2); // ensure meaningful test
-  complex<double> Ftest = complex<double>(0, 0);
-  for (int j = 0; j < M; ++j) Ftest += c[j] * exp(1i * (double)n * x[j]);
+  std::complex<double> Ftest(0, 0);
+  for (int j = 0; j < M; ++j) Ftest += c[j] * std::exp(I * (double)n * x[j]);
   int nout    = n + N / 2; // index in output array for freq mode n
   double Fmax = 0.0;       // compute inf norm of F
   for (int m = 0; m < N; ++m) {
-    double aF = abs(F[m]);
+    double aF = std::abs(F[m]);
     if (aF > Fmax) Fmax = aF;
   }
-  double err = abs(F[nout] - Ftest) / Fmax;
+  double err = std::abs(F[nout] - Ftest) / Fmax;
   printf("guru 1D type-1 double-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n", ier,
          n, err);