Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions devel/foldrescale.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
The macro wins hands-down on i7, even for modern GCC9.
This should be done in C++ not as a macro, someday.
*/

using finufft::common::INV_2PI;

#define FOLDRESCALE(x, N, p) \
(p ? (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT)INV_2PI * N) \
: (x >= 0.0 ? (x < (FLT)N ? x : x - (FLT)N) : x + (FLT)N))
Expand Down
12 changes: 5 additions & 7 deletions docs/opts.rst
Original file line number Diff line number Diff line change
Expand Up @@ -238,17 +238,15 @@ However, with FFTW as the FFT library, FINUFFT is thread safe so long as no othe
#include <finufft.h>
#include <omp.h>

using namespace std;

constexpr int N = 65384;

void locker(void *lck) { reinterpret_cast<recursive_mutex *>(lck)->lock(); }
void unlocker(void *lck) { reinterpret_cast<recursive_mutex *>(lck)->unlock(); }
void locker(void *lck) { reinterpret_cast<std::recursive_mutex *>(lck)->lock(); }
void unlocker(void *lck) { reinterpret_cast<std::recursive_mutex *>(lck)->unlock(); }

int main() {
int64_t Ns[3]; // guru describes mode array by vector [N1,N2..]
Ns[0] = N;
recursive_mutex lck;
std::recursive_mutex lck;

finufft_opts opts;
finufft_default_opts(&opts);
Expand All @@ -259,7 +257,7 @@ However, with FFTW as the FFT library, FINUFFT is thread safe so long as no othe
opts.fftw_lock_data = reinterpret_cast<void *>(&lck);

// random nonuniform points (x) and complex strengths (c)
vector<complex<double>> c(N);
std::vector<std::complex<double>> c(N);

// init FFTW threads
fftw_init_threads();
Expand All @@ -268,7 +266,7 @@ However, with FFTW as the FFT library, FINUFFT is thread safe so long as no othe
#pragma omp parallel for
for (int j = 0; j < 100; ++j) {
// allocate output array for FFTW...
vector<complex<double>> F1(N);
std::vector<std::complex<double>> F1(N);

// FFTW plan
lck.lock();
Expand Down
6 changes: 3 additions & 3 deletions examples/cuda/example2d1many.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,12 +100,12 @@ int main(int argc, char *argv[])
std::complex<float> Ft = std::complex<float>(0, 0),
J = std::complex<float>(0, 1) * (float)iflag;
for (auto j = 0UL; j < M; ++j)
Ft += c[j + i * M] * exp(J * (nt1 * x[j] + nt2 * y[j])); // crude direct
Ft += c[j + i * M] * std::exp(J * (nt1 * x[j] + nt2 * y[j])); // crude direct
int it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2); // index in complex F as 1d array
printf("[gpu %3d] one mode: abs err in F[%d,%d] is %.3g\n", i, nt1, nt2,
abs(Ft - fk[it + i * N]));
std::abs(Ft - fk[it + i * N]));
printf("[gpu %3d] one mode: rel err in F[%d,%d] is %.3g\n", i, nt1, nt2,
abs(Ft - fk[it + i * N]) / infnorm(N, fk + i * N));
std::abs(Ft - fk[it + i * N]) / infnorm(N, fk + i * N));
}

cudaFreeHost(x);
Expand Down
4 changes: 2 additions & 2 deletions examples/cuda/example2d2many.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,10 +109,10 @@ int main(int argc, char *argv[])
int m = 0;
for (int m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2) // loop in correct order over F
for (int m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1)
ct += fkstart[m++] * exp(J * (m1 * x[jt] + m2 * y[jt])); // crude direct
ct += fkstart[m++] * std::exp(J * (m1 * x[jt] + m2 * y[jt])); // crude direct

printf("[gpu %3d] one targ: rel err in c[%d] is %.3g\n", t, jt,
abs(cstart[jt] - ct) / infnorm(M, c));
std::abs(cstart[jt] - ct) / infnorm(M, c));
}

cudaFreeHost(x);
Expand Down
4 changes: 2 additions & 2 deletions examples/cuda/example2d3many.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,10 +112,10 @@ int main(int argc, char *argv[])
int jt = N / 2; // check arbitrary choice of one targ pt
std::complex<double> J(0, iflag * 1);
std::complex<double> fkt(0, 0);
for (int m = 0; m < M; m++) fkt += cstart[m] * exp(J * (s[jt] * x[m] + t[jt] * y[m]));
for (int m = 0; m < M; m++) fkt += cstart[m] * std::exp(J * (s[jt] * x[m] + t[jt] * y[m]));

printf("[gpu %3d] one targ: rel err in c[%d] is %.3g\n", tr, jt,
abs(fkstart[jt] - fkt) / infnorm(N, fkstart));
std::abs(fkstart[jt] - fkt) / infnorm(N, fkstart));
}

cudaFreeHost(x);
Expand Down
96 changes: 37 additions & 59 deletions examples/cuda/getting_started.cpp
Original file line number Diff line number Diff line change
@@ -1,31 +1,19 @@
/*

Simple example of the 1D type-1 transform. To compile, run

nvcc -o getting_started getting_started.cpp -lcufinufft

followed by

./getting_started

with the necessary paths set if the library is not installed in the standard
directories. If the library has been compiled in the standard way, this means

export CPATH="${CPATH:+${CPATH}:}../../include"
export LIBRARY_PATH="${LIBRARY_PATH:+${LIBRARY_PATH}:}../../build"
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH:+${LD_LIBRARY_PATH}:}../../build"

*/

#include <complex.h>
Simple example of the 1D type-1 transform using std::complex.
To compile:
nvcc -o getting_started getting_started.cpp -lcufinufft
*/

#include <cmath>
#include <complex>
#include <cstdio>
#include <cstdlib>
#include <cuComplex.h>
#include <cuda_runtime.h>
#include <cufinufft.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <vector>

static const double PI = 3.141592653589793238462643383279502884;
static constexpr double PI = 3.141592653589793238462643383279502884;

int main() {
// Problem size: number of nonuniform points (M) and grid size (N).
Expand All @@ -35,9 +23,9 @@ int main() {
int64_t modes[1] = {N};

// Host pointers: frequencies (x), coefficients (c), and output (f).
float *x;
float _Complex *c;
float _Complex *f;
std::vector<float> x(M);
std::vector<std::complex<float>> c(M);
std::vector<std::complex<float>> f(N);

// Device pointers.
float *d_x;
Expand All @@ -48,71 +36,61 @@ int main() {

// Manual calculation at a single point idx.
int idx;
float _Complex f0;

// Allocate the host arrays.
x = (float *)malloc(M * sizeof(float));
c = (float _Complex *)malloc(M * sizeof(float _Complex));
f = (float _Complex *)malloc(N * sizeof(float _Complex));

// Fill with random numbers. Frequencies must be in the interval [-pi, pi)
// while strengths can be any value.
srand(0);
std::complex<float> f0;

// Fill with random numbers.
std::srand(42);
for (int j = 0; j < M; ++j) {
x[j] = 2 * PI * (((float)rand()) / RAND_MAX - 1);
c[j] =
(2 * ((float)rand()) / RAND_MAX - 1) + I * (2 * ((float)rand()) / RAND_MAX - 1);
x[j] = 2 * PI * ((float)std::rand() / RAND_MAX - 1);
float re = 2 * ((float)std::rand()) / RAND_MAX - 1;
float im = 2 * ((float)std::rand()) / RAND_MAX - 1;
c[j] = std::complex<float>(re, im);
}

// Allocate the device arrays and copy the x and c arrays.
// Allocate the device arrays and copy x and c.
cudaMalloc(&d_x, M * sizeof(float));
cudaMalloc(&d_c, M * sizeof(float _Complex));
cudaMalloc(&d_f, N * sizeof(float _Complex));
cudaMalloc(&d_c, M * sizeof(cuFloatComplex));
cudaMalloc(&d_f, N * sizeof(cuFloatComplex));

std::vector<cuFloatComplex> c_dev(M);
for (int j = 0; j < M; ++j) c_dev[j] = make_cuFloatComplex(c[j].real(), c[j].imag());

cudaMemcpy(d_x, x, M * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_c, c, M * sizeof(float _Complex), cudaMemcpyHostToDevice);
cudaMemcpy(d_x, x.data(), M * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_c, c_dev.data(), M * sizeof(cuFloatComplex), cudaMemcpyHostToDevice);

// Make the cufinufft plan for a 1D type-1 transform with six digits of
// tolerance.
cufinufftf_makeplan(1, 1, modes, 1, 1, 1e-6, &plan, NULL);
// Make the cufinufft plan for 1D type-1 transform.
cufinufftf_makeplan(1, 1, modes, 1, 1, 1e-6, &plan, nullptr);

// Set the frequencies of the nonuniform points.
cufinufftf_setpts(plan, M, d_x, NULL, NULL, 0, NULL, NULL, NULL);
cufinufftf_setpts(plan, M, d_x, nullptr, nullptr, 0, nullptr, nullptr, nullptr);

// Actually execute the plan on the given coefficients and store the result
// in the d_f array.
cufinufftf_execute(plan, d_c, d_f);

// Copy the result back onto the host.
cudaMemcpy(f, d_f, N * sizeof(float _Complex), cudaMemcpyDeviceToHost);
cudaMemcpy(f.data(), d_f, N * sizeof(cuFloatComplex), cudaMemcpyDeviceToHost);

// Destroy the plan and free the device arrays after we're done.
cufinufftf_destroy(plan);

cudaFree(d_x);
cudaFree(d_c);
cudaFree(d_f);

// Pick an index to check the result of the calculation.
idx = 4 * N / 7;

printf("f[%d] = %lf + %lfi\n", idx, crealf(f[idx]), cimagf(f[idx]));
printf("f[%d] = %lf + %lfi\n", idx, std::real(f[idx]), std::imag(f[idx]));

// Calculate the result manually using the formula for the type-1
// transform.
f0 = 0;

std::complex<float> I(-0.0, 1.0);
for (int j = 0; j < M; ++j) {
f0 += c[j] * cexp(I * x[j] * (idx - N / 2));
f0 += c[j] * std::exp(I * x[j] * float(idx - N / 2));
}

printf("f0[%d] = %lf + %lfi\n", idx, crealf(f0), cimagf(f0));

// Finally free the host arrays.
free(x);
free(c);
free(f);
printf("f0[%d] = %lf + %lfi\n", idx, std::real(f0), std::imag(f0));

return 0;
}
40 changes: 20 additions & 20 deletions examples/guru1d1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,13 @@

// specific to this example...
#include <cassert>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <cmath>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <vector>

// only good for small projects...
using namespace std;

static const double PI = 3.141592653589793238462643383279502884;
// allows 1i to be the imaginary unit... (C++14 onwards)
using namespace std::complex_literals;

int main(int argc, char *argv[])
/* Example calling guru C++ interface to FINUFFT library, passing
Expand Down Expand Up @@ -42,43 +38,47 @@ int main(int argc, char *argv[])
} else // or, NULL here means use default opts...
finufft_makeplan(type, dim, Ns, +1, ntransf, tol, &plan, NULL);

const std::complex<double> I(0.0, 1.0);

// generate some random nonuniform points
vector<double> x(M);
std::vector<double> x(M);
for (int j = 0; j < M; ++j)
x[j] = PI * (2 * ((double)rand() / RAND_MAX) - 1); // uniform random in [-pi,pi)
x[j] = PI * (2 * ((double)std::rand() / RAND_MAX) - 1); // uniform random in [-pi,pi)
// note FINUFFT doesn't use std::vector types, so we need to make a pointer...
finufft_setpts(plan, M, x.data(), NULL, NULL, 0, NULL, NULL, NULL);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nullptr ? Surely it doesn't matter since setpts ignores them.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I missed it. I'll fix it.


// generate some complex strengths
vector<complex<double>> c(M);
std::vector<std::complex<double>> c(M);
for (int j = 0; j < M; ++j)
c[j] =
2 * ((double)rand() / RAND_MAX) - 1 + 1i * (2 * ((double)rand() / RAND_MAX) - 1);
c[j] = 2 * ((double)std::rand() / RAND_MAX) - 1 +
std::complex<double>(0.0, 1.0) *
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we use I which was defined above for a reason? I must say this std:: everywhere is highly unpleasant! I'm not sure why in our example codes we're killing using namespace std; ....

(2 * ((double)std::rand() / RAND_MAX) - 1);

// alloc output array for the Fourier modes, then do the transform
vector<complex<double>> F(N);
std::vector<std::complex<double>> F(N);
int ier = finufft_execute(plan, c.data(), F.data());

// for fun, do another with same NU pts (no re-sorting), but new strengths...
for (int j = 0; j < M; ++j)
c[j] =
2 * ((double)rand() / RAND_MAX) - 1 + 1i * (2 * ((double)rand() / RAND_MAX) - 1);
c[j] = 2 * ((double)std::rand() / RAND_MAX) - 1 +
std::complex<double>(0.0, 1.0) *
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I

(2 * ((double)std::rand() / RAND_MAX) - 1);
ier = finufft_execute(plan, c.data(), F.data());

finufft_destroy(plan); // don't forget! done with transforms of this size

// rest is math checking and reporting...
int n = 142519; // check the answer just for this mode
assert(n >= -(double)N / 2 && n < (double)N / 2); // ensure meaningful test
complex<double> Ftest = complex<double>(0, 0);
for (int j = 0; j < M; ++j) Ftest += c[j] * exp(1i * (double)n * x[j]);
std::complex<double> Ftest(0, 0);
for (int j = 0; j < M; ++j) Ftest += c[j] * std::exp(I * (double)n * x[j]);
int nout = n + N / 2; // index in output array for freq mode n
double Fmax = 0.0; // compute inf norm of F
for (int m = 0; m < N; ++m) {
double aF = abs(F[m]);
double aF = std::abs(F[m]);
if (aF > Fmax) Fmax = aF;
}
double err = abs(F[nout] - Ftest) / Fmax;
double err = std::abs(F[nout] - Ftest) / Fmax;
printf("guru 1D type-1 double-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n", ier,
n, err);

Expand Down
Loading
Loading