diff --git a/Makefile b/Makefile index bc05734a..d1cc9f72 100644 --- a/Makefile +++ b/Makefile @@ -50,6 +50,7 @@ tests: examples: $(CC) src/ftutilities.c examples/additiontheorem.c $(CFLAGS) -L$(LIBDIR) -l$(LIB) $(LDFLAGS) $(LDLIBS) -o additiontheorem + $(CC) src/ftutilities.c examples/annulus.c $(CFLAGS) -L$(LIBDIR) -l$(LIB) $(LDFLAGS) $(LDLIBS) -o annulus $(CC) src/ftutilities.c examples/calculus.c $(CFLAGS) -L$(LIBDIR) -l$(LIB) $(LDFLAGS) $(LDLIBS) -o calculus $(CC) src/ftutilities.c examples/holomorphic.c $(CFLAGS) -L$(LIBDIR) -l$(LIB) $(LDFLAGS) $(LDLIBS) -o holomorphic $(CC) src/ftutilities.c examples/nonlocaldiffusion.c $(CFLAGS) -L$(LIBDIR) -l$(LIB) $(LDFLAGS) $(LDLIBS) -o nonlocaldiffusion @@ -74,6 +75,7 @@ runtests: runexamples: ./additiontheorem + ./annulus ./calculus ./holomorphic ./nonlocaldiffusion @@ -86,6 +88,7 @@ coverage: clean: rm -f lib$(LIB).* rm -f additiontheorem + rm -f annulus rm -f calculus rm -f holomorphic rm -f nonlocaldiffusion diff --git a/docs/docs.doc b/docs/docs.doc index 5b603590..6f928aeb 100755 --- a/docs/docs.doc +++ b/docs/docs.doc @@ -246,6 +246,46 @@ f_{2n-2}(r,\theta) = \sum_{\ell=0}^{n-1}\sum_{m=2-2n}^{2n-2} g_{2\ell}^m \left\{ and \ref ft_execute_cxf2disk converts them back. +\subsection ann2cxf + +Consider \f$P_n^{t,(\alpha,\beta,\gamma)}(x)\f$ to be orthogonal polynomials in \f$L^2([-1,1], (1-x)^\alpha(1+x)^\beta(t+x)^\gamma\ud x)\f$, for parameter values \f$\{t>1, \alpha,\beta>-1, \gamma\in\mathbb{R}\}\cup\{t=1, \alpha,\beta+\gamma>-1\}\f$. + +Real orthonormal annulus polynomials in \f$L^2(\{(r,\theta) : \rho < r < 1, 0 < \theta < 2\pi\}, r^{2\gamma+1}(r^2-\rho^2)^\alpha(1-r^2)^\beta\ud r\ud\theta)\f$ are: +\f[ +Z_{\ell,m}^{\rho,(\alpha,\beta,\gamma)}(r,\theta) = \sqrt{2} \left(\frac{2}{1-\rho^2}\right)^{\frac{\abs{m}+\alpha+\beta+\gamma+1}{2}} r^{\abs{m}}\tilde{P}_{\frac{\ell-\abs{m}}{2}}^{\frac{1+\rho^2}{1-\rho^2},(\beta,\alpha,\abs{m}+\gamma)}\left(\frac{2r^2-1-\rho^2}{1-\rho^2}\right) \sqrt{\frac{2-\delta_{m,0}}{2\pi}} \left\{\begin{array}{ccc} \cos(m\theta) & {\rm for} & m \ge 0,\\ \sin(\abs{m}\theta) & {\rm for} & m < 0,\end{array}\right. +\f] +where the tilde implies that \f$\tilde{P}_n^{t,(\alpha,\beta,\gamma)}(x)\f$ are orthonormal. In the limit as \f$\rho\to0\f$, the annulus polynomials converge to the Zernike polynomials, \f$Z_{\ell,m}^{0,(\alpha,\beta,\gamma)}(r, \theta) = Z_{\ell,m}^{(\alpha+\gamma,\beta)}(r, \theta)\f$. A degree-\f$(2n-2)\f$ expansion in annulus polynomials is given by: +\f[ +f_{2n-2}(r,\theta) = \sum_{\ell=0}^{2n-2}\sum_{m=-\ell,2}^{+\ell} f_\ell^m Z_{\ell,m}^{\rho,(\alpha,\beta,\gamma)}(r,\theta), +\f] +where the \f$,2\f$ in the inner summation index implies that the inner summation runs from \f$m=-\ell\f$ in steps of \f$2\f$ up to \f$+\ell\f$. If annulus polynomial expansion coefficients are organized into the array: +\f[ +F = \begin{pmatrix} +f_0^0 & f_1^{-1} & f_1^1 & f_2^{-2} & f_2^2 & \cdots & f_{2n-2}^{2-2n} & f_{2n-2}^{2n-2}\\ +f_2^0 & f_3^{-1} & f_3^1 & f_4^{-2} & f_4^2 & \cdots & 0 & 0\\ +\vdots & \vdots & \vdots & \vdots & \vdots & \ddots & \vdots & \vdots\\ +f_{2n-6}^0 & f_{2n-5}^{-1} & f_{2n-5}^1 & f_{2n-4}^{-2} & f_{2n-4}^2 & & \vdots & \vdots\\ +f_{2n-4}^0 & f_{2n-3}^{-1} & f_{2n-3}^1 & f_{2n-2}^{-2} & f_{2n-2}^2 & \cdots & 0 & 0\\ +f_{2n-2}^0 & 0 & 0 & 0 & 0 & \cdots & 0 & 0\\ +\end{pmatrix}, +\f] +then \ref ft_plan_ann2cxf creates the appropriate \ref ft_harmonic_plan, and \ref ft_execute_ann2cxf returns the even Chebyshev--Fourier coefficients: +\f[ +G = \begin{pmatrix} +g_0^0 & g_0^{-1} & g_0^1 & g_0^{-2} & g_0^2 & \cdots & g_0^{2-2n} & g_0^{2n-2}\\ +g_2^0 & g_2^{-1} & g_2^1 & g_2^{-2} & g_2^2 & \cdots & g_2^{2-2n} & g_2^{2n-2}\\ +\vdots & \vdots & \vdots & \vdots & \vdots & \ddots & \vdots & \vdots\\ +g_{2n-4}^0 & g_{2n-4}^{-1} & g_{2n-4}^1 & g_{2n-4}^{-2} & g_{2n-4}^2 & \cdots & g_{2n-4}^{2-2n} & g_{2n-4}^{2n-2}\\ +g_{2n-2}^0 & 0 & 0 & g_{2n-2}^{-2} & g_{2n-2}^2 & \cdots & g_{2n-2}^{2-2n} & g_{2n-2}^{2n-2}\\ +\end{pmatrix}. +\f] +That is: +\f[ +f_{2n-2}(r,\theta) = \sum_{\ell=0}^{n-1}\sum_{m=2-2n}^{2n-2} g_{2\ell}^m \left\{\begin{array}{ccc} T_{\ell}\left(\frac{2r^2-1-\rho^2}{1-\rho^2}\right) & {\rm for} & m~{\rm even},\\ \sqrt{\frac{2}{1-\rho^2}}rT_{\ell}\left(\frac{2r^2-1-\rho^2}{1-\rho^2}\right) & {\rm for} & m~{\rm odd},\end{array}\right\} \sqrt{\frac{2-\delta_{m,0}}{2\pi}}\left\{\begin{array}{ccc} \cos(m\theta) & {\rm for} & m \ge 0,\\ \sin(\abs{m}\theta) & {\rm for} & m < 0,\end{array}\right. +\f] +and \ref ft_execute_cxf2ann converts them back. + + \subsection rectdisk2cheb Real orthonormal Dunkl--Xu polynomials in \f$L^2(\mathbb{D}^2, (1-x^2-y^2)^\beta\ud x\ud y)\f$ are: @@ -436,6 +476,15 @@ r_n & = \cos\left(\frac{2n+1}{4N}\pi\right) = \sin\left(\frac{2N-2n-1}{4N}\pi\ri \theta_m & = \frac{2m}{M}\pi,\quad {\rm for} \quad 0 \le m < M. \f} +\subsection ft_fftw_annulus_plan + +On the annulus, the \f$N\times M\f$ radial--azimuthal grids are: +\f{align*} +r_n & = \sqrt{\cos^2\left(\frac{2n+1}{4N}\pi\right) + \rho^2\sin^2\left(\frac{2n+1}{4N}\pi\right)},\\ +& = \sqrt{\sin^2\left(\frac{2N-2n-1}{4N}\pi\right) + \rho^2\cos^2\left(\frac{2N-2n-1}{4N}\pi\right)},\quad {\rm for} \quad 0 \le n < N,\\ +\theta_m & = \frac{2m}{M}\pi,\quad {\rm for} \quad 0 \le m < M. +\f} + \subsection ft_fftw_rectdisk_plan On the rectangularized disk, the \f$N\times M\f$ mapped tensor product grids are: diff --git a/examples/annulus.c b/examples/annulus.c new file mode 100644 index 00000000..e27d6bc0 --- /dev/null +++ b/examples/annulus.c @@ -0,0 +1,102 @@ +#include +#include + +double f(double x, double y) {return (pow(x, 3))/(x*x+y*y-0.25);}; + +/*! + \example annulus.c + In this example, we explore integration of the function: + \f[ + f(x,y) = \frac{x^3}{x^2+y^2-\frac{1}{4}}, + \f] + over the annulus defined by \f$\{(r,\theta) : \frac{2}{3} < r < 1, 0 < \theta < 2\pi\}\f$. We will calculate the integral: + \f[ + \int_0^{2\pi}\int_{\frac{2}{3}}^1 f(r\cos\theta,r\sin\theta)^2r{\rm\,d}r{\rm\,d}\theta, + \f] + by analyzing the function in an annulus polynomial series. +*/ +int main(void) { + printf("In this example, we explore square integration of a function over \n"); + printf("the annulus with parameter "MAGENTA("ρ = 2/3")". We analyze the function:\n"); + printf("\n"); + printf("\t"MAGENTA("f(x,y) = x³/(x²-y²-1/4)")",\n"); + printf("\n"); + printf("on an "MAGENTA("N×M")" tensor product grid defined by:\n"); + printf("\n"); + printf("\t"MAGENTA("rₙ = √{cos²[(n+1/2)π/4N] + ρ²sin²[(n+1/2)π/4N]}")", for "MAGENTA("0 ≤ n < N")",\n"); + printf("\n"); + printf("and\n"); + printf("\n"); + printf("\t"MAGENTA("θₘ = 2π m/M")", for "MAGENTA("0 ≤ m < M")";\n"); + printf("\n"); + printf("we convert the function samples to Chebyshev×Fourier coefficients using\n"); + printf(CYAN("ft_plan_annulus_analysis")" and "CYAN("ft_execute_annulus_analysis")"; and finally, we transform\n"); + printf("the Chebyshev×Fourier coefficients to annulus harmonic coefficients using\n"); + printf(CYAN("ft_plan_ann2cxf")" and "CYAN("ft_execute_cxf2ann")".\n"); + printf("\n"); + printf("N.B. for the storage pattern of the printed arrays, please consult the\n"); + printf("documentation. (Arrays are stored in column-major ordering.)\n"); + + char * FMT = "%1.3f"; + + int N = 8; + int M = 4*N-3; + + printf("\n\n"MAGENTA("N = %i")", and "MAGENTA("M = %i")"\n\n", N, M); + + double rho = 2.0/3.0; + double r[N], theta[M], F[4*N*N]; + + for (int n = 0; n < N; n++) { + double t = (N-n-0.5)*M_PI/(2*N); + double ct2 = sin(t); + double st2 = cos(t); + r[n] = sqrt(ct2*ct2+rho*rho*st2*st2); + } + for (int m = 0; m < M; m++) + theta[m] = 2.0*M_PI*m/M; + + printmat("Radial grid "MAGENTA("r"), FMT, r, N, 1); + printf("\n"); + printmat("Azimuthal grid "MAGENTA("θ"), FMT, theta, 1, M); + printf("\n"); + + for (int m = 0; m < M; m++) + for (int n = 0; n < N; n++) + F[n+N*m] = f(r[n]*cos(theta[m]), r[n]*sin(theta[m])); + + printf("On the tensor product grid, our function samples are:\n\n"); + + printmat("F", FMT, F, N, M); + printf("\n"); + + double alpha = 0.0, beta = 0.0, gamma = 0.0; + ft_harmonic_plan * P = ft_plan_ann2cxf(N, alpha, beta, gamma, rho); + ft_annulus_fftw_plan * PA = ft_plan_annulus_analysis(N, M, rho, FT_FFTW_FLAGS); + + ft_execute_annulus_analysis('N', PA, F, N, M); + ft_execute_cxf2ann('N', P, F, N, M); + + printf("Its annulus polynomial coefficients are:\n\n"); + + printmat("U", FMT, F, N, M); + printf("\n"); + + printf("The annulus polynomial coefficients are useful for integration.\n"); + printf("The integral of "MAGENTA("[f(x,y)]^2")" over the annulus is\n"); + printf("approximately the square of the 2-norm of the coefficients, \n\t"); + double val = pow(ft_norm_1arg(F, N*M), 2); + printf("%1.16f", val); + printf(".\n"); + printf("This compares favourably to the exact result, \n\t"); + double tval = 5.0*M_PI/8.0*(1675.0/4536.0+9.0*log(3.0)/32.0-3.0*log(7.0)/32.0); + printf("%1.16f", tval); + printf(".\n"); + printf("The relative error in the integral is %4.2e.\n", fabs(val-tval)/fabs(tval)); + printf("This error can be improved upon by increasing "MAGENTA("N")" and "MAGENTA("M")".\n"); + + ft_destroy_harmonic_plan(P); + ft_destroy_annulus_fftw_plan(PA); + + return 0; +} diff --git a/src/banded_source.c b/src/banded_source.c index ae42bb86..8c79cf2b 100644 --- a/src/banded_source.c +++ b/src/banded_source.c @@ -1502,17 +1502,13 @@ void X(mpsm)(char TRANS, X(modified_plan) * P, FLT * B, int LDB, int N) { } X(modified_plan) * X(plan_modified)(const int n, X(banded) * (*operator_clenshaw)(const int n, const int nc, const FLT * c, const int incc, const X(cop_params) params), const X(cop_params) params, const int nu, const FLT * u, const int nv, const FLT * v, const int verbose) { + X(modified_plan) * P = malloc(sizeof(X(modified_plan))); if (nv < 1) { // polynomial case X(banded) * U = operator_clenshaw(n, nu, u, 1, params); X(banded_cholfact)(U); X(triangular_banded) * R = X(convert_banded_to_triangular_banded)(U); - X(modified_plan) * P = malloc(sizeof(X(modified_plan))); P->R = R; - P->n = n; - P->nu = nu; - P->nv = nv; - return P; } else { // rational case @@ -1597,14 +1593,13 @@ X(modified_plan) * X(plan_modified)(const int n, X(banded) * (*operator_clenshaw X(destroy_banded)(Lt); X(destroy_banded)(ULt); X(destroy_banded_ql)(F); - X(modified_plan) * P = malloc(sizeof(X(modified_plan))); - P->n = n; - P->nu = nu; - P->nv = nv; P->K = K; P->R = R; - return P; } + P->n = n; + P->nu = nu; + P->nv = nv; + return P; } void Y(execute_jacobi_similarity)(const X(modified_plan) * P, const int n, const FLT * ap, const FLT * bp, FLT * aq, FLT * bq) { diff --git a/src/drivers.c b/src/drivers.c index f580facb..2e42364a 100644 --- a/src/drivers.c +++ b/src/drivers.c @@ -791,16 +791,25 @@ void execute_spinsph_lo2hi_NEON(const ft_spin_rotation_plan * SRP, ft_complex * } void ft_destroy_harmonic_plan(ft_harmonic_plan * P) { - for (int i = 0; i < P->NRP; i++) - ft_destroy_rotation_plan(P->RP[i]); - free(P->RP); + if (P->NRP > 0) { + for (int i = 0; i < P->NRP; i++) + ft_destroy_rotation_plan(P->RP[i]); + free(P->RP); + } + if (P->NMP > 0) { + for (int i = 0; i < P->NMP; i++) + ft_destroy_modified_plan(P->MP[i]); + free(P->MP); + } VFREE(P->B); - for (int i = 0; i < P->NP; i++) { - free(P->P[i]); - free(P->Pinv[i]); + if (P->NP > 0) { + for (int i = 0; i < P->NP; i++) { + free(P->P[i]); + free(P->Pinv[i]); + } + free(P->P); + free(P->Pinv); } - free(P->P); - free(P->Pinv); free(P); } @@ -816,6 +825,7 @@ ft_harmonic_plan * ft_plan_sph2fourier(const int n) { P->Pinv[0] = plan_chebyshev_to_legendre(0, 1, n); P->Pinv[1] = plan_ultraspherical_to_ultraspherical(0, 1, n, 1.0, 1.5); P->NRP = 1; + P->NMP = 0; P->NP = 2; return P; } @@ -903,6 +913,7 @@ ft_harmonic_plan * ft_plan_tri2cheb(const int n, const double alpha, const doubl P->beta = beta; P->gamma = gamma; P->NRP = 1; + P->NMP = 0; P->NP = 2; return P; } @@ -970,6 +981,7 @@ ft_harmonic_plan * ft_plan_disk2cxf(const int n, const double alpha, const doubl P->alpha = alpha; P->beta = beta; P->NRP = 1; + P->NMP = 0; P->NP = 2; return P; } @@ -1008,6 +1020,106 @@ void ft_execute_cxf2disk(const char TRANS, const ft_harmonic_plan * P, double * } } +ft_harmonic_plan * ft_plan_ann2cxf(const int n, const double alpha, const double beta, const double gamma, const double rho) { + ft_harmonic_plan * P = malloc(sizeof(ft_harmonic_plan)); + P->RP = malloc(sizeof(ft_rotation_plan *)); + P->RP[0] = ft_plan_rotannulus(n, alpha, beta, gamma, rho); + if (gamma == 0.0) { + // For even m, it is the identity. + P->MP = malloc(2 * sizeof(ft_modified_plan *)); + P->MP[0] = malloc(sizeof(ft_modified_plan)); + P->MP[0]->R = ft_create_I_triangular_banded(n, 0); + P->MP[0]->n = n; + P->MP[0]->nu = 0; + P->MP[0]->nv = 0; + // For odd m, we must convert from P^{(1-ρ^2)/(1+ρ^2), (β,α,1)}(x) ↘ P^{(β,α)}(x) which is the Cholesky factor of (tI+X_{β,α}) + double t = 1.0 + 2.0*rho*rho/(1.0-rho*rho); + ft_banded * U = ft_create_jacobi_multiplication(1, n, n, beta, alpha); + for (int i = 0; i < n; i++) + ft_set_banded_index(U, t + ft_get_banded_index(U, i, i), i, i); + ft_banded_cholfact(U); + P->MP[1] = malloc(sizeof(ft_modified_plan)); + P->MP[1]->R = ft_convert_banded_to_triangular_banded(U); + P->MP[1]->n = n; + P->MP[1]->nu = 1; + P->MP[1]->nv = 0; + P->NMP = 2; + } + else { + P->NMP = 0; + warning("plan_ann2cxf: γ≠0 not implemented."); + } + P->B = VMALLOC(VALIGN(n) * (4*n-3) * sizeof(double)); + P->P = malloc(sizeof(double *)); + P->P[0] = plan_jacobi_to_chebyshev(1, 0, n, beta, alpha); + P->Pinv = malloc(sizeof(double *)); + P->Pinv[0] = plan_chebyshev_to_jacobi(0, 1, n, beta, alpha); + double cst = sqrt(2.0)*pow(2.0/(1.0-rho*rho), 0.5*(alpha+beta+gamma+1.0)); + double cstinv = 1.0/cst; + for (int j = 0; j < n; j++) + for (int i = 0; i <= j; i++) { + P->P[0][i+j*n] *= cst; + P->Pinv[0][i+j*n] *= cstinv; + } + P->alpha = alpha; + P->beta = beta; + P->gamma = gamma; + P->rho = rho; + P->NRP = 1; + P->NP = 1; + return P; +} + +void ft_execute_ann2cxf(const char TRANS, const ft_harmonic_plan * P, double * A, const int N, const int M) { + if (TRANS == 'N') { + // Hi 2 lo + ft_execute_disk_hi2lo(P->RP[0], A, P->B, M); + // Semi-classical to classical + ft_mpmm('N', P->MP[0], A, 4*N, (M+3)/4); + ft_mpmm('N', P->MP[1], A+N, 4*N, (M+2)/4); + ft_mpmm('N', P->MP[1], A+2*N, 4*N, (M+1)/4); + ft_mpmm('N', P->MP[0], A+3*N, 4*N, M/4); + // Classical to Chebyshev + cblas_dtrmm(CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, N, M, 1.0, P->P[0], N, A, N); + } + else if (TRANS == 'T') { + // Chebyshev to classical + cblas_dtrmm(CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, N, M, 1.0, P->P[0], N, A, N); + // Classical to semi-classical + ft_mpmm('T', P->MP[0], A, 4*N, (M+3)/4); + ft_mpmm('T', P->MP[1], A+N, 4*N, (M+2)/4); + ft_mpmm('T', P->MP[1], A+2*N, 4*N, (M+1)/4); + ft_mpmm('T', P->MP[0], A+3*N, 4*N, M/4); + // Lo 2 hi + ft_execute_disk_lo2hi(P->RP[0], A, P->B, M); + } +} + +void ft_execute_cxf2ann(const char TRANS, const ft_harmonic_plan * P, double * A, const int N, const int M) { + if (TRANS == 'N') { + // Chebyshev to classical + cblas_dtrmm(CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, N, M, 1.0, P->Pinv[0], N, A, N); + // Classical to semi-classical + ft_mpsm('N', P->MP[0], A, 4*N, (M+3)/4); + ft_mpsm('N', P->MP[1], A+N, 4*N, (M+2)/4); + ft_mpsm('N', P->MP[1], A+2*N, 4*N, (M+1)/4); + ft_mpsm('N', P->MP[0], A+3*N, 4*N, M/4); + // Lo 2 hi + ft_execute_disk_lo2hi(P->RP[0], A, P->B, M); + } + else if (TRANS == 'T') { + // Hi 2 lo + ft_execute_disk_hi2lo(P->RP[0], A, P->B, M); + // Semi-classical to classical + ft_mpsm('T', P->MP[0], A, 4*N, (M+3)/4); + ft_mpsm('T', P->MP[1], A+N, 4*N, (M+2)/4); + ft_mpsm('T', P->MP[1], A+2*N, 4*N, (M+1)/4); + ft_mpsm('T', P->MP[0], A+3*N, 4*N, M/4); + // Classical to Chebyshev + cblas_dtrmm(CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, N, M, 1.0, P->Pinv[0], N, A, N); + } +} + ft_harmonic_plan * ft_plan_rectdisk2cheb(const int n, const double beta) { ft_harmonic_plan * P = malloc(sizeof(ft_harmonic_plan)); P->RP = malloc(sizeof(ft_rotation_plan *)); @@ -1023,6 +1135,7 @@ ft_harmonic_plan * ft_plan_rectdisk2cheb(const int n, const double beta) { P->Pinv[2] = plan_chebyshev_to_ultraspherical(0, 1, n, beta + 0.5); P->beta = beta; P->NRP = 1; + P->NMP = 0; P->NP = 3; return P; } @@ -1076,6 +1189,7 @@ ft_harmonic_plan * ft_plan_tet2cheb(const int n, const double alpha, const doubl P->gamma = gamma; P->delta = delta; P->NRP = 2; + P->NMP = 0; P->NP = 3; return P; } diff --git a/src/fasttransforms.h b/src/fasttransforms.h index 62b1c7ce..02ee1972 100644 --- a/src/fasttransforms.h +++ b/src/fasttransforms.h @@ -315,6 +315,7 @@ void ft_kernel_tri_hi2lo(const ft_rotation_plan * RP, const int m1, const int m2 void ft_kernel_tri_lo2hi(const ft_rotation_plan * RP, const int m1, const int m2, double * A, const int S); ft_rotation_plan * ft_plan_rotdisk(const int n, const double alpha, const double beta); +ft_rotation_plan * ft_plan_rotannulus(const int n, const double alpha, const double beta, const double gamma, const double rho); /// Convert a single vector of disk harmonic coefficients in A with stride S from order m2 down to order m1. void ft_kernel_disk_hi2lo(const ft_rotation_plan * RP, const int m1, const int m2, double * A, const int S); @@ -369,6 +370,7 @@ void ft_execute_spinsph_lo2hi(const ft_spin_rotation_plan * SRP, ft_complex * A, /// Data structure to store \ref ft_rotation_plan "ft_rotation_plan"s, arrays to represent 1D orthogonal polynomial transforms and their inverses, and Greek parameters. typedef struct { ft_rotation_plan ** RP; + ft_modified_plan ** MP; double * B; double ** P; double ** Pinv; @@ -376,7 +378,9 @@ typedef struct { double beta; double gamma; double delta; + double rho; int NRP; + int NMP; int NP; } ft_harmonic_plan; @@ -410,6 +414,14 @@ void ft_execute_disk2cxf(const char TRANS, const ft_harmonic_plan * P, double * /// Transform a Chebyshev--Fourier series to a disk harmonic expansion. void ft_execute_cxf2disk(const char TRANS, const ft_harmonic_plan * P, double * A, const int N, const int M); +/// Plan an annulus harmonic transform. +ft_harmonic_plan * ft_plan_ann2cxf(const int n, const double alpha, const double beta, const double gamma, const double rho); + +/// Transform an annulus harmonic expansion to a Chebyshev--Fourier series. +void ft_execute_ann2cxf(const char TRANS, const ft_harmonic_plan * P, double * A, const int N, const int M); +/// Transform a Chebyshev--Fourier series to an annulus harmonic expansion. +void ft_execute_cxf2ann(const char TRANS, const ft_harmonic_plan * P, double * A, const int N, const int M); + /// Plan a rectangularized disk harmonic transform. ft_harmonic_plan * ft_plan_rectdisk2cheb(const int n, const double beta); @@ -534,6 +546,27 @@ void ft_execute_disk_synthesis(const char TRANS, const ft_disk_fftw_plan * P, do /// Execute FFTW analysis on the disk. void ft_execute_disk_analysis(const char TRANS, const ft_disk_fftw_plan * P, double * X, const int N, const int M); +typedef struct { + fftw_plan planr; + fftw_plan plantheta; + double * w; + double * Y; +} ft_annulus_fftw_plan; + +/// Destroy a \ref ft_annulus_fftw_plan. +void ft_destroy_annulus_fftw_plan(ft_annulus_fftw_plan * P); + +ft_annulus_fftw_plan * ft_plan_annulus_with_kind(const int N, const int M, const double rho, const fftw_r2r_kind kind[3][1], const unsigned flags); +/// Plan FFTW synthesis on the annulus. +ft_annulus_fftw_plan * ft_plan_annulus_synthesis(const int N, const int M, const double rho, const unsigned flags); +/// Plan FFTW analysis on the annulus. +ft_annulus_fftw_plan * ft_plan_annulus_analysis(const int N, const int M, const double rho, const unsigned flags); + +/// Execute FFTW synthesis on the annulus. +void ft_execute_annulus_synthesis(const char TRANS, const ft_annulus_fftw_plan * P, double * X, const int N, const int M); +/// Execute FFTW analysis on the annulus. +void ft_execute_annulus_analysis(const char TRANS, const ft_annulus_fftw_plan * P, double * X, const int N, const int M); + typedef struct { fftw_plan planx1; fftw_plan planx2; diff --git a/src/fftw.c b/src/fftw.c index 88fefc18..7399d17e 100644 --- a/src/fftw.c +++ b/src/fftw.c @@ -521,6 +521,132 @@ void ft_execute_disk_analysis(const char TRANS, const ft_disk_fftw_plan * P, dou } } +void ft_destroy_annulus_fftw_plan(ft_annulus_fftw_plan * P) { + fftw_destroy_plan(P->planr); + fftw_destroy_plan(P->plantheta); + free(P->w); + fftw_free(P->Y); + free(P); +} + +ft_annulus_fftw_plan * ft_plan_annulus_with_kind(const int N, const int M, const double rho, const fftw_r2r_kind kind[2][1], const unsigned flags) { + int rank = 1; // not 2: we are computing 1d transforms // + int n[] = {N}; // 1d transforms of length n // + int idist = N, odist = N; + int istride = 1, ostride = 1; // distance between two elements in the same column // + int * inembed = n, * onembed = n; + + ft_annulus_fftw_plan * P = malloc(sizeof(ft_annulus_fftw_plan)); + + // θ = (2n+1)/(2N)π + // (2r^2-1-ρ^2)/(1-ρ^2) == cosθ + // w = sqrt(2/(1-ρ^2))*r + P->w = malloc(N*sizeof(double)); + for (int n = 0; n < N; n++) { + double theta2 = (n+0.5)/(2.0*N); + double ct2 = cos(M_PI*theta2); + double st2 = sin(M_PI*theta2); + P->w[n] = sqrt(2.0*(ct2*ct2+rho*rho*st2*st2)/(1.0-rho*rho)); + } + + P->Y = fftw_malloc(N*2*(M/2+1)*sizeof(double)); + + int howmany = M; + P->planr = fftw_plan_many_r2r(rank, n, howmany, P->Y, inembed, istride, idist, P->Y, onembed, ostride, odist, kind[0], flags); + + n[0] = M; + idist = odist = 1; + istride = ostride = N; + howmany = N; + double * Z = fftw_malloc(N*M*sizeof(double)); + if (kind[1][0] == FFTW_HC2R) + P->plantheta = fftw_plan_many_dft_c2r(rank, n, howmany, (fftw_complex *) P->Y, inembed, istride, idist, Z, onembed, ostride, odist, flags); + else if (kind[1][0] == FFTW_R2HC) + P->plantheta = fftw_plan_many_dft_r2c(rank, n, howmany, Z, inembed, istride, idist, (fftw_complex *) P->Y, onembed, ostride, odist, flags); + fftw_free(Z); + return P; +} + +ft_annulus_fftw_plan * ft_plan_annulus_synthesis(const int N, const int M, const double rho, const unsigned flags) { + const fftw_r2r_kind kind[2][1] = {{FFTW_REDFT01}, {FFTW_HC2R}}; + return ft_plan_annulus_with_kind(N, M, rho, kind, flags); +} + +ft_annulus_fftw_plan * ft_plan_annulus_analysis(const int N, const int M, const double rho, const unsigned flags) { + const fftw_r2r_kind kind[2][1] = {{FFTW_REDFT10}, {FFTW_R2HC}}; + return ft_plan_annulus_with_kind(N, M, rho, kind, flags); +} + +void ft_execute_annulus_synthesis(const char TRANS, const ft_annulus_fftw_plan * P, double * X, const int N, const int M) { + if (TRANS == 'N') { + for (int j = 0; j < M; j++) + X[j*N] *= 2.0; + fftw_execute_r2r(P->planr, X, X); + double * w = P->w; + for (int j = 1; j < M; j += 4) + for (int i = 0; i < N; i++) { + X[i+j*N] *= w[i]; + X[i+(j+1)*N] *= w[i]; + } + for (int i = 0; i < N*M; i++) + X[i] *= M_1_4_SQRT_PI; + for (int i = 0; i < N; i++) + X[i] *= M_SQRT2; + data_r2c(X, P->Y, N, M); + fftw_execute_dft_c2r(P->plantheta, (fftw_complex *) P->Y, X); + } + else if (TRANS == 'T') { + fftw_execute_dft_r2c(P->plantheta, X, (fftw_complex *) P->Y); + data_c2r(X, P->Y, N, M); + for (int i = 0; i < N*M; i++) + X[i] *= M_1_2_SQRT_PI; + for (int i = 0; i < N; i++) + X[i] *= M_SQRT1_2; + double * w = P->w; + for (int j = 1; j < M; j += 4) + for (int i = 0; i < N; i++) { + X[i+j*N] *= w[i]; + X[i+(j+1)*N] *= w[i]; + } + fftw_execute_r2r(P->planr, X, X); + } +} + +void ft_execute_annulus_analysis(const char TRANS, const ft_annulus_fftw_plan * P, double * X, const int N, const int M) { + if (TRANS == 'N') { + fftw_execute_dft_r2c(P->plantheta, X, (fftw_complex *) P->Y); + data_c2r(X, P->Y, N, M); + for (int i = 0; i < N*M; i++) + X[i] *= M_4_SQRT_PI/(2*N*M); + for (int i = 0; i < N; i++) + X[i] *= M_SQRT1_2; + double * w = P->w; + for (int j = 1; j < M; j += 4) + for (int i = 0; i < N; i++) { + X[i+j*N] /= w[i]; + X[i+(j+1)*N] /= w[i]; + } + fftw_execute_r2r(P->planr, X, X); + for (int j = 0; j < M; j++) + X[j*N] *= 0.5; + } + else if (TRANS == 'T') { + fftw_execute_r2r(P->planr, X, X); + double * w = P->w; + for (int j = 1; j < M; j += 4) + for (int i = 0; i < N; i++) { + X[i+j*N] /= w[i]; + X[i+(j+1)*N] /= w[i]; + } + for (int i = 0; i < N*M; i++) + X[i] *= M_2_SQRT_PI/(2*N*M); + for (int i = 0; i < N; i++) + X[i] *= M_SQRT2; + data_r2c(X, P->Y, N, M); + fftw_execute_dft_c2r(P->plantheta, (fftw_complex *) P->Y, X); + } +} + void ft_destroy_rectdisk_fftw_plan(ft_rectdisk_fftw_plan * P) { fftw_destroy_plan(P->planx1); fftw_destroy_plan(P->planx2); diff --git a/src/hierarchical_source.h b/src/hierarchical_source.h index 90dd9a00..efd2c5d6 100644 --- a/src/hierarchical_source.h +++ b/src/hierarchical_source.h @@ -108,7 +108,7 @@ FLT X(coulombkernel)(FLT x, FLT y); FLT X(coulombprimekernel)(FLT x, FLT y); FLT X(logkernel)(FLT x, FLT y); -static FLT X(diff)(FLT x, FLT y); +//static FLT X(diff)(FLT x, FLT y); FLT X(cauchykernel2)(FLT x, FLT ylo, FLT yhi); FLT X(coulombkernel2)(FLT x, FLT ylo, FLT yhi); diff --git a/src/rotations.c b/src/rotations.c index b526ca91..282acf17 100644 --- a/src/rotations.c +++ b/src/rotations.c @@ -167,8 +167,8 @@ void ft_kernel_tri_lo2hi(const ft_rotation_plan * RP, const int m1, const int m2 kernel_tri_lo2hi_default(RP, m1, m2, A, S); } -#define sd(l,m) s[l+(m)*n-(m)/2*((m)+1)/2] -#define cd(l,m) c[l+(m)*n-(m)/2*((m)+1)/2] +#define sd(l,m) s[l+(m)*n-((m)/2)*(((m)+1)/2)] +#define cd(l,m) c[l+(m)*n-((m)/2)*(((m)+1)/2)] ft_rotation_plan * ft_plan_rotdisk(const int n, const double alpha, const double beta) { double * s = malloc(n*n * sizeof(double)); @@ -189,6 +189,76 @@ ft_rotation_plan * ft_plan_rotdisk(const int n, const double alpha, const double return RP; } +// Orthonormal semi-classical Jacobi hierarchy with respect to w(x) = (1-x)^β (1+x)^α (t+x)^(γ+m). +// If t = 1, then the sines and cosines should match (1-x)^β (1+x)^(α+γ+m), which is the rotdisk with (α+γ, β). +ft_rotation_plan * ft_plan_rotannulus(const int n, const double alpha, const double beta, const double gamma, const double rho) { + double * s = malloc(n*n * sizeof(double)); + double * c = malloc(n*n * sizeof(double)); + double t = 1.0 + 2.0*rho*rho/(1.0-rho*rho); + if (gamma == 0.0) { + ft_modified_plan * P = malloc(sizeof(ft_modified_plan)); + P->nu = 1; + P->nv = 0; + ft_banded * XP = ft_create_jacobi_multiplication(1, n+1, n+1, beta, alpha); + ft_symmetric_tridiagonal * JP = ft_convert_banded_to_symmetric_tridiagonal(XP); + for (int i = 0; i < n+1; i++) + JP->a[i] += t; + for (int m = 0; m < 2*n-1; m += 2) { + ft_symmetric_tridiagonal_qr * QR = ft_symmetric_tridiagonal_qrfact(JP); + for (int l = 0; l < n-(m+1)/2; l++) { + sd(l, m) = QR->s[l]; + cd(l, m) = QR->c[l]; + } + P->R = QR->R; + P->n = QR->n; + ft_symmetric_tridiagonal * JP2 = ft_execute_jacobi_similarity(P, JP); + ft_destroy_symmetric_tridiagonal(JP); + JP = JP2; + QR->R = P->R; + ft_destroy_symmetric_tridiagonal_qr(QR); + } + ft_destroy_symmetric_tridiagonal(JP); + XP = ft_create_jacobi_multiplication(1, n+1, n+1, beta, alpha); + JP = ft_convert_banded_to_symmetric_tridiagonal(XP); + for (int i = 0; i < n+1; i++) + JP->a[i] += t; + XP = ft_create_jacobi_multiplication(1, n+1, n+1, beta, alpha); + for (int i = 0; i < n+1; i++) + ft_set_banded_index(XP, t + ft_get_banded_index(XP, i, i), i, i); + ft_banded_cholfact(XP); + P->n = XP->n; + P->R = ft_convert_banded_to_triangular_banded(XP); + ft_symmetric_tridiagonal * JP1 = ft_execute_jacobi_similarity(P, JP); + ft_destroy_symmetric_tridiagonal(JP); + ft_destroy_triangular_banded(P->R); + JP = JP1; + for (int m = 1; m < 2*n-1; m += 2) { + ft_symmetric_tridiagonal_qr * QR = ft_symmetric_tridiagonal_qrfact(JP); + for (int l = 0; l < n-(m+1)/2; l++) { + sd(l, m) = QR->s[l]; + cd(l, m) = QR->c[l]; + } + P->R = QR->R; + P->n = QR->n; + ft_symmetric_tridiagonal * JP2 = ft_execute_jacobi_similarity(P, JP); + ft_destroy_symmetric_tridiagonal(JP); + JP = JP2; + QR->R = P->R; + ft_destroy_symmetric_tridiagonal_qr(QR); + } + ft_destroy_symmetric_tridiagonal(JP); + free(P); + } + else { + warning("plan_rotannulus: γ≠0 not implemented."); + } + ft_rotation_plan * RP = malloc(sizeof(ft_rotation_plan)); + RP->s = s; + RP->c = c; + RP->n = n; + return RP; +} + void ft_kernel_disk_hi2lo(const ft_rotation_plan * RP, const int m1, const int m2, double * A, const int S) { kernel_disk_hi2lo_default(RP, m1, m2, A, S); } diff --git a/src/rotations/rotations.h b/src/rotations/rotations.h index dcf6b2c3..157fb523 100644 --- a/src/rotations/rotations.h +++ b/src/rotations/rotations.h @@ -6,8 +6,8 @@ #define s(l,m) s[l+(m)*(2*n+1-(m))/2] #define c(l,m) c[l+(m)*(2*n+1-(m))/2] -#define sd(l,m) s[l+(m)*n-(m)/2*((m)+1)/2] -#define cd(l,m) c[l+(m)*n-(m)/2*((m)+1)/2] +#define sd(l,m) s[l+(m)*n-((m)/2)*(((m)+1)/2)] +#define cd(l,m) c[l+(m)*n-((m)/2)*(((m)+1)/2)] #define s1(l,m) s1[l+(m)*n] #define c1(l,m) c1[l+(m)*n] diff --git a/src/transforms_source.c b/src/transforms_source.c index aabb7bbd..cf1a7d29 100644 --- a/src/transforms_source.c +++ b/src/transforms_source.c @@ -212,13 +212,13 @@ X(btb_eigen_FMM) * X(plan_associated_hermite_to_hermite)(const int norm1, const } X(modified_plan) * X(plan_modified_jacobi_to_jacobi)(const int n, const FLT alpha, const FLT beta, const int nu, const FLT * u, const int nv, const FLT * v, const int verbose) { - X(plan_modified)(n, X(operator_normalized_jacobi_clenshaw), (X(cop_params)) {alpha, beta}, nu, u, nv, v, verbose); + return X(plan_modified)(n, X(operator_normalized_jacobi_clenshaw), (X(cop_params)) {alpha, beta}, nu, u, nv, v, verbose); } X(modified_plan) * X(plan_modified_laguerre_to_laguerre)(const int n, const FLT alpha, const int nu, const FLT * u, const int nv, const FLT * v, const int verbose) { - X(plan_modified)(n, X(operator_normalized_laguerre_clenshaw), (X(cop_params)) {alpha, 0}, nu, u, nv, v, verbose); + return X(plan_modified)(n, X(operator_normalized_laguerre_clenshaw), (X(cop_params)) {alpha, 0}, nu, u, nv, v, verbose); } X(modified_plan) * X(plan_modified_hermite_to_hermite)(const int n, const int nu, const FLT * u, const int nv, const FLT * v, const int verbose) { - X(plan_modified)(n, X(operator_normalized_hermite_clenshaw), (X(cop_params)) {0, 0}, nu, u, nv, v, verbose); + return X(plan_modified)(n, X(operator_normalized_hermite_clenshaw), (X(cop_params)) {0, 0}, nu, u, nv, v, verbose); } diff --git a/test/test_banded_source.c b/test/test_banded_source.c index de05514e..acabba2f 100644 --- a/test/test_banded_source.c +++ b/test/test_banded_source.c @@ -549,7 +549,7 @@ void Y(test_banded)(int * checksum) { X(inner_test_banded)(checksum, n); if (sizeof(FLT) == sizeof(double)) { printf("\n\n"); - for (int n = 1024; n < 131072; n *= 2) { + for (int n = 1024; n < 32768; n *= 2) { X(inner_timing_test_banded_FMM)(checksum, n); X(inner_timing_test_banded_ADI)(checksum, n); } diff --git a/test/test_dprk_source.c b/test/test_dprk_source.c index a3a9ccae..8a9177f8 100644 --- a/test/test_dprk_source.c +++ b/test/test_dprk_source.c @@ -64,7 +64,6 @@ void Y(test_dprk)(int * checksum) { printf("---------------------------------------------------------|----------------------\n"); int n = 256; - struct timeval start, end; X(test_permute)(checksum); diff --git a/test/test_drivers.c b/test/test_drivers.c index 4893d7e4..4d93e720 100644 --- a/test/test_drivers.c +++ b/test/test_drivers.c @@ -17,8 +17,8 @@ int main(int argc, const char * argv[]) { ft_spin_rotation_plan * SRP; ft_harmonic_plan * P; ft_spin_harmonic_plan * SP; - //double alpha = -0.5, beta = -0.5, gamma = -0.5, delta = -0.5; // best case scenario - double alpha = 0.0, beta = 0.0, gamma = 0.0, delta = 0.0; // not as good. perhaps better to transform to second kind Chebyshev + //double alpha = -0.5, beta = -0.5, gamma = -0.5, delta = -0.5, rho = sqrt(1.0/3.0); // best case scenario + double alpha = 0.0, beta = 0.0, gamma = 0.0, delta = 0.0, rho = sqrt(1.0/3.0); // not as good. perhaps better to transform to second kind Chebyshev int IERR, ITIME, J, N, L, M, NTIMES; @@ -982,6 +982,200 @@ int main(int argc, const char * argv[]) { } printf("];\n"); + printf("\nTesting the accuracy of annulus drivers.\n\n"); + printf("\t\t\t Test \t\t\t\t | Relative Error\n"); + printf("---------------------------------------------------------|----------------------\n"); + for (int i = 0; i < IERR; i++) { + N = 64*pow(2, i)+J; + M = 4*N-3; + + A = diskones(N, M); + Ac = aligned_copymat(A, N, M); + B = copymat(A, N, M); + + RP = ft_plan_rotannulus(N, alpha, beta, 0.0, 0.0); + RP1 = ft_plan_rotdisk(N, alpha, beta); + + err = hypot(ft_norm_2arg(RP->s, RP1->s, N*N), ft_norm_2arg(RP->c, RP1->c, N*N))/hypot(ft_norm_1arg(RP1->s, N*N), ft_norm_1arg(RP1->c, N*N)); + printf("ϵ_2 Givens rotations \t (N×N) = (%5ix%5i): \t |%20.2e ", N, N, err); + ft_checktest(err, 6*sqrt(N), &checksum); + err = MAX(ft_normInf_2arg(RP->s, RP1->s, N*N), ft_normInf_2arg(RP->c, RP1->c, N*N))/MAX(ft_normInf_1arg(RP1->s, N*N), ft_normInf_1arg(RP1->c, N*N)); + printf("ϵ_∞ Givens rotations \t (N×N) = (%5ix%5i): \t |%20.2e ", N, N, err); + ft_checktest(err, 5*N, &checksum); + ft_destroy_rotation_plan(RP); + ft_destroy_rotation_plan(RP1); + + RP = ft_plan_rotannulus(N, alpha, beta, 0.0, rho); + + execute_disk_hi2lo_default(RP, A, M); + execute_disk_lo2hi_default(RP, A, M); + + err = ft_norm_2arg(A, B, N*M)/ft_norm_1arg(B, N*M); + printf("ϵ_2 default-default \t (N×M) = (%5ix%5i): \t |%20.2e ", N, M, err); + ft_checktest(err, 6*sqrt(N), &checksum); + err = ft_normInf_2arg(A, B, N*M)/ft_normInf_1arg(B, N*M); + printf("ϵ_∞ default-default \t (N×M) = (%5ix%5i): \t |%20.2e ", N, M, err); + ft_checktest(err, 3*N, &checksum); + + #if defined(__i386__) || defined(__x86_64__) + execute_disk_hi2lo_SSE2(RP, A, Ac, M); + execute_disk_lo2hi_default(RP, A, M); + + err = ft_norm_2arg(A, B, N*M)/ft_norm_1arg(B, N*M); + printf("ϵ_2 SSE2-default \t (N×M) = (%5ix%5i): \t |%20.2e ", N, M, err); + ft_checktest(err, 6*sqrt(N), &checksum); + err = ft_normInf_2arg(A, B, N*M)/ft_normInf_1arg(B, N*M); + printf("ϵ_∞ SSE2-default \t (N×M) = (%5ix%5i): \t |%20.2e ", N, M, err); + ft_checktest(err, 3*N, &checksum); + + execute_disk_hi2lo_default(RP, A, M); + execute_disk_lo2hi_SSE2(RP, A, Ac, M); + + err = ft_norm_2arg(A, B, N*M)/ft_norm_1arg(B, N*M); + printf("ϵ_2 default-SSE2 \t (N×M) = (%5ix%5i): \t |%20.2e ", N, M, err); + ft_checktest(err, 6*sqrt(N), &checksum); + err = ft_normInf_2arg(A, B, N*M)/ft_normInf_1arg(B, N*M); + printf("ϵ_∞ default-SSE2 \t (N×M) = (%5ix%5i): \t |%20.2e ", N, M, err); + ft_checktest(err, 3*N, &checksum); + + execute_disk_hi2lo_AVX(RP, A, Ac, M); + execute_disk_lo2hi_SSE2(RP, A, Ac, M); + + err = ft_norm_2arg(A, B, N*M)/ft_norm_1arg(B, N*M); + printf("ϵ_2 AVX-SSE2 \t\t (N×M) = (%5ix%5i): \t |%20.2e ", N, M, err); + ft_checktest(err, 6*sqrt(N), &checksum); + err = ft_normInf_2arg(A, B, N*M)/ft_normInf_1arg(B, N*M); + printf("ϵ_∞ AVX-SSE2 \t\t (N×M) = (%5ix%5i): \t |%20.2e ", N, M, err); + ft_checktest(err, 3*N, &checksum); + + execute_disk_hi2lo_SSE2(RP, A, Ac, M); + execute_disk_lo2hi_AVX(RP, A, Ac, M); + + err = ft_norm_2arg(A, B, N*M)/ft_norm_1arg(B, N*M); + printf("ϵ_2 SSE2-AVX \t\t (N×M) = (%5ix%5i): \t |%20.2e ", N, M, err); + ft_checktest(err, 6*sqrt(N), &checksum); + err = ft_normInf_2arg(A, B, N*M)/ft_normInf_1arg(B, N*M); + printf("ϵ_∞ SSE2-AVX \t\t (N×M) = (%5ix%5i): \t |%20.2e ", N, M, err); + ft_checktest(err, 3*N, &checksum); + + execute_disk_hi2lo_AVX_FMA(RP, A, Ac, M); + execute_disk_lo2hi_AVX(RP, A, Ac, M); + + err = ft_norm_2arg(A, B, N*M)/ft_norm_1arg(B, N*M); + printf("ϵ_2 AVX_FMA-AVX \t (N×M) = (%5ix%5i): \t |%20.2e ", N, M, err); + ft_checktest(err, 6*sqrt(N), &checksum); + err = ft_normInf_2arg(A, B, N*M)/ft_normInf_1arg(B, N*M); + printf("ϵ_∞ AVX_FMA-AVX \t (N×M) = (%5ix%5i): \t |%20.2e ", N, M, err); + ft_checktest(err, 3*N, &checksum); + + execute_disk_hi2lo_AVX(RP, A, Ac, M); + execute_disk_lo2hi_AVX_FMA(RP, A, Ac, M); + + err = ft_norm_2arg(A, B, N*M)/ft_norm_1arg(B, N*M); + printf("ϵ_2 AVX-AVX_FMA \t (N×M) = (%5ix%5i): \t |%20.2e ", N, M, err); + ft_checktest(err, 6*sqrt(N), &checksum); + err = ft_normInf_2arg(A, B, N*M)/ft_normInf_1arg(B, N*M); + printf("ϵ_∞ AVX-AVX_FMA \t (N×M) = (%5ix%5i): \t |%20.2e ", N, M, err); + ft_checktest(err, 3*N, &checksum); + + execute_disk_hi2lo_AVX512F(RP, A, Ac, M); + execute_disk_lo2hi_AVX_FMA(RP, A, Ac, M); + + err = ft_norm_2arg(A, B, N*M)/ft_norm_1arg(B, N*M); + printf("ϵ_2 AVX512F-AVX_FMA \t (N×M) = (%5ix%5i): \t |%20.2e ", N, M, err); + ft_checktest(err, 6*sqrt(N), &checksum); + err = ft_normInf_2arg(A, B, N*M)/ft_normInf_1arg(B, N*M); + printf("ϵ_∞ AVX512F-AVX_FMA \t (N×M) = (%5ix%5i): \t |%20.2e ", N, M, err); + ft_checktest(err, 3*N, &checksum); + + execute_disk_hi2lo_AVX_FMA(RP, A, Ac, M); + execute_disk_lo2hi_AVX512F(RP, A, Ac, M); + + err = ft_norm_2arg(A, B, N*M)/ft_norm_1arg(B, N*M); + printf("ϵ_2 AVX_FMA-AVX512F \t (N×M) = (%5ix%5i): \t |%20.2e ", N, M, err); + ft_checktest(err, 6*sqrt(N), &checksum); + err = ft_normInf_2arg(A, B, N*M)/ft_normInf_1arg(B, N*M); + printf("ϵ_∞ AVX_FMA-AVX512F \t (N×M) = (%5ix%5i): \t |%20.2e ", N, M, err); + ft_checktest(err, 3*N, &checksum); + #elif defined(__aarch64__) + execute_disk_hi2lo_NEON(RP, A, Ac, M); + execute_disk_lo2hi_default(RP, A, M); + + err = ft_norm_2arg(A, B, N*M)/ft_norm_1arg(B, N*M); + printf("ϵ_2 NEON-default \t (N×M) = (%5ix%5i): \t |%20.2e ", N, M, err); + ft_checktest(err, 6*sqrt(N), &checksum); + err = ft_normInf_2arg(A, B, N*M)/ft_normInf_1arg(B, N*M); + printf("ϵ_∞ NEON-default \t (N×M) = (%5ix%5i): \t |%20.2e ", N, M, err); + ft_checktest(err, 3*N, &checksum); + + execute_disk_hi2lo_default(RP, A, M); + execute_disk_lo2hi_NEON(RP, A, Ac, M); + + err = ft_norm_2arg(A, B, N*M)/ft_norm_1arg(B, N*M); + printf("ϵ_2 default-NEON \t (N×M) = (%5ix%5i): \t |%20.2e ", N, M, err); + ft_checktest(err, 6*sqrt(N), &checksum); + err = ft_normInf_2arg(A, B, N*M)/ft_normInf_1arg(B, N*M); + printf("ϵ_∞ default-NEON \t (N×M) = (%5ix%5i): \t |%20.2e ", N, M, err); + ft_checktest(err, 3*N, &checksum); + #endif + + free(A); + VFREE(Ac); + free(B); + ft_destroy_rotation_plan(RP); + } + + printf("\nTesting the accuracy of annulus transforms.\n\n"); + printf("\t\t\t Test \t\t\t\t | Relative Error\n"); + printf("---------------------------------------------------------|----------------------\n"); + for (int i = 0; i < IERR; i++) { + N = 64*pow(2, i)+J; + M = 4*N-3; + + A = diskrand(N, M); + B = copymat(A, N, M); + P = ft_plan_ann2cxf(N, alpha, beta, 0.0, rho); + + ft_execute_ann2cxf('N', P, A, N, M); + ft_execute_cxf2ann('N', P, A, N, M); + + ft_execute_ann2cxf('T', P, A, N, M); + ft_execute_cxf2ann('T', P, A, N, M); + + err = ft_norm_2arg(A, B, N*M)/ft_norm_1arg(B, N*M); + printf("ϵ_2 \t\t\t (N×M) = (%5ix%5i): \t |%20.2e ", N, M, err); + ft_checktest(err, 12*sqrt(N), &checksum); + err = ft_normInf_2arg(A, B, N*M)/ft_normInf_1arg(B, N*M); + printf("ϵ_∞ \t\t\t (N×M) = (%5ix%5i): \t |%20.2e ", N, M, err); + ft_checktest(err, 6*N, &checksum); + + free(A); + free(B); + ft_destroy_harmonic_plan(P); + } + + printf("\nTiming annulus transforms.\n\n"); + printf("t9 = [\n"); + for (int i = 0; i < ITIME; i++) { + N = 64*pow(2, i)+J; + M = 4*N-3; + NTIMES = 1 + pow(2048/N, 2); + + A = diskrand(N, M); + P = ft_plan_ann2cxf(N, alpha, beta, 0.0, rho); + + FT_TIME(ft_execute_ann2cxf('N', P, A, N, M), start, end, NTIMES) + printf("%d %.6f", N, elapsed(&start, &end, NTIMES)); + + FT_TIME(ft_execute_cxf2ann('N', P, A, N, M), start, end, NTIMES) + printf(" %.6f", elapsed(&start, &end, NTIMES)); + + printf("\n"); + free(A); + ft_destroy_harmonic_plan(P); + } + printf("];\n"); + printf("\nTesting the accuracy of Dunkl-Xu drivers.\n\n"); printf("\t\t\t Test \t\t\t\t | Relative Error\n"); printf("---------------------------------------------------------|----------------------\n"); @@ -1011,7 +1205,7 @@ int main(int argc, const char * argv[]) { } printf("\nTiming Dunkl-Xu drivers.\n\n"); - printf("t9 = [\n"); + printf("t10 = [\n"); for (int i = 0; i < ITIME; i++) { N = 64*pow(2, i)+J; M = N; @@ -1064,7 +1258,7 @@ int main(int argc, const char * argv[]) { } printf("\nTiming Dunkl-Xu transforms.\n\n"); - printf("t10 = [\n"); + printf("t11 = [\n"); for (int i = 0; i < ITIME; i++) { N = 64*pow(2, i)+J; M = N; @@ -1177,7 +1371,7 @@ int main(int argc, const char * argv[]) { } printf("\nTiming Proriol³ drivers.\n\n"); - printf("t11 = [\n"); + printf("t12 = [\n"); for (int i = 0; i < ITIME; i++) { N = 16*pow(2, i)+J; L = M = N; @@ -1258,7 +1452,7 @@ int main(int argc, const char * argv[]) { } printf("\nTiming Proriol³ transforms.\n\n"); - printf("t12 = [\n"); + printf("t13 = [\n"); for (int i = 0; i < ITIME; i++) { N = 16*pow(2, i)+J; L = M = N; @@ -1395,7 +1589,7 @@ int main(int argc, const char * argv[]) { } printf("\nTiming spin-weighted spherical harmonic drivers.\n\n"); - printf("t13 = [\n"); + printf("t14 = [\n"); for (int i = 0; i < ITIME; i++) { N = 64*pow(2, i)+J; M = 2*N-1; @@ -1483,7 +1677,7 @@ int main(int argc, const char * argv[]) { } printf("\nTiming spin-weighted spherical harmonic transforms.\n\n"); - printf("t14 = [\n"); + printf("t15 = [\n"); for (int i = 0; i < ITIME; i++) { N = 64*pow(2, i)+J; M = 2*N-1; diff --git a/test/test_elliptic_source.c b/test/test_elliptic_source.c index c7f9caaa..2a7b4fab 100644 --- a/test/test_elliptic_source.c +++ b/test/test_elliptic_source.c @@ -70,7 +70,6 @@ void Y(test_elliptic)(int * checksum) { FLT y = 2.0; FLT sny, cny, dny; X(jacobian_elliptic_functions)(y, k, &sny, &cny, &dny, FT_SN | FT_CN | FT_DN); - FLT xy = x+y; FLT snxy, cnxy, dnxy; X(jacobian_elliptic_functions)(x+y, k, &snxy, &cnxy, &dnxy, FT_SN | FT_CN | FT_DN); err = Y(fabs)(snxy-(snx*cny*dny+sny*cnx*dnx)/(1-k*k*snx*snx*sny*sny))/Y(fabs)(snxy); diff --git a/test/test_fftw.c b/test/test_fftw.c index 4d6adf87..748feb99 100644 --- a/test/test_fftw.c +++ b/test/test_fftw.c @@ -13,11 +13,12 @@ int main(int argc, const char * argv[]) { ft_sphere_fftw_plan * PS, * PA; ft_triangle_fftw_plan * QS, * QA; ft_disk_fftw_plan * RS, * RA; + ft_annulus_fftw_plan * RS1, * RA1; ft_rectdisk_fftw_plan * SS, * SA; ft_tetrahedron_fftw_plan * TS, * TA; ft_spinsphere_fftw_plan * US, * UA; - //double alpha = -0.5, beta = -0.5, gamma = -0.5, delta = -0.5; // best case scenario - double alpha = 0.0, beta = 0.0, gamma = 0.0, delta = 0.0; // not as good. perhaps better to transform to second kind Chebyshev + //double alpha = -0.5, beta = -0.5, gamma = -0.5, delta = -0.5, rho = sqrt(1.0/3.0); // best case scenario + double alpha = 0.0, beta = 0.0, gamma = 0.0, delta = 0.0, rho = sqrt(1.0/3.0); // not as good. perhaps better to transform to second kind Chebyshev int IERR, ITIME, J, N, L, M, NTIMES; @@ -284,6 +285,68 @@ int main(int argc, const char * argv[]) { } printf("];\n"); + printf("\nTesting the accuracy of annulus transforms + FFTW synthesis and analysis.\n\n"); + printf("\t\t\t Test \t\t\t\t | Relative Error\n"); + printf("---------------------------------------------------------|----------------------\n"); + for (int i = 0; i < IERR; i++) { + N = 64*pow(2, i)+J; + M = 4*N-3; + + A = diskrand(N, M); + B = copymat(A, N, M); + P = ft_plan_ann2cxf(N, alpha, beta, 0.0, rho); + RS1 = ft_plan_annulus_synthesis(N, M, rho, FT_FFTW_FLAGS); + RA1 = ft_plan_annulus_analysis(N, M, rho, FT_FFTW_FLAGS); + + ft_execute_ann2cxf('N', P, A, N, M); + ft_execute_annulus_synthesis('N', RS1, A, N, M); + ft_execute_annulus_analysis('N', RA1, A, N, M); + ft_execute_cxf2ann('N', P, A, N, M); + + ft_execute_ann2cxf('T', P, A, N, M); + ft_execute_annulus_synthesis('T', RA1, A, N, M); + ft_execute_annulus_analysis('T', RS1, A, N, M); + ft_execute_cxf2ann('T', P, A, N, M); + + err = ft_norm_2arg(A, B, N*M)/ft_norm_1arg(B, N*M); + printf("ϵ_2 \t\t\t (N×M) = (%5ix%5i): \t |%20.2e ", N, M, err); + ft_checktest(err, 8*sqrt(N), &checksum); + err = ft_normInf_2arg(A, B, N*M)/ft_normInf_1arg(B, N*M); + printf("ϵ_∞ \t\t\t (N×M) = (%5ix%5i): \t |%20.2e ", N, M, err); + ft_checktest(err, 4*N, &checksum); + + free(A); + free(B); + ft_destroy_harmonic_plan(P); + ft_destroy_annulus_fftw_plan(RS1); + ft_destroy_annulus_fftw_plan(RA1); + } + + printf("\nTiming annulus transforms + FFTW synthesis and analysis.\n\n"); + printf("t4 = [\n"); + for (int i = 0; i < ITIME; i++) { + N = 64*pow(2, i)+J; + M = 4*N-3; + NTIMES = 1 + pow(2048/N, 2); + + A = diskrand(N, M); + P = ft_plan_ann2cxf(N, alpha, beta, 0.0, rho); + RS1 = ft_plan_annulus_synthesis(N, M, rho, FT_FFTW_FLAGS); + RA1 = ft_plan_annulus_analysis(N, M, rho, FT_FFTW_FLAGS); + + FT_TIME({ft_execute_ann2cxf('N', P, A, N, M); ft_execute_annulus_synthesis('N', RS1, A, N, M);}, start, end, NTIMES) + printf("%d %.6f", N, elapsed(&start, &end, NTIMES)); + + FT_TIME({ft_execute_annulus_analysis('N', RA1, A, N, M); ft_execute_cxf2ann('N', P, A, N, M);}, start, end, NTIMES) + printf(" %.6f\n", elapsed(&start, &end, NTIMES)); + + free(A); + ft_destroy_harmonic_plan(P); + ft_destroy_annulus_fftw_plan(RS1); + ft_destroy_annulus_fftw_plan(RA1); + } + printf("];\n"); + printf("\nTesting the accuracy of Dunkl-Xu transforms + FFTW synthesis and analysis.\n\n"); printf("\t\t\t Test \t\t\t\t | Relative Error\n"); printf("---------------------------------------------------------|----------------------\n"); diff --git a/test/test_recurrence.c b/test/test_recurrence.c index 66ed8b74..28dac4a5 100644 --- a/test/test_recurrence.c +++ b/test/test_recurrence.c @@ -429,7 +429,6 @@ int main(void) { float * x = malloc(m*sizeof(float)); float * phi0 = malloc(m*sizeof(float)); float * f = malloc(m*sizeof(float)); - float err = 0.0f; for (int k = 0; k < n; k++) { c[k] = 1.0f/(k+1.0f); A[k] = (2*k+1.0f)/(k+1.0f); @@ -545,7 +544,6 @@ int main(void) { double * x = malloc(m*sizeof(double)); double * phi0 = malloc(m*sizeof(double)); double * f = malloc(m*sizeof(double)); - double err = 0.0; for (int k = 0; k < n; k++) { c[k] = 1.0/(k+1.0); A[k] = (2*k+1.0)/(k+1.0); diff --git a/test/test_tdc_source.c b/test/test_tdc_source.c index cd0f8c88..69c4f441 100644 --- a/test/test_tdc_source.c +++ b/test/test_tdc_source.c @@ -3,7 +3,6 @@ void Y(test_tdc)(int * checksum) { printf("---------------------------------------------------------|----------------------\n"); int n = 256; - struct timeval start, end; int mu = n/2; int m = 0; diff --git a/test/test_tdc_source2.c b/test/test_tdc_source2.c index df9bc41e..5cc59174 100644 --- a/test/test_tdc_source2.c +++ b/test/test_tdc_source2.c @@ -7,10 +7,6 @@ void X(inner_test_tdc_drop_precision)(int * checksum, int n) { int mu = n/2; int m = 0; char PARITY = 'E'; - int shft; - if (PARITY == 'E') shft = 0; - else if (PARITY == 'O') shft = 1; - else shft = -1; X(symmetric_tridiagonal) * T = X(create_A_shtsdtev)(n, mu, m, PARITY); X(symmetric_tridiagonal) * S = X(create_B_shtsdtev)(n, m, PARITY); X2(symmetric_tridiagonal) * T2 = X2(create_A_shtsdtev)(n, mu, m, PARITY); diff --git a/test/test_transforms_mpfr.c b/test/test_transforms_mpfr.c index 13cc5819..8de57f5a 100644 --- a/test/test_transforms_mpfr.c +++ b/test/test_transforms_mpfr.c @@ -52,7 +52,7 @@ void ft_mpfr_checktest(mpfr_t * err, double cst, int * checksum, mpfr_prec_t pre } void test_transforms_mpfr(int * checksum, int N, mpfr_prec_t prec, mpfr_rnd_t rnd) { - mpfr_t err, t1, t2, t3; + mpfr_t err, t1, t2; mpfr_init2(err, prec); mpfr_init2(t1, prec); mpfr_init2(t2, prec); diff --git a/test/test_transforms_source.c b/test/test_transforms_source.c index 8ea58ab4..276be2e7 100644 --- a/test/test_transforms_source.c +++ b/test/test_transforms_source.c @@ -397,7 +397,6 @@ void Y(test_modified_transforms)(int * checksum, int N) { X(checktest)(err, Y(pow)(n+1, 2), checksum); X(destroy_symmetric_tridiagonal)(JQ); X(destroy_modified_plan)(P); - /* P = X(plan_modified_jacobi_to_jacobi)(n, alpha, beta, 4, u, 0, NULL, 0); X(mpsm)('N', P, DP, n, n); X(mpmm)('N', P, IDP, n, n); @@ -440,12 +439,10 @@ void Y(test_modified_transforms)(int * checksum, int N) { err = X(norm_2arg)(DP, Id, n*n)/X(norm_1arg)(Id, n*n); printf("Rational vs. raised recip. polynomial weight \t n = %3i |%20.2e ", n, (double) err); X(checktest)(err, Y(pow)(n+1, 3), checksum); - */ free(Id); free(DP); free(IDP); } - printf("\nTesting the accuracy of modified Laguerre--Laguerre transforms.\n\n"); printf("\t\t\t Test \t\t\t\t | 2-norm Relative Error\n"); printf("---------------------------------------------------------|----------------------\n"); @@ -519,7 +516,6 @@ void Y(test_modified_transforms)(int * checksum, int N) { free(DP); free(IDP); } - printf("\nTesting the accuracy of modified Hermite--Hermite transforms.\n\n"); printf("\t\t\t Test \t\t\t\t | 2-norm Relative Error\n"); printf("---------------------------------------------------------|----------------------\n");