/* * Copyright (c) 1997-1999, 2003 Massachusetts Institute of Technology * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */ /* $Id: fftwnd.c,v 1.1 2008/10/17 06:13:18 scuri Exp $ */ #include "fftw-int.h" /* the number of buffers to use for buffered transforms: */ #define FFTWND_NBUFFERS 8 /* the default number of buffers to use: */ #define FFTWND_DEFAULT_NBUFFERS 0 /* the number of "padding" elements between consecutive buffer lines */ #define FFTWND_BUFFER_PADDING 8 static void destroy_plan_array(int rank, fftw_plan *plans); static void init_test_array(fftw_complex *arr, int stride, int n) { int j; for (j = 0; j < n; ++j) { c_re(arr[stride * j]) = 0.0; c_im(arr[stride * j]) = 0.0; } } /* * Same as fftw_measure_runtime, except for fftwnd plan. */ double fftwnd_measure_runtime(fftwnd_plan plan, fftw_complex *in, int istride, fftw_complex *out, int ostride) { fftw_time begin, end, start; double t, tmax, tmin; int i, iter; int n; int repeat; if (plan->rank == 0) return 0.0; n = 1; for (i = 0; i < plan->rank; ++i) n *= plan->n[i]; iter = 1; for (;;) { tmin = 1.0E10; tmax = -1.0E10; init_test_array(in, istride, n); start = fftw_get_time(); /* repeat the measurement FFTW_TIME_REPEAT times */ for (repeat = 0; repeat < FFTW_TIME_REPEAT; ++repeat) { begin = fftw_get_time(); for (i = 0; i < iter; ++i) { fftwnd(plan, 1, in, istride, 0, out, ostride, 0); } end = fftw_get_time(); t = fftw_time_to_sec(fftw_time_diff(end, begin)); if (t < tmin) tmin = t; if (t > tmax) tmax = t; /* do not run for too long */ t = fftw_time_to_sec(fftw_time_diff(end, start)); if (t > FFTW_TIME_LIMIT) break; } if (tmin >= FFTW_TIME_MIN) break; iter *= 2; } tmin /= (double) iter; tmax /= (double) iter; return tmin; } /********************** Initializing the FFTWND Plan ***********************/ /* Initialize everything except for the 1D plans and the work array: */ fftwnd_plan fftwnd_create_plan_aux(int rank, const int *n, fftw_direction dir, int flags) { int i; fftwnd_plan p; if (rank < 0) return 0; for (i = 0; i < rank; ++i) if (n[i] <= 0) return 0; p = (fftwnd_plan) fftw_malloc(sizeof(fftwnd_data)); p->n = 0; p->n_before = 0; p->n_after = 0; p->plans = 0; p->work = 0; p->dir = dir; p->rank = rank; p->is_in_place = flags & FFTW_IN_PLACE; p->nwork = 0; p->nbuffers = 0; if (rank == 0) return 0; p->n = (int *) fftw_malloc(sizeof(int) * rank); p->n_before = (int *) fftw_malloc(sizeof(int) * rank); p->n_after = (int *) fftw_malloc(sizeof(int) * rank); p->n_before[0] = 1; p->n_after[rank - 1] = 1; for (i = 0; i < rank; ++i) { p->n[i] = n[i]; if (i) { p->n_before[i] = p->n_before[i - 1] * n[i - 1]; p->n_after[rank - 1 - i] = p->n_after[rank - i] * n[rank - i]; } } return p; } /* create an empty new array of rank 1d plans */ fftw_plan *fftwnd_new_plan_array(int rank) { fftw_plan *plans; int i; plans = (fftw_plan *) fftw_malloc(rank * sizeof(fftw_plan)); if (!plans) return 0; for (i = 0; i < rank; ++i) plans[i] = 0; return plans; } /* * create an array of plans using the ordinary 1d fftw_create_plan, * which allocates its own array and creates plans optimized for * contiguous data. */ fftw_plan *fftwnd_create_plans_generic(fftw_plan *plans, int rank, const int *n, fftw_direction dir, int flags) { if (rank <= 0) return 0; if (plans) { int i, j; int cur_flags; for (i = 0; i < rank; ++i) { if (i < rank - 1 || (flags & FFTW_IN_PLACE)) { /* * fft's except the last dimension are always in-place */ cur_flags = flags | FFTW_IN_PLACE; for (j = i - 1; j >= 0 && n[i] != n[j]; --j); } else { cur_flags = flags; /* * we must create a separate plan for the last * dimension */ j = -1; } if (j >= 0) { /* * If a plan already exists for this size * array, reuse it: */ plans[i] = plans[j]; } else { /* generate a new plan: */ plans[i] = fftw_create_plan(n[i], dir, cur_flags); if (!plans[i]) { destroy_plan_array(rank, plans); return 0; } } } } return plans; } static int get_maxdim(int rank, const int *n, int flags) { int i; int maxdim = 0; for (i = 0; i < rank - 1; ++i) if (n[i] > maxdim) maxdim = n[i]; if (rank > 0 && flags & FFTW_IN_PLACE && n[rank - 1] > maxdim) maxdim = n[rank - 1]; return maxdim; } /* compute number of elements required for work array (has to be big enough to hold ncopies of the largest dimension in n that will need an in-place transform. */ int fftwnd_work_size(int rank, const int *n, int flags, int ncopies) { return (ncopies * get_maxdim(rank, n, flags) + (ncopies - 1) * FFTWND_BUFFER_PADDING); } /* * create plans using the fftw_create_plan_specific planner, which * allows us to create plans for each dimension that are specialized * for the strides that we are going to use. */ fftw_plan *fftwnd_create_plans_specific(fftw_plan *plans, int rank, const int *n, const int *n_after, fftw_direction dir, int flags, fftw_complex *in, int istride, fftw_complex *out, int ostride) { if (rank <= 0) return 0; if (plans) { int i, stride, cur_flags; fftw_complex *work = 0; int nwork; nwork = fftwnd_work_size(rank, n, flags, 1); if (nwork) work = (fftw_complex*)fftw_malloc(nwork * sizeof(fftw_complex)); for (i = 0; i < rank; ++i) { /* fft's except the last dimension are always in-place */ if (i < rank - 1) cur_flags = flags | FFTW_IN_PLACE; else cur_flags = flags; /* stride for transforming ith dimension */ stride = n_after[i]; if (cur_flags & FFTW_IN_PLACE) plans[i] = fftw_create_plan_specific(n[i], dir, cur_flags, in, istride * stride, work, 1); else plans[i] = fftw_create_plan_specific(n[i], dir, cur_flags, in, istride * stride, out, ostride * stride); if (!plans[i]) { destroy_plan_array(rank, plans); fftw_free(work); return 0; } } if (work) fftw_free(work); } return plans; } /* * Create an fftwnd_plan specialized for specific arrays. (These * arrays are ignored, however, if they are NULL or if the flags do * not include FFTW_MEASURE.) The main advantage of being provided * arrays like this is that we can do runtime timing measurements of * our options, without worrying about allocating excessive scratch * space. */ fftwnd_plan fftwnd_create_plan_specific(int rank, const int *n, fftw_direction dir, int flags, fftw_complex *in, int istride, fftw_complex *out, int ostride) { fftwnd_plan p; if (!(p = fftwnd_create_plan_aux(rank, n, dir, flags))) return 0; if (!(flags & FFTW_MEASURE) || in == 0 || (!p->is_in_place && out == 0)) { /**** use default plan ****/ p->plans = fftwnd_create_plans_generic(fftwnd_new_plan_array(rank), rank, n, dir, flags); if (!p->plans) { fftwnd_destroy_plan(p); return 0; } if (flags & FFTWND_FORCE_BUFFERED) p->nbuffers = FFTWND_NBUFFERS; else p->nbuffers = FFTWND_DEFAULT_NBUFFERS; p->nwork = fftwnd_work_size(rank, n, flags, p->nbuffers + 1); if (p->nwork && !(flags & FFTW_THREADSAFE)) { p->work = (fftw_complex*) fftw_malloc(p->nwork * sizeof(fftw_complex)); if (!p->work) { fftwnd_destroy_plan(p); return 0; } } } else { /**** use runtime measurements to pick plan ****/ fftw_plan *plans_buf, *plans_nobuf; double t_buf, t_nobuf; p->nwork = fftwnd_work_size(rank, n, flags, FFTWND_NBUFFERS + 1); if (p->nwork && !(flags & FFTW_THREADSAFE)) { p->work = (fftw_complex*) fftw_malloc(p->nwork * sizeof(fftw_complex)); if (!p->work) { fftwnd_destroy_plan(p); return 0; } } else p->work = (fftw_complex*) NULL; /* two possible sets of 1D plans: */ plans_buf = fftwnd_create_plans_generic(fftwnd_new_plan_array(rank), rank, n, dir, flags); plans_nobuf = fftwnd_create_plans_specific(fftwnd_new_plan_array(rank), rank, n, p->n_after, dir, flags, in, istride, out, ostride); if (!plans_buf || !plans_nobuf) { destroy_plan_array(rank, plans_nobuf); destroy_plan_array(rank, plans_buf); fftwnd_destroy_plan(p); return 0; } /* time the two possible plans */ p->plans = plans_nobuf; p->nbuffers = 0; p->nwork = fftwnd_work_size(rank, n, flags, p->nbuffers + 1); t_nobuf = fftwnd_measure_runtime(p, in, istride, out, ostride); p->plans = plans_buf; p->nbuffers = FFTWND_NBUFFERS; p->nwork = fftwnd_work_size(rank, n, flags, p->nbuffers + 1); t_buf = fftwnd_measure_runtime(p, in, istride, out, ostride); /* pick the better one: */ if (t_nobuf < t_buf) { /* use unbuffered transform */ p->plans = plans_nobuf; p->nbuffers = 0; /* work array is unnecessarily large */ if (p->work) fftw_free(p->work); p->work = 0; destroy_plan_array(rank, plans_buf); /* allocate a work array of the correct size: */ p->nwork = fftwnd_work_size(rank, n, flags, p->nbuffers + 1); if (p->nwork && !(flags & FFTW_THREADSAFE)) { p->work = (fftw_complex*) fftw_malloc(p->nwork * sizeof(fftw_complex)); if (!p->work) { fftwnd_destroy_plan(p); return 0; } } } else { /* use buffered transform */ destroy_plan_array(rank, plans_nobuf); } } return p; } fftwnd_plan fftw2d_create_plan_specific(int nx, int ny, fftw_direction dir, int flags, fftw_complex *in, int istride, fftw_complex *out, int ostride) { int n[2]; n[0] = nx; n[1] = ny; return fftwnd_create_plan_specific(2, n, dir, flags, in, istride, out, ostride); } fftwnd_plan fftw3d_create_plan_specific(int nx, int ny, int nz, fftw_direction dir, int flags, fftw_complex *in, int istride, fftw_complex *out, int ostride) { int n[3]; n[0] = nx; n[1] = ny; n[2] = nz; return fftwnd_create_plan_specific(3, n, dir, flags, in, istride, out, ostride); } /* Create a generic fftwnd plan: */ fftwnd_plan fftwnd_create_plan(int rank, const int *n, fftw_direction dir, int flags) { return fftwnd_create_plan_specific(rank, n, dir, flags, 0, 1, 0, 1); } fftwnd_plan fftw2d_create_plan(int nx, int ny, fftw_direction dir, int flags) { return fftw2d_create_plan_specific(nx, ny, dir, flags, 0, 1, 0, 1); } fftwnd_plan fftw3d_create_plan(int nx, int ny, int nz, fftw_direction dir, int flags) { return fftw3d_create_plan_specific(nx, ny, nz, dir, flags, 0, 1, 0, 1); } /************************ Freeing the FFTWND Plan ************************/ static void destroy_plan_array(int rank, fftw_plan *plans) { if (plans) { int i, j; for (i = 0; i < rank; ++i) { for (j = i - 1; j >= 0 && plans[i] != plans[j]; --j); if (j < 0 && plans[i]) fftw_destroy_plan(plans[i]); } fftw_free(plans); } } void fftwnd_destroy_plan(fftwnd_plan plan) { if (plan) { destroy_plan_array(plan->rank, plan->plans); if (plan->n) fftw_free(plan->n); if (plan->n_before) fftw_free(plan->n_before); if (plan->n_after) fftw_free(plan->n_after); if (plan->work) fftw_free(plan->work); fftw_free(plan); } } /************************ Printing the FFTWND Plan ************************/ void fftwnd_fprint_plan(FILE *f, fftwnd_plan plan) { if (plan) { int i, j; if (plan->rank == 0) { fprintf(f, "plan for rank 0 (null) transform.\n"); return; } fprintf(f, "plan for "); for (i = 0; i < plan->rank; ++i) fprintf(f, "%s%d", i ? "x" : "", plan->n[i]); fprintf(f, " transform:\n"); if (plan->nbuffers > 0) fprintf(f, " -- using buffered transforms (%d buffers)\n", plan->nbuffers); else fprintf(f, " -- using unbuffered transform\n"); for (i = 0; i < plan->rank; ++i) { fprintf(f, "* dimension %d (size %d) ", i, plan->n[i]); for (j = i - 1; j >= 0; --j) if (plan->plans[j] == plan->plans[i]) break; if (j < 0) fftw_fprint_plan(f, plan->plans[i]); else fprintf(f, "plan is same as dimension %d plan.\n", j); } } } void fftwnd_print_plan(fftwnd_plan plan) { fftwnd_fprint_plan(stdout, plan); } /********************* Buffered FFTW (in-place) *********************/ void fftw_buffered(fftw_plan p, int howmany, fftw_complex *in, int istride, int idist, fftw_complex *work, int nbuffers, fftw_complex *buffers) { int i = 0, n, nb; n = p->n; nb = n + FFTWND_BUFFER_PADDING; do { for (; i <= howmany - nbuffers; i += nbuffers) { fftw_complex *cur_in = in + i * idist; int j, buf; /* * First, copy nbuffers strided arrays to the * contiguous buffer arrays (reading consecutive * locations, assuming that idist is 1): */ for (j = 0; j < n; ++j) { fftw_complex *cur_in2 = cur_in + j * istride; fftw_complex *cur_buffers = buffers + j; for (buf = 0; buf <= nbuffers - 4; buf += 4) { fftw_real r0, i0, r1, i1, r2, i2, r3, i3; r0 = c_re(cur_in2[0]); i0 = c_im(cur_in2[0]); r1 = c_re(cur_in2[idist]); i1 = c_im(cur_in2[idist]); r2 = c_re(cur_in2[2 * idist]); i2 = c_im(cur_in2[2 * idist]); r3 = c_re(cur_in2[3 * idist]); i3 = c_im(cur_in2[3 * idist]); c_re(cur_buffers[0]) = r0; c_im(cur_buffers[0]) = i0; c_re(cur_buffers[nb]) = r1; c_im(cur_buffers[nb]) = i1; c_re(cur_buffers[2 * nb]) = r2; c_im(cur_buffers[2 * nb]) = i2; c_re(cur_buffers[3 * nb]) = r3; c_im(cur_buffers[3 * nb]) = i3; cur_buffers += 4 * nb; cur_in2 += 4 * idist; } for (; buf < nbuffers; ++buf) { *cur_buffers = *cur_in2; cur_buffers += nb; cur_in2 += idist; } } /* * Now, compute the FFTs in the buffers (in-place * using work): */ fftw(p, nbuffers, buffers, 1, nb, work, 1, 0); /* * Finally, copy the results back from the contiguous * buffers to the strided arrays (writing consecutive * locations): */ for (j = 0; j < n; ++j) { fftw_complex *cur_in2 = cur_in + j * istride; fftw_complex *cur_buffers = buffers + j; for (buf = 0; buf <= nbuffers - 4; buf += 4) { fftw_real r0, i0, r1, i1, r2, i2, r3, i3; r0 = c_re(cur_buffers[0]); i0 = c_im(cur_buffers[0]); r1 = c_re(cur_buffers[nb]); i1 = c_im(cur_buffers[nb]); r2 = c_re(cur_buffers[2 * nb]); i2 = c_im(cur_buffers[2 * nb]); r3 = c_re(cur_buffers[3 * nb]); i3 = c_im(cur_buffers[3 * nb]); c_re(cur_in2[0]) = r0; c_im(cur_in2[0]) = i0; c_re(cur_in2[idist]) = r1; c_im(cur_in2[idist]) = i1; c_re(cur_in2[2 * idist]) = r2; c_im(cur_in2[2 * idist]) = i2; c_re(cur_in2[3 * idist]) = r3; c_im(cur_in2[3 * idist]) = i3; cur_buffers += 4 * nb; cur_in2 += 4 * idist; } for (; buf < nbuffers; ++buf) { *cur_in2 = *cur_buffers; cur_buffers += nb; cur_in2 += idist; } } } /* * we skip howmany % nbuffers ffts at the end of the loop, * so we have to go back and do them: */ nbuffers = howmany - i; } while (i < howmany); } /********************* Computing the N-Dimensional FFT *********************/ void fftwnd_aux(fftwnd_plan p, int cur_dim, fftw_complex *in, int istride, fftw_complex *out, int ostride, fftw_complex *work) { int n_after = p->n_after[cur_dim], n = p->n[cur_dim]; if (cur_dim == p->rank - 2) { /* just do the last dimension directly: */ if (p->is_in_place) fftw(p->plans[p->rank - 1], n, in, istride, n_after * istride, work, 1, 0); else fftw(p->plans[p->rank - 1], n, in, istride, n_after * istride, out, ostride, n_after * ostride); } else { /* we have at least two dimensions to go */ int i; /* * process the subsequent dimensions recursively, in hyperslabs, * to get maximum locality: */ for (i = 0; i < n; ++i) fftwnd_aux(p, cur_dim + 1, in + i * n_after * istride, istride, out + i * n_after * ostride, ostride, work); } /* do the current dimension (in-place): */ if (p->nbuffers == 0) { fftw(p->plans[cur_dim], n_after, out, n_after * ostride, ostride, work, 1, 0); } else /* using contiguous copy buffers: */ fftw_buffered(p->plans[cur_dim], n_after, out, n_after * ostride, ostride, work, p->nbuffers, work + n); } /* * alternate version of fftwnd_aux -- this version pushes the howmany * loop down to the leaves of the computation, for greater locality in * cases where dist < stride */ void fftwnd_aux_howmany(fftwnd_plan p, int cur_dim, int howmany, fftw_complex *in, int istride, int idist, fftw_complex *out, int ostride, int odist, fftw_complex *work) { int n_after = p->n_after[cur_dim], n = p->n[cur_dim]; int k; if (cur_dim == p->rank - 2) { /* just do the last dimension directly: */ if (p->is_in_place) for (k = 0; k < n; ++k) fftw(p->plans[p->rank - 1], howmany, in + k * n_after * istride, istride, idist, work, 1, 0); else for (k = 0; k < n; ++k) fftw(p->plans[p->rank - 1], howmany, in + k * n_after * istride, istride, idist, out + k * n_after * ostride, ostride, odist); } else { /* we have at least two dimensions to go */ int i; /* * process the subsequent dimensions recursively, in * hyperslabs, to get maximum locality: */ for (i = 0; i < n; ++i) fftwnd_aux_howmany(p, cur_dim + 1, howmany, in + i * n_after * istride, istride, idist, out + i * n_after * ostride, ostride, odist, work); } /* do the current dimension (in-place): */ if (p->nbuffers == 0) for (k = 0; k < n_after; ++k) fftw(p->plans[cur_dim], howmany, out + k * ostride, n_after * ostride, odist, work, 1, 0); else /* using contiguous copy buffers: */ for (k = 0; k < n_after; ++k) fftw_buffered(p->plans[cur_dim], howmany, out + k * ostride, n_after * ostride, odist, work, p->nbuffers, work + n); } void fftwnd(fftwnd_plan p, int howmany, fftw_complex *in, int istride, int idist, fftw_complex *out, int ostride, int odist) { fftw_complex *work; #ifdef FFTW_DEBUG if (p->rank > 0 && (p->plans[0]->flags & FFTW_THREADSAFE) && p->nwork && p->work) fftw_die("bug with FFTW_THREADSAFE flag\n"); #endif if (p->nwork && !p->work) work = (fftw_complex *) fftw_malloc(p->nwork * sizeof(fftw_complex)); else work = p->work; switch (p->rank) { case 0: break; case 1: if (p->is_in_place) /* fft is in-place */ fftw(p->plans[0], howmany, in, istride, idist, work, 1, 0); else fftw(p->plans[0], howmany, in, istride, idist, out, ostride, odist); break; default: /* rank >= 2 */ { if (p->is_in_place) { out = in; ostride = istride; odist = idist; } if (howmany > 1 && odist < ostride) fftwnd_aux_howmany(p, 0, howmany, in, istride, idist, out, ostride, odist, work); else { int i; for (i = 0; i < howmany; ++i) fftwnd_aux(p, 0, in + i * idist, istride, out + i * odist, ostride, work); } } } if (p->nwork && !p->work) fftw_free(work); } void fftwnd_one(fftwnd_plan p, fftw_complex *in, fftw_complex *out) { fftwnd(p, 1, in, 1, 1, out, 1, 1); }