diff options
Diffstat (limited to 'src/fftw3/kernel')
40 files changed, 5400 insertions, 0 deletions
diff --git a/src/fftw3/kernel/align.c b/src/fftw3/kernel/align.c new file mode 100644 index 0000000..7f475d5 --- /dev/null +++ b/src/fftw3/kernel/align.c @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* $Id: align.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */ + +#include "ifftw.h" + +#if HAVE_3DNOW +# define ALGN 8 +#elif HAVE_SIMD +# define ALGN 16 +#elif HAVE_K7 +# define ALGN 8 +#else +# define ALGN (sizeof(R)) +#endif + +/* NONPORTABLE */ +int X(alignment_of)(R *p) +{ + return (int)(((uintptr_t) p) % ALGN); +} diff --git a/src/fftw3/kernel/alloc.c b/src/fftw3/kernel/alloc.c new file mode 100644 index 0000000..a95e0e8 --- /dev/null +++ b/src/fftw3/kernel/alloc.c @@ -0,0 +1,404 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* $Id: alloc.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */ + +#include "ifftw.h" + +#if defined(HAVE_DECL_MEMALIGN) && !HAVE_DECL_MEMALIGN +# if defined(HAVE_MALLOC_H) +# include <malloc.h> +# else +extern void *memalign(size_t, size_t); +# endif +#endif + +#if defined(HAVE_DECL_POSIX_MEMALIGN) && !HAVE_DECL_POSIX_MEMALIGN +extern int posix_memalign(void **, size_t, size_t); +#endif + +#if defined(macintosh) /* MacOS 9 */ +# include <Multiprocessing.h> +#endif + +#define real_malloc X(malloc) +#define real_free free /* memalign and malloc use ordinary free */ + +#if defined(WITH_OUR_MALLOC16) && (MIN_ALIGNMENT == 16) +/* Our own 16-byte aligned malloc/free. Assumes sizeof(void*) is a + power of two <= 8 and that malloc is at least sizeof(void*)-aligned. + + The main reason for this routine is that, as of this writing, + Windows does not include any aligned allocation routines in its + system libraries, and instead provides an implementation with a + Visual C++ "Processor Pack" that you have to statically link into + your program. We do not want to require users to have VC++ + (e.g. gcc/MinGW should be fine). Our code should be at least as good + as the MS _aligned_malloc, in any case, according to second-hand + reports of the algorithm it employs (also based on plain malloc). */ +static void *our_malloc16(size_t n) +{ + void *p0, *p; + if (!(p0 = malloc(n + 16))) return (void *) 0; + p = (void *) (((uintptr_t) p0 + 16) & (~((uintptr_t) 15))); + *((void **) p - 1) = p0; + return p; +} +static void our_free16(void *p) +{ + if (p) free(*((void **) p - 1)); +} +#endif + +/* part of user-callable API */ +void *X(malloc)(size_t n) +{ + void *p; + +#if defined(MIN_ALIGNMENT) + +# if defined(WITH_OUR_MALLOC16) && (MIN_ALIGNMENT == 16) + p = our_malloc16(n); +# undef real_free +# define real_free our_free16 + +# elif defined(HAVE_MEMALIGN) + p = memalign(MIN_ALIGNMENT, n); + +# elif defined(HAVE_POSIX_MEMALIGN) + /* note: posix_memalign is broken in glibc 2.2.5: it constrains + the size, not the alignment, to be (power of two) * sizeof(void*). + The bug seems to have been fixed as of glibc 2.3.1. */ + if (posix_memalign(&p, MIN_ALIGNMENT, n)) + p = (void*) 0; + +# elif defined(__ICC) || defined(__INTEL_COMPILER) || defined(HAVE__MM_MALLOC) + /* Intel's C compiler defines _mm_malloc and _mm_free intrinsics */ + p = (void *) _mm_malloc(n, MIN_ALIGNMENT); +# undef real_free +# define real_free _mm_free + +# elif defined(_MSC_VER) + /* MS Visual C++ 6.0 with a "Processor Pack" supports SIMD + and _aligned_malloc/free (uses malloc.h) */ + p = (void *) _aligned_malloc(n, MIN_ALIGNMENT); +# undef real_free +# define real_free _aligned_free + +# elif (defined(__MACOSX__) || defined(__APPLE__)) && (MIN_ALIGNMENT <= 16) + /* MacOS X malloc is already 16-byte aligned */ + p = malloc(n); + +# elif defined(macintosh) /* MacOS 9 */ + p = (void *) MPAllocateAligned(n, +# if MIN_ALIGNMENT == 8 + kMPAllocate8ByteAligned, +# elif MIN_ALIGNMENT == 16 + kMPAllocate16ByteAligned, +# elif MIN_ALIGNMENT == 32 + kMPAllocate32ByteAligned, +# else +# error "Unknown alignment for MPAllocateAligned" +# endif + 0); +# undef real_free +# define real_free MPFree + +# else + /* Add your machine here and send a patch to fftw@fftw.org + or (e.g. for Windows) configure --with-our-malloc16 */ +# error "Don't know how to malloc() aligned memory." +# endif + +#else /* !defined(MIN_ALIGMENT) */ + p = malloc(n); +#endif + + return p; +} + +/* part of user-callable API */ +void X(free)(void *p) +{ + real_free(p); +} + +/********************************************************** + * DEBUGGING CODE + **********************************************************/ +#if defined(FFTW_DEBUG_MALLOC) + +#include <stdio.h> + +/* + debugging malloc/free. + + 1) Initialize every malloced and freed area to random values, just + to make sure we are not using uninitialized pointers. + + 2) check for blocks freed twice. + + 3) Check for writes past the ends of allocated blocks + + 4) destroy contents of freed blocks in order to detect incorrect reuse. + + 5) keep track of who allocates what and report memory leaks + + This code is a quick and dirty hack. May be nonportable. + Use at your own risk. + +*/ + +#define MAGIC ((size_t)0xABadCafe) +#define PAD_FACTOR 2 +#define SZ_HEADER (4 * sizeof(size_t)) +#define HASHSZ 1031 + +static unsigned int hashaddr(void *p) +{ + return ((unsigned long)p) % HASHSZ; +} + +struct mstat { + int siz; + int maxsiz; + int cnt; + int maxcnt; +}; + +static struct mstat mstat[MALLOC_WHAT_LAST]; + +struct minfo { + const char *file; + int line; + size_t n; + void *p; + struct minfo *next; +}; + +static struct minfo *minfo[HASHSZ] = {0}; + +#ifdef HAVE_THREADS +int X(in_thread) = 0; +#endif + +void *X(malloc_debug)(size_t n, enum malloc_tag what, + const char *file, int line) +{ + char *p; + size_t i; + struct minfo *info; + struct mstat *stat = mstat + what; + struct mstat *estat = mstat + EVERYTHING; + + if (n == 0) + n = 1; + + if (!IN_THREAD) { + stat->siz += n; + if (stat->siz > stat->maxsiz) + stat->maxsiz = stat->siz; + estat->siz += n; + if (estat->siz > estat->maxsiz) + estat->maxsiz = estat->siz; + } + + p = (char *) real_malloc(PAD_FACTOR * n + SZ_HEADER); + A(p); + + /* store the sz in a known position */ + ((size_t *) p)[0] = n; + ((size_t *) p)[1] = MAGIC; + ((size_t *) p)[2] = what; + + /* fill with junk */ + for (i = 0; i < PAD_FACTOR * n; i++) + p[i + SZ_HEADER] = (char) (i ^ 0xEF); + + if (!IN_THREAD) { + ++stat->cnt; + ++estat->cnt; + + if (stat->cnt > stat->maxcnt) + stat->maxcnt = stat->cnt; + if (estat->cnt > estat->maxcnt) + estat->maxcnt = estat->cnt; + } + + /* skip the info we stored previously */ + p = p + SZ_HEADER; + + if (!IN_THREAD) { + unsigned int h = hashaddr(p); + /* record allocation in allocation list */ + info = (struct minfo *) malloc(sizeof(struct minfo)); + info->n = n; + info->file = file; + info->line = line; + info->p = p; + info->next = minfo[h]; + minfo[h] = info; + } + + return (void *) p; +} + +void X(ifree)(void *p) +{ + char *q; + + A(p); + + q = ((char *) p) - SZ_HEADER; + A(q); + + { + size_t n = ((size_t *) q)[0]; + size_t magic = ((size_t *) q)[1]; + int what = ((size_t *) q)[2]; + size_t i; + struct mstat *stat = mstat + what; + struct mstat *estat = mstat + EVERYTHING; + + /* set to zero to detect duplicate free's */ + ((size_t *) q)[0] = 0; + + A(magic == MAGIC); + ((size_t *) q)[1] = ~MAGIC; + + if (!IN_THREAD) { + stat->siz -= n; + A(stat->siz >= 0); + estat->siz -= n; + A(estat->siz >= 0); + } + + /* check for writing past end of array: */ + for (i = n; i < PAD_FACTOR * n; ++i) + if (q[i + SZ_HEADER] != (char) (i ^ 0xEF)) { + A(0 /* array bounds overwritten */ ); + } + for (i = 0; i < PAD_FACTOR * n; ++i) + q[i + SZ_HEADER] = (char) (i ^ 0xAD); + + if (!IN_THREAD) { + --stat->cnt; + --estat->cnt; + + A(stat->cnt >= 0); + A((stat->cnt == 0 && stat->siz == 0) || + (stat->cnt > 0 && stat->siz > 0)); + A(estat->cnt >= 0); + A((estat->cnt == 0 && estat->siz == 0) || + (estat->cnt > 0 && estat->siz > 0)); + } + + real_free(q); + } + + if (!IN_THREAD) { + /* delete minfo entry */ + unsigned int h = hashaddr(p); + struct minfo **i; + + for (i = minfo + h; *i; i = &((*i)->next)) { + if ((*i)->p == p) { + struct minfo *i0 = (*i)->next; + free(*i); + *i = i0; + return; + } + } + + A(0 /* no entry in minfo list */ ); + } +} + +void X(malloc_print_minfo)(int verbose) +{ + struct minfo *info; + int what; + unsigned int h; + + if (verbose) { + static const char *names[MALLOC_WHAT_LAST] = { + "EVERYTHING", + "PLANS", "SOLVERS", "PROBLEMS", "BUFFERS", + "HASHT", "TENSORS", "PLANNERS", "SLVDSC", "TWIDDLES", + "STRIDES", "OTHER" + }; + + printf("%12s %8s %8s %10s %10s\n", + "what", "cnt", "maxcnt", "siz", "maxsiz"); + + for (what = 0; what < MALLOC_WHAT_LAST; ++what) { + struct mstat *stat = mstat + what; + printf("%12s %8d %8d %10d %10d\n", + names[what], stat->cnt, stat->maxcnt, + stat->siz, stat->maxsiz); + } + } + + for (h = 0; h < HASHSZ; ++h) + if (minfo[h]) { + printf("\nUnfreed allocations:\n"); + break; + } + + for (h = 0; h < HASHSZ; ++h) + for (info = minfo[h]; info; info = info->next) { + printf("%s:%d: %d bytes at %p\n", + info->file, info->line, info->n, info->p); + } +} + +#else +/********************************************************** + * NON DEBUGGING CODE + **********************************************************/ +/* production version, no hacks */ + +void *X(malloc_plain)(size_t n) +{ + void *p; + if (n == 0) + n = 1; + p = real_malloc(n); + CK(p); + +#ifdef MIN_ALIGMENT + A((((uintptr_t)p) % MIN_ALIGNMENT) == 0); +#endif + + return p; +} + +void X(ifree)(void *p) +{ + real_free(p); +} + +#endif + +void X(ifree0)(void *p) +{ + /* common pattern */ + if (p) X(ifree)(p); +} diff --git a/src/fftw3/kernel/assert.c b/src/fftw3/kernel/assert.c new file mode 100644 index 0000000..a066db0 --- /dev/null +++ b/src/fftw3/kernel/assert.c @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* $Id: assert.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */ +#include "ifftw.h" +#include <stdio.h> +#include <stdlib.h> + +void X(assertion_failed)(const char *s, int line, const char *file) +{ + fflush(stdout); + fprintf(stderr, "fftw: %s:%d: assertion failed: %s\n", file, line, s); + exit(EXIT_FAILURE); +} diff --git a/src/fftw3/kernel/awake.c b/src/fftw3/kernel/awake.c new file mode 100644 index 0000000..7ee716f --- /dev/null +++ b/src/fftw3/kernel/awake.c @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* $Id: awake.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */ + +#include "ifftw.h" + +void X(null_awake)(plan *ego, int awake) +{ + UNUSED(ego); + UNUSED(awake); + /* do nothing */ +} diff --git a/src/fftw3/kernel/cycle.h b/src/fftw3/kernel/cycle.h new file mode 100644 index 0000000..a324c2a --- /dev/null +++ b/src/fftw3/kernel/cycle.h @@ -0,0 +1,420 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + */ + +/* $Id: cycle.h,v 1.1 2008/10/17 06:11:29 scuri Exp $ */ + +/* machine-dependent cycle counters code. Needs to be inlined. */ + +/***************************************************************************/ +/* To use the cycle counters in your code, simply #include "cycle.h" (this + file), and then use the functions/macros: + + ticks getticks(void); + + ticks is an opaque typedef defined below, representing the current time. + You extract the elapsed time between two calls to gettick() via: + + double elapsed(ticks t1, ticks t0); + + which returns a double-precision variable in arbitrary units. You + are not expected to convert this into human units like seconds; it + is intended only for *comparisons* of time intervals. + + (In order to use some of the OS-dependent timer routines like + Solaris' gethrtime, you need to paste the autoconf snippet below + into your configure.ac file and #include "config.h" before cycle.h, + or define the relevant macros manually if you are not using autoconf.) +*/ + +/***************************************************************************/ +/* This file uses macros like HAVE_GETHRTIME that are assumed to be + defined according to whether the corresponding function/type/header + is available on your system. The necessary macros are most + conveniently defined if you are using GNU autoconf, via the tests: + + dnl --------------------------------------------------------------------- + + AC_C_INLINE + AC_HEADER_TIME + AC_CHECK_HEADERS([sys/time.h c_asm.h intrinsics.h]) + + AC_CHECK_TYPE([hrtime_t],[AC_DEFINE(HAVE_HRTIME_T, 1, [Define to 1 if hrtime_t is defined in <sys/time.h>])],,[#if HAVE_SYS_TIME_H +#include <sys/time.h> +#endif]) + + AC_CHECK_FUNCS([gethrtime read_real_time time_base_to_time clock_gettime]) + + dnl Cray UNICOS _rtc() (real-time clock) intrinsic + AC_MSG_CHECKING([for _rtc intrinsic]) + rtc_ok=yes + AC_TRY_LINK([#ifdef HAVE_INTRINSICS_H +#include <intrinsics.h> +#endif], [_rtc()], [AC_DEFINE(HAVE__RTC,1,[Define if you have the UNICOS _rtc() intrinsic.])], [rtc_ok=no]) + AC_MSG_RESULT($rtc_ok) + + dnl --------------------------------------------------------------------- +*/ + +/***************************************************************************/ + +#if TIME_WITH_SYS_TIME +# include <sys/time.h> +# include <time.h> +#else +# if HAVE_SYS_TIME_H +# include <sys/time.h> +# else +# include <time.h> +# endif +#endif + +#define INLINE_ELAPSED(INL) static INL double elapsed(ticks t1, ticks t0) \ +{ \ + return (double)(t1 - t0); \ +} + +/*----------------------------------------------------------------*/ +/* Solaris */ +#if defined(HAVE_GETHRTIME) && defined(HAVE_HRTIME_T) && !defined(HAVE_TICK_COUNTER) +typedef hrtime_t ticks; + +#define getticks gethrtime + +INLINE_ELAPSED(inline) + +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* AIX v. 4+ routines to read the real-time clock or time-base register */ +#if defined(HAVE_READ_REAL_TIME) && defined(HAVE_TIME_BASE_TO_TIME) && !defined(HAVE_TICK_COUNTER) +typedef timebasestruct_t ticks; + +static inline ticks getticks(void) +{ + ticks t; + read_real_time(&t, TIMEBASE_SZ); + return t; +} + +static inline double elapsed(ticks t1, ticks t0) /* time in nanoseconds */ +{ + time_base_to_time(&t1, TIMEBASE_SZ); + time_base_to_time(&t0, TIMEBASE_SZ); + return ((t1.tb_high - t0.tb_high) * 1e9 + (t1.tb_low - t0.tb_low)); +} + +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* + * PowerPC ``cycle'' counter using the time base register. + */ +#if ((defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__))) || (defined(__MWERKS__) && defined(macintosh))) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long long ticks; + +static __inline__ ticks getticks(void) +{ + unsigned int tbl, tbu0, tbu1; + + do { + __asm__ __volatile__ ("mftbu %0" : "=r"(tbu0)); + __asm__ __volatile__ ("mftb %0" : "=r"(tbl)); + __asm__ __volatile__ ("mftbu %0" : "=r"(tbu1)); + } while (tbu0 != tbu1); + + return (((unsigned long long)tbu0) << 32) | tbl; +} + +INLINE_ELAPSED(__inline__) + +#define HAVE_TICK_COUNTER +#endif +/*----------------------------------------------------------------*/ +/* + * Pentium cycle counter + */ +#if (defined(__GNUC__) || defined(__ICC)) && defined(__i386__) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long long ticks; + +static __inline__ ticks getticks(void) +{ + ticks ret; + + __asm__ __volatile__("rdtsc": "=A" (ret)); + /* no input, nothing else clobbered */ + return ret; +} + +INLINE_ELAPSED(__inline__) + +#define HAVE_TICK_COUNTER +#endif + +/* Visual C++ -- thanks to Morten Nissov for his help with this */ +#if _MSC_VER >= 1200 && _M_IX86 >= 500 && !defined(HAVE_TICK_COUNTER) +#include <windows.h> +typedef LARGE_INTEGER ticks; +#define RDTSC __asm __emit 0fh __asm __emit 031h /* hack for VC++ 5.0 */ + +static __inline ticks getticks(void) +{ + ticks ret; + + __asm { + RDTSC + mov ret.HighPart, edx + mov ret.LowPart, eax + } + return ret; +} + +static __inline double elapsed(ticks t1, ticks t0) +{ + return (double)(t1.QuadPart - t0.QuadPart); +} + +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* + * X86-64 cycle counter + */ +#if defined(__GNUC__) && defined(__x86_64__) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long long ticks; + +static __inline__ ticks getticks(void) +{ + unsigned a, d; + asm volatile("rdtsc" : "=a" (a), "=d" (d)); + return ((ticks)a) | (((ticks)d) << 32); +} + +INLINE_ELAPSED(__inline__) + +#define HAVE_TICK_COUNTER +#endif + +/* Visual C++ (FIXME: how to detect compilation for x86-64?) */ +#if _MSC_VER >= 1400 && !defined(HAVE_TICK_COUNTER) +typedef ULONG64 ticks; + +#define getticks __rdtsc + +INLINE_ELAPSED(__inline) + +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* + * IA64 cycle counter + */ +#if defined(__GNUC__) && defined(__ia64__) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long ticks; + +static __inline__ ticks getticks(void) +{ + ticks ret; + + __asm__ __volatile__ ("mov %0=ar.itc" : "=r"(ret)); + return ret; +} + +INLINE_ELAPSED(__inline__) + +#define HAVE_TICK_COUNTER +#endif + +/* HP/UX IA64 compiler, courtesy Teresa L. Johnson: */ +#if defined(__hpux) && defined(__ia64) && !defined(HAVE_TICK_COUNTER) +#include <machine/sys/inline.h> +typedef unsigned long ticks; + +static inline ticks getticks(void) +{ + ticks ret; + + ret = _Asm_mov_from_ar (_AREG_ITC); + return ret; +} + +INLINE_ELAPSED(inline) + +#define HAVE_TICK_COUNTER +#endif + +/* intel's ecc compiler */ +#if defined(__ECC) && defined(__ia64__) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long ticks; +#include <ia64intrin.h> + +static __inline__ ticks getticks(void) +{ + return __getReg(_IA64_REG_AR_ITC); +} + +INLINE_ELAPSED(__inline__) + +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* + * PA-RISC cycle counter + */ +#if defined(__hppa__) || defined(__hppa) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long ticks; + +# ifdef __GNUC__ +static __inline__ ticks getticks(void) +{ + ticks ret; + + __asm__ __volatile__("mfctl 16, %0": "=r" (ret)); + /* no input, nothing else clobbered */ + return ret; +} +# else +# include <machine/inline.h> +static inline unsigned long getticks(void) +{ + register ticks ret; + _MFCTL(16, ret); + return ret; +} +# endif + +INLINE_ELAPSED(inline) + +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* S390, courtesy of James Treacy */ +#if defined(__GNUC__) && defined(__s390__) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long long ticks; + +static __inline__ ticks getticks(void) +{ + ticks cycles; + __asm__("stck 0(%0)" : : "a" (&(cycles)) : "memory", "cc"); + return cycles; +} + +INLINE_ELAPSED(__inline__) + +#define HAVE_TICK_COUNTER +#endif +/*----------------------------------------------------------------*/ +#if defined(__GNUC__) && defined(__alpha__) && !defined(HAVE_TICK_COUNTER) +/* + * The 32-bit cycle counter on alpha overflows pretty quickly, + * unfortunately. A 1GHz machine overflows in 4 seconds. + */ +typedef unsigned int ticks; + +static __inline__ ticks getticks(void) +{ + unsigned long cc; + __asm__ __volatile__ ("rpcc %0" : "=r"(cc)); + return (cc & 0xFFFFFFFF); +} + +INLINE_ELAPSED(__inline__) + +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +#if defined(__GNUC__) && defined(__sparc_v9__) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long ticks; + +static __inline__ ticks getticks(void) +{ + ticks ret; + __asm__("rd %%tick, %0" : "=r" (ret)); + return ret; +} + +INLINE_ELAPSED(__inline__) + +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +#if defined(__DECC) && defined(__alpha) && defined(HAVE_C_ASM_H) && !defined(HAVE_TICK_COUNTER) +# include <c_asm.h> +typedef unsigned int ticks; + +static __inline ticks getticks(void) +{ + unsigned long cc; + cc = asm("rpcc %v0"); + return (cc & 0xFFFFFFFF); +} + +INLINE_ELAPSED(__inline) + +#define HAVE_TICK_COUNTER +#endif +/*----------------------------------------------------------------*/ +/* SGI/Irix */ +#if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_SGI_CYCLE) && !defined(HAVE_TICK_COUNTER) +typedef struct timespec ticks; + +static inline ticks getticks(void) +{ + struct timespec t; + clock_gettime(CLOCK_SGI_CYCLE, &t); + return t; +} + +static inline double elapsed(ticks t1, ticks t0) +{ + return (double)(t1.tv_sec - t0.tv_sec) * 1.0E9 + + (double)(t1.tv_nsec - t0.tv_nsec); +} +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* Cray UNICOS _rtc() intrinsic function */ +#if defined(HAVE__RTC) && !defined(HAVE_TICK_COUNTER) +#ifdef HAVE_INTRINSICS_H +# include <intrinsics.h> +#endif + +typedef long long ticks; + +#define getticks _rtc + +INLINE_ELAPSED(inline) + +#define HAVE_TICK_COUNTER +#endif + diff --git a/src/fftw3/kernel/debug.c b/src/fftw3/kernel/debug.c new file mode 100644 index 0000000..63e4a22 --- /dev/null +++ b/src/fftw3/kernel/debug.c @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* $Id: debug.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */ +#include "ifftw.h" + +#ifdef FFTW_DEBUG +#include <stdio.h> + +typedef struct { + printer super; + FILE *f; +} P_file; + +static void putchr_file(printer *p_, char c) +{ + P_file *p = (P_file *) p_; + fputc(c, p->f); +} + +static printer *mkprinter_file(FILE *f) +{ + P_file *p = (P_file *) X(mkprinter)(sizeof(P_file), putchr_file, 0); + p->f = f; + return &p->super; +} + +void X(debug)(const char *format, ...) +{ + va_list ap; + printer *p = mkprinter_file(stderr); + va_start(ap, format); + p->vprint(p, format, ap); + va_end(ap); + X(printer_destroy)(p); +} +#endif diff --git a/src/fftw3/kernel/hash.c b/src/fftw3/kernel/hash.c new file mode 100644 index 0000000..f12d5a4 --- /dev/null +++ b/src/fftw3/kernel/hash.c @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include "ifftw.h" + +unsigned X(hash)(const char *s) +{ + unsigned h = 0xDEADBEEFu; + do { + h = h * 17 + (int)*s; + } while (*s++); + return h; +} + diff --git a/src/fftw3/kernel/iabs.c b/src/fftw3/kernel/iabs.c new file mode 100644 index 0000000..d2bdb7c --- /dev/null +++ b/src/fftw3/kernel/iabs.c @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* $Id: iabs.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */ + +#include "ifftw.h" + +int X(iabs)(int a) +{ + return (int) (a < 0 ? -a : a); +} diff --git a/src/fftw3/kernel/ifftw.h b/src/fftw3/kernel/ifftw.h new file mode 100644 index 0000000..0269e18 --- /dev/null +++ b/src/fftw3/kernel/ifftw.h @@ -0,0 +1,848 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* $Id: ifftw.h,v 1.1 2008/10/17 06:11:29 scuri Exp $ */ + +/* FFTW internal header file */ +#ifndef __IFFTW_H__ +#define __IFFTW_H__ + +#include "config.h" + +#include <stdlib.h> /* size_t */ +#include <stdarg.h> /* va_list */ +#include <stddef.h> /* ptrdiff_t */ + +#if HAVE_SYS_TYPES_H +# include <sys/types.h> +#endif + +#if HAVE_STDINT_H +# include <stdint.h> /* uintptr_t, maybe */ +#endif + +#if HAVE_INTTYPES_H +# include <inttypes.h> /* uintptr_t, maybe */ +#endif + +/* determine precision and name-mangling scheme */ +#define CONCAT(prefix, name) prefix ## name +#if defined(FFTW_SINGLE) +typedef float R; +#define X(name) CONCAT(fftwf_, name) +#elif defined(FFTW_LDOUBLE) +typedef long double R; +#define X(name) CONCAT(fftwl_, name) +#else +typedef double R; +#define X(name) CONCAT(fftw_, name) +#endif + +/* dummy use of unused parameters to silence compiler warnings */ +#define UNUSED(x) (void)x + +#define FFT_SIGN (-1) /* sign convention for forward transforms */ + +/* get rid of that object-oriented stink: */ +#define REGISTER_SOLVER(p, s) X(solver_register)(p, s) + +#define STRINGIZEx(x) #x +#define STRINGIZE(x) STRINGIZEx(x) + +#ifndef HAVE_K7 +#define HAVE_K7 0 +#endif + +#if defined(HAVE_SSE) || defined(HAVE_SSE2) || defined(HAVE_ALTIVEC) || defined(HAVE_3DNOW) +#define HAVE_SIMD 1 +#else +#define HAVE_SIMD 0 +#endif + +/* forward declarations */ +typedef struct problem_s problem; +typedef struct plan_s plan; +typedef struct solver_s solver; +typedef struct planner_s planner; +typedef struct printer_s printer; +typedef struct scanner_s scanner; + +/*-----------------------------------------------------------------------*/ +/* alloca: */ +#if HAVE_SIMD +#define MIN_ALIGNMENT 16 +#endif + +#ifdef HAVE_ALLOCA + /* use alloca if available */ + +#ifndef alloca +#ifdef __GNUC__ +# define alloca __builtin_alloca +#else +# ifdef _MSC_VER +# include <malloc.h> +# define alloca _alloca +# else +# if HAVE_ALLOCA_H +# include <alloca.h> +# else +# ifdef _AIX + #pragma alloca +# else +# ifndef alloca /* predefined by HP cc +Olibcalls */ +void *alloca(size_t); +# endif +# endif +# endif +# endif +#endif +#endif + +# ifdef MIN_ALIGNMENT +# define STACK_MALLOC(T, p, x) \ + { \ + p = (T)alloca((x) + MIN_ALIGNMENT); \ + p = (T)(((uintptr_t)p + (MIN_ALIGNMENT - 1)) & \ + (~(uintptr_t)(MIN_ALIGNMENT - 1))); \ + } +# define STACK_FREE(x) +# else /* HAVE_ALLOCA && !defined(MIN_ALIGNMENT) */ +# define STACK_MALLOC(T, p, x) p = (T)alloca(x) +# define STACK_FREE(x) +# endif + +#else /* ! HAVE_ALLOCA */ + /* use malloc instead of alloca */ +# define STACK_MALLOC(T, p, x) p = (T)MALLOC(x, OTHER) +# define STACK_FREE(x) X(ifree)(x) +#endif /* ! HAVE_ALLOCA */ + +/*-----------------------------------------------------------------------*/ +/* define uintptr_t if it is not already defined */ + +#ifndef HAVE_UINTPTR_T +# if SIZEOF_VOID_P == 0 +# error sizeof void* is unknown! +# elif SIZEOF_UNSIGNED_INT == SIZEOF_VOID_P + typedef unsigned int uintptr_t; +# elif SIZEOF_UNSIGNED_LONG == SIZEOF_VOID_P + typedef unsigned long uintptr_t; +# elif SIZEOF_UNSIGNED_LONG_LONG == SIZEOF_VOID_P + typedef unsigned long long uintptr_t; +# else +# error no unsigned integer type matches void* sizeof! +# endif +#endif + +/*-----------------------------------------------------------------------*/ +/* assert.c: */ +extern void X(assertion_failed)(const char *s, int line, const char *file); + +/* always check */ +#define CK(ex) \ + (void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0)) + +#ifdef FFTW_DEBUG +/* check only if debug enabled */ +#define A(ex) \ + (void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0)) +#else +#define A(ex) /* nothing */ +#endif + +extern void X(debug)(const char *format, ...); +#define D X(debug) + +/*-----------------------------------------------------------------------*/ +/* alloc.c: */ + +/* objects allocated by malloc, for statistical purposes */ +enum malloc_tag { + EVERYTHING, + PLANS, + SOLVERS, + PROBLEMS, + BUFFERS, + HASHT, + TENSORS, + PLANNERS, + SLVDESCS, + TWIDDLES, + STRIDES, + OTHER, + MALLOC_WHAT_LAST /* must be last */ +}; + +extern void X(ifree)(void *ptr); +extern void X(ifree0)(void *ptr); + +#ifdef FFTW_DEBUG_MALLOC + +extern void *X(malloc_debug)(size_t n, enum malloc_tag what, + const char *file, int line); +#define MALLOC(n, what) X(malloc_debug)(n, what, __FILE__, __LINE__) +#define NATIVE_MALLOC(n, what) MALLOC(n, what) +void X(malloc_print_minfo)(int vrbose); + +#else /* ! FFTW_DEBUG_MALLOC */ + +extern void *X(malloc_plain)(size_t sz); +#define MALLOC(n, what) X(malloc_plain)(n) +#define NATIVE_MALLOC(n, what) malloc(n) + +#endif + +#if defined(FFTW_DEBUG) && defined(FFTW_DEBUG_MALLOC) && defined(HAVE_THREADS) +extern int X(in_thread); +# define IN_THREAD X(in_thread) +# define THREAD_ON { int in_thread_save = X(in_thread); X(in_thread) = 1 +# define THREAD_OFF X(in_thread) = in_thread_save; } +#else +# define IN_THREAD 0 +# define THREAD_ON +# define THREAD_OFF +#endif + +/*-----------------------------------------------------------------------*/ +/* ops.c: */ +/* + * ops counter. The total number of additions is add + fma + * and the total number of multiplications is mul + fma. + * Total flops = add + mul + 2 * fma + */ +typedef struct { + double add; + double mul; + double fma; + double other; +} opcnt; + +void X(ops_zero)(opcnt *dst); +void X(ops_other)(int o, opcnt *dst); +void X(ops_cpy)(const opcnt *src, opcnt *dst); + +void X(ops_add)(const opcnt *a, const opcnt *b, opcnt *dst); +void X(ops_add2)(const opcnt *a, opcnt *dst); + +/* dst = m * a + b */ +void X(ops_madd)(int m, const opcnt *a, const opcnt *b, opcnt *dst); + +/* dst += m * a */ +void X(ops_madd2)(int m, const opcnt *a, opcnt *dst); + + +/*-----------------------------------------------------------------------*/ +/* minmax.c: */ +int X(imax)(int a, int b); +int X(imin)(int a, int b); + +/*-----------------------------------------------------------------------*/ +/* iabs.c: */ +int X(iabs)(int a); + +/*-----------------------------------------------------------------------*/ +/* md5.c */ + +#if SIZEOF_UNSIGNED_INT >= 4 +typedef unsigned int md5uint; +#else +typedef unsigned long md5uint; /* at least 32 bits as per C standard */ +#endif + +typedef md5uint md5sig[4]; + +typedef struct { + md5sig s; /* state and signature */ + + /* fields not meant to be used outside md5.c: */ + unsigned char c[64]; /* stuff not yet processed */ + unsigned l; /* total length. Should be 64 bits long, but this is + good enough for us */ +} md5; + +void X(md5begin)(md5 *p); +void X(md5putb)(md5 *p, const void *d_, int len); +void X(md5puts)(md5 *p, const char *s); +void X(md5putc)(md5 *p, unsigned char c); +void X(md5int)(md5 *p, int i); +void X(md5unsigned)(md5 *p, unsigned i); +void X(md5ptrdiff)(md5 *p, ptrdiff_t d); +void X(md5end)(md5 *p); + +/*-----------------------------------------------------------------------*/ +/* tensor.c: */ +#define STRUCT_HACK_KR +#undef STRUCT_HACK_C99 + +typedef struct { + int n; + int is; /* input stride */ + int os; /* output stride */ +} iodim; + +typedef struct { + int rnk; +#if defined(STRUCT_HACK_KR) + iodim dims[1]; +#elif defined(STRUCT_HACK_C99) + iodim dims[]; +#else + iodim *dims; +#endif +} tensor; + +/* + Definition of rank -infinity. + This definition has the property that if you want rank 0 or 1, + you can simply test for rank <= 1. This is a common case. + + A tensor of rank -infinity has size 0. +*/ +#define RNK_MINFTY ((int)(((unsigned) -1) >> 1)) +#define FINITE_RNK(rnk) ((rnk) != RNK_MINFTY) + +typedef enum { INPLACE_IS, INPLACE_OS } inplace_kind; + +tensor *X(mktensor)(int rnk); +tensor *X(mktensor_0d)(void); +tensor *X(mktensor_1d)(int n, int is, int os); +tensor *X(mktensor_2d)(int n0, int is0, int os0, + int n1, int is1, int os1); +int X(tensor_sz)(const tensor *sz); +void X(tensor_md5)(md5 *p, const tensor *t); +int X(tensor_max_index)(const tensor *sz); +int X(tensor_min_istride)(const tensor *sz); +int X(tensor_min_ostride)(const tensor *sz); +int X(tensor_min_stride)(const tensor *sz); +int X(tensor_inplace_strides)(const tensor *sz); +int X(tensor_inplace_strides2)(const tensor *a, const tensor *b); +tensor *X(tensor_copy)(const tensor *sz); +int X(tensor_kosherp)(const tensor *x); + +tensor *X(tensor_copy_inplace)(const tensor *sz, inplace_kind k); +tensor *X(tensor_copy_except)(const tensor *sz, int except_dim); +tensor *X(tensor_copy_sub)(const tensor *sz, int start_dim, int rnk); +tensor *X(tensor_compress)(const tensor *sz); +tensor *X(tensor_compress_contiguous)(const tensor *sz); +tensor *X(tensor_append)(const tensor *a, const tensor *b); +void X(tensor_split)(const tensor *sz, tensor **a, int a_rnk, tensor **b); +int X(tensor_tornk1)(const tensor *t, int *n, int *is, int *os); +void X(tensor_destroy)(tensor *sz); +void X(tensor_destroy2)(tensor *a, tensor *b); +void X(tensor_destroy4)(tensor *a, tensor *b, tensor *c, tensor *d); +void X(tensor_print)(const tensor *sz, printer *p); +int X(dimcmp)(const iodim *a, const iodim *b); + +/*-----------------------------------------------------------------------*/ +/* problem.c: */ +typedef struct { + void (*hash) (const problem *ego, md5 *p); + void (*zero) (const problem *ego); + void (*print) (problem *ego, printer *p); + void (*destroy) (problem *ego); +} problem_adt; + +struct problem_s { + const problem_adt *adt; +}; + +problem *X(mkproblem)(size_t sz, const problem_adt *adt); +void X(problem_destroy)(problem *ego); + +/*-----------------------------------------------------------------------*/ +/* print.c */ +struct printer_s { + void (*print)(printer *p, const char *format, ...); + void (*vprint)(printer *p, const char *format, va_list ap); + void (*putchr)(printer *p, char c); + void (*cleanup)(printer *p); + int indent; + int indent_incr; +}; + +printer *X(mkprinter)(size_t size, + void (*putchr)(printer *p, char c), + void (*cleanup)(printer *p)); +void X(printer_destroy)(printer *p); + +/*-----------------------------------------------------------------------*/ +/* scan.c */ +struct scanner_s { + int (*scan)(scanner *sc, const char *format, ...); + int (*vscan)(scanner *sc, const char *format, va_list ap); + int (*getchr)(scanner *sc); + int ungotc; +}; + +scanner *X(mkscanner)(size_t size, int (*getchr)(scanner *sc)); +void X(scanner_destroy)(scanner *sc); + +/*-----------------------------------------------------------------------*/ +/* plan.c: */ +typedef struct { + void (*solve)(const plan *ego, const problem *p); + void (*awake)(plan *ego, int flag); + void (*print)(const plan *ego, printer *p); + void (*destroy)(plan *ego); +} plan_adt; + +struct plan_s { + const plan_adt *adt; + int awake_refcnt; + opcnt ops; + double pcost; +}; + +plan *X(mkplan)(size_t size, const plan_adt *adt); +void X(plan_destroy_internal)(plan *ego); +void X(plan_awake)(plan *ego, int flag); +#define AWAKE(plan, flag) X(plan_awake)(plan, flag) +void X(plan_null_destroy)(plan *ego); + +/*-----------------------------------------------------------------------*/ +/* solver.c: */ +typedef struct { + plan *(*mkplan)(const solver *ego, const problem *p, planner *plnr); +} solver_adt; + +struct solver_s { + const solver_adt *adt; + int refcnt; +}; + +solver *X(mksolver)(size_t size, const solver_adt *adt); +void X(solver_use)(solver *ego); +void X(solver_destroy)(solver *ego); +void X(solver_register)(planner *plnr, solver *s); + +/* shorthand */ +#define MKSOLVER(type, adt) (type *)X(mksolver)(sizeof(type), adt) + +/*-----------------------------------------------------------------------*/ +/* planner.c */ + +typedef struct slvdesc_s { + solver *slv; + const char *reg_nam; + unsigned nam_hash; + int reg_id; +} slvdesc; + +typedef struct solution_s solution; /* opaque */ + +/* values for problem_flags: */ +enum { + DESTROY_INPUT = 0x1, + NO_SIMD = 0x2, + CONSERVE_MEMORY = 0x4, + NO_DHT_R2HC = 0x8 +}; + +#define DESTROY_INPUTP(plnr) ((plnr)->problem_flags & DESTROY_INPUT) +#define NO_SIMDP(plnr) ((plnr)->problem_flags & NO_SIMD) +#define CONSERVE_MEMORYP(plnr) ((plnr)->problem_flags & CONSERVE_MEMORY) +#define NO_DHT_R2HCP(plnr) ((plnr)->problem_flags & NO_DHT_R2HC) + +/* values for planner_flags: */ +enum { + /* impatience flags */ + + BELIEVE_PCOST = 0x1, + DFT_R2HC_ICKY = 0x2, + NONTHREADED_ICKY = 0x4, + NO_BUFFERING = 0x8, + NO_EXHAUSTIVE = 0x10, + NO_INDIRECT_OP = 0x20, + NO_LARGE_GENERIC = 0x40, + NO_RANK_SPLITS = 0x80, + NO_VRANK_SPLITS = 0x100, + NO_VRECURSE = 0x200, + + /* flags that control the search */ + NO_UGLY = 0x400, /* avoid plans we are 99% sure are suboptimal */ + NO_SEARCH = 0x800, /* avoid searching altogether---use wisdom entries + only */ + + ESTIMATE = 0x1000, + IMPATIENCE_FLAGS = (ESTIMATE | (ESTIMATE - 1)), + + BLESSING = 0x4000, /* save this entry */ + H_VALID = 0x8000, /* valid hastable entry */ + NONIMPATIENCE_FLAGS = BLESSING +}; + +#define BELIEVE_PCOSTP(plnr) ((plnr)->planner_flags & BELIEVE_PCOST) +#define DFT_R2HC_ICKYP(plnr) ((plnr)->planner_flags & DFT_R2HC_ICKY) +#define ESTIMATEP(plnr) ((plnr)->planner_flags & ESTIMATE) +#define NONTHREADED_ICKYP(plnr) (((plnr)->planner_flags & NONTHREADED_ICKY) \ + && (plnr)->nthr > 1) +#define NO_BUFFERINGP(plnr) ((plnr)->planner_flags & NO_BUFFERING) +#define NO_EXHAUSTIVEP(plnr) ((plnr)->planner_flags & NO_EXHAUSTIVE) +#define NO_INDIRECT_OP_P(plnr) ((plnr)->planner_flags & NO_INDIRECT_OP) +#define NO_LARGE_GENERICP(plnr) ((plnr)->planner_flags & NO_LARGE_GENERIC) +#define NO_RANK_SPLITSP(plnr) ((plnr)->planner_flags & NO_RANK_SPLITS) +#define NO_UGLYP(plnr) ((plnr)->planner_flags & NO_UGLY) +#define NO_SEARCHP(plnr) ((plnr)->planner_flags & NO_SEARCH) +#define NO_VRANK_SPLITSP(plnr) ((plnr)->planner_flags & NO_VRANK_SPLITS) +#define NO_VRECURSEP(plnr) ((plnr)->planner_flags & NO_VRECURSE) + +typedef enum { FORGET_ACCURSED, FORGET_EVERYTHING } amnesia; + +typedef struct { + void (*register_solver)(planner *ego, solver *s); + plan *(*mkplan)(planner *ego, problem *p); + void (*forget)(planner *ego, amnesia a); + void (*exprt)(planner *ego, printer *p); /* ``export'' is a reserved + word in C++. */ + int (*imprt)(planner *ego, scanner *sc); +} planner_adt; + +struct planner_s { + const planner_adt *adt; + void (*hook)(plan *pln, const problem *p, int optimalp); + + /* solver descriptors */ + slvdesc *slvdescs; + unsigned nslvdesc, slvdescsiz; + const char *cur_reg_nam; + int cur_reg_id; + + /* hash table of solutions */ + solution *solutions; + unsigned hashsiz, nelem; + + int nthr; + unsigned problem_flags; + unsigned short planner_flags; /* matches type of solution.flags in + planner.c */ + /* various statistics */ + int nplan; /* number of plans evaluated */ + double pcost, epcost; /* total pcost of measured/estimated plans */ + int nprob; /* number of problems evaluated */ + int lookup, succ_lookup, lookup_iter; + int insert, insert_iter, insert_unknown; + int nrehash; +}; + +planner *X(mkplanner)(void); +void X(planner_destroy)(planner *ego); + +#ifdef FFTW_DEBUG +void X(planner_dump)(planner *ego, int vrbose); +#endif + +/* + Iterate over all solvers. Read: + + @article{ baker93iterators, + author = "Henry G. Baker, Jr.", + title = "Iterators: Signs of Weakness in Object-Oriented Languages", + journal = "{ACM} {OOPS} Messenger", + volume = "4", + number = "3", + pages = "18--25" + } +*/ +#define FORALL_SOLVERS(ego, s, p, what) \ +{ \ + unsigned _cnt; \ + for (_cnt = 0; _cnt < ego->nslvdesc; ++_cnt) { \ + slvdesc *p = ego->slvdescs + _cnt; \ + solver *s = p->slv; \ + what; \ + } \ +} + +/* make plan, destroy problem */ +plan *X(mkplan_d)(planner *ego, problem *p); + +/*-----------------------------------------------------------------------*/ +/* stride.c: */ + +/* If PRECOMPUTE_ARRAY_INDICES is defined, precompute all strides. */ +#if (defined(__i386__) || _M_IX86 >= 500) && !HAVE_K7 && !defined(FFTW_LDOUBLE) +#define PRECOMPUTE_ARRAY_INDICES +#endif + +#ifdef PRECOMPUTE_ARRAY_INDICES +typedef int *stride; +#define WS(stride, i) (stride[i]) +extern stride X(mkstride)(int n, int s); +void X(stride_destroy)(stride p); + +#else + +typedef int stride; +#define WS(stride, i) (stride * i) +#define fftwf_mkstride(n, stride) stride +#define fftw_mkstride(n, stride) stride +#define fftwl_mkstride(n, stride) stride +#define fftwf_stride_destroy(p) ((void) p) +#define fftw_stride_destroy(p) ((void) p) +#define fftwl_stride_destroy(p) ((void) p) + +#endif /* PRECOMPUTE_ARRAY_INDICES */ + +/*-----------------------------------------------------------------------*/ +/* solvtab.c */ + +struct solvtab_s { void (*reg)(planner *); const char *reg_nam; }; +typedef struct solvtab_s solvtab[]; +void X(solvtab_exec)(solvtab tbl, planner *p); +#define SOLVTAB(s) { s, STRINGIZE(s) } +#define SOLVTAB_END { 0, 0 } + +/*-----------------------------------------------------------------------*/ +/* pickdim.c */ +int X(pickdim)(int which_dim, const int *buddies, int nbuddies, + const tensor *sz, int oop, int *dp); + +/*-----------------------------------------------------------------------*/ +/* twiddle.c */ +/* little language to express twiddle factors computation */ +enum { TW_COS = 0, TW_SIN = 1, TW_TAN = 2, TW_NEXT = 3, + TW_FULL = 4, TW_GENERIC = 5 }; + +typedef struct { + unsigned char op; + unsigned char v; + short i; +} tw_instr; + +typedef struct twid_s { + R *W; /* array of twiddle factors */ + int n, r, m; /* transform order, radix, # twiddle rows */ + int refcnt; + const tw_instr *instr; + struct twid_s *cdr; +} twid; + +void X(mktwiddle)(twid **pp, const tw_instr *instr, int n, int r, int m); +void X(twiddle_destroy)(twid **pp); +int X(twiddle_length)(int r, const tw_instr *p); +void X(twiddle_awake)(int flg, twid **pp, + const tw_instr *instr, int n, int r, int m); + +/*-----------------------------------------------------------------------*/ +/* trig.c */ +#ifdef FFTW_LDOUBLE +typedef long double trigreal; +#else +typedef double trigreal; +#endif + +extern trigreal X(cos2pi)(int, int); +extern trigreal X(sin2pi)(int, int); +extern trigreal X(tan2pi)(int, int); +extern trigreal X(sincos)(trigreal m, trigreal n, int sinp); + +/*-----------------------------------------------------------------------*/ +/* primes.c: */ + +#if defined(FFTW_ENABLE_UNSAFE_MULMOD) +# define MULMOD(x,y,p) (((x) * (y)) % (p)) +#elif ((SIZEOF_INT != 0) && (SIZEOF_LONG >= 2 * SIZEOF_INT)) +# define MULMOD(x,y,p) ((int) ((((long) (x)) * ((long) (y))) % ((long) (p)))) +#elif ((SIZEOF_INT != 0) && (SIZEOF_LONG_LONG >= 2 * SIZEOF_INT)) +# define MULMOD(x,y,p) ((int) ((((long long) (x)) * ((long long) (y))) \ + % ((long long) (p)))) +#elif defined(_MSC_VER) +# define MULMOD(x,y,p) ((int) ((((__int64) (x)) * ((__int64) (y))) \ + % ((__int64) (p)))) +#else /* 'long long' unavailable */ +# define SAFE_MULMOD 1 +int X(safe_mulmod)(int x, int y, int p); +# define MULMOD(x,y,p) X(safe_mulmod)(x,y,p) +#endif + +int X(power_mod)(int n, int m, int p); +int X(find_generator)(int p); +int X(first_divisor)(int n); +int X(is_prime)(int n); +int X(next_prime)(int n); + +#define GENERIC_MIN_BAD 71 /* min prime for which generic becomes bad */ + +/*-----------------------------------------------------------------------*/ +/* rader.c: */ +typedef struct rader_tls rader_tl; + +void X(rader_tl_insert)(int k1, int k2, int k3, R *W, rader_tl **tl); +R *X(rader_tl_find)(int k1, int k2, int k3, rader_tl *t); +void X(rader_tl_delete)(R *W, rader_tl **tl); + +/*-----------------------------------------------------------------------*/ +/* transpose.c: */ + +void X(transpose)(R *A, int n, int m, int d, int N, R *buf); +void X(transpose_slow)(R *a, int nx, int ny, int N, + char *move, int move_size, R *buf); +int X(transposable)(const iodim *a, const iodim *b, + int vl, int s, R *ri, R *ii); +void X(transpose_dims)(const iodim *a, const iodim *b, + int *n, int *m, int *d, int *nd, int *md); +int X(transpose_simplep)(const iodim *a, const iodim *b, int vl, int s, + R *ri, R *ii); +int X(transpose_slowp)(const iodim *a, const iodim *b, int N); + +/*-----------------------------------------------------------------------*/ +/* misc stuff */ +void X(null_awake)(plan *ego, int awake); +int X(square)(int x); +double X(measure_execution_time)(plan *pln, const problem *p); +int X(alignment_of)(R *p); +unsigned X(hash)(const char *s); +int X(compute_nbuf)(int n, int vl, int nbuf, int maxbufsz); +int X(ct_uglyp)(int min_n, int n, int r); + +#if HAVE_SIMD +R *X(taint)(R *p, int s); +R *X(join_taint)(R *p1, R *p2); +#define TAINT(p, s) X(taint)(p, s) +#define UNTAINT(p) ((R *) (((uintptr_t) (p)) & ~(uintptr_t)3)) +#define TAINTOF(p) (((uintptr_t)(p)) & 3) +#define JOIN_TAINT(p1, p2) X(join_taint)(p1, p2) +#else +#define TAINT(p, s) (p) +#define UNTAINT(p) (p) +#define TAINTOF(p) 0 +#define JOIN_TAINT(p1, p2) p1 +#endif + +#ifdef FFTW_DEBUG_ALIGNMENT +# define ASSERT_ALIGNED_DOUBLE { \ + double __foo; \ + CK(!(((uintptr_t) &__foo) & 0x7)); \ +} +#else +# define ASSERT_ALIGNED_DOUBLE +#endif /* FFTW_DEBUG_ALIGNMENT */ + + + +/*-----------------------------------------------------------------------*/ +/* macros used in codelets to reduce source code size */ + +typedef R E; /* internal precision of codelets. */ + +#ifdef FFTW_LDOUBLE +# define K(x) ((E) x##L) +#else +# define K(x) ((E) x) +#endif +#define DK(name, value) const E name = K(value) + +/* FMA macros */ + +#if defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__)) +/* this peculiar coding seems to do the right thing on all of + gcc-2.95, gcc-3.1, and gcc-3.2. + + The obvious expression a * b + c does not work. If both x = a * b + + c and y = a * b - c appear in the source, gcc computes t = a * b, + x = t + c, y = t - c, thus destroying the fma. +*/ +static __inline__ E FMA(E a, E b, E c) +{ + E x = a * b; + x = x + c; + return x; +} + +static __inline__ E FMS(E a, E b, E c) +{ + E x = a * b; + x = x - c; + return x; +} + +static __inline__ E FNMA(E a, E b, E c) +{ + E x = a * b; + x = - (x + c); + return x; +} + +static __inline__ E FNMS(E a, E b, E c) +{ + E x = a * b; + x = - (x - c); + return x; +} +#else +#define FMA(a, b, c) (((a) * (b)) + (c)) +#define FMS(a, b, c) (((a) * (b)) - (c)) +#define FNMA(a, b, c) (- (((a) * (b)) + (c))) +#define FNMS(a, b, c) ((c) - ((a) * (b))) +#endif + + +/* stack-alignment hackery */ +#if defined(__GNUC__) && defined(__i386__) +/* + * horrible hack to align the stack to a 16-byte boundary. + * + * We assume a gcc version >= 2.95 so that + * -mpreferred-stack-boundary works. Otherwise, all bets are + * off. However, -mpreferred-stack-boundary does not create a + * stack alignment, but it only preserves it. Unfortunately, + * many versions of libc on linux call main() with the wrong + * initial stack alignment, with the result that the code is now + * pessimally aligned instead of having a 50% chance of being + * correct. + */ + +#define WITH_ALIGNED_STACK(what) \ +{ \ + /* \ + * Use alloca to allocate some memory on the stack. \ + * This alerts gcc that something funny is going \ + * on, so that it does not omit the frame pointer \ + * etc. \ + */ \ + (void)__builtin_alloca(16); \ + \ + /* \ + * Now align the stack pointer \ + */ \ + __asm__ __volatile__ ("andl $-16, %esp"); \ + \ + what \ +} +#endif + +#ifdef __ICC /* Intel's compiler for ia32 */ +#define WITH_ALIGNED_STACK(what) \ +{ \ + /* \ + * Simply calling alloca seems to do the right thing. \ + * The size of the allocated block seems to be irrelevant. \ + */ \ + _alloca(16); \ + what \ +} +#endif + +#ifndef WITH_ALIGNED_STACK +#define WITH_ALIGNED_STACK(what) what +#endif + +#endif /* __IFFTW_H__ */ diff --git a/src/fftw3/kernel/kbuffered.c b/src/fftw3/kernel/kbuffered.c new file mode 100644 index 0000000..d20403f --- /dev/null +++ b/src/fftw3/kernel/kbuffered.c @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* routines shared by the various buffered solvers */ + +#include "ifftw.h" + +int X(compute_nbuf)(int n, int vl, int nbuf, int maxbufsz) +{ + int i; + + if (nbuf * n > maxbufsz) + nbuf = X(imax)((int)1, maxbufsz / n); + + /* + * Look for a buffer number (not too big) that divides the + * vector length, in order that we only need one child plan: + */ + for (i = nbuf; i < vl && i < 2 * nbuf; ++i) + if (vl % i == 0) + return i; + + /* whatever... */ + nbuf = X(imin)(nbuf, vl); + return nbuf; +} + diff --git a/src/fftw3/kernel/kct.c b/src/fftw3/kernel/kct.c new file mode 100644 index 0000000..ba3bf68 --- /dev/null +++ b/src/fftw3/kernel/kct.c @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* common routines for Cooley-Tukey algorithms */ + +#include "ifftw.h" + +#define POW2P(n) (((n) > 0) && (((n) & ((n) - 1)) == 0)) + +/* TRUE if radix-r is ugly for size n */ +int X(ct_uglyp)(int min_n, int n, int r) +{ + return (n <= min_n) || (POW2P(n) && (n / r) <= 4); +} diff --git a/src/fftw3/kernel/kplan.c b/src/fftw3/kernel/kplan.c new file mode 100644 index 0000000..4c17107 --- /dev/null +++ b/src/fftw3/kernel/kplan.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* $Id: kplan.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */ + +#include "ifftw.h" + +/* "Plan: To bother about the best method of accomplishing an + accidental result." (Ambrose Bierce, The Enlarged Devil's + Dictionary). */ + +plan *X(mkplan)(size_t size, const plan_adt *adt) +{ + plan *p = (plan *)MALLOC(size, PLANS); + + A(adt->destroy); + p->awake_refcnt = 0; + p->adt = adt; + X(ops_zero)(&p->ops); + p->pcost = 0.0; + + return p; +} + +/* + * destroy a plan + */ +void X(plan_destroy_internal)(plan *ego) +{ + if (ego) { + if (ego->awake_refcnt > 0) + ego->adt->awake(ego, 0); + ego->adt->destroy(ego); + X(ifree)(ego); + } +} + +/* dummy destroy routine for plans with no local state */ +void X(plan_null_destroy)(plan *ego) +{ + UNUSED(ego); + /* nothing */ +} + +void X(plan_awake)(plan *ego, int flag) +{ + if (flag) { + if (!ego->awake_refcnt) + ego->adt->awake(ego, flag); + ++ego->awake_refcnt; + } else { + --ego->awake_refcnt; + if (!ego->awake_refcnt) + ego->adt->awake(ego, flag); + } +} + diff --git a/src/fftw3/kernel/kproblem.c b/src/fftw3/kernel/kproblem.c new file mode 100644 index 0000000..4686692 --- /dev/null +++ b/src/fftw3/kernel/kproblem.c @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* $Id: kproblem.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */ + +#include "ifftw.h" + +/* constructor */ +problem *X(mkproblem)(size_t sz, const problem_adt *adt) +{ + problem *p = (problem *)MALLOC(sz, PROBLEMS); + + p->adt = adt; + return p; +} + +/* destructor */ +void X(problem_destroy)(problem *ego) +{ + if (ego) + ego->adt->destroy(ego); +} + diff --git a/src/fftw3/kernel/krader.c b/src/fftw3/kernel/krader.c new file mode 100644 index 0000000..354edfc --- /dev/null +++ b/src/fftw3/kernel/krader.c @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include "ifftw.h" + +/* + common routines for Rader solvers +*/ + + +/* shared twiddle and omega lists, keyed by two/three integers. */ +struct rader_tls { + int k1, k2, k3; + R *W; + int refcnt; + rader_tl *cdr; +}; + +void X(rader_tl_insert)(int k1, int k2, int k3, R *W, rader_tl **tl) +{ + rader_tl *t = (rader_tl *) MALLOC(sizeof(rader_tl), TWIDDLES); + t->k1 = k1; t->k2 = k2; t->k3 = k3; t->W = W; + t->refcnt = 1; t->cdr = *tl; *tl = t; +} + +R *X(rader_tl_find)(int k1, int k2, int k3, rader_tl *t) +{ + while (t && (t->k1 != k1 || t->k2 != k2 || t->k3 != k3)) + t = t->cdr; + if (t) { + ++t->refcnt; + return t->W; + } else + return 0; +} + +void X(rader_tl_delete)(R *W, rader_tl **tl) +{ + if (W) { + rader_tl **tp, *t; + + for (tp = tl; (t = *tp) && t->W != W; tp = &t->cdr) + ; + + if (t && --t->refcnt <= 0) { + *tp = t->cdr; + X(ifree)(t->W); + X(ifree)(t); + } + } +} diff --git a/src/fftw3/kernel/md5-1.c b/src/fftw3/kernel/md5-1.c new file mode 100644 index 0000000..a2e87a1 --- /dev/null +++ b/src/fftw3/kernel/md5-1.c @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include "ifftw.h" + + +void X(md5putb)(md5 *p, const void *d_, int len) +{ + int i; + const unsigned char *d = (const unsigned char *)d_; + for (i = 0; i < len; ++i) + X(md5putc)(p, d[i]); +} + +void X(md5puts)(md5 *p, const char *s) +{ + /* also hash final '\0' */ + do { + X(md5putc)(p, *s); + } while(*s++); +} + +void X(md5int)(md5 *p, int i) +{ + X(md5putb)(p, &i, sizeof(i)); +} + +void X(md5unsigned)(md5 *p, unsigned i) +{ + X(md5putb)(p, &i, sizeof(i)); +} + +void X(md5ptrdiff)(md5 *p, ptrdiff_t d) +{ + X(md5putb)(p, &d, sizeof(d)); +} + diff --git a/src/fftw3/kernel/md5.c b/src/fftw3/kernel/md5.c new file mode 100644 index 0000000..edbd811 --- /dev/null +++ b/src/fftw3/kernel/md5.c @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* + independent implementation of Ron Rivest's MD5 message-digest + algorithm, based on rfc 1321. + + Optimized for small code size, not speed. Works as long as + sizeof(md5uint) >= 4. +*/ + +#include "ifftw.h" + +/* sintab[i] = 4294967296.0 * abs(sin((double)(i + 1))) */ +static const md5uint sintab[64] = { + 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, + 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501, + 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be, + 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821, + 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa, + 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8, + 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, + 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a, + 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c, + 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70, + 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05, + 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665, + 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, + 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1, + 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1, + 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391 +}; + +/* see rfc 1321 section 3.4 */ +static const struct roundtab { + char k; + char s; +} roundtab[64] = { + { 0, 7}, { 1, 12}, { 2, 17}, { 3, 22}, + { 4, 7}, { 5, 12}, { 6, 17}, { 7, 22}, + { 8, 7}, { 9, 12}, { 10, 17}, { 11, 22}, + { 12, 7}, { 13, 12}, { 14, 17}, { 15, 22}, + { 1, 5}, { 6, 9}, { 11, 14}, { 0, 20}, + { 5, 5}, { 10, 9}, { 15, 14}, { 4, 20}, + { 9, 5}, { 14, 9}, { 3, 14}, { 8, 20}, + { 13, 5}, { 2, 9}, { 7, 14}, { 12, 20}, + { 5, 4}, { 8, 11}, { 11, 16}, { 14, 23}, + { 1, 4}, { 4, 11}, { 7, 16}, { 10, 23}, + { 13, 4}, { 0, 11}, { 3, 16}, { 6, 23}, + { 9, 4}, { 12, 11}, { 15, 16}, { 2, 23}, + { 0, 6}, { 7, 10}, { 14, 15}, { 5, 21}, + { 12, 6}, { 3, 10}, { 10, 15}, { 1, 21}, + { 8, 6}, { 15, 10}, { 6, 15}, { 13, 21}, + { 4, 6}, { 11, 10}, { 2, 15}, { 9, 21} +}; + +#define rol(a, s) ((a << (int)(s)) | (a >> (32 - (int)(s)))) + +static void doblock(md5sig state, const unsigned char *data) +{ + md5uint a, b, c, d, t, x[16]; + const md5uint msk = 0xffffffffUL; + int i; + + /* encode input bytes into md5uint */ + for (i = 0; i < 16; ++i) { + const unsigned char *p = data + 4 * i; + x[i] = p[0] | (p[1] << 8) | (p[2] << 16) | (p[3] << 24); + } + + a = state[0]; b = state[1]; c = state[2]; d = state[3]; + for (i = 0; i < 64; ++i) { + const struct roundtab *p = roundtab + i; + int round = i / 16; + switch (round) { + case 0: a += (b & c) | (~b & d); break; + case 1: a += (b & d) | (c & ~d); break; + case 2: a += b ^ c ^ d; break; + case 3: a += c ^ (b | ~d); break; + } + a += sintab[i]; + a += x[(int)(p->k)]; + a &= msk; + t = b + rol(a, p->s); + a = d; d = c; c = b; b = t; + } + state[0] = (state[0] + a) & msk; + state[1] = (state[1] + b) & msk; + state[2] = (state[2] + c) & msk; + state[3] = (state[3] + d) & msk; +} + + +void X(md5begin)(md5 *p) +{ + p->s[0] = 0x67452301; + p->s[1] = 0xefcdab89; + p->s[2] = 0x98badcfe; + p->s[3] = 0x10325476; + p->l = 0; +} + +void X(md5putc)(md5 *p, unsigned char c) +{ + p->c[p->l % 64] = c; + if (((++p->l) % 64) == 0) doblock(p->s, p->c); +} + +void X(md5end)(md5 *p) +{ + unsigned l, i; + + l = 8 * p->l; /* length before padding, in bits */ + + /* rfc 1321 section 3.1: padding */ + X(md5putc)(p, 0x80); + while ((p->l % 64) != 56) X(md5putc)(p, 0x00); + + /* rfc 1321 section 3.2: length (little endian) */ + for (i = 0; i < 8; ++i) { + X(md5putc)(p, l & 0xFF); + l = l >> 8; + } + + /* Now p->l % 64 == 0 and signature is in p->s */ +} diff --git a/src/fftw3/kernel/minmax.c b/src/fftw3/kernel/minmax.c new file mode 100644 index 0000000..fc6f3a3 --- /dev/null +++ b/src/fftw3/kernel/minmax.c @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* $Id: minmax.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */ + +#include "ifftw.h" + +int X(imax)(int a, int b) +{ + return (a > b) ? a : b; +} + +int X(imin)(int a, int b) +{ + return (a < b) ? a : b; +} diff --git a/src/fftw3/kernel/ops.c b/src/fftw3/kernel/ops.c new file mode 100644 index 0000000..9e927c1 --- /dev/null +++ b/src/fftw3/kernel/ops.c @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* $Id: ops.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */ + +#include "ifftw.h" + +void X(ops_zero)(opcnt *dst) +{ + dst->add = dst->mul = dst->fma = dst->other = 0; +} + +void X(ops_cpy)(const opcnt *src, opcnt *dst) +{ + *dst = *src; +} + +void X(ops_other)(int o, opcnt *dst) +{ + X(ops_zero)(dst); + dst->other = o; +} + +void X(ops_madd)(int m, const opcnt *a, const opcnt *b, opcnt *dst) +{ + dst->add = m * a->add + b->add; + dst->mul = m * a->mul + b->mul; + dst->fma = m * a->fma + b->fma; + dst->other = m * a->other + b->other; +} + +void X(ops_add)(const opcnt *a, const opcnt *b, opcnt *dst) +{ + X(ops_madd)(1, a, b, dst); +} + +void X(ops_add2)(const opcnt *a, opcnt *dst) +{ + X(ops_add)(a, dst, dst); +} + +void X(ops_madd2)(int m, const opcnt *a, opcnt *dst) +{ + X(ops_madd)(m, a, dst, dst); +} + diff --git a/src/fftw3/kernel/pickdim.c b/src/fftw3/kernel/pickdim.c new file mode 100644 index 0000000..034e6db --- /dev/null +++ b/src/fftw3/kernel/pickdim.c @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include "ifftw.h" + +/* $Id: pickdim.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */ + +/* Given a solver which_dim, a vector sz, and whether or not the + transform is out-of-place, return the actual dimension index that + it corresponds to. The basic idea here is that we return the + which_dim'th valid dimension, starting from the end if + which_dim < 0. */ +static int really_pickdim(int which_dim, const tensor *sz, int oop, int *dp) +{ + int i; + int count_ok = 0; + if (which_dim > 0) { + for (i = 0; i < sz->rnk; ++i) { + if (oop || sz->dims[i].is == sz->dims[i].os) + if (++count_ok == which_dim) { + *dp = i; + return 1; + } + } + } + else if (which_dim < 0) { + for (i = sz->rnk; i > 0; --i) { + if (oop || sz->dims[i - 1].is == sz->dims[i - 1].os) + if (++count_ok == -which_dim) { + *dp = i - 1; + return 1; + } + } + } + else { /* zero: pick the middle, if valid */ + i = sz->rnk / 2 - 1; + if (i < sz->rnk && (oop || sz->dims[i].is == sz->dims[i].os)) { + *dp = i; + return 1; + } + } + return 0; +} + +/* Like really_pickdim, but only returns 1 if no previous "buddy" + which_dim in the buddies list would give the same dim. */ +int X(pickdim)(int which_dim, const int *buddies, int nbuddies, + const tensor *sz, int oop, int *dp) +{ + int i, d1; + + if (!really_pickdim(which_dim, sz, oop, dp)) + return 0; + + /* check whether some buddy solver would produce the same dim. + If so, consider this solver unapplicable and let the buddy + take care of it. The smallest-indexed buddy is applicable. */ + for (i = 0; i < nbuddies; ++i) { + if (buddies[i] == which_dim) + break; /* found self */ + if (really_pickdim(buddies[i], sz, oop, &d1) && *dp == d1) + return 0; /* found equivalent buddy */ + } + return 1; +} diff --git a/src/fftw3/kernel/planner.c b/src/fftw3/kernel/planner.c new file mode 100644 index 0000000..1d01cb9 --- /dev/null +++ b/src/fftw3/kernel/planner.c @@ -0,0 +1,695 @@ +/* + * Copyright (c) 2000 Matteo Frigo + * Copyright (c) 2000 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* $Id: planner.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */ +#include "ifftw.h" +#include <string.h> + +/* GNU Coding Standards, Sec. 5.2: "Please write the comments in a GNU + program in English, because English is the one language that nearly + all programmers in all countries can read." + + ingemisco tanquam reus + culpa rubet vultus meus + supplicanti parce [rms] +*/ + +#define BLESSEDP(solution) ((solution)->flags & BLESSING) +#define VALIDP(solution) ((solution)->flags & H_VALID) +#define UNBLESS(flags) flags &= ~BLESSING + +#define MAXNAM 64 /* maximum length of registrar's name. + Used for reading wisdom. There is no point + in doing this right */ + +/* Flags f1 subsumes flags f2 iff f1 is less/equally impatient than + f2, defining a partial ordering. */ +#define IMPATIENCE(flags) ((flags) & IMPATIENCE_FLAGS) +#define NONIMPATIENCE(flags) ((flags) & NONIMPATIENCE_FLAGS) +#define ORDERED(f1, f2) (SUBSUMES(f1, f2) || SUBSUMES(f2, f1)) +#define SUBSUMES(f1, f2) ((IMPATIENCE(f1) & (f2)) == IMPATIENCE(f1)) + +static unsigned addmod(unsigned a, unsigned b, unsigned p) +{ + /* gcc-2.95/sparc produces incorrect code for the fast version below. */ +#if defined(__sparc__) && defined(__GNUC__) + /* slow version */ + return (a + b) % p; +#else + /* faster version */ + unsigned c = a + b; + return c >= p ? c - p : c; +#endif +} + +/* + slvdesc management: +*/ +static void sgrow(planner *ego) +{ + unsigned osiz = ego->slvdescsiz, nsiz = 1 + osiz + osiz / 4; + slvdesc *ntab = (slvdesc *)MALLOC(nsiz * sizeof(slvdesc), SLVDESCS); + slvdesc *otab = ego->slvdescs; + unsigned i; + + ego->slvdescs = ntab; + ego->slvdescsiz = nsiz; + for (i = 0; i < osiz; ++i) + ntab[i] = otab[i]; + X(ifree0)(otab); +} + +static void register_solver(planner *ego, solver *s) +{ + slvdesc *n; + if (s) { /* add s to solver list */ + X(solver_use)(s); + + if (ego->nslvdesc >= ego->slvdescsiz) + sgrow(ego); + + n = ego->slvdescs + ego->nslvdesc++; + + n->slv = s; + n->reg_nam = ego->cur_reg_nam; + n->reg_id = ego->cur_reg_id++; + + A(strlen(n->reg_nam) < MAXNAM); + n->nam_hash = X(hash)(n->reg_nam); + } +} + +static int slookup(planner *ego, char *nam, int id) +{ + unsigned h = X(hash)(nam); /* used to avoid strcmp in the common case */ + FORALL_SOLVERS(ego, s, sp, { + UNUSED(s); + if (sp->reg_id == id && sp->nam_hash == h + && !strcmp(sp->reg_nam, nam)) + return sp - ego->slvdescs; + }); + return -1; +} + +/* + md5-related stuff: +*/ + +/* first hash function */ +static unsigned h1(planner *ego, const md5sig s) +{ + return s[0] % ego->hashsiz; +} + +/* second hash function (for double hashing) */ +static unsigned h2(planner *ego, const md5sig s) +{ + return 1U + s[1] % (ego->hashsiz - 1); +} + +static void md5hash(md5 *m, const problem *p, const planner *plnr) +{ + X(md5begin)(m); + X(md5unsigned)(m, sizeof(R)); /* so we don't mix different precisions */ + X(md5unsigned)(m, plnr->problem_flags); + X(md5int)(m, plnr->nthr); + p->adt->hash(p, m); + X(md5end)(m); +} + +static int md5eq(const md5sig a, const md5sig b) +{ + return a[0] == b[0] && a[1] == b[1] && a[2] == b[2] && a[3] == b[3]; +} + +static void sigcpy(const md5sig a, md5sig b) +{ + b[0] = a[0]; b[1] = a[1]; b[2] = a[2]; b[3] = a[3]; +} + +/* + memoization routines : +*/ + +/* + liber scriptus proferetur + in quo totum continetur + unde mundus iudicetur +*/ +struct solution_s { + md5sig s; + unsigned short flags; + short slvndx; +}; + +static solution *hlookup(planner *ego, const md5sig s, unsigned short flags) +{ + unsigned g, h = h1(ego, s), d = h2(ego, s); + + ++ego->lookup; + + for (g = h; ; g = addmod(g, d, ego->hashsiz)) { + solution *l = ego->solutions + g; + ++ego->lookup_iter; + if (VALIDP(l)) { + if (md5eq(s, l->s) && ORDERED(l->flags, flags)) { + ++ego->succ_lookup; + return l; + } + } else { + return 0; + } + A((g + d) % ego->hashsiz != h); + } +} + + +static void hinsert0(planner *ego, const md5sig s, unsigned short flags, + int slvndx, solution *l) +{ + ++ego->insert; + if (!l) { + /* search for nonfull slot */ + unsigned g, h = h1(ego, s), d = h2(ego, s); + ++ego->insert_unknown; + for (g = h; ; g = addmod(g, d, ego->hashsiz)) { + ++ego->insert_iter; + l = ego->solutions + g; + if (!VALIDP(l)) break; + A((g + d) % ego->hashsiz != h); + } + } + + /* fill slot */ + l->flags = flags | H_VALID; + l->slvndx = (short)slvndx; + sigcpy(s, l->s); +} + +static void rehash(planner *ego, unsigned nsiz) +{ + unsigned osiz = ego->hashsiz, h; + solution *osol = ego->solutions, *nsol; + + nsiz = (unsigned)X(next_prime)((int)nsiz); + nsol = (solution *)MALLOC(nsiz * sizeof(solution), HASHT); + ++ego->nrehash; + + /* init new table */ + for (h = 0; h < nsiz; ++h) + nsol[h].flags = 0; + + /* install new table */ + ego->hashsiz = nsiz; + ego->solutions = nsol; + + /* copy table */ + for (h = 0; h < osiz; ++h) { + solution *l = osol + h; + if (VALIDP(l)) + hinsert0(ego, l->s, l->flags, l->slvndx, 0); + } + + X(ifree0)(osol); +} + +static unsigned minsz(unsigned nelem) +{ + return 1U + nelem + nelem / 8U; +} + +static unsigned nextsz(unsigned nelem) +{ + return minsz(minsz(nelem)); +} + +static void hgrow(planner *ego) +{ + unsigned nelem = ego->nelem; + if (minsz(nelem) >= ego->hashsiz) + rehash(ego, nextsz(nelem)); +} + +static void hshrink(planner *ego) +{ + unsigned nelem = ego->nelem; + /* always rehash after deletions */ + rehash(ego, nextsz(nelem)); +} + +/* inherit blessing, but only if the solver is the same */ +static unsigned short merge_flags(unsigned short dstflags, int dstndx, + unsigned short srcflags, int srcndx) +{ + if (srcndx == dstndx) + dstflags |= (srcflags & BLESSING); /* ne me perdas illa die */ + return dstflags; +} + +static void hinsert(planner *ego, const md5sig s, + unsigned short flags, int slvndx) +{ + solution *l; + + if ((l = hlookup(ego, s, flags))) { + if (SUBSUMES(flags, l->flags)) { + /* overwrite old solution */ + flags = merge_flags(flags, slvndx, l->flags, l->slvndx); + } else { + A(SUBSUMES(l->flags, flags)); + l->flags = merge_flags(l->flags, l->slvndx, flags, slvndx); + return; + } + } else { + ++ego->nelem; + hgrow(ego); + } + hinsert0(ego, s, flags, slvndx, l); +} + +static void hcurse_subsumed(planner *ego) +{ + unsigned h; + + /* unbless any entries that are unreachable because they + are subsumed by less-impatient ones. */ + for (h = 0; h < ego->hashsiz; ++h) { + solution *l = ego->solutions + h; + if (VALIDP(l)) { + unsigned d = h2(ego, l->s), g = addmod(h, d, ego->hashsiz); + for (; ; g = addmod(g, d, ego->hashsiz)) { + solution *m = ego->solutions + g; + if (VALIDP(m)) { + if (md5eq(l->s, m->s) && + SUBSUMES(l->flags, m->flags)) { + /* ne cadant in obscurum */ + l->flags = merge_flags(l->flags, l->slvndx, + m->flags, m->slvndx); + + /* cum vix justus sit securus */ + UNBLESS(m->flags); + } + } + else break; + A((g + d) % ego->hashsiz != h); + } + } + } +} + + +static void invoke_hook(planner *ego, plan *pln, const problem *p, + int optimalp) +{ + if (ego->hook) + ego->hook(pln, p, optimalp); +} + +static void evaluate_plan(planner *ego, plan *pln, const problem *p) +{ + if (!BELIEVE_PCOSTP(ego) || pln->pcost == 0.0) { + ego->nplan++; + + if (ESTIMATEP(ego)) { + estimate: + /* heuristic */ + pln->pcost = 0.0 + + pln->ops.add + + pln->ops.mul + + 2 * pln->ops.fma + + pln->ops.other; + ego->epcost += pln->pcost; + } else { + double t = X(measure_execution_time)(pln, p); + + if (t < 0) { /* unavailable cycle counter */ + /* Real programmers can write FORTRAN in any language */ + goto estimate; + } + + pln->pcost = t; + ego->pcost += t; + } + } + + invoke_hook(ego, pln, p, 0); +} + +/* maintain dynamic scoping of flags, nthr: */ +static plan *invoke_solver(planner *ego, problem *p, solver *s, + unsigned short nflags) +{ + unsigned short planner_flags = ego->planner_flags; + unsigned problem_flags = ego->problem_flags; + int nthr = ego->nthr; + plan *pln; + ego->planner_flags = nflags; + pln = s->adt->mkplan(s, p, ego); + ego->problem_flags = problem_flags; + ego->nthr = nthr; + ego->planner_flags = planner_flags; + return pln; +} + +static plan *search(planner *ego, problem *p, slvdesc **descp) +{ + plan *best = 0; + int best_not_yet_timed = 1; + int pass; + + if (NO_SEARCHP(ego)) { + /* D("invalid search for %P %x\n", p, ego->planner_flags); */ + return 0; + } + + for (pass = 0; pass < 2; ++pass) { + unsigned short nflags = ego->planner_flags; + + if (best) break; + + switch (pass) { + case 0: + /* skip pass 0 during exhaustive search */ + if (!NO_EXHAUSTIVEP(ego)) continue; + nflags |= NO_UGLY; + break; + case 1: + /* skip pass 1 if NO_UGLY */ + if (NO_UGLYP(ego)) continue; + break; + } + + FORALL_SOLVERS(ego, s, sp, { + plan *pln = invoke_solver(ego, p, s, nflags); + + if (pln) { + if (best) { + if (best_not_yet_timed) { + evaluate_plan(ego, best, p); + best_not_yet_timed = 0; + } + evaluate_plan(ego, pln, p); + if (pln->pcost < best->pcost) { + X(plan_destroy_internal)(best); + best = pln; + *descp = sp; + } else { + X(plan_destroy_internal)(pln); + } + } else { + best = pln; + *descp = sp; + } + } + }); + } + + return best; +} + +static plan *mkplan(planner *ego, problem *p) +{ + plan *pln; + md5 m; + slvdesc *sp; + unsigned short flags; + ASSERT_ALIGNED_DOUBLE; + + /* Canonical form. */ + if (!NO_EXHAUSTIVEP(ego)) ego->planner_flags &= ~NO_UGLY; + + ++ego->nprob; + md5hash(&m, p, ego); + + pln = 0; + + { + solution *sol; /* new scope for sol */ + + if ((sol = hlookup(ego, m.s, ego->planner_flags))) { + if (SUBSUMES(sol->flags, ego->planner_flags)) { + /* wisdom is acceptable */ + if (sol->slvndx < 0) + return 0; /* known to be infeasible */ + + /* use solver to obtain a plan */ + sp = ego->slvdescs + sol->slvndx; + pln = + invoke_solver(ego, p, sp->slv, + (0 + | NO_SEARCH + | IMPATIENCE(sol->flags) + | NONIMPATIENCE(ego->planner_flags) )); + + /* if (!pln) then the entry is bogus, but + we currently do nothing about it. */ + /* CAVEAS: Do not use ``sol'' here, because the + pointer is possibly dangling after the call to + invoke_solver(). */ + } else { + A(SUBSUMES(ego->planner_flags, sol->flags)); + } + } + } + + + if (!pln) + pln = search(ego, p, &sp); + + flags = ego->planner_flags; + + if (pln) { + /* Postulate de iure that NO_UGLY subsumes ~NO_UGLY if the + problem is feasible. Also postulate that NO_SEARCH + subsumes ~NO_SEARCH. */ + flags &= ~(NO_UGLY | NO_SEARCH); + } + + hinsert(ego, m.s, flags, pln ? sp - ego->slvdescs : -1); + + if (pln) + invoke_hook(ego, pln, p, 1); + return pln; +} + +/* destroy hash table entries. If FORGET_EVERYTHING, destroy the whole + table. If FORGET_ACCURSED, then destroy entries that are not blessed. */ +static void forget(planner *ego, amnesia a) +{ + unsigned h; + + /* garbage-collect while we are at it */ + if (a != FORGET_EVERYTHING) + hcurse_subsumed(ego); + + for (h = 0; h < ego->hashsiz; ++h) { + solution *l = ego->solutions + h; + if (VALIDP(l)) { + if (a == FORGET_EVERYTHING || + (a == FORGET_ACCURSED && !BLESSEDP(l))) { + /* confutatis maledictis + flammis acribus addictis */ + l->flags &= ~H_VALID; + --ego->nelem; + } + } + } + /* nil inultum remanebit */ + + hshrink(ego); +} + +static void htab_destroy(planner *ego) +{ + forget(ego, FORGET_EVERYTHING); + X(ifree)(ego->solutions); + ego->nelem = 0U; +} + +/* FIXME: what sort of version information should we write? */ +#define WISDOM_PREAMBLE PACKAGE "-" VERSION " " STRINGIZE(X(wisdom)) + +/* tantus labor non sit cassus */ +static void exprt(planner *ego, printer *p) +{ + unsigned h; + + hcurse_subsumed(ego); + + p->print(p, "(" WISDOM_PREAMBLE "%("); + for (h = 0; h < ego->hashsiz; ++h) { + solution *l = ego->solutions + h; + if (VALIDP(l) && BLESSEDP(l) && l->slvndx >= 0) { + slvdesc *sp = ego->slvdescs + l->slvndx; + /* qui salvandos salvas gratis + salva me fons pietatis */ + p->print(p, "(%s %d #x%x #x%M #x%M #x%M #x%M)\n", + sp->reg_nam, sp->reg_id, (int)l->flags, + l->s[0], l->s[1], l->s[2], l->s[3]); + } + } + p->print(p, "%))\n"); +} + +/* mors stupebit et natura + cum resurget creatura */ +static int imprt(planner *ego, scanner *sc) +{ + char buf[MAXNAM + 1]; + md5uint sig[4]; + int flags; + int reg_id; + int slvndx; + solution *sol; + + if (!sc->scan(sc, "(" WISDOM_PREAMBLE)) + return 0; /* don't need to restore hashtable */ + + /* make a backup copy of the hash table (cache the hash) */ + { + unsigned h, hsiz = ego->hashsiz; + sol = (solution *)MALLOC(hsiz * sizeof(solution), HASHT); + for (h = 0; h < hsiz; ++h) + sol[h] = ego->solutions[h]; + } + + while (1) { + if (sc->scan(sc, ")")) + break; + + /* qua resurget ex favilla */ + if (!sc->scan(sc, "(%*s %d #x%x #x%M #x%M #x%M #x%M)", + MAXNAM, buf, ®_id, &flags, + sig + 0, sig + 1, sig + 2, sig + 3)) + goto bad; + + if ((slvndx = slookup(ego, buf, reg_id)) < 0) + goto bad; + + /* inter oves locum praesta */ + hinsert(ego, sig, (unsigned short)flags, slvndx); + } + + X(ifree0)(sol); + return 1; + + bad: + /* ``The wisdom of FFTW must be above suspicion.'' */ + X(ifree0)(ego->solutions); + ego->solutions = sol; + return 0; +} + +/* + * create a planner + */ +planner *X(mkplanner)(void) +{ + static const planner_adt padt = { + register_solver, mkplan, forget, exprt, imprt + }; + + planner *p = (planner *) MALLOC(sizeof(planner), PLANNERS); + + p->adt = &padt; + p->nplan = p->nprob = p->nrehash = 0; + p->pcost = p->epcost = 0.0; + p->succ_lookup = p->lookup = p->lookup_iter = 0; + p->insert = p->insert_iter = p->insert_unknown = 0; + p->hook = 0; + p->cur_reg_nam = 0; + + p->slvdescs = 0; + p->nslvdesc = p->slvdescsiz = 0; + + p->solutions = 0; + p->hashsiz = p->nelem = 0U; + + p->problem_flags = 0; + p->planner_flags = 0; + p->nthr = 1; + + hgrow(p); /* so that hashsiz > 0 */ + + return p; +} + +void X(planner_destroy)(planner *ego) +{ + /* destroy hash table */ + htab_destroy(ego); + + /* destroy solvdesc table */ + FORALL_SOLVERS(ego, s, sp, { + UNUSED(sp); + X(solver_destroy)(s); + }); + + X(ifree0)(ego->slvdescs); + X(ifree)(ego); /* dona eis requiem */ +} + +plan *X(mkplan_d)(planner *ego, problem *p) +{ + plan *pln = ego->adt->mkplan(ego, p); + X(problem_destroy)(p); + return pln; +} + +/* + * Debugging code: + */ +#ifdef FFTW_DEBUG + +void X(planner_dump)(planner *ego, int verbose) +{ + unsigned valid = 0, empty = 0, infeasible = 0; + unsigned h; + UNUSED(verbose); /* historical */ + + for (h = 0; h < ego->hashsiz; ++h) { + solution *l = ego->solutions + h; + if (VALIDP(l)) { + ++valid; + if (l->slvndx < 0) ++infeasible; + } else + ++empty; + + } + + D("nplan = %d\n", ego->nplan); + D("nprob = %d\n", ego->nprob); + D("pcost = %g\n", ego->pcost); + D("epcost = %g\n", ego->epcost); + D("lookup = %d\n", ego->lookup); + D("succ_lookup = %d\n", ego->succ_lookup); + D("lookup_iter = %d\n", ego->lookup_iter); + D("insert = %d\n", ego->insert); + D("insert_iter = %d\n", ego->insert_iter); + D("insert_unknown = %d\n", ego->insert_unknown); + D("nrehash = %d\n", ego->nrehash); + D("hashsiz = %u\n", ego->hashsiz); + D("empty = %d\n", empty); + D("valid = %d\n", valid); + D("infeasible = %d\n", infeasible); + A(ego->nelem == valid); +} + +#endif diff --git a/src/fftw3/kernel/primes.c b/src/fftw3/kernel/primes.c new file mode 100644 index 0000000..608e51c --- /dev/null +++ b/src/fftw3/kernel/primes.c @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* $Id: primes.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */ + +#include "ifftw.h" + +/***************************************************************************/ + +/* Rader's algorithm requires lots of modular arithmetic, and if we + aren't careful we can have errors due to integer overflows. */ + +#ifdef SAFE_MULMOD + +# include <limits.h> + +/* compute (x * y) mod p, but watch out for integer overflows; we must + have x, y >= 0, p > 0. This routine is slow. */ +int X(safe_mulmod)(int x, int y, int p) +{ + if (y == 0 || x <= INT_MAX / y) + return((x * y) % p); + else { + int y2 = y/2; + return((X(safe_mulmod)(x, y2, p) + + X(safe_mulmod)(x, y - y2, p)) % p); + } +} +#endif /* safe_mulmod ('long long' unavailable) */ + +/***************************************************************************/ + +/* Compute n^m mod p, where m >= 0 and p > 0. If we really cared, we + could make this tail-recursive. */ +int X(power_mod)(int n, int m, int p) +{ + A(p > 0); + if (m == 0) + return 1; + else if (m % 2 == 0) { + int x = X(power_mod)(n, m / 2, p); + return MULMOD(x, x, p); + } + else + return MULMOD(n, X(power_mod)(n, m - 1, p), p); +} + +/* the following two routines were contributed by Greg Dionne. */ +static int get_prime_factors(int n, int *primef) +{ + int i; + int size = 0; + + primef[size++] = 2; + do + n >>= 1; + while ((n & 1) == 0); + + if (n == 1) + return size; + + for (i = 3; i * i <= n; i += 2) + if (!(n % i)) { + primef[size++] = i; + do + n /= i; + while (!(n % i)); + } + if (n == 1) + return size; + primef[size++] = n; + return size; +} + +int X(find_generator)(int p) +{ + int n, i, size; + int primef[16]; /* smallest number = 32589158477190044730 > 2^64 */ + int pm1 = p - 1; + + if (p == 2) + return 1; + + size = get_prime_factors(pm1, primef); + n = 2; + for (i = 0; i < size; i++) + if (X(power_mod)(n, pm1 / primef[i], p) == 1) { + i = -1; + n++; + } + return n; +} + +/* Return first prime divisor of n (It would be at best slightly faster to + search a static table of primes; there are 6542 primes < 2^16.) */ +int X(first_divisor)(int n) +{ + int i; + if (n <= 1) + return n; + if (n % 2 == 0) + return 2; + for (i = 3; i*i <= n; i += 2) + if (n % i == 0) + return i; + return n; +} + +int X(is_prime)(int n) +{ + return(n > 1 && X(first_divisor)(n) == n); +} + +int X(next_prime)(int n) +{ + while (!X(is_prime)(n)) ++n; + return n; +} diff --git a/src/fftw3/kernel/print.c b/src/fftw3/kernel/print.c new file mode 100644 index 0000000..314be7c --- /dev/null +++ b/src/fftw3/kernel/print.c @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* $Id: print.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */ + +#include "ifftw.h" +#include <stddef.h> +#include <stdarg.h> +#include <stdio.h> + +#define BSZ 64 + +static void myputs(printer *p, const char *s) +{ + char c; + while ((c = *s++)) + p->putchr(p, c); +} + +static void vprint(printer *p, const char *format, va_list ap) +{ + char buf[BSZ]; + const char *s = format; + char c; + int i; + + for (i = 0; i < p->indent; ++i) + p->putchr(p, ' '); + + while ((c = *s++)) { + switch (c) { + case '%': + switch ((c = *s++)) { + case 'M': { + /* md5 value */ + md5uint x = va_arg(ap, md5uint); + x = 0xffffffffUL & x; + sprintf(buf, "%8.8lx", (unsigned long)x); + goto putbuf; + } + case 'c': { + int x = va_arg(ap, int); + p->putchr(p, x); + break; + } + case 's': { + char *x = va_arg(ap, char *); + if (x) + myputs(p, x); + else + goto putnull; + break; + } + case 'd': { + int x = va_arg(ap, int); + sprintf(buf, "%d", x); + goto putbuf; + } + case 't': { + ptrdiff_t x; + A(*s == 'd'); + s += 1; + x = va_arg(ap, ptrdiff_t); + /* should use C99 %td here, but + this is not yet widespread enough */ + sprintf(buf, "%ld", (long) x); + goto putbuf; + } + case 'f': case 'e': case 'g': { + char fmt[3] = "%x"; + double x = va_arg(ap, double); + fmt[1] = c; + sprintf(buf, fmt, x); + goto putbuf; + } + case 'v': { + /* print optional vector length */ + int x = va_arg(ap, int); + if (x > 1) { + sprintf(buf, "-x%d", x); + goto putbuf; + } + break; + } + case 'o': { + /* integer option. Usage: %oNAME= */ + int x = va_arg(ap, int); + if (x) + p->putchr(p, '/'); + while ((c = *s++) != '=') + if (x) + p->putchr(p, c); + if (x) { + sprintf(buf, "=%d", x); + goto putbuf; + } + break; + } + case 'u': { + unsigned x = va_arg(ap, unsigned); + sprintf(buf, "%u", x); + goto putbuf; + } + case 'x': { + unsigned x = va_arg(ap, unsigned); + sprintf(buf, "%x", x); + goto putbuf; + } + case '(': { + /* newline, augment indent level */ + p->putchr(p, '\n'); + p->indent += p->indent_incr; + break; + } + case ')': { + /* decrement indent level */ + p->indent -= p->indent_incr; + break; + } + case 'p': { /* note difference from C's %p */ + /* print plan */ + plan *x = va_arg(ap, plan *); + if (x) + x->adt->print(x, p); + else + goto putnull; + break; + } + case 'P': { + /* print problem */ + problem *x = va_arg(ap, problem *); + if (x) + x->adt->print(x, p); + else + goto putnull; + break; + } + case 'T': { + /* print tensor */ + tensor *x = va_arg(ap, tensor *); + if (x) + X(tensor_print)(x, p); + else + goto putnull; + break; + } + default: + A(0 /* unknown format */); + break; + + putbuf: + myputs(p, buf); + break; + putnull: + myputs(p, "(null)"); + break; + } + break; + default: + p->putchr(p, c); + break; + } + } +} + +static void print(printer *p, const char *format, ...) +{ + va_list ap; + va_start(ap, format); + vprint(p, format, ap); + va_end(ap); +} + +printer *X(mkprinter)(size_t size, + void (*putchr)(printer *p, char c), + void (*cleanup)(printer *p)) +{ + printer *s = (printer *)MALLOC(size, OTHER); + s->print = print; + s->vprint = vprint; + s->putchr = putchr; + s->cleanup = cleanup; + s->indent = 0; + s->indent_incr = 2; + return s; +} + +void X(printer_destroy)(printer *p) +{ + if (p->cleanup) + p->cleanup(p); + X(ifree)(p); +} diff --git a/src/fftw3/kernel/scan.c b/src/fftw3/kernel/scan.c new file mode 100644 index 0000000..f5fa6e1 --- /dev/null +++ b/src/fftw3/kernel/scan.c @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* $Id: scan.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */ + +#include "ifftw.h" +#include <string.h> +#include <stddef.h> +#include <stdarg.h> +#include <stdio.h> + +#ifdef USE_CTYPE +#include <ctype.h> +#else +/* Screw ctype. On linux, the is* functions call a routine that gets + the ctype map in the current locale. Because this operation is + expensive, the map is cached on a per-thread basis. I am not + willing to link this crap with FFTW. Not over my dead body. + + Sic transit gloria mundi. +*/ +#undef isspace +#define isspace(x) ((x) >= 0 && (x) <= ' ') +#undef isdigit +#define isdigit(x) ((x) >= '0' && (x) <= '9') +#undef isupper +#define isupper(x) ((x) >= 'A' && (x) <= 'Z') +#undef islower +#define islower(x) ((x) >= 'a' && (x) <= 'z') +#endif + +static int mygetc(scanner *sc) +{ + if (sc->ungotc != EOF) { + int c = sc->ungotc; + sc->ungotc = EOF; + return c; + } + return(sc->getchr(sc)); +} + +#define GETCHR(sc) mygetc(sc) + +static void myungetc(scanner *sc, int c) +{ + sc->ungotc = c; +} + +#define UNGETCHR(sc, c) myungetc(sc, c) + +static void eat_blanks(scanner *sc) +{ + int ch; + while (ch = GETCHR(sc), isspace(ch)) + ; + UNGETCHR(sc, ch); +} + +static void mygets(scanner *sc, char *s, size_t maxlen) +{ + char *s0 = s; + int ch; + + A(maxlen > 0); + while ((ch = GETCHR(sc)) != EOF && !isspace(ch) + && ch != ')' && ch != '(' && s < s0 + maxlen) + *s++ = ch; + *s = 0; + UNGETCHR(sc, ch); +} + +static long getlong(scanner *sc, int base, int *ret) +{ + int sign = 1, ch, count; + long x = 0; + + ch = GETCHR(sc); + if (ch == '-' || ch == '+') { + sign = ch == '-' ? -1 : 1; + ch = GETCHR(sc); + } + for (count = 0; ; ++count) { + if (isdigit(ch)) + ch -= '0'; + else if (isupper(ch)) + ch -= 'A' - 10; + else if (islower(ch)) + ch -= 'a' - 10; + else + break; + x = x * base + ch; + ch = GETCHR(sc); + } + x *= sign; + UNGETCHR(sc, ch); + *ret = count > 0; + return x; +} + +/* vscan is mostly scanf-like, with our additional format specifiers, + but with a few twists. It returns simply 0 or 1 indicating whether + the match was successful. '(' and ')' in the format string match + those characters preceded by any whitespace. Finally, if a + character match fails, it will ungetchr() the last character back + onto the stream. */ +static int vscan(scanner *sc, const char *format, va_list ap) +{ + const char *s = format; + char c; + int ch = 0; + size_t fmt_len; + + while ((c = *s++)) { + fmt_len = 0; + switch (c) { + case '%': + getformat: + switch ((c = *s++)) { + case 's': { + char *x = va_arg(ap, char *); + mygets(sc, x, fmt_len); + break; + } + case 'd': { + int *x = va_arg(ap, int *); + *x = (int) getlong(sc, 10, &ch); + if (!ch) return 0; + break; + } + case 'x': { + int *x = va_arg(ap, int *); + *x = (int) getlong(sc, 16, &ch); + if (!ch) return 0; + break; + } + case 'M': { + md5uint *x = va_arg(ap, md5uint *); + *x = 0xffffffffUL & getlong(sc, 16, &ch); + if (!ch) return 0; + break; + } + case '*': { + if ((fmt_len = va_arg(ap, int)) <= 0) return 0; + goto getformat; + } + default: + A(0 /* unknown format */); + break; + } + break; + default: + if (isspace(c) || c == '(' || c == ')') + eat_blanks(sc); + if (!isspace(c) && (ch = GETCHR(sc)) != c) { + UNGETCHR(sc, ch); + return 0; + } + break; + } + } + return 1; +} + +static int scan(scanner *sc, const char *format, ...) +{ + int ret; + va_list ap; + va_start(ap, format); + ret = vscan(sc, format, ap); + va_end(ap); + return ret; +} + +scanner *X(mkscanner)(size_t size, int (*getchr)(scanner *sc)) +{ + scanner *s = (scanner *)MALLOC(size, OTHER); + s->scan = scan; + s->vscan = vscan; + s->getchr = getchr; + s->ungotc = EOF; + return s; +} + +void X(scanner_destroy)(scanner *sc) +{ + X(ifree)(sc); +} diff --git a/src/fftw3/kernel/solver.c b/src/fftw3/kernel/solver.c new file mode 100644 index 0000000..4bfb899 --- /dev/null +++ b/src/fftw3/kernel/solver.c @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* $Id: solver.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */ + +#include "ifftw.h" + +solver *X(mksolver)(size_t size, const solver_adt *adt) +{ + solver *s = (solver *)MALLOC(size, SOLVERS); + + s->adt = adt; + s->refcnt = 0; + return s; +} + +void X(solver_use)(solver *ego) +{ + ++ego->refcnt; +} + +void X(solver_destroy)(solver *ego) +{ + if ((--ego->refcnt) == 0) + X(ifree)(ego); +} + +void X(solver_register)(planner *plnr, solver *s) +{ + plnr->adt->register_solver(plnr, s); +} diff --git a/src/fftw3/kernel/solvtab.c b/src/fftw3/kernel/solvtab.c new file mode 100644 index 0000000..496915e --- /dev/null +++ b/src/fftw3/kernel/solvtab.c @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* $Id: solvtab.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */ + +#include "ifftw.h" + +void X(solvtab_exec)(solvtab tbl, planner *p) +{ + for (; tbl->reg_nam; ++tbl) { + p->cur_reg_nam = tbl->reg_nam; + p->cur_reg_id = 0; + tbl->reg(p); + } + p->cur_reg_nam = 0; +} diff --git a/src/fftw3/kernel/square.c b/src/fftw3/kernel/square.c new file mode 100644 index 0000000..4b5afab --- /dev/null +++ b/src/fftw3/kernel/square.c @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* $Id: square.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */ + +#include "ifftw.h" + +int X(square)(int x) +{ + return x * x; +} diff --git a/src/fftw3/kernel/stride.c b/src/fftw3/kernel/stride.c new file mode 100644 index 0000000..fda049f --- /dev/null +++ b/src/fftw3/kernel/stride.c @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* $Id: stride.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */ +#include "ifftw.h" + +#ifdef PRECOMPUTE_ARRAY_INDICES +stride X(mkstride)(int n, int s) +{ + int i; + int *p = (int *) MALLOC(n * sizeof(int), STRIDES); + + for (i = 0; i < n; ++i) + p[i] = s * i; + + return p; +} + +void X(stride_destroy)(stride p) +{ + X(ifree0)(p); +} + +#endif diff --git a/src/fftw3/kernel/tensor.c b/src/fftw3/kernel/tensor.c new file mode 100644 index 0000000..1161963 --- /dev/null +++ b/src/fftw3/kernel/tensor.c @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* $Id: tensor.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */ + +#include "ifftw.h" + +tensor *X(mktensor)(int rnk) +{ + tensor *x; + + A(rnk >= 0); + +#if defined(STRUCT_HACK_KR) + if (FINITE_RNK(rnk) && rnk > 1) + x = (tensor *)MALLOC(sizeof(tensor) + (rnk - 1) * sizeof(iodim), + TENSORS); + else + x = (tensor *)MALLOC(sizeof(tensor), TENSORS); +#elif defined(STRUCT_HACK_C99) + if (FINITE_RNK(rnk)) + x = (tensor *)MALLOC(sizeof(tensor) + rnk * sizeof(iodim), + TENSORS); + else + x = (tensor *)MALLOC(sizeof(tensor), TENSORS); +#else + x = (tensor *)MALLOC(sizeof(tensor), TENSORS); + if (FINITE_RNK(rnk) && rnk > 0) + x->dims = (iodim *)MALLOC(sizeof(iodim) * rnk, TENSORS); + else + x->dims = 0; +#endif + + x->rnk = rnk; + return x; +} + +void X(tensor_destroy)(tensor *sz) +{ +#if !defined(STRUCT_HACK_C99) && !defined(STRUCT_HACK_KR) + X(ifree0)(sz->dims); +#endif + X(ifree)(sz); +} + +int X(tensor_sz)(const tensor *sz) +{ + int i, n = 1; + + if (!FINITE_RNK(sz->rnk)) + return 0; + + for (i = 0; i < sz->rnk; ++i) + n *= sz->dims[i].n; + return n; +} + +void X(tensor_md5)(md5 *p, const tensor *t) +{ + int i; + X(md5int)(p, t->rnk); + if (FINITE_RNK(t->rnk)) { + for (i = 0; i < t->rnk; ++i) { + const iodim *q = t->dims + i; + X(md5int)(p, q->n); + X(md5int)(p, q->is); + X(md5int)(p, q->os); + } + } +} + +/* treat a (rank <= 1)-tensor as a rank-1 tensor, extracting + appropriate n, is, and os components */ +int X(tensor_tornk1)(const tensor *t, int *n, int *is, int *os) +{ + A(t->rnk <= 1); + if (t->rnk == 1) { + const iodim *vd = t->dims; + *n = vd[0].n; + *is = vd[0].is; + *os = vd[0].os; + } else { + *n = 1; + *is = *os = 0; + } + return 1; +} + +void X(tensor_print)(const tensor *x, printer *p) +{ + if (FINITE_RNK(x->rnk)) { + int i; + int first = 1; + p->print(p, "("); + for (i = 0; i < x->rnk; ++i) { + const iodim *d = x->dims + i; + p->print(p, "%s(%d %d %d)", + first ? "" : " ", + d->n, d->is, d->os); + first = 0; + } + p->print(p, ")"); + } else { + p->print(p, "rank-minfty"); + } +} diff --git a/src/fftw3/kernel/tensor1.c b/src/fftw3/kernel/tensor1.c new file mode 100644 index 0000000..06ad4dc --- /dev/null +++ b/src/fftw3/kernel/tensor1.c @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* $Id: tensor1.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */ + +#include "ifftw.h" + +tensor *X(mktensor_0d)(void) +{ + return X(mktensor(0)); +} + +tensor *X(mktensor_1d)(int n, int is, int os) +{ + tensor *x = X(mktensor)(1); + x->dims[0].n = n; + x->dims[0].is = is; + x->dims[0].os = os; + return x; +} diff --git a/src/fftw3/kernel/tensor2.c b/src/fftw3/kernel/tensor2.c new file mode 100644 index 0000000..a45e164 --- /dev/null +++ b/src/fftw3/kernel/tensor2.c @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* $Id: tensor2.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */ + +#include "ifftw.h" + +tensor *X(mktensor_2d)(int n0, int is0, int os0, + int n1, int is1, int os1) +{ + tensor *x = X(mktensor)(2); + x->dims[0].n = n0; + x->dims[0].is = is0; + x->dims[0].os = os0; + x->dims[1].n = n1; + x->dims[1].is = is1; + x->dims[1].os = os1; + return x; +} + diff --git a/src/fftw3/kernel/tensor4.c b/src/fftw3/kernel/tensor4.c new file mode 100644 index 0000000..9b6cd28 --- /dev/null +++ b/src/fftw3/kernel/tensor4.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* $Id: tensor4.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */ + +#include "ifftw.h" + +int X(tensor_max_index)(const tensor *sz) +{ + int i; + int n = 0; + + A(FINITE_RNK(sz->rnk)); + for (i = 0; i < sz->rnk; ++i) { + const iodim *p = sz->dims + i; + n += (p->n - 1) * X(imax)(X(iabs)(p->is), X(iabs)(p->os)); + } + return n; +} + +#define tensor_min_xstride(sz, xs) { \ + A(FINITE_RNK(sz->rnk)); \ + if (sz->rnk == 0) return 0; \ + else { \ + int i; \ + int s = X(iabs)(sz->dims[0].xs); \ + for (i = 1; i < sz->rnk; ++i) \ + s = X(imin)(s, X(iabs)(sz->dims[i].xs)); \ + return s; \ + } \ +} + +int X(tensor_min_istride)(const tensor *sz) tensor_min_xstride(sz, is) +int X(tensor_min_ostride)(const tensor *sz) tensor_min_xstride(sz, os) + +int X(tensor_min_stride)(const tensor *sz) +{ + return X(imin)(X(tensor_min_istride)(sz), X(tensor_min_ostride)(sz)); +} + +int X(tensor_inplace_strides)(const tensor *sz) +{ + int i; + A(FINITE_RNK(sz->rnk)); + for (i = 0; i < sz->rnk; ++i) { + const iodim *p = sz->dims + i; + if (p->is != p->os) + return 0; + } + return 1; +} + +int X(tensor_inplace_strides2)(const tensor *a, const tensor *b) +{ + return X(tensor_inplace_strides(a)) && X(tensor_inplace_strides(b)); +} diff --git a/src/fftw3/kernel/tensor5.c b/src/fftw3/kernel/tensor5.c new file mode 100644 index 0000000..8144930 --- /dev/null +++ b/src/fftw3/kernel/tensor5.c @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* $Id: tensor5.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */ + +#include "ifftw.h" + +static void dimcpy(iodim *dst, const iodim *src, int rnk) +{ + int i; + if (FINITE_RNK(rnk)) + for (i = 0; i < rnk; ++i) + dst[i] = src[i]; +} + +tensor *X(tensor_copy)(const tensor *sz) +{ + tensor *x = X(mktensor)(sz->rnk); + dimcpy(x->dims, sz->dims, sz->rnk); + return x; +} + +/* like X(tensor_copy), but makes strides in-place by + setting os = is if k == INPLACE_IS or is = os if k == INPLACE_OS. */ +tensor *X(tensor_copy_inplace)(const tensor *sz, inplace_kind k) +{ + tensor *x = X(tensor_copy)(sz); + if (FINITE_RNK(x->rnk)) { + int i; + if (k == INPLACE_OS) + for (i = 0; i < x->rnk; ++i) + x->dims[i].is = x->dims[i].os; + else + for (i = 0; i < x->rnk; ++i) + x->dims[i].os = x->dims[i].is; + } + return x; +} + +/* Like X(tensor_copy), but copy all of the dimensions *except* + except_dim. */ +tensor *X(tensor_copy_except)(const tensor *sz, int except_dim) +{ + tensor *x; + + A(FINITE_RNK(sz->rnk) && sz->rnk >= 1 && except_dim < sz->rnk); + x = X(mktensor)(sz->rnk - 1); + dimcpy(x->dims, sz->dims, except_dim); + dimcpy(x->dims + except_dim, sz->dims + except_dim + 1, + x->rnk - except_dim); + return x; +} + +/* Like X(tensor_copy), but copy only rnk dimensions starting + with start_dim. */ +tensor *X(tensor_copy_sub)(const tensor *sz, int start_dim, int rnk) +{ + tensor *x; + + A(FINITE_RNK(sz->rnk) && start_dim + rnk <= sz->rnk); + x = X(mktensor)(rnk); + dimcpy(x->dims, sz->dims + start_dim, rnk); + return x; +} + +tensor *X(tensor_append)(const tensor *a, const tensor *b) +{ + if (!FINITE_RNK(a->rnk) || !FINITE_RNK(b->rnk)) { + return X(mktensor)(RNK_MINFTY); + } else { + tensor *x = X(mktensor)(a->rnk + b->rnk); + dimcpy(x->dims, a->dims, a->rnk); + dimcpy(x->dims + a->rnk, b->dims, b->rnk); + return x; + } +} diff --git a/src/fftw3/kernel/tensor7.c b/src/fftw3/kernel/tensor7.c new file mode 100644 index 0000000..ceae2c4 --- /dev/null +++ b/src/fftw3/kernel/tensor7.c @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* $Id: tensor7.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */ + +#include "ifftw.h" + +/* total order among iodim's */ +int X(dimcmp)(const iodim *a, const iodim *b) +{ + if (b->is != a->is) + return (b->is - a->is); /* shorter strides go later */ + if (b->os != a->os) + return (b->os - a->os); /* shorter strides go later */ + return (int)(a->n - b->n); /* larger n's go later */ +} + +/* Like tensor_copy, but eliminate n == 1 dimensions, which + never affect any transform or transform vector. + + Also, we sort the tensor into a canonical order of decreasing + is. In general, processing a loop/array in order of + decreasing stride will improve locality; sorting also makes the + analysis in fftw_tensor_contiguous (below) easier. The choice + of is over os is mostly arbitrary, and hopefully + shouldn't affect things much. Normally, either the os will be + in the same order as is (for e.g. multi-dimensional + transforms) or will be in opposite order (e.g. for Cooley-Tukey + recursion). (Both forward and backwards traversal of the tensor + are considered e.g. by vrank-geq1, so sorting in increasing + vs. decreasing order is not really important.) */ +tensor *X(tensor_compress)(const tensor *sz) +{ + int i, rnk; + tensor *x; + + A(FINITE_RNK(sz->rnk)); + for (i = rnk = 0; i < sz->rnk; ++i) { + A(sz->dims[i].n > 0); + if (sz->dims[i].n != 1) + ++rnk; + } + + x = X(mktensor)(rnk); + for (i = rnk = 0; i < sz->rnk; ++i) { + if (sz->dims[i].n != 1) + x->dims[rnk++] = sz->dims[i]; + } + + if (rnk) { + /* God knows how qsort() behaves if n==0 */ + qsort(x->dims, (size_t)x->rnk, sizeof(iodim), + (int (*)(const void *, const void *))X(dimcmp)); + } + + return x; +} + +/* Return whether the strides of a and b are such that they form an + effective contiguous 1d array. Assumes that a.is >= b.is. */ +static int strides_contig(iodim *a, iodim *b) +{ + return (a->is == b->is * (int)b->n && + a->os == b->os * (int)b->n); +} + +/* Like tensor_compress, but also compress into one dimension any + group of dimensions that form a contiguous block of indices with + some stride. (This can safely be done for transform vector sizes.) */ +tensor *X(tensor_compress_contiguous)(const tensor *sz) +{ + int i, rnk; + tensor *sz2, *x; + + if (X(tensor_sz)(sz) == 0) + return X(mktensor)(RNK_MINFTY); + + sz2 = X(tensor_compress)(sz); + A(FINITE_RNK(sz2->rnk)); + + if (sz2->rnk < 2) /* nothing to compress */ + return sz2; + + for (i = rnk = 1; i < sz2->rnk; ++i) + if (!strides_contig(sz2->dims + i - 1, sz2->dims + i)) + ++rnk; + + x = X(mktensor)(rnk); + x->dims[0] = sz2->dims[0]; + for (i = rnk = 1; i < sz2->rnk; ++i) { + if (strides_contig(sz2->dims + i - 1, sz2->dims + i)) { + x->dims[rnk - 1].n *= sz2->dims[i].n; + x->dims[rnk - 1].is = sz2->dims[i].is; + x->dims[rnk - 1].os = sz2->dims[i].os; + } else { + A(rnk < x->rnk); + x->dims[rnk++] = sz2->dims[i]; + } + } + + X(tensor_destroy)(sz2); + return x; +} + +/* The inverse of X(tensor_append): splits the sz tensor into + tensor a followed by tensor b, where a's rank is arnk. */ +void X(tensor_split)(const tensor *sz, tensor **a, int arnk, tensor **b) +{ + A(FINITE_RNK(sz->rnk) && FINITE_RNK(arnk)); + + *a = X(tensor_copy_sub)(sz, 0, arnk); + *b = X(tensor_copy_sub)(sz, arnk, sz->rnk - arnk); +} diff --git a/src/fftw3/kernel/tensor8.c b/src/fftw3/kernel/tensor8.c new file mode 100644 index 0000000..05e9b47 --- /dev/null +++ b/src/fftw3/kernel/tensor8.c @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* $Id: tensor8.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */ + +#include "ifftw.h" + +void X(tensor_destroy2)(tensor *a, tensor *b) +{ + X(tensor_destroy)(a); + X(tensor_destroy)(b); +} + +void X(tensor_destroy4)(tensor *a, tensor *b, tensor *c, tensor *d) +{ + X(tensor_destroy2)(a, b); + X(tensor_destroy2)(c, d); +} diff --git a/src/fftw3/kernel/tensor9.c b/src/fftw3/kernel/tensor9.c new file mode 100644 index 0000000..33ddf45 --- /dev/null +++ b/src/fftw3/kernel/tensor9.c @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* $Id: tensor9.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */ + +#include "ifftw.h" + +int X(tensor_kosherp)(const tensor *x) +{ + int i; + + if (x->rnk < 0) return 0; + + if (FINITE_RNK(x->rnk)) { + for (i = 0; i < x->rnk; ++i) + if (x->dims[i].n < 0) + return 0; + } + return 1; +} diff --git a/src/fftw3/kernel/timer.c b/src/fftw3/kernel/timer.c new file mode 100644 index 0000000..72969ad --- /dev/null +++ b/src/fftw3/kernel/timer.c @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* $Id: timer.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */ + +#include "ifftw.h" + +#ifdef HAVE_UNISTD_H +# include <unistd.h> +#endif + +#ifndef WITH_SLOW_TIMER +# include "cycle.h" +#else +# if TIME_WITH_SYS_TIME +# include <sys/time.h> +# include <time.h> +# else +# if HAVE_SYS_TIME_H +# include <sys/time.h> +# else +# include <time.h> +# endif +# endif +#endif + +#ifndef FFTW_TIME_LIMIT +#define FFTW_TIME_LIMIT 2.0 /* don't run for more than two seconds */ +#endif + +#ifdef HAVE_BSDGETTIMEOFDAY +#ifndef HAVE_GETTIMEOFDAY +#define gettimeofday BSDgettimeofday +#define HAVE_GETTIMEOFDAY 1 +#endif +#endif + +#if defined(HAVE_GETTIMEOFDAY) && !defined(HAVE_SECONDS_TIMER) +typedef struct timeval seconds; + +static seconds getseconds(void) +{ + struct timeval tv; + gettimeofday(&tv, 0); + return tv; +} + +static double elapsed_sec(seconds t1, seconds t0) +{ + return (double)(t1.tv_sec - t0.tv_sec) + + (double)(t1.tv_usec - t0.tv_usec) * 1.0E-6; +} + +# define TIME_MIN_SEC 1.0e-2 /* from fftw2 */ +# define HAVE_SECONDS_TIMER +#endif + +#ifndef HAVE_SECONDS_TIMER +# include <time.h> + +typedef clock_t seconds; + +static seconds getseconds(void) { return clock(); } + +static double elapsed_sec(seconds t1, seconds t0) +{ + return ((double) (t1 - t0)) / CLOCKS_PER_SEC; +} + +# define TIME_MIN_SEC 2.0e-1 /* from fftw2 */ +# define HAVE_SECONDS_TIMER +#endif + +#ifdef WITH_SLOW_TIMER +/* excruciatingly slow; only use this if there is no choice! */ +typedef seconds ticks; +# define getticks getseconds +# define elapsed elapsed_sec +# define TIME_MIN TIME_MIN_SEC +# define TIME_REPEAT 4 /* from fftw2 */ +# define HAVE_TICK_COUNTER +#endif + +#ifdef HAVE_TICK_COUNTER + +# ifndef TIME_MIN +# define TIME_MIN 100.0 +# endif + +# ifndef TIME_REPEAT +# define TIME_REPEAT 8 +# endif + + static double measure(plan *pln, const problem *p, int iter) + { + ticks t0, t1; + int i; + + t0 = getticks(); + for (i = 0; i < iter; ++i) + pln->adt->solve(pln, p); + t1 = getticks(); + return elapsed(t1, t0); + } + + + double X(measure_execution_time)(plan *pln, const problem *p) + { + seconds begin, now; + double t, tmax, tmin; + int iter; + int repeat; + + AWAKE(pln, 1); + p->adt->zero(p); + + start_over: + for (iter = 1; iter; iter *= 2) { + tmin = 1.0E10; + tmax = -1.0E10; + + begin = getseconds(); + /* repeat the measurement TIME_REPEAT times */ + for (repeat = 0; repeat < TIME_REPEAT; ++repeat) { + t = measure(pln, p, iter); + + if (t < 0) + goto start_over; + + if (t < tmin) + tmin = t; + if (t > tmax) + tmax = t; + + /* do not run for too long */ + now = getseconds(); + t = elapsed_sec(now, begin); + + if (t > FFTW_TIME_LIMIT) + break; + } + + if (tmin >= TIME_MIN) { + tmin /= (double) iter; + tmax /= (double) iter; + AWAKE(pln, 0); + return tmin; + } + } + goto start_over; /* may happen if timer is screwed up */ + } + +#else /* no cycle counter */ + + double X(measure_execution_time)(plan *pln, const problem *p) + { + UNUSED(p); + UNUSED(pln); + return -1.0; + } + +#endif diff --git a/src/fftw3/kernel/transpose.c b/src/fftw3/kernel/transpose.c new file mode 100644 index 0000000..fb489bd --- /dev/null +++ b/src/fftw3/kernel/transpose.c @@ -0,0 +1,430 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* transposes of unit-stride arrays, including arrays of N-tuples and + non-square matrices, using cache-oblivious recursive algorithms */ + +#include "ifftw.h" +#include <string.h> /* memcpy */ + +#define CUTOFF 8 /* size below which we do a naive transpose */ + +/*************************************************************************/ +/* some utilities for the solvers */ + +static int Ntuple_transposable(const iodim *a, const iodim *b, + int vl, int s, R *ri, R *ii) +{ + return(2 == s && (ii == ri + 1 || ri == ii + 1) + && + ((a->is == b->os && a->is == (vl*2) + && a->os == b->n * (vl*2) && b->is == a->n * (vl*2)) + || + (a->os == b->is && a->os == (vl*2) + && a->is == b->n * (vl*2) && b->os == a->n * (vl*2)))); +} + + +/* our solvers' transpose routines work for square matrices of arbitrary + stride, or for non-square matrices of a given vl*vl2 corresponding + to the N of the Ntuple with vl2 == s. */ +int X(transposable)(const iodim *a, const iodim *b, + int vl, int s, R *ri, R *ii) +{ + return ((a->n == b->n && a->os == b->is && a->is == b->os) + || Ntuple_transposable(a, b, vl, s, ri, ii)); +} + +static int gcd(int a, int b) +{ + int r; + do { + r = a % b; + a = b; + b = r; + } while (r != 0); + + return a; +} + +/* all of the solvers need to extract n, m, d, n/d, and m/d from the + two iodims, so we put it here to save code space */ +void X(transpose_dims)(const iodim *a, const iodim *b, + int *n, int *m, int *d, int *nd, int *md) +{ + int n0, m0, d0; + /* matrix should be n x m, row-major */ + if (a->is < b->is) { + *n = n0 = b->n; + *m = m0 = a->n; + } + else { + *n = n0 = a->n; + *m = m0 = b->n; + } + *d = d0 = gcd(n0, m0); + *nd = n0 / d0; + *md = m0 / d0; +} + +/* use the simple square transpose in the solver for square matrices + that aren't too big or which have the wrong stride */ +int X(transpose_simplep)(const iodim *a, const iodim *b, int vl, int s, + R *ri, R *ii) +{ + return (a->n == b->n && + (a->n*(vl*2) < CUTOFF + || !Ntuple_transposable(a, b, vl, s, ri, ii))); +} + +/* use the slow general transpose if the buffer would be more than 1/8 + the whole transpose and the transpose is fairly big. + (FIXME: use the CONSERVE_MEMORY flag?) */ +int X(transpose_slowp)(const iodim *a, const iodim *b, int N) +{ + int d = gcd(a->n, b->n); + return (d < 8 && (a->n * b->n * N) / d > 65536); +} + +/*************************************************************************/ +/* Out-of-place transposes: */ + +/* Transpose A (n x m) to B (m x n), where A and B are stored + as n x fda and m x fda arrays, respectively, operating on N-tuples: */ +static void rec_transpose_Ntuple(R *A, R *B, int n, int m, int fda, int fdb, + int N) +{ + if (n == 1 || m == 1 || (n + m) * N < CUTOFF*2) { + int i, j, k; + for (i = 0; i < n; ++i) { + for (j = 0; j < m; ++j) { + for (k = 0; k < N; ++k) { /* FIXME: unroll */ + B[(j*fdb + i) * N + k] = A[(i*fda + j) * N + k]; + } + } + } + } + else if (n > m) { + int n2 = n / 2; + rec_transpose_Ntuple(A, B, n2, m, fda, fdb, N); + rec_transpose_Ntuple(A + n2*N*fda, B + n2*N, n - n2, m, fda, fdb, N); + } + else { + int m2 = m / 2; + rec_transpose_Ntuple(A, B, n, m2, fda, fdb, N); + rec_transpose_Ntuple(A + m2*N, B + m2*N*fdb, n, m - m2, fda, fdb, N); + } +} + +/*************************************************************************/ +/* In-place transposes of square matrices of N-tuples: */ + +/* Transpose both A and B, where A is n x m and B is m x n, storing + the transpose of A in B and the transpose of B in A. A and B + are actually stored as n x fda and m x fda arrays. */ +static void rec_transpose_swap_Ntuple(R *A, R *B, int n, int m, int fda, int N) +{ + if (n == 1 || m == 1 || (n + m) * N <= CUTOFF*2) { + switch (N) { + case 1: { + int i, j; + for (i = 0; i < n; ++i) { + for (j = 0; j < m; ++j) { + R a = A[(i*fda + j)]; + A[(i*fda + j)] = B[(j*fda + i)]; + B[(j*fda + i)] = a; + } + } + break; + } + case 2: { + int i, j; + for (i = 0; i < n; ++i) { + for (j = 0; j < m; ++j) { + R a0 = A[(i*fda + j) * 2 + 0]; + R a1 = A[(i*fda + j) * 2 + 1]; + A[(i*fda + j) * 2 + 0] = B[(j*fda + i) * 2 + 0]; + A[(i*fda + j) * 2 + 1] = B[(j*fda + i) * 2 + 1]; + B[(j*fda + i) * 2 + 0] = a0; + B[(j*fda + i) * 2 + 1] = a1; + } + } + break; + } + default: { + int i, j, k; + for (i = 0; i < n; ++i) { + for (j = 0; j < m; ++j) { + for (k = 0; k < N; ++k) { + R a = A[(i*fda + j) * N + k]; + A[(i*fda + j) * N + k] = + B[(j*fda + i) * N + k]; + B[(j*fda + i) * N + k] = a; + } + } + } + } + } + } else if (n > m) { + int n2 = n / 2; + rec_transpose_swap_Ntuple(A, B, n2, m, fda, N); + rec_transpose_swap_Ntuple(A + n2*N*fda, B + n2*N, n - n2, m, fda, N); + } + else { + int m2 = m / 2; + rec_transpose_swap_Ntuple(A, B, n, m2, fda, N); + rec_transpose_swap_Ntuple(A + m2*N, B + m2*N*fda, n, m - m2, fda, N); + } +} + +/* Transpose A, an n x n matrix (stored as n x fda), in-place. */ +static void rec_transpose_sq_ip_Ntuple(R *A, int n, int fda, int N) +{ + if (n == 1) + return; + else if (n*N <= CUTOFF) { + switch (N) { + case 1: { + int i, j; + for (i = 0; i < n; ++i) { + for (j = i + 1; j < n; ++j) { + R a = A[(i*fda + j)]; + A[(i*fda + j)] = A[(j*fda + i)]; + A[(j*fda + i)] = a; + } + } + break; + } + case 2: { + int i, j; + for (i = 0; i < n; ++i) { + for (j = i + 1; j < n; ++j) { + R a0 = A[(i*fda + j) * 2 + 0]; + R a1 = A[(i*fda + j) * 2 + 1]; + A[(i*fda + j) * 2 + 0] = A[(j*fda + i) * 2 + 0]; + A[(i*fda + j) * 2 + 1] = A[(j*fda + i) * 2 + 1]; + A[(j*fda + i) * 2 + 0] = a0; + A[(j*fda + i) * 2 + 1] = a1; + } + } + break; + } + default: { + int i, j, k; + for (i = 0; i < n; ++i) { + for (j = i + 1; j < n; ++j) { + for (k = 0; k < N; ++k) { + R a = A[(i*fda + j) * N + k]; + A[(i*fda + j) * N + k] = + A[(j*fda + i) * N + k]; + A[(j*fda + i) * N + k] = a; + } + } + } + } + } + } else { + int n2 = n / 2; + rec_transpose_sq_ip_Ntuple(A, n2, fda, N); + rec_transpose_sq_ip_Ntuple((A + n2*N) + n2*N*fda, n - n2, fda, N); + rec_transpose_swap_Ntuple(A + n2*N, A + n2*N*fda, n2, n - n2, fda,N); + } +} + +/*************************************************************************/ +/* In-place transposes of non-square matrices: */ + +/* Transpose the matrix A in-place, where A is an (n*d) x (m*d) matrix + of N-tuples and buf contains at least n*m*d*N elements. In + general, to transpose a p x q matrix, you should call this routine + with d = gcd(p, q), n = p/d, and m = q/d. */ +void X(transpose)(R *A, int n, int m, int d, int N, R *buf) +{ + A(n > 0 && m > 0 && N > 0 && d > 0); + if (d == 1) { + rec_transpose_Ntuple(A, buf, n,m, m,n, N); + memcpy(A, buf, m*n*N*sizeof(R)); + } + else if (n*m == 1) { + rec_transpose_sq_ip_Ntuple(A, d, d, N); + } + else { + int i, num_el = n*m*d*N; + + /* treat as (d x n) x (d' x m) matrix. (d' = d) */ + + /* First, transpose d x (n x d') x m to d x (d' x n) x m, + using the buf matrix. This consists of d transposes + of contiguous n x d' matrices of m-tuples. */ + if (n > 1) { + for (i = 0; i < d; ++i) { + rec_transpose_Ntuple(A + i*num_el, buf, + n,d, d,n, m*N); + memcpy(A + i*num_el, buf, num_el*sizeof(R)); + } + } + + /* Now, transpose (d x d') x (n x m) to (d' x d) x (n x m), which + is a square in-place transpose of n*m-tuples: */ + rec_transpose_sq_ip_Ntuple(A, d, d, n*m*N); + + /* Finally, transpose d' x ((d x n) x m) to d' x (m x (d x n)), + using the buf matrix. This consists of d' transposes + of contiguous d*n x m matrices. */ + if (m > 1) { + for (i = 0; i < d; ++i) { + rec_transpose_Ntuple(A + i*num_el, buf, + d*n,m, m,d*n, N); + memcpy(A + i*num_el, buf, num_el*sizeof(R)); + } + } + } +} + +/*************************************************************************/ +/* In-place transpose routine from TOMS. This routine is much slower + than the cache-oblivious algorithm above, but is has the advantage + of requiring less buffer space for the case of gcd(nx,ny) small. */ + +/* + * TOMS Transpose. Revised version of algorithm 380. + * + * These routines do in-place transposes of arrays. + * + * [ Cate, E.G. and Twigg, D.W., ACM Transactions on Mathematical Software, + * vol. 3, no. 1, 104-110 (1977) ] + * + * C version by Steven G. Johnson. February 1997. + */ + +/* + * "a" is a 1D array of length ny*nx*N which constains the nx x ny + * matrix of N-tuples to be transposed. "a" is stored in row-major + * order (last index varies fastest). move is a 1D array of length + * move_size used to store information to speed up the process. The + * value move_size=(ny+nx)/2 is recommended. buf should be an array + * of length 2*N. + * + */ + +void X(transpose_slow)(R *a, int nx, int ny, int N, + char *move, int move_size, R *buf) +{ + int i, j, im, mn; + R *b, *c, *d; + int ncount; + int k; + + /* check arguments and initialize: */ + A(ny > 0 && nx > 0 && N > 0 && move_size > 0); + + b = buf; + + if (ny == nx) { + /* + * if matrix is square, exchange elements a(i,j) and a(j,i): + */ + for (i = 0; i < nx; ++i) + for (j = i + 1; j < nx; ++j) { + memcpy(b, &a[N * (i + j * nx)], N * sizeof(R)); + memcpy(&a[N * (i + j * nx)], &a[N * (j + i * nx)], N * sizeof(R)); + memcpy(&a[N * (j + i * nx)], b, N * sizeof(R)); + } + return; + } + c = buf + N; + ncount = 2; /* always at least 2 fixed points */ + k = (mn = ny * nx) - 1; + + for (i = 0; i < move_size; ++i) + move[i] = 0; + + if (ny >= 3 && nx >= 3) + ncount += gcd(ny - 1, nx - 1) - 1; /* # fixed points */ + + i = 1; + im = ny; + + while (1) { + int i1, i2, i1c, i2c; + int kmi; + + /** Rearrange the elements of a loop + and its companion loop: **/ + + i1 = i; + kmi = k - i; + memcpy(b, &a[N * i1], N * sizeof(R)); + i1c = kmi; + memcpy(c, &a[N * i1c], N * sizeof(R)); + + while (1) { + i2 = ny * i1 - k * (i1 / nx); + i2c = k - i2; + if (i1 < move_size) + move[i1] = 1; + if (i1c < move_size) + move[i1c] = 1; + ncount += 2; + if (i2 == i) + break; + if (i2 == kmi) { + d = b; + b = c; + c = d; + break; + } + memcpy(&a[N * i1], &a[N * i2], + N * sizeof(R)); + memcpy(&a[N * i1c], &a[N * i2c], + N * sizeof(R)); + i1 = i2; + i1c = i2c; + } + memcpy(&a[N * i1], b, N * sizeof(R)); + memcpy(&a[N * i1c], c, N * sizeof(R)); + + if (ncount >= mn) + break; /* we've moved all elements */ + + /** Search for loops to rearrange: **/ + + while (1) { + int max = k - i; + ++i; + A(i <= max); + im += ny; + if (im > k) + im -= k; + i2 = im; + if (i == i2) + continue; + if (i >= move_size) { + while (i2 > i && i2 < max) { + i1 = i2; + i2 = ny * i1 - k * (i1 / nx); + } + if (i2 == i) + break; + } else if (!move[i]) + break; + } + } +} diff --git a/src/fftw3/kernel/trig.c b/src/fftw3/kernel/trig.c new file mode 100644 index 0000000..0d4c3b2 --- /dev/null +++ b/src/fftw3/kernel/trig.c @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* $Id: trig.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */ + +/* trigonometric functions */ +#include "ifftw.h" +#include <math.h> + +trigreal X(cos2pi)(int m, int n) +{ + return X(sincos)((trigreal)m, (trigreal)n, 0); +} + +trigreal X(sin2pi)(int m, int n) +{ + return X(sincos)((trigreal)m, (trigreal)n, 1); +} + +trigreal X(tan2pi)(int m, int n) +{ +#if 0 /* unimplemented, unused */ + trigreal dm = m, dn = n; + return TAN(by2pi(dm, dn)); +#endif + UNUSED(m); UNUSED(n); + return 0.0; +} diff --git a/src/fftw3/kernel/trig1.c b/src/fftw3/kernel/trig1.c new file mode 100644 index 0000000..b86362c --- /dev/null +++ b/src/fftw3/kernel/trig1.c @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* $Id: trig1.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */ + +/* trigonometric functions */ +#include "ifftw.h" +#include <math.h> + +#ifdef FFTW_LDOUBLE +# define COS cosl +# define SIN sinl +# define TAN tanl +# define KTRIG(x) (x##L) +#else +# define COS cos +# define SIN sin +# define TAN tan +# define KTRIG(x) (x) +#endif + +static const trigreal K2PI = + KTRIG(6.2831853071795864769252867665590057683943388); +#define by2pi(m, n) ((K2PI * (m)) / (n)) + +/* + * Improve accuracy by reducing x to range [0..1/8] + * before multiplication by 2 * PI. + */ + +trigreal X(sincos)(trigreal m, trigreal n, int sinp) +{ + /* waiting for C to get tail recursion... */ + trigreal half_n = n * KTRIG(0.5); + trigreal quarter_n = half_n * KTRIG(0.5); + trigreal eighth_n = quarter_n * KTRIG(0.5); + trigreal sgn = KTRIG(1.0); + + if (sinp) goto sin; + cos: + if (m < 0) { m = -m; /* goto cos; */ } + if (m > half_n) { m = n - m; goto cos; } + if (m > eighth_n) { m = quarter_n - m; goto sin; } + return sgn * COS(by2pi(m, n)); + + msin: + sgn = -sgn; + sin: + if (m < 0) { m = -m; goto msin; } + if (m > half_n) { m = n - m; goto msin; } + if (m > eighth_n) { m = quarter_n - m; goto cos; } + return sgn * SIN(by2pi(m, n)); +} diff --git a/src/fftw3/kernel/twiddle.c b/src/fftw3/kernel/twiddle.c new file mode 100644 index 0000000..785bdd7 --- /dev/null +++ b/src/fftw3/kernel/twiddle.c @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2003 Matteo Frigo + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* $Id: twiddle.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */ + +/* Twiddle manipulation */ + +#include "ifftw.h" +#include <math.h> + +/* table of known twiddle factors */ +static twid *twlist = (twid *) 0; + +static int equal_instr(const tw_instr *p, const tw_instr *q) +{ + if (p == q) + return 1; + + for (;; ++p, ++q) { + if (p->op != q->op || p->v != q->v || p->i != q->i) + return 0; + if (p->op == TW_NEXT) /* == q->op */ + return 1; + } + A(0 /* can't happen */); +} + +static int ok_twid(const twid *t, const tw_instr *q, int n, int r, int m) +{ + return (n == t->n && r == t->r && m <= t->m && equal_instr(t->instr, q)); +} + +static twid *lookup(const tw_instr *q, int n, int r, int m) +{ + twid *p; + + for (p = twlist; p && !ok_twid(p, q, n, r, m); p = p->cdr) + ; + return p; +} + +static int twlen0(int r, const tw_instr **pp) +{ + int ntwiddle = 0; + const tw_instr *p = *pp; + + /* compute length of bytecode program */ + A(r > 0); + for ( ; p->op != TW_NEXT; ++p) { + switch (p->op) { + case TW_FULL: + ntwiddle += (r - 1) * 2; + break; + case TW_GENERIC: + ntwiddle += r * 2; + break; + default: + ++ntwiddle; + } + } + + *pp = p; + return ntwiddle; +} + +int X(twiddle_length)(int r, const tw_instr *p) +{ + return twlen0(r, &p); +} + +static R *compute(const tw_instr *instr, int n, int r, int m) +{ + int ntwiddle, j; + R *W, *W0; + const tw_instr *p; + + static trigreal (*const f[])(int, int) = { + X(cos2pi), X(sin2pi), X(tan2pi) + }; + + p = instr; + ntwiddle = twlen0(r, &p); + + W0 = W = (R *)MALLOC(ntwiddle * (m / p->v) * sizeof(R), TWIDDLES); + + for (j = 0; j < m; j += p->v) { + for (p = instr; p->op != TW_NEXT; ++p) { + switch (p->op) { + case TW_FULL: + { + int i; + A((int)p->i == r); /* consistency check */ + for (i = 1; i < r; ++i) { + *W++ = f[TW_COS]((j + p->v) * i, n); + *W++ = f[TW_SIN]((j + p->v) * i, n); + } + break; + } + + case TW_GENERIC: + { + int i; + A(p->v == 0); /* unused */ + A(p->i == 0); /* unused */ + for (i = 0; i < r; ++i) { + int k = j * r + i; + *W++ = f[TW_COS](k, n); + *W++ = FFT_SIGN * f[TW_SIN](k, n); + } + break; + } + + default: + *W++ = f[p->op](((signed int)(j + p->v)) * p->i, n); + break; + } + } + A(m % p->v == 0); + } + + return W0; +} + +void X(mktwiddle)(twid **pp, const tw_instr *instr, int n, int r, int m) +{ + twid *p; + + if (*pp) return; /* already created */ + + if ((p = lookup(instr, n, r, m))) { + ++p->refcnt; + goto done; + } + + p = (twid *) MALLOC(sizeof(twid), TWIDDLES); + p->n = n; + p->r = r; + p->m = m; + p->instr = instr; + p->refcnt = 1; + p->W = compute(instr, n, r, m); + + /* cons! onto twlist */ + p->cdr = twlist; + twlist = p; + + done: + *pp = p; + return; +} + +void X(twiddle_destroy)(twid **pp) +{ + twid *p = *pp; + if (p) { + twid **q; + if ((--p->refcnt) == 0) { + /* remove p from twiddle list */ + for (q = &twlist; *q; q = &((*q)->cdr)) { + if (*q == p) { + *q = p->cdr; + X(ifree)(p->W); + X(ifree)(p); + goto done; + } + } + A(0 /* can't happen */ ); + } + } + done: + *pp = 0; /* destroy pointer */ + return; +} + + +void X(twiddle_awake)(int flg, twid **pp, + const tw_instr *instr, int n, int r, int m) +{ + if (flg) + X(mktwiddle)(pp, instr, n, r, m); + else + X(twiddle_destroy)(pp); +} |