40 files changed, 5400 insertions, 0 deletions
diff --git a/src/fftw3/kernel/align.c b/src/fftw3/kernel/align.c
new file mode 100644
index 0000000..7f475d5
--- /dev/null
+++ b/src/fftw3/kernel/align.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/* $Id: align.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */
+
+#include "ifftw.h"
+
+#if HAVE_3DNOW
+#  define ALGN 8
+#elif HAVE_SIMD
+#  define ALGN 16
+#elif HAVE_K7
+#  define ALGN 8
+#else
+#  define ALGN (sizeof(R))
+#endif
+
+/* NONPORTABLE */
+int X(alignment_of)(R *p)
+{
+     return (int)(((uintptr_t) p) % ALGN);
+}
diff --git a/src/fftw3/kernel/alloc.c b/src/fftw3/kernel/alloc.c
new file mode 100644
index 0000000..a95e0e8
--- /dev/null
+++ b/src/fftw3/kernel/alloc.c
@@ -0,0 +1,404 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/* $Id: alloc.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */
+
+#include "ifftw.h"
+
+#if defined(HAVE_DECL_MEMALIGN) && !HAVE_DECL_MEMALIGN
+#  if defined(HAVE_MALLOC_H)
+#    include <malloc.h>
+#  else
+extern void *memalign(size_t, size_t);
+#  endif
+#endif
+
+#if defined(HAVE_DECL_POSIX_MEMALIGN) && !HAVE_DECL_POSIX_MEMALIGN
+extern int posix_memalign(void **, size_t, size_t);
+#endif
+
+#if defined(macintosh) /* MacOS 9 */
+#  include <Multiprocessing.h>
+#endif
+
+#define real_malloc X(malloc)
+#define real_free free /* memalign and malloc use ordinary free */
+
+#if defined(WITH_OUR_MALLOC16) && (MIN_ALIGNMENT == 16)
+/* Our own 16-byte aligned malloc/free.  Assumes sizeof(void*) is a
+   power of two <= 8 and that malloc is at least sizeof(void*)-aligned.
+
+   The main reason for this routine is that, as of this writing,
+   Windows does not include any aligned allocation routines in its
+   system libraries, and instead provides an implementation with a
+   Visual C++ "Processor Pack" that you have to statically link into
+   your program.  We do not want to require users to have VC++
+   (e.g. gcc/MinGW should be fine).  Our code should be at least as good
+   as the MS _aligned_malloc, in any case, according to second-hand
+   reports of the algorithm it employs (also based on plain malloc). */
+static void *our_malloc16(size_t n)
+{
+     void *p0, *p;
+     if (!(p0 = malloc(n + 16))) return (void *) 0;
+     p = (void *) (((uintptr_t) p0 + 16) & (~((uintptr_t) 15)));
+     *((void **) p - 1) = p0;
+     return p;
+}
+static void our_free16(void *p)
+{
+     if (p) free(*((void **) p - 1));
+}
+#endif
+
+/* part of user-callable API */
+void *X(malloc)(size_t n)
+{
+     void *p;
+
+#if defined(MIN_ALIGNMENT)
+
+#  if defined(WITH_OUR_MALLOC16) && (MIN_ALIGNMENT == 16)
+     p = our_malloc16(n);
+#    undef real_free
+#    define real_free our_free16
+
+#  elif defined(HAVE_MEMALIGN)
+     p = memalign(MIN_ALIGNMENT, n);
+
+#  elif defined(HAVE_POSIX_MEMALIGN)
+     /* note: posix_memalign is broken in glibc 2.2.5: it constrains
+	the size, not the alignment, to be (power of two) * sizeof(void*).
+        The bug seems to have been fixed as of glibc 2.3.1. */
+     if (posix_memalign(&p, MIN_ALIGNMENT, n))
+	  p = (void*) 0;
+
+#  elif defined(__ICC) || defined(__INTEL_COMPILER) || defined(HAVE__MM_MALLOC)
+     /* Intel's C compiler defines _mm_malloc and _mm_free intrinsics */
+     p = (void *) _mm_malloc(n, MIN_ALIGNMENT);
+#    undef real_free
+#    define real_free _mm_free
+
+#  elif defined(_MSC_VER)
+     /* MS Visual C++ 6.0 with a "Processor Pack" supports SIMD
+	and _aligned_malloc/free (uses malloc.h) */
+     p = (void *) _aligned_malloc(n, MIN_ALIGNMENT);
+#    undef real_free
+#    define real_free _aligned_free
+
+#  elif (defined(__MACOSX__) || defined(__APPLE__)) && (MIN_ALIGNMENT <= 16)
+     /* MacOS X malloc is already 16-byte aligned */
+     p = malloc(n);
+
+#  elif defined(macintosh) /* MacOS 9 */
+     p = (void *) MPAllocateAligned(n,
+#    if MIN_ALIGNMENT == 8
+				    kMPAllocate8ByteAligned,
+#    elif MIN_ALIGNMENT == 16
+				    kMPAllocate16ByteAligned,
+#    elif MIN_ALIGNMENT == 32
+				    kMPAllocate32ByteAligned,
+#    else
+#      error "Unknown alignment for MPAllocateAligned"
+#    endif
+				    0);
+#    undef real_free
+#    define real_free MPFree
+
+#  else
+     /* Add your machine here and send a patch to fftw@fftw.org 
+        or (e.g. for Windows) configure --with-our-malloc16 */
+#    error "Don't know how to malloc() aligned memory."
+#  endif
+
+#else /* !defined(MIN_ALIGMENT) */
+     p = malloc(n);
+#endif
+
+     return p;
+}
+
+/* part of user-callable API */
+void X(free)(void *p)
+{
+     real_free(p);
+}
+
+/**********************************************************
+ *   DEBUGGING CODE
+ **********************************************************/
+#if defined(FFTW_DEBUG_MALLOC)
+
+#include <stdio.h>
+
+/*
+  debugging malloc/free. 
+ 
+  1) Initialize every malloced and freed area to random values, just
+  to make sure we are not using uninitialized pointers.
+ 
+  2) check for blocks freed twice.
+ 
+  3) Check for writes past the ends of allocated blocks
+ 
+  4) destroy contents of freed blocks in order to detect incorrect reuse.
+ 
+  5) keep track of who allocates what and report memory leaks
+ 
+  This code is a quick and dirty hack.  May be nonportable. 
+  Use at your own risk.
+ 
+*/
+
+#define MAGIC ((size_t)0xABadCafe)
+#define PAD_FACTOR 2
+#define SZ_HEADER (4 * sizeof(size_t))
+#define HASHSZ 1031
+
+static unsigned int hashaddr(void *p)
+{
+     return ((unsigned long)p) % HASHSZ;
+}
+
+struct mstat {
+     int siz;
+     int maxsiz;
+     int cnt;
+     int maxcnt;
+};
+
+static struct mstat mstat[MALLOC_WHAT_LAST];
+
+struct minfo {
+     const char *file;
+     int line;
+     size_t n;
+     void *p;
+     struct minfo *next;
+};
+
+static struct minfo *minfo[HASHSZ] = {0};
+
+#ifdef HAVE_THREADS
+int X(in_thread) = 0;
+#endif
+
+void *X(malloc_debug)(size_t n, enum malloc_tag what,
+                      const char *file, int line)
+{
+     char *p;
+     size_t i;
+     struct minfo *info;
+     struct mstat *stat = mstat + what;
+     struct mstat *estat = mstat + EVERYTHING;
+
+     if (n == 0)
+          n = 1;
+
+     if (!IN_THREAD) {
+	  stat->siz += n;
+	  if (stat->siz > stat->maxsiz)
+	       stat->maxsiz = stat->siz;
+	  estat->siz += n;
+	  if (estat->siz > estat->maxsiz)
+	       estat->maxsiz = estat->siz;
+     }
+
+     p = (char *) real_malloc(PAD_FACTOR * n + SZ_HEADER);
+     A(p);
+
+     /* store the sz in a known position */
+     ((size_t *) p)[0] = n;
+     ((size_t *) p)[1] = MAGIC;
+     ((size_t *) p)[2] = what;
+
+     /* fill with junk */
+     for (i = 0; i < PAD_FACTOR * n; i++)
+          p[i + SZ_HEADER] = (char) (i ^ 0xEF);
+
+     if (!IN_THREAD) {
+	  ++stat->cnt;
+	  ++estat->cnt;
+	  
+	  if (stat->cnt > stat->maxcnt)
+	       stat->maxcnt = stat->cnt;
+	  if (estat->cnt > estat->maxcnt)
+	       estat->maxcnt = estat->cnt;
+     }
+
+     /* skip the info we stored previously */
+     p = p + SZ_HEADER;
+
+     if (!IN_THREAD) {
+	  unsigned int h = hashaddr(p);
+	  /* record allocation in allocation list */
+	  info = (struct minfo *) malloc(sizeof(struct minfo));
+	  info->n = n;
+	  info->file = file;
+	  info->line = line;
+	  info->p = p;
+	  info->next = minfo[h];
+	  minfo[h] = info;
+     }
+
+     return (void *) p;
+}
+
+void X(ifree)(void *p)
+{
+     char *q;
+
+     A(p);
+
+     q = ((char *) p) - SZ_HEADER;
+     A(q);
+
+     {
+          size_t n = ((size_t *) q)[0];
+          size_t magic = ((size_t *) q)[1];
+          int what = ((size_t *) q)[2];
+          size_t i;
+          struct mstat *stat = mstat + what;
+          struct mstat *estat = mstat + EVERYTHING;
+
+          /* set to zero to detect duplicate free's */
+          ((size_t *) q)[0] = 0;
+
+          A(magic == MAGIC);
+          ((size_t *) q)[1] = ~MAGIC;
+
+	  if (!IN_THREAD) {
+	       stat->siz -= n;
+	       A(stat->siz >= 0);
+	       estat->siz -= n;
+	       A(estat->siz >= 0);
+	  }
+
+          /* check for writing past end of array: */
+          for (i = n; i < PAD_FACTOR * n; ++i)
+               if (q[i + SZ_HEADER] != (char) (i ^ 0xEF)) {
+                    A(0 /* array bounds overwritten */ );
+               }
+          for (i = 0; i < PAD_FACTOR * n; ++i)
+               q[i + SZ_HEADER] = (char) (i ^ 0xAD);
+
+	  if (!IN_THREAD) {
+	       --stat->cnt;
+	       --estat->cnt;
+	       
+	       A(stat->cnt >= 0);
+	       A((stat->cnt == 0 && stat->siz == 0) ||
+		 (stat->cnt > 0 && stat->siz > 0));
+	       A(estat->cnt >= 0);
+	       A((estat->cnt == 0 && estat->siz == 0) ||
+		 (estat->cnt > 0 && estat->siz > 0));
+	  }
+
+          real_free(q);
+     }
+
+     if (!IN_THREAD) {
+          /* delete minfo entry */
+	  unsigned int h = hashaddr(p);
+          struct minfo **i;
+
+          for (i = minfo + h; *i; i = &((*i)->next)) {
+               if ((*i)->p == p) {
+                    struct minfo *i0 = (*i)->next;
+                    free(*i);
+                    *i = i0;
+                    return;
+               }
+          }
+
+          A(0 /* no entry in minfo list */ );
+     }
+}
+
+void X(malloc_print_minfo)(int verbose)
+{
+     struct minfo *info;
+     int what;
+     unsigned int h;
+
+     if (verbose) {
+	  static const char *names[MALLOC_WHAT_LAST] = {
+	       "EVERYTHING",
+	       "PLANS", "SOLVERS", "PROBLEMS", "BUFFERS",
+	       "HASHT", "TENSORS", "PLANNERS", "SLVDSC", "TWIDDLES",
+	       "STRIDES", "OTHER"
+	  };
+
+	  printf("%12s %8s %8s %10s %10s\n",
+		 "what", "cnt", "maxcnt", "siz", "maxsiz");
+
+	  for (what = 0; what < MALLOC_WHAT_LAST; ++what) {
+	       struct mstat *stat = mstat + what;
+	       printf("%12s %8d %8d %10d %10d\n",
+		      names[what], stat->cnt, stat->maxcnt,
+		      stat->siz, stat->maxsiz);
+	  }
+     }
+
+     for (h = 0; h < HASHSZ; ++h) 
+	  if (minfo[h]) {
+	       printf("\nUnfreed allocations:\n");
+	       break;
+	  }
+
+     for (h = 0; h < HASHSZ; ++h) 
+	  for (info = minfo[h]; info; info = info->next) {
+	       printf("%s:%d:  %d bytes at %p\n",
+		      info->file, info->line, info->n, info->p);
+	  }
+}
+
+#else
+/**********************************************************
+ *   NON DEBUGGING CODE
+ **********************************************************/
+/* production version, no hacks */
+
+void *X(malloc_plain)(size_t n)
+{
+     void *p;
+     if (n == 0)
+          n = 1;
+     p = real_malloc(n);
+     CK(p);
+
+#ifdef MIN_ALIGMENT
+     A((((uintptr_t)p) % MIN_ALIGNMENT) == 0);
+#endif
+
+     return p;
+}
+
+void X(ifree)(void *p)
+{
+     real_free(p);
+}
+
+#endif
+
+void X(ifree0)(void *p)
+{
+     /* common pattern */
+     if (p) X(ifree)(p);
+}
diff --git a/src/fftw3/kernel/assert.c b/src/fftw3/kernel/assert.c
new file mode 100644
index 0000000..a066db0
--- /dev/null
+++ b/src/fftw3/kernel/assert.c
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/* $Id: assert.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */
+#include "ifftw.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+void X(assertion_failed)(const char *s, int line, const char *file)
+{
+     fflush(stdout);
+     fprintf(stderr, "fftw: %s:%d: assertion failed: %s\n", file, line, s);
+     exit(EXIT_FAILURE);
+}
diff --git a/src/fftw3/kernel/awake.c b/src/fftw3/kernel/awake.c
new file mode 100644
index 0000000..7ee716f
--- /dev/null
+++ b/src/fftw3/kernel/awake.c
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/* $Id: awake.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */
+
+#include "ifftw.h"
+
+void X(null_awake)(plan *ego, int awake)
+{
+     UNUSED(ego);
+     UNUSED(awake);
+     /* do nothing */
+}
diff --git a/src/fftw3/kernel/cycle.h b/src/fftw3/kernel/cycle.h
new file mode 100644
index 0000000..a324c2a
--- /dev/null
+++ b/src/fftw3/kernel/cycle.h
@@ -0,0 +1,420 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/* $Id: cycle.h,v 1.1 2008/10/17 06:11:29 scuri Exp $ */
+
+/* machine-dependent cycle counters code. Needs to be inlined. */
+
+/***************************************************************************/
+/* To use the cycle counters in your code, simply #include "cycle.h" (this
+   file), and then use the functions/macros:
+
+                 ticks getticks(void);
+
+   ticks is an opaque typedef defined below, representing the current time.
+   You extract the elapsed time between two calls to gettick() via:
+
+                 double elapsed(ticks t1, ticks t0);
+
+   which returns a double-precision variable in arbitrary units.  You
+   are not expected to convert this into human units like seconds; it
+   is intended only for *comparisons* of time intervals.
+
+   (In order to use some of the OS-dependent timer routines like
+   Solaris' gethrtime, you need to paste the autoconf snippet below
+   into your configure.ac file and #include "config.h" before cycle.h,
+   or define the relevant macros manually if you are not using autoconf.)
+*/
+
+/***************************************************************************/
+/* This file uses macros like HAVE_GETHRTIME that are assumed to be
+   defined according to whether the corresponding function/type/header
+   is available on your system.  The necessary macros are most
+   conveniently defined if you are using GNU autoconf, via the tests:
+   
+   dnl ---------------------------------------------------------------------
+
+   AC_C_INLINE
+   AC_HEADER_TIME
+   AC_CHECK_HEADERS([sys/time.h c_asm.h intrinsics.h])
+
+   AC_CHECK_TYPE([hrtime_t],[AC_DEFINE(HAVE_HRTIME_T, 1, [Define to 1 if hrtime_t is defined in <sys/time.h>])],,[#if HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif])
+
+   AC_CHECK_FUNCS([gethrtime read_real_time time_base_to_time clock_gettime])
+
+   dnl Cray UNICOS _rtc() (real-time clock) intrinsic
+   AC_MSG_CHECKING([for _rtc intrinsic])
+   rtc_ok=yes
+   AC_TRY_LINK([#ifdef HAVE_INTRINSICS_H
+#include <intrinsics.h>
+#endif], [_rtc()], [AC_DEFINE(HAVE__RTC,1,[Define if you have the UNICOS _rtc() intrinsic.])], [rtc_ok=no])
+   AC_MSG_RESULT($rtc_ok)
+
+   dnl ---------------------------------------------------------------------
+*/
+
+/***************************************************************************/
+
+#if TIME_WITH_SYS_TIME
+# include <sys/time.h>
+# include <time.h>
+#else
+# if HAVE_SYS_TIME_H
+#  include <sys/time.h>
+# else
+#  include <time.h>
+# endif
+#endif
+
+#define INLINE_ELAPSED(INL) static INL double elapsed(ticks t1, ticks t0) \
+{									  \
+     return (double)(t1 - t0);						  \
+}
+
+/*----------------------------------------------------------------*/
+/* Solaris */
+#if defined(HAVE_GETHRTIME) && defined(HAVE_HRTIME_T) && !defined(HAVE_TICK_COUNTER)
+typedef hrtime_t ticks;
+
+#define getticks gethrtime
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/* AIX v. 4+ routines to read the real-time clock or time-base register */
+#if defined(HAVE_READ_REAL_TIME) && defined(HAVE_TIME_BASE_TO_TIME) && !defined(HAVE_TICK_COUNTER)
+typedef timebasestruct_t ticks;
+
+static inline ticks getticks(void)
+{
+     ticks t;
+     read_real_time(&t, TIMEBASE_SZ);
+     return t;
+}
+
+static inline double elapsed(ticks t1, ticks t0) /* time in nanoseconds */
+{
+     time_base_to_time(&t1, TIMEBASE_SZ);
+     time_base_to_time(&t0, TIMEBASE_SZ);
+     return ((t1.tb_high - t0.tb_high) * 1e9 + (t1.tb_low - t0.tb_low));
+}
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * PowerPC ``cycle'' counter using the time base register.
+ */
+#if ((defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__))) || (defined(__MWERKS__) && defined(macintosh)))  && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     unsigned int tbl, tbu0, tbu1;
+
+     do {
+	  __asm__ __volatile__ ("mftbu %0" : "=r"(tbu0));
+	  __asm__ __volatile__ ("mftb %0" : "=r"(tbl));
+	  __asm__ __volatile__ ("mftbu %0" : "=r"(tbu1));
+     } while (tbu0 != tbu1);
+
+     return (((unsigned long long)tbu0) << 32) | tbl;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+/*----------------------------------------------------------------*/
+/*
+ * Pentium cycle counter 
+ */
+#if (defined(__GNUC__) || defined(__ICC)) && defined(__i386__)  && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     ticks ret;
+
+     __asm__ __volatile__("rdtsc": "=A" (ret));
+     /* no input, nothing else clobbered */
+     return ret;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/* Visual C++ -- thanks to Morten Nissov for his help with this */
+#if _MSC_VER >= 1200 && _M_IX86 >= 500 && !defined(HAVE_TICK_COUNTER)
+#include <windows.h>
+typedef LARGE_INTEGER ticks;
+#define RDTSC __asm __emit 0fh __asm __emit 031h /* hack for VC++ 5.0 */
+
+static __inline ticks getticks(void)
+{
+     ticks ret;
+
+     __asm {
+	  RDTSC
+	  mov ret.HighPart, edx
+	  mov ret.LowPart, eax
+     }
+     return ret;
+}
+
+static __inline double elapsed(ticks t1, ticks t0)
+{  
+     return (double)(t1.QuadPart - t0.QuadPart);
+}  
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * X86-64 cycle counter
+ */
+#if defined(__GNUC__) && defined(__x86_64__)  && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     unsigned a, d; 
+     asm volatile("rdtsc" : "=a" (a), "=d" (d)); 
+     return ((ticks)a) | (((ticks)d) << 32); 
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/* Visual C++ (FIXME: how to detect compilation for x86-64?) */
+#if _MSC_VER >= 1400 && !defined(HAVE_TICK_COUNTER)
+typedef ULONG64 ticks;
+
+#define getticks __rdtsc
+
+INLINE_ELAPSED(__inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * IA64 cycle counter
+ */
+#if defined(__GNUC__) && defined(__ia64__) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     ticks ret;
+
+     __asm__ __volatile__ ("mov %0=ar.itc" : "=r"(ret));
+     return ret;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/* HP/UX IA64 compiler, courtesy Teresa L. Johnson: */
+#if defined(__hpux) && defined(__ia64) && !defined(HAVE_TICK_COUNTER)
+#include <machine/sys/inline.h>
+typedef unsigned long ticks;
+
+static inline ticks getticks(void)
+{
+     ticks ret;
+
+     ret = _Asm_mov_from_ar (_AREG_ITC);
+     return ret;
+}
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/* intel's ecc compiler */
+#if defined(__ECC) && defined(__ia64__) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long ticks;
+#include <ia64intrin.h>
+
+static __inline__ ticks getticks(void)
+{
+     return __getReg(_IA64_REG_AR_ITC);
+}
+ 
+INLINE_ELAPSED(__inline__)
+ 
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * PA-RISC cycle counter 
+ */
+#if defined(__hppa__) || defined(__hppa) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long ticks;
+
+#  ifdef __GNUC__
+static __inline__ ticks getticks(void)
+{
+     ticks ret;
+
+     __asm__ __volatile__("mfctl 16, %0": "=r" (ret));
+     /* no input, nothing else clobbered */
+     return ret;
+}
+#  else
+#  include <machine/inline.h>
+static inline unsigned long getticks(void)
+{
+     register ticks ret;
+     _MFCTL(16, ret);
+     return ret;
+}
+#  endif
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/* S390, courtesy of James Treacy */
+#if defined(__GNUC__) && defined(__s390__) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     ticks cycles;
+     __asm__("stck 0(%0)" : : "a" (&(cycles)) : "memory", "cc");
+     return cycles;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+/*----------------------------------------------------------------*/
+#if defined(__GNUC__) && defined(__alpha__) && !defined(HAVE_TICK_COUNTER)
+/*
+ * The 32-bit cycle counter on alpha overflows pretty quickly, 
+ * unfortunately.  A 1GHz machine overflows in 4 seconds.
+ */
+typedef unsigned int ticks;
+
+static __inline__ ticks getticks(void)
+{
+     unsigned long cc;
+     __asm__ __volatile__ ("rpcc %0" : "=r"(cc));
+     return (cc & 0xFFFFFFFF);
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+#if defined(__GNUC__) && defined(__sparc_v9__) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     ticks ret;
+     __asm__("rd %%tick, %0" : "=r" (ret));
+     return ret;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+#if defined(__DECC) && defined(__alpha) && defined(HAVE_C_ASM_H) && !defined(HAVE_TICK_COUNTER)
+#  include <c_asm.h>
+typedef unsigned int ticks;
+
+static __inline ticks getticks(void)
+{
+     unsigned long cc;
+     cc = asm("rpcc %v0");
+     return (cc & 0xFFFFFFFF);
+}
+
+INLINE_ELAPSED(__inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+/*----------------------------------------------------------------*/
+/* SGI/Irix */
+#if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_SGI_CYCLE) && !defined(HAVE_TICK_COUNTER)
+typedef struct timespec ticks;
+
+static inline ticks getticks(void)
+{
+     struct timespec t;
+     clock_gettime(CLOCK_SGI_CYCLE, &t);
+     return t;
+}
+
+static inline double elapsed(ticks t1, ticks t0)
+{
+     return (double)(t1.tv_sec - t0.tv_sec) * 1.0E9 +
+	  (double)(t1.tv_nsec - t0.tv_nsec);
+}
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/* Cray UNICOS _rtc() intrinsic function */
+#if defined(HAVE__RTC) && !defined(HAVE_TICK_COUNTER)
+#ifdef HAVE_INTRINSICS_H
+#  include <intrinsics.h>
+#endif
+
+typedef long long ticks;
+
+#define getticks _rtc
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
diff --git a/src/fftw3/kernel/debug.c b/src/fftw3/kernel/debug.c
new file mode 100644
index 0000000..63e4a22
--- /dev/null
+++ b/src/fftw3/kernel/debug.c
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/* $Id: debug.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */
+#include "ifftw.h"
+
+#ifdef FFTW_DEBUG
+#include <stdio.h>
+
+typedef struct {
+     printer super;
+     FILE *f;
+} P_file;
+
+static void putchr_file(printer *p_, char c)
+{
+     P_file *p = (P_file *) p_;
+     fputc(c, p->f);
+}
+
+static printer *mkprinter_file(FILE *f)
+{
+     P_file *p = (P_file *) X(mkprinter)(sizeof(P_file), putchr_file, 0);
+     p->f = f;
+     return &p->super;
+}
+
+void X(debug)(const char *format, ...)
+{
+     va_list ap;
+     printer *p = mkprinter_file(stderr);
+     va_start(ap, format);
+     p->vprint(p, format, ap);
+     va_end(ap);
+     X(printer_destroy)(p);
+}
+#endif
diff --git a/src/fftw3/kernel/hash.c b/src/fftw3/kernel/hash.c
new file mode 100644
index 0000000..f12d5a4
--- /dev/null
+++ b/src/fftw3/kernel/hash.c
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include "ifftw.h"
+
+unsigned X(hash)(const char *s)
+{
+     unsigned h = 0xDEADBEEFu;
+     do {
+	  h = h * 17 + (int)*s;
+     } while (*s++);
+     return h;
+}
+
diff --git a/src/fftw3/kernel/iabs.c b/src/fftw3/kernel/iabs.c
new file mode 100644
index 0000000..d2bdb7c
--- /dev/null
+++ b/src/fftw3/kernel/iabs.c
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/* $Id: iabs.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */
+
+#include "ifftw.h"
+
+int X(iabs)(int a)
+{
+     return (int) (a < 0 ? -a : a);
+}
diff --git a/src/fftw3/kernel/ifftw.h b/src/fftw3/kernel/ifftw.h
new file mode 100644
index 0000000..0269e18
--- /dev/null
+++ b/src/fftw3/kernel/ifftw.h
@@ -0,0 +1,848 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/* $Id: ifftw.h,v 1.1 2008/10/17 06:11:29 scuri Exp $ */
+
+/* FFTW internal header file */
+#ifndef __IFFTW_H__
+#define __IFFTW_H__
+
+#include "config.h"
+
+#include <stdlib.h>		/* size_t */
+#include <stdarg.h>		/* va_list */
+#include <stddef.h>             /* ptrdiff_t */
+
+#if HAVE_SYS_TYPES_H
+# include <sys/types.h>
+#endif
+
+#if HAVE_STDINT_H
+# include <stdint.h>             /* uintptr_t, maybe */
+#endif
+
+#if HAVE_INTTYPES_H
+# include <inttypes.h>           /* uintptr_t, maybe */
+#endif
+
+/* determine precision and name-mangling scheme */
+#define CONCAT(prefix, name) prefix ## name
+#if defined(FFTW_SINGLE)
+typedef float R;
+#define X(name) CONCAT(fftwf_, name)
+#elif defined(FFTW_LDOUBLE)
+typedef long double R;
+#define X(name) CONCAT(fftwl_, name)
+#else
+typedef double R;
+#define X(name) CONCAT(fftw_, name)
+#endif
+
+/* dummy use of unused parameters to silence compiler warnings */
+#define UNUSED(x) (void)x
+
+#define FFT_SIGN (-1)  /* sign convention for forward transforms */
+
+/* get rid of that object-oriented stink: */
+#define REGISTER_SOLVER(p, s) X(solver_register)(p, s)
+
+#define STRINGIZEx(x) #x
+#define STRINGIZE(x) STRINGIZEx(x)
+
+#ifndef HAVE_K7
+#define HAVE_K7 0
+#endif
+
+#if defined(HAVE_SSE) || defined(HAVE_SSE2) || defined(HAVE_ALTIVEC) || defined(HAVE_3DNOW)
+#define HAVE_SIMD 1
+#else
+#define HAVE_SIMD 0
+#endif
+
+/* forward declarations */
+typedef struct problem_s problem;
+typedef struct plan_s plan;
+typedef struct solver_s solver;
+typedef struct planner_s planner;
+typedef struct printer_s printer;
+typedef struct scanner_s scanner;
+
+/*-----------------------------------------------------------------------*/
+/* alloca: */
+#if HAVE_SIMD
+#define MIN_ALIGNMENT 16
+#endif
+
+#ifdef HAVE_ALLOCA
+   /* use alloca if available */
+
+#ifndef alloca
+#ifdef __GNUC__
+# define alloca __builtin_alloca
+#else
+# ifdef _MSC_VER
+#  include <malloc.h>
+#  define alloca _alloca
+# else
+#  if HAVE_ALLOCA_H
+#   include <alloca.h>
+#  else
+#   ifdef _AIX
+ #pragma alloca
+#   else
+#    ifndef alloca /* predefined by HP cc +Olibcalls */
+void *alloca(size_t);
+#    endif
+#   endif
+#  endif
+# endif
+#endif
+#endif
+
+#  ifdef MIN_ALIGNMENT
+#    define STACK_MALLOC(T, p, x)				\
+     {								\
+         p = (T)alloca((x) + MIN_ALIGNMENT);			\
+         p = (T)(((uintptr_t)p + (MIN_ALIGNMENT - 1)) &	\
+               (~(uintptr_t)(MIN_ALIGNMENT - 1)));		\
+     }
+#    define STACK_FREE(x) 
+#  else /* HAVE_ALLOCA && !defined(MIN_ALIGNMENT) */
+#    define STACK_MALLOC(T, p, x) p = (T)alloca(x) 
+#    define STACK_FREE(x) 
+#  endif
+
+#else /* ! HAVE_ALLOCA */
+   /* use malloc instead of alloca */
+#  define STACK_MALLOC(T, p, x) p = (T)MALLOC(x, OTHER)
+#  define STACK_FREE(x) X(ifree)(x)
+#endif /* ! HAVE_ALLOCA */
+
+/*-----------------------------------------------------------------------*/
+/* define uintptr_t if it is not already defined */
+
+#ifndef HAVE_UINTPTR_T
+#  if SIZEOF_VOID_P == 0
+#    error sizeof void* is unknown!
+#  elif SIZEOF_UNSIGNED_INT == SIZEOF_VOID_P
+     typedef unsigned int uintptr_t;
+#  elif SIZEOF_UNSIGNED_LONG == SIZEOF_VOID_P
+     typedef unsigned long uintptr_t;
+#  elif SIZEOF_UNSIGNED_LONG_LONG == SIZEOF_VOID_P
+     typedef unsigned long long uintptr_t;
+#  else
+#    error no unsigned integer type matches void* sizeof!
+#  endif
+#endif
+
+/*-----------------------------------------------------------------------*/
+/* assert.c: */
+extern void X(assertion_failed)(const char *s, int line, const char *file);
+
+/* always check */
+#define CK(ex)						 \
+      (void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0))
+
+#ifdef FFTW_DEBUG
+/* check only if debug enabled */
+#define A(ex)						 \
+      (void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0))
+#else
+#define A(ex) /* nothing */
+#endif
+
+extern void X(debug)(const char *format, ...);
+#define D X(debug)
+
+/*-----------------------------------------------------------------------*/
+/* alloc.c: */
+
+/* objects allocated by malloc, for statistical purposes */
+enum malloc_tag {
+     EVERYTHING,
+     PLANS,
+     SOLVERS,
+     PROBLEMS,
+     BUFFERS,
+     HASHT,
+     TENSORS,
+     PLANNERS,
+     SLVDESCS,
+     TWIDDLES,
+     STRIDES,
+     OTHER,
+     MALLOC_WHAT_LAST		/* must be last */
+};
+
+extern void X(ifree)(void *ptr);
+extern void X(ifree0)(void *ptr);
+
+#ifdef FFTW_DEBUG_MALLOC
+
+extern void *X(malloc_debug)(size_t n, enum malloc_tag what,
+			     const char *file, int line);
+#define MALLOC(n, what) X(malloc_debug)(n, what, __FILE__, __LINE__)
+#define NATIVE_MALLOC(n, what) MALLOC(n, what)
+void X(malloc_print_minfo)(int vrbose);
+
+#else /* ! FFTW_DEBUG_MALLOC */
+
+extern void *X(malloc_plain)(size_t sz);
+#define MALLOC(n, what)  X(malloc_plain)(n)
+#define NATIVE_MALLOC(n, what) malloc(n)
+
+#endif
+
+#if defined(FFTW_DEBUG) && defined(FFTW_DEBUG_MALLOC) && defined(HAVE_THREADS)
+extern int X(in_thread);
+#  define IN_THREAD X(in_thread)
+#  define THREAD_ON { int in_thread_save = X(in_thread); X(in_thread) = 1
+#  define THREAD_OFF X(in_thread) = in_thread_save; }
+#else
+#  define IN_THREAD 0
+#  define THREAD_ON 
+#  define THREAD_OFF 
+#endif
+
+/*-----------------------------------------------------------------------*/
+/* ops.c: */
+/*
+ * ops counter.  The total number of additions is add + fma
+ * and the total number of multiplications is mul + fma.
+ * Total flops = add + mul + 2 * fma
+ */
+typedef struct {
+     double add;
+     double mul;
+     double fma;
+     double other;
+} opcnt;
+
+void X(ops_zero)(opcnt *dst);
+void X(ops_other)(int o, opcnt *dst);
+void X(ops_cpy)(const opcnt *src, opcnt *dst);
+
+void X(ops_add)(const opcnt *a, const opcnt *b, opcnt *dst);
+void X(ops_add2)(const opcnt *a, opcnt *dst);
+
+/* dst = m * a + b */
+void X(ops_madd)(int m, const opcnt *a, const opcnt *b, opcnt *dst);
+
+/* dst += m * a */
+void X(ops_madd2)(int m, const opcnt *a, opcnt *dst);
+
+
+/*-----------------------------------------------------------------------*/
+/* minmax.c: */
+int X(imax)(int a, int b);
+int X(imin)(int a, int b);
+
+/*-----------------------------------------------------------------------*/
+/* iabs.c: */
+int X(iabs)(int a);
+
+/*-----------------------------------------------------------------------*/
+/* md5.c */
+
+#if SIZEOF_UNSIGNED_INT >= 4
+typedef unsigned int md5uint;
+#else
+typedef unsigned long md5uint; /* at least 32 bits as per C standard */
+#endif
+
+typedef md5uint md5sig[4];
+
+typedef struct {
+     md5sig s; /* state and signature */
+
+     /* fields not meant to be used outside md5.c: */
+     unsigned char c[64]; /* stuff not yet processed */
+     unsigned l;  /* total length.  Should be 64 bits long, but this is
+		     good enough for us */
+} md5;
+
+void X(md5begin)(md5 *p);
+void X(md5putb)(md5 *p, const void *d_, int len);
+void X(md5puts)(md5 *p, const char *s);
+void X(md5putc)(md5 *p, unsigned char c);
+void X(md5int)(md5 *p, int i);
+void X(md5unsigned)(md5 *p, unsigned i);
+void X(md5ptrdiff)(md5 *p, ptrdiff_t d);
+void X(md5end)(md5 *p);
+
+/*-----------------------------------------------------------------------*/
+/* tensor.c: */
+#define STRUCT_HACK_KR
+#undef STRUCT_HACK_C99
+
+typedef struct {
+     int n;
+     int is;			/* input stride */
+     int os;			/* output stride */
+} iodim;
+
+typedef struct {
+     int rnk;
+#if defined(STRUCT_HACK_KR)
+     iodim dims[1];
+#elif defined(STRUCT_HACK_C99)
+     iodim dims[];
+#else
+     iodim *dims;
+#endif
+} tensor;
+
+/*
+  Definition of rank -infinity.
+  This definition has the property that if you want rank 0 or 1,
+  you can simply test for rank <= 1.  This is a common case.
+ 
+  A tensor of rank -infinity has size 0.
+*/
+#define RNK_MINFTY  ((int)(((unsigned) -1) >> 1))
+#define FINITE_RNK(rnk) ((rnk) != RNK_MINFTY)
+
+typedef enum { INPLACE_IS, INPLACE_OS } inplace_kind;
+
+tensor *X(mktensor)(int rnk);
+tensor *X(mktensor_0d)(void);
+tensor *X(mktensor_1d)(int n, int is, int os);
+tensor *X(mktensor_2d)(int n0, int is0, int os0,
+                      int n1, int is1, int os1);
+int X(tensor_sz)(const tensor *sz);
+void X(tensor_md5)(md5 *p, const tensor *t);
+int X(tensor_max_index)(const tensor *sz);
+int X(tensor_min_istride)(const tensor *sz);
+int X(tensor_min_ostride)(const tensor *sz);
+int X(tensor_min_stride)(const tensor *sz);
+int X(tensor_inplace_strides)(const tensor *sz);
+int X(tensor_inplace_strides2)(const tensor *a, const tensor *b);
+tensor *X(tensor_copy)(const tensor *sz);
+int X(tensor_kosherp)(const tensor *x);
+
+tensor *X(tensor_copy_inplace)(const tensor *sz, inplace_kind k);
+tensor *X(tensor_copy_except)(const tensor *sz, int except_dim);
+tensor *X(tensor_copy_sub)(const tensor *sz, int start_dim, int rnk);
+tensor *X(tensor_compress)(const tensor *sz);
+tensor *X(tensor_compress_contiguous)(const tensor *sz);
+tensor *X(tensor_append)(const tensor *a, const tensor *b);
+void X(tensor_split)(const tensor *sz, tensor **a, int a_rnk, tensor **b);
+int X(tensor_tornk1)(const tensor *t, int *n, int *is, int *os);
+void X(tensor_destroy)(tensor *sz);
+void X(tensor_destroy2)(tensor *a, tensor *b);
+void X(tensor_destroy4)(tensor *a, tensor *b, tensor *c, tensor *d);
+void X(tensor_print)(const tensor *sz, printer *p);
+int X(dimcmp)(const iodim *a, const iodim *b);
+
+/*-----------------------------------------------------------------------*/
+/* problem.c: */
+typedef struct {
+     void (*hash) (const problem *ego, md5 *p);
+     void (*zero) (const problem *ego);
+     void (*print) (problem *ego, printer *p);
+     void (*destroy) (problem *ego);
+} problem_adt;
+
+struct problem_s {
+     const problem_adt *adt;
+};
+
+problem *X(mkproblem)(size_t sz, const problem_adt *adt);
+void X(problem_destroy)(problem *ego);
+
+/*-----------------------------------------------------------------------*/
+/* print.c */
+struct printer_s {
+     void (*print)(printer *p, const char *format, ...);
+     void (*vprint)(printer *p, const char *format, va_list ap);
+     void (*putchr)(printer *p, char c);
+     void (*cleanup)(printer *p);
+     int indent;
+     int indent_incr;
+};
+
+printer *X(mkprinter)(size_t size, 
+		      void (*putchr)(printer *p, char c),
+		      void (*cleanup)(printer *p));
+void X(printer_destroy)(printer *p);
+
+/*-----------------------------------------------------------------------*/
+/* scan.c */
+struct scanner_s {
+     int (*scan)(scanner *sc, const char *format, ...);
+     int (*vscan)(scanner *sc, const char *format, va_list ap);
+     int (*getchr)(scanner *sc);
+     int ungotc;
+};
+
+scanner *X(mkscanner)(size_t size, int (*getchr)(scanner *sc));
+void X(scanner_destroy)(scanner *sc);
+
+/*-----------------------------------------------------------------------*/
+/* plan.c: */
+typedef struct {
+     void (*solve)(const plan *ego, const problem *p);
+     void (*awake)(plan *ego, int flag);
+     void (*print)(const plan *ego, printer *p);
+     void (*destroy)(plan *ego);
+} plan_adt;
+
+struct plan_s {
+     const plan_adt *adt;
+     int awake_refcnt;
+     opcnt ops;
+     double pcost;
+};
+
+plan *X(mkplan)(size_t size, const plan_adt *adt);
+void X(plan_destroy_internal)(plan *ego);
+void X(plan_awake)(plan *ego, int flag);
+#define AWAKE(plan, flag) X(plan_awake)(plan, flag)
+void X(plan_null_destroy)(plan *ego);
+
+/*-----------------------------------------------------------------------*/
+/* solver.c: */
+typedef struct {
+     plan *(*mkplan)(const solver *ego, const problem *p, planner *plnr);
+} solver_adt;
+
+struct solver_s {
+     const solver_adt *adt;
+     int refcnt;
+};
+
+solver *X(mksolver)(size_t size, const solver_adt *adt);
+void X(solver_use)(solver *ego);
+void X(solver_destroy)(solver *ego);
+void X(solver_register)(planner *plnr, solver *s);
+
+/* shorthand */
+#define MKSOLVER(type, adt) (type *)X(mksolver)(sizeof(type), adt)
+
+/*-----------------------------------------------------------------------*/
+/* planner.c */
+
+typedef struct slvdesc_s {
+     solver *slv;
+     const char *reg_nam;
+     unsigned nam_hash;
+     int reg_id;
+} slvdesc;
+
+typedef struct solution_s solution; /* opaque */
+
+/* values for problem_flags: */
+enum { 
+     DESTROY_INPUT = 0x1,
+     NO_SIMD = 0x2,
+     CONSERVE_MEMORY = 0x4,
+     NO_DHT_R2HC = 0x8
+};
+
+#define DESTROY_INPUTP(plnr) ((plnr)->problem_flags & DESTROY_INPUT)
+#define NO_SIMDP(plnr) ((plnr)->problem_flags & NO_SIMD)
+#define CONSERVE_MEMORYP(plnr) ((plnr)->problem_flags & CONSERVE_MEMORY)
+#define NO_DHT_R2HCP(plnr) ((plnr)->problem_flags & NO_DHT_R2HC)
+
+/* values for planner_flags: */
+enum {
+     /* impatience flags  */
+
+     BELIEVE_PCOST = 0x1,
+     DFT_R2HC_ICKY = 0x2,
+     NONTHREADED_ICKY = 0x4,
+     NO_BUFFERING = 0x8,
+     NO_EXHAUSTIVE = 0x10,
+     NO_INDIRECT_OP = 0x20,
+     NO_LARGE_GENERIC = 0x40,
+     NO_RANK_SPLITS = 0x80,
+     NO_VRANK_SPLITS = 0x100,
+     NO_VRECURSE = 0x200,
+     
+     /* flags that control the search */
+     NO_UGLY = 0x400,  /* avoid plans we are 99% sure are suboptimal */
+     NO_SEARCH = 0x800,  /* avoid searching altogether---use wisdom entries 
+			    only */
+
+     ESTIMATE = 0x1000,
+     IMPATIENCE_FLAGS = (ESTIMATE | (ESTIMATE - 1)),
+     
+     BLESSING = 0x4000,  /* save this entry */
+     H_VALID = 0x8000,    /* valid hastable entry */
+     NONIMPATIENCE_FLAGS = BLESSING
+};
+
+#define BELIEVE_PCOSTP(plnr) ((plnr)->planner_flags & BELIEVE_PCOST)
+#define DFT_R2HC_ICKYP(plnr) ((plnr)->planner_flags & DFT_R2HC_ICKY)
+#define ESTIMATEP(plnr) ((plnr)->planner_flags & ESTIMATE)
+#define NONTHREADED_ICKYP(plnr) (((plnr)->planner_flags & NONTHREADED_ICKY) \
+                                  && (plnr)->nthr > 1)
+#define NO_BUFFERINGP(plnr) ((plnr)->planner_flags & NO_BUFFERING)
+#define NO_EXHAUSTIVEP(plnr) ((plnr)->planner_flags & NO_EXHAUSTIVE)
+#define NO_INDIRECT_OP_P(plnr) ((plnr)->planner_flags & NO_INDIRECT_OP)
+#define NO_LARGE_GENERICP(plnr) ((plnr)->planner_flags & NO_LARGE_GENERIC)
+#define NO_RANK_SPLITSP(plnr) ((plnr)->planner_flags & NO_RANK_SPLITS)
+#define NO_UGLYP(plnr) ((plnr)->planner_flags & NO_UGLY)
+#define NO_SEARCHP(plnr) ((plnr)->planner_flags & NO_SEARCH)
+#define NO_VRANK_SPLITSP(plnr) ((plnr)->planner_flags & NO_VRANK_SPLITS)
+#define NO_VRECURSEP(plnr) ((plnr)->planner_flags & NO_VRECURSE)
+
+typedef enum { FORGET_ACCURSED, FORGET_EVERYTHING } amnesia;
+
+typedef struct {
+     void (*register_solver)(planner *ego, solver *s);
+     plan *(*mkplan)(planner *ego, problem *p);
+     void (*forget)(planner *ego, amnesia a);
+     void (*exprt)(planner *ego, printer *p); /* ``export'' is a reserved
+						 word in C++. */
+     int (*imprt)(planner *ego, scanner *sc);
+} planner_adt;
+
+struct planner_s {
+     const planner_adt *adt;
+     void (*hook)(plan *pln, const problem *p, int optimalp);
+
+     /* solver descriptors */
+     slvdesc *slvdescs;
+     unsigned nslvdesc, slvdescsiz;
+     const char *cur_reg_nam;
+     int cur_reg_id;
+
+     /* hash table of solutions */
+     solution *solutions;
+     unsigned hashsiz, nelem;
+
+     int nthr;
+     unsigned problem_flags;
+     unsigned short planner_flags; /* matches type of solution.flags in
+				      planner.c */
+     /* various statistics */
+     int nplan;    /* number of plans evaluated */
+     double pcost, epcost; /* total pcost of measured/estimated plans */
+     int nprob;    /* number of problems evaluated */
+     int lookup, succ_lookup, lookup_iter;
+     int insert, insert_iter, insert_unknown;
+     int nrehash;
+};
+
+planner *X(mkplanner)(void);
+void X(planner_destroy)(planner *ego);
+
+#ifdef FFTW_DEBUG
+void X(planner_dump)(planner *ego, int vrbose);
+#endif
+
+/*
+  Iterate over all solvers.   Read:
+ 
+  @article{ baker93iterators,
+  author = "Henry G. Baker, Jr.",
+  title = "Iterators: Signs of Weakness in Object-Oriented Languages",
+  journal = "{ACM} {OOPS} Messenger",
+  volume = "4",
+  number = "3",
+  pages = "18--25"
+  }
+*/
+#define FORALL_SOLVERS(ego, s, p, what)			\
+{							\
+     unsigned _cnt;					\
+     for (_cnt = 0; _cnt < ego->nslvdesc; ++_cnt) {	\
+	  slvdesc *p = ego->slvdescs + _cnt;		\
+	  solver *s = p->slv;				\
+	  what;						\
+     }							\
+}
+
+/* make plan, destroy problem */
+plan *X(mkplan_d)(planner *ego, problem *p);
+
+/*-----------------------------------------------------------------------*/
+/* stride.c: */
+
+/* If PRECOMPUTE_ARRAY_INDICES is defined, precompute all strides. */
+#if (defined(__i386__) || _M_IX86 >= 500) && !HAVE_K7 && !defined(FFTW_LDOUBLE)
+#define PRECOMPUTE_ARRAY_INDICES
+#endif
+
+#ifdef PRECOMPUTE_ARRAY_INDICES
+typedef int *stride;
+#define WS(stride, i)  (stride[i])
+extern stride X(mkstride)(int n, int s);
+void X(stride_destroy)(stride p);
+
+#else
+
+typedef int stride;
+#define WS(stride, i)  (stride * i)
+#define fftwf_mkstride(n, stride) stride
+#define fftw_mkstride(n, stride) stride
+#define fftwl_mkstride(n, stride) stride
+#define fftwf_stride_destroy(p) ((void) p)
+#define fftw_stride_destroy(p) ((void) p)
+#define fftwl_stride_destroy(p) ((void) p)
+
+#endif /* PRECOMPUTE_ARRAY_INDICES */
+
+/*-----------------------------------------------------------------------*/
+/* solvtab.c */
+
+struct solvtab_s { void (*reg)(planner *); const char *reg_nam; };
+typedef struct solvtab_s solvtab[];
+void X(solvtab_exec)(solvtab tbl, planner *p);
+#define SOLVTAB(s) { s, STRINGIZE(s) }
+#define SOLVTAB_END { 0, 0 }
+
+/*-----------------------------------------------------------------------*/
+/* pickdim.c */
+int X(pickdim)(int which_dim, const int *buddies, int nbuddies,
+	       const tensor *sz, int oop, int *dp);
+
+/*-----------------------------------------------------------------------*/
+/* twiddle.c */
+/* little language to express twiddle factors computation */
+enum { TW_COS = 0, TW_SIN = 1, TW_TAN = 2, TW_NEXT = 3,
+       TW_FULL = 4, TW_GENERIC = 5 };
+
+typedef struct {
+     unsigned char op;
+     unsigned char v;
+     short i;
+} tw_instr;
+
+typedef struct twid_s {
+     R *W;                     /* array of twiddle factors */
+     int n, r, m;                /* transform order, radix, # twiddle rows */
+     int refcnt;
+     const tw_instr *instr;
+     struct twid_s *cdr;
+} twid;
+
+void X(mktwiddle)(twid **pp, const tw_instr *instr, int n, int r, int m);
+void X(twiddle_destroy)(twid **pp);
+int X(twiddle_length)(int r, const tw_instr *p);
+void X(twiddle_awake)(int flg, twid **pp, 
+		      const tw_instr *instr, int n, int r, int m);
+
+/*-----------------------------------------------------------------------*/
+/* trig.c */
+#ifdef FFTW_LDOUBLE
+typedef long double trigreal;
+#else
+typedef double trigreal;
+#endif
+
+extern trigreal X(cos2pi)(int, int);
+extern trigreal X(sin2pi)(int, int);
+extern trigreal X(tan2pi)(int, int);
+extern trigreal X(sincos)(trigreal m, trigreal n, int sinp);
+
+/*-----------------------------------------------------------------------*/
+/* primes.c: */
+
+#if defined(FFTW_ENABLE_UNSAFE_MULMOD)
+#  define MULMOD(x,y,p) (((x) * (y)) % (p))
+#elif ((SIZEOF_INT != 0) && (SIZEOF_LONG >= 2 * SIZEOF_INT))
+#  define MULMOD(x,y,p) ((int) ((((long) (x)) * ((long) (y))) % ((long) (p))))
+#elif ((SIZEOF_INT != 0) && (SIZEOF_LONG_LONG >= 2 * SIZEOF_INT))
+#  define MULMOD(x,y,p) ((int) ((((long long) (x)) * ((long long) (y))) \
+				 % ((long long) (p))))
+#elif defined(_MSC_VER)
+#  define MULMOD(x,y,p) ((int) ((((__int64) (x)) * ((__int64) (y))) \
+                                 % ((__int64) (p))))
+#else /* 'long long' unavailable */
+#  define SAFE_MULMOD 1
+int X(safe_mulmod)(int x, int y, int p);
+#  define MULMOD(x,y,p) X(safe_mulmod)(x,y,p)
+#endif
+
+int X(power_mod)(int n, int m, int p);
+int X(find_generator)(int p);
+int X(first_divisor)(int n);
+int X(is_prime)(int n);
+int X(next_prime)(int n);
+
+#define GENERIC_MIN_BAD 71 /* min prime for which generic becomes bad */
+
+/*-----------------------------------------------------------------------*/
+/* rader.c: */
+typedef struct rader_tls rader_tl;
+
+void X(rader_tl_insert)(int k1, int k2, int k3, R *W, rader_tl **tl);
+R *X(rader_tl_find)(int k1, int k2, int k3, rader_tl *t);
+void X(rader_tl_delete)(R *W, rader_tl **tl);
+
+/*-----------------------------------------------------------------------*/
+/* transpose.c: */
+
+void X(transpose)(R *A, int n, int m, int d, int N, R *buf);
+void X(transpose_slow)(R *a, int nx, int ny, int N,
+		       char *move, int move_size, R *buf);
+int X(transposable)(const iodim *a, const iodim *b,
+		    int vl, int s, R *ri, R *ii);
+void X(transpose_dims)(const iodim *a, const iodim *b,
+                       int *n, int *m, int *d, int *nd, int *md);
+int X(transpose_simplep)(const iodim *a, const iodim *b, int vl, int s,
+			 R *ri, R *ii);
+int X(transpose_slowp)(const iodim *a, const iodim *b, int N);
+
+/*-----------------------------------------------------------------------*/
+/* misc stuff */
+void X(null_awake)(plan *ego, int awake);
+int X(square)(int x);
+double X(measure_execution_time)(plan *pln, const problem *p);
+int X(alignment_of)(R *p);
+unsigned X(hash)(const char *s);
+int X(compute_nbuf)(int n, int vl, int nbuf, int maxbufsz);
+int X(ct_uglyp)(int min_n, int n, int r);
+
+#if HAVE_SIMD
+R *X(taint)(R *p, int s);
+R *X(join_taint)(R *p1, R *p2);
+#define TAINT(p, s) X(taint)(p, s)
+#define UNTAINT(p) ((R *) (((uintptr_t) (p)) & ~(uintptr_t)3))
+#define TAINTOF(p) (((uintptr_t)(p)) & 3)
+#define JOIN_TAINT(p1, p2) X(join_taint)(p1, p2)
+#else
+#define TAINT(p, s) (p)
+#define UNTAINT(p) (p)
+#define TAINTOF(p) 0
+#define JOIN_TAINT(p1, p2) p1
+#endif
+
+#ifdef FFTW_DEBUG_ALIGNMENT
+#  define ASSERT_ALIGNED_DOUBLE {		\
+     double __foo;				\
+     CK(!(((uintptr_t) &__foo) & 0x7));		\
+}
+#else
+#  define ASSERT_ALIGNED_DOUBLE 
+#endif /* FFTW_DEBUG_ALIGNMENT */
+
+
+
+/*-----------------------------------------------------------------------*/
+/* macros used in codelets to reduce source code size */
+
+typedef R E;  /* internal precision of codelets. */
+
+#ifdef FFTW_LDOUBLE
+#  define K(x) ((E) x##L)
+#else
+#  define K(x) ((E) x)
+#endif
+#define DK(name, value) const E name = K(value)
+
+/* FMA macros */
+
+#if defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__))
+/* this peculiar coding seems to do the right thing on all of
+   gcc-2.95, gcc-3.1, and gcc-3.2.  
+
+   The obvious expression a * b + c does not work.  If both x = a * b
+   + c and y = a * b - c appear in the source, gcc computes t = a * b,
+   x = t + c, y = t - c, thus destroying the fma.
+*/
+static __inline__ E FMA(E a, E b, E c)
+{
+     E x = a * b;
+     x = x + c;
+     return x;
+}
+
+static __inline__ E FMS(E a, E b, E c)
+{
+     E x = a * b;
+     x = x - c;
+     return x;
+}
+
+static __inline__ E FNMA(E a, E b, E c)
+{
+     E x = a * b;
+     x = - (x + c);
+     return x;
+}
+
+static __inline__ E FNMS(E a, E b, E c)
+{
+     E x = a * b;
+     x = - (x - c);
+     return x;
+}
+#else
+#define FMA(a, b, c) (((a) * (b)) + (c))
+#define FMS(a, b, c) (((a) * (b)) - (c))
+#define FNMA(a, b, c) (- (((a) * (b)) + (c)))
+#define FNMS(a, b, c) ((c) - ((a) * (b)))
+#endif
+
+
+/* stack-alignment hackery */
+#if defined(__GNUC__) && defined(__i386__)
+/*
+ * horrible hack to align the stack to a 16-byte boundary.
+ *
+ * We assume a gcc version >= 2.95 so that
+ * -mpreferred-stack-boundary works.  Otherwise, all bets are
+ * off.  However, -mpreferred-stack-boundary does not create a
+ * stack alignment, but it only preserves it.  Unfortunately,
+ * many versions of libc on linux call main() with the wrong
+ * initial stack alignment, with the result that the code is now
+ * pessimally aligned instead of having a 50% chance of being
+ * correct.
+ */
+
+#define WITH_ALIGNED_STACK(what)				\
+{								\
+     /*								\
+      * Use alloca to allocate some memory on the stack.	\
+      * This alerts gcc that something funny is going		\
+      * on, so that it does not omit the frame pointer		\
+      * etc.							\
+      */							\
+     (void)__builtin_alloca(16);				\
+								\
+     /*								\
+      * Now align the stack pointer				\
+      */							\
+     __asm__ __volatile__ ("andl $-16, %esp");			\
+								\
+     what							\
+}
+#endif
+
+#ifdef __ICC /* Intel's compiler for ia32 */
+#define WITH_ALIGNED_STACK(what)				\
+{								\
+     /*								\
+      * Simply calling alloca seems to do the right thing.	\
+      * The size of the allocated block seems to be irrelevant.	\
+      */							\
+     _alloca(16);						\
+     what							\
+}
+#endif
+
+#ifndef WITH_ALIGNED_STACK
+#define WITH_ALIGNED_STACK(what) what
+#endif
+
+#endif /* __IFFTW_H__ */
diff --git a/src/fftw3/kernel/kbuffered.c b/src/fftw3/kernel/kbuffered.c
new file mode 100644
index 0000000..d20403f
--- /dev/null
+++ b/src/fftw3/kernel/kbuffered.c
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/* routines shared by the various buffered solvers */
+
+#include "ifftw.h"
+
+int X(compute_nbuf)(int n, int vl, int nbuf, int maxbufsz)
+{
+     int i; 
+
+     if (nbuf * n > maxbufsz)
+          nbuf = X(imax)((int)1, maxbufsz / n);
+
+     /*
+      * Look for a buffer number (not too big) that divides the
+      * vector length, in order that we only need one child plan:
+      */
+     for (i = nbuf; i < vl && i < 2 * nbuf; ++i)
+          if (vl % i == 0)
+               return i;
+
+     /* whatever... */
+     nbuf = X(imin)(nbuf, vl);
+     return nbuf;
+}
+
diff --git a/src/fftw3/kernel/kct.c b/src/fftw3/kernel/kct.c
new file mode 100644
index 0000000..ba3bf68
--- /dev/null
+++ b/src/fftw3/kernel/kct.c
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/* common routines for Cooley-Tukey algorithms */
+
+#include "ifftw.h"
+
+#define POW2P(n) (((n) > 0) && (((n) & ((n) - 1)) == 0))
+
+/* TRUE if radix-r is ugly for size n */
+int X(ct_uglyp)(int min_n, int n, int r)
+{
+     return (n <= min_n) || (POW2P(n) && (n / r) <= 4);
+}
diff --git a/src/fftw3/kernel/kplan.c b/src/fftw3/kernel/kplan.c
new file mode 100644
index 0000000..4c17107
--- /dev/null
+++ b/src/fftw3/kernel/kplan.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/* $Id: kplan.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */
+
+#include "ifftw.h"
+
+/* "Plan: To bother about the best method of accomplishing an
+   accidental result."  (Ambrose Bierce, The Enlarged Devil's
+   Dictionary). */
+
+plan *X(mkplan)(size_t size, const plan_adt *adt)
+{
+     plan *p = (plan *)MALLOC(size, PLANS);
+
+     A(adt->destroy);
+     p->awake_refcnt = 0;
+     p->adt = adt;
+     X(ops_zero)(&p->ops);
+     p->pcost = 0.0;
+     
+     return p;
+}
+
+/*
+ * destroy a plan
+ */
+void X(plan_destroy_internal)(plan *ego)
+{
+     if (ego) {
+	  if (ego->awake_refcnt > 0)
+	       ego->adt->awake(ego, 0);
+          ego->adt->destroy(ego);
+	  X(ifree)(ego);
+     }
+}
+
+/* dummy destroy routine for plans with no local state */
+void X(plan_null_destroy)(plan *ego)
+{
+     UNUSED(ego);
+     /* nothing */
+}
+
+void X(plan_awake)(plan *ego, int flag)
+{
+     if (flag) {
+	  if (!ego->awake_refcnt)
+	       ego->adt->awake(ego, flag);
+	  ++ego->awake_refcnt;
+     } else {
+	  --ego->awake_refcnt;
+	  if (!ego->awake_refcnt)
+	       ego->adt->awake(ego, flag);
+     }
+}
+
diff --git a/src/fftw3/kernel/kproblem.c b/src/fftw3/kernel/kproblem.c
new file mode 100644
index 0000000..4686692
--- /dev/null
+++ b/src/fftw3/kernel/kproblem.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/* $Id: kproblem.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */
+
+#include "ifftw.h"
+
+/* constructor */
+problem *X(mkproblem)(size_t sz, const problem_adt *adt)
+{
+     problem *p = (problem *)MALLOC(sz, PROBLEMS);
+
+     p->adt = adt;
+     return p;
+}
+
+/* destructor */
+void X(problem_destroy)(problem *ego)
+{
+     if (ego)
+	  ego->adt->destroy(ego);
+}
+
diff --git a/src/fftw3/kernel/krader.c b/src/fftw3/kernel/krader.c
new file mode 100644
index 0000000..354edfc
--- /dev/null
+++ b/src/fftw3/kernel/krader.c
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include "ifftw.h"
+
+/*
+  common routines for Rader solvers 
+*/
+
+
+/* shared twiddle and omega lists, keyed by two/three integers. */
+struct rader_tls {
+     int k1, k2, k3;
+     R *W;
+     int refcnt;
+     rader_tl *cdr; 
+};
+
+void X(rader_tl_insert)(int k1, int k2, int k3, R *W, rader_tl **tl)
+{
+     rader_tl *t = (rader_tl *) MALLOC(sizeof(rader_tl), TWIDDLES);
+     t->k1 = k1; t->k2 = k2; t->k3 = k3; t->W = W;
+     t->refcnt = 1; t->cdr = *tl; *tl = t;
+}
+
+R *X(rader_tl_find)(int k1, int k2, int k3, rader_tl *t)
+{
+     while (t && (t->k1 != k1 || t->k2 != k2 || t->k3 != k3))
+	  t = t->cdr;
+     if (t) {
+	  ++t->refcnt;
+	  return t->W;
+     } else 
+	  return 0;
+}
+
+void X(rader_tl_delete)(R *W, rader_tl **tl)
+{
+     if (W) {
+	  rader_tl **tp, *t;
+
+	  for (tp = tl; (t = *tp) && t->W != W; tp = &t->cdr)
+	       ;
+
+	  if (t && --t->refcnt <= 0) {
+	       *tp = t->cdr;
+	       X(ifree)(t->W);
+	       X(ifree)(t);
+	  }
+     }
+}
diff --git a/src/fftw3/kernel/md5-1.c b/src/fftw3/kernel/md5-1.c
new file mode 100644
index 0000000..a2e87a1
--- /dev/null
+++ b/src/fftw3/kernel/md5-1.c
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include "ifftw.h"
+
+
+void X(md5putb)(md5 *p, const void *d_, int len)
+{
+     int i;
+     const unsigned char *d = (const unsigned char *)d_;
+     for (i = 0; i < len; ++i)
+	  X(md5putc)(p, d[i]);
+}
+
+void X(md5puts)(md5 *p, const char *s)
+{
+     /* also hash final '\0' */
+     do {
+	  X(md5putc)(p, *s);
+     } while(*s++);
+}
+
+void X(md5int)(md5 *p, int i)
+{
+     X(md5putb)(p, &i, sizeof(i));
+}
+
+void X(md5unsigned)(md5 *p, unsigned i)
+{
+     X(md5putb)(p, &i, sizeof(i));
+}
+
+void X(md5ptrdiff)(md5 *p, ptrdiff_t d)
+{
+     X(md5putb)(p, &d, sizeof(d));
+}
+
diff --git a/src/fftw3/kernel/md5.c b/src/fftw3/kernel/md5.c
new file mode 100644
index 0000000..edbd811
--- /dev/null
+++ b/src/fftw3/kernel/md5.c
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/* 
+   independent implementation of Ron Rivest's MD5 message-digest
+   algorithm, based on rfc 1321.
+
+   Optimized for small code size, not speed.  Works as long as
+   sizeof(md5uint) >= 4.
+*/
+
+#include "ifftw.h"
+
+/* sintab[i] = 4294967296.0 * abs(sin((double)(i + 1))) */
+static const md5uint sintab[64] = {
+     0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee,
+     0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
+     0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
+     0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,
+     0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa,
+     0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
+     0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed,
+     0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,
+     0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
+     0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
+     0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05,
+     0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
+     0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039,
+     0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
+     0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
+     0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
+}; 
+
+/* see rfc 1321 section 3.4 */
+static const struct roundtab {
+     char k; 
+     char s;
+} roundtab[64] = {
+     {  0,  7}, {  1, 12}, {  2, 17}, {  3, 22},
+     {  4,  7}, {  5, 12}, {  6, 17}, {  7, 22},
+     {  8,  7}, {  9, 12}, { 10, 17}, { 11, 22},
+     { 12,  7}, { 13, 12}, { 14, 17}, { 15, 22},
+     {  1,  5}, {  6,  9}, { 11, 14}, {  0, 20},
+     {  5,  5}, { 10,  9}, { 15, 14}, {  4, 20},
+     {  9,  5}, { 14,  9}, {  3, 14}, {  8, 20},
+     { 13,  5}, {  2,  9}, {  7, 14}, { 12, 20},
+     {  5,  4}, {  8, 11}, { 11, 16}, { 14, 23},
+     {  1,  4}, {  4, 11}, {  7, 16}, { 10, 23},
+     { 13,  4}, {  0, 11}, {  3, 16}, {  6, 23},
+     {  9,  4}, { 12, 11}, { 15, 16}, {  2, 23},
+     {  0,  6}, {  7, 10}, { 14, 15}, {  5, 21},
+     { 12,  6}, {  3, 10}, { 10, 15}, {  1, 21},
+     {  8,  6}, { 15, 10}, {  6, 15}, { 13, 21},
+     {  4,  6}, { 11, 10}, {  2, 15}, {  9, 21}
+};
+
+#define rol(a, s) ((a << (int)(s)) | (a >> (32 - (int)(s))))
+
+static void doblock(md5sig state, const unsigned char *data)
+{
+     md5uint a, b, c, d, t, x[16];
+     const md5uint msk = 0xffffffffUL;
+     int i;
+
+     /* encode input bytes into md5uint */
+     for (i = 0; i < 16; ++i) {
+	  const unsigned char *p = data + 4 * i;
+	  x[i] = p[0] | (p[1] << 8) | (p[2] << 16) | (p[3] << 24);
+     }
+
+     a = state[0]; b = state[1]; c = state[2]; d = state[3];
+     for (i = 0; i < 64; ++i) {
+	  const struct roundtab *p = roundtab + i;
+	  int round = i / 16;
+	  switch (round) {
+	      case 0: a += (b & c) | (~b & d); break;
+	      case 1: a += (b & d) | (c & ~d); break;
+	      case 2: a += b ^ c ^ d; break;
+	      case 3: a += c ^ (b | ~d); break;
+	  }
+	  a += sintab[i];
+	  a += x[(int)(p->k)];
+	  a &= msk;
+	  t = b + rol(a, p->s);
+	  a = d; d = c; c = b; b = t;
+     }
+     state[0] = (state[0] + a) & msk;
+     state[1] = (state[1] + b) & msk;
+     state[2] = (state[2] + c) & msk;
+     state[3] = (state[3] + d) & msk;
+}
+
+
+void X(md5begin)(md5 *p)
+{
+     p->s[0] = 0x67452301;
+     p->s[1] = 0xefcdab89;
+     p->s[2] = 0x98badcfe;
+     p->s[3] = 0x10325476;
+     p->l = 0;
+}
+
+void X(md5putc)(md5 *p, unsigned char c)
+{
+     p->c[p->l % 64] = c;
+     if (((++p->l) % 64) == 0) doblock(p->s, p->c);
+}
+
+void X(md5end)(md5 *p)
+{
+     unsigned l, i;
+
+     l = 8 * p->l; /* length before padding, in bits */
+
+     /* rfc 1321 section 3.1: padding */
+     X(md5putc)(p, 0x80);
+     while ((p->l % 64) != 56) X(md5putc)(p, 0x00);
+
+     /* rfc 1321 section 3.2: length (little endian) */
+     for (i = 0; i < 8; ++i) {
+	  X(md5putc)(p, l & 0xFF);
+	  l = l >> 8;
+     }
+
+     /* Now p->l % 64 == 0 and signature is in p->s */
+}
diff --git a/src/fftw3/kernel/minmax.c b/src/fftw3/kernel/minmax.c
new file mode 100644
index 0000000..fc6f3a3
--- /dev/null
+++ b/src/fftw3/kernel/minmax.c
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/* $Id: minmax.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */
+
+#include "ifftw.h"
+
+int X(imax)(int a, int b)
+{
+     return (a > b) ? a : b;
+}
+
+int X(imin)(int a, int b)
+{
+     return (a < b) ? a : b;
+}
diff --git a/src/fftw3/kernel/ops.c b/src/fftw3/kernel/ops.c
new file mode 100644
index 0000000..9e927c1
--- /dev/null
+++ b/src/fftw3/kernel/ops.c
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/* $Id: ops.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */
+
+#include "ifftw.h"
+
+void X(ops_zero)(opcnt *dst)
+{
+     dst->add = dst->mul = dst->fma = dst->other = 0;
+}
+
+void X(ops_cpy)(const opcnt *src, opcnt *dst)
+{
+     *dst = *src;
+}
+
+void X(ops_other)(int o, opcnt *dst)
+{
+     X(ops_zero)(dst);
+     dst->other = o;
+}
+
+void X(ops_madd)(int m, const opcnt *a, const opcnt *b, opcnt *dst)
+{
+     dst->add = m * a->add + b->add;
+     dst->mul = m * a->mul + b->mul;
+     dst->fma = m * a->fma + b->fma;
+     dst->other = m * a->other + b->other;
+}
+
+void X(ops_add)(const opcnt *a, const opcnt *b, opcnt *dst)
+{
+     X(ops_madd)(1, a, b, dst);
+}
+
+void X(ops_add2)(const opcnt *a, opcnt *dst)
+{
+     X(ops_add)(a, dst, dst);
+}
+
+void X(ops_madd2)(int m, const opcnt *a, opcnt *dst)
+{
+     X(ops_madd)(m, a, dst, dst);
+}
+
diff --git a/src/fftw3/kernel/pickdim.c b/src/fftw3/kernel/pickdim.c
new file mode 100644
index 0000000..034e6db
--- /dev/null
+++ b/src/fftw3/kernel/pickdim.c
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include "ifftw.h"
+
+/* $Id: pickdim.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */
+
+/* Given a solver which_dim, a vector sz, and whether or not the
+   transform is out-of-place, return the actual dimension index that
+   it corresponds to.  The basic idea here is that we return the
+   which_dim'th valid dimension, starting from the end if
+   which_dim < 0. */
+static int really_pickdim(int which_dim, const tensor *sz, int oop, int *dp)
+{
+     int i;
+     int count_ok = 0;
+     if (which_dim > 0) {
+          for (i = 0; i < sz->rnk; ++i) {
+               if (oop || sz->dims[i].is == sz->dims[i].os)
+                    if (++count_ok == which_dim) {
+                         *dp = i;
+                         return 1;
+                    }
+          }
+     }
+     else if (which_dim < 0) {
+          for (i = sz->rnk; i > 0; --i) {
+               if (oop || sz->dims[i - 1].is == sz->dims[i - 1].os)
+                    if (++count_ok == -which_dim) {
+                         *dp = i - 1;
+                         return 1;
+                    }
+          }
+     }
+     else { /* zero: pick the middle, if valid */
+	  i = sz->rnk / 2 - 1;
+	  if (i < sz->rnk && (oop || sz->dims[i].is == sz->dims[i].os)) {
+	       *dp = i;
+	       return 1;
+	  }
+     }
+     return 0;
+}
+
+/* Like really_pickdim, but only returns 1 if no previous "buddy"
+   which_dim in the buddies list would give the same dim. */
+int X(pickdim)(int which_dim, const int *buddies, int nbuddies,
+	       const tensor *sz, int oop, int *dp)
+{
+     int i, d1;
+
+     if (!really_pickdim(which_dim, sz, oop, dp))
+          return 0;
+
+     /* check whether some buddy solver would produce the same dim.
+        If so, consider this solver unapplicable and let the buddy
+        take care of it.  The smallest-indexed buddy is applicable. */
+     for (i = 0; i < nbuddies; ++i) {
+          if (buddies[i] == which_dim)
+               break;  /* found self */
+          if (really_pickdim(buddies[i], sz, oop, &d1) && *dp == d1)
+               return 0; /* found equivalent buddy */
+     }
+     return 1;
+}
diff --git a/src/fftw3/kernel/planner.c b/src/fftw3/kernel/planner.c
new file mode 100644
index 0000000..1d01cb9
--- /dev/null
+++ b/src/fftw3/kernel/planner.c
@@ -0,0 +1,695 @@
+/*
+ * Copyright (c) 2000 Matteo Frigo
+ * Copyright (c) 2000 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/* $Id: planner.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */
+#include "ifftw.h"
+#include <string.h>
+
+/* GNU Coding Standards, Sec. 5.2: "Please write the comments in a GNU
+   program in English, because English is the one language that nearly
+   all programmers in all countries can read."
+
+                    ingemisco tanquam reus
+		    culpa rubet vultus meus
+		    supplicanti parce [rms]
+*/
+
+#define BLESSEDP(solution) ((solution)->flags & BLESSING)
+#define VALIDP(solution) ((solution)->flags & H_VALID)
+#define UNBLESS(flags)  flags &= ~BLESSING
+
+#define MAXNAM 64  /* maximum length of registrar's name.
+		      Used for reading wisdom.  There is no point
+		      in doing this right */
+
+/* Flags f1 subsumes flags f2 iff f1 is less/equally impatient than
+   f2, defining a partial ordering. */
+#define IMPATIENCE(flags) ((flags) & IMPATIENCE_FLAGS)
+#define NONIMPATIENCE(flags) ((flags) & NONIMPATIENCE_FLAGS)
+#define ORDERED(f1, f2) (SUBSUMES(f1, f2) || SUBSUMES(f2, f1))
+#define SUBSUMES(f1, f2) ((IMPATIENCE(f1) & (f2)) == IMPATIENCE(f1))
+
+static unsigned addmod(unsigned a, unsigned b, unsigned p)
+{
+     /* gcc-2.95/sparc produces incorrect code for the fast version below. */
+#if defined(__sparc__) && defined(__GNUC__)
+     /* slow version  */
+     return (a + b) % p;
+#else
+     /* faster version */
+     unsigned c = a + b;
+     return c >= p ? c - p : c;
+#endif
+}
+
+/*
+  slvdesc management:
+*/
+static void sgrow(planner *ego)
+{
+     unsigned osiz = ego->slvdescsiz, nsiz = 1 + osiz + osiz / 4;
+     slvdesc *ntab = (slvdesc *)MALLOC(nsiz * sizeof(slvdesc), SLVDESCS);
+     slvdesc *otab = ego->slvdescs;
+     unsigned i;
+
+     ego->slvdescs = ntab;
+     ego->slvdescsiz = nsiz;
+     for (i = 0; i < osiz; ++i)
+	  ntab[i] = otab[i];
+     X(ifree0)(otab);
+}
+
+static void register_solver(planner *ego, solver *s)
+{
+     slvdesc *n;
+     if (s) { /* add s to solver list */
+	  X(solver_use)(s);
+
+	  if (ego->nslvdesc >= ego->slvdescsiz)
+	       sgrow(ego);
+
+	  n = ego->slvdescs + ego->nslvdesc++;
+
+	  n->slv = s;
+	  n->reg_nam = ego->cur_reg_nam;
+	  n->reg_id = ego->cur_reg_id++;
+	  
+	  A(strlen(n->reg_nam) < MAXNAM);
+	  n->nam_hash = X(hash)(n->reg_nam);
+     }
+}
+
+static int slookup(planner *ego, char *nam, int id)
+{
+     unsigned h = X(hash)(nam); /* used to avoid strcmp in the common case */
+     FORALL_SOLVERS(ego, s, sp, {
+	  UNUSED(s);
+	  if (sp->reg_id == id && sp->nam_hash == h
+	      && !strcmp(sp->reg_nam, nam))
+	       return sp - ego->slvdescs;
+     });
+     return -1;
+}
+
+/*
+  md5-related stuff:
+*/
+
+/* first hash function */
+static unsigned h1(planner *ego, const md5sig s)
+{
+     return s[0] % ego->hashsiz;
+}
+
+/* second hash function (for double hashing) */
+static unsigned h2(planner *ego, const md5sig s)
+{
+     return 1U + s[1] % (ego->hashsiz - 1);
+}
+
+static void md5hash(md5 *m, const problem *p, const planner *plnr)
+{
+     X(md5begin)(m);
+     X(md5unsigned)(m, sizeof(R)); /* so we don't mix different precisions */
+     X(md5unsigned)(m, plnr->problem_flags);
+     X(md5int)(m, plnr->nthr);
+     p->adt->hash(p, m);
+     X(md5end)(m);
+}
+
+static int md5eq(const md5sig a, const md5sig b)
+{
+     return a[0] == b[0] && a[1] == b[1] && a[2] == b[2] && a[3] == b[3];
+}
+
+static void sigcpy(const md5sig a, md5sig b)
+{
+     b[0] = a[0]; b[1] = a[1]; b[2] = a[2]; b[3] = a[3];
+}
+
+/*
+  memoization routines :
+*/
+
+/*
+   liber scriptus proferetur
+   in quo totum continetur
+   unde mundus iudicetur
+*/
+struct solution_s {
+     md5sig s;
+     unsigned short flags;
+     short slvndx;
+};
+
+static solution *hlookup(planner *ego, const md5sig s, unsigned short flags)
+{
+     unsigned g, h = h1(ego, s), d = h2(ego, s);
+
+     ++ego->lookup;
+
+     for (g = h; ; g = addmod(g, d, ego->hashsiz)) {
+	  solution *l = ego->solutions + g;
+	  ++ego->lookup_iter;
+	  if (VALIDP(l)) {
+	       if (md5eq(s, l->s) && ORDERED(l->flags, flags)) { 
+		    ++ego->succ_lookup;
+		    return l; 
+	       }
+	  } else {
+	       return 0;
+	  }
+	  A((g + d) % ego->hashsiz != h);
+     }
+}
+
+
+static void hinsert0(planner *ego, const md5sig s, unsigned short flags,
+		     int slvndx, solution *l)
+{
+     ++ego->insert;
+     if (!l) { 	 
+	  /* search for nonfull slot */
+	  unsigned g, h = h1(ego, s), d = h2(ego, s); 
+	  ++ego->insert_unknown;
+	  for (g = h; ; g = addmod(g, d, ego->hashsiz)) {
+	       ++ego->insert_iter;
+	       l = ego->solutions + g;
+	       if (!VALIDP(l)) break;
+	       A((g + d) % ego->hashsiz != h);
+	  }
+     }
+
+     /* fill slot */
+     l->flags = flags | H_VALID;
+     l->slvndx = (short)slvndx;
+     sigcpy(s, l->s);
+}
+
+static void rehash(planner *ego, unsigned nsiz)
+{
+     unsigned osiz = ego->hashsiz, h;
+     solution *osol = ego->solutions, *nsol;
+
+     nsiz = (unsigned)X(next_prime)((int)nsiz);
+     nsol = (solution *)MALLOC(nsiz * sizeof(solution), HASHT);
+     ++ego->nrehash;
+
+     /* init new table */
+     for (h = 0; h < nsiz; ++h) 
+	  nsol[h].flags = 0;
+
+     /* install new table */
+     ego->hashsiz = nsiz;
+     ego->solutions = nsol;
+
+     /* copy table */
+     for (h = 0; h < osiz; ++h) {
+	  solution *l = osol + h;
+	  if (VALIDP(l))
+	       hinsert0(ego, l->s, l->flags, l->slvndx, 0);
+     }
+
+     X(ifree0)(osol);
+}
+
+static unsigned minsz(unsigned nelem)
+{
+     return 1U + nelem + nelem / 8U;
+}
+
+static unsigned nextsz(unsigned nelem)
+{
+     return minsz(minsz(nelem));
+}
+
+static void hgrow(planner *ego)
+{
+     unsigned nelem = ego->nelem;
+     if (minsz(nelem) >= ego->hashsiz)
+	  rehash(ego, nextsz(nelem));
+}
+
+static void hshrink(planner *ego)
+{
+     unsigned nelem = ego->nelem;
+     /* always rehash after deletions */
+     rehash(ego, nextsz(nelem));
+}
+
+/* inherit blessing, but only if the solver is the same */
+static unsigned short merge_flags(unsigned short dstflags, int dstndx,
+				  unsigned short srcflags, int srcndx)
+{
+     if (srcndx == dstndx)
+	  dstflags |= (srcflags & BLESSING); /* ne me perdas illa die */
+     return dstflags;
+}
+
+static void hinsert(planner *ego, const md5sig s, 
+		    unsigned short flags, int slvndx)
+{
+     solution *l;
+
+     if ((l = hlookup(ego, s, flags))) {
+	  if (SUBSUMES(flags, l->flags)) {
+	       /* overwrite old solution */
+	       flags = merge_flags(flags, slvndx, l->flags, l->slvndx);
+	  } else {
+	       A(SUBSUMES(l->flags, flags));
+	       l->flags = merge_flags(l->flags, l->slvndx, flags, slvndx);
+	       return;
+	  }
+     } else {
+	  ++ego->nelem;
+	  hgrow(ego);
+     }
+     hinsert0(ego, s, flags, slvndx, l);
+}
+
+static void hcurse_subsumed(planner *ego)
+{
+     unsigned h;
+
+     /* unbless any entries that are unreachable because they
+        are subsumed by less-impatient ones.  */
+     for (h = 0; h < ego->hashsiz; ++h) {
+	  solution *l = ego->solutions + h;
+	  if (VALIDP(l)) {
+	       unsigned d = h2(ego, l->s), g = addmod(h, d, ego->hashsiz);
+	       for (; ; g = addmod(g, d, ego->hashsiz)) {
+		    solution *m = ego->solutions + g;
+		    if (VALIDP(m)) {
+			 if (md5eq(l->s, m->s) &&
+			     SUBSUMES(l->flags, m->flags)) {
+			      /* ne cadant in obscurum */
+			      l->flags = merge_flags(l->flags, l->slvndx,
+						     m->flags, m->slvndx);
+
+			      /* cum vix justus sit securus */
+			      UNBLESS(m->flags);
+			 }
+		    }
+		    else break;
+		    A((g + d) % ego->hashsiz != h);
+	       }
+	  }
+     }
+}
+
+
+static void invoke_hook(planner *ego, plan *pln, const problem *p, 
+			int optimalp)
+{
+     if (ego->hook)
+	  ego->hook(pln, p, optimalp);
+}
+
+static void evaluate_plan(planner *ego, plan *pln, const problem *p)
+{
+     if (!BELIEVE_PCOSTP(ego) || pln->pcost == 0.0) {
+	  ego->nplan++;
+
+	  if (ESTIMATEP(ego)) {
+	  estimate:
+	       /* heuristic */
+	       pln->pcost = 0.0
+		    + pln->ops.add
+		    + pln->ops.mul
+		    + 2 * pln->ops.fma
+		    + pln->ops.other;
+	       ego->epcost += pln->pcost;
+	  } else {
+	       double t = X(measure_execution_time)(pln, p);
+
+	       if (t < 0) {  /* unavailable cycle counter */
+		    /* Real programmers can write FORTRAN in any language */
+		    goto estimate;
+	       }
+
+	       pln->pcost = t;
+	       ego->pcost += t;
+	  }
+     }
+     
+     invoke_hook(ego, pln, p, 0);
+}
+
+/* maintain dynamic scoping of flags, nthr: */
+static plan *invoke_solver(planner *ego, problem *p, solver *s, 
+			   unsigned short nflags)
+{
+     unsigned short planner_flags = ego->planner_flags;
+     unsigned problem_flags = ego->problem_flags;
+     int nthr = ego->nthr;
+     plan *pln;
+     ego->planner_flags = nflags;
+     pln = s->adt->mkplan(s, p, ego);
+     ego->problem_flags = problem_flags;
+     ego->nthr = nthr;
+     ego->planner_flags = planner_flags;
+     return pln;
+}
+
+static plan *search(planner *ego, problem *p, slvdesc **descp)
+{
+     plan *best = 0;
+     int best_not_yet_timed = 1;
+     int pass;
+
+     if (NO_SEARCHP(ego)) {
+	  /* D("invalid search for %P %x\n", p, ego->planner_flags); */
+	  return 0;
+     }
+
+     for (pass = 0; pass < 2; ++pass) {
+	  unsigned short nflags = ego->planner_flags;
+	  
+	  if (best) break;
+
+	  switch (pass) {
+	      case 0: 
+		   /* skip pass 0 during exhaustive search */
+		   if (!NO_EXHAUSTIVEP(ego)) continue;
+		   nflags |= NO_UGLY;
+		   break;
+	      case 1:
+		   /* skip pass 1 if NO_UGLY */
+		   if (NO_UGLYP(ego)) continue;
+		   break;
+	  }
+
+          FORALL_SOLVERS(ego, s, sp, {
+	       plan *pln = invoke_solver(ego, p, s, nflags);
+
+	       if (pln) {
+		    if (best) {
+			 if (best_not_yet_timed) {
+			      evaluate_plan(ego, best, p);
+			      best_not_yet_timed = 0;
+			 }
+			 evaluate_plan(ego, pln, p);
+			 if (pln->pcost < best->pcost) {
+			      X(plan_destroy_internal)(best);
+			      best = pln;
+			      *descp = sp;
+			 } else {
+			      X(plan_destroy_internal)(pln);
+			 }
+		    } else {
+			 best = pln;
+			 *descp = sp;
+		    }
+	       }
+	  });
+     }
+
+     return best;
+}
+
+static plan *mkplan(planner *ego, problem *p)
+{
+     plan *pln;
+     md5 m;
+     slvdesc *sp;
+     unsigned short flags;
+     ASSERT_ALIGNED_DOUBLE;
+
+     /* Canonical form. */
+     if (!NO_EXHAUSTIVEP(ego)) ego->planner_flags &= ~NO_UGLY;
+	  
+     ++ego->nprob;
+     md5hash(&m, p, ego);
+
+     pln = 0;
+
+     {
+	  solution *sol; /* new scope for sol */
+
+	  if ((sol = hlookup(ego, m.s, ego->planner_flags))) {
+	       if (SUBSUMES(sol->flags, ego->planner_flags)) {
+		    /* wisdom is acceptable */
+		    if (sol->slvndx < 0) 
+			 return 0;   /* known to be infeasible */
+
+		    /* use solver to obtain a plan */
+		    sp = ego->slvdescs + sol->slvndx;
+		    pln = 
+			 invoke_solver(ego, p, sp->slv, 
+				       (0
+					| NO_SEARCH 
+					| IMPATIENCE(sol->flags)
+					| NONIMPATIENCE(ego->planner_flags) ));
+
+		    /* if (!pln) then the entry is bogus, but
+		       we currently do nothing about it. */
+		    /* CAVEAS: Do not use ``sol'' here, because the
+		       pointer is possibly dangling after the call to
+		       invoke_solver(). */
+	       } else {
+		    A(SUBSUMES(ego->planner_flags, sol->flags));
+	       }
+	  }
+     }
+
+
+     if (!pln)
+	  pln = search(ego, p, &sp);
+
+     flags = ego->planner_flags;
+
+     if (pln) {
+	  /* Postulate de iure that NO_UGLY subsumes ~NO_UGLY if the
+	     problem is feasible. Also postulate that NO_SEARCH
+	     subsumes ~NO_SEARCH. */
+	  flags &= ~(NO_UGLY | NO_SEARCH);
+     }
+
+     hinsert(ego, m.s, flags, pln ? sp - ego->slvdescs : -1);
+
+     if (pln)
+	  invoke_hook(ego, pln, p, 1);
+     return pln;
+}
+
+/* destroy hash table entries.  If FORGET_EVERYTHING, destroy the whole
+   table.  If FORGET_ACCURSED, then destroy entries that are not blessed. */
+static void forget(planner *ego, amnesia a)
+{
+     unsigned h;
+
+     /* garbage-collect while we are at it */ 
+     if (a != FORGET_EVERYTHING)
+	  hcurse_subsumed(ego);
+
+     for (h = 0; h < ego->hashsiz; ++h) {
+	  solution *l = ego->solutions + h;
+	  if (VALIDP(l)) {
+	       if (a == FORGET_EVERYTHING ||
+		   (a == FORGET_ACCURSED && !BLESSEDP(l))) {
+		    /* confutatis maledictis
+		       flammis acribus addictis */
+		    l->flags &= ~H_VALID;
+		    --ego->nelem;
+	       }
+	  }
+     }
+     /* nil inultum remanebit */
+
+     hshrink(ego);
+}
+
+static void htab_destroy(planner *ego)
+{
+     forget(ego, FORGET_EVERYTHING);
+     X(ifree)(ego->solutions);
+     ego->nelem = 0U;
+}
+
+/* FIXME: what sort of version information should we write? */
+#define WISDOM_PREAMBLE PACKAGE "-" VERSION " " STRINGIZE(X(wisdom))
+
+/* tantus labor non sit cassus */
+static void exprt(planner *ego, printer *p)
+{
+     unsigned h;
+
+     hcurse_subsumed(ego);
+
+     p->print(p, "(" WISDOM_PREAMBLE "%(");
+     for (h = 0; h < ego->hashsiz; ++h) {
+	  solution *l = ego->solutions + h;
+	  if (VALIDP(l) && BLESSEDP(l) && l->slvndx >= 0) {
+	       slvdesc *sp = ego->slvdescs + l->slvndx;
+	       /* qui salvandos salvas gratis
+		  salva me fons pietatis */
+	       p->print(p, "(%s %d #x%x #x%M #x%M #x%M #x%M)\n",
+			sp->reg_nam, sp->reg_id, (int)l->flags,
+			l->s[0], l->s[1], l->s[2], l->s[3]);
+	  }
+     }
+     p->print(p, "%))\n");
+}
+
+/* mors stupebit et natura
+   cum resurget creatura */
+static int imprt(planner *ego, scanner *sc)
+{
+     char buf[MAXNAM + 1];
+     md5uint sig[4];
+     int flags;
+     int reg_id;
+     int slvndx;
+     solution *sol;
+
+     if (!sc->scan(sc, "(" WISDOM_PREAMBLE))
+	  return 0; /* don't need to restore hashtable */
+
+     /* make a backup copy of the hash table (cache the hash) */
+     {
+	  unsigned h, hsiz = ego->hashsiz;
+	  sol = (solution *)MALLOC(hsiz * sizeof(solution), HASHT);
+	  for (h = 0; h < hsiz; ++h)
+	       sol[h] = ego->solutions[h];
+     }
+
+     while (1) {
+	  if (sc->scan(sc, ")"))
+	       break;
+
+	  /* qua resurget ex favilla */
+	  if (!sc->scan(sc, "(%*s %d #x%x #x%M #x%M #x%M #x%M)",
+			MAXNAM, buf, &reg_id, &flags, 
+			sig + 0, sig + 1, sig + 2, sig + 3))
+	       goto bad;
+
+	  if ((slvndx = slookup(ego, buf, reg_id)) < 0)
+	       goto bad;
+
+	  /* inter oves locum praesta */
+	  hinsert(ego, sig, (unsigned short)flags, slvndx);
+     }
+
+     X(ifree0)(sol);
+     return 1;
+
+ bad:
+     /* ``The wisdom of FFTW must be above suspicion.'' */
+     X(ifree0)(ego->solutions);
+     ego->solutions = sol;
+     return 0;
+}
+
+/*
+ * create a planner
+ */
+planner *X(mkplanner)(void)
+{
+     static const planner_adt padt = {
+	  register_solver, mkplan, forget, exprt, imprt
+     };
+
+     planner *p = (planner *) MALLOC(sizeof(planner), PLANNERS);
+
+     p->adt = &padt;
+     p->nplan = p->nprob = p->nrehash = 0;
+     p->pcost = p->epcost = 0.0;
+     p->succ_lookup = p->lookup = p->lookup_iter = 0;
+     p->insert = p->insert_iter = p->insert_unknown = 0;
+     p->hook = 0;
+     p->cur_reg_nam = 0;
+
+     p->slvdescs = 0;
+     p->nslvdesc = p->slvdescsiz = 0;
+
+     p->solutions = 0;
+     p->hashsiz = p->nelem = 0U;
+
+     p->problem_flags = 0;
+     p->planner_flags = 0;
+     p->nthr = 1;
+
+     hgrow(p);			/* so that hashsiz > 0 */
+
+     return p;
+}
+
+void X(planner_destroy)(planner *ego)
+{
+     /* destroy hash table */
+     htab_destroy(ego);
+
+     /* destroy solvdesc table */
+     FORALL_SOLVERS(ego, s, sp, {
+	  UNUSED(sp);
+	  X(solver_destroy)(s);
+     });
+
+     X(ifree0)(ego->slvdescs);
+     X(ifree)(ego); /* dona eis requiem */
+}
+
+plan *X(mkplan_d)(planner *ego, problem *p)
+{
+     plan *pln = ego->adt->mkplan(ego, p);
+     X(problem_destroy)(p);
+     return pln;
+}
+
+/*
+ * Debugging code:
+ */
+#ifdef FFTW_DEBUG
+
+void X(planner_dump)(planner *ego, int verbose)
+{
+     unsigned valid = 0, empty = 0, infeasible = 0;
+     unsigned h;
+     UNUSED(verbose); /* historical */
+
+     for (h = 0; h < ego->hashsiz; ++h) {
+	  solution *l = ego->solutions + h; 
+	  if (VALIDP(l)) {
+	       ++valid; 
+	       if (l->slvndx < 0) ++infeasible;
+	  } else
+	       ++empty;
+	  
+     }
+
+     D("nplan = %d\n", ego->nplan);
+     D("nprob = %d\n", ego->nprob);
+     D("pcost = %g\n", ego->pcost);
+     D("epcost = %g\n", ego->epcost);
+     D("lookup = %d\n", ego->lookup);
+     D("succ_lookup = %d\n", ego->succ_lookup);
+     D("lookup_iter = %d\n", ego->lookup_iter);
+     D("insert = %d\n", ego->insert);
+     D("insert_iter = %d\n", ego->insert_iter);
+     D("insert_unknown = %d\n", ego->insert_unknown);
+     D("nrehash = %d\n", ego->nrehash);
+     D("hashsiz = %u\n", ego->hashsiz);
+     D("empty = %d\n", empty);
+     D("valid = %d\n", valid);
+     D("infeasible = %d\n", infeasible);
+     A(ego->nelem == valid);
+}
+
+#endif
diff --git a/src/fftw3/kernel/primes.c b/src/fftw3/kernel/primes.c
new file mode 100644
index 0000000..608e51c
--- /dev/null
+++ b/src/fftw3/kernel/primes.c
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/* $Id: primes.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */
+
+#include "ifftw.h"
+
+/***************************************************************************/
+
+/* Rader's algorithm requires lots of modular arithmetic, and if we
+   aren't careful we can have errors due to integer overflows. */
+
+#ifdef SAFE_MULMOD
+
+#  include <limits.h>
+
+/* compute (x * y) mod p, but watch out for integer overflows; we must
+   have x, y >= 0, p > 0.  This routine is slow. */
+int X(safe_mulmod)(int x, int y, int p)
+{
+     if (y == 0 || x <= INT_MAX / y)
+	  return((x * y) % p);
+     else {
+	  int y2 = y/2;
+	  return((X(safe_mulmod)(x, y2, p) +
+		  X(safe_mulmod)(x, y - y2, p)) % p);
+     }
+}
+#endif /* safe_mulmod ('long long' unavailable) */
+
+/***************************************************************************/
+
+/* Compute n^m mod p, where m >= 0 and p > 0.  If we really cared, we
+   could make this tail-recursive. */
+int X(power_mod)(int n, int m, int p)
+{
+     A(p > 0);
+     if (m == 0)
+	  return 1;
+     else if (m % 2 == 0) {
+	  int x = X(power_mod)(n, m / 2, p);
+	  return MULMOD(x, x, p);
+     }
+     else
+	  return MULMOD(n, X(power_mod)(n, m - 1, p), p);
+}
+
+/* the following two routines were contributed by Greg Dionne. */
+static int get_prime_factors(int n, int *primef)
+{
+     int i;
+     int size = 0;
+
+     primef[size++] = 2;
+     do
+	  n >>= 1;
+     while ((n & 1) == 0);
+
+     if (n == 1)
+	  return size;
+
+     for (i = 3; i * i <= n; i += 2)
+	  if (!(n % i)) {
+	       primef[size++] = i;
+	       do
+		    n /= i;
+	       while (!(n % i));
+	  }
+     if (n == 1)
+	  return size;
+     primef[size++] = n;
+     return size;
+}
+
+int X(find_generator)(int p)
+{
+    int n, i, size;
+    int primef[16];     /* smallest number = 32589158477190044730 > 2^64 */
+    int pm1 = p - 1;
+
+    if (p == 2)
+	 return 1;
+
+    size = get_prime_factors(pm1, primef);
+    n = 2;
+    for (i = 0; i < size; i++)
+        if (X(power_mod)(n, pm1 / primef[i], p) == 1) {
+            i = -1;
+            n++;
+        }
+    return n;
+}
+
+/* Return first prime divisor of n  (It would be at best slightly faster to
+   search a static table of primes; there are 6542 primes < 2^16.)  */
+int X(first_divisor)(int n)
+{
+     int i;
+     if (n <= 1)
+	  return n;
+     if (n % 2 == 0)
+	  return 2;
+     for (i = 3; i*i <= n; i += 2)
+	  if (n % i == 0)
+	       return i;
+     return n;
+}
+
+int X(is_prime)(int n)
+{
+     return(n > 1 && X(first_divisor)(n) == n);
+}
+
+int X(next_prime)(int n)
+{
+     while (!X(is_prime)(n)) ++n;
+     return n;
+}
diff --git a/src/fftw3/kernel/print.c b/src/fftw3/kernel/print.c
new file mode 100644
index 0000000..314be7c
--- /dev/null
+++ b/src/fftw3/kernel/print.c
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/* $Id: print.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */
+
+#include "ifftw.h"
+#include <stddef.h>
+#include <stdarg.h>
+#include <stdio.h>
+
+#define BSZ 64
+
+static void myputs(printer *p, const char *s)
+{
+     char c;
+     while ((c = *s++))
+          p->putchr(p, c);
+}
+
+static void vprint(printer *p, const char *format, va_list ap)
+{
+     char buf[BSZ];
+     const char *s = format;
+     char c;
+     int i;
+
+     for (i = 0; i < p->indent; ++i)
+          p->putchr(p, ' ');
+
+     while ((c = *s++)) {
+          switch (c) {
+	      case '%':
+		   switch ((c = *s++)) {
+		       case 'M': {
+			    /* md5 value */
+			    md5uint x = va_arg(ap, md5uint);
+			    x = 0xffffffffUL & x;
+			    sprintf(buf, "%8.8lx", (unsigned long)x);
+			    goto putbuf;
+		       }
+		       case 'c': {
+			    int x = va_arg(ap, int);
+			    p->putchr(p, x);
+			    break;
+		       }
+		       case 's': {
+			    char *x = va_arg(ap, char *);
+			    if (x)
+				 myputs(p, x);
+			    else
+				 goto putnull;
+			    break;
+		       }
+		       case 'd': {
+			    int x = va_arg(ap, int);
+			    sprintf(buf, "%d", x);
+			    goto putbuf;
+		       }
+		       case 't': {
+			    ptrdiff_t x;
+			    A(*s == 'd');
+			    s += 1;
+			    x = va_arg(ap, ptrdiff_t);
+			    /* should use C99 %td here, but
+			       this is not yet widespread enough */
+			    sprintf(buf, "%ld", (long) x);
+			    goto putbuf;
+		       }
+		       case 'f': case 'e': case 'g': {
+			    char fmt[3] = "%x";
+			    double x = va_arg(ap, double);
+			    fmt[1] = c;
+			    sprintf(buf, fmt, x);
+			    goto putbuf;
+		       }
+		       case 'v': {
+			    /* print optional vector length */
+			    int x = va_arg(ap, int);
+			    if (x > 1) {
+				 sprintf(buf, "-x%d", x);
+				 goto putbuf;
+			    }
+			    break;
+		       }
+		       case 'o': {
+			    /* integer option.  Usage: %oNAME= */
+			    int x = va_arg(ap, int);
+			    if (x)
+				 p->putchr(p, '/');
+			    while ((c = *s++) != '=')
+				 if (x)
+				      p->putchr(p, c);
+			    if (x) {
+				 sprintf(buf, "=%d", x);
+				 goto putbuf;
+			    }
+			    break;
+		       }
+		       case 'u': {
+			    unsigned x = va_arg(ap, unsigned);
+			    sprintf(buf, "%u", x);
+			    goto putbuf;
+		       }
+		       case 'x': {
+			    unsigned x = va_arg(ap, unsigned);
+			    sprintf(buf, "%x", x);
+			    goto putbuf;
+		       }
+		       case '(': {
+			    /* newline, augment indent level */
+			    p->putchr(p, '\n');
+			    p->indent += p->indent_incr;
+			    break;
+		       }
+		       case ')': {
+			    /* decrement indent level */
+			    p->indent -= p->indent_incr;
+			    break;
+		       }
+		       case 'p': {  /* note difference from C's %p */
+			    /* print plan */
+			    plan *x = va_arg(ap, plan *);
+			    if (x) 
+				 x->adt->print(x, p);
+			    else 
+				 goto putnull;
+			    break;
+		       }
+		       case 'P': {
+			    /* print problem */
+			    problem *x = va_arg(ap, problem *);
+			    if (x)
+				 x->adt->print(x, p);
+			    else
+				 goto putnull;
+			    break;
+		       }
+		       case 'T': {
+			    /* print tensor */
+			    tensor *x = va_arg(ap, tensor *);
+			    if (x)
+				 X(tensor_print)(x, p);
+			    else
+				 goto putnull;
+			    break;
+		       }
+		       default:
+			    A(0 /* unknown format */);
+			    break;
+
+		   putbuf:
+			    myputs(p, buf);
+			    break;
+		   putnull:
+			    myputs(p, "(null)");
+			    break;
+		   }
+		   break;
+	      default:
+		   p->putchr(p, c);
+		   break;
+          }
+     }
+}
+
+static void print(printer *p, const char *format, ...)
+{
+     va_list ap;
+     va_start(ap, format);
+     vprint(p, format, ap);
+     va_end(ap);
+}
+
+printer *X(mkprinter)(size_t size, 
+		      void (*putchr)(printer *p, char c),
+		      void (*cleanup)(printer *p))
+{
+     printer *s = (printer *)MALLOC(size, OTHER);
+     s->print = print;
+     s->vprint = vprint;
+     s->putchr = putchr;
+     s->cleanup = cleanup;
+     s->indent = 0;
+     s->indent_incr = 2;
+     return s;
+}
+
+void X(printer_destroy)(printer *p)
+{
+     if (p->cleanup)
+	  p->cleanup(p);
+     X(ifree)(p);
+}
diff --git a/src/fftw3/kernel/scan.c b/src/fftw3/kernel/scan.c
new file mode 100644
index 0000000..f5fa6e1
--- /dev/null
+++ b/src/fftw3/kernel/scan.c
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/* $Id: scan.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */
+
+#include "ifftw.h"
+#include <string.h>
+#include <stddef.h>
+#include <stdarg.h>
+#include <stdio.h>
+
+#ifdef USE_CTYPE
+#include <ctype.h>
+#else
+/* Screw ctype. On linux, the is* functions call a routine that gets
+   the ctype map in the current locale.  Because this operation is
+   expensive, the map is cached on a per-thread basis.  I am not
+   willing to link this crap with FFTW.  Not over my dead body.
+
+   Sic transit gloria mundi.
+*/
+#undef isspace
+#define isspace(x) ((x) >= 0 && (x) <= ' ')
+#undef isdigit
+#define isdigit(x) ((x) >= '0' && (x) <= '9')
+#undef isupper
+#define isupper(x) ((x) >= 'A' && (x) <= 'Z')
+#undef islower
+#define islower(x) ((x) >= 'a' && (x) <= 'z')
+#endif
+
+static int mygetc(scanner *sc)
+{
+     if (sc->ungotc != EOF) {
+	  int c = sc->ungotc;
+	  sc->ungotc = EOF;
+	  return c;
+     }
+     return(sc->getchr(sc));
+}
+
+#define GETCHR(sc) mygetc(sc)
+
+static void myungetc(scanner *sc, int c)
+{
+     sc->ungotc = c;
+}
+
+#define UNGETCHR(sc, c) myungetc(sc, c)
+
+static void eat_blanks(scanner *sc)
+{
+     int ch;
+     while (ch = GETCHR(sc), isspace(ch))
+          ;
+     UNGETCHR(sc, ch);
+}
+
+static void mygets(scanner *sc, char *s, size_t maxlen)
+{
+     char *s0 = s;
+     int ch;
+
+     A(maxlen > 0);
+     while ((ch = GETCHR(sc)) != EOF && !isspace(ch)
+	    && ch != ')' && ch != '(' && s < s0 + maxlen)
+	  *s++ = ch;
+     *s = 0;
+     UNGETCHR(sc, ch);
+}
+
+static long getlong(scanner *sc, int base, int *ret)
+{
+     int sign = 1, ch, count;
+     long x = 0;     
+
+     ch = GETCHR(sc);
+     if (ch == '-' || ch == '+') {
+	  sign = ch == '-' ? -1 : 1;
+	  ch = GETCHR(sc);
+     }
+     for (count = 0; ; ++count) {
+	  if (isdigit(ch)) 
+	       ch -= '0';
+	  else if (isupper(ch))
+	       ch -= 'A' - 10;
+	  else if (islower(ch))
+	       ch -= 'a' - 10;
+	  else
+	       break;
+	  x = x * base + ch;
+	  ch = GETCHR(sc);
+     }
+     x *= sign;
+     UNGETCHR(sc, ch);
+     *ret = count > 0;
+     return x;
+}
+
+/* vscan is mostly scanf-like, with our additional format specifiers,
+   but with a few twists.  It returns simply 0 or 1 indicating whether
+   the match was successful. '(' and ')' in the format string match
+   those characters preceded by any whitespace.  Finally, if a
+   character match fails, it will ungetchr() the last character back
+   onto the stream. */
+static int vscan(scanner *sc, const char *format, va_list ap)
+{
+     const char *s = format;
+     char c;
+     int ch = 0;
+     size_t fmt_len;
+
+     while ((c = *s++)) {
+	  fmt_len = 0;
+          switch (c) {
+	      case '%':
+	  getformat:
+		   switch ((c = *s++)) {
+		       case 's': {
+			    char *x = va_arg(ap, char *);
+			    mygets(sc, x, fmt_len);
+			    break;
+		       }
+		       case 'd': {
+			    int *x = va_arg(ap, int *);
+			    *x = (int) getlong(sc, 10, &ch);
+			    if (!ch) return 0;
+			    break;
+		       }
+		       case 'x': {
+			    int *x = va_arg(ap, int *);
+			    *x = (int) getlong(sc, 16, &ch);
+			    if (!ch) return 0;
+			    break;
+		       }
+		       case 'M': {
+			    md5uint *x = va_arg(ap, md5uint *);
+			    *x = 0xffffffffUL & getlong(sc, 16, &ch);
+			    if (!ch) return 0;
+			    break;
+		       }
+		       case '*': {
+			    if ((fmt_len = va_arg(ap, int)) <= 0) return 0;
+			    goto getformat;
+		       }
+		       default:
+			    A(0 /* unknown format */);
+			    break;
+		   }
+		   break;
+	      default:
+		   if (isspace(c) || c == '(' || c == ')')
+			eat_blanks(sc);
+		   if (!isspace(c) && (ch = GETCHR(sc)) != c) {
+			UNGETCHR(sc, ch);
+			return 0;
+		   }
+		   break;
+          }
+     }
+     return 1;
+}
+
+static int scan(scanner *sc, const char *format, ...)
+{
+     int ret;
+     va_list ap;
+     va_start(ap, format);
+     ret = vscan(sc, format, ap);
+     va_end(ap);
+     return ret;
+}
+
+scanner *X(mkscanner)(size_t size, int (*getchr)(scanner *sc))
+{
+     scanner *s = (scanner *)MALLOC(size, OTHER);
+     s->scan = scan;
+     s->vscan = vscan;
+     s->getchr = getchr;
+     s->ungotc = EOF;
+     return s;
+}
+
+void X(scanner_destroy)(scanner *sc)
+{
+     X(ifree)(sc);
+}
diff --git a/src/fftw3/kernel/solver.c b/src/fftw3/kernel/solver.c
new file mode 100644
index 0000000..4bfb899
--- /dev/null
+++ b/src/fftw3/kernel/solver.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/* $Id: solver.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */
+
+#include "ifftw.h"
+
+solver *X(mksolver)(size_t size, const solver_adt *adt)
+{
+     solver *s = (solver *)MALLOC(size, SOLVERS);
+
+     s->adt = adt;
+     s->refcnt = 0;
+     return s;
+}
+
+void X(solver_use)(solver *ego)
+{
+     ++ego->refcnt;
+}
+
+void X(solver_destroy)(solver *ego)
+{
+     if ((--ego->refcnt) == 0)
+          X(ifree)(ego);
+}
+
+void X(solver_register)(planner *plnr, solver *s)
+{
+     plnr->adt->register_solver(plnr, s);
+}
diff --git a/src/fftw3/kernel/solvtab.c b/src/fftw3/kernel/solvtab.c
new file mode 100644
index 0000000..496915e
--- /dev/null
+++ b/src/fftw3/kernel/solvtab.c
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/* $Id: solvtab.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */
+
+#include "ifftw.h"
+
+void X(solvtab_exec)(solvtab tbl, planner *p)
+{
+     for (; tbl->reg_nam; ++tbl) {
+	  p->cur_reg_nam = tbl->reg_nam;
+	  p->cur_reg_id = 0;
+	  tbl->reg(p);
+     }
+     p->cur_reg_nam = 0;
+}
diff --git a/src/fftw3/kernel/square.c b/src/fftw3/kernel/square.c
new file mode 100644
index 0000000..4b5afab
--- /dev/null
+++ b/src/fftw3/kernel/square.c
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/* $Id: square.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */
+
+#include "ifftw.h"
+
+int X(square)(int x)
+{
+     return x * x;
+}
diff --git a/src/fftw3/kernel/stride.c b/src/fftw3/kernel/stride.c
new file mode 100644
index 0000000..fda049f
--- /dev/null
+++ b/src/fftw3/kernel/stride.c
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/* $Id: stride.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */
+#include "ifftw.h"
+
+#ifdef PRECOMPUTE_ARRAY_INDICES
+stride X(mkstride)(int n, int s)
+{
+     int i;
+     int *p = (int *) MALLOC(n * sizeof(int), STRIDES);
+
+     for (i = 0; i < n; ++i)
+          p[i] = s * i;
+
+     return p;
+}
+
+void X(stride_destroy)(stride p)
+{
+     X(ifree0)(p);
+}
+
+#endif
diff --git a/src/fftw3/kernel/tensor.c b/src/fftw3/kernel/tensor.c
new file mode 100644
index 0000000..1161963
--- /dev/null
+++ b/src/fftw3/kernel/tensor.c
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/* $Id: tensor.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */
+
+#include "ifftw.h"
+
+tensor *X(mktensor)(int rnk) 
+{
+     tensor *x;
+
+     A(rnk >= 0);
+
+#if defined(STRUCT_HACK_KR)
+     if (FINITE_RNK(rnk) && rnk > 1)
+	  x = (tensor *)MALLOC(sizeof(tensor) + (rnk - 1) * sizeof(iodim),
+				    TENSORS);
+     else
+	  x = (tensor *)MALLOC(sizeof(tensor), TENSORS);
+#elif defined(STRUCT_HACK_C99)
+     if (FINITE_RNK(rnk))
+	  x = (tensor *)MALLOC(sizeof(tensor) + rnk * sizeof(iodim),
+				    TENSORS);
+     else
+	  x = (tensor *)MALLOC(sizeof(tensor), TENSORS);
+#else
+     x = (tensor *)MALLOC(sizeof(tensor), TENSORS);
+     if (FINITE_RNK(rnk) && rnk > 0)
+          x->dims = (iodim *)MALLOC(sizeof(iodim) * rnk, TENSORS);
+     else
+          x->dims = 0;
+#endif
+
+     x->rnk = rnk;
+     return x;
+}
+
+void X(tensor_destroy)(tensor *sz)
+{
+#if !defined(STRUCT_HACK_C99) && !defined(STRUCT_HACK_KR)
+     X(ifree0)(sz->dims);
+#endif
+     X(ifree)(sz);
+}
+
+int X(tensor_sz)(const tensor *sz)
+{
+     int i, n = 1;
+
+     if (!FINITE_RNK(sz->rnk))
+          return 0;
+
+     for (i = 0; i < sz->rnk; ++i)
+          n *= sz->dims[i].n;
+     return n;
+}
+
+void X(tensor_md5)(md5 *p, const tensor *t)
+{
+     int i;
+     X(md5int)(p, t->rnk);
+     if (FINITE_RNK(t->rnk)) {
+	  for (i = 0; i < t->rnk; ++i) {
+	       const iodim *q = t->dims + i;
+	       X(md5int)(p, q->n);
+	       X(md5int)(p, q->is);
+	       X(md5int)(p, q->os);
+	  }
+     }
+}
+
+/* treat a (rank <= 1)-tensor as a rank-1 tensor, extracting
+   appropriate n, is, and os components */
+int X(tensor_tornk1)(const tensor *t, int *n, int *is, int *os)
+{
+     A(t->rnk <= 1);
+     if (t->rnk == 1) {
+	  const iodim *vd = t->dims;
+          *n = vd[0].n;
+          *is = vd[0].is;
+          *os = vd[0].os;
+     } else {
+          *n = 1;
+          *is = *os = 0;
+     }
+     return 1;
+}
+
+void X(tensor_print)(const tensor *x, printer *p)
+{
+     if (FINITE_RNK(x->rnk)) {
+	  int i;
+	  int first = 1;
+	  p->print(p, "(");
+	  for (i = 0; i < x->rnk; ++i) {
+	       const iodim *d = x->dims + i;
+	       p->print(p, "%s(%d %d %d)", 
+			first ? "" : " ",
+			d->n, d->is, d->os);
+	       first = 0;
+	  }
+	  p->print(p, ")");
+     } else {
+	  p->print(p, "rank-minfty"); 
+     }
+}
diff --git a/src/fftw3/kernel/tensor1.c b/src/fftw3/kernel/tensor1.c
new file mode 100644
index 0000000..06ad4dc
--- /dev/null
+++ b/src/fftw3/kernel/tensor1.c
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/* $Id: tensor1.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */
+
+#include "ifftw.h"
+
+tensor *X(mktensor_0d)(void)
+{
+     return X(mktensor(0));
+}
+
+tensor *X(mktensor_1d)(int n, int is, int os)
+{
+     tensor *x = X(mktensor)(1);
+     x->dims[0].n = n;
+     x->dims[0].is = is;
+     x->dims[0].os = os;
+     return x;
+}
diff --git a/src/fftw3/kernel/tensor2.c b/src/fftw3/kernel/tensor2.c
new file mode 100644
index 0000000..a45e164
--- /dev/null
+++ b/src/fftw3/kernel/tensor2.c
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/* $Id: tensor2.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */
+
+#include "ifftw.h"
+
+tensor *X(mktensor_2d)(int n0, int is0, int os0,
+                      int n1, int is1, int os1)
+{
+     tensor *x = X(mktensor)(2);
+     x->dims[0].n = n0;
+     x->dims[0].is = is0;
+     x->dims[0].os = os0;
+     x->dims[1].n = n1;
+     x->dims[1].is = is1;
+     x->dims[1].os = os1;
+     return x;
+}
+
diff --git a/src/fftw3/kernel/tensor4.c b/src/fftw3/kernel/tensor4.c
new file mode 100644
index 0000000..9b6cd28
--- /dev/null
+++ b/src/fftw3/kernel/tensor4.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/* $Id: tensor4.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */
+
+#include "ifftw.h"
+
+int X(tensor_max_index)(const tensor *sz)
+{
+     int i;
+     int n = 0;
+
+     A(FINITE_RNK(sz->rnk));
+     for (i = 0; i < sz->rnk; ++i) {
+          const iodim *p = sz->dims + i;
+          n += (p->n - 1) * X(imax)(X(iabs)(p->is), X(iabs)(p->os));
+     }
+     return n;
+}
+
+#define tensor_min_xstride(sz, xs) {			\
+     A(FINITE_RNK(sz->rnk));				\
+     if (sz->rnk == 0) return 0;			\
+     else {						\
+          int i;					\
+          int s = X(iabs)(sz->dims[0].xs);		\
+          for (i = 1; i < sz->rnk; ++i)			\
+               s = X(imin)(s, X(iabs)(sz->dims[i].xs));	\
+          return s;					\
+     }							\
+}
+
+int X(tensor_min_istride)(const tensor *sz) tensor_min_xstride(sz, is)
+int X(tensor_min_ostride)(const tensor *sz) tensor_min_xstride(sz, os)
+
+int X(tensor_min_stride)(const tensor *sz)
+{
+     return X(imin)(X(tensor_min_istride)(sz), X(tensor_min_ostride)(sz));
+}
+
+int X(tensor_inplace_strides)(const tensor *sz)
+{
+     int i;
+     A(FINITE_RNK(sz->rnk));
+     for (i = 0; i < sz->rnk; ++i) {
+          const iodim *p = sz->dims + i;
+          if (p->is != p->os)
+               return 0;
+     }
+     return 1;
+}
+
+int X(tensor_inplace_strides2)(const tensor *a, const tensor *b)
+{
+     return X(tensor_inplace_strides(a)) && X(tensor_inplace_strides(b));
+}
diff --git a/src/fftw3/kernel/tensor5.c b/src/fftw3/kernel/tensor5.c
new file mode 100644
index 0000000..8144930
--- /dev/null
+++ b/src/fftw3/kernel/tensor5.c
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/* $Id: tensor5.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */
+
+#include "ifftw.h"
+
+static void dimcpy(iodim *dst, const iodim *src, int rnk)
+{
+     int i;
+     if (FINITE_RNK(rnk))
+          for (i = 0; i < rnk; ++i)
+               dst[i] = src[i];
+}
+
+tensor *X(tensor_copy)(const tensor *sz)
+{
+     tensor *x = X(mktensor)(sz->rnk);
+     dimcpy(x->dims, sz->dims, sz->rnk);
+     return x;
+}
+
+/* like X(tensor_copy), but makes strides in-place by
+   setting os = is if k == INPLACE_IS or is = os if k == INPLACE_OS. */
+tensor *X(tensor_copy_inplace)(const tensor *sz, inplace_kind k)
+{
+     tensor *x = X(tensor_copy)(sz);
+     if (FINITE_RNK(x->rnk)) {
+	  int i;
+	  if (k == INPLACE_OS)
+	       for (i = 0; i < x->rnk; ++i)
+		    x->dims[i].is = x->dims[i].os;
+	  else
+	       for (i = 0; i < x->rnk; ++i)
+		    x->dims[i].os = x->dims[i].is;
+     }
+     return x;
+}
+
+/* Like X(tensor_copy), but copy all of the dimensions *except*
+   except_dim. */
+tensor *X(tensor_copy_except)(const tensor *sz, int except_dim)
+{
+     tensor *x;
+
+     A(FINITE_RNK(sz->rnk) && sz->rnk >= 1 && except_dim < sz->rnk);
+     x = X(mktensor)(sz->rnk - 1);
+     dimcpy(x->dims, sz->dims, except_dim);
+     dimcpy(x->dims + except_dim, sz->dims + except_dim + 1,
+            x->rnk - except_dim);
+     return x;
+}
+
+/* Like X(tensor_copy), but copy only rnk dimensions starting
+   with start_dim. */
+tensor *X(tensor_copy_sub)(const tensor *sz, int start_dim, int rnk)
+{
+     tensor *x;
+
+     A(FINITE_RNK(sz->rnk) && start_dim + rnk <= sz->rnk);
+     x = X(mktensor)(rnk);
+     dimcpy(x->dims, sz->dims + start_dim, rnk);
+     return x;
+}
+
+tensor *X(tensor_append)(const tensor *a, const tensor *b)
+{
+     if (!FINITE_RNK(a->rnk) || !FINITE_RNK(b->rnk)) {
+          return X(mktensor)(RNK_MINFTY);
+     } else {
+	  tensor *x = X(mktensor)(a->rnk + b->rnk);
+          dimcpy(x->dims, a->dims, a->rnk);
+          dimcpy(x->dims + a->rnk, b->dims, b->rnk);
+	  return x;
+     }
+}
diff --git a/src/fftw3/kernel/tensor7.c b/src/fftw3/kernel/tensor7.c
new file mode 100644
index 0000000..ceae2c4
--- /dev/null
+++ b/src/fftw3/kernel/tensor7.c
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/* $Id: tensor7.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */
+
+#include "ifftw.h"
+
+/* total order among iodim's */
+int X(dimcmp)(const iodim *a, const iodim *b)
+{
+     if (b->is != a->is)
+          return (b->is - a->is);	/* shorter strides go later */
+     if (b->os != a->os)
+          return (b->os - a->os);	/* shorter strides go later */
+     return (int)(a->n - b->n);	        /* larger n's go later */
+}
+
+/* Like tensor_copy, but eliminate n == 1 dimensions, which
+   never affect any transform or transform vector.
+ 
+   Also, we sort the tensor into a canonical order of decreasing
+   is.  In general, processing a loop/array in order of
+   decreasing stride will improve locality; sorting also makes the
+   analysis in fftw_tensor_contiguous (below) easier.  The choice
+   of is over os is mostly arbitrary, and hopefully
+   shouldn't affect things much.  Normally, either the os will be
+   in the same order as is (for e.g. multi-dimensional
+   transforms) or will be in opposite order (e.g. for Cooley-Tukey
+   recursion).  (Both forward and backwards traversal of the tensor
+   are considered e.g. by vrank-geq1, so sorting in increasing
+   vs. decreasing order is not really important.) */
+tensor *X(tensor_compress)(const tensor *sz)
+{
+     int i, rnk;
+     tensor *x;
+
+     A(FINITE_RNK(sz->rnk));
+     for (i = rnk = 0; i < sz->rnk; ++i) {
+          A(sz->dims[i].n > 0);
+          if (sz->dims[i].n != 1)
+               ++rnk;
+     }
+
+     x = X(mktensor)(rnk);
+     for (i = rnk = 0; i < sz->rnk; ++i) {
+          if (sz->dims[i].n != 1)
+               x->dims[rnk++] = sz->dims[i];
+     }
+
+     if (rnk) {
+	  /* God knows how qsort() behaves if n==0 */
+	  qsort(x->dims, (size_t)x->rnk, sizeof(iodim),
+		(int (*)(const void *, const void *))X(dimcmp));
+     }
+
+     return x;
+}
+
+/* Return whether the strides of a and b are such that they form an
+   effective contiguous 1d array.  Assumes that a.is >= b.is. */
+static int strides_contig(iodim *a, iodim *b)
+{
+     return (a->is == b->is * (int)b->n &&
+             a->os == b->os * (int)b->n);
+}
+
+/* Like tensor_compress, but also compress into one dimension any
+   group of dimensions that form a contiguous block of indices with
+   some stride.  (This can safely be done for transform vector sizes.) */
+tensor *X(tensor_compress_contiguous)(const tensor *sz)
+{
+     int i, rnk;
+     tensor *sz2, *x;
+
+     if (X(tensor_sz)(sz) == 0) 
+	  return X(mktensor)(RNK_MINFTY);
+
+     sz2 = X(tensor_compress)(sz);
+     A(FINITE_RNK(sz2->rnk));
+
+     if (sz2->rnk < 2)		/* nothing to compress */
+          return sz2;
+
+     for (i = rnk = 1; i < sz2->rnk; ++i)
+          if (!strides_contig(sz2->dims + i - 1, sz2->dims + i))
+               ++rnk;
+
+     x = X(mktensor)(rnk);
+     x->dims[0] = sz2->dims[0];
+     for (i = rnk = 1; i < sz2->rnk; ++i) {
+          if (strides_contig(sz2->dims + i - 1, sz2->dims + i)) {
+               x->dims[rnk - 1].n *= sz2->dims[i].n;
+               x->dims[rnk - 1].is = sz2->dims[i].is;
+               x->dims[rnk - 1].os = sz2->dims[i].os;
+          } else {
+               A(rnk < x->rnk);
+               x->dims[rnk++] = sz2->dims[i];
+          }
+     }
+
+     X(tensor_destroy)(sz2);
+     return x;
+}
+
+/* The inverse of X(tensor_append): splits the sz tensor into
+   tensor a followed by tensor b, where a's rank is arnk. */
+void X(tensor_split)(const tensor *sz, tensor **a, int arnk, tensor **b)
+{
+     A(FINITE_RNK(sz->rnk) && FINITE_RNK(arnk));
+
+     *a = X(tensor_copy_sub)(sz, 0, arnk);
+     *b = X(tensor_copy_sub)(sz, arnk, sz->rnk - arnk);
+}
diff --git a/src/fftw3/kernel/tensor8.c b/src/fftw3/kernel/tensor8.c
new file mode 100644
index 0000000..05e9b47
--- /dev/null
+++ b/src/fftw3/kernel/tensor8.c
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/* $Id: tensor8.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */
+
+#include "ifftw.h"
+
+void X(tensor_destroy2)(tensor *a, tensor *b)
+{
+     X(tensor_destroy)(a);
+     X(tensor_destroy)(b);
+}
+
+void X(tensor_destroy4)(tensor *a, tensor *b, tensor *c, tensor *d)
+{
+     X(tensor_destroy2)(a, b);
+     X(tensor_destroy2)(c, d);
+}
diff --git a/src/fftw3/kernel/tensor9.c b/src/fftw3/kernel/tensor9.c
new file mode 100644
index 0000000..33ddf45
--- /dev/null
+++ b/src/fftw3/kernel/tensor9.c
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/* $Id: tensor9.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */
+
+#include "ifftw.h"
+
+int X(tensor_kosherp)(const tensor *x)
+{
+     int i;
+
+     if (x->rnk < 0) return 0;
+
+     if (FINITE_RNK(x->rnk)) {
+	  for (i = 0; i < x->rnk; ++i)
+	       if (x->dims[i].n < 0)
+		    return 0;
+     }
+     return 1;
+}
diff --git a/src/fftw3/kernel/timer.c b/src/fftw3/kernel/timer.c
new file mode 100644
index 0000000..72969ad
--- /dev/null
+++ b/src/fftw3/kernel/timer.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/* $Id: timer.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */
+
+#include "ifftw.h"
+
+#ifdef HAVE_UNISTD_H
+#  include <unistd.h>
+#endif
+
+#ifndef WITH_SLOW_TIMER
+#  include "cycle.h"
+#else
+#  if TIME_WITH_SYS_TIME
+#   include <sys/time.h>
+#   include <time.h>
+#  else
+#   if HAVE_SYS_TIME_H
+#    include <sys/time.h>
+#   else
+#    include <time.h>
+#   endif
+#  endif
+#endif
+
+#ifndef FFTW_TIME_LIMIT
+#define FFTW_TIME_LIMIT 2.0  /* don't run for more than two seconds */
+#endif
+
+#ifdef HAVE_BSDGETTIMEOFDAY
+#ifndef HAVE_GETTIMEOFDAY
+#define gettimeofday BSDgettimeofday
+#define HAVE_GETTIMEOFDAY 1
+#endif
+#endif
+
+#if defined(HAVE_GETTIMEOFDAY) && !defined(HAVE_SECONDS_TIMER)
+typedef struct timeval seconds;
+
+static seconds getseconds(void)
+{
+     struct timeval tv;
+     gettimeofday(&tv, 0);
+     return tv;
+}
+
+static double elapsed_sec(seconds t1, seconds t0)
+{
+     return (double)(t1.tv_sec - t0.tv_sec) +
+	  (double)(t1.tv_usec - t0.tv_usec) * 1.0E-6;
+}
+
+#  define TIME_MIN_SEC 1.0e-2 /* from fftw2 */
+#  define HAVE_SECONDS_TIMER
+#endif
+
+#ifndef HAVE_SECONDS_TIMER
+#  include <time.h>
+
+typedef clock_t seconds;
+
+static seconds getseconds(void) { return clock(); }
+
+static double elapsed_sec(seconds t1, seconds t0)
+{
+     return ((double) (t1 - t0)) / CLOCKS_PER_SEC;
+}
+
+#  define TIME_MIN_SEC 2.0e-1 /* from fftw2 */
+#  define HAVE_SECONDS_TIMER
+#endif
+
+#ifdef WITH_SLOW_TIMER
+/* excruciatingly slow; only use this if there is no choice! */
+typedef seconds ticks;
+#  define getticks getseconds
+#  define elapsed elapsed_sec
+#  define TIME_MIN TIME_MIN_SEC
+#  define TIME_REPEAT 4 /* from fftw2 */
+#  define HAVE_TICK_COUNTER
+#endif
+
+#ifdef HAVE_TICK_COUNTER
+
+#  ifndef TIME_MIN
+#    define TIME_MIN 100.0
+#  endif
+
+#  ifndef TIME_REPEAT
+#    define TIME_REPEAT 8
+#  endif
+
+  static double measure(plan *pln, const problem *p, int iter)
+  {
+       ticks t0, t1;
+       int i;
+
+       t0 = getticks();
+       for (i = 0; i < iter; ++i) 
+	    pln->adt->solve(pln, p);
+       t1 = getticks();
+       return elapsed(t1, t0);
+  }
+
+
+  double X(measure_execution_time)(plan *pln, const problem *p)
+  {
+       seconds begin, now;
+       double t, tmax, tmin;
+       int iter;
+       int repeat;
+
+       AWAKE(pln, 1);
+       p->adt->zero(p);
+
+  start_over:
+       for (iter = 1; iter; iter *= 2) {
+	    tmin = 1.0E10;
+	    tmax = -1.0E10;
+
+	    begin = getseconds();
+	    /* repeat the measurement TIME_REPEAT times */
+	    for (repeat = 0; repeat < TIME_REPEAT; ++repeat) {
+		 t = measure(pln, p, iter);
+
+		 if (t < 0)
+		      goto start_over;
+
+		 if (t < tmin)
+		      tmin = t;
+		 if (t > tmax)
+		      tmax = t;
+
+		 /* do not run for too long */
+		 now = getseconds();
+		 t = elapsed_sec(now, begin);
+
+		 if (t > FFTW_TIME_LIMIT)
+		      break;
+	    }
+
+	    if (tmin >= TIME_MIN) {
+		 tmin /= (double) iter;
+		 tmax /= (double) iter;
+		 AWAKE(pln, 0);
+		 return tmin;
+	    }
+       }
+       goto start_over; /* may happen if timer is screwed up */
+  }
+
+#else /* no cycle counter */
+
+  double X(measure_execution_time)(plan *pln, const problem *p)
+  {
+       UNUSED(p);
+       UNUSED(pln);
+       return -1.0;
+  }
+
+#endif
diff --git a/src/fftw3/kernel/transpose.c b/src/fftw3/kernel/transpose.c
new file mode 100644
index 0000000..fb489bd
--- /dev/null
+++ b/src/fftw3/kernel/transpose.c
@@ -0,0 +1,430 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/* transposes of unit-stride arrays, including arrays of N-tuples and
+   non-square matrices, using cache-oblivious recursive algorithms */
+
+#include "ifftw.h"
+#include <string.h> /* memcpy */
+
+#define CUTOFF 8 /* size below which we do a naive transpose */
+
+/*************************************************************************/
+/* some utilities for the solvers */
+
+static int Ntuple_transposable(const iodim *a, const iodim *b,
+			       int vl, int s, R *ri, R *ii)
+{
+     return(2 == s && (ii == ri + 1 || ri == ii + 1)
+	    &&
+	    ((a->is == b->os && a->is == (vl*2)
+	      && a->os == b->n * (vl*2) && b->is == a->n * (vl*2))
+	     ||
+	     (a->os == b->is && a->os == (vl*2)
+	      && a->is == b->n * (vl*2) && b->os == a->n * (vl*2))));
+}
+
+
+/* our solvers' transpose routines work for square matrices of arbitrary
+   stride, or for non-square matrices of a given vl*vl2 corresponding
+   to the N of the Ntuple with vl2 == s. */
+int X(transposable)(const iodim *a, const iodim *b,
+		    int vl, int s, R *ri, R *ii)
+{
+     return ((a->n == b->n && a->os == b->is && a->is == b->os)
+	     || Ntuple_transposable(a, b, vl, s, ri, ii));
+}
+
+static int gcd(int a, int b)
+{
+     int r;
+     do {
+	  r = a % b;
+	  a = b;
+	  b = r;
+     } while (r != 0);
+     
+     return a;
+}
+
+/* all of the solvers need to extract n, m, d, n/d, and m/d from the
+   two iodims, so we put it here to save code space */
+void X(transpose_dims)(const iodim *a, const iodim *b,
+		       int *n, int *m, int *d, int *nd, int *md)
+{
+     int n0, m0, d0;
+     /* matrix should be n x m, row-major */
+     if (a->is < b->is) {
+	  *n = n0 = b->n;
+	  *m = m0 = a->n;
+     }
+     else {
+	  *n = n0 = a->n;
+	  *m = m0 = b->n;
+     }
+     *d = d0 = gcd(n0, m0);
+     *nd = n0 / d0;
+     *md = m0 / d0;
+}
+
+/* use the simple square transpose in the solver for square matrices
+   that aren't too big or which have the wrong stride */
+int X(transpose_simplep)(const iodim *a, const iodim *b, int vl, int s,
+			 R *ri, R *ii)
+{
+     return (a->n == b->n &&
+	     (a->n*(vl*2) < CUTOFF 
+	      ||  !Ntuple_transposable(a, b, vl, s, ri, ii)));
+}
+
+/* use the slow general transpose if the buffer would be more than 1/8
+   the whole transpose and the transpose is fairly big.
+   (FIXME: use the CONSERVE_MEMORY flag?) */
+int X(transpose_slowp)(const iodim *a, const iodim *b, int N)
+{
+     int d = gcd(a->n, b->n);
+     return (d < 8 && (a->n * b->n * N) / d > 65536);
+}
+
+/*************************************************************************/
+/* Out-of-place transposes: */
+
+/* Transpose A (n x m) to B (m x n), where A and B are stored
+   as n x fda and m x fda arrays, respectively, operating on N-tuples: */
+static void rec_transpose_Ntuple(R *A, R *B, int n, int m, int fda, int fdb,
+			  int N)
+{
+     if (n == 1 || m == 1 || (n + m) * N < CUTOFF*2) {
+	  int i, j, k;
+	  for (i = 0; i < n; ++i) {
+	       for (j = 0; j < m; ++j) {
+		    for (k = 0; k < N; ++k) { /* FIXME: unroll */
+			 B[(j*fdb + i) * N + k] = A[(i*fda + j) * N + k];
+		    }
+	       }
+	  }
+     }
+     else if (n > m) {
+	  int n2 = n / 2;
+	  rec_transpose_Ntuple(A, B, n2, m, fda, fdb, N);
+	  rec_transpose_Ntuple(A + n2*N*fda, B + n2*N, n - n2, m, fda, fdb, N);
+     }
+     else {
+	  int m2 = m / 2;
+	  rec_transpose_Ntuple(A, B, n, m2, fda, fdb, N);
+	  rec_transpose_Ntuple(A + m2*N, B + m2*N*fdb, n, m - m2, fda, fdb, N);
+     }
+}
+
+/*************************************************************************/
+/* In-place transposes of square matrices of N-tuples: */
+
+/* Transpose both A and B, where A is n x m and B is m x n, storing
+   the transpose of A in B and the transpose of B in A.  A and B
+   are actually stored as n x fda and m x fda arrays. */
+static void rec_transpose_swap_Ntuple(R *A, R *B, int n, int m, int fda, int N)
+{
+     if (n == 1 || m == 1 || (n + m) * N <= CUTOFF*2) {
+	  switch (N) {
+	      case 1: {
+		   int i, j;
+		   for (i = 0; i < n; ++i) {
+			for (j = 0; j < m; ++j) {
+			     R a = A[(i*fda + j)];
+			     A[(i*fda + j)] = B[(j*fda + i)];
+			     B[(j*fda + i)] = a;
+			}
+		   }
+		   break;
+	      }
+	      case 2: {
+		   int i, j;
+		   for (i = 0; i < n; ++i) {
+			for (j = 0; j < m; ++j) {
+			     R a0 = A[(i*fda + j) * 2 + 0];
+			     R a1 = A[(i*fda + j) * 2 + 1];
+			     A[(i*fda + j) * 2 + 0] = B[(j*fda + i) * 2 + 0];
+			     A[(i*fda + j) * 2 + 1] = B[(j*fda + i) * 2 + 1];
+			     B[(j*fda + i) * 2 + 0] = a0;
+			     B[(j*fda + i) * 2 + 1] = a1;
+			}
+		   }
+		   break;
+	      }
+	      default: {
+		   int i, j, k;
+		   for (i = 0; i < n; ++i) {
+			for (j = 0; j < m; ++j) {
+			     for (k = 0; k < N; ++k) {
+				  R a = A[(i*fda + j) * N + k];
+				  A[(i*fda + j) * N + k] = 
+				       B[(j*fda + i) * N + k];
+				  B[(j*fda + i) * N + k] = a;
+			     }
+			}
+		   }
+	      }
+	  }
+     } else if (n > m) {
+	  int n2 = n / 2;
+	  rec_transpose_swap_Ntuple(A, B, n2, m, fda, N);
+	  rec_transpose_swap_Ntuple(A + n2*N*fda, B + n2*N, n - n2, m, fda, N);
+     }
+     else {
+	  int m2 = m / 2;
+	  rec_transpose_swap_Ntuple(A, B, n, m2, fda, N);
+	  rec_transpose_swap_Ntuple(A + m2*N, B + m2*N*fda, n, m - m2, fda, N);
+     }
+}
+
+/* Transpose A, an n x n matrix (stored as n x fda), in-place. */
+static void rec_transpose_sq_ip_Ntuple(R *A, int n, int fda, int N)
+{
+     if (n == 1)
+	  return;
+     else if (n*N <= CUTOFF) {
+	  switch (N) {
+	      case 1: {
+		   int i, j;
+		   for (i = 0; i < n; ++i) {
+			for (j = i + 1; j < n; ++j) {
+			     R a = A[(i*fda + j)];
+			     A[(i*fda + j)] = A[(j*fda + i)];
+			     A[(j*fda + i)] = a;
+			}
+		   }
+		   break;
+	      }
+	      case 2: {
+		   int i, j;
+		   for (i = 0; i < n; ++i) {
+			for (j = i + 1; j < n; ++j) {
+			     R a0 = A[(i*fda + j) * 2 + 0];
+			     R a1 = A[(i*fda + j) * 2 + 1];
+			     A[(i*fda + j) * 2 + 0] = A[(j*fda + i) * 2 + 0];
+			     A[(i*fda + j) * 2 + 1] = A[(j*fda + i) * 2 + 1];
+			     A[(j*fda + i) * 2 + 0] = a0;
+			     A[(j*fda + i) * 2 + 1] = a1;
+			}
+		   }
+		   break;
+	      }
+	      default: {
+		   int i, j, k;
+		   for (i = 0; i < n; ++i) {
+			for (j = i + 1; j < n; ++j) {
+			     for (k = 0; k < N; ++k) {
+				  R a = A[(i*fda + j) * N + k];
+				  A[(i*fda + j) * N + k] = 
+				       A[(j*fda + i) * N + k];
+				  A[(j*fda + i) * N + k] = a;
+			     }
+			}
+		   }
+	      }
+	  }
+     } else {
+	  int n2 = n / 2;
+	  rec_transpose_sq_ip_Ntuple(A, n2, fda, N);
+	  rec_transpose_sq_ip_Ntuple((A + n2*N) + n2*N*fda, n - n2, fda, N);
+	  rec_transpose_swap_Ntuple(A + n2*N, A + n2*N*fda, n2, n - n2, fda,N);
+     }
+}
+
+/*************************************************************************/
+/* In-place transposes of non-square matrices: */
+
+/* Transpose the matrix A in-place, where A is an (n*d) x (m*d) matrix
+   of N-tuples and buf contains at least n*m*d*N elements.  In
+   general, to transpose a p x q matrix, you should call this routine
+   with d = gcd(p, q), n = p/d, and m = q/d. */
+void X(transpose)(R *A, int n, int m, int d, int N, R *buf)
+{
+     A(n > 0 && m > 0 && N > 0 && d > 0);
+     if (d == 1) {
+	  rec_transpose_Ntuple(A, buf, n,m, m,n, N);
+	  memcpy(A, buf, m*n*N*sizeof(R));
+     }
+     else if (n*m == 1) {
+	  rec_transpose_sq_ip_Ntuple(A, d, d, N);
+     }
+     else {
+	  int i, num_el = n*m*d*N;
+
+	  /* treat as (d x n) x (d' x m) matrix.  (d' = d) */
+
+	  /* First, transpose d x (n x d') x m to d x (d' x n) x m,
+	     using the buf matrix.  This consists of d transposes
+	     of contiguous n x d' matrices of m-tuples. */
+	  if (n > 1) {
+	       for (i = 0; i < d; ++i) {
+		    rec_transpose_Ntuple(A + i*num_el, buf,
+					 n,d, d,n, m*N);
+		    memcpy(A + i*num_el, buf, num_el*sizeof(R));
+	       }
+	  }
+	  
+	  /* Now, transpose (d x d') x (n x m) to (d' x d) x (n x m), which
+	     is a square in-place transpose of n*m-tuples: */
+	  rec_transpose_sq_ip_Ntuple(A, d, d, n*m*N);
+
+	  /* Finally, transpose d' x ((d x n) x m) to d' x (m x (d x n)),
+	     using the buf matrix.  This consists of d' transposes
+	     of contiguous d*n x m matrices. */
+	  if (m > 1) {
+	       for (i = 0; i < d; ++i) {
+		    rec_transpose_Ntuple(A + i*num_el, buf,
+					 d*n,m, m,d*n, N);
+		    memcpy(A + i*num_el, buf, num_el*sizeof(R));
+	       }
+	  }
+     }
+}
+
+/*************************************************************************/
+/* In-place transpose routine from TOMS.  This routine is much slower
+   than the cache-oblivious algorithm above, but is has the advantage
+   of requiring less buffer space for the case of gcd(nx,ny) small. */
+
+/*
+ * TOMS Transpose.  Revised version of algorithm 380.
+ * 
+ * These routines do in-place transposes of arrays.
+ * 
+ * [ Cate, E.G. and Twigg, D.W., ACM Transactions on Mathematical Software, 
+ *   vol. 3, no. 1, 104-110 (1977) ]
+ * 
+ * C version by Steven G. Johnson. February 1997.
+ */
+
+/*
+ * "a" is a 1D array of length ny*nx*N which constains the nx x ny
+ * matrix of N-tuples to be transposed.  "a" is stored in row-major
+ * order (last index varies fastest).  move is a 1D array of length
+ * move_size used to store information to speed up the process.  The
+ * value move_size=(ny+nx)/2 is recommended.  buf should be an array
+ * of length 2*N.
+ * 
+ */
+
+void X(transpose_slow)(R *a, int nx, int ny, int N,
+		       char *move, int move_size, R *buf)
+{
+     int i, j, im, mn;
+     R *b, *c, *d;
+     int ncount;
+     int k;
+     
+     /* check arguments and initialize: */
+     A(ny > 0 && nx > 0 && N > 0 && move_size > 0);
+     
+     b = buf;
+     
+     if (ny == nx) {
+	  /*
+	   * if matrix is square, exchange elements a(i,j) and a(j,i):
+	   */
+	  for (i = 0; i < nx; ++i)
+	       for (j = i + 1; j < nx; ++j) {
+		    memcpy(b, &a[N * (i + j * nx)], N * sizeof(R));
+		    memcpy(&a[N * (i + j * nx)], &a[N * (j + i * nx)], N * sizeof(R));
+		    memcpy(&a[N * (j + i * nx)], b, N * sizeof(R));
+	       }
+	  return;
+     }
+     c = buf + N;
+     ncount = 2;		/* always at least 2 fixed points */
+     k = (mn = ny * nx) - 1;
+     
+     for (i = 0; i < move_size; ++i)
+	  move[i] = 0;
+     
+     if (ny >= 3 && nx >= 3)
+	  ncount += gcd(ny - 1, nx - 1) - 1;	/* # fixed points */
+     
+     i = 1;
+     im = ny;
+     
+     while (1) {
+	  int i1, i2, i1c, i2c;
+	  int kmi;
+	  
+	  /** Rearrange the elements of a loop
+	      and its companion loop: **/
+	  
+	  i1 = i;
+	  kmi = k - i;
+	  memcpy(b, &a[N * i1], N * sizeof(R));
+	  i1c = kmi;
+	  memcpy(c, &a[N * i1c], N * sizeof(R));
+	  
+	  while (1) {
+	       i2 = ny * i1 - k * (i1 / nx);
+	       i2c = k - i2;
+	       if (i1 < move_size)
+		    move[i1] = 1;
+	       if (i1c < move_size)
+		    move[i1c] = 1;
+	       ncount += 2;
+	       if (i2 == i)
+		    break;
+	       if (i2 == kmi) {
+		    d = b;
+		    b = c;
+		    c = d;
+		    break;
+	       }
+	       memcpy(&a[N * i1], &a[N * i2], 
+		      N * sizeof(R));
+	       memcpy(&a[N * i1c], &a[N * i2c], 
+		      N * sizeof(R));
+	       i1 = i2;
+	       i1c = i2c;
+	  }
+	  memcpy(&a[N * i1], b, N * sizeof(R));
+	  memcpy(&a[N * i1c], c, N * sizeof(R));
+	  
+	  if (ncount >= mn)
+	       break;	/* we've moved all elements */
+	  
+	  /** Search for loops to rearrange: **/
+	  
+	  while (1) {
+	       int max = k - i;
+	       ++i;
+	       A(i <= max);
+	       im += ny;
+	       if (im > k)
+		    im -= k;
+	       i2 = im;
+	       if (i == i2)
+		    continue;
+	       if (i >= move_size) {
+		    while (i2 > i && i2 < max) {
+			 i1 = i2;
+			 i2 = ny * i1 - k * (i1 / nx);
+		    }
+		    if (i2 == i)
+			 break;
+	       } else if (!move[i])
+		    break;
+	  }
+     }
+}
diff --git a/src/fftw3/kernel/trig.c b/src/fftw3/kernel/trig.c
new file mode 100644
index 0000000..0d4c3b2
--- /dev/null
+++ b/src/fftw3/kernel/trig.c
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/* $Id: trig.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */
+
+/* trigonometric functions */
+#include "ifftw.h"
+#include <math.h>
+
+trigreal X(cos2pi)(int m, int n)
+{
+     return X(sincos)((trigreal)m, (trigreal)n, 0);
+}
+
+trigreal X(sin2pi)(int m, int n)
+{
+     return X(sincos)((trigreal)m, (trigreal)n, 1);
+}
+
+trigreal X(tan2pi)(int m, int n)
+{
+#if 0      /* unimplemented, unused */
+     trigreal dm = m, dn = n;
+     return TAN(by2pi(dm, dn));
+#endif
+     UNUSED(m); UNUSED(n);
+     return 0.0;
+}
diff --git a/src/fftw3/kernel/trig1.c b/src/fftw3/kernel/trig1.c
new file mode 100644
index 0000000..b86362c
--- /dev/null
+++ b/src/fftw3/kernel/trig1.c
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/* $Id: trig1.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */
+
+/* trigonometric functions */
+#include "ifftw.h"
+#include <math.h>
+
+#ifdef FFTW_LDOUBLE
+#  define COS cosl
+#  define SIN sinl
+#  define TAN tanl
+#  define KTRIG(x) (x##L)
+#else
+#  define COS cos
+#  define SIN sin
+#  define TAN tan
+#  define KTRIG(x) (x)
+#endif
+
+static const trigreal K2PI =
+    KTRIG(6.2831853071795864769252867665590057683943388);
+#define by2pi(m, n) ((K2PI * (m)) / (n))
+
+/*
+ * Improve accuracy by reducing x to range [0..1/8]
+ * before multiplication by 2 * PI.
+ */
+
+trigreal X(sincos)(trigreal m, trigreal n, int sinp)
+{
+     /* waiting for C to get tail recursion... */
+     trigreal half_n = n * KTRIG(0.5);
+     trigreal quarter_n = half_n * KTRIG(0.5);
+     trigreal eighth_n = quarter_n * KTRIG(0.5);
+     trigreal sgn = KTRIG(1.0);
+
+     if (sinp) goto sin;
+ cos:
+     if (m < 0) { m = -m; /* goto cos; */ }
+     if (m > half_n) { m = n - m; goto cos; }
+     if (m > eighth_n) { m = quarter_n - m; goto sin; }
+     return sgn * COS(by2pi(m, n));
+
+ msin:
+     sgn = -sgn;
+ sin:
+     if (m < 0) { m = -m; goto msin; }
+     if (m > half_n) { m = n - m; goto msin; }
+     if (m > eighth_n) { m = quarter_n - m; goto cos; }
+     return sgn * SIN(by2pi(m, n));
+}
diff --git a/src/fftw3/kernel/twiddle.c b/src/fftw3/kernel/twiddle.c
new file mode 100644
index 0000000..785bdd7
--- /dev/null
+++ b/src/fftw3/kernel/twiddle.c
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2003 Matteo Frigo
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/* $Id: twiddle.c,v 1.1 2008/10/17 06:11:29 scuri Exp $ */
+
+/* Twiddle manipulation */
+
+#include "ifftw.h"
+#include <math.h>
+
+/* table of known twiddle factors */
+static twid *twlist = (twid *) 0;
+
+static int equal_instr(const tw_instr *p, const tw_instr *q)
+{
+     if (p == q)
+          return 1;
+
+     for (;; ++p, ++q) {
+          if (p->op != q->op || p->v != q->v || p->i != q->i)
+               return 0;
+          if (p->op == TW_NEXT)  /* == q->op */
+               return 1;
+     }
+     A(0 /* can't happen */);
+}
+
+static int ok_twid(const twid *t, const tw_instr *q, int n, int r, int m)
+{
+     return (n == t->n && r == t->r && m <= t->m && equal_instr(t->instr, q));
+}
+
+static twid *lookup(const tw_instr *q, int n, int r, int m)
+{
+     twid *p;
+
+     for (p = twlist; p && !ok_twid(p, q, n, r, m); p = p->cdr)
+          ;
+     return p;
+}
+
+static int twlen0(int r, const tw_instr **pp)
+{
+     int ntwiddle = 0;
+     const tw_instr *p = *pp;
+
+     /* compute length of bytecode program */
+     A(r > 0);
+     for ( ; p->op != TW_NEXT; ++p) {
+	  switch (p->op) {
+	      case TW_FULL:
+		   ntwiddle += (r - 1) * 2;
+		   break;
+	      case TW_GENERIC:
+		   ntwiddle += r * 2;
+		   break;
+	      default:
+		   ++ntwiddle;
+	  }
+     }
+
+     *pp = p;
+     return ntwiddle;
+}
+
+int X(twiddle_length)(int r, const tw_instr *p)
+{
+     return twlen0(r, &p);
+}
+
+static R *compute(const tw_instr *instr, int n, int r, int m)
+{
+     int ntwiddle, j;
+     R *W, *W0;
+     const tw_instr *p;
+
+     static trigreal (*const f[])(int, int) = { 
+	  X(cos2pi), X(sin2pi), X(tan2pi) 
+     };
+
+     p = instr;
+     ntwiddle = twlen0(r, &p);
+
+     W0 = W = (R *)MALLOC(ntwiddle * (m / p->v) * sizeof(R), TWIDDLES);
+
+     for (j = 0; j < m; j += p->v) {
+          for (p = instr; p->op != TW_NEXT; ++p) {
+	       switch (p->op) {
+		   case TW_FULL:
+		   {
+			int i;
+			A((int)p->i == r); /* consistency check */
+			for (i = 1; i < r; ++i) {
+			     *W++ = f[TW_COS]((j + p->v) * i, n);
+			     *W++ = f[TW_SIN]((j + p->v) * i, n);
+			}
+			break;
+		   }
+
+		   case TW_GENERIC:
+		   {
+			int i;
+			A(p->v == 0); /* unused */
+			A(p->i == 0); /* unused */
+			for (i = 0; i < r; ++i) {
+			     int k = j * r + i;
+			     *W++ = f[TW_COS](k, n);
+			     *W++ = FFT_SIGN * f[TW_SIN](k, n);
+			}
+			break;
+		   }
+		   
+		   default:
+			*W++ = f[p->op](((signed int)(j + p->v)) * p->i, n);
+			break;
+	       }
+	  }
+          A(m % p->v == 0);
+     }
+
+     return W0;
+}
+
+void X(mktwiddle)(twid **pp, const tw_instr *instr, int n, int r, int m)
+{
+     twid *p;
+
+     if (*pp) return;  /* already created */
+
+     if ((p = lookup(instr, n, r, m))) {
+          ++p->refcnt;
+	  goto done;
+     }
+
+     p = (twid *) MALLOC(sizeof(twid), TWIDDLES);
+     p->n = n;
+     p->r = r;
+     p->m = m;
+     p->instr = instr;
+     p->refcnt = 1;
+     p->W = compute(instr, n, r, m);
+
+     /* cons! onto twlist */
+     p->cdr = twlist;
+     twlist = p;
+
+ done:
+     *pp = p;
+     return;
+}
+
+void X(twiddle_destroy)(twid **pp)
+{
+     twid *p = *pp;
+     if (p) {
+          twid **q;
+          if ((--p->refcnt) == 0) {
+               /* remove p from twiddle list */
+               for (q = &twlist; *q; q = &((*q)->cdr)) {
+                    if (*q == p) {
+                         *q = p->cdr;
+                         X(ifree)(p->W);
+                         X(ifree)(p);
+			 goto done;
+                    }
+               }
+               A(0 /* can't happen */ );
+          }
+     }
+ done:
+     *pp = 0; /* destroy pointer */
+     return;
+}
+
+
+void X(twiddle_awake)(int flg, twid **pp, 
+		      const tw_instr *instr, int n, int r, int m)
+{
+     if (flg) 
+	  X(mktwiddle)(pp, instr, n, r, m);
+     else 
+	  X(twiddle_destroy)(pp);
+}