*** empty log message ***

author: scuri <scuri> 2009-08-20 01:10:27 +0000
committer: scuri <scuri> 2009-08-20 01:10:27 +0000
commit: 83b3c8b629d96f5fdf754d35d5f4f5369dbfef1d (patch)
tree: 81242227fb731da262139283c301e58f861e9727
parent: b014c9fa4544c111796aed1406e704338bacf456 (diff)
37 files changed, 12053 insertions, 1168 deletions
diff --git a/html/en/history.html b/html/en/history.html
index 96404bd..78acfd4 100644
--- a/html/en/history.html
+++ b/html/en/history.html
@@ -26,7 +26,8 @@
 	1.2.39. Removed changes to the library that made it incompatible with other 
 		libPNG distributions.</li>
 		<li><span style="color: #008000">Changed:</span> libLZF updated to 
-		version 3.5.</li>
+		version 3.5. libJPEG updated to 
+		version 7.</li>
 		<li><span style="color: #008000">Changed: <span
             style="color: #000000"> <strong>imImageCreateBased</strong> and
 		<strong>imConvertColorSpace</strong> now also 
diff --git a/html/en/libjpeg.txt b/html/en/libjpeg.txt
index 3937918..2afec09 100644
--- a/html/en/libjpeg.txt
+++ b/html/en/libjpeg.txt
@@ -17,7 +17,7 @@ with respect to this software, its quality, accuracy, merchantability, or
 fitness for a particular purpose.  This software is provided "AS IS", and you,
 its user, assume the entire risk as to its quality and accuracy.
 
-This software is copyright (C) 1991-1998, Thomas G. Lane.
+This software is copyright (C) 1991-2009, Thomas G. Lane, Guido Vollbeding.
 All Rights Reserved except as specified below.
 
 Permission is hereby granted to use, copy, modify, and distribute this
diff --git a/include/im_format_all.h b/include/im_format_all.h
index 2a6ab96..881694b 100644
--- a/include/im_format_all.h
+++ b/include/im_format_all.h
@@ -118,9 +118,9 @@ void imFormatRegisterTIFF(void);
  * ISO/IEC 10918 (1994, 1995, 1997, 1999)\n
  * http://www.jpeg.org/
  * \par
- * Access to the JPEG file format uses libJPEG version 6b. \n
+ * Access to the JPEG file format uses libJPEG version 7. \n
  * http://www.ijg.org                                      \n
- * Copyright (C) 1991-1998, Thomas G. Lane                 \n
+ * Copyright (C) 1994-2009, Thomas G. Lane, Guido Vollbeding  \n
  *   from the Independent JPEG Group.
  * \par
  * Access to the EXIF attributes uses libEXIF version 0.6.12. \n
@@ -153,7 +153,7 @@ void imFormatRegisterTIFF(void);
     Changes to libJPEG:
       jdatadst.c - fflush and ferror replaced by macros JFFLUSH and JFERROR.
       jinclude.h - standard JFFLUSH and JFERROR definitions, and new macro HAVE_JFIO.
-      jmorecfg.h - changed definition of INT32 to JINT32 for better compatibility.
+      --  jmorecfg.h - changed definition of INT32 to JINT32 for better compatibility.
       jdhuf.c - added support for OJPEG_SUPPORT in libTIFF.
       new file created: jconfig.h
 
diff --git a/mak.vc9/im.vcproj b/mak.vc9/im.vcproj
index a42553c..a54feb4 100644
--- a/mak.vc9/im.vcproj
+++ b/mak.vc9/im.vcproj
@@ -403,6 +403,10 @@
 				Name="libJPEG"
 				>
 				<File
+					RelativePath="..\src\libjpeg\jaricom.c"
+					>
+				</File>
+				<File
 					RelativePath="..\src\libjpeg\jcapimin.c"
 					>
 				</File>
@@ -411,6 +415,10 @@
 					>
 				</File>
 				<File
+					RelativePath="..\src\libjpeg\jcarith.c"
+					>
+				</File>
+				<File
 					RelativePath="..\src\libjpeg\jccoefct.c"
 					>
 				</File>
@@ -475,6 +483,10 @@
 					>
 				</File>
 				<File
+					RelativePath="..\src\libjpeg\jdarith.c"
+					>
+				</File>
+				<File
 					RelativePath="..\src\libjpeg\jdatadst.c"
 					>
 				</File>
@@ -495,10 +507,6 @@
 					>
 				</File>
 				<File
-					RelativePath="..\src\libjpeg\jdhuff.c"
-					>
-				</File>
-				<File
 					RelativePath="..\src\libjpeg\jdinput.c"
 					>
 				</File>
@@ -519,10 +527,6 @@
 					>
 				</File>
 				<File
-					RelativePath="..\src\libjpeg\jdphuff.c"
-					>
-				</File>
-				<File
 					RelativePath="..\src\libjpeg\jdpostct.c"
 					>
 				</File>
@@ -563,10 +567,6 @@
 					>
 				</File>
 				<File
-					RelativePath="..\src\libjpeg\jidctred.c"
-					>
-				</File>
-				<File
 					RelativePath="..\src\libjpeg\jmemmgr.c"
 					>
 				</File>
diff --git a/src/config.mak b/src/config.mak
index 5e730bb..d2c3c0d 100644
--- a/src/config.mak
+++ b/src/config.mak
@@ -29,14 +29,14 @@ SRCTIFF  := $(addprefix libtiff/, $(SRCTIFF)) im_format_tiff.cpp
 INCLUDES += libtiff 
 
 SRCJPEG = \
-    jcapimin.c  jcmarker.c  jdapimin.c  jdinput.c   jdtrans.c   jidctred.c \
+    jcapimin.c  jcmarker.c  jdapimin.c  jdinput.c   jdtrans.c   \
     jcapistd.c  jcmaster.c  jdapistd.c  jdmainct.c  jerror.c    jmemmgr.c  \
     jccoefct.c  jcomapi.c   jdatadst.c  jdmarker.c  jfdctflt.c  jmemnobs.c \
     jccolor.c   jcparam.c   jdatasrc.c  jdmaster.c  jfdctfst.c  jquant1.c  \
-    jcdctmgr.c  jcphuff.c   jdcoefct.c  jdmerge.c   jfdctint.c  jquant2.c  \
-    jchuff.c    jcprepct.c  jdcolor.c   jdphuff.c   jidctflt.c  jutils.c   \
-    jcinit.c    jcsample.c  jddctmgr.c  jdpostct.c  jidctfst.c             \
-    jcmainct.c  jctrans.c   jdhuff.c    jdsample.c  jidctint.c
+    jcdctmgr.c  jdcoefct.c  jdmerge.c   jfdctint.c  jquant2.c  \
+    jchuff.c    jcprepct.c  jdcolor.c   jidctflt.c  jutils.c    jdarith.c \
+    jcinit.c    jcsample.c  jddctmgr.c  jdpostct.c  jidctfst.c  jaricom.c  \
+    jcmainct.c  jctrans.c   jdhuff.c    jdsample.c  jidctint.c  jcarith.c
 SRCJPEG  := $(addprefix libjpeg/, $(SRCJPEG)) im_format_jpeg.cpp
 INCLUDES += libjpeg 
 
diff --git a/src/libjpeg/jcapimin.c b/src/libjpeg/jcapimin.c
index 54fb8c5..563ab42 100644
--- a/src/libjpeg/jcapimin.c
+++ b/src/libjpeg/jcapimin.c
@@ -63,8 +63,10 @@ jpeg_CreateCompress (j_compress_ptr cinfo, int version, size_t structsize)
 
   cinfo->comp_info = NULL;
 
-  for (i = 0; i < NUM_QUANT_TBLS; i++)
+  for (i = 0; i < NUM_QUANT_TBLS; i++) {
     cinfo->quant_tbl_ptrs[i] = NULL;
+    cinfo->q_scale_factor[i] = 100;
+  }
 
   for (i = 0; i < NUM_HUFF_TBLS; i++) {
     cinfo->dc_huff_tbl_ptrs[i] = NULL;
diff --git a/src/libjpeg/jccoefct.c b/src/libjpeg/jccoefct.c
index 1963ddb..d775313 100644
--- a/src/libjpeg/jccoefct.c
+++ b/src/libjpeg/jccoefct.c
@@ -149,6 +149,7 @@ compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
   int blkn, bi, ci, yindex, yoffset, blockcnt;
   JDIMENSION ypos, xpos;
   jpeg_component_info *compptr;
+  forward_DCT_ptr forward_DCT;
 
   /* Loop to write as much as one whole iMCU row */
   for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row;
@@ -167,17 +168,19 @@ compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
       blkn = 0;
       for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
 	compptr = cinfo->cur_comp_info[ci];
+	forward_DCT = cinfo->fdct->forward_DCT[compptr->component_index];
 	blockcnt = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
 						: compptr->last_col_width;
 	xpos = MCU_col_num * compptr->MCU_sample_width;
-	ypos = yoffset * DCTSIZE; /* ypos == (yoffset+yindex) * DCTSIZE */
+	ypos = yoffset * compptr->DCT_v_scaled_size;
+	/* ypos == (yoffset+yindex) * DCTSIZE */
 	for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
 	  if (coef->iMCU_row_num < last_iMCU_row ||
 	      yoffset+yindex < compptr->last_row_height) {
-	    (*cinfo->fdct->forward_DCT) (cinfo, compptr,
-					 input_buf[compptr->component_index],
-					 coef->MCU_buffer[blkn],
-					 ypos, xpos, (JDIMENSION) blockcnt);
+	    (*forward_DCT) (cinfo, compptr,
+			    input_buf[compptr->component_index],
+			    coef->MCU_buffer[blkn],
+			    ypos, xpos, (JDIMENSION) blockcnt);
 	    if (blockcnt < compptr->MCU_width) {
 	      /* Create some dummy blocks at the right edge of the image. */
 	      jzero_far((void FAR *) coef->MCU_buffer[blkn + blockcnt],
@@ -195,7 +198,7 @@ compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
 	    }
 	  }
 	  blkn += compptr->MCU_width;
-	  ypos += DCTSIZE;
+	  ypos += compptr->DCT_v_scaled_size;
 	}
       }
       /* Try to write the MCU.  In event of a suspension failure, we will
@@ -252,6 +255,7 @@ compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
   jpeg_component_info *compptr;
   JBLOCKARRAY buffer;
   JBLOCKROW thisblockrow, lastblockrow;
+  forward_DCT_ptr forward_DCT;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
@@ -274,15 +278,15 @@ compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
     ndummy = (int) (blocks_across % h_samp_factor);
     if (ndummy > 0)
       ndummy = h_samp_factor - ndummy;
+    forward_DCT = cinfo->fdct->forward_DCT[ci];
     /* Perform DCT for all non-dummy blocks in this iMCU row.  Each call
      * on forward_DCT processes a complete horizontal row of DCT blocks.
      */
     for (block_row = 0; block_row < block_rows; block_row++) {
       thisblockrow = buffer[block_row];
-      (*cinfo->fdct->forward_DCT) (cinfo, compptr,
-				   input_buf[ci], thisblockrow,
-				   (JDIMENSION) (block_row * DCTSIZE),
-				   (JDIMENSION) 0, blocks_across);
+      (*forward_DCT) (cinfo, compptr, input_buf[ci], thisblockrow,
+		      (JDIMENSION) (block_row * compptr->DCT_v_scaled_size),
+		      (JDIMENSION) 0, blocks_across);
       if (ndummy > 0) {
 	/* Create dummy blocks at the right edge of the image. */
 	thisblockrow += blocks_across; /* => first dummy block */
diff --git a/src/libjpeg/jcdctmgr.c b/src/libjpeg/jcdctmgr.c
index 61fa79b..0bbdbb6 100644
--- a/src/libjpeg/jcdctmgr.c
+++ b/src/libjpeg/jcdctmgr.c
@@ -23,7 +23,7 @@ typedef struct {
   struct jpeg_forward_dct pub;	/* public fields */
 
   /* Pointer to the DCT routine actually in use */
-  forward_DCT_method_ptr do_dct;
+  forward_DCT_method_ptr do_dct[MAX_COMPONENTS];
 
   /* The actual post-DCT divisors --- not identical to the quant table
    * entries, because of scaling (especially for an unnormalized DCT).
@@ -33,7 +33,7 @@ typedef struct {
 
 #ifdef DCT_FLOAT_SUPPORTED
   /* Same as above for the floating-point case. */
-  float_DCT_method_ptr do_float_dct;
+  float_DCT_method_ptr do_float_dct[MAX_COMPONENTS];
   FAST_FLOAT * float_divisors[NUM_QUANT_TBLS];
 #endif
 } my_fdct_controller;
@@ -41,6 +41,132 @@ typedef struct {
 typedef my_fdct_controller * my_fdct_ptr;
 
 
+/* The current scaled-DCT routines require ISLOW-style divisor tables,
+ * so be sure to compile that code if either ISLOW or SCALING is requested.
+ */
+#ifdef DCT_ISLOW_SUPPORTED
+#define PROVIDE_ISLOW_TABLES
+#else
+#ifdef DCT_SCALING_SUPPORTED
+#define PROVIDE_ISLOW_TABLES
+#endif
+#endif
+
+
+/*
+ * Perform forward DCT on one or more blocks of a component.
+ *
+ * The input samples are taken from the sample_data[] array starting at
+ * position start_row/start_col, and moving to the right for any additional
+ * blocks. The quantized coefficients are returned in coef_blocks[].
+ */
+
+METHODDEF(void)
+forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr,
+	     JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
+	     JDIMENSION start_row, JDIMENSION start_col,
+	     JDIMENSION num_blocks)
+/* This version is used for integer DCT implementations. */
+{
+  /* This routine is heavily used, so it's worth coding it tightly. */
+  my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
+  forward_DCT_method_ptr do_dct = fdct->do_dct[compptr->component_index];
+  DCTELEM * divisors = fdct->divisors[compptr->quant_tbl_no];
+  DCTELEM workspace[DCTSIZE2];	/* work area for FDCT subroutine */
+  JDIMENSION bi;
+
+  sample_data += start_row;	/* fold in the vertical offset once */
+
+  for (bi = 0; bi < num_blocks; bi++, start_col += compptr->DCT_h_scaled_size) {
+    /* Perform the DCT */
+    (*do_dct) (workspace, sample_data, start_col);
+
+    /* Quantize/descale the coefficients, and store into coef_blocks[] */
+    { register DCTELEM temp, qval;
+      register int i;
+      register JCOEFPTR output_ptr = coef_blocks[bi];
+
+      for (i = 0; i < DCTSIZE2; i++) {
+	qval = divisors[i];
+	temp = workspace[i];
+	/* Divide the coefficient value by qval, ensuring proper rounding.
+	 * Since C does not specify the direction of rounding for negative
+	 * quotients, we have to force the dividend positive for portability.
+	 *
+	 * In most files, at least half of the output values will be zero
+	 * (at default quantization settings, more like three-quarters...)
+	 * so we should ensure that this case is fast.  On many machines,
+	 * a comparison is enough cheaper than a divide to make a special test
+	 * a win.  Since both inputs will be nonnegative, we need only test
+	 * for a < b to discover whether a/b is 0.
+	 * If your machine's division is fast enough, define FAST_DIVIDE.
+	 */
+#ifdef FAST_DIVIDE
+#define DIVIDE_BY(a,b)	a /= b
+#else
+#define DIVIDE_BY(a,b)	if (a >= b) a /= b; else a = 0
+#endif
+	if (temp < 0) {
+	  temp = -temp;
+	  temp += qval>>1;	/* for rounding */
+	  DIVIDE_BY(temp, qval);
+	  temp = -temp;
+	} else {
+	  temp += qval>>1;	/* for rounding */
+	  DIVIDE_BY(temp, qval);
+	}
+	output_ptr[i] = (JCOEF) temp;
+      }
+    }
+  }
+}
+
+
+#ifdef DCT_FLOAT_SUPPORTED
+
+METHODDEF(void)
+forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr,
+		   JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
+		   JDIMENSION start_row, JDIMENSION start_col,
+		   JDIMENSION num_blocks)
+/* This version is used for floating-point DCT implementations. */
+{
+  /* This routine is heavily used, so it's worth coding it tightly. */
+  my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
+  float_DCT_method_ptr do_dct = fdct->do_float_dct[compptr->component_index];
+  FAST_FLOAT * divisors = fdct->float_divisors[compptr->quant_tbl_no];
+  FAST_FLOAT workspace[DCTSIZE2]; /* work area for FDCT subroutine */
+  JDIMENSION bi;
+
+  sample_data += start_row;	/* fold in the vertical offset once */
+
+  for (bi = 0; bi < num_blocks; bi++, start_col += compptr->DCT_h_scaled_size) {
+    /* Perform the DCT */
+    (*do_dct) (workspace, sample_data, start_col);
+
+    /* Quantize/descale the coefficients, and store into coef_blocks[] */
+    { register FAST_FLOAT temp;
+      register int i;
+      register JCOEFPTR output_ptr = coef_blocks[bi];
+
+      for (i = 0; i < DCTSIZE2; i++) {
+	/* Apply the quantization and scaling factor */
+	temp = workspace[i] * divisors[i];
+	/* Round to nearest integer.
+	 * Since C does not specify the direction of rounding for negative
+	 * quotients, we have to force the dividend positive for portability.
+	 * The maximum coefficient size is +-16K (for 12-bit data), so this
+	 * code should work for either 16-bit or 32-bit ints.
+	 */
+	output_ptr[i] = (JCOEF) ((int) (temp + (FAST_FLOAT) 16384.5) - 16384);
+      }
+    }
+  }
+}
+
+#endif /* DCT_FLOAT_SUPPORTED */
+
+
 /*
  * Initialize for a processing pass.
  * Verify that all referenced Q-tables are present, and set up
@@ -56,11 +182,170 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
   my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
   int ci, qtblno, i;
   jpeg_component_info *compptr;
+  int method = 0;
   JQUANT_TBL * qtbl;
   DCTELEM * dtbl;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
+    /* Select the proper DCT routine for this component's scaling */
+    switch ((compptr->DCT_h_scaled_size << 8) + compptr->DCT_v_scaled_size) {
+#ifdef DCT_SCALING_SUPPORTED
+    case ((1 << 8) + 1):
+      fdct->do_dct[ci] = jpeg_fdct_1x1;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((2 << 8) + 2):
+      fdct->do_dct[ci] = jpeg_fdct_2x2;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((3 << 8) + 3):
+      fdct->do_dct[ci] = jpeg_fdct_3x3;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((4 << 8) + 4):
+      fdct->do_dct[ci] = jpeg_fdct_4x4;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((5 << 8) + 5):
+      fdct->do_dct[ci] = jpeg_fdct_5x5;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((6 << 8) + 6):
+      fdct->do_dct[ci] = jpeg_fdct_6x6;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((7 << 8) + 7):
+      fdct->do_dct[ci] = jpeg_fdct_7x7;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((9 << 8) + 9):
+      fdct->do_dct[ci] = jpeg_fdct_9x9;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((10 << 8) + 10):
+      fdct->do_dct[ci] = jpeg_fdct_10x10;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((11 << 8) + 11):
+      fdct->do_dct[ci] = jpeg_fdct_11x11;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((12 << 8) + 12):
+      fdct->do_dct[ci] = jpeg_fdct_12x12;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((13 << 8) + 13):
+      fdct->do_dct[ci] = jpeg_fdct_13x13;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((14 << 8) + 14):
+      fdct->do_dct[ci] = jpeg_fdct_14x14;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((15 << 8) + 15):
+      fdct->do_dct[ci] = jpeg_fdct_15x15;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((16 << 8) + 16):
+      fdct->do_dct[ci] = jpeg_fdct_16x16;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((16 << 8) + 8):
+      fdct->do_dct[ci] = jpeg_fdct_16x8;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((14 << 8) + 7):
+      fdct->do_dct[ci] = jpeg_fdct_14x7;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((12 << 8) + 6):
+      fdct->do_dct[ci] = jpeg_fdct_12x6;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((10 << 8) + 5):
+      fdct->do_dct[ci] = jpeg_fdct_10x5;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((8 << 8) + 4):
+      fdct->do_dct[ci] = jpeg_fdct_8x4;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((6 << 8) + 3):
+      fdct->do_dct[ci] = jpeg_fdct_6x3;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((4 << 8) + 2):
+      fdct->do_dct[ci] = jpeg_fdct_4x2;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((2 << 8) + 1):
+      fdct->do_dct[ci] = jpeg_fdct_2x1;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((8 << 8) + 16):
+      fdct->do_dct[ci] = jpeg_fdct_8x16;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((7 << 8) + 14):
+      fdct->do_dct[ci] = jpeg_fdct_7x14;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((6 << 8) + 12):
+      fdct->do_dct[ci] = jpeg_fdct_6x12;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((5 << 8) + 10):
+      fdct->do_dct[ci] = jpeg_fdct_5x10;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((4 << 8) + 8):
+      fdct->do_dct[ci] = jpeg_fdct_4x8;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((3 << 8) + 6):
+      fdct->do_dct[ci] = jpeg_fdct_3x6;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((2 << 8) + 4):
+      fdct->do_dct[ci] = jpeg_fdct_2x4;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((1 << 8) + 2):
+      fdct->do_dct[ci] = jpeg_fdct_1x2;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+#endif
+    case ((DCTSIZE << 8) + DCTSIZE):
+      switch (cinfo->dct_method) {
+#ifdef DCT_ISLOW_SUPPORTED
+      case JDCT_ISLOW:
+	fdct->do_dct[ci] = jpeg_fdct_islow;
+	method = JDCT_ISLOW;
+	break;
+#endif
+#ifdef DCT_IFAST_SUPPORTED
+      case JDCT_IFAST:
+	fdct->do_dct[ci] = jpeg_fdct_ifast;
+	method = JDCT_IFAST;
+	break;
+#endif
+#ifdef DCT_FLOAT_SUPPORTED
+      case JDCT_FLOAT:
+	fdct->do_float_dct[ci] = jpeg_fdct_float;
+	method = JDCT_FLOAT;
+	break;
+#endif
+      default:
+	ERREXIT(cinfo, JERR_NOT_COMPILED);
+	break;
+      }
+      break;
+    default:
+      ERREXIT2(cinfo, JERR_BAD_DCTSIZE,
+	       compptr->DCT_h_scaled_size, compptr->DCT_v_scaled_size);
+      break;
+    }
     qtblno = compptr->quant_tbl_no;
     /* Make sure specified quantization table is present */
     if (qtblno < 0 || qtblno >= NUM_QUANT_TBLS ||
@@ -69,8 +354,8 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
     qtbl = cinfo->quant_tbl_ptrs[qtblno];
     /* Compute divisors for this quant table */
     /* We may do this more than once for same table, but it's not a big deal */
-    switch (cinfo->dct_method) {
-#ifdef DCT_ISLOW_SUPPORTED
+    switch (method) {
+#ifdef PROVIDE_ISLOW_TABLES
     case JDCT_ISLOW:
       /* For LL&M IDCT method, divisors are equal to raw quantization
        * coefficients multiplied by 8 (to counteract scaling).
@@ -84,6 +369,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
       for (i = 0; i < DCTSIZE2; i++) {
 	dtbl[i] = ((DCTELEM) qtbl->quantval[i]) << 3;
       }
+      fdct->pub.forward_DCT[ci] = forward_DCT;
       break;
 #endif
 #ifdef DCT_IFAST_SUPPORTED
@@ -122,6 +408,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
 		    CONST_BITS-3);
 	}
       }
+      fdct->pub.forward_DCT[ci] = forward_DCT;
       break;
 #endif
 #ifdef DCT_FLOAT_SUPPORTED
@@ -158,6 +445,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
 	  }
 	}
       }
+      fdct->pub.forward_DCT[ci] = forward_DCT_float;
       break;
 #endif
     default:
@@ -169,175 +457,6 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
 
 
 /*
- * Perform forward DCT on one or more blocks of a component.
- *
- * The input samples are taken from the sample_data[] array starting at
- * position start_row/start_col, and moving to the right for any additional
- * blocks. The quantized coefficients are returned in coef_blocks[].
- */
-
-METHODDEF(void)
-forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr,
-	     JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
-	     JDIMENSION start_row, JDIMENSION start_col,
-	     JDIMENSION num_blocks)
-/* This version is used for integer DCT implementations. */
-{
-  /* This routine is heavily used, so it's worth coding it tightly. */
-  my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
-  forward_DCT_method_ptr do_dct = fdct->do_dct;
-  DCTELEM * divisors = fdct->divisors[compptr->quant_tbl_no];
-  DCTELEM workspace[DCTSIZE2];	/* work area for FDCT subroutine */
-  JDIMENSION bi;
-
-  sample_data += start_row;	/* fold in the vertical offset once */
-
-  for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
-    /* Load data into workspace, applying unsigned->signed conversion */
-    { register DCTELEM *workspaceptr;
-      register JSAMPROW elemptr;
-      register int elemr;
-
-      workspaceptr = workspace;
-      for (elemr = 0; elemr < DCTSIZE; elemr++) {
-	elemptr = sample_data[elemr] + start_col;
-#if DCTSIZE == 8		/* unroll the inner loop */
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-#else
-	{ register int elemc;
-	  for (elemc = DCTSIZE; elemc > 0; elemc--) {
-	    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	  }
-	}
-#endif
-      }
-    }
-
-    /* Perform the DCT */
-    (*do_dct) (workspace);
-
-    /* Quantize/descale the coefficients, and store into coef_blocks[] */
-    { register DCTELEM temp, qval;
-      register int i;
-      register JCOEFPTR output_ptr = coef_blocks[bi];
-
-      for (i = 0; i < DCTSIZE2; i++) {
-	qval = divisors[i];
-	temp = workspace[i];
-	/* Divide the coefficient value by qval, ensuring proper rounding.
-	 * Since C does not specify the direction of rounding for negative
-	 * quotients, we have to force the dividend positive for portability.
-	 *
-	 * In most files, at least half of the output values will be zero
-	 * (at default quantization settings, more like three-quarters...)
-	 * so we should ensure that this case is fast.  On many machines,
-	 * a comparison is enough cheaper than a divide to make a special test
-	 * a win.  Since both inputs will be nonnegative, we need only test
-	 * for a < b to discover whether a/b is 0.
-	 * If your machine's division is fast enough, define FAST_DIVIDE.
-	 */
-#ifdef FAST_DIVIDE
-#define DIVIDE_BY(a,b)	a /= b
-#else
-#define DIVIDE_BY(a,b)	if (a >= b) a /= b; else a = 0
-#endif
-	if (temp < 0) {
-	  temp = -temp;
-	  temp += qval>>1;	/* for rounding */
-	  DIVIDE_BY(temp, qval);
-	  temp = -temp;
-	} else {
-	  temp += qval>>1;	/* for rounding */
-	  DIVIDE_BY(temp, qval);
-	}
-	output_ptr[i] = (JCOEF) temp;
-      }
-    }
-  }
-}
-
-
-#ifdef DCT_FLOAT_SUPPORTED
-
-METHODDEF(void)
-forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr,
-		   JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
-		   JDIMENSION start_row, JDIMENSION start_col,
-		   JDIMENSION num_blocks)
-/* This version is used for floating-point DCT implementations. */
-{
-  /* This routine is heavily used, so it's worth coding it tightly. */
-  my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
-  float_DCT_method_ptr do_dct = fdct->do_float_dct;
-  FAST_FLOAT * divisors = fdct->float_divisors[compptr->quant_tbl_no];
-  FAST_FLOAT workspace[DCTSIZE2]; /* work area for FDCT subroutine */
-  JDIMENSION bi;
-
-  sample_data += start_row;	/* fold in the vertical offset once */
-
-  for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
-    /* Load data into workspace, applying unsigned->signed conversion */
-    { register FAST_FLOAT *workspaceptr;
-      register JSAMPROW elemptr;
-      register int elemr;
-
-      workspaceptr = workspace;
-      for (elemr = 0; elemr < DCTSIZE; elemr++) {
-	elemptr = sample_data[elemr] + start_col;
-#if DCTSIZE == 8		/* unroll the inner loop */
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-#else
-	{ register int elemc;
-	  for (elemc = DCTSIZE; elemc > 0; elemc--) {
-	    *workspaceptr++ = (FAST_FLOAT)
-	      (GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	  }
-	}
-#endif
-      }
-    }
-
-    /* Perform the DCT */
-    (*do_dct) (workspace);
-
-    /* Quantize/descale the coefficients, and store into coef_blocks[] */
-    { register FAST_FLOAT temp;
-      register int i;
-      register JCOEFPTR output_ptr = coef_blocks[bi];
-
-      for (i = 0; i < DCTSIZE2; i++) {
-	/* Apply the quantization and scaling factor */
-	temp = workspace[i] * divisors[i];
-	/* Round to nearest integer.
-	 * Since C does not specify the direction of rounding for negative
-	 * quotients, we have to force the dividend positive for portability.
-	 * The maximum coefficient size is +-16K (for 12-bit data), so this
-	 * code should work for either 16-bit or 32-bit ints.
-	 */
-	output_ptr[i] = (JCOEF) ((int) (temp + (FAST_FLOAT) 16384.5) - 16384);
-      }
-    }
-  }
-}
-
-#endif /* DCT_FLOAT_SUPPORTED */
-
-
-/*
  * Initialize FDCT manager.
  */
 
@@ -353,30 +472,6 @@ jinit_forward_dct (j_compress_ptr cinfo)
   cinfo->fdct = (struct jpeg_forward_dct *) fdct;
   fdct->pub.start_pass = start_pass_fdctmgr;
 
-  switch (cinfo->dct_method) {
-#ifdef DCT_ISLOW_SUPPORTED
-  case JDCT_ISLOW:
-    fdct->pub.forward_DCT = forward_DCT;
-    fdct->do_dct = jpeg_fdct_islow;
-    break;
-#endif
-#ifdef DCT_IFAST_SUPPORTED
-  case JDCT_IFAST:
-    fdct->pub.forward_DCT = forward_DCT;
-    fdct->do_dct = jpeg_fdct_ifast;
-    break;
-#endif
-#ifdef DCT_FLOAT_SUPPORTED
-  case JDCT_FLOAT:
-    fdct->pub.forward_DCT = forward_DCT_float;
-    fdct->do_float_dct = jpeg_fdct_float;
-    break;
-#endif
-  default:
-    ERREXIT(cinfo, JERR_NOT_COMPILED);
-    break;
-  }
-
   /* Mark divisor tables unallocated */
   for (i = 0; i < NUM_QUANT_TBLS; i++) {
     fdct->divisors[i] = NULL;
diff --git a/src/libjpeg/jchuff.c b/src/libjpeg/jchuff.c
index f235250..0fd578e 100644
--- a/src/libjpeg/jchuff.c
+++ b/src/libjpeg/jchuff.c
@@ -2,22 +2,48 @@
  * jchuff.c
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2006-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
  * This file contains Huffman entropy encoding routines.
+ * Both sequential and progressive modes are supported in this single module.
  *
  * Much of the complexity here has to do with supporting output suspension.
  * If the data destination module demands suspension, we want to be able to
  * back up to the start of the current MCU.  To do this, we copy state
  * variables into local working storage, and update them back to the
  * permanent JPEG objects only upon successful completion of an MCU.
+ *
+ * We do not support output suspension for the progressive JPEG mode, since
+ * the library currently does not allow multiple-scan files to be written
+ * with output suspension.
  */
 
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jchuff.h"		/* Declarations shared with jcphuff.c */
+
+
+/* The legal range of a DCT coefficient is
+ *  -1024 .. +1023  for 8-bit data;
+ * -16384 .. +16383 for 12-bit data.
+ * Hence the magnitude should always fit in 10 or 14 bits respectively.
+ */
+
+#if BITS_IN_JSAMPLE == 8
+#define MAX_COEF_BITS 10
+#else
+#define MAX_COEF_BITS 14
+#endif
+
+/* Derived data constructed for each Huffman table */
+
+typedef struct {
+  unsigned int ehufco[256];	/* code for each symbol */
+  char ehufsi[256];		/* length of code for each symbol */
+  /* If no code has been allocated for a symbol S, ehufsi[S] contains 0 */
+} c_derived_tbl;
 
 
 /* Expanded entropy encoder object for Huffman encoding.
@@ -61,19 +87,47 @@ typedef struct {
   unsigned int restarts_to_go;	/* MCUs left in this restart interval */
   int next_restart_num;		/* next restart number to write (0-7) */
 
+  /* Following four fields used only in sequential mode */
+
   /* Pointers to derived tables (these workspaces have image lifespan) */
   c_derived_tbl * dc_derived_tbls[NUM_HUFF_TBLS];
   c_derived_tbl * ac_derived_tbls[NUM_HUFF_TBLS];
 
-#ifdef ENTROPY_OPT_SUPPORTED	/* Statistics tables for optimization */
+  /* Statistics tables for optimization */
   long * dc_count_ptrs[NUM_HUFF_TBLS];
   long * ac_count_ptrs[NUM_HUFF_TBLS];
-#endif
+
+  /* Following fields used only in progressive mode */
+
+  /* Mode flag: TRUE for optimization, FALSE for actual data output */
+  boolean gather_statistics;
+
+  /* next_output_byte/free_in_buffer are local copies of cinfo->dest fields.
+   */
+  JOCTET * next_output_byte;	/* => next byte to write in buffer */
+  size_t free_in_buffer;	/* # of byte spaces remaining in buffer */
+  j_compress_ptr cinfo;		/* link to cinfo (needed for dump_buffer) */
+
+  /* Coding status for AC components */
+  int ac_tbl_no;		/* the table number of the single component */
+  unsigned int EOBRUN;		/* run length of EOBs */
+  unsigned int BE;		/* # of buffered correction bits before MCU */
+  char * bit_buffer;		/* buffer for correction bits (1 per char) */
+  /* packing correction bits tightly would save some space but cost time... */
+
+  /* Pointers to derived tables (these workspaces have image lifespan).
+   * Since any one scan in progressive mode codes only DC or only AC,
+   * we only need one set of tables, not one for DC and one for AC.
+   */
+  c_derived_tbl * derived_tbls[NUM_HUFF_TBLS];
+
+  /* Statistics tables for optimization; again, one set is enough */
+  long * count_ptrs[NUM_HUFF_TBLS];
 } huff_entropy_encoder;
 
 typedef huff_entropy_encoder * huff_entropy_ptr;
 
-/* Working state while writing an MCU.
+/* Working state while writing an MCU (sequential mode).
  * This struct contains all the fields that are needed by subroutines.
  */
 
@@ -84,98 +138,37 @@ typedef struct {
   j_compress_ptr cinfo;		/* dump_buffer needs access to this */
 } working_state;
 
+/* MAX_CORR_BITS is the number of bits the AC refinement correction-bit
+ * buffer can hold.  Larger sizes may slightly improve compression, but
+ * 1000 is already well into the realm of overkill.
+ * The minimum safe size is 64 bits.
+ */
 
-/* Forward declarations */
-METHODDEF(boolean) encode_mcu_huff JPP((j_compress_ptr cinfo,
-					JBLOCKROW *MCU_data));
-METHODDEF(void) finish_pass_huff JPP((j_compress_ptr cinfo));
-#ifdef ENTROPY_OPT_SUPPORTED
-METHODDEF(boolean) encode_mcu_gather JPP((j_compress_ptr cinfo,
-					  JBLOCKROW *MCU_data));
-METHODDEF(void) finish_pass_gather JPP((j_compress_ptr cinfo));
-#endif
-
+#define MAX_CORR_BITS  1000	/* Max # of correction bits I can buffer */
 
-/*
- * Initialize for a Huffman-compressed scan.
- * If gather_statistics is TRUE, we do not output anything during the scan,
- * just count the Huffman symbols used and generate Huffman code tables.
+/* IRIGHT_SHIFT is like RIGHT_SHIFT, but works on int rather than INT32.
+ * We assume that int right shift is unsigned if INT32 right shift is,
+ * which should be safe.
  */
 
-METHODDEF(void)
-start_pass_huff (j_compress_ptr cinfo, boolean gather_statistics)
-{
-  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
-  int ci, dctbl, actbl;
-  jpeg_component_info * compptr;
-
-  if (gather_statistics) {
-#ifdef ENTROPY_OPT_SUPPORTED
-    entropy->pub.encode_mcu = encode_mcu_gather;
-    entropy->pub.finish_pass = finish_pass_gather;
+#ifdef RIGHT_SHIFT_IS_UNSIGNED
+#define ISHIFT_TEMPS	int ishift_temp;
+#define IRIGHT_SHIFT(x,shft)  \
+	((ishift_temp = (x)) < 0 ? \
+	 (ishift_temp >> (shft)) | ((~0) << (16-(shft))) : \
+	 (ishift_temp >> (shft)))
 #else
-    ERREXIT(cinfo, JERR_NOT_COMPILED);
+#define ISHIFT_TEMPS
+#define IRIGHT_SHIFT(x,shft)	((x) >> (shft))
 #endif
-  } else {
-    entropy->pub.encode_mcu = encode_mcu_huff;
-    entropy->pub.finish_pass = finish_pass_huff;
-  }
-
-  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
-    compptr = cinfo->cur_comp_info[ci];
-    dctbl = compptr->dc_tbl_no;
-    actbl = compptr->ac_tbl_no;
-    if (gather_statistics) {
-#ifdef ENTROPY_OPT_SUPPORTED
-      /* Check for invalid table indexes */
-      /* (make_c_derived_tbl does this in the other path) */
-      if (dctbl < 0 || dctbl >= NUM_HUFF_TBLS)
-	ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, dctbl);
-      if (actbl < 0 || actbl >= NUM_HUFF_TBLS)
-	ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, actbl);
-      /* Allocate and zero the statistics tables */
-      /* Note that jpeg_gen_optimal_table expects 257 entries in each table! */
-      if (entropy->dc_count_ptrs[dctbl] == NULL)
-	entropy->dc_count_ptrs[dctbl] = (long *)
-	  (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				      257 * SIZEOF(long));
-      MEMZERO(entropy->dc_count_ptrs[dctbl], 257 * SIZEOF(long));
-      if (entropy->ac_count_ptrs[actbl] == NULL)
-	entropy->ac_count_ptrs[actbl] = (long *)
-	  (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				      257 * SIZEOF(long));
-      MEMZERO(entropy->ac_count_ptrs[actbl], 257 * SIZEOF(long));
-#endif
-    } else {
-      /* Compute derived values for Huffman tables */
-      /* We may do this more than once for a table, but it's not expensive */
-      jpeg_make_c_derived_tbl(cinfo, TRUE, dctbl,
-			      & entropy->dc_derived_tbls[dctbl]);
-      jpeg_make_c_derived_tbl(cinfo, FALSE, actbl,
-			      & entropy->ac_derived_tbls[actbl]);
-    }
-    /* Initialize DC predictions to 0 */
-    entropy->saved.last_dc_val[ci] = 0;
-  }
-
-  /* Initialize bit buffer to empty */
-  entropy->saved.put_buffer = 0;
-  entropy->saved.put_bits = 0;
-
-  /* Initialize restart stuff */
-  entropy->restarts_to_go = cinfo->restart_interval;
-  entropy->next_restart_num = 0;
-}
 
 
 /*
  * Compute the derived values for a Huffman table.
  * This routine also performs some validation checks on the table.
- *
- * Note this is also used by jcphuff.c.
  */
 
-GLOBAL(void)
+LOCAL(void)
 jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno,
 			 c_derived_tbl ** pdtbl)
 {
@@ -264,18 +257,27 @@ jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno,
 }
 
 
-/* Outputting bytes to the file */
+/* Outputting bytes to the file.
+ * NB: these must be called only when actually outputting,
+ * that is, entropy->gather_statistics == FALSE.
+ */
 
 /* Emit a byte, taking 'action' if must suspend. */
-#define emit_byte(state,val,action)  \
+#define emit_byte_s(state,val,action)  \
 	{ *(state)->next_output_byte++ = (JOCTET) (val);  \
 	  if (--(state)->free_in_buffer == 0)  \
-	    if (! dump_buffer(state))  \
+	    if (! dump_buffer_s(state))  \
 	      { action; } }
 
+/* Emit a byte */
+#define emit_byte_e(entropy,val)  \
+	{ *(entropy)->next_output_byte++ = (JOCTET) (val);  \
+	  if (--(entropy)->free_in_buffer == 0)  \
+	    dump_buffer_e(entropy); }
+
 
 LOCAL(boolean)
-dump_buffer (working_state * state)
+dump_buffer_s (working_state * state)
 /* Empty the output buffer; return TRUE if successful, FALSE if must suspend */
 {
   struct jpeg_destination_mgr * dest = state->cinfo->dest;
@@ -289,6 +291,20 @@ dump_buffer (working_state * state)
 }
 
 
+LOCAL(void)
+dump_buffer_e (huff_entropy_ptr entropy)
+/* Empty the output buffer; we do not support suspension in this case. */
+{
+  struct jpeg_destination_mgr * dest = entropy->cinfo->dest;
+
+  if (! (*dest->empty_output_buffer) (entropy->cinfo))
+    ERREXIT(entropy->cinfo, JERR_CANT_SUSPEND);
+  /* After a successful buffer dump, must reset buffer pointers */
+  entropy->next_output_byte = dest->next_output_byte;
+  entropy->free_in_buffer = dest->free_in_buffer;
+}
+
+
 /* Outputting bits to the file */
 
 /* Only the right 24 bits of put_buffer are used; the valid bits are
@@ -299,7 +315,7 @@ dump_buffer (working_state * state)
 
 INLINE
 LOCAL(boolean)
-emit_bits (working_state * state, unsigned int code, int size)
+emit_bits_s (working_state * state, unsigned int code, int size)
 /* Emit some bits; return TRUE if successful, FALSE if must suspend */
 {
   /* This routine is heavily used, so it's worth coding tightly. */
@@ -321,9 +337,9 @@ emit_bits (working_state * state, unsigned int code, int size)
   while (put_bits >= 8) {
     int c = (int) ((put_buffer >> 16) & 0xFF);
     
-    emit_byte(state, c, return FALSE);
+    emit_byte_s(state, c, return FALSE);
     if (c == 0xFF) {		/* need to stuff a zero byte? */
-      emit_byte(state, 0, return FALSE);
+      emit_byte_s(state, 0, return FALSE);
     }
     put_buffer <<= 8;
     put_bits -= 8;
@@ -336,17 +352,554 @@ emit_bits (working_state * state, unsigned int code, int size)
 }
 
 
+INLINE
+LOCAL(void)
+emit_bits_e (huff_entropy_ptr entropy, unsigned int code, int size)
+/* Emit some bits, unless we are in gather mode */
+{
+  /* This routine is heavily used, so it's worth coding tightly. */
+  register INT32 put_buffer = (INT32) code;
+  register int put_bits = entropy->saved.put_bits;
+
+  /* if size is 0, caller used an invalid Huffman table entry */
+  if (size == 0)
+    ERREXIT(entropy->cinfo, JERR_HUFF_MISSING_CODE);
+
+  if (entropy->gather_statistics)
+    return;			/* do nothing if we're only getting stats */
+
+  put_buffer &= (((INT32) 1)<<size) - 1; /* mask off any extra bits in code */
+  
+  put_bits += size;		/* new number of bits in buffer */
+
+  put_buffer <<= 24 - put_bits; /* align incoming bits */
+
+  /* and merge with old buffer contents */
+  put_buffer |= entropy->saved.put_buffer;
+
+  while (put_bits >= 8) {
+    int c = (int) ((put_buffer >> 16) & 0xFF);
+
+    emit_byte_e(entropy, c);
+    if (c == 0xFF) {		/* need to stuff a zero byte? */
+      emit_byte_e(entropy, 0);
+    }
+    put_buffer <<= 8;
+    put_bits -= 8;
+  }
+
+  entropy->saved.put_buffer = put_buffer; /* update variables */
+  entropy->saved.put_bits = put_bits;
+}
+
+
 LOCAL(boolean)
-flush_bits (working_state * state)
+flush_bits_s (working_state * state)
 {
-  if (! emit_bits(state, 0x7F, 7)) /* fill any partial byte with ones */
+  if (! emit_bits_s(state, 0x7F, 7)) /* fill any partial byte with ones */
     return FALSE;
-  state->cur.put_buffer = 0;	/* and reset bit-buffer to empty */
+  state->cur.put_buffer = 0;	     /* and reset bit-buffer to empty */
   state->cur.put_bits = 0;
   return TRUE;
 }
 
 
+LOCAL(void)
+flush_bits_e (huff_entropy_ptr entropy)
+{
+  emit_bits_e(entropy, 0x7F, 7); /* fill any partial byte with ones */
+  entropy->saved.put_buffer = 0; /* and reset bit-buffer to empty */
+  entropy->saved.put_bits = 0;
+}
+
+
+/*
+ * Emit (or just count) a Huffman symbol.
+ */
+
+INLINE
+LOCAL(void)
+emit_symbol (huff_entropy_ptr entropy, int tbl_no, int symbol)
+{
+  if (entropy->gather_statistics)
+    entropy->count_ptrs[tbl_no][symbol]++;
+  else {
+    c_derived_tbl * tbl = entropy->derived_tbls[tbl_no];
+    emit_bits_e(entropy, tbl->ehufco[symbol], tbl->ehufsi[symbol]);
+  }
+}
+
+
+/*
+ * Emit bits from a correction bit buffer.
+ */
+
+LOCAL(void)
+emit_buffered_bits (huff_entropy_ptr entropy, char * bufstart,
+		    unsigned int nbits)
+{
+  if (entropy->gather_statistics)
+    return;			/* no real work */
+
+  while (nbits > 0) {
+    emit_bits_e(entropy, (unsigned int) (*bufstart), 1);
+    bufstart++;
+    nbits--;
+  }
+}
+
+
+/*
+ * Emit any pending EOBRUN symbol.
+ */
+
+LOCAL(void)
+emit_eobrun (huff_entropy_ptr entropy)
+{
+  register int temp, nbits;
+
+  if (entropy->EOBRUN > 0) {	/* if there is any pending EOBRUN */
+    temp = entropy->EOBRUN;
+    nbits = 0;
+    while ((temp >>= 1))
+      nbits++;
+    /* safety check: shouldn't happen given limited correction-bit buffer */
+    if (nbits > 14)
+      ERREXIT(entropy->cinfo, JERR_HUFF_MISSING_CODE);
+
+    emit_symbol(entropy, entropy->ac_tbl_no, nbits << 4);
+    if (nbits)
+      emit_bits_e(entropy, entropy->EOBRUN, nbits);
+
+    entropy->EOBRUN = 0;
+
+    /* Emit any buffered correction bits */
+    emit_buffered_bits(entropy, entropy->bit_buffer, entropy->BE);
+    entropy->BE = 0;
+  }
+}
+
+
+/*
+ * Emit a restart marker & resynchronize predictions.
+ */
+
+LOCAL(boolean)
+emit_restart_s (working_state * state, int restart_num)
+{
+  int ci;
+
+  if (! flush_bits_s(state))
+    return FALSE;
+
+  emit_byte_s(state, 0xFF, return FALSE);
+  emit_byte_s(state, JPEG_RST0 + restart_num, return FALSE);
+
+  /* Re-initialize DC predictions to 0 */
+  for (ci = 0; ci < state->cinfo->comps_in_scan; ci++)
+    state->cur.last_dc_val[ci] = 0;
+
+  /* The restart counter is not updated until we successfully write the MCU. */
+
+  return TRUE;
+}
+
+
+LOCAL(void)
+emit_restart_e (huff_entropy_ptr entropy, int restart_num)
+{
+  int ci;
+
+  emit_eobrun(entropy);
+
+  if (! entropy->gather_statistics) {
+    flush_bits_e(entropy);
+    emit_byte_e(entropy, 0xFF);
+    emit_byte_e(entropy, JPEG_RST0 + restart_num);
+  }
+
+  if (entropy->cinfo->Ss == 0) {
+    /* Re-initialize DC predictions to 0 */
+    for (ci = 0; ci < entropy->cinfo->comps_in_scan; ci++)
+      entropy->saved.last_dc_val[ci] = 0;
+  } else {
+    /* Re-initialize all AC-related fields to 0 */
+    entropy->EOBRUN = 0;
+    entropy->BE = 0;
+  }
+}
+
+
+/*
+ * MCU encoding for DC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  register int temp, temp2;
+  register int nbits;
+  int blkn, ci;
+  int Al = cinfo->Al;
+  JBLOCKROW block;
+  jpeg_component_info * compptr;
+  ISHIFT_TEMPS
+
+  entropy->next_output_byte = cinfo->dest->next_output_byte;
+  entropy->free_in_buffer = cinfo->dest->free_in_buffer;
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval)
+    if (entropy->restarts_to_go == 0)
+      emit_restart_e(entropy, entropy->next_restart_num);
+
+  /* Encode the MCU data blocks */
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    block = MCU_data[blkn];
+    ci = cinfo->MCU_membership[blkn];
+    compptr = cinfo->cur_comp_info[ci];
+
+    /* Compute the DC value after the required point transform by Al.
+     * This is simply an arithmetic right shift.
+     */
+    temp2 = IRIGHT_SHIFT((int) ((*block)[0]), Al);
+
+    /* DC differences are figured on the point-transformed values. */
+    temp = temp2 - entropy->saved.last_dc_val[ci];
+    entropy->saved.last_dc_val[ci] = temp2;
+
+    /* Encode the DC coefficient difference per section G.1.2.1 */
+    temp2 = temp;
+    if (temp < 0) {
+      temp = -temp;		/* temp is abs value of input */
+      /* For a negative input, want temp2 = bitwise complement of abs(input) */
+      /* This code assumes we are on a two's complement machine */
+      temp2--;
+    }
+    
+    /* Find the number of bits needed for the magnitude of the coefficient */
+    nbits = 0;
+    while (temp) {
+      nbits++;
+      temp >>= 1;
+    }
+    /* Check for out-of-range coefficient values.
+     * Since we're encoding a difference, the range limit is twice as much.
+     */
+    if (nbits > MAX_COEF_BITS+1)
+      ERREXIT(cinfo, JERR_BAD_DCT_COEF);
+    
+    /* Count/emit the Huffman-coded symbol for the number of bits */
+    emit_symbol(entropy, compptr->dc_tbl_no, nbits);
+    
+    /* Emit that number of bits of the value, if positive, */
+    /* or the complement of its magnitude, if negative. */
+    if (nbits)			/* emit_bits rejects calls with size 0 */
+      emit_bits_e(entropy, (unsigned int) temp2, nbits);
+  }
+
+  cinfo->dest->next_output_byte = entropy->next_output_byte;
+  cinfo->dest->free_in_buffer = entropy->free_in_buffer;
+
+  /* Update restart-interval state too */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU encoding for AC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  register int temp, temp2;
+  register int nbits;
+  register int r, k;
+  int Se = cinfo->Se;
+  int Al = cinfo->Al;
+  JBLOCKROW block;
+
+  entropy->next_output_byte = cinfo->dest->next_output_byte;
+  entropy->free_in_buffer = cinfo->dest->free_in_buffer;
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval)
+    if (entropy->restarts_to_go == 0)
+      emit_restart_e(entropy, entropy->next_restart_num);
+
+  /* Encode the MCU data block */
+  block = MCU_data[0];
+
+  /* Encode the AC coefficients per section G.1.2.2, fig. G.3 */
+  
+  r = 0;			/* r = run length of zeros */
+   
+  for (k = cinfo->Ss; k <= Se; k++) {
+    if ((temp = (*block)[jpeg_natural_order[k]]) == 0) {
+      r++;
+      continue;
+    }
+    /* We must apply the point transform by Al.  For AC coefficients this
+     * is an integer division with rounding towards 0.  To do this portably
+     * in C, we shift after obtaining the absolute value; so the code is
+     * interwoven with finding the abs value (temp) and output bits (temp2).
+     */
+    if (temp < 0) {
+      temp = -temp;		/* temp is abs value of input */
+      temp >>= Al;		/* apply the point transform */
+      /* For a negative coef, want temp2 = bitwise complement of abs(coef) */
+      temp2 = ~temp;
+    } else {
+      temp >>= Al;		/* apply the point transform */
+      temp2 = temp;
+    }
+    /* Watch out for case that nonzero coef is zero after point transform */
+    if (temp == 0) {
+      r++;
+      continue;
+    }
+
+    /* Emit any pending EOBRUN */
+    if (entropy->EOBRUN > 0)
+      emit_eobrun(entropy);
+    /* if run length > 15, must emit special run-length-16 codes (0xF0) */
+    while (r > 15) {
+      emit_symbol(entropy, entropy->ac_tbl_no, 0xF0);
+      r -= 16;
+    }
+
+    /* Find the number of bits needed for the magnitude of the coefficient */
+    nbits = 1;			/* there must be at least one 1 bit */
+    while ((temp >>= 1))
+      nbits++;
+    /* Check for out-of-range coefficient values */
+    if (nbits > MAX_COEF_BITS)
+      ERREXIT(cinfo, JERR_BAD_DCT_COEF);
+
+    /* Count/emit Huffman symbol for run length / number of bits */
+    emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + nbits);
+
+    /* Emit that number of bits of the value, if positive, */
+    /* or the complement of its magnitude, if negative. */
+    emit_bits_e(entropy, (unsigned int) temp2, nbits);
+
+    r = 0;			/* reset zero run length */
+  }
+
+  if (r > 0) {			/* If there are trailing zeroes, */
+    entropy->EOBRUN++;		/* count an EOB */
+    if (entropy->EOBRUN == 0x7FFF)
+      emit_eobrun(entropy);	/* force it out to avoid overflow */
+  }
+
+  cinfo->dest->next_output_byte = entropy->next_output_byte;
+  cinfo->dest->free_in_buffer = entropy->free_in_buffer;
+
+  /* Update restart-interval state too */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU encoding for DC successive approximation refinement scan.
+ * Note: we assume such scans can be multi-component, although the spec
+ * is not very clear on the point.
+ */
+
+METHODDEF(boolean)
+encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  register int temp;
+  int blkn;
+  int Al = cinfo->Al;
+  JBLOCKROW block;
+
+  entropy->next_output_byte = cinfo->dest->next_output_byte;
+  entropy->free_in_buffer = cinfo->dest->free_in_buffer;
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval)
+    if (entropy->restarts_to_go == 0)
+      emit_restart_e(entropy, entropy->next_restart_num);
+
+  /* Encode the MCU data blocks */
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    block = MCU_data[blkn];
+
+    /* We simply emit the Al'th bit of the DC coefficient value. */
+    temp = (*block)[0];
+    emit_bits_e(entropy, (unsigned int) (temp >> Al), 1);
+  }
+
+  cinfo->dest->next_output_byte = entropy->next_output_byte;
+  cinfo->dest->free_in_buffer = entropy->free_in_buffer;
+
+  /* Update restart-interval state too */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU encoding for AC successive approximation refinement scan.
+ */
+
+METHODDEF(boolean)
+encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  register int temp;
+  register int r, k;
+  int EOB;
+  char *BR_buffer;
+  unsigned int BR;
+  int Se = cinfo->Se;
+  int Al = cinfo->Al;
+  JBLOCKROW block;
+  int absvalues[DCTSIZE2];
+
+  entropy->next_output_byte = cinfo->dest->next_output_byte;
+  entropy->free_in_buffer = cinfo->dest->free_in_buffer;
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval)
+    if (entropy->restarts_to_go == 0)
+      emit_restart_e(entropy, entropy->next_restart_num);
+
+  /* Encode the MCU data block */
+  block = MCU_data[0];
+
+  /* It is convenient to make a pre-pass to determine the transformed
+   * coefficients' absolute values and the EOB position.
+   */
+  EOB = 0;
+  for (k = cinfo->Ss; k <= Se; k++) {
+    temp = (*block)[jpeg_natural_order[k]];
+    /* We must apply the point transform by Al.  For AC coefficients this
+     * is an integer division with rounding towards 0.  To do this portably
+     * in C, we shift after obtaining the absolute value.
+     */
+    if (temp < 0)
+      temp = -temp;		/* temp is abs value of input */
+    temp >>= Al;		/* apply the point transform */
+    absvalues[k] = temp;	/* save abs value for main pass */
+    if (temp == 1)
+      EOB = k;			/* EOB = index of last newly-nonzero coef */
+  }
+
+  /* Encode the AC coefficients per section G.1.2.3, fig. G.7 */
+  
+  r = 0;			/* r = run length of zeros */
+  BR = 0;			/* BR = count of buffered bits added now */
+  BR_buffer = entropy->bit_buffer + entropy->BE; /* Append bits to buffer */
+
+  for (k = cinfo->Ss; k <= Se; k++) {
+    if ((temp = absvalues[k]) == 0) {
+      r++;
+      continue;
+    }
+
+    /* Emit any required ZRLs, but not if they can be folded into EOB */
+    while (r > 15 && k <= EOB) {
+      /* emit any pending EOBRUN and the BE correction bits */
+      emit_eobrun(entropy);
+      /* Emit ZRL */
+      emit_symbol(entropy, entropy->ac_tbl_no, 0xF0);
+      r -= 16;
+      /* Emit buffered correction bits that must be associated with ZRL */
+      emit_buffered_bits(entropy, BR_buffer, BR);
+      BR_buffer = entropy->bit_buffer; /* BE bits are gone now */
+      BR = 0;
+    }
+
+    /* If the coef was previously nonzero, it only needs a correction bit.
+     * NOTE: a straight translation of the spec's figure G.7 would suggest
+     * that we also need to test r > 15.  But if r > 15, we can only get here
+     * if k > EOB, which implies that this coefficient is not 1.
+     */
+    if (temp > 1) {
+      /* The correction bit is the next bit of the absolute value. */
+      BR_buffer[BR++] = (char) (temp & 1);
+      continue;
+    }
+
+    /* Emit any pending EOBRUN and the BE correction bits */
+    emit_eobrun(entropy);
+
+    /* Count/emit Huffman symbol for run length / number of bits */
+    emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + 1);
+
+    /* Emit output bit for newly-nonzero coef */
+    temp = ((*block)[jpeg_natural_order[k]] < 0) ? 0 : 1;
+    emit_bits_e(entropy, (unsigned int) temp, 1);
+
+    /* Emit buffered correction bits that must be associated with this code */
+    emit_buffered_bits(entropy, BR_buffer, BR);
+    BR_buffer = entropy->bit_buffer; /* BE bits are gone now */
+    BR = 0;
+    r = 0;			/* reset zero run length */
+  }
+
+  if (r > 0 || BR > 0) {	/* If there are trailing zeroes, */
+    entropy->EOBRUN++;		/* count an EOB */
+    entropy->BE += BR;		/* concat my correction bits to older ones */
+    /* We force out the EOB if we risk either:
+     * 1. overflow of the EOB counter;
+     * 2. overflow of the correction bit buffer during the next MCU.
+     */
+    if (entropy->EOBRUN == 0x7FFF || entropy->BE > (MAX_CORR_BITS-DCTSIZE2+1))
+      emit_eobrun(entropy);
+  }
+
+  cinfo->dest->next_output_byte = entropy->next_output_byte;
+  cinfo->dest->free_in_buffer = entropy->free_in_buffer;
+
+  /* Update restart-interval state too */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  return TRUE;
+}
+
+
 /* Encode a single block's worth of coefficients */
 
 LOCAL(boolean)
@@ -356,9 +909,9 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
   register int temp, temp2;
   register int nbits;
   register int k, r, i;
-  
+
   /* Encode the DC coefficient difference per section F.1.2.1 */
-  
+
   temp = temp2 = block[0] - last_dc_val;
 
   if (temp < 0) {
@@ -367,7 +920,7 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
     /* This code assumes we are on a two's complement machine */
     temp2--;
   }
-  
+
   /* Find the number of bits needed for the magnitude of the coefficient */
   nbits = 0;
   while (temp) {
@@ -379,28 +932,28 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
    */
   if (nbits > MAX_COEF_BITS+1)
     ERREXIT(state->cinfo, JERR_BAD_DCT_COEF);
-  
+
   /* Emit the Huffman-coded symbol for the number of bits */
-  if (! emit_bits(state, dctbl->ehufco[nbits], dctbl->ehufsi[nbits]))
+  if (! emit_bits_s(state, dctbl->ehufco[nbits], dctbl->ehufsi[nbits]))
     return FALSE;
 
   /* Emit that number of bits of the value, if positive, */
   /* or the complement of its magnitude, if negative. */
   if (nbits)			/* emit_bits rejects calls with size 0 */
-    if (! emit_bits(state, (unsigned int) temp2, nbits))
+    if (! emit_bits_s(state, (unsigned int) temp2, nbits))
       return FALSE;
 
   /* Encode the AC coefficients per section F.1.2.2 */
-  
+
   r = 0;			/* r = run length of zeros */
-  
+
   for (k = 1; k < DCTSIZE2; k++) {
     if ((temp = block[jpeg_natural_order[k]]) == 0) {
       r++;
     } else {
       /* if run length > 15, must emit special run-length-16 codes (0xF0) */
       while (r > 15) {
-	if (! emit_bits(state, actbl->ehufco[0xF0], actbl->ehufsi[0xF0]))
+	if (! emit_bits_s(state, actbl->ehufco[0xF0], actbl->ehufsi[0xF0]))
 	  return FALSE;
 	r -= 16;
       }
@@ -411,7 +964,7 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
 	/* This code assumes we are on a two's complement machine */
 	temp2--;
       }
-      
+
       /* Find the number of bits needed for the magnitude of the coefficient */
       nbits = 1;		/* there must be at least one 1 bit */
       while ((temp >>= 1))
@@ -419,24 +972,24 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
       /* Check for out-of-range coefficient values */
       if (nbits > MAX_COEF_BITS)
 	ERREXIT(state->cinfo, JERR_BAD_DCT_COEF);
-      
+
       /* Emit Huffman symbol for run length / number of bits */
       i = (r << 4) + nbits;
-      if (! emit_bits(state, actbl->ehufco[i], actbl->ehufsi[i]))
+      if (! emit_bits_s(state, actbl->ehufco[i], actbl->ehufsi[i]))
 	return FALSE;
 
       /* Emit that number of bits of the value, if positive, */
       /* or the complement of its magnitude, if negative. */
-      if (! emit_bits(state, (unsigned int) temp2, nbits))
+      if (! emit_bits_s(state, (unsigned int) temp2, nbits))
 	return FALSE;
-      
+
       r = 0;
     }
   }
 
   /* If the last coef(s) were zero, emit an end-of-block code */
   if (r > 0)
-    if (! emit_bits(state, actbl->ehufco[0], actbl->ehufsi[0]))
+    if (! emit_bits_s(state, actbl->ehufco[0], actbl->ehufsi[0]))
       return FALSE;
 
   return TRUE;
@@ -444,31 +997,6 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
 
 
 /*
- * Emit a restart marker & resynchronize predictions.
- */
-
-LOCAL(boolean)
-emit_restart (working_state * state, int restart_num)
-{
-  int ci;
-
-  if (! flush_bits(state))
-    return FALSE;
-
-  emit_byte(state, 0xFF, return FALSE);
-  emit_byte(state, JPEG_RST0 + restart_num, return FALSE);
-
-  /* Re-initialize DC predictions to 0 */
-  for (ci = 0; ci < state->cinfo->comps_in_scan; ci++)
-    state->cur.last_dc_val[ci] = 0;
-
-  /* The restart counter is not updated until we successfully write the MCU. */
-
-  return TRUE;
-}
-
-
-/*
  * Encode and output one MCU's worth of Huffman-compressed coefficients.
  */
 
@@ -489,7 +1017,7 @@ encode_mcu_huff (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
   /* Emit restart marker if needed */
   if (cinfo->restart_interval) {
     if (entropy->restarts_to_go == 0)
-      if (! emit_restart(&state, entropy->next_restart_num))
+      if (! emit_restart_s(&state, entropy->next_restart_num))
 	return FALSE;
   }
 
@@ -535,20 +1063,32 @@ finish_pass_huff (j_compress_ptr cinfo)
   huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
   working_state state;
 
-  /* Load up working state ... flush_bits needs it */
-  state.next_output_byte = cinfo->dest->next_output_byte;
-  state.free_in_buffer = cinfo->dest->free_in_buffer;
-  ASSIGN_STATE(state.cur, entropy->saved);
-  state.cinfo = cinfo;
+  if (cinfo->progressive_mode) {
+    entropy->next_output_byte = cinfo->dest->next_output_byte;
+    entropy->free_in_buffer = cinfo->dest->free_in_buffer;
 
-  /* Flush out the last data */
-  if (! flush_bits(&state))
-    ERREXIT(cinfo, JERR_CANT_SUSPEND);
+    /* Flush out any buffered data */
+    emit_eobrun(entropy);
+    flush_bits_e(entropy);
 
-  /* Update state */
-  cinfo->dest->next_output_byte = state.next_output_byte;
-  cinfo->dest->free_in_buffer = state.free_in_buffer;
-  ASSIGN_STATE(entropy->saved, state.cur);
+    cinfo->dest->next_output_byte = entropy->next_output_byte;
+    cinfo->dest->free_in_buffer = entropy->free_in_buffer;
+  } else {
+    /* Load up working state ... flush_bits needs it */
+    state.next_output_byte = cinfo->dest->next_output_byte;
+    state.free_in_buffer = cinfo->dest->free_in_buffer;
+    ASSIGN_STATE(state.cur, entropy->saved);
+    state.cinfo = cinfo;
+
+    /* Flush out the last data */
+    if (! flush_bits_s(&state))
+      ERREXIT(cinfo, JERR_CANT_SUSPEND);
+
+    /* Update state */
+    cinfo->dest->next_output_byte = state.next_output_byte;
+    cinfo->dest->free_in_buffer = state.free_in_buffer;
+    ASSIGN_STATE(entropy->saved, state.cur);
+  }
 }
 
 
@@ -563,8 +1103,6 @@ finish_pass_huff (j_compress_ptr cinfo)
  * the compressed data.
  */
 
-#ifdef ENTROPY_OPT_SUPPORTED
-
 
 /* Process a single block's worth of coefficients */
 
@@ -675,7 +1213,6 @@ encode_mcu_gather (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 
 /*
  * Generate the best Huffman code table for the given counts, fill htbl.
- * Note this is also used by jcphuff.c.
  *
  * The JPEG standard requires that no symbol be assigned a codeword of all
  * one bits (so that padding bits added at the end of a compressed segment
@@ -701,7 +1238,7 @@ encode_mcu_gather (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
  * So the extra complexity of an optimal algorithm doesn't seem worthwhile.
  */
 
-GLOBAL(void)
+LOCAL(void)
 jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL * htbl, long freq[])
 {
 #define MAX_CLEN 32		/* assumed maximum initial code length */
@@ -846,41 +1383,200 @@ METHODDEF(void)
 finish_pass_gather (j_compress_ptr cinfo)
 {
   huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
-  int ci, dctbl, actbl;
+  int ci, dctbl, actbl, tbl;
   jpeg_component_info * compptr;
   JHUFF_TBL **htblptr;
   boolean did_dc[NUM_HUFF_TBLS];
   boolean did_ac[NUM_HUFF_TBLS];
+  boolean did[NUM_HUFF_TBLS];
 
   /* It's important not to apply jpeg_gen_optimal_table more than once
    * per table, because it clobbers the input frequency counts!
    */
-  MEMZERO(did_dc, SIZEOF(did_dc));
-  MEMZERO(did_ac, SIZEOF(did_ac));
-
-  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
-    compptr = cinfo->cur_comp_info[ci];
-    dctbl = compptr->dc_tbl_no;
-    actbl = compptr->ac_tbl_no;
-    if (! did_dc[dctbl]) {
-      htblptr = & cinfo->dc_huff_tbl_ptrs[dctbl];
-      if (*htblptr == NULL)
-	*htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
-      jpeg_gen_optimal_table(cinfo, *htblptr, entropy->dc_count_ptrs[dctbl]);
-      did_dc[dctbl] = TRUE;
-    }
-    if (! did_ac[actbl]) {
-      htblptr = & cinfo->ac_huff_tbl_ptrs[actbl];
-      if (*htblptr == NULL)
-	*htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
-      jpeg_gen_optimal_table(cinfo, *htblptr, entropy->ac_count_ptrs[actbl]);
-      did_ac[actbl] = TRUE;
+  if (cinfo->progressive_mode) {
+    /* Flush out buffered data (all we care about is counting the EOB symbol) */
+    emit_eobrun(entropy);
+
+    MEMZERO(did, SIZEOF(did));
+
+    for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+      compptr = cinfo->cur_comp_info[ci];
+      if (cinfo->Ss == 0) {
+	if (cinfo->Ah != 0)	/* DC refinement needs no table */
+	  continue;
+	tbl = compptr->dc_tbl_no;
+      } else {
+	tbl = compptr->ac_tbl_no;
+      }
+      if (! did[tbl]) {
+	if (cinfo->Ss == 0)
+	  htblptr = & cinfo->dc_huff_tbl_ptrs[tbl];
+	else
+	  htblptr = & cinfo->ac_huff_tbl_ptrs[tbl];
+	if (*htblptr == NULL)
+	  *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
+	jpeg_gen_optimal_table(cinfo, *htblptr, entropy->count_ptrs[tbl]);
+	did[tbl] = TRUE;
+      }
+    }
+  } else {
+    MEMZERO(did_dc, SIZEOF(did_dc));
+    MEMZERO(did_ac, SIZEOF(did_ac));
+
+    for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+      compptr = cinfo->cur_comp_info[ci];
+      dctbl = compptr->dc_tbl_no;
+      actbl = compptr->ac_tbl_no;
+      if (! did_dc[dctbl]) {
+	htblptr = & cinfo->dc_huff_tbl_ptrs[dctbl];
+	if (*htblptr == NULL)
+	  *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
+	jpeg_gen_optimal_table(cinfo, *htblptr, entropy->dc_count_ptrs[dctbl]);
+	did_dc[dctbl] = TRUE;
+      }
+      if (! did_ac[actbl]) {
+	htblptr = & cinfo->ac_huff_tbl_ptrs[actbl];
+	if (*htblptr == NULL)
+	  *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
+	jpeg_gen_optimal_table(cinfo, *htblptr, entropy->ac_count_ptrs[actbl]);
+	did_ac[actbl] = TRUE;
+      }
     }
   }
 }
 
 
-#endif /* ENTROPY_OPT_SUPPORTED */
+/*
+ * Initialize for a Huffman-compressed scan.
+ * If gather_statistics is TRUE, we do not output anything during the scan,
+ * just count the Huffman symbols used and generate Huffman code tables.
+ */
+
+METHODDEF(void)
+start_pass_huff (j_compress_ptr cinfo, boolean gather_statistics)
+{
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  int ci, dctbl, actbl, tbl;
+  jpeg_component_info * compptr;
+
+  if (gather_statistics)
+    entropy->pub.finish_pass = finish_pass_gather;
+  else
+    entropy->pub.finish_pass = finish_pass_huff;
+
+  if (cinfo->progressive_mode) {
+    entropy->cinfo = cinfo;
+    entropy->gather_statistics = gather_statistics;
+
+    /* We assume jcmaster.c already validated the scan parameters. */
+
+    /* Select execution routine */
+    if (cinfo->Ah == 0) {
+      if (cinfo->Ss == 0)
+	entropy->pub.encode_mcu = encode_mcu_DC_first;
+      else
+	entropy->pub.encode_mcu = encode_mcu_AC_first;
+    } else {
+      if (cinfo->Ss == 0)
+	entropy->pub.encode_mcu = encode_mcu_DC_refine;
+      else {
+	entropy->pub.encode_mcu = encode_mcu_AC_refine;
+	/* AC refinement needs a correction bit buffer */
+	if (entropy->bit_buffer == NULL)
+	  entropy->bit_buffer = (char *)
+	    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+					MAX_CORR_BITS * SIZEOF(char));
+      }
+    }
+
+    /* Only DC coefficients may be interleaved, so cinfo->comps_in_scan = 1
+     * for AC coefficients.
+     */
+    for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+      compptr = cinfo->cur_comp_info[ci];
+      /* Initialize DC predictions to 0 */
+      entropy->saved.last_dc_val[ci] = 0;
+      /* Get table index */
+      if (cinfo->Ss == 0) {
+	if (cinfo->Ah != 0)	/* DC refinement needs no table */
+	  continue;
+	tbl = compptr->dc_tbl_no;
+      } else {
+	entropy->ac_tbl_no = tbl = compptr->ac_tbl_no;
+      }
+      if (gather_statistics) {
+	/* Check for invalid table index */
+	/* (make_c_derived_tbl does this in the other path) */
+	if (tbl < 0 || tbl >= NUM_HUFF_TBLS)
+	  ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, tbl);
+	/* Allocate and zero the statistics tables */
+	/* Note that jpeg_gen_optimal_table expects 257 entries in each table! */
+	if (entropy->count_ptrs[tbl] == NULL)
+	  entropy->count_ptrs[tbl] = (long *)
+	    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+					257 * SIZEOF(long));
+	MEMZERO(entropy->count_ptrs[tbl], 257 * SIZEOF(long));
+      } else {
+	/* Compute derived values for Huffman table */
+	/* We may do this more than once for a table, but it's not expensive */
+	jpeg_make_c_derived_tbl(cinfo, cinfo->Ss == 0, tbl,
+				& entropy->derived_tbls[tbl]);
+      }
+    }
+
+    /* Initialize AC stuff */
+    entropy->EOBRUN = 0;
+    entropy->BE = 0;
+  } else {
+    if (gather_statistics)
+      entropy->pub.encode_mcu = encode_mcu_gather;
+    else
+      entropy->pub.encode_mcu = encode_mcu_huff;
+
+    for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+      compptr = cinfo->cur_comp_info[ci];
+      dctbl = compptr->dc_tbl_no;
+      actbl = compptr->ac_tbl_no;
+      if (gather_statistics) {
+	/* Check for invalid table indexes */
+	/* (make_c_derived_tbl does this in the other path) */
+	if (dctbl < 0 || dctbl >= NUM_HUFF_TBLS)
+	  ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, dctbl);
+	if (actbl < 0 || actbl >= NUM_HUFF_TBLS)
+	  ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, actbl);
+	/* Allocate and zero the statistics tables */
+	/* Note that jpeg_gen_optimal_table expects 257 entries in each table! */
+	if (entropy->dc_count_ptrs[dctbl] == NULL)
+	  entropy->dc_count_ptrs[dctbl] = (long *)
+	    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+					257 * SIZEOF(long));
+	MEMZERO(entropy->dc_count_ptrs[dctbl], 257 * SIZEOF(long));
+	if (entropy->ac_count_ptrs[actbl] == NULL)
+	  entropy->ac_count_ptrs[actbl] = (long *)
+	    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+					257 * SIZEOF(long));
+	MEMZERO(entropy->ac_count_ptrs[actbl], 257 * SIZEOF(long));
+      } else {
+	/* Compute derived values for Huffman tables */
+	/* We may do this more than once for a table, but it's not expensive */
+	jpeg_make_c_derived_tbl(cinfo, TRUE, dctbl,
+				& entropy->dc_derived_tbls[dctbl]);
+	jpeg_make_c_derived_tbl(cinfo, FALSE, actbl,
+				& entropy->ac_derived_tbls[actbl]);
+      }
+      /* Initialize DC predictions to 0 */
+      entropy->saved.last_dc_val[ci] = 0;
+    }
+  }
+
+  /* Initialize bit buffer to empty */
+  entropy->saved.put_buffer = 0;
+  entropy->saved.put_bits = 0;
+
+  /* Initialize restart stuff */
+  entropy->restarts_to_go = cinfo->restart_interval;
+  entropy->next_restart_num = 0;
+}
 
 
 /*
@@ -899,11 +1595,18 @@ jinit_huff_encoder (j_compress_ptr cinfo)
   cinfo->entropy = (struct jpeg_entropy_encoder *) entropy;
   entropy->pub.start_pass = start_pass_huff;
 
-  /* Mark tables unallocated */
-  for (i = 0; i < NUM_HUFF_TBLS; i++) {
-    entropy->dc_derived_tbls[i] = entropy->ac_derived_tbls[i] = NULL;
-#ifdef ENTROPY_OPT_SUPPORTED
-    entropy->dc_count_ptrs[i] = entropy->ac_count_ptrs[i] = NULL;
-#endif
+  if (cinfo->progressive_mode) {
+    /* Mark tables unallocated */
+    for (i = 0; i < NUM_HUFF_TBLS; i++) {
+      entropy->derived_tbls[i] = NULL;
+      entropy->count_ptrs[i] = NULL;
+    }
+    entropy->bit_buffer = NULL;	/* needed only in AC refinement scan */
+  } else {
+    /* Mark tables unallocated */
+    for (i = 0; i < NUM_HUFF_TBLS; i++) {
+      entropy->dc_derived_tbls[i] = entropy->ac_derived_tbls[i] = NULL;
+      entropy->dc_count_ptrs[i] = entropy->ac_count_ptrs[i] = NULL;
+    }
   }
 }
diff --git a/src/libjpeg/jcinit.c b/src/libjpeg/jcinit.c
index 5efffe3..0ba310f 100644
--- a/src/libjpeg/jcinit.c
+++ b/src/libjpeg/jcinit.c
@@ -41,17 +41,10 @@ jinit_compress_master (j_compress_ptr cinfo)
   /* Forward DCT */
   jinit_forward_dct(cinfo);
   /* Entropy encoding: either Huffman or arithmetic coding. */
-  if (cinfo->arith_code) {
-    ERREXIT(cinfo, JERR_ARITH_NOTIMPL);
-  } else {
-    if (cinfo->progressive_mode) {
-#ifdef C_PROGRESSIVE_SUPPORTED
-      jinit_phuff_encoder(cinfo);
-#else
-      ERREXIT(cinfo, JERR_NOT_COMPILED);
-#endif
-    } else
-      jinit_huff_encoder(cinfo);
+  if (cinfo->arith_code)
+    jinit_arith_encoder(cinfo);
+  else {
+    jinit_huff_encoder(cinfo);
   }
 
   /* Need a full-image coefficient buffer in any multi-pass mode. */
diff --git a/src/libjpeg/jcmainct.c b/src/libjpeg/jcmainct.c
index e0279a7..7de75d1 100644
--- a/src/libjpeg/jcmainct.c
+++ b/src/libjpeg/jcmainct.c
@@ -118,17 +118,17 @@ process_data_simple_main (j_compress_ptr cinfo,
 
   while (main->cur_iMCU_row < cinfo->total_iMCU_rows) {
     /* Read input data if we haven't filled the main buffer yet */
-    if (main->rowgroup_ctr < DCTSIZE)
+    if (main->rowgroup_ctr < (JDIMENSION) cinfo->min_DCT_v_scaled_size)
       (*cinfo->prep->pre_process_data) (cinfo,
 					input_buf, in_row_ctr, in_rows_avail,
 					main->buffer, &main->rowgroup_ctr,
-					(JDIMENSION) DCTSIZE);
+					(JDIMENSION) cinfo->min_DCT_v_scaled_size);
 
     /* If we don't have a full iMCU row buffered, return to application for
      * more data.  Note that preprocessor will always pad to fill the iMCU row
      * at the bottom of the image.
      */
-    if (main->rowgroup_ctr != DCTSIZE)
+    if (main->rowgroup_ctr != (JDIMENSION) cinfo->min_DCT_v_scaled_size)
       return;
 
     /* Send the completed row to the compressor */
@@ -269,10 +269,10 @@ jinit_c_main_controller (j_compress_ptr cinfo, boolean need_full_buffer)
 	 ci++, compptr++) {
       main->whole_image[ci] = (*cinfo->mem->request_virt_sarray)
 	((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE,
-	 compptr->width_in_blocks * DCTSIZE,
+	 compptr->width_in_blocks * compptr->DCT_h_scaled_size,
 	 (JDIMENSION) jround_up((long) compptr->height_in_blocks,
 				(long) compptr->v_samp_factor) * DCTSIZE,
-	 (JDIMENSION) (compptr->v_samp_factor * DCTSIZE));
+	 (JDIMENSION) (compptr->v_samp_factor * compptr->DCT_v_scaled_size));
     }
 #else
     ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
@@ -286,8 +286,8 @@ jinit_c_main_controller (j_compress_ptr cinfo, boolean need_full_buffer)
 	 ci++, compptr++) {
       main->buffer[ci] = (*cinfo->mem->alloc_sarray)
 	((j_common_ptr) cinfo, JPOOL_IMAGE,
-	 compptr->width_in_blocks * DCTSIZE,
-	 (JDIMENSION) (compptr->v_samp_factor * DCTSIZE));
+	 compptr->width_in_blocks * compptr->DCT_h_scaled_size,
+	 (JDIMENSION) (compptr->v_samp_factor * compptr->DCT_v_scaled_size));
     }
   }
 }
diff --git a/src/libjpeg/jcmarker.c b/src/libjpeg/jcmarker.c
index 3d1e6c6..54d109a 100644
--- a/src/libjpeg/jcmarker.c
+++ b/src/libjpeg/jcmarker.c
@@ -285,13 +285,13 @@ emit_sof (j_compress_ptr cinfo, JPEG_MARKER code)
   emit_2bytes(cinfo, 3 * cinfo->num_components + 2 + 5 + 1); /* length */
 
   /* Make sure image isn't bigger than SOF field can handle */
-  if ((long) cinfo->image_height > 65535L ||
-      (long) cinfo->image_width > 65535L)
+  if ((long) cinfo->jpeg_height > 65535L ||
+      (long) cinfo->jpeg_width > 65535L)
     ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int) 65535);
 
   emit_byte(cinfo, cinfo->data_precision);
-  emit_2bytes(cinfo, (int) cinfo->image_height);
-  emit_2bytes(cinfo, (int) cinfo->image_width);
+  emit_2bytes(cinfo, (int) cinfo->jpeg_height);
+  emit_2bytes(cinfo, (int) cinfo->jpeg_width);
 
   emit_byte(cinfo, cinfo->num_components);
 
@@ -529,7 +529,10 @@ write_frame_header (j_compress_ptr cinfo)
 
   /* Emit the proper SOF marker */
   if (cinfo->arith_code) {
-    emit_sof(cinfo, M_SOF9);	/* SOF code for arithmetic coding */
+    if (cinfo->progressive_mode)
+      emit_sof(cinfo, M_SOF10); /* SOF code for progressive arithmetic */
+    else
+      emit_sof(cinfo, M_SOF9);  /* SOF code for sequential arithmetic */
   } else {
     if (cinfo->progressive_mode)
       emit_sof(cinfo, M_SOF2);	/* SOF code for progressive Huffman */
diff --git a/src/libjpeg/jcmaster.c b/src/libjpeg/jcmaster.c
index aab4020..db3e61a 100644
--- a/src/libjpeg/jcmaster.c
+++ b/src/libjpeg/jcmaster.c
@@ -2,6 +2,7 @@
  * jcmaster.c
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2003-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -42,23 +43,172 @@ typedef my_comp_master * my_master_ptr;
  * Support routines that do various essential calculations.
  */
 
+/*
+ * Compute JPEG image dimensions and related values.
+ * NOTE: this is exported for possible use by application.
+ * Hence it mustn't do anything that can't be done twice.
+ */
+
+GLOBAL(void)
+jpeg_calc_jpeg_dimensions (j_compress_ptr cinfo)
+/* Do computations that are needed before master selection phase */
+{
+#ifdef DCT_SCALING_SUPPORTED
+
+  /* Compute actual JPEG image dimensions and DCT scaling choices. */
+  if (cinfo->scale_num >= cinfo->scale_denom * 8) {
+    /* Provide 8/1 scaling */
+    cinfo->jpeg_width = cinfo->image_width << 3;
+    cinfo->jpeg_height = cinfo->image_height << 3;
+    cinfo->min_DCT_h_scaled_size = 1;
+    cinfo->min_DCT_v_scaled_size = 1;
+  } else if (cinfo->scale_num >= cinfo->scale_denom * 4) {
+    /* Provide 4/1 scaling */
+    cinfo->jpeg_width = cinfo->image_width << 2;
+    cinfo->jpeg_height = cinfo->image_height << 2;
+    cinfo->min_DCT_h_scaled_size = 2;
+    cinfo->min_DCT_v_scaled_size = 2;
+  } else if (cinfo->scale_num * 3 >= cinfo->scale_denom * 8) {
+    /* Provide 8/3 scaling */
+    cinfo->jpeg_width = (cinfo->image_width << 1) + (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 2, 3L);
+    cinfo->jpeg_height = (cinfo->image_height << 1) + (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 2, 3L);
+    cinfo->min_DCT_h_scaled_size = 3;
+    cinfo->min_DCT_v_scaled_size = 3;
+  } else if (cinfo->scale_num >= cinfo->scale_denom * 2) {
+    /* Provide 2/1 scaling */
+    cinfo->jpeg_width = cinfo->image_width << 1;
+    cinfo->jpeg_height = cinfo->image_height << 1;
+    cinfo->min_DCT_h_scaled_size = 4;
+    cinfo->min_DCT_v_scaled_size = 4;
+  } else if (cinfo->scale_num * 5 >= cinfo->scale_denom * 8) {
+    /* Provide 8/5 scaling */
+    cinfo->jpeg_width = cinfo->image_width + (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 3, 5L);
+    cinfo->jpeg_height = cinfo->image_height + (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 3, 5L);
+    cinfo->min_DCT_h_scaled_size = 5;
+    cinfo->min_DCT_v_scaled_size = 5;
+  } else if (cinfo->scale_num * 3 >= cinfo->scale_denom * 4) {
+    /* Provide 4/3 scaling */
+    cinfo->jpeg_width = cinfo->image_width + (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width, 3L);
+    cinfo->jpeg_height = cinfo->image_height + (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height, 3L);
+    cinfo->min_DCT_h_scaled_size = 6;
+    cinfo->min_DCT_v_scaled_size = 6;
+  } else if (cinfo->scale_num * 7 >= cinfo->scale_denom * 8) {
+    /* Provide 8/7 scaling */
+    cinfo->jpeg_width = cinfo->image_width + (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width, 7L);
+    cinfo->jpeg_height = cinfo->image_height + (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height, 7L);
+    cinfo->min_DCT_h_scaled_size = 7;
+    cinfo->min_DCT_v_scaled_size = 7;
+  } else if (cinfo->scale_num >= cinfo->scale_denom) {
+    /* Provide 1/1 scaling */
+    cinfo->jpeg_width = cinfo->image_width;
+    cinfo->jpeg_height = cinfo->image_height;
+    cinfo->min_DCT_h_scaled_size = DCTSIZE;
+    cinfo->min_DCT_v_scaled_size = DCTSIZE;
+  } else if (cinfo->scale_num * 9 >= cinfo->scale_denom * 8) {
+    /* Provide 8/9 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 8, 9L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 8, 9L);
+    cinfo->min_DCT_h_scaled_size = 9;
+    cinfo->min_DCT_v_scaled_size = 9;
+  } else if (cinfo->scale_num * 5 >= cinfo->scale_denom * 4) {
+    /* Provide 4/5 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 4, 5L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 4, 5L);
+    cinfo->min_DCT_h_scaled_size = 10;
+    cinfo->min_DCT_v_scaled_size = 10;
+  } else if (cinfo->scale_num * 11 >= cinfo->scale_denom * 8) {
+    /* Provide 8/11 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 8, 11L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 8, 11L);
+    cinfo->min_DCT_h_scaled_size = 11;
+    cinfo->min_DCT_v_scaled_size = 11;
+  } else if (cinfo->scale_num * 3 >= cinfo->scale_denom * 2) {
+    /* Provide 2/3 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 2, 3L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 2, 3L);
+    cinfo->min_DCT_h_scaled_size = 12;
+    cinfo->min_DCT_v_scaled_size = 12;
+  } else if (cinfo->scale_num * 13 >= cinfo->scale_denom * 8) {
+    /* Provide 8/13 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 8, 13L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 8, 13L);
+    cinfo->min_DCT_h_scaled_size = 13;
+    cinfo->min_DCT_v_scaled_size = 13;
+  } else if (cinfo->scale_num * 7 >= cinfo->scale_denom * 4) {
+    /* Provide 4/7 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 4, 7L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 4, 7L);
+    cinfo->min_DCT_h_scaled_size = 14;
+    cinfo->min_DCT_v_scaled_size = 14;
+  } else if (cinfo->scale_num * 15 >= cinfo->scale_denom * 8) {
+    /* Provide 8/15 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 8, 15L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 8, 15L);
+    cinfo->min_DCT_h_scaled_size = 15;
+    cinfo->min_DCT_v_scaled_size = 15;
+  } else {
+    /* Provide 1/2 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width, 2L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height, 2L);
+    cinfo->min_DCT_h_scaled_size = 16;
+    cinfo->min_DCT_v_scaled_size = 16;
+  }
+
+#else /* !DCT_SCALING_SUPPORTED */
+
+  /* Hardwire it to "no scaling" */
+  cinfo->jpeg_width = cinfo->image_width;
+  cinfo->jpeg_height = cinfo->image_height;
+  cinfo->min_DCT_h_scaled_size = DCTSIZE;
+  cinfo->min_DCT_v_scaled_size = DCTSIZE;
+
+#endif /* DCT_SCALING_SUPPORTED */
+}
+
+
 LOCAL(void)
 initial_setup (j_compress_ptr cinfo)
 /* Do computations that are needed before master selection phase */
 {
-  int ci;
+  int ci, ssize;
   jpeg_component_info *compptr;
   long samplesperrow;
   JDIMENSION jd_samplesperrow;
 
+  jpeg_calc_jpeg_dimensions(cinfo);
+
   /* Sanity check on image dimensions */
-  if (cinfo->image_height <= 0 || cinfo->image_width <= 0
+  if (cinfo->jpeg_height <= 0 || cinfo->jpeg_width <= 0
       || cinfo->num_components <= 0 || cinfo->input_components <= 0)
     ERREXIT(cinfo, JERR_EMPTY_IMAGE);
 
   /* Make sure image isn't bigger than I can handle */
-  if ((long) cinfo->image_height > (long) JPEG_MAX_DIMENSION ||
-      (long) cinfo->image_width > (long) JPEG_MAX_DIMENSION)
+  if ((long) cinfo->jpeg_height > (long) JPEG_MAX_DIMENSION ||
+      (long) cinfo->jpeg_width > (long) JPEG_MAX_DIMENSION)
     ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int) JPEG_MAX_DIMENSION);
 
   /* Width of an input scanline must be representable as JDIMENSION. */
@@ -95,22 +245,52 @@ initial_setup (j_compress_ptr cinfo)
        ci++, compptr++) {
     /* Fill in the correct component_index value; don't rely on application */
     compptr->component_index = ci;
-    /* For compression, we never do DCT scaling. */
-    compptr->DCT_scaled_size = DCTSIZE;
+    /* In selecting the actual DCT scaling for each component, we try to
+     * scale down the chroma components via DCT scaling rather than downsampling.
+     * This saves time if the downsampler gets to use 1:1 scaling.
+     * Note this code adapts subsampling ratios which are powers of 2.
+     */
+    ssize = 1;
+#ifdef DCT_SCALING_SUPPORTED
+    while (cinfo->min_DCT_h_scaled_size * ssize <=
+	   (cinfo->do_fancy_downsampling ? DCTSIZE : DCTSIZE / 2) &&
+	   (cinfo->max_h_samp_factor % (compptr->h_samp_factor * ssize * 2)) == 0) {
+      ssize = ssize * 2;
+    }
+#endif
+    compptr->DCT_h_scaled_size = cinfo->min_DCT_h_scaled_size * ssize;
+    ssize = 1;
+#ifdef DCT_SCALING_SUPPORTED
+    while (cinfo->min_DCT_v_scaled_size * ssize <=
+	   (cinfo->do_fancy_downsampling ? DCTSIZE : DCTSIZE / 2) &&
+	   (cinfo->max_v_samp_factor % (compptr->v_samp_factor * ssize * 2)) == 0) {
+      ssize = ssize * 2;
+    }
+#endif
+    compptr->DCT_v_scaled_size = cinfo->min_DCT_v_scaled_size * ssize;
+
+    /* We don't support DCT ratios larger than 2. */
+    if (compptr->DCT_h_scaled_size > compptr->DCT_v_scaled_size * 2)
+	compptr->DCT_h_scaled_size = compptr->DCT_v_scaled_size * 2;
+    else if (compptr->DCT_v_scaled_size > compptr->DCT_h_scaled_size * 2)
+	compptr->DCT_v_scaled_size = compptr->DCT_h_scaled_size * 2;
+
     /* Size in DCT blocks */
     compptr->width_in_blocks = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor,
+      jdiv_round_up((long) cinfo->jpeg_width * (long) compptr->h_samp_factor,
 		    (long) (cinfo->max_h_samp_factor * DCTSIZE));
     compptr->height_in_blocks = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * (long) compptr->v_samp_factor,
+      jdiv_round_up((long) cinfo->jpeg_height * (long) compptr->v_samp_factor,
 		    (long) (cinfo->max_v_samp_factor * DCTSIZE));
     /* Size in samples */
     compptr->downsampled_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor,
-		    (long) cinfo->max_h_samp_factor);
+      jdiv_round_up((long) cinfo->jpeg_width *
+		    (long) (compptr->h_samp_factor * compptr->DCT_h_scaled_size),
+		    (long) (cinfo->max_h_samp_factor * DCTSIZE));
     compptr->downsampled_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * (long) compptr->v_samp_factor,
-		    (long) cinfo->max_v_samp_factor);
+      jdiv_round_up((long) cinfo->jpeg_height *
+		    (long) (compptr->v_samp_factor * compptr->DCT_v_scaled_size),
+		    (long) (cinfo->max_v_samp_factor * DCTSIZE));
     /* Mark component needed (this flag isn't actually used for compression) */
     compptr->component_needed = TRUE;
   }
@@ -119,7 +299,7 @@ initial_setup (j_compress_ptr cinfo)
    * main controller will call coefficient controller).
    */
   cinfo->total_iMCU_rows = (JDIMENSION)
-    jdiv_round_up((long) cinfo->image_height,
+    jdiv_round_up((long) cinfo->jpeg_height,
 		  (long) (cinfo->max_v_samp_factor*DCTSIZE));
 }
 
@@ -325,7 +505,7 @@ per_scan_setup (j_compress_ptr cinfo)
     compptr->MCU_width = 1;
     compptr->MCU_height = 1;
     compptr->MCU_blocks = 1;
-    compptr->MCU_sample_width = DCTSIZE;
+    compptr->MCU_sample_width = compptr->DCT_h_scaled_size;
     compptr->last_col_width = 1;
     /* For noninterleaved scans, it is convenient to define last_row_height
      * as the number of block rows present in the last iMCU row.
@@ -347,10 +527,10 @@ per_scan_setup (j_compress_ptr cinfo)
     
     /* Overall image size in MCUs */
     cinfo->MCUs_per_row = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width,
+      jdiv_round_up((long) cinfo->jpeg_width,
 		    (long) (cinfo->max_h_samp_factor*DCTSIZE));
     cinfo->MCU_rows_in_scan = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height,
+      jdiv_round_up((long) cinfo->jpeg_height,
 		    (long) (cinfo->max_v_samp_factor*DCTSIZE));
     
     cinfo->blocks_in_MCU = 0;
@@ -361,7 +541,7 @@ per_scan_setup (j_compress_ptr cinfo)
       compptr->MCU_width = compptr->h_samp_factor;
       compptr->MCU_height = compptr->v_samp_factor;
       compptr->MCU_blocks = compptr->MCU_width * compptr->MCU_height;
-      compptr->MCU_sample_width = compptr->MCU_width * DCTSIZE;
+      compptr->MCU_sample_width = compptr->MCU_width * compptr->DCT_h_scaled_size;
       /* Figure number of non-dummy blocks in last MCU column & row */
       tmp = (int) (compptr->width_in_blocks % compptr->MCU_width);
       if (tmp == 0) tmp = compptr->MCU_width;
@@ -433,7 +613,7 @@ prepare_for_pass (j_compress_ptr cinfo)
     /* Do Huffman optimization for a scan after the first one. */
     select_scan_parameters(cinfo);
     per_scan_setup(cinfo);
-    if (cinfo->Ss != 0 || cinfo->Ah == 0 || cinfo->arith_code) {
+    if (cinfo->Ss != 0 || cinfo->Ah == 0) {
       (*cinfo->entropy->start_pass) (cinfo, TRUE);
       (*cinfo->coef->start_pass) (cinfo, JBUF_CRANK_DEST);
       master->pub.call_pass_startup = FALSE;
@@ -567,7 +747,7 @@ jinit_c_master_control (j_compress_ptr cinfo, boolean transcode_only)
     cinfo->num_scans = 1;
   }
 
-  if (cinfo->progressive_mode)	/*  TEMPORARY HACK ??? */
+  if (cinfo->progressive_mode && cinfo->arith_code == 0)	/*  TEMPORARY HACK ??? */
     cinfo->optimize_coding = TRUE; /* assume default tables no good for progressive mode */
 
   /* Initialize my private state */
diff --git a/src/libjpeg/jcparam.c b/src/libjpeg/jcparam.c
index 6fc48f5..c5e85dd 100644
--- a/src/libjpeg/jcparam.c
+++ b/src/libjpeg/jcparam.c
@@ -2,6 +2,7 @@
  * jcparam.c
  *
  * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Modified 2003-2008 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -60,6 +61,47 @@ jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl,
 }
 
 
+/* These are the sample quantization tables given in JPEG spec section K.1.
+ * The spec says that the values given produce "good" quality, and
+ * when divided by 2, "very good" quality.
+ */
+static const unsigned int std_luminance_quant_tbl[DCTSIZE2] = {
+  16,  11,  10,  16,  24,  40,  51,  61,
+  12,  12,  14,  19,  26,  58,  60,  55,
+  14,  13,  16,  24,  40,  57,  69,  56,
+  14,  17,  22,  29,  51,  87,  80,  62,
+  18,  22,  37,  56,  68, 109, 103,  77,
+  24,  35,  55,  64,  81, 104, 113,  92,
+  49,  64,  78,  87, 103, 121, 120, 101,
+  72,  92,  95,  98, 112, 100, 103,  99
+};
+static const unsigned int std_chrominance_quant_tbl[DCTSIZE2] = {
+  17,  18,  24,  47,  99,  99,  99,  99,
+  18,  21,  26,  66,  99,  99,  99,  99,
+  24,  26,  56,  99,  99,  99,  99,  99,
+  47,  66,  99,  99,  99,  99,  99,  99,
+  99,  99,  99,  99,  99,  99,  99,  99,
+  99,  99,  99,  99,  99,  99,  99,  99,
+  99,  99,  99,  99,  99,  99,  99,  99,
+  99,  99,  99,  99,  99,  99,  99,  99
+};
+
+
+GLOBAL(void)
+jpeg_default_qtables (j_compress_ptr cinfo, boolean force_baseline)
+/* Set or change the 'quality' (quantization) setting, using default tables
+ * and straight percentage-scaling quality scales.
+ * This entry point allows different scalings for luminance and chrominance.
+ */
+{
+  /* Set up two quantization tables using the specified scaling */
+  jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl,
+		       cinfo->q_scale_factor[0], force_baseline);
+  jpeg_add_quant_table(cinfo, 1, std_chrominance_quant_tbl,
+		       cinfo->q_scale_factor[1], force_baseline);
+}
+
+
 GLOBAL(void)
 jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
 			 boolean force_baseline)
@@ -69,31 +111,6 @@ jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
  * applications that insist on a linear percentage scaling.
  */
 {
-  /* These are the sample quantization tables given in JPEG spec section K.1.
-   * The spec says that the values given produce "good" quality, and
-   * when divided by 2, "very good" quality.
-   */
-  static const unsigned int std_luminance_quant_tbl[DCTSIZE2] = {
-    16,  11,  10,  16,  24,  40,  51,  61,
-    12,  12,  14,  19,  26,  58,  60,  55,
-    14,  13,  16,  24,  40,  57,  69,  56,
-    14,  17,  22,  29,  51,  87,  80,  62,
-    18,  22,  37,  56,  68, 109, 103,  77,
-    24,  35,  55,  64,  81, 104, 113,  92,
-    49,  64,  78,  87, 103, 121, 120, 101,
-    72,  92,  95,  98, 112, 100, 103,  99
-  };
-  static const unsigned int std_chrominance_quant_tbl[DCTSIZE2] = {
-    17,  18,  24,  47,  99,  99,  99,  99,
-    18,  21,  26,  66,  99,  99,  99,  99,
-    24,  26,  56,  99,  99,  99,  99,  99,
-    47,  66,  99,  99,  99,  99,  99,  99,
-    99,  99,  99,  99,  99,  99,  99,  99,
-    99,  99,  99,  99,  99,  99,  99,  99,
-    99,  99,  99,  99,  99,  99,  99,  99,
-    99,  99,  99,  99,  99,  99,  99,  99
-  };
-
   /* Set up two quantization tables using the specified scaling */
   jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl,
 		       scale_factor, force_baseline);
@@ -284,6 +301,8 @@ jpeg_set_defaults (j_compress_ptr cinfo)
 
   /* Initialize everything not dependent on the color space */
 
+  cinfo->scale_num = 1;		/* 1:1 scaling */
+  cinfo->scale_denom = 1;
   cinfo->data_precision = BITS_IN_JSAMPLE;
   /* Set up two quantization tables using default quality of 75 */
   jpeg_set_quality(cinfo, 75, TRUE);
@@ -320,6 +339,9 @@ jpeg_set_defaults (j_compress_ptr cinfo)
   /* By default, use the simpler non-cosited sampling alignment */
   cinfo->CCIR601_sampling = FALSE;
 
+  /* By default, apply fancy downsampling */
+  cinfo->do_fancy_downsampling = TRUE;
+
   /* No input smoothing */
   cinfo->smoothing_factor = 0;
 
diff --git a/src/libjpeg/jcprepct.c b/src/libjpeg/jcprepct.c
index fa93333..be44cc4 100644
--- a/src/libjpeg/jcprepct.c
+++ b/src/libjpeg/jcprepct.c
@@ -173,10 +173,12 @@ pre_process_data (j_compress_ptr cinfo,
 	*out_row_group_ctr < out_row_groups_avail) {
       for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
 	   ci++, compptr++) {
+	numrows = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
+		  cinfo->min_DCT_v_scaled_size;
 	expand_bottom_edge(output_buf[ci],
-			   compptr->width_in_blocks * DCTSIZE,
-			   (int) (*out_row_group_ctr * compptr->v_samp_factor),
-			   (int) (out_row_groups_avail * compptr->v_samp_factor));
+			   compptr->width_in_blocks * compptr->DCT_h_scaled_size,
+			   (int) (*out_row_group_ctr * numrows),
+			   (int) (out_row_groups_avail * numrows));
       }
       *out_row_group_ctr = out_row_groups_avail;
       break;			/* can exit outer loop without test */
@@ -288,7 +290,8 @@ create_context_buffer (j_compress_ptr cinfo)
      */
     true_buffer = (*cinfo->mem->alloc_sarray)
       ((j_common_ptr) cinfo, JPOOL_IMAGE,
-       (JDIMENSION) (((long) compptr->width_in_blocks * DCTSIZE *
+       (JDIMENSION) (((long) compptr->width_in_blocks *
+		      cinfo->min_DCT_h_scaled_size *
 		      cinfo->max_h_samp_factor) / compptr->h_samp_factor),
        (JDIMENSION) (3 * rgroup_height));
     /* Copy true buffer row pointers into the middle of the fake row array */
@@ -346,7 +349,8 @@ jinit_c_prep_controller (j_compress_ptr cinfo, boolean need_full_buffer)
 	 ci++, compptr++) {
       prep->color_buf[ci] = (*cinfo->mem->alloc_sarray)
 	((j_common_ptr) cinfo, JPOOL_IMAGE,
-	 (JDIMENSION) (((long) compptr->width_in_blocks * DCTSIZE *
+	 (JDIMENSION) (((long) compptr->width_in_blocks *
+			cinfo->min_DCT_h_scaled_size *
 			cinfo->max_h_samp_factor) / compptr->h_samp_factor),
 	 (JDIMENSION) cinfo->max_v_samp_factor);
     }
diff --git a/src/libjpeg/jcsample.c b/src/libjpeg/jcsample.c
index 212ec87..4d36f85 100644
--- a/src/libjpeg/jcsample.c
+++ b/src/libjpeg/jcsample.c
@@ -62,6 +62,15 @@ typedef struct {
 
   /* Downsampling method pointers, one per component */
   downsample1_ptr methods[MAX_COMPONENTS];
+
+  /* Height of an output row group for each component. */
+  int rowgroup_height[MAX_COMPONENTS];
+
+  /* These arrays save pixel expansion factors so that int_downsample need not
+   * recompute them each time.  They are unused for other downsampling methods.
+   */
+  UINT8 h_expand[MAX_COMPONENTS];
+  UINT8 v_expand[MAX_COMPONENTS];
 } my_downsampler;
 
 typedef my_downsampler * my_downsample_ptr;
@@ -123,7 +132,8 @@ sep_downsample (j_compress_ptr cinfo,
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
     in_ptr = input_buf[ci] + in_row_index;
-    out_ptr = output_buf[ci] + (out_row_group_index * compptr->v_samp_factor);
+    out_ptr = output_buf[ci] +
+	      (out_row_group_index * downsample->rowgroup_height[ci]);
     (*downsample->methods[ci]) (cinfo, compptr, in_ptr, out_ptr);
   }
 }
@@ -140,14 +150,15 @@ METHODDEF(void)
 int_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
 		JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
+  my_downsample_ptr downsample = (my_downsample_ptr) cinfo->downsample;
   int inrow, outrow, h_expand, v_expand, numpix, numpix2, h, v;
   JDIMENSION outcol, outcol_h;	/* outcol_h == outcol*h_expand */
-  JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
+  JDIMENSION output_cols = compptr->width_in_blocks * compptr->DCT_h_scaled_size;
   JSAMPROW inptr, outptr;
   INT32 outvalue;
 
-  h_expand = cinfo->max_h_samp_factor / compptr->h_samp_factor;
-  v_expand = cinfo->max_v_samp_factor / compptr->v_samp_factor;
+  h_expand = downsample->h_expand[compptr->component_index];
+  v_expand = downsample->v_expand[compptr->component_index];
   numpix = h_expand * v_expand;
   numpix2 = numpix/2;
 
@@ -158,8 +169,8 @@ int_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
   expand_right_edge(input_data, cinfo->max_v_samp_factor,
 		    cinfo->image_width, output_cols * h_expand);
 
-  inrow = 0;
-  for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
+  inrow = outrow = 0;
+  while (inrow < cinfo->max_v_samp_factor) {
     outptr = output_data[outrow];
     for (outcol = 0, outcol_h = 0; outcol < output_cols;
 	 outcol++, outcol_h += h_expand) {
@@ -173,6 +184,7 @@ int_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
       *outptr++ = (JSAMPLE) ((outvalue + numpix2) / numpix);
     }
     inrow += v_expand;
+    outrow++;
   }
 }
 
@@ -191,8 +203,8 @@ fullsize_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
   jcopy_sample_rows(input_data, 0, output_data, 0,
 		    cinfo->max_v_samp_factor, cinfo->image_width);
   /* Edge-expand */
-  expand_right_edge(output_data, cinfo->max_v_samp_factor,
-		    cinfo->image_width, compptr->width_in_blocks * DCTSIZE);
+  expand_right_edge(output_data, cinfo->max_v_samp_factor, cinfo->image_width,
+		    compptr->width_in_blocks * compptr->DCT_h_scaled_size);
 }
 
 
@@ -212,9 +224,9 @@ METHODDEF(void)
 h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
 		 JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
-  int outrow;
+  int inrow;
   JDIMENSION outcol;
-  JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
+  JDIMENSION output_cols = compptr->width_in_blocks * compptr->DCT_h_scaled_size;
   register JSAMPROW inptr, outptr;
   register int bias;
 
@@ -225,9 +237,9 @@ h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
   expand_right_edge(input_data, cinfo->max_v_samp_factor,
 		    cinfo->image_width, output_cols * 2);
 
-  for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
-    outptr = output_data[outrow];
-    inptr = input_data[outrow];
+  for (inrow = 0; inrow < cinfo->max_v_samp_factor; inrow++) {
+    outptr = output_data[inrow];
+    inptr = input_data[inrow];
     bias = 0;			/* bias = 0,1,0,1,... for successive samples */
     for (outcol = 0; outcol < output_cols; outcol++) {
       *outptr++ = (JSAMPLE) ((GETJSAMPLE(*inptr) + GETJSAMPLE(inptr[1])
@@ -251,7 +263,7 @@ h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
 {
   int inrow, outrow;
   JDIMENSION outcol;
-  JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
+  JDIMENSION output_cols = compptr->width_in_blocks * compptr->DCT_h_scaled_size;
   register JSAMPROW inptr0, inptr1, outptr;
   register int bias;
 
@@ -262,8 +274,8 @@ h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
   expand_right_edge(input_data, cinfo->max_v_samp_factor,
 		    cinfo->image_width, output_cols * 2);
 
-  inrow = 0;
-  for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
+  inrow = outrow = 0;
+  while (inrow < cinfo->max_v_samp_factor) {
     outptr = output_data[outrow];
     inptr0 = input_data[inrow];
     inptr1 = input_data[inrow+1];
@@ -276,6 +288,7 @@ h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
       inptr0 += 2; inptr1 += 2;
     }
     inrow += 2;
+    outrow++;
   }
 }
 
@@ -294,7 +307,7 @@ h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
 {
   int inrow, outrow;
   JDIMENSION colctr;
-  JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
+  JDIMENSION output_cols = compptr->width_in_blocks * compptr->DCT_h_scaled_size;
   register JSAMPROW inptr0, inptr1, above_ptr, below_ptr, outptr;
   INT32 membersum, neighsum, memberscale, neighscale;
 
@@ -321,8 +334,8 @@ h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
   memberscale = 16384 - cinfo->smoothing_factor * 80; /* scaled (1-5*SF)/4 */
   neighscale = cinfo->smoothing_factor * 16; /* scaled SF/4 */
 
-  inrow = 0;
-  for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
+  inrow = outrow = 0;
+  while (inrow < cinfo->max_v_samp_factor) {
     outptr = output_data[outrow];
     inptr0 = input_data[inrow];
     inptr1 = input_data[inrow+1];
@@ -378,6 +391,7 @@ h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
     *outptr = (JSAMPLE) ((membersum + 32768) >> 16);
 
     inrow += 2;
+    outrow++;
   }
 }
 
@@ -392,9 +406,9 @@ METHODDEF(void)
 fullsize_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
 			    JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
-  int outrow;
+  int inrow;
   JDIMENSION colctr;
-  JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
+  JDIMENSION output_cols = compptr->width_in_blocks * compptr->DCT_h_scaled_size;
   register JSAMPROW inptr, above_ptr, below_ptr, outptr;
   INT32 membersum, neighsum, memberscale, neighscale;
   int colsum, lastcolsum, nextcolsum;
@@ -415,11 +429,11 @@ fullsize_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
   memberscale = 65536L - cinfo->smoothing_factor * 512L; /* scaled 1-8*SF */
   neighscale = cinfo->smoothing_factor * 64; /* scaled SF */
 
-  for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
-    outptr = output_data[outrow];
-    inptr = input_data[outrow];
-    above_ptr = input_data[outrow-1];
-    below_ptr = input_data[outrow+1];
+  for (inrow = 0; inrow < cinfo->max_v_samp_factor; inrow++) {
+    outptr = output_data[inrow];
+    inptr = input_data[inrow];
+    above_ptr = input_data[inrow-1];
+    below_ptr = input_data[inrow+1];
 
     /* Special case for first column */
     colsum = GETJSAMPLE(*above_ptr++) + GETJSAMPLE(*below_ptr++) +
@@ -467,6 +481,7 @@ jinit_downsampler (j_compress_ptr cinfo)
   int ci;
   jpeg_component_info * compptr;
   boolean smoothok = TRUE;
+  int h_in_group, v_in_group, h_out_group, v_out_group;
 
   downsample = (my_downsample_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
@@ -482,8 +497,17 @@ jinit_downsampler (j_compress_ptr cinfo)
   /* Verify we can handle the sampling factors, and set up method pointers */
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
-    if (compptr->h_samp_factor == cinfo->max_h_samp_factor &&
-	compptr->v_samp_factor == cinfo->max_v_samp_factor) {
+    /* Compute size of an "output group" for DCT scaling.  This many samples
+     * are to be converted from max_h_samp_factor * max_v_samp_factor pixels.
+     */
+    h_out_group = (compptr->h_samp_factor * compptr->DCT_h_scaled_size) /
+		  cinfo->min_DCT_h_scaled_size;
+    v_out_group = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
+		  cinfo->min_DCT_v_scaled_size;
+    h_in_group = cinfo->max_h_samp_factor;
+    v_in_group = cinfo->max_v_samp_factor;
+    downsample->rowgroup_height[ci] = v_out_group; /* save for use later */
+    if (h_in_group == h_out_group && v_in_group == v_out_group) {
 #ifdef INPUT_SMOOTHING_SUPPORTED
       if (cinfo->smoothing_factor) {
 	downsample->methods[ci] = fullsize_smooth_downsample;
@@ -491,12 +515,12 @@ jinit_downsampler (j_compress_ptr cinfo)
       } else
 #endif
 	downsample->methods[ci] = fullsize_downsample;
-    } else if (compptr->h_samp_factor * 2 == cinfo->max_h_samp_factor &&
-	       compptr->v_samp_factor == cinfo->max_v_samp_factor) {
+    } else if (h_in_group == h_out_group * 2 &&
+	       v_in_group == v_out_group) {
       smoothok = FALSE;
       downsample->methods[ci] = h2v1_downsample;
-    } else if (compptr->h_samp_factor * 2 == cinfo->max_h_samp_factor &&
-	       compptr->v_samp_factor * 2 == cinfo->max_v_samp_factor) {
+    } else if (h_in_group == h_out_group * 2 &&
+	       v_in_group == v_out_group * 2) {
 #ifdef INPUT_SMOOTHING_SUPPORTED
       if (cinfo->smoothing_factor) {
 	downsample->methods[ci] = h2v2_smooth_downsample;
@@ -504,10 +528,12 @@ jinit_downsampler (j_compress_ptr cinfo)
       } else
 #endif
 	downsample->methods[ci] = h2v2_downsample;
-    } else if ((cinfo->max_h_samp_factor % compptr->h_samp_factor) == 0 &&
-	       (cinfo->max_v_samp_factor % compptr->v_samp_factor) == 0) {
+    } else if ((h_in_group % h_out_group) == 0 &&
+	       (v_in_group % v_out_group) == 0) {
       smoothok = FALSE;
       downsample->methods[ci] = int_downsample;
+      downsample->h_expand[ci] = (UINT8) (h_in_group / h_out_group);
+      downsample->v_expand[ci] = (UINT8) (v_in_group / v_out_group);
     } else
       ERREXIT(cinfo, JERR_FRACT_SAMPLE_NOTIMPL);
   }
diff --git a/src/libjpeg/jctrans.c b/src/libjpeg/jctrans.c
index 0e6d707..74bb12c 100644
--- a/src/libjpeg/jctrans.c
+++ b/src/libjpeg/jctrans.c
@@ -167,16 +167,9 @@ transencode_master_selection (j_compress_ptr cinfo,
 
   /* Entropy encoding: either Huffman or arithmetic coding. */
   if (cinfo->arith_code) {
-    ERREXIT(cinfo, JERR_ARITH_NOTIMPL);
+    jinit_arith_encoder(cinfo);
   } else {
-    if (cinfo->progressive_mode) {
-#ifdef C_PROGRESSIVE_SUPPORTED
-      jinit_phuff_encoder(cinfo);
-#else
-      ERREXIT(cinfo, JERR_NOT_COMPILED);
-#endif
-    } else
-      jinit_huff_encoder(cinfo);
+    jinit_huff_encoder(cinfo);
   }
 
   /* We need a special coefficient buffer controller. */
diff --git a/src/libjpeg/jdapimin.c b/src/libjpeg/jdapimin.c
index cadb59f..a55498f 100644
--- a/src/libjpeg/jdapimin.c
+++ b/src/libjpeg/jdapimin.c
@@ -2,6 +2,7 @@
  * jdapimin.c
  *
  * Copyright (C) 1994-1998, Thomas G. Lane.
+ * Modified 2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -185,8 +186,8 @@ default_decompress_parms (j_decompress_ptr cinfo)
   }
 
   /* Set defaults for other decompression parameters. */
-  cinfo->scale_num = 1;		/* 1:1 scaling */
-  cinfo->scale_denom = 1;
+  cinfo->scale_num = DCTSIZE;		/* 1:1 scaling */
+  cinfo->scale_denom = DCTSIZE;
   cinfo->output_gamma = 1.0;
   cinfo->buffered_image = FALSE;
   cinfo->raw_data_out = FALSE;
diff --git a/src/libjpeg/jdapistd.c b/src/libjpeg/jdapistd.c
index c8e3fa0..9d74537 100644
--- a/src/libjpeg/jdapistd.c
+++ b/src/libjpeg/jdapistd.c
@@ -202,7 +202,7 @@ jpeg_read_raw_data (j_decompress_ptr cinfo, JSAMPIMAGE data,
   }
 
   /* Verify that at least one iMCU row can be returned. */
-  lines_per_iMCU_row = cinfo->max_v_samp_factor * cinfo->min_DCT_scaled_size;
+  lines_per_iMCU_row = cinfo->max_v_samp_factor * cinfo->min_DCT_v_scaled_size;
   if (max_lines < lines_per_iMCU_row)
     ERREXIT(cinfo, JERR_BUFFER_SIZE);
 
diff --git a/src/libjpeg/jdcoefct.c b/src/libjpeg/jdcoefct.c
index 4938d20..462e92c 100644
--- a/src/libjpeg/jdcoefct.c
+++ b/src/libjpeg/jdcoefct.c
@@ -187,7 +187,7 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 	useful_width = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
 						    : compptr->last_col_width;
 	output_ptr = output_buf[compptr->component_index] +
-	  yoffset * compptr->DCT_scaled_size;
+	  yoffset * compptr->DCT_v_scaled_size;
 	start_col = MCU_col_num * compptr->MCU_sample_width;
 	for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
 	  if (cinfo->input_iMCU_row < last_iMCU_row ||
@@ -197,11 +197,11 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 	      (*inverse_DCT) (cinfo, compptr,
 			      (JCOEFPTR) coef->MCU_buffer[blkn+xindex],
 			      output_ptr, output_col);
-	      output_col += compptr->DCT_scaled_size;
+	      output_col += compptr->DCT_h_scaled_size;
 	    }
 	  }
 	  blkn += compptr->MCU_width;
-	  output_ptr += compptr->DCT_scaled_size;
+	  output_ptr += compptr->DCT_v_scaled_size;
 	}
       }
     }
@@ -362,9 +362,9 @@ decompress_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 	(*inverse_DCT) (cinfo, compptr, (JCOEFPTR) buffer_ptr,
 			output_ptr, output_col);
 	buffer_ptr++;
-	output_col += compptr->DCT_scaled_size;
+	output_col += compptr->DCT_h_scaled_size;
       }
-      output_ptr += compptr->DCT_scaled_size;
+      output_ptr += compptr->DCT_v_scaled_size;
     }
   }
 
@@ -654,9 +654,9 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 	DC4 = DC5; DC5 = DC6;
 	DC7 = DC8; DC8 = DC9;
 	buffer_ptr++, prev_block_row++, next_block_row++;
-	output_col += compptr->DCT_scaled_size;
+	output_col += compptr->DCT_h_scaled_size;
       }
-      output_ptr += compptr->DCT_scaled_size;
+      output_ptr += compptr->DCT_v_scaled_size;
     }
   }
 
diff --git a/src/libjpeg/jdct.h b/src/libjpeg/jdct.h
index 04192a2..360dec8 100644
--- a/src/libjpeg/jdct.h
+++ b/src/libjpeg/jdct.h
@@ -14,11 +14,16 @@
 
 
 /*
- * A forward DCT routine is given a pointer to a work area of type DCTELEM[];
- * the DCT is to be performed in-place in that buffer.  Type DCTELEM is int
- * for 8-bit samples, INT32 for 12-bit samples.  (NOTE: Floating-point DCT
- * implementations use an array of type FAST_FLOAT, instead.)
- * The DCT inputs are expected to be signed (range +-CENTERJSAMPLE).
+ * A forward DCT routine is given a pointer to an input sample array and
+ * a pointer to a work area of type DCTELEM[]; the DCT is to be performed
+ * in-place in that buffer.  Type DCTELEM is int for 8-bit samples, INT32
+ * for 12-bit samples.  (NOTE: Floating-point DCT implementations use an
+ * array of type FAST_FLOAT, instead.)
+ * The input data is to be fetched from the sample array starting at a
+ * specified column.  (Any row offset needed will be applied to the array
+ * pointer before it is passed to the FDCT code.)
+ * Note that the number of samples fetched by the FDCT routine is
+ * DCT_h_scaled_size * DCT_v_scaled_size.
  * The DCT outputs are returned scaled up by a factor of 8; they therefore
  * have a range of +-8K for 8-bit data, +-128K for 12-bit data.  This
  * convention improves accuracy in integer implementations and saves some
@@ -32,8 +37,12 @@ typedef int DCTELEM;		/* 16 or 32 bits is fine */
 typedef INT32 DCTELEM;		/* must have 32 bits */
 #endif
 
-typedef JMETHOD(void, forward_DCT_method_ptr, (DCTELEM * data));
-typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data));
+typedef JMETHOD(void, forward_DCT_method_ptr, (DCTELEM * data,
+					       JSAMPARRAY sample_data,
+					       JDIMENSION start_col));
+typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data,
+					     JSAMPARRAY sample_data,
+					     JDIMENSION start_col));
 
 
 /*
@@ -44,7 +53,7 @@ typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data));
  * sample array starting at a specified column.  (Any row offset needed will
  * be applied to the array pointer before it is passed to the IDCT code.)
  * Note that the number of samples emitted by the IDCT routine is
- * DCT_scaled_size * DCT_scaled_size.
+ * DCT_h_scaled_size * DCT_v_scaled_size.
  */
 
 /* typedef inverse_DCT_method_ptr is declared in jpegint.h */
@@ -84,19 +93,143 @@ typedef FAST_FLOAT FLOAT_MULT_TYPE; /* preferred floating type */
 #define jpeg_fdct_islow		jFDislow
 #define jpeg_fdct_ifast		jFDifast
 #define jpeg_fdct_float		jFDfloat
+#define jpeg_fdct_7x7		jFD7x7
+#define jpeg_fdct_6x6		jFD6x6
+#define jpeg_fdct_5x5		jFD5x5
+#define jpeg_fdct_4x4		jFD4x4
+#define jpeg_fdct_3x3		jFD3x3
+#define jpeg_fdct_2x2		jFD2x2
+#define jpeg_fdct_1x1		jFD1x1
+#define jpeg_fdct_9x9		jFD9x9
+#define jpeg_fdct_10x10		jFD10x10
+#define jpeg_fdct_11x11		jFD11x11
+#define jpeg_fdct_12x12		jFD12x12
+#define jpeg_fdct_13x13		jFD13x13
+#define jpeg_fdct_14x14		jFD14x14
+#define jpeg_fdct_15x15		jFD15x15
+#define jpeg_fdct_16x16		jFD16x16
+#define jpeg_fdct_16x8		jFD16x8
+#define jpeg_fdct_14x7		jFD14x7
+#define jpeg_fdct_12x6		jFD12x6
+#define jpeg_fdct_10x5		jFD10x5
+#define jpeg_fdct_8x4		jFD8x4
+#define jpeg_fdct_6x3		jFD6x3
+#define jpeg_fdct_4x2		jFD4x2
+#define jpeg_fdct_2x1		jFD2x1
+#define jpeg_fdct_8x16		jFD8x16
+#define jpeg_fdct_7x14		jFD7x14
+#define jpeg_fdct_6x12		jFD6x12
+#define jpeg_fdct_5x10		jFD5x10
+#define jpeg_fdct_4x8		jFD4x8
+#define jpeg_fdct_3x6		jFD3x6
+#define jpeg_fdct_2x4		jFD2x4
+#define jpeg_fdct_1x2		jFD1x2
 #define jpeg_idct_islow		jRDislow
 #define jpeg_idct_ifast		jRDifast
 #define jpeg_idct_float		jRDfloat
+#define jpeg_idct_7x7		jRD7x7
+#define jpeg_idct_6x6		jRD6x6
+#define jpeg_idct_5x5		jRD5x5
 #define jpeg_idct_4x4		jRD4x4
+#define jpeg_idct_3x3		jRD3x3
 #define jpeg_idct_2x2		jRD2x2
 #define jpeg_idct_1x1		jRD1x1
+#define jpeg_idct_9x9		jRD9x9
+#define jpeg_idct_10x10		jRD10x10
+#define jpeg_idct_11x11		jRD11x11
+#define jpeg_idct_12x12		jRD12x12
+#define jpeg_idct_13x13		jRD13x13
+#define jpeg_idct_14x14		jRD14x14
+#define jpeg_idct_15x15		jRD15x15
+#define jpeg_idct_16x16		jRD16x16
+#define jpeg_idct_16x8		jRD16x8
+#define jpeg_idct_14x7		jRD14x7
+#define jpeg_idct_12x6		jRD12x6
+#define jpeg_idct_10x5		jRD10x5
+#define jpeg_idct_8x4		jRD8x4
+#define jpeg_idct_6x3		jRD6x3
+#define jpeg_idct_4x2		jRD4x2
+#define jpeg_idct_2x1		jRD2x1
+#define jpeg_idct_8x16		jRD8x16
+#define jpeg_idct_7x14		jRD7x14
+#define jpeg_idct_6x12		jRD6x12
+#define jpeg_idct_5x10		jRD5x10
+#define jpeg_idct_4x8		jRD4x8
+#define jpeg_idct_3x6		jRD3x8
+#define jpeg_idct_2x4		jRD2x4
+#define jpeg_idct_1x2		jRD1x2
 #endif /* NEED_SHORT_EXTERNAL_NAMES */
 
 /* Extern declarations for the forward and inverse DCT routines. */
 
-EXTERN(void) jpeg_fdct_islow JPP((DCTELEM * data));
-EXTERN(void) jpeg_fdct_ifast JPP((DCTELEM * data));
-EXTERN(void) jpeg_fdct_float JPP((FAST_FLOAT * data));
+EXTERN(void) jpeg_fdct_islow
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_ifast
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_float
+    JPP((FAST_FLOAT * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_7x7
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_6x6
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_5x5
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_4x4
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_3x3
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_2x2
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_1x1
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_9x9
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_10x10
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_11x11
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_12x12
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_13x13
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_14x14
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_15x15
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_16x16
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_16x8
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_14x7
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_12x6
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_10x5
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_8x4
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_6x3
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_4x2
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_2x1
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_8x16
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_7x14
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_6x12
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_5x10
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_4x8
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_3x6
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_2x4
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_1x2
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
 
 EXTERN(void) jpeg_idct_islow
     JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
@@ -107,15 +240,99 @@ EXTERN(void) jpeg_idct_ifast
 EXTERN(void) jpeg_idct_float
     JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
 	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_7x7
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_6x6
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_5x5
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
 EXTERN(void) jpeg_idct_4x4
     JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
 	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_3x3
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
 EXTERN(void) jpeg_idct_2x2
     JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
 	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
 EXTERN(void) jpeg_idct_1x1
     JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
 	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_9x9
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_10x10
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_11x11
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_12x12
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_13x13
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_14x14
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_15x15
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_16x16
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_16x8
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_14x7
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_12x6
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_10x5
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_8x4
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_6x3
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_4x2
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_2x1
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_8x16
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_7x14
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_6x12
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_5x10
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_4x8
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_3x6
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_2x4
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_1x2
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
 
 
 /*
diff --git a/src/libjpeg/jddctmgr.c b/src/libjpeg/jddctmgr.c
index bbf8d0e..bdbde34 100644
--- a/src/libjpeg/jddctmgr.c
+++ b/src/libjpeg/jddctmgr.c
@@ -98,22 +98,134 @@ start_pass (j_decompress_ptr cinfo)
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
     /* Select the proper IDCT routine for this component's scaling */
-    switch (compptr->DCT_scaled_size) {
+    switch ((compptr->DCT_h_scaled_size << 8) + compptr->DCT_v_scaled_size) {
 #ifdef IDCT_SCALING_SUPPORTED
-    case 1:
+    case ((1 << 8) + 1):
       method_ptr = jpeg_idct_1x1;
-      method = JDCT_ISLOW;	/* jidctred uses islow-style table */
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
       break;
-    case 2:
+    case ((2 << 8) + 2):
       method_ptr = jpeg_idct_2x2;
-      method = JDCT_ISLOW;	/* jidctred uses islow-style table */
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
       break;
-    case 4:
+    case ((3 << 8) + 3):
+      method_ptr = jpeg_idct_3x3;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((4 << 8) + 4):
       method_ptr = jpeg_idct_4x4;
-      method = JDCT_ISLOW;	/* jidctred uses islow-style table */
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((5 << 8) + 5):
+      method_ptr = jpeg_idct_5x5;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((6 << 8) + 6):
+      method_ptr = jpeg_idct_6x6;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((7 << 8) + 7):
+      method_ptr = jpeg_idct_7x7;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((9 << 8) + 9):
+      method_ptr = jpeg_idct_9x9;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((10 << 8) + 10):
+      method_ptr = jpeg_idct_10x10;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((11 << 8) + 11):
+      method_ptr = jpeg_idct_11x11;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((12 << 8) + 12):
+      method_ptr = jpeg_idct_12x12;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((13 << 8) + 13):
+      method_ptr = jpeg_idct_13x13;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((14 << 8) + 14):
+      method_ptr = jpeg_idct_14x14;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((15 << 8) + 15):
+      method_ptr = jpeg_idct_15x15;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((16 << 8) + 16):
+      method_ptr = jpeg_idct_16x16;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((16 << 8) + 8):
+      method_ptr = jpeg_idct_16x8;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((14 << 8) + 7):
+      method_ptr = jpeg_idct_14x7;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((12 << 8) + 6):
+      method_ptr = jpeg_idct_12x6;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((10 << 8) + 5):
+      method_ptr = jpeg_idct_10x5;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((8 << 8) + 4):
+      method_ptr = jpeg_idct_8x4;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((6 << 8) + 3):
+      method_ptr = jpeg_idct_6x3;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((4 << 8) + 2):
+      method_ptr = jpeg_idct_4x2;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((2 << 8) + 1):
+      method_ptr = jpeg_idct_2x1;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((8 << 8) + 16):
+      method_ptr = jpeg_idct_8x16;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((7 << 8) + 14):
+      method_ptr = jpeg_idct_7x14;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((6 << 8) + 12):
+      method_ptr = jpeg_idct_6x12;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((5 << 8) + 10):
+      method_ptr = jpeg_idct_5x10;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((4 << 8) + 8):
+      method_ptr = jpeg_idct_4x8;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((3 << 8) + 6):
+      method_ptr = jpeg_idct_3x6;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((2 << 8) + 4):
+      method_ptr = jpeg_idct_2x4;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((1 << 8) + 2):
+      method_ptr = jpeg_idct_1x2;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
       break;
 #endif
-    case DCTSIZE:
+    case ((DCTSIZE << 8) + DCTSIZE):
       switch (cinfo->dct_method) {
 #ifdef DCT_ISLOW_SUPPORTED
       case JDCT_ISLOW:
@@ -139,7 +251,8 @@ start_pass (j_decompress_ptr cinfo)
       }
       break;
     default:
-      ERREXIT1(cinfo, JERR_BAD_DCTSIZE, compptr->DCT_scaled_size);
+      ERREXIT2(cinfo, JERR_BAD_DCTSIZE,
+	       compptr->DCT_h_scaled_size, compptr->DCT_v_scaled_size);
       break;
     }
     idct->pub.inverse_DCT[ci] = method_ptr;
diff --git a/src/libjpeg/jdhuff.c b/src/libjpeg/jdhuff.c
index b960874..a24181b 100644
--- a/src/libjpeg/jdhuff.c
+++ b/src/libjpeg/jdhuff.c
@@ -2,10 +2,12 @@
  * jdhuff.c
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2006-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
  * This file contains Huffman entropy decoding routines.
+ * Both sequential and progressive modes are supported in this single module.
  *
  * Much of the complexity here has to do with supporting input suspension.
  * If the data source module demands suspension, we want to be able to back
@@ -17,7 +19,173 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jdhuff.h"		/* Declarations shared with jdphuff.c */
+
+
+/* Derived data constructed for each Huffman table */
+
+#define HUFF_LOOKAHEAD	8	/* # of bits of lookahead */
+
+typedef struct {
+  /* Basic tables: (element [0] of each array is unused) */
+  INT32 maxcode[18];		/* largest code of length k (-1 if none) */
+  /* (maxcode[17] is a sentinel to ensure jpeg_huff_decode terminates) */
+  INT32 valoffset[17];		/* huffval[] offset for codes of length k */
+  /* valoffset[k] = huffval[] index of 1st symbol of code length k, less
+   * the smallest code of length k; so given a code of length k, the
+   * corresponding symbol is huffval[code + valoffset[k]]
+   */
+
+  /* Link to public Huffman table (needed only in jpeg_huff_decode) */
+  JHUFF_TBL *pub;
+
+  /* Lookahead tables: indexed by the next HUFF_LOOKAHEAD bits of
+   * the input data stream.  If the next Huffman code is no more
+   * than HUFF_LOOKAHEAD bits long, we can obtain its length and
+   * the corresponding symbol directly from these tables.
+   */
+  int look_nbits[1<<HUFF_LOOKAHEAD]; /* # bits, or 0 if too long */
+  UINT8 look_sym[1<<HUFF_LOOKAHEAD]; /* symbol, or unused */
+} d_derived_tbl;
+
+
+/*
+ * Fetching the next N bits from the input stream is a time-critical operation
+ * for the Huffman decoders.  We implement it with a combination of inline
+ * macros and out-of-line subroutines.  Note that N (the number of bits
+ * demanded at one time) never exceeds 15 for JPEG use.
+ *
+ * We read source bytes into get_buffer and dole out bits as needed.
+ * If get_buffer already contains enough bits, they are fetched in-line
+ * by the macros CHECK_BIT_BUFFER and GET_BITS.  When there aren't enough
+ * bits, jpeg_fill_bit_buffer is called; it will attempt to fill get_buffer
+ * as full as possible (not just to the number of bits needed; this
+ * prefetching reduces the overhead cost of calling jpeg_fill_bit_buffer).
+ * Note that jpeg_fill_bit_buffer may return FALSE to indicate suspension.
+ * On TRUE return, jpeg_fill_bit_buffer guarantees that get_buffer contains
+ * at least the requested number of bits --- dummy zeroes are inserted if
+ * necessary.
+ */
+
+typedef INT32 bit_buf_type;	/* type of bit-extraction buffer */
+#define BIT_BUF_SIZE  32	/* size of buffer in bits */
+
+/* If long is > 32 bits on your machine, and shifting/masking longs is
+ * reasonably fast, making bit_buf_type be long and setting BIT_BUF_SIZE
+ * appropriately should be a win.  Unfortunately we can't define the size
+ * with something like  #define BIT_BUF_SIZE (sizeof(bit_buf_type)*8)
+ * because not all machines measure sizeof in 8-bit bytes.
+ */
+
+typedef struct {		/* Bitreading state saved across MCUs */
+  bit_buf_type get_buffer;	/* current bit-extraction buffer */
+  int bits_left;		/* # of unused bits in it */
+} bitread_perm_state;
+
+typedef struct {		/* Bitreading working state within an MCU */
+  /* Current data source location */
+  /* We need a copy, rather than munging the original, in case of suspension */
+  const JOCTET * next_input_byte; /* => next byte to read from source */
+  size_t bytes_in_buffer;	/* # of bytes remaining in source buffer */
+  /* Bit input buffer --- note these values are kept in register variables,
+   * not in this struct, inside the inner loops.
+   */
+  bit_buf_type get_buffer;	/* current bit-extraction buffer */
+  int bits_left;		/* # of unused bits in it */
+  /* Pointer needed by jpeg_fill_bit_buffer. */
+  j_decompress_ptr cinfo;	/* back link to decompress master record */
+} bitread_working_state;
+
+/* Macros to declare and load/save bitread local variables. */
+#define BITREAD_STATE_VARS  \
+	register bit_buf_type get_buffer;  \
+	register int bits_left;  \
+	bitread_working_state br_state
+
+#define BITREAD_LOAD_STATE(cinfop,permstate)  \
+	br_state.cinfo = cinfop; \
+	br_state.next_input_byte = cinfop->src->next_input_byte; \
+	br_state.bytes_in_buffer = cinfop->src->bytes_in_buffer; \
+	get_buffer = permstate.get_buffer; \
+	bits_left = permstate.bits_left;
+
+#define BITREAD_SAVE_STATE(cinfop,permstate)  \
+	cinfop->src->next_input_byte = br_state.next_input_byte; \
+	cinfop->src->bytes_in_buffer = br_state.bytes_in_buffer; \
+	permstate.get_buffer = get_buffer; \
+	permstate.bits_left = bits_left
+
+/*
+ * These macros provide the in-line portion of bit fetching.
+ * Use CHECK_BIT_BUFFER to ensure there are N bits in get_buffer
+ * before using GET_BITS, PEEK_BITS, or DROP_BITS.
+ * The variables get_buffer and bits_left are assumed to be locals,
+ * but the state struct might not be (jpeg_huff_decode needs this).
+ *	CHECK_BIT_BUFFER(state,n,action);
+ *		Ensure there are N bits in get_buffer; if suspend, take action.
+ *      val = GET_BITS(n);
+ *		Fetch next N bits.
+ *      val = PEEK_BITS(n);
+ *		Fetch next N bits without removing them from the buffer.
+ *	DROP_BITS(n);
+ *		Discard next N bits.
+ * The value N should be a simple variable, not an expression, because it
+ * is evaluated multiple times.
+ */
+
+#define CHECK_BIT_BUFFER(state,nbits,action) \
+	{ if (bits_left < (nbits)) {  \
+	    if (! jpeg_fill_bit_buffer(&(state),get_buffer,bits_left,nbits))  \
+	      { action; }  \
+	    get_buffer = (state).get_buffer; bits_left = (state).bits_left; } }
+
+#define GET_BITS(nbits) \
+	(((int) (get_buffer >> (bits_left -= (nbits)))) & BIT_MASK(nbits))
+
+#define PEEK_BITS(nbits) \
+	(((int) (get_buffer >> (bits_left -  (nbits)))) & BIT_MASK(nbits))
+
+#define DROP_BITS(nbits) \
+	(bits_left -= (nbits))
+
+
+/*
+ * Code for extracting next Huffman-coded symbol from input bit stream.
+ * Again, this is time-critical and we make the main paths be macros.
+ *
+ * We use a lookahead table to process codes of up to HUFF_LOOKAHEAD bits
+ * without looping.  Usually, more than 95% of the Huffman codes will be 8
+ * or fewer bits long.  The few overlength codes are handled with a loop,
+ * which need not be inline code.
+ *
+ * Notes about the HUFF_DECODE macro:
+ * 1. Near the end of the data segment, we may fail to get enough bits
+ *    for a lookahead.  In that case, we do it the hard way.
+ * 2. If the lookahead table contains no entry, the next code must be
+ *    more than HUFF_LOOKAHEAD bits long.
+ * 3. jpeg_huff_decode returns -1 if forced to suspend.
+ */
+
+#define HUFF_DECODE(result,state,htbl,failaction,slowlabel) \
+{ register int nb, look; \
+  if (bits_left < HUFF_LOOKAHEAD) { \
+    if (! jpeg_fill_bit_buffer(&state,get_buffer,bits_left, 0)) {failaction;} \
+    get_buffer = state.get_buffer; bits_left = state.bits_left; \
+    if (bits_left < HUFF_LOOKAHEAD) { \
+      nb = 1; goto slowlabel; \
+    } \
+  } \
+  look = PEEK_BITS(HUFF_LOOKAHEAD); \
+  if ((nb = htbl->look_nbits[look]) != 0) { \
+    DROP_BITS(nb); \
+    result = htbl->look_sym[look]; \
+  } else { \
+    nb = HUFF_LOOKAHEAD+1; \
+slowlabel: \
+    if ((result=jpeg_huff_decode(&state,get_buffer,bits_left,htbl,nb)) < 0) \
+	{ failaction; } \
+    get_buffer = state.get_buffer; bits_left = state.bits_left; \
+  } \
+}
 
 
 /*
@@ -28,7 +196,8 @@
  */
 
 typedef struct {
-  int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
+  unsigned int EOBRUN;			/* remaining EOBs in EOBRUN */
+  int last_dc_val[MAX_COMPS_IN_SCAN];	/* last DC coef for each component */
 } savable_state;
 
 /* This macro is to work around compilers with missing or broken
@@ -41,7 +210,8 @@ typedef struct {
 #else
 #if MAX_COMPS_IN_SCAN == 4
 #define ASSIGN_STATE(dest,src)  \
-	((dest).last_dc_val[0] = (src).last_dc_val[0], \
+	((dest).EOBRUN = (src).EOBRUN, \
+	 (dest).last_dc_val[0] = (src).last_dc_val[0], \
 	 (dest).last_dc_val[1] = (src).last_dc_val[1], \
 	 (dest).last_dc_val[2] = (src).last_dc_val[2], \
 	 (dest).last_dc_val[3] = (src).last_dc_val[3])
@@ -61,6 +231,15 @@ typedef struct {
   /* These fields are NOT loaded into local working state. */
   unsigned int restarts_to_go;	/* MCUs left in this restart interval */
 
+  /* Following two fields used only in progressive mode */
+
+  /* Pointers to derived tables (these workspaces have image lifespan) */
+  d_derived_tbl * derived_tbls[NUM_HUFF_TBLS];
+
+  d_derived_tbl * ac_derived_tbl; /* active table during an AC scan */
+
+  /* Following fields used only in sequential mode */
+
   /* Pointers to derived tables (these workspaces have image lifespan) */
   d_derived_tbl * dc_derived_tbls[NUM_HUFF_TBLS];
   d_derived_tbl * ac_derived_tbls[NUM_HUFF_TBLS];
@@ -71,81 +250,30 @@ typedef struct {
   d_derived_tbl * dc_cur_tbls[D_MAX_BLOCKS_IN_MCU];
   d_derived_tbl * ac_cur_tbls[D_MAX_BLOCKS_IN_MCU];
   /* Whether we care about the DC and AC coefficient values for each block */
-  boolean dc_needed[D_MAX_BLOCKS_IN_MCU];
-  boolean ac_needed[D_MAX_BLOCKS_IN_MCU];
+  int coef_limit[D_MAX_BLOCKS_IN_MCU];
 } huff_entropy_decoder;
 
 typedef huff_entropy_decoder * huff_entropy_ptr;
 
 
-/*
- * Initialize for a Huffman-compressed scan.
- */
-
-METHODDEF(void)
-start_pass_huff_decoder (j_decompress_ptr cinfo)
-{
-  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
-  int ci, blkn, dctbl, actbl;
-  jpeg_component_info * compptr;
-
-  /* Check that the scan parameters Ss, Se, Ah/Al are OK for sequential JPEG.
-   * This ought to be an error condition, but we make it a warning because
-   * there are some baseline files out there with all zeroes in these bytes.
-   */
-  if (cinfo->Ss != 0 || cinfo->Se != DCTSIZE2-1 ||
-      cinfo->Ah != 0 || cinfo->Al != 0)
-    WARNMS(cinfo, JWRN_NOT_SEQUENTIAL);
-
-  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
-    compptr = cinfo->cur_comp_info[ci];
-    dctbl = compptr->dc_tbl_no;
-    actbl = compptr->ac_tbl_no;
-    /* Compute derived values for Huffman tables */
-    /* We may do this more than once for a table, but it's not expensive */
-    jpeg_make_d_derived_tbl(cinfo, TRUE, dctbl,
-			    & entropy->dc_derived_tbls[dctbl]);
-    jpeg_make_d_derived_tbl(cinfo, FALSE, actbl,
-			    & entropy->ac_derived_tbls[actbl]);
-    /* Initialize DC predictions to 0 */
-    entropy->saved.last_dc_val[ci] = 0;
-  }
-
-  /* Precalculate decoding info for each block in an MCU of this scan */
-  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
-    ci = cinfo->MCU_membership[blkn];
-    compptr = cinfo->cur_comp_info[ci];
-    /* Precalculate which table to use for each block */
-    entropy->dc_cur_tbls[blkn] = entropy->dc_derived_tbls[compptr->dc_tbl_no];
-    entropy->ac_cur_tbls[blkn] = entropy->ac_derived_tbls[compptr->ac_tbl_no];
-    /* Decide whether we really care about the coefficient values */
-    if (compptr->component_needed) {
-      entropy->dc_needed[blkn] = TRUE;
-      /* we don't need the ACs if producing a 1/8th-size image */
-      entropy->ac_needed[blkn] = (compptr->DCT_scaled_size > 1);
-    } else {
-      entropy->dc_needed[blkn] = entropy->ac_needed[blkn] = FALSE;
-    }
-  }
-
-  /* Initialize bitread state variables */
-  entropy->bitstate.bits_left = 0;
-  entropy->bitstate.get_buffer = 0; /* unnecessary, but keeps Purify quiet */
-  entropy->pub.insufficient_data = FALSE;
-
-  /* Initialize restart counter */
-  entropy->restarts_to_go = cinfo->restart_interval;
-}
+static const int jpeg_zigzag_order[8][8] = {
+  {  0,  1,  5,  6, 14, 15, 27, 28 },
+  {  2,  4,  7, 13, 16, 26, 29, 42 },
+  {  3,  8, 12, 17, 25, 30, 41, 43 },
+  {  9, 11, 18, 24, 31, 40, 44, 53 },
+  { 10, 19, 23, 32, 39, 45, 52, 54 },
+  { 20, 22, 33, 38, 46, 51, 55, 60 },
+  { 21, 34, 37, 47, 50, 56, 59, 61 },
+  { 35, 36, 48, 49, 57, 58, 62, 63 }
+};
 
 
 /*
  * Compute the derived values for a Huffman table.
  * This routine also performs some validation checks on the table.
- *
- * Note this is also used by jdphuff.c.
  */
 
-GLOBAL(void)
+LOCAL(void)
 jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
 			 d_derived_tbl ** pdtbl)
 {
@@ -267,8 +395,7 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
 
 
 /*
- * Out-of-line code for bit fetching (shared with jdphuff.c).
- * See jdhuff.h for info about usage.
+ * Out-of-line code for bit fetching.
  * Note: current values of get_buffer and bits_left are passed as parameters,
  * but are returned in the corresponding fields of the state struct.
  *
@@ -288,7 +415,7 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
 #endif
 
 
-GLOBAL(boolean)
+LOCAL(boolean)
 jpeg_fill_bit_buffer (bitread_working_state * state,
 		      register bit_buf_type get_buffer, register int bits_left,
 		      int nbits)
@@ -390,11 +517,32 @@ jpeg_fill_bit_buffer (bitread_working_state * state,
 
 
 /*
+ * Figure F.12: extend sign bit.
+ * On some machines, a shift and sub will be faster than a table lookup.
+ */
+
+#ifdef AVOID_TABLES
+
+#define BIT_MASK(nbits)   ((1<<(nbits))-1)
+#define HUFF_EXTEND(x,s)  ((x) < (1<<((s)-1)) ? (x) - ((1<<(s))-1) : (x))
+
+#else
+
+#define BIT_MASK(nbits)   bmask[nbits]
+#define HUFF_EXTEND(x,s)  ((x) <= bmask[(s) - 1] ? (x) - bmask[s] : (x))
+
+static const int bmask[16] =	/* bmask[n] is mask for n rightmost bits */
+  { 0, 0x0001, 0x0003, 0x0007, 0x000F, 0x001F, 0x003F, 0x007F, 0x00FF,
+    0x01FF, 0x03FF, 0x07FF, 0x0FFF, 0x1FFF, 0x3FFF, 0x7FFF };
+
+#endif /* AVOID_TABLES */
+
+
+/*
  * Out-of-line code for Huffman code decoding.
- * See jdhuff.h for info about usage.
  */
 
-GLOBAL(int)
+LOCAL(int)
 jpeg_huff_decode (bitread_working_state * state,
 		  register bit_buf_type get_buffer, register int bits_left,
 		  d_derived_tbl * htbl, int min_bits)
@@ -434,32 +582,6 @@ jpeg_huff_decode (bitread_working_state * state,
 
 
 /*
- * Figure F.12: extend sign bit.
- * On some machines, a shift and add will be faster than a table lookup.
- */
-
-#ifdef AVOID_TABLES
-
-#define HUFF_EXTEND(x,s)  ((x) < (1<<((s)-1)) ? (x) + (((-1)<<(s)) + 1) : (x))
-
-#else
-
-#define HUFF_EXTEND(x,s)  ((x) < extend_test[s] ? (x) + extend_offset[s] : (x))
-
-static const int extend_test[16] =   /* entry n is 2**(n-1) */
-  { 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
-    0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000 };
-
-static const int extend_offset[16] = /* entry n is (-1 << n) + 1 */
-  { 0, ((-1)<<1) + 1, ((-1)<<2) + 1, ((-1)<<3) + 1, ((-1)<<4) + 1,
-    ((-1)<<5) + 1, ((-1)<<6) + 1, ((-1)<<7) + 1, ((-1)<<8) + 1,
-    ((-1)<<9) + 1, ((-1)<<10) + 1, ((-1)<<11) + 1, ((-1)<<12) + 1,
-    ((-1)<<13) + 1, ((-1)<<14) + 1, ((-1)<<15) + 1 };
-
-#endif /* AVOID_TABLES */
-
-
-/*
  * Check for a restart marker & resynchronize decoder.
  * Returns FALSE if must suspend.
  */
@@ -482,6 +604,8 @@ process_restart (j_decompress_ptr cinfo)
   /* Re-initialize DC predictions to 0 */
   for (ci = 0; ci < cinfo->comps_in_scan; ci++)
     entropy->saved.last_dc_val[ci] = 0;
+  /* Re-init EOB run count, too */
+  entropy->saved.EOBRUN = 0;
 
   /* Reset restart counter */
   entropy->restarts_to_go = cinfo->restart_interval;
@@ -499,27 +623,40 @@ process_restart (j_decompress_ptr cinfo)
 
 
 /*
- * Decode and return one MCU's worth of Huffman-compressed coefficients.
+ * Huffman MCU decoding.
+ * Each of these routines decodes and returns one MCU's worth of
+ * Huffman-compressed coefficients. 
  * The coefficients are reordered from zigzag order into natural array order,
  * but are not dequantized.
  *
  * The i'th block of the MCU is stored into the block pointed to by
- * MCU_data[i].  WE ASSUME THIS AREA HAS BEEN ZEROED BY THE CALLER.
+ * MCU_data[i].  WE ASSUME THIS AREA IS INITIALLY ZEROED BY THE CALLER.
  * (Wholesale zeroing is usually a little faster than retail...)
  *
- * Returns FALSE if data source requested suspension.  In that case no
+ * We return FALSE if data source requested suspension.  In that case no
  * changes have been made to permanent state.  (Exception: some output
  * coefficients may already have been assigned.  This is harmless for
- * this module, since we'll just re-assign them on the next call.)
+ * spectral selection, since we'll just re-assign them on the next call.
+ * Successive approximation AC refinement has to be more careful, however.)
+ */
+
+/*
+ * MCU decoding for DC initial scan (either spectral selection,
+ * or first pass of successive approximation).
  */
 
 METHODDEF(boolean)
-decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
-{
+decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{   
   huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
-  int blkn;
+  int Al = cinfo->Al;
+  register int s, r;
+  int blkn, ci;
+  JBLOCKROW block;
   BITREAD_STATE_VARS;
   savable_state state;
+  d_derived_tbl * tbl;
+  jpeg_component_info * compptr;
 
   /* Process restart marker if needed; may have to suspend */
   if (cinfo->restart_interval) {
@@ -540,40 +677,391 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
     /* Outer loop handles each block in the MCU */
 
     for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
-      JBLOCKROW block = MCU_data[blkn];
-      d_derived_tbl * dctbl = entropy->dc_cur_tbls[blkn];
-      d_derived_tbl * actbl = entropy->ac_cur_tbls[blkn];
-      register int s, k, r;
+      block = MCU_data[blkn];
+      ci = cinfo->MCU_membership[blkn];
+      compptr = cinfo->cur_comp_info[ci];
+      tbl = entropy->derived_tbls[compptr->dc_tbl_no];
 
       /* Decode a single block's worth of coefficients */
 
       /* Section F.2.2.1: decode the DC coefficient difference */
-      HUFF_DECODE(s, br_state, dctbl, return FALSE, label1);
+      HUFF_DECODE(s, br_state, tbl, return FALSE, label1);
       if (s) {
 	CHECK_BIT_BUFFER(br_state, s, return FALSE);
 	r = GET_BITS(s);
 	s = HUFF_EXTEND(r, s);
       }
 
-      if (entropy->dc_needed[blkn]) {
+      /* Convert DC difference to actual value, update last_dc_val */
+      s += state.last_dc_val[ci];
+      state.last_dc_val[ci] = s;
+      /* Scale and output the coefficient (assumes jpeg_natural_order[0]=0) */
+      (*block)[0] = (JCOEF) (s << Al);
+    }
+
+    /* Completed MCU, so update state */
+    BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+    ASSIGN_STATE(entropy->saved, state);
+  }
+
+  /* Account for restart interval (no-op if not using restarts) */
+  entropy->restarts_to_go--;
+
+  return TRUE;
+}
+
+
+/*
+ * MCU decoding for AC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{   
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  int Se = cinfo->Se;
+  int Al = cinfo->Al;
+  register int s, k, r;
+  unsigned int EOBRUN;
+  JBLOCKROW block;
+  BITREAD_STATE_VARS;
+  d_derived_tbl * tbl;
+
+  /* Process restart marker if needed; may have to suspend */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      if (! process_restart(cinfo))
+	return FALSE;
+  }
+
+  /* If we've run out of data, just leave the MCU set to zeroes.
+   * This way, we return uniform gray for the remainder of the segment.
+   */
+  if (! entropy->pub.insufficient_data) {
+
+    /* Load up working state.
+     * We can avoid loading/saving bitread state if in an EOB run.
+     */
+    EOBRUN = entropy->saved.EOBRUN;	/* only part of saved state we need */
+
+    /* There is always only one block per MCU */
+
+    if (EOBRUN > 0)		/* if it's a band of zeroes... */
+      EOBRUN--;			/* ...process it now (we do nothing) */
+    else {
+      BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+      block = MCU_data[0];
+      tbl = entropy->ac_derived_tbl;
+
+      for (k = cinfo->Ss; k <= Se; k++) {
+	HUFF_DECODE(s, br_state, tbl, return FALSE, label2);
+	r = s >> 4;
+	s &= 15;
+	if (s) {
+	  k += r;
+	  CHECK_BIT_BUFFER(br_state, s, return FALSE);
+	  r = GET_BITS(s);
+	  s = HUFF_EXTEND(r, s);
+	  /* Scale and output coefficient in natural (dezigzagged) order */
+	  (*block)[jpeg_natural_order[k]] = (JCOEF) (s << Al);
+	} else {
+	  if (r == 15) {	/* ZRL */
+	    k += 15;		/* skip 15 zeroes in band */
+	  } else {		/* EOBr, run length is 2^r + appended bits */
+	    EOBRUN = 1 << r;
+	    if (r) {		/* EOBr, r > 0 */
+	      CHECK_BIT_BUFFER(br_state, r, return FALSE);
+	      r = GET_BITS(r);
+	      EOBRUN += r;
+	    }
+	    EOBRUN--;		/* this band is processed at this moment */
+	    break;		/* force end-of-band */
+	  }
+	}
+      }
+
+      BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+    }
+
+    /* Completed MCU, so update state */
+    entropy->saved.EOBRUN = EOBRUN;	/* only part of saved state we need */
+  }
+
+  /* Account for restart interval (no-op if not using restarts) */
+  entropy->restarts_to_go--;
+
+  return TRUE;
+}
+
+
+/*
+ * MCU decoding for DC successive approximation refinement scan.
+ * Note: we assume such scans can be multi-component, although the spec
+ * is not very clear on the point.
+ */
+
+METHODDEF(boolean)
+decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{   
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  int p1 = 1 << cinfo->Al;	/* 1 in the bit position being coded */
+  int blkn;
+  JBLOCKROW block;
+  BITREAD_STATE_VARS;
+
+  /* Process restart marker if needed; may have to suspend */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      if (! process_restart(cinfo))
+	return FALSE;
+  }
+
+  /* Not worth the cycles to check insufficient_data here,
+   * since we will not change the data anyway if we read zeroes.
+   */
+
+  /* Load up working state */
+  BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+
+  /* Outer loop handles each block in the MCU */
+
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    block = MCU_data[blkn];
+
+    /* Encoded data is simply the next bit of the two's-complement DC value */
+    CHECK_BIT_BUFFER(br_state, 1, return FALSE);
+    if (GET_BITS(1))
+      (*block)[0] |= p1;
+    /* Note: since we use |=, repeating the assignment later is safe */
+  }
+
+  /* Completed MCU, so update state */
+  BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+
+  /* Account for restart interval (no-op if not using restarts) */
+  entropy->restarts_to_go--;
+
+  return TRUE;
+}
+
+
+/*
+ * MCU decoding for AC successive approximation refinement scan.
+ */
+
+METHODDEF(boolean)
+decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{   
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  int Se = cinfo->Se;
+  int p1 = 1 << cinfo->Al;	/* 1 in the bit position being coded */
+  int m1 = (-1) << cinfo->Al;	/* -1 in the bit position being coded */
+  register int s, k, r;
+  unsigned int EOBRUN;
+  JBLOCKROW block;
+  JCOEFPTR thiscoef;
+  BITREAD_STATE_VARS;
+  d_derived_tbl * tbl;
+  int num_newnz;
+  int newnz_pos[DCTSIZE2];
+
+  /* Process restart marker if needed; may have to suspend */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      if (! process_restart(cinfo))
+	return FALSE;
+  }
+
+  /* If we've run out of data, don't modify the MCU.
+   */
+  if (! entropy->pub.insufficient_data) {
+
+    /* Load up working state */
+    BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+    EOBRUN = entropy->saved.EOBRUN; /* only part of saved state we need */
+
+    /* There is always only one block per MCU */
+    block = MCU_data[0];
+    tbl = entropy->ac_derived_tbl;
+
+    /* If we are forced to suspend, we must undo the assignments to any newly
+     * nonzero coefficients in the block, because otherwise we'd get confused
+     * next time about which coefficients were already nonzero.
+     * But we need not undo addition of bits to already-nonzero coefficients;
+     * instead, we can test the current bit to see if we already did it.
+     */
+    num_newnz = 0;
+
+    /* initialize coefficient loop counter to start of band */
+    k = cinfo->Ss;
+
+    if (EOBRUN == 0) {
+      for (; k <= Se; k++) {
+	HUFF_DECODE(s, br_state, tbl, goto undoit, label3);
+	r = s >> 4;
+	s &= 15;
+	if (s) {
+	  if (s != 1)		/* size of new coef should always be 1 */
+	    WARNMS(cinfo, JWRN_HUFF_BAD_CODE);
+	  CHECK_BIT_BUFFER(br_state, 1, goto undoit);
+	  if (GET_BITS(1))
+	    s = p1;		/* newly nonzero coef is positive */
+	  else
+	    s = m1;		/* newly nonzero coef is negative */
+	} else {
+	  if (r != 15) {
+	    EOBRUN = 1 << r;	/* EOBr, run length is 2^r + appended bits */
+	    if (r) {
+	      CHECK_BIT_BUFFER(br_state, r, goto undoit);
+	      r = GET_BITS(r);
+	      EOBRUN += r;
+	    }
+	    break;		/* rest of block is handled by EOB logic */
+	  }
+	  /* note s = 0 for processing ZRL */
+	}
+	/* Advance over already-nonzero coefs and r still-zero coefs,
+	 * appending correction bits to the nonzeroes.  A correction bit is 1
+	 * if the absolute value of the coefficient must be increased.
+	 */
+	do {
+	  thiscoef = *block + jpeg_natural_order[k];
+	  if (*thiscoef != 0) {
+	    CHECK_BIT_BUFFER(br_state, 1, goto undoit);
+	    if (GET_BITS(1)) {
+	      if ((*thiscoef & p1) == 0) { /* do nothing if already set it */
+		if (*thiscoef >= 0)
+		  *thiscoef += p1;
+		else
+		  *thiscoef += m1;
+	      }
+	    }
+	  } else {
+	    if (--r < 0)
+	      break;		/* reached target zero coefficient */
+	  }
+	  k++;
+	} while (k <= Se);
+	if (s) {
+	  int pos = jpeg_natural_order[k];
+	  /* Output newly nonzero coefficient */
+	  (*block)[pos] = (JCOEF) s;
+	  /* Remember its position in case we have to suspend */
+	  newnz_pos[num_newnz++] = pos;
+	}
+      }
+    }
+
+    if (EOBRUN > 0) {
+      /* Scan any remaining coefficient positions after the end-of-band
+       * (the last newly nonzero coefficient, if any).  Append a correction
+       * bit to each already-nonzero coefficient.  A correction bit is 1
+       * if the absolute value of the coefficient must be increased.
+       */
+      for (; k <= Se; k++) {
+	thiscoef = *block + jpeg_natural_order[k];
+	if (*thiscoef != 0) {
+	  CHECK_BIT_BUFFER(br_state, 1, goto undoit);
+	  if (GET_BITS(1)) {
+	    if ((*thiscoef & p1) == 0) { /* do nothing if already changed it */
+	      if (*thiscoef >= 0)
+		*thiscoef += p1;
+	      else
+		*thiscoef += m1;
+	    }
+	  }
+	}
+      }
+      /* Count one block completed in EOB run */
+      EOBRUN--;
+    }
+
+    /* Completed MCU, so update state */
+    BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+    entropy->saved.EOBRUN = EOBRUN; /* only part of saved state we need */
+  }
+
+  /* Account for restart interval (no-op if not using restarts) */
+  entropy->restarts_to_go--;
+
+  return TRUE;
+
+undoit:
+  /* Re-zero any output coefficients that we made newly nonzero */
+  while (num_newnz > 0)
+    (*block)[newnz_pos[--num_newnz]] = 0;
+
+  return FALSE;
+}
+
+
+/*
+ * Decode one MCU's worth of Huffman-compressed coefficients.
+ */
+
+METHODDEF(boolean)
+decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  int blkn;
+  BITREAD_STATE_VARS;
+  savable_state state;
+
+  /* Process restart marker if needed; may have to suspend */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      if (! process_restart(cinfo))
+	return FALSE;
+  }
+
+  /* If we've run out of data, just leave the MCU set to zeroes.
+   * This way, we return uniform gray for the remainder of the segment.
+   */
+  if (! entropy->pub.insufficient_data) {
+
+    /* Load up working state */
+    BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+    ASSIGN_STATE(state, entropy->saved);
+
+    /* Outer loop handles each block in the MCU */
+
+    for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+      JBLOCKROW block = MCU_data[blkn];
+      d_derived_tbl * htbl;
+      register int s, k, r;
+      int coef_limit, ci;
+
+      /* Decode a single block's worth of coefficients */
+
+      /* Section F.2.2.1: decode the DC coefficient difference */
+      htbl = entropy->dc_cur_tbls[blkn];
+      HUFF_DECODE(s, br_state, htbl, return FALSE, label1);
+
+      htbl = entropy->ac_cur_tbls[blkn];
+      k = 1;
+      coef_limit = entropy->coef_limit[blkn];
+      if (coef_limit) {
 	/* Convert DC difference to actual value, update last_dc_val */
-	int ci = cinfo->MCU_membership[blkn];
+	if (s) {
+	  CHECK_BIT_BUFFER(br_state, s, return FALSE);
+	  r = GET_BITS(s);
+	  s = HUFF_EXTEND(r, s);
+	}
+	ci = cinfo->MCU_membership[blkn];
 	s += state.last_dc_val[ci];
 	state.last_dc_val[ci] = s;
-	/* Output the DC coefficient (assumes jpeg_natural_order[0] = 0) */
+	/* Output the DC coefficient */
 	(*block)[0] = (JCOEF) s;
-      }
-
-      if (entropy->ac_needed[blkn]) {
 
 	/* Section F.2.2.2: decode the AC coefficients */
 	/* Since zeroes are skipped, output area must be cleared beforehand */
-	for (k = 1; k < DCTSIZE2; k++) {
-	  HUFF_DECODE(s, br_state, actbl, return FALSE, label2);
-      
+	for (; k < coef_limit; k++) {
+	  HUFF_DECODE(s, br_state, htbl, return FALSE, label2);
+
 	  r = s >> 4;
 	  s &= 15;
-      
+
 	  if (s) {
 	    k += r;
 	    CHECK_BIT_BUFFER(br_state, s, return FALSE);
@@ -586,33 +1074,37 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 	    (*block)[jpeg_natural_order[k]] = (JCOEF) s;
 	  } else {
 	    if (r != 15)
-	      break;
+	      goto EndOfBlock;
 	    k += 15;
 	  }
 	}
-
       } else {
-
-	/* Section F.2.2.2: decode the AC coefficients */
-	/* In this path we just discard the values */
-	for (k = 1; k < DCTSIZE2; k++) {
-	  HUFF_DECODE(s, br_state, actbl, return FALSE, label3);
-      
-	  r = s >> 4;
-	  s &= 15;
-      
-	  if (s) {
-	    k += r;
-	    CHECK_BIT_BUFFER(br_state, s, return FALSE);
-	    DROP_BITS(s);
-	  } else {
-	    if (r != 15)
-	      break;
-	    k += 15;
-	  }
+	if (s) {
+	  CHECK_BIT_BUFFER(br_state, s, return FALSE);
+	  DROP_BITS(s);
 	}
+      }
+
+      /* Section F.2.2.2: decode the AC coefficients */
+      /* In this path we just discard the values */
+      for (; k < DCTSIZE2; k++) {
+	HUFF_DECODE(s, br_state, htbl, return FALSE, label3);
+
+	r = s >> 4;
+	s &= 15;
 
+	if (s) {
+	  k += r;
+	  CHECK_BIT_BUFFER(br_state, s, return FALSE);
+	  DROP_BITS(s);
+	} else {
+	  if (r != 15)
+	    break;
+	  k += 15;
+	}
       }
+
+      EndOfBlock: ;
     }
 
     /* Completed MCU, so update state */
@@ -628,6 +1120,156 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 
 
 /*
+ * Initialize for a Huffman-compressed scan.
+ */
+
+METHODDEF(void)
+start_pass_huff_decoder (j_decompress_ptr cinfo)
+{
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  int ci, blkn, dctbl, actbl, i;
+  jpeg_component_info * compptr;
+
+  if (cinfo->progressive_mode) {
+    /* Validate progressive scan parameters */
+    if (cinfo->Ss == 0) {
+      if (cinfo->Se != 0)
+	goto bad;
+    } else {
+      /* need not check Ss/Se < 0 since they came from unsigned bytes */
+      if (cinfo->Se < cinfo->Ss || cinfo->Se >= DCTSIZE2)
+	goto bad;
+      /* AC scans may have only one component */
+      if (cinfo->comps_in_scan != 1)
+	goto bad;
+    }
+    if (cinfo->Ah != 0) {
+      /* Successive approximation refinement scan: must have Al = Ah-1. */
+      if (cinfo->Ah-1 != cinfo->Al)
+	goto bad;
+    }
+    if (cinfo->Al > 13) {	/* need not check for < 0 */
+      /* Arguably the maximum Al value should be less than 13 for 8-bit precision,
+       * but the spec doesn't say so, and we try to be liberal about what we
+       * accept.  Note: large Al values could result in out-of-range DC
+       * coefficients during early scans, leading to bizarre displays due to
+       * overflows in the IDCT math.  But we won't crash.
+       */
+      bad:
+      ERREXIT4(cinfo, JERR_BAD_PROGRESSION,
+	       cinfo->Ss, cinfo->Se, cinfo->Ah, cinfo->Al);
+    }
+    /* Update progression status, and verify that scan order is legal.
+     * Note that inter-scan inconsistencies are treated as warnings
+     * not fatal errors ... not clear if this is right way to behave.
+     */
+    for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+      int coefi, cindex = cinfo->cur_comp_info[ci]->component_index;
+      int *coef_bit_ptr = & cinfo->coef_bits[cindex][0];
+      if (cinfo->Ss && coef_bit_ptr[0] < 0) /* AC without prior DC scan */
+	WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, 0);
+      for (coefi = cinfo->Ss; coefi <= cinfo->Se; coefi++) {
+	int expected = (coef_bit_ptr[coefi] < 0) ? 0 : coef_bit_ptr[coefi];
+	if (cinfo->Ah != expected)
+	  WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, coefi);
+	coef_bit_ptr[coefi] = cinfo->Al;
+      }
+    }
+
+    /* Select MCU decoding routine */
+    if (cinfo->Ah == 0) {
+      if (cinfo->Ss == 0)
+	entropy->pub.decode_mcu = decode_mcu_DC_first;
+      else
+	entropy->pub.decode_mcu = decode_mcu_AC_first;
+    } else {
+      if (cinfo->Ss == 0)
+	entropy->pub.decode_mcu = decode_mcu_DC_refine;
+      else
+	entropy->pub.decode_mcu = decode_mcu_AC_refine;
+    }
+
+    for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+      compptr = cinfo->cur_comp_info[ci];
+      /* Make sure requested tables are present, and compute derived tables.
+       * We may build same derived table more than once, but it's not expensive.
+       */
+      if (cinfo->Ss == 0) {
+	if (cinfo->Ah == 0) {	/* DC refinement needs no table */
+	  i = compptr->dc_tbl_no;
+	  jpeg_make_d_derived_tbl(cinfo, TRUE, i,
+				  & entropy->derived_tbls[i]);
+	}
+      } else {
+	i = compptr->ac_tbl_no;
+	jpeg_make_d_derived_tbl(cinfo, FALSE, i,
+				& entropy->derived_tbls[i]);
+	/* remember the single active table */
+	entropy->ac_derived_tbl = entropy->derived_tbls[i];
+      }
+      /* Initialize DC predictions to 0 */
+      entropy->saved.last_dc_val[ci] = 0;
+    }
+
+    /* Initialize private state variables */
+    entropy->saved.EOBRUN = 0;
+  } else {
+    /* Check that the scan parameters Ss, Se, Ah/Al are OK for sequential JPEG.
+     * This ought to be an error condition, but we make it a warning because
+     * there are some baseline files out there with all zeroes in these bytes.
+     */
+    if (cinfo->Ss != 0 || cinfo->Se != DCTSIZE2-1 ||
+	cinfo->Ah != 0 || cinfo->Al != 0)
+      WARNMS(cinfo, JWRN_NOT_SEQUENTIAL);
+
+    /* Select MCU decoding routine */
+    entropy->pub.decode_mcu = decode_mcu;
+
+    for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+      compptr = cinfo->cur_comp_info[ci];
+      dctbl = compptr->dc_tbl_no;
+      actbl = compptr->ac_tbl_no;
+      /* Compute derived values for Huffman tables */
+      /* We may do this more than once for a table, but it's not expensive */
+      jpeg_make_d_derived_tbl(cinfo, TRUE, dctbl,
+			      & entropy->dc_derived_tbls[dctbl]);
+      jpeg_make_d_derived_tbl(cinfo, FALSE, actbl,
+			      & entropy->ac_derived_tbls[actbl]);
+      /* Initialize DC predictions to 0 */
+      entropy->saved.last_dc_val[ci] = 0;
+    }
+
+    /* Precalculate decoding info for each block in an MCU of this scan */
+    for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+      ci = cinfo->MCU_membership[blkn];
+      compptr = cinfo->cur_comp_info[ci];
+      /* Precalculate which table to use for each block */
+      entropy->dc_cur_tbls[blkn] = entropy->dc_derived_tbls[compptr->dc_tbl_no];
+      entropy->ac_cur_tbls[blkn] = entropy->ac_derived_tbls[compptr->ac_tbl_no];
+      /* Decide whether we really care about the coefficient values */
+      if (compptr->component_needed) {
+	ci = compptr->DCT_v_scaled_size;
+	if (ci <= 0 || ci > 8) ci = 8;
+	i = compptr->DCT_h_scaled_size;
+	if (i <= 0 || i > 8) i = 8;
+	entropy->coef_limit[blkn] = 1 + jpeg_zigzag_order[ci - 1][i - 1];
+      } else {
+	entropy->coef_limit[blkn] = 0;
+      }
+    }
+  }
+
+  /* Initialize bitread state variables */
+  entropy->bitstate.bits_left = 0;
+  entropy->bitstate.get_buffer = 0; /* unnecessary, but keeps Purify quiet */
+  entropy->pub.insufficient_data = FALSE;
+
+  /* Initialize restart counter */
+  entropy->restarts_to_go = cinfo->restart_interval;
+}
+
+
+/*
  * Module initialization routine for Huffman entropy decoding.
  */
 
@@ -642,11 +1284,27 @@ jinit_huff_decoder (j_decompress_ptr cinfo)
 				SIZEOF(huff_entropy_decoder));
   cinfo->entropy = (struct jpeg_entropy_decoder *) entropy;
   entropy->pub.start_pass = start_pass_huff_decoder;
-  entropy->pub.decode_mcu = decode_mcu;
 
-  /* Mark tables unallocated */
-  for (i = 0; i < NUM_HUFF_TBLS; i++) {
-    entropy->dc_derived_tbls[i] = entropy->ac_derived_tbls[i] = NULL;
+  if (cinfo->progressive_mode) {
+    /* Create progression status table */
+    int *coef_bit_ptr, ci;
+    cinfo->coef_bits = (int (*)[DCTSIZE2])
+      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+				  cinfo->num_components*DCTSIZE2*SIZEOF(int));
+    coef_bit_ptr = & cinfo->coef_bits[0][0];
+    for (ci = 0; ci < cinfo->num_components; ci++)
+      for (i = 0; i < DCTSIZE2; i++)
+	*coef_bit_ptr++ = -1;
+
+    /* Mark derived tables unallocated */
+    for (i = 0; i < NUM_HUFF_TBLS; i++) {
+      entropy->derived_tbls[i] = NULL;
+    }
+  } else {
+    /* Mark tables unallocated */
+    for (i = 0; i < NUM_HUFF_TBLS; i++) {
+      entropy->dc_derived_tbls[i] = entropy->ac_derived_tbls[i] = NULL;
+    }
   }
 }
 
diff --git a/src/libjpeg/jdinput.c b/src/libjpeg/jdinput.c
index 0c2ac8f..08a9b37 100644
--- a/src/libjpeg/jdinput.c
+++ b/src/libjpeg/jdinput.c
@@ -2,13 +2,14 @@
  * jdinput.c
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2002-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
  * This file contains input control logic for the JPEG decompressor.
  * These routines are concerned with controlling the decompressor's input
  * processing (marker reading and coefficient decoding).  The actual input
- * reading is done in jdmarker.c, jdhuff.c, and jdphuff.c.
+ * reading is done in jdmarker.c, jdhuff.c, and jdarith.c.
  */
 
 #define JPEG_INTERNALS
@@ -74,12 +75,14 @@ initial_setup (j_decompress_ptr cinfo)
    * In the full decompressor, this will be overridden by jdmaster.c;
    * but in the transcoder, jdmaster.c is not used, so we must do it here.
    */
-  cinfo->min_DCT_scaled_size = DCTSIZE;
+  cinfo->min_DCT_h_scaled_size = DCTSIZE;
+  cinfo->min_DCT_v_scaled_size = DCTSIZE;
 
   /* Compute dimensions of components */
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
-    compptr->DCT_scaled_size = DCTSIZE;
+    compptr->DCT_h_scaled_size = DCTSIZE;
+    compptr->DCT_v_scaled_size = DCTSIZE;
     /* Size in DCT blocks */
     compptr->width_in_blocks = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor,
@@ -138,7 +141,7 @@ per_scan_setup (j_decompress_ptr cinfo)
     compptr->MCU_width = 1;
     compptr->MCU_height = 1;
     compptr->MCU_blocks = 1;
-    compptr->MCU_sample_width = compptr->DCT_scaled_size;
+    compptr->MCU_sample_width = compptr->DCT_h_scaled_size;
     compptr->last_col_width = 1;
     /* For noninterleaved scans, it is convenient to define last_row_height
      * as the number of block rows present in the last iMCU row.
@@ -174,7 +177,7 @@ per_scan_setup (j_decompress_ptr cinfo)
       compptr->MCU_width = compptr->h_samp_factor;
       compptr->MCU_height = compptr->v_samp_factor;
       compptr->MCU_blocks = compptr->MCU_width * compptr->MCU_height;
-      compptr->MCU_sample_width = compptr->MCU_width * compptr->DCT_scaled_size;
+      compptr->MCU_sample_width = compptr->MCU_width * compptr->DCT_h_scaled_size;
       /* Figure number of non-dummy blocks in last MCU column & row */
       tmp = (int) (compptr->width_in_blocks % compptr->MCU_width);
       if (tmp == 0) tmp = compptr->MCU_width;
diff --git a/src/libjpeg/jdmainct.c b/src/libjpeg/jdmainct.c
index 13c956f..02723ca 100644
--- a/src/libjpeg/jdmainct.c
+++ b/src/libjpeg/jdmainct.c
@@ -161,7 +161,7 @@ alloc_funny_pointers (j_decompress_ptr cinfo)
 {
   my_main_ptr main = (my_main_ptr) cinfo->main;
   int ci, rgroup;
-  int M = cinfo->min_DCT_scaled_size;
+  int M = cinfo->min_DCT_v_scaled_size;
   jpeg_component_info *compptr;
   JSAMPARRAY xbuf;
 
@@ -175,8 +175,8 @@ alloc_funny_pointers (j_decompress_ptr cinfo)
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
-    rgroup = (compptr->v_samp_factor * compptr->DCT_scaled_size) /
-      cinfo->min_DCT_scaled_size; /* height of a row group of component */
+    rgroup = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
+      cinfo->min_DCT_v_scaled_size; /* height of a row group of component */
     /* Get space for pointer lists --- M+4 row groups in each list.
      * We alloc both pointer lists with one call to save a few cycles.
      */
@@ -202,14 +202,14 @@ make_funny_pointers (j_decompress_ptr cinfo)
 {
   my_main_ptr main = (my_main_ptr) cinfo->main;
   int ci, i, rgroup;
-  int M = cinfo->min_DCT_scaled_size;
+  int M = cinfo->min_DCT_v_scaled_size;
   jpeg_component_info *compptr;
   JSAMPARRAY buf, xbuf0, xbuf1;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
-    rgroup = (compptr->v_samp_factor * compptr->DCT_scaled_size) /
-      cinfo->min_DCT_scaled_size; /* height of a row group of component */
+    rgroup = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
+      cinfo->min_DCT_v_scaled_size; /* height of a row group of component */
     xbuf0 = main->xbuffer[0][ci];
     xbuf1 = main->xbuffer[1][ci];
     /* First copy the workspace pointers as-is */
@@ -242,14 +242,14 @@ set_wraparound_pointers (j_decompress_ptr cinfo)
 {
   my_main_ptr main = (my_main_ptr) cinfo->main;
   int ci, i, rgroup;
-  int M = cinfo->min_DCT_scaled_size;
+  int M = cinfo->min_DCT_v_scaled_size;
   jpeg_component_info *compptr;
   JSAMPARRAY xbuf0, xbuf1;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
-    rgroup = (compptr->v_samp_factor * compptr->DCT_scaled_size) /
-      cinfo->min_DCT_scaled_size; /* height of a row group of component */
+    rgroup = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
+      cinfo->min_DCT_v_scaled_size; /* height of a row group of component */
     xbuf0 = main->xbuffer[0][ci];
     xbuf1 = main->xbuffer[1][ci];
     for (i = 0; i < rgroup; i++) {
@@ -277,8 +277,8 @@ set_bottom_pointers (j_decompress_ptr cinfo)
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
     /* Count sample rows in one iMCU row and in one row group */
-    iMCUheight = compptr->v_samp_factor * compptr->DCT_scaled_size;
-    rgroup = iMCUheight / cinfo->min_DCT_scaled_size;
+    iMCUheight = compptr->v_samp_factor * compptr->DCT_v_scaled_size;
+    rgroup = iMCUheight / cinfo->min_DCT_v_scaled_size;
     /* Count nondummy sample rows remaining for this component */
     rows_left = (int) (compptr->downsampled_height % (JDIMENSION) iMCUheight);
     if (rows_left == 0) rows_left = iMCUheight;
@@ -357,7 +357,7 @@ process_data_simple_main (j_decompress_ptr cinfo,
   }
 
   /* There are always min_DCT_scaled_size row groups in an iMCU row. */
-  rowgroups_avail = (JDIMENSION) cinfo->min_DCT_scaled_size;
+  rowgroups_avail = (JDIMENSION) cinfo->min_DCT_v_scaled_size;
   /* Note: at the bottom of the image, we may pass extra garbage row groups
    * to the postprocessor.  The postprocessor has to check for bottom
    * of image anyway (at row resolution), so no point in us doing it too.
@@ -417,7 +417,7 @@ process_data_context_main (j_decompress_ptr cinfo,
   case CTX_PREPARE_FOR_IMCU:
     /* Prepare to process first M-1 row groups of this iMCU row */
     main->rowgroup_ctr = 0;
-    main->rowgroups_avail = (JDIMENSION) (cinfo->min_DCT_scaled_size - 1);
+    main->rowgroups_avail = (JDIMENSION) (cinfo->min_DCT_v_scaled_size - 1);
     /* Check for bottom of image: if so, tweak pointers to "duplicate"
      * the last sample row, and adjust rowgroups_avail to ignore padding rows.
      */
@@ -440,8 +440,8 @@ process_data_context_main (j_decompress_ptr cinfo,
     main->buffer_full = FALSE;
     /* Still need to process last row group of this iMCU row, */
     /* which is saved at index M+1 of the other xbuffer */
-    main->rowgroup_ctr = (JDIMENSION) (cinfo->min_DCT_scaled_size + 1);
-    main->rowgroups_avail = (JDIMENSION) (cinfo->min_DCT_scaled_size + 2);
+    main->rowgroup_ctr = (JDIMENSION) (cinfo->min_DCT_v_scaled_size + 1);
+    main->rowgroups_avail = (JDIMENSION) (cinfo->min_DCT_v_scaled_size + 2);
     main->context_state = CTX_POSTPONED_ROW;
   }
 }
@@ -492,21 +492,21 @@ jinit_d_main_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
    * ngroups is the number of row groups we need.
    */
   if (cinfo->upsample->need_context_rows) {
-    if (cinfo->min_DCT_scaled_size < 2) /* unsupported, see comments above */
+    if (cinfo->min_DCT_v_scaled_size < 2) /* unsupported, see comments above */
       ERREXIT(cinfo, JERR_NOTIMPL);
     alloc_funny_pointers(cinfo); /* Alloc space for xbuffer[] lists */
-    ngroups = cinfo->min_DCT_scaled_size + 2;
+    ngroups = cinfo->min_DCT_v_scaled_size + 2;
   } else {
-    ngroups = cinfo->min_DCT_scaled_size;
+    ngroups = cinfo->min_DCT_v_scaled_size;
   }
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
-    rgroup = (compptr->v_samp_factor * compptr->DCT_scaled_size) /
-      cinfo->min_DCT_scaled_size; /* height of a row group of component */
+    rgroup = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
+      cinfo->min_DCT_v_scaled_size; /* height of a row group of component */
     main->buffer[ci] = (*cinfo->mem->alloc_sarray)
 			((j_common_ptr) cinfo, JPOOL_IMAGE,
-			 compptr->width_in_blocks * compptr->DCT_scaled_size,
+			 compptr->width_in_blocks * compptr->DCT_h_scaled_size,
 			 (JDIMENSION) (rgroup * ngroups));
   }
 }
diff --git a/src/libjpeg/jdmaster.c b/src/libjpeg/jdmaster.c
index 2802c5b..04f8312 100644
--- a/src/libjpeg/jdmaster.c
+++ b/src/libjpeg/jdmaster.c
@@ -2,6 +2,7 @@
  * jdmaster.c
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2002-2008 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -61,9 +62,12 @@ use_merged_upsample (j_decompress_ptr cinfo)
       cinfo->comp_info[2].v_samp_factor != 1)
     return FALSE;
   /* furthermore, it doesn't work if we've scaled the IDCTs differently */
-  if (cinfo->comp_info[0].DCT_scaled_size != cinfo->min_DCT_scaled_size ||
-      cinfo->comp_info[1].DCT_scaled_size != cinfo->min_DCT_scaled_size ||
-      cinfo->comp_info[2].DCT_scaled_size != cinfo->min_DCT_scaled_size)
+  if (cinfo->comp_info[0].DCT_h_scaled_size != cinfo->min_DCT_h_scaled_size ||
+      cinfo->comp_info[1].DCT_h_scaled_size != cinfo->min_DCT_h_scaled_size ||
+      cinfo->comp_info[2].DCT_h_scaled_size != cinfo->min_DCT_h_scaled_size ||
+      cinfo->comp_info[0].DCT_v_scaled_size != cinfo->min_DCT_v_scaled_size ||
+      cinfo->comp_info[1].DCT_v_scaled_size != cinfo->min_DCT_v_scaled_size ||
+      cinfo->comp_info[2].DCT_v_scaled_size != cinfo->min_DCT_v_scaled_size)
     return FALSE;
   /* ??? also need to test for upsample-time rescaling, when & if supported */
   return TRUE;			/* by golly, it'll work... */
@@ -102,43 +106,152 @@ jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
       jdiv_round_up((long) cinfo->image_width, 8L);
     cinfo->output_height = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_height, 8L);
-    cinfo->min_DCT_scaled_size = 1;
+    cinfo->min_DCT_h_scaled_size = 1;
+    cinfo->min_DCT_v_scaled_size = 1;
   } else if (cinfo->scale_num * 4 <= cinfo->scale_denom) {
     /* Provide 1/4 scaling */
     cinfo->output_width = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_width, 4L);
     cinfo->output_height = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_height, 4L);
-    cinfo->min_DCT_scaled_size = 2;
+    cinfo->min_DCT_h_scaled_size = 2;
+    cinfo->min_DCT_v_scaled_size = 2;
+  } else if (cinfo->scale_num * 8 <= cinfo->scale_denom * 3) {
+    /* Provide 3/8 scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 3L, 8L);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 3L, 8L);
+    cinfo->min_DCT_h_scaled_size = 3;
+    cinfo->min_DCT_v_scaled_size = 3;
   } else if (cinfo->scale_num * 2 <= cinfo->scale_denom) {
     /* Provide 1/2 scaling */
     cinfo->output_width = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_width, 2L);
     cinfo->output_height = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_height, 2L);
-    cinfo->min_DCT_scaled_size = 4;
-  } else {
+    cinfo->min_DCT_h_scaled_size = 4;
+    cinfo->min_DCT_v_scaled_size = 4;
+  } else if (cinfo->scale_num * 8 <= cinfo->scale_denom * 5) {
+    /* Provide 5/8 scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 5L, 8L);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 5L, 8L);
+    cinfo->min_DCT_h_scaled_size = 5;
+    cinfo->min_DCT_v_scaled_size = 5;
+  } else if (cinfo->scale_num * 4 <= cinfo->scale_denom * 3) {
+    /* Provide 3/4 scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 3L, 4L);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 3L, 4L);
+    cinfo->min_DCT_h_scaled_size = 6;
+    cinfo->min_DCT_v_scaled_size = 6;
+  } else if (cinfo->scale_num * 8 <= cinfo->scale_denom * 7) {
+    /* Provide 7/8 scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 7L, 8L);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 7L, 8L);
+    cinfo->min_DCT_h_scaled_size = 7;
+    cinfo->min_DCT_v_scaled_size = 7;
+  } else if (cinfo->scale_num <= cinfo->scale_denom) {
     /* Provide 1/1 scaling */
     cinfo->output_width = cinfo->image_width;
     cinfo->output_height = cinfo->image_height;
-    cinfo->min_DCT_scaled_size = DCTSIZE;
+    cinfo->min_DCT_h_scaled_size = DCTSIZE;
+    cinfo->min_DCT_v_scaled_size = DCTSIZE;
+  } else if (cinfo->scale_num * 8 <= cinfo->scale_denom * 9) {
+    /* Provide 9/8 scaling */
+    cinfo->output_width = cinfo->image_width + (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width, 8L);
+    cinfo->output_height = cinfo->image_height + (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height, 8L);
+    cinfo->min_DCT_h_scaled_size = 9;
+    cinfo->min_DCT_v_scaled_size = 9;
+  } else if (cinfo->scale_num * 4 <= cinfo->scale_denom * 5) {
+    /* Provide 5/4 scaling */
+    cinfo->output_width = cinfo->image_width + (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width, 4L);
+    cinfo->output_height = cinfo->image_height + (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height, 4L);
+    cinfo->min_DCT_h_scaled_size = 10;
+    cinfo->min_DCT_v_scaled_size = 10;
+  } else if (cinfo->scale_num * 8 <= cinfo->scale_denom * 11) {
+    /* Provide 11/8 scaling */
+    cinfo->output_width = cinfo->image_width + (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 3L, 8L);
+    cinfo->output_height = cinfo->image_height + (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 3L, 8L);
+    cinfo->min_DCT_h_scaled_size = 11;
+    cinfo->min_DCT_v_scaled_size = 11;
+  } else if (cinfo->scale_num * 2 <= cinfo->scale_denom * 3) {
+    /* Provide 3/2 scaling */
+    cinfo->output_width = cinfo->image_width + (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width, 2L);
+    cinfo->output_height = cinfo->image_height + (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height, 2L);
+    cinfo->min_DCT_h_scaled_size = 12;
+    cinfo->min_DCT_v_scaled_size = 12;
+  } else if (cinfo->scale_num * 8 <= cinfo->scale_denom * 13) {
+    /* Provide 13/8 scaling */
+    cinfo->output_width = cinfo->image_width + (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 5L, 8L);
+    cinfo->output_height = cinfo->image_height + (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 5L, 8L);
+    cinfo->min_DCT_h_scaled_size = 13;
+    cinfo->min_DCT_v_scaled_size = 13;
+  } else if (cinfo->scale_num * 4 <= cinfo->scale_denom * 7) {
+    /* Provide 7/4 scaling */
+    cinfo->output_width = cinfo->image_width + (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 3L, 4L);
+    cinfo->output_height = cinfo->image_height + (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 3L, 4L);
+    cinfo->min_DCT_h_scaled_size = 14;
+    cinfo->min_DCT_v_scaled_size = 14;
+  } else if (cinfo->scale_num * 8 <= cinfo->scale_denom * 15) {
+    /* Provide 15/8 scaling */
+    cinfo->output_width = cinfo->image_width + (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 7L, 8L);
+    cinfo->output_height = cinfo->image_height + (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 7L, 8L);
+    cinfo->min_DCT_h_scaled_size = 15;
+    cinfo->min_DCT_v_scaled_size = 15;
+  } else {
+    /* Provide 2/1 scaling */
+    cinfo->output_width = cinfo->image_width << 1;
+    cinfo->output_height = cinfo->image_height << 1;
+    cinfo->min_DCT_h_scaled_size = 16;
+    cinfo->min_DCT_v_scaled_size = 16;
   }
   /* In selecting the actual DCT scaling for each component, we try to
    * scale up the chroma components via IDCT scaling rather than upsampling.
    * This saves time if the upsampler gets to use 1:1 scaling.
-   * Note this code assumes that the supported DCT scalings are powers of 2.
+   * Note this code adapts subsampling ratios which are powers of 2.
    */
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
-    int ssize = cinfo->min_DCT_scaled_size;
-    while (ssize < DCTSIZE &&
-	   (compptr->h_samp_factor * ssize * 2 <=
-	    cinfo->max_h_samp_factor * cinfo->min_DCT_scaled_size) &&
-	   (compptr->v_samp_factor * ssize * 2 <=
-	    cinfo->max_v_samp_factor * cinfo->min_DCT_scaled_size)) {
+    int ssize = 1;
+    while (cinfo->min_DCT_h_scaled_size * ssize <=
+	   (cinfo->do_fancy_upsampling ? DCTSIZE : DCTSIZE / 2) &&
+	   (cinfo->max_h_samp_factor % (compptr->h_samp_factor * ssize * 2)) == 0) {
+      ssize = ssize * 2;
+    }
+    compptr->DCT_h_scaled_size = cinfo->min_DCT_h_scaled_size * ssize;
+    ssize = 1;
+    while (cinfo->min_DCT_v_scaled_size * ssize <=
+	   (cinfo->do_fancy_upsampling ? DCTSIZE : DCTSIZE / 2) &&
+	   (cinfo->max_v_samp_factor % (compptr->v_samp_factor * ssize * 2)) == 0) {
       ssize = ssize * 2;
     }
-    compptr->DCT_scaled_size = ssize;
+    compptr->DCT_v_scaled_size = cinfo->min_DCT_v_scaled_size * ssize;
+
+    /* We don't support IDCT ratios larger than 2. */
+    if (compptr->DCT_h_scaled_size > compptr->DCT_v_scaled_size * 2)
+	compptr->DCT_h_scaled_size = compptr->DCT_v_scaled_size * 2;
+    else if (compptr->DCT_v_scaled_size > compptr->DCT_h_scaled_size * 2)
+	compptr->DCT_v_scaled_size = compptr->DCT_h_scaled_size * 2;
   }
 
   /* Recompute downsampled dimensions of components;
@@ -149,11 +262,11 @@ jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
     /* Size in samples, after IDCT scaling */
     compptr->downsampled_width = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_width *
-		    (long) (compptr->h_samp_factor * compptr->DCT_scaled_size),
+		    (long) (compptr->h_samp_factor * compptr->DCT_h_scaled_size),
 		    (long) (cinfo->max_h_samp_factor * DCTSIZE));
     compptr->downsampled_height = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_height *
-		    (long) (compptr->v_samp_factor * compptr->DCT_scaled_size),
+		    (long) (compptr->v_samp_factor * compptr->DCT_v_scaled_size),
 		    (long) (cinfo->max_v_samp_factor * DCTSIZE));
   }
 
@@ -373,16 +486,9 @@ master_selection (j_decompress_ptr cinfo)
   jinit_inverse_dct(cinfo);
   /* Entropy decoding: either Huffman or arithmetic coding. */
   if (cinfo->arith_code) {
-    ERREXIT(cinfo, JERR_ARITH_NOTIMPL);
+    jinit_arith_decoder(cinfo);
   } else {
-    if (cinfo->progressive_mode) {
-#ifdef D_PROGRESSIVE_SUPPORTED
-      jinit_phuff_decoder(cinfo);
-#else
-      ERREXIT(cinfo, JERR_NOT_COMPILED);
-#endif
-    } else
-      jinit_huff_decoder(cinfo);
+    jinit_huff_decoder(cinfo);
   }
 
   /* Initialize principal buffer controllers. */
diff --git a/src/libjpeg/jdsample.c b/src/libjpeg/jdsample.c
index 80ffefb..7bc8885 100644
--- a/src/libjpeg/jdsample.c
+++ b/src/libjpeg/jdsample.c
@@ -2,13 +2,14 @@
  * jdsample.c
  *
  * Copyright (C) 1991-1996, Thomas G. Lane.
+ * Modified 2002-2008 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
  * This file contains upsampling routines.
  *
  * Upsampling input data is counted in "row groups".  A row group
- * is defined to be (v_samp_factor * DCT_scaled_size / min_DCT_scaled_size)
+ * is defined to be (v_samp_factor * DCT_v_scaled_size / min_DCT_v_scaled_size)
  * sample rows of each component.  Upsampling will normally produce
  * max_v_samp_factor pixel rows from each row group (but this could vary
  * if the upsampler is applying a scale factor of its own).
@@ -237,11 +238,11 @@ h2v1_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   register JSAMPROW inptr, outptr;
   register JSAMPLE invalue;
   JSAMPROW outend;
-  int inrow;
+  int outrow;
 
-  for (inrow = 0; inrow < cinfo->max_v_samp_factor; inrow++) {
-    inptr = input_data[inrow];
-    outptr = output_data[inrow];
+  for (outrow = 0; outrow < cinfo->max_v_samp_factor; outrow++) {
+    inptr = input_data[outrow];
+    outptr = output_data[outrow];
     outend = outptr + cinfo->output_width;
     while (outptr < outend) {
       invalue = *inptr++;	/* don't need GETJSAMPLE() here */
@@ -286,112 +287,6 @@ h2v2_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 
 
 /*
- * Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
- *
- * The upsampling algorithm is linear interpolation between pixel centers,
- * also known as a "triangle filter".  This is a good compromise between
- * speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
- * of the way between input pixel centers.
- *
- * A note about the "bias" calculations: when rounding fractional values to
- * integer, we do not want to always round 0.5 up to the next integer.
- * If we did that, we'd introduce a noticeable bias towards larger values.
- * Instead, this code is arranged so that 0.5 will be rounded up or down at
- * alternate pixel locations (a simple ordered dither pattern).
- */
-
-METHODDEF(void)
-h2v1_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-		     JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)
-{
-  JSAMPARRAY output_data = *output_data_ptr;
-  register JSAMPROW inptr, outptr;
-  register int invalue;
-  register JDIMENSION colctr;
-  int inrow;
-
-  for (inrow = 0; inrow < cinfo->max_v_samp_factor; inrow++) {
-    inptr = input_data[inrow];
-    outptr = output_data[inrow];
-    /* Special case for first column */
-    invalue = GETJSAMPLE(*inptr++);
-    *outptr++ = (JSAMPLE) invalue;
-    *outptr++ = (JSAMPLE) ((invalue * 3 + GETJSAMPLE(*inptr) + 2) >> 2);
-
-    for (colctr = compptr->downsampled_width - 2; colctr > 0; colctr--) {
-      /* General case: 3/4 * nearer pixel + 1/4 * further pixel */
-      invalue = GETJSAMPLE(*inptr++) * 3;
-      *outptr++ = (JSAMPLE) ((invalue + GETJSAMPLE(inptr[-2]) + 1) >> 2);
-      *outptr++ = (JSAMPLE) ((invalue + GETJSAMPLE(*inptr) + 2) >> 2);
-    }
-
-    /* Special case for last column */
-    invalue = GETJSAMPLE(*inptr);
-    *outptr++ = (JSAMPLE) ((invalue * 3 + GETJSAMPLE(inptr[-1]) + 1) >> 2);
-    *outptr++ = (JSAMPLE) invalue;
-  }
-}
-
-
-/*
- * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
- * Again a triangle filter; see comments for h2v1 case, above.
- *
- * It is OK for us to reference the adjacent input rows because we demanded
- * context from the main buffer controller (see initialization code).
- */
-
-METHODDEF(void)
-h2v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-		     JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)
-{
-  JSAMPARRAY output_data = *output_data_ptr;
-  register JSAMPROW inptr0, inptr1, outptr;
-#if BITS_IN_JSAMPLE == 8
-  register int thiscolsum, lastcolsum, nextcolsum;
-#else
-  register INT32 thiscolsum, lastcolsum, nextcolsum;
-#endif
-  register JDIMENSION colctr;
-  int inrow, outrow, v;
-
-  inrow = outrow = 0;
-  while (outrow < cinfo->max_v_samp_factor) {
-    for (v = 0; v < 2; v++) {
-      /* inptr0 points to nearest input row, inptr1 points to next nearest */
-      inptr0 = input_data[inrow];
-      if (v == 0)		/* next nearest is row above */
-	inptr1 = input_data[inrow-1];
-      else			/* next nearest is row below */
-	inptr1 = input_data[inrow+1];
-      outptr = output_data[outrow++];
-
-      /* Special case for first column */
-      thiscolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
-      nextcolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
-      *outptr++ = (JSAMPLE) ((thiscolsum * 4 + 8) >> 4);
-      *outptr++ = (JSAMPLE) ((thiscolsum * 3 + nextcolsum + 7) >> 4);
-      lastcolsum = thiscolsum; thiscolsum = nextcolsum;
-
-      for (colctr = compptr->downsampled_width - 2; colctr > 0; colctr--) {
-	/* General case: 3/4 * nearer pixel + 1/4 * further pixel in each */
-	/* dimension, thus 9/16, 3/16, 3/16, 1/16 overall */
-	nextcolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
-	*outptr++ = (JSAMPLE) ((thiscolsum * 3 + lastcolsum + 8) >> 4);
-	*outptr++ = (JSAMPLE) ((thiscolsum * 3 + nextcolsum + 7) >> 4);
-	lastcolsum = thiscolsum; thiscolsum = nextcolsum;
-      }
-
-      /* Special case for last column */
-      *outptr++ = (JSAMPLE) ((thiscolsum * 3 + lastcolsum + 8) >> 4);
-      *outptr++ = (JSAMPLE) ((thiscolsum * 4 + 7) >> 4);
-    }
-    inrow++;
-  }
-}
-
-
-/*
  * Module initialization routine for upsampling.
  */
 
@@ -401,7 +296,7 @@ jinit_upsampler (j_decompress_ptr cinfo)
   my_upsample_ptr upsample;
   int ci;
   jpeg_component_info * compptr;
-  boolean need_buffer, do_fancy;
+  boolean need_buffer;
   int h_in_group, v_in_group, h_out_group, v_out_group;
 
   upsample = (my_upsample_ptr)
@@ -415,11 +310,6 @@ jinit_upsampler (j_decompress_ptr cinfo)
   if (cinfo->CCIR601_sampling)	/* this isn't supported */
     ERREXIT(cinfo, JERR_CCIR601_NOTIMPL);
 
-  /* jdmainct.c doesn't support context rows when min_DCT_scaled_size = 1,
-   * so don't ask for it.
-   */
-  do_fancy = cinfo->do_fancy_upsampling && cinfo->min_DCT_scaled_size > 1;
-
   /* Verify we can handle the sampling factors, select per-component methods,
    * and create storage as needed.
    */
@@ -428,10 +318,10 @@ jinit_upsampler (j_decompress_ptr cinfo)
     /* Compute size of an "input group" after IDCT scaling.  This many samples
      * are to be converted to max_h_samp_factor * max_v_samp_factor pixels.
      */
-    h_in_group = (compptr->h_samp_factor * compptr->DCT_scaled_size) /
-		 cinfo->min_DCT_scaled_size;
-    v_in_group = (compptr->v_samp_factor * compptr->DCT_scaled_size) /
-		 cinfo->min_DCT_scaled_size;
+    h_in_group = (compptr->h_samp_factor * compptr->DCT_h_scaled_size) /
+		 cinfo->min_DCT_h_scaled_size;
+    v_in_group = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
+		 cinfo->min_DCT_v_scaled_size;
     h_out_group = cinfo->max_h_samp_factor;
     v_out_group = cinfo->max_v_samp_factor;
     upsample->rowgroup_height[ci] = v_in_group; /* save for use later */
@@ -446,19 +336,12 @@ jinit_upsampler (j_decompress_ptr cinfo)
       need_buffer = FALSE;
     } else if (h_in_group * 2 == h_out_group &&
 	       v_in_group == v_out_group) {
-      /* Special cases for 2h1v upsampling */
-      if (do_fancy && compptr->downsampled_width > 2)
-	upsample->methods[ci] = h2v1_fancy_upsample;
-      else
-	upsample->methods[ci] = h2v1_upsample;
+      /* Special case for 2h1v upsampling */
+      upsample->methods[ci] = h2v1_upsample;
     } else if (h_in_group * 2 == h_out_group &&
 	       v_in_group * 2 == v_out_group) {
-      /* Special cases for 2h2v upsampling */
-      if (do_fancy && compptr->downsampled_width > 2) {
-	upsample->methods[ci] = h2v2_fancy_upsample;
-	upsample->pub.need_context_rows = TRUE;
-      } else
-	upsample->methods[ci] = h2v2_upsample;
+      /* Special case for 2h2v upsampling */
+      upsample->methods[ci] = h2v2_upsample;
     } else if ((h_out_group % h_in_group) == 0 &&
 	       (v_out_group % v_in_group) == 0) {
       /* Generic integral-factors upsampling method */
diff --git a/src/libjpeg/jdtrans.c b/src/libjpeg/jdtrans.c
index 6c0ab71..385c336 100644
--- a/src/libjpeg/jdtrans.c
+++ b/src/libjpeg/jdtrans.c
@@ -100,17 +100,10 @@ transdecode_master_selection (j_decompress_ptr cinfo)
   cinfo->buffered_image = TRUE;
 
   /* Entropy decoding: either Huffman or arithmetic coding. */
-  if (cinfo->arith_code) {
-    ERREXIT(cinfo, JERR_ARITH_NOTIMPL);
-  } else {
-    if (cinfo->progressive_mode) {
-#ifdef D_PROGRESSIVE_SUPPORTED
-      jinit_phuff_decoder(cinfo);
-#else
-      ERREXIT(cinfo, JERR_NOT_COMPILED);
-#endif
-    } else
-      jinit_huff_decoder(cinfo);
+  if (cinfo->arith_code)
+    jinit_arith_decoder(cinfo);
+  else {
+    jinit_huff_decoder(cinfo);
   }
 
   /* Always get a full-image coefficient buffer. */
diff --git a/src/libjpeg/jerror.h b/src/libjpeg/jerror.h
index fc2fffe..1cfb2b1 100644
--- a/src/libjpeg/jerror.h
+++ b/src/libjpeg/jerror.h
@@ -2,6 +2,7 @@
  * jerror.h
  *
  * Copyright (C) 1994-1997, Thomas G. Lane.
+ * Modified 1997-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -39,14 +40,15 @@ typedef enum {
 JMESSAGE(JMSG_NOMESSAGE, "Bogus message code %d") /* Must be first entry! */
 
 /* For maintenance convenience, list is alphabetical by message code name */
-JMESSAGE(JERR_ARITH_NOTIMPL,
-	 "Sorry, there are legal restrictions on arithmetic coding")
 JMESSAGE(JERR_BAD_ALIGN_TYPE, "ALIGN_TYPE is wrong, please fix")
 JMESSAGE(JERR_BAD_ALLOC_CHUNK, "MAX_ALLOC_CHUNK is wrong, please fix")
 JMESSAGE(JERR_BAD_BUFFER_MODE, "Bogus buffer control mode")
 JMESSAGE(JERR_BAD_COMPONENT_ID, "Invalid component ID %d in SOS")
+JMESSAGE(JERR_BAD_CROP_SPEC, "Invalid crop request")
 JMESSAGE(JERR_BAD_DCT_COEF, "DCT coefficient out of range")
-JMESSAGE(JERR_BAD_DCTSIZE, "IDCT output block size %d not supported")
+JMESSAGE(JERR_BAD_DCTSIZE, "DCT scaled block size %dx%d not supported")
+JMESSAGE(JERR_BAD_DROP_SAMPLING,
+	 "Component index %d: mismatching sampling ratio %d:%d, %d:%d, %c")
 JMESSAGE(JERR_BAD_HUFF_TABLE, "Bogus Huffman table definition")
 JMESSAGE(JERR_BAD_IN_COLORSPACE, "Bogus input colorspace")
 JMESSAGE(JERR_BAD_J_COLORSPACE, "Bogus JPEG colorspace")
@@ -93,6 +95,7 @@ JMESSAGE(JERR_MISSING_DATA, "Scan script does not transmit all data")
 JMESSAGE(JERR_MODE_CHANGE, "Invalid color quantization mode change")
 JMESSAGE(JERR_NOTIMPL, "Not implemented yet")
 JMESSAGE(JERR_NOT_COMPILED, "Requested feature was omitted at compile time")
+JMESSAGE(JERR_NO_ARITH_TABLE, "Arithmetic table 0x%02x was not defined")
 JMESSAGE(JERR_NO_BACKING_STORE, "Backing store not supported")
 JMESSAGE(JERR_NO_HUFF_TABLE, "Huffman table 0x%02x was not defined")
 JMESSAGE(JERR_NO_IMAGE, "JPEG datastream contains no image")
@@ -170,6 +173,7 @@ JMESSAGE(JTRC_UNKNOWN_IDS,
 JMESSAGE(JTRC_XMS_CLOSE, "Freed XMS handle %u")
 JMESSAGE(JTRC_XMS_OPEN, "Obtained XMS handle %u")
 JMESSAGE(JWRN_ADOBE_XFORM, "Unknown Adobe color transform code %d")
+JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code")
 JMESSAGE(JWRN_BOGUS_PROGRESSION,
 	 "Inconsistent progression sequence for component %d coefficient %d")
 JMESSAGE(JWRN_EXTRANEOUS_DATA,
@@ -227,6 +231,15 @@ JMESSAGE(JWRN_TOO_MUCH_DATA, "Application transferred too many scanlines")
    (cinfo)->err->msg_parm.i[2] = (p3), \
    (cinfo)->err->msg_parm.i[3] = (p4), \
    (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
+#define ERREXIT6(cinfo,code,p1,p2,p3,p4,p5,p6)  \
+  ((cinfo)->err->msg_code = (code), \
+   (cinfo)->err->msg_parm.i[0] = (p1), \
+   (cinfo)->err->msg_parm.i[1] = (p2), \
+   (cinfo)->err->msg_parm.i[2] = (p3), \
+   (cinfo)->err->msg_parm.i[3] = (p4), \
+   (cinfo)->err->msg_parm.i[4] = (p5), \
+   (cinfo)->err->msg_parm.i[5] = (p6), \
+   (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
 #define ERREXITS(cinfo,code,str)  \
   ((cinfo)->err->msg_code = (code), \
    strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \
diff --git a/src/libjpeg/jfdctflt.c b/src/libjpeg/jfdctflt.c
index 79d7a00..74d0d86 100644
--- a/src/libjpeg/jfdctflt.c
+++ b/src/libjpeg/jfdctflt.c
@@ -2,6 +2,7 @@
  * jfdctflt.c
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Modified 2003-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -56,41 +57,46 @@
  */
 
 GLOBAL(void)
-jpeg_fdct_float (FAST_FLOAT * data)
+jpeg_fdct_float (FAST_FLOAT * data, JSAMPARRAY sample_data, JDIMENSION start_col)
 {
   FAST_FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   FAST_FLOAT tmp10, tmp11, tmp12, tmp13;
   FAST_FLOAT z1, z2, z3, z4, z5, z11, z13;
   FAST_FLOAT *dataptr;
+  JSAMPROW elemptr;
   int ctr;
 
   /* Pass 1: process rows. */
 
   dataptr = data;
-  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
-    tmp0 = dataptr[0] + dataptr[7];
-    tmp7 = dataptr[0] - dataptr[7];
-    tmp1 = dataptr[1] + dataptr[6];
-    tmp6 = dataptr[1] - dataptr[6];
-    tmp2 = dataptr[2] + dataptr[5];
-    tmp5 = dataptr[2] - dataptr[5];
-    tmp3 = dataptr[3] + dataptr[4];
-    tmp4 = dataptr[3] - dataptr[4];
-    
+  for (ctr = 0; ctr < DCTSIZE; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Load data into workspace */
+    tmp0 = (FAST_FLOAT) (GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]));
+    tmp7 = (FAST_FLOAT) (GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]));
+    tmp1 = (FAST_FLOAT) (GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]));
+    tmp6 = (FAST_FLOAT) (GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]));
+    tmp2 = (FAST_FLOAT) (GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]));
+    tmp5 = (FAST_FLOAT) (GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]));
+    tmp3 = (FAST_FLOAT) (GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]));
+    tmp4 = (FAST_FLOAT) (GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]));
+
     /* Even part */
-    
+
     tmp10 = tmp0 + tmp3;	/* phase 2 */
     tmp13 = tmp0 - tmp3;
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
-    
-    dataptr[0] = tmp10 + tmp11; /* phase 3 */
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = tmp10 + tmp11 - 8 * CENTERJSAMPLE; /* phase 3 */
     dataptr[4] = tmp10 - tmp11;
-    
+
     z1 = (tmp12 + tmp13) * ((FAST_FLOAT) 0.707106781); /* c4 */
     dataptr[2] = tmp13 + z1;	/* phase 5 */
     dataptr[6] = tmp13 - z1;
-    
+
     /* Odd part */
 
     tmp10 = tmp4 + tmp5;	/* phase 2 */
@@ -126,21 +132,21 @@ jpeg_fdct_float (FAST_FLOAT * data)
     tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
     tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
     tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
-    
+
     /* Even part */
-    
+
     tmp10 = tmp0 + tmp3;	/* phase 2 */
     tmp13 = tmp0 - tmp3;
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
-    
+
     dataptr[DCTSIZE*0] = tmp10 + tmp11; /* phase 3 */
     dataptr[DCTSIZE*4] = tmp10 - tmp11;
-    
+
     z1 = (tmp12 + tmp13) * ((FAST_FLOAT) 0.707106781); /* c4 */
     dataptr[DCTSIZE*2] = tmp13 + z1; /* phase 5 */
     dataptr[DCTSIZE*6] = tmp13 - z1;
-    
+
     /* Odd part */
 
     tmp10 = tmp4 + tmp5;	/* phase 2 */
diff --git a/src/libjpeg/jfdctfst.c b/src/libjpeg/jfdctfst.c
index ccb378a..8cad5f2 100644
--- a/src/libjpeg/jfdctfst.c
+++ b/src/libjpeg/jfdctfst.c
@@ -2,6 +2,7 @@
  * jfdctfst.c
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Modified 2003-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -111,42 +112,47 @@
  */
 
 GLOBAL(void)
-jpeg_fdct_ifast (DCTELEM * data)
+jpeg_fdct_ifast (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
 {
   DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   DCTELEM tmp10, tmp11, tmp12, tmp13;
   DCTELEM z1, z2, z3, z4, z5, z11, z13;
   DCTELEM *dataptr;
+  JSAMPROW elemptr;
   int ctr;
   SHIFT_TEMPS
 
   /* Pass 1: process rows. */
 
   dataptr = data;
-  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
-    tmp0 = dataptr[0] + dataptr[7];
-    tmp7 = dataptr[0] - dataptr[7];
-    tmp1 = dataptr[1] + dataptr[6];
-    tmp6 = dataptr[1] - dataptr[6];
-    tmp2 = dataptr[2] + dataptr[5];
-    tmp5 = dataptr[2] - dataptr[5];
-    tmp3 = dataptr[3] + dataptr[4];
-    tmp4 = dataptr[3] - dataptr[4];
-    
+  for (ctr = 0; ctr < DCTSIZE; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Load data into workspace */
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
+    tmp7 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
+    tmp6 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
+    tmp5 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
+    tmp4 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
+
     /* Even part */
-    
+
     tmp10 = tmp0 + tmp3;	/* phase 2 */
     tmp13 = tmp0 - tmp3;
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
-    
-    dataptr[0] = tmp10 + tmp11; /* phase 3 */
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = tmp10 + tmp11 - 8 * CENTERJSAMPLE; /* phase 3 */
     dataptr[4] = tmp10 - tmp11;
-    
+
     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
     dataptr[2] = tmp13 + z1;	/* phase 5 */
     dataptr[6] = tmp13 - z1;
-    
+
     /* Odd part */
 
     tmp10 = tmp4 + tmp5;	/* phase 2 */
@@ -182,21 +188,21 @@ jpeg_fdct_ifast (DCTELEM * data)
     tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
     tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
     tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
-    
+
     /* Even part */
-    
+
     tmp10 = tmp0 + tmp3;	/* phase 2 */
     tmp13 = tmp0 - tmp3;
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
-    
+
     dataptr[DCTSIZE*0] = tmp10 + tmp11; /* phase 3 */
     dataptr[DCTSIZE*4] = tmp10 - tmp11;
-    
+
     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
     dataptr[DCTSIZE*2] = tmp13 + z1; /* phase 5 */
     dataptr[DCTSIZE*6] = tmp13 - z1;
-    
+
     /* Odd part */
 
     tmp10 = tmp4 + tmp5;	/* phase 2 */
diff --git a/src/libjpeg/jfdctint.c b/src/libjpeg/jfdctint.c
index 0a78b64..1dde58c 100644
--- a/src/libjpeg/jfdctint.c
+++ b/src/libjpeg/jfdctint.c
@@ -2,6 +2,7 @@
  * jfdctint.c
  *
  * Copyright (C) 1991-1996, Thomas G. Lane.
+ * Modification developed 2003-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -21,6 +22,23 @@
  * The advantage of this method is that no data path contains more than one
  * multiplication; this allows a very simple and accurate implementation in
  * scaled fixed-point arithmetic, with a minimal number of shifts.
+ *
+ * We also provide FDCT routines with various input sample block sizes for
+ * direct resolution reduction or enlargement and for direct resolving the
+ * common 2x1 and 1x2 subsampling cases without additional resampling: NxN
+ * (N=1...16), 2NxN, and Nx2N (N=1...8) pixels for one 8x8 output DCT block.
+ *
+ * For N<8 we fill the remaining block coefficients with zero.
+ * For N>8 we apply a partial N-point FDCT on the input samples, computing
+ * just the lower 8 frequency coefficients and discarding the rest.
+ *
+ * We must scale the output coefficients of the N-point FDCT appropriately
+ * to the standard 8-point FDCT level by 8/N per 1-D pass.  This scaling
+ * is folded into the constant multipliers (pass 2) and/or final/initial
+ * shifting.
+ *
+ * CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases
+ * since there would be too many additional constants to pre-calculate.
  */
 
 #define JPEG_INTERNALS
@@ -36,7 +54,7 @@
  */
 
 #if DCTSIZE != 8
-  Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
+  Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
 #endif
 
 
@@ -137,12 +155,13 @@
  */
 
 GLOBAL(void)
-jpeg_fdct_islow (DCTELEM * data)
+jpeg_fdct_islow (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
 {
-  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  INT32 tmp0, tmp1, tmp2, tmp3;
   INT32 tmp10, tmp11, tmp12, tmp13;
-  INT32 z1, z2, z3, z4, z5;
+  INT32 z1;
   DCTELEM *dataptr;
+  JSAMPROW elemptr;
   int ctr;
   SHIFT_TEMPS
 
@@ -151,62 +170,74 @@ jpeg_fdct_islow (DCTELEM * data)
   /* furthermore, we scale the results by 2**PASS1_BITS. */
 
   dataptr = data;
-  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
-    tmp0 = dataptr[0] + dataptr[7];
-    tmp7 = dataptr[0] - dataptr[7];
-    tmp1 = dataptr[1] + dataptr[6];
-    tmp6 = dataptr[1] - dataptr[6];
-    tmp2 = dataptr[2] + dataptr[5];
-    tmp5 = dataptr[2] - dataptr[5];
-    tmp3 = dataptr[3] + dataptr[4];
-    tmp4 = dataptr[3] - dataptr[4];
-    
+  for (ctr = 0; ctr < DCTSIZE; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
     /* Even part per LL&M figure 1 --- note that published figure is faulty;
      * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
      */
-    
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
+
     tmp10 = tmp0 + tmp3;
-    tmp13 = tmp0 - tmp3;
+    tmp12 = tmp0 - tmp3;
     tmp11 = tmp1 + tmp2;
-    tmp12 = tmp1 - tmp2;
-    
-    dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS);
+    tmp13 = tmp1 - tmp2;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
+    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM) ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << PASS1_BITS);
     dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
-    
+
     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
-    dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
-				   CONST_BITS-PASS1_BITS);
-    dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
-				   CONST_BITS-PASS1_BITS);
-    
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+    dataptr[2] = (DCTELEM) RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865),
+				       CONST_BITS-PASS1_BITS);
+    dataptr[6] = (DCTELEM) RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065),
+				       CONST_BITS-PASS1_BITS);
+
     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
-     * cK represents cos(K*pi/16).
-     * i0..i3 in the paper are tmp4..tmp7 here.
+     * cK represents sqrt(2) * cos(K*pi/16).
+     * i0..i3 in the paper are tmp0..tmp3 here.
      */
-    
-    z1 = tmp4 + tmp7;
-    z2 = tmp5 + tmp6;
-    z3 = tmp4 + tmp6;
-    z4 = tmp5 + tmp7;
-    z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
-    
-    tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
-    tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
-    tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
-    tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
-    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
-    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
-    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
-    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
-    
-    z3 += z5;
-    z4 += z5;
-    
-    dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS);
-    dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS);
-    dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS);
-    dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS);
-    
+
+    tmp10 = tmp0 + tmp3;
+    tmp11 = tmp1 + tmp2;
+    tmp12 = tmp0 + tmp2;
+    tmp13 = tmp1 + tmp3;
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
+    tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
+    tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
+    tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
+    tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
+    tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
+    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
+    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */
+
+    tmp12 += z1;
+    tmp13 += z1;
+
+    dataptr[1] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS);
+    dataptr[3] = (DCTELEM)
+      RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS);
+    dataptr[5] = (DCTELEM)
+      RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS);
+    dataptr[7] = (DCTELEM)
+      RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS);
+
     dataptr += DCTSIZE;		/* advance pointer to next row */
   }
 
@@ -217,67 +248,4101 @@ jpeg_fdct_islow (DCTELEM * data)
 
   dataptr = data;
   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part per LL&M figure 1 --- note that published figure is faulty;
+     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+     */
+
     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
-    tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
-    tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
     tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
-    tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
     tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
-    tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
-    
+
+    /* Add fudge factor here for final descale. */
+    tmp10 = tmp0 + tmp3 + (ONE << (PASS1_BITS-1));
+    tmp12 = tmp0 - tmp3;
+    tmp11 = tmp1 + tmp2;
+    tmp13 = tmp1 - tmp2;
+
+    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
+    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
+    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
+    tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
+
+    dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS1_BITS);
+    dataptr[DCTSIZE*4] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS1_BITS);
+
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS+PASS1_BITS-1);
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), CONST_BITS+PASS1_BITS);
+
+    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
+     * cK represents sqrt(2) * cos(K*pi/16).
+     * i0..i3 in the paper are tmp0..tmp3 here.
+     */
+
+    tmp10 = tmp0 + tmp3;
+    tmp11 = tmp1 + tmp2;
+    tmp12 = tmp0 + tmp2;
+    tmp13 = tmp1 + tmp3;
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS+PASS1_BITS-1);
+
+    tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
+    tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
+    tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
+    tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
+    tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
+    tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
+    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
+    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */
+
+    tmp12 += z1;
+    tmp13 += z1;
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*5] = (DCTELEM)
+      RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*7] = (DCTELEM)
+      RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+#ifdef DCT_SCALING_SUPPORTED
+
+
+/*
+ * Perform the forward DCT on a 7x7 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_7x7 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3;
+  INT32 tmp10, tmp11, tmp12;
+  INT32 z1, z2, z3;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* cK represents sqrt(2) * cos(K*pi/14). */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 7; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]);
+    tmp3 = GETJSAMPLE(elemptr[3]);
+
+    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
+    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
+    tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);
+
+    z1 = tmp0 + tmp2;
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE) << PASS1_BITS);
+    tmp3 += tmp3;
+    z1 -= tmp3;
+    z1 -= tmp3;
+    z1 = MULTIPLY(z1, FIX(0.353553391));                /* (c2+c6-c4)/2 */
+    z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002));       /* (c2+c4-c6)/2 */
+    z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123));       /* c6 */
+    dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
+    z1 -= z2;
+    z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734));       /* c4 */
+    dataptr[4] = (DCTELEM)
+      DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */
+	      CONST_BITS-PASS1_BITS);
+    dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);
+
+    /* Odd part */
+
+    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347));   /* (c3+c1-c5)/2 */
+    tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339));   /* (c3+c5-c1)/2 */
+    tmp0 = tmp1 - tmp2;
+    tmp1 += tmp2;
+    tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.378756276)); /* -c1 */
+    tmp1 += tmp2;
+    tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.613604268));   /* c5 */
+    tmp0 += tmp3;
+    tmp2 += tmp3 + MULTIPLY(tmp12, FIX(1.870828693));   /* c3+c1-c5 */
+
+    dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
+    dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
+    dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/7)**2 = 64/49, which we fold
+   * into the constant multipliers:
+   * cK now represents sqrt(2) * cos(K*pi/14) * 64/49.
+   */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 7; ctr++) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*6];
+    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*5];
+    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*4];
+    tmp3 = dataptr[DCTSIZE*3];
+
+    tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*6];
+    tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*5];
+    tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*4];
+
+    z1 = tmp0 + tmp2;
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(z1 + tmp1 + tmp3, FIX(1.306122449)), /* 64/49 */
+	      CONST_BITS+PASS1_BITS);
+    tmp3 += tmp3;
+    z1 -= tmp3;
+    z1 -= tmp3;
+    z1 = MULTIPLY(z1, FIX(0.461784020));                /* (c2+c6-c4)/2 */
+    z2 = MULTIPLY(tmp0 - tmp2, FIX(1.202428084));       /* (c2+c4-c6)/2 */
+    z3 = MULTIPLY(tmp1 - tmp2, FIX(0.411026446));       /* c6 */
+    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS+PASS1_BITS);
+    z1 -= z2;
+    z2 = MULTIPLY(tmp0 - tmp1, FIX(1.151670509));       /* c4 */
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.923568041)), /* c2+c6-c4 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+PASS1_BITS);
+
+    /* Odd part */
+
+    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.221765677));   /* (c3+c1-c5)/2 */
+    tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.222383464));   /* (c3+c5-c1)/2 */
+    tmp0 = tmp1 - tmp2;
+    tmp1 += tmp2;
+    tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.800824523)); /* -c1 */
+    tmp1 += tmp2;
+    tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.801442310));   /* c5 */
+    tmp0 += tmp3;
+    tmp2 += tmp3 + MULTIPLY(tmp12, FIX(2.443531355));   /* c3+c1-c5 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 6x6 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_6x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2;
+  INT32 tmp10, tmp11, tmp12;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* cK represents sqrt(2) * cos(K*pi/12). */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 6; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
+    tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
+
+    tmp10 = tmp0 + tmp2;
+    tmp12 = tmp0 - tmp2;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << PASS1_BITS);
+    dataptr[2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp12, FIX(1.224744871)),                 /* c2 */
+	      CONST_BITS-PASS1_BITS);
+    dataptr[4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
+	      CONST_BITS-PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)),     /* c5 */
+		    CONST_BITS-PASS1_BITS);
+
+    dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
+    dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
+    dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/6)**2 = 16/9, which we fold
+   * into the constant multipliers:
+   * cK now represents sqrt(2) * cos(K*pi/12) * 16/9.
+   */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 6; ctr++) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
+    tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
+    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
+
+    tmp10 = tmp0 + tmp2;
+    tmp12 = tmp0 - tmp2;
+
+    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
+    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
+    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)),         /* 16/9 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp12, FIX(2.177324216)),                 /* c2 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
+	      CONST_BITS+PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829));             /* c5 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),   /* 16/9 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)),    /* 16/9 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*5] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)),   /* 16/9 */
+	      CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 5x5 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_5x5 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2;
+  INT32 tmp10, tmp11;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* We scale the results further by 2 as part of output adaption */
+  /* scaling for different DCT size. */
+  /* cK represents sqrt(2) * cos(K*pi/10). */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 5; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]);
+    tmp2 = GETJSAMPLE(elemptr[2]);
+
+    tmp10 = tmp0 + tmp1;
+    tmp11 = tmp0 - tmp1;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << (PASS1_BITS+1));
+    tmp11 = MULTIPLY(tmp11, FIX(0.790569415));          /* (c2+c4)/2 */
+    tmp10 -= tmp2 << 2;
+    tmp10 = MULTIPLY(tmp10, FIX(0.353553391));          /* (c2-c4)/2 */
+    dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS-1);
+    dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS-1);
+
+    /* Odd part */
+
+    tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876));    /* c3 */
+
+    dataptr[1] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */
+	      CONST_BITS-PASS1_BITS-1);
+    dataptr[3] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */
+	      CONST_BITS-PASS1_BITS-1);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/5)**2 = 64/25, which we partially
+   * fold into the constant multipliers (other part was done in pass 1):
+   * cK now represents sqrt(2) * cos(K*pi/10) * 32/25.
+   */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 5; ctr++) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*4];
+    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*3];
+    tmp2 = dataptr[DCTSIZE*2];
+
+    tmp10 = tmp0 + tmp1;
+    tmp11 = tmp0 - tmp1;
+
+    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*4];
+    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*3];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 + tmp2, FIX(1.28)),        /* 32/25 */
+	      CONST_BITS+PASS1_BITS);
+    tmp11 = MULTIPLY(tmp11, FIX(1.011928851));          /* (c2+c4)/2 */
+    tmp10 -= tmp2 << 2;
+    tmp10 = MULTIPLY(tmp10, FIX(0.452548340));          /* (c2-c4)/2 */
+    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS+PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = MULTIPLY(tmp0 + tmp1, FIX(1.064004961));    /* c3 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.657591230)), /* c1-c3 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.785601151)), /* c1+c3 */
+	      CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 4x4 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_4x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1;
+  INT32 tmp10, tmp11;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* We must also scale the output by (8/4)**2 = 2**2, which we add here. */
+  /* cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT]. */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 4; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
+
+    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
+    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+2));
+    dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+2));
+
+    /* Odd part */
+
+    tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
+    /* Add fudge factor here for final descale. */
+    tmp0 += ONE << (CONST_BITS-PASS1_BITS-3);
+
+    dataptr[1] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
+		  CONST_BITS-PASS1_BITS-2);
+    dataptr[3] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
+		  CONST_BITS-PASS1_BITS-2);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 4; ctr++) {
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3] + (ONE << (PASS1_BITS-1));
+    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
+
+    tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
+    tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
+
+    dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
+    dataptr[DCTSIZE*2] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
+
+    /* Odd part */
+
+    tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
+    /* Add fudge factor here for final descale. */
+    tmp0 += ONE << (CONST_BITS+PASS1_BITS-1);
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
+		  CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
+		  CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 3x3 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_3x3 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* We scale the results further by 2**2 as part of output adaption */
+  /* scaling for different DCT size. */
+  /* cK represents sqrt(2) * cos(K*pi/6). */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 3; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
+    tmp1 = GETJSAMPLE(elemptr[1]);
+
+    tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+2));
+    dataptr[2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
+	      CONST_BITS-PASS1_BITS-2);
+
+    /* Odd part */
+
+    dataptr[1] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp2, FIX(1.224744871)),               /* c1 */
+	      CONST_BITS-PASS1_BITS-2);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/3)**2 = 64/9, which we partially
+   * fold into the constant multipliers (other part was done in pass 1):
+   * cK now represents sqrt(2) * cos(K*pi/6) * 16/9.
+   */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 3; ctr++) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*2];
+    tmp1 = dataptr[DCTSIZE*1];
+
+    tmp2 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*2];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),        /* 16/9 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(1.257078722)), /* c2 */
+	      CONST_BITS+PASS1_BITS);
+
+    /* Odd part */
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp2, FIX(2.177324216)),               /* c1 */
+	      CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 2x2 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_2x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3;
+  JSAMPROW elemptr;
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT. */
+
+  /* Row 0 */
+  elemptr = sample_data[0] + start_col;
+
+  tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);
+  tmp1 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]);
+
+  /* Row 1 */
+  elemptr = sample_data[1] + start_col;
+
+  tmp2 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);
+  tmp3 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]);
+
+  /* Pass 2: process columns.
+   * We leave the results scaled up by an overall factor of 8.
+   * We must also scale the output by (8/2)**2 = 2**4.
+   */
+
+  /* Column 0 */
+  /* Apply unsigned->signed conversion */
+  data[DCTSIZE*0] = (DCTELEM) ((tmp0 + tmp2 - 4 * CENTERJSAMPLE) << 4);
+  data[DCTSIZE*1] = (DCTELEM) ((tmp0 - tmp2) << 4);
+
+  /* Column 1 */
+  data[DCTSIZE*0+1] = (DCTELEM) ((tmp1 + tmp3) << 4);
+  data[DCTSIZE*1+1] = (DCTELEM) ((tmp1 - tmp3) << 4);
+}
+
+
+/*
+ * Perform the forward DCT on a 1x1 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_1x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* We leave the result scaled up by an overall factor of 8. */
+  /* We must also scale the output by (8/1)**2 = 2**6. */
+  /* Apply unsigned->signed conversion */
+  data[0] = (DCTELEM)
+    ((GETJSAMPLE(sample_data[0][start_col]) - CENTERJSAMPLE) << 6);
+}
+
+
+/*
+ * Perform the forward DCT on a 9x9 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_9x9 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
+  INT32 tmp10, tmp11, tmp12, tmp13;
+  INT32 z1, z2;
+  DCTELEM workspace[8];
+  DCTELEM *dataptr;
+  DCTELEM *wsptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* we scale the results further by 2 as part of output adaption */
+  /* scaling for different DCT size. */
+  /* cK represents sqrt(2) * cos(K*pi/18). */
+
+  dataptr = data;
+  ctr = 0;
+  for (;;) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[8]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[7]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[6]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[5]);
+    tmp4 = GETJSAMPLE(elemptr[4]);
+
+    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[8]);
+    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[7]);
+    tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[6]);
+    tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[5]);
+
+    z1 = tmp0 + tmp2 + tmp3;
+    z2 = tmp1 + tmp4;
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM) ((z1 + z2 - 9 * CENTERJSAMPLE) << 1);
+    dataptr[6] = (DCTELEM)
+      DESCALE(MULTIPLY(z1 - z2 - z2, FIX(0.707106781)),  /* c6 */
+	      CONST_BITS-1);
+    z1 = MULTIPLY(tmp0 - tmp2, FIX(1.328926049));        /* c2 */
+    z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(0.707106781)); /* c6 */
+    dataptr[2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp2 - tmp3, FIX(1.083350441))    /* c4 */
+	      + z1 + z2, CONST_BITS-1);
+    dataptr[4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp3 - tmp0, FIX(0.245575608))    /* c8 */
+	      + z1 - z2, CONST_BITS-1);
+
+    /* Odd part */
+
+    dataptr[3] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(1.224744871)), /* c3 */
+	      CONST_BITS-1);
+
+    tmp11 = MULTIPLY(tmp11, FIX(1.224744871));        /* c3 */
+    tmp0 = MULTIPLY(tmp10 + tmp12, FIX(0.909038955)); /* c5 */
+    tmp1 = MULTIPLY(tmp10 + tmp13, FIX(0.483689525)); /* c7 */
+
+    dataptr[1] = (DCTELEM) DESCALE(tmp11 + tmp0 + tmp1, CONST_BITS-1);
+
+    tmp2 = MULTIPLY(tmp12 - tmp13, FIX(1.392728481)); /* c1 */
+
+    dataptr[5] = (DCTELEM) DESCALE(tmp0 - tmp11 - tmp2, CONST_BITS-1);
+    dataptr[7] = (DCTELEM) DESCALE(tmp1 - tmp11 + tmp2, CONST_BITS-1);
+
+    ctr++;
+
+    if (ctr != DCTSIZE) {
+      if (ctr == 9)
+	break;			/* Done. */
+      dataptr += DCTSIZE;	/* advance pointer to next row */
+    } else
+      dataptr = workspace;	/* switch pointer to extended workspace */
+  }
+
+  /* Pass 2: process columns.
+   * We leave the results scaled up by an overall factor of 8.
+   * We must also scale the output by (8/9)**2 = 64/81, which we partially
+   * fold into the constant multipliers and final/initial shifting:
+   * cK now represents sqrt(2) * cos(K*pi/18) * 128/81.
+   */
+
+  dataptr = data;
+  wsptr = workspace;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*0];
+    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*7];
+    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*6];
+    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*5];
+    tmp4 = dataptr[DCTSIZE*4];
+
+    tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*0];
+    tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*7];
+    tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*6];
+    tmp13 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*5];
+
+    z1 = tmp0 + tmp2 + tmp3;
+    z2 = tmp1 + tmp4;
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(z1 + z2, FIX(1.580246914)),       /* 128/81 */
+	      CONST_BITS+2);
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      DESCALE(MULTIPLY(z1 - z2 - z2, FIX(1.117403309)),  /* c6 */
+	      CONST_BITS+2);
+    z1 = MULTIPLY(tmp0 - tmp2, FIX(2.100031287));        /* c2 */
+    z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(1.117403309)); /* c6 */
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp2 - tmp3, FIX(1.711961190))    /* c4 */
+	      + z1 + z2, CONST_BITS+2);
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp3 - tmp0, FIX(0.388070096))    /* c8 */
+	      + z1 - z2, CONST_BITS+2);
+
+    /* Odd part */
+
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(1.935399303)), /* c3 */
+	      CONST_BITS+2);
+
+    tmp11 = MULTIPLY(tmp11, FIX(1.935399303));        /* c3 */
+    tmp0 = MULTIPLY(tmp10 + tmp12, FIX(1.436506004)); /* c5 */
+    tmp1 = MULTIPLY(tmp10 + tmp13, FIX(0.764348879)); /* c7 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      DESCALE(tmp11 + tmp0 + tmp1, CONST_BITS+2);
+
+    tmp2 = MULTIPLY(tmp12 - tmp13, FIX(2.200854883)); /* c1 */
+
+    dataptr[DCTSIZE*5] = (DCTELEM)
+      DESCALE(tmp0 - tmp11 - tmp2, CONST_BITS+2);
+    dataptr[DCTSIZE*7] = (DCTELEM)
+      DESCALE(tmp1 - tmp11 + tmp2, CONST_BITS+2);
+
+    dataptr++;			/* advance pointer to next column */
+    wsptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 10x10 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_10x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
+  DCTELEM workspace[8*2];
+  DCTELEM *dataptr;
+  DCTELEM *wsptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* we scale the results further by 2 as part of output adaption */
+  /* scaling for different DCT size. */
+  /* cK represents sqrt(2) * cos(K*pi/20). */
+
+  dataptr = data;
+  ctr = 0;
+  for (;;) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]);
+    tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]);
+    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]);
+
+    tmp10 = tmp0 + tmp4;
+    tmp13 = tmp0 - tmp4;
+    tmp11 = tmp1 + tmp3;
+    tmp14 = tmp1 - tmp3;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
+    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
+    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE) << 1);
+    tmp12 += tmp12;
+    dataptr[4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */
+	      MULTIPLY(tmp11 - tmp12, FIX(0.437016024)),  /* c8 */
+	      CONST_BITS-1);
+    tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876));    /* c6 */
+    dataptr[2] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)),  /* c2-c6 */
+	      CONST_BITS-1);
+    dataptr[6] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)),  /* c2+c6 */
+	      CONST_BITS-1);
+
+    /* Odd part */
+
+    tmp10 = tmp0 + tmp4;
+    tmp11 = tmp1 - tmp3;
+    dataptr[5] = (DCTELEM) ((tmp10 - tmp11 - tmp2) << 1);
+    tmp2 <<= CONST_BITS;
+    dataptr[1] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0, FIX(1.396802247)) +          /* c1 */
+	      MULTIPLY(tmp1, FIX(1.260073511)) + tmp2 +   /* c3 */
+	      MULTIPLY(tmp3, FIX(0.642039522)) +          /* c7 */
+	      MULTIPLY(tmp4, FIX(0.221231742)),           /* c9 */
+	      CONST_BITS-1);
+    tmp12 = MULTIPLY(tmp0 - tmp4, FIX(0.951056516)) -     /* (c3+c7)/2 */
+	    MULTIPLY(tmp1 + tmp3, FIX(0.587785252));      /* (c1-c9)/2 */
+    tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.309016994)) +   /* (c3-c7)/2 */
+	    (tmp11 << (CONST_BITS - 1)) - tmp2;
+    dataptr[3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS-1);
+    dataptr[7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS-1);
+
+    ctr++;
+
+    if (ctr != DCTSIZE) {
+      if (ctr == 10)
+	break;			/* Done. */
+      dataptr += DCTSIZE;	/* advance pointer to next row */
+    } else
+      dataptr = workspace;	/* switch pointer to extended workspace */
+  }
+
+  /* Pass 2: process columns.
+   * We leave the results scaled up by an overall factor of 8.
+   * We must also scale the output by (8/10)**2 = 16/25, which we partially
+   * fold into the constant multipliers and final/initial shifting:
+   * cK now represents sqrt(2) * cos(K*pi/20) * 32/25.
+   */
+
+  dataptr = data;
+  wsptr = workspace;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*1];
+    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*0];
+    tmp12 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*7];
+    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*6];
+    tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*5];
+
+    tmp10 = tmp0 + tmp4;
+    tmp13 = tmp0 - tmp4;
+    tmp11 = tmp1 + tmp3;
+    tmp14 = tmp1 - tmp3;
+
+    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*1];
+    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*0];
+    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*7];
+    tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*6];
+    tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*5];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(1.28)), /* 32/25 */
+	      CONST_BITS+2);
+    tmp12 += tmp12;
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.464477191)) - /* c4 */
+	      MULTIPLY(tmp11 - tmp12, FIX(0.559380511)),  /* c8 */
+	      CONST_BITS+2);
+    tmp10 = MULTIPLY(tmp13 + tmp14, FIX(1.064004961));    /* c6 */
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.657591230)),  /* c2-c6 */
+	      CONST_BITS+2);
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.785601151)),  /* c2+c6 */
+	      CONST_BITS+2);
+
+    /* Odd part */
+
+    tmp10 = tmp0 + tmp4;
+    tmp11 = tmp1 - tmp3;
+    dataptr[DCTSIZE*5] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp2, FIX(1.28)),  /* 32/25 */
+	      CONST_BITS+2);
+    tmp2 = MULTIPLY(tmp2, FIX(1.28));                     /* 32/25 */
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0, FIX(1.787906876)) +          /* c1 */
+	      MULTIPLY(tmp1, FIX(1.612894094)) + tmp2 +   /* c3 */
+	      MULTIPLY(tmp3, FIX(0.821810588)) +          /* c7 */
+	      MULTIPLY(tmp4, FIX(0.283176630)),           /* c9 */
+	      CONST_BITS+2);
+    tmp12 = MULTIPLY(tmp0 - tmp4, FIX(1.217352341)) -     /* (c3+c7)/2 */
+	    MULTIPLY(tmp1 + tmp3, FIX(0.752365123));      /* (c1-c9)/2 */
+    tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.395541753)) +   /* (c3-c7)/2 */
+	    MULTIPLY(tmp11, FIX(0.64)) - tmp2;            /* 16/25 */
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS+2);
+    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS+2);
+
+    dataptr++;			/* advance pointer to next column */
+    wsptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on an 11x11 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_11x11 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
+  INT32 z1, z2, z3;
+  DCTELEM workspace[8*3];
+  DCTELEM *dataptr;
+  DCTELEM *wsptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* we scale the results further by 2 as part of output adaption */
+  /* scaling for different DCT size. */
+  /* cK represents sqrt(2) * cos(K*pi/22). */
+
+  dataptr = data;
+  ctr = 0;
+  for (;;) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[10]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[9]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[8]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[7]);
+    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[6]);
+    tmp5 = GETJSAMPLE(elemptr[5]);
+
+    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[10]);
+    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[9]);
+    tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[8]);
+    tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[7]);
+    tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[6]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 - 11 * CENTERJSAMPLE) << 1);
+    tmp5 += tmp5;
+    tmp0 -= tmp5;
+    tmp1 -= tmp5;
+    tmp2 -= tmp5;
+    tmp3 -= tmp5;
+    tmp4 -= tmp5;
+    z1 = MULTIPLY(tmp0 + tmp3, FIX(1.356927976)) +       /* c2 */
+	 MULTIPLY(tmp2 + tmp4, FIX(0.201263574));        /* c10 */
+    z2 = MULTIPLY(tmp1 - tmp3, FIX(0.926112931));        /* c6 */
+    z3 = MULTIPLY(tmp0 - tmp1, FIX(1.189712156));        /* c4 */
+    dataptr[2] = (DCTELEM)
+      DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(1.018300590)) /* c2+c8-c6 */
+	      - MULTIPLY(tmp4, FIX(1.390975730)),        /* c4+c10 */
+	      CONST_BITS-1);
+    dataptr[4] = (DCTELEM)
+      DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(0.062335650)) /* c4-c6-c10 */
+	      - MULTIPLY(tmp2, FIX(1.356927976))         /* c2 */
+	      + MULTIPLY(tmp4, FIX(0.587485545)),        /* c8 */
+	      CONST_BITS-1);
+    dataptr[6] = (DCTELEM)
+      DESCALE(z1 + z3 - MULTIPLY(tmp0, FIX(1.620527200)) /* c2+c4-c6 */
+	      - MULTIPLY(tmp2, FIX(0.788749120)),        /* c8+c10 */
+	      CONST_BITS-1);
+
+    /* Odd part */
+
+    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.286413905));    /* c3 */
+    tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.068791298));    /* c5 */
+    tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.764581576));    /* c7 */
+    tmp0 = tmp1 + tmp2 + tmp3 - MULTIPLY(tmp10, FIX(1.719967871)) /* c7+c5+c3-c1 */
+	   + MULTIPLY(tmp14, FIX(0.398430003));          /* c9 */
+    tmp4 = MULTIPLY(tmp11 + tmp12, - FIX(0.764581576));  /* -c7 */
+    tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.399818907));  /* -c1 */
+    tmp1 += tmp4 + tmp5 + MULTIPLY(tmp11, FIX(1.276416582)) /* c9+c7+c1-c3 */
+	    - MULTIPLY(tmp14, FIX(1.068791298));         /* c5 */
+    tmp10 = MULTIPLY(tmp12 + tmp13, FIX(0.398430003));   /* c9 */
+    tmp2 += tmp4 + tmp10 - MULTIPLY(tmp12, FIX(1.989053629)) /* c9+c5+c3-c7 */
+	    + MULTIPLY(tmp14, FIX(1.399818907));         /* c1 */
+    tmp3 += tmp5 + tmp10 + MULTIPLY(tmp13, FIX(1.305598626)) /* c1+c5-c9-c7 */
+	    - MULTIPLY(tmp14, FIX(1.286413905));         /* c3 */
+
+    dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-1);
+    dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-1);
+    dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-1);
+    dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS-1);
+
+    ctr++;
+
+    if (ctr != DCTSIZE) {
+      if (ctr == 11)
+	break;			/* Done. */
+      dataptr += DCTSIZE;	/* advance pointer to next row */
+    } else
+      dataptr = workspace;	/* switch pointer to extended workspace */
+  }
+
+  /* Pass 2: process columns.
+   * We leave the results scaled up by an overall factor of 8.
+   * We must also scale the output by (8/11)**2 = 64/121, which we partially
+   * fold into the constant multipliers and final/initial shifting:
+   * cK now represents sqrt(2) * cos(K*pi/22) * 128/121.
+   */
+
+  dataptr = data;
+  wsptr = workspace;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*2];
+    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*1];
+    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*0];
+    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*7];
+    tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*6];
+    tmp5 = dataptr[DCTSIZE*5];
+
+    tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*2];
+    tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*1];
+    tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*0];
+    tmp13 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*7];
+    tmp14 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*6];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5,
+		       FIX(1.057851240)),                /* 128/121 */
+	      CONST_BITS+2);
+    tmp5 += tmp5;
+    tmp0 -= tmp5;
+    tmp1 -= tmp5;
+    tmp2 -= tmp5;
+    tmp3 -= tmp5;
+    tmp4 -= tmp5;
+    z1 = MULTIPLY(tmp0 + tmp3, FIX(1.435427942)) +       /* c2 */
+	 MULTIPLY(tmp2 + tmp4, FIX(0.212906922));        /* c10 */
+    z2 = MULTIPLY(tmp1 - tmp3, FIX(0.979689713));        /* c6 */
+    z3 = MULTIPLY(tmp0 - tmp1, FIX(1.258538479));        /* c4 */
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(1.077210542)) /* c2+c8-c6 */
+	      - MULTIPLY(tmp4, FIX(1.471445400)),        /* c4+c10 */
+	      CONST_BITS+2);
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(0.065941844)) /* c4-c6-c10 */
+	      - MULTIPLY(tmp2, FIX(1.435427942))         /* c2 */
+	      + MULTIPLY(tmp4, FIX(0.621472312)),        /* c8 */
+	      CONST_BITS+2);
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      DESCALE(z1 + z3 - MULTIPLY(tmp0, FIX(1.714276708)) /* c2+c4-c6 */
+	      - MULTIPLY(tmp2, FIX(0.834379234)),        /* c8+c10 */
+	      CONST_BITS+2);
+
+    /* Odd part */
+
+    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.360834544));    /* c3 */
+    tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.130622199));    /* c5 */
+    tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.808813568));    /* c7 */
+    tmp0 = tmp1 + tmp2 + tmp3 - MULTIPLY(tmp10, FIX(1.819470145)) /* c7+c5+c3-c1 */
+	   + MULTIPLY(tmp14, FIX(0.421479672));          /* c9 */
+    tmp4 = MULTIPLY(tmp11 + tmp12, - FIX(0.808813568));  /* -c7 */
+    tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.480800167));  /* -c1 */
+    tmp1 += tmp4 + tmp5 + MULTIPLY(tmp11, FIX(1.350258864)) /* c9+c7+c1-c3 */
+	    - MULTIPLY(tmp14, FIX(1.130622199));         /* c5 */
+    tmp10 = MULTIPLY(tmp12 + tmp13, FIX(0.421479672));   /* c9 */
+    tmp2 += tmp4 + tmp10 - MULTIPLY(tmp12, FIX(2.104122847)) /* c9+c5+c3-c7 */
+	    + MULTIPLY(tmp14, FIX(1.480800167));         /* c1 */
+    tmp3 += tmp5 + tmp10 + MULTIPLY(tmp13, FIX(1.381129125)) /* c1+c5-c9-c7 */
+	    - MULTIPLY(tmp14, FIX(1.360834544));         /* c3 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+2);
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+2);
+    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+2);
+    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+2);
+
+    dataptr++;			/* advance pointer to next column */
+    wsptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 12x12 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_12x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  DCTELEM workspace[8*4];
+  DCTELEM *dataptr;
+  DCTELEM *wsptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT. */
+  /* cK represents sqrt(2) * cos(K*pi/24). */
+
+  dataptr = data;
+  ctr = 0;
+  for (;;) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]);
+    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]);
+    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]);
+
+    tmp10 = tmp0 + tmp5;
+    tmp13 = tmp0 - tmp5;
+    tmp11 = tmp1 + tmp4;
+    tmp14 = tmp1 - tmp4;
+    tmp12 = tmp2 + tmp3;
+    tmp15 = tmp2 - tmp3;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
+    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
+    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
+    tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM) (tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE);
+    dataptr[6] = (DCTELEM) (tmp13 - tmp14 - tmp15);
+    dataptr[4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
+	      CONST_BITS);
+    dataptr[2] = (DCTELEM)
+      DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */
+	      CONST_BITS);
+
+    /* Odd part */
+
+    tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100);    /* c9 */
+    tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865);   /* c3-c9 */
+    tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065);   /* c3+c9 */
+    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054));   /* c5 */
+    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669));   /* c7 */
+    tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */
+	    + MULTIPLY(tmp5, FIX(0.184591911));        /* c11 */
+    tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */
+    tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.339493912)) /* c1+c5-c11 */
+	    + MULTIPLY(tmp5, FIX(0.860918669));        /* c7 */
+    tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.725788011)) /* c1+c11-c7 */
+	    - MULTIPLY(tmp5, FIX(1.121971054));        /* c5 */
+    tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.306562965)) /* c3 */
+	    - MULTIPLY(tmp2 + tmp5, FIX_0_541196100);  /* c9 */
+
+    dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS);
+    dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS);
+    dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS);
+    dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS);
+
+    ctr++;
+
+    if (ctr != DCTSIZE) {
+      if (ctr == 12)
+	break;			/* Done. */
+      dataptr += DCTSIZE;	/* advance pointer to next row */
+    } else
+      dataptr = workspace;	/* switch pointer to extended workspace */
+  }
+
+  /* Pass 2: process columns.
+   * We leave the results scaled up by an overall factor of 8.
+   * We must also scale the output by (8/12)**2 = 4/9, which we partially
+   * fold into the constant multipliers and final shifting:
+   * cK now represents sqrt(2) * cos(K*pi/24) * 8/9.
+   */
+
+  dataptr = data;
+  wsptr = workspace;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*3];
+    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*2];
+    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*1];
+    tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*0];
+    tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*7];
+    tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*6];
+
+    tmp10 = tmp0 + tmp5;
+    tmp13 = tmp0 - tmp5;
+    tmp11 = tmp1 + tmp4;
+    tmp14 = tmp1 - tmp4;
+    tmp12 = tmp2 + tmp3;
+    tmp15 = tmp2 - tmp3;
+
+    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*3];
+    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*2];
+    tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*1];
+    tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*0];
+    tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*7];
+    tmp5 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*6];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(0.888888889)), /* 8/9 */
+	      CONST_BITS+1);
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp13 - tmp14 - tmp15, FIX(0.888888889)), /* 8/9 */
+	      CONST_BITS+1);
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.088662108)),         /* c4 */
+	      CONST_BITS+1);
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp14 - tmp15, FIX(0.888888889)) +        /* 8/9 */
+	      MULTIPLY(tmp13 + tmp15, FIX(1.214244803)),         /* c2 */
+	      CONST_BITS+1);
+
+    /* Odd part */
+
+    tmp10 = MULTIPLY(tmp1 + tmp4, FIX(0.481063200));   /* c9 */
+    tmp14 = tmp10 + MULTIPLY(tmp1, FIX(0.680326102));  /* c3-c9 */
+    tmp15 = tmp10 - MULTIPLY(tmp4, FIX(1.642452502));  /* c3+c9 */
+    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(0.997307603));   /* c5 */
+    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.765261039));   /* c7 */
+    tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.516244403)) /* c5+c7-c1 */
+	    + MULTIPLY(tmp5, FIX(0.164081699));        /* c11 */
+    tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.164081699)); /* -c11 */
+    tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.079550144)) /* c1+c5-c11 */
+	    + MULTIPLY(tmp5, FIX(0.765261039));        /* c7 */
+    tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.645144899)) /* c1+c11-c7 */
+	    - MULTIPLY(tmp5, FIX(0.997307603));        /* c5 */
+    tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.161389302)) /* c3 */
+	    - MULTIPLY(tmp2 + tmp5, FIX(0.481063200)); /* c9 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+1);
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+1);
+    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+1);
+    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+1);
+
+    dataptr++;			/* advance pointer to next column */
+    wsptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 13x13 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_13x13 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  INT32 z1, z2;
+  DCTELEM workspace[8*5];
+  DCTELEM *dataptr;
+  DCTELEM *wsptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT. */
+  /* cK represents sqrt(2) * cos(K*pi/26). */
+
+  dataptr = data;
+  ctr = 0;
+  for (;;) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[12]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[11]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[10]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[9]);
+    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[8]);
+    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[7]);
+    tmp6 = GETJSAMPLE(elemptr[6]);
+
+    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[12]);
+    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[11]);
+    tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[10]);
+    tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[9]);
+    tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[8]);
+    tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[7]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      (tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6 - 13 * CENTERJSAMPLE);
+    tmp6 += tmp6;
+    tmp0 -= tmp6;
+    tmp1 -= tmp6;
+    tmp2 -= tmp6;
+    tmp3 -= tmp6;
+    tmp4 -= tmp6;
+    tmp5 -= tmp6;
+    dataptr[2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0, FIX(1.373119086)) +   /* c2 */
+	      MULTIPLY(tmp1, FIX(1.058554052)) +   /* c6 */
+	      MULTIPLY(tmp2, FIX(0.501487041)) -   /* c10 */
+	      MULTIPLY(tmp3, FIX(0.170464608)) -   /* c12 */
+	      MULTIPLY(tmp4, FIX(0.803364869)) -   /* c8 */
+	      MULTIPLY(tmp5, FIX(1.252223920)),    /* c4 */
+	      CONST_BITS);
+    z1 = MULTIPLY(tmp0 - tmp2, FIX(1.155388986)) - /* (c4+c6)/2 */
+	 MULTIPLY(tmp3 - tmp4, FIX(0.435816023)) - /* (c2-c10)/2 */
+	 MULTIPLY(tmp1 - tmp5, FIX(0.316450131));  /* (c8-c12)/2 */
+    z2 = MULTIPLY(tmp0 + tmp2, FIX(0.096834934)) - /* (c4-c6)/2 */
+	 MULTIPLY(tmp3 + tmp4, FIX(0.937303064)) + /* (c2+c10)/2 */
+	 MULTIPLY(tmp1 + tmp5, FIX(0.486914739));  /* (c8+c12)/2 */
+
+    dataptr[4] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS);
+    dataptr[6] = (DCTELEM) DESCALE(z1 - z2, CONST_BITS);
+
+    /* Odd part */
+
+    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.322312651));   /* c3 */
+    tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.163874945));   /* c5 */
+    tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.937797057)) +  /* c7 */
+	   MULTIPLY(tmp14 + tmp15, FIX(0.338443458));   /* c11 */
+    tmp0 = tmp1 + tmp2 + tmp3 -
+	   MULTIPLY(tmp10, FIX(2.020082300)) +          /* c3+c5+c7-c1 */
+	   MULTIPLY(tmp14, FIX(0.318774355));           /* c9-c11 */
+    tmp4 = MULTIPLY(tmp14 - tmp15, FIX(0.937797057)) -  /* c7 */
+	   MULTIPLY(tmp11 + tmp12, FIX(0.338443458));   /* c11 */
+    tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.163874945)); /* -c5 */
+    tmp1 += tmp4 + tmp5 +
+	    MULTIPLY(tmp11, FIX(0.837223564)) -         /* c5+c9+c11-c3 */
+	    MULTIPLY(tmp14, FIX(2.341699410));          /* c1+c7 */
+    tmp6 = MULTIPLY(tmp12 + tmp13, - FIX(0.657217813)); /* -c9 */
+    tmp2 += tmp4 + tmp6 -
+	    MULTIPLY(tmp12, FIX(1.572116027)) +         /* c1+c5-c9-c11 */
+	    MULTIPLY(tmp15, FIX(2.260109708));          /* c3+c7 */
+    tmp3 += tmp5 + tmp6 +
+	    MULTIPLY(tmp13, FIX(2.205608352)) -         /* c3+c5+c9-c7 */
+	    MULTIPLY(tmp15, FIX(1.742345811));          /* c1+c11 */
+
+    dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS);
+    dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS);
+    dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS);
+    dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS);
+
+    ctr++;
+
+    if (ctr != DCTSIZE) {
+      if (ctr == 13)
+	break;			/* Done. */
+      dataptr += DCTSIZE;	/* advance pointer to next row */
+    } else
+      dataptr = workspace;	/* switch pointer to extended workspace */
+  }
+
+  /* Pass 2: process columns.
+   * We leave the results scaled up by an overall factor of 8.
+   * We must also scale the output by (8/13)**2 = 64/169, which we partially
+   * fold into the constant multipliers and final shifting:
+   * cK now represents sqrt(2) * cos(K*pi/26) * 128/169.
+   */
+
+  dataptr = data;
+  wsptr = workspace;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*4];
+    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*3];
+    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*2];
+    tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*1];
+    tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*0];
+    tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*7];
+    tmp6 = dataptr[DCTSIZE*6];
+
+    tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*4];
+    tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*3];
+    tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*2];
+    tmp13 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*1];
+    tmp14 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*0];
+    tmp15 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*7];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6,
+		       FIX(0.757396450)),          /* 128/169 */
+	      CONST_BITS+1);
+    tmp6 += tmp6;
+    tmp0 -= tmp6;
+    tmp1 -= tmp6;
+    tmp2 -= tmp6;
+    tmp3 -= tmp6;
+    tmp4 -= tmp6;
+    tmp5 -= tmp6;
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0, FIX(1.039995521)) +   /* c2 */
+	      MULTIPLY(tmp1, FIX(0.801745081)) +   /* c6 */
+	      MULTIPLY(tmp2, FIX(0.379824504)) -   /* c10 */
+	      MULTIPLY(tmp3, FIX(0.129109289)) -   /* c12 */
+	      MULTIPLY(tmp4, FIX(0.608465700)) -   /* c8 */
+	      MULTIPLY(tmp5, FIX(0.948429952)),    /* c4 */
+	      CONST_BITS+1);
+    z1 = MULTIPLY(tmp0 - tmp2, FIX(0.875087516)) - /* (c4+c6)/2 */
+	 MULTIPLY(tmp3 - tmp4, FIX(0.330085509)) - /* (c2-c10)/2 */
+	 MULTIPLY(tmp1 - tmp5, FIX(0.239678205));  /* (c8-c12)/2 */
+    z2 = MULTIPLY(tmp0 + tmp2, FIX(0.073342435)) - /* (c4-c6)/2 */
+	 MULTIPLY(tmp3 + tmp4, FIX(0.709910013)) + /* (c2+c10)/2 */
+	 MULTIPLY(tmp1 + tmp5, FIX(0.368787494));  /* (c8+c12)/2 */
+
+    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+1);
+    dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 - z2, CONST_BITS+1);
+
+    /* Odd part */
+
+    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.001514908));   /* c3 */
+    tmp2 = MULTIPLY(tmp10 + tmp12, FIX(0.881514751));   /* c5 */
+    tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.710284161)) +  /* c7 */
+	   MULTIPLY(tmp14 + tmp15, FIX(0.256335874));   /* c11 */
+    tmp0 = tmp1 + tmp2 + tmp3 -
+	   MULTIPLY(tmp10, FIX(1.530003162)) +          /* c3+c5+c7-c1 */
+	   MULTIPLY(tmp14, FIX(0.241438564));           /* c9-c11 */
+    tmp4 = MULTIPLY(tmp14 - tmp15, FIX(0.710284161)) -  /* c7 */
+	   MULTIPLY(tmp11 + tmp12, FIX(0.256335874));   /* c11 */
+    tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(0.881514751)); /* -c5 */
+    tmp1 += tmp4 + tmp5 +
+	    MULTIPLY(tmp11, FIX(0.634110155)) -         /* c5+c9+c11-c3 */
+	    MULTIPLY(tmp14, FIX(1.773594819));          /* c1+c7 */
+    tmp6 = MULTIPLY(tmp12 + tmp13, - FIX(0.497774438)); /* -c9 */
+    tmp2 += tmp4 + tmp6 -
+	    MULTIPLY(tmp12, FIX(1.190715098)) +         /* c1+c5-c9-c11 */
+	    MULTIPLY(tmp15, FIX(1.711799069));          /* c3+c7 */
+    tmp3 += tmp5 + tmp6 +
+	    MULTIPLY(tmp13, FIX(1.670519935)) -         /* c3+c5+c9-c7 */
+	    MULTIPLY(tmp15, FIX(1.319646532));          /* c1+c11 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+1);
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+1);
+    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+1);
+    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+1);
+
+    dataptr++;			/* advance pointer to next column */
+    wsptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 14x14 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_14x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+  DCTELEM workspace[8*6];
+  DCTELEM *dataptr;
+  DCTELEM *wsptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT. */
+  /* cK represents sqrt(2) * cos(K*pi/28). */
+
+  dataptr = data;
+  ctr = 0;
+  for (;;) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]);
+    tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]);
+    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]);
+    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]);
+    tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]);
+
+    tmp10 = tmp0 + tmp6;
+    tmp14 = tmp0 - tmp6;
+    tmp11 = tmp1 + tmp5;
+    tmp15 = tmp1 - tmp5;
+    tmp12 = tmp2 + tmp4;
+    tmp16 = tmp2 - tmp4;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]);
+    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
+    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
+    tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
+    tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      (tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE);
+    tmp13 += tmp13;
+    dataptr[4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */
+	      MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */
+	      MULTIPLY(tmp12 - tmp13, FIX(0.881747734)),  /* c8 */
+	      CONST_BITS);
+
+    tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686));    /* c6 */
+
+    dataptr[2] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590))   /* c2-c6 */
+	      + MULTIPLY(tmp16, FIX(0.613604268)),        /* c10 */
+	      CONST_BITS);
+    dataptr[6] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954))   /* c6+c10 */
+	      - MULTIPLY(tmp16, FIX(1.378756276)),        /* c2 */
+	      CONST_BITS);
+
+    /* Odd part */
+
+    tmp10 = tmp1 + tmp2;
+    tmp11 = tmp5 - tmp4;
+    dataptr[7] = (DCTELEM) (tmp0 - tmp10 + tmp3 - tmp11 - tmp6);
+    tmp3 <<= CONST_BITS;
+    tmp10 = MULTIPLY(tmp10, - FIX(0.158341681));          /* -c13 */
+    tmp11 = MULTIPLY(tmp11, FIX(1.405321284));            /* c1 */
+    tmp10 += tmp11 - tmp3;
+    tmp11 = MULTIPLY(tmp0 + tmp2, FIX(1.197448846)) +     /* c5 */
+	    MULTIPLY(tmp4 + tmp6, FIX(0.752406978));      /* c9 */
+    dataptr[5] = (DCTELEM)
+      DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(2.373959773)) /* c3+c5-c13 */
+	      + MULTIPLY(tmp4, FIX(1.119999435)),         /* c1+c11-c9 */
+	      CONST_BITS);
+    tmp12 = MULTIPLY(tmp0 + tmp1, FIX(1.334852607)) +     /* c3 */
+	    MULTIPLY(tmp5 - tmp6, FIX(0.467085129));      /* c11 */
+    dataptr[3] = (DCTELEM)
+      DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.424103948)) /* c3-c9-c13 */
+	      - MULTIPLY(tmp5, FIX(3.069855259)),         /* c1+c5+c11 */
+	      CONST_BITS);
+    dataptr[1] = (DCTELEM)
+      DESCALE(tmp11 + tmp12 + tmp3 + tmp6 -
+	      MULTIPLY(tmp0 + tmp6, FIX(1.126980169)),    /* c3+c5-c1 */
+	      CONST_BITS);
+
+    ctr++;
+
+    if (ctr != DCTSIZE) {
+      if (ctr == 14)
+	break;			/* Done. */
+      dataptr += DCTSIZE;	/* advance pointer to next row */
+    } else
+      dataptr = workspace;	/* switch pointer to extended workspace */
+  }
+
+  /* Pass 2: process columns.
+   * We leave the results scaled up by an overall factor of 8.
+   * We must also scale the output by (8/14)**2 = 16/49, which we partially
+   * fold into the constant multipliers and final shifting:
+   * cK now represents sqrt(2) * cos(K*pi/28) * 32/49.
+   */
+
+  dataptr = data;
+  wsptr = workspace;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*5];
+    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*4];
+    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*3];
+    tmp13 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*2];
+    tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*1];
+    tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*0];
+    tmp6 = dataptr[DCTSIZE*6] + dataptr[DCTSIZE*7];
+
+    tmp10 = tmp0 + tmp6;
+    tmp14 = tmp0 - tmp6;
+    tmp11 = tmp1 + tmp5;
+    tmp15 = tmp1 - tmp5;
+    tmp12 = tmp2 + tmp4;
+    tmp16 = tmp2 - tmp4;
+
+    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*5];
+    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*4];
+    tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*3];
+    tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*2];
+    tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*1];
+    tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*0];
+    tmp6 = dataptr[DCTSIZE*6] - dataptr[DCTSIZE*7];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12 + tmp13,
+		       FIX(0.653061224)),                 /* 32/49 */
+	      CONST_BITS+1);
+    tmp13 += tmp13;
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(0.832106052)) + /* c4 */
+	      MULTIPLY(tmp11 - tmp13, FIX(0.205513223)) - /* c12 */
+	      MULTIPLY(tmp12 - tmp13, FIX(0.575835255)),  /* c8 */
+	      CONST_BITS+1);
+
+    tmp10 = MULTIPLY(tmp14 + tmp15, FIX(0.722074570));    /* c6 */
+
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.178337691))   /* c2-c6 */
+	      + MULTIPLY(tmp16, FIX(0.400721155)),        /* c10 */
+	      CONST_BITS+1);
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.122795725))   /* c6+c10 */
+	      - MULTIPLY(tmp16, FIX(0.900412262)),        /* c2 */
+	      CONST_BITS+1);
+
+    /* Odd part */
+
+    tmp10 = tmp1 + tmp2;
+    tmp11 = tmp5 - tmp4;
+    dataptr[DCTSIZE*7] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 - tmp10 + tmp3 - tmp11 - tmp6,
+		       FIX(0.653061224)),                 /* 32/49 */
+	      CONST_BITS+1);
+    tmp3  = MULTIPLY(tmp3 , FIX(0.653061224));            /* 32/49 */
+    tmp10 = MULTIPLY(tmp10, - FIX(0.103406812));          /* -c13 */
+    tmp11 = MULTIPLY(tmp11, FIX(0.917760839));            /* c1 */
+    tmp10 += tmp11 - tmp3;
+    tmp11 = MULTIPLY(tmp0 + tmp2, FIX(0.782007410)) +     /* c5 */
+	    MULTIPLY(tmp4 + tmp6, FIX(0.491367823));      /* c9 */
+    dataptr[DCTSIZE*5] = (DCTELEM)
+      DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(1.550341076)) /* c3+c5-c13 */
+	      + MULTIPLY(tmp4, FIX(0.731428202)),         /* c1+c11-c9 */
+	      CONST_BITS+1);
+    tmp12 = MULTIPLY(tmp0 + tmp1, FIX(0.871740478)) +     /* c3 */
+	    MULTIPLY(tmp5 - tmp6, FIX(0.305035186));      /* c11 */
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.276965844)) /* c3-c9-c13 */
+	      - MULTIPLY(tmp5, FIX(2.004803435)),         /* c1+c5+c11 */
+	      CONST_BITS+1);
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      DESCALE(tmp11 + tmp12 + tmp3
+	      - MULTIPLY(tmp0, FIX(0.735987049))          /* c3+c5-c1 */
+	      - MULTIPLY(tmp6, FIX(0.082925825)),         /* c9-c11-c13 */
+	      CONST_BITS+1);
+
+    dataptr++;			/* advance pointer to next column */
+    wsptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 15x15 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_15x15 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+  INT32 z1, z2, z3;
+  DCTELEM workspace[8*7];
+  DCTELEM *dataptr;
+  DCTELEM *wsptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT. */
+  /* cK represents sqrt(2) * cos(K*pi/30). */
+
+  dataptr = data;
+  ctr = 0;
+  for (;;) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[14]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[13]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[12]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[11]);
+    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[10]);
+    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[9]);
+    tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[8]);
+    tmp7 = GETJSAMPLE(elemptr[7]);
+
+    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[14]);
+    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[13]);
+    tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[12]);
+    tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[11]);
+    tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[10]);
+    tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[9]);
+    tmp16 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[8]);
+
+    z1 = tmp0 + tmp4 + tmp5;
+    z2 = tmp1 + tmp3 + tmp6;
+    z3 = tmp2 + tmp7;
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM) (z1 + z2 + z3 - 15 * CENTERJSAMPLE);
+    z3 += z3;
+    dataptr[6] = (DCTELEM)
+      DESCALE(MULTIPLY(z1 - z3, FIX(1.144122806)) - /* c6 */
+	      MULTIPLY(z2 - z3, FIX(0.437016024)),  /* c12 */
+	      CONST_BITS);
+    tmp2 += ((tmp1 + tmp4) >> 1) - tmp7 - tmp7;
+    z1 = MULTIPLY(tmp3 - tmp2, FIX(1.531135173)) -  /* c2+c14 */
+         MULTIPLY(tmp6 - tmp2, FIX(2.238241955));   /* c4+c8 */
+    z2 = MULTIPLY(tmp5 - tmp2, FIX(0.798468008)) -  /* c8-c14 */
+	 MULTIPLY(tmp0 - tmp2, FIX(0.091361227));   /* c2-c4 */
+    z3 = MULTIPLY(tmp0 - tmp3, FIX(1.383309603)) +  /* c2 */
+	 MULTIPLY(tmp6 - tmp5, FIX(0.946293579)) +  /* c8 */
+	 MULTIPLY(tmp1 - tmp4, FIX(0.790569415));   /* (c6+c12)/2 */
+
+    dataptr[2] = (DCTELEM) DESCALE(z1 + z3, CONST_BITS);
+    dataptr[4] = (DCTELEM) DESCALE(z2 + z3, CONST_BITS);
+
+    /* Odd part */
+
+    tmp2 = MULTIPLY(tmp10 - tmp12 - tmp13 + tmp15 + tmp16,
+		    FIX(1.224744871));                         /* c5 */
+    tmp1 = MULTIPLY(tmp10 - tmp14 - tmp15, FIX(1.344997024)) + /* c3 */
+	   MULTIPLY(tmp11 - tmp13 - tmp16, FIX(0.831253876));  /* c9 */
+    tmp12 = MULTIPLY(tmp12, FIX(1.224744871));                 /* c5 */
+    tmp4 = MULTIPLY(tmp10 - tmp16, FIX(1.406466353)) +         /* c1 */
+	   MULTIPLY(tmp11 + tmp14, FIX(1.344997024)) +         /* c3 */
+	   MULTIPLY(tmp13 + tmp15, FIX(0.575212477));          /* c11 */
+    tmp0 = MULTIPLY(tmp13, FIX(0.475753014)) -                 /* c7-c11 */
+	   MULTIPLY(tmp14, FIX(0.513743148)) +                 /* c3-c9 */
+	   MULTIPLY(tmp16, FIX(1.700497885)) + tmp4 + tmp12;   /* c1+c13 */
+    tmp3 = MULTIPLY(tmp10, - FIX(0.355500862)) -               /* -(c1-c7) */
+	   MULTIPLY(tmp11, FIX(2.176250899)) -                 /* c3+c9 */
+	   MULTIPLY(tmp15, FIX(0.869244010)) + tmp4 - tmp12;   /* c11+c13 */
+
+    dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS);
+    dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS);
+    dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS);
+    dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS);
+
+    ctr++;
+
+    if (ctr != DCTSIZE) {
+      if (ctr == 15)
+	break;			/* Done. */
+      dataptr += DCTSIZE;	/* advance pointer to next row */
+    } else
+      dataptr = workspace;	/* switch pointer to extended workspace */
+  }
+
+  /* Pass 2: process columns.
+   * We leave the results scaled up by an overall factor of 8.
+   * We must also scale the output by (8/15)**2 = 64/225, which we partially
+   * fold into the constant multipliers and final shifting:
+   * cK now represents sqrt(2) * cos(K*pi/30) * 256/225.
+   */
+
+  dataptr = data;
+  wsptr = workspace;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*6];
+    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*5];
+    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*4];
+    tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*3];
+    tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*2];
+    tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*1];
+    tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*0];
+    tmp7 = dataptr[DCTSIZE*7];
+
+    tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*6];
+    tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*5];
+    tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*4];
+    tmp13 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*3];
+    tmp14 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*2];
+    tmp15 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*1];
+    tmp16 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*0];
+
+    z1 = tmp0 + tmp4 + tmp5;
+    z2 = tmp1 + tmp3 + tmp6;
+    z3 = tmp2 + tmp7;
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(z1 + z2 + z3, FIX(1.137777778)), /* 256/225 */
+	      CONST_BITS+2);
+    z3 += z3;
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      DESCALE(MULTIPLY(z1 - z3, FIX(1.301757503)) - /* c6 */
+	      MULTIPLY(z2 - z3, FIX(0.497227121)),  /* c12 */
+	      CONST_BITS+2);
+    tmp2 += ((tmp1 + tmp4) >> 1) - tmp7 - tmp7;
+    z1 = MULTIPLY(tmp3 - tmp2, FIX(1.742091575)) -  /* c2+c14 */
+         MULTIPLY(tmp6 - tmp2, FIX(2.546621957));   /* c4+c8 */
+    z2 = MULTIPLY(tmp5 - tmp2, FIX(0.908479156)) -  /* c8-c14 */
+	 MULTIPLY(tmp0 - tmp2, FIX(0.103948774));   /* c2-c4 */
+    z3 = MULTIPLY(tmp0 - tmp3, FIX(1.573898926)) +  /* c2 */
+	 MULTIPLY(tmp6 - tmp5, FIX(1.076671805)) +  /* c8 */
+	 MULTIPLY(tmp1 - tmp4, FIX(0.899492312));   /* (c6+c12)/2 */
+
+    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z3, CONST_BITS+2);
+    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(z2 + z3, CONST_BITS+2);
+
+    /* Odd part */
+
+    tmp2 = MULTIPLY(tmp10 - tmp12 - tmp13 + tmp15 + tmp16,
+		    FIX(1.393487498));                         /* c5 */
+    tmp1 = MULTIPLY(tmp10 - tmp14 - tmp15, FIX(1.530307725)) + /* c3 */
+	   MULTIPLY(tmp11 - tmp13 - tmp16, FIX(0.945782187));  /* c9 */
+    tmp12 = MULTIPLY(tmp12, FIX(1.393487498));                 /* c5 */
+    tmp4 = MULTIPLY(tmp10 - tmp16, FIX(1.600246161)) +         /* c1 */
+	   MULTIPLY(tmp11 + tmp14, FIX(1.530307725)) +         /* c3 */
+	   MULTIPLY(tmp13 + tmp15, FIX(0.654463974));          /* c11 */
+    tmp0 = MULTIPLY(tmp13, FIX(0.541301207)) -                 /* c7-c11 */
+	   MULTIPLY(tmp14, FIX(0.584525538)) +                 /* c3-c9 */
+	   MULTIPLY(tmp16, FIX(1.934788705)) + tmp4 + tmp12;   /* c1+c13 */
+    tmp3 = MULTIPLY(tmp10, - FIX(0.404480980)) -               /* -(c1-c7) */
+	   MULTIPLY(tmp11, FIX(2.476089912)) -                 /* c3+c9 */
+	   MULTIPLY(tmp15, FIX(0.989006518)) + tmp4 - tmp12;   /* c11+c13 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+2);
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+2);
+    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+2);
+    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+2);
+
+    dataptr++;			/* advance pointer to next column */
+    wsptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 16x16 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_16x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
+  DCTELEM workspace[DCTSIZE2];
+  DCTELEM *dataptr;
+  DCTELEM *wsptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* cK represents sqrt(2) * cos(K*pi/32). */
+
+  dataptr = data;
+  ctr = 0;
+  for (;;) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]);
+    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]);
+    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]);
+    tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]);
+    tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]);
+
+    tmp10 = tmp0 + tmp7;
+    tmp14 = tmp0 - tmp7;
+    tmp11 = tmp1 + tmp6;
+    tmp15 = tmp1 - tmp6;
+    tmp12 = tmp2 + tmp5;
+    tmp16 = tmp2 - tmp5;
+    tmp13 = tmp3 + tmp4;
+    tmp17 = tmp3 - tmp4;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]);
+    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]);
+    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
+    tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
+    tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
+    tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE) << PASS1_BITS);
+    dataptr[4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
+	      MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */
+	      CONST_BITS-PASS1_BITS);
+
+    tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) +   /* c14[16] = c7[8] */
+	    MULTIPLY(tmp14 - tmp16, FIX(1.387039845));    /* c2[16] = c1[8] */
+
+    dataptr[2] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982))   /* c6+c14 */
+	      + MULTIPLY(tmp16, FIX(2.172734804)),        /* c2+c10 */
+	      CONST_BITS-PASS1_BITS);
+    dataptr[6] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243))   /* c2-c6 */
+	      - MULTIPLY(tmp17, FIX(1.061594338)),        /* c10+c14 */
+	      CONST_BITS-PASS1_BITS);
+
+    /* Odd part */
+
+    tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) +         /* c3 */
+	    MULTIPLY(tmp6 - tmp7, FIX(0.410524528));          /* c13 */
+    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) +         /* c5 */
+	    MULTIPLY(tmp5 + tmp7, FIX(0.666655658));          /* c11 */
+    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) +         /* c7 */
+	    MULTIPLY(tmp4 - tmp7, FIX(0.897167586));          /* c9 */
+    tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) +         /* c15 */
+	    MULTIPLY(tmp6 - tmp5, FIX(1.407403738));          /* c1 */
+    tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) +       /* -c11 */
+	    MULTIPLY(tmp4 + tmp6, - FIX(1.247225013));        /* -c5 */
+    tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) +       /* -c3 */
+	    MULTIPLY(tmp5 - tmp4, FIX(0.410524528));          /* c13 */
+    tmp10 = tmp11 + tmp12 + tmp13 -
+	    MULTIPLY(tmp0, FIX(2.286341144)) +                /* c7+c5+c3-c1 */
+	    MULTIPLY(tmp7, FIX(0.779653625));                 /* c15+c13-c11+c9 */
+    tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
+	     - MULTIPLY(tmp6, FIX(1.663905119));              /* c7+c13+c1-c5 */
+    tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
+	     + MULTIPLY(tmp5, FIX(1.227391138));              /* c9-c11+c1-c13 */
+    tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
+	     + MULTIPLY(tmp4, FIX(2.167985692));              /* c1+c13+c5-c9 */
+
+    dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
+    dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
+    dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
+    dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
+
+    ctr++;
+
+    if (ctr != DCTSIZE) {
+      if (ctr == DCTSIZE * 2)
+	break;			/* Done. */
+      dataptr += DCTSIZE;	/* advance pointer to next row */
+    } else
+      dataptr = workspace;	/* switch pointer to extended workspace */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/16)**2 = 1/2**2.
+   */
+
+  dataptr = data;
+  wsptr = workspace;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*7];
+    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*6];
+    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*5];
+    tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*4];
+    tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*3];
+    tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*2];
+    tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*1];
+    tmp7 = dataptr[DCTSIZE*7] + wsptr[DCTSIZE*0];
+
+    tmp10 = tmp0 + tmp7;
+    tmp14 = tmp0 - tmp7;
+    tmp11 = tmp1 + tmp6;
+    tmp15 = tmp1 - tmp6;
+    tmp12 = tmp2 + tmp5;
+    tmp16 = tmp2 - tmp5;
+    tmp13 = tmp3 + tmp4;
+    tmp17 = tmp3 - tmp4;
+
+    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*7];
+    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*6];
+    tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*5];
+    tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*4];
+    tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*3];
+    tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*2];
+    tmp6 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*1];
+    tmp7 = dataptr[DCTSIZE*7] - wsptr[DCTSIZE*0];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(tmp10 + tmp11 + tmp12 + tmp13, PASS1_BITS+2);
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
+	      MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */
+	      CONST_BITS+PASS1_BITS+2);
+
+    tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) +   /* c14[16] = c7[8] */
+	    MULTIPLY(tmp14 - tmp16, FIX(1.387039845));    /* c2[16] = c1[8] */
+
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982))   /* c6+c14 */
+	      + MULTIPLY(tmp16, FIX(2.172734804)),        /* c2+10 */
+	      CONST_BITS+PASS1_BITS+2);
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243))   /* c2-c6 */
+	      - MULTIPLY(tmp17, FIX(1.061594338)),        /* c10+c14 */
+	      CONST_BITS+PASS1_BITS+2);
+
+    /* Odd part */
+
+    tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) +         /* c3 */
+	    MULTIPLY(tmp6 - tmp7, FIX(0.410524528));          /* c13 */
+    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) +         /* c5 */
+	    MULTIPLY(tmp5 + tmp7, FIX(0.666655658));          /* c11 */
+    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) +         /* c7 */
+	    MULTIPLY(tmp4 - tmp7, FIX(0.897167586));          /* c9 */
+    tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) +         /* c15 */
+	    MULTIPLY(tmp6 - tmp5, FIX(1.407403738));          /* c1 */
+    tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) +       /* -c11 */
+	    MULTIPLY(tmp4 + tmp6, - FIX(1.247225013));        /* -c5 */
+    tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) +       /* -c3 */
+	    MULTIPLY(tmp5 - tmp4, FIX(0.410524528));          /* c13 */
+    tmp10 = tmp11 + tmp12 + tmp13 -
+	    MULTIPLY(tmp0, FIX(2.286341144)) +                /* c7+c5+c3-c1 */
+	    MULTIPLY(tmp7, FIX(0.779653625));                 /* c15+c13-c11+c9 */
+    tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
+	     - MULTIPLY(tmp6, FIX(1.663905119));              /* c7+c13+c1-c5 */
+    tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
+	     + MULTIPLY(tmp5, FIX(1.227391138));              /* c9-c11+c1-c13 */
+    tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
+	     + MULTIPLY(tmp4, FIX(2.167985692));              /* c1+c13+c5-c9 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS+2);
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS+2);
+    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS+2);
+    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS+2);
+
+    dataptr++;			/* advance pointer to next column */
+    wsptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 16x8 sample block.
+ *
+ * 16-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_16x8 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
+  INT32 z1;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32). */
+
+  dataptr = data;
+  ctr = 0;
+  for (ctr = 0; ctr < DCTSIZE; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]);
+    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]);
+    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]);
+    tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]);
+    tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]);
+
+    tmp10 = tmp0 + tmp7;
+    tmp14 = tmp0 - tmp7;
+    tmp11 = tmp1 + tmp6;
+    tmp15 = tmp1 - tmp6;
+    tmp12 = tmp2 + tmp5;
+    tmp16 = tmp2 - tmp5;
+    tmp13 = tmp3 + tmp4;
+    tmp17 = tmp3 - tmp4;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]);
+    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]);
+    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
+    tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
+    tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
+    tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE) << PASS1_BITS);
+    dataptr[4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
+	      MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */
+	      CONST_BITS-PASS1_BITS);
+
+    tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) +   /* c14[16] = c7[8] */
+	    MULTIPLY(tmp14 - tmp16, FIX(1.387039845));    /* c2[16] = c1[8] */
+
+    dataptr[2] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982))   /* c6+c14 */
+	      + MULTIPLY(tmp16, FIX(2.172734804)),        /* c2+c10 */
+	      CONST_BITS-PASS1_BITS);
+    dataptr[6] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243))   /* c2-c6 */
+	      - MULTIPLY(tmp17, FIX(1.061594338)),        /* c10+c14 */
+	      CONST_BITS-PASS1_BITS);
+
+    /* Odd part */
+
+    tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) +         /* c3 */
+	    MULTIPLY(tmp6 - tmp7, FIX(0.410524528));          /* c13 */
+    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) +         /* c5 */
+	    MULTIPLY(tmp5 + tmp7, FIX(0.666655658));          /* c11 */
+    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) +         /* c7 */
+	    MULTIPLY(tmp4 - tmp7, FIX(0.897167586));          /* c9 */
+    tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) +         /* c15 */
+	    MULTIPLY(tmp6 - tmp5, FIX(1.407403738));          /* c1 */
+    tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) +       /* -c11 */
+	    MULTIPLY(tmp4 + tmp6, - FIX(1.247225013));        /* -c5 */
+    tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) +       /* -c3 */
+	    MULTIPLY(tmp5 - tmp4, FIX(0.410524528));          /* c13 */
+    tmp10 = tmp11 + tmp12 + tmp13 -
+	    MULTIPLY(tmp0, FIX(2.286341144)) +                /* c7+c5+c3-c1 */
+	    MULTIPLY(tmp7, FIX(0.779653625));                 /* c15+c13-c11+c9 */
+    tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
+	     - MULTIPLY(tmp6, FIX(1.663905119));              /* c7+c13+c1-c5 */
+    tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
+	     + MULTIPLY(tmp5, FIX(1.227391138));              /* c9-c11+c1-c13 */
+    tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
+	     + MULTIPLY(tmp4, FIX(2.167985692));              /* c1+c13+c5-c9 */
+
+    dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
+    dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
+    dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
+    dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by 8/16 = 1/2.
+   */
+
+  dataptr = data;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
     /* Even part per LL&M figure 1 --- note that published figure is faulty;
      * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
      */
-    
+
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
+    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
+    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
+    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
+
+    tmp10 = tmp0 + tmp3;
+    tmp12 = tmp0 - tmp3;
+    tmp11 = tmp1 + tmp2;
+    tmp13 = tmp1 - tmp2;
+
+    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
+    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
+    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
+    tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
+
+    dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS+1);
+    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS+1);
+
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
+    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865),
+					   CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065),
+					   CONST_BITS+PASS1_BITS+1);
+
+    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
+     * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
+     * i0..i3 in the paper are tmp0..tmp3 here.
+     */
+
+    tmp10 = tmp0 + tmp3;
+    tmp11 = tmp1 + tmp2;
+    tmp12 = tmp0 + tmp2;
+    tmp13 = tmp1 + tmp3;
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
+
+    tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
+    tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
+    tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
+    tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
+    tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
+    tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
+    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
+    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */
+
+    tmp12 += z1;
+    tmp13 += z1;
+
+    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0 + tmp10 + tmp12,
+					   CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1 + tmp11 + tmp13,
+					   CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2 + tmp11 + tmp12,
+					   CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3 + tmp10 + tmp13,
+					   CONST_BITS+PASS1_BITS+1);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 14x7 sample block.
+ *
+ * 14-point FDCT in pass 1 (rows), 7-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_14x7 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+  INT32 z1, z2, z3;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Zero bottom row of output coefficient block. */
+  MEMZERO(&data[DCTSIZE*7], SIZEOF(DCTELEM) * DCTSIZE);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* 14-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/28). */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 7; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]);
+    tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]);
+    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]);
+    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]);
+    tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]);
+
+    tmp10 = tmp0 + tmp6;
+    tmp14 = tmp0 - tmp6;
+    tmp11 = tmp1 + tmp5;
+    tmp15 = tmp1 - tmp5;
+    tmp12 = tmp2 + tmp4;
+    tmp16 = tmp2 - tmp4;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]);
+    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
+    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
+    tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
+    tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE) << PASS1_BITS);
+    tmp13 += tmp13;
+    dataptr[4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */
+	      MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */
+	      MULTIPLY(tmp12 - tmp13, FIX(0.881747734)),  /* c8 */
+	      CONST_BITS-PASS1_BITS);
+
+    tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686));    /* c6 */
+
+    dataptr[2] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590))   /* c2-c6 */
+	      + MULTIPLY(tmp16, FIX(0.613604268)),        /* c10 */
+	      CONST_BITS-PASS1_BITS);
+    dataptr[6] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954))   /* c6+c10 */
+	      - MULTIPLY(tmp16, FIX(1.378756276)),        /* c2 */
+	      CONST_BITS-PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = tmp1 + tmp2;
+    tmp11 = tmp5 - tmp4;
+    dataptr[7] = (DCTELEM) ((tmp0 - tmp10 + tmp3 - tmp11 - tmp6) << PASS1_BITS);
+    tmp3 <<= CONST_BITS;
+    tmp10 = MULTIPLY(tmp10, - FIX(0.158341681));          /* -c13 */
+    tmp11 = MULTIPLY(tmp11, FIX(1.405321284));            /* c1 */
+    tmp10 += tmp11 - tmp3;
+    tmp11 = MULTIPLY(tmp0 + tmp2, FIX(1.197448846)) +     /* c5 */
+	    MULTIPLY(tmp4 + tmp6, FIX(0.752406978));      /* c9 */
+    dataptr[5] = (DCTELEM)
+      DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(2.373959773)) /* c3+c5-c13 */
+	      + MULTIPLY(tmp4, FIX(1.119999435)),         /* c1+c11-c9 */
+	      CONST_BITS-PASS1_BITS);
+    tmp12 = MULTIPLY(tmp0 + tmp1, FIX(1.334852607)) +     /* c3 */
+	    MULTIPLY(tmp5 - tmp6, FIX(0.467085129));      /* c11 */
+    dataptr[3] = (DCTELEM)
+      DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.424103948)) /* c3-c9-c13 */
+	      - MULTIPLY(tmp5, FIX(3.069855259)),         /* c1+c5+c11 */
+	      CONST_BITS-PASS1_BITS);
+    dataptr[1] = (DCTELEM)
+      DESCALE(tmp11 + tmp12 + tmp3 + tmp6 -
+	      MULTIPLY(tmp0 + tmp6, FIX(1.126980169)),    /* c3+c5-c1 */
+	      CONST_BITS-PASS1_BITS);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/14)*(8/7) = 32/49, which we
+   * partially fold into the constant multipliers and final shifting:
+   * 7-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/14) * 64/49.
+   */
+
+  dataptr = data;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*6];
+    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*5];
+    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*4];
+    tmp3 = dataptr[DCTSIZE*3];
+
+    tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*6];
+    tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*5];
+    tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*4];
+
+    z1 = tmp0 + tmp2;
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(z1 + tmp1 + tmp3, FIX(1.306122449)), /* 64/49 */
+	      CONST_BITS+PASS1_BITS+1);
+    tmp3 += tmp3;
+    z1 -= tmp3;
+    z1 -= tmp3;
+    z1 = MULTIPLY(z1, FIX(0.461784020));                /* (c2+c6-c4)/2 */
+    z2 = MULTIPLY(tmp0 - tmp2, FIX(1.202428084));       /* (c2+c4-c6)/2 */
+    z3 = MULTIPLY(tmp1 - tmp2, FIX(0.411026446));       /* c6 */
+    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS+PASS1_BITS+1);
+    z1 -= z2;
+    z2 = MULTIPLY(tmp0 - tmp1, FIX(1.151670509));       /* c4 */
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.923568041)), /* c2+c6-c4 */
+	      CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+PASS1_BITS+1);
+
+    /* Odd part */
+
+    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.221765677));   /* (c3+c1-c5)/2 */
+    tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.222383464));   /* (c3+c5-c1)/2 */
+    tmp0 = tmp1 - tmp2;
+    tmp1 += tmp2;
+    tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.800824523)); /* -c1 */
+    tmp1 += tmp2;
+    tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.801442310));   /* c5 */
+    tmp0 += tmp3;
+    tmp2 += tmp3 + MULTIPLY(tmp12, FIX(2.443531355));   /* c3+c1-c5 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS+1);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 12x6 sample block.
+ *
+ * 12-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_12x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Zero 2 bottom rows of output coefficient block. */
+  MEMZERO(&data[DCTSIZE*6], SIZEOF(DCTELEM) * DCTSIZE * 2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* 12-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/24). */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 6; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]);
+    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]);
+    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]);
+
+    tmp10 = tmp0 + tmp5;
+    tmp13 = tmp0 - tmp5;
+    tmp11 = tmp1 + tmp4;
+    tmp14 = tmp1 - tmp4;
+    tmp12 = tmp2 + tmp3;
+    tmp15 = tmp2 - tmp3;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
+    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
+    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
+    tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE) << PASS1_BITS);
+    dataptr[6] = (DCTELEM) ((tmp13 - tmp14 - tmp15) << PASS1_BITS);
+    dataptr[4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
+	      CONST_BITS-PASS1_BITS);
+    dataptr[2] = (DCTELEM)
+      DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */
+	      CONST_BITS-PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100);    /* c9 */
+    tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865);   /* c3-c9 */
+    tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065);   /* c3+c9 */
+    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054));   /* c5 */
+    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669));   /* c7 */
+    tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */
+	    + MULTIPLY(tmp5, FIX(0.184591911));        /* c11 */
+    tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */
+    tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.339493912)) /* c1+c5-c11 */
+	    + MULTIPLY(tmp5, FIX(0.860918669));        /* c7 */
+    tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.725788011)) /* c1+c11-c7 */
+	    - MULTIPLY(tmp5, FIX(1.121971054));        /* c5 */
+    tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.306562965)) /* c3 */
+	    - MULTIPLY(tmp2 + tmp5, FIX_0_541196100);  /* c9 */
+
+    dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
+    dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
+    dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
+    dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/12)*(8/6) = 8/9, which we
+   * partially fold into the constant multipliers and final shifting:
+   * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12) * 16/9.
+   */
+
+  dataptr = data;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
+    tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
+    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
+
+    tmp10 = tmp0 + tmp2;
+    tmp12 = tmp0 - tmp2;
+
+    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
+    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
+    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)),         /* 16/9 */
+	      CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp12, FIX(2.177324216)),                 /* c2 */
+	      CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
+	      CONST_BITS+PASS1_BITS+1);
+
+    /* Odd part */
+
+    tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829));             /* c5 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),   /* 16/9 */
+	      CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)),    /* 16/9 */
+	      CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*5] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)),   /* 16/9 */
+	      CONST_BITS+PASS1_BITS+1);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 10x5 sample block.
+ *
+ * 10-point FDCT in pass 1 (rows), 5-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_10x5 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Zero 3 bottom rows of output coefficient block. */
+  MEMZERO(&data[DCTSIZE*5], SIZEOF(DCTELEM) * DCTSIZE * 3);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* 10-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/20). */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 5; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]);
+    tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]);
+    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]);
+
+    tmp10 = tmp0 + tmp4;
+    tmp13 = tmp0 - tmp4;
+    tmp11 = tmp1 + tmp3;
+    tmp14 = tmp1 - tmp3;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
+    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
+    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE) << PASS1_BITS);
+    tmp12 += tmp12;
+    dataptr[4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */
+	      MULTIPLY(tmp11 - tmp12, FIX(0.437016024)),  /* c8 */
+	      CONST_BITS-PASS1_BITS);
+    tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876));    /* c6 */
+    dataptr[2] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)),  /* c2-c6 */
+	      CONST_BITS-PASS1_BITS);
+    dataptr[6] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)),  /* c2+c6 */
+	      CONST_BITS-PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = tmp0 + tmp4;
+    tmp11 = tmp1 - tmp3;
+    dataptr[5] = (DCTELEM) ((tmp10 - tmp11 - tmp2) << PASS1_BITS);
+    tmp2 <<= CONST_BITS;
+    dataptr[1] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0, FIX(1.396802247)) +          /* c1 */
+	      MULTIPLY(tmp1, FIX(1.260073511)) + tmp2 +   /* c3 */
+	      MULTIPLY(tmp3, FIX(0.642039522)) +          /* c7 */
+	      MULTIPLY(tmp4, FIX(0.221231742)),           /* c9 */
+	      CONST_BITS-PASS1_BITS);
+    tmp12 = MULTIPLY(tmp0 - tmp4, FIX(0.951056516)) -     /* (c3+c7)/2 */
+	    MULTIPLY(tmp1 + tmp3, FIX(0.587785252));      /* (c1-c9)/2 */
+    tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.309016994)) +   /* (c3-c7)/2 */
+	    (tmp11 << (CONST_BITS - 1)) - tmp2;
+    dataptr[3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS-PASS1_BITS);
+    dataptr[7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS-PASS1_BITS);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/10)*(8/5) = 32/25, which we
+   * fold into the constant multipliers:
+   * 5-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/10) * 32/25.
+   */
+
+  dataptr = data;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*4];
+    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*3];
+    tmp2 = dataptr[DCTSIZE*2];
+
+    tmp10 = tmp0 + tmp1;
+    tmp11 = tmp0 - tmp1;
+
+    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*4];
+    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*3];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 + tmp2, FIX(1.28)),        /* 32/25 */
+	      CONST_BITS+PASS1_BITS);
+    tmp11 = MULTIPLY(tmp11, FIX(1.011928851));          /* (c2+c4)/2 */
+    tmp10 -= tmp2 << 2;
+    tmp10 = MULTIPLY(tmp10, FIX(0.452548340));          /* (c2-c4)/2 */
+    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS+PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = MULTIPLY(tmp0 + tmp1, FIX(1.064004961));    /* c3 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.657591230)), /* c1-c3 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.785601151)), /* c1+c3 */
+	      CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on an 8x4 sample block.
+ *
+ * 8-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_8x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3;
+  INT32 tmp10, tmp11, tmp12, tmp13;
+  INT32 z1;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Zero 4 bottom rows of output coefficient block. */
+  MEMZERO(&data[DCTSIZE*4], SIZEOF(DCTELEM) * DCTSIZE * 4);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* We must also scale the output by 8/4 = 2, which we add here. */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 4; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part per LL&M figure 1 --- note that published figure is faulty;
+     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+     */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
+
+    tmp10 = tmp0 + tmp3;
+    tmp12 = tmp0 - tmp3;
+    tmp11 = tmp1 + tmp2;
+    tmp13 = tmp1 - tmp2;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
+    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << (PASS1_BITS+1));
+    dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << (PASS1_BITS+1));
+
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS-PASS1_BITS-2);
+    dataptr[2] = (DCTELEM) RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865),
+				       CONST_BITS-PASS1_BITS-1);
+    dataptr[6] = (DCTELEM) RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065),
+				       CONST_BITS-PASS1_BITS-1);
+
+    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
+     * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
+     * i0..i3 in the paper are tmp0..tmp3 here.
+     */
+
+    tmp10 = tmp0 + tmp3;
+    tmp11 = tmp1 + tmp2;
+    tmp12 = tmp0 + tmp2;
+    tmp13 = tmp1 + tmp3;
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS-PASS1_BITS-2);
+
+    tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
+    tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
+    tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
+    tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
+    tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
+    tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
+    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
+    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */
+
+    tmp12 += z1;
+    tmp13 += z1;
+
+    dataptr[1] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS-1);
+    dataptr[3] = (DCTELEM)
+      RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS-1);
+    dataptr[5] = (DCTELEM)
+      RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS-1);
+    dataptr[7] = (DCTELEM)
+      RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS-1);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * 4-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
+   */
+
+  dataptr = data;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3] + (ONE << (PASS1_BITS-1));
+    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
+
+    tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
+    tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
+
+    dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
+    dataptr[DCTSIZE*2] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
+
+    /* Odd part */
+
+    tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);   /* c6 */
+    /* Add fudge factor here for final descale. */
+    tmp0 += ONE << (CONST_BITS+PASS1_BITS-1);
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
+		  CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
+		  CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 6x3 sample block.
+ *
+ * 6-point FDCT in pass 1 (rows), 3-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_6x3 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2;
+  INT32 tmp10, tmp11, tmp12;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* We scale the results further by 2 as part of output adaption */
+  /* scaling for different DCT size. */
+  /* 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12). */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 3; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
+    tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
+
+    tmp10 = tmp0 + tmp2;
+    tmp12 = tmp0 - tmp2;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << (PASS1_BITS+1));
+    dataptr[2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp12, FIX(1.224744871)),                 /* c2 */
+	      CONST_BITS-PASS1_BITS-1);
+    dataptr[4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
+	      CONST_BITS-PASS1_BITS-1);
+
+    /* Odd part */
+
+    tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)),     /* c5 */
+		    CONST_BITS-PASS1_BITS-1);
+
+    dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << (PASS1_BITS+1)));
+    dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << (PASS1_BITS+1));
+    dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << (PASS1_BITS+1)));
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/6)*(8/3) = 32/9, which we partially
+   * fold into the constant multipliers (other part was done in pass 1):
+   * 3-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/6) * 16/9.
+   */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 6; ctr++) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*2];
+    tmp1 = dataptr[DCTSIZE*1];
+
+    tmp2 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*2];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),        /* 16/9 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(1.257078722)), /* c2 */
+	      CONST_BITS+PASS1_BITS);
+
+    /* Odd part */
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp2, FIX(2.177324216)),               /* c1 */
+	      CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 4x2 sample block.
+ *
+ * 4-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_4x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1;
+  INT32 tmp10, tmp11;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* We must also scale the output by (8/4)*(8/2) = 2**3, which we add here. */
+  /* 4-point FDCT kernel, */
+  /* cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT]. */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 2; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
+
+    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
+    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+3));
+    dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+3));
+
+    /* Odd part */
+
+    tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
+    /* Add fudge factor here for final descale. */
+    tmp0 += ONE << (CONST_BITS-PASS1_BITS-4);
+
+    dataptr[1] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
+		  CONST_BITS-PASS1_BITS-3);
+    dataptr[3] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
+		  CONST_BITS-PASS1_BITS-3);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 4; ctr++) {
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = dataptr[DCTSIZE*0] + (ONE << (PASS1_BITS-1));
+    tmp1 = dataptr[DCTSIZE*1];
+
+    dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
+
+    /* Odd part */
+
+    dataptr[DCTSIZE*1] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 2x1 sample block.
+ *
+ * 2-point FDCT in pass 1 (rows), 1-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_2x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1;
+  JSAMPROW elemptr;
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  elemptr = sample_data[0] + start_col;
+
+  tmp0 = GETJSAMPLE(elemptr[0]);
+  tmp1 = GETJSAMPLE(elemptr[1]);
+
+  /* We leave the results scaled up by an overall factor of 8.
+   * We must also scale the output by (8/2)*(8/1) = 2**5.
+   */
+
+  /* Even part */
+  /* Apply unsigned->signed conversion */
+  data[0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 5);
+
+  /* Odd part */
+  data[1] = (DCTELEM) ((tmp0 - tmp1) << 5);
+}
+
+
+/*
+ * Perform the forward DCT on an 8x16 sample block.
+ *
+ * 8-point FDCT in pass 1 (rows), 16-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_8x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
+  INT32 z1;
+  DCTELEM workspace[DCTSIZE2];
+  DCTELEM *dataptr;
+  DCTELEM *wsptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+
+  dataptr = data;
+  ctr = 0;
+  for (;;) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part per LL&M figure 1 --- note that published figure is faulty;
+     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+     */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
+
     tmp10 = tmp0 + tmp3;
-    tmp13 = tmp0 - tmp3;
+    tmp12 = tmp0 - tmp3;
     tmp11 = tmp1 + tmp2;
-    tmp12 = tmp1 - tmp2;
-    
-    dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS);
-    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS);
-    
+    tmp13 = tmp1 - tmp2;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
+    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM) ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << PASS1_BITS);
+    dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
+
     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
-    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
-					   CONST_BITS+PASS1_BITS);
-    dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
-					   CONST_BITS+PASS1_BITS);
-    
+    dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865),
+				   CONST_BITS-PASS1_BITS);
+    dataptr[6] = (DCTELEM) DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065),
+				   CONST_BITS-PASS1_BITS);
+
     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
-     * cK represents cos(K*pi/16).
-     * i0..i3 in the paper are tmp4..tmp7 here.
+     * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
+     * i0..i3 in the paper are tmp0..tmp3 here.
      */
-    
-    z1 = tmp4 + tmp7;
-    z2 = tmp5 + tmp6;
-    z3 = tmp4 + tmp6;
-    z4 = tmp5 + tmp7;
-    z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
-    
-    tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
-    tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
-    tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
-    tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
-    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
-    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
-    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
-    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
-    
-    z3 += z5;
-    z4 += z5;
-    
-    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp4 + z1 + z3,
-					   CONST_BITS+PASS1_BITS);
-    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp5 + z2 + z4,
-					   CONST_BITS+PASS1_BITS);
-    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp6 + z2 + z3,
-					   CONST_BITS+PASS1_BITS);
-    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp7 + z1 + z4,
-					   CONST_BITS+PASS1_BITS);
-    
+
+    tmp10 = tmp0 + tmp3;
+    tmp11 = tmp1 + tmp2;
+    tmp12 = tmp0 + tmp2;
+    tmp13 = tmp1 + tmp3;
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
+
+    tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
+    tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
+    tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
+    tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
+    tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
+    tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
+    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
+    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */
+
+    tmp12 += z1;
+    tmp13 += z1;
+
+    dataptr[1] = (DCTELEM) DESCALE(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS);
+    dataptr[3] = (DCTELEM) DESCALE(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS);
+    dataptr[5] = (DCTELEM) DESCALE(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS);
+    dataptr[7] = (DCTELEM) DESCALE(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS);
+
+    ctr++;
+
+    if (ctr != DCTSIZE) {
+      if (ctr == DCTSIZE * 2)
+	break;			/* Done. */
+      dataptr += DCTSIZE;	/* advance pointer to next row */
+    } else
+      dataptr = workspace;	/* switch pointer to extended workspace */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by 8/16 = 1/2.
+   * 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
+   */
+
+  dataptr = data;
+  wsptr = workspace;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*7];
+    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*6];
+    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*5];
+    tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*4];
+    tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*3];
+    tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*2];
+    tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*1];
+    tmp7 = dataptr[DCTSIZE*7] + wsptr[DCTSIZE*0];
+
+    tmp10 = tmp0 + tmp7;
+    tmp14 = tmp0 - tmp7;
+    tmp11 = tmp1 + tmp6;
+    tmp15 = tmp1 - tmp6;
+    tmp12 = tmp2 + tmp5;
+    tmp16 = tmp2 - tmp5;
+    tmp13 = tmp3 + tmp4;
+    tmp17 = tmp3 - tmp4;
+
+    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*7];
+    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*6];
+    tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*5];
+    tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*4];
+    tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*3];
+    tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*2];
+    tmp6 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*1];
+    tmp7 = dataptr[DCTSIZE*7] - wsptr[DCTSIZE*0];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(tmp10 + tmp11 + tmp12 + tmp13, PASS1_BITS+1);
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
+	      MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */
+	      CONST_BITS+PASS1_BITS+1);
+
+    tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) +   /* c14[16] = c7[8] */
+	    MULTIPLY(tmp14 - tmp16, FIX(1.387039845));    /* c2[16] = c1[8] */
+
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982))   /* c6+c14 */
+	      + MULTIPLY(tmp16, FIX(2.172734804)),        /* c2+c10 */
+	      CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243))   /* c2-c6 */
+	      - MULTIPLY(tmp17, FIX(1.061594338)),        /* c10+c14 */
+	      CONST_BITS+PASS1_BITS+1);
+
+    /* Odd part */
+
+    tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) +         /* c3 */
+	    MULTIPLY(tmp6 - tmp7, FIX(0.410524528));          /* c13 */
+    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) +         /* c5 */
+	    MULTIPLY(tmp5 + tmp7, FIX(0.666655658));          /* c11 */
+    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) +         /* c7 */
+	    MULTIPLY(tmp4 - tmp7, FIX(0.897167586));          /* c9 */
+    tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) +         /* c15 */
+	    MULTIPLY(tmp6 - tmp5, FIX(1.407403738));          /* c1 */
+    tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) +       /* -c11 */
+	    MULTIPLY(tmp4 + tmp6, - FIX(1.247225013));        /* -c5 */
+    tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) +       /* -c3 */
+	    MULTIPLY(tmp5 - tmp4, FIX(0.410524528));          /* c13 */
+    tmp10 = tmp11 + tmp12 + tmp13 -
+	    MULTIPLY(tmp0, FIX(2.286341144)) +                /* c7+c5+c3-c1 */
+	    MULTIPLY(tmp7, FIX(0.779653625));                 /* c15+c13-c11+c9 */
+    tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
+	     - MULTIPLY(tmp6, FIX(1.663905119));              /* c7+c13+c1-c5 */
+    tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
+	     + MULTIPLY(tmp5, FIX(1.227391138));              /* c9-c11+c1-c13 */
+    tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
+	     + MULTIPLY(tmp4, FIX(2.167985692));              /* c1+c13+c5-c9 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS+1);
+
     dataptr++;			/* advance pointer to next column */
+    wsptr++;			/* advance pointer to next column */
   }
 }
 
+
+/*
+ * Perform the forward DCT on a 7x14 sample block.
+ *
+ * 7-point FDCT in pass 1 (rows), 14-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_7x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+  INT32 z1, z2, z3;
+  DCTELEM workspace[8*6];
+  DCTELEM *dataptr;
+  DCTELEM *wsptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* 7-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/14). */
+
+  dataptr = data;
+  ctr = 0;
+  for (;;) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]);
+    tmp3 = GETJSAMPLE(elemptr[3]);
+
+    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
+    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
+    tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);
+
+    z1 = tmp0 + tmp2;
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE) << PASS1_BITS);
+    tmp3 += tmp3;
+    z1 -= tmp3;
+    z1 -= tmp3;
+    z1 = MULTIPLY(z1, FIX(0.353553391));                /* (c2+c6-c4)/2 */
+    z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002));       /* (c2+c4-c6)/2 */
+    z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123));       /* c6 */
+    dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
+    z1 -= z2;
+    z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734));       /* c4 */
+    dataptr[4] = (DCTELEM)
+      DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */
+	      CONST_BITS-PASS1_BITS);
+    dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);
+
+    /* Odd part */
+
+    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347));   /* (c3+c1-c5)/2 */
+    tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339));   /* (c3+c5-c1)/2 */
+    tmp0 = tmp1 - tmp2;
+    tmp1 += tmp2;
+    tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.378756276)); /* -c1 */
+    tmp1 += tmp2;
+    tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.613604268));   /* c5 */
+    tmp0 += tmp3;
+    tmp2 += tmp3 + MULTIPLY(tmp12, FIX(1.870828693));   /* c3+c1-c5 */
+
+    dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
+    dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
+    dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
+
+    ctr++;
+
+    if (ctr != DCTSIZE) {
+      if (ctr == 14)
+	break;			/* Done. */
+      dataptr += DCTSIZE;	/* advance pointer to next row */
+    } else
+      dataptr = workspace;	/* switch pointer to extended workspace */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/7)*(8/14) = 32/49, which we
+   * fold into the constant multipliers:
+   * 14-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/28) * 32/49.
+   */
+
+  dataptr = data;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 7; ctr++) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*5];
+    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*4];
+    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*3];
+    tmp13 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*2];
+    tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*1];
+    tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*0];
+    tmp6 = dataptr[DCTSIZE*6] + dataptr[DCTSIZE*7];
+
+    tmp10 = tmp0 + tmp6;
+    tmp14 = tmp0 - tmp6;
+    tmp11 = tmp1 + tmp5;
+    tmp15 = tmp1 - tmp5;
+    tmp12 = tmp2 + tmp4;
+    tmp16 = tmp2 - tmp4;
+
+    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*5];
+    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*4];
+    tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*3];
+    tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*2];
+    tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*1];
+    tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*0];
+    tmp6 = dataptr[DCTSIZE*6] - dataptr[DCTSIZE*7];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12 + tmp13,
+		       FIX(0.653061224)),                 /* 32/49 */
+	      CONST_BITS+PASS1_BITS);
+    tmp13 += tmp13;
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(0.832106052)) + /* c4 */
+	      MULTIPLY(tmp11 - tmp13, FIX(0.205513223)) - /* c12 */
+	      MULTIPLY(tmp12 - tmp13, FIX(0.575835255)),  /* c8 */
+	      CONST_BITS+PASS1_BITS);
+
+    tmp10 = MULTIPLY(tmp14 + tmp15, FIX(0.722074570));    /* c6 */
+
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.178337691))   /* c2-c6 */
+	      + MULTIPLY(tmp16, FIX(0.400721155)),        /* c10 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.122795725))   /* c6+c10 */
+	      - MULTIPLY(tmp16, FIX(0.900412262)),        /* c2 */
+	      CONST_BITS+PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = tmp1 + tmp2;
+    tmp11 = tmp5 - tmp4;
+    dataptr[DCTSIZE*7] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 - tmp10 + tmp3 - tmp11 - tmp6,
+		       FIX(0.653061224)),                 /* 32/49 */
+	      CONST_BITS+PASS1_BITS);
+    tmp3  = MULTIPLY(tmp3 , FIX(0.653061224));            /* 32/49 */
+    tmp10 = MULTIPLY(tmp10, - FIX(0.103406812));          /* -c13 */
+    tmp11 = MULTIPLY(tmp11, FIX(0.917760839));            /* c1 */
+    tmp10 += tmp11 - tmp3;
+    tmp11 = MULTIPLY(tmp0 + tmp2, FIX(0.782007410)) +     /* c5 */
+	    MULTIPLY(tmp4 + tmp6, FIX(0.491367823));      /* c9 */
+    dataptr[DCTSIZE*5] = (DCTELEM)
+      DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(1.550341076)) /* c3+c5-c13 */
+	      + MULTIPLY(tmp4, FIX(0.731428202)),         /* c1+c11-c9 */
+	      CONST_BITS+PASS1_BITS);
+    tmp12 = MULTIPLY(tmp0 + tmp1, FIX(0.871740478)) +     /* c3 */
+	    MULTIPLY(tmp5 - tmp6, FIX(0.305035186));      /* c11 */
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.276965844)) /* c3-c9-c13 */
+	      - MULTIPLY(tmp5, FIX(2.004803435)),         /* c1+c5+c11 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      DESCALE(tmp11 + tmp12 + tmp3
+	      - MULTIPLY(tmp0, FIX(0.735987049))          /* c3+c5-c1 */
+	      - MULTIPLY(tmp6, FIX(0.082925825)),         /* c9-c11-c13 */
+	      CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+    wsptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 6x12 sample block.
+ *
+ * 6-point FDCT in pass 1 (rows), 12-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_6x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  DCTELEM workspace[8*4];
+  DCTELEM *dataptr;
+  DCTELEM *wsptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12). */
+
+  dataptr = data;
+  ctr = 0;
+  for (;;) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
+    tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
+
+    tmp10 = tmp0 + tmp2;
+    tmp12 = tmp0 - tmp2;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << PASS1_BITS);
+    dataptr[2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp12, FIX(1.224744871)),                 /* c2 */
+	      CONST_BITS-PASS1_BITS);
+    dataptr[4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
+	      CONST_BITS-PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)),     /* c5 */
+		    CONST_BITS-PASS1_BITS);
+
+    dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
+    dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
+    dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));
+
+    ctr++;
+
+    if (ctr != DCTSIZE) {
+      if (ctr == 12)
+	break;			/* Done. */
+      dataptr += DCTSIZE;	/* advance pointer to next row */
+    } else
+      dataptr = workspace;	/* switch pointer to extended workspace */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/6)*(8/12) = 8/9, which we
+   * fold into the constant multipliers:
+   * 12-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/24) * 8/9.
+   */
+
+  dataptr = data;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 6; ctr++) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*3];
+    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*2];
+    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*1];
+    tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*0];
+    tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*7];
+    tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*6];
+
+    tmp10 = tmp0 + tmp5;
+    tmp13 = tmp0 - tmp5;
+    tmp11 = tmp1 + tmp4;
+    tmp14 = tmp1 - tmp4;
+    tmp12 = tmp2 + tmp3;
+    tmp15 = tmp2 - tmp3;
+
+    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*3];
+    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*2];
+    tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*1];
+    tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*0];
+    tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*7];
+    tmp5 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*6];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(0.888888889)), /* 8/9 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp13 - tmp14 - tmp15, FIX(0.888888889)), /* 8/9 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.088662108)),         /* c4 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp14 - tmp15, FIX(0.888888889)) +        /* 8/9 */
+	      MULTIPLY(tmp13 + tmp15, FIX(1.214244803)),         /* c2 */
+	      CONST_BITS+PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = MULTIPLY(tmp1 + tmp4, FIX(0.481063200));   /* c9 */
+    tmp14 = tmp10 + MULTIPLY(tmp1, FIX(0.680326102));  /* c3-c9 */
+    tmp15 = tmp10 - MULTIPLY(tmp4, FIX(1.642452502));  /* c3+c9 */
+    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(0.997307603));   /* c5 */
+    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.765261039));   /* c7 */
+    tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.516244403)) /* c5+c7-c1 */
+	    + MULTIPLY(tmp5, FIX(0.164081699));        /* c11 */
+    tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.164081699)); /* -c11 */
+    tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.079550144)) /* c1+c5-c11 */
+	    + MULTIPLY(tmp5, FIX(0.765261039));        /* c7 */
+    tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.645144899)) /* c1+c11-c7 */
+	    - MULTIPLY(tmp5, FIX(0.997307603));        /* c5 */
+    tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.161389302)) /* c3 */
+	    - MULTIPLY(tmp2 + tmp5, FIX(0.481063200)); /* c9 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+    wsptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 5x10 sample block.
+ *
+ * 5-point FDCT in pass 1 (rows), 10-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_5x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
+  DCTELEM workspace[8*2];
+  DCTELEM *dataptr;
+  DCTELEM *wsptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* 5-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/10). */
+
+  dataptr = data;
+  ctr = 0;
+  for (;;) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]);
+    tmp2 = GETJSAMPLE(elemptr[2]);
+
+    tmp10 = tmp0 + tmp1;
+    tmp11 = tmp0 - tmp1;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << PASS1_BITS);
+    tmp11 = MULTIPLY(tmp11, FIX(0.790569415));          /* (c2+c4)/2 */
+    tmp10 -= tmp2 << 2;
+    tmp10 = MULTIPLY(tmp10, FIX(0.353553391));          /* (c2-c4)/2 */
+    dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS);
+    dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876));    /* c3 */
+
+    dataptr[1] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */
+	      CONST_BITS-PASS1_BITS);
+    dataptr[3] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */
+	      CONST_BITS-PASS1_BITS);
+
+    ctr++;
+
+    if (ctr != DCTSIZE) {
+      if (ctr == 10)
+	break;			/* Done. */
+      dataptr += DCTSIZE;	/* advance pointer to next row */
+    } else
+      dataptr = workspace;	/* switch pointer to extended workspace */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/5)*(8/10) = 32/25, which we
+   * fold into the constant multipliers:
+   * 10-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/20) * 32/25.
+   */
+
+  dataptr = data;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 5; ctr++) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*1];
+    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*0];
+    tmp12 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*7];
+    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*6];
+    tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*5];
+
+    tmp10 = tmp0 + tmp4;
+    tmp13 = tmp0 - tmp4;
+    tmp11 = tmp1 + tmp3;
+    tmp14 = tmp1 - tmp3;
+
+    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*1];
+    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*0];
+    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*7];
+    tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*6];
+    tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*5];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(1.28)), /* 32/25 */
+	      CONST_BITS+PASS1_BITS);
+    tmp12 += tmp12;
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.464477191)) - /* c4 */
+	      MULTIPLY(tmp11 - tmp12, FIX(0.559380511)),  /* c8 */
+	      CONST_BITS+PASS1_BITS);
+    tmp10 = MULTIPLY(tmp13 + tmp14, FIX(1.064004961));    /* c6 */
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.657591230)),  /* c2-c6 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.785601151)),  /* c2+c6 */
+	      CONST_BITS+PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = tmp0 + tmp4;
+    tmp11 = tmp1 - tmp3;
+    dataptr[DCTSIZE*5] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp2, FIX(1.28)),  /* 32/25 */
+	      CONST_BITS+PASS1_BITS);
+    tmp2 = MULTIPLY(tmp2, FIX(1.28));                     /* 32/25 */
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0, FIX(1.787906876)) +          /* c1 */
+	      MULTIPLY(tmp1, FIX(1.612894094)) + tmp2 +   /* c3 */
+	      MULTIPLY(tmp3, FIX(0.821810588)) +          /* c7 */
+	      MULTIPLY(tmp4, FIX(0.283176630)),           /* c9 */
+	      CONST_BITS+PASS1_BITS);
+    tmp12 = MULTIPLY(tmp0 - tmp4, FIX(1.217352341)) -     /* (c3+c7)/2 */
+	    MULTIPLY(tmp1 + tmp3, FIX(0.752365123));      /* (c1-c9)/2 */
+    tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.395541753)) +   /* (c3-c7)/2 */
+	    MULTIPLY(tmp11, FIX(0.64)) - tmp2;            /* 16/25 */
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+    wsptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 4x8 sample block.
+ *
+ * 4-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_4x8 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3;
+  INT32 tmp10, tmp11, tmp12, tmp13;
+  INT32 z1;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* We must also scale the output by 8/4 = 2, which we add here. */
+  /* 4-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16). */
+
+  dataptr = data;
+  for (ctr = 0; ctr < DCTSIZE; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
+
+    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
+    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+1));
+    dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+1));
+
+    /* Odd part */
+
+    tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
+    /* Add fudge factor here for final descale. */
+    tmp0 += ONE << (CONST_BITS-PASS1_BITS-2);
+
+    dataptr[1] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
+		  CONST_BITS-PASS1_BITS-1);
+    dataptr[3] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
+		  CONST_BITS-PASS1_BITS-1);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 4; ctr++) {
+    /* Even part per LL&M figure 1 --- note that published figure is faulty;
+     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+     */
+
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
+    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
+    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
+    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
+
+    /* Add fudge factor here for final descale. */
+    tmp10 = tmp0 + tmp3 + (ONE << (PASS1_BITS-1));
+    tmp12 = tmp0 - tmp3;
+    tmp11 = tmp1 + tmp2;
+    tmp13 = tmp1 - tmp2;
+
+    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
+    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
+    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
+    tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
+
+    dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS1_BITS);
+    dataptr[DCTSIZE*4] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS1_BITS);
+
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS+PASS1_BITS-1);
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), CONST_BITS+PASS1_BITS);
+
+    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
+     * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
+     * i0..i3 in the paper are tmp0..tmp3 here.
+     */
+
+    tmp10 = tmp0 + tmp3;
+    tmp11 = tmp1 + tmp2;
+    tmp12 = tmp0 + tmp2;
+    tmp13 = tmp1 + tmp3;
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS+PASS1_BITS-1);
+
+    tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
+    tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
+    tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
+    tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
+    tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
+    tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
+    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
+    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */
+
+    tmp12 += z1;
+    tmp13 += z1;
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*5] = (DCTELEM)
+      RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*7] = (DCTELEM)
+      RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 3x6 sample block.
+ *
+ * 3-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_3x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2;
+  INT32 tmp10, tmp11, tmp12;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* We scale the results further by 2 as part of output adaption */
+  /* scaling for different DCT size. */
+  /* 3-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/6). */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 6; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
+    tmp1 = GETJSAMPLE(elemptr[1]);
+
+    tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+1));
+    dataptr[2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
+	      CONST_BITS-PASS1_BITS-1);
+
+    /* Odd part */
+
+    dataptr[1] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp2, FIX(1.224744871)),               /* c1 */
+	      CONST_BITS-PASS1_BITS-1);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/6)*(8/3) = 32/9, which we partially
+   * fold into the constant multipliers (other part was done in pass 1):
+   * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12) * 16/9.
+   */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 3; ctr++) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
+    tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
+    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
+
+    tmp10 = tmp0 + tmp2;
+    tmp12 = tmp0 - tmp2;
+
+    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
+    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
+    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)),         /* 16/9 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp12, FIX(2.177324216)),                 /* c2 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
+	      CONST_BITS+PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829));             /* c5 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),   /* 16/9 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)),    /* 16/9 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*5] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)),   /* 16/9 */
+	      CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 2x4 sample block.
+ *
+ * 2-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_2x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1;
+  INT32 tmp10, tmp11;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT. */
+  /* We must also scale the output by (8/2)*(8/4) = 2**3, which we add here. */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 4; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]);
+    tmp1 = GETJSAMPLE(elemptr[1]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 3);
+
+    /* Odd part */
+
+    dataptr[1] = (DCTELEM) ((tmp0 - tmp1) << 3);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We leave the results scaled up by an overall factor of 8.
+   * 4-point FDCT kernel,
+   * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
+   */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 2; ctr++) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3];
+    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
+
+    tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
+    tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
+
+    dataptr[DCTSIZE*0] = (DCTELEM) (tmp0 + tmp1);
+    dataptr[DCTSIZE*2] = (DCTELEM) (tmp0 - tmp1);
+
+    /* Odd part */
+
+    tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
+    /* Add fudge factor here for final descale. */
+    tmp0 += ONE << (CONST_BITS-1);
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
+		  CONST_BITS);
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
+		  CONST_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 1x2 sample block.
+ *
+ * 1-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_1x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1;
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  tmp0 = GETJSAMPLE(sample_data[0][start_col]);
+  tmp1 = GETJSAMPLE(sample_data[1][start_col]);
+
+  /* We leave the results scaled up by an overall factor of 8.
+   * We must also scale the output by (8/1)*(8/2) = 2**5.
+   */
+
+  /* Even part */
+  /* Apply unsigned->signed conversion */
+  data[DCTSIZE*0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 5);
+
+  /* Odd part */
+  data[DCTSIZE*1] = (DCTELEM) ((tmp0 - tmp1) << 5);
+}
+
+#endif /* DCT_SCALING_SUPPORTED */
 #endif /* DCT_ISLOW_SUPPORTED */
diff --git a/src/libjpeg/jidctint.c b/src/libjpeg/jidctint.c
index a72b320..dcdf7ce 100644
--- a/src/libjpeg/jidctint.c
+++ b/src/libjpeg/jidctint.c
@@ -2,6 +2,7 @@
  * jidctint.c
  *
  * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Modification developed 2002-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -23,6 +24,28 @@
  * The advantage of this method is that no data path contains more than one
  * multiplication; this allows a very simple and accurate implementation in
  * scaled fixed-point arithmetic, with a minimal number of shifts.
+ *
+ * We also provide IDCT routines with various output sample block sizes for
+ * direct resolution reduction or enlargement and for direct resolving the
+ * common 2x1 and 1x2 subsampling cases without additional resampling: NxN
+ * (N=1...16), 2NxN, and Nx2N (N=1...8) pixels for one 8x8 input DCT block.
+ *
+ * For N<8 we simply take the corresponding low-frequency coefficients of
+ * the 8x8 input DCT block and apply an NxN point IDCT on the sub-block
+ * to yield the downscaled outputs.
+ * This can be seen as direct low-pass downsampling from the DCT domain
+ * point of view rather than the usual spatial domain point of view,
+ * yielding significant computational savings and results at least
+ * as good as common bilinear (averaging) spatial downsampling.
+ *
+ * For N>8 we apply a partial NxN IDCT on the 8 input coefficients as
+ * lower frequencies and higher frequencies assumed to be zero.
+ * It turns out that the computational effort is similar to the 8x8 IDCT
+ * regarding the output size.
+ * Furthermore, the scaling and descaling is the same for all IDCT sizes.
+ *
+ * CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases
+ * since there would be too many additional constants to pre-calculate.
  */
 
 #define JPEG_INTERNALS
@@ -38,7 +61,7 @@
  */
 
 #if DCTSIZE != 8
-  Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
+  Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
 #endif
 
 
@@ -151,7 +174,7 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 {
   INT32 tmp0, tmp1, tmp2, tmp3;
   INT32 tmp10, tmp11, tmp12, tmp13;
-  INT32 z1, z2, z3, z4, z5;
+  INT32 z1, z2, z3;
   JCOEFPTR inptr;
   ISLOW_MULT_TYPE * quantptr;
   int * wsptr;
@@ -177,14 +200,14 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
      * With typical images and quantization tables, half or more of the
      * column DCT calculations can be simplified this way.
      */
-    
+
     if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
 	inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
 	inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
 	inptr[DCTSIZE*7] == 0) {
       /* AC terms all zero */
       int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
-      
+
       wsptr[DCTSIZE*0] = dcval;
       wsptr[DCTSIZE*1] = dcval;
       wsptr[DCTSIZE*2] = dcval;
@@ -193,82 +216,84 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       wsptr[DCTSIZE*5] = dcval;
       wsptr[DCTSIZE*6] = dcval;
       wsptr[DCTSIZE*7] = dcval;
-      
+
       inptr++;			/* advance pointers to next column */
       quantptr++;
       wsptr++;
       continue;
     }
-    
+
     /* Even part: reverse the even part of the forward DCT. */
     /* The rotator is sqrt(2)*c(-6). */
     
     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
-    
+
     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
-    tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
-    tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
-    
+    tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
+    tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
+
     z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z2 <<= CONST_BITS;
+    z3 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    z2 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    tmp0 = z2 + z3;
+    tmp1 = z2 - z3;
+
+    tmp10 = tmp0 + tmp2;
+    tmp13 = tmp0 - tmp2;
+    tmp11 = tmp1 + tmp3;
+    tmp12 = tmp1 - tmp3;
 
-    tmp0 = (z2 + z3) << CONST_BITS;
-    tmp1 = (z2 - z3) << CONST_BITS;
-    
-    tmp10 = tmp0 + tmp3;
-    tmp13 = tmp0 - tmp3;
-    tmp11 = tmp1 + tmp2;
-    tmp12 = tmp1 - tmp2;
-    
     /* Odd part per figure 8; the matrix is unitary and hence its
      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
      */
-    
+
     tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
     tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
     tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
     tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
     
-    z1 = tmp0 + tmp3;
-    z2 = tmp1 + tmp2;
-    z3 = tmp0 + tmp2;
-    z4 = tmp1 + tmp3;
-    z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
-    
+    z2 = tmp0 + tmp2;
+    z3 = tmp1 + tmp3;
+
+    z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
+    z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+    z2 += z1;
+    z3 += z1;
+
+    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+    tmp0 += z1 + z2;
+    tmp3 += z1 + z3;
+
+    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
-    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
-    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
-    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
-    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
-    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
-    
-    z3 += z5;
-    z4 += z5;
-    
-    tmp0 += z1 + z3;
-    tmp1 += z2 + z4;
-    tmp2 += z2 + z3;
-    tmp3 += z1 + z4;
-    
+    tmp1 += z1 + z3;
+    tmp2 += z1 + z2;
+
     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
-    
-    wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
+
+    wsptr[DCTSIZE*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
     
     inptr++;			/* advance pointers to next column */
     quantptr++;
     wsptr++;
   }
-  
+
   /* Pass 2: process rows from work array, store into output array. */
   /* Note that we must descale the results by a factor of 8 == 2**3, */
   /* and also undo the PASS1_BITS scaling. */
@@ -283,14 +308,14 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
      * test takes more time than it's worth.  In that case this section
      * may be commented out.
      */
-    
+
 #ifndef NO_ZERO_ROW_TEST
     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
 	wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
       /* AC terms all zero */
       JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3)
 				  & RANGE_MASK];
-      
+
       outptr[0] = dcval;
       outptr[1] = dcval;
       outptr[2] = dcval;
@@ -304,86 +329,4809 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       continue;
     }
 #endif
-    
+
     /* Even part: reverse the even part of the forward DCT. */
     /* The rotator is sqrt(2)*c(-6). */
     
     z2 = (INT32) wsptr[2];
     z3 = (INT32) wsptr[6];
+
+    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
+    tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
+    tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
+
+    /* Add fudge factor here for final descale. */
+    z2 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z3 = (INT32) wsptr[4];
+
+    tmp0 = (z2 + z3) << CONST_BITS;
+    tmp1 = (z2 - z3) << CONST_BITS;
+    
+    tmp10 = tmp0 + tmp2;
+    tmp13 = tmp0 - tmp2;
+    tmp11 = tmp1 + tmp3;
+    tmp12 = tmp1 - tmp3;
+
+    /* Odd part per figure 8; the matrix is unitary and hence its
+     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
+     */
+
+    tmp0 = (INT32) wsptr[7];
+    tmp1 = (INT32) wsptr[5];
+    tmp2 = (INT32) wsptr[3];
+    tmp3 = (INT32) wsptr[1];
+
+    z2 = tmp0 + tmp2;
+    z3 = tmp1 + tmp3;
+
+    z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
+    z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+    z2 += z1;
+    z3 += z1;
+
+    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
+    tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+    tmp0 += z1 + z2;
+    tmp3 += z1 + z3;
+
+    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+    tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
+    tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
+    tmp1 += z1 + z3;
+    tmp2 += z1 + z2;
+
+    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp3,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp3,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += DCTSIZE;		/* advance pointer to next row */
+  }
+}
+
+#ifdef IDCT_SCALING_SUPPORTED
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 7x7 output block.
+ *
+ * Optimized algorithm with 12 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/14).
+ */
+
+GLOBAL(void)
+jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12, tmp13;
+  INT32 z1, z2, z3;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[7*7];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp13 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp13 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp13 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+    tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
+    tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
+    tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
+    tmp0 = z1 + z3;
+    z2 -= tmp0;
+    tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
+    tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536));  /* c2-c4-c6 */
+    tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249));  /* c2+c4+c6 */
+    tmp13 += MULTIPLY(z2, FIX(1.414213562));         /* c0 */
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+
+    tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
+    tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
+    tmp0 = tmp1 - tmp2;
+    tmp1 += tmp2;
+    tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276));    /* -c1 */
+    tmp1 += tmp2;
+    z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
+    tmp0 += z2;
+    tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693));     /* c3+c1-c5 */
+
+    /* Final output stage */
+
+    wsptr[7*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[7*6] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[7*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
+    wsptr[7*5] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
+    wsptr[7*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[7*4] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[7*3] = (int) RIGHT_SHIFT(tmp13, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 7 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 7; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp13 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp13 <<= CONST_BITS;
+
+    z1 = (INT32) wsptr[2];
+    z2 = (INT32) wsptr[4];
+    z3 = (INT32) wsptr[6];
+
+    tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
+    tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
+    tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
+    tmp0 = z1 + z3;
+    z2 -= tmp0;
+    tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
+    tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536));  /* c2-c4-c6 */
+    tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249));  /* c2+c4+c6 */
+    tmp13 += MULTIPLY(z2, FIX(1.414213562));         /* c0 */
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+
+    tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
+    tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
+    tmp0 = tmp1 - tmp2;
+    tmp1 += tmp2;
+    tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276));    /* -c1 */
+    tmp1 += tmp2;
+    z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
+    tmp0 += z2;
+    tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693));     /* c3+c1-c5 */
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 7;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 6x6 output block.
+ *
+ * Optimized algorithm with 3 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/12).
+ */
+
+GLOBAL(void)
+jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
+  INT32 z1, z2, z3;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[6*6];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp0 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
+    tmp1 = tmp0 + tmp10;
+    tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS-PASS1_BITS);
+    tmp10 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
+    tmp10 = tmp1 + tmp0;
+    tmp12 = tmp1 - tmp0;
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
+    tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
+    tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
+    tmp1 = (z1 - z2 - z3) << PASS1_BITS;
+
+    /* Final output stage */
+
+    wsptr[6*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[6*5] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[6*1] = (int) (tmp11 + tmp1);
+    wsptr[6*4] = (int) (tmp11 - tmp1);
+    wsptr[6*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[6*3] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 6 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 6; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 <<= CONST_BITS;
+    tmp2 = (INT32) wsptr[4];
+    tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
+    tmp1 = tmp0 + tmp10;
+    tmp11 = tmp0 - tmp10 - tmp10;
+    tmp10 = (INT32) wsptr[2];
+    tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
+    tmp10 = tmp1 + tmp0;
+    tmp12 = tmp1 - tmp0;
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
+    tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
+    tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
+    tmp1 = (z1 - z2 - z3) << CONST_BITS;
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 6;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 5x5 output block.
+ *
+ * Optimized algorithm with 5 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/10).
+ */
+
+GLOBAL(void)
+jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp1, tmp10, tmp11, tmp12;
+  INT32 z1, z2, z3;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[5*5];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp12 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp12 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp12 += ONE << (CONST_BITS-PASS1_BITS-1);
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    tmp1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
+    z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
+    z3 = tmp12 + z2;
+    tmp10 = z3 + z1;
+    tmp11 = z3 - z1;
+    tmp12 -= z2 << 2;
+
+    /* Odd part */
+
+    z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+
+    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
+    tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
+    tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899));   /* c1+c3 */
+
+    /* Final output stage */
+
+    wsptr[5*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[5*4] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[5*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
+    wsptr[5*3] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
+    wsptr[5*2] = (int) RIGHT_SHIFT(tmp12, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 5 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 5; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp12 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp12 <<= CONST_BITS;
+    tmp0 = (INT32) wsptr[2];
+    tmp1 = (INT32) wsptr[4];
+    z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
+    z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
+    z3 = tmp12 + z2;
+    tmp10 = z3 + z1;
+    tmp11 = z3 - z1;
+    tmp12 -= z2 << 2;
+
+    /* Odd part */
+
+    z2 = (INT32) wsptr[1];
+    z3 = (INT32) wsptr[3];
+
+    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
+    tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
+    tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899));   /* c1+c3 */
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 5;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 4x4 output block.
+ *
+ * Optimized algorithm with 3 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
+ */
+
+GLOBAL(void)
+jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp2, tmp10, tmp12;
+  INT32 z1, z2, z3;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[4*4];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 4; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    
+    tmp10 = (tmp0 + tmp2) << PASS1_BITS;
+    tmp12 = (tmp0 - tmp2) << PASS1_BITS;
+
+    /* Odd part */
+    /* Same rotation as in the even part of the 8x8 LL&M IDCT */
+
+    z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+
+    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);               /* c6 */
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+    tmp0 = RIGHT_SHIFT(z1 + MULTIPLY(z2, FIX_0_765366865), /* c2-c6 */
+		       CONST_BITS-PASS1_BITS);
+    tmp2 = RIGHT_SHIFT(z1 - MULTIPLY(z3, FIX_1_847759065), /* c2+c6 */
+		       CONST_BITS-PASS1_BITS);
+
+    /* Final output stage */
+
+    wsptr[4*0] = (int) (tmp10 + tmp0);
+    wsptr[4*3] = (int) (tmp10 - tmp0);
+    wsptr[4*1] = (int) (tmp12 + tmp2);
+    wsptr[4*2] = (int) (tmp12 - tmp2);
+  }
+
+  /* Pass 2: process 4 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 4; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp2 = (INT32) wsptr[2];
+
+    tmp10 = (tmp0 + tmp2) << CONST_BITS;
+    tmp12 = (tmp0 - tmp2) << CONST_BITS;
+
+    /* Odd part */
+    /* Same rotation as in the even part of the 8x8 LL&M IDCT */
+
+    z2 = (INT32) wsptr[1];
+    z3 = (INT32) wsptr[3];
+
+    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
+    tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
+    tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 4;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 3x3 output block.
+ *
+ * Optimized algorithm with 2 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/6).
+ */
+
+GLOBAL(void)
+jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp2, tmp10, tmp12;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[3*3];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp0 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
+    tmp10 = tmp0 + tmp12;
+    tmp2 = tmp0 - tmp12 - tmp12;
+
+    /* Odd part */
+
+    tmp12 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
+
+    /* Final output stage */
+
+    wsptr[3*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[3*2] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[3*1] = (int) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 3 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 3; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 <<= CONST_BITS;
+    tmp2 = (INT32) wsptr[2];
+    tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
+    tmp10 = tmp0 + tmp12;
+    tmp2 = tmp0 - tmp12 - tmp12;
+
+    /* Odd part */
+
+    tmp12 = (INT32) wsptr[1];
+    tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 3;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 2x2 output block.
+ *
+ * Multiplication-less algorithm.
+ */
+
+GLOBAL(void)
+jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  ISLOW_MULT_TYPE * quantptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input. */
+
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+
+  /* Column 0 */
+  tmp4 = DEQUANTIZE(coef_block[DCTSIZE*0], quantptr[DCTSIZE*0]);
+  tmp5 = DEQUANTIZE(coef_block[DCTSIZE*1], quantptr[DCTSIZE*1]);
+  /* Add fudge factor here for final descale. */
+  tmp4 += ONE << 2;
+
+  tmp0 = tmp4 + tmp5;
+  tmp2 = tmp4 - tmp5;
+
+  /* Column 1 */
+  tmp4 = DEQUANTIZE(coef_block[DCTSIZE*0+1], quantptr[DCTSIZE*0+1]);
+  tmp5 = DEQUANTIZE(coef_block[DCTSIZE*1+1], quantptr[DCTSIZE*1+1]);
+
+  tmp1 = tmp4 + tmp5;
+  tmp3 = tmp4 - tmp5;
+
+  /* Pass 2: process 2 rows, store into output array. */
+
+  /* Row 0 */
+  outptr = output_buf[0] + output_col;
+
+  outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp0 + tmp1, 3) & RANGE_MASK];
+  outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp0 - tmp1, 3) & RANGE_MASK];
+
+  /* Row 1 */
+  outptr = output_buf[1] + output_col;
+
+  outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp2 + tmp3, 3) & RANGE_MASK];
+  outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2 - tmp3, 3) & RANGE_MASK];
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 1x1 output block.
+ *
+ * We hardly need an inverse DCT routine for this: just take the
+ * average pixel value, which is one-eighth of the DC coefficient.
+ */
+
+GLOBAL(void)
+jpeg_idct_1x1 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  int dcval;
+  ISLOW_MULT_TYPE * quantptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  SHIFT_TEMPS
+
+  /* 1x1 is trivial: just take the DC coefficient divided by 8. */
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  dcval = DEQUANTIZE(coef_block[0], quantptr[0]);
+  dcval = (int) DESCALE((INT32) dcval, 3);
+
+  output_buf[0][output_col] = range_limit[dcval & RANGE_MASK];
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 9x9 output block.
+ *
+ * Optimized algorithm with 10 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/18).
+ */
+
+GLOBAL(void)
+jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13, tmp14;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*9];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp0 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+    tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
+    tmp1 = tmp0 + tmp3;
+    tmp2 = tmp0 - tmp3 - tmp3;
+
+    tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
+    tmp11 = tmp2 + tmp0;
+    tmp14 = tmp2 - tmp0 - tmp0;
+
+    tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
+    tmp2 = MULTIPLY(z1, FIX(1.083350441));      /* c4 */
+    tmp3 = MULTIPLY(z2, FIX(0.245575608));      /* c8 */
+
+    tmp10 = tmp1 + tmp0 - tmp3;
+    tmp12 = tmp1 - tmp0 + tmp2;
+    tmp13 = tmp1 - tmp2 + tmp3;
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+    z2 = MULTIPLY(z2, - FIX(1.224744871));           /* -c3 */
+
+    tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
+    tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
+    tmp0 = tmp2 + tmp3 - z2;
+    tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481));      /* c1 */
+    tmp2 += z2 - tmp1;
+    tmp3 += z2 + tmp1;
+    tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
+
+    /* Final output stage */
+
+    wsptr[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[8*8] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
+    wsptr[8*7] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
+    wsptr[8*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[8*6] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[8*3] = (int) RIGHT_SHIFT(tmp13 + tmp3, CONST_BITS-PASS1_BITS);
+    wsptr[8*5] = (int) RIGHT_SHIFT(tmp13 - tmp3, CONST_BITS-PASS1_BITS);
+    wsptr[8*4] = (int) RIGHT_SHIFT(tmp14, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 9 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 9; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 <<= CONST_BITS;
+
+    z1 = (INT32) wsptr[2];
+    z2 = (INT32) wsptr[4];
+    z3 = (INT32) wsptr[6];
+
+    tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
+    tmp1 = tmp0 + tmp3;
+    tmp2 = tmp0 - tmp3 - tmp3;
+
+    tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
+    tmp11 = tmp2 + tmp0;
+    tmp14 = tmp2 - tmp0 - tmp0;
+
+    tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
+    tmp2 = MULTIPLY(z1, FIX(1.083350441));      /* c4 */
+    tmp3 = MULTIPLY(z2, FIX(0.245575608));      /* c8 */
+
+    tmp10 = tmp1 + tmp0 - tmp3;
+    tmp12 = tmp1 - tmp0 + tmp2;
+    tmp13 = tmp1 - tmp2 + tmp3;
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    z4 = (INT32) wsptr[7];
+
+    z2 = MULTIPLY(z2, - FIX(1.224744871));           /* -c3 */
+
+    tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
+    tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
+    tmp0 = tmp2 + tmp3 - z2;
+    tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481));      /* c1 */
+    tmp2 += z2 - tmp1;
+    tmp3 += z2 + tmp1;
+    tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp3,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp3,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp14,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 8;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 10x10 output block.
+ *
+ * Optimized algorithm with 12 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/20).
+ */
+
+GLOBAL(void)
+jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		 JCOEFPTR coef_block,
+		 JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
+  INT32 z1, z2, z3, z4, z5;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*10];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z3 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    z3 += ONE << (CONST_BITS-PASS1_BITS-1);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
+    z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
+    tmp10 = z3 + z1;
+    tmp11 = z3 - z2;
+
+    tmp22 = RIGHT_SHIFT(z3 - ((z1 - z2) << 1),   /* c0 = (c4-c8)*2 */
+			CONST_BITS-PASS1_BITS);
+
+    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
+    tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
+    tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
+
+    tmp20 = tmp10 + tmp12;
+    tmp24 = tmp10 - tmp12;
+    tmp21 = tmp11 + tmp13;
+    tmp23 = tmp11 - tmp13;
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+    tmp11 = z2 + z4;
+    tmp13 = z2 - z4;
+
+    tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
+    z5 = z3 << CONST_BITS;
+
+    z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
+    z4 = z5 + tmp12;
+
+    tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
+    tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
+
+    z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
+    z4 = z5 - tmp12 - (tmp13 << (CONST_BITS - 1));
+
+    tmp12 = (z1 - tmp13 - z3) << PASS1_BITS;
+
+    tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
+    tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
+
+    /* Final output stage */
+
+    wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*9] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*8] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*2] = (int) (tmp22 + tmp12);
+    wsptr[8*7] = (int) (tmp22 - tmp12);
+    wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*6] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*5] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 10 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 10; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z3 <<= CONST_BITS;
+    z4 = (INT32) wsptr[4];
+    z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
+    z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
+    tmp10 = z3 + z1;
+    tmp11 = z3 - z2;
+
+    tmp22 = z3 - ((z1 - z2) << 1);               /* c0 = (c4-c8)*2 */
+
+    z2 = (INT32) wsptr[2];
+    z3 = (INT32) wsptr[6];
+
+    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
+    tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
+    tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
+
+    tmp20 = tmp10 + tmp12;
+    tmp24 = tmp10 - tmp12;
+    tmp21 = tmp11 + tmp13;
+    tmp23 = tmp11 - tmp13;
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    z3 <<= CONST_BITS;
+    z4 = (INT32) wsptr[7];
+
+    tmp11 = z2 + z4;
+    tmp13 = z2 - z4;
+
+    tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
+
+    z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
+    z4 = z3 + tmp12;
+
+    tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
+    tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
+
+    z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
+    z4 = z3 - tmp12 - (tmp13 << (CONST_BITS - 1));
+
+    tmp12 = ((z1 - tmp13) << CONST_BITS) - z3;
+
+    tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
+    tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 8;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 11x11 output block.
+ *
+ * Optimized algorithm with 24 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/22).
+ */
+
+GLOBAL(void)
+jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		 JCOEFPTR coef_block,
+		 JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*11];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp10 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp10 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+    tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
+    tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
+    z4 = z1 + z3;
+    tmp24 = MULTIPLY(z4, - FIX(1.155664402));        /* -(c2-c10) */
+    z4 -= z2;
+    tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
+    tmp21 = tmp20 + tmp23 + tmp25 -
+	    MULTIPLY(z2, FIX(1.821790775));          /* c2+c4+c10-c6 */
+    tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
+    tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
+    tmp24 += tmp25;
+    tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120));  /* c8+c10 */
+    tmp24 += MULTIPLY(z2, FIX(1.944413522)) -        /* c2+c8 */
+	     MULTIPLY(z1, FIX(1.390975730));         /* c4+c10 */
+    tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562));  /* c0 */
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+    tmp11 = z1 + z2;
+    tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
+    tmp11 = MULTIPLY(tmp11, FIX(0.887983902));           /* c3-c9 */
+    tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295));         /* c5-c9 */
+    tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
+    tmp10 = tmp11 + tmp12 + tmp13 -
+	    MULTIPLY(z1, FIX(0.923107866));              /* c7+c5+c3-c1-2*c9 */
+    z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
+    tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
+    tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
+    z1    = MULTIPLY(z2 + z4, - FIX(1.798248910));       /* -(c1+c9) */
+    tmp11 += z1;
+    tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
+    tmp14 += MULTIPLY(z2, - FIX(1.467221301)) +          /* -(c5+c9) */
+	     MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
+	     MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
+
+    /* Final output stage */
+
+    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*10] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 11 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 11; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp10 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp10 <<= CONST_BITS;
+
+    z1 = (INT32) wsptr[2];
+    z2 = (INT32) wsptr[4];
+    z3 = (INT32) wsptr[6];
+
+    tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
+    tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
+    z4 = z1 + z3;
+    tmp24 = MULTIPLY(z4, - FIX(1.155664402));        /* -(c2-c10) */
+    z4 -= z2;
+    tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
+    tmp21 = tmp20 + tmp23 + tmp25 -
+	    MULTIPLY(z2, FIX(1.821790775));          /* c2+c4+c10-c6 */
+    tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
+    tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
+    tmp24 += tmp25;
+    tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120));  /* c8+c10 */
+    tmp24 += MULTIPLY(z2, FIX(1.944413522)) -        /* c2+c8 */
+	     MULTIPLY(z1, FIX(1.390975730));         /* c4+c10 */
+    tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562));  /* c0 */
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    z4 = (INT32) wsptr[7];
+
+    tmp11 = z1 + z2;
+    tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
+    tmp11 = MULTIPLY(tmp11, FIX(0.887983902));           /* c3-c9 */
+    tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295));         /* c5-c9 */
+    tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
+    tmp10 = tmp11 + tmp12 + tmp13 -
+	    MULTIPLY(z1, FIX(0.923107866));              /* c7+c5+c3-c1-2*c9 */
+    z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
+    tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
+    tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
+    z1    = MULTIPLY(z2 + z4, - FIX(1.798248910));       /* -(c1+c9) */
+    tmp11 += z1;
+    tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
+    tmp14 += MULTIPLY(z2, - FIX(1.467221301)) +          /* -(c5+c9) */
+	     MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
+	     MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
+
+    /* Final output stage */
+
+    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+
+    wsptr += 8;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 12x12 output block.
+ *
+ * Optimized algorithm with 15 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/24).
+ */
+
+GLOBAL(void)
+jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		 JCOEFPTR coef_block,
+		 JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*12];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z3 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    z3 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
+
+    tmp10 = z3 + z4;
+    tmp11 = z3 - z4;
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
+    z1 <<= CONST_BITS;
+    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z2 <<= CONST_BITS;
+
+    tmp12 = z1 - z2;
+
+    tmp21 = z3 + tmp12;
+    tmp24 = z3 - tmp12;
+
+    tmp12 = z4 + z2;
+
+    tmp20 = tmp10 + tmp12;
+    tmp25 = tmp10 - tmp12;
+
+    tmp12 = z4 - z1 - z2;
+
+    tmp22 = tmp11 + tmp12;
+    tmp23 = tmp11 - tmp12;
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+    tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
+    tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
+
+    tmp10 = z1 + z3;
+    tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
+    tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
+    tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
+    tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
+    tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
+    tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
+    tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
+	     MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
+
+    z1 -= z4;
+    z2 -= z3;
+    z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
+    tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
+    tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
+
+    /* Final output stage */
+
+    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*11] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*10] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
+    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 12 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 12; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z3 <<= CONST_BITS;
+
+    z4 = (INT32) wsptr[4];
+    z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
+
+    tmp10 = z3 + z4;
+    tmp11 = z3 - z4;
+
+    z1 = (INT32) wsptr[2];
+    z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
+    z1 <<= CONST_BITS;
+    z2 = (INT32) wsptr[6];
+    z2 <<= CONST_BITS;
+
+    tmp12 = z1 - z2;
+
+    tmp21 = z3 + tmp12;
+    tmp24 = z3 - tmp12;
+
+    tmp12 = z4 + z2;
+
+    tmp20 = tmp10 + tmp12;
+    tmp25 = tmp10 - tmp12;
+
+    tmp12 = z4 - z1 - z2;
+
+    tmp22 = tmp11 + tmp12;
+    tmp23 = tmp11 - tmp12;
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    z4 = (INT32) wsptr[7];
+
+    tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
+    tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
+
+    tmp10 = z1 + z3;
+    tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
+    tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
+    tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
+    tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
+    tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
+    tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
+    tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
+	     MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
+
+    z1 -= z4;
+    z2 -= z3;
+    z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
+    tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
+    tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
+
+    /* Final output stage */
+
+    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+
+    wsptr += 8;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 13x13 output block.
+ *
+ * Optimized algorithm with 29 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/26).
+ */
+
+GLOBAL(void)
+jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		 JCOEFPTR coef_block,
+		 JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*13];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z1 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+    tmp10 = z3 + z4;
+    tmp11 = z3 - z4;
+
+    tmp12 = MULTIPLY(tmp10, FIX(1.155388986));                /* (c4+c6)/2 */
+    tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1;           /* (c4-c6)/2 */
+
+    tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13;   /* c2 */
+    tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13;   /* c10 */
+
+    tmp12 = MULTIPLY(tmp10, FIX(0.316450131));                /* (c8-c12)/2 */
+    tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1;           /* (c8+c12)/2 */
+
+    tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13;   /* c6 */
+    tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
+
+    tmp12 = MULTIPLY(tmp10, FIX(0.435816023));                /* (c2-c10)/2 */
+    tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1;           /* (c2+c10)/2 */
+
+    tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
+    tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
+
+    tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1;      /* c0 */
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+    tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
+    tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
+    tmp15 = z1 + z4;
+    tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
+    tmp10 = tmp11 + tmp12 + tmp13 -
+	    MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
+    tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458));   /* -c11 */
+    tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
+    tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
+    tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945));   /* -c5 */
+    tmp11 += tmp14;
+    tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
+    tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813));   /* -c9 */
+    tmp12 += tmp14;
+    tmp13 += tmp14;
+    tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
+    tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
+	    MULTIPLY(z2, FIX(0.466105296));          /* c1-c7 */
+    z1    = MULTIPLY(z3 - z2, FIX(0.937797057));     /* c7 */
+    tmp14 += z1;
+    tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) -   /* c3-c7 */
+	     MULTIPLY(z4, FIX(1.742345811));         /* c1+c11 */
+
+    /* Final output stage */
+
+    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*12] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*11] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*10] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
+    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
+    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 13 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 13; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z1 <<= CONST_BITS;
+
+    z2 = (INT32) wsptr[2];
+    z3 = (INT32) wsptr[4];
+    z4 = (INT32) wsptr[6];
+
+    tmp10 = z3 + z4;
+    tmp11 = z3 - z4;
+
+    tmp12 = MULTIPLY(tmp10, FIX(1.155388986));                /* (c4+c6)/2 */
+    tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1;           /* (c4-c6)/2 */
+
+    tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13;   /* c2 */
+    tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13;   /* c10 */
+
+    tmp12 = MULTIPLY(tmp10, FIX(0.316450131));                /* (c8-c12)/2 */
+    tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1;           /* (c8+c12)/2 */
+
+    tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13;   /* c6 */
+    tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
+
+    tmp12 = MULTIPLY(tmp10, FIX(0.435816023));                /* (c2-c10)/2 */
+    tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1;           /* (c2+c10)/2 */
+
+    tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
+    tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
+
+    tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1;      /* c0 */
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    z4 = (INT32) wsptr[7];
+
+    tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
+    tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
+    tmp15 = z1 + z4;
+    tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
+    tmp10 = tmp11 + tmp12 + tmp13 -
+	    MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
+    tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458));   /* -c11 */
+    tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
+    tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
+    tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945));   /* -c5 */
+    tmp11 += tmp14;
+    tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
+    tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813));   /* -c9 */
+    tmp12 += tmp14;
+    tmp13 += tmp14;
+    tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
+    tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
+	    MULTIPLY(z2, FIX(0.466105296));          /* c1-c7 */
+    z1    = MULTIPLY(z3 - z2, FIX(0.937797057));     /* c7 */
+    tmp14 += z1;
+    tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) -   /* c3-c7 */
+	     MULTIPLY(z4, FIX(1.742345811));         /* c1+c11 */
+
+    /* Final output stage */
+
+    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+
+    wsptr += 8;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 14x14 output block.
+ *
+ * Optimized algorithm with 20 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/28).
+ */
+
+GLOBAL(void)
+jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		 JCOEFPTR coef_block,
+		 JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*14];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z1 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
+    z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
+    z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
+
+    tmp10 = z1 + z2;
+    tmp11 = z1 + z3;
+    tmp12 = z1 - z4;
+
+    tmp23 = RIGHT_SHIFT(z1 - ((z2 + z3 - z4) << 1), /* c0 = (c4+c12-c8)*2 */
+			CONST_BITS-PASS1_BITS);
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+    z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
+
+    tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
+    tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
+    tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
+	    MULTIPLY(z2, FIX(1.378756276));      /* c2 */
+
+    tmp20 = tmp10 + tmp13;
+    tmp26 = tmp10 - tmp13;
+    tmp21 = tmp11 + tmp14;
+    tmp25 = tmp11 - tmp14;
+    tmp22 = tmp12 + tmp15;
+    tmp24 = tmp12 - tmp15;
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    tmp13 = z4 << CONST_BITS;
+
+    tmp14 = z1 + z3;
+    tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
+    tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
+    tmp10 = tmp11 + tmp12 + tmp13 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
+    tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
+    tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
+    z1    -= z2;
+    tmp15 = MULTIPLY(z1, FIX(0.467085129)) - tmp13;        /* c11 */
+    tmp16 += tmp15;
+    z1    += z4;
+    z4    = MULTIPLY(z2 + z3, - FIX(0.158341681)) - tmp13; /* -c13 */
+    tmp11 += z4 - MULTIPLY(z2, FIX(0.424103948));          /* c3-c9-c13 */
+    tmp12 += z4 - MULTIPLY(z3, FIX(2.373959773));          /* c3+c5-c13 */
+    z4    = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
+    tmp14 += z4 + tmp13 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
+    tmp15 += z4 + MULTIPLY(z2, FIX(0.674957567));          /* c1+c11-c5 */
+
+    tmp13 = (z1 - z3) << PASS1_BITS;
+
+    /* Final output stage */
+
+    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*13] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*12] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*11] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*3]  = (int) (tmp23 + tmp13);
+    wsptr[8*10] = (int) (tmp23 - tmp13);
+    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
+    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
+    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
+    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 14 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 14; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z1 <<= CONST_BITS;
+    z4 = (INT32) wsptr[4];
+    z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
+    z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
+    z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
+
+    tmp10 = z1 + z2;
+    tmp11 = z1 + z3;
+    tmp12 = z1 - z4;
+
+    tmp23 = z1 - ((z2 + z3 - z4) << 1);          /* c0 = (c4+c12-c8)*2 */
+
+    z1 = (INT32) wsptr[2];
+    z2 = (INT32) wsptr[6];
+
+    z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
+
+    tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
+    tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
+    tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
+	    MULTIPLY(z2, FIX(1.378756276));      /* c2 */
+
+    tmp20 = tmp10 + tmp13;
+    tmp26 = tmp10 - tmp13;
+    tmp21 = tmp11 + tmp14;
+    tmp25 = tmp11 - tmp14;
+    tmp22 = tmp12 + tmp15;
+    tmp24 = tmp12 - tmp15;
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    z4 = (INT32) wsptr[7];
+    z4 <<= CONST_BITS;
+
+    tmp14 = z1 + z3;
+    tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
+    tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
+    tmp10 = tmp11 + tmp12 + z4 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
+    tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
+    tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
+    z1    -= z2;
+    tmp15 = MULTIPLY(z1, FIX(0.467085129)) - z4;           /* c11 */
+    tmp16 += tmp15;
+    tmp13 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - z4;    /* -c13 */
+    tmp11 += tmp13 - MULTIPLY(z2, FIX(0.424103948));       /* c3-c9-c13 */
+    tmp12 += tmp13 - MULTIPLY(z3, FIX(2.373959773));       /* c3+c5-c13 */
+    tmp13 = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
+    tmp14 += tmp13 + z4 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
+    tmp15 += tmp13 + MULTIPLY(z2, FIX(0.674957567));       /* c1+c11-c5 */
+
+    tmp13 = ((z1 - z3) << CONST_BITS) + z4;
+
+    /* Final output stage */
+
+    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+
+    wsptr += 8;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 15x15 output block.
+ *
+ * Optimized algorithm with 22 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/30).
+ */
+
+GLOBAL(void)
+jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		 JCOEFPTR coef_block,
+		 JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*15];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z1 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+    tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
+    tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
+
+    tmp12 = z1 - tmp10;
+    tmp13 = z1 + tmp11;
+    z1 -= (tmp11 - tmp10) << 1;             /* c0 = (c6-c12)*2 */
+
+    z4 = z2 - z3;
+    z3 += z2;
+    tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */
+    tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */
+    z2 = MULTIPLY(z2, FIX(1.439773946));    /* c4+c14 */
+
+    tmp20 = tmp13 + tmp10 + tmp11;
+    tmp23 = tmp12 - tmp10 + tmp11 + z2;
+
+    tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */
+    tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */
+
+    tmp25 = tmp13 - tmp10 - tmp11;
+    tmp26 = tmp12 + tmp10 - tmp11 - z2;
+
+    tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */
+    tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */
+
+    tmp21 = tmp12 + tmp10 + tmp11;
+    tmp24 = tmp13 - tmp10 + tmp11;
+    tmp11 += tmp11;
+    tmp22 = z1 + tmp11;                     /* c10 = c6-c12 */
+    tmp27 = z1 - tmp11 - tmp11;             /* c0 = (c6-c12)*2 */
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
+    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+    tmp13 = z2 - z4;
+    tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
+    tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148));         /* c3-c9 */
+    tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899));      /* c3+c9 */
+
+    tmp13 = MULTIPLY(z2, - FIX(0.831253876));               /* -c9 */
+    tmp15 = MULTIPLY(z2, - FIX(1.344997024));               /* -c3 */
+    z2 = z1 - z4;
+    tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353));            /* c1 */
+
+    tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */
+    tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */
+    tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3;            /* c5 */
+    z2 = MULTIPLY(z1 + z4, FIX(0.575212477));               /* c11 */
+    tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3;      /* c7-c11 */
+    tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3;      /* c11+c13 */
+
+    /* Final output stage */
+
+    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*14] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*13] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*12] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*11] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*10] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
+    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
+    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
+    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
+    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 15 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 15; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z1 <<= CONST_BITS;
+
+    z2 = (INT32) wsptr[2];
+    z3 = (INT32) wsptr[4];
+    z4 = (INT32) wsptr[6];
+
+    tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
+    tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
+
+    tmp12 = z1 - tmp10;
+    tmp13 = z1 + tmp11;
+    z1 -= (tmp11 - tmp10) << 1;             /* c0 = (c6-c12)*2 */
+
+    z4 = z2 - z3;
+    z3 += z2;
+    tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */
+    tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */
+    z2 = MULTIPLY(z2, FIX(1.439773946));    /* c4+c14 */
+
+    tmp20 = tmp13 + tmp10 + tmp11;
+    tmp23 = tmp12 - tmp10 + tmp11 + z2;
+
+    tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */
+    tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */
+
+    tmp25 = tmp13 - tmp10 - tmp11;
+    tmp26 = tmp12 + tmp10 - tmp11 - z2;
+
+    tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */
+    tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */
+
+    tmp21 = tmp12 + tmp10 + tmp11;
+    tmp24 = tmp13 - tmp10 + tmp11;
+    tmp11 += tmp11;
+    tmp22 = z1 + tmp11;                     /* c10 = c6-c12 */
+    tmp27 = z1 - tmp11 - tmp11;             /* c0 = (c6-c12)*2 */
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z4 = (INT32) wsptr[5];
+    z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
+    z4 = (INT32) wsptr[7];
+
+    tmp13 = z2 - z4;
+    tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
+    tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148));         /* c3-c9 */
+    tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899));      /* c3+c9 */
+
+    tmp13 = MULTIPLY(z2, - FIX(0.831253876));               /* -c9 */
+    tmp15 = MULTIPLY(z2, - FIX(1.344997024));               /* -c3 */
+    z2 = z1 - z4;
+    tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353));            /* c1 */
+
+    tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */
+    tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */
+    tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3;            /* c5 */
+    z2 = MULTIPLY(z1 + z4, FIX(0.575212477));               /* c11 */
+    tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3;      /* c7-c11 */
+    tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3;      /* c11+c13 */
+
+    /* Final output stage */
+
+    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+
+    wsptr += 8;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 16x16 output block.
+ *
+ * Optimized algorithm with 28 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/32).
+ */
+
+GLOBAL(void)
+jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		 JCOEFPTR coef_block,
+		 JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*16];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp0 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp0 += 1 << (CONST_BITS-PASS1_BITS-1);
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
+    tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
+
+    tmp10 = tmp0 + tmp1;
+    tmp11 = tmp0 - tmp1;
+    tmp12 = tmp0 + tmp2;
+    tmp13 = tmp0 - tmp2;
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z3 = z1 - z2;
+    z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
+    z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
+
+    tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
+    tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
+    tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
+    tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
+
+    tmp20 = tmp10 + tmp0;
+    tmp27 = tmp10 - tmp0;
+    tmp21 = tmp12 + tmp1;
+    tmp26 = tmp12 - tmp1;
+    tmp22 = tmp13 + tmp2;
+    tmp25 = tmp13 - tmp2;
+    tmp23 = tmp11 + tmp3;
+    tmp24 = tmp11 - tmp3;
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+    tmp11 = z1 + z3;
+
+    tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
+    tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
+    tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
+    tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
+    tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
+    tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
+    tmp0  = tmp1 + tmp2 + tmp3 -
+	    MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
+    tmp13 = tmp10 + tmp11 + tmp12 -
+	    MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
+    z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
+    tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
+    tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
+    z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
+    tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
+    tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
+    z2    += z4;
+    z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
+    tmp1  += z1;
+    tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
+    z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
+    tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
+    tmp12 += z2;
+    z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
+    tmp2  += z2;
+    tmp3  += z2;
+    z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
+    tmp10 += z2;
+    tmp11 += z2;
+
+    /* Final output stage */
+
+    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp0,  CONST_BITS-PASS1_BITS);
+    wsptr[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0,  CONST_BITS-PASS1_BITS);
+    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp1,  CONST_BITS-PASS1_BITS);
+    wsptr[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1,  CONST_BITS-PASS1_BITS);
+    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp2,  CONST_BITS-PASS1_BITS);
+    wsptr[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2,  CONST_BITS-PASS1_BITS);
+    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp3,  CONST_BITS-PASS1_BITS);
+    wsptr[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3,  CONST_BITS-PASS1_BITS);
+    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 16 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 16; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 <<= CONST_BITS;
+
+    z1 = (INT32) wsptr[4];
+    tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
+    tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
+
+    tmp10 = tmp0 + tmp1;
+    tmp11 = tmp0 - tmp1;
+    tmp12 = tmp0 + tmp2;
+    tmp13 = tmp0 - tmp2;
+
+    z1 = (INT32) wsptr[2];
+    z2 = (INT32) wsptr[6];
+    z3 = z1 - z2;
+    z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
+    z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
+
+    tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
+    tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
+    tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
+    tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
+
+    tmp20 = tmp10 + tmp0;
+    tmp27 = tmp10 - tmp0;
+    tmp21 = tmp12 + tmp1;
+    tmp26 = tmp12 - tmp1;
+    tmp22 = tmp13 + tmp2;
+    tmp25 = tmp13 - tmp2;
+    tmp23 = tmp11 + tmp3;
+    tmp24 = tmp11 - tmp3;
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    z4 = (INT32) wsptr[7];
+
+    tmp11 = z1 + z3;
+
+    tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
+    tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
+    tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
+    tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
+    tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
+    tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
+    tmp0  = tmp1 + tmp2 + tmp3 -
+	    MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
+    tmp13 = tmp10 + tmp11 + tmp12 -
+	    MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
+    z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
+    tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
+    tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
+    z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
+    tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
+    tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
+    z2    += z4;
+    z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
+    tmp1  += z1;
+    tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
+    z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
+    tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
+    tmp12 += z2;
+    z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
+    tmp2  += z2;
+    tmp3  += z2;
+    z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
+    tmp10 += z2;
+    tmp11 += z2;
+
+    /* Final output stage */
+
+    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp0,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[15] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp0,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp1,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp1,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp2,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp2,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp3,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp3,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27 + tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp27 - tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+
+    wsptr += 8;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 16x8 output block.
+ *
+ * 8-point IDCT in pass 1 (columns), 16-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_16x8 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		JCOEFPTR coef_block,
+		JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*8];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = DCTSIZE; ctr > 0; ctr--) {
+    /* Due to quantization, we will usually find that many of the input
+     * coefficients are zero, especially the AC terms.  We can exploit this
+     * by short-circuiting the IDCT calculation for any column in which all
+     * the AC terms are zero.  In that case each output is equal to the
+     * DC coefficient (with scale factor as needed).
+     * With typical images and quantization tables, half or more of the
+     * column DCT calculations can be simplified this way.
+     */
+    
+    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
+	inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
+	inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
+	inptr[DCTSIZE*7] == 0) {
+      /* AC terms all zero */
+      int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
+      
+      wsptr[DCTSIZE*0] = dcval;
+      wsptr[DCTSIZE*1] = dcval;
+      wsptr[DCTSIZE*2] = dcval;
+      wsptr[DCTSIZE*3] = dcval;
+      wsptr[DCTSIZE*4] = dcval;
+      wsptr[DCTSIZE*5] = dcval;
+      wsptr[DCTSIZE*6] = dcval;
+      wsptr[DCTSIZE*7] = dcval;
+      
+      inptr++;			/* advance pointers to next column */
+      quantptr++;
+      wsptr++;
+      continue;
+    }
+    
+    /* Even part: reverse the even part of the forward DCT. */
+    /* The rotator is sqrt(2)*c(-6). */
+    
+    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
     
     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
-    tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
-    tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
+    tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
+    tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
     
-    tmp0 = ((INT32) wsptr[0] + (INT32) wsptr[4]) << CONST_BITS;
-    tmp1 = ((INT32) wsptr[0] - (INT32) wsptr[4]) << CONST_BITS;
+    z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z2 <<= CONST_BITS;
+    z3 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    z2 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    tmp0 = z2 + z3;
+    tmp1 = z2 - z3;
     
-    tmp10 = tmp0 + tmp3;
-    tmp13 = tmp0 - tmp3;
-    tmp11 = tmp1 + tmp2;
-    tmp12 = tmp1 - tmp2;
+    tmp10 = tmp0 + tmp2;
+    tmp13 = tmp0 - tmp2;
+    tmp11 = tmp1 + tmp3;
+    tmp12 = tmp1 - tmp3;
     
     /* Odd part per figure 8; the matrix is unitary and hence its
      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
      */
     
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    
+    z2 = tmp0 + tmp2;
+    z3 = tmp1 + tmp3;
+
+    z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
+    z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+    z2 += z1;
+    z3 += z1;
+
+    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
+    tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+    tmp0 += z1 + z2;
+    tmp3 += z1 + z3;
+
+    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+    tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
+    tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
+    tmp1 += z1 + z3;
+    tmp2 += z1 + z2;
+    
+    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+    
+    wsptr[DCTSIZE*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
+    
+    inptr++;			/* advance pointers to next column */
+    quantptr++;
+    wsptr++;
+  }
+
+  /* Pass 2: process 8 rows from work array, store into output array.
+   * 16-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
+   */
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 <<= CONST_BITS;
+
+    z1 = (INT32) wsptr[4];
+    tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
+    tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
+
+    tmp10 = tmp0 + tmp1;
+    tmp11 = tmp0 - tmp1;
+    tmp12 = tmp0 + tmp2;
+    tmp13 = tmp0 - tmp2;
+
+    z1 = (INT32) wsptr[2];
+    z2 = (INT32) wsptr[6];
+    z3 = z1 - z2;
+    z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
+    z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
+
+    tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
+    tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
+    tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
+    tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
+
+    tmp20 = tmp10 + tmp0;
+    tmp27 = tmp10 - tmp0;
+    tmp21 = tmp12 + tmp1;
+    tmp26 = tmp12 - tmp1;
+    tmp22 = tmp13 + tmp2;
+    tmp25 = tmp13 - tmp2;
+    tmp23 = tmp11 + tmp3;
+    tmp24 = tmp11 - tmp3;
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    z4 = (INT32) wsptr[7];
+
+    tmp11 = z1 + z3;
+
+    tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
+    tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
+    tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
+    tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
+    tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
+    tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
+    tmp0  = tmp1 + tmp2 + tmp3 -
+	    MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
+    tmp13 = tmp10 + tmp11 + tmp12 -
+	    MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
+    z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
+    tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
+    tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
+    z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
+    tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
+    tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
+    z2    += z4;
+    z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
+    tmp1  += z1;
+    tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
+    z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
+    tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
+    tmp12 += z2;
+    z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
+    tmp2  += z2;
+    tmp3  += z2;
+    z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
+    tmp10 += z2;
+    tmp11 += z2;
+
+    /* Final output stage */
+
+    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp0,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[15] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp0,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp1,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp1,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp2,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp2,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp3,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp3,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27 + tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp27 - tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+
+    wsptr += 8;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 14x7 output block.
+ *
+ * 7-point IDCT in pass 1 (columns), 14-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_14x7 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		JCOEFPTR coef_block,
+		JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*7];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array.
+   * 7-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/14).
+   */
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp23 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp23 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp23 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+    tmp20 = MULTIPLY(z2 - z3, FIX(0.881747734));       /* c4 */
+    tmp22 = MULTIPLY(z1 - z2, FIX(0.314692123));       /* c6 */
+    tmp21 = tmp20 + tmp22 + tmp23 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
+    tmp10 = z1 + z3;
+    z2 -= tmp10;
+    tmp10 = MULTIPLY(tmp10, FIX(1.274162392)) + tmp23; /* c2 */
+    tmp20 += tmp10 - MULTIPLY(z3, FIX(0.077722536));   /* c2-c4-c6 */
+    tmp22 += tmp10 - MULTIPLY(z1, FIX(2.470602249));   /* c2+c4+c6 */
+    tmp23 += MULTIPLY(z2, FIX(1.414213562));           /* c0 */
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+
+    tmp11 = MULTIPLY(z1 + z2, FIX(0.935414347));       /* (c3+c1-c5)/2 */
+    tmp12 = MULTIPLY(z1 - z2, FIX(0.170262339));       /* (c3+c5-c1)/2 */
+    tmp10 = tmp11 - tmp12;
+    tmp11 += tmp12;
+    tmp12 = MULTIPLY(z2 + z3, - FIX(1.378756276));     /* -c1 */
+    tmp11 += tmp12;
+    z2 = MULTIPLY(z1 + z3, FIX(0.613604268));          /* c5 */
+    tmp10 += z2;
+    tmp12 += z2 + MULTIPLY(z3, FIX(1.870828693));      /* c3+c1-c5 */
+
+    /* Final output stage */
+
+    wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*6] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*5] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*4] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*3] = (int) RIGHT_SHIFT(tmp23, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 7 rows from work array, store into output array.
+   * 14-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/28).
+   */
+  wsptr = workspace;
+  for (ctr = 0; ctr < 7; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z1 <<= CONST_BITS;
+    z4 = (INT32) wsptr[4];
+    z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
+    z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
+    z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
+
+    tmp10 = z1 + z2;
+    tmp11 = z1 + z3;
+    tmp12 = z1 - z4;
+
+    tmp23 = z1 - ((z2 + z3 - z4) << 1);          /* c0 = (c4+c12-c8)*2 */
+
+    z1 = (INT32) wsptr[2];
+    z2 = (INT32) wsptr[6];
+
+    z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
+
+    tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
+    tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
+    tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
+	    MULTIPLY(z2, FIX(1.378756276));      /* c2 */
+
+    tmp20 = tmp10 + tmp13;
+    tmp26 = tmp10 - tmp13;
+    tmp21 = tmp11 + tmp14;
+    tmp25 = tmp11 - tmp14;
+    tmp22 = tmp12 + tmp15;
+    tmp24 = tmp12 - tmp15;
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    z4 = (INT32) wsptr[7];
+    z4 <<= CONST_BITS;
+
+    tmp14 = z1 + z3;
+    tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
+    tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
+    tmp10 = tmp11 + tmp12 + z4 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
+    tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
+    tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
+    z1    -= z2;
+    tmp15 = MULTIPLY(z1, FIX(0.467085129)) - z4;           /* c11 */
+    tmp16 += tmp15;
+    tmp13 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - z4;    /* -c13 */
+    tmp11 += tmp13 - MULTIPLY(z2, FIX(0.424103948));       /* c3-c9-c13 */
+    tmp12 += tmp13 - MULTIPLY(z3, FIX(2.373959773));       /* c3+c5-c13 */
+    tmp13 = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
+    tmp14 += tmp13 + z4 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
+    tmp15 += tmp13 + MULTIPLY(z2, FIX(0.674957567));       /* c1+c11-c5 */
+
+    tmp13 = ((z1 - z3) << CONST_BITS) + z4;
+
+    /* Final output stage */
+
+    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+
+    wsptr += 8;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 12x6 output block.
+ *
+ * 6-point IDCT in pass 1 (columns), 12-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_12x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		JCOEFPTR coef_block,
+		JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*6];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array.
+   * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
+   */
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp10 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp10 += ONE << (CONST_BITS-PASS1_BITS-1);
+    tmp12 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    tmp20 = MULTIPLY(tmp12, FIX(0.707106781));   /* c4 */
+    tmp11 = tmp10 + tmp20;
+    tmp21 = RIGHT_SHIFT(tmp10 - tmp20 - tmp20, CONST_BITS-PASS1_BITS);
+    tmp20 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    tmp10 = MULTIPLY(tmp20, FIX(1.224744871));   /* c2 */
+    tmp20 = tmp11 + tmp10;
+    tmp22 = tmp11 - tmp10;
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    tmp11 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
+    tmp10 = tmp11 + ((z1 + z2) << CONST_BITS);
+    tmp12 = tmp11 + ((z3 - z2) << CONST_BITS);
+    tmp11 = (z1 - z2 - z3) << PASS1_BITS;
+
+    /* Final output stage */
+
+    wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*5] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*1] = (int) (tmp21 + tmp11);
+    wsptr[8*4] = (int) (tmp21 - tmp11);
+    wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*3] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 6 rows from work array, store into output array.
+   * 12-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/24).
+   */
+  wsptr = workspace;
+  for (ctr = 0; ctr < 6; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z3 <<= CONST_BITS;
+
+    z4 = (INT32) wsptr[4];
+    z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
+
+    tmp10 = z3 + z4;
+    tmp11 = z3 - z4;
+
+    z1 = (INT32) wsptr[2];
+    z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
+    z1 <<= CONST_BITS;
+    z2 = (INT32) wsptr[6];
+    z2 <<= CONST_BITS;
+
+    tmp12 = z1 - z2;
+
+    tmp21 = z3 + tmp12;
+    tmp24 = z3 - tmp12;
+
+    tmp12 = z4 + z2;
+
+    tmp20 = tmp10 + tmp12;
+    tmp25 = tmp10 - tmp12;
+
+    tmp12 = z4 - z1 - z2;
+
+    tmp22 = tmp11 + tmp12;
+    tmp23 = tmp11 - tmp12;
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    z4 = (INT32) wsptr[7];
+
+    tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
+    tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
+
+    tmp10 = z1 + z3;
+    tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
+    tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
+    tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
+    tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
+    tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
+    tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
+    tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
+	     MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
+
+    z1 -= z4;
+    z2 -= z3;
+    z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
+    tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
+    tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
+
+    /* Final output stage */
+
+    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+
+    wsptr += 8;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 10x5 output block.
+ *
+ * 5-point IDCT in pass 1 (columns), 10-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_10x5 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		JCOEFPTR coef_block,
+		JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*5];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array.
+   * 5-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/10).
+   */
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp12 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp12 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp12 += ONE << (CONST_BITS-PASS1_BITS-1);
+    tmp13 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    tmp14 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z1 = MULTIPLY(tmp13 + tmp14, FIX(0.790569415)); /* (c2+c4)/2 */
+    z2 = MULTIPLY(tmp13 - tmp14, FIX(0.353553391)); /* (c2-c4)/2 */
+    z3 = tmp12 + z2;
+    tmp10 = z3 + z1;
+    tmp11 = z3 - z1;
+    tmp12 -= z2 << 2;
+
+    /* Odd part */
+
+    z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+
+    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));       /* c3 */
+    tmp13 = z1 + MULTIPLY(z2, FIX(0.513743148));    /* c1-c3 */
+    tmp14 = z1 - MULTIPLY(z3, FIX(2.176250899));    /* c1+c3 */
+
+    /* Final output stage */
+
+    wsptr[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*4] = (int) RIGHT_SHIFT(tmp10 - tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*3] = (int) RIGHT_SHIFT(tmp11 - tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*2] = (int) RIGHT_SHIFT(tmp12, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 5 rows from work array, store into output array.
+   * 10-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/20).
+   */
+  wsptr = workspace;
+  for (ctr = 0; ctr < 5; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z3 <<= CONST_BITS;
+    z4 = (INT32) wsptr[4];
+    z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
+    z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
+    tmp10 = z3 + z1;
+    tmp11 = z3 - z2;
+
+    tmp22 = z3 - ((z1 - z2) << 1);               /* c0 = (c4-c8)*2 */
+
+    z2 = (INT32) wsptr[2];
+    z3 = (INT32) wsptr[6];
+
+    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
+    tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
+    tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
+
+    tmp20 = tmp10 + tmp12;
+    tmp24 = tmp10 - tmp12;
+    tmp21 = tmp11 + tmp13;
+    tmp23 = tmp11 - tmp13;
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    z3 <<= CONST_BITS;
+    z4 = (INT32) wsptr[7];
+
+    tmp11 = z2 + z4;
+    tmp13 = z2 - z4;
+
+    tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
+
+    z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
+    z4 = z3 + tmp12;
+
+    tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
+    tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
+
+    z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
+    z4 = z3 - tmp12 - (tmp13 << (CONST_BITS - 1));
+
+    tmp12 = ((z1 - tmp13) << CONST_BITS) - z3;
+
+    tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
+    tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 8;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 8x4 output block.
+ *
+ * 4-point IDCT in pass 1 (columns), 8-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_8x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3;
+  INT32 tmp10, tmp11, tmp12, tmp13;
+  INT32 z1, z2, z3;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*4];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array.
+   * 4-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
+   */
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+
+    tmp10 = (tmp0 + tmp2) << PASS1_BITS;
+    tmp12 = (tmp0 - tmp2) << PASS1_BITS;
+
+    /* Odd part */
+    /* Same rotation as in the even part of the 8x8 LL&M IDCT */
+
+    z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+
+    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);               /* c6 */
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+    tmp0 = RIGHT_SHIFT(z1 + MULTIPLY(z2, FIX_0_765366865), /* c2-c6 */
+		       CONST_BITS-PASS1_BITS);
+    tmp2 = RIGHT_SHIFT(z1 - MULTIPLY(z3, FIX_1_847759065), /* c2+c6 */
+		       CONST_BITS-PASS1_BITS);
+
+    /* Final output stage */
+
+    wsptr[8*0] = (int) (tmp10 + tmp0);
+    wsptr[8*3] = (int) (tmp10 - tmp0);
+    wsptr[8*1] = (int) (tmp12 + tmp2);
+    wsptr[8*2] = (int) (tmp12 - tmp2);
+  }
+
+  /* Pass 2: process rows from work array, store into output array. */
+  /* Note that we must descale the results by a factor of 8 == 2**3, */
+  /* and also undo the PASS1_BITS scaling. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 4; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part: reverse the even part of the forward DCT. */
+    /* The rotator is sqrt(2)*c(-6). */
+
+    z2 = (INT32) wsptr[2];
+    z3 = (INT32) wsptr[6];
+    
+    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
+    tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
+    tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
+    
+    /* Add fudge factor here for final descale. */
+    z2 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z3 = (INT32) wsptr[4];
+    
+    tmp0 = (z2 + z3) << CONST_BITS;
+    tmp1 = (z2 - z3) << CONST_BITS;
+    
+    tmp10 = tmp0 + tmp2;
+    tmp13 = tmp0 - tmp2;
+    tmp11 = tmp1 + tmp3;
+    tmp12 = tmp1 - tmp3;
+
+    /* Odd part per figure 8; the matrix is unitary and hence its
+     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
+     */
+
     tmp0 = (INT32) wsptr[7];
     tmp1 = (INT32) wsptr[5];
     tmp2 = (INT32) wsptr[3];
     tmp3 = (INT32) wsptr[1];
-    
-    z1 = tmp0 + tmp3;
-    z2 = tmp1 + tmp2;
-    z3 = tmp0 + tmp2;
-    z4 = tmp1 + tmp3;
-    z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
-    
+
+    z2 = tmp0 + tmp2;
+    z3 = tmp1 + tmp3;
+
+    z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
+    z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+    z2 += z1;
+    z3 += z1;
+
+    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+    tmp0 += z1 + z2;
+    tmp3 += z1 + z3;
+
+    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
-    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
-    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
-    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
-    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
-    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+    tmp1 += z1 + z3;
+    tmp2 += z1 + z2;
+
+    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp3,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp3,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += DCTSIZE;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 6x3 output block.
+ *
+ * 3-point IDCT in pass 1 (columns), 6-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_6x3 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
+  INT32 z1, z2, z3;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[6*3];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array.
+   * 3-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/6).
+   */
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp0 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
+    tmp10 = tmp0 + tmp12;
+    tmp2 = tmp0 - tmp12 - tmp12;
+
+    /* Odd part */
+
+    tmp12 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
+
+    /* Final output stage */
+
+    wsptr[6*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[6*2] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[6*1] = (int) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
+  }
+  
+  /* Pass 2: process 3 rows from work array, store into output array.
+   * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
+   */
+  wsptr = workspace;
+  for (ctr = 0; ctr < 3; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 <<= CONST_BITS;
+    tmp2 = (INT32) wsptr[4];
+    tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
+    tmp1 = tmp0 + tmp10;
+    tmp11 = tmp0 - tmp10 - tmp10;
+    tmp10 = (INT32) wsptr[2];
+    tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
+    tmp10 = tmp1 + tmp0;
+    tmp12 = tmp1 - tmp0;
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
+    tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
+    tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
+    tmp1 = (z1 - z2 - z3) << CONST_BITS;
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 6;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 4x2 output block.
+ *
+ * 2-point IDCT in pass 1 (columns), 4-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_4x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp2, tmp10, tmp12;
+  INT32 z1, z2, z3;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  INT32 * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  INT32 workspace[4*2];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 4; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+
+    /* Odd part */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+
+    /* Final output stage */
+
+    wsptr[4*0] = tmp10 + tmp0;
+    wsptr[4*1] = tmp10 - tmp0;
+  }
+
+  /* Pass 2: process 2 rows from work array, store into output array.
+   * 4-point IDCT kernel,
+   * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
+   */
+  wsptr = workspace;
+  for (ctr = 0; ctr < 2; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = wsptr[0] + (ONE << 2);
+    tmp2 = wsptr[2];
+
+    tmp10 = (tmp0 + tmp2) << CONST_BITS;
+    tmp12 = (tmp0 - tmp2) << CONST_BITS;
+
+    /* Odd part */
+    /* Same rotation as in the even part of the 8x8 LL&M IDCT */
+
+    z2 = wsptr[1];
+    z3 = wsptr[3];
+
+    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
+    tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
+    tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+					      CONST_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+					      CONST_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
+					      CONST_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
+					      CONST_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 4;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 2x1 output block.
+ *
+ * 1-point IDCT in pass 1 (columns), 2-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_2x1 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp10;
+  ISLOW_MULT_TYPE * quantptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  SHIFT_TEMPS
+
+  /* Pass 1: empty. */
+
+  /* Pass 2: process 1 row from input, store into output array. */
+
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  outptr = output_buf[0] + output_col;
+
+  /* Even part */
+
+  tmp10 = DEQUANTIZE(coef_block[0], quantptr[0]);
+  /* Add fudge factor here for final descale. */
+  tmp10 += ONE << 2;
+
+  /* Odd part */
+
+  tmp0 = DEQUANTIZE(coef_block[1], quantptr[1]);
+
+  /* Final output stage */
+
+  outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, 3) & RANGE_MASK];
+  outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, 3) & RANGE_MASK];
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 8x16 output block.
+ *
+ * 16-point IDCT in pass 1 (columns), 8-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_8x16 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		JCOEFPTR coef_block,
+		JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*16];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array.
+   * 16-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
+   */
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp0 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
+    tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
+
+    tmp10 = tmp0 + tmp1;
+    tmp11 = tmp0 - tmp1;
+    tmp12 = tmp0 + tmp2;
+    tmp13 = tmp0 - tmp2;
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z3 = z1 - z2;
+    z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
+    z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
+
+    tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
+    tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
+    tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
+    tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
+
+    tmp20 = tmp10 + tmp0;
+    tmp27 = tmp10 - tmp0;
+    tmp21 = tmp12 + tmp1;
+    tmp26 = tmp12 - tmp1;
+    tmp22 = tmp13 + tmp2;
+    tmp25 = tmp13 - tmp2;
+    tmp23 = tmp11 + tmp3;
+    tmp24 = tmp11 - tmp3;
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+    tmp11 = z1 + z3;
+
+    tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
+    tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
+    tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
+    tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
+    tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
+    tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
+    tmp0  = tmp1 + tmp2 + tmp3 -
+	    MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
+    tmp13 = tmp10 + tmp11 + tmp12 -
+	    MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
+    z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
+    tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
+    tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
+    z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
+    tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
+    tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
+    z2    += z4;
+    z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
+    tmp1  += z1;
+    tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
+    z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
+    tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
+    tmp12 += z2;
+    z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
+    tmp2  += z2;
+    tmp3  += z2;
+    z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
+    tmp10 += z2;
+    tmp11 += z2;
+
+    /* Final output stage */
+
+    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp0,  CONST_BITS-PASS1_BITS);
+    wsptr[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0,  CONST_BITS-PASS1_BITS);
+    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp1,  CONST_BITS-PASS1_BITS);
+    wsptr[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1,  CONST_BITS-PASS1_BITS);
+    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp2,  CONST_BITS-PASS1_BITS);
+    wsptr[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2,  CONST_BITS-PASS1_BITS);
+    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp3,  CONST_BITS-PASS1_BITS);
+    wsptr[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3,  CONST_BITS-PASS1_BITS);
+    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
+  }
+  
+  /* Pass 2: process rows from work array, store into output array. */
+  /* Note that we must descale the results by a factor of 8 == 2**3, */
+  /* and also undo the PASS1_BITS scaling. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 16; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+    
+    /* Even part: reverse the even part of the forward DCT. */
+    /* The rotator is sqrt(2)*c(-6). */
+    
+    z2 = (INT32) wsptr[2];
+    z3 = (INT32) wsptr[6];
+    
+    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
+    tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
+    tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
+    
+    /* Add fudge factor here for final descale. */
+    z2 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z3 = (INT32) wsptr[4];
+    
+    tmp0 = (z2 + z3) << CONST_BITS;
+    tmp1 = (z2 - z3) << CONST_BITS;
+    
+    tmp10 = tmp0 + tmp2;
+    tmp13 = tmp0 - tmp2;
+    tmp11 = tmp1 + tmp3;
+    tmp12 = tmp1 - tmp3;
+    
+    /* Odd part per figure 8; the matrix is unitary and hence its
+     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
+     */
     
-    z3 += z5;
-    z4 += z5;
+    tmp0 = (INT32) wsptr[7];
+    tmp1 = (INT32) wsptr[5];
+    tmp2 = (INT32) wsptr[3];
+    tmp3 = (INT32) wsptr[1];
     
-    tmp0 += z1 + z3;
-    tmp1 += z2 + z4;
-    tmp2 += z2 + z3;
-    tmp3 += z1 + z4;
+    z2 = tmp0 + tmp2;
+    z3 = tmp1 + tmp3;
+
+    z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
+    z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+    z2 += z1;
+    z3 += z1;
+
+    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
+    tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+    tmp0 += z1 + z2;
+    tmp3 += z1 + z3;
+
+    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+    tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
+    tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
+    tmp1 += z1 + z3;
+    tmp2 += z1 + z2;
     
     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
     
-    outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp3,
-					  CONST_BITS+PASS1_BITS+3)
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp3,
+					      CONST_BITS+PASS1_BITS+3)
 			    & RANGE_MASK];
-    outptr[7] = range_limit[(int) DESCALE(tmp10 - tmp3,
-					  CONST_BITS+PASS1_BITS+3)
+    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp3,
+					      CONST_BITS+PASS1_BITS+3)
 			    & RANGE_MASK];
-    outptr[1] = range_limit[(int) DESCALE(tmp11 + tmp2,
-					  CONST_BITS+PASS1_BITS+3)
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp2,
+					      CONST_BITS+PASS1_BITS+3)
 			    & RANGE_MASK];
-    outptr[6] = range_limit[(int) DESCALE(tmp11 - tmp2,
-					  CONST_BITS+PASS1_BITS+3)
+    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp2,
+					      CONST_BITS+PASS1_BITS+3)
 			    & RANGE_MASK];
-    outptr[2] = range_limit[(int) DESCALE(tmp12 + tmp1,
-					  CONST_BITS+PASS1_BITS+3)
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp1,
+					      CONST_BITS+PASS1_BITS+3)
 			    & RANGE_MASK];
-    outptr[5] = range_limit[(int) DESCALE(tmp12 - tmp1,
-					  CONST_BITS+PASS1_BITS+3)
+    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp1,
+					      CONST_BITS+PASS1_BITS+3)
 			    & RANGE_MASK];
-    outptr[3] = range_limit[(int) DESCALE(tmp13 + tmp0,
-					  CONST_BITS+PASS1_BITS+3)
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp0,
+					      CONST_BITS+PASS1_BITS+3)
 			    & RANGE_MASK];
-    outptr[4] = range_limit[(int) DESCALE(tmp13 - tmp0,
-					  CONST_BITS+PASS1_BITS+3)
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp0,
+					      CONST_BITS+PASS1_BITS+3)
 			    & RANGE_MASK];
     
     wsptr += DCTSIZE;		/* advance pointer to next row */
   }
 }
 
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 7x14 output block.
+ *
+ * 14-point IDCT in pass 1 (columns), 7-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_7x14 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		JCOEFPTR coef_block,
+		JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[7*14];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array.
+   * 14-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/28).
+   */
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z1 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
+    z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
+    z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
+
+    tmp10 = z1 + z2;
+    tmp11 = z1 + z3;
+    tmp12 = z1 - z4;
+
+    tmp23 = RIGHT_SHIFT(z1 - ((z2 + z3 - z4) << 1), /* c0 = (c4+c12-c8)*2 */
+			CONST_BITS-PASS1_BITS);
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+    z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
+
+    tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
+    tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
+    tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
+	    MULTIPLY(z2, FIX(1.378756276));      /* c2 */
+
+    tmp20 = tmp10 + tmp13;
+    tmp26 = tmp10 - tmp13;
+    tmp21 = tmp11 + tmp14;
+    tmp25 = tmp11 - tmp14;
+    tmp22 = tmp12 + tmp15;
+    tmp24 = tmp12 - tmp15;
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    tmp13 = z4 << CONST_BITS;
+
+    tmp14 = z1 + z3;
+    tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
+    tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
+    tmp10 = tmp11 + tmp12 + tmp13 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
+    tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
+    tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
+    z1    -= z2;
+    tmp15 = MULTIPLY(z1, FIX(0.467085129)) - tmp13;        /* c11 */
+    tmp16 += tmp15;
+    z1    += z4;
+    z4    = MULTIPLY(z2 + z3, - FIX(0.158341681)) - tmp13; /* -c13 */
+    tmp11 += z4 - MULTIPLY(z2, FIX(0.424103948));          /* c3-c9-c13 */
+    tmp12 += z4 - MULTIPLY(z3, FIX(2.373959773));          /* c3+c5-c13 */
+    z4    = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
+    tmp14 += z4 + tmp13 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
+    tmp15 += z4 + MULTIPLY(z2, FIX(0.674957567));          /* c1+c11-c5 */
+
+    tmp13 = (z1 - z3) << PASS1_BITS;
+
+    /* Final output stage */
+
+    wsptr[7*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[7*13] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[7*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[7*12] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[7*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[7*11] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[7*3]  = (int) (tmp23 + tmp13);
+    wsptr[7*10] = (int) (tmp23 - tmp13);
+    wsptr[7*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[7*9]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[7*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
+    wsptr[7*8]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
+    wsptr[7*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
+    wsptr[7*7]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 14 rows from work array, store into output array.
+   * 7-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/14).
+   */
+  wsptr = workspace;
+  for (ctr = 0; ctr < 14; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp23 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp23 <<= CONST_BITS;
+
+    z1 = (INT32) wsptr[2];
+    z2 = (INT32) wsptr[4];
+    z3 = (INT32) wsptr[6];
+
+    tmp20 = MULTIPLY(z2 - z3, FIX(0.881747734));       /* c4 */
+    tmp22 = MULTIPLY(z1 - z2, FIX(0.314692123));       /* c6 */
+    tmp21 = tmp20 + tmp22 + tmp23 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
+    tmp10 = z1 + z3;
+    z2 -= tmp10;
+    tmp10 = MULTIPLY(tmp10, FIX(1.274162392)) + tmp23; /* c2 */
+    tmp20 += tmp10 - MULTIPLY(z3, FIX(0.077722536));   /* c2-c4-c6 */
+    tmp22 += tmp10 - MULTIPLY(z1, FIX(2.470602249));   /* c2+c4+c6 */
+    tmp23 += MULTIPLY(z2, FIX(1.414213562));           /* c0 */
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+
+    tmp11 = MULTIPLY(z1 + z2, FIX(0.935414347));       /* (c3+c1-c5)/2 */
+    tmp12 = MULTIPLY(z1 - z2, FIX(0.170262339));       /* (c3+c5-c1)/2 */
+    tmp10 = tmp11 - tmp12;
+    tmp11 += tmp12;
+    tmp12 = MULTIPLY(z2 + z3, - FIX(1.378756276));     /* -c1 */
+    tmp11 += tmp12;
+    z2 = MULTIPLY(z1 + z3, FIX(0.613604268));          /* c5 */
+    tmp10 += z2;
+    tmp12 += z2 + MULTIPLY(z3, FIX(1.870828693));      /* c3+c1-c5 */
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 7;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 6x12 output block.
+ *
+ * 12-point IDCT in pass 1 (columns), 6-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_6x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		JCOEFPTR coef_block,
+		JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[6*12];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array.
+   * 12-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/24).
+   */
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z3 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    z3 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
+
+    tmp10 = z3 + z4;
+    tmp11 = z3 - z4;
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
+    z1 <<= CONST_BITS;
+    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z2 <<= CONST_BITS;
+
+    tmp12 = z1 - z2;
+
+    tmp21 = z3 + tmp12;
+    tmp24 = z3 - tmp12;
+
+    tmp12 = z4 + z2;
+
+    tmp20 = tmp10 + tmp12;
+    tmp25 = tmp10 - tmp12;
+
+    tmp12 = z4 - z1 - z2;
+
+    tmp22 = tmp11 + tmp12;
+    tmp23 = tmp11 - tmp12;
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+    tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
+    tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
+
+    tmp10 = z1 + z3;
+    tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
+    tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
+    tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
+    tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
+    tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
+    tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
+    tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
+	     MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
+
+    z1 -= z4;
+    z2 -= z3;
+    z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
+    tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
+    tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
+
+    /* Final output stage */
+
+    wsptr[6*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[6*11] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[6*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[6*10] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[6*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[6*9]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[6*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[6*8]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[6*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[6*7]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[6*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
+    wsptr[6*6]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 12 rows from work array, store into output array.
+   * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
+   */
+  wsptr = workspace;
+  for (ctr = 0; ctr < 12; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp10 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp10 <<= CONST_BITS;
+    tmp12 = (INT32) wsptr[4];
+    tmp20 = MULTIPLY(tmp12, FIX(0.707106781));   /* c4 */
+    tmp11 = tmp10 + tmp20;
+    tmp21 = tmp10 - tmp20 - tmp20;
+    tmp20 = (INT32) wsptr[2];
+    tmp10 = MULTIPLY(tmp20, FIX(1.224744871));   /* c2 */
+    tmp20 = tmp11 + tmp10;
+    tmp22 = tmp11 - tmp10;
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    tmp11 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
+    tmp10 = tmp11 + ((z1 + z2) << CONST_BITS);
+    tmp12 = tmp11 + ((z3 - z2) << CONST_BITS);
+    tmp11 = (z1 - z2 - z3) << CONST_BITS;
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 6;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 5x10 output block.
+ *
+ * 10-point IDCT in pass 1 (columns), 5-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_5x10 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		JCOEFPTR coef_block,
+		JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
+  INT32 z1, z2, z3, z4, z5;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[5*10];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array.
+   * 10-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/20).
+   */
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z3 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    z3 += ONE << (CONST_BITS-PASS1_BITS-1);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
+    z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
+    tmp10 = z3 + z1;
+    tmp11 = z3 - z2;
+
+    tmp22 = RIGHT_SHIFT(z3 - ((z1 - z2) << 1),   /* c0 = (c4-c8)*2 */
+			CONST_BITS-PASS1_BITS);
+
+    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
+    tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
+    tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
+
+    tmp20 = tmp10 + tmp12;
+    tmp24 = tmp10 - tmp12;
+    tmp21 = tmp11 + tmp13;
+    tmp23 = tmp11 - tmp13;
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+    tmp11 = z2 + z4;
+    tmp13 = z2 - z4;
+
+    tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
+    z5 = z3 << CONST_BITS;
+
+    z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
+    z4 = z5 + tmp12;
+
+    tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
+    tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
+
+    z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
+    z4 = z5 - tmp12 - (tmp13 << (CONST_BITS - 1));
+
+    tmp12 = (z1 - tmp13 - z3) << PASS1_BITS;
+
+    tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
+    tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
+
+    /* Final output stage */
+
+    wsptr[5*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[5*9] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[5*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[5*8] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[5*2] = (int) (tmp22 + tmp12);
+    wsptr[5*7] = (int) (tmp22 - tmp12);
+    wsptr[5*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[5*6] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[5*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[5*5] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 10 rows from work array, store into output array.
+   * 5-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/10).
+   */
+  wsptr = workspace;
+  for (ctr = 0; ctr < 10; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp12 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp12 <<= CONST_BITS;
+    tmp13 = (INT32) wsptr[2];
+    tmp14 = (INT32) wsptr[4];
+    z1 = MULTIPLY(tmp13 + tmp14, FIX(0.790569415)); /* (c2+c4)/2 */
+    z2 = MULTIPLY(tmp13 - tmp14, FIX(0.353553391)); /* (c2-c4)/2 */
+    z3 = tmp12 + z2;
+    tmp10 = z3 + z1;
+    tmp11 = z3 - z1;
+    tmp12 -= z2 << 2;
+
+    /* Odd part */
+
+    z2 = (INT32) wsptr[1];
+    z3 = (INT32) wsptr[3];
+
+    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));       /* c3 */
+    tmp13 = z1 + MULTIPLY(z2, FIX(0.513743148));    /* c1-c3 */
+    tmp14 = z1 - MULTIPLY(z3, FIX(2.176250899));    /* c1+c3 */
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp13,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp13,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp14,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp14,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 5;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 4x8 output block.
+ *
+ * 8-point IDCT in pass 1 (columns), 4-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_4x8 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3;
+  INT32 tmp10, tmp11, tmp12, tmp13;
+  INT32 z1, z2, z3;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[4*8];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 4; ctr > 0; ctr--) {
+    /* Due to quantization, we will usually find that many of the input
+     * coefficients are zero, especially the AC terms.  We can exploit this
+     * by short-circuiting the IDCT calculation for any column in which all
+     * the AC terms are zero.  In that case each output is equal to the
+     * DC coefficient (with scale factor as needed).
+     * With typical images and quantization tables, half or more of the
+     * column DCT calculations can be simplified this way.
+     */
+
+    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
+	inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
+	inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
+	inptr[DCTSIZE*7] == 0) {
+      /* AC terms all zero */
+      int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
+
+      wsptr[4*0] = dcval;
+      wsptr[4*1] = dcval;
+      wsptr[4*2] = dcval;
+      wsptr[4*3] = dcval;
+      wsptr[4*4] = dcval;
+      wsptr[4*5] = dcval;
+      wsptr[4*6] = dcval;
+      wsptr[4*7] = dcval;
+
+      inptr++;			/* advance pointers to next column */
+      quantptr++;
+      wsptr++;
+      continue;
+    }
+
+    /* Even part: reverse the even part of the forward DCT. */
+    /* The rotator is sqrt(2)*c(-6). */
+
+    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    
+    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
+    tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
+    tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
+    
+    z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z2 <<= CONST_BITS;
+    z3 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    z2 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    tmp0 = z2 + z3;
+    tmp1 = z2 - z3;
+    
+    tmp10 = tmp0 + tmp2;
+    tmp13 = tmp0 - tmp2;
+    tmp11 = tmp1 + tmp3;
+    tmp12 = tmp1 - tmp3;
+
+    /* Odd part per figure 8; the matrix is unitary and hence its
+     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
+     */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+
+    z2 = tmp0 + tmp2;
+    z3 = tmp1 + tmp3;
+
+    z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
+    z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+    z2 += z1;
+    z3 += z1;
+
+    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
+    tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+    tmp0 += z1 + z2;
+    tmp3 += z1 + z3;
+
+    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+    tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
+    tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
+    tmp1 += z1 + z3;
+    tmp2 += z1 + z2;
+
+    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+    wsptr[4*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
+    wsptr[4*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
+    wsptr[4*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[4*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[4*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
+    wsptr[4*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
+    wsptr[4*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[4*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
+
+    inptr++;			/* advance pointers to next column */
+    quantptr++;
+    wsptr++;
+  }
+
+  /* Pass 2: process 8 rows from work array, store into output array.
+   * 4-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
+   */
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp2 = (INT32) wsptr[2];
+
+    tmp10 = (tmp0 + tmp2) << CONST_BITS;
+    tmp12 = (tmp0 - tmp2) << CONST_BITS;
+
+    /* Odd part */
+    /* Same rotation as in the even part of the 8x8 LL&M IDCT */
+
+    z2 = (INT32) wsptr[1];
+    z3 = (INT32) wsptr[3];
+
+    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
+    tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
+    tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    
+    wsptr += 4;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 3x6 output block.
+ *
+ * 6-point IDCT in pass 1 (columns), 3-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_3x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
+  INT32 z1, z2, z3;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[3*6];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array.
+   * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
+   */
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp0 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
+    tmp1 = tmp0 + tmp10;
+    tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS-PASS1_BITS);
+    tmp10 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
+    tmp10 = tmp1 + tmp0;
+    tmp12 = tmp1 - tmp0;
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
+    tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
+    tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
+    tmp1 = (z1 - z2 - z3) << PASS1_BITS;
+
+    /* Final output stage */
+
+    wsptr[3*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[3*5] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[3*1] = (int) (tmp11 + tmp1);
+    wsptr[3*4] = (int) (tmp11 - tmp1);
+    wsptr[3*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[3*3] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 6 rows from work array, store into output array.
+   * 3-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/6).
+   */
+  wsptr = workspace;
+  for (ctr = 0; ctr < 6; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 <<= CONST_BITS;
+    tmp2 = (INT32) wsptr[2];
+    tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
+    tmp10 = tmp0 + tmp12;
+    tmp2 = tmp0 - tmp12 - tmp12;
+
+    /* Odd part */
+
+    tmp12 = (INT32) wsptr[1];
+    tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 3;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 2x4 output block.
+ *
+ * 4-point IDCT in pass 1 (columns), 2-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_2x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp2, tmp10, tmp12;
+  INT32 z1, z2, z3;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  INT32 * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  INT32 workspace[2*4];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array.
+   * 4-point IDCT kernel,
+   * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
+   */
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 2; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+
+    tmp10 = (tmp0 + tmp2) << CONST_BITS;
+    tmp12 = (tmp0 - tmp2) << CONST_BITS;
+
+    /* Odd part */
+    /* Same rotation as in the even part of the 8x8 LL&M IDCT */
+
+    z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+
+    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
+    tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
+    tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
+
+    /* Final output stage */
+
+    wsptr[2*0] = tmp10 + tmp0;
+    wsptr[2*3] = tmp10 - tmp0;
+    wsptr[2*1] = tmp12 + tmp2;
+    wsptr[2*2] = tmp12 - tmp2;
+  }
+
+  /* Pass 2: process 4 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 4; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp10 = wsptr[0] + (ONE << (CONST_BITS+2));
+
+    /* Odd part */
+
+    tmp0 = wsptr[1];
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 2;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 1x2 output block.
+ *
+ * 2-point IDCT in pass 1 (columns), 1-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_1x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp10;
+  ISLOW_MULT_TYPE * quantptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  SHIFT_TEMPS
+
+  /* Process 1 column from input, store into output array. */
+
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+
+  /* Even part */
+    
+  tmp10 = DEQUANTIZE(coef_block[DCTSIZE*0], quantptr[DCTSIZE*0]);
+  /* Add fudge factor here for final descale. */
+  tmp10 += ONE << 2;
+
+  /* Odd part */
+
+  tmp0 = DEQUANTIZE(coef_block[DCTSIZE*1], quantptr[DCTSIZE*1]);
+
+  /* Final output stage */
+
+  output_buf[0][output_col] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, 3)
+					  & RANGE_MASK];
+  output_buf[1][output_col] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, 3)
+					  & RANGE_MASK];
+}
+
+#endif /* IDCT_SCALING_SUPPORTED */
 #endif /* DCT_ISLOW_SUPPORTED */
diff --git a/src/libjpeg/jmorecfg.h b/src/libjpeg/jmorecfg.h
index 2182f15..c610bc2 100644
--- a/src/libjpeg/jmorecfg.h
+++ b/src/libjpeg/jmorecfg.h
@@ -2,6 +2,7 @@
  * jmorecfg.h
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 1997-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -158,8 +159,11 @@ typedef short INT16;
 /* INT32 must hold at least signed 32-bit values. */
 
 #ifndef XMD_H			/* X11/xmd.h correctly defines INT32 */
-typedef int JINT32;
-#define INT32 JINT32
+#ifndef _BASETSD_H_		/* Microsoft defines it in basetsd.h */
+#ifndef QGLOBAL_H		/* Qt defines it in qglobal.h */
+typedef long INT32;
+#endif
+#endif
 #endif
 
 /* Datatype used for image dimensions.  The JPEG standard only supports
@@ -210,11 +214,13 @@ typedef unsigned int JDIMENSION;
  * explicit coding is needed; see uses of the NEED_FAR_POINTERS symbol.
  */
 
+#ifndef FAR
 #ifdef NEED_FAR_POINTERS
 #define FAR  far
 #else
 #define FAR
 #endif
+#endif
 
 
 /*
@@ -257,8 +263,6 @@ typedef int boolean;
  * (You may HAVE to do that if your compiler doesn't like null source files.)
  */
 
-/* Arithmetic coding is unsupported for legal reasons.  Complaints to IBM. */
-
 /* Capability options common to encoder and decoder: */
 
 #define DCT_ISLOW_SUPPORTED	/* slow but accurate integer algorithm */
@@ -267,9 +271,10 @@ typedef int boolean;
 
 /* Encoder capability options: */
 
-#undef  C_ARITH_CODING_SUPPORTED    /* Arithmetic coding back end? */
+#define C_ARITH_CODING_SUPPORTED    /* Arithmetic coding back end? */
 #define C_MULTISCAN_FILES_SUPPORTED /* Multiple-scan JPEG files? */
 #define C_PROGRESSIVE_SUPPORTED	    /* Progressive JPEG? (Requires MULTISCAN)*/
+#define DCT_SCALING_SUPPORTED	    /* Input rescaling via DCT? (Requires DCT_ISLOW)*/
 #define ENTROPY_OPT_SUPPORTED	    /* Optimization of entropy coding parms? */
 /* Note: if you selected 12-bit data precision, it is dangerous to turn off
  * ENTROPY_OPT_SUPPORTED.  The standard Huffman tables are only good for 8-bit
@@ -283,12 +288,12 @@ typedef int boolean;
 
 /* Decoder capability options: */
 
-#undef  D_ARITH_CODING_SUPPORTED    /* Arithmetic coding back end? */
+#define D_ARITH_CODING_SUPPORTED    /* Arithmetic coding back end? */
 #define D_MULTISCAN_FILES_SUPPORTED /* Multiple-scan JPEG files? */
 #define D_PROGRESSIVE_SUPPORTED	    /* Progressive JPEG? (Requires MULTISCAN)*/
+#define IDCT_SCALING_SUPPORTED	    /* Output rescaling via IDCT? */
 #define SAVE_MARKERS_SUPPORTED	    /* jpeg_save_markers() needed? */
 #define BLOCK_SMOOTHING_SUPPORTED   /* Block smoothing? (Progressive only) */
-#define IDCT_SCALING_SUPPORTED	    /* Output rescaling via IDCT? */
 #undef  UPSAMPLE_SCALING_SUPPORTED  /* Output rescaling at upsample stage? */
 #define UPSAMPLE_MERGING_SUPPORTED  /* Fast path for sloppy upsampling? */
 #define QUANT_1PASS_SUPPORTED	    /* 1-pass color quantization? */
diff --git a/src/libjpeg/jpegint.h b/src/libjpeg/jpegint.h
index 95b00d4..69b781b 100644
--- a/src/libjpeg/jpegint.h
+++ b/src/libjpeg/jpegint.h
@@ -2,6 +2,7 @@
  * jpegint.h
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 1997-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -99,14 +100,16 @@ struct jpeg_downsampler {
 };
 
 /* Forward DCT (also controls coefficient quantization) */
+typedef JMETHOD(void, forward_DCT_ptr,
+		(j_compress_ptr cinfo, jpeg_component_info * compptr,
+		 JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
+		 JDIMENSION start_row, JDIMENSION start_col,
+		 JDIMENSION num_blocks));
+
 struct jpeg_forward_dct {
   JMETHOD(void, start_pass, (j_compress_ptr cinfo));
-  /* perhaps this should be an array??? */
-  JMETHOD(void, forward_DCT, (j_compress_ptr cinfo,
-			      jpeg_component_info * compptr,
-			      JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
-			      JDIMENSION start_row, JDIMENSION start_col,
-			      JDIMENSION num_blocks));
+  /* It is useful to allow each component to have a separate FDCT method. */
+  forward_DCT_ptr forward_DCT[MAX_COMPONENTS];
 };
 
 /* Entropy encoding */
@@ -303,7 +306,7 @@ struct jpeg_color_quantizer {
 #define jinit_downsampler	jIDownsampler
 #define jinit_forward_dct	jIFDCT
 #define jinit_huff_encoder	jIHEncoder
-#define jinit_phuff_encoder	jIPHEncoder
+#define jinit_arith_encoder	jIAEncoder
 #define jinit_marker_writer	jIMWriter
 #define jinit_master_decompress	jIDMaster
 #define jinit_d_main_controller	jIDMainC
@@ -312,7 +315,7 @@ struct jpeg_color_quantizer {
 #define jinit_input_controller	jIInCtlr
 #define jinit_marker_reader	jIMReader
 #define jinit_huff_decoder	jIHDecoder
-#define jinit_phuff_decoder	jIPHDecoder
+#define jinit_arith_decoder	jIADecoder
 #define jinit_inverse_dct	jIIDCT
 #define jinit_upsampler		jIUpsampler
 #define jinit_color_deconverter	jIDColor
@@ -344,7 +347,7 @@ EXTERN(void) jinit_color_converter JPP((j_compress_ptr cinfo));
 EXTERN(void) jinit_downsampler JPP((j_compress_ptr cinfo));
 EXTERN(void) jinit_forward_dct JPP((j_compress_ptr cinfo));
 EXTERN(void) jinit_huff_encoder JPP((j_compress_ptr cinfo));
-EXTERN(void) jinit_phuff_encoder JPP((j_compress_ptr cinfo));
+EXTERN(void) jinit_arith_encoder JPP((j_compress_ptr cinfo));
 EXTERN(void) jinit_marker_writer JPP((j_compress_ptr cinfo));
 /* Decompression module initialization routines */
 EXTERN(void) jinit_master_decompress JPP((j_decompress_ptr cinfo));
@@ -357,7 +360,7 @@ EXTERN(void) jinit_d_post_controller JPP((j_decompress_ptr cinfo,
 EXTERN(void) jinit_input_controller JPP((j_decompress_ptr cinfo));
 EXTERN(void) jinit_marker_reader JPP((j_decompress_ptr cinfo));
 EXTERN(void) jinit_huff_decoder JPP((j_decompress_ptr cinfo));
-EXTERN(void) jinit_phuff_decoder JPP((j_decompress_ptr cinfo));
+EXTERN(void) jinit_arith_decoder JPP((j_decompress_ptr cinfo));
 EXTERN(void) jinit_inverse_dct JPP((j_decompress_ptr cinfo));
 EXTERN(void) jinit_upsampler JPP((j_decompress_ptr cinfo));
 EXTERN(void) jinit_color_deconverter JPP((j_decompress_ptr cinfo));
diff --git a/src/libjpeg/jpeglib.h b/src/libjpeg/jpeglib.h
index d1be8dd..341a19c 100644
--- a/src/libjpeg/jpeglib.h
+++ b/src/libjpeg/jpeglib.h
@@ -2,6 +2,7 @@
  * jpeglib.h
  *
  * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Modified 2002-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -26,11 +27,17 @@
 #include "jmorecfg.h"		/* seldom changed options */
 
 
+#ifdef __cplusplus
+#ifndef DONT_USE_EXTERN_C
+extern "C" {
+#endif
+#endif
+
 /* Version ID for the JPEG library.
- * Might be useful for tests like "#if JPEG_LIB_VERSION >= 60".
+ * Might be useful for tests like "#if JPEG_LIB_VERSION >= 70".
  */
 
-#define JPEG_LIB_VERSION  62	/* Version 6b */
+#define JPEG_LIB_VERSION  70	/* Version 7.0 */
 
 
 /* Various constants determining the sizes of things.
@@ -138,18 +145,18 @@ typedef struct {
    */
   JDIMENSION width_in_blocks;
   JDIMENSION height_in_blocks;
-  /* Size of a DCT block in samples.  Always DCTSIZE for compression.
-   * For decompression this is the size of the output from one DCT block,
-   * reflecting any scaling we choose to apply during the IDCT step.
-   * Values of 1,2,4,8 are likely to be supported.  Note that different
-   * components may receive different IDCT scalings.
+  /* Size of a DCT block in samples,
+   * reflecting any scaling we choose to apply during the DCT step.
+   * Values from 1 to 16 are supported.
+   * Note that different components may receive different DCT scalings.
    */
-  int DCT_scaled_size;
+  int DCT_h_scaled_size;
+  int DCT_v_scaled_size;
   /* The downsampled dimensions are the component's actual, unpadded number
-   * of samples at the main buffer (preprocessing/compression interface), thus
-   * downsampled_width = ceil(image_width * Hi/Hmax)
-   * and similarly for height.  For decompression, IDCT scaling is included, so
-   * downsampled_width = ceil(image_width * Hi/Hmax * DCT_scaled_size/DCTSIZE)
+   * of samples at the main buffer (preprocessing/compression interface);
+   * DCT scaling is included, so
+   * downsampled_width = ceil(image_width * Hi/Hmax * DCT_h_scaled_size/DCTSIZE)
+   * and similarly for height.
    */
   JDIMENSION downsampled_width;	 /* actual width in samples */
   JDIMENSION downsampled_height; /* actual height in samples */
@@ -291,6 +298,17 @@ struct jpeg_compress_struct {
    * helper routines to simplify changing parameters.
    */
 
+  unsigned int scale_num, scale_denom; /* fraction by which to scale image */
+
+  JDIMENSION jpeg_width;	/* scaled JPEG image width */
+  JDIMENSION jpeg_height;	/* scaled JPEG image height */
+  /* Dimensions of actual JPEG image that will be written to file,
+   * derived from input dimensions by scaling factors above.
+   * These fields are computed by jpeg_start_compress().
+   * You can also use jpeg_calc_jpeg_dimensions() to determine these values
+   * in advance of calling jpeg_start_compress().
+   */
+
   int data_precision;		/* bits of precision in image data */
 
   int num_components;		/* # of color components in JPEG image */
@@ -298,14 +316,17 @@ struct jpeg_compress_struct {
 
   jpeg_component_info * comp_info;
   /* comp_info[i] describes component that appears i'th in SOF */
-  
+
   JQUANT_TBL * quant_tbl_ptrs[NUM_QUANT_TBLS];
-  /* ptrs to coefficient quantization tables, or NULL if not defined */
-  
+  int q_scale_factor[NUM_QUANT_TBLS];
+  /* ptrs to coefficient quantization tables, or NULL if not defined,
+   * and corresponding scale factors (percentage, initialized 100).
+   */
+
   JHUFF_TBL * dc_huff_tbl_ptrs[NUM_HUFF_TBLS];
   JHUFF_TBL * ac_huff_tbl_ptrs[NUM_HUFF_TBLS];
   /* ptrs to Huffman coding tables, or NULL if not defined */
-  
+
   UINT8 arith_dc_L[NUM_ARITH_TBLS]; /* L values for DC arith-coding tables */
   UINT8 arith_dc_U[NUM_ARITH_TBLS]; /* U values for DC arith-coding tables */
   UINT8 arith_ac_K[NUM_ARITH_TBLS]; /* Kx values for AC arith-coding tables */
@@ -321,6 +342,7 @@ struct jpeg_compress_struct {
   boolean arith_code;		/* TRUE=arithmetic coding, FALSE=Huffman */
   boolean optimize_coding;	/* TRUE=optimize entropy encoding parms */
   boolean CCIR601_sampling;	/* TRUE=first samples are cosited */
+  boolean do_fancy_downsampling; /* TRUE=apply fancy downsampling */
   int smoothing_factor;		/* 1..100, or 0 for no input smoothing */
   J_DCT_METHOD dct_method;	/* DCT algorithm selector */
 
@@ -364,6 +386,9 @@ struct jpeg_compress_struct {
   int max_h_samp_factor;	/* largest h_samp_factor */
   int max_v_samp_factor;	/* largest v_samp_factor */
 
+  int min_DCT_h_scaled_size;	/* smallest DCT_h_scaled_size of any component */
+  int min_DCT_v_scaled_size;	/* smallest DCT_v_scaled_size of any component */
+
   JDIMENSION total_iMCU_rows;	/* # of iMCU rows to be input to coef ctlr */
   /* The coefficient controller receives data in units of MCU rows as defined
    * for fully interleaved scans (whether the JPEG file is interleaved or not).
@@ -575,7 +600,8 @@ struct jpeg_decompress_struct {
   int max_h_samp_factor;	/* largest h_samp_factor */
   int max_v_samp_factor;	/* largest v_samp_factor */
 
-  int min_DCT_scaled_size;	/* smallest DCT_scaled_size of any component */
+  int min_DCT_h_scaled_size;	/* smallest DCT_h_scaled_size of any component */
+  int min_DCT_v_scaled_size;	/* smallest DCT_v_scaled_size of any component */
 
   JDIMENSION total_iMCU_rows;	/* # of iMCU rows in image */
   /* The coefficient controller's input and output progress is measured in
@@ -583,7 +609,7 @@ struct jpeg_decompress_struct {
    * in fully interleaved JPEG scans, but are used whether the scan is
    * interleaved or not.  We define an iMCU row as v_samp_factor DCT block
    * rows of each component.  Therefore, the IDCT output contains
-   * v_samp_factor*DCT_scaled_size sample rows of a component per iMCU row.
+   * v_samp_factor*DCT_v_scaled_size sample rows of a component per iMCU row.
    */
 
   JSAMPLE * sample_range_limit; /* table for fast range-limiting */
@@ -841,6 +867,7 @@ typedef JMETHOD(boolean, jpeg_marker_parser_method, (j_decompress_ptr cinfo));
 #define jpeg_default_colorspace	jDefColorspace
 #define jpeg_set_quality	jSetQuality
 #define jpeg_set_linear_quality	jSetLQuality
+#define jpeg_default_qtables	jDefQTables
 #define jpeg_add_quant_table	jAddQuantTable
 #define jpeg_quality_scaling	jQualityScaling
 #define jpeg_simple_progression	jSimProgress
@@ -850,6 +877,7 @@ typedef JMETHOD(boolean, jpeg_marker_parser_method, (j_decompress_ptr cinfo));
 #define jpeg_start_compress	jStrtCompress
 #define jpeg_write_scanlines	jWrtScanlines
 #define jpeg_finish_compress	jFinCompress
+#define jpeg_calc_jpeg_dimensions	jCjpegDimensions
 #define jpeg_write_raw_data	jWrtRawData
 #define jpeg_write_marker	jWrtMarker
 #define jpeg_write_m_header	jWrtMHeader
@@ -921,6 +949,8 @@ EXTERN(void) jpeg_set_quality JPP((j_compress_ptr cinfo, int quality,
 EXTERN(void) jpeg_set_linear_quality JPP((j_compress_ptr cinfo,
 					  int scale_factor,
 					  boolean force_baseline));
+EXTERN(void) jpeg_default_qtables JPP((j_compress_ptr cinfo,
+				       boolean force_baseline));
 EXTERN(void) jpeg_add_quant_table JPP((j_compress_ptr cinfo, int which_tbl,
 				       const unsigned int *basic_table,
 				       int scale_factor,
@@ -940,12 +970,15 @@ EXTERN(JDIMENSION) jpeg_write_scanlines JPP((j_compress_ptr cinfo,
 					     JDIMENSION num_lines));
 EXTERN(void) jpeg_finish_compress JPP((j_compress_ptr cinfo));
 
+/* Precalculate JPEG dimensions for current compression parameters. */
+EXTERN(void) jpeg_calc_jpeg_dimensions JPP((j_compress_ptr cinfo));
+
 /* Replaces jpeg_write_scanlines when writing raw downsampled data. */
 EXTERN(JDIMENSION) jpeg_write_raw_data JPP((j_compress_ptr cinfo,
 					    JSAMPIMAGE data,
 					    JDIMENSION num_lines));
 
-/* Write a special marker.  See libjpeg.doc concerning safe usage. */
+/* Write a special marker.  See libjpeg.txt concerning safe usage. */
 EXTERN(void) jpeg_write_marker
 	JPP((j_compress_ptr cinfo, int marker,
 	     const JOCTET * dataptr, unsigned int datalen));
@@ -1093,4 +1126,10 @@ struct jpeg_color_quantizer { long dummy; };
 #include "jerror.h"		/* fetch error codes too */
 #endif
 
+#ifdef __cplusplus
+#ifndef DONT_USE_EXTERN_C
+}
+#endif
+#endif
+
 #endif /* JPEGLIB_H */
diff --git a/src/libjpeg/jversion.h b/src/libjpeg/jversion.h
index 6472c58..5151731 100644
--- a/src/libjpeg/jversion.h
+++ b/src/libjpeg/jversion.h
@@ -1,7 +1,7 @@
 /*
  * jversion.h
  *
- * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Copyright (C) 1991-2009, Thomas G. Lane, Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -9,6 +9,6 @@
  */
 
 
-#define JVERSION	"6b  27-Mar-1998"
+#define JVERSION	"7  27-Jun-2009"
 
-#define JCOPYRIGHT	"Copyright (C) 1998, Thomas G. Lane"
+#define JCOPYRIGHT	"Copyright (C) 2009, Thomas G. Lane, Guido Vollbeding"
author	scuri <scuri>	2009-08-20 01:10:27 +0000
committer	scuri <scuri>	2009-08-20 01:10:27 +0000
commit	83b3c8b629d96f5fdf754d35d5f4f5369dbfef1d (patch)
tree	81242227fb731da262139283c301e58f861e9727
parent	b014c9fa4544c111796aed1406e704338bacf456 (diff)