summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorscuri <scuri>2009-08-20 01:10:27 +0000
committerscuri <scuri>2009-08-20 01:10:27 +0000
commit83b3c8b629d96f5fdf754d35d5f4f5369dbfef1d (patch)
tree81242227fb731da262139283c301e58f861e9727 /src
parentb014c9fa4544c111796aed1406e704338bacf456 (diff)
*** empty log message ***
Diffstat (limited to 'src')
-rw-r--r--src/config.mak10
-rw-r--r--src/libjpeg/jcapimin.c4
-rw-r--r--src/libjpeg/jccoefct.c24
-rw-r--r--src/libjpeg/jcdctmgr.c489
-rw-r--r--src/libjpeg/jchuff.c1059
-rw-r--r--src/libjpeg/jcinit.c15
-rw-r--r--src/libjpeg/jcmainct.c14
-rw-r--r--src/libjpeg/jcmarker.c13
-rw-r--r--src/libjpeg/jcmaster.c218
-rw-r--r--src/libjpeg/jcparam.c72
-rw-r--r--src/libjpeg/jcprepct.c14
-rw-r--r--src/libjpeg/jcsample.c94
-rw-r--r--src/libjpeg/jctrans.c11
-rw-r--r--src/libjpeg/jdapimin.c5
-rw-r--r--src/libjpeg/jdapistd.c2
-rw-r--r--src/libjpeg/jdcoefct.c14
-rw-r--r--src/libjpeg/jdct.h239
-rw-r--r--src/libjpeg/jddctmgr.c131
-rw-r--r--src/libjpeg/jdhuff.c944
-rw-r--r--src/libjpeg/jdinput.c13
-rw-r--r--src/libjpeg/jdmainct.c42
-rw-r--r--src/libjpeg/jdmaster.c160
-rw-r--r--src/libjpeg/jdsample.c147
-rw-r--r--src/libjpeg/jdtrans.c15
-rw-r--r--src/libjpeg/jerror.h19
-rw-r--r--src/libjpeg/jfdctflt.c48
-rw-r--r--src/libjpeg/jfdctfst.c48
-rw-r--r--src/libjpeg/jfdctint.c4261
-rw-r--r--src/libjpeg/jidctint.c4936
-rw-r--r--src/libjpeg/jmorecfg.h19
-rw-r--r--src/libjpeg/jpegint.h23
-rw-r--r--src/libjpeg/jpeglib.h77
-rw-r--r--src/libjpeg/jversion.h6
33 files changed, 12035 insertions, 1151 deletions
diff --git a/src/config.mak b/src/config.mak
index 5e730bb..d2c3c0d 100644
--- a/src/config.mak
+++ b/src/config.mak
@@ -29,14 +29,14 @@ SRCTIFF := $(addprefix libtiff/, $(SRCTIFF)) im_format_tiff.cpp
INCLUDES += libtiff
SRCJPEG = \
- jcapimin.c jcmarker.c jdapimin.c jdinput.c jdtrans.c jidctred.c \
+ jcapimin.c jcmarker.c jdapimin.c jdinput.c jdtrans.c \
jcapistd.c jcmaster.c jdapistd.c jdmainct.c jerror.c jmemmgr.c \
jccoefct.c jcomapi.c jdatadst.c jdmarker.c jfdctflt.c jmemnobs.c \
jccolor.c jcparam.c jdatasrc.c jdmaster.c jfdctfst.c jquant1.c \
- jcdctmgr.c jcphuff.c jdcoefct.c jdmerge.c jfdctint.c jquant2.c \
- jchuff.c jcprepct.c jdcolor.c jdphuff.c jidctflt.c jutils.c \
- jcinit.c jcsample.c jddctmgr.c jdpostct.c jidctfst.c \
- jcmainct.c jctrans.c jdhuff.c jdsample.c jidctint.c
+ jcdctmgr.c jdcoefct.c jdmerge.c jfdctint.c jquant2.c \
+ jchuff.c jcprepct.c jdcolor.c jidctflt.c jutils.c jdarith.c \
+ jcinit.c jcsample.c jddctmgr.c jdpostct.c jidctfst.c jaricom.c \
+ jcmainct.c jctrans.c jdhuff.c jdsample.c jidctint.c jcarith.c
SRCJPEG := $(addprefix libjpeg/, $(SRCJPEG)) im_format_jpeg.cpp
INCLUDES += libjpeg
diff --git a/src/libjpeg/jcapimin.c b/src/libjpeg/jcapimin.c
index 54fb8c5..563ab42 100644
--- a/src/libjpeg/jcapimin.c
+++ b/src/libjpeg/jcapimin.c
@@ -63,8 +63,10 @@ jpeg_CreateCompress (j_compress_ptr cinfo, int version, size_t structsize)
cinfo->comp_info = NULL;
- for (i = 0; i < NUM_QUANT_TBLS; i++)
+ for (i = 0; i < NUM_QUANT_TBLS; i++) {
cinfo->quant_tbl_ptrs[i] = NULL;
+ cinfo->q_scale_factor[i] = 100;
+ }
for (i = 0; i < NUM_HUFF_TBLS; i++) {
cinfo->dc_huff_tbl_ptrs[i] = NULL;
diff --git a/src/libjpeg/jccoefct.c b/src/libjpeg/jccoefct.c
index 1963ddb..d775313 100644
--- a/src/libjpeg/jccoefct.c
+++ b/src/libjpeg/jccoefct.c
@@ -149,6 +149,7 @@ compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
int blkn, bi, ci, yindex, yoffset, blockcnt;
JDIMENSION ypos, xpos;
jpeg_component_info *compptr;
+ forward_DCT_ptr forward_DCT;
/* Loop to write as much as one whole iMCU row */
for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row;
@@ -167,17 +168,19 @@ compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
blkn = 0;
for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
compptr = cinfo->cur_comp_info[ci];
+ forward_DCT = cinfo->fdct->forward_DCT[compptr->component_index];
blockcnt = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
: compptr->last_col_width;
xpos = MCU_col_num * compptr->MCU_sample_width;
- ypos = yoffset * DCTSIZE; /* ypos == (yoffset+yindex) * DCTSIZE */
+ ypos = yoffset * compptr->DCT_v_scaled_size;
+ /* ypos == (yoffset+yindex) * DCTSIZE */
for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
if (coef->iMCU_row_num < last_iMCU_row ||
yoffset+yindex < compptr->last_row_height) {
- (*cinfo->fdct->forward_DCT) (cinfo, compptr,
- input_buf[compptr->component_index],
- coef->MCU_buffer[blkn],
- ypos, xpos, (JDIMENSION) blockcnt);
+ (*forward_DCT) (cinfo, compptr,
+ input_buf[compptr->component_index],
+ coef->MCU_buffer[blkn],
+ ypos, xpos, (JDIMENSION) blockcnt);
if (blockcnt < compptr->MCU_width) {
/* Create some dummy blocks at the right edge of the image. */
jzero_far((void FAR *) coef->MCU_buffer[blkn + blockcnt],
@@ -195,7 +198,7 @@ compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
}
}
blkn += compptr->MCU_width;
- ypos += DCTSIZE;
+ ypos += compptr->DCT_v_scaled_size;
}
}
/* Try to write the MCU. In event of a suspension failure, we will
@@ -252,6 +255,7 @@ compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
jpeg_component_info *compptr;
JBLOCKARRAY buffer;
JBLOCKROW thisblockrow, lastblockrow;
+ forward_DCT_ptr forward_DCT;
for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
ci++, compptr++) {
@@ -274,15 +278,15 @@ compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
ndummy = (int) (blocks_across % h_samp_factor);
if (ndummy > 0)
ndummy = h_samp_factor - ndummy;
+ forward_DCT = cinfo->fdct->forward_DCT[ci];
/* Perform DCT for all non-dummy blocks in this iMCU row. Each call
* on forward_DCT processes a complete horizontal row of DCT blocks.
*/
for (block_row = 0; block_row < block_rows; block_row++) {
thisblockrow = buffer[block_row];
- (*cinfo->fdct->forward_DCT) (cinfo, compptr,
- input_buf[ci], thisblockrow,
- (JDIMENSION) (block_row * DCTSIZE),
- (JDIMENSION) 0, blocks_across);
+ (*forward_DCT) (cinfo, compptr, input_buf[ci], thisblockrow,
+ (JDIMENSION) (block_row * compptr->DCT_v_scaled_size),
+ (JDIMENSION) 0, blocks_across);
if (ndummy > 0) {
/* Create dummy blocks at the right edge of the image. */
thisblockrow += blocks_across; /* => first dummy block */
diff --git a/src/libjpeg/jcdctmgr.c b/src/libjpeg/jcdctmgr.c
index 61fa79b..0bbdbb6 100644
--- a/src/libjpeg/jcdctmgr.c
+++ b/src/libjpeg/jcdctmgr.c
@@ -23,7 +23,7 @@ typedef struct {
struct jpeg_forward_dct pub; /* public fields */
/* Pointer to the DCT routine actually in use */
- forward_DCT_method_ptr do_dct;
+ forward_DCT_method_ptr do_dct[MAX_COMPONENTS];
/* The actual post-DCT divisors --- not identical to the quant table
* entries, because of scaling (especially for an unnormalized DCT).
@@ -33,7 +33,7 @@ typedef struct {
#ifdef DCT_FLOAT_SUPPORTED
/* Same as above for the floating-point case. */
- float_DCT_method_ptr do_float_dct;
+ float_DCT_method_ptr do_float_dct[MAX_COMPONENTS];
FAST_FLOAT * float_divisors[NUM_QUANT_TBLS];
#endif
} my_fdct_controller;
@@ -41,6 +41,132 @@ typedef struct {
typedef my_fdct_controller * my_fdct_ptr;
+/* The current scaled-DCT routines require ISLOW-style divisor tables,
+ * so be sure to compile that code if either ISLOW or SCALING is requested.
+ */
+#ifdef DCT_ISLOW_SUPPORTED
+#define PROVIDE_ISLOW_TABLES
+#else
+#ifdef DCT_SCALING_SUPPORTED
+#define PROVIDE_ISLOW_TABLES
+#endif
+#endif
+
+
+/*
+ * Perform forward DCT on one or more blocks of a component.
+ *
+ * The input samples are taken from the sample_data[] array starting at
+ * position start_row/start_col, and moving to the right for any additional
+ * blocks. The quantized coefficients are returned in coef_blocks[].
+ */
+
+METHODDEF(void)
+forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr,
+ JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
+ JDIMENSION start_row, JDIMENSION start_col,
+ JDIMENSION num_blocks)
+/* This version is used for integer DCT implementations. */
+{
+ /* This routine is heavily used, so it's worth coding it tightly. */
+ my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
+ forward_DCT_method_ptr do_dct = fdct->do_dct[compptr->component_index];
+ DCTELEM * divisors = fdct->divisors[compptr->quant_tbl_no];
+ DCTELEM workspace[DCTSIZE2]; /* work area for FDCT subroutine */
+ JDIMENSION bi;
+
+ sample_data += start_row; /* fold in the vertical offset once */
+
+ for (bi = 0; bi < num_blocks; bi++, start_col += compptr->DCT_h_scaled_size) {
+ /* Perform the DCT */
+ (*do_dct) (workspace, sample_data, start_col);
+
+ /* Quantize/descale the coefficients, and store into coef_blocks[] */
+ { register DCTELEM temp, qval;
+ register int i;
+ register JCOEFPTR output_ptr = coef_blocks[bi];
+
+ for (i = 0; i < DCTSIZE2; i++) {
+ qval = divisors[i];
+ temp = workspace[i];
+ /* Divide the coefficient value by qval, ensuring proper rounding.
+ * Since C does not specify the direction of rounding for negative
+ * quotients, we have to force the dividend positive for portability.
+ *
+ * In most files, at least half of the output values will be zero
+ * (at default quantization settings, more like three-quarters...)
+ * so we should ensure that this case is fast. On many machines,
+ * a comparison is enough cheaper than a divide to make a special test
+ * a win. Since both inputs will be nonnegative, we need only test
+ * for a < b to discover whether a/b is 0.
+ * If your machine's division is fast enough, define FAST_DIVIDE.
+ */
+#ifdef FAST_DIVIDE
+#define DIVIDE_BY(a,b) a /= b
+#else
+#define DIVIDE_BY(a,b) if (a >= b) a /= b; else a = 0
+#endif
+ if (temp < 0) {
+ temp = -temp;
+ temp += qval>>1; /* for rounding */
+ DIVIDE_BY(temp, qval);
+ temp = -temp;
+ } else {
+ temp += qval>>1; /* for rounding */
+ DIVIDE_BY(temp, qval);
+ }
+ output_ptr[i] = (JCOEF) temp;
+ }
+ }
+ }
+}
+
+
+#ifdef DCT_FLOAT_SUPPORTED
+
+METHODDEF(void)
+forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr,
+ JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
+ JDIMENSION start_row, JDIMENSION start_col,
+ JDIMENSION num_blocks)
+/* This version is used for floating-point DCT implementations. */
+{
+ /* This routine is heavily used, so it's worth coding it tightly. */
+ my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
+ float_DCT_method_ptr do_dct = fdct->do_float_dct[compptr->component_index];
+ FAST_FLOAT * divisors = fdct->float_divisors[compptr->quant_tbl_no];
+ FAST_FLOAT workspace[DCTSIZE2]; /* work area for FDCT subroutine */
+ JDIMENSION bi;
+
+ sample_data += start_row; /* fold in the vertical offset once */
+
+ for (bi = 0; bi < num_blocks; bi++, start_col += compptr->DCT_h_scaled_size) {
+ /* Perform the DCT */
+ (*do_dct) (workspace, sample_data, start_col);
+
+ /* Quantize/descale the coefficients, and store into coef_blocks[] */
+ { register FAST_FLOAT temp;
+ register int i;
+ register JCOEFPTR output_ptr = coef_blocks[bi];
+
+ for (i = 0; i < DCTSIZE2; i++) {
+ /* Apply the quantization and scaling factor */
+ temp = workspace[i] * divisors[i];
+ /* Round to nearest integer.
+ * Since C does not specify the direction of rounding for negative
+ * quotients, we have to force the dividend positive for portability.
+ * The maximum coefficient size is +-16K (for 12-bit data), so this
+ * code should work for either 16-bit or 32-bit ints.
+ */
+ output_ptr[i] = (JCOEF) ((int) (temp + (FAST_FLOAT) 16384.5) - 16384);
+ }
+ }
+ }
+}
+
+#endif /* DCT_FLOAT_SUPPORTED */
+
+
/*
* Initialize for a processing pass.
* Verify that all referenced Q-tables are present, and set up
@@ -56,11 +182,170 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
int ci, qtblno, i;
jpeg_component_info *compptr;
+ int method = 0;
JQUANT_TBL * qtbl;
DCTELEM * dtbl;
for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
ci++, compptr++) {
+ /* Select the proper DCT routine for this component's scaling */
+ switch ((compptr->DCT_h_scaled_size << 8) + compptr->DCT_v_scaled_size) {
+#ifdef DCT_SCALING_SUPPORTED
+ case ((1 << 8) + 1):
+ fdct->do_dct[ci] = jpeg_fdct_1x1;
+ method = JDCT_ISLOW; /* jfdctint uses islow-style table */
+ break;
+ case ((2 << 8) + 2):
+ fdct->do_dct[ci] = jpeg_fdct_2x2;
+ method = JDCT_ISLOW; /* jfdctint uses islow-style table */
+ break;
+ case ((3 << 8) + 3):
+ fdct->do_dct[ci] = jpeg_fdct_3x3;
+ method = JDCT_ISLOW; /* jfdctint uses islow-style table */
+ break;
+ case ((4 << 8) + 4):
+ fdct->do_dct[ci] = jpeg_fdct_4x4;
+ method = JDCT_ISLOW; /* jfdctint uses islow-style table */
+ break;
+ case ((5 << 8) + 5):
+ fdct->do_dct[ci] = jpeg_fdct_5x5;
+ method = JDCT_ISLOW; /* jfdctint uses islow-style table */
+ break;
+ case ((6 << 8) + 6):
+ fdct->do_dct[ci] = jpeg_fdct_6x6;
+ method = JDCT_ISLOW; /* jfdctint uses islow-style table */
+ break;
+ case ((7 << 8) + 7):
+ fdct->do_dct[ci] = jpeg_fdct_7x7;
+ method = JDCT_ISLOW; /* jfdctint uses islow-style table */
+ break;
+ case ((9 << 8) + 9):
+ fdct->do_dct[ci] = jpeg_fdct_9x9;
+ method = JDCT_ISLOW; /* jfdctint uses islow-style table */
+ break;
+ case ((10 << 8) + 10):
+ fdct->do_dct[ci] = jpeg_fdct_10x10;
+ method = JDCT_ISLOW; /* jfdctint uses islow-style table */
+ break;
+ case ((11 << 8) + 11):
+ fdct->do_dct[ci] = jpeg_fdct_11x11;
+ method = JDCT_ISLOW; /* jfdctint uses islow-style table */
+ break;
+ case ((12 << 8) + 12):
+ fdct->do_dct[ci] = jpeg_fdct_12x12;
+ method = JDCT_ISLOW; /* jfdctint uses islow-style table */
+ break;
+ case ((13 << 8) + 13):
+ fdct->do_dct[ci] = jpeg_fdct_13x13;
+ method = JDCT_ISLOW; /* jfdctint uses islow-style table */
+ break;
+ case ((14 << 8) + 14):
+ fdct->do_dct[ci] = jpeg_fdct_14x14;
+ method = JDCT_ISLOW; /* jfdctint uses islow-style table */
+ break;
+ case ((15 << 8) + 15):
+ fdct->do_dct[ci] = jpeg_fdct_15x15;
+ method = JDCT_ISLOW; /* jfdctint uses islow-style table */
+ break;
+ case ((16 << 8) + 16):
+ fdct->do_dct[ci] = jpeg_fdct_16x16;
+ method = JDCT_ISLOW; /* jfdctint uses islow-style table */
+ break;
+ case ((16 << 8) + 8):
+ fdct->do_dct[ci] = jpeg_fdct_16x8;
+ method = JDCT_ISLOW; /* jfdctint uses islow-style table */
+ break;
+ case ((14 << 8) + 7):
+ fdct->do_dct[ci] = jpeg_fdct_14x7;
+ method = JDCT_ISLOW; /* jfdctint uses islow-style table */
+ break;
+ case ((12 << 8) + 6):
+ fdct->do_dct[ci] = jpeg_fdct_12x6;
+ method = JDCT_ISLOW; /* jfdctint uses islow-style table */
+ break;
+ case ((10 << 8) + 5):
+ fdct->do_dct[ci] = jpeg_fdct_10x5;
+ method = JDCT_ISLOW; /* jfdctint uses islow-style table */
+ break;
+ case ((8 << 8) + 4):
+ fdct->do_dct[ci] = jpeg_fdct_8x4;
+ method = JDCT_ISLOW; /* jfdctint uses islow-style table */
+ break;
+ case ((6 << 8) + 3):
+ fdct->do_dct[ci] = jpeg_fdct_6x3;
+ method = JDCT_ISLOW; /* jfdctint uses islow-style table */
+ break;
+ case ((4 << 8) + 2):
+ fdct->do_dct[ci] = jpeg_fdct_4x2;
+ method = JDCT_ISLOW; /* jfdctint uses islow-style table */
+ break;
+ case ((2 << 8) + 1):
+ fdct->do_dct[ci] = jpeg_fdct_2x1;
+ method = JDCT_ISLOW; /* jfdctint uses islow-style table */
+ break;
+ case ((8 << 8) + 16):
+ fdct->do_dct[ci] = jpeg_fdct_8x16;
+ method = JDCT_ISLOW; /* jfdctint uses islow-style table */
+ break;
+ case ((7 << 8) + 14):
+ fdct->do_dct[ci] = jpeg_fdct_7x14;
+ method = JDCT_ISLOW; /* jfdctint uses islow-style table */
+ break;
+ case ((6 << 8) + 12):
+ fdct->do_dct[ci] = jpeg_fdct_6x12;
+ method = JDCT_ISLOW; /* jfdctint uses islow-style table */
+ break;
+ case ((5 << 8) + 10):
+ fdct->do_dct[ci] = jpeg_fdct_5x10;
+ method = JDCT_ISLOW; /* jfdctint uses islow-style table */
+ break;
+ case ((4 << 8) + 8):
+ fdct->do_dct[ci] = jpeg_fdct_4x8;
+ method = JDCT_ISLOW; /* jfdctint uses islow-style table */
+ break;
+ case ((3 << 8) + 6):
+ fdct->do_dct[ci] = jpeg_fdct_3x6;
+ method = JDCT_ISLOW; /* jfdctint uses islow-style table */
+ break;
+ case ((2 << 8) + 4):
+ fdct->do_dct[ci] = jpeg_fdct_2x4;
+ method = JDCT_ISLOW; /* jfdctint uses islow-style table */
+ break;
+ case ((1 << 8) + 2):
+ fdct->do_dct[ci] = jpeg_fdct_1x2;
+ method = JDCT_ISLOW; /* jfdctint uses islow-style table */
+ break;
+#endif
+ case ((DCTSIZE << 8) + DCTSIZE):
+ switch (cinfo->dct_method) {
+#ifdef DCT_ISLOW_SUPPORTED
+ case JDCT_ISLOW:
+ fdct->do_dct[ci] = jpeg_fdct_islow;
+ method = JDCT_ISLOW;
+ break;
+#endif
+#ifdef DCT_IFAST_SUPPORTED
+ case JDCT_IFAST:
+ fdct->do_dct[ci] = jpeg_fdct_ifast;
+ method = JDCT_IFAST;
+ break;
+#endif
+#ifdef DCT_FLOAT_SUPPORTED
+ case JDCT_FLOAT:
+ fdct->do_float_dct[ci] = jpeg_fdct_float;
+ method = JDCT_FLOAT;
+ break;
+#endif
+ default:
+ ERREXIT(cinfo, JERR_NOT_COMPILED);
+ break;
+ }
+ break;
+ default:
+ ERREXIT2(cinfo, JERR_BAD_DCTSIZE,
+ compptr->DCT_h_scaled_size, compptr->DCT_v_scaled_size);
+ break;
+ }
qtblno = compptr->quant_tbl_no;
/* Make sure specified quantization table is present */
if (qtblno < 0 || qtblno >= NUM_QUANT_TBLS ||
@@ -69,8 +354,8 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
qtbl = cinfo->quant_tbl_ptrs[qtblno];
/* Compute divisors for this quant table */
/* We may do this more than once for same table, but it's not a big deal */
- switch (cinfo->dct_method) {
-#ifdef DCT_ISLOW_SUPPORTED
+ switch (method) {
+#ifdef PROVIDE_ISLOW_TABLES
case JDCT_ISLOW:
/* For LL&M IDCT method, divisors are equal to raw quantization
* coefficients multiplied by 8 (to counteract scaling).
@@ -84,6 +369,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
for (i = 0; i < DCTSIZE2; i++) {
dtbl[i] = ((DCTELEM) qtbl->quantval[i]) << 3;
}
+ fdct->pub.forward_DCT[ci] = forward_DCT;
break;
#endif
#ifdef DCT_IFAST_SUPPORTED
@@ -122,6 +408,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
CONST_BITS-3);
}
}
+ fdct->pub.forward_DCT[ci] = forward_DCT;
break;
#endif
#ifdef DCT_FLOAT_SUPPORTED
@@ -158,6 +445,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
}
}
}
+ fdct->pub.forward_DCT[ci] = forward_DCT_float;
break;
#endif
default:
@@ -169,175 +457,6 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
/*
- * Perform forward DCT on one or more blocks of a component.
- *
- * The input samples are taken from the sample_data[] array starting at
- * position start_row/start_col, and moving to the right for any additional
- * blocks. The quantized coefficients are returned in coef_blocks[].
- */
-
-METHODDEF(void)
-forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr,
- JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
- JDIMENSION start_row, JDIMENSION start_col,
- JDIMENSION num_blocks)
-/* This version is used for integer DCT implementations. */
-{
- /* This routine is heavily used, so it's worth coding it tightly. */
- my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
- forward_DCT_method_ptr do_dct = fdct->do_dct;
- DCTELEM * divisors = fdct->divisors[compptr->quant_tbl_no];
- DCTELEM workspace[DCTSIZE2]; /* work area for FDCT subroutine */
- JDIMENSION bi;
-
- sample_data += start_row; /* fold in the vertical offset once */
-
- for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
- /* Load data into workspace, applying unsigned->signed conversion */
- { register DCTELEM *workspaceptr;
- register JSAMPROW elemptr;
- register int elemr;
-
- workspaceptr = workspace;
- for (elemr = 0; elemr < DCTSIZE; elemr++) {
- elemptr = sample_data[elemr] + start_col;
-#if DCTSIZE == 8 /* unroll the inner loop */
- *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
- *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
- *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
- *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
- *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
- *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
- *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
- *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-#else
- { register int elemc;
- for (elemc = DCTSIZE; elemc > 0; elemc--) {
- *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
- }
- }
-#endif
- }
- }
-
- /* Perform the DCT */
- (*do_dct) (workspace);
-
- /* Quantize/descale the coefficients, and store into coef_blocks[] */
- { register DCTELEM temp, qval;
- register int i;
- register JCOEFPTR output_ptr = coef_blocks[bi];
-
- for (i = 0; i < DCTSIZE2; i++) {
- qval = divisors[i];
- temp = workspace[i];
- /* Divide the coefficient value by qval, ensuring proper rounding.
- * Since C does not specify the direction of rounding for negative
- * quotients, we have to force the dividend positive for portability.
- *
- * In most files, at least half of the output values will be zero
- * (at default quantization settings, more like three-quarters...)
- * so we should ensure that this case is fast. On many machines,
- * a comparison is enough cheaper than a divide to make a special test
- * a win. Since both inputs will be nonnegative, we need only test
- * for a < b to discover whether a/b is 0.
- * If your machine's division is fast enough, define FAST_DIVIDE.
- */
-#ifdef FAST_DIVIDE
-#define DIVIDE_BY(a,b) a /= b
-#else
-#define DIVIDE_BY(a,b) if (a >= b) a /= b; else a = 0
-#endif
- if (temp < 0) {
- temp = -temp;
- temp += qval>>1; /* for rounding */
- DIVIDE_BY(temp, qval);
- temp = -temp;
- } else {
- temp += qval>>1; /* for rounding */
- DIVIDE_BY(temp, qval);
- }
- output_ptr[i] = (JCOEF) temp;
- }
- }
- }
-}
-
-
-#ifdef DCT_FLOAT_SUPPORTED
-
-METHODDEF(void)
-forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr,
- JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
- JDIMENSION start_row, JDIMENSION start_col,
- JDIMENSION num_blocks)
-/* This version is used for floating-point DCT implementations. */
-{
- /* This routine is heavily used, so it's worth coding it tightly. */
- my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
- float_DCT_method_ptr do_dct = fdct->do_float_dct;
- FAST_FLOAT * divisors = fdct->float_divisors[compptr->quant_tbl_no];
- FAST_FLOAT workspace[DCTSIZE2]; /* work area for FDCT subroutine */
- JDIMENSION bi;
-
- sample_data += start_row; /* fold in the vertical offset once */
-
- for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
- /* Load data into workspace, applying unsigned->signed conversion */
- { register FAST_FLOAT *workspaceptr;
- register JSAMPROW elemptr;
- register int elemr;
-
- workspaceptr = workspace;
- for (elemr = 0; elemr < DCTSIZE; elemr++) {
- elemptr = sample_data[elemr] + start_col;
-#if DCTSIZE == 8 /* unroll the inner loop */
- *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
- *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
- *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
- *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
- *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
- *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
- *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
- *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-#else
- { register int elemc;
- for (elemc = DCTSIZE; elemc > 0; elemc--) {
- *workspaceptr++ = (FAST_FLOAT)
- (GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
- }
- }
-#endif
- }
- }
-
- /* Perform the DCT */
- (*do_dct) (workspace);
-
- /* Quantize/descale the coefficients, and store into coef_blocks[] */
- { register FAST_FLOAT temp;
- register int i;
- register JCOEFPTR output_ptr = coef_blocks[bi];
-
- for (i = 0; i < DCTSIZE2; i++) {
- /* Apply the quantization and scaling factor */
- temp = workspace[i] * divisors[i];
- /* Round to nearest integer.
- * Since C does not specify the direction of rounding for negative
- * quotients, we have to force the dividend positive for portability.
- * The maximum coefficient size is +-16K (for 12-bit data), so this
- * code should work for either 16-bit or 32-bit ints.
- */
- output_ptr[i] = (JCOEF) ((int) (temp + (FAST_FLOAT) 16384.5) - 16384);
- }
- }
- }
-}
-
-#endif /* DCT_FLOAT_SUPPORTED */
-
-
-/*
* Initialize FDCT manager.
*/
@@ -353,30 +472,6 @@ jinit_forward_dct (j_compress_ptr cinfo)
cinfo->fdct = (struct jpeg_forward_dct *) fdct;
fdct->pub.start_pass = start_pass_fdctmgr;
- switch (cinfo->dct_method) {
-#ifdef DCT_ISLOW_SUPPORTED
- case JDCT_ISLOW:
- fdct->pub.forward_DCT = forward_DCT;
- fdct->do_dct = jpeg_fdct_islow;
- break;
-#endif
-#ifdef DCT_IFAST_SUPPORTED
- case JDCT_IFAST:
- fdct->pub.forward_DCT = forward_DCT;
- fdct->do_dct = jpeg_fdct_ifast;
- break;
-#endif
-#ifdef DCT_FLOAT_SUPPORTED
- case JDCT_FLOAT:
- fdct->pub.forward_DCT = forward_DCT_float;
- fdct->do_float_dct = jpeg_fdct_float;
- break;
-#endif
- default:
- ERREXIT(cinfo, JERR_NOT_COMPILED);
- break;
- }
-
/* Mark divisor tables unallocated */
for (i = 0; i < NUM_QUANT_TBLS; i++) {
fdct->divisors[i] = NULL;
diff --git a/src/libjpeg/jchuff.c b/src/libjpeg/jchuff.c
index f235250..0fd578e 100644
--- a/src/libjpeg/jchuff.c
+++ b/src/libjpeg/jchuff.c
@@ -2,22 +2,48 @@
* jchuff.c
*
* Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2006-2009 by Guido Vollbeding.
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
* This file contains Huffman entropy encoding routines.
+ * Both sequential and progressive modes are supported in this single module.
*
* Much of the complexity here has to do with supporting output suspension.
* If the data destination module demands suspension, we want to be able to
* back up to the start of the current MCU. To do this, we copy state
* variables into local working storage, and update them back to the
* permanent JPEG objects only upon successful completion of an MCU.
+ *
+ * We do not support output suspension for the progressive JPEG mode, since
+ * the library currently does not allow multiple-scan files to be written
+ * with output suspension.
*/
#define JPEG_INTERNALS
#include "jinclude.h"
#include "jpeglib.h"
-#include "jchuff.h" /* Declarations shared with jcphuff.c */
+
+
+/* The legal range of a DCT coefficient is
+ * -1024 .. +1023 for 8-bit data;
+ * -16384 .. +16383 for 12-bit data.
+ * Hence the magnitude should always fit in 10 or 14 bits respectively.
+ */
+
+#if BITS_IN_JSAMPLE == 8
+#define MAX_COEF_BITS 10
+#else
+#define MAX_COEF_BITS 14
+#endif
+
+/* Derived data constructed for each Huffman table */
+
+typedef struct {
+ unsigned int ehufco[256]; /* code for each symbol */
+ char ehufsi[256]; /* length of code for each symbol */
+ /* If no code has been allocated for a symbol S, ehufsi[S] contains 0 */
+} c_derived_tbl;
/* Expanded entropy encoder object for Huffman encoding.
@@ -61,19 +87,47 @@ typedef struct {
unsigned int restarts_to_go; /* MCUs left in this restart interval */
int next_restart_num; /* next restart number to write (0-7) */
+ /* Following four fields used only in sequential mode */
+
/* Pointers to derived tables (these workspaces have image lifespan) */
c_derived_tbl * dc_derived_tbls[NUM_HUFF_TBLS];
c_derived_tbl * ac_derived_tbls[NUM_HUFF_TBLS];
-#ifdef ENTROPY_OPT_SUPPORTED /* Statistics tables for optimization */
+ /* Statistics tables for optimization */
long * dc_count_ptrs[NUM_HUFF_TBLS];
long * ac_count_ptrs[NUM_HUFF_TBLS];
-#endif
+
+ /* Following fields used only in progressive mode */
+
+ /* Mode flag: TRUE for optimization, FALSE for actual data output */
+ boolean gather_statistics;
+
+ /* next_output_byte/free_in_buffer are local copies of cinfo->dest fields.
+ */
+ JOCTET * next_output_byte; /* => next byte to write in buffer */
+ size_t free_in_buffer; /* # of byte spaces remaining in buffer */
+ j_compress_ptr cinfo; /* link to cinfo (needed for dump_buffer) */
+
+ /* Coding status for AC components */
+ int ac_tbl_no; /* the table number of the single component */
+ unsigned int EOBRUN; /* run length of EOBs */
+ unsigned int BE; /* # of buffered correction bits before MCU */
+ char * bit_buffer; /* buffer for correction bits (1 per char) */
+ /* packing correction bits tightly would save some space but cost time... */
+
+ /* Pointers to derived tables (these workspaces have image lifespan).
+ * Since any one scan in progressive mode codes only DC or only AC,
+ * we only need one set of tables, not one for DC and one for AC.
+ */
+ c_derived_tbl * derived_tbls[NUM_HUFF_TBLS];
+
+ /* Statistics tables for optimization; again, one set is enough */
+ long * count_ptrs[NUM_HUFF_TBLS];
} huff_entropy_encoder;
typedef huff_entropy_encoder * huff_entropy_ptr;
-/* Working state while writing an MCU.
+/* Working state while writing an MCU (sequential mode).
* This struct contains all the fields that are needed by subroutines.
*/
@@ -84,98 +138,37 @@ typedef struct {
j_compress_ptr cinfo; /* dump_buffer needs access to this */
} working_state;
+/* MAX_CORR_BITS is the number of bits the AC refinement correction-bit
+ * buffer can hold. Larger sizes may slightly improve compression, but
+ * 1000 is already well into the realm of overkill.
+ * The minimum safe size is 64 bits.
+ */
-/* Forward declarations */
-METHODDEF(boolean) encode_mcu_huff JPP((j_compress_ptr cinfo,
- JBLOCKROW *MCU_data));
-METHODDEF(void) finish_pass_huff JPP((j_compress_ptr cinfo));
-#ifdef ENTROPY_OPT_SUPPORTED
-METHODDEF(boolean) encode_mcu_gather JPP((j_compress_ptr cinfo,
- JBLOCKROW *MCU_data));
-METHODDEF(void) finish_pass_gather JPP((j_compress_ptr cinfo));
-#endif
-
+#define MAX_CORR_BITS 1000 /* Max # of correction bits I can buffer */
-/*
- * Initialize for a Huffman-compressed scan.
- * If gather_statistics is TRUE, we do not output anything during the scan,
- * just count the Huffman symbols used and generate Huffman code tables.
+/* IRIGHT_SHIFT is like RIGHT_SHIFT, but works on int rather than INT32.
+ * We assume that int right shift is unsigned if INT32 right shift is,
+ * which should be safe.
*/
-METHODDEF(void)
-start_pass_huff (j_compress_ptr cinfo, boolean gather_statistics)
-{
- huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
- int ci, dctbl, actbl;
- jpeg_component_info * compptr;
-
- if (gather_statistics) {
-#ifdef ENTROPY_OPT_SUPPORTED
- entropy->pub.encode_mcu = encode_mcu_gather;
- entropy->pub.finish_pass = finish_pass_gather;
+#ifdef RIGHT_SHIFT_IS_UNSIGNED
+#define ISHIFT_TEMPS int ishift_temp;
+#define IRIGHT_SHIFT(x,shft) \
+ ((ishift_temp = (x)) < 0 ? \
+ (ishift_temp >> (shft)) | ((~0) << (16-(shft))) : \
+ (ishift_temp >> (shft)))
#else
- ERREXIT(cinfo, JERR_NOT_COMPILED);
+#define ISHIFT_TEMPS
+#define IRIGHT_SHIFT(x,shft) ((x) >> (shft))
#endif
- } else {
- entropy->pub.encode_mcu = encode_mcu_huff;
- entropy->pub.finish_pass = finish_pass_huff;
- }
-
- for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
- compptr = cinfo->cur_comp_info[ci];
- dctbl = compptr->dc_tbl_no;
- actbl = compptr->ac_tbl_no;
- if (gather_statistics) {
-#ifdef ENTROPY_OPT_SUPPORTED
- /* Check for invalid table indexes */
- /* (make_c_derived_tbl does this in the other path) */
- if (dctbl < 0 || dctbl >= NUM_HUFF_TBLS)
- ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, dctbl);
- if (actbl < 0 || actbl >= NUM_HUFF_TBLS)
- ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, actbl);
- /* Allocate and zero the statistics tables */
- /* Note that jpeg_gen_optimal_table expects 257 entries in each table! */
- if (entropy->dc_count_ptrs[dctbl] == NULL)
- entropy->dc_count_ptrs[dctbl] = (long *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
- 257 * SIZEOF(long));
- MEMZERO(entropy->dc_count_ptrs[dctbl], 257 * SIZEOF(long));
- if (entropy->ac_count_ptrs[actbl] == NULL)
- entropy->ac_count_ptrs[actbl] = (long *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
- 257 * SIZEOF(long));
- MEMZERO(entropy->ac_count_ptrs[actbl], 257 * SIZEOF(long));
-#endif
- } else {
- /* Compute derived values for Huffman tables */
- /* We may do this more than once for a table, but it's not expensive */
- jpeg_make_c_derived_tbl(cinfo, TRUE, dctbl,
- & entropy->dc_derived_tbls[dctbl]);
- jpeg_make_c_derived_tbl(cinfo, FALSE, actbl,
- & entropy->ac_derived_tbls[actbl]);
- }
- /* Initialize DC predictions to 0 */
- entropy->saved.last_dc_val[ci] = 0;
- }
-
- /* Initialize bit buffer to empty */
- entropy->saved.put_buffer = 0;
- entropy->saved.put_bits = 0;
-
- /* Initialize restart stuff */
- entropy->restarts_to_go = cinfo->restart_interval;
- entropy->next_restart_num = 0;
-}
/*
* Compute the derived values for a Huffman table.
* This routine also performs some validation checks on the table.
- *
- * Note this is also used by jcphuff.c.
*/
-GLOBAL(void)
+LOCAL(void)
jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno,
c_derived_tbl ** pdtbl)
{
@@ -264,18 +257,27 @@ jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno,
}
-/* Outputting bytes to the file */
+/* Outputting bytes to the file.
+ * NB: these must be called only when actually outputting,
+ * that is, entropy->gather_statistics == FALSE.
+ */
/* Emit a byte, taking 'action' if must suspend. */
-#define emit_byte(state,val,action) \
+#define emit_byte_s(state,val,action) \
{ *(state)->next_output_byte++ = (JOCTET) (val); \
if (--(state)->free_in_buffer == 0) \
- if (! dump_buffer(state)) \
+ if (! dump_buffer_s(state)) \
{ action; } }
+/* Emit a byte */
+#define emit_byte_e(entropy,val) \
+ { *(entropy)->next_output_byte++ = (JOCTET) (val); \
+ if (--(entropy)->free_in_buffer == 0) \
+ dump_buffer_e(entropy); }
+
LOCAL(boolean)
-dump_buffer (working_state * state)
+dump_buffer_s (working_state * state)
/* Empty the output buffer; return TRUE if successful, FALSE if must suspend */
{
struct jpeg_destination_mgr * dest = state->cinfo->dest;
@@ -289,6 +291,20 @@ dump_buffer (working_state * state)
}
+LOCAL(void)
+dump_buffer_e (huff_entropy_ptr entropy)
+/* Empty the output buffer; we do not support suspension in this case. */
+{
+ struct jpeg_destination_mgr * dest = entropy->cinfo->dest;
+
+ if (! (*dest->empty_output_buffer) (entropy->cinfo))
+ ERREXIT(entropy->cinfo, JERR_CANT_SUSPEND);
+ /* After a successful buffer dump, must reset buffer pointers */
+ entropy->next_output_byte = dest->next_output_byte;
+ entropy->free_in_buffer = dest->free_in_buffer;
+}
+
+
/* Outputting bits to the file */
/* Only the right 24 bits of put_buffer are used; the valid bits are
@@ -299,7 +315,7 @@ dump_buffer (working_state * state)
INLINE
LOCAL(boolean)
-emit_bits (working_state * state, unsigned int code, int size)
+emit_bits_s (working_state * state, unsigned int code, int size)
/* Emit some bits; return TRUE if successful, FALSE if must suspend */
{
/* This routine is heavily used, so it's worth coding tightly. */
@@ -321,9 +337,9 @@ emit_bits (working_state * state, unsigned int code, int size)
while (put_bits >= 8) {
int c = (int) ((put_buffer >> 16) & 0xFF);
- emit_byte(state, c, return FALSE);
+ emit_byte_s(state, c, return FALSE);
if (c == 0xFF) { /* need to stuff a zero byte? */
- emit_byte(state, 0, return FALSE);
+ emit_byte_s(state, 0, return FALSE);
}
put_buffer <<= 8;
put_bits -= 8;
@@ -336,17 +352,554 @@ emit_bits (working_state * state, unsigned int code, int size)
}
+INLINE
+LOCAL(void)
+emit_bits_e (huff_entropy_ptr entropy, unsigned int code, int size)
+/* Emit some bits, unless we are in gather mode */
+{
+ /* This routine is heavily used, so it's worth coding tightly. */
+ register INT32 put_buffer = (INT32) code;
+ register int put_bits = entropy->saved.put_bits;
+
+ /* if size is 0, caller used an invalid Huffman table entry */
+ if (size == 0)
+ ERREXIT(entropy->cinfo, JERR_HUFF_MISSING_CODE);
+
+ if (entropy->gather_statistics)
+ return; /* do nothing if we're only getting stats */
+
+ put_buffer &= (((INT32) 1)<<size) - 1; /* mask off any extra bits in code */
+
+ put_bits += size; /* new number of bits in buffer */
+
+ put_buffer <<= 24 - put_bits; /* align incoming bits */
+
+ /* and merge with old buffer contents */
+ put_buffer |= entropy->saved.put_buffer;
+
+ while (put_bits >= 8) {
+ int c = (int) ((put_buffer >> 16) & 0xFF);
+
+ emit_byte_e(entropy, c);
+ if (c == 0xFF) { /* need to stuff a zero byte? */
+ emit_byte_e(entropy, 0);
+ }
+ put_buffer <<= 8;
+ put_bits -= 8;
+ }
+
+ entropy->saved.put_buffer = put_buffer; /* update variables */
+ entropy->saved.put_bits = put_bits;
+}
+
+
LOCAL(boolean)
-flush_bits (working_state * state)
+flush_bits_s (working_state * state)
{
- if (! emit_bits(state, 0x7F, 7)) /* fill any partial byte with ones */
+ if (! emit_bits_s(state, 0x7F, 7)) /* fill any partial byte with ones */
return FALSE;
- state->cur.put_buffer = 0; /* and reset bit-buffer to empty */
+ state->cur.put_buffer = 0; /* and reset bit-buffer to empty */
state->cur.put_bits = 0;
return TRUE;
}
+LOCAL(void)
+flush_bits_e (huff_entropy_ptr entropy)
+{
+ emit_bits_e(entropy, 0x7F, 7); /* fill any partial byte with ones */
+ entropy->saved.put_buffer = 0; /* and reset bit-buffer to empty */
+ entropy->saved.put_bits = 0;
+}
+
+
+/*
+ * Emit (or just count) a Huffman symbol.
+ */
+
+INLINE
+LOCAL(void)
+emit_symbol (huff_entropy_ptr entropy, int tbl_no, int symbol)
+{
+ if (entropy->gather_statistics)
+ entropy->count_ptrs[tbl_no][symbol]++;
+ else {
+ c_derived_tbl * tbl = entropy->derived_tbls[tbl_no];
+ emit_bits_e(entropy, tbl->ehufco[symbol], tbl->ehufsi[symbol]);
+ }
+}
+
+
+/*
+ * Emit bits from a correction bit buffer.
+ */
+
+LOCAL(void)
+emit_buffered_bits (huff_entropy_ptr entropy, char * bufstart,
+ unsigned int nbits)
+{
+ if (entropy->gather_statistics)
+ return; /* no real work */
+
+ while (nbits > 0) {
+ emit_bits_e(entropy, (unsigned int) (*bufstart), 1);
+ bufstart++;
+ nbits--;
+ }
+}
+
+
+/*
+ * Emit any pending EOBRUN symbol.
+ */
+
+LOCAL(void)
+emit_eobrun (huff_entropy_ptr entropy)
+{
+ register int temp, nbits;
+
+ if (entropy->EOBRUN > 0) { /* if there is any pending EOBRUN */
+ temp = entropy->EOBRUN;
+ nbits = 0;
+ while ((temp >>= 1))
+ nbits++;
+ /* safety check: shouldn't happen given limited correction-bit buffer */
+ if (nbits > 14)
+ ERREXIT(entropy->cinfo, JERR_HUFF_MISSING_CODE);
+
+ emit_symbol(entropy, entropy->ac_tbl_no, nbits << 4);
+ if (nbits)
+ emit_bits_e(entropy, entropy->EOBRUN, nbits);
+
+ entropy->EOBRUN = 0;
+
+ /* Emit any buffered correction bits */
+ emit_buffered_bits(entropy, entropy->bit_buffer, entropy->BE);
+ entropy->BE = 0;
+ }
+}
+
+
+/*
+ * Emit a restart marker & resynchronize predictions.
+ */
+
+LOCAL(boolean)
+emit_restart_s (working_state * state, int restart_num)
+{
+ int ci;
+
+ if (! flush_bits_s(state))
+ return FALSE;
+
+ emit_byte_s(state, 0xFF, return FALSE);
+ emit_byte_s(state, JPEG_RST0 + restart_num, return FALSE);
+
+ /* Re-initialize DC predictions to 0 */
+ for (ci = 0; ci < state->cinfo->comps_in_scan; ci++)
+ state->cur.last_dc_val[ci] = 0;
+
+ /* The restart counter is not updated until we successfully write the MCU. */
+
+ return TRUE;
+}
+
+
+LOCAL(void)
+emit_restart_e (huff_entropy_ptr entropy, int restart_num)
+{
+ int ci;
+
+ emit_eobrun(entropy);
+
+ if (! entropy->gather_statistics) {
+ flush_bits_e(entropy);
+ emit_byte_e(entropy, 0xFF);
+ emit_byte_e(entropy, JPEG_RST0 + restart_num);
+ }
+
+ if (entropy->cinfo->Ss == 0) {
+ /* Re-initialize DC predictions to 0 */
+ for (ci = 0; ci < entropy->cinfo->comps_in_scan; ci++)
+ entropy->saved.last_dc_val[ci] = 0;
+ } else {
+ /* Re-initialize all AC-related fields to 0 */
+ entropy->EOBRUN = 0;
+ entropy->BE = 0;
+ }
+}
+
+
+/*
+ * MCU encoding for DC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+ huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+ register int temp, temp2;
+ register int nbits;
+ int blkn, ci;
+ int Al = cinfo->Al;
+ JBLOCKROW block;
+ jpeg_component_info * compptr;
+ ISHIFT_TEMPS
+
+ entropy->next_output_byte = cinfo->dest->next_output_byte;
+ entropy->free_in_buffer = cinfo->dest->free_in_buffer;
+
+ /* Emit restart marker if needed */
+ if (cinfo->restart_interval)
+ if (entropy->restarts_to_go == 0)
+ emit_restart_e(entropy, entropy->next_restart_num);
+
+ /* Encode the MCU data blocks */
+ for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+ block = MCU_data[blkn];
+ ci = cinfo->MCU_membership[blkn];
+ compptr = cinfo->cur_comp_info[ci];
+
+ /* Compute the DC value after the required point transform by Al.
+ * This is simply an arithmetic right shift.
+ */
+ temp2 = IRIGHT_SHIFT((int) ((*block)[0]), Al);
+
+ /* DC differences are figured on the point-transformed values. */
+ temp = temp2 - entropy->saved.last_dc_val[ci];
+ entropy->saved.last_dc_val[ci] = temp2;
+
+ /* Encode the DC coefficient difference per section G.1.2.1 */
+ temp2 = temp;
+ if (temp < 0) {
+ temp = -temp; /* temp is abs value of input */
+ /* For a negative input, want temp2 = bitwise complement of abs(input) */
+ /* This code assumes we are on a two's complement machine */
+ temp2--;
+ }
+
+ /* Find the number of bits needed for the magnitude of the coefficient */
+ nbits = 0;
+ while (temp) {
+ nbits++;
+ temp >>= 1;
+ }
+ /* Check for out-of-range coefficient values.
+ * Since we're encoding a difference, the range limit is twice as much.
+ */
+ if (nbits > MAX_COEF_BITS+1)
+ ERREXIT(cinfo, JERR_BAD_DCT_COEF);
+
+ /* Count/emit the Huffman-coded symbol for the number of bits */
+ emit_symbol(entropy, compptr->dc_tbl_no, nbits);
+
+ /* Emit that number of bits of the value, if positive, */
+ /* or the complement of its magnitude, if negative. */
+ if (nbits) /* emit_bits rejects calls with size 0 */
+ emit_bits_e(entropy, (unsigned int) temp2, nbits);
+ }
+
+ cinfo->dest->next_output_byte = entropy->next_output_byte;
+ cinfo->dest->free_in_buffer = entropy->free_in_buffer;
+
+ /* Update restart-interval state too */
+ if (cinfo->restart_interval) {
+ if (entropy->restarts_to_go == 0) {
+ entropy->restarts_to_go = cinfo->restart_interval;
+ entropy->next_restart_num++;
+ entropy->next_restart_num &= 7;
+ }
+ entropy->restarts_to_go--;
+ }
+
+ return TRUE;
+}
+
+
+/*
+ * MCU encoding for AC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+ huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+ register int temp, temp2;
+ register int nbits;
+ register int r, k;
+ int Se = cinfo->Se;
+ int Al = cinfo->Al;
+ JBLOCKROW block;
+
+ entropy->next_output_byte = cinfo->dest->next_output_byte;
+ entropy->free_in_buffer = cinfo->dest->free_in_buffer;
+
+ /* Emit restart marker if needed */
+ if (cinfo->restart_interval)
+ if (entropy->restarts_to_go == 0)
+ emit_restart_e(entropy, entropy->next_restart_num);
+
+ /* Encode the MCU data block */
+ block = MCU_data[0];
+
+ /* Encode the AC coefficients per section G.1.2.2, fig. G.3 */
+
+ r = 0; /* r = run length of zeros */
+
+ for (k = cinfo->Ss; k <= Se; k++) {
+ if ((temp = (*block)[jpeg_natural_order[k]]) == 0) {
+ r++;
+ continue;
+ }
+ /* We must apply the point transform by Al. For AC coefficients this
+ * is an integer division with rounding towards 0. To do this portably
+ * in C, we shift after obtaining the absolute value; so the code is
+ * interwoven with finding the abs value (temp) and output bits (temp2).
+ */
+ if (temp < 0) {
+ temp = -temp; /* temp is abs value of input */
+ temp >>= Al; /* apply the point transform */
+ /* For a negative coef, want temp2 = bitwise complement of abs(coef) */
+ temp2 = ~temp;
+ } else {
+ temp >>= Al; /* apply the point transform */
+ temp2 = temp;
+ }
+ /* Watch out for case that nonzero coef is zero after point transform */
+ if (temp == 0) {
+ r++;
+ continue;
+ }
+
+ /* Emit any pending EOBRUN */
+ if (entropy->EOBRUN > 0)
+ emit_eobrun(entropy);
+ /* if run length > 15, must emit special run-length-16 codes (0xF0) */
+ while (r > 15) {
+ emit_symbol(entropy, entropy->ac_tbl_no, 0xF0);
+ r -= 16;
+ }
+
+ /* Find the number of bits needed for the magnitude of the coefficient */
+ nbits = 1; /* there must be at least one 1 bit */
+ while ((temp >>= 1))
+ nbits++;
+ /* Check for out-of-range coefficient values */
+ if (nbits > MAX_COEF_BITS)
+ ERREXIT(cinfo, JERR_BAD_DCT_COEF);
+
+ /* Count/emit Huffman symbol for run length / number of bits */
+ emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + nbits);
+
+ /* Emit that number of bits of the value, if positive, */
+ /* or the complement of its magnitude, if negative. */
+ emit_bits_e(entropy, (unsigned int) temp2, nbits);
+
+ r = 0; /* reset zero run length */
+ }
+
+ if (r > 0) { /* If there are trailing zeroes, */
+ entropy->EOBRUN++; /* count an EOB */
+ if (entropy->EOBRUN == 0x7FFF)
+ emit_eobrun(entropy); /* force it out to avoid overflow */
+ }
+
+ cinfo->dest->next_output_byte = entropy->next_output_byte;
+ cinfo->dest->free_in_buffer = entropy->free_in_buffer;
+
+ /* Update restart-interval state too */
+ if (cinfo->restart_interval) {
+ if (entropy->restarts_to_go == 0) {
+ entropy->restarts_to_go = cinfo->restart_interval;
+ entropy->next_restart_num++;
+ entropy->next_restart_num &= 7;
+ }
+ entropy->restarts_to_go--;
+ }
+
+ return TRUE;
+}
+
+
+/*
+ * MCU encoding for DC successive approximation refinement scan.
+ * Note: we assume such scans can be multi-component, although the spec
+ * is not very clear on the point.
+ */
+
+METHODDEF(boolean)
+encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+ huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+ register int temp;
+ int blkn;
+ int Al = cinfo->Al;
+ JBLOCKROW block;
+
+ entropy->next_output_byte = cinfo->dest->next_output_byte;
+ entropy->free_in_buffer = cinfo->dest->free_in_buffer;
+
+ /* Emit restart marker if needed */
+ if (cinfo->restart_interval)
+ if (entropy->restarts_to_go == 0)
+ emit_restart_e(entropy, entropy->next_restart_num);
+
+ /* Encode the MCU data blocks */
+ for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+ block = MCU_data[blkn];
+
+ /* We simply emit the Al'th bit of the DC coefficient value. */
+ temp = (*block)[0];
+ emit_bits_e(entropy, (unsigned int) (temp >> Al), 1);
+ }
+
+ cinfo->dest->next_output_byte = entropy->next_output_byte;
+ cinfo->dest->free_in_buffer = entropy->free_in_buffer;
+
+ /* Update restart-interval state too */
+ if (cinfo->restart_interval) {
+ if (entropy->restarts_to_go == 0) {
+ entropy->restarts_to_go = cinfo->restart_interval;
+ entropy->next_restart_num++;
+ entropy->next_restart_num &= 7;
+ }
+ entropy->restarts_to_go--;
+ }
+
+ return TRUE;
+}
+
+
+/*
+ * MCU encoding for AC successive approximation refinement scan.
+ */
+
+METHODDEF(boolean)
+encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+ huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+ register int temp;
+ register int r, k;
+ int EOB;
+ char *BR_buffer;
+ unsigned int BR;
+ int Se = cinfo->Se;
+ int Al = cinfo->Al;
+ JBLOCKROW block;
+ int absvalues[DCTSIZE2];
+
+ entropy->next_output_byte = cinfo->dest->next_output_byte;
+ entropy->free_in_buffer = cinfo->dest->free_in_buffer;
+
+ /* Emit restart marker if needed */
+ if (cinfo->restart_interval)
+ if (entropy->restarts_to_go == 0)
+ emit_restart_e(entropy, entropy->next_restart_num);
+
+ /* Encode the MCU data block */
+ block = MCU_data[0];
+
+ /* It is convenient to make a pre-pass to determine the transformed
+ * coefficients' absolute values and the EOB position.
+ */
+ EOB = 0;
+ for (k = cinfo->Ss; k <= Se; k++) {
+ temp = (*block)[jpeg_natural_order[k]];
+ /* We must apply the point transform by Al. For AC coefficients this
+ * is an integer division with rounding towards 0. To do this portably
+ * in C, we shift after obtaining the absolute value.
+ */
+ if (temp < 0)
+ temp = -temp; /* temp is abs value of input */
+ temp >>= Al; /* apply the point transform */
+ absvalues[k] = temp; /* save abs value for main pass */
+ if (temp == 1)
+ EOB = k; /* EOB = index of last newly-nonzero coef */
+ }
+
+ /* Encode the AC coefficients per section G.1.2.3, fig. G.7 */
+
+ r = 0; /* r = run length of zeros */
+ BR = 0; /* BR = count of buffered bits added now */
+ BR_buffer = entropy->bit_buffer + entropy->BE; /* Append bits to buffer */
+
+ for (k = cinfo->Ss; k <= Se; k++) {
+ if ((temp = absvalues[k]) == 0) {
+ r++;
+ continue;
+ }
+
+ /* Emit any required ZRLs, but not if they can be folded into EOB */
+ while (r > 15 && k <= EOB) {
+ /* emit any pending EOBRUN and the BE correction bits */
+ emit_eobrun(entropy);
+ /* Emit ZRL */
+ emit_symbol(entropy, entropy->ac_tbl_no, 0xF0);
+ r -= 16;
+ /* Emit buffered correction bits that must be associated with ZRL */
+ emit_buffered_bits(entropy, BR_buffer, BR);
+ BR_buffer = entropy->bit_buffer; /* BE bits are gone now */
+ BR = 0;
+ }
+
+ /* If the coef was previously nonzero, it only needs a correction bit.
+ * NOTE: a straight translation of the spec's figure G.7 would suggest
+ * that we also need to test r > 15. But if r > 15, we can only get here
+ * if k > EOB, which implies that this coefficient is not 1.
+ */
+ if (temp > 1) {
+ /* The correction bit is the next bit of the absolute value. */
+ BR_buffer[BR++] = (char) (temp & 1);
+ continue;
+ }
+
+ /* Emit any pending EOBRUN and the BE correction bits */
+ emit_eobrun(entropy);
+
+ /* Count/emit Huffman symbol for run length / number of bits */
+ emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + 1);
+
+ /* Emit output bit for newly-nonzero coef */
+ temp = ((*block)[jpeg_natural_order[k]] < 0) ? 0 : 1;
+ emit_bits_e(entropy, (unsigned int) temp, 1);
+
+ /* Emit buffered correction bits that must be associated with this code */
+ emit_buffered_bits(entropy, BR_buffer, BR);
+ BR_buffer = entropy->bit_buffer; /* BE bits are gone now */
+ BR = 0;
+ r = 0; /* reset zero run length */
+ }
+
+ if (r > 0 || BR > 0) { /* If there are trailing zeroes, */
+ entropy->EOBRUN++; /* count an EOB */
+ entropy->BE += BR; /* concat my correction bits to older ones */
+ /* We force out the EOB if we risk either:
+ * 1. overflow of the EOB counter;
+ * 2. overflow of the correction bit buffer during the next MCU.
+ */
+ if (entropy->EOBRUN == 0x7FFF || entropy->BE > (MAX_CORR_BITS-DCTSIZE2+1))
+ emit_eobrun(entropy);
+ }
+
+ cinfo->dest->next_output_byte = entropy->next_output_byte;
+ cinfo->dest->free_in_buffer = entropy->free_in_buffer;
+
+ /* Update restart-interval state too */
+ if (cinfo->restart_interval) {
+ if (entropy->restarts_to_go == 0) {
+ entropy->restarts_to_go = cinfo->restart_interval;
+ entropy->next_restart_num++;
+ entropy->next_restart_num &= 7;
+ }
+ entropy->restarts_to_go--;
+ }
+
+ return TRUE;
+}
+
+
/* Encode a single block's worth of coefficients */
LOCAL(boolean)
@@ -356,9 +909,9 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
register int temp, temp2;
register int nbits;
register int k, r, i;
-
+
/* Encode the DC coefficient difference per section F.1.2.1 */
-
+
temp = temp2 = block[0] - last_dc_val;
if (temp < 0) {
@@ -367,7 +920,7 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
/* This code assumes we are on a two's complement machine */
temp2--;
}
-
+
/* Find the number of bits needed for the magnitude of the coefficient */
nbits = 0;
while (temp) {
@@ -379,28 +932,28 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
*/
if (nbits > MAX_COEF_BITS+1)
ERREXIT(state->cinfo, JERR_BAD_DCT_COEF);
-
+
/* Emit the Huffman-coded symbol for the number of bits */
- if (! emit_bits(state, dctbl->ehufco[nbits], dctbl->ehufsi[nbits]))
+ if (! emit_bits_s(state, dctbl->ehufco[nbits], dctbl->ehufsi[nbits]))
return FALSE;
/* Emit that number of bits of the value, if positive, */
/* or the complement of its magnitude, if negative. */
if (nbits) /* emit_bits rejects calls with size 0 */
- if (! emit_bits(state, (unsigned int) temp2, nbits))
+ if (! emit_bits_s(state, (unsigned int) temp2, nbits))
return FALSE;
/* Encode the AC coefficients per section F.1.2.2 */
-
+
r = 0; /* r = run length of zeros */
-
+
for (k = 1; k < DCTSIZE2; k++) {
if ((temp = block[jpeg_natural_order[k]]) == 0) {
r++;
} else {
/* if run length > 15, must emit special run-length-16 codes (0xF0) */
while (r > 15) {
- if (! emit_bits(state, actbl->ehufco[0xF0], actbl->ehufsi[0xF0]))
+ if (! emit_bits_s(state, actbl->ehufco[0xF0], actbl->ehufsi[0xF0]))
return FALSE;
r -= 16;
}
@@ -411,7 +964,7 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
/* This code assumes we are on a two's complement machine */
temp2--;
}
-
+
/* Find the number of bits needed for the magnitude of the coefficient */
nbits = 1; /* there must be at least one 1 bit */
while ((temp >>= 1))
@@ -419,24 +972,24 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
/* Check for out-of-range coefficient values */
if (nbits > MAX_COEF_BITS)
ERREXIT(state->cinfo, JERR_BAD_DCT_COEF);
-
+
/* Emit Huffman symbol for run length / number of bits */
i = (r << 4) + nbits;
- if (! emit_bits(state, actbl->ehufco[i], actbl->ehufsi[i]))
+ if (! emit_bits_s(state, actbl->ehufco[i], actbl->ehufsi[i]))
return FALSE;
/* Emit that number of bits of the value, if positive, */
/* or the complement of its magnitude, if negative. */
- if (! emit_bits(state, (unsigned int) temp2, nbits))
+ if (! emit_bits_s(state, (unsigned int) temp2, nbits))
return FALSE;
-
+
r = 0;
}
}
/* If the last coef(s) were zero, emit an end-of-block code */
if (r > 0)
- if (! emit_bits(state, actbl->ehufco[0], actbl->ehufsi[0]))
+ if (! emit_bits_s(state, actbl->ehufco[0], actbl->ehufsi[0]))
return FALSE;
return TRUE;
@@ -444,31 +997,6 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
/*
- * Emit a restart marker & resynchronize predictions.
- */
-
-LOCAL(boolean)
-emit_restart (working_state * state, int restart_num)
-{
- int ci;
-
- if (! flush_bits(state))
- return FALSE;
-
- emit_byte(state, 0xFF, return FALSE);
- emit_byte(state, JPEG_RST0 + restart_num, return FALSE);
-
- /* Re-initialize DC predictions to 0 */
- for (ci = 0; ci < state->cinfo->comps_in_scan; ci++)
- state->cur.last_dc_val[ci] = 0;
-
- /* The restart counter is not updated until we successfully write the MCU. */
-
- return TRUE;
-}
-
-
-/*
* Encode and output one MCU's worth of Huffman-compressed coefficients.
*/
@@ -489,7 +1017,7 @@ encode_mcu_huff (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
/* Emit restart marker if needed */
if (cinfo->restart_interval) {
if (entropy->restarts_to_go == 0)
- if (! emit_restart(&state, entropy->next_restart_num))
+ if (! emit_restart_s(&state, entropy->next_restart_num))
return FALSE;
}
@@ -535,20 +1063,32 @@ finish_pass_huff (j_compress_ptr cinfo)
huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
working_state state;
- /* Load up working state ... flush_bits needs it */
- state.next_output_byte = cinfo->dest->next_output_byte;
- state.free_in_buffer = cinfo->dest->free_in_buffer;
- ASSIGN_STATE(state.cur, entropy->saved);
- state.cinfo = cinfo;
+ if (cinfo->progressive_mode) {
+ entropy->next_output_byte = cinfo->dest->next_output_byte;
+ entropy->free_in_buffer = cinfo->dest->free_in_buffer;
- /* Flush out the last data */
- if (! flush_bits(&state))
- ERREXIT(cinfo, JERR_CANT_SUSPEND);
+ /* Flush out any buffered data */
+ emit_eobrun(entropy);
+ flush_bits_e(entropy);
- /* Update state */
- cinfo->dest->next_output_byte = state.next_output_byte;
- cinfo->dest->free_in_buffer = state.free_in_buffer;
- ASSIGN_STATE(entropy->saved, state.cur);
+ cinfo->dest->next_output_byte = entropy->next_output_byte;
+ cinfo->dest->free_in_buffer = entropy->free_in_buffer;
+ } else {
+ /* Load up working state ... flush_bits needs it */
+ state.next_output_byte = cinfo->dest->next_output_byte;
+ state.free_in_buffer = cinfo->dest->free_in_buffer;
+ ASSIGN_STATE(state.cur, entropy->saved);
+ state.cinfo = cinfo;
+
+ /* Flush out the last data */
+ if (! flush_bits_s(&state))
+ ERREXIT(cinfo, JERR_CANT_SUSPEND);
+
+ /* Update state */
+ cinfo->dest->next_output_byte = state.next_output_byte;
+ cinfo->dest->free_in_buffer = state.free_in_buffer;
+ ASSIGN_STATE(entropy->saved, state.cur);
+ }
}
@@ -563,8 +1103,6 @@ finish_pass_huff (j_compress_ptr cinfo)
* the compressed data.
*/
-#ifdef ENTROPY_OPT_SUPPORTED
-
/* Process a single block's worth of coefficients */
@@ -675,7 +1213,6 @@ encode_mcu_gather (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
/*
* Generate the best Huffman code table for the given counts, fill htbl.
- * Note this is also used by jcphuff.c.
*
* The JPEG standard requires that no symbol be assigned a codeword of all
* one bits (so that padding bits added at the end of a compressed segment
@@ -701,7 +1238,7 @@ encode_mcu_gather (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
* So the extra complexity of an optimal algorithm doesn't seem worthwhile.
*/
-GLOBAL(void)
+LOCAL(void)
jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL * htbl, long freq[])
{
#define MAX_CLEN 32 /* assumed maximum initial code length */
@@ -846,41 +1383,200 @@ METHODDEF(void)
finish_pass_gather (j_compress_ptr cinfo)
{
huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
- int ci, dctbl, actbl;
+ int ci, dctbl, actbl, tbl;
jpeg_component_info * compptr;
JHUFF_TBL **htblptr;
boolean did_dc[NUM_HUFF_TBLS];
boolean did_ac[NUM_HUFF_TBLS];
+ boolean did[NUM_HUFF_TBLS];
/* It's important not to apply jpeg_gen_optimal_table more than once
* per table, because it clobbers the input frequency counts!
*/
- MEMZERO(did_dc, SIZEOF(did_dc));
- MEMZERO(did_ac, SIZEOF(did_ac));
-
- for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
- compptr = cinfo->cur_comp_info[ci];
- dctbl = compptr->dc_tbl_no;
- actbl = compptr->ac_tbl_no;
- if (! did_dc[dctbl]) {
- htblptr = & cinfo->dc_huff_tbl_ptrs[dctbl];
- if (*htblptr == NULL)
- *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
- jpeg_gen_optimal_table(cinfo, *htblptr, entropy->dc_count_ptrs[dctbl]);
- did_dc[dctbl] = TRUE;
- }
- if (! did_ac[actbl]) {
- htblptr = & cinfo->ac_huff_tbl_ptrs[actbl];
- if (*htblptr == NULL)
- *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
- jpeg_gen_optimal_table(cinfo, *htblptr, entropy->ac_count_ptrs[actbl]);
- did_ac[actbl] = TRUE;
+ if (cinfo->progressive_mode) {
+ /* Flush out buffered data (all we care about is counting the EOB symbol) */
+ emit_eobrun(entropy);
+
+ MEMZERO(did, SIZEOF(did));
+
+ for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+ compptr = cinfo->cur_comp_info[ci];
+ if (cinfo->Ss == 0) {
+ if (cinfo->Ah != 0) /* DC refinement needs no table */
+ continue;
+ tbl = compptr->dc_tbl_no;
+ } else {
+ tbl = compptr->ac_tbl_no;
+ }
+ if (! did[tbl]) {
+ if (cinfo->Ss == 0)
+ htblptr = & cinfo->dc_huff_tbl_ptrs[tbl];
+ else
+ htblptr = & cinfo->ac_huff_tbl_ptrs[tbl];
+ if (*htblptr == NULL)
+ *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
+ jpeg_gen_optimal_table(cinfo, *htblptr, entropy->count_ptrs[tbl]);
+ did[tbl] = TRUE;
+ }
+ }
+ } else {
+ MEMZERO(did_dc, SIZEOF(did_dc));
+ MEMZERO(did_ac, SIZEOF(did_ac));
+
+ for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+ compptr = cinfo->cur_comp_info[ci];
+ dctbl = compptr->dc_tbl_no;
+ actbl = compptr->ac_tbl_no;
+ if (! did_dc[dctbl]) {
+ htblptr = & cinfo->dc_huff_tbl_ptrs[dctbl];
+ if (*htblptr == NULL)
+ *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
+ jpeg_gen_optimal_table(cinfo, *htblptr, entropy->dc_count_ptrs[dctbl]);
+ did_dc[dctbl] = TRUE;
+ }
+ if (! did_ac[actbl]) {
+ htblptr = & cinfo->ac_huff_tbl_ptrs[actbl];
+ if (*htblptr == NULL)
+ *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
+ jpeg_gen_optimal_table(cinfo, *htblptr, entropy->ac_count_ptrs[actbl]);
+ did_ac[actbl] = TRUE;
+ }
}
}
}
-#endif /* ENTROPY_OPT_SUPPORTED */
+/*
+ * Initialize for a Huffman-compressed scan.
+ * If gather_statistics is TRUE, we do not output anything during the scan,
+ * just count the Huffman symbols used and generate Huffman code tables.
+ */
+
+METHODDEF(void)
+start_pass_huff (j_compress_ptr cinfo, boolean gather_statistics)
+{
+ huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+ int ci, dctbl, actbl, tbl;
+ jpeg_component_info * compptr;
+
+ if (gather_statistics)
+ entropy->pub.finish_pass = finish_pass_gather;
+ else
+ entropy->pub.finish_pass = finish_pass_huff;
+
+ if (cinfo->progressive_mode) {
+ entropy->cinfo = cinfo;
+ entropy->gather_statistics = gather_statistics;
+
+ /* We assume jcmaster.c already validated the scan parameters. */
+
+ /* Select execution routine */
+ if (cinfo->Ah == 0) {
+ if (cinfo->Ss == 0)
+ entropy->pub.encode_mcu = encode_mcu_DC_first;
+ else
+ entropy->pub.encode_mcu = encode_mcu_AC_first;
+ } else {
+ if (cinfo->Ss == 0)
+ entropy->pub.encode_mcu = encode_mcu_DC_refine;
+ else {
+ entropy->pub.encode_mcu = encode_mcu_AC_refine;
+ /* AC refinement needs a correction bit buffer */
+ if (entropy->bit_buffer == NULL)
+ entropy->bit_buffer = (char *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ MAX_CORR_BITS * SIZEOF(char));
+ }
+ }
+
+ /* Only DC coefficients may be interleaved, so cinfo->comps_in_scan = 1
+ * for AC coefficients.
+ */
+ for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+ compptr = cinfo->cur_comp_info[ci];
+ /* Initialize DC predictions to 0 */
+ entropy->saved.last_dc_val[ci] = 0;
+ /* Get table index */
+ if (cinfo->Ss == 0) {
+ if (cinfo->Ah != 0) /* DC refinement needs no table */
+ continue;
+ tbl = compptr->dc_tbl_no;
+ } else {
+ entropy->ac_tbl_no = tbl = compptr->ac_tbl_no;
+ }
+ if (gather_statistics) {
+ /* Check for invalid table index */
+ /* (make_c_derived_tbl does this in the other path) */
+ if (tbl < 0 || tbl >= NUM_HUFF_TBLS)
+ ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, tbl);
+ /* Allocate and zero the statistics tables */
+ /* Note that jpeg_gen_optimal_table expects 257 entries in each table! */
+ if (entropy->count_ptrs[tbl] == NULL)
+ entropy->count_ptrs[tbl] = (long *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ 257 * SIZEOF(long));
+ MEMZERO(entropy->count_ptrs[tbl], 257 * SIZEOF(long));
+ } else {
+ /* Compute derived values for Huffman table */
+ /* We may do this more than once for a table, but it's not expensive */
+ jpeg_make_c_derived_tbl(cinfo, cinfo->Ss == 0, tbl,
+ & entropy->derived_tbls[tbl]);
+ }
+ }
+
+ /* Initialize AC stuff */
+ entropy->EOBRUN = 0;
+ entropy->BE = 0;
+ } else {
+ if (gather_statistics)
+ entropy->pub.encode_mcu = encode_mcu_gather;
+ else
+ entropy->pub.encode_mcu = encode_mcu_huff;
+
+ for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+ compptr = cinfo->cur_comp_info[ci];
+ dctbl = compptr->dc_tbl_no;
+ actbl = compptr->ac_tbl_no;
+ if (gather_statistics) {
+ /* Check for invalid table indexes */
+ /* (make_c_derived_tbl does this in the other path) */
+ if (dctbl < 0 || dctbl >= NUM_HUFF_TBLS)
+ ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, dctbl);
+ if (actbl < 0 || actbl >= NUM_HUFF_TBLS)
+ ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, actbl);
+ /* Allocate and zero the statistics tables */
+ /* Note that jpeg_gen_optimal_table expects 257 entries in each table! */
+ if (entropy->dc_count_ptrs[dctbl] == NULL)
+ entropy->dc_count_ptrs[dctbl] = (long *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ 257 * SIZEOF(long));
+ MEMZERO(entropy->dc_count_ptrs[dctbl], 257 * SIZEOF(long));
+ if (entropy->ac_count_ptrs[actbl] == NULL)
+ entropy->ac_count_ptrs[actbl] = (long *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ 257 * SIZEOF(long));
+ MEMZERO(entropy->ac_count_ptrs[actbl], 257 * SIZEOF(long));
+ } else {
+ /* Compute derived values for Huffman tables */
+ /* We may do this more than once for a table, but it's not expensive */
+ jpeg_make_c_derived_tbl(cinfo, TRUE, dctbl,
+ & entropy->dc_derived_tbls[dctbl]);
+ jpeg_make_c_derived_tbl(cinfo, FALSE, actbl,
+ & entropy->ac_derived_tbls[actbl]);
+ }
+ /* Initialize DC predictions to 0 */
+ entropy->saved.last_dc_val[ci] = 0;
+ }
+ }
+
+ /* Initialize bit buffer to empty */
+ entropy->saved.put_buffer = 0;
+ entropy->saved.put_bits = 0;
+
+ /* Initialize restart stuff */
+ entropy->restarts_to_go = cinfo->restart_interval;
+ entropy->next_restart_num = 0;
+}
/*
@@ -899,11 +1595,18 @@ jinit_huff_encoder (j_compress_ptr cinfo)
cinfo->entropy = (struct jpeg_entropy_encoder *) entropy;
entropy->pub.start_pass = start_pass_huff;
- /* Mark tables unallocated */
- for (i = 0; i < NUM_HUFF_TBLS; i++) {
- entropy->dc_derived_tbls[i] = entropy->ac_derived_tbls[i] = NULL;
-#ifdef ENTROPY_OPT_SUPPORTED
- entropy->dc_count_ptrs[i] = entropy->ac_count_ptrs[i] = NULL;
-#endif
+ if (cinfo->progressive_mode) {
+ /* Mark tables unallocated */
+ for (i = 0; i < NUM_HUFF_TBLS; i++) {
+ entropy->derived_tbls[i] = NULL;
+ entropy->count_ptrs[i] = NULL;
+ }
+ entropy->bit_buffer = NULL; /* needed only in AC refinement scan */
+ } else {
+ /* Mark tables unallocated */
+ for (i = 0; i < NUM_HUFF_TBLS; i++) {
+ entropy->dc_derived_tbls[i] = entropy->ac_derived_tbls[i] = NULL;
+ entropy->dc_count_ptrs[i] = entropy->ac_count_ptrs[i] = NULL;
+ }
}
}
diff --git a/src/libjpeg/jcinit.c b/src/libjpeg/jcinit.c
index 5efffe3..0ba310f 100644
--- a/src/libjpeg/jcinit.c
+++ b/src/libjpeg/jcinit.c
@@ -41,17 +41,10 @@ jinit_compress_master (j_compress_ptr cinfo)
/* Forward DCT */
jinit_forward_dct(cinfo);
/* Entropy encoding: either Huffman or arithmetic coding. */
- if (cinfo->arith_code) {
- ERREXIT(cinfo, JERR_ARITH_NOTIMPL);
- } else {
- if (cinfo->progressive_mode) {
-#ifdef C_PROGRESSIVE_SUPPORTED
- jinit_phuff_encoder(cinfo);
-#else
- ERREXIT(cinfo, JERR_NOT_COMPILED);
-#endif
- } else
- jinit_huff_encoder(cinfo);
+ if (cinfo->arith_code)
+ jinit_arith_encoder(cinfo);
+ else {
+ jinit_huff_encoder(cinfo);
}
/* Need a full-image coefficient buffer in any multi-pass mode. */
diff --git a/src/libjpeg/jcmainct.c b/src/libjpeg/jcmainct.c
index e0279a7..7de75d1 100644
--- a/src/libjpeg/jcmainct.c
+++ b/src/libjpeg/jcmainct.c
@@ -118,17 +118,17 @@ process_data_simple_main (j_compress_ptr cinfo,
while (main->cur_iMCU_row < cinfo->total_iMCU_rows) {
/* Read input data if we haven't filled the main buffer yet */
- if (main->rowgroup_ctr < DCTSIZE)
+ if (main->rowgroup_ctr < (JDIMENSION) cinfo->min_DCT_v_scaled_size)
(*cinfo->prep->pre_process_data) (cinfo,
input_buf, in_row_ctr, in_rows_avail,
main->buffer, &main->rowgroup_ctr,
- (JDIMENSION) DCTSIZE);
+ (JDIMENSION) cinfo->min_DCT_v_scaled_size);
/* If we don't have a full iMCU row buffered, return to application for
* more data. Note that preprocessor will always pad to fill the iMCU row
* at the bottom of the image.
*/
- if (main->rowgroup_ctr != DCTSIZE)
+ if (main->rowgroup_ctr != (JDIMENSION) cinfo->min_DCT_v_scaled_size)
return;
/* Send the completed row to the compressor */
@@ -269,10 +269,10 @@ jinit_c_main_controller (j_compress_ptr cinfo, boolean need_full_buffer)
ci++, compptr++) {
main->whole_image[ci] = (*cinfo->mem->request_virt_sarray)
((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE,
- compptr->width_in_blocks * DCTSIZE,
+ compptr->width_in_blocks * compptr->DCT_h_scaled_size,
(JDIMENSION) jround_up((long) compptr->height_in_blocks,
(long) compptr->v_samp_factor) * DCTSIZE,
- (JDIMENSION) (compptr->v_samp_factor * DCTSIZE));
+ (JDIMENSION) (compptr->v_samp_factor * compptr->DCT_v_scaled_size));
}
#else
ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
@@ -286,8 +286,8 @@ jinit_c_main_controller (j_compress_ptr cinfo, boolean need_full_buffer)
ci++, compptr++) {
main->buffer[ci] = (*cinfo->mem->alloc_sarray)
((j_common_ptr) cinfo, JPOOL_IMAGE,
- compptr->width_in_blocks * DCTSIZE,
- (JDIMENSION) (compptr->v_samp_factor * DCTSIZE));
+ compptr->width_in_blocks * compptr->DCT_h_scaled_size,
+ (JDIMENSION) (compptr->v_samp_factor * compptr->DCT_v_scaled_size));
}
}
}
diff --git a/src/libjpeg/jcmarker.c b/src/libjpeg/jcmarker.c
index 3d1e6c6..54d109a 100644
--- a/src/libjpeg/jcmarker.c
+++ b/src/libjpeg/jcmarker.c
@@ -285,13 +285,13 @@ emit_sof (j_compress_ptr cinfo, JPEG_MARKER code)
emit_2bytes(cinfo, 3 * cinfo->num_components + 2 + 5 + 1); /* length */
/* Make sure image isn't bigger than SOF field can handle */
- if ((long) cinfo->image_height > 65535L ||
- (long) cinfo->image_width > 65535L)
+ if ((long) cinfo->jpeg_height > 65535L ||
+ (long) cinfo->jpeg_width > 65535L)
ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int) 65535);
emit_byte(cinfo, cinfo->data_precision);
- emit_2bytes(cinfo, (int) cinfo->image_height);
- emit_2bytes(cinfo, (int) cinfo->image_width);
+ emit_2bytes(cinfo, (int) cinfo->jpeg_height);
+ emit_2bytes(cinfo, (int) cinfo->jpeg_width);
emit_byte(cinfo, cinfo->num_components);
@@ -529,7 +529,10 @@ write_frame_header (j_compress_ptr cinfo)
/* Emit the proper SOF marker */
if (cinfo->arith_code) {
- emit_sof(cinfo, M_SOF9); /* SOF code for arithmetic coding */
+ if (cinfo->progressive_mode)
+ emit_sof(cinfo, M_SOF10); /* SOF code for progressive arithmetic */
+ else
+ emit_sof(cinfo, M_SOF9); /* SOF code for sequential arithmetic */
} else {
if (cinfo->progressive_mode)
emit_sof(cinfo, M_SOF2); /* SOF code for progressive Huffman */
diff --git a/src/libjpeg/jcmaster.c b/src/libjpeg/jcmaster.c
index aab4020..db3e61a 100644
--- a/src/libjpeg/jcmaster.c
+++ b/src/libjpeg/jcmaster.c
@@ -2,6 +2,7 @@
* jcmaster.c
*
* Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2003-2009 by Guido Vollbeding.
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -42,23 +43,172 @@ typedef my_comp_master * my_master_ptr;
* Support routines that do various essential calculations.
*/
+/*
+ * Compute JPEG image dimensions and related values.
+ * NOTE: this is exported for possible use by application.
+ * Hence it mustn't do anything that can't be done twice.
+ */
+
+GLOBAL(void)
+jpeg_calc_jpeg_dimensions (j_compress_ptr cinfo)
+/* Do computations that are needed before master selection phase */
+{
+#ifdef DCT_SCALING_SUPPORTED
+
+ /* Compute actual JPEG image dimensions and DCT scaling choices. */
+ if (cinfo->scale_num >= cinfo->scale_denom * 8) {
+ /* Provide 8/1 scaling */
+ cinfo->jpeg_width = cinfo->image_width << 3;
+ cinfo->jpeg_height = cinfo->image_height << 3;
+ cinfo->min_DCT_h_scaled_size = 1;
+ cinfo->min_DCT_v_scaled_size = 1;
+ } else if (cinfo->scale_num >= cinfo->scale_denom * 4) {
+ /* Provide 4/1 scaling */
+ cinfo->jpeg_width = cinfo->image_width << 2;
+ cinfo->jpeg_height = cinfo->image_height << 2;
+ cinfo->min_DCT_h_scaled_size = 2;
+ cinfo->min_DCT_v_scaled_size = 2;
+ } else if (cinfo->scale_num * 3 >= cinfo->scale_denom * 8) {
+ /* Provide 8/3 scaling */
+ cinfo->jpeg_width = (cinfo->image_width << 1) + (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_width * 2, 3L);
+ cinfo->jpeg_height = (cinfo->image_height << 1) + (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_height * 2, 3L);
+ cinfo->min_DCT_h_scaled_size = 3;
+ cinfo->min_DCT_v_scaled_size = 3;
+ } else if (cinfo->scale_num >= cinfo->scale_denom * 2) {
+ /* Provide 2/1 scaling */
+ cinfo->jpeg_width = cinfo->image_width << 1;
+ cinfo->jpeg_height = cinfo->image_height << 1;
+ cinfo->min_DCT_h_scaled_size = 4;
+ cinfo->min_DCT_v_scaled_size = 4;
+ } else if (cinfo->scale_num * 5 >= cinfo->scale_denom * 8) {
+ /* Provide 8/5 scaling */
+ cinfo->jpeg_width = cinfo->image_width + (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_width * 3, 5L);
+ cinfo->jpeg_height = cinfo->image_height + (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_height * 3, 5L);
+ cinfo->min_DCT_h_scaled_size = 5;
+ cinfo->min_DCT_v_scaled_size = 5;
+ } else if (cinfo->scale_num * 3 >= cinfo->scale_denom * 4) {
+ /* Provide 4/3 scaling */
+ cinfo->jpeg_width = cinfo->image_width + (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_width, 3L);
+ cinfo->jpeg_height = cinfo->image_height + (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_height, 3L);
+ cinfo->min_DCT_h_scaled_size = 6;
+ cinfo->min_DCT_v_scaled_size = 6;
+ } else if (cinfo->scale_num * 7 >= cinfo->scale_denom * 8) {
+ /* Provide 8/7 scaling */
+ cinfo->jpeg_width = cinfo->image_width + (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_width, 7L);
+ cinfo->jpeg_height = cinfo->image_height + (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_height, 7L);
+ cinfo->min_DCT_h_scaled_size = 7;
+ cinfo->min_DCT_v_scaled_size = 7;
+ } else if (cinfo->scale_num >= cinfo->scale_denom) {
+ /* Provide 1/1 scaling */
+ cinfo->jpeg_width = cinfo->image_width;
+ cinfo->jpeg_height = cinfo->image_height;
+ cinfo->min_DCT_h_scaled_size = DCTSIZE;
+ cinfo->min_DCT_v_scaled_size = DCTSIZE;
+ } else if (cinfo->scale_num * 9 >= cinfo->scale_denom * 8) {
+ /* Provide 8/9 scaling */
+ cinfo->jpeg_width = (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_width * 8, 9L);
+ cinfo->jpeg_height = (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_height * 8, 9L);
+ cinfo->min_DCT_h_scaled_size = 9;
+ cinfo->min_DCT_v_scaled_size = 9;
+ } else if (cinfo->scale_num * 5 >= cinfo->scale_denom * 4) {
+ /* Provide 4/5 scaling */
+ cinfo->jpeg_width = (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_width * 4, 5L);
+ cinfo->jpeg_height = (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_height * 4, 5L);
+ cinfo->min_DCT_h_scaled_size = 10;
+ cinfo->min_DCT_v_scaled_size = 10;
+ } else if (cinfo->scale_num * 11 >= cinfo->scale_denom * 8) {
+ /* Provide 8/11 scaling */
+ cinfo->jpeg_width = (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_width * 8, 11L);
+ cinfo->jpeg_height = (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_height * 8, 11L);
+ cinfo->min_DCT_h_scaled_size = 11;
+ cinfo->min_DCT_v_scaled_size = 11;
+ } else if (cinfo->scale_num * 3 >= cinfo->scale_denom * 2) {
+ /* Provide 2/3 scaling */
+ cinfo->jpeg_width = (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_width * 2, 3L);
+ cinfo->jpeg_height = (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_height * 2, 3L);
+ cinfo->min_DCT_h_scaled_size = 12;
+ cinfo->min_DCT_v_scaled_size = 12;
+ } else if (cinfo->scale_num * 13 >= cinfo->scale_denom * 8) {
+ /* Provide 8/13 scaling */
+ cinfo->jpeg_width = (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_width * 8, 13L);
+ cinfo->jpeg_height = (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_height * 8, 13L);
+ cinfo->min_DCT_h_scaled_size = 13;
+ cinfo->min_DCT_v_scaled_size = 13;
+ } else if (cinfo->scale_num * 7 >= cinfo->scale_denom * 4) {
+ /* Provide 4/7 scaling */
+ cinfo->jpeg_width = (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_width * 4, 7L);
+ cinfo->jpeg_height = (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_height * 4, 7L);
+ cinfo->min_DCT_h_scaled_size = 14;
+ cinfo->min_DCT_v_scaled_size = 14;
+ } else if (cinfo->scale_num * 15 >= cinfo->scale_denom * 8) {
+ /* Provide 8/15 scaling */
+ cinfo->jpeg_width = (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_width * 8, 15L);
+ cinfo->jpeg_height = (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_height * 8, 15L);
+ cinfo->min_DCT_h_scaled_size = 15;
+ cinfo->min_DCT_v_scaled_size = 15;
+ } else {
+ /* Provide 1/2 scaling */
+ cinfo->jpeg_width = (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_width, 2L);
+ cinfo->jpeg_height = (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_height, 2L);
+ cinfo->min_DCT_h_scaled_size = 16;
+ cinfo->min_DCT_v_scaled_size = 16;
+ }
+
+#else /* !DCT_SCALING_SUPPORTED */
+
+ /* Hardwire it to "no scaling" */
+ cinfo->jpeg_width = cinfo->image_width;
+ cinfo->jpeg_height = cinfo->image_height;
+ cinfo->min_DCT_h_scaled_size = DCTSIZE;
+ cinfo->min_DCT_v_scaled_size = DCTSIZE;
+
+#endif /* DCT_SCALING_SUPPORTED */
+}
+
+
LOCAL(void)
initial_setup (j_compress_ptr cinfo)
/* Do computations that are needed before master selection phase */
{
- int ci;
+ int ci, ssize;
jpeg_component_info *compptr;
long samplesperrow;
JDIMENSION jd_samplesperrow;
+ jpeg_calc_jpeg_dimensions(cinfo);
+
/* Sanity check on image dimensions */
- if (cinfo->image_height <= 0 || cinfo->image_width <= 0
+ if (cinfo->jpeg_height <= 0 || cinfo->jpeg_width <= 0
|| cinfo->num_components <= 0 || cinfo->input_components <= 0)
ERREXIT(cinfo, JERR_EMPTY_IMAGE);
/* Make sure image isn't bigger than I can handle */
- if ((long) cinfo->image_height > (long) JPEG_MAX_DIMENSION ||
- (long) cinfo->image_width > (long) JPEG_MAX_DIMENSION)
+ if ((long) cinfo->jpeg_height > (long) JPEG_MAX_DIMENSION ||
+ (long) cinfo->jpeg_width > (long) JPEG_MAX_DIMENSION)
ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int) JPEG_MAX_DIMENSION);
/* Width of an input scanline must be representable as JDIMENSION. */
@@ -95,22 +245,52 @@ initial_setup (j_compress_ptr cinfo)
ci++, compptr++) {
/* Fill in the correct component_index value; don't rely on application */
compptr->component_index = ci;
- /* For compression, we never do DCT scaling. */
- compptr->DCT_scaled_size = DCTSIZE;
+ /* In selecting the actual DCT scaling for each component, we try to
+ * scale down the chroma components via DCT scaling rather than downsampling.
+ * This saves time if the downsampler gets to use 1:1 scaling.
+ * Note this code adapts subsampling ratios which are powers of 2.
+ */
+ ssize = 1;
+#ifdef DCT_SCALING_SUPPORTED
+ while (cinfo->min_DCT_h_scaled_size * ssize <=
+ (cinfo->do_fancy_downsampling ? DCTSIZE : DCTSIZE / 2) &&
+ (cinfo->max_h_samp_factor % (compptr->h_samp_factor * ssize * 2)) == 0) {
+ ssize = ssize * 2;
+ }
+#endif
+ compptr->DCT_h_scaled_size = cinfo->min_DCT_h_scaled_size * ssize;
+ ssize = 1;
+#ifdef DCT_SCALING_SUPPORTED
+ while (cinfo->min_DCT_v_scaled_size * ssize <=
+ (cinfo->do_fancy_downsampling ? DCTSIZE : DCTSIZE / 2) &&
+ (cinfo->max_v_samp_factor % (compptr->v_samp_factor * ssize * 2)) == 0) {
+ ssize = ssize * 2;
+ }
+#endif
+ compptr->DCT_v_scaled_size = cinfo->min_DCT_v_scaled_size * ssize;
+
+ /* We don't support DCT ratios larger than 2. */
+ if (compptr->DCT_h_scaled_size > compptr->DCT_v_scaled_size * 2)
+ compptr->DCT_h_scaled_size = compptr->DCT_v_scaled_size * 2;
+ else if (compptr->DCT_v_scaled_size > compptr->DCT_h_scaled_size * 2)
+ compptr->DCT_v_scaled_size = compptr->DCT_h_scaled_size * 2;
+
/* Size in DCT blocks */
compptr->width_in_blocks = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor,
+ jdiv_round_up((long) cinfo->jpeg_width * (long) compptr->h_samp_factor,
(long) (cinfo->max_h_samp_factor * DCTSIZE));
compptr->height_in_blocks = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_height * (long) compptr->v_samp_factor,
+ jdiv_round_up((long) cinfo->jpeg_height * (long) compptr->v_samp_factor,
(long) (cinfo->max_v_samp_factor * DCTSIZE));
/* Size in samples */
compptr->downsampled_width = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor,
- (long) cinfo->max_h_samp_factor);
+ jdiv_round_up((long) cinfo->jpeg_width *
+ (long) (compptr->h_samp_factor * compptr->DCT_h_scaled_size),
+ (long) (cinfo->max_h_samp_factor * DCTSIZE));
compptr->downsampled_height = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_height * (long) compptr->v_samp_factor,
- (long) cinfo->max_v_samp_factor);
+ jdiv_round_up((long) cinfo->jpeg_height *
+ (long) (compptr->v_samp_factor * compptr->DCT_v_scaled_size),
+ (long) (cinfo->max_v_samp_factor * DCTSIZE));
/* Mark component needed (this flag isn't actually used for compression) */
compptr->component_needed = TRUE;
}
@@ -119,7 +299,7 @@ initial_setup (j_compress_ptr cinfo)
* main controller will call coefficient controller).
*/
cinfo->total_iMCU_rows = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_height,
+ jdiv_round_up((long) cinfo->jpeg_height,
(long) (cinfo->max_v_samp_factor*DCTSIZE));
}
@@ -325,7 +505,7 @@ per_scan_setup (j_compress_ptr cinfo)
compptr->MCU_width = 1;
compptr->MCU_height = 1;
compptr->MCU_blocks = 1;
- compptr->MCU_sample_width = DCTSIZE;
+ compptr->MCU_sample_width = compptr->DCT_h_scaled_size;
compptr->last_col_width = 1;
/* For noninterleaved scans, it is convenient to define last_row_height
* as the number of block rows present in the last iMCU row.
@@ -347,10 +527,10 @@ per_scan_setup (j_compress_ptr cinfo)
/* Overall image size in MCUs */
cinfo->MCUs_per_row = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_width,
+ jdiv_round_up((long) cinfo->jpeg_width,
(long) (cinfo->max_h_samp_factor*DCTSIZE));
cinfo->MCU_rows_in_scan = (JDIMENSION)
- jdiv_round_up((long) cinfo->image_height,
+ jdiv_round_up((long) cinfo->jpeg_height,
(long) (cinfo->max_v_samp_factor*DCTSIZE));
cinfo->blocks_in_MCU = 0;
@@ -361,7 +541,7 @@ per_scan_setup (j_compress_ptr cinfo)
compptr->MCU_width = compptr->h_samp_factor;
compptr->MCU_height = compptr->v_samp_factor;
compptr->MCU_blocks = compptr->MCU_width * compptr->MCU_height;
- compptr->MCU_sample_width = compptr->MCU_width * DCTSIZE;
+ compptr->MCU_sample_width = compptr->MCU_width * compptr->DCT_h_scaled_size;
/* Figure number of non-dummy blocks in last MCU column & row */
tmp = (int) (compptr->width_in_blocks % compptr->MCU_width);
if (tmp == 0) tmp = compptr->MCU_width;
@@ -433,7 +613,7 @@ prepare_for_pass (j_compress_ptr cinfo)
/* Do Huffman optimization for a scan after the first one. */
select_scan_parameters(cinfo);
per_scan_setup(cinfo);
- if (cinfo->Ss != 0 || cinfo->Ah == 0 || cinfo->arith_code) {
+ if (cinfo->Ss != 0 || cinfo->Ah == 0) {
(*cinfo->entropy->start_pass) (cinfo, TRUE);
(*cinfo->coef->start_pass) (cinfo, JBUF_CRANK_DEST);
master->pub.call_pass_startup = FALSE;
@@ -567,7 +747,7 @@ jinit_c_master_control (j_compress_ptr cinfo, boolean transcode_only)
cinfo->num_scans = 1;
}
- if (cinfo->progressive_mode) /* TEMPORARY HACK ??? */
+ if (cinfo->progressive_mode && cinfo->arith_code == 0) /* TEMPORARY HACK ??? */
cinfo->optimize_coding = TRUE; /* assume default tables no good for progressive mode */
/* Initialize my private state */
diff --git a/src/libjpeg/jcparam.c b/src/libjpeg/jcparam.c
index 6fc48f5..c5e85dd 100644
--- a/src/libjpeg/jcparam.c
+++ b/src/libjpeg/jcparam.c
@@ -2,6 +2,7 @@
* jcparam.c
*
* Copyright (C) 1991-1998, Thomas G. Lane.
+ * Modified 2003-2008 by Guido Vollbeding.
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -60,6 +61,47 @@ jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl,
}
+/* These are the sample quantization tables given in JPEG spec section K.1.
+ * The spec says that the values given produce "good" quality, and
+ * when divided by 2, "very good" quality.
+ */
+static const unsigned int std_luminance_quant_tbl[DCTSIZE2] = {
+ 16, 11, 10, 16, 24, 40, 51, 61,
+ 12, 12, 14, 19, 26, 58, 60, 55,
+ 14, 13, 16, 24, 40, 57, 69, 56,
+ 14, 17, 22, 29, 51, 87, 80, 62,
+ 18, 22, 37, 56, 68, 109, 103, 77,
+ 24, 35, 55, 64, 81, 104, 113, 92,
+ 49, 64, 78, 87, 103, 121, 120, 101,
+ 72, 92, 95, 98, 112, 100, 103, 99
+};
+static const unsigned int std_chrominance_quant_tbl[DCTSIZE2] = {
+ 17, 18, 24, 47, 99, 99, 99, 99,
+ 18, 21, 26, 66, 99, 99, 99, 99,
+ 24, 26, 56, 99, 99, 99, 99, 99,
+ 47, 66, 99, 99, 99, 99, 99, 99,
+ 99, 99, 99, 99, 99, 99, 99, 99,
+ 99, 99, 99, 99, 99, 99, 99, 99,
+ 99, 99, 99, 99, 99, 99, 99, 99,
+ 99, 99, 99, 99, 99, 99, 99, 99
+};
+
+
+GLOBAL(void)
+jpeg_default_qtables (j_compress_ptr cinfo, boolean force_baseline)
+/* Set or change the 'quality' (quantization) setting, using default tables
+ * and straight percentage-scaling quality scales.
+ * This entry point allows different scalings for luminance and chrominance.
+ */
+{
+ /* Set up two quantization tables using the specified scaling */
+ jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl,
+ cinfo->q_scale_factor[0], force_baseline);
+ jpeg_add_quant_table(cinfo, 1, std_chrominance_quant_tbl,
+ cinfo->q_scale_factor[1], force_baseline);
+}
+
+
GLOBAL(void)
jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
boolean force_baseline)
@@ -69,31 +111,6 @@ jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
* applications that insist on a linear percentage scaling.
*/
{
- /* These are the sample quantization tables given in JPEG spec section K.1.
- * The spec says that the values given produce "good" quality, and
- * when divided by 2, "very good" quality.
- */
- static const unsigned int std_luminance_quant_tbl[DCTSIZE2] = {
- 16, 11, 10, 16, 24, 40, 51, 61,
- 12, 12, 14, 19, 26, 58, 60, 55,
- 14, 13, 16, 24, 40, 57, 69, 56,
- 14, 17, 22, 29, 51, 87, 80, 62,
- 18, 22, 37, 56, 68, 109, 103, 77,
- 24, 35, 55, 64, 81, 104, 113, 92,
- 49, 64, 78, 87, 103, 121, 120, 101,
- 72, 92, 95, 98, 112, 100, 103, 99
- };
- static const unsigned int std_chrominance_quant_tbl[DCTSIZE2] = {
- 17, 18, 24, 47, 99, 99, 99, 99,
- 18, 21, 26, 66, 99, 99, 99, 99,
- 24, 26, 56, 99, 99, 99, 99, 99,
- 47, 66, 99, 99, 99, 99, 99, 99,
- 99, 99, 99, 99, 99, 99, 99, 99,
- 99, 99, 99, 99, 99, 99, 99, 99,
- 99, 99, 99, 99, 99, 99, 99, 99,
- 99, 99, 99, 99, 99, 99, 99, 99
- };
-
/* Set up two quantization tables using the specified scaling */
jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl,
scale_factor, force_baseline);
@@ -284,6 +301,8 @@ jpeg_set_defaults (j_compress_ptr cinfo)
/* Initialize everything not dependent on the color space */
+ cinfo->scale_num = 1; /* 1:1 scaling */
+ cinfo->scale_denom = 1;
cinfo->data_precision = BITS_IN_JSAMPLE;
/* Set up two quantization tables using default quality of 75 */
jpeg_set_quality(cinfo, 75, TRUE);
@@ -320,6 +339,9 @@ jpeg_set_defaults (j_compress_ptr cinfo)
/* By default, use the simpler non-cosited sampling alignment */
cinfo->CCIR601_sampling = FALSE;
+ /* By default, apply fancy downsampling */
+ cinfo->do_fancy_downsampling = TRUE;
+
/* No input smoothing */
cinfo->smoothing_factor = 0;
diff --git a/src/libjpeg/jcprepct.c b/src/libjpeg/jcprepct.c
index fa93333..be44cc4 100644
--- a/src/libjpeg/jcprepct.c
+++ b/src/libjpeg/jcprepct.c
@@ -173,10 +173,12 @@ pre_process_data (j_compress_ptr cinfo,
*out_row_group_ctr < out_row_groups_avail) {
for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
ci++, compptr++) {
+ numrows = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
+ cinfo->min_DCT_v_scaled_size;
expand_bottom_edge(output_buf[ci],
- compptr->width_in_blocks * DCTSIZE,
- (int) (*out_row_group_ctr * compptr->v_samp_factor),
- (int) (out_row_groups_avail * compptr->v_samp_factor));
+ compptr->width_in_blocks * compptr->DCT_h_scaled_size,
+ (int) (*out_row_group_ctr * numrows),
+ (int) (out_row_groups_avail * numrows));
}
*out_row_group_ctr = out_row_groups_avail;
break; /* can exit outer loop without test */
@@ -288,7 +290,8 @@ create_context_buffer (j_compress_ptr cinfo)
*/
true_buffer = (*cinfo->mem->alloc_sarray)
((j_common_ptr) cinfo, JPOOL_IMAGE,
- (JDIMENSION) (((long) compptr->width_in_blocks * DCTSIZE *
+ (JDIMENSION) (((long) compptr->width_in_blocks *
+ cinfo->min_DCT_h_scaled_size *
cinfo->max_h_samp_factor) / compptr->h_samp_factor),
(JDIMENSION) (3 * rgroup_height));
/* Copy true buffer row pointers into the middle of the fake row array */
@@ -346,7 +349,8 @@ jinit_c_prep_controller (j_compress_ptr cinfo, boolean need_full_buffer)
ci++, compptr++) {
prep->color_buf[ci] = (*cinfo->mem->alloc_sarray)
((j_common_ptr) cinfo, JPOOL_IMAGE,
- (JDIMENSION) (((long) compptr->width_in_blocks * DCTSIZE *
+ (JDIMENSION) (((long) compptr->width_in_blocks *
+ cinfo->min_DCT_h_scaled_size *
cinfo->max_h_samp_factor) / compptr->h_samp_factor),
(JDIMENSION) cinfo->max_v_samp_factor);
}
diff --git a/src/libjpeg/jcsample.c b/src/libjpeg/jcsample.c
index 212ec87..4d36f85 100644
--- a/src/libjpeg/jcsample.c
+++ b/src/libjpeg/jcsample.c
@@ -62,6 +62,15 @@ typedef struct {
/* Downsampling method pointers, one per component */
downsample1_ptr methods[MAX_COMPONENTS];
+
+ /* Height of an output row group for each component. */
+ int rowgroup_height[MAX_COMPONENTS];
+
+ /* These arrays save pixel expansion factors so that int_downsample need not
+ * recompute them each time. They are unused for other downsampling methods.
+ */
+ UINT8 h_expand[MAX_COMPONENTS];
+ UINT8 v_expand[MAX_COMPONENTS];
} my_downsampler;
typedef my_downsampler * my_downsample_ptr;
@@ -123,7 +132,8 @@ sep_downsample (j_compress_ptr cinfo,
for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
ci++, compptr++) {
in_ptr = input_buf[ci] + in_row_index;
- out_ptr = output_buf[ci] + (out_row_group_index * compptr->v_samp_factor);
+ out_ptr = output_buf[ci] +
+ (out_row_group_index * downsample->rowgroup_height[ci]);
(*downsample->methods[ci]) (cinfo, compptr, in_ptr, out_ptr);
}
}
@@ -140,14 +150,15 @@ METHODDEF(void)
int_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
JSAMPARRAY input_data, JSAMPARRAY output_data)
{
+ my_downsample_ptr downsample = (my_downsample_ptr) cinfo->downsample;
int inrow, outrow, h_expand, v_expand, numpix, numpix2, h, v;
JDIMENSION outcol, outcol_h; /* outcol_h == outcol*h_expand */
- JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
+ JDIMENSION output_cols = compptr->width_in_blocks * compptr->DCT_h_scaled_size;
JSAMPROW inptr, outptr;
INT32 outvalue;
- h_expand = cinfo->max_h_samp_factor / compptr->h_samp_factor;
- v_expand = cinfo->max_v_samp_factor / compptr->v_samp_factor;
+ h_expand = downsample->h_expand[compptr->component_index];
+ v_expand = downsample->v_expand[compptr->component_index];
numpix = h_expand * v_expand;
numpix2 = numpix/2;
@@ -158,8 +169,8 @@ int_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
expand_right_edge(input_data, cinfo->max_v_samp_factor,
cinfo->image_width, output_cols * h_expand);
- inrow = 0;
- for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
+ inrow = outrow = 0;
+ while (inrow < cinfo->max_v_samp_factor) {
outptr = output_data[outrow];
for (outcol = 0, outcol_h = 0; outcol < output_cols;
outcol++, outcol_h += h_expand) {
@@ -173,6 +184,7 @@ int_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
*outptr++ = (JSAMPLE) ((outvalue + numpix2) / numpix);
}
inrow += v_expand;
+ outrow++;
}
}
@@ -191,8 +203,8 @@ fullsize_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
jcopy_sample_rows(input_data, 0, output_data, 0,
cinfo->max_v_samp_factor, cinfo->image_width);
/* Edge-expand */
- expand_right_edge(output_data, cinfo->max_v_samp_factor,
- cinfo->image_width, compptr->width_in_blocks * DCTSIZE);
+ expand_right_edge(output_data, cinfo->max_v_samp_factor, cinfo->image_width,
+ compptr->width_in_blocks * compptr->DCT_h_scaled_size);
}
@@ -212,9 +224,9 @@ METHODDEF(void)
h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
JSAMPARRAY input_data, JSAMPARRAY output_data)
{
- int outrow;
+ int inrow;
JDIMENSION outcol;
- JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
+ JDIMENSION output_cols = compptr->width_in_blocks * compptr->DCT_h_scaled_size;
register JSAMPROW inptr, outptr;
register int bias;
@@ -225,9 +237,9 @@ h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
expand_right_edge(input_data, cinfo->max_v_samp_factor,
cinfo->image_width, output_cols * 2);
- for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
- outptr = output_data[outrow];
- inptr = input_data[outrow];
+ for (inrow = 0; inrow < cinfo->max_v_samp_factor; inrow++) {
+ outptr = output_data[inrow];
+ inptr = input_data[inrow];
bias = 0; /* bias = 0,1,0,1,... for successive samples */
for (outcol = 0; outcol < output_cols; outcol++) {
*outptr++ = (JSAMPLE) ((GETJSAMPLE(*inptr) + GETJSAMPLE(inptr[1])
@@ -251,7 +263,7 @@ h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
{
int inrow, outrow;
JDIMENSION outcol;
- JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
+ JDIMENSION output_cols = compptr->width_in_blocks * compptr->DCT_h_scaled_size;
register JSAMPROW inptr0, inptr1, outptr;
register int bias;
@@ -262,8 +274,8 @@ h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
expand_right_edge(input_data, cinfo->max_v_samp_factor,
cinfo->image_width, output_cols * 2);
- inrow = 0;
- for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
+ inrow = outrow = 0;
+ while (inrow < cinfo->max_v_samp_factor) {
outptr = output_data[outrow];
inptr0 = input_data[inrow];
inptr1 = input_data[inrow+1];
@@ -276,6 +288,7 @@ h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
inptr0 += 2; inptr1 += 2;
}
inrow += 2;
+ outrow++;
}
}
@@ -294,7 +307,7 @@ h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
{
int inrow, outrow;
JDIMENSION colctr;
- JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
+ JDIMENSION output_cols = compptr->width_in_blocks * compptr->DCT_h_scaled_size;
register JSAMPROW inptr0, inptr1, above_ptr, below_ptr, outptr;
INT32 membersum, neighsum, memberscale, neighscale;
@@ -321,8 +334,8 @@ h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
memberscale = 16384 - cinfo->smoothing_factor * 80; /* scaled (1-5*SF)/4 */
neighscale = cinfo->smoothing_factor * 16; /* scaled SF/4 */
- inrow = 0;
- for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
+ inrow = outrow = 0;
+ while (inrow < cinfo->max_v_samp_factor) {
outptr = output_data[outrow];
inptr0 = input_data[inrow];
inptr1 = input_data[inrow+1];
@@ -378,6 +391,7 @@ h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
*outptr = (JSAMPLE) ((membersum + 32768) >> 16);
inrow += 2;
+ outrow++;
}
}
@@ -392,9 +406,9 @@ METHODDEF(void)
fullsize_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY output_data)
{
- int outrow;
+ int inrow;
JDIMENSION colctr;
- JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
+ JDIMENSION output_cols = compptr->width_in_blocks * compptr->DCT_h_scaled_size;
register JSAMPROW inptr, above_ptr, below_ptr, outptr;
INT32 membersum, neighsum, memberscale, neighscale;
int colsum, lastcolsum, nextcolsum;
@@ -415,11 +429,11 @@ fullsize_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
memberscale = 65536L - cinfo->smoothing_factor * 512L; /* scaled 1-8*SF */
neighscale = cinfo->smoothing_factor * 64; /* scaled SF */
- for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
- outptr = output_data[outrow];
- inptr = input_data[outrow];
- above_ptr = input_data[outrow-1];
- below_ptr = input_data[outrow+1];
+ for (inrow = 0; inrow < cinfo->max_v_samp_factor; inrow++) {
+ outptr = output_data[inrow];
+ inptr = input_data[inrow];
+ above_ptr = input_data[inrow-1];
+ below_ptr = input_data[inrow+1];
/* Special case for first column */
colsum = GETJSAMPLE(*above_ptr++) + GETJSAMPLE(*below_ptr++) +
@@ -467,6 +481,7 @@ jinit_downsampler (j_compress_ptr cinfo)
int ci;
jpeg_component_info * compptr;
boolean smoothok = TRUE;
+ int h_in_group, v_in_group, h_out_group, v_out_group;
downsample = (my_downsample_ptr)
(*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
@@ -482,8 +497,17 @@ jinit_downsampler (j_compress_ptr cinfo)
/* Verify we can handle the sampling factors, and set up method pointers */
for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
ci++, compptr++) {
- if (compptr->h_samp_factor == cinfo->max_h_samp_factor &&
- compptr->v_samp_factor == cinfo->max_v_samp_factor) {
+ /* Compute size of an "output group" for DCT scaling. This many samples
+ * are to be converted from max_h_samp_factor * max_v_samp_factor pixels.
+ */
+ h_out_group = (compptr->h_samp_factor * compptr->DCT_h_scaled_size) /
+ cinfo->min_DCT_h_scaled_size;
+ v_out_group = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
+ cinfo->min_DCT_v_scaled_size;
+ h_in_group = cinfo->max_h_samp_factor;
+ v_in_group = cinfo->max_v_samp_factor;
+ downsample->rowgroup_height[ci] = v_out_group; /* save for use later */
+ if (h_in_group == h_out_group && v_in_group == v_out_group) {
#ifdef INPUT_SMOOTHING_SUPPORTED
if (cinfo->smoothing_factor) {
downsample->methods[ci] = fullsize_smooth_downsample;
@@ -491,12 +515,12 @@ jinit_downsampler (j_compress_ptr cinfo)
} else
#endif
downsample->methods[ci] = fullsize_downsample;
- } else if (compptr->h_samp_factor * 2 == cinfo->max_h_samp_factor &&
- compptr->v_samp_factor == cinfo->max_v_samp_factor) {
+ } else if (h_in_group == h_out_group * 2 &&
+ v_in_group == v_out_group) {
smoothok = FALSE;
downsample->methods[ci] = h2v1_downsample;
- } else if (compptr->h_samp_factor * 2 == cinfo->max_h_samp_factor &&
- compptr->v_samp_factor * 2 == cinfo->max_v_samp_factor) {
+ } else if (h_in_group == h_out_group * 2 &&
+ v_in_group == v_out_group * 2) {
#ifdef INPUT_SMOOTHING_SUPPORTED
if (cinfo->smoothing_factor) {
downsample->methods[ci] = h2v2_smooth_downsample;
@@ -504,10 +528,12 @@ jinit_downsampler (j_compress_ptr cinfo)
} else
#endif
downsample->methods[ci] = h2v2_downsample;
- } else if ((cinfo->max_h_samp_factor % compptr->h_samp_factor) == 0 &&
- (cinfo->max_v_samp_factor % compptr->v_samp_factor) == 0) {
+ } else if ((h_in_group % h_out_group) == 0 &&
+ (v_in_group % v_out_group) == 0) {
smoothok = FALSE;
downsample->methods[ci] = int_downsample;
+ downsample->h_expand[ci] = (UINT8) (h_in_group / h_out_group);
+ downsample->v_expand[ci] = (UINT8) (v_in_group / v_out_group);
} else
ERREXIT(cinfo, JERR_FRACT_SAMPLE_NOTIMPL);
}
diff --git a/src/libjpeg/jctrans.c b/src/libjpeg/jctrans.c
index 0e6d707..74bb12c 100644
--- a/src/libjpeg/jctrans.c
+++ b/src/libjpeg/jctrans.c
@@ -167,16 +167,9 @@ transencode_master_selection (j_compress_ptr cinfo,
/* Entropy encoding: either Huffman or arithmetic coding. */
if (cinfo->arith_code) {
- ERREXIT(cinfo, JERR_ARITH_NOTIMPL);
+ jinit_arith_encoder(cinfo);
} else {
- if (cinfo->progressive_mode) {
-#ifdef C_PROGRESSIVE_SUPPORTED
- jinit_phuff_encoder(cinfo);
-#else
- ERREXIT(cinfo, JERR_NOT_COMPILED);
-#endif
- } else
- jinit_huff_encoder(cinfo);
+ jinit_huff_encoder(cinfo);
}
/* We need a special coefficient buffer controller. */
diff --git a/src/libjpeg/jdapimin.c b/src/libjpeg/jdapimin.c
index cadb59f..a55498f 100644
--- a/src/libjpeg/jdapimin.c
+++ b/src/libjpeg/jdapimin.c
@@ -2,6 +2,7 @@
* jdapimin.c
*
* Copyright (C) 1994-1998, Thomas G. Lane.
+ * Modified 2009 by Guido Vollbeding.
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -185,8 +186,8 @@ default_decompress_parms (j_decompress_ptr cinfo)
}
/* Set defaults for other decompression parameters. */
- cinfo->scale_num = 1; /* 1:1 scaling */
- cinfo->scale_denom = 1;
+ cinfo->scale_num = DCTSIZE; /* 1:1 scaling */
+ cinfo->scale_denom = DCTSIZE;
cinfo->output_gamma = 1.0;
cinfo->buffered_image = FALSE;
cinfo->raw_data_out = FALSE;
diff --git a/src/libjpeg/jdapistd.c b/src/libjpeg/jdapistd.c
index c8e3fa0..9d74537 100644
--- a/src/libjpeg/jdapistd.c
+++ b/src/libjpeg/jdapistd.c
@@ -202,7 +202,7 @@ jpeg_read_raw_data (j_decompress_ptr cinfo, JSAMPIMAGE data,
}
/* Verify that at least one iMCU row can be returned. */
- lines_per_iMCU_row = cinfo->max_v_samp_factor * cinfo->min_DCT_scaled_size;
+ lines_per_iMCU_row = cinfo->max_v_samp_factor * cinfo->min_DCT_v_scaled_size;
if (max_lines < lines_per_iMCU_row)
ERREXIT(cinfo, JERR_BUFFER_SIZE);
diff --git a/src/libjpeg/jdcoefct.c b/src/libjpeg/jdcoefct.c
index 4938d20..462e92c 100644
--- a/src/libjpeg/jdcoefct.c
+++ b/src/libjpeg/jdcoefct.c
@@ -187,7 +187,7 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
useful_width = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
: compptr->last_col_width;
output_ptr = output_buf[compptr->component_index] +
- yoffset * compptr->DCT_scaled_size;
+ yoffset * compptr->DCT_v_scaled_size;
start_col = MCU_col_num * compptr->MCU_sample_width;
for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
if (cinfo->input_iMCU_row < last_iMCU_row ||
@@ -197,11 +197,11 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
(*inverse_DCT) (cinfo, compptr,
(JCOEFPTR) coef->MCU_buffer[blkn+xindex],
output_ptr, output_col);
- output_col += compptr->DCT_scaled_size;
+ output_col += compptr->DCT_h_scaled_size;
}
}
blkn += compptr->MCU_width;
- output_ptr += compptr->DCT_scaled_size;
+ output_ptr += compptr->DCT_v_scaled_size;
}
}
}
@@ -362,9 +362,9 @@ decompress_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
(*inverse_DCT) (cinfo, compptr, (JCOEFPTR) buffer_ptr,
output_ptr, output_col);
buffer_ptr++;
- output_col += compptr->DCT_scaled_size;
+ output_col += compptr->DCT_h_scaled_size;
}
- output_ptr += compptr->DCT_scaled_size;
+ output_ptr += compptr->DCT_v_scaled_size;
}
}
@@ -654,9 +654,9 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
DC4 = DC5; DC5 = DC6;
DC7 = DC8; DC8 = DC9;
buffer_ptr++, prev_block_row++, next_block_row++;
- output_col += compptr->DCT_scaled_size;
+ output_col += compptr->DCT_h_scaled_size;
}
- output_ptr += compptr->DCT_scaled_size;
+ output_ptr += compptr->DCT_v_scaled_size;
}
}
diff --git a/src/libjpeg/jdct.h b/src/libjpeg/jdct.h
index 04192a2..360dec8 100644
--- a/src/libjpeg/jdct.h
+++ b/src/libjpeg/jdct.h
@@ -14,11 +14,16 @@
/*
- * A forward DCT routine is given a pointer to a work area of type DCTELEM[];
- * the DCT is to be performed in-place in that buffer. Type DCTELEM is int
- * for 8-bit samples, INT32 for 12-bit samples. (NOTE: Floating-point DCT
- * implementations use an array of type FAST_FLOAT, instead.)
- * The DCT inputs are expected to be signed (range +-CENTERJSAMPLE).
+ * A forward DCT routine is given a pointer to an input sample array and
+ * a pointer to a work area of type DCTELEM[]; the DCT is to be performed
+ * in-place in that buffer. Type DCTELEM is int for 8-bit samples, INT32
+ * for 12-bit samples. (NOTE: Floating-point DCT implementations use an
+ * array of type FAST_FLOAT, instead.)
+ * The input data is to be fetched from the sample array starting at a
+ * specified column. (Any row offset needed will be applied to the array
+ * pointer before it is passed to the FDCT code.)
+ * Note that the number of samples fetched by the FDCT routine is
+ * DCT_h_scaled_size * DCT_v_scaled_size.
* The DCT outputs are returned scaled up by a factor of 8; they therefore
* have a range of +-8K for 8-bit data, +-128K for 12-bit data. This
* convention improves accuracy in integer implementations and saves some
@@ -32,8 +37,12 @@ typedef int DCTELEM; /* 16 or 32 bits is fine */
typedef INT32 DCTELEM; /* must have 32 bits */
#endif
-typedef JMETHOD(void, forward_DCT_method_ptr, (DCTELEM * data));
-typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data));
+typedef JMETHOD(void, forward_DCT_method_ptr, (DCTELEM * data,
+ JSAMPARRAY sample_data,
+ JDIMENSION start_col));
+typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data,
+ JSAMPARRAY sample_data,
+ JDIMENSION start_col));
/*
@@ -44,7 +53,7 @@ typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data));
* sample array starting at a specified column. (Any row offset needed will
* be applied to the array pointer before it is passed to the IDCT code.)
* Note that the number of samples emitted by the IDCT routine is
- * DCT_scaled_size * DCT_scaled_size.
+ * DCT_h_scaled_size * DCT_v_scaled_size.
*/
/* typedef inverse_DCT_method_ptr is declared in jpegint.h */
@@ -84,19 +93,143 @@ typedef FAST_FLOAT FLOAT_MULT_TYPE; /* preferred floating type */
#define jpeg_fdct_islow jFDislow
#define jpeg_fdct_ifast jFDifast
#define jpeg_fdct_float jFDfloat
+#define jpeg_fdct_7x7 jFD7x7
+#define jpeg_fdct_6x6 jFD6x6
+#define jpeg_fdct_5x5 jFD5x5
+#define jpeg_fdct_4x4 jFD4x4
+#define jpeg_fdct_3x3 jFD3x3
+#define jpeg_fdct_2x2 jFD2x2
+#define jpeg_fdct_1x1 jFD1x1
+#define jpeg_fdct_9x9 jFD9x9
+#define jpeg_fdct_10x10 jFD10x10
+#define jpeg_fdct_11x11 jFD11x11
+#define jpeg_fdct_12x12 jFD12x12
+#define jpeg_fdct_13x13 jFD13x13
+#define jpeg_fdct_14x14 jFD14x14
+#define jpeg_fdct_15x15 jFD15x15
+#define jpeg_fdct_16x16 jFD16x16
+#define jpeg_fdct_16x8 jFD16x8
+#define jpeg_fdct_14x7 jFD14x7
+#define jpeg_fdct_12x6 jFD12x6
+#define jpeg_fdct_10x5 jFD10x5
+#define jpeg_fdct_8x4 jFD8x4
+#define jpeg_fdct_6x3 jFD6x3
+#define jpeg_fdct_4x2 jFD4x2
+#define jpeg_fdct_2x1 jFD2x1
+#define jpeg_fdct_8x16 jFD8x16
+#define jpeg_fdct_7x14 jFD7x14
+#define jpeg_fdct_6x12 jFD6x12
+#define jpeg_fdct_5x10 jFD5x10
+#define jpeg_fdct_4x8 jFD4x8
+#define jpeg_fdct_3x6 jFD3x6
+#define jpeg_fdct_2x4 jFD2x4
+#define jpeg_fdct_1x2 jFD1x2
#define jpeg_idct_islow jRDislow
#define jpeg_idct_ifast jRDifast
#define jpeg_idct_float jRDfloat
+#define jpeg_idct_7x7 jRD7x7
+#define jpeg_idct_6x6 jRD6x6
+#define jpeg_idct_5x5 jRD5x5
#define jpeg_idct_4x4 jRD4x4
+#define jpeg_idct_3x3 jRD3x3
#define jpeg_idct_2x2 jRD2x2
#define jpeg_idct_1x1 jRD1x1
+#define jpeg_idct_9x9 jRD9x9
+#define jpeg_idct_10x10 jRD10x10
+#define jpeg_idct_11x11 jRD11x11
+#define jpeg_idct_12x12 jRD12x12
+#define jpeg_idct_13x13 jRD13x13
+#define jpeg_idct_14x14 jRD14x14
+#define jpeg_idct_15x15 jRD15x15
+#define jpeg_idct_16x16 jRD16x16
+#define jpeg_idct_16x8 jRD16x8
+#define jpeg_idct_14x7 jRD14x7
+#define jpeg_idct_12x6 jRD12x6
+#define jpeg_idct_10x5 jRD10x5
+#define jpeg_idct_8x4 jRD8x4
+#define jpeg_idct_6x3 jRD6x3
+#define jpeg_idct_4x2 jRD4x2
+#define jpeg_idct_2x1 jRD2x1
+#define jpeg_idct_8x16 jRD8x16
+#define jpeg_idct_7x14 jRD7x14
+#define jpeg_idct_6x12 jRD6x12
+#define jpeg_idct_5x10 jRD5x10
+#define jpeg_idct_4x8 jRD4x8
+#define jpeg_idct_3x6 jRD3x8
+#define jpeg_idct_2x4 jRD2x4
+#define jpeg_idct_1x2 jRD1x2
#endif /* NEED_SHORT_EXTERNAL_NAMES */
/* Extern declarations for the forward and inverse DCT routines. */
-EXTERN(void) jpeg_fdct_islow JPP((DCTELEM * data));
-EXTERN(void) jpeg_fdct_ifast JPP((DCTELEM * data));
-EXTERN(void) jpeg_fdct_float JPP((FAST_FLOAT * data));
+EXTERN(void) jpeg_fdct_islow
+ JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_ifast
+ JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_float
+ JPP((FAST_FLOAT * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_7x7
+ JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_6x6
+ JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_5x5
+ JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_4x4
+ JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_3x3
+ JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_2x2
+ JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_1x1
+ JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_9x9
+ JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_10x10
+ JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_11x11
+ JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_12x12
+ JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_13x13
+ JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_14x14
+ JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_15x15
+ JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_16x16
+ JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_16x8
+ JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_14x7
+ JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_12x6
+ JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_10x5
+ JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_8x4
+ JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_6x3
+ JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_4x2
+ JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_2x1
+ JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_8x16
+ JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_7x14
+ JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_6x12
+ JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_5x10
+ JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_4x8
+ JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_3x6
+ JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_2x4
+ JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_1x2
+ JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
EXTERN(void) jpeg_idct_islow
JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
@@ -107,15 +240,99 @@ EXTERN(void) jpeg_idct_ifast
EXTERN(void) jpeg_idct_float
JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_7x7
+ JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_6x6
+ JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_5x5
+ JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
EXTERN(void) jpeg_idct_4x4
JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_3x3
+ JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
EXTERN(void) jpeg_idct_2x2
JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
EXTERN(void) jpeg_idct_1x1
JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_9x9
+ JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_10x10
+ JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_11x11
+ JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_12x12
+ JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_13x13
+ JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_14x14
+ JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_15x15
+ JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_16x16
+ JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_16x8
+ JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_14x7
+ JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_12x6
+ JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_10x5
+ JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_8x4
+ JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_6x3
+ JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_4x2
+ JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_2x1
+ JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_8x16
+ JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_7x14
+ JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_6x12
+ JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_5x10
+ JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_4x8
+ JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_3x6
+ JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_2x4
+ JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_1x2
+ JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
/*
diff --git a/src/libjpeg/jddctmgr.c b/src/libjpeg/jddctmgr.c
index bbf8d0e..bdbde34 100644
--- a/src/libjpeg/jddctmgr.c
+++ b/src/libjpeg/jddctmgr.c
@@ -98,22 +98,134 @@ start_pass (j_decompress_ptr cinfo)
for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
ci++, compptr++) {
/* Select the proper IDCT routine for this component's scaling */
- switch (compptr->DCT_scaled_size) {
+ switch ((compptr->DCT_h_scaled_size << 8) + compptr->DCT_v_scaled_size) {
#ifdef IDCT_SCALING_SUPPORTED
- case 1:
+ case ((1 << 8) + 1):
method_ptr = jpeg_idct_1x1;
- method = JDCT_ISLOW; /* jidctred uses islow-style table */
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
break;
- case 2:
+ case ((2 << 8) + 2):
method_ptr = jpeg_idct_2x2;
- method = JDCT_ISLOW; /* jidctred uses islow-style table */
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
break;
- case 4:
+ case ((3 << 8) + 3):
+ method_ptr = jpeg_idct_3x3;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case ((4 << 8) + 4):
method_ptr = jpeg_idct_4x4;
- method = JDCT_ISLOW; /* jidctred uses islow-style table */
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case ((5 << 8) + 5):
+ method_ptr = jpeg_idct_5x5;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case ((6 << 8) + 6):
+ method_ptr = jpeg_idct_6x6;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case ((7 << 8) + 7):
+ method_ptr = jpeg_idct_7x7;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case ((9 << 8) + 9):
+ method_ptr = jpeg_idct_9x9;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case ((10 << 8) + 10):
+ method_ptr = jpeg_idct_10x10;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case ((11 << 8) + 11):
+ method_ptr = jpeg_idct_11x11;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case ((12 << 8) + 12):
+ method_ptr = jpeg_idct_12x12;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case ((13 << 8) + 13):
+ method_ptr = jpeg_idct_13x13;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case ((14 << 8) + 14):
+ method_ptr = jpeg_idct_14x14;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case ((15 << 8) + 15):
+ method_ptr = jpeg_idct_15x15;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case ((16 << 8) + 16):
+ method_ptr = jpeg_idct_16x16;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case ((16 << 8) + 8):
+ method_ptr = jpeg_idct_16x8;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case ((14 << 8) + 7):
+ method_ptr = jpeg_idct_14x7;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case ((12 << 8) + 6):
+ method_ptr = jpeg_idct_12x6;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case ((10 << 8) + 5):
+ method_ptr = jpeg_idct_10x5;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case ((8 << 8) + 4):
+ method_ptr = jpeg_idct_8x4;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case ((6 << 8) + 3):
+ method_ptr = jpeg_idct_6x3;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case ((4 << 8) + 2):
+ method_ptr = jpeg_idct_4x2;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case ((2 << 8) + 1):
+ method_ptr = jpeg_idct_2x1;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case ((8 << 8) + 16):
+ method_ptr = jpeg_idct_8x16;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case ((7 << 8) + 14):
+ method_ptr = jpeg_idct_7x14;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case ((6 << 8) + 12):
+ method_ptr = jpeg_idct_6x12;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case ((5 << 8) + 10):
+ method_ptr = jpeg_idct_5x10;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case ((4 << 8) + 8):
+ method_ptr = jpeg_idct_4x8;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case ((3 << 8) + 6):
+ method_ptr = jpeg_idct_3x6;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case ((2 << 8) + 4):
+ method_ptr = jpeg_idct_2x4;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case ((1 << 8) + 2):
+ method_ptr = jpeg_idct_1x2;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
break;
#endif
- case DCTSIZE:
+ case ((DCTSIZE << 8) + DCTSIZE):
switch (cinfo->dct_method) {
#ifdef DCT_ISLOW_SUPPORTED
case JDCT_ISLOW:
@@ -139,7 +251,8 @@ start_pass (j_decompress_ptr cinfo)
}
break;
default:
- ERREXIT1(cinfo, JERR_BAD_DCTSIZE, compptr->DCT_scaled_size);
+ ERREXIT2(cinfo, JERR_BAD_DCTSIZE,
+ compptr->DCT_h_scaled_size, compptr->DCT_v_scaled_size);
break;
}
idct->pub.inverse_DCT[ci] = method_ptr;
diff --git a/src/libjpeg/jdhuff.c b/src/libjpeg/jdhuff.c
index b960874..a24181b 100644
--- a/src/libjpeg/jdhuff.c
+++ b/src/libjpeg/jdhuff.c
@@ -2,10 +2,12 @@
* jdhuff.c
*
* Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2006-2009 by Guido Vollbeding.
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
* This file contains Huffman entropy decoding routines.
+ * Both sequential and progressive modes are supported in this single module.
*
* Much of the complexity here has to do with supporting input suspension.
* If the data source module demands suspension, we want to be able to back
@@ -17,7 +19,173 @@
#define JPEG_INTERNALS
#include "jinclude.h"
#include "jpeglib.h"
-#include "jdhuff.h" /* Declarations shared with jdphuff.c */
+
+
+/* Derived data constructed for each Huffman table */
+
+#define HUFF_LOOKAHEAD 8 /* # of bits of lookahead */
+
+typedef struct {
+ /* Basic tables: (element [0] of each array is unused) */
+ INT32 maxcode[18]; /* largest code of length k (-1 if none) */
+ /* (maxcode[17] is a sentinel to ensure jpeg_huff_decode terminates) */
+ INT32 valoffset[17]; /* huffval[] offset for codes of length k */
+ /* valoffset[k] = huffval[] index of 1st symbol of code length k, less
+ * the smallest code of length k; so given a code of length k, the
+ * corresponding symbol is huffval[code + valoffset[k]]
+ */
+
+ /* Link to public Huffman table (needed only in jpeg_huff_decode) */
+ JHUFF_TBL *pub;
+
+ /* Lookahead tables: indexed by the next HUFF_LOOKAHEAD bits of
+ * the input data stream. If the next Huffman code is no more
+ * than HUFF_LOOKAHEAD bits long, we can obtain its length and
+ * the corresponding symbol directly from these tables.
+ */
+ int look_nbits[1<<HUFF_LOOKAHEAD]; /* # bits, or 0 if too long */
+ UINT8 look_sym[1<<HUFF_LOOKAHEAD]; /* symbol, or unused */
+} d_derived_tbl;
+
+
+/*
+ * Fetching the next N bits from the input stream is a time-critical operation
+ * for the Huffman decoders. We implement it with a combination of inline
+ * macros and out-of-line subroutines. Note that N (the number of bits
+ * demanded at one time) never exceeds 15 for JPEG use.
+ *
+ * We read source bytes into get_buffer and dole out bits as needed.
+ * If get_buffer already contains enough bits, they are fetched in-line
+ * by the macros CHECK_BIT_BUFFER and GET_BITS. When there aren't enough
+ * bits, jpeg_fill_bit_buffer is called; it will attempt to fill get_buffer
+ * as full as possible (not just to the number of bits needed; this
+ * prefetching reduces the overhead cost of calling jpeg_fill_bit_buffer).
+ * Note that jpeg_fill_bit_buffer may return FALSE to indicate suspension.
+ * On TRUE return, jpeg_fill_bit_buffer guarantees that get_buffer contains
+ * at least the requested number of bits --- dummy zeroes are inserted if
+ * necessary.
+ */
+
+typedef INT32 bit_buf_type; /* type of bit-extraction buffer */
+#define BIT_BUF_SIZE 32 /* size of buffer in bits */
+
+/* If long is > 32 bits on your machine, and shifting/masking longs is
+ * reasonably fast, making bit_buf_type be long and setting BIT_BUF_SIZE
+ * appropriately should be a win. Unfortunately we can't define the size
+ * with something like #define BIT_BUF_SIZE (sizeof(bit_buf_type)*8)
+ * because not all machines measure sizeof in 8-bit bytes.
+ */
+
+typedef struct { /* Bitreading state saved across MCUs */
+ bit_buf_type get_buffer; /* current bit-extraction buffer */
+ int bits_left; /* # of unused bits in it */
+} bitread_perm_state;
+
+typedef struct { /* Bitreading working state within an MCU */
+ /* Current data source location */
+ /* We need a copy, rather than munging the original, in case of suspension */
+ const JOCTET * next_input_byte; /* => next byte to read from source */
+ size_t bytes_in_buffer; /* # of bytes remaining in source buffer */
+ /* Bit input buffer --- note these values are kept in register variables,
+ * not in this struct, inside the inner loops.
+ */
+ bit_buf_type get_buffer; /* current bit-extraction buffer */
+ int bits_left; /* # of unused bits in it */
+ /* Pointer needed by jpeg_fill_bit_buffer. */
+ j_decompress_ptr cinfo; /* back link to decompress master record */
+} bitread_working_state;
+
+/* Macros to declare and load/save bitread local variables. */
+#define BITREAD_STATE_VARS \
+ register bit_buf_type get_buffer; \
+ register int bits_left; \
+ bitread_working_state br_state
+
+#define BITREAD_LOAD_STATE(cinfop,permstate) \
+ br_state.cinfo = cinfop; \
+ br_state.next_input_byte = cinfop->src->next_input_byte; \
+ br_state.bytes_in_buffer = cinfop->src->bytes_in_buffer; \
+ get_buffer = permstate.get_buffer; \
+ bits_left = permstate.bits_left;
+
+#define BITREAD_SAVE_STATE(cinfop,permstate) \
+ cinfop->src->next_input_byte = br_state.next_input_byte; \
+ cinfop->src->bytes_in_buffer = br_state.bytes_in_buffer; \
+ permstate.get_buffer = get_buffer; \
+ permstate.bits_left = bits_left
+
+/*
+ * These macros provide the in-line portion of bit fetching.
+ * Use CHECK_BIT_BUFFER to ensure there are N bits in get_buffer
+ * before using GET_BITS, PEEK_BITS, or DROP_BITS.
+ * The variables get_buffer and bits_left are assumed to be locals,
+ * but the state struct might not be (jpeg_huff_decode needs this).
+ * CHECK_BIT_BUFFER(state,n,action);
+ * Ensure there are N bits in get_buffer; if suspend, take action.
+ * val = GET_BITS(n);
+ * Fetch next N bits.
+ * val = PEEK_BITS(n);
+ * Fetch next N bits without removing them from the buffer.
+ * DROP_BITS(n);
+ * Discard next N bits.
+ * The value N should be a simple variable, not an expression, because it
+ * is evaluated multiple times.
+ */
+
+#define CHECK_BIT_BUFFER(state,nbits,action) \
+ { if (bits_left < (nbits)) { \
+ if (! jpeg_fill_bit_buffer(&(state),get_buffer,bits_left,nbits)) \
+ { action; } \
+ get_buffer = (state).get_buffer; bits_left = (state).bits_left; } }
+
+#define GET_BITS(nbits) \
+ (((int) (get_buffer >> (bits_left -= (nbits)))) & BIT_MASK(nbits))
+
+#define PEEK_BITS(nbits) \
+ (((int) (get_buffer >> (bits_left - (nbits)))) & BIT_MASK(nbits))
+
+#define DROP_BITS(nbits) \
+ (bits_left -= (nbits))
+
+
+/*
+ * Code for extracting next Huffman-coded symbol from input bit stream.
+ * Again, this is time-critical and we make the main paths be macros.
+ *
+ * We use a lookahead table to process codes of up to HUFF_LOOKAHEAD bits
+ * without looping. Usually, more than 95% of the Huffman codes will be 8
+ * or fewer bits long. The few overlength codes are handled with a loop,
+ * which need not be inline code.
+ *
+ * Notes about the HUFF_DECODE macro:
+ * 1. Near the end of the data segment, we may fail to get enough bits
+ * for a lookahead. In that case, we do it the hard way.
+ * 2. If the lookahead table contains no entry, the next code must be
+ * more than HUFF_LOOKAHEAD bits long.
+ * 3. jpeg_huff_decode returns -1 if forced to suspend.
+ */
+
+#define HUFF_DECODE(result,state,htbl,failaction,slowlabel) \
+{ register int nb, look; \
+ if (bits_left < HUFF_LOOKAHEAD) { \
+ if (! jpeg_fill_bit_buffer(&state,get_buffer,bits_left, 0)) {failaction;} \
+ get_buffer = state.get_buffer; bits_left = state.bits_left; \
+ if (bits_left < HUFF_LOOKAHEAD) { \
+ nb = 1; goto slowlabel; \
+ } \
+ } \
+ look = PEEK_BITS(HUFF_LOOKAHEAD); \
+ if ((nb = htbl->look_nbits[look]) != 0) { \
+ DROP_BITS(nb); \
+ result = htbl->look_sym[look]; \
+ } else { \
+ nb = HUFF_LOOKAHEAD+1; \
+slowlabel: \
+ if ((result=jpeg_huff_decode(&state,get_buffer,bits_left,htbl,nb)) < 0) \
+ { failaction; } \
+ get_buffer = state.get_buffer; bits_left = state.bits_left; \
+ } \
+}
/*
@@ -28,7 +196,8 @@
*/
typedef struct {
- int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
+ unsigned int EOBRUN; /* remaining EOBs in EOBRUN */
+ int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
} savable_state;
/* This macro is to work around compilers with missing or broken
@@ -41,7 +210,8 @@ typedef struct {
#else
#if MAX_COMPS_IN_SCAN == 4
#define ASSIGN_STATE(dest,src) \
- ((dest).last_dc_val[0] = (src).last_dc_val[0], \
+ ((dest).EOBRUN = (src).EOBRUN, \
+ (dest).last_dc_val[0] = (src).last_dc_val[0], \
(dest).last_dc_val[1] = (src).last_dc_val[1], \
(dest).last_dc_val[2] = (src).last_dc_val[2], \
(dest).last_dc_val[3] = (src).last_dc_val[3])
@@ -61,6 +231,15 @@ typedef struct {
/* These fields are NOT loaded into local working state. */
unsigned int restarts_to_go; /* MCUs left in this restart interval */
+ /* Following two fields used only in progressive mode */
+
+ /* Pointers to derived tables (these workspaces have image lifespan) */
+ d_derived_tbl * derived_tbls[NUM_HUFF_TBLS];
+
+ d_derived_tbl * ac_derived_tbl; /* active table during an AC scan */
+
+ /* Following fields used only in sequential mode */
+
/* Pointers to derived tables (these workspaces have image lifespan) */
d_derived_tbl * dc_derived_tbls[NUM_HUFF_TBLS];
d_derived_tbl * ac_derived_tbls[NUM_HUFF_TBLS];
@@ -71,81 +250,30 @@ typedef struct {
d_derived_tbl * dc_cur_tbls[D_MAX_BLOCKS_IN_MCU];
d_derived_tbl * ac_cur_tbls[D_MAX_BLOCKS_IN_MCU];
/* Whether we care about the DC and AC coefficient values for each block */
- boolean dc_needed[D_MAX_BLOCKS_IN_MCU];
- boolean ac_needed[D_MAX_BLOCKS_IN_MCU];
+ int coef_limit[D_MAX_BLOCKS_IN_MCU];
} huff_entropy_decoder;
typedef huff_entropy_decoder * huff_entropy_ptr;
-/*
- * Initialize for a Huffman-compressed scan.
- */
-
-METHODDEF(void)
-start_pass_huff_decoder (j_decompress_ptr cinfo)
-{
- huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
- int ci, blkn, dctbl, actbl;
- jpeg_component_info * compptr;
-
- /* Check that the scan parameters Ss, Se, Ah/Al are OK for sequential JPEG.
- * This ought to be an error condition, but we make it a warning because
- * there are some baseline files out there with all zeroes in these bytes.
- */
- if (cinfo->Ss != 0 || cinfo->Se != DCTSIZE2-1 ||
- cinfo->Ah != 0 || cinfo->Al != 0)
- WARNMS(cinfo, JWRN_NOT_SEQUENTIAL);
-
- for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
- compptr = cinfo->cur_comp_info[ci];
- dctbl = compptr->dc_tbl_no;
- actbl = compptr->ac_tbl_no;
- /* Compute derived values for Huffman tables */
- /* We may do this more than once for a table, but it's not expensive */
- jpeg_make_d_derived_tbl(cinfo, TRUE, dctbl,
- & entropy->dc_derived_tbls[dctbl]);
- jpeg_make_d_derived_tbl(cinfo, FALSE, actbl,
- & entropy->ac_derived_tbls[actbl]);
- /* Initialize DC predictions to 0 */
- entropy->saved.last_dc_val[ci] = 0;
- }
-
- /* Precalculate decoding info for each block in an MCU of this scan */
- for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
- ci = cinfo->MCU_membership[blkn];
- compptr = cinfo->cur_comp_info[ci];
- /* Precalculate which table to use for each block */
- entropy->dc_cur_tbls[blkn] = entropy->dc_derived_tbls[compptr->dc_tbl_no];
- entropy->ac_cur_tbls[blkn] = entropy->ac_derived_tbls[compptr->ac_tbl_no];
- /* Decide whether we really care about the coefficient values */
- if (compptr->component_needed) {
- entropy->dc_needed[blkn] = TRUE;
- /* we don't need the ACs if producing a 1/8th-size image */
- entropy->ac_needed[blkn] = (compptr->DCT_scaled_size > 1);
- } else {
- entropy->dc_needed[blkn] = entropy->ac_needed[blkn] = FALSE;
- }
- }
-
- /* Initialize bitread state variables */
- entropy->bitstate.bits_left = 0;
- entropy->bitstate.get_buffer = 0; /* unnecessary, but keeps Purify quiet */
- entropy->pub.insufficient_data = FALSE;
-
- /* Initialize restart counter */
- entropy->restarts_to_go = cinfo->restart_interval;
-}
+static const int jpeg_zigzag_order[8][8] = {
+ { 0, 1, 5, 6, 14, 15, 27, 28 },
+ { 2, 4, 7, 13, 16, 26, 29, 42 },
+ { 3, 8, 12, 17, 25, 30, 41, 43 },
+ { 9, 11, 18, 24, 31, 40, 44, 53 },
+ { 10, 19, 23, 32, 39, 45, 52, 54 },
+ { 20, 22, 33, 38, 46, 51, 55, 60 },
+ { 21, 34, 37, 47, 50, 56, 59, 61 },
+ { 35, 36, 48, 49, 57, 58, 62, 63 }
+};
/*
* Compute the derived values for a Huffman table.
* This routine also performs some validation checks on the table.
- *
- * Note this is also used by jdphuff.c.
*/
-GLOBAL(void)
+LOCAL(void)
jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
d_derived_tbl ** pdtbl)
{
@@ -267,8 +395,7 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
/*
- * Out-of-line code for bit fetching (shared with jdphuff.c).
- * See jdhuff.h for info about usage.
+ * Out-of-line code for bit fetching.
* Note: current values of get_buffer and bits_left are passed as parameters,
* but are returned in the corresponding fields of the state struct.
*
@@ -288,7 +415,7 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
#endif
-GLOBAL(boolean)
+LOCAL(boolean)
jpeg_fill_bit_buffer (bitread_working_state * state,
register bit_buf_type get_buffer, register int bits_left,
int nbits)
@@ -390,11 +517,32 @@ jpeg_fill_bit_buffer (bitread_working_state * state,
/*
+ * Figure F.12: extend sign bit.
+ * On some machines, a shift and sub will be faster than a table lookup.
+ */
+
+#ifdef AVOID_TABLES
+
+#define BIT_MASK(nbits) ((1<<(nbits))-1)
+#define HUFF_EXTEND(x,s) ((x) < (1<<((s)-1)) ? (x) - ((1<<(s))-1) : (x))
+
+#else
+
+#define BIT_MASK(nbits) bmask[nbits]
+#define HUFF_EXTEND(x,s) ((x) <= bmask[(s) - 1] ? (x) - bmask[s] : (x))
+
+static const int bmask[16] = /* bmask[n] is mask for n rightmost bits */
+ { 0, 0x0001, 0x0003, 0x0007, 0x000F, 0x001F, 0x003F, 0x007F, 0x00FF,
+ 0x01FF, 0x03FF, 0x07FF, 0x0FFF, 0x1FFF, 0x3FFF, 0x7FFF };
+
+#endif /* AVOID_TABLES */
+
+
+/*
* Out-of-line code for Huffman code decoding.
- * See jdhuff.h for info about usage.
*/
-GLOBAL(int)
+LOCAL(int)
jpeg_huff_decode (bitread_working_state * state,
register bit_buf_type get_buffer, register int bits_left,
d_derived_tbl * htbl, int min_bits)
@@ -434,32 +582,6 @@ jpeg_huff_decode (bitread_working_state * state,
/*
- * Figure F.12: extend sign bit.
- * On some machines, a shift and add will be faster than a table lookup.
- */
-
-#ifdef AVOID_TABLES
-
-#define HUFF_EXTEND(x,s) ((x) < (1<<((s)-1)) ? (x) + (((-1)<<(s)) + 1) : (x))
-
-#else
-
-#define HUFF_EXTEND(x,s) ((x) < extend_test[s] ? (x) + extend_offset[s] : (x))
-
-static const int extend_test[16] = /* entry n is 2**(n-1) */
- { 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
- 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000 };
-
-static const int extend_offset[16] = /* entry n is (-1 << n) + 1 */
- { 0, ((-1)<<1) + 1, ((-1)<<2) + 1, ((-1)<<3) + 1, ((-1)<<4) + 1,
- ((-1)<<5) + 1, ((-1)<<6) + 1, ((-1)<<7) + 1, ((-1)<<8) + 1,
- ((-1)<<9) + 1, ((-1)<<10) + 1, ((-1)<<11) + 1, ((-1)<<12) + 1,
- ((-1)<<13) + 1, ((-1)<<14) + 1, ((-1)<<15) + 1 };
-
-#endif /* AVOID_TABLES */
-
-
-/*
* Check for a restart marker & resynchronize decoder.
* Returns FALSE if must suspend.
*/
@@ -482,6 +604,8 @@ process_restart (j_decompress_ptr cinfo)
/* Re-initialize DC predictions to 0 */
for (ci = 0; ci < cinfo->comps_in_scan; ci++)
entropy->saved.last_dc_val[ci] = 0;
+ /* Re-init EOB run count, too */
+ entropy->saved.EOBRUN = 0;
/* Reset restart counter */
entropy->restarts_to_go = cinfo->restart_interval;
@@ -499,27 +623,40 @@ process_restart (j_decompress_ptr cinfo)
/*
- * Decode and return one MCU's worth of Huffman-compressed coefficients.
+ * Huffman MCU decoding.
+ * Each of these routines decodes and returns one MCU's worth of
+ * Huffman-compressed coefficients.
* The coefficients are reordered from zigzag order into natural array order,
* but are not dequantized.
*
* The i'th block of the MCU is stored into the block pointed to by
- * MCU_data[i]. WE ASSUME THIS AREA HAS BEEN ZEROED BY THE CALLER.
+ * MCU_data[i]. WE ASSUME THIS AREA IS INITIALLY ZEROED BY THE CALLER.
* (Wholesale zeroing is usually a little faster than retail...)
*
- * Returns FALSE if data source requested suspension. In that case no
+ * We return FALSE if data source requested suspension. In that case no
* changes have been made to permanent state. (Exception: some output
* coefficients may already have been assigned. This is harmless for
- * this module, since we'll just re-assign them on the next call.)
+ * spectral selection, since we'll just re-assign them on the next call.
+ * Successive approximation AC refinement has to be more careful, however.)
+ */
+
+/*
+ * MCU decoding for DC initial scan (either spectral selection,
+ * or first pass of successive approximation).
*/
METHODDEF(boolean)
-decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
-{
+decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
- int blkn;
+ int Al = cinfo->Al;
+ register int s, r;
+ int blkn, ci;
+ JBLOCKROW block;
BITREAD_STATE_VARS;
savable_state state;
+ d_derived_tbl * tbl;
+ jpeg_component_info * compptr;
/* Process restart marker if needed; may have to suspend */
if (cinfo->restart_interval) {
@@ -540,40 +677,391 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
/* Outer loop handles each block in the MCU */
for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
- JBLOCKROW block = MCU_data[blkn];
- d_derived_tbl * dctbl = entropy->dc_cur_tbls[blkn];
- d_derived_tbl * actbl = entropy->ac_cur_tbls[blkn];
- register int s, k, r;
+ block = MCU_data[blkn];
+ ci = cinfo->MCU_membership[blkn];
+ compptr = cinfo->cur_comp_info[ci];
+ tbl = entropy->derived_tbls[compptr->dc_tbl_no];
/* Decode a single block's worth of coefficients */
/* Section F.2.2.1: decode the DC coefficient difference */
- HUFF_DECODE(s, br_state, dctbl, return FALSE, label1);
+ HUFF_DECODE(s, br_state, tbl, return FALSE, label1);
if (s) {
CHECK_BIT_BUFFER(br_state, s, return FALSE);
r = GET_BITS(s);
s = HUFF_EXTEND(r, s);
}
- if (entropy->dc_needed[blkn]) {
+ /* Convert DC difference to actual value, update last_dc_val */
+ s += state.last_dc_val[ci];
+ state.last_dc_val[ci] = s;
+ /* Scale and output the coefficient (assumes jpeg_natural_order[0]=0) */
+ (*block)[0] = (JCOEF) (s << Al);
+ }
+
+ /* Completed MCU, so update state */
+ BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+ ASSIGN_STATE(entropy->saved, state);
+ }
+
+ /* Account for restart interval (no-op if not using restarts) */
+ entropy->restarts_to_go--;
+
+ return TRUE;
+}
+
+
+/*
+ * MCU decoding for AC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+ huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+ int Se = cinfo->Se;
+ int Al = cinfo->Al;
+ register int s, k, r;
+ unsigned int EOBRUN;
+ JBLOCKROW block;
+ BITREAD_STATE_VARS;
+ d_derived_tbl * tbl;
+
+ /* Process restart marker if needed; may have to suspend */
+ if (cinfo->restart_interval) {
+ if (entropy->restarts_to_go == 0)
+ if (! process_restart(cinfo))
+ return FALSE;
+ }
+
+ /* If we've run out of data, just leave the MCU set to zeroes.
+ * This way, we return uniform gray for the remainder of the segment.
+ */
+ if (! entropy->pub.insufficient_data) {
+
+ /* Load up working state.
+ * We can avoid loading/saving bitread state if in an EOB run.
+ */
+ EOBRUN = entropy->saved.EOBRUN; /* only part of saved state we need */
+
+ /* There is always only one block per MCU */
+
+ if (EOBRUN > 0) /* if it's a band of zeroes... */
+ EOBRUN--; /* ...process it now (we do nothing) */
+ else {
+ BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+ block = MCU_data[0];
+ tbl = entropy->ac_derived_tbl;
+
+ for (k = cinfo->Ss; k <= Se; k++) {
+ HUFF_DECODE(s, br_state, tbl, return FALSE, label2);
+ r = s >> 4;
+ s &= 15;
+ if (s) {
+ k += r;
+ CHECK_BIT_BUFFER(br_state, s, return FALSE);
+ r = GET_BITS(s);
+ s = HUFF_EXTEND(r, s);
+ /* Scale and output coefficient in natural (dezigzagged) order */
+ (*block)[jpeg_natural_order[k]] = (JCOEF) (s << Al);
+ } else {
+ if (r == 15) { /* ZRL */
+ k += 15; /* skip 15 zeroes in band */
+ } else { /* EOBr, run length is 2^r + appended bits */
+ EOBRUN = 1 << r;
+ if (r) { /* EOBr, r > 0 */
+ CHECK_BIT_BUFFER(br_state, r, return FALSE);
+ r = GET_BITS(r);
+ EOBRUN += r;
+ }
+ EOBRUN--; /* this band is processed at this moment */
+ break; /* force end-of-band */
+ }
+ }
+ }
+
+ BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+ }
+
+ /* Completed MCU, so update state */
+ entropy->saved.EOBRUN = EOBRUN; /* only part of saved state we need */
+ }
+
+ /* Account for restart interval (no-op if not using restarts) */
+ entropy->restarts_to_go--;
+
+ return TRUE;
+}
+
+
+/*
+ * MCU decoding for DC successive approximation refinement scan.
+ * Note: we assume such scans can be multi-component, although the spec
+ * is not very clear on the point.
+ */
+
+METHODDEF(boolean)
+decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+ huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+ int p1 = 1 << cinfo->Al; /* 1 in the bit position being coded */
+ int blkn;
+ JBLOCKROW block;
+ BITREAD_STATE_VARS;
+
+ /* Process restart marker if needed; may have to suspend */
+ if (cinfo->restart_interval) {
+ if (entropy->restarts_to_go == 0)
+ if (! process_restart(cinfo))
+ return FALSE;
+ }
+
+ /* Not worth the cycles to check insufficient_data here,
+ * since we will not change the data anyway if we read zeroes.
+ */
+
+ /* Load up working state */
+ BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+
+ /* Outer loop handles each block in the MCU */
+
+ for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+ block = MCU_data[blkn];
+
+ /* Encoded data is simply the next bit of the two's-complement DC value */
+ CHECK_BIT_BUFFER(br_state, 1, return FALSE);
+ if (GET_BITS(1))
+ (*block)[0] |= p1;
+ /* Note: since we use |=, repeating the assignment later is safe */
+ }
+
+ /* Completed MCU, so update state */
+ BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+
+ /* Account for restart interval (no-op if not using restarts) */
+ entropy->restarts_to_go--;
+
+ return TRUE;
+}
+
+
+/*
+ * MCU decoding for AC successive approximation refinement scan.
+ */
+
+METHODDEF(boolean)
+decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+ huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+ int Se = cinfo->Se;
+ int p1 = 1 << cinfo->Al; /* 1 in the bit position being coded */
+ int m1 = (-1) << cinfo->Al; /* -1 in the bit position being coded */
+ register int s, k, r;
+ unsigned int EOBRUN;
+ JBLOCKROW block;
+ JCOEFPTR thiscoef;
+ BITREAD_STATE_VARS;
+ d_derived_tbl * tbl;
+ int num_newnz;
+ int newnz_pos[DCTSIZE2];
+
+ /* Process restart marker if needed; may have to suspend */
+ if (cinfo->restart_interval) {
+ if (entropy->restarts_to_go == 0)
+ if (! process_restart(cinfo))
+ return FALSE;
+ }
+
+ /* If we've run out of data, don't modify the MCU.
+ */
+ if (! entropy->pub.insufficient_data) {
+
+ /* Load up working state */
+ BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+ EOBRUN = entropy->saved.EOBRUN; /* only part of saved state we need */
+
+ /* There is always only one block per MCU */
+ block = MCU_data[0];
+ tbl = entropy->ac_derived_tbl;
+
+ /* If we are forced to suspend, we must undo the assignments to any newly
+ * nonzero coefficients in the block, because otherwise we'd get confused
+ * next time about which coefficients were already nonzero.
+ * But we need not undo addition of bits to already-nonzero coefficients;
+ * instead, we can test the current bit to see if we already did it.
+ */
+ num_newnz = 0;
+
+ /* initialize coefficient loop counter to start of band */
+ k = cinfo->Ss;
+
+ if (EOBRUN == 0) {
+ for (; k <= Se; k++) {
+ HUFF_DECODE(s, br_state, tbl, goto undoit, label3);
+ r = s >> 4;
+ s &= 15;
+ if (s) {
+ if (s != 1) /* size of new coef should always be 1 */
+ WARNMS(cinfo, JWRN_HUFF_BAD_CODE);
+ CHECK_BIT_BUFFER(br_state, 1, goto undoit);
+ if (GET_BITS(1))
+ s = p1; /* newly nonzero coef is positive */
+ else
+ s = m1; /* newly nonzero coef is negative */
+ } else {
+ if (r != 15) {
+ EOBRUN = 1 << r; /* EOBr, run length is 2^r + appended bits */
+ if (r) {
+ CHECK_BIT_BUFFER(br_state, r, goto undoit);
+ r = GET_BITS(r);
+ EOBRUN += r;
+ }
+ break; /* rest of block is handled by EOB logic */
+ }
+ /* note s = 0 for processing ZRL */
+ }
+ /* Advance over already-nonzero coefs and r still-zero coefs,
+ * appending correction bits to the nonzeroes. A correction bit is 1
+ * if the absolute value of the coefficient must be increased.
+ */
+ do {
+ thiscoef = *block + jpeg_natural_order[k];
+ if (*thiscoef != 0) {
+ CHECK_BIT_BUFFER(br_state, 1, goto undoit);
+ if (GET_BITS(1)) {
+ if ((*thiscoef & p1) == 0) { /* do nothing if already set it */
+ if (*thiscoef >= 0)
+ *thiscoef += p1;
+ else
+ *thiscoef += m1;
+ }
+ }
+ } else {
+ if (--r < 0)
+ break; /* reached target zero coefficient */
+ }
+ k++;
+ } while (k <= Se);
+ if (s) {
+ int pos = jpeg_natural_order[k];
+ /* Output newly nonzero coefficient */
+ (*block)[pos] = (JCOEF) s;
+ /* Remember its position in case we have to suspend */
+ newnz_pos[num_newnz++] = pos;
+ }
+ }
+ }
+
+ if (EOBRUN > 0) {
+ /* Scan any remaining coefficient positions after the end-of-band
+ * (the last newly nonzero coefficient, if any). Append a correction
+ * bit to each already-nonzero coefficient. A correction bit is 1
+ * if the absolute value of the coefficient must be increased.
+ */
+ for (; k <= Se; k++) {
+ thiscoef = *block + jpeg_natural_order[k];
+ if (*thiscoef != 0) {
+ CHECK_BIT_BUFFER(br_state, 1, goto undoit);
+ if (GET_BITS(1)) {
+ if ((*thiscoef & p1) == 0) { /* do nothing if already changed it */
+ if (*thiscoef >= 0)
+ *thiscoef += p1;
+ else
+ *thiscoef += m1;
+ }
+ }
+ }
+ }
+ /* Count one block completed in EOB run */
+ EOBRUN--;
+ }
+
+ /* Completed MCU, so update state */
+ BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+ entropy->saved.EOBRUN = EOBRUN; /* only part of saved state we need */
+ }
+
+ /* Account for restart interval (no-op if not using restarts) */
+ entropy->restarts_to_go--;
+
+ return TRUE;
+
+undoit:
+ /* Re-zero any output coefficients that we made newly nonzero */
+ while (num_newnz > 0)
+ (*block)[newnz_pos[--num_newnz]] = 0;
+
+ return FALSE;
+}
+
+
+/*
+ * Decode one MCU's worth of Huffman-compressed coefficients.
+ */
+
+METHODDEF(boolean)
+decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+ huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+ int blkn;
+ BITREAD_STATE_VARS;
+ savable_state state;
+
+ /* Process restart marker if needed; may have to suspend */
+ if (cinfo->restart_interval) {
+ if (entropy->restarts_to_go == 0)
+ if (! process_restart(cinfo))
+ return FALSE;
+ }
+
+ /* If we've run out of data, just leave the MCU set to zeroes.
+ * This way, we return uniform gray for the remainder of the segment.
+ */
+ if (! entropy->pub.insufficient_data) {
+
+ /* Load up working state */
+ BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+ ASSIGN_STATE(state, entropy->saved);
+
+ /* Outer loop handles each block in the MCU */
+
+ for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+ JBLOCKROW block = MCU_data[blkn];
+ d_derived_tbl * htbl;
+ register int s, k, r;
+ int coef_limit, ci;
+
+ /* Decode a single block's worth of coefficients */
+
+ /* Section F.2.2.1: decode the DC coefficient difference */
+ htbl = entropy->dc_cur_tbls[blkn];
+ HUFF_DECODE(s, br_state, htbl, return FALSE, label1);
+
+ htbl = entropy->ac_cur_tbls[blkn];
+ k = 1;
+ coef_limit = entropy->coef_limit[blkn];
+ if (coef_limit) {
/* Convert DC difference to actual value, update last_dc_val */
- int ci = cinfo->MCU_membership[blkn];
+ if (s) {
+ CHECK_BIT_BUFFER(br_state, s, return FALSE);
+ r = GET_BITS(s);
+ s = HUFF_EXTEND(r, s);
+ }
+ ci = cinfo->MCU_membership[blkn];
s += state.last_dc_val[ci];
state.last_dc_val[ci] = s;
- /* Output the DC coefficient (assumes jpeg_natural_order[0] = 0) */
+ /* Output the DC coefficient */
(*block)[0] = (JCOEF) s;
- }
-
- if (entropy->ac_needed[blkn]) {
/* Section F.2.2.2: decode the AC coefficients */
/* Since zeroes are skipped, output area must be cleared beforehand */
- for (k = 1; k < DCTSIZE2; k++) {
- HUFF_DECODE(s, br_state, actbl, return FALSE, label2);
-
+ for (; k < coef_limit; k++) {
+ HUFF_DECODE(s, br_state, htbl, return FALSE, label2);
+
r = s >> 4;
s &= 15;
-
+
if (s) {
k += r;
CHECK_BIT_BUFFER(br_state, s, return FALSE);
@@ -586,33 +1074,37 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
(*block)[jpeg_natural_order[k]] = (JCOEF) s;
} else {
if (r != 15)
- break;
+ goto EndOfBlock;
k += 15;
}
}
-
} else {
-
- /* Section F.2.2.2: decode the AC coefficients */
- /* In this path we just discard the values */
- for (k = 1; k < DCTSIZE2; k++) {
- HUFF_DECODE(s, br_state, actbl, return FALSE, label3);
-
- r = s >> 4;
- s &= 15;
-
- if (s) {
- k += r;
- CHECK_BIT_BUFFER(br_state, s, return FALSE);
- DROP_BITS(s);
- } else {
- if (r != 15)
- break;
- k += 15;
- }
+ if (s) {
+ CHECK_BIT_BUFFER(br_state, s, return FALSE);
+ DROP_BITS(s);
}
+ }
+
+ /* Section F.2.2.2: decode the AC coefficients */
+ /* In this path we just discard the values */
+ for (; k < DCTSIZE2; k++) {
+ HUFF_DECODE(s, br_state, htbl, return FALSE, label3);
+
+ r = s >> 4;
+ s &= 15;
+ if (s) {
+ k += r;
+ CHECK_BIT_BUFFER(br_state, s, return FALSE);
+ DROP_BITS(s);
+ } else {
+ if (r != 15)
+ break;
+ k += 15;
+ }
}
+
+ EndOfBlock: ;
}
/* Completed MCU, so update state */
@@ -628,6 +1120,156 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
/*
+ * Initialize for a Huffman-compressed scan.
+ */
+
+METHODDEF(void)
+start_pass_huff_decoder (j_decompress_ptr cinfo)
+{
+ huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+ int ci, blkn, dctbl, actbl, i;
+ jpeg_component_info * compptr;
+
+ if (cinfo->progressive_mode) {
+ /* Validate progressive scan parameters */
+ if (cinfo->Ss == 0) {
+ if (cinfo->Se != 0)
+ goto bad;
+ } else {
+ /* need not check Ss/Se < 0 since they came from unsigned bytes */
+ if (cinfo->Se < cinfo->Ss || cinfo->Se >= DCTSIZE2)
+ goto bad;
+ /* AC scans may have only one component */
+ if (cinfo->comps_in_scan != 1)
+ goto bad;
+ }
+ if (cinfo->Ah != 0) {
+ /* Successive approximation refinement scan: must have Al = Ah-1. */
+ if (cinfo->Ah-1 != cinfo->Al)
+ goto bad;
+ }
+ if (cinfo->Al > 13) { /* need not check for < 0 */
+ /* Arguably the maximum Al value should be less than 13 for 8-bit precision,
+ * but the spec doesn't say so, and we try to be liberal about what we
+ * accept. Note: large Al values could result in out-of-range DC
+ * coefficients during early scans, leading to bizarre displays due to
+ * overflows in the IDCT math. But we won't crash.
+ */
+ bad:
+ ERREXIT4(cinfo, JERR_BAD_PROGRESSION,
+ cinfo->Ss, cinfo->Se, cinfo->Ah, cinfo->Al);
+ }
+ /* Update progression status, and verify that scan order is legal.
+ * Note that inter-scan inconsistencies are treated as warnings
+ * not fatal errors ... not clear if this is right way to behave.
+ */
+ for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+ int coefi, cindex = cinfo->cur_comp_info[ci]->component_index;
+ int *coef_bit_ptr = & cinfo->coef_bits[cindex][0];
+ if (cinfo->Ss && coef_bit_ptr[0] < 0) /* AC without prior DC scan */
+ WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, 0);
+ for (coefi = cinfo->Ss; coefi <= cinfo->Se; coefi++) {
+ int expected = (coef_bit_ptr[coefi] < 0) ? 0 : coef_bit_ptr[coefi];
+ if (cinfo->Ah != expected)
+ WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, coefi);
+ coef_bit_ptr[coefi] = cinfo->Al;
+ }
+ }
+
+ /* Select MCU decoding routine */
+ if (cinfo->Ah == 0) {
+ if (cinfo->Ss == 0)
+ entropy->pub.decode_mcu = decode_mcu_DC_first;
+ else
+ entropy->pub.decode_mcu = decode_mcu_AC_first;
+ } else {
+ if (cinfo->Ss == 0)
+ entropy->pub.decode_mcu = decode_mcu_DC_refine;
+ else
+ entropy->pub.decode_mcu = decode_mcu_AC_refine;
+ }
+
+ for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+ compptr = cinfo->cur_comp_info[ci];
+ /* Make sure requested tables are present, and compute derived tables.
+ * We may build same derived table more than once, but it's not expensive.
+ */
+ if (cinfo->Ss == 0) {
+ if (cinfo->Ah == 0) { /* DC refinement needs no table */
+ i = compptr->dc_tbl_no;
+ jpeg_make_d_derived_tbl(cinfo, TRUE, i,
+ & entropy->derived_tbls[i]);
+ }
+ } else {
+ i = compptr->ac_tbl_no;
+ jpeg_make_d_derived_tbl(cinfo, FALSE, i,
+ & entropy->derived_tbls[i]);
+ /* remember the single active table */
+ entropy->ac_derived_tbl = entropy->derived_tbls[i];
+ }
+ /* Initialize DC predictions to 0 */
+ entropy->saved.last_dc_val[ci] = 0;
+ }
+
+ /* Initialize private state variables */
+ entropy->saved.EOBRUN = 0;
+ } else {
+ /* Check that the scan parameters Ss, Se, Ah/Al are OK for sequential JPEG.
+ * This ought to be an error condition, but we make it a warning because
+ * there are some baseline files out there with all zeroes in these bytes.
+ */
+ if (cinfo->Ss != 0 || cinfo->Se != DCTSIZE2-1 ||
+ cinfo->Ah != 0 || cinfo->Al != 0)
+ WARNMS(cinfo, JWRN_NOT_SEQUENTIAL);
+
+ /* Select MCU decoding routine */
+ entropy->pub.decode_mcu = decode_mcu;
+
+ for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+ compptr = cinfo->cur_comp_info[ci];
+ dctbl = compptr->dc_tbl_no;
+ actbl = compptr->ac_tbl_no;
+ /* Compute derived values for Huffman tables */
+ /* We may do this more than once for a table, but it's not expensive */
+ jpeg_make_d_derived_tbl(cinfo, TRUE, dctbl,
+ & entropy->dc_derived_tbls[dctbl]);
+ jpeg_make_d_derived_tbl(cinfo, FALSE, actbl,
+ & entropy->ac_derived_tbls[actbl]);
+ /* Initialize DC predictions to 0 */
+ entropy->saved.last_dc_val[ci] = 0;
+ }
+
+ /* Precalculate decoding info for each block in an MCU of this scan */
+ for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+ ci = cinfo->MCU_membership[blkn];
+ compptr = cinfo->cur_comp_info[ci];
+ /* Precalculate which table to use for each block */
+ entropy->dc_cur_tbls[blkn] = entropy->dc_derived_tbls[compptr->dc_tbl_no];
+ entropy->ac_cur_tbls[blkn] = entropy->ac_derived_tbls[compptr->ac_tbl_no];
+ /* Decide whether we really care about the coefficient values */
+ if (compptr->component_needed) {
+ ci = compptr->DCT_v_scaled_size;
+ if (ci <= 0 || ci > 8) ci = 8;
+ i = compptr->DCT_h_scaled_size;
+ if (i <= 0 || i > 8) i = 8;
+ entropy->coef_limit[blkn] = 1 + jpeg_zigzag_order[ci - 1][i - 1];
+ } else {
+ entropy->coef_limit[blkn] = 0;
+ }
+ }
+ }
+
+ /* Initialize bitread state variables */
+ entropy->bitstate.bits_left = 0;
+ entropy->bitstate.get_buffer = 0; /* unnecessary, but keeps Purify quiet */
+ entropy->pub.insufficient_data = FALSE;
+
+ /* Initialize restart counter */
+ entropy->restarts_to_go = cinfo->restart_interval;
+}
+
+
+/*
* Module initialization routine for Huffman entropy decoding.
*/
@@ -642,11 +1284,27 @@ jinit_huff_decoder (j_decompress_ptr cinfo)
SIZEOF(huff_entropy_decoder));
cinfo->entropy = (struct jpeg_entropy_decoder *) entropy;
entropy->pub.start_pass = start_pass_huff_decoder;
- entropy->pub.decode_mcu = decode_mcu;
- /* Mark tables unallocated */
- for (i = 0; i < NUM_HUFF_TBLS; i++) {
- entropy->dc_derived_tbls[i] = entropy->ac_derived_tbls[i] = NULL;
+ if (cinfo->progressive_mode) {
+ /* Create progression status table */
+ int *coef_bit_ptr, ci;
+ cinfo->coef_bits = (int (*)[DCTSIZE2])
+ (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ cinfo->num_components*DCTSIZE2*SIZEOF(int));
+ coef_bit_ptr = & cinfo->coef_bits[0][0];
+ for (ci = 0; ci < cinfo->num_components; ci++)
+ for (i = 0; i < DCTSIZE2; i++)
+ *coef_bit_ptr++ = -1;
+
+ /* Mark derived tables unallocated */
+ for (i = 0; i < NUM_HUFF_TBLS; i++) {
+ entropy->derived_tbls[i] = NULL;
+ }
+ } else {
+ /* Mark tables unallocated */
+ for (i = 0; i < NUM_HUFF_TBLS; i++) {
+ entropy->dc_derived_tbls[i] = entropy->ac_derived_tbls[i] = NULL;
+ }
}
}
diff --git a/src/libjpeg/jdinput.c b/src/libjpeg/jdinput.c
index 0c2ac8f..08a9b37 100644
--- a/src/libjpeg/jdinput.c
+++ b/src/libjpeg/jdinput.c
@@ -2,13 +2,14 @@
* jdinput.c
*
* Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2002-2009 by Guido Vollbeding.
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
* This file contains input control logic for the JPEG decompressor.
* These routines are concerned with controlling the decompressor's input
* processing (marker reading and coefficient decoding). The actual input
- * reading is done in jdmarker.c, jdhuff.c, and jdphuff.c.
+ * reading is done in jdmarker.c, jdhuff.c, and jdarith.c.
*/
#define JPEG_INTERNALS
@@ -74,12 +75,14 @@ initial_setup (j_decompress_ptr cinfo)
* In the full decompressor, this will be overridden by jdmaster.c;
* but in the transcoder, jdmaster.c is not used, so we must do it here.
*/
- cinfo->min_DCT_scaled_size = DCTSIZE;
+ cinfo->min_DCT_h_scaled_size = DCTSIZE;
+ cinfo->min_DCT_v_scaled_size = DCTSIZE;
/* Compute dimensions of components */
for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
ci++, compptr++) {
- compptr->DCT_scaled_size = DCTSIZE;
+ compptr->DCT_h_scaled_size = DCTSIZE;
+ compptr->DCT_v_scaled_size = DCTSIZE;
/* Size in DCT blocks */
compptr->width_in_blocks = (JDIMENSION)
jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor,
@@ -138,7 +141,7 @@ per_scan_setup (j_decompress_ptr cinfo)
compptr->MCU_width = 1;
compptr->MCU_height = 1;
compptr->MCU_blocks = 1;
- compptr->MCU_sample_width = compptr->DCT_scaled_size;
+ compptr->MCU_sample_width = compptr->DCT_h_scaled_size;
compptr->last_col_width = 1;
/* For noninterleaved scans, it is convenient to define last_row_height
* as the number of block rows present in the last iMCU row.
@@ -174,7 +177,7 @@ per_scan_setup (j_decompress_ptr cinfo)
compptr->MCU_width = compptr->h_samp_factor;
compptr->MCU_height = compptr->v_samp_factor;
compptr->MCU_blocks = compptr->MCU_width * compptr->MCU_height;
- compptr->MCU_sample_width = compptr->MCU_width * compptr->DCT_scaled_size;
+ compptr->MCU_sample_width = compptr->MCU_width * compptr->DCT_h_scaled_size;
/* Figure number of non-dummy blocks in last MCU column & row */
tmp = (int) (compptr->width_in_blocks % compptr->MCU_width);
if (tmp == 0) tmp = compptr->MCU_width;
diff --git a/src/libjpeg/jdmainct.c b/src/libjpeg/jdmainct.c
index 13c956f..02723ca 100644
--- a/src/libjpeg/jdmainct.c
+++ b/src/libjpeg/jdmainct.c
@@ -161,7 +161,7 @@ alloc_funny_pointers (j_decompress_ptr cinfo)
{
my_main_ptr main = (my_main_ptr) cinfo->main;
int ci, rgroup;
- int M = cinfo->min_DCT_scaled_size;
+ int M = cinfo->min_DCT_v_scaled_size;
jpeg_component_info *compptr;
JSAMPARRAY xbuf;
@@ -175,8 +175,8 @@ alloc_funny_pointers (j_decompress_ptr cinfo)
for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
ci++, compptr++) {
- rgroup = (compptr->v_samp_factor * compptr->DCT_scaled_size) /
- cinfo->min_DCT_scaled_size; /* height of a row group of component */
+ rgroup = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
+ cinfo->min_DCT_v_scaled_size; /* height of a row group of component */
/* Get space for pointer lists --- M+4 row groups in each list.
* We alloc both pointer lists with one call to save a few cycles.
*/
@@ -202,14 +202,14 @@ make_funny_pointers (j_decompress_ptr cinfo)
{
my_main_ptr main = (my_main_ptr) cinfo->main;
int ci, i, rgroup;
- int M = cinfo->min_DCT_scaled_size;
+ int M = cinfo->min_DCT_v_scaled_size;
jpeg_component_info *compptr;
JSAMPARRAY buf, xbuf0, xbuf1;
for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
ci++, compptr++) {
- rgroup = (compptr->v_samp_factor * compptr->DCT_scaled_size) /
- cinfo->min_DCT_scaled_size; /* height of a row group of component */
+ rgroup = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
+ cinfo->min_DCT_v_scaled_size; /* height of a row group of component */
xbuf0 = main->xbuffer[0][ci];
xbuf1 = main->xbuffer[1][ci];
/* First copy the workspace pointers as-is */
@@ -242,14 +242,14 @@ set_wraparound_pointers (j_decompress_ptr cinfo)
{
my_main_ptr main = (my_main_ptr) cinfo->main;
int ci, i, rgroup;
- int M = cinfo->min_DCT_scaled_size;
+ int M = cinfo->min_DCT_v_scaled_size;
jpeg_component_info *compptr;
JSAMPARRAY xbuf0, xbuf1;
for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
ci++, compptr++) {
- rgroup = (compptr->v_samp_factor * compptr->DCT_scaled_size) /
- cinfo->min_DCT_scaled_size; /* height of a row group of component */
+ rgroup = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
+ cinfo->min_DCT_v_scaled_size; /* height of a row group of component */
xbuf0 = main->xbuffer[0][ci];
xbuf1 = main->xbuffer[1][ci];
for (i = 0; i < rgroup; i++) {
@@ -277,8 +277,8 @@ set_bottom_pointers (j_decompress_ptr cinfo)
for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
ci++, compptr++) {
/* Count sample rows in one iMCU row and in one row group */
- iMCUheight = compptr->v_samp_factor * compptr->DCT_scaled_size;
- rgroup = iMCUheight / cinfo->min_DCT_scaled_size;
+ iMCUheight = compptr->v_samp_factor * compptr->DCT_v_scaled_size;
+ rgroup = iMCUheight / cinfo->min_DCT_v_scaled_size;
/* Count nondummy sample rows remaining for this component */
rows_left = (int) (compptr->downsampled_height % (JDIMENSION) iMCUheight);
if (rows_left == 0) rows_left = iMCUheight;
@@ -357,7 +357,7 @@ process_data_simple_main (j_decompress_ptr cinfo,
}
/* There are always min_DCT_scaled_size row groups in an iMCU row. */
- rowgroups_avail = (JDIMENSION) cinfo->min_DCT_scaled_size;
+ rowgroups_avail = (JDIMENSION) cinfo->min_DCT_v_scaled_size;
/* Note: at the bottom of the image, we may pass extra garbage row groups
* to the postprocessor. The postprocessor has to check for bottom
* of image anyway (at row resolution), so no point in us doing it too.
@@ -417,7 +417,7 @@ process_data_context_main (j_decompress_ptr cinfo,
case CTX_PREPARE_FOR_IMCU:
/* Prepare to process first M-1 row groups of this iMCU row */
main->rowgroup_ctr = 0;
- main->rowgroups_avail = (JDIMENSION) (cinfo->min_DCT_scaled_size - 1);
+ main->rowgroups_avail = (JDIMENSION) (cinfo->min_DCT_v_scaled_size - 1);
/* Check for bottom of image: if so, tweak pointers to "duplicate"
* the last sample row, and adjust rowgroups_avail to ignore padding rows.
*/
@@ -440,8 +440,8 @@ process_data_context_main (j_decompress_ptr cinfo,
main->buffer_full = FALSE;
/* Still need to process last row group of this iMCU row, */
/* which is saved at index M+1 of the other xbuffer */
- main->rowgroup_ctr = (JDIMENSION) (cinfo->min_DCT_scaled_size + 1);
- main->rowgroups_avail = (JDIMENSION) (cinfo->min_DCT_scaled_size + 2);
+ main->rowgroup_ctr = (JDIMENSION) (cinfo->min_DCT_v_scaled_size + 1);
+ main->rowgroups_avail = (JDIMENSION) (cinfo->min_DCT_v_scaled_size + 2);
main->context_state = CTX_POSTPONED_ROW;
}
}
@@ -492,21 +492,21 @@ jinit_d_main_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
* ngroups is the number of row groups we need.
*/
if (cinfo->upsample->need_context_rows) {
- if (cinfo->min_DCT_scaled_size < 2) /* unsupported, see comments above */
+ if (cinfo->min_DCT_v_scaled_size < 2) /* unsupported, see comments above */
ERREXIT(cinfo, JERR_NOTIMPL);
alloc_funny_pointers(cinfo); /* Alloc space for xbuffer[] lists */
- ngroups = cinfo->min_DCT_scaled_size + 2;
+ ngroups = cinfo->min_DCT_v_scaled_size + 2;
} else {
- ngroups = cinfo->min_DCT_scaled_size;
+ ngroups = cinfo->min_DCT_v_scaled_size;
}
for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
ci++, compptr++) {
- rgroup = (compptr->v_samp_factor * compptr->DCT_scaled_size) /
- cinfo->min_DCT_scaled_size; /* height of a row group of component */
+ rgroup = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
+ cinfo->min_DCT_v_scaled_size; /* height of a row group of component */
main->buffer[ci] = (*cinfo->mem->alloc_sarray)
((j_common_ptr) cinfo, JPOOL_IMAGE,
- compptr->width_in_blocks * compptr->DCT_scaled_size,
+ compptr->width_in_blocks * compptr->DCT_h_scaled_size,
(JDIMENSION) (rgroup * ngroups));
}
}
diff --git a/src/libjpeg/jdmaster.c b/src/libjpeg/jdmaster.c
index 2802c5b..04f8312 100644
--- a/src/libjpeg/jdmaster.c
+++ b/src/libjpeg/jdmaster.c
@@ -2,6 +2,7 @@
* jdmaster.c
*
* Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2002-2008 by Guido Vollbeding.
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -61,9 +62,12 @@ use_merged_upsample (j_decompress_ptr cinfo)
cinfo->comp_info[2].v_samp_factor != 1)
return FALSE;
/* furthermore, it doesn't work if we've scaled the IDCTs differently */
- if (cinfo->comp_info[0].DCT_scaled_size != cinfo->min_DCT_scaled_size ||
- cinfo->comp_info[1].DCT_scaled_size != cinfo->min_DCT_scaled_size ||
- cinfo->comp_info[2].DCT_scaled_size != cinfo->min_DCT_scaled_size)
+ if (cinfo->comp_info[0].DCT_h_scaled_size != cinfo->min_DCT_h_scaled_size ||
+ cinfo->comp_info[1].DCT_h_scaled_size != cinfo->min_DCT_h_scaled_size ||
+ cinfo->comp_info[2].DCT_h_scaled_size != cinfo->min_DCT_h_scaled_size ||
+ cinfo->comp_info[0].DCT_v_scaled_size != cinfo->min_DCT_v_scaled_size ||
+ cinfo->comp_info[1].DCT_v_scaled_size != cinfo->min_DCT_v_scaled_size ||
+ cinfo->comp_info[2].DCT_v_scaled_size != cinfo->min_DCT_v_scaled_size)
return FALSE;
/* ??? also need to test for upsample-time rescaling, when & if supported */
return TRUE; /* by golly, it'll work... */
@@ -102,43 +106,152 @@ jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
jdiv_round_up((long) cinfo->image_width, 8L);
cinfo->output_height = (JDIMENSION)
jdiv_round_up((long) cinfo->image_height, 8L);
- cinfo->min_DCT_scaled_size = 1;
+ cinfo->min_DCT_h_scaled_size = 1;
+ cinfo->min_DCT_v_scaled_size = 1;
} else if (cinfo->scale_num * 4 <= cinfo->scale_denom) {
/* Provide 1/4 scaling */
cinfo->output_width = (JDIMENSION)
jdiv_round_up((long) cinfo->image_width, 4L);
cinfo->output_height = (JDIMENSION)
jdiv_round_up((long) cinfo->image_height, 4L);
- cinfo->min_DCT_scaled_size = 2;
+ cinfo->min_DCT_h_scaled_size = 2;
+ cinfo->min_DCT_v_scaled_size = 2;
+ } else if (cinfo->scale_num * 8 <= cinfo->scale_denom * 3) {
+ /* Provide 3/8 scaling */
+ cinfo->output_width = (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_width * 3L, 8L);
+ cinfo->output_height = (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_height * 3L, 8L);
+ cinfo->min_DCT_h_scaled_size = 3;
+ cinfo->min_DCT_v_scaled_size = 3;
} else if (cinfo->scale_num * 2 <= cinfo->scale_denom) {
/* Provide 1/2 scaling */
cinfo->output_width = (JDIMENSION)
jdiv_round_up((long) cinfo->image_width, 2L);
cinfo->output_height = (JDIMENSION)
jdiv_round_up((long) cinfo->image_height, 2L);
- cinfo->min_DCT_scaled_size = 4;
- } else {
+ cinfo->min_DCT_h_scaled_size = 4;
+ cinfo->min_DCT_v_scaled_size = 4;
+ } else if (cinfo->scale_num * 8 <= cinfo->scale_denom * 5) {
+ /* Provide 5/8 scaling */
+ cinfo->output_width = (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_width * 5L, 8L);
+ cinfo->output_height = (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_height * 5L, 8L);
+ cinfo->min_DCT_h_scaled_size = 5;
+ cinfo->min_DCT_v_scaled_size = 5;
+ } else if (cinfo->scale_num * 4 <= cinfo->scale_denom * 3) {
+ /* Provide 3/4 scaling */
+ cinfo->output_width = (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_width * 3L, 4L);
+ cinfo->output_height = (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_height * 3L, 4L);
+ cinfo->min_DCT_h_scaled_size = 6;
+ cinfo->min_DCT_v_scaled_size = 6;
+ } else if (cinfo->scale_num * 8 <= cinfo->scale_denom * 7) {
+ /* Provide 7/8 scaling */
+ cinfo->output_width = (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_width * 7L, 8L);
+ cinfo->output_height = (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_height * 7L, 8L);
+ cinfo->min_DCT_h_scaled_size = 7;
+ cinfo->min_DCT_v_scaled_size = 7;
+ } else if (cinfo->scale_num <= cinfo->scale_denom) {
/* Provide 1/1 scaling */
cinfo->output_width = cinfo->image_width;
cinfo->output_height = cinfo->image_height;
- cinfo->min_DCT_scaled_size = DCTSIZE;
+ cinfo->min_DCT_h_scaled_size = DCTSIZE;
+ cinfo->min_DCT_v_scaled_size = DCTSIZE;
+ } else if (cinfo->scale_num * 8 <= cinfo->scale_denom * 9) {
+ /* Provide 9/8 scaling */
+ cinfo->output_width = cinfo->image_width + (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_width, 8L);
+ cinfo->output_height = cinfo->image_height + (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_height, 8L);
+ cinfo->min_DCT_h_scaled_size = 9;
+ cinfo->min_DCT_v_scaled_size = 9;
+ } else if (cinfo->scale_num * 4 <= cinfo->scale_denom * 5) {
+ /* Provide 5/4 scaling */
+ cinfo->output_width = cinfo->image_width + (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_width, 4L);
+ cinfo->output_height = cinfo->image_height + (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_height, 4L);
+ cinfo->min_DCT_h_scaled_size = 10;
+ cinfo->min_DCT_v_scaled_size = 10;
+ } else if (cinfo->scale_num * 8 <= cinfo->scale_denom * 11) {
+ /* Provide 11/8 scaling */
+ cinfo->output_width = cinfo->image_width + (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_width * 3L, 8L);
+ cinfo->output_height = cinfo->image_height + (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_height * 3L, 8L);
+ cinfo->min_DCT_h_scaled_size = 11;
+ cinfo->min_DCT_v_scaled_size = 11;
+ } else if (cinfo->scale_num * 2 <= cinfo->scale_denom * 3) {
+ /* Provide 3/2 scaling */
+ cinfo->output_width = cinfo->image_width + (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_width, 2L);
+ cinfo->output_height = cinfo->image_height + (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_height, 2L);
+ cinfo->min_DCT_h_scaled_size = 12;
+ cinfo->min_DCT_v_scaled_size = 12;
+ } else if (cinfo->scale_num * 8 <= cinfo->scale_denom * 13) {
+ /* Provide 13/8 scaling */
+ cinfo->output_width = cinfo->image_width + (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_width * 5L, 8L);
+ cinfo->output_height = cinfo->image_height + (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_height * 5L, 8L);
+ cinfo->min_DCT_h_scaled_size = 13;
+ cinfo->min_DCT_v_scaled_size = 13;
+ } else if (cinfo->scale_num * 4 <= cinfo->scale_denom * 7) {
+ /* Provide 7/4 scaling */
+ cinfo->output_width = cinfo->image_width + (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_width * 3L, 4L);
+ cinfo->output_height = cinfo->image_height + (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_height * 3L, 4L);
+ cinfo->min_DCT_h_scaled_size = 14;
+ cinfo->min_DCT_v_scaled_size = 14;
+ } else if (cinfo->scale_num * 8 <= cinfo->scale_denom * 15) {
+ /* Provide 15/8 scaling */
+ cinfo->output_width = cinfo->image_width + (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_width * 7L, 8L);
+ cinfo->output_height = cinfo->image_height + (JDIMENSION)
+ jdiv_round_up((long) cinfo->image_height * 7L, 8L);
+ cinfo->min_DCT_h_scaled_size = 15;
+ cinfo->min_DCT_v_scaled_size = 15;
+ } else {
+ /* Provide 2/1 scaling */
+ cinfo->output_width = cinfo->image_width << 1;
+ cinfo->output_height = cinfo->image_height << 1;
+ cinfo->min_DCT_h_scaled_size = 16;
+ cinfo->min_DCT_v_scaled_size = 16;
}
/* In selecting the actual DCT scaling for each component, we try to
* scale up the chroma components via IDCT scaling rather than upsampling.
* This saves time if the upsampler gets to use 1:1 scaling.
- * Note this code assumes that the supported DCT scalings are powers of 2.
+ * Note this code adapts subsampling ratios which are powers of 2.
*/
for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
ci++, compptr++) {
- int ssize = cinfo->min_DCT_scaled_size;
- while (ssize < DCTSIZE &&
- (compptr->h_samp_factor * ssize * 2 <=
- cinfo->max_h_samp_factor * cinfo->min_DCT_scaled_size) &&
- (compptr->v_samp_factor * ssize * 2 <=
- cinfo->max_v_samp_factor * cinfo->min_DCT_scaled_size)) {
+ int ssize = 1;
+ while (cinfo->min_DCT_h_scaled_size * ssize <=
+ (cinfo->do_fancy_upsampling ? DCTSIZE : DCTSIZE / 2) &&
+ (cinfo->max_h_samp_factor % (compptr->h_samp_factor * ssize * 2)) == 0) {
+ ssize = ssize * 2;
+ }
+ compptr->DCT_h_scaled_size = cinfo->min_DCT_h_scaled_size * ssize;
+ ssize = 1;
+ while (cinfo->min_DCT_v_scaled_size * ssize <=
+ (cinfo->do_fancy_upsampling ? DCTSIZE : DCTSIZE / 2) &&
+ (cinfo->max_v_samp_factor % (compptr->v_samp_factor * ssize * 2)) == 0) {
ssize = ssize * 2;
}
- compptr->DCT_scaled_size = ssize;
+ compptr->DCT_v_scaled_size = cinfo->min_DCT_v_scaled_size * ssize;
+
+ /* We don't support IDCT ratios larger than 2. */
+ if (compptr->DCT_h_scaled_size > compptr->DCT_v_scaled_size * 2)
+ compptr->DCT_h_scaled_size = compptr->DCT_v_scaled_size * 2;
+ else if (compptr->DCT_v_scaled_size > compptr->DCT_h_scaled_size * 2)
+ compptr->DCT_v_scaled_size = compptr->DCT_h_scaled_size * 2;
}
/* Recompute downsampled dimensions of components;
@@ -149,11 +262,11 @@ jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
/* Size in samples, after IDCT scaling */
compptr->downsampled_width = (JDIMENSION)
jdiv_round_up((long) cinfo->image_width *
- (long) (compptr->h_samp_factor * compptr->DCT_scaled_size),
+ (long) (compptr->h_samp_factor * compptr->DCT_h_scaled_size),
(long) (cinfo->max_h_samp_factor * DCTSIZE));
compptr->downsampled_height = (JDIMENSION)
jdiv_round_up((long) cinfo->image_height *
- (long) (compptr->v_samp_factor * compptr->DCT_scaled_size),
+ (long) (compptr->v_samp_factor * compptr->DCT_v_scaled_size),
(long) (cinfo->max_v_samp_factor * DCTSIZE));
}
@@ -373,16 +486,9 @@ master_selection (j_decompress_ptr cinfo)
jinit_inverse_dct(cinfo);
/* Entropy decoding: either Huffman or arithmetic coding. */
if (cinfo->arith_code) {
- ERREXIT(cinfo, JERR_ARITH_NOTIMPL);
+ jinit_arith_decoder(cinfo);
} else {
- if (cinfo->progressive_mode) {
-#ifdef D_PROGRESSIVE_SUPPORTED
- jinit_phuff_decoder(cinfo);
-#else
- ERREXIT(cinfo, JERR_NOT_COMPILED);
-#endif
- } else
- jinit_huff_decoder(cinfo);
+ jinit_huff_decoder(cinfo);
}
/* Initialize principal buffer controllers. */
diff --git a/src/libjpeg/jdsample.c b/src/libjpeg/jdsample.c
index 80ffefb..7bc8885 100644
--- a/src/libjpeg/jdsample.c
+++ b/src/libjpeg/jdsample.c
@@ -2,13 +2,14 @@
* jdsample.c
*
* Copyright (C) 1991-1996, Thomas G. Lane.
+ * Modified 2002-2008 by Guido Vollbeding.
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
* This file contains upsampling routines.
*
* Upsampling input data is counted in "row groups". A row group
- * is defined to be (v_samp_factor * DCT_scaled_size / min_DCT_scaled_size)
+ * is defined to be (v_samp_factor * DCT_v_scaled_size / min_DCT_v_scaled_size)
* sample rows of each component. Upsampling will normally produce
* max_v_samp_factor pixel rows from each row group (but this could vary
* if the upsampler is applying a scale factor of its own).
@@ -237,11 +238,11 @@ h2v1_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
register JSAMPROW inptr, outptr;
register JSAMPLE invalue;
JSAMPROW outend;
- int inrow;
+ int outrow;
- for (inrow = 0; inrow < cinfo->max_v_samp_factor; inrow++) {
- inptr = input_data[inrow];
- outptr = output_data[inrow];
+ for (outrow = 0; outrow < cinfo->max_v_samp_factor; outrow++) {
+ inptr = input_data[outrow];
+ outptr = output_data[outrow];
outend = outptr + cinfo->output_width;
while (outptr < outend) {
invalue = *inptr++; /* don't need GETJSAMPLE() here */
@@ -286,112 +287,6 @@ h2v2_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
/*
- * Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
- *
- * The upsampling algorithm is linear interpolation between pixel centers,
- * also known as a "triangle filter". This is a good compromise between
- * speed and visual quality. The centers of the output pixels are 1/4 and 3/4
- * of the way between input pixel centers.
- *
- * A note about the "bias" calculations: when rounding fractional values to
- * integer, we do not want to always round 0.5 up to the next integer.
- * If we did that, we'd introduce a noticeable bias towards larger values.
- * Instead, this code is arranged so that 0.5 will be rounded up or down at
- * alternate pixel locations (a simple ordered dither pattern).
- */
-
-METHODDEF(void)
-h2v1_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
- JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)
-{
- JSAMPARRAY output_data = *output_data_ptr;
- register JSAMPROW inptr, outptr;
- register int invalue;
- register JDIMENSION colctr;
- int inrow;
-
- for (inrow = 0; inrow < cinfo->max_v_samp_factor; inrow++) {
- inptr = input_data[inrow];
- outptr = output_data[inrow];
- /* Special case for first column */
- invalue = GETJSAMPLE(*inptr++);
- *outptr++ = (JSAMPLE) invalue;
- *outptr++ = (JSAMPLE) ((invalue * 3 + GETJSAMPLE(*inptr) + 2) >> 2);
-
- for (colctr = compptr->downsampled_width - 2; colctr > 0; colctr--) {
- /* General case: 3/4 * nearer pixel + 1/4 * further pixel */
- invalue = GETJSAMPLE(*inptr++) * 3;
- *outptr++ = (JSAMPLE) ((invalue + GETJSAMPLE(inptr[-2]) + 1) >> 2);
- *outptr++ = (JSAMPLE) ((invalue + GETJSAMPLE(*inptr) + 2) >> 2);
- }
-
- /* Special case for last column */
- invalue = GETJSAMPLE(*inptr);
- *outptr++ = (JSAMPLE) ((invalue * 3 + GETJSAMPLE(inptr[-1]) + 1) >> 2);
- *outptr++ = (JSAMPLE) invalue;
- }
-}
-
-
-/*
- * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
- * Again a triangle filter; see comments for h2v1 case, above.
- *
- * It is OK for us to reference the adjacent input rows because we demanded
- * context from the main buffer controller (see initialization code).
- */
-
-METHODDEF(void)
-h2v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
- JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)
-{
- JSAMPARRAY output_data = *output_data_ptr;
- register JSAMPROW inptr0, inptr1, outptr;
-#if BITS_IN_JSAMPLE == 8
- register int thiscolsum, lastcolsum, nextcolsum;
-#else
- register INT32 thiscolsum, lastcolsum, nextcolsum;
-#endif
- register JDIMENSION colctr;
- int inrow, outrow, v;
-
- inrow = outrow = 0;
- while (outrow < cinfo->max_v_samp_factor) {
- for (v = 0; v < 2; v++) {
- /* inptr0 points to nearest input row, inptr1 points to next nearest */
- inptr0 = input_data[inrow];
- if (v == 0) /* next nearest is row above */
- inptr1 = input_data[inrow-1];
- else /* next nearest is row below */
- inptr1 = input_data[inrow+1];
- outptr = output_data[outrow++];
-
- /* Special case for first column */
- thiscolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
- nextcolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
- *outptr++ = (JSAMPLE) ((thiscolsum * 4 + 8) >> 4);
- *outptr++ = (JSAMPLE) ((thiscolsum * 3 + nextcolsum + 7) >> 4);
- lastcolsum = thiscolsum; thiscolsum = nextcolsum;
-
- for (colctr = compptr->downsampled_width - 2; colctr > 0; colctr--) {
- /* General case: 3/4 * nearer pixel + 1/4 * further pixel in each */
- /* dimension, thus 9/16, 3/16, 3/16, 1/16 overall */
- nextcolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
- *outptr++ = (JSAMPLE) ((thiscolsum * 3 + lastcolsum + 8) >> 4);
- *outptr++ = (JSAMPLE) ((thiscolsum * 3 + nextcolsum + 7) >> 4);
- lastcolsum = thiscolsum; thiscolsum = nextcolsum;
- }
-
- /* Special case for last column */
- *outptr++ = (JSAMPLE) ((thiscolsum * 3 + lastcolsum + 8) >> 4);
- *outptr++ = (JSAMPLE) ((thiscolsum * 4 + 7) >> 4);
- }
- inrow++;
- }
-}
-
-
-/*
* Module initialization routine for upsampling.
*/
@@ -401,7 +296,7 @@ jinit_upsampler (j_decompress_ptr cinfo)
my_upsample_ptr upsample;
int ci;
jpeg_component_info * compptr;
- boolean need_buffer, do_fancy;
+ boolean need_buffer;
int h_in_group, v_in_group, h_out_group, v_out_group;
upsample = (my_upsample_ptr)
@@ -415,11 +310,6 @@ jinit_upsampler (j_decompress_ptr cinfo)
if (cinfo->CCIR601_sampling) /* this isn't supported */
ERREXIT(cinfo, JERR_CCIR601_NOTIMPL);
- /* jdmainct.c doesn't support context rows when min_DCT_scaled_size = 1,
- * so don't ask for it.
- */
- do_fancy = cinfo->do_fancy_upsampling && cinfo->min_DCT_scaled_size > 1;
-
/* Verify we can handle the sampling factors, select per-component methods,
* and create storage as needed.
*/
@@ -428,10 +318,10 @@ jinit_upsampler (j_decompress_ptr cinfo)
/* Compute size of an "input group" after IDCT scaling. This many samples
* are to be converted to max_h_samp_factor * max_v_samp_factor pixels.
*/
- h_in_group = (compptr->h_samp_factor * compptr->DCT_scaled_size) /
- cinfo->min_DCT_scaled_size;
- v_in_group = (compptr->v_samp_factor * compptr->DCT_scaled_size) /
- cinfo->min_DCT_scaled_size;
+ h_in_group = (compptr->h_samp_factor * compptr->DCT_h_scaled_size) /
+ cinfo->min_DCT_h_scaled_size;
+ v_in_group = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
+ cinfo->min_DCT_v_scaled_size;
h_out_group = cinfo->max_h_samp_factor;
v_out_group = cinfo->max_v_samp_factor;
upsample->rowgroup_height[ci] = v_in_group; /* save for use later */
@@ -446,19 +336,12 @@ jinit_upsampler (j_decompress_ptr cinfo)
need_buffer = FALSE;
} else if (h_in_group * 2 == h_out_group &&
v_in_group == v_out_group) {
- /* Special cases for 2h1v upsampling */
- if (do_fancy && compptr->downsampled_width > 2)
- upsample->methods[ci] = h2v1_fancy_upsample;
- else
- upsample->methods[ci] = h2v1_upsample;
+ /* Special case for 2h1v upsampling */
+ upsample->methods[ci] = h2v1_upsample;
} else if (h_in_group * 2 == h_out_group &&
v_in_group * 2 == v_out_group) {
- /* Special cases for 2h2v upsampling */
- if (do_fancy && compptr->downsampled_width > 2) {
- upsample->methods[ci] = h2v2_fancy_upsample;
- upsample->pub.need_context_rows = TRUE;
- } else
- upsample->methods[ci] = h2v2_upsample;
+ /* Special case for 2h2v upsampling */
+ upsample->methods[ci] = h2v2_upsample;
} else if ((h_out_group % h_in_group) == 0 &&
(v_out_group % v_in_group) == 0) {
/* Generic integral-factors upsampling method */
diff --git a/src/libjpeg/jdtrans.c b/src/libjpeg/jdtrans.c
index 6c0ab71..385c336 100644
--- a/src/libjpeg/jdtrans.c
+++ b/src/libjpeg/jdtrans.c
@@ -100,17 +100,10 @@ transdecode_master_selection (j_decompress_ptr cinfo)
cinfo->buffered_image = TRUE;
/* Entropy decoding: either Huffman or arithmetic coding. */
- if (cinfo->arith_code) {
- ERREXIT(cinfo, JERR_ARITH_NOTIMPL);
- } else {
- if (cinfo->progressive_mode) {
-#ifdef D_PROGRESSIVE_SUPPORTED
- jinit_phuff_decoder(cinfo);
-#else
- ERREXIT(cinfo, JERR_NOT_COMPILED);
-#endif
- } else
- jinit_huff_decoder(cinfo);
+ if (cinfo->arith_code)
+ jinit_arith_decoder(cinfo);
+ else {
+ jinit_huff_decoder(cinfo);
}
/* Always get a full-image coefficient buffer. */
diff --git a/src/libjpeg/jerror.h b/src/libjpeg/jerror.h
index fc2fffe..1cfb2b1 100644
--- a/src/libjpeg/jerror.h
+++ b/src/libjpeg/jerror.h
@@ -2,6 +2,7 @@
* jerror.h
*
* Copyright (C) 1994-1997, Thomas G. Lane.
+ * Modified 1997-2009 by Guido Vollbeding.
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -39,14 +40,15 @@ typedef enum {
JMESSAGE(JMSG_NOMESSAGE, "Bogus message code %d") /* Must be first entry! */
/* For maintenance convenience, list is alphabetical by message code name */
-JMESSAGE(JERR_ARITH_NOTIMPL,
- "Sorry, there are legal restrictions on arithmetic coding")
JMESSAGE(JERR_BAD_ALIGN_TYPE, "ALIGN_TYPE is wrong, please fix")
JMESSAGE(JERR_BAD_ALLOC_CHUNK, "MAX_ALLOC_CHUNK is wrong, please fix")
JMESSAGE(JERR_BAD_BUFFER_MODE, "Bogus buffer control mode")
JMESSAGE(JERR_BAD_COMPONENT_ID, "Invalid component ID %d in SOS")
+JMESSAGE(JERR_BAD_CROP_SPEC, "Invalid crop request")
JMESSAGE(JERR_BAD_DCT_COEF, "DCT coefficient out of range")
-JMESSAGE(JERR_BAD_DCTSIZE, "IDCT output block size %d not supported")
+JMESSAGE(JERR_BAD_DCTSIZE, "DCT scaled block size %dx%d not supported")
+JMESSAGE(JERR_BAD_DROP_SAMPLING,
+ "Component index %d: mismatching sampling ratio %d:%d, %d:%d, %c")
JMESSAGE(JERR_BAD_HUFF_TABLE, "Bogus Huffman table definition")
JMESSAGE(JERR_BAD_IN_COLORSPACE, "Bogus input colorspace")
JMESSAGE(JERR_BAD_J_COLORSPACE, "Bogus JPEG colorspace")
@@ -93,6 +95,7 @@ JMESSAGE(JERR_MISSING_DATA, "Scan script does not transmit all data")
JMESSAGE(JERR_MODE_CHANGE, "Invalid color quantization mode change")
JMESSAGE(JERR_NOTIMPL, "Not implemented yet")
JMESSAGE(JERR_NOT_COMPILED, "Requested feature was omitted at compile time")
+JMESSAGE(JERR_NO_ARITH_TABLE, "Arithmetic table 0x%02x was not defined")
JMESSAGE(JERR_NO_BACKING_STORE, "Backing store not supported")
JMESSAGE(JERR_NO_HUFF_TABLE, "Huffman table 0x%02x was not defined")
JMESSAGE(JERR_NO_IMAGE, "JPEG datastream contains no image")
@@ -170,6 +173,7 @@ JMESSAGE(JTRC_UNKNOWN_IDS,
JMESSAGE(JTRC_XMS_CLOSE, "Freed XMS handle %u")
JMESSAGE(JTRC_XMS_OPEN, "Obtained XMS handle %u")
JMESSAGE(JWRN_ADOBE_XFORM, "Unknown Adobe color transform code %d")
+JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code")
JMESSAGE(JWRN_BOGUS_PROGRESSION,
"Inconsistent progression sequence for component %d coefficient %d")
JMESSAGE(JWRN_EXTRANEOUS_DATA,
@@ -227,6 +231,15 @@ JMESSAGE(JWRN_TOO_MUCH_DATA, "Application transferred too many scanlines")
(cinfo)->err->msg_parm.i[2] = (p3), \
(cinfo)->err->msg_parm.i[3] = (p4), \
(*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
+#define ERREXIT6(cinfo,code,p1,p2,p3,p4,p5,p6) \
+ ((cinfo)->err->msg_code = (code), \
+ (cinfo)->err->msg_parm.i[0] = (p1), \
+ (cinfo)->err->msg_parm.i[1] = (p2), \
+ (cinfo)->err->msg_parm.i[2] = (p3), \
+ (cinfo)->err->msg_parm.i[3] = (p4), \
+ (cinfo)->err->msg_parm.i[4] = (p5), \
+ (cinfo)->err->msg_parm.i[5] = (p6), \
+ (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
#define ERREXITS(cinfo,code,str) \
((cinfo)->err->msg_code = (code), \
strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \
diff --git a/src/libjpeg/jfdctflt.c b/src/libjpeg/jfdctflt.c
index 79d7a00..74d0d86 100644
--- a/src/libjpeg/jfdctflt.c
+++ b/src/libjpeg/jfdctflt.c
@@ -2,6 +2,7 @@
* jfdctflt.c
*
* Copyright (C) 1994-1996, Thomas G. Lane.
+ * Modified 2003-2009 by Guido Vollbeding.
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -56,41 +57,46 @@
*/
GLOBAL(void)
-jpeg_fdct_float (FAST_FLOAT * data)
+jpeg_fdct_float (FAST_FLOAT * data, JSAMPARRAY sample_data, JDIMENSION start_col)
{
FAST_FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
FAST_FLOAT tmp10, tmp11, tmp12, tmp13;
FAST_FLOAT z1, z2, z3, z4, z5, z11, z13;
FAST_FLOAT *dataptr;
+ JSAMPROW elemptr;
int ctr;
/* Pass 1: process rows. */
dataptr = data;
- for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
- tmp0 = dataptr[0] + dataptr[7];
- tmp7 = dataptr[0] - dataptr[7];
- tmp1 = dataptr[1] + dataptr[6];
- tmp6 = dataptr[1] - dataptr[6];
- tmp2 = dataptr[2] + dataptr[5];
- tmp5 = dataptr[2] - dataptr[5];
- tmp3 = dataptr[3] + dataptr[4];
- tmp4 = dataptr[3] - dataptr[4];
-
+ for (ctr = 0; ctr < DCTSIZE; ctr++) {
+ elemptr = sample_data[ctr] + start_col;
+
+ /* Load data into workspace */
+ tmp0 = (FAST_FLOAT) (GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]));
+ tmp7 = (FAST_FLOAT) (GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]));
+ tmp1 = (FAST_FLOAT) (GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]));
+ tmp6 = (FAST_FLOAT) (GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]));
+ tmp2 = (FAST_FLOAT) (GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]));
+ tmp5 = (FAST_FLOAT) (GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]));
+ tmp3 = (FAST_FLOAT) (GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]));
+ tmp4 = (FAST_FLOAT) (GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]));
+
/* Even part */
-
+
tmp10 = tmp0 + tmp3; /* phase 2 */
tmp13 = tmp0 - tmp3;
tmp11 = tmp1 + tmp2;
tmp12 = tmp1 - tmp2;
-
- dataptr[0] = tmp10 + tmp11; /* phase 3 */
+
+ /* Apply unsigned->signed conversion */
+ dataptr[0] = tmp10 + tmp11 - 8 * CENTERJSAMPLE; /* phase 3 */
dataptr[4] = tmp10 - tmp11;
-
+
z1 = (tmp12 + tmp13) * ((FAST_FLOAT) 0.707106781); /* c4 */
dataptr[2] = tmp13 + z1; /* phase 5 */
dataptr[6] = tmp13 - z1;
-
+
/* Odd part */
tmp10 = tmp4 + tmp5; /* phase 2 */
@@ -126,21 +132,21 @@ jpeg_fdct_float (FAST_FLOAT * data)
tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
-
+
/* Even part */
-
+
tmp10 = tmp0 + tmp3; /* phase 2 */
tmp13 = tmp0 - tmp3;
tmp11 = tmp1 + tmp2;
tmp12 = tmp1 - tmp2;
-
+
dataptr[DCTSIZE*0] = tmp10 + tmp11; /* phase 3 */
dataptr[DCTSIZE*4] = tmp10 - tmp11;
-
+
z1 = (tmp12 + tmp13) * ((FAST_FLOAT) 0.707106781); /* c4 */
dataptr[DCTSIZE*2] = tmp13 + z1; /* phase 5 */
dataptr[DCTSIZE*6] = tmp13 - z1;
-
+
/* Odd part */
tmp10 = tmp4 + tmp5; /* phase 2 */
diff --git a/src/libjpeg/jfdctfst.c b/src/libjpeg/jfdctfst.c
index ccb378a..8cad5f2 100644
--- a/src/libjpeg/jfdctfst.c
+++ b/src/libjpeg/jfdctfst.c
@@ -2,6 +2,7 @@
* jfdctfst.c
*
* Copyright (C) 1994-1996, Thomas G. Lane.
+ * Modified 2003-2009 by Guido Vollbeding.
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -111,42 +112,47 @@
*/
GLOBAL(void)
-jpeg_fdct_ifast (DCTELEM * data)
+jpeg_fdct_ifast (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
{
DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
DCTELEM tmp10, tmp11, tmp12, tmp13;
DCTELEM z1, z2, z3, z4, z5, z11, z13;
DCTELEM *dataptr;
+ JSAMPROW elemptr;
int ctr;
SHIFT_TEMPS
/* Pass 1: process rows. */
dataptr = data;
- for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
- tmp0 = dataptr[0] + dataptr[7];
- tmp7 = dataptr[0] - dataptr[7];
- tmp1 = dataptr[1] + dataptr[6];
- tmp6 = dataptr[1] - dataptr[6];
- tmp2 = dataptr[2] + dataptr[5];
- tmp5 = dataptr[2] - dataptr[5];
- tmp3 = dataptr[3] + dataptr[4];
- tmp4 = dataptr[3] - dataptr[4];
-
+ for (ctr = 0; ctr < DCTSIZE; ctr++) {
+ elemptr = sample_data[ctr] + start_col;
+
+ /* Load data into workspace */
+ tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
+ tmp7 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
+ tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
+ tmp6 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
+ tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
+ tmp5 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
+ tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
+ tmp4 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
+
/* Even part */
-
+
tmp10 = tmp0 + tmp3; /* phase 2 */
tmp13 = tmp0 - tmp3;
tmp11 = tmp1 + tmp2;
tmp12 = tmp1 - tmp2;
-
- dataptr[0] = tmp10 + tmp11; /* phase 3 */
+
+ /* Apply unsigned->signed conversion */
+ dataptr[0] = tmp10 + tmp11 - 8 * CENTERJSAMPLE; /* phase 3 */
dataptr[4] = tmp10 - tmp11;
-
+
z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
dataptr[2] = tmp13 + z1; /* phase 5 */
dataptr[6] = tmp13 - z1;
-
+
/* Odd part */
tmp10 = tmp4 + tmp5; /* phase 2 */
@@ -182,21 +188,21 @@ jpeg_fdct_ifast (DCTELEM * data)
tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
-
+
/* Even part */
-
+
tmp10 = tmp0 + tmp3; /* phase 2 */
tmp13 = tmp0 - tmp3;
tmp11 = tmp1 + tmp2;
tmp12 = tmp1 - tmp2;
-
+
dataptr[DCTSIZE*0] = tmp10 + tmp11; /* phase 3 */
dataptr[DCTSIZE*4] = tmp10 - tmp11;
-
+
z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
dataptr[DCTSIZE*2] = tmp13 + z1; /* phase 5 */
dataptr[DCTSIZE*6] = tmp13 - z1;
-
+
/* Odd part */
tmp10 = tmp4 + tmp5; /* phase 2 */
diff --git a/src/libjpeg/jfdctint.c b/src/libjpeg/jfdctint.c
index 0a78b64..1dde58c 100644
--- a/src/libjpeg/jfdctint.c
+++ b/src/libjpeg/jfdctint.c
@@ -2,6 +2,7 @@
* jfdctint.c
*
* Copyright (C) 1991-1996, Thomas G. Lane.
+ * Modification developed 2003-2009 by Guido Vollbeding.
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -21,6 +22,23 @@
* The advantage of this method is that no data path contains more than one
* multiplication; this allows a very simple and accurate implementation in
* scaled fixed-point arithmetic, with a minimal number of shifts.
+ *
+ * We also provide FDCT routines with various input sample block sizes for
+ * direct resolution reduction or enlargement and for direct resolving the
+ * common 2x1 and 1x2 subsampling cases without additional resampling: NxN
+ * (N=1...16), 2NxN, and Nx2N (N=1...8) pixels for one 8x8 output DCT block.
+ *
+ * For N<8 we fill the remaining block coefficients with zero.
+ * For N>8 we apply a partial N-point FDCT on the input samples, computing
+ * just the lower 8 frequency coefficients and discarding the rest.
+ *
+ * We must scale the output coefficients of the N-point FDCT appropriately
+ * to the standard 8-point FDCT level by 8/N per 1-D pass. This scaling
+ * is folded into the constant multipliers (pass 2) and/or final/initial
+ * shifting.
+ *
+ * CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases
+ * since there would be too many additional constants to pre-calculate.
*/
#define JPEG_INTERNALS
@@ -36,7 +54,7 @@
*/
#if DCTSIZE != 8
- Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
+ Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
#endif
@@ -137,12 +155,13 @@
*/
GLOBAL(void)
-jpeg_fdct_islow (DCTELEM * data)
+jpeg_fdct_islow (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
{
- INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ INT32 tmp0, tmp1, tmp2, tmp3;
INT32 tmp10, tmp11, tmp12, tmp13;
- INT32 z1, z2, z3, z4, z5;
+ INT32 z1;
DCTELEM *dataptr;
+ JSAMPROW elemptr;
int ctr;
SHIFT_TEMPS
@@ -151,62 +170,74 @@ jpeg_fdct_islow (DCTELEM * data)
/* furthermore, we scale the results by 2**PASS1_BITS. */
dataptr = data;
- for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
- tmp0 = dataptr[0] + dataptr[7];
- tmp7 = dataptr[0] - dataptr[7];
- tmp1 = dataptr[1] + dataptr[6];
- tmp6 = dataptr[1] - dataptr[6];
- tmp2 = dataptr[2] + dataptr[5];
- tmp5 = dataptr[2] - dataptr[5];
- tmp3 = dataptr[3] + dataptr[4];
- tmp4 = dataptr[3] - dataptr[4];
-
+ for (ctr = 0; ctr < DCTSIZE; ctr++) {
+ elemptr = sample_data[ctr] + start_col;
+
/* Even part per LL&M figure 1 --- note that published figure is faulty;
* rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
*/
-
+
+ tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
+ tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
+ tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
+ tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
+
tmp10 = tmp0 + tmp3;
- tmp13 = tmp0 - tmp3;
+ tmp12 = tmp0 - tmp3;
tmp11 = tmp1 + tmp2;
- tmp12 = tmp1 - tmp2;
-
- dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS);
+ tmp13 = tmp1 - tmp2;
+
+ tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
+ tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
+ tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
+ tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
+
+ /* Apply unsigned->signed conversion */
+ dataptr[0] = (DCTELEM) ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << PASS1_BITS);
dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
-
+
z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
- dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
- CONST_BITS-PASS1_BITS);
- dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
- CONST_BITS-PASS1_BITS);
-
+ /* Add fudge factor here for final descale. */
+ z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+ dataptr[2] = (DCTELEM) RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865),
+ CONST_BITS-PASS1_BITS);
+ dataptr[6] = (DCTELEM) RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065),
+ CONST_BITS-PASS1_BITS);
+
/* Odd part per figure 8 --- note paper omits factor of sqrt(2).
- * cK represents cos(K*pi/16).
- * i0..i3 in the paper are tmp4..tmp7 here.
+ * cK represents sqrt(2) * cos(K*pi/16).
+ * i0..i3 in the paper are tmp0..tmp3 here.
*/
-
- z1 = tmp4 + tmp7;
- z2 = tmp5 + tmp6;
- z3 = tmp4 + tmp6;
- z4 = tmp5 + tmp7;
- z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
-
- tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
- tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
- tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
- tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
- z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
- z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
- z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
- z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
-
- z3 += z5;
- z4 += z5;
-
- dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS);
- dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS);
- dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS);
- dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS);
-
+
+ tmp10 = tmp0 + tmp3;
+ tmp11 = tmp1 + tmp2;
+ tmp12 = tmp0 + tmp2;
+ tmp13 = tmp1 + tmp3;
+ z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
+ /* Add fudge factor here for final descale. */
+ z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+ tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
+ tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
+ tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
+ tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
+ tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */
+ tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */
+ tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */
+ tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
+
+ tmp12 += z1;
+ tmp13 += z1;
+
+ dataptr[1] = (DCTELEM)
+ RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS);
+ dataptr[3] = (DCTELEM)
+ RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS);
+ dataptr[5] = (DCTELEM)
+ RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS);
+ dataptr[7] = (DCTELEM)
+ RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS);
+
dataptr += DCTSIZE; /* advance pointer to next row */
}
@@ -217,67 +248,4101 @@ jpeg_fdct_islow (DCTELEM * data)
dataptr = data;
for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+ /* Even part per LL&M figure 1 --- note that published figure is faulty;
+ * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+ */
+
tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
- tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
- tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
- tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
- tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
-
+
+ /* Add fudge factor here for final descale. */
+ tmp10 = tmp0 + tmp3 + (ONE << (PASS1_BITS-1));
+ tmp12 = tmp0 - tmp3;
+ tmp11 = tmp1 + tmp2;
+ tmp13 = tmp1 - tmp2;
+
+ tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
+ tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
+ tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
+ tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
+
+ dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS1_BITS);
+ dataptr[DCTSIZE*4] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS1_BITS);
+
+ z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
+ /* Add fudge factor here for final descale. */
+ z1 += ONE << (CONST_BITS+PASS1_BITS-1);
+ dataptr[DCTSIZE*2] = (DCTELEM)
+ RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*6] = (DCTELEM)
+ RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), CONST_BITS+PASS1_BITS);
+
+ /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
+ * cK represents sqrt(2) * cos(K*pi/16).
+ * i0..i3 in the paper are tmp0..tmp3 here.
+ */
+
+ tmp10 = tmp0 + tmp3;
+ tmp11 = tmp1 + tmp2;
+ tmp12 = tmp0 + tmp2;
+ tmp13 = tmp1 + tmp3;
+ z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
+ /* Add fudge factor here for final descale. */
+ z1 += ONE << (CONST_BITS+PASS1_BITS-1);
+
+ tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
+ tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
+ tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
+ tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
+ tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */
+ tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */
+ tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */
+ tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
+
+ tmp12 += z1;
+ tmp13 += z1;
+
+ dataptr[DCTSIZE*1] = (DCTELEM)
+ RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*3] = (DCTELEM)
+ RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*5] = (DCTELEM)
+ RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*7] = (DCTELEM)
+ RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS+PASS1_BITS);
+
+ dataptr++; /* advance pointer to next column */
+ }
+}
+
+#ifdef DCT_SCALING_SUPPORTED
+
+
+/*
+ * Perform the forward DCT on a 7x7 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_7x7 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+ INT32 tmp0, tmp1, tmp2, tmp3;
+ INT32 tmp10, tmp11, tmp12;
+ INT32 z1, z2, z3;
+ DCTELEM *dataptr;
+ JSAMPROW elemptr;
+ int ctr;
+ SHIFT_TEMPS
+
+ /* Pre-zero output coefficient block. */
+ MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+ /* Pass 1: process rows. */
+ /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+ /* furthermore, we scale the results by 2**PASS1_BITS. */
+ /* cK represents sqrt(2) * cos(K*pi/14). */
+
+ dataptr = data;
+ for (ctr = 0; ctr < 7; ctr++) {
+ elemptr = sample_data[ctr] + start_col;
+
+ /* Even part */
+
+ tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]);
+ tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]);
+ tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]);
+ tmp3 = GETJSAMPLE(elemptr[3]);
+
+ tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
+ tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
+ tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);
+
+ z1 = tmp0 + tmp2;
+ /* Apply unsigned->signed conversion */
+ dataptr[0] = (DCTELEM)
+ ((z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE) << PASS1_BITS);
+ tmp3 += tmp3;
+ z1 -= tmp3;
+ z1 -= tmp3;
+ z1 = MULTIPLY(z1, FIX(0.353553391)); /* (c2+c6-c4)/2 */
+ z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002)); /* (c2+c4-c6)/2 */
+ z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123)); /* c6 */
+ dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
+ z1 -= z2;
+ z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734)); /* c4 */
+ dataptr[4] = (DCTELEM)
+ DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */
+ CONST_BITS-PASS1_BITS);
+ dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);
+
+ /* Odd part */
+
+ tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347)); /* (c3+c1-c5)/2 */
+ tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339)); /* (c3+c5-c1)/2 */
+ tmp0 = tmp1 - tmp2;
+ tmp1 += tmp2;
+ tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.378756276)); /* -c1 */
+ tmp1 += tmp2;
+ tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.613604268)); /* c5 */
+ tmp0 += tmp3;
+ tmp2 += tmp3 + MULTIPLY(tmp12, FIX(1.870828693)); /* c3+c1-c5 */
+
+ dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
+ dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
+ dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
+
+ dataptr += DCTSIZE; /* advance pointer to next row */
+ }
+
+ /* Pass 2: process columns.
+ * We remove the PASS1_BITS scaling, but leave the results scaled up
+ * by an overall factor of 8.
+ * We must also scale the output by (8/7)**2 = 64/49, which we fold
+ * into the constant multipliers:
+ * cK now represents sqrt(2) * cos(K*pi/14) * 64/49.
+ */
+
+ dataptr = data;
+ for (ctr = 0; ctr < 7; ctr++) {
+ /* Even part */
+
+ tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*6];
+ tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*5];
+ tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*4];
+ tmp3 = dataptr[DCTSIZE*3];
+
+ tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*6];
+ tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*5];
+ tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*4];
+
+ z1 = tmp0 + tmp2;
+ dataptr[DCTSIZE*0] = (DCTELEM)
+ DESCALE(MULTIPLY(z1 + tmp1 + tmp3, FIX(1.306122449)), /* 64/49 */
+ CONST_BITS+PASS1_BITS);
+ tmp3 += tmp3;
+ z1 -= tmp3;
+ z1 -= tmp3;
+ z1 = MULTIPLY(z1, FIX(0.461784020)); /* (c2+c6-c4)/2 */
+ z2 = MULTIPLY(tmp0 - tmp2, FIX(1.202428084)); /* (c2+c4-c6)/2 */
+ z3 = MULTIPLY(tmp1 - tmp2, FIX(0.411026446)); /* c6 */
+ dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS+PASS1_BITS);
+ z1 -= z2;
+ z2 = MULTIPLY(tmp0 - tmp1, FIX(1.151670509)); /* c4 */
+ dataptr[DCTSIZE*4] = (DCTELEM)
+ DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.923568041)), /* c2+c6-c4 */
+ CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+PASS1_BITS);
+
+ /* Odd part */
+
+ tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.221765677)); /* (c3+c1-c5)/2 */
+ tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.222383464)); /* (c3+c5-c1)/2 */
+ tmp0 = tmp1 - tmp2;
+ tmp1 += tmp2;
+ tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.800824523)); /* -c1 */
+ tmp1 += tmp2;
+ tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.801442310)); /* c5 */
+ tmp0 += tmp3;
+ tmp2 += tmp3 + MULTIPLY(tmp12, FIX(2.443531355)); /* c3+c1-c5 */
+
+ dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS);
+
+ dataptr++; /* advance pointer to next column */
+ }
+}
+
+
+/*
+ * Perform the forward DCT on a 6x6 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_6x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+ INT32 tmp0, tmp1, tmp2;
+ INT32 tmp10, tmp11, tmp12;
+ DCTELEM *dataptr;
+ JSAMPROW elemptr;
+ int ctr;
+ SHIFT_TEMPS
+
+ /* Pre-zero output coefficient block. */
+ MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+ /* Pass 1: process rows. */
+ /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+ /* furthermore, we scale the results by 2**PASS1_BITS. */
+ /* cK represents sqrt(2) * cos(K*pi/12). */
+
+ dataptr = data;
+ for (ctr = 0; ctr < 6; ctr++) {
+ elemptr = sample_data[ctr] + start_col;
+
+ /* Even part */
+
+ tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
+ tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
+ tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
+
+ tmp10 = tmp0 + tmp2;
+ tmp12 = tmp0 - tmp2;
+
+ tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
+ tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
+ tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
+
+ /* Apply unsigned->signed conversion */
+ dataptr[0] = (DCTELEM)
+ ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << PASS1_BITS);
+ dataptr[2] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp12, FIX(1.224744871)), /* c2 */
+ CONST_BITS-PASS1_BITS);
+ dataptr[4] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
+ CONST_BITS-PASS1_BITS);
+
+ /* Odd part */
+
+ tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)), /* c5 */
+ CONST_BITS-PASS1_BITS);
+
+ dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
+ dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
+ dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));
+
+ dataptr += DCTSIZE; /* advance pointer to next row */
+ }
+
+ /* Pass 2: process columns.
+ * We remove the PASS1_BITS scaling, but leave the results scaled up
+ * by an overall factor of 8.
+ * We must also scale the output by (8/6)**2 = 16/9, which we fold
+ * into the constant multipliers:
+ * cK now represents sqrt(2) * cos(K*pi/12) * 16/9.
+ */
+
+ dataptr = data;
+ for (ctr = 0; ctr < 6; ctr++) {
+ /* Even part */
+
+ tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
+ tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
+ tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
+
+ tmp10 = tmp0 + tmp2;
+ tmp12 = tmp0 - tmp2;
+
+ tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
+ tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
+ tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
+
+ dataptr[DCTSIZE*0] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)), /* 16/9 */
+ CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*2] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp12, FIX(2.177324216)), /* c2 */
+ CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*4] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
+ CONST_BITS+PASS1_BITS);
+
+ /* Odd part */
+
+ tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829)); /* c5 */
+
+ dataptr[DCTSIZE*1] = (DCTELEM)
+ DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
+ CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*3] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)), /* 16/9 */
+ CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*5] = (DCTELEM)
+ DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)), /* 16/9 */
+ CONST_BITS+PASS1_BITS);
+
+ dataptr++; /* advance pointer to next column */
+ }
+}
+
+
+/*
+ * Perform the forward DCT on a 5x5 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_5x5 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+ INT32 tmp0, tmp1, tmp2;
+ INT32 tmp10, tmp11;
+ DCTELEM *dataptr;
+ JSAMPROW elemptr;
+ int ctr;
+ SHIFT_TEMPS
+
+ /* Pre-zero output coefficient block. */
+ MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+ /* Pass 1: process rows. */
+ /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+ /* furthermore, we scale the results by 2**PASS1_BITS. */
+ /* We scale the results further by 2 as part of output adaption */
+ /* scaling for different DCT size. */
+ /* cK represents sqrt(2) * cos(K*pi/10). */
+
+ dataptr = data;
+ for (ctr = 0; ctr < 5; ctr++) {
+ elemptr = sample_data[ctr] + start_col;
+
+ /* Even part */
+
+ tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]);
+ tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]);
+ tmp2 = GETJSAMPLE(elemptr[2]);
+
+ tmp10 = tmp0 + tmp1;
+ tmp11 = tmp0 - tmp1;
+
+ tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
+ tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);
+
+ /* Apply unsigned->signed conversion */
+ dataptr[0] = (DCTELEM)
+ ((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << (PASS1_BITS+1));
+ tmp11 = MULTIPLY(tmp11, FIX(0.790569415)); /* (c2+c4)/2 */
+ tmp10 -= tmp2 << 2;
+ tmp10 = MULTIPLY(tmp10, FIX(0.353553391)); /* (c2-c4)/2 */
+ dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS-1);
+ dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS-1);
+
+ /* Odd part */
+
+ tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876)); /* c3 */
+
+ dataptr[1] = (DCTELEM)
+ DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */
+ CONST_BITS-PASS1_BITS-1);
+ dataptr[3] = (DCTELEM)
+ DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */
+ CONST_BITS-PASS1_BITS-1);
+
+ dataptr += DCTSIZE; /* advance pointer to next row */
+ }
+
+ /* Pass 2: process columns.
+ * We remove the PASS1_BITS scaling, but leave the results scaled up
+ * by an overall factor of 8.
+ * We must also scale the output by (8/5)**2 = 64/25, which we partially
+ * fold into the constant multipliers (other part was done in pass 1):
+ * cK now represents sqrt(2) * cos(K*pi/10) * 32/25.
+ */
+
+ dataptr = data;
+ for (ctr = 0; ctr < 5; ctr++) {
+ /* Even part */
+
+ tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*4];
+ tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*3];
+ tmp2 = dataptr[DCTSIZE*2];
+
+ tmp10 = tmp0 + tmp1;
+ tmp11 = tmp0 - tmp1;
+
+ tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*4];
+ tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*3];
+
+ dataptr[DCTSIZE*0] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 + tmp2, FIX(1.28)), /* 32/25 */
+ CONST_BITS+PASS1_BITS);
+ tmp11 = MULTIPLY(tmp11, FIX(1.011928851)); /* (c2+c4)/2 */
+ tmp10 -= tmp2 << 2;
+ tmp10 = MULTIPLY(tmp10, FIX(0.452548340)); /* (c2-c4)/2 */
+ dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS+PASS1_BITS);
+
+ /* Odd part */
+
+ tmp10 = MULTIPLY(tmp0 + tmp1, FIX(1.064004961)); /* c3 */
+
+ dataptr[DCTSIZE*1] = (DCTELEM)
+ DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.657591230)), /* c1-c3 */
+ CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*3] = (DCTELEM)
+ DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.785601151)), /* c1+c3 */
+ CONST_BITS+PASS1_BITS);
+
+ dataptr++; /* advance pointer to next column */
+ }
+}
+
+
+/*
+ * Perform the forward DCT on a 4x4 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_4x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+ INT32 tmp0, tmp1;
+ INT32 tmp10, tmp11;
+ DCTELEM *dataptr;
+ JSAMPROW elemptr;
+ int ctr;
+ SHIFT_TEMPS
+
+ /* Pre-zero output coefficient block. */
+ MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+ /* Pass 1: process rows. */
+ /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+ /* furthermore, we scale the results by 2**PASS1_BITS. */
+ /* We must also scale the output by (8/4)**2 = 2**2, which we add here. */
+ /* cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT]. */
+
+ dataptr = data;
+ for (ctr = 0; ctr < 4; ctr++) {
+ elemptr = sample_data[ctr] + start_col;
+
+ /* Even part */
+
+ tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
+ tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
+
+ tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
+ tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
+
+ /* Apply unsigned->signed conversion */
+ dataptr[0] = (DCTELEM)
+ ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+2));
+ dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+2));
+
+ /* Odd part */
+
+ tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
+ /* Add fudge factor here for final descale. */
+ tmp0 += ONE << (CONST_BITS-PASS1_BITS-3);
+
+ dataptr[1] = (DCTELEM)
+ RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
+ CONST_BITS-PASS1_BITS-2);
+ dataptr[3] = (DCTELEM)
+ RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
+ CONST_BITS-PASS1_BITS-2);
+
+ dataptr += DCTSIZE; /* advance pointer to next row */
+ }
+
+ /* Pass 2: process columns.
+ * We remove the PASS1_BITS scaling, but leave the results scaled up
+ * by an overall factor of 8.
+ */
+
+ dataptr = data;
+ for (ctr = 0; ctr < 4; ctr++) {
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3] + (ONE << (PASS1_BITS-1));
+ tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
+
+ tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
+ tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
+
+ dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
+ dataptr[DCTSIZE*2] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
+
+ /* Odd part */
+
+ tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
+ /* Add fudge factor here for final descale. */
+ tmp0 += ONE << (CONST_BITS+PASS1_BITS-1);
+
+ dataptr[DCTSIZE*1] = (DCTELEM)
+ RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
+ CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*3] = (DCTELEM)
+ RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
+ CONST_BITS+PASS1_BITS);
+
+ dataptr++; /* advance pointer to next column */
+ }
+}
+
+
+/*
+ * Perform the forward DCT on a 3x3 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_3x3 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+ INT32 tmp0, tmp1, tmp2;
+ DCTELEM *dataptr;
+ JSAMPROW elemptr;
+ int ctr;
+ SHIFT_TEMPS
+
+ /* Pre-zero output coefficient block. */
+ MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+ /* Pass 1: process rows. */
+ /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+ /* furthermore, we scale the results by 2**PASS1_BITS. */
+ /* We scale the results further by 2**2 as part of output adaption */
+ /* scaling for different DCT size. */
+ /* cK represents sqrt(2) * cos(K*pi/6). */
+
+ dataptr = data;
+ for (ctr = 0; ctr < 3; ctr++) {
+ elemptr = sample_data[ctr] + start_col;
+
+ /* Even part */
+
+ tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
+ tmp1 = GETJSAMPLE(elemptr[1]);
+
+ tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);
+
+ /* Apply unsigned->signed conversion */
+ dataptr[0] = (DCTELEM)
+ ((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+2));
+ dataptr[2] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
+ CONST_BITS-PASS1_BITS-2);
+
+ /* Odd part */
+
+ dataptr[1] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp2, FIX(1.224744871)), /* c1 */
+ CONST_BITS-PASS1_BITS-2);
+
+ dataptr += DCTSIZE; /* advance pointer to next row */
+ }
+
+ /* Pass 2: process columns.
+ * We remove the PASS1_BITS scaling, but leave the results scaled up
+ * by an overall factor of 8.
+ * We must also scale the output by (8/3)**2 = 64/9, which we partially
+ * fold into the constant multipliers (other part was done in pass 1):
+ * cK now represents sqrt(2) * cos(K*pi/6) * 16/9.
+ */
+
+ dataptr = data;
+ for (ctr = 0; ctr < 3; ctr++) {
+ /* Even part */
+
+ tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*2];
+ tmp1 = dataptr[DCTSIZE*1];
+
+ tmp2 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*2];
+
+ dataptr[DCTSIZE*0] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
+ CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*2] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(1.257078722)), /* c2 */
+ CONST_BITS+PASS1_BITS);
+
+ /* Odd part */
+
+ dataptr[DCTSIZE*1] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp2, FIX(2.177324216)), /* c1 */
+ CONST_BITS+PASS1_BITS);
+
+ dataptr++; /* advance pointer to next column */
+ }
+}
+
+
+/*
+ * Perform the forward DCT on a 2x2 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_2x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+ INT32 tmp0, tmp1, tmp2, tmp3;
+ JSAMPROW elemptr;
+
+ /* Pre-zero output coefficient block. */
+ MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+ /* Pass 1: process rows. */
+ /* Note results are scaled up by sqrt(8) compared to a true DCT. */
+
+ /* Row 0 */
+ elemptr = sample_data[0] + start_col;
+
+ tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);
+ tmp1 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]);
+
+ /* Row 1 */
+ elemptr = sample_data[1] + start_col;
+
+ tmp2 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);
+ tmp3 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]);
+
+ /* Pass 2: process columns.
+ * We leave the results scaled up by an overall factor of 8.
+ * We must also scale the output by (8/2)**2 = 2**4.
+ */
+
+ /* Column 0 */
+ /* Apply unsigned->signed conversion */
+ data[DCTSIZE*0] = (DCTELEM) ((tmp0 + tmp2 - 4 * CENTERJSAMPLE) << 4);
+ data[DCTSIZE*1] = (DCTELEM) ((tmp0 - tmp2) << 4);
+
+ /* Column 1 */
+ data[DCTSIZE*0+1] = (DCTELEM) ((tmp1 + tmp3) << 4);
+ data[DCTSIZE*1+1] = (DCTELEM) ((tmp1 - tmp3) << 4);
+}
+
+
+/*
+ * Perform the forward DCT on a 1x1 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_1x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+ /* Pre-zero output coefficient block. */
+ MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+ /* We leave the result scaled up by an overall factor of 8. */
+ /* We must also scale the output by (8/1)**2 = 2**6. */
+ /* Apply unsigned->signed conversion */
+ data[0] = (DCTELEM)
+ ((GETJSAMPLE(sample_data[0][start_col]) - CENTERJSAMPLE) << 6);
+}
+
+
+/*
+ * Perform the forward DCT on a 9x9 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_9x9 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+ INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
+ INT32 tmp10, tmp11, tmp12, tmp13;
+ INT32 z1, z2;
+ DCTELEM workspace[8];
+ DCTELEM *dataptr;
+ DCTELEM *wsptr;
+ JSAMPROW elemptr;
+ int ctr;
+ SHIFT_TEMPS
+
+ /* Pass 1: process rows. */
+ /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+ /* we scale the results further by 2 as part of output adaption */
+ /* scaling for different DCT size. */
+ /* cK represents sqrt(2) * cos(K*pi/18). */
+
+ dataptr = data;
+ ctr = 0;
+ for (;;) {
+ elemptr = sample_data[ctr] + start_col;
+
+ /* Even part */
+
+ tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[8]);
+ tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[7]);
+ tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[6]);
+ tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[5]);
+ tmp4 = GETJSAMPLE(elemptr[4]);
+
+ tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[8]);
+ tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[7]);
+ tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[6]);
+ tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[5]);
+
+ z1 = tmp0 + tmp2 + tmp3;
+ z2 = tmp1 + tmp4;
+ /* Apply unsigned->signed conversion */
+ dataptr[0] = (DCTELEM) ((z1 + z2 - 9 * CENTERJSAMPLE) << 1);
+ dataptr[6] = (DCTELEM)
+ DESCALE(MULTIPLY(z1 - z2 - z2, FIX(0.707106781)), /* c6 */
+ CONST_BITS-1);
+ z1 = MULTIPLY(tmp0 - tmp2, FIX(1.328926049)); /* c2 */
+ z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(0.707106781)); /* c6 */
+ dataptr[2] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp2 - tmp3, FIX(1.083350441)) /* c4 */
+ + z1 + z2, CONST_BITS-1);
+ dataptr[4] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp3 - tmp0, FIX(0.245575608)) /* c8 */
+ + z1 - z2, CONST_BITS-1);
+
+ /* Odd part */
+
+ dataptr[3] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(1.224744871)), /* c3 */
+ CONST_BITS-1);
+
+ tmp11 = MULTIPLY(tmp11, FIX(1.224744871)); /* c3 */
+ tmp0 = MULTIPLY(tmp10 + tmp12, FIX(0.909038955)); /* c5 */
+ tmp1 = MULTIPLY(tmp10 + tmp13, FIX(0.483689525)); /* c7 */
+
+ dataptr[1] = (DCTELEM) DESCALE(tmp11 + tmp0 + tmp1, CONST_BITS-1);
+
+ tmp2 = MULTIPLY(tmp12 - tmp13, FIX(1.392728481)); /* c1 */
+
+ dataptr[5] = (DCTELEM) DESCALE(tmp0 - tmp11 - tmp2, CONST_BITS-1);
+ dataptr[7] = (DCTELEM) DESCALE(tmp1 - tmp11 + tmp2, CONST_BITS-1);
+
+ ctr++;
+
+ if (ctr != DCTSIZE) {
+ if (ctr == 9)
+ break; /* Done. */
+ dataptr += DCTSIZE; /* advance pointer to next row */
+ } else
+ dataptr = workspace; /* switch pointer to extended workspace */
+ }
+
+ /* Pass 2: process columns.
+ * We leave the results scaled up by an overall factor of 8.
+ * We must also scale the output by (8/9)**2 = 64/81, which we partially
+ * fold into the constant multipliers and final/initial shifting:
+ * cK now represents sqrt(2) * cos(K*pi/18) * 128/81.
+ */
+
+ dataptr = data;
+ wsptr = workspace;
+ for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+ /* Even part */
+
+ tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*0];
+ tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*7];
+ tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*6];
+ tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*5];
+ tmp4 = dataptr[DCTSIZE*4];
+
+ tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*0];
+ tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*7];
+ tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*6];
+ tmp13 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*5];
+
+ z1 = tmp0 + tmp2 + tmp3;
+ z2 = tmp1 + tmp4;
+ dataptr[DCTSIZE*0] = (DCTELEM)
+ DESCALE(MULTIPLY(z1 + z2, FIX(1.580246914)), /* 128/81 */
+ CONST_BITS+2);
+ dataptr[DCTSIZE*6] = (DCTELEM)
+ DESCALE(MULTIPLY(z1 - z2 - z2, FIX(1.117403309)), /* c6 */
+ CONST_BITS+2);
+ z1 = MULTIPLY(tmp0 - tmp2, FIX(2.100031287)); /* c2 */
+ z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(1.117403309)); /* c6 */
+ dataptr[DCTSIZE*2] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp2 - tmp3, FIX(1.711961190)) /* c4 */
+ + z1 + z2, CONST_BITS+2);
+ dataptr[DCTSIZE*4] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp3 - tmp0, FIX(0.388070096)) /* c8 */
+ + z1 - z2, CONST_BITS+2);
+
+ /* Odd part */
+
+ dataptr[DCTSIZE*3] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(1.935399303)), /* c3 */
+ CONST_BITS+2);
+
+ tmp11 = MULTIPLY(tmp11, FIX(1.935399303)); /* c3 */
+ tmp0 = MULTIPLY(tmp10 + tmp12, FIX(1.436506004)); /* c5 */
+ tmp1 = MULTIPLY(tmp10 + tmp13, FIX(0.764348879)); /* c7 */
+
+ dataptr[DCTSIZE*1] = (DCTELEM)
+ DESCALE(tmp11 + tmp0 + tmp1, CONST_BITS+2);
+
+ tmp2 = MULTIPLY(tmp12 - tmp13, FIX(2.200854883)); /* c1 */
+
+ dataptr[DCTSIZE*5] = (DCTELEM)
+ DESCALE(tmp0 - tmp11 - tmp2, CONST_BITS+2);
+ dataptr[DCTSIZE*7] = (DCTELEM)
+ DESCALE(tmp1 - tmp11 + tmp2, CONST_BITS+2);
+
+ dataptr++; /* advance pointer to next column */
+ wsptr++; /* advance pointer to next column */
+ }
+}
+
+
+/*
+ * Perform the forward DCT on a 10x10 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_10x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+ INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
+ INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
+ DCTELEM workspace[8*2];
+ DCTELEM *dataptr;
+ DCTELEM *wsptr;
+ JSAMPROW elemptr;
+ int ctr;
+ SHIFT_TEMPS
+
+ /* Pass 1: process rows. */
+ /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+ /* we scale the results further by 2 as part of output adaption */
+ /* scaling for different DCT size. */
+ /* cK represents sqrt(2) * cos(K*pi/20). */
+
+ dataptr = data;
+ ctr = 0;
+ for (;;) {
+ elemptr = sample_data[ctr] + start_col;
+
+ /* Even part */
+
+ tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]);
+ tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]);
+ tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]);
+ tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]);
+ tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]);
+
+ tmp10 = tmp0 + tmp4;
+ tmp13 = tmp0 - tmp4;
+ tmp11 = tmp1 + tmp3;
+ tmp14 = tmp1 - tmp3;
+
+ tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]);
+ tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
+ tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
+ tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
+ tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);
+
+ /* Apply unsigned->signed conversion */
+ dataptr[0] = (DCTELEM)
+ ((tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE) << 1);
+ tmp12 += tmp12;
+ dataptr[4] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */
+ MULTIPLY(tmp11 - tmp12, FIX(0.437016024)), /* c8 */
+ CONST_BITS-1);
+ tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876)); /* c6 */
+ dataptr[2] = (DCTELEM)
+ DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)), /* c2-c6 */
+ CONST_BITS-1);
+ dataptr[6] = (DCTELEM)
+ DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)), /* c2+c6 */
+ CONST_BITS-1);
+
+ /* Odd part */
+
+ tmp10 = tmp0 + tmp4;
+ tmp11 = tmp1 - tmp3;
+ dataptr[5] = (DCTELEM) ((tmp10 - tmp11 - tmp2) << 1);
+ tmp2 <<= CONST_BITS;
+ dataptr[1] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp0, FIX(1.396802247)) + /* c1 */
+ MULTIPLY(tmp1, FIX(1.260073511)) + tmp2 + /* c3 */
+ MULTIPLY(tmp3, FIX(0.642039522)) + /* c7 */
+ MULTIPLY(tmp4, FIX(0.221231742)), /* c9 */
+ CONST_BITS-1);
+ tmp12 = MULTIPLY(tmp0 - tmp4, FIX(0.951056516)) - /* (c3+c7)/2 */
+ MULTIPLY(tmp1 + tmp3, FIX(0.587785252)); /* (c1-c9)/2 */
+ tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.309016994)) + /* (c3-c7)/2 */
+ (tmp11 << (CONST_BITS - 1)) - tmp2;
+ dataptr[3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS-1);
+ dataptr[7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS-1);
+
+ ctr++;
+
+ if (ctr != DCTSIZE) {
+ if (ctr == 10)
+ break; /* Done. */
+ dataptr += DCTSIZE; /* advance pointer to next row */
+ } else
+ dataptr = workspace; /* switch pointer to extended workspace */
+ }
+
+ /* Pass 2: process columns.
+ * We leave the results scaled up by an overall factor of 8.
+ * We must also scale the output by (8/10)**2 = 16/25, which we partially
+ * fold into the constant multipliers and final/initial shifting:
+ * cK now represents sqrt(2) * cos(K*pi/20) * 32/25.
+ */
+
+ dataptr = data;
+ wsptr = workspace;
+ for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+ /* Even part */
+
+ tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*1];
+ tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*0];
+ tmp12 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*7];
+ tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*6];
+ tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*5];
+
+ tmp10 = tmp0 + tmp4;
+ tmp13 = tmp0 - tmp4;
+ tmp11 = tmp1 + tmp3;
+ tmp14 = tmp1 - tmp3;
+
+ tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*1];
+ tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*0];
+ tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*7];
+ tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*6];
+ tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*5];
+
+ dataptr[DCTSIZE*0] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(1.28)), /* 32/25 */
+ CONST_BITS+2);
+ tmp12 += tmp12;
+ dataptr[DCTSIZE*4] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.464477191)) - /* c4 */
+ MULTIPLY(tmp11 - tmp12, FIX(0.559380511)), /* c8 */
+ CONST_BITS+2);
+ tmp10 = MULTIPLY(tmp13 + tmp14, FIX(1.064004961)); /* c6 */
+ dataptr[DCTSIZE*2] = (DCTELEM)
+ DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.657591230)), /* c2-c6 */
+ CONST_BITS+2);
+ dataptr[DCTSIZE*6] = (DCTELEM)
+ DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.785601151)), /* c2+c6 */
+ CONST_BITS+2);
+
+ /* Odd part */
+
+ tmp10 = tmp0 + tmp4;
+ tmp11 = tmp1 - tmp3;
+ dataptr[DCTSIZE*5] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 - tmp11 - tmp2, FIX(1.28)), /* 32/25 */
+ CONST_BITS+2);
+ tmp2 = MULTIPLY(tmp2, FIX(1.28)); /* 32/25 */
+ dataptr[DCTSIZE*1] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp0, FIX(1.787906876)) + /* c1 */
+ MULTIPLY(tmp1, FIX(1.612894094)) + tmp2 + /* c3 */
+ MULTIPLY(tmp3, FIX(0.821810588)) + /* c7 */
+ MULTIPLY(tmp4, FIX(0.283176630)), /* c9 */
+ CONST_BITS+2);
+ tmp12 = MULTIPLY(tmp0 - tmp4, FIX(1.217352341)) - /* (c3+c7)/2 */
+ MULTIPLY(tmp1 + tmp3, FIX(0.752365123)); /* (c1-c9)/2 */
+ tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.395541753)) + /* (c3-c7)/2 */
+ MULTIPLY(tmp11, FIX(0.64)) - tmp2; /* 16/25 */
+ dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS+2);
+ dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS+2);
+
+ dataptr++; /* advance pointer to next column */
+ wsptr++; /* advance pointer to next column */
+ }
+}
+
+
+/*
+ * Perform the forward DCT on an 11x11 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_11x11 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+ INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+ INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
+ INT32 z1, z2, z3;
+ DCTELEM workspace[8*3];
+ DCTELEM *dataptr;
+ DCTELEM *wsptr;
+ JSAMPROW elemptr;
+ int ctr;
+ SHIFT_TEMPS
+
+ /* Pass 1: process rows. */
+ /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+ /* we scale the results further by 2 as part of output adaption */
+ /* scaling for different DCT size. */
+ /* cK represents sqrt(2) * cos(K*pi/22). */
+
+ dataptr = data;
+ ctr = 0;
+ for (;;) {
+ elemptr = sample_data[ctr] + start_col;
+
+ /* Even part */
+
+ tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[10]);
+ tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[9]);
+ tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[8]);
+ tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[7]);
+ tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[6]);
+ tmp5 = GETJSAMPLE(elemptr[5]);
+
+ tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[10]);
+ tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[9]);
+ tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[8]);
+ tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[7]);
+ tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[6]);
+
+ /* Apply unsigned->signed conversion */
+ dataptr[0] = (DCTELEM)
+ ((tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 - 11 * CENTERJSAMPLE) << 1);
+ tmp5 += tmp5;
+ tmp0 -= tmp5;
+ tmp1 -= tmp5;
+ tmp2 -= tmp5;
+ tmp3 -= tmp5;
+ tmp4 -= tmp5;
+ z1 = MULTIPLY(tmp0 + tmp3, FIX(1.356927976)) + /* c2 */
+ MULTIPLY(tmp2 + tmp4, FIX(0.201263574)); /* c10 */
+ z2 = MULTIPLY(tmp1 - tmp3, FIX(0.926112931)); /* c6 */
+ z3 = MULTIPLY(tmp0 - tmp1, FIX(1.189712156)); /* c4 */
+ dataptr[2] = (DCTELEM)
+ DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(1.018300590)) /* c2+c8-c6 */
+ - MULTIPLY(tmp4, FIX(1.390975730)), /* c4+c10 */
+ CONST_BITS-1);
+ dataptr[4] = (DCTELEM)
+ DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(0.062335650)) /* c4-c6-c10 */
+ - MULTIPLY(tmp2, FIX(1.356927976)) /* c2 */
+ + MULTIPLY(tmp4, FIX(0.587485545)), /* c8 */
+ CONST_BITS-1);
+ dataptr[6] = (DCTELEM)
+ DESCALE(z1 + z3 - MULTIPLY(tmp0, FIX(1.620527200)) /* c2+c4-c6 */
+ - MULTIPLY(tmp2, FIX(0.788749120)), /* c8+c10 */
+ CONST_BITS-1);
+
+ /* Odd part */
+
+ tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.286413905)); /* c3 */
+ tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.068791298)); /* c5 */
+ tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.764581576)); /* c7 */
+ tmp0 = tmp1 + tmp2 + tmp3 - MULTIPLY(tmp10, FIX(1.719967871)) /* c7+c5+c3-c1 */
+ + MULTIPLY(tmp14, FIX(0.398430003)); /* c9 */
+ tmp4 = MULTIPLY(tmp11 + tmp12, - FIX(0.764581576)); /* -c7 */
+ tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.399818907)); /* -c1 */
+ tmp1 += tmp4 + tmp5 + MULTIPLY(tmp11, FIX(1.276416582)) /* c9+c7+c1-c3 */
+ - MULTIPLY(tmp14, FIX(1.068791298)); /* c5 */
+ tmp10 = MULTIPLY(tmp12 + tmp13, FIX(0.398430003)); /* c9 */
+ tmp2 += tmp4 + tmp10 - MULTIPLY(tmp12, FIX(1.989053629)) /* c9+c5+c3-c7 */
+ + MULTIPLY(tmp14, FIX(1.399818907)); /* c1 */
+ tmp3 += tmp5 + tmp10 + MULTIPLY(tmp13, FIX(1.305598626)) /* c1+c5-c9-c7 */
+ - MULTIPLY(tmp14, FIX(1.286413905)); /* c3 */
+
+ dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-1);
+ dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-1);
+ dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-1);
+ dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS-1);
+
+ ctr++;
+
+ if (ctr != DCTSIZE) {
+ if (ctr == 11)
+ break; /* Done. */
+ dataptr += DCTSIZE; /* advance pointer to next row */
+ } else
+ dataptr = workspace; /* switch pointer to extended workspace */
+ }
+
+ /* Pass 2: process columns.
+ * We leave the results scaled up by an overall factor of 8.
+ * We must also scale the output by (8/11)**2 = 64/121, which we partially
+ * fold into the constant multipliers and final/initial shifting:
+ * cK now represents sqrt(2) * cos(K*pi/22) * 128/121.
+ */
+
+ dataptr = data;
+ wsptr = workspace;
+ for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+ /* Even part */
+
+ tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*2];
+ tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*1];
+ tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*0];
+ tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*7];
+ tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*6];
+ tmp5 = dataptr[DCTSIZE*5];
+
+ tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*2];
+ tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*1];
+ tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*0];
+ tmp13 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*7];
+ tmp14 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*6];
+
+ dataptr[DCTSIZE*0] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5,
+ FIX(1.057851240)), /* 128/121 */
+ CONST_BITS+2);
+ tmp5 += tmp5;
+ tmp0 -= tmp5;
+ tmp1 -= tmp5;
+ tmp2 -= tmp5;
+ tmp3 -= tmp5;
+ tmp4 -= tmp5;
+ z1 = MULTIPLY(tmp0 + tmp3, FIX(1.435427942)) + /* c2 */
+ MULTIPLY(tmp2 + tmp4, FIX(0.212906922)); /* c10 */
+ z2 = MULTIPLY(tmp1 - tmp3, FIX(0.979689713)); /* c6 */
+ z3 = MULTIPLY(tmp0 - tmp1, FIX(1.258538479)); /* c4 */
+ dataptr[DCTSIZE*2] = (DCTELEM)
+ DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(1.077210542)) /* c2+c8-c6 */
+ - MULTIPLY(tmp4, FIX(1.471445400)), /* c4+c10 */
+ CONST_BITS+2);
+ dataptr[DCTSIZE*4] = (DCTELEM)
+ DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(0.065941844)) /* c4-c6-c10 */
+ - MULTIPLY(tmp2, FIX(1.435427942)) /* c2 */
+ + MULTIPLY(tmp4, FIX(0.621472312)), /* c8 */
+ CONST_BITS+2);
+ dataptr[DCTSIZE*6] = (DCTELEM)
+ DESCALE(z1 + z3 - MULTIPLY(tmp0, FIX(1.714276708)) /* c2+c4-c6 */
+ - MULTIPLY(tmp2, FIX(0.834379234)), /* c8+c10 */
+ CONST_BITS+2);
+
+ /* Odd part */
+
+ tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.360834544)); /* c3 */
+ tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.130622199)); /* c5 */
+ tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.808813568)); /* c7 */
+ tmp0 = tmp1 + tmp2 + tmp3 - MULTIPLY(tmp10, FIX(1.819470145)) /* c7+c5+c3-c1 */
+ + MULTIPLY(tmp14, FIX(0.421479672)); /* c9 */
+ tmp4 = MULTIPLY(tmp11 + tmp12, - FIX(0.808813568)); /* -c7 */
+ tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.480800167)); /* -c1 */
+ tmp1 += tmp4 + tmp5 + MULTIPLY(tmp11, FIX(1.350258864)) /* c9+c7+c1-c3 */
+ - MULTIPLY(tmp14, FIX(1.130622199)); /* c5 */
+ tmp10 = MULTIPLY(tmp12 + tmp13, FIX(0.421479672)); /* c9 */
+ tmp2 += tmp4 + tmp10 - MULTIPLY(tmp12, FIX(2.104122847)) /* c9+c5+c3-c7 */
+ + MULTIPLY(tmp14, FIX(1.480800167)); /* c1 */
+ tmp3 += tmp5 + tmp10 + MULTIPLY(tmp13, FIX(1.381129125)) /* c1+c5-c9-c7 */
+ - MULTIPLY(tmp14, FIX(1.360834544)); /* c3 */
+
+ dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+2);
+ dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+2);
+ dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+2);
+ dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+2);
+
+ dataptr++; /* advance pointer to next column */
+ wsptr++; /* advance pointer to next column */
+ }
+}
+
+
+/*
+ * Perform the forward DCT on a 12x12 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_12x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+ INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+ INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+ DCTELEM workspace[8*4];
+ DCTELEM *dataptr;
+ DCTELEM *wsptr;
+ JSAMPROW elemptr;
+ int ctr;
+ SHIFT_TEMPS
+
+ /* Pass 1: process rows. */
+ /* Note results are scaled up by sqrt(8) compared to a true DCT. */
+ /* cK represents sqrt(2) * cos(K*pi/24). */
+
+ dataptr = data;
+ ctr = 0;
+ for (;;) {
+ elemptr = sample_data[ctr] + start_col;
+
+ /* Even part */
+
+ tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]);
+ tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]);
+ tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]);
+ tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]);
+ tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]);
+ tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]);
+
+ tmp10 = tmp0 + tmp5;
+ tmp13 = tmp0 - tmp5;
+ tmp11 = tmp1 + tmp4;
+ tmp14 = tmp1 - tmp4;
+ tmp12 = tmp2 + tmp3;
+ tmp15 = tmp2 - tmp3;
+
+ tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]);
+ tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]);
+ tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
+ tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
+ tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
+ tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);
+
+ /* Apply unsigned->signed conversion */
+ dataptr[0] = (DCTELEM) (tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE);
+ dataptr[6] = (DCTELEM) (tmp13 - tmp14 - tmp15);
+ dataptr[4] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
+ CONST_BITS);
+ dataptr[2] = (DCTELEM)
+ DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */
+ CONST_BITS);
+
+ /* Odd part */
+
+ tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100); /* c9 */
+ tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865); /* c3-c9 */
+ tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065); /* c3+c9 */
+ tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054)); /* c5 */
+ tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669)); /* c7 */
+ tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */
+ + MULTIPLY(tmp5, FIX(0.184591911)); /* c11 */
+ tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */
+ tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.339493912)) /* c1+c5-c11 */
+ + MULTIPLY(tmp5, FIX(0.860918669)); /* c7 */
+ tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.725788011)) /* c1+c11-c7 */
+ - MULTIPLY(tmp5, FIX(1.121971054)); /* c5 */
+ tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.306562965)) /* c3 */
+ - MULTIPLY(tmp2 + tmp5, FIX_0_541196100); /* c9 */
+
+ dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS);
+ dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS);
+ dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS);
+ dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS);
+
+ ctr++;
+
+ if (ctr != DCTSIZE) {
+ if (ctr == 12)
+ break; /* Done. */
+ dataptr += DCTSIZE; /* advance pointer to next row */
+ } else
+ dataptr = workspace; /* switch pointer to extended workspace */
+ }
+
+ /* Pass 2: process columns.
+ * We leave the results scaled up by an overall factor of 8.
+ * We must also scale the output by (8/12)**2 = 4/9, which we partially
+ * fold into the constant multipliers and final shifting:
+ * cK now represents sqrt(2) * cos(K*pi/24) * 8/9.
+ */
+
+ dataptr = data;
+ wsptr = workspace;
+ for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+ /* Even part */
+
+ tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*3];
+ tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*2];
+ tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*1];
+ tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*0];
+ tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*7];
+ tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*6];
+
+ tmp10 = tmp0 + tmp5;
+ tmp13 = tmp0 - tmp5;
+ tmp11 = tmp1 + tmp4;
+ tmp14 = tmp1 - tmp4;
+ tmp12 = tmp2 + tmp3;
+ tmp15 = tmp2 - tmp3;
+
+ tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*3];
+ tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*2];
+ tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*1];
+ tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*0];
+ tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*7];
+ tmp5 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*6];
+
+ dataptr[DCTSIZE*0] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(0.888888889)), /* 8/9 */
+ CONST_BITS+1);
+ dataptr[DCTSIZE*6] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp13 - tmp14 - tmp15, FIX(0.888888889)), /* 8/9 */
+ CONST_BITS+1);
+ dataptr[DCTSIZE*4] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.088662108)), /* c4 */
+ CONST_BITS+1);
+ dataptr[DCTSIZE*2] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp14 - tmp15, FIX(0.888888889)) + /* 8/9 */
+ MULTIPLY(tmp13 + tmp15, FIX(1.214244803)), /* c2 */
+ CONST_BITS+1);
+
+ /* Odd part */
+
+ tmp10 = MULTIPLY(tmp1 + tmp4, FIX(0.481063200)); /* c9 */
+ tmp14 = tmp10 + MULTIPLY(tmp1, FIX(0.680326102)); /* c3-c9 */
+ tmp15 = tmp10 - MULTIPLY(tmp4, FIX(1.642452502)); /* c3+c9 */
+ tmp12 = MULTIPLY(tmp0 + tmp2, FIX(0.997307603)); /* c5 */
+ tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.765261039)); /* c7 */
+ tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.516244403)) /* c5+c7-c1 */
+ + MULTIPLY(tmp5, FIX(0.164081699)); /* c11 */
+ tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.164081699)); /* -c11 */
+ tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.079550144)) /* c1+c5-c11 */
+ + MULTIPLY(tmp5, FIX(0.765261039)); /* c7 */
+ tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.645144899)) /* c1+c11-c7 */
+ - MULTIPLY(tmp5, FIX(0.997307603)); /* c5 */
+ tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.161389302)) /* c3 */
+ - MULTIPLY(tmp2 + tmp5, FIX(0.481063200)); /* c9 */
+
+ dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+1);
+ dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+1);
+ dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+1);
+ dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+1);
+
+ dataptr++; /* advance pointer to next column */
+ wsptr++; /* advance pointer to next column */
+ }
+}
+
+
+/*
+ * Perform the forward DCT on a 13x13 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_13x13 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+ INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+ INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+ INT32 z1, z2;
+ DCTELEM workspace[8*5];
+ DCTELEM *dataptr;
+ DCTELEM *wsptr;
+ JSAMPROW elemptr;
+ int ctr;
+ SHIFT_TEMPS
+
+ /* Pass 1: process rows. */
+ /* Note results are scaled up by sqrt(8) compared to a true DCT. */
+ /* cK represents sqrt(2) * cos(K*pi/26). */
+
+ dataptr = data;
+ ctr = 0;
+ for (;;) {
+ elemptr = sample_data[ctr] + start_col;
+
+ /* Even part */
+
+ tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[12]);
+ tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[11]);
+ tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[10]);
+ tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[9]);
+ tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[8]);
+ tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[7]);
+ tmp6 = GETJSAMPLE(elemptr[6]);
+
+ tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[12]);
+ tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[11]);
+ tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[10]);
+ tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[9]);
+ tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[8]);
+ tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[7]);
+
+ /* Apply unsigned->signed conversion */
+ dataptr[0] = (DCTELEM)
+ (tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6 - 13 * CENTERJSAMPLE);
+ tmp6 += tmp6;
+ tmp0 -= tmp6;
+ tmp1 -= tmp6;
+ tmp2 -= tmp6;
+ tmp3 -= tmp6;
+ tmp4 -= tmp6;
+ tmp5 -= tmp6;
+ dataptr[2] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp0, FIX(1.373119086)) + /* c2 */
+ MULTIPLY(tmp1, FIX(1.058554052)) + /* c6 */
+ MULTIPLY(tmp2, FIX(0.501487041)) - /* c10 */
+ MULTIPLY(tmp3, FIX(0.170464608)) - /* c12 */
+ MULTIPLY(tmp4, FIX(0.803364869)) - /* c8 */
+ MULTIPLY(tmp5, FIX(1.252223920)), /* c4 */
+ CONST_BITS);
+ z1 = MULTIPLY(tmp0 - tmp2, FIX(1.155388986)) - /* (c4+c6)/2 */
+ MULTIPLY(tmp3 - tmp4, FIX(0.435816023)) - /* (c2-c10)/2 */
+ MULTIPLY(tmp1 - tmp5, FIX(0.316450131)); /* (c8-c12)/2 */
+ z2 = MULTIPLY(tmp0 + tmp2, FIX(0.096834934)) - /* (c4-c6)/2 */
+ MULTIPLY(tmp3 + tmp4, FIX(0.937303064)) + /* (c2+c10)/2 */
+ MULTIPLY(tmp1 + tmp5, FIX(0.486914739)); /* (c8+c12)/2 */
+
+ dataptr[4] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS);
+ dataptr[6] = (DCTELEM) DESCALE(z1 - z2, CONST_BITS);
+
+ /* Odd part */
+
+ tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.322312651)); /* c3 */
+ tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.163874945)); /* c5 */
+ tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.937797057)) + /* c7 */
+ MULTIPLY(tmp14 + tmp15, FIX(0.338443458)); /* c11 */
+ tmp0 = tmp1 + tmp2 + tmp3 -
+ MULTIPLY(tmp10, FIX(2.020082300)) + /* c3+c5+c7-c1 */
+ MULTIPLY(tmp14, FIX(0.318774355)); /* c9-c11 */
+ tmp4 = MULTIPLY(tmp14 - tmp15, FIX(0.937797057)) - /* c7 */
+ MULTIPLY(tmp11 + tmp12, FIX(0.338443458)); /* c11 */
+ tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.163874945)); /* -c5 */
+ tmp1 += tmp4 + tmp5 +
+ MULTIPLY(tmp11, FIX(0.837223564)) - /* c5+c9+c11-c3 */
+ MULTIPLY(tmp14, FIX(2.341699410)); /* c1+c7 */
+ tmp6 = MULTIPLY(tmp12 + tmp13, - FIX(0.657217813)); /* -c9 */
+ tmp2 += tmp4 + tmp6 -
+ MULTIPLY(tmp12, FIX(1.572116027)) + /* c1+c5-c9-c11 */
+ MULTIPLY(tmp15, FIX(2.260109708)); /* c3+c7 */
+ tmp3 += tmp5 + tmp6 +
+ MULTIPLY(tmp13, FIX(2.205608352)) - /* c3+c5+c9-c7 */
+ MULTIPLY(tmp15, FIX(1.742345811)); /* c1+c11 */
+
+ dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS);
+ dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS);
+ dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS);
+ dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS);
+
+ ctr++;
+
+ if (ctr != DCTSIZE) {
+ if (ctr == 13)
+ break; /* Done. */
+ dataptr += DCTSIZE; /* advance pointer to next row */
+ } else
+ dataptr = workspace; /* switch pointer to extended workspace */
+ }
+
+ /* Pass 2: process columns.
+ * We leave the results scaled up by an overall factor of 8.
+ * We must also scale the output by (8/13)**2 = 64/169, which we partially
+ * fold into the constant multipliers and final shifting:
+ * cK now represents sqrt(2) * cos(K*pi/26) * 128/169.
+ */
+
+ dataptr = data;
+ wsptr = workspace;
+ for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+ /* Even part */
+
+ tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*4];
+ tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*3];
+ tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*2];
+ tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*1];
+ tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*0];
+ tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*7];
+ tmp6 = dataptr[DCTSIZE*6];
+
+ tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*4];
+ tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*3];
+ tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*2];
+ tmp13 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*1];
+ tmp14 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*0];
+ tmp15 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*7];
+
+ dataptr[DCTSIZE*0] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6,
+ FIX(0.757396450)), /* 128/169 */
+ CONST_BITS+1);
+ tmp6 += tmp6;
+ tmp0 -= tmp6;
+ tmp1 -= tmp6;
+ tmp2 -= tmp6;
+ tmp3 -= tmp6;
+ tmp4 -= tmp6;
+ tmp5 -= tmp6;
+ dataptr[DCTSIZE*2] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp0, FIX(1.039995521)) + /* c2 */
+ MULTIPLY(tmp1, FIX(0.801745081)) + /* c6 */
+ MULTIPLY(tmp2, FIX(0.379824504)) - /* c10 */
+ MULTIPLY(tmp3, FIX(0.129109289)) - /* c12 */
+ MULTIPLY(tmp4, FIX(0.608465700)) - /* c8 */
+ MULTIPLY(tmp5, FIX(0.948429952)), /* c4 */
+ CONST_BITS+1);
+ z1 = MULTIPLY(tmp0 - tmp2, FIX(0.875087516)) - /* (c4+c6)/2 */
+ MULTIPLY(tmp3 - tmp4, FIX(0.330085509)) - /* (c2-c10)/2 */
+ MULTIPLY(tmp1 - tmp5, FIX(0.239678205)); /* (c8-c12)/2 */
+ z2 = MULTIPLY(tmp0 + tmp2, FIX(0.073342435)) - /* (c4-c6)/2 */
+ MULTIPLY(tmp3 + tmp4, FIX(0.709910013)) + /* (c2+c10)/2 */
+ MULTIPLY(tmp1 + tmp5, FIX(0.368787494)); /* (c8+c12)/2 */
+
+ dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+1);
+ dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 - z2, CONST_BITS+1);
+
+ /* Odd part */
+
+ tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.001514908)); /* c3 */
+ tmp2 = MULTIPLY(tmp10 + tmp12, FIX(0.881514751)); /* c5 */
+ tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.710284161)) + /* c7 */
+ MULTIPLY(tmp14 + tmp15, FIX(0.256335874)); /* c11 */
+ tmp0 = tmp1 + tmp2 + tmp3 -
+ MULTIPLY(tmp10, FIX(1.530003162)) + /* c3+c5+c7-c1 */
+ MULTIPLY(tmp14, FIX(0.241438564)); /* c9-c11 */
+ tmp4 = MULTIPLY(tmp14 - tmp15, FIX(0.710284161)) - /* c7 */
+ MULTIPLY(tmp11 + tmp12, FIX(0.256335874)); /* c11 */
+ tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(0.881514751)); /* -c5 */
+ tmp1 += tmp4 + tmp5 +
+ MULTIPLY(tmp11, FIX(0.634110155)) - /* c5+c9+c11-c3 */
+ MULTIPLY(tmp14, FIX(1.773594819)); /* c1+c7 */
+ tmp6 = MULTIPLY(tmp12 + tmp13, - FIX(0.497774438)); /* -c9 */
+ tmp2 += tmp4 + tmp6 -
+ MULTIPLY(tmp12, FIX(1.190715098)) + /* c1+c5-c9-c11 */
+ MULTIPLY(tmp15, FIX(1.711799069)); /* c3+c7 */
+ tmp3 += tmp5 + tmp6 +
+ MULTIPLY(tmp13, FIX(1.670519935)) - /* c3+c5+c9-c7 */
+ MULTIPLY(tmp15, FIX(1.319646532)); /* c1+c11 */
+
+ dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+1);
+ dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+1);
+ dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+1);
+ dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+1);
+
+ dataptr++; /* advance pointer to next column */
+ wsptr++; /* advance pointer to next column */
+ }
+}
+
+
+/*
+ * Perform the forward DCT on a 14x14 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_14x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+ INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+ INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+ DCTELEM workspace[8*6];
+ DCTELEM *dataptr;
+ DCTELEM *wsptr;
+ JSAMPROW elemptr;
+ int ctr;
+ SHIFT_TEMPS
+
+ /* Pass 1: process rows. */
+ /* Note results are scaled up by sqrt(8) compared to a true DCT. */
+ /* cK represents sqrt(2) * cos(K*pi/28). */
+
+ dataptr = data;
+ ctr = 0;
+ for (;;) {
+ elemptr = sample_data[ctr] + start_col;
+
+ /* Even part */
+
+ tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]);
+ tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]);
+ tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]);
+ tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]);
+ tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]);
+ tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]);
+ tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]);
+
+ tmp10 = tmp0 + tmp6;
+ tmp14 = tmp0 - tmp6;
+ tmp11 = tmp1 + tmp5;
+ tmp15 = tmp1 - tmp5;
+ tmp12 = tmp2 + tmp4;
+ tmp16 = tmp2 - tmp4;
+
+ tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]);
+ tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]);
+ tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]);
+ tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
+ tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
+ tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
+ tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);
+
+ /* Apply unsigned->signed conversion */
+ dataptr[0] = (DCTELEM)
+ (tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE);
+ tmp13 += tmp13;
+ dataptr[4] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */
+ MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */
+ MULTIPLY(tmp12 - tmp13, FIX(0.881747734)), /* c8 */
+ CONST_BITS);
+
+ tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686)); /* c6 */
+
+ dataptr[2] = (DCTELEM)
+ DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590)) /* c2-c6 */
+ + MULTIPLY(tmp16, FIX(0.613604268)), /* c10 */
+ CONST_BITS);
+ dataptr[6] = (DCTELEM)
+ DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954)) /* c6+c10 */
+ - MULTIPLY(tmp16, FIX(1.378756276)), /* c2 */
+ CONST_BITS);
+
+ /* Odd part */
+
+ tmp10 = tmp1 + tmp2;
+ tmp11 = tmp5 - tmp4;
+ dataptr[7] = (DCTELEM) (tmp0 - tmp10 + tmp3 - tmp11 - tmp6);
+ tmp3 <<= CONST_BITS;
+ tmp10 = MULTIPLY(tmp10, - FIX(0.158341681)); /* -c13 */
+ tmp11 = MULTIPLY(tmp11, FIX(1.405321284)); /* c1 */
+ tmp10 += tmp11 - tmp3;
+ tmp11 = MULTIPLY(tmp0 + tmp2, FIX(1.197448846)) + /* c5 */
+ MULTIPLY(tmp4 + tmp6, FIX(0.752406978)); /* c9 */
+ dataptr[5] = (DCTELEM)
+ DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(2.373959773)) /* c3+c5-c13 */
+ + MULTIPLY(tmp4, FIX(1.119999435)), /* c1+c11-c9 */
+ CONST_BITS);
+ tmp12 = MULTIPLY(tmp0 + tmp1, FIX(1.334852607)) + /* c3 */
+ MULTIPLY(tmp5 - tmp6, FIX(0.467085129)); /* c11 */
+ dataptr[3] = (DCTELEM)
+ DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.424103948)) /* c3-c9-c13 */
+ - MULTIPLY(tmp5, FIX(3.069855259)), /* c1+c5+c11 */
+ CONST_BITS);
+ dataptr[1] = (DCTELEM)
+ DESCALE(tmp11 + tmp12 + tmp3 + tmp6 -
+ MULTIPLY(tmp0 + tmp6, FIX(1.126980169)), /* c3+c5-c1 */
+ CONST_BITS);
+
+ ctr++;
+
+ if (ctr != DCTSIZE) {
+ if (ctr == 14)
+ break; /* Done. */
+ dataptr += DCTSIZE; /* advance pointer to next row */
+ } else
+ dataptr = workspace; /* switch pointer to extended workspace */
+ }
+
+ /* Pass 2: process columns.
+ * We leave the results scaled up by an overall factor of 8.
+ * We must also scale the output by (8/14)**2 = 16/49, which we partially
+ * fold into the constant multipliers and final shifting:
+ * cK now represents sqrt(2) * cos(K*pi/28) * 32/49.
+ */
+
+ dataptr = data;
+ wsptr = workspace;
+ for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+ /* Even part */
+
+ tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*5];
+ tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*4];
+ tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*3];
+ tmp13 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*2];
+ tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*1];
+ tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*0];
+ tmp6 = dataptr[DCTSIZE*6] + dataptr[DCTSIZE*7];
+
+ tmp10 = tmp0 + tmp6;
+ tmp14 = tmp0 - tmp6;
+ tmp11 = tmp1 + tmp5;
+ tmp15 = tmp1 - tmp5;
+ tmp12 = tmp2 + tmp4;
+ tmp16 = tmp2 - tmp4;
+
+ tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*5];
+ tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*4];
+ tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*3];
+ tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*2];
+ tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*1];
+ tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*0];
+ tmp6 = dataptr[DCTSIZE*6] - dataptr[DCTSIZE*7];
+
+ dataptr[DCTSIZE*0] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12 + tmp13,
+ FIX(0.653061224)), /* 32/49 */
+ CONST_BITS+1);
+ tmp13 += tmp13;
+ dataptr[DCTSIZE*4] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 - tmp13, FIX(0.832106052)) + /* c4 */
+ MULTIPLY(tmp11 - tmp13, FIX(0.205513223)) - /* c12 */
+ MULTIPLY(tmp12 - tmp13, FIX(0.575835255)), /* c8 */
+ CONST_BITS+1);
+
+ tmp10 = MULTIPLY(tmp14 + tmp15, FIX(0.722074570)); /* c6 */
+
+ dataptr[DCTSIZE*2] = (DCTELEM)
+ DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.178337691)) /* c2-c6 */
+ + MULTIPLY(tmp16, FIX(0.400721155)), /* c10 */
+ CONST_BITS+1);
+ dataptr[DCTSIZE*6] = (DCTELEM)
+ DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.122795725)) /* c6+c10 */
+ - MULTIPLY(tmp16, FIX(0.900412262)), /* c2 */
+ CONST_BITS+1);
+
+ /* Odd part */
+
+ tmp10 = tmp1 + tmp2;
+ tmp11 = tmp5 - tmp4;
+ dataptr[DCTSIZE*7] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp0 - tmp10 + tmp3 - tmp11 - tmp6,
+ FIX(0.653061224)), /* 32/49 */
+ CONST_BITS+1);
+ tmp3 = MULTIPLY(tmp3 , FIX(0.653061224)); /* 32/49 */
+ tmp10 = MULTIPLY(tmp10, - FIX(0.103406812)); /* -c13 */
+ tmp11 = MULTIPLY(tmp11, FIX(0.917760839)); /* c1 */
+ tmp10 += tmp11 - tmp3;
+ tmp11 = MULTIPLY(tmp0 + tmp2, FIX(0.782007410)) + /* c5 */
+ MULTIPLY(tmp4 + tmp6, FIX(0.491367823)); /* c9 */
+ dataptr[DCTSIZE*5] = (DCTELEM)
+ DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(1.550341076)) /* c3+c5-c13 */
+ + MULTIPLY(tmp4, FIX(0.731428202)), /* c1+c11-c9 */
+ CONST_BITS+1);
+ tmp12 = MULTIPLY(tmp0 + tmp1, FIX(0.871740478)) + /* c3 */
+ MULTIPLY(tmp5 - tmp6, FIX(0.305035186)); /* c11 */
+ dataptr[DCTSIZE*3] = (DCTELEM)
+ DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.276965844)) /* c3-c9-c13 */
+ - MULTIPLY(tmp5, FIX(2.004803435)), /* c1+c5+c11 */
+ CONST_BITS+1);
+ dataptr[DCTSIZE*1] = (DCTELEM)
+ DESCALE(tmp11 + tmp12 + tmp3
+ - MULTIPLY(tmp0, FIX(0.735987049)) /* c3+c5-c1 */
+ - MULTIPLY(tmp6, FIX(0.082925825)), /* c9-c11-c13 */
+ CONST_BITS+1);
+
+ dataptr++; /* advance pointer to next column */
+ wsptr++; /* advance pointer to next column */
+ }
+}
+
+
+/*
+ * Perform the forward DCT on a 15x15 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_15x15 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+ INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+ INT32 z1, z2, z3;
+ DCTELEM workspace[8*7];
+ DCTELEM *dataptr;
+ DCTELEM *wsptr;
+ JSAMPROW elemptr;
+ int ctr;
+ SHIFT_TEMPS
+
+ /* Pass 1: process rows. */
+ /* Note results are scaled up by sqrt(8) compared to a true DCT. */
+ /* cK represents sqrt(2) * cos(K*pi/30). */
+
+ dataptr = data;
+ ctr = 0;
+ for (;;) {
+ elemptr = sample_data[ctr] + start_col;
+
+ /* Even part */
+
+ tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[14]);
+ tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[13]);
+ tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[12]);
+ tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[11]);
+ tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[10]);
+ tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[9]);
+ tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[8]);
+ tmp7 = GETJSAMPLE(elemptr[7]);
+
+ tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[14]);
+ tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[13]);
+ tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[12]);
+ tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[11]);
+ tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[10]);
+ tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[9]);
+ tmp16 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[8]);
+
+ z1 = tmp0 + tmp4 + tmp5;
+ z2 = tmp1 + tmp3 + tmp6;
+ z3 = tmp2 + tmp7;
+ /* Apply unsigned->signed conversion */
+ dataptr[0] = (DCTELEM) (z1 + z2 + z3 - 15 * CENTERJSAMPLE);
+ z3 += z3;
+ dataptr[6] = (DCTELEM)
+ DESCALE(MULTIPLY(z1 - z3, FIX(1.144122806)) - /* c6 */
+ MULTIPLY(z2 - z3, FIX(0.437016024)), /* c12 */
+ CONST_BITS);
+ tmp2 += ((tmp1 + tmp4) >> 1) - tmp7 - tmp7;
+ z1 = MULTIPLY(tmp3 - tmp2, FIX(1.531135173)) - /* c2+c14 */
+ MULTIPLY(tmp6 - tmp2, FIX(2.238241955)); /* c4+c8 */
+ z2 = MULTIPLY(tmp5 - tmp2, FIX(0.798468008)) - /* c8-c14 */
+ MULTIPLY(tmp0 - tmp2, FIX(0.091361227)); /* c2-c4 */
+ z3 = MULTIPLY(tmp0 - tmp3, FIX(1.383309603)) + /* c2 */
+ MULTIPLY(tmp6 - tmp5, FIX(0.946293579)) + /* c8 */
+ MULTIPLY(tmp1 - tmp4, FIX(0.790569415)); /* (c6+c12)/2 */
+
+ dataptr[2] = (DCTELEM) DESCALE(z1 + z3, CONST_BITS);
+ dataptr[4] = (DCTELEM) DESCALE(z2 + z3, CONST_BITS);
+
+ /* Odd part */
+
+ tmp2 = MULTIPLY(tmp10 - tmp12 - tmp13 + tmp15 + tmp16,
+ FIX(1.224744871)); /* c5 */
+ tmp1 = MULTIPLY(tmp10 - tmp14 - tmp15, FIX(1.344997024)) + /* c3 */
+ MULTIPLY(tmp11 - tmp13 - tmp16, FIX(0.831253876)); /* c9 */
+ tmp12 = MULTIPLY(tmp12, FIX(1.224744871)); /* c5 */
+ tmp4 = MULTIPLY(tmp10 - tmp16, FIX(1.406466353)) + /* c1 */
+ MULTIPLY(tmp11 + tmp14, FIX(1.344997024)) + /* c3 */
+ MULTIPLY(tmp13 + tmp15, FIX(0.575212477)); /* c11 */
+ tmp0 = MULTIPLY(tmp13, FIX(0.475753014)) - /* c7-c11 */
+ MULTIPLY(tmp14, FIX(0.513743148)) + /* c3-c9 */
+ MULTIPLY(tmp16, FIX(1.700497885)) + tmp4 + tmp12; /* c1+c13 */
+ tmp3 = MULTIPLY(tmp10, - FIX(0.355500862)) - /* -(c1-c7) */
+ MULTIPLY(tmp11, FIX(2.176250899)) - /* c3+c9 */
+ MULTIPLY(tmp15, FIX(0.869244010)) + tmp4 - tmp12; /* c11+c13 */
+
+ dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS);
+ dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS);
+ dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS);
+ dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS);
+
+ ctr++;
+
+ if (ctr != DCTSIZE) {
+ if (ctr == 15)
+ break; /* Done. */
+ dataptr += DCTSIZE; /* advance pointer to next row */
+ } else
+ dataptr = workspace; /* switch pointer to extended workspace */
+ }
+
+ /* Pass 2: process columns.
+ * We leave the results scaled up by an overall factor of 8.
+ * We must also scale the output by (8/15)**2 = 64/225, which we partially
+ * fold into the constant multipliers and final shifting:
+ * cK now represents sqrt(2) * cos(K*pi/30) * 256/225.
+ */
+
+ dataptr = data;
+ wsptr = workspace;
+ for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+ /* Even part */
+
+ tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*6];
+ tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*5];
+ tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*4];
+ tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*3];
+ tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*2];
+ tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*1];
+ tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*0];
+ tmp7 = dataptr[DCTSIZE*7];
+
+ tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*6];
+ tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*5];
+ tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*4];
+ tmp13 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*3];
+ tmp14 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*2];
+ tmp15 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*1];
+ tmp16 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*0];
+
+ z1 = tmp0 + tmp4 + tmp5;
+ z2 = tmp1 + tmp3 + tmp6;
+ z3 = tmp2 + tmp7;
+ dataptr[DCTSIZE*0] = (DCTELEM)
+ DESCALE(MULTIPLY(z1 + z2 + z3, FIX(1.137777778)), /* 256/225 */
+ CONST_BITS+2);
+ z3 += z3;
+ dataptr[DCTSIZE*6] = (DCTELEM)
+ DESCALE(MULTIPLY(z1 - z3, FIX(1.301757503)) - /* c6 */
+ MULTIPLY(z2 - z3, FIX(0.497227121)), /* c12 */
+ CONST_BITS+2);
+ tmp2 += ((tmp1 + tmp4) >> 1) - tmp7 - tmp7;
+ z1 = MULTIPLY(tmp3 - tmp2, FIX(1.742091575)) - /* c2+c14 */
+ MULTIPLY(tmp6 - tmp2, FIX(2.546621957)); /* c4+c8 */
+ z2 = MULTIPLY(tmp5 - tmp2, FIX(0.908479156)) - /* c8-c14 */
+ MULTIPLY(tmp0 - tmp2, FIX(0.103948774)); /* c2-c4 */
+ z3 = MULTIPLY(tmp0 - tmp3, FIX(1.573898926)) + /* c2 */
+ MULTIPLY(tmp6 - tmp5, FIX(1.076671805)) + /* c8 */
+ MULTIPLY(tmp1 - tmp4, FIX(0.899492312)); /* (c6+c12)/2 */
+
+ dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z3, CONST_BITS+2);
+ dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(z2 + z3, CONST_BITS+2);
+
+ /* Odd part */
+
+ tmp2 = MULTIPLY(tmp10 - tmp12 - tmp13 + tmp15 + tmp16,
+ FIX(1.393487498)); /* c5 */
+ tmp1 = MULTIPLY(tmp10 - tmp14 - tmp15, FIX(1.530307725)) + /* c3 */
+ MULTIPLY(tmp11 - tmp13 - tmp16, FIX(0.945782187)); /* c9 */
+ tmp12 = MULTIPLY(tmp12, FIX(1.393487498)); /* c5 */
+ tmp4 = MULTIPLY(tmp10 - tmp16, FIX(1.600246161)) + /* c1 */
+ MULTIPLY(tmp11 + tmp14, FIX(1.530307725)) + /* c3 */
+ MULTIPLY(tmp13 + tmp15, FIX(0.654463974)); /* c11 */
+ tmp0 = MULTIPLY(tmp13, FIX(0.541301207)) - /* c7-c11 */
+ MULTIPLY(tmp14, FIX(0.584525538)) + /* c3-c9 */
+ MULTIPLY(tmp16, FIX(1.934788705)) + tmp4 + tmp12; /* c1+c13 */
+ tmp3 = MULTIPLY(tmp10, - FIX(0.404480980)) - /* -(c1-c7) */
+ MULTIPLY(tmp11, FIX(2.476089912)) - /* c3+c9 */
+ MULTIPLY(tmp15, FIX(0.989006518)) + tmp4 - tmp12; /* c11+c13 */
+
+ dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+2);
+ dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+2);
+ dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+2);
+ dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+2);
+
+ dataptr++; /* advance pointer to next column */
+ wsptr++; /* advance pointer to next column */
+ }
+}
+
+
+/*
+ * Perform the forward DCT on a 16x16 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_16x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+ INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
+ DCTELEM workspace[DCTSIZE2];
+ DCTELEM *dataptr;
+ DCTELEM *wsptr;
+ JSAMPROW elemptr;
+ int ctr;
+ SHIFT_TEMPS
+
+ /* Pass 1: process rows. */
+ /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+ /* furthermore, we scale the results by 2**PASS1_BITS. */
+ /* cK represents sqrt(2) * cos(K*pi/32). */
+
+ dataptr = data;
+ ctr = 0;
+ for (;;) {
+ elemptr = sample_data[ctr] + start_col;
+
+ /* Even part */
+
+ tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]);
+ tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]);
+ tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]);
+ tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]);
+ tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]);
+ tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]);
+ tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]);
+ tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]);
+
+ tmp10 = tmp0 + tmp7;
+ tmp14 = tmp0 - tmp7;
+ tmp11 = tmp1 + tmp6;
+ tmp15 = tmp1 - tmp6;
+ tmp12 = tmp2 + tmp5;
+ tmp16 = tmp2 - tmp5;
+ tmp13 = tmp3 + tmp4;
+ tmp17 = tmp3 - tmp4;
+
+ tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]);
+ tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]);
+ tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]);
+ tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]);
+ tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
+ tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
+ tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
+ tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);
+
+ /* Apply unsigned->signed conversion */
+ dataptr[0] = (DCTELEM)
+ ((tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE) << PASS1_BITS);
+ dataptr[4] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
+ MULTIPLY(tmp11 - tmp12, FIX_0_541196100), /* c12[16] = c6[8] */
+ CONST_BITS-PASS1_BITS);
+
+ tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) + /* c14[16] = c7[8] */
+ MULTIPLY(tmp14 - tmp16, FIX(1.387039845)); /* c2[16] = c1[8] */
+
+ dataptr[2] = (DCTELEM)
+ DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982)) /* c6+c14 */
+ + MULTIPLY(tmp16, FIX(2.172734804)), /* c2+c10 */
+ CONST_BITS-PASS1_BITS);
+ dataptr[6] = (DCTELEM)
+ DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243)) /* c2-c6 */
+ - MULTIPLY(tmp17, FIX(1.061594338)), /* c10+c14 */
+ CONST_BITS-PASS1_BITS);
+
+ /* Odd part */
+
+ tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) + /* c3 */
+ MULTIPLY(tmp6 - tmp7, FIX(0.410524528)); /* c13 */
+ tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) + /* c5 */
+ MULTIPLY(tmp5 + tmp7, FIX(0.666655658)); /* c11 */
+ tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) + /* c7 */
+ MULTIPLY(tmp4 - tmp7, FIX(0.897167586)); /* c9 */
+ tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) + /* c15 */
+ MULTIPLY(tmp6 - tmp5, FIX(1.407403738)); /* c1 */
+ tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) + /* -c11 */
+ MULTIPLY(tmp4 + tmp6, - FIX(1.247225013)); /* -c5 */
+ tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) + /* -c3 */
+ MULTIPLY(tmp5 - tmp4, FIX(0.410524528)); /* c13 */
+ tmp10 = tmp11 + tmp12 + tmp13 -
+ MULTIPLY(tmp0, FIX(2.286341144)) + /* c7+c5+c3-c1 */
+ MULTIPLY(tmp7, FIX(0.779653625)); /* c15+c13-c11+c9 */
+ tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
+ - MULTIPLY(tmp6, FIX(1.663905119)); /* c7+c13+c1-c5 */
+ tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
+ + MULTIPLY(tmp5, FIX(1.227391138)); /* c9-c11+c1-c13 */
+ tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
+ + MULTIPLY(tmp4, FIX(2.167985692)); /* c1+c13+c5-c9 */
+
+ dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
+ dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
+ dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
+ dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
+
+ ctr++;
+
+ if (ctr != DCTSIZE) {
+ if (ctr == DCTSIZE * 2)
+ break; /* Done. */
+ dataptr += DCTSIZE; /* advance pointer to next row */
+ } else
+ dataptr = workspace; /* switch pointer to extended workspace */
+ }
+
+ /* Pass 2: process columns.
+ * We remove the PASS1_BITS scaling, but leave the results scaled up
+ * by an overall factor of 8.
+ * We must also scale the output by (8/16)**2 = 1/2**2.
+ */
+
+ dataptr = data;
+ wsptr = workspace;
+ for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+ /* Even part */
+
+ tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*7];
+ tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*6];
+ tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*5];
+ tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*4];
+ tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*3];
+ tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*2];
+ tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*1];
+ tmp7 = dataptr[DCTSIZE*7] + wsptr[DCTSIZE*0];
+
+ tmp10 = tmp0 + tmp7;
+ tmp14 = tmp0 - tmp7;
+ tmp11 = tmp1 + tmp6;
+ tmp15 = tmp1 - tmp6;
+ tmp12 = tmp2 + tmp5;
+ tmp16 = tmp2 - tmp5;
+ tmp13 = tmp3 + tmp4;
+ tmp17 = tmp3 - tmp4;
+
+ tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*7];
+ tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*6];
+ tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*5];
+ tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*4];
+ tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*3];
+ tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*2];
+ tmp6 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*1];
+ tmp7 = dataptr[DCTSIZE*7] - wsptr[DCTSIZE*0];
+
+ dataptr[DCTSIZE*0] = (DCTELEM)
+ DESCALE(tmp10 + tmp11 + tmp12 + tmp13, PASS1_BITS+2);
+ dataptr[DCTSIZE*4] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
+ MULTIPLY(tmp11 - tmp12, FIX_0_541196100), /* c12[16] = c6[8] */
+ CONST_BITS+PASS1_BITS+2);
+
+ tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) + /* c14[16] = c7[8] */
+ MULTIPLY(tmp14 - tmp16, FIX(1.387039845)); /* c2[16] = c1[8] */
+
+ dataptr[DCTSIZE*2] = (DCTELEM)
+ DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982)) /* c6+c14 */
+ + MULTIPLY(tmp16, FIX(2.172734804)), /* c2+10 */
+ CONST_BITS+PASS1_BITS+2);
+ dataptr[DCTSIZE*6] = (DCTELEM)
+ DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243)) /* c2-c6 */
+ - MULTIPLY(tmp17, FIX(1.061594338)), /* c10+c14 */
+ CONST_BITS+PASS1_BITS+2);
+
+ /* Odd part */
+
+ tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) + /* c3 */
+ MULTIPLY(tmp6 - tmp7, FIX(0.410524528)); /* c13 */
+ tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) + /* c5 */
+ MULTIPLY(tmp5 + tmp7, FIX(0.666655658)); /* c11 */
+ tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) + /* c7 */
+ MULTIPLY(tmp4 - tmp7, FIX(0.897167586)); /* c9 */
+ tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) + /* c15 */
+ MULTIPLY(tmp6 - tmp5, FIX(1.407403738)); /* c1 */
+ tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) + /* -c11 */
+ MULTIPLY(tmp4 + tmp6, - FIX(1.247225013)); /* -c5 */
+ tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) + /* -c3 */
+ MULTIPLY(tmp5 - tmp4, FIX(0.410524528)); /* c13 */
+ tmp10 = tmp11 + tmp12 + tmp13 -
+ MULTIPLY(tmp0, FIX(2.286341144)) + /* c7+c5+c3-c1 */
+ MULTIPLY(tmp7, FIX(0.779653625)); /* c15+c13-c11+c9 */
+ tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
+ - MULTIPLY(tmp6, FIX(1.663905119)); /* c7+c13+c1-c5 */
+ tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
+ + MULTIPLY(tmp5, FIX(1.227391138)); /* c9-c11+c1-c13 */
+ tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
+ + MULTIPLY(tmp4, FIX(2.167985692)); /* c1+c13+c5-c9 */
+
+ dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS+2);
+ dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS+2);
+ dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS+2);
+ dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS+2);
+
+ dataptr++; /* advance pointer to next column */
+ wsptr++; /* advance pointer to next column */
+ }
+}
+
+
+/*
+ * Perform the forward DCT on a 16x8 sample block.
+ *
+ * 16-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_16x8 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+ INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
+ INT32 z1;
+ DCTELEM *dataptr;
+ JSAMPROW elemptr;
+ int ctr;
+ SHIFT_TEMPS
+
+ /* Pass 1: process rows. */
+ /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+ /* furthermore, we scale the results by 2**PASS1_BITS. */
+ /* 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32). */
+
+ dataptr = data;
+ ctr = 0;
+ for (ctr = 0; ctr < DCTSIZE; ctr++) {
+ elemptr = sample_data[ctr] + start_col;
+
+ /* Even part */
+
+ tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]);
+ tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]);
+ tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]);
+ tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]);
+ tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]);
+ tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]);
+ tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]);
+ tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]);
+
+ tmp10 = tmp0 + tmp7;
+ tmp14 = tmp0 - tmp7;
+ tmp11 = tmp1 + tmp6;
+ tmp15 = tmp1 - tmp6;
+ tmp12 = tmp2 + tmp5;
+ tmp16 = tmp2 - tmp5;
+ tmp13 = tmp3 + tmp4;
+ tmp17 = tmp3 - tmp4;
+
+ tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]);
+ tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]);
+ tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]);
+ tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]);
+ tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
+ tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
+ tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
+ tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);
+
+ /* Apply unsigned->signed conversion */
+ dataptr[0] = (DCTELEM)
+ ((tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE) << PASS1_BITS);
+ dataptr[4] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
+ MULTIPLY(tmp11 - tmp12, FIX_0_541196100), /* c12[16] = c6[8] */
+ CONST_BITS-PASS1_BITS);
+
+ tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) + /* c14[16] = c7[8] */
+ MULTIPLY(tmp14 - tmp16, FIX(1.387039845)); /* c2[16] = c1[8] */
+
+ dataptr[2] = (DCTELEM)
+ DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982)) /* c6+c14 */
+ + MULTIPLY(tmp16, FIX(2.172734804)), /* c2+c10 */
+ CONST_BITS-PASS1_BITS);
+ dataptr[6] = (DCTELEM)
+ DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243)) /* c2-c6 */
+ - MULTIPLY(tmp17, FIX(1.061594338)), /* c10+c14 */
+ CONST_BITS-PASS1_BITS);
+
+ /* Odd part */
+
+ tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) + /* c3 */
+ MULTIPLY(tmp6 - tmp7, FIX(0.410524528)); /* c13 */
+ tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) + /* c5 */
+ MULTIPLY(tmp5 + tmp7, FIX(0.666655658)); /* c11 */
+ tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) + /* c7 */
+ MULTIPLY(tmp4 - tmp7, FIX(0.897167586)); /* c9 */
+ tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) + /* c15 */
+ MULTIPLY(tmp6 - tmp5, FIX(1.407403738)); /* c1 */
+ tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) + /* -c11 */
+ MULTIPLY(tmp4 + tmp6, - FIX(1.247225013)); /* -c5 */
+ tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) + /* -c3 */
+ MULTIPLY(tmp5 - tmp4, FIX(0.410524528)); /* c13 */
+ tmp10 = tmp11 + tmp12 + tmp13 -
+ MULTIPLY(tmp0, FIX(2.286341144)) + /* c7+c5+c3-c1 */
+ MULTIPLY(tmp7, FIX(0.779653625)); /* c15+c13-c11+c9 */
+ tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
+ - MULTIPLY(tmp6, FIX(1.663905119)); /* c7+c13+c1-c5 */
+ tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
+ + MULTIPLY(tmp5, FIX(1.227391138)); /* c9-c11+c1-c13 */
+ tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
+ + MULTIPLY(tmp4, FIX(2.167985692)); /* c1+c13+c5-c9 */
+
+ dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
+ dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
+ dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
+ dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
+
+ dataptr += DCTSIZE; /* advance pointer to next row */
+ }
+
+ /* Pass 2: process columns.
+ * We remove the PASS1_BITS scaling, but leave the results scaled up
+ * by an overall factor of 8.
+ * We must also scale the output by 8/16 = 1/2.
+ */
+
+ dataptr = data;
+ for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
/* Even part per LL&M figure 1 --- note that published figure is faulty;
* rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
*/
-
+
+ tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
+ tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
+ tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
+ tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
+
+ tmp10 = tmp0 + tmp3;
+ tmp12 = tmp0 - tmp3;
+ tmp11 = tmp1 + tmp2;
+ tmp13 = tmp1 - tmp2;
+
+ tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
+ tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
+ tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
+ tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
+
+ dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS+1);
+ dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS+1);
+
+ z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
+ dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865),
+ CONST_BITS+PASS1_BITS+1);
+ dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065),
+ CONST_BITS+PASS1_BITS+1);
+
+ /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
+ * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
+ * i0..i3 in the paper are tmp0..tmp3 here.
+ */
+
+ tmp10 = tmp0 + tmp3;
+ tmp11 = tmp1 + tmp2;
+ tmp12 = tmp0 + tmp2;
+ tmp13 = tmp1 + tmp3;
+ z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
+
+ tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
+ tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
+ tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
+ tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
+ tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */
+ tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */
+ tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */
+ tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
+
+ tmp12 += z1;
+ tmp13 += z1;
+
+ dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0 + tmp10 + tmp12,
+ CONST_BITS+PASS1_BITS+1);
+ dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1 + tmp11 + tmp13,
+ CONST_BITS+PASS1_BITS+1);
+ dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2 + tmp11 + tmp12,
+ CONST_BITS+PASS1_BITS+1);
+ dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3 + tmp10 + tmp13,
+ CONST_BITS+PASS1_BITS+1);
+
+ dataptr++; /* advance pointer to next column */
+ }
+}
+
+
+/*
+ * Perform the forward DCT on a 14x7 sample block.
+ *
+ * 14-point FDCT in pass 1 (rows), 7-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_14x7 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+ INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+ INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+ INT32 z1, z2, z3;
+ DCTELEM *dataptr;
+ JSAMPROW elemptr;
+ int ctr;
+ SHIFT_TEMPS
+
+ /* Zero bottom row of output coefficient block. */
+ MEMZERO(&data[DCTSIZE*7], SIZEOF(DCTELEM) * DCTSIZE);
+
+ /* Pass 1: process rows. */
+ /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+ /* furthermore, we scale the results by 2**PASS1_BITS. */
+ /* 14-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/28). */
+
+ dataptr = data;
+ for (ctr = 0; ctr < 7; ctr++) {
+ elemptr = sample_data[ctr] + start_col;
+
+ /* Even part */
+
+ tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]);
+ tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]);
+ tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]);
+ tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]);
+ tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]);
+ tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]);
+ tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]);
+
+ tmp10 = tmp0 + tmp6;
+ tmp14 = tmp0 - tmp6;
+ tmp11 = tmp1 + tmp5;
+ tmp15 = tmp1 - tmp5;
+ tmp12 = tmp2 + tmp4;
+ tmp16 = tmp2 - tmp4;
+
+ tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]);
+ tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]);
+ tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]);
+ tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
+ tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
+ tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
+ tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);
+
+ /* Apply unsigned->signed conversion */
+ dataptr[0] = (DCTELEM)
+ ((tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE) << PASS1_BITS);
+ tmp13 += tmp13;
+ dataptr[4] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */
+ MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */
+ MULTIPLY(tmp12 - tmp13, FIX(0.881747734)), /* c8 */
+ CONST_BITS-PASS1_BITS);
+
+ tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686)); /* c6 */
+
+ dataptr[2] = (DCTELEM)
+ DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590)) /* c2-c6 */
+ + MULTIPLY(tmp16, FIX(0.613604268)), /* c10 */
+ CONST_BITS-PASS1_BITS);
+ dataptr[6] = (DCTELEM)
+ DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954)) /* c6+c10 */
+ - MULTIPLY(tmp16, FIX(1.378756276)), /* c2 */
+ CONST_BITS-PASS1_BITS);
+
+ /* Odd part */
+
+ tmp10 = tmp1 + tmp2;
+ tmp11 = tmp5 - tmp4;
+ dataptr[7] = (DCTELEM) ((tmp0 - tmp10 + tmp3 - tmp11 - tmp6) << PASS1_BITS);
+ tmp3 <<= CONST_BITS;
+ tmp10 = MULTIPLY(tmp10, - FIX(0.158341681)); /* -c13 */
+ tmp11 = MULTIPLY(tmp11, FIX(1.405321284)); /* c1 */
+ tmp10 += tmp11 - tmp3;
+ tmp11 = MULTIPLY(tmp0 + tmp2, FIX(1.197448846)) + /* c5 */
+ MULTIPLY(tmp4 + tmp6, FIX(0.752406978)); /* c9 */
+ dataptr[5] = (DCTELEM)
+ DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(2.373959773)) /* c3+c5-c13 */
+ + MULTIPLY(tmp4, FIX(1.119999435)), /* c1+c11-c9 */
+ CONST_BITS-PASS1_BITS);
+ tmp12 = MULTIPLY(tmp0 + tmp1, FIX(1.334852607)) + /* c3 */
+ MULTIPLY(tmp5 - tmp6, FIX(0.467085129)); /* c11 */
+ dataptr[3] = (DCTELEM)
+ DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.424103948)) /* c3-c9-c13 */
+ - MULTIPLY(tmp5, FIX(3.069855259)), /* c1+c5+c11 */
+ CONST_BITS-PASS1_BITS);
+ dataptr[1] = (DCTELEM)
+ DESCALE(tmp11 + tmp12 + tmp3 + tmp6 -
+ MULTIPLY(tmp0 + tmp6, FIX(1.126980169)), /* c3+c5-c1 */
+ CONST_BITS-PASS1_BITS);
+
+ dataptr += DCTSIZE; /* advance pointer to next row */
+ }
+
+ /* Pass 2: process columns.
+ * We remove the PASS1_BITS scaling, but leave the results scaled up
+ * by an overall factor of 8.
+ * We must also scale the output by (8/14)*(8/7) = 32/49, which we
+ * partially fold into the constant multipliers and final shifting:
+ * 7-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/14) * 64/49.
+ */
+
+ dataptr = data;
+ for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+ /* Even part */
+
+ tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*6];
+ tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*5];
+ tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*4];
+ tmp3 = dataptr[DCTSIZE*3];
+
+ tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*6];
+ tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*5];
+ tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*4];
+
+ z1 = tmp0 + tmp2;
+ dataptr[DCTSIZE*0] = (DCTELEM)
+ DESCALE(MULTIPLY(z1 + tmp1 + tmp3, FIX(1.306122449)), /* 64/49 */
+ CONST_BITS+PASS1_BITS+1);
+ tmp3 += tmp3;
+ z1 -= tmp3;
+ z1 -= tmp3;
+ z1 = MULTIPLY(z1, FIX(0.461784020)); /* (c2+c6-c4)/2 */
+ z2 = MULTIPLY(tmp0 - tmp2, FIX(1.202428084)); /* (c2+c4-c6)/2 */
+ z3 = MULTIPLY(tmp1 - tmp2, FIX(0.411026446)); /* c6 */
+ dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS+PASS1_BITS+1);
+ z1 -= z2;
+ z2 = MULTIPLY(tmp0 - tmp1, FIX(1.151670509)); /* c4 */
+ dataptr[DCTSIZE*4] = (DCTELEM)
+ DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.923568041)), /* c2+c6-c4 */
+ CONST_BITS+PASS1_BITS+1);
+ dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+PASS1_BITS+1);
+
+ /* Odd part */
+
+ tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.221765677)); /* (c3+c1-c5)/2 */
+ tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.222383464)); /* (c3+c5-c1)/2 */
+ tmp0 = tmp1 - tmp2;
+ tmp1 += tmp2;
+ tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.800824523)); /* -c1 */
+ tmp1 += tmp2;
+ tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.801442310)); /* c5 */
+ tmp0 += tmp3;
+ tmp2 += tmp3 + MULTIPLY(tmp12, FIX(2.443531355)); /* c3+c1-c5 */
+
+ dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS+1);
+ dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS+1);
+ dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS+1);
+
+ dataptr++; /* advance pointer to next column */
+ }
+}
+
+
+/*
+ * Perform the forward DCT on a 12x6 sample block.
+ *
+ * 12-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_12x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+ INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+ INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+ DCTELEM *dataptr;
+ JSAMPROW elemptr;
+ int ctr;
+ SHIFT_TEMPS
+
+ /* Zero 2 bottom rows of output coefficient block. */
+ MEMZERO(&data[DCTSIZE*6], SIZEOF(DCTELEM) * DCTSIZE * 2);
+
+ /* Pass 1: process rows. */
+ /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+ /* furthermore, we scale the results by 2**PASS1_BITS. */
+ /* 12-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/24). */
+
+ dataptr = data;
+ for (ctr = 0; ctr < 6; ctr++) {
+ elemptr = sample_data[ctr] + start_col;
+
+ /* Even part */
+
+ tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]);
+ tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]);
+ tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]);
+ tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]);
+ tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]);
+ tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]);
+
+ tmp10 = tmp0 + tmp5;
+ tmp13 = tmp0 - tmp5;
+ tmp11 = tmp1 + tmp4;
+ tmp14 = tmp1 - tmp4;
+ tmp12 = tmp2 + tmp3;
+ tmp15 = tmp2 - tmp3;
+
+ tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]);
+ tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]);
+ tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
+ tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
+ tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
+ tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);
+
+ /* Apply unsigned->signed conversion */
+ dataptr[0] = (DCTELEM)
+ ((tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE) << PASS1_BITS);
+ dataptr[6] = (DCTELEM) ((tmp13 - tmp14 - tmp15) << PASS1_BITS);
+ dataptr[4] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
+ CONST_BITS-PASS1_BITS);
+ dataptr[2] = (DCTELEM)
+ DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */
+ CONST_BITS-PASS1_BITS);
+
+ /* Odd part */
+
+ tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100); /* c9 */
+ tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865); /* c3-c9 */
+ tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065); /* c3+c9 */
+ tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054)); /* c5 */
+ tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669)); /* c7 */
+ tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */
+ + MULTIPLY(tmp5, FIX(0.184591911)); /* c11 */
+ tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */
+ tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.339493912)) /* c1+c5-c11 */
+ + MULTIPLY(tmp5, FIX(0.860918669)); /* c7 */
+ tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.725788011)) /* c1+c11-c7 */
+ - MULTIPLY(tmp5, FIX(1.121971054)); /* c5 */
+ tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.306562965)) /* c3 */
+ - MULTIPLY(tmp2 + tmp5, FIX_0_541196100); /* c9 */
+
+ dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
+ dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
+ dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
+ dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
+
+ dataptr += DCTSIZE; /* advance pointer to next row */
+ }
+
+ /* Pass 2: process columns.
+ * We remove the PASS1_BITS scaling, but leave the results scaled up
+ * by an overall factor of 8.
+ * We must also scale the output by (8/12)*(8/6) = 8/9, which we
+ * partially fold into the constant multipliers and final shifting:
+ * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12) * 16/9.
+ */
+
+ dataptr = data;
+ for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+ /* Even part */
+
+ tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
+ tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
+ tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
+
+ tmp10 = tmp0 + tmp2;
+ tmp12 = tmp0 - tmp2;
+
+ tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
+ tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
+ tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
+
+ dataptr[DCTSIZE*0] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)), /* 16/9 */
+ CONST_BITS+PASS1_BITS+1);
+ dataptr[DCTSIZE*2] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp12, FIX(2.177324216)), /* c2 */
+ CONST_BITS+PASS1_BITS+1);
+ dataptr[DCTSIZE*4] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
+ CONST_BITS+PASS1_BITS+1);
+
+ /* Odd part */
+
+ tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829)); /* c5 */
+
+ dataptr[DCTSIZE*1] = (DCTELEM)
+ DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
+ CONST_BITS+PASS1_BITS+1);
+ dataptr[DCTSIZE*3] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)), /* 16/9 */
+ CONST_BITS+PASS1_BITS+1);
+ dataptr[DCTSIZE*5] = (DCTELEM)
+ DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)), /* 16/9 */
+ CONST_BITS+PASS1_BITS+1);
+
+ dataptr++; /* advance pointer to next column */
+ }
+}
+
+
+/*
+ * Perform the forward DCT on a 10x5 sample block.
+ *
+ * 10-point FDCT in pass 1 (rows), 5-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_10x5 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+ INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
+ INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
+ DCTELEM *dataptr;
+ JSAMPROW elemptr;
+ int ctr;
+ SHIFT_TEMPS
+
+ /* Zero 3 bottom rows of output coefficient block. */
+ MEMZERO(&data[DCTSIZE*5], SIZEOF(DCTELEM) * DCTSIZE * 3);
+
+ /* Pass 1: process rows. */
+ /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+ /* furthermore, we scale the results by 2**PASS1_BITS. */
+ /* 10-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/20). */
+
+ dataptr = data;
+ for (ctr = 0; ctr < 5; ctr++) {
+ elemptr = sample_data[ctr] + start_col;
+
+ /* Even part */
+
+ tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]);
+ tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]);
+ tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]);
+ tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]);
+ tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]);
+
+ tmp10 = tmp0 + tmp4;
+ tmp13 = tmp0 - tmp4;
+ tmp11 = tmp1 + tmp3;
+ tmp14 = tmp1 - tmp3;
+
+ tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]);
+ tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
+ tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
+ tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
+ tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);
+
+ /* Apply unsigned->signed conversion */
+ dataptr[0] = (DCTELEM)
+ ((tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE) << PASS1_BITS);
+ tmp12 += tmp12;
+ dataptr[4] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */
+ MULTIPLY(tmp11 - tmp12, FIX(0.437016024)), /* c8 */
+ CONST_BITS-PASS1_BITS);
+ tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876)); /* c6 */
+ dataptr[2] = (DCTELEM)
+ DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)), /* c2-c6 */
+ CONST_BITS-PASS1_BITS);
+ dataptr[6] = (DCTELEM)
+ DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)), /* c2+c6 */
+ CONST_BITS-PASS1_BITS);
+
+ /* Odd part */
+
+ tmp10 = tmp0 + tmp4;
+ tmp11 = tmp1 - tmp3;
+ dataptr[5] = (DCTELEM) ((tmp10 - tmp11 - tmp2) << PASS1_BITS);
+ tmp2 <<= CONST_BITS;
+ dataptr[1] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp0, FIX(1.396802247)) + /* c1 */
+ MULTIPLY(tmp1, FIX(1.260073511)) + tmp2 + /* c3 */
+ MULTIPLY(tmp3, FIX(0.642039522)) + /* c7 */
+ MULTIPLY(tmp4, FIX(0.221231742)), /* c9 */
+ CONST_BITS-PASS1_BITS);
+ tmp12 = MULTIPLY(tmp0 - tmp4, FIX(0.951056516)) - /* (c3+c7)/2 */
+ MULTIPLY(tmp1 + tmp3, FIX(0.587785252)); /* (c1-c9)/2 */
+ tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.309016994)) + /* (c3-c7)/2 */
+ (tmp11 << (CONST_BITS - 1)) - tmp2;
+ dataptr[3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS-PASS1_BITS);
+ dataptr[7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS-PASS1_BITS);
+
+ dataptr += DCTSIZE; /* advance pointer to next row */
+ }
+
+ /* Pass 2: process columns.
+ * We remove the PASS1_BITS scaling, but leave the results scaled up
+ * by an overall factor of 8.
+ * We must also scale the output by (8/10)*(8/5) = 32/25, which we
+ * fold into the constant multipliers:
+ * 5-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/10) * 32/25.
+ */
+
+ dataptr = data;
+ for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+ /* Even part */
+
+ tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*4];
+ tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*3];
+ tmp2 = dataptr[DCTSIZE*2];
+
+ tmp10 = tmp0 + tmp1;
+ tmp11 = tmp0 - tmp1;
+
+ tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*4];
+ tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*3];
+
+ dataptr[DCTSIZE*0] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 + tmp2, FIX(1.28)), /* 32/25 */
+ CONST_BITS+PASS1_BITS);
+ tmp11 = MULTIPLY(tmp11, FIX(1.011928851)); /* (c2+c4)/2 */
+ tmp10 -= tmp2 << 2;
+ tmp10 = MULTIPLY(tmp10, FIX(0.452548340)); /* (c2-c4)/2 */
+ dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS+PASS1_BITS);
+
+ /* Odd part */
+
+ tmp10 = MULTIPLY(tmp0 + tmp1, FIX(1.064004961)); /* c3 */
+
+ dataptr[DCTSIZE*1] = (DCTELEM)
+ DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.657591230)), /* c1-c3 */
+ CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*3] = (DCTELEM)
+ DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.785601151)), /* c1+c3 */
+ CONST_BITS+PASS1_BITS);
+
+ dataptr++; /* advance pointer to next column */
+ }
+}
+
+
+/*
+ * Perform the forward DCT on an 8x4 sample block.
+ *
+ * 8-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_8x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+ INT32 tmp0, tmp1, tmp2, tmp3;
+ INT32 tmp10, tmp11, tmp12, tmp13;
+ INT32 z1;
+ DCTELEM *dataptr;
+ JSAMPROW elemptr;
+ int ctr;
+ SHIFT_TEMPS
+
+ /* Zero 4 bottom rows of output coefficient block. */
+ MEMZERO(&data[DCTSIZE*4], SIZEOF(DCTELEM) * DCTSIZE * 4);
+
+ /* Pass 1: process rows. */
+ /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+ /* furthermore, we scale the results by 2**PASS1_BITS. */
+ /* We must also scale the output by 8/4 = 2, which we add here. */
+
+ dataptr = data;
+ for (ctr = 0; ctr < 4; ctr++) {
+ elemptr = sample_data[ctr] + start_col;
+
+ /* Even part per LL&M figure 1 --- note that published figure is faulty;
+ * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+ */
+
+ tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
+ tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
+ tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
+ tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
+
+ tmp10 = tmp0 + tmp3;
+ tmp12 = tmp0 - tmp3;
+ tmp11 = tmp1 + tmp2;
+ tmp13 = tmp1 - tmp2;
+
+ tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
+ tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
+ tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
+ tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
+
+ /* Apply unsigned->signed conversion */
+ dataptr[0] = (DCTELEM)
+ ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << (PASS1_BITS+1));
+ dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << (PASS1_BITS+1));
+
+ z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
+ /* Add fudge factor here for final descale. */
+ z1 += ONE << (CONST_BITS-PASS1_BITS-2);
+ dataptr[2] = (DCTELEM) RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865),
+ CONST_BITS-PASS1_BITS-1);
+ dataptr[6] = (DCTELEM) RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065),
+ CONST_BITS-PASS1_BITS-1);
+
+ /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
+ * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
+ * i0..i3 in the paper are tmp0..tmp3 here.
+ */
+
+ tmp10 = tmp0 + tmp3;
+ tmp11 = tmp1 + tmp2;
+ tmp12 = tmp0 + tmp2;
+ tmp13 = tmp1 + tmp3;
+ z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
+ /* Add fudge factor here for final descale. */
+ z1 += ONE << (CONST_BITS-PASS1_BITS-2);
+
+ tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
+ tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
+ tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
+ tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
+ tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */
+ tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */
+ tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */
+ tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
+
+ tmp12 += z1;
+ tmp13 += z1;
+
+ dataptr[1] = (DCTELEM)
+ RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS-1);
+ dataptr[3] = (DCTELEM)
+ RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS-1);
+ dataptr[5] = (DCTELEM)
+ RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS-1);
+ dataptr[7] = (DCTELEM)
+ RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS-1);
+
+ dataptr += DCTSIZE; /* advance pointer to next row */
+ }
+
+ /* Pass 2: process columns.
+ * We remove the PASS1_BITS scaling, but leave the results scaled up
+ * by an overall factor of 8.
+ * 4-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
+ */
+
+ dataptr = data;
+ for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3] + (ONE << (PASS1_BITS-1));
+ tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
+
+ tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
+ tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
+
+ dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
+ dataptr[DCTSIZE*2] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
+
+ /* Odd part */
+
+ tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
+ /* Add fudge factor here for final descale. */
+ tmp0 += ONE << (CONST_BITS+PASS1_BITS-1);
+
+ dataptr[DCTSIZE*1] = (DCTELEM)
+ RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
+ CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*3] = (DCTELEM)
+ RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
+ CONST_BITS+PASS1_BITS);
+
+ dataptr++; /* advance pointer to next column */
+ }
+}
+
+
+/*
+ * Perform the forward DCT on a 6x3 sample block.
+ *
+ * 6-point FDCT in pass 1 (rows), 3-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_6x3 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+ INT32 tmp0, tmp1, tmp2;
+ INT32 tmp10, tmp11, tmp12;
+ DCTELEM *dataptr;
+ JSAMPROW elemptr;
+ int ctr;
+ SHIFT_TEMPS
+
+ /* Pre-zero output coefficient block. */
+ MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+ /* Pass 1: process rows. */
+ /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+ /* furthermore, we scale the results by 2**PASS1_BITS. */
+ /* We scale the results further by 2 as part of output adaption */
+ /* scaling for different DCT size. */
+ /* 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12). */
+
+ dataptr = data;
+ for (ctr = 0; ctr < 3; ctr++) {
+ elemptr = sample_data[ctr] + start_col;
+
+ /* Even part */
+
+ tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
+ tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
+ tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
+
+ tmp10 = tmp0 + tmp2;
+ tmp12 = tmp0 - tmp2;
+
+ tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
+ tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
+ tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
+
+ /* Apply unsigned->signed conversion */
+ dataptr[0] = (DCTELEM)
+ ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << (PASS1_BITS+1));
+ dataptr[2] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp12, FIX(1.224744871)), /* c2 */
+ CONST_BITS-PASS1_BITS-1);
+ dataptr[4] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
+ CONST_BITS-PASS1_BITS-1);
+
+ /* Odd part */
+
+ tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)), /* c5 */
+ CONST_BITS-PASS1_BITS-1);
+
+ dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << (PASS1_BITS+1)));
+ dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << (PASS1_BITS+1));
+ dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << (PASS1_BITS+1)));
+
+ dataptr += DCTSIZE; /* advance pointer to next row */
+ }
+
+ /* Pass 2: process columns.
+ * We remove the PASS1_BITS scaling, but leave the results scaled up
+ * by an overall factor of 8.
+ * We must also scale the output by (8/6)*(8/3) = 32/9, which we partially
+ * fold into the constant multipliers (other part was done in pass 1):
+ * 3-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/6) * 16/9.
+ */
+
+ dataptr = data;
+ for (ctr = 0; ctr < 6; ctr++) {
+ /* Even part */
+
+ tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*2];
+ tmp1 = dataptr[DCTSIZE*1];
+
+ tmp2 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*2];
+
+ dataptr[DCTSIZE*0] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
+ CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*2] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(1.257078722)), /* c2 */
+ CONST_BITS+PASS1_BITS);
+
+ /* Odd part */
+
+ dataptr[DCTSIZE*1] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp2, FIX(2.177324216)), /* c1 */
+ CONST_BITS+PASS1_BITS);
+
+ dataptr++; /* advance pointer to next column */
+ }
+}
+
+
+/*
+ * Perform the forward DCT on a 4x2 sample block.
+ *
+ * 4-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_4x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+ INT32 tmp0, tmp1;
+ INT32 tmp10, tmp11;
+ DCTELEM *dataptr;
+ JSAMPROW elemptr;
+ int ctr;
+ SHIFT_TEMPS
+
+ /* Pre-zero output coefficient block. */
+ MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+ /* Pass 1: process rows. */
+ /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+ /* furthermore, we scale the results by 2**PASS1_BITS. */
+ /* We must also scale the output by (8/4)*(8/2) = 2**3, which we add here. */
+ /* 4-point FDCT kernel, */
+ /* cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT]. */
+
+ dataptr = data;
+ for (ctr = 0; ctr < 2; ctr++) {
+ elemptr = sample_data[ctr] + start_col;
+
+ /* Even part */
+
+ tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
+ tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
+
+ tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
+ tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
+
+ /* Apply unsigned->signed conversion */
+ dataptr[0] = (DCTELEM)
+ ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+3));
+ dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+3));
+
+ /* Odd part */
+
+ tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
+ /* Add fudge factor here for final descale. */
+ tmp0 += ONE << (CONST_BITS-PASS1_BITS-4);
+
+ dataptr[1] = (DCTELEM)
+ RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
+ CONST_BITS-PASS1_BITS-3);
+ dataptr[3] = (DCTELEM)
+ RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
+ CONST_BITS-PASS1_BITS-3);
+
+ dataptr += DCTSIZE; /* advance pointer to next row */
+ }
+
+ /* Pass 2: process columns.
+ * We remove the PASS1_BITS scaling, but leave the results scaled up
+ * by an overall factor of 8.
+ */
+
+ dataptr = data;
+ for (ctr = 0; ctr < 4; ctr++) {
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ tmp0 = dataptr[DCTSIZE*0] + (ONE << (PASS1_BITS-1));
+ tmp1 = dataptr[DCTSIZE*1];
+
+ dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
+
+ /* Odd part */
+
+ dataptr[DCTSIZE*1] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
+
+ dataptr++; /* advance pointer to next column */
+ }
+}
+
+
+/*
+ * Perform the forward DCT on a 2x1 sample block.
+ *
+ * 2-point FDCT in pass 1 (rows), 1-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_2x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+ INT32 tmp0, tmp1;
+ JSAMPROW elemptr;
+
+ /* Pre-zero output coefficient block. */
+ MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+ elemptr = sample_data[0] + start_col;
+
+ tmp0 = GETJSAMPLE(elemptr[0]);
+ tmp1 = GETJSAMPLE(elemptr[1]);
+
+ /* We leave the results scaled up by an overall factor of 8.
+ * We must also scale the output by (8/2)*(8/1) = 2**5.
+ */
+
+ /* Even part */
+ /* Apply unsigned->signed conversion */
+ data[0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 5);
+
+ /* Odd part */
+ data[1] = (DCTELEM) ((tmp0 - tmp1) << 5);
+}
+
+
+/*
+ * Perform the forward DCT on an 8x16 sample block.
+ *
+ * 8-point FDCT in pass 1 (rows), 16-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_8x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+ INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
+ INT32 z1;
+ DCTELEM workspace[DCTSIZE2];
+ DCTELEM *dataptr;
+ DCTELEM *wsptr;
+ JSAMPROW elemptr;
+ int ctr;
+ SHIFT_TEMPS
+
+ /* Pass 1: process rows. */
+ /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+ /* furthermore, we scale the results by 2**PASS1_BITS. */
+
+ dataptr = data;
+ ctr = 0;
+ for (;;) {
+ elemptr = sample_data[ctr] + start_col;
+
+ /* Even part per LL&M figure 1 --- note that published figure is faulty;
+ * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+ */
+
+ tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
+ tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
+ tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
+ tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
+
tmp10 = tmp0 + tmp3;
- tmp13 = tmp0 - tmp3;
+ tmp12 = tmp0 - tmp3;
tmp11 = tmp1 + tmp2;
- tmp12 = tmp1 - tmp2;
-
- dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS);
- dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS);
-
+ tmp13 = tmp1 - tmp2;
+
+ tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
+ tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
+ tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
+ tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
+
+ /* Apply unsigned->signed conversion */
+ dataptr[0] = (DCTELEM) ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << PASS1_BITS);
+ dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
+
z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
- dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
- CONST_BITS+PASS1_BITS);
- dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
- CONST_BITS+PASS1_BITS);
-
+ dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865),
+ CONST_BITS-PASS1_BITS);
+ dataptr[6] = (DCTELEM) DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065),
+ CONST_BITS-PASS1_BITS);
+
/* Odd part per figure 8 --- note paper omits factor of sqrt(2).
- * cK represents cos(K*pi/16).
- * i0..i3 in the paper are tmp4..tmp7 here.
+ * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
+ * i0..i3 in the paper are tmp0..tmp3 here.
*/
-
- z1 = tmp4 + tmp7;
- z2 = tmp5 + tmp6;
- z3 = tmp4 + tmp6;
- z4 = tmp5 + tmp7;
- z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
-
- tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
- tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
- tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
- tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
- z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
- z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
- z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
- z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
-
- z3 += z5;
- z4 += z5;
-
- dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp4 + z1 + z3,
- CONST_BITS+PASS1_BITS);
- dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp5 + z2 + z4,
- CONST_BITS+PASS1_BITS);
- dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp6 + z2 + z3,
- CONST_BITS+PASS1_BITS);
- dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp7 + z1 + z4,
- CONST_BITS+PASS1_BITS);
-
+
+ tmp10 = tmp0 + tmp3;
+ tmp11 = tmp1 + tmp2;
+ tmp12 = tmp0 + tmp2;
+ tmp13 = tmp1 + tmp3;
+ z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
+
+ tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
+ tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
+ tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
+ tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
+ tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */
+ tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */
+ tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */
+ tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
+
+ tmp12 += z1;
+ tmp13 += z1;
+
+ dataptr[1] = (DCTELEM) DESCALE(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS);
+ dataptr[3] = (DCTELEM) DESCALE(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS);
+ dataptr[5] = (DCTELEM) DESCALE(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS);
+ dataptr[7] = (DCTELEM) DESCALE(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS);
+
+ ctr++;
+
+ if (ctr != DCTSIZE) {
+ if (ctr == DCTSIZE * 2)
+ break; /* Done. */
+ dataptr += DCTSIZE; /* advance pointer to next row */
+ } else
+ dataptr = workspace; /* switch pointer to extended workspace */
+ }
+
+ /* Pass 2: process columns.
+ * We remove the PASS1_BITS scaling, but leave the results scaled up
+ * by an overall factor of 8.
+ * We must also scale the output by 8/16 = 1/2.
+ * 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
+ */
+
+ dataptr = data;
+ wsptr = workspace;
+ for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+ /* Even part */
+
+ tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*7];
+ tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*6];
+ tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*5];
+ tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*4];
+ tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*3];
+ tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*2];
+ tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*1];
+ tmp7 = dataptr[DCTSIZE*7] + wsptr[DCTSIZE*0];
+
+ tmp10 = tmp0 + tmp7;
+ tmp14 = tmp0 - tmp7;
+ tmp11 = tmp1 + tmp6;
+ tmp15 = tmp1 - tmp6;
+ tmp12 = tmp2 + tmp5;
+ tmp16 = tmp2 - tmp5;
+ tmp13 = tmp3 + tmp4;
+ tmp17 = tmp3 - tmp4;
+
+ tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*7];
+ tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*6];
+ tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*5];
+ tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*4];
+ tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*3];
+ tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*2];
+ tmp6 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*1];
+ tmp7 = dataptr[DCTSIZE*7] - wsptr[DCTSIZE*0];
+
+ dataptr[DCTSIZE*0] = (DCTELEM)
+ DESCALE(tmp10 + tmp11 + tmp12 + tmp13, PASS1_BITS+1);
+ dataptr[DCTSIZE*4] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
+ MULTIPLY(tmp11 - tmp12, FIX_0_541196100), /* c12[16] = c6[8] */
+ CONST_BITS+PASS1_BITS+1);
+
+ tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) + /* c14[16] = c7[8] */
+ MULTIPLY(tmp14 - tmp16, FIX(1.387039845)); /* c2[16] = c1[8] */
+
+ dataptr[DCTSIZE*2] = (DCTELEM)
+ DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982)) /* c6+c14 */
+ + MULTIPLY(tmp16, FIX(2.172734804)), /* c2+c10 */
+ CONST_BITS+PASS1_BITS+1);
+ dataptr[DCTSIZE*6] = (DCTELEM)
+ DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243)) /* c2-c6 */
+ - MULTIPLY(tmp17, FIX(1.061594338)), /* c10+c14 */
+ CONST_BITS+PASS1_BITS+1);
+
+ /* Odd part */
+
+ tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) + /* c3 */
+ MULTIPLY(tmp6 - tmp7, FIX(0.410524528)); /* c13 */
+ tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) + /* c5 */
+ MULTIPLY(tmp5 + tmp7, FIX(0.666655658)); /* c11 */
+ tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) + /* c7 */
+ MULTIPLY(tmp4 - tmp7, FIX(0.897167586)); /* c9 */
+ tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) + /* c15 */
+ MULTIPLY(tmp6 - tmp5, FIX(1.407403738)); /* c1 */
+ tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) + /* -c11 */
+ MULTIPLY(tmp4 + tmp6, - FIX(1.247225013)); /* -c5 */
+ tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) + /* -c3 */
+ MULTIPLY(tmp5 - tmp4, FIX(0.410524528)); /* c13 */
+ tmp10 = tmp11 + tmp12 + tmp13 -
+ MULTIPLY(tmp0, FIX(2.286341144)) + /* c7+c5+c3-c1 */
+ MULTIPLY(tmp7, FIX(0.779653625)); /* c15+c13-c11+c9 */
+ tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
+ - MULTIPLY(tmp6, FIX(1.663905119)); /* c7+c13+c1-c5 */
+ tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
+ + MULTIPLY(tmp5, FIX(1.227391138)); /* c9-c11+c1-c13 */
+ tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
+ + MULTIPLY(tmp4, FIX(2.167985692)); /* c1+c13+c5-c9 */
+
+ dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS+1);
+ dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS+1);
+ dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS+1);
+ dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS+1);
+
dataptr++; /* advance pointer to next column */
+ wsptr++; /* advance pointer to next column */
}
}
+
+/*
+ * Perform the forward DCT on a 7x14 sample block.
+ *
+ * 7-point FDCT in pass 1 (rows), 14-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_7x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+ INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+ INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+ INT32 z1, z2, z3;
+ DCTELEM workspace[8*6];
+ DCTELEM *dataptr;
+ DCTELEM *wsptr;
+ JSAMPROW elemptr;
+ int ctr;
+ SHIFT_TEMPS
+
+ /* Pre-zero output coefficient block. */
+ MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+ /* Pass 1: process rows. */
+ /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+ /* furthermore, we scale the results by 2**PASS1_BITS. */
+ /* 7-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/14). */
+
+ dataptr = data;
+ ctr = 0;
+ for (;;) {
+ elemptr = sample_data[ctr] + start_col;
+
+ /* Even part */
+
+ tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]);
+ tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]);
+ tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]);
+ tmp3 = GETJSAMPLE(elemptr[3]);
+
+ tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
+ tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
+ tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);
+
+ z1 = tmp0 + tmp2;
+ /* Apply unsigned->signed conversion */
+ dataptr[0] = (DCTELEM)
+ ((z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE) << PASS1_BITS);
+ tmp3 += tmp3;
+ z1 -= tmp3;
+ z1 -= tmp3;
+ z1 = MULTIPLY(z1, FIX(0.353553391)); /* (c2+c6-c4)/2 */
+ z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002)); /* (c2+c4-c6)/2 */
+ z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123)); /* c6 */
+ dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
+ z1 -= z2;
+ z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734)); /* c4 */
+ dataptr[4] = (DCTELEM)
+ DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */
+ CONST_BITS-PASS1_BITS);
+ dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);
+
+ /* Odd part */
+
+ tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347)); /* (c3+c1-c5)/2 */
+ tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339)); /* (c3+c5-c1)/2 */
+ tmp0 = tmp1 - tmp2;
+ tmp1 += tmp2;
+ tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.378756276)); /* -c1 */
+ tmp1 += tmp2;
+ tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.613604268)); /* c5 */
+ tmp0 += tmp3;
+ tmp2 += tmp3 + MULTIPLY(tmp12, FIX(1.870828693)); /* c3+c1-c5 */
+
+ dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
+ dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
+ dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
+
+ ctr++;
+
+ if (ctr != DCTSIZE) {
+ if (ctr == 14)
+ break; /* Done. */
+ dataptr += DCTSIZE; /* advance pointer to next row */
+ } else
+ dataptr = workspace; /* switch pointer to extended workspace */
+ }
+
+ /* Pass 2: process columns.
+ * We remove the PASS1_BITS scaling, but leave the results scaled up
+ * by an overall factor of 8.
+ * We must also scale the output by (8/7)*(8/14) = 32/49, which we
+ * fold into the constant multipliers:
+ * 14-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/28) * 32/49.
+ */
+
+ dataptr = data;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 7; ctr++) {
+ /* Even part */
+
+ tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*5];
+ tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*4];
+ tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*3];
+ tmp13 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*2];
+ tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*1];
+ tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*0];
+ tmp6 = dataptr[DCTSIZE*6] + dataptr[DCTSIZE*7];
+
+ tmp10 = tmp0 + tmp6;
+ tmp14 = tmp0 - tmp6;
+ tmp11 = tmp1 + tmp5;
+ tmp15 = tmp1 - tmp5;
+ tmp12 = tmp2 + tmp4;
+ tmp16 = tmp2 - tmp4;
+
+ tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*5];
+ tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*4];
+ tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*3];
+ tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*2];
+ tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*1];
+ tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*0];
+ tmp6 = dataptr[DCTSIZE*6] - dataptr[DCTSIZE*7];
+
+ dataptr[DCTSIZE*0] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12 + tmp13,
+ FIX(0.653061224)), /* 32/49 */
+ CONST_BITS+PASS1_BITS);
+ tmp13 += tmp13;
+ dataptr[DCTSIZE*4] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 - tmp13, FIX(0.832106052)) + /* c4 */
+ MULTIPLY(tmp11 - tmp13, FIX(0.205513223)) - /* c12 */
+ MULTIPLY(tmp12 - tmp13, FIX(0.575835255)), /* c8 */
+ CONST_BITS+PASS1_BITS);
+
+ tmp10 = MULTIPLY(tmp14 + tmp15, FIX(0.722074570)); /* c6 */
+
+ dataptr[DCTSIZE*2] = (DCTELEM)
+ DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.178337691)) /* c2-c6 */
+ + MULTIPLY(tmp16, FIX(0.400721155)), /* c10 */
+ CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*6] = (DCTELEM)
+ DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.122795725)) /* c6+c10 */
+ - MULTIPLY(tmp16, FIX(0.900412262)), /* c2 */
+ CONST_BITS+PASS1_BITS);
+
+ /* Odd part */
+
+ tmp10 = tmp1 + tmp2;
+ tmp11 = tmp5 - tmp4;
+ dataptr[DCTSIZE*7] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp0 - tmp10 + tmp3 - tmp11 - tmp6,
+ FIX(0.653061224)), /* 32/49 */
+ CONST_BITS+PASS1_BITS);
+ tmp3 = MULTIPLY(tmp3 , FIX(0.653061224)); /* 32/49 */
+ tmp10 = MULTIPLY(tmp10, - FIX(0.103406812)); /* -c13 */
+ tmp11 = MULTIPLY(tmp11, FIX(0.917760839)); /* c1 */
+ tmp10 += tmp11 - tmp3;
+ tmp11 = MULTIPLY(tmp0 + tmp2, FIX(0.782007410)) + /* c5 */
+ MULTIPLY(tmp4 + tmp6, FIX(0.491367823)); /* c9 */
+ dataptr[DCTSIZE*5] = (DCTELEM)
+ DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(1.550341076)) /* c3+c5-c13 */
+ + MULTIPLY(tmp4, FIX(0.731428202)), /* c1+c11-c9 */
+ CONST_BITS+PASS1_BITS);
+ tmp12 = MULTIPLY(tmp0 + tmp1, FIX(0.871740478)) + /* c3 */
+ MULTIPLY(tmp5 - tmp6, FIX(0.305035186)); /* c11 */
+ dataptr[DCTSIZE*3] = (DCTELEM)
+ DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.276965844)) /* c3-c9-c13 */
+ - MULTIPLY(tmp5, FIX(2.004803435)), /* c1+c5+c11 */
+ CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*1] = (DCTELEM)
+ DESCALE(tmp11 + tmp12 + tmp3
+ - MULTIPLY(tmp0, FIX(0.735987049)) /* c3+c5-c1 */
+ - MULTIPLY(tmp6, FIX(0.082925825)), /* c9-c11-c13 */
+ CONST_BITS+PASS1_BITS);
+
+ dataptr++; /* advance pointer to next column */
+ wsptr++; /* advance pointer to next column */
+ }
+}
+
+
+/*
+ * Perform the forward DCT on a 6x12 sample block.
+ *
+ * 6-point FDCT in pass 1 (rows), 12-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_6x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+ INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+ INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+ DCTELEM workspace[8*4];
+ DCTELEM *dataptr;
+ DCTELEM *wsptr;
+ JSAMPROW elemptr;
+ int ctr;
+ SHIFT_TEMPS
+
+ /* Pre-zero output coefficient block. */
+ MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+ /* Pass 1: process rows. */
+ /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+ /* furthermore, we scale the results by 2**PASS1_BITS. */
+ /* 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12). */
+
+ dataptr = data;
+ ctr = 0;
+ for (;;) {
+ elemptr = sample_data[ctr] + start_col;
+
+ /* Even part */
+
+ tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
+ tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
+ tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
+
+ tmp10 = tmp0 + tmp2;
+ tmp12 = tmp0 - tmp2;
+
+ tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
+ tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
+ tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
+
+ /* Apply unsigned->signed conversion */
+ dataptr[0] = (DCTELEM)
+ ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << PASS1_BITS);
+ dataptr[2] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp12, FIX(1.224744871)), /* c2 */
+ CONST_BITS-PASS1_BITS);
+ dataptr[4] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
+ CONST_BITS-PASS1_BITS);
+
+ /* Odd part */
+
+ tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)), /* c5 */
+ CONST_BITS-PASS1_BITS);
+
+ dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
+ dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
+ dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));
+
+ ctr++;
+
+ if (ctr != DCTSIZE) {
+ if (ctr == 12)
+ break; /* Done. */
+ dataptr += DCTSIZE; /* advance pointer to next row */
+ } else
+ dataptr = workspace; /* switch pointer to extended workspace */
+ }
+
+ /* Pass 2: process columns.
+ * We remove the PASS1_BITS scaling, but leave the results scaled up
+ * by an overall factor of 8.
+ * We must also scale the output by (8/6)*(8/12) = 8/9, which we
+ * fold into the constant multipliers:
+ * 12-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/24) * 8/9.
+ */
+
+ dataptr = data;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 6; ctr++) {
+ /* Even part */
+
+ tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*3];
+ tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*2];
+ tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*1];
+ tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*0];
+ tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*7];
+ tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*6];
+
+ tmp10 = tmp0 + tmp5;
+ tmp13 = tmp0 - tmp5;
+ tmp11 = tmp1 + tmp4;
+ tmp14 = tmp1 - tmp4;
+ tmp12 = tmp2 + tmp3;
+ tmp15 = tmp2 - tmp3;
+
+ tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*3];
+ tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*2];
+ tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*1];
+ tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*0];
+ tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*7];
+ tmp5 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*6];
+
+ dataptr[DCTSIZE*0] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(0.888888889)), /* 8/9 */
+ CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*6] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp13 - tmp14 - tmp15, FIX(0.888888889)), /* 8/9 */
+ CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*4] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.088662108)), /* c4 */
+ CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*2] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp14 - tmp15, FIX(0.888888889)) + /* 8/9 */
+ MULTIPLY(tmp13 + tmp15, FIX(1.214244803)), /* c2 */
+ CONST_BITS+PASS1_BITS);
+
+ /* Odd part */
+
+ tmp10 = MULTIPLY(tmp1 + tmp4, FIX(0.481063200)); /* c9 */
+ tmp14 = tmp10 + MULTIPLY(tmp1, FIX(0.680326102)); /* c3-c9 */
+ tmp15 = tmp10 - MULTIPLY(tmp4, FIX(1.642452502)); /* c3+c9 */
+ tmp12 = MULTIPLY(tmp0 + tmp2, FIX(0.997307603)); /* c5 */
+ tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.765261039)); /* c7 */
+ tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.516244403)) /* c5+c7-c1 */
+ + MULTIPLY(tmp5, FIX(0.164081699)); /* c11 */
+ tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.164081699)); /* -c11 */
+ tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.079550144)) /* c1+c5-c11 */
+ + MULTIPLY(tmp5, FIX(0.765261039)); /* c7 */
+ tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.645144899)) /* c1+c11-c7 */
+ - MULTIPLY(tmp5, FIX(0.997307603)); /* c5 */
+ tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.161389302)) /* c3 */
+ - MULTIPLY(tmp2 + tmp5, FIX(0.481063200)); /* c9 */
+
+ dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS);
+
+ dataptr++; /* advance pointer to next column */
+ wsptr++; /* advance pointer to next column */
+ }
+}
+
+
+/*
+ * Perform the forward DCT on a 5x10 sample block.
+ *
+ * 5-point FDCT in pass 1 (rows), 10-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_5x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+ INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
+ INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
+ DCTELEM workspace[8*2];
+ DCTELEM *dataptr;
+ DCTELEM *wsptr;
+ JSAMPROW elemptr;
+ int ctr;
+ SHIFT_TEMPS
+
+ /* Pre-zero output coefficient block. */
+ MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+ /* Pass 1: process rows. */
+ /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+ /* furthermore, we scale the results by 2**PASS1_BITS. */
+ /* 5-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/10). */
+
+ dataptr = data;
+ ctr = 0;
+ for (;;) {
+ elemptr = sample_data[ctr] + start_col;
+
+ /* Even part */
+
+ tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]);
+ tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]);
+ tmp2 = GETJSAMPLE(elemptr[2]);
+
+ tmp10 = tmp0 + tmp1;
+ tmp11 = tmp0 - tmp1;
+
+ tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
+ tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);
+
+ /* Apply unsigned->signed conversion */
+ dataptr[0] = (DCTELEM)
+ ((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << PASS1_BITS);
+ tmp11 = MULTIPLY(tmp11, FIX(0.790569415)); /* (c2+c4)/2 */
+ tmp10 -= tmp2 << 2;
+ tmp10 = MULTIPLY(tmp10, FIX(0.353553391)); /* (c2-c4)/2 */
+ dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS);
+ dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS);
+
+ /* Odd part */
+
+ tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876)); /* c3 */
+
+ dataptr[1] = (DCTELEM)
+ DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */
+ CONST_BITS-PASS1_BITS);
+ dataptr[3] = (DCTELEM)
+ DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */
+ CONST_BITS-PASS1_BITS);
+
+ ctr++;
+
+ if (ctr != DCTSIZE) {
+ if (ctr == 10)
+ break; /* Done. */
+ dataptr += DCTSIZE; /* advance pointer to next row */
+ } else
+ dataptr = workspace; /* switch pointer to extended workspace */
+ }
+
+ /* Pass 2: process columns.
+ * We remove the PASS1_BITS scaling, but leave the results scaled up
+ * by an overall factor of 8.
+ * We must also scale the output by (8/5)*(8/10) = 32/25, which we
+ * fold into the constant multipliers:
+ * 10-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/20) * 32/25.
+ */
+
+ dataptr = data;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 5; ctr++) {
+ /* Even part */
+
+ tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*1];
+ tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*0];
+ tmp12 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*7];
+ tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*6];
+ tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*5];
+
+ tmp10 = tmp0 + tmp4;
+ tmp13 = tmp0 - tmp4;
+ tmp11 = tmp1 + tmp3;
+ tmp14 = tmp1 - tmp3;
+
+ tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*1];
+ tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*0];
+ tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*7];
+ tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*6];
+ tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*5];
+
+ dataptr[DCTSIZE*0] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(1.28)), /* 32/25 */
+ CONST_BITS+PASS1_BITS);
+ tmp12 += tmp12;
+ dataptr[DCTSIZE*4] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.464477191)) - /* c4 */
+ MULTIPLY(tmp11 - tmp12, FIX(0.559380511)), /* c8 */
+ CONST_BITS+PASS1_BITS);
+ tmp10 = MULTIPLY(tmp13 + tmp14, FIX(1.064004961)); /* c6 */
+ dataptr[DCTSIZE*2] = (DCTELEM)
+ DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.657591230)), /* c2-c6 */
+ CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*6] = (DCTELEM)
+ DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.785601151)), /* c2+c6 */
+ CONST_BITS+PASS1_BITS);
+
+ /* Odd part */
+
+ tmp10 = tmp0 + tmp4;
+ tmp11 = tmp1 - tmp3;
+ dataptr[DCTSIZE*5] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 - tmp11 - tmp2, FIX(1.28)), /* 32/25 */
+ CONST_BITS+PASS1_BITS);
+ tmp2 = MULTIPLY(tmp2, FIX(1.28)); /* 32/25 */
+ dataptr[DCTSIZE*1] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp0, FIX(1.787906876)) + /* c1 */
+ MULTIPLY(tmp1, FIX(1.612894094)) + tmp2 + /* c3 */
+ MULTIPLY(tmp3, FIX(0.821810588)) + /* c7 */
+ MULTIPLY(tmp4, FIX(0.283176630)), /* c9 */
+ CONST_BITS+PASS1_BITS);
+ tmp12 = MULTIPLY(tmp0 - tmp4, FIX(1.217352341)) - /* (c3+c7)/2 */
+ MULTIPLY(tmp1 + tmp3, FIX(0.752365123)); /* (c1-c9)/2 */
+ tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.395541753)) + /* (c3-c7)/2 */
+ MULTIPLY(tmp11, FIX(0.64)) - tmp2; /* 16/25 */
+ dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS+PASS1_BITS);
+
+ dataptr++; /* advance pointer to next column */
+ wsptr++; /* advance pointer to next column */
+ }
+}
+
+
+/*
+ * Perform the forward DCT on a 4x8 sample block.
+ *
+ * 4-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_4x8 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+ INT32 tmp0, tmp1, tmp2, tmp3;
+ INT32 tmp10, tmp11, tmp12, tmp13;
+ INT32 z1;
+ DCTELEM *dataptr;
+ JSAMPROW elemptr;
+ int ctr;
+ SHIFT_TEMPS
+
+ /* Pre-zero output coefficient block. */
+ MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+ /* Pass 1: process rows. */
+ /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+ /* furthermore, we scale the results by 2**PASS1_BITS. */
+ /* We must also scale the output by 8/4 = 2, which we add here. */
+ /* 4-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16). */
+
+ dataptr = data;
+ for (ctr = 0; ctr < DCTSIZE; ctr++) {
+ elemptr = sample_data[ctr] + start_col;
+
+ /* Even part */
+
+ tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
+ tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
+
+ tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
+ tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
+
+ /* Apply unsigned->signed conversion */
+ dataptr[0] = (DCTELEM)
+ ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+1));
+ dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+1));
+
+ /* Odd part */
+
+ tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
+ /* Add fudge factor here for final descale. */
+ tmp0 += ONE << (CONST_BITS-PASS1_BITS-2);
+
+ dataptr[1] = (DCTELEM)
+ RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
+ CONST_BITS-PASS1_BITS-1);
+ dataptr[3] = (DCTELEM)
+ RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
+ CONST_BITS-PASS1_BITS-1);
+
+ dataptr += DCTSIZE; /* advance pointer to next row */
+ }
+
+ /* Pass 2: process columns.
+ * We remove the PASS1_BITS scaling, but leave the results scaled up
+ * by an overall factor of 8.
+ */
+
+ dataptr = data;
+ for (ctr = 0; ctr < 4; ctr++) {
+ /* Even part per LL&M figure 1 --- note that published figure is faulty;
+ * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+ */
+
+ tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
+ tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
+ tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
+ tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
+
+ /* Add fudge factor here for final descale. */
+ tmp10 = tmp0 + tmp3 + (ONE << (PASS1_BITS-1));
+ tmp12 = tmp0 - tmp3;
+ tmp11 = tmp1 + tmp2;
+ tmp13 = tmp1 - tmp2;
+
+ tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
+ tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
+ tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
+ tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
+
+ dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS1_BITS);
+ dataptr[DCTSIZE*4] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS1_BITS);
+
+ z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
+ /* Add fudge factor here for final descale. */
+ z1 += ONE << (CONST_BITS+PASS1_BITS-1);
+ dataptr[DCTSIZE*2] = (DCTELEM)
+ RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*6] = (DCTELEM)
+ RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), CONST_BITS+PASS1_BITS);
+
+ /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
+ * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
+ * i0..i3 in the paper are tmp0..tmp3 here.
+ */
+
+ tmp10 = tmp0 + tmp3;
+ tmp11 = tmp1 + tmp2;
+ tmp12 = tmp0 + tmp2;
+ tmp13 = tmp1 + tmp3;
+ z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
+ /* Add fudge factor here for final descale. */
+ z1 += ONE << (CONST_BITS+PASS1_BITS-1);
+
+ tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
+ tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
+ tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
+ tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
+ tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */
+ tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */
+ tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */
+ tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
+
+ tmp12 += z1;
+ tmp13 += z1;
+
+ dataptr[DCTSIZE*1] = (DCTELEM)
+ RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*3] = (DCTELEM)
+ RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*5] = (DCTELEM)
+ RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*7] = (DCTELEM)
+ RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS+PASS1_BITS);
+
+ dataptr++; /* advance pointer to next column */
+ }
+}
+
+
+/*
+ * Perform the forward DCT on a 3x6 sample block.
+ *
+ * 3-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_3x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+ INT32 tmp0, tmp1, tmp2;
+ INT32 tmp10, tmp11, tmp12;
+ DCTELEM *dataptr;
+ JSAMPROW elemptr;
+ int ctr;
+ SHIFT_TEMPS
+
+ /* Pre-zero output coefficient block. */
+ MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+ /* Pass 1: process rows. */
+ /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+ /* furthermore, we scale the results by 2**PASS1_BITS. */
+ /* We scale the results further by 2 as part of output adaption */
+ /* scaling for different DCT size. */
+ /* 3-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/6). */
+
+ dataptr = data;
+ for (ctr = 0; ctr < 6; ctr++) {
+ elemptr = sample_data[ctr] + start_col;
+
+ /* Even part */
+
+ tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
+ tmp1 = GETJSAMPLE(elemptr[1]);
+
+ tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);
+
+ /* Apply unsigned->signed conversion */
+ dataptr[0] = (DCTELEM)
+ ((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+1));
+ dataptr[2] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
+ CONST_BITS-PASS1_BITS-1);
+
+ /* Odd part */
+
+ dataptr[1] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp2, FIX(1.224744871)), /* c1 */
+ CONST_BITS-PASS1_BITS-1);
+
+ dataptr += DCTSIZE; /* advance pointer to next row */
+ }
+
+ /* Pass 2: process columns.
+ * We remove the PASS1_BITS scaling, but leave the results scaled up
+ * by an overall factor of 8.
+ * We must also scale the output by (8/6)*(8/3) = 32/9, which we partially
+ * fold into the constant multipliers (other part was done in pass 1):
+ * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12) * 16/9.
+ */
+
+ dataptr = data;
+ for (ctr = 0; ctr < 3; ctr++) {
+ /* Even part */
+
+ tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
+ tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
+ tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
+
+ tmp10 = tmp0 + tmp2;
+ tmp12 = tmp0 - tmp2;
+
+ tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
+ tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
+ tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
+
+ dataptr[DCTSIZE*0] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)), /* 16/9 */
+ CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*2] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp12, FIX(2.177324216)), /* c2 */
+ CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*4] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
+ CONST_BITS+PASS1_BITS);
+
+ /* Odd part */
+
+ tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829)); /* c5 */
+
+ dataptr[DCTSIZE*1] = (DCTELEM)
+ DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
+ CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*3] = (DCTELEM)
+ DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)), /* 16/9 */
+ CONST_BITS+PASS1_BITS);
+ dataptr[DCTSIZE*5] = (DCTELEM)
+ DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)), /* 16/9 */
+ CONST_BITS+PASS1_BITS);
+
+ dataptr++; /* advance pointer to next column */
+ }
+}
+
+
+/*
+ * Perform the forward DCT on a 2x4 sample block.
+ *
+ * 2-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_2x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+ INT32 tmp0, tmp1;
+ INT32 tmp10, tmp11;
+ DCTELEM *dataptr;
+ JSAMPROW elemptr;
+ int ctr;
+ SHIFT_TEMPS
+
+ /* Pre-zero output coefficient block. */
+ MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+ /* Pass 1: process rows. */
+ /* Note results are scaled up by sqrt(8) compared to a true DCT. */
+ /* We must also scale the output by (8/2)*(8/4) = 2**3, which we add here. */
+
+ dataptr = data;
+ for (ctr = 0; ctr < 4; ctr++) {
+ elemptr = sample_data[ctr] + start_col;
+
+ /* Even part */
+
+ tmp0 = GETJSAMPLE(elemptr[0]);
+ tmp1 = GETJSAMPLE(elemptr[1]);
+
+ /* Apply unsigned->signed conversion */
+ dataptr[0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 3);
+
+ /* Odd part */
+
+ dataptr[1] = (DCTELEM) ((tmp0 - tmp1) << 3);
+
+ dataptr += DCTSIZE; /* advance pointer to next row */
+ }
+
+ /* Pass 2: process columns.
+ * We leave the results scaled up by an overall factor of 8.
+ * 4-point FDCT kernel,
+ * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
+ */
+
+ dataptr = data;
+ for (ctr = 0; ctr < 2; ctr++) {
+ /* Even part */
+
+ tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3];
+ tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
+
+ tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
+ tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
+
+ dataptr[DCTSIZE*0] = (DCTELEM) (tmp0 + tmp1);
+ dataptr[DCTSIZE*2] = (DCTELEM) (tmp0 - tmp1);
+
+ /* Odd part */
+
+ tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
+ /* Add fudge factor here for final descale. */
+ tmp0 += ONE << (CONST_BITS-1);
+
+ dataptr[DCTSIZE*1] = (DCTELEM)
+ RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
+ CONST_BITS);
+ dataptr[DCTSIZE*3] = (DCTELEM)
+ RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
+ CONST_BITS);
+
+ dataptr++; /* advance pointer to next column */
+ }
+}
+
+
+/*
+ * Perform the forward DCT on a 1x2 sample block.
+ *
+ * 1-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_1x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+ INT32 tmp0, tmp1;
+
+ /* Pre-zero output coefficient block. */
+ MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+ tmp0 = GETJSAMPLE(sample_data[0][start_col]);
+ tmp1 = GETJSAMPLE(sample_data[1][start_col]);
+
+ /* We leave the results scaled up by an overall factor of 8.
+ * We must also scale the output by (8/1)*(8/2) = 2**5.
+ */
+
+ /* Even part */
+ /* Apply unsigned->signed conversion */
+ data[DCTSIZE*0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 5);
+
+ /* Odd part */
+ data[DCTSIZE*1] = (DCTELEM) ((tmp0 - tmp1) << 5);
+}
+
+#endif /* DCT_SCALING_SUPPORTED */
#endif /* DCT_ISLOW_SUPPORTED */
diff --git a/src/libjpeg/jidctint.c b/src/libjpeg/jidctint.c
index a72b320..dcdf7ce 100644
--- a/src/libjpeg/jidctint.c
+++ b/src/libjpeg/jidctint.c
@@ -2,6 +2,7 @@
* jidctint.c
*
* Copyright (C) 1991-1998, Thomas G. Lane.
+ * Modification developed 2002-2009 by Guido Vollbeding.
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -23,6 +24,28 @@
* The advantage of this method is that no data path contains more than one
* multiplication; this allows a very simple and accurate implementation in
* scaled fixed-point arithmetic, with a minimal number of shifts.
+ *
+ * We also provide IDCT routines with various output sample block sizes for
+ * direct resolution reduction or enlargement and for direct resolving the
+ * common 2x1 and 1x2 subsampling cases without additional resampling: NxN
+ * (N=1...16), 2NxN, and Nx2N (N=1...8) pixels for one 8x8 input DCT block.
+ *
+ * For N<8 we simply take the corresponding low-frequency coefficients of
+ * the 8x8 input DCT block and apply an NxN point IDCT on the sub-block
+ * to yield the downscaled outputs.
+ * This can be seen as direct low-pass downsampling from the DCT domain
+ * point of view rather than the usual spatial domain point of view,
+ * yielding significant computational savings and results at least
+ * as good as common bilinear (averaging) spatial downsampling.
+ *
+ * For N>8 we apply a partial NxN IDCT on the 8 input coefficients as
+ * lower frequencies and higher frequencies assumed to be zero.
+ * It turns out that the computational effort is similar to the 8x8 IDCT
+ * regarding the output size.
+ * Furthermore, the scaling and descaling is the same for all IDCT sizes.
+ *
+ * CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases
+ * since there would be too many additional constants to pre-calculate.
*/
#define JPEG_INTERNALS
@@ -38,7 +61,7 @@
*/
#if DCTSIZE != 8
- Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
+ Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
#endif
@@ -151,7 +174,7 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
{
INT32 tmp0, tmp1, tmp2, tmp3;
INT32 tmp10, tmp11, tmp12, tmp13;
- INT32 z1, z2, z3, z4, z5;
+ INT32 z1, z2, z3;
JCOEFPTR inptr;
ISLOW_MULT_TYPE * quantptr;
int * wsptr;
@@ -177,14 +200,14 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
* With typical images and quantization tables, half or more of the
* column DCT calculations can be simplified this way.
*/
-
+
if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
inptr[DCTSIZE*7] == 0) {
/* AC terms all zero */
int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
-
+
wsptr[DCTSIZE*0] = dcval;
wsptr[DCTSIZE*1] = dcval;
wsptr[DCTSIZE*2] = dcval;
@@ -193,82 +216,84 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
wsptr[DCTSIZE*5] = dcval;
wsptr[DCTSIZE*6] = dcval;
wsptr[DCTSIZE*7] = dcval;
-
+
inptr++; /* advance pointers to next column */
quantptr++;
wsptr++;
continue;
}
-
+
/* Even part: reverse the even part of the forward DCT. */
/* The rotator is sqrt(2)*c(-6). */
z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
-
+
z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
- tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
- tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
-
+ tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
+ tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
+
z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+ z2 <<= CONST_BITS;
+ z3 <<= CONST_BITS;
+ /* Add fudge factor here for final descale. */
+ z2 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+ tmp0 = z2 + z3;
+ tmp1 = z2 - z3;
+
+ tmp10 = tmp0 + tmp2;
+ tmp13 = tmp0 - tmp2;
+ tmp11 = tmp1 + tmp3;
+ tmp12 = tmp1 - tmp3;
- tmp0 = (z2 + z3) << CONST_BITS;
- tmp1 = (z2 - z3) << CONST_BITS;
-
- tmp10 = tmp0 + tmp3;
- tmp13 = tmp0 - tmp3;
- tmp11 = tmp1 + tmp2;
- tmp12 = tmp1 - tmp2;
-
/* Odd part per figure 8; the matrix is unitary and hence its
* transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
*/
-
+
tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
- z1 = tmp0 + tmp3;
- z2 = tmp1 + tmp2;
- z3 = tmp0 + tmp2;
- z4 = tmp1 + tmp3;
- z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
-
+ z2 = tmp0 + tmp2;
+ z3 = tmp1 + tmp3;
+
+ z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
+ z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+ z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+ z2 += z1;
+ z3 += z1;
+
+ z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+ tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+ tmp0 += z1 + z2;
+ tmp3 += z1 + z3;
+
+ z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
- tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
- z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
- z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
- z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
- z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
-
- z3 += z5;
- z4 += z5;
-
- tmp0 += z1 + z3;
- tmp1 += z2 + z4;
- tmp2 += z2 + z3;
- tmp3 += z1 + z4;
-
+ tmp1 += z1 + z3;
+ tmp2 += z1 + z2;
+
/* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
-
- wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
- wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
- wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
- wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
- wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
- wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
- wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
- wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
+
+ wsptr[DCTSIZE*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
+ wsptr[DCTSIZE*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
+ wsptr[DCTSIZE*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
+ wsptr[DCTSIZE*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
+ wsptr[DCTSIZE*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
+ wsptr[DCTSIZE*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
+ wsptr[DCTSIZE*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
+ wsptr[DCTSIZE*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
inptr++; /* advance pointers to next column */
quantptr++;
wsptr++;
}
-
+
/* Pass 2: process rows from work array, store into output array. */
/* Note that we must descale the results by a factor of 8 == 2**3, */
/* and also undo the PASS1_BITS scaling. */
@@ -283,14 +308,14 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
* test takes more time than it's worth. In that case this section
* may be commented out.
*/
-
+
#ifndef NO_ZERO_ROW_TEST
if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
/* AC terms all zero */
JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3)
& RANGE_MASK];
-
+
outptr[0] = dcval;
outptr[1] = dcval;
outptr[2] = dcval;
@@ -304,86 +329,4809 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
continue;
}
#endif
-
+
/* Even part: reverse the even part of the forward DCT. */
/* The rotator is sqrt(2)*c(-6). */
z2 = (INT32) wsptr[2];
z3 = (INT32) wsptr[6];
+
+ z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
+ tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
+ tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
+
+ /* Add fudge factor here for final descale. */
+ z2 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+ z3 = (INT32) wsptr[4];
+
+ tmp0 = (z2 + z3) << CONST_BITS;
+ tmp1 = (z2 - z3) << CONST_BITS;
+
+ tmp10 = tmp0 + tmp2;
+ tmp13 = tmp0 - tmp2;
+ tmp11 = tmp1 + tmp3;
+ tmp12 = tmp1 - tmp3;
+
+ /* Odd part per figure 8; the matrix is unitary and hence its
+ * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
+ */
+
+ tmp0 = (INT32) wsptr[7];
+ tmp1 = (INT32) wsptr[5];
+ tmp2 = (INT32) wsptr[3];
+ tmp3 = (INT32) wsptr[1];
+
+ z2 = tmp0 + tmp2;
+ z3 = tmp1 + tmp3;
+
+ z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
+ z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+ z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+ z2 += z1;
+ z3 += z1;
+
+ z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
+ tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+ tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+ tmp0 += z1 + z2;
+ tmp3 += z1 + z3;
+
+ z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+ tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
+ tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
+ tmp1 += z1 + z3;
+ tmp2 += z1 + z2;
+
+ /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp3,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp3,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp2,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp2,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp1,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp1,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp0,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp0,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+
+ wsptr += DCTSIZE; /* advance pointer to next row */
+ }
+}
+
+#ifdef IDCT_SCALING_SUPPORTED
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 7x7 output block.
+ *
+ * Optimized algorithm with 12 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/14).
+ */
+
+GLOBAL(void)
+jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12, tmp13;
+ INT32 z1, z2, z3;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE * quantptr;
+ int * wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[7*7]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array. */
+
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ tmp13 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ tmp13 <<= CONST_BITS;
+ /* Add fudge factor here for final descale. */
+ tmp13 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+ tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734)); /* c4 */
+ tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123)); /* c6 */
+ tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
+ tmp0 = z1 + z3;
+ z2 -= tmp0;
+ tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
+ tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536)); /* c2-c4-c6 */
+ tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249)); /* c2+c4+c6 */
+ tmp13 += MULTIPLY(z2, FIX(1.414213562)); /* c0 */
+
+ /* Odd part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+
+ tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347)); /* (c3+c1-c5)/2 */
+ tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339)); /* (c3+c5-c1)/2 */
+ tmp0 = tmp1 - tmp2;
+ tmp1 += tmp2;
+ tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276)); /* -c1 */
+ tmp1 += tmp2;
+ z2 = MULTIPLY(z1 + z3, FIX(0.613604268)); /* c5 */
+ tmp0 += z2;
+ tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693)); /* c3+c1-c5 */
+
+ /* Final output stage */
+
+ wsptr[7*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
+ wsptr[7*6] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
+ wsptr[7*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
+ wsptr[7*5] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
+ wsptr[7*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
+ wsptr[7*4] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
+ wsptr[7*3] = (int) RIGHT_SHIFT(tmp13, CONST_BITS-PASS1_BITS);
+ }
+
+ /* Pass 2: process 7 rows from work array, store into output array. */
+
+ wsptr = workspace;
+ for (ctr = 0; ctr < 7; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ tmp13 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+ tmp13 <<= CONST_BITS;
+
+ z1 = (INT32) wsptr[2];
+ z2 = (INT32) wsptr[4];
+ z3 = (INT32) wsptr[6];
+
+ tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734)); /* c4 */
+ tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123)); /* c6 */
+ tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
+ tmp0 = z1 + z3;
+ z2 -= tmp0;
+ tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
+ tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536)); /* c2-c4-c6 */
+ tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249)); /* c2+c4+c6 */
+ tmp13 += MULTIPLY(z2, FIX(1.414213562)); /* c0 */
+
+ /* Odd part */
+
+ z1 = (INT32) wsptr[1];
+ z2 = (INT32) wsptr[3];
+ z3 = (INT32) wsptr[5];
+
+ tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347)); /* (c3+c1-c5)/2 */
+ tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339)); /* (c3+c5-c1)/2 */
+ tmp0 = tmp1 - tmp2;
+ tmp1 += tmp2;
+ tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276)); /* -c1 */
+ tmp1 += tmp2;
+ z2 = MULTIPLY(z1 + z3, FIX(0.613604268)); /* c5 */
+ tmp0 += z2;
+ tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693)); /* c3+c1-c5 */
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+
+ wsptr += 7; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 6x6 output block.
+ *
+ * Optimized algorithm with 3 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/12).
+ */
+
+GLOBAL(void)
+jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
+ INT32 z1, z2, z3;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE * quantptr;
+ int * wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[6*6]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array. */
+
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ tmp0 <<= CONST_BITS;
+ /* Add fudge factor here for final descale. */
+ tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
+ tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+ tmp10 = MULTIPLY(tmp2, FIX(0.707106781)); /* c4 */
+ tmp1 = tmp0 + tmp10;
+ tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS-PASS1_BITS);
+ tmp10 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+ tmp0 = MULTIPLY(tmp10, FIX(1.224744871)); /* c2 */
+ tmp10 = tmp1 + tmp0;
+ tmp12 = tmp1 - tmp0;
+
+ /* Odd part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+ tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
+ tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
+ tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
+ tmp1 = (z1 - z2 - z3) << PASS1_BITS;
+
+ /* Final output stage */
+
+ wsptr[6*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
+ wsptr[6*5] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
+ wsptr[6*1] = (int) (tmp11 + tmp1);
+ wsptr[6*4] = (int) (tmp11 - tmp1);
+ wsptr[6*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
+ wsptr[6*3] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
+ }
+
+ /* Pass 2: process 6 rows from work array, store into output array. */
+
+ wsptr = workspace;
+ for (ctr = 0; ctr < 6; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+ tmp0 <<= CONST_BITS;
+ tmp2 = (INT32) wsptr[4];
+ tmp10 = MULTIPLY(tmp2, FIX(0.707106781)); /* c4 */
+ tmp1 = tmp0 + tmp10;
+ tmp11 = tmp0 - tmp10 - tmp10;
+ tmp10 = (INT32) wsptr[2];
+ tmp0 = MULTIPLY(tmp10, FIX(1.224744871)); /* c2 */
+ tmp10 = tmp1 + tmp0;
+ tmp12 = tmp1 - tmp0;
+
+ /* Odd part */
+
+ z1 = (INT32) wsptr[1];
+ z2 = (INT32) wsptr[3];
+ z3 = (INT32) wsptr[5];
+ tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
+ tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
+ tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
+ tmp1 = (z1 - z2 - z3) << CONST_BITS;
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+
+ wsptr += 6; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 5x5 output block.
+ *
+ * Optimized algorithm with 5 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/10).
+ */
+
+GLOBAL(void)
+jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ INT32 tmp0, tmp1, tmp10, tmp11, tmp12;
+ INT32 z1, z2, z3;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE * quantptr;
+ int * wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[5*5]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array. */
+
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ tmp12 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ tmp12 <<= CONST_BITS;
+ /* Add fudge factor here for final descale. */
+ tmp12 += ONE << (CONST_BITS-PASS1_BITS-1);
+ tmp0 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+ tmp1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+ z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
+ z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
+ z3 = tmp12 + z2;
+ tmp10 = z3 + z1;
+ tmp11 = z3 - z1;
+ tmp12 -= z2 << 2;
+
+ /* Odd part */
+
+ z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+
+ z1 = MULTIPLY(z2 + z3, FIX(0.831253876)); /* c3 */
+ tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c1-c3 */
+ tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c1+c3 */
+
+ /* Final output stage */
+
+ wsptr[5*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
+ wsptr[5*4] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
+ wsptr[5*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
+ wsptr[5*3] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
+ wsptr[5*2] = (int) RIGHT_SHIFT(tmp12, CONST_BITS-PASS1_BITS);
+ }
+
+ /* Pass 2: process 5 rows from work array, store into output array. */
+
+ wsptr = workspace;
+ for (ctr = 0; ctr < 5; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ tmp12 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+ tmp12 <<= CONST_BITS;
+ tmp0 = (INT32) wsptr[2];
+ tmp1 = (INT32) wsptr[4];
+ z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
+ z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
+ z3 = tmp12 + z2;
+ tmp10 = z3 + z1;
+ tmp11 = z3 - z1;
+ tmp12 -= z2 << 2;
+
+ /* Odd part */
+
+ z2 = (INT32) wsptr[1];
+ z3 = (INT32) wsptr[3];
+
+ z1 = MULTIPLY(z2 + z3, FIX(0.831253876)); /* c3 */
+ tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c1-c3 */
+ tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c1+c3 */
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+
+ wsptr += 5; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 4x4 output block.
+ *
+ * Optimized algorithm with 3 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
+ */
+
+GLOBAL(void)
+jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ INT32 tmp0, tmp2, tmp10, tmp12;
+ INT32 z1, z2, z3;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE * quantptr;
+ int * wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[4*4]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array. */
+
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 4; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+
+ tmp10 = (tmp0 + tmp2) << PASS1_BITS;
+ tmp12 = (tmp0 - tmp2) << PASS1_BITS;
+
+ /* Odd part */
+ /* Same rotation as in the even part of the 8x8 LL&M IDCT */
+
+ z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+
+ z1 = MULTIPLY(z2 + z3, FIX_0_541196100); /* c6 */
+ /* Add fudge factor here for final descale. */
+ z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+ tmp0 = RIGHT_SHIFT(z1 + MULTIPLY(z2, FIX_0_765366865), /* c2-c6 */
+ CONST_BITS-PASS1_BITS);
+ tmp2 = RIGHT_SHIFT(z1 - MULTIPLY(z3, FIX_1_847759065), /* c2+c6 */
+ CONST_BITS-PASS1_BITS);
+
+ /* Final output stage */
+
+ wsptr[4*0] = (int) (tmp10 + tmp0);
+ wsptr[4*3] = (int) (tmp10 - tmp0);
+ wsptr[4*1] = (int) (tmp12 + tmp2);
+ wsptr[4*2] = (int) (tmp12 - tmp2);
+ }
+
+ /* Pass 2: process 4 rows from work array, store into output array. */
+
+ wsptr = workspace;
+ for (ctr = 0; ctr < 4; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+ tmp2 = (INT32) wsptr[2];
+
+ tmp10 = (tmp0 + tmp2) << CONST_BITS;
+ tmp12 = (tmp0 - tmp2) << CONST_BITS;
+
+ /* Odd part */
+ /* Same rotation as in the even part of the 8x8 LL&M IDCT */
+
+ z2 = (INT32) wsptr[1];
+ z3 = (INT32) wsptr[3];
+
+ z1 = MULTIPLY(z2 + z3, FIX_0_541196100); /* c6 */
+ tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
+ tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+
+ wsptr += 4; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 3x3 output block.
+ *
+ * Optimized algorithm with 2 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/6).
+ */
+
+GLOBAL(void)
+jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ INT32 tmp0, tmp2, tmp10, tmp12;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE * quantptr;
+ int * wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[3*3]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array. */
+
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ tmp0 <<= CONST_BITS;
+ /* Add fudge factor here for final descale. */
+ tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
+ tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+ tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
+ tmp10 = tmp0 + tmp12;
+ tmp2 = tmp0 - tmp12 - tmp12;
+
+ /* Odd part */
+
+ tmp12 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+ tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
+
+ /* Final output stage */
+
+ wsptr[3*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
+ wsptr[3*2] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
+ wsptr[3*1] = (int) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
+ }
+
+ /* Pass 2: process 3 rows from work array, store into output array. */
+
+ wsptr = workspace;
+ for (ctr = 0; ctr < 3; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+ tmp0 <<= CONST_BITS;
+ tmp2 = (INT32) wsptr[2];
+ tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
+ tmp10 = tmp0 + tmp12;
+ tmp2 = tmp0 - tmp12 - tmp12;
+
+ /* Odd part */
+
+ tmp12 = (INT32) wsptr[1];
+ tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+
+ wsptr += 3; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 2x2 output block.
+ *
+ * Multiplication-less algorithm.
+ */
+
+GLOBAL(void)
+jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+ ISLOW_MULT_TYPE * quantptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input. */
+
+ quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+
+ /* Column 0 */
+ tmp4 = DEQUANTIZE(coef_block[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ tmp5 = DEQUANTIZE(coef_block[DCTSIZE*1], quantptr[DCTSIZE*1]);
+ /* Add fudge factor here for final descale. */
+ tmp4 += ONE << 2;
+
+ tmp0 = tmp4 + tmp5;
+ tmp2 = tmp4 - tmp5;
+
+ /* Column 1 */
+ tmp4 = DEQUANTIZE(coef_block[DCTSIZE*0+1], quantptr[DCTSIZE*0+1]);
+ tmp5 = DEQUANTIZE(coef_block[DCTSIZE*1+1], quantptr[DCTSIZE*1+1]);
+
+ tmp1 = tmp4 + tmp5;
+ tmp3 = tmp4 - tmp5;
+
+ /* Pass 2: process 2 rows, store into output array. */
+
+ /* Row 0 */
+ outptr = output_buf[0] + output_col;
+
+ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp0 + tmp1, 3) & RANGE_MASK];
+ outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp0 - tmp1, 3) & RANGE_MASK];
+
+ /* Row 1 */
+ outptr = output_buf[1] + output_col;
+
+ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp2 + tmp3, 3) & RANGE_MASK];
+ outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2 - tmp3, 3) & RANGE_MASK];
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 1x1 output block.
+ *
+ * We hardly need an inverse DCT routine for this: just take the
+ * average pixel value, which is one-eighth of the DC coefficient.
+ */
+
+GLOBAL(void)
+jpeg_idct_1x1 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ int dcval;
+ ISLOW_MULT_TYPE * quantptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ SHIFT_TEMPS
+
+ /* 1x1 is trivial: just take the DC coefficient divided by 8. */
+ quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ dcval = DEQUANTIZE(coef_block[0], quantptr[0]);
+ dcval = (int) DESCALE((INT32) dcval, 3);
+
+ output_buf[0][output_col] = range_limit[dcval & RANGE_MASK];
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 9x9 output block.
+ *
+ * Optimized algorithm with 10 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/18).
+ */
+
+GLOBAL(void)
+jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13, tmp14;
+ INT32 z1, z2, z3, z4;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE * quantptr;
+ int * wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[8*9]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array. */
+
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ tmp0 <<= CONST_BITS;
+ /* Add fudge factor here for final descale. */
+ tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+ tmp3 = MULTIPLY(z3, FIX(0.707106781)); /* c6 */
+ tmp1 = tmp0 + tmp3;
+ tmp2 = tmp0 - tmp3 - tmp3;
+
+ tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
+ tmp11 = tmp2 + tmp0;
+ tmp14 = tmp2 - tmp0 - tmp0;
+
+ tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
+ tmp2 = MULTIPLY(z1, FIX(1.083350441)); /* c4 */
+ tmp3 = MULTIPLY(z2, FIX(0.245575608)); /* c8 */
+
+ tmp10 = tmp1 + tmp0 - tmp3;
+ tmp12 = tmp1 - tmp0 + tmp2;
+ tmp13 = tmp1 - tmp2 + tmp3;
+
+ /* Odd part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+ z2 = MULTIPLY(z2, - FIX(1.224744871)); /* -c3 */
+
+ tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955)); /* c5 */
+ tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525)); /* c7 */
+ tmp0 = tmp2 + tmp3 - z2;
+ tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481)); /* c1 */
+ tmp2 += z2 - tmp1;
+ tmp3 += z2 + tmp1;
+ tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
+
+ /* Final output stage */
+
+ wsptr[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
+ wsptr[8*8] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
+ wsptr[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
+ wsptr[8*7] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
+ wsptr[8*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
+ wsptr[8*6] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
+ wsptr[8*3] = (int) RIGHT_SHIFT(tmp13 + tmp3, CONST_BITS-PASS1_BITS);
+ wsptr[8*5] = (int) RIGHT_SHIFT(tmp13 - tmp3, CONST_BITS-PASS1_BITS);
+ wsptr[8*4] = (int) RIGHT_SHIFT(tmp14, CONST_BITS-PASS1_BITS);
+ }
+
+ /* Pass 2: process 9 rows from work array, store into output array. */
+
+ wsptr = workspace;
+ for (ctr = 0; ctr < 9; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+ tmp0 <<= CONST_BITS;
+
+ z1 = (INT32) wsptr[2];
+ z2 = (INT32) wsptr[4];
+ z3 = (INT32) wsptr[6];
+
+ tmp3 = MULTIPLY(z3, FIX(0.707106781)); /* c6 */
+ tmp1 = tmp0 + tmp3;
+ tmp2 = tmp0 - tmp3 - tmp3;
+
+ tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
+ tmp11 = tmp2 + tmp0;
+ tmp14 = tmp2 - tmp0 - tmp0;
+
+ tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
+ tmp2 = MULTIPLY(z1, FIX(1.083350441)); /* c4 */
+ tmp3 = MULTIPLY(z2, FIX(0.245575608)); /* c8 */
+
+ tmp10 = tmp1 + tmp0 - tmp3;
+ tmp12 = tmp1 - tmp0 + tmp2;
+ tmp13 = tmp1 - tmp2 + tmp3;
+
+ /* Odd part */
+
+ z1 = (INT32) wsptr[1];
+ z2 = (INT32) wsptr[3];
+ z3 = (INT32) wsptr[5];
+ z4 = (INT32) wsptr[7];
+
+ z2 = MULTIPLY(z2, - FIX(1.224744871)); /* -c3 */
+
+ tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955)); /* c5 */
+ tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525)); /* c7 */
+ tmp0 = tmp2 + tmp3 - z2;
+ tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481)); /* c1 */
+ tmp2 += z2 - tmp1;
+ tmp3 += z2 + tmp1;
+ tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp3,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp3,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp14,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+
+ wsptr += 8; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 10x10 output block.
+ *
+ * Optimized algorithm with 12 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/20).
+ */
+
+GLOBAL(void)
+jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
+ INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
+ INT32 z1, z2, z3, z4, z5;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE * quantptr;
+ int * wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[8*10]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array. */
+
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ z3 <<= CONST_BITS;
+ /* Add fudge factor here for final descale. */
+ z3 += ONE << (CONST_BITS-PASS1_BITS-1);
+ z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+ z1 = MULTIPLY(z4, FIX(1.144122806)); /* c4 */
+ z2 = MULTIPLY(z4, FIX(0.437016024)); /* c8 */
+ tmp10 = z3 + z1;
+ tmp11 = z3 - z2;
+
+ tmp22 = RIGHT_SHIFT(z3 - ((z1 - z2) << 1), /* c0 = (c4-c8)*2 */
+ CONST_BITS-PASS1_BITS);
+
+ z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+ z1 = MULTIPLY(z2 + z3, FIX(0.831253876)); /* c6 */
+ tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
+ tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
+
+ tmp20 = tmp10 + tmp12;
+ tmp24 = tmp10 - tmp12;
+ tmp21 = tmp11 + tmp13;
+ tmp23 = tmp11 - tmp13;
+
+ /* Odd part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+ tmp11 = z2 + z4;
+ tmp13 = z2 - z4;
+
+ tmp12 = MULTIPLY(tmp13, FIX(0.309016994)); /* (c3-c7)/2 */
+ z5 = z3 << CONST_BITS;
+
+ z2 = MULTIPLY(tmp11, FIX(0.951056516)); /* (c3+c7)/2 */
+ z4 = z5 + tmp12;
+
+ tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
+ tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
+
+ z2 = MULTIPLY(tmp11, FIX(0.587785252)); /* (c1-c9)/2 */
+ z4 = z5 - tmp12 - (tmp13 << (CONST_BITS - 1));
+
+ tmp12 = (z1 - tmp13 - z3) << PASS1_BITS;
+
+ tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
+ tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
+
+ /* Final output stage */
+
+ wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+ wsptr[8*9] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+ wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+ wsptr[8*8] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+ wsptr[8*2] = (int) (tmp22 + tmp12);
+ wsptr[8*7] = (int) (tmp22 - tmp12);
+ wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
+ wsptr[8*6] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
+ wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
+ wsptr[8*5] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+ }
+
+ /* Pass 2: process 10 rows from work array, store into output array. */
+
+ wsptr = workspace;
+ for (ctr = 0; ctr < 10; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+ z3 <<= CONST_BITS;
+ z4 = (INT32) wsptr[4];
+ z1 = MULTIPLY(z4, FIX(1.144122806)); /* c4 */
+ z2 = MULTIPLY(z4, FIX(0.437016024)); /* c8 */
+ tmp10 = z3 + z1;
+ tmp11 = z3 - z2;
+
+ tmp22 = z3 - ((z1 - z2) << 1); /* c0 = (c4-c8)*2 */
+
+ z2 = (INT32) wsptr[2];
+ z3 = (INT32) wsptr[6];
+
+ z1 = MULTIPLY(z2 + z3, FIX(0.831253876)); /* c6 */
+ tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
+ tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
+
+ tmp20 = tmp10 + tmp12;
+ tmp24 = tmp10 - tmp12;
+ tmp21 = tmp11 + tmp13;
+ tmp23 = tmp11 - tmp13;
+
+ /* Odd part */
+
+ z1 = (INT32) wsptr[1];
+ z2 = (INT32) wsptr[3];
+ z3 = (INT32) wsptr[5];
+ z3 <<= CONST_BITS;
+ z4 = (INT32) wsptr[7];
+
+ tmp11 = z2 + z4;
+ tmp13 = z2 - z4;
+
+ tmp12 = MULTIPLY(tmp13, FIX(0.309016994)); /* (c3-c7)/2 */
+
+ z2 = MULTIPLY(tmp11, FIX(0.951056516)); /* (c3+c7)/2 */
+ z4 = z3 + tmp12;
+
+ tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
+ tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
+
+ z2 = MULTIPLY(tmp11, FIX(0.587785252)); /* (c1-c9)/2 */
+ z4 = z3 - tmp12 - (tmp13 << (CONST_BITS - 1));
+
+ tmp12 = ((z1 - tmp13) << CONST_BITS) - z3;
+
+ tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
+ tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+
+ wsptr += 8; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 11x11 output block.
+ *
+ * Optimized algorithm with 24 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/22).
+ */
+
+GLOBAL(void)
+jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
+ INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
+ INT32 z1, z2, z3, z4;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE * quantptr;
+ int * wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[8*11]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array. */
+
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ tmp10 <<= CONST_BITS;
+ /* Add fudge factor here for final descale. */
+ tmp10 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+ tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132)); /* c2+c4 */
+ tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045)); /* c2-c6 */
+ z4 = z1 + z3;
+ tmp24 = MULTIPLY(z4, - FIX(1.155664402)); /* -(c2-c10) */
+ z4 -= z2;
+ tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976)); /* c2 */
+ tmp21 = tmp20 + tmp23 + tmp25 -
+ MULTIPLY(z2, FIX(1.821790775)); /* c2+c4+c10-c6 */
+ tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
+ tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
+ tmp24 += tmp25;
+ tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120)); /* c8+c10 */
+ tmp24 += MULTIPLY(z2, FIX(1.944413522)) - /* c2+c8 */
+ MULTIPLY(z1, FIX(1.390975730)); /* c4+c10 */
+ tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562)); /* c0 */
+
+ /* Odd part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+ tmp11 = z1 + z2;
+ tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
+ tmp11 = MULTIPLY(tmp11, FIX(0.887983902)); /* c3-c9 */
+ tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295)); /* c5-c9 */
+ tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
+ tmp10 = tmp11 + tmp12 + tmp13 -
+ MULTIPLY(z1, FIX(0.923107866)); /* c7+c5+c3-c1-2*c9 */
+ z1 = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
+ tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588)); /* c1+c7+3*c9-c3 */
+ tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623)); /* c3+c5-c7-c9 */
+ z1 = MULTIPLY(z2 + z4, - FIX(1.798248910)); /* -(c1+c9) */
+ tmp11 += z1;
+ tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632)); /* c1+c5+c9-c7 */
+ tmp14 += MULTIPLY(z2, - FIX(1.467221301)) + /* -(c5+c9) */
+ MULTIPLY(z3, FIX(1.001388905)) - /* c1-c9 */
+ MULTIPLY(z4, FIX(1.684843907)); /* c3+c9 */
+
+ /* Final output stage */
+
+ wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+ wsptr[8*10] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+ wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+ wsptr[8*9] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+ wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
+ wsptr[8*8] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
+ wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
+ wsptr[8*7] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
+ wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
+ wsptr[8*6] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+ wsptr[8*5] = (int) RIGHT_SHIFT(tmp25, CONST_BITS-PASS1_BITS);
+ }
+
+ /* Pass 2: process 11 rows from work array, store into output array. */
+
+ wsptr = workspace;
+ for (ctr = 0; ctr < 11; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ tmp10 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+ tmp10 <<= CONST_BITS;
+
+ z1 = (INT32) wsptr[2];
+ z2 = (INT32) wsptr[4];
+ z3 = (INT32) wsptr[6];
+
+ tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132)); /* c2+c4 */
+ tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045)); /* c2-c6 */
+ z4 = z1 + z3;
+ tmp24 = MULTIPLY(z4, - FIX(1.155664402)); /* -(c2-c10) */
+ z4 -= z2;
+ tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976)); /* c2 */
+ tmp21 = tmp20 + tmp23 + tmp25 -
+ MULTIPLY(z2, FIX(1.821790775)); /* c2+c4+c10-c6 */
+ tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
+ tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
+ tmp24 += tmp25;
+ tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120)); /* c8+c10 */
+ tmp24 += MULTIPLY(z2, FIX(1.944413522)) - /* c2+c8 */
+ MULTIPLY(z1, FIX(1.390975730)); /* c4+c10 */
+ tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562)); /* c0 */
+
+ /* Odd part */
+
+ z1 = (INT32) wsptr[1];
+ z2 = (INT32) wsptr[3];
+ z3 = (INT32) wsptr[5];
+ z4 = (INT32) wsptr[7];
+
+ tmp11 = z1 + z2;
+ tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
+ tmp11 = MULTIPLY(tmp11, FIX(0.887983902)); /* c3-c9 */
+ tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295)); /* c5-c9 */
+ tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
+ tmp10 = tmp11 + tmp12 + tmp13 -
+ MULTIPLY(z1, FIX(0.923107866)); /* c7+c5+c3-c1-2*c9 */
+ z1 = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
+ tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588)); /* c1+c7+3*c9-c3 */
+ tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623)); /* c3+c5-c7-c9 */
+ z1 = MULTIPLY(z2 + z4, - FIX(1.798248910)); /* -(c1+c9) */
+ tmp11 += z1;
+ tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632)); /* c1+c5+c9-c7 */
+ tmp14 += MULTIPLY(z2, - FIX(1.467221301)) + /* -(c5+c9) */
+ MULTIPLY(z3, FIX(1.001388905)) - /* c1-c9 */
+ MULTIPLY(z4, FIX(1.684843907)); /* c3+c9 */
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp25,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+
+ wsptr += 8; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 12x12 output block.
+ *
+ * Optimized algorithm with 15 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/24).
+ */
+
+GLOBAL(void)
+jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+ INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
+ INT32 z1, z2, z3, z4;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE * quantptr;
+ int * wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[8*12]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array. */
+
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ z3 <<= CONST_BITS;
+ /* Add fudge factor here for final descale. */
+ z3 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+ z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+ z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
+
+ tmp10 = z3 + z4;
+ tmp11 = z3 - z4;
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+ z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
+ z1 <<= CONST_BITS;
+ z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+ z2 <<= CONST_BITS;
+
+ tmp12 = z1 - z2;
+
+ tmp21 = z3 + tmp12;
+ tmp24 = z3 - tmp12;
+
+ tmp12 = z4 + z2;
+
+ tmp20 = tmp10 + tmp12;
+ tmp25 = tmp10 - tmp12;
+
+ tmp12 = z4 - z1 - z2;
+
+ tmp22 = tmp11 + tmp12;
+ tmp23 = tmp11 - tmp12;
+
+ /* Odd part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+ tmp11 = MULTIPLY(z2, FIX(1.306562965)); /* c3 */
+ tmp14 = MULTIPLY(z2, - FIX_0_541196100); /* -c9 */
+
+ tmp10 = z1 + z3;
+ tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669)); /* c7 */
+ tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384)); /* c5-c7 */
+ tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716)); /* c1-c5 */
+ tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580)); /* -(c7+c11) */
+ tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
+ tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
+ tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) - /* c7-c11 */
+ MULTIPLY(z4, FIX(1.982889723)); /* c5+c7 */
+
+ z1 -= z4;
+ z2 -= z3;
+ z3 = MULTIPLY(z1 + z2, FIX_0_541196100); /* c9 */
+ tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865); /* c3-c9 */
+ tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065); /* c3+c9 */
+
+ /* Final output stage */
+
+ wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+ wsptr[8*11] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+ wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+ wsptr[8*10] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+ wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
+ wsptr[8*9] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
+ wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
+ wsptr[8*8] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
+ wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
+ wsptr[8*7] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+ wsptr[8*5] = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
+ wsptr[8*6] = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
+ }
+
+ /* Pass 2: process 12 rows from work array, store into output array. */
+
+ wsptr = workspace;
+ for (ctr = 0; ctr < 12; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+ z3 <<= CONST_BITS;
+
+ z4 = (INT32) wsptr[4];
+ z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
+
+ tmp10 = z3 + z4;
+ tmp11 = z3 - z4;
+
+ z1 = (INT32) wsptr[2];
+ z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
+ z1 <<= CONST_BITS;
+ z2 = (INT32) wsptr[6];
+ z2 <<= CONST_BITS;
+
+ tmp12 = z1 - z2;
+
+ tmp21 = z3 + tmp12;
+ tmp24 = z3 - tmp12;
+
+ tmp12 = z4 + z2;
+
+ tmp20 = tmp10 + tmp12;
+ tmp25 = tmp10 - tmp12;
+
+ tmp12 = z4 - z1 - z2;
+
+ tmp22 = tmp11 + tmp12;
+ tmp23 = tmp11 - tmp12;
+
+ /* Odd part */
+
+ z1 = (INT32) wsptr[1];
+ z2 = (INT32) wsptr[3];
+ z3 = (INT32) wsptr[5];
+ z4 = (INT32) wsptr[7];
+
+ tmp11 = MULTIPLY(z2, FIX(1.306562965)); /* c3 */
+ tmp14 = MULTIPLY(z2, - FIX_0_541196100); /* -c9 */
+
+ tmp10 = z1 + z3;
+ tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669)); /* c7 */
+ tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384)); /* c5-c7 */
+ tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716)); /* c1-c5 */
+ tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580)); /* -(c7+c11) */
+ tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
+ tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
+ tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) - /* c7-c11 */
+ MULTIPLY(z4, FIX(1.982889723)); /* c5+c7 */
+
+ z1 -= z4;
+ z2 -= z3;
+ z3 = MULTIPLY(z1 + z2, FIX_0_541196100); /* c9 */
+ tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865); /* c3-c9 */
+ tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065); /* c3+c9 */
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+
+ wsptr += 8; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 13x13 output block.
+ *
+ * Optimized algorithm with 29 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/26).
+ */
+
+GLOBAL(void)
+jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+ INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
+ INT32 z1, z2, z3, z4;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE * quantptr;
+ int * wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[8*13]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array. */
+
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ z1 <<= CONST_BITS;
+ /* Add fudge factor here for final descale. */
+ z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+ z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+ tmp10 = z3 + z4;
+ tmp11 = z3 - z4;
+
+ tmp12 = MULTIPLY(tmp10, FIX(1.155388986)); /* (c4+c6)/2 */
+ tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1; /* (c4-c6)/2 */
+
+ tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13; /* c2 */
+ tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13; /* c10 */
+
+ tmp12 = MULTIPLY(tmp10, FIX(0.316450131)); /* (c8-c12)/2 */
+ tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1; /* (c8+c12)/2 */
+
+ tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13; /* c6 */
+ tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
+
+ tmp12 = MULTIPLY(tmp10, FIX(0.435816023)); /* (c2-c10)/2 */
+ tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1; /* (c2+c10)/2 */
+
+ tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
+ tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
+
+ tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1; /* c0 */
+
+ /* Odd part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+ tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651)); /* c3 */
+ tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945)); /* c5 */
+ tmp15 = z1 + z4;
+ tmp13 = MULTIPLY(tmp15, FIX(0.937797057)); /* c7 */
+ tmp10 = tmp11 + tmp12 + tmp13 -
+ MULTIPLY(z1, FIX(2.020082300)); /* c7+c5+c3-c1 */
+ tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458)); /* -c11 */
+ tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
+ tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
+ tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945)); /* -c5 */
+ tmp11 += tmp14;
+ tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
+ tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813)); /* -c9 */
+ tmp12 += tmp14;
+ tmp13 += tmp14;
+ tmp15 = MULTIPLY(tmp15, FIX(0.338443458)); /* c11 */
+ tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
+ MULTIPLY(z2, FIX(0.466105296)); /* c1-c7 */
+ z1 = MULTIPLY(z3 - z2, FIX(0.937797057)); /* c7 */
+ tmp14 += z1;
+ tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) - /* c3-c7 */
+ MULTIPLY(z4, FIX(1.742345811)); /* c1+c11 */
+
+ /* Final output stage */
+
+ wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+ wsptr[8*12] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+ wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+ wsptr[8*11] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+ wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
+ wsptr[8*10] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
+ wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
+ wsptr[8*9] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
+ wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
+ wsptr[8*8] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+ wsptr[8*5] = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
+ wsptr[8*7] = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
+ wsptr[8*6] = (int) RIGHT_SHIFT(tmp26, CONST_BITS-PASS1_BITS);
+ }
+
+ /* Pass 2: process 13 rows from work array, store into output array. */
+
+ wsptr = workspace;
+ for (ctr = 0; ctr < 13; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+ z1 <<= CONST_BITS;
+
+ z2 = (INT32) wsptr[2];
+ z3 = (INT32) wsptr[4];
+ z4 = (INT32) wsptr[6];
+
+ tmp10 = z3 + z4;
+ tmp11 = z3 - z4;
+
+ tmp12 = MULTIPLY(tmp10, FIX(1.155388986)); /* (c4+c6)/2 */
+ tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1; /* (c4-c6)/2 */
+
+ tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13; /* c2 */
+ tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13; /* c10 */
+
+ tmp12 = MULTIPLY(tmp10, FIX(0.316450131)); /* (c8-c12)/2 */
+ tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1; /* (c8+c12)/2 */
+
+ tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13; /* c6 */
+ tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
+
+ tmp12 = MULTIPLY(tmp10, FIX(0.435816023)); /* (c2-c10)/2 */
+ tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1; /* (c2+c10)/2 */
+
+ tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
+ tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
+
+ tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1; /* c0 */
+
+ /* Odd part */
+
+ z1 = (INT32) wsptr[1];
+ z2 = (INT32) wsptr[3];
+ z3 = (INT32) wsptr[5];
+ z4 = (INT32) wsptr[7];
+
+ tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651)); /* c3 */
+ tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945)); /* c5 */
+ tmp15 = z1 + z4;
+ tmp13 = MULTIPLY(tmp15, FIX(0.937797057)); /* c7 */
+ tmp10 = tmp11 + tmp12 + tmp13 -
+ MULTIPLY(z1, FIX(2.020082300)); /* c7+c5+c3-c1 */
+ tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458)); /* -c11 */
+ tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
+ tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
+ tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945)); /* -c5 */
+ tmp11 += tmp14;
+ tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
+ tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813)); /* -c9 */
+ tmp12 += tmp14;
+ tmp13 += tmp14;
+ tmp15 = MULTIPLY(tmp15, FIX(0.338443458)); /* c11 */
+ tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
+ MULTIPLY(z2, FIX(0.466105296)); /* c1-c7 */
+ z1 = MULTIPLY(z3 - z2, FIX(0.937797057)); /* c7 */
+ tmp14 += z1;
+ tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) - /* c3-c7 */
+ MULTIPLY(z4, FIX(1.742345811)); /* c1+c11 */
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp26,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+
+ wsptr += 8; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 14x14 output block.
+ *
+ * Optimized algorithm with 20 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/28).
+ */
+
+GLOBAL(void)
+jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+ INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
+ INT32 z1, z2, z3, z4;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE * quantptr;
+ int * wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[8*14]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array. */
+
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ z1 <<= CONST_BITS;
+ /* Add fudge factor here for final descale. */
+ z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+ z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+ z2 = MULTIPLY(z4, FIX(1.274162392)); /* c4 */
+ z3 = MULTIPLY(z4, FIX(0.314692123)); /* c12 */
+ z4 = MULTIPLY(z4, FIX(0.881747734)); /* c8 */
+
+ tmp10 = z1 + z2;
+ tmp11 = z1 + z3;
+ tmp12 = z1 - z4;
+
+ tmp23 = RIGHT_SHIFT(z1 - ((z2 + z3 - z4) << 1), /* c0 = (c4+c12-c8)*2 */
+ CONST_BITS-PASS1_BITS);
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+ z3 = MULTIPLY(z1 + z2, FIX(1.105676686)); /* c6 */
+
+ tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
+ tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
+ tmp15 = MULTIPLY(z1, FIX(0.613604268)) - /* c10 */
+ MULTIPLY(z2, FIX(1.378756276)); /* c2 */
+
+ tmp20 = tmp10 + tmp13;
+ tmp26 = tmp10 - tmp13;
+ tmp21 = tmp11 + tmp14;
+ tmp25 = tmp11 - tmp14;
+ tmp22 = tmp12 + tmp15;
+ tmp24 = tmp12 - tmp15;
+
+ /* Odd part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+ tmp13 = z4 << CONST_BITS;
+
+ tmp14 = z1 + z3;
+ tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607)); /* c3 */
+ tmp12 = MULTIPLY(tmp14, FIX(1.197448846)); /* c5 */
+ tmp10 = tmp11 + tmp12 + tmp13 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
+ tmp14 = MULTIPLY(tmp14, FIX(0.752406978)); /* c9 */
+ tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426)); /* c9+c11-c13 */
+ z1 -= z2;
+ tmp15 = MULTIPLY(z1, FIX(0.467085129)) - tmp13; /* c11 */
+ tmp16 += tmp15;
+ z1 += z4;
+ z4 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - tmp13; /* -c13 */
+ tmp11 += z4 - MULTIPLY(z2, FIX(0.424103948)); /* c3-c9-c13 */
+ tmp12 += z4 - MULTIPLY(z3, FIX(2.373959773)); /* c3+c5-c13 */
+ z4 = MULTIPLY(z3 - z2, FIX(1.405321284)); /* c1 */
+ tmp14 += z4 + tmp13 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
+ tmp15 += z4 + MULTIPLY(z2, FIX(0.674957567)); /* c1+c11-c5 */
+
+ tmp13 = (z1 - z3) << PASS1_BITS;
+
+ /* Final output stage */
+
+ wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+ wsptr[8*13] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+ wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+ wsptr[8*12] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+ wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
+ wsptr[8*11] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
+ wsptr[8*3] = (int) (tmp23 + tmp13);
+ wsptr[8*10] = (int) (tmp23 - tmp13);
+ wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
+ wsptr[8*9] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+ wsptr[8*5] = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
+ wsptr[8*8] = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
+ wsptr[8*6] = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
+ wsptr[8*7] = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
+ }
+
+ /* Pass 2: process 14 rows from work array, store into output array. */
+
+ wsptr = workspace;
+ for (ctr = 0; ctr < 14; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+ z1 <<= CONST_BITS;
+ z4 = (INT32) wsptr[4];
+ z2 = MULTIPLY(z4, FIX(1.274162392)); /* c4 */
+ z3 = MULTIPLY(z4, FIX(0.314692123)); /* c12 */
+ z4 = MULTIPLY(z4, FIX(0.881747734)); /* c8 */
+
+ tmp10 = z1 + z2;
+ tmp11 = z1 + z3;
+ tmp12 = z1 - z4;
+
+ tmp23 = z1 - ((z2 + z3 - z4) << 1); /* c0 = (c4+c12-c8)*2 */
+
+ z1 = (INT32) wsptr[2];
+ z2 = (INT32) wsptr[6];
+
+ z3 = MULTIPLY(z1 + z2, FIX(1.105676686)); /* c6 */
+
+ tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
+ tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
+ tmp15 = MULTIPLY(z1, FIX(0.613604268)) - /* c10 */
+ MULTIPLY(z2, FIX(1.378756276)); /* c2 */
+
+ tmp20 = tmp10 + tmp13;
+ tmp26 = tmp10 - tmp13;
+ tmp21 = tmp11 + tmp14;
+ tmp25 = tmp11 - tmp14;
+ tmp22 = tmp12 + tmp15;
+ tmp24 = tmp12 - tmp15;
+
+ /* Odd part */
+
+ z1 = (INT32) wsptr[1];
+ z2 = (INT32) wsptr[3];
+ z3 = (INT32) wsptr[5];
+ z4 = (INT32) wsptr[7];
+ z4 <<= CONST_BITS;
+
+ tmp14 = z1 + z3;
+ tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607)); /* c3 */
+ tmp12 = MULTIPLY(tmp14, FIX(1.197448846)); /* c5 */
+ tmp10 = tmp11 + tmp12 + z4 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
+ tmp14 = MULTIPLY(tmp14, FIX(0.752406978)); /* c9 */
+ tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426)); /* c9+c11-c13 */
+ z1 -= z2;
+ tmp15 = MULTIPLY(z1, FIX(0.467085129)) - z4; /* c11 */
+ tmp16 += tmp15;
+ tmp13 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - z4; /* -c13 */
+ tmp11 += tmp13 - MULTIPLY(z2, FIX(0.424103948)); /* c3-c9-c13 */
+ tmp12 += tmp13 - MULTIPLY(z3, FIX(2.373959773)); /* c3+c5-c13 */
+ tmp13 = MULTIPLY(z3 - z2, FIX(1.405321284)); /* c1 */
+ tmp14 += tmp13 + z4 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
+ tmp15 += tmp13 + MULTIPLY(z2, FIX(0.674957567)); /* c1+c11-c5 */
+
+ tmp13 = ((z1 - z3) << CONST_BITS) + z4;
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+
+ wsptr += 8; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 15x15 output block.
+ *
+ * Optimized algorithm with 22 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/30).
+ */
+
+GLOBAL(void)
+jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+ INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
+ INT32 z1, z2, z3, z4;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE * quantptr;
+ int * wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[8*15]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array. */
+
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ z1 <<= CONST_BITS;
+ /* Add fudge factor here for final descale. */
+ z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+ z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+ tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
+ tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
+
+ tmp12 = z1 - tmp10;
+ tmp13 = z1 + tmp11;
+ z1 -= (tmp11 - tmp10) << 1; /* c0 = (c6-c12)*2 */
+
+ z4 = z2 - z3;
+ z3 += z2;
+ tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */
+ tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */
+ z2 = MULTIPLY(z2, FIX(1.439773946)); /* c4+c14 */
+
+ tmp20 = tmp13 + tmp10 + tmp11;
+ tmp23 = tmp12 - tmp10 + tmp11 + z2;
+
+ tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */
+ tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */
+
+ tmp25 = tmp13 - tmp10 - tmp11;
+ tmp26 = tmp12 + tmp10 - tmp11 - z2;
+
+ tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */
+ tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */
+
+ tmp21 = tmp12 + tmp10 + tmp11;
+ tmp24 = tmp13 - tmp10 + tmp11;
+ tmp11 += tmp11;
+ tmp22 = z1 + tmp11; /* c10 = c6-c12 */
+ tmp27 = z1 - tmp11 - tmp11; /* c0 = (c6-c12)*2 */
+
+ /* Odd part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+ z3 = MULTIPLY(z4, FIX(1.224744871)); /* c5 */
+ z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+ tmp13 = z2 - z4;
+ tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876)); /* c9 */
+ tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148)); /* c3-c9 */
+ tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899)); /* c3+c9 */
+
+ tmp13 = MULTIPLY(z2, - FIX(0.831253876)); /* -c9 */
+ tmp15 = MULTIPLY(z2, - FIX(1.344997024)); /* -c3 */
+ z2 = z1 - z4;
+ tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353)); /* c1 */
+
+ tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */
+ tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */
+ tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3; /* c5 */
+ z2 = MULTIPLY(z1 + z4, FIX(0.575212477)); /* c11 */
+ tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3; /* c7-c11 */
+ tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3; /* c11+c13 */
+
+ /* Final output stage */
+
+ wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+ wsptr[8*14] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+ wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+ wsptr[8*13] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+ wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
+ wsptr[8*12] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
+ wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
+ wsptr[8*11] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
+ wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
+ wsptr[8*10] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+ wsptr[8*5] = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
+ wsptr[8*9] = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
+ wsptr[8*6] = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
+ wsptr[8*8] = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
+ wsptr[8*7] = (int) RIGHT_SHIFT(tmp27, CONST_BITS-PASS1_BITS);
+ }
+
+ /* Pass 2: process 15 rows from work array, store into output array. */
+
+ wsptr = workspace;
+ for (ctr = 0; ctr < 15; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+ z1 <<= CONST_BITS;
+
+ z2 = (INT32) wsptr[2];
+ z3 = (INT32) wsptr[4];
+ z4 = (INT32) wsptr[6];
+
+ tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
+ tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
+
+ tmp12 = z1 - tmp10;
+ tmp13 = z1 + tmp11;
+ z1 -= (tmp11 - tmp10) << 1; /* c0 = (c6-c12)*2 */
+
+ z4 = z2 - z3;
+ z3 += z2;
+ tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */
+ tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */
+ z2 = MULTIPLY(z2, FIX(1.439773946)); /* c4+c14 */
+
+ tmp20 = tmp13 + tmp10 + tmp11;
+ tmp23 = tmp12 - tmp10 + tmp11 + z2;
+
+ tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */
+ tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */
+
+ tmp25 = tmp13 - tmp10 - tmp11;
+ tmp26 = tmp12 + tmp10 - tmp11 - z2;
+
+ tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */
+ tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */
+
+ tmp21 = tmp12 + tmp10 + tmp11;
+ tmp24 = tmp13 - tmp10 + tmp11;
+ tmp11 += tmp11;
+ tmp22 = z1 + tmp11; /* c10 = c6-c12 */
+ tmp27 = z1 - tmp11 - tmp11; /* c0 = (c6-c12)*2 */
+
+ /* Odd part */
+
+ z1 = (INT32) wsptr[1];
+ z2 = (INT32) wsptr[3];
+ z4 = (INT32) wsptr[5];
+ z3 = MULTIPLY(z4, FIX(1.224744871)); /* c5 */
+ z4 = (INT32) wsptr[7];
+
+ tmp13 = z2 - z4;
+ tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876)); /* c9 */
+ tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148)); /* c3-c9 */
+ tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899)); /* c3+c9 */
+
+ tmp13 = MULTIPLY(z2, - FIX(0.831253876)); /* -c9 */
+ tmp15 = MULTIPLY(z2, - FIX(1.344997024)); /* -c3 */
+ z2 = z1 - z4;
+ tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353)); /* c1 */
+
+ tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */
+ tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */
+ tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3; /* c5 */
+ z2 = MULTIPLY(z1 + z4, FIX(0.575212477)); /* c11 */
+ tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3; /* c7-c11 */
+ tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3; /* c11+c13 */
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp27,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+
+ wsptr += 8; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 16x16 output block.
+ *
+ * Optimized algorithm with 28 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/32).
+ */
+
+GLOBAL(void)
+jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
+ INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
+ INT32 z1, z2, z3, z4;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE * quantptr;
+ int * wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[8*16]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array. */
+
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ tmp0 <<= CONST_BITS;
+ /* Add fudge factor here for final descale. */
+ tmp0 += 1 << (CONST_BITS-PASS1_BITS-1);
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+ tmp1 = MULTIPLY(z1, FIX(1.306562965)); /* c4[16] = c2[8] */
+ tmp2 = MULTIPLY(z1, FIX_0_541196100); /* c12[16] = c6[8] */
+
+ tmp10 = tmp0 + tmp1;
+ tmp11 = tmp0 - tmp1;
+ tmp12 = tmp0 + tmp2;
+ tmp13 = tmp0 - tmp2;
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+ z3 = z1 - z2;
+ z4 = MULTIPLY(z3, FIX(0.275899379)); /* c14[16] = c7[8] */
+ z3 = MULTIPLY(z3, FIX(1.387039845)); /* c2[16] = c1[8] */
+
+ tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447); /* (c6+c2)[16] = (c3+c1)[8] */
+ tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223); /* (c6-c14)[16] = (c3-c7)[8] */
+ tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
+ tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
+
+ tmp20 = tmp10 + tmp0;
+ tmp27 = tmp10 - tmp0;
+ tmp21 = tmp12 + tmp1;
+ tmp26 = tmp12 - tmp1;
+ tmp22 = tmp13 + tmp2;
+ tmp25 = tmp13 - tmp2;
+ tmp23 = tmp11 + tmp3;
+ tmp24 = tmp11 - tmp3;
+
+ /* Odd part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+ tmp11 = z1 + z3;
+
+ tmp1 = MULTIPLY(z1 + z2, FIX(1.353318001)); /* c3 */
+ tmp2 = MULTIPLY(tmp11, FIX(1.247225013)); /* c5 */
+ tmp3 = MULTIPLY(z1 + z4, FIX(1.093201867)); /* c7 */
+ tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586)); /* c9 */
+ tmp11 = MULTIPLY(tmp11, FIX(0.666655658)); /* c11 */
+ tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528)); /* c13 */
+ tmp0 = tmp1 + tmp2 + tmp3 -
+ MULTIPLY(z1, FIX(2.286341144)); /* c7+c5+c3-c1 */
+ tmp13 = tmp10 + tmp11 + tmp12 -
+ MULTIPLY(z1, FIX(1.835730603)); /* c9+c11+c13-c15 */
+ z1 = MULTIPLY(z2 + z3, FIX(0.138617169)); /* c15 */
+ tmp1 += z1 + MULTIPLY(z2, FIX(0.071888074)); /* c9+c11-c3-c15 */
+ tmp2 += z1 - MULTIPLY(z3, FIX(1.125726048)); /* c5+c7+c15-c3 */
+ z1 = MULTIPLY(z3 - z2, FIX(1.407403738)); /* c1 */
+ tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282)); /* c1+c11-c9-c13 */
+ tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411)); /* c1+c5+c13-c7 */
+ z2 += z4;
+ z1 = MULTIPLY(z2, - FIX(0.666655658)); /* -c11 */
+ tmp1 += z1;
+ tmp3 += z1 + MULTIPLY(z4, FIX(1.065388962)); /* c3+c11+c15-c7 */
+ z2 = MULTIPLY(z2, - FIX(1.247225013)); /* -c5 */
+ tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809)); /* c1+c5+c9-c13 */
+ tmp12 += z2;
+ z2 = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
+ tmp2 += z2;
+ tmp3 += z2;
+ z2 = MULTIPLY(z4 - z3, FIX(0.410524528)); /* c13 */
+ tmp10 += z2;
+ tmp11 += z2;
+
+ /* Final output stage */
+
+ wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp0, CONST_BITS-PASS1_BITS);
+ wsptr[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0, CONST_BITS-PASS1_BITS);
+ wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp1, CONST_BITS-PASS1_BITS);
+ wsptr[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1, CONST_BITS-PASS1_BITS);
+ wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp2, CONST_BITS-PASS1_BITS);
+ wsptr[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2, CONST_BITS-PASS1_BITS);
+ wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp3, CONST_BITS-PASS1_BITS);
+ wsptr[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3, CONST_BITS-PASS1_BITS);
+ wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
+ wsptr[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
+ wsptr[8*5] = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
+ wsptr[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
+ wsptr[8*6] = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
+ wsptr[8*9] = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
+ wsptr[8*7] = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
+ wsptr[8*8] = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
+ }
+
+ /* Pass 2: process 16 rows from work array, store into output array. */
+
+ wsptr = workspace;
+ for (ctr = 0; ctr < 16; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+ tmp0 <<= CONST_BITS;
+
+ z1 = (INT32) wsptr[4];
+ tmp1 = MULTIPLY(z1, FIX(1.306562965)); /* c4[16] = c2[8] */
+ tmp2 = MULTIPLY(z1, FIX_0_541196100); /* c12[16] = c6[8] */
+
+ tmp10 = tmp0 + tmp1;
+ tmp11 = tmp0 - tmp1;
+ tmp12 = tmp0 + tmp2;
+ tmp13 = tmp0 - tmp2;
+
+ z1 = (INT32) wsptr[2];
+ z2 = (INT32) wsptr[6];
+ z3 = z1 - z2;
+ z4 = MULTIPLY(z3, FIX(0.275899379)); /* c14[16] = c7[8] */
+ z3 = MULTIPLY(z3, FIX(1.387039845)); /* c2[16] = c1[8] */
+
+ tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447); /* (c6+c2)[16] = (c3+c1)[8] */
+ tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223); /* (c6-c14)[16] = (c3-c7)[8] */
+ tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
+ tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
+
+ tmp20 = tmp10 + tmp0;
+ tmp27 = tmp10 - tmp0;
+ tmp21 = tmp12 + tmp1;
+ tmp26 = tmp12 - tmp1;
+ tmp22 = tmp13 + tmp2;
+ tmp25 = tmp13 - tmp2;
+ tmp23 = tmp11 + tmp3;
+ tmp24 = tmp11 - tmp3;
+
+ /* Odd part */
+
+ z1 = (INT32) wsptr[1];
+ z2 = (INT32) wsptr[3];
+ z3 = (INT32) wsptr[5];
+ z4 = (INT32) wsptr[7];
+
+ tmp11 = z1 + z3;
+
+ tmp1 = MULTIPLY(z1 + z2, FIX(1.353318001)); /* c3 */
+ tmp2 = MULTIPLY(tmp11, FIX(1.247225013)); /* c5 */
+ tmp3 = MULTIPLY(z1 + z4, FIX(1.093201867)); /* c7 */
+ tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586)); /* c9 */
+ tmp11 = MULTIPLY(tmp11, FIX(0.666655658)); /* c11 */
+ tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528)); /* c13 */
+ tmp0 = tmp1 + tmp2 + tmp3 -
+ MULTIPLY(z1, FIX(2.286341144)); /* c7+c5+c3-c1 */
+ tmp13 = tmp10 + tmp11 + tmp12 -
+ MULTIPLY(z1, FIX(1.835730603)); /* c9+c11+c13-c15 */
+ z1 = MULTIPLY(z2 + z3, FIX(0.138617169)); /* c15 */
+ tmp1 += z1 + MULTIPLY(z2, FIX(0.071888074)); /* c9+c11-c3-c15 */
+ tmp2 += z1 - MULTIPLY(z3, FIX(1.125726048)); /* c5+c7+c15-c3 */
+ z1 = MULTIPLY(z3 - z2, FIX(1.407403738)); /* c1 */
+ tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282)); /* c1+c11-c9-c13 */
+ tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411)); /* c1+c5+c13-c7 */
+ z2 += z4;
+ z1 = MULTIPLY(z2, - FIX(0.666655658)); /* -c11 */
+ tmp1 += z1;
+ tmp3 += z1 + MULTIPLY(z4, FIX(1.065388962)); /* c3+c11+c15-c7 */
+ z2 = MULTIPLY(z2, - FIX(1.247225013)); /* -c5 */
+ tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809)); /* c1+c5+c9-c13 */
+ tmp12 += z2;
+ z2 = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
+ tmp2 += z2;
+ tmp3 += z2;
+ z2 = MULTIPLY(z4 - z3, FIX(0.410524528)); /* c13 */
+ tmp10 += z2;
+ tmp11 += z2;
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp0,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[15] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp0,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp1,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp1,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp2,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp2,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp3,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp3,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp10,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp10,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp11,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp11,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp12,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp12,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp27 + tmp13,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp27 - tmp13,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+
+ wsptr += 8; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 16x8 output block.
+ *
+ * 8-point IDCT in pass 1 (columns), 16-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_16x8 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
+ INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
+ INT32 z1, z2, z3, z4;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE * quantptr;
+ int * wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[8*8]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array. */
+ /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
+ /* furthermore, we scale the results by 2**PASS1_BITS. */
+
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = DCTSIZE; ctr > 0; ctr--) {
+ /* Due to quantization, we will usually find that many of the input
+ * coefficients are zero, especially the AC terms. We can exploit this
+ * by short-circuiting the IDCT calculation for any column in which all
+ * the AC terms are zero. In that case each output is equal to the
+ * DC coefficient (with scale factor as needed).
+ * With typical images and quantization tables, half or more of the
+ * column DCT calculations can be simplified this way.
+ */
+
+ if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
+ inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
+ inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
+ inptr[DCTSIZE*7] == 0) {
+ /* AC terms all zero */
+ int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
+
+ wsptr[DCTSIZE*0] = dcval;
+ wsptr[DCTSIZE*1] = dcval;
+ wsptr[DCTSIZE*2] = dcval;
+ wsptr[DCTSIZE*3] = dcval;
+ wsptr[DCTSIZE*4] = dcval;
+ wsptr[DCTSIZE*5] = dcval;
+ wsptr[DCTSIZE*6] = dcval;
+ wsptr[DCTSIZE*7] = dcval;
+
+ inptr++; /* advance pointers to next column */
+ quantptr++;
+ wsptr++;
+ continue;
+ }
+
+ /* Even part: reverse the even part of the forward DCT. */
+ /* The rotator is sqrt(2)*c(-6). */
+
+ z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
- tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
- tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
+ tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
+ tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
- tmp0 = ((INT32) wsptr[0] + (INT32) wsptr[4]) << CONST_BITS;
- tmp1 = ((INT32) wsptr[0] - (INT32) wsptr[4]) << CONST_BITS;
+ z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+ z2 <<= CONST_BITS;
+ z3 <<= CONST_BITS;
+ /* Add fudge factor here for final descale. */
+ z2 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+ tmp0 = z2 + z3;
+ tmp1 = z2 - z3;
- tmp10 = tmp0 + tmp3;
- tmp13 = tmp0 - tmp3;
- tmp11 = tmp1 + tmp2;
- tmp12 = tmp1 - tmp2;
+ tmp10 = tmp0 + tmp2;
+ tmp13 = tmp0 - tmp2;
+ tmp11 = tmp1 + tmp3;
+ tmp12 = tmp1 - tmp3;
/* Odd part per figure 8; the matrix is unitary and hence its
* transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
*/
+ tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+ tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+ tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+ tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+
+ z2 = tmp0 + tmp2;
+ z3 = tmp1 + tmp3;
+
+ z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
+ z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+ z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+ z2 += z1;
+ z3 += z1;
+
+ z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
+ tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+ tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+ tmp0 += z1 + z2;
+ tmp3 += z1 + z3;
+
+ z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+ tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
+ tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
+ tmp1 += z1 + z3;
+ tmp2 += z1 + z2;
+
+ /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+ wsptr[DCTSIZE*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
+ wsptr[DCTSIZE*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
+ wsptr[DCTSIZE*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
+ wsptr[DCTSIZE*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
+ wsptr[DCTSIZE*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
+ wsptr[DCTSIZE*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
+ wsptr[DCTSIZE*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
+ wsptr[DCTSIZE*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
+
+ inptr++; /* advance pointers to next column */
+ quantptr++;
+ wsptr++;
+ }
+
+ /* Pass 2: process 8 rows from work array, store into output array.
+ * 16-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
+ */
+ wsptr = workspace;
+ for (ctr = 0; ctr < 8; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+ tmp0 <<= CONST_BITS;
+
+ z1 = (INT32) wsptr[4];
+ tmp1 = MULTIPLY(z1, FIX(1.306562965)); /* c4[16] = c2[8] */
+ tmp2 = MULTIPLY(z1, FIX_0_541196100); /* c12[16] = c6[8] */
+
+ tmp10 = tmp0 + tmp1;
+ tmp11 = tmp0 - tmp1;
+ tmp12 = tmp0 + tmp2;
+ tmp13 = tmp0 - tmp2;
+
+ z1 = (INT32) wsptr[2];
+ z2 = (INT32) wsptr[6];
+ z3 = z1 - z2;
+ z4 = MULTIPLY(z3, FIX(0.275899379)); /* c14[16] = c7[8] */
+ z3 = MULTIPLY(z3, FIX(1.387039845)); /* c2[16] = c1[8] */
+
+ tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447); /* (c6+c2)[16] = (c3+c1)[8] */
+ tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223); /* (c6-c14)[16] = (c3-c7)[8] */
+ tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
+ tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
+
+ tmp20 = tmp10 + tmp0;
+ tmp27 = tmp10 - tmp0;
+ tmp21 = tmp12 + tmp1;
+ tmp26 = tmp12 - tmp1;
+ tmp22 = tmp13 + tmp2;
+ tmp25 = tmp13 - tmp2;
+ tmp23 = tmp11 + tmp3;
+ tmp24 = tmp11 - tmp3;
+
+ /* Odd part */
+
+ z1 = (INT32) wsptr[1];
+ z2 = (INT32) wsptr[3];
+ z3 = (INT32) wsptr[5];
+ z4 = (INT32) wsptr[7];
+
+ tmp11 = z1 + z3;
+
+ tmp1 = MULTIPLY(z1 + z2, FIX(1.353318001)); /* c3 */
+ tmp2 = MULTIPLY(tmp11, FIX(1.247225013)); /* c5 */
+ tmp3 = MULTIPLY(z1 + z4, FIX(1.093201867)); /* c7 */
+ tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586)); /* c9 */
+ tmp11 = MULTIPLY(tmp11, FIX(0.666655658)); /* c11 */
+ tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528)); /* c13 */
+ tmp0 = tmp1 + tmp2 + tmp3 -
+ MULTIPLY(z1, FIX(2.286341144)); /* c7+c5+c3-c1 */
+ tmp13 = tmp10 + tmp11 + tmp12 -
+ MULTIPLY(z1, FIX(1.835730603)); /* c9+c11+c13-c15 */
+ z1 = MULTIPLY(z2 + z3, FIX(0.138617169)); /* c15 */
+ tmp1 += z1 + MULTIPLY(z2, FIX(0.071888074)); /* c9+c11-c3-c15 */
+ tmp2 += z1 - MULTIPLY(z3, FIX(1.125726048)); /* c5+c7+c15-c3 */
+ z1 = MULTIPLY(z3 - z2, FIX(1.407403738)); /* c1 */
+ tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282)); /* c1+c11-c9-c13 */
+ tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411)); /* c1+c5+c13-c7 */
+ z2 += z4;
+ z1 = MULTIPLY(z2, - FIX(0.666655658)); /* -c11 */
+ tmp1 += z1;
+ tmp3 += z1 + MULTIPLY(z4, FIX(1.065388962)); /* c3+c11+c15-c7 */
+ z2 = MULTIPLY(z2, - FIX(1.247225013)); /* -c5 */
+ tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809)); /* c1+c5+c9-c13 */
+ tmp12 += z2;
+ z2 = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
+ tmp2 += z2;
+ tmp3 += z2;
+ z2 = MULTIPLY(z4 - z3, FIX(0.410524528)); /* c13 */
+ tmp10 += z2;
+ tmp11 += z2;
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp0,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[15] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp0,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp1,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp1,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp2,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp2,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp3,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp3,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp10,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp10,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp11,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp11,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp12,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp12,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp27 + tmp13,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp27 - tmp13,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+
+ wsptr += 8; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 14x7 output block.
+ *
+ * 7-point IDCT in pass 1 (columns), 14-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_14x7 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+ INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
+ INT32 z1, z2, z3, z4;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE * quantptr;
+ int * wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[8*7]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array.
+ * 7-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/14).
+ */
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ tmp23 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ tmp23 <<= CONST_BITS;
+ /* Add fudge factor here for final descale. */
+ tmp23 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+ tmp20 = MULTIPLY(z2 - z3, FIX(0.881747734)); /* c4 */
+ tmp22 = MULTIPLY(z1 - z2, FIX(0.314692123)); /* c6 */
+ tmp21 = tmp20 + tmp22 + tmp23 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
+ tmp10 = z1 + z3;
+ z2 -= tmp10;
+ tmp10 = MULTIPLY(tmp10, FIX(1.274162392)) + tmp23; /* c2 */
+ tmp20 += tmp10 - MULTIPLY(z3, FIX(0.077722536)); /* c2-c4-c6 */
+ tmp22 += tmp10 - MULTIPLY(z1, FIX(2.470602249)); /* c2+c4+c6 */
+ tmp23 += MULTIPLY(z2, FIX(1.414213562)); /* c0 */
+
+ /* Odd part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+
+ tmp11 = MULTIPLY(z1 + z2, FIX(0.935414347)); /* (c3+c1-c5)/2 */
+ tmp12 = MULTIPLY(z1 - z2, FIX(0.170262339)); /* (c3+c5-c1)/2 */
+ tmp10 = tmp11 - tmp12;
+ tmp11 += tmp12;
+ tmp12 = MULTIPLY(z2 + z3, - FIX(1.378756276)); /* -c1 */
+ tmp11 += tmp12;
+ z2 = MULTIPLY(z1 + z3, FIX(0.613604268)); /* c5 */
+ tmp10 += z2;
+ tmp12 += z2 + MULTIPLY(z3, FIX(1.870828693)); /* c3+c1-c5 */
+
+ /* Final output stage */
+
+ wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+ wsptr[8*6] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+ wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+ wsptr[8*5] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+ wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
+ wsptr[8*4] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
+ wsptr[8*3] = (int) RIGHT_SHIFT(tmp23, CONST_BITS-PASS1_BITS);
+ }
+
+ /* Pass 2: process 7 rows from work array, store into output array.
+ * 14-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/28).
+ */
+ wsptr = workspace;
+ for (ctr = 0; ctr < 7; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+ z1 <<= CONST_BITS;
+ z4 = (INT32) wsptr[4];
+ z2 = MULTIPLY(z4, FIX(1.274162392)); /* c4 */
+ z3 = MULTIPLY(z4, FIX(0.314692123)); /* c12 */
+ z4 = MULTIPLY(z4, FIX(0.881747734)); /* c8 */
+
+ tmp10 = z1 + z2;
+ tmp11 = z1 + z3;
+ tmp12 = z1 - z4;
+
+ tmp23 = z1 - ((z2 + z3 - z4) << 1); /* c0 = (c4+c12-c8)*2 */
+
+ z1 = (INT32) wsptr[2];
+ z2 = (INT32) wsptr[6];
+
+ z3 = MULTIPLY(z1 + z2, FIX(1.105676686)); /* c6 */
+
+ tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
+ tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
+ tmp15 = MULTIPLY(z1, FIX(0.613604268)) - /* c10 */
+ MULTIPLY(z2, FIX(1.378756276)); /* c2 */
+
+ tmp20 = tmp10 + tmp13;
+ tmp26 = tmp10 - tmp13;
+ tmp21 = tmp11 + tmp14;
+ tmp25 = tmp11 - tmp14;
+ tmp22 = tmp12 + tmp15;
+ tmp24 = tmp12 - tmp15;
+
+ /* Odd part */
+
+ z1 = (INT32) wsptr[1];
+ z2 = (INT32) wsptr[3];
+ z3 = (INT32) wsptr[5];
+ z4 = (INT32) wsptr[7];
+ z4 <<= CONST_BITS;
+
+ tmp14 = z1 + z3;
+ tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607)); /* c3 */
+ tmp12 = MULTIPLY(tmp14, FIX(1.197448846)); /* c5 */
+ tmp10 = tmp11 + tmp12 + z4 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
+ tmp14 = MULTIPLY(tmp14, FIX(0.752406978)); /* c9 */
+ tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426)); /* c9+c11-c13 */
+ z1 -= z2;
+ tmp15 = MULTIPLY(z1, FIX(0.467085129)) - z4; /* c11 */
+ tmp16 += tmp15;
+ tmp13 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - z4; /* -c13 */
+ tmp11 += tmp13 - MULTIPLY(z2, FIX(0.424103948)); /* c3-c9-c13 */
+ tmp12 += tmp13 - MULTIPLY(z3, FIX(2.373959773)); /* c3+c5-c13 */
+ tmp13 = MULTIPLY(z3 - z2, FIX(1.405321284)); /* c1 */
+ tmp14 += tmp13 + z4 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
+ tmp15 += tmp13 + MULTIPLY(z2, FIX(0.674957567)); /* c1+c11-c5 */
+
+ tmp13 = ((z1 - z3) << CONST_BITS) + z4;
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+
+ wsptr += 8; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 12x6 output block.
+ *
+ * 6-point IDCT in pass 1 (columns), 12-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_12x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+ INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
+ INT32 z1, z2, z3, z4;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE * quantptr;
+ int * wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[8*6]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array.
+ * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
+ */
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ tmp10 <<= CONST_BITS;
+ /* Add fudge factor here for final descale. */
+ tmp10 += ONE << (CONST_BITS-PASS1_BITS-1);
+ tmp12 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+ tmp20 = MULTIPLY(tmp12, FIX(0.707106781)); /* c4 */
+ tmp11 = tmp10 + tmp20;
+ tmp21 = RIGHT_SHIFT(tmp10 - tmp20 - tmp20, CONST_BITS-PASS1_BITS);
+ tmp20 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+ tmp10 = MULTIPLY(tmp20, FIX(1.224744871)); /* c2 */
+ tmp20 = tmp11 + tmp10;
+ tmp22 = tmp11 - tmp10;
+
+ /* Odd part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+ tmp11 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
+ tmp10 = tmp11 + ((z1 + z2) << CONST_BITS);
+ tmp12 = tmp11 + ((z3 - z2) << CONST_BITS);
+ tmp11 = (z1 - z2 - z3) << PASS1_BITS;
+
+ /* Final output stage */
+
+ wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+ wsptr[8*5] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+ wsptr[8*1] = (int) (tmp21 + tmp11);
+ wsptr[8*4] = (int) (tmp21 - tmp11);
+ wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
+ wsptr[8*3] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
+ }
+
+ /* Pass 2: process 6 rows from work array, store into output array.
+ * 12-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/24).
+ */
+ wsptr = workspace;
+ for (ctr = 0; ctr < 6; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+ z3 <<= CONST_BITS;
+
+ z4 = (INT32) wsptr[4];
+ z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
+
+ tmp10 = z3 + z4;
+ tmp11 = z3 - z4;
+
+ z1 = (INT32) wsptr[2];
+ z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
+ z1 <<= CONST_BITS;
+ z2 = (INT32) wsptr[6];
+ z2 <<= CONST_BITS;
+
+ tmp12 = z1 - z2;
+
+ tmp21 = z3 + tmp12;
+ tmp24 = z3 - tmp12;
+
+ tmp12 = z4 + z2;
+
+ tmp20 = tmp10 + tmp12;
+ tmp25 = tmp10 - tmp12;
+
+ tmp12 = z4 - z1 - z2;
+
+ tmp22 = tmp11 + tmp12;
+ tmp23 = tmp11 - tmp12;
+
+ /* Odd part */
+
+ z1 = (INT32) wsptr[1];
+ z2 = (INT32) wsptr[3];
+ z3 = (INT32) wsptr[5];
+ z4 = (INT32) wsptr[7];
+
+ tmp11 = MULTIPLY(z2, FIX(1.306562965)); /* c3 */
+ tmp14 = MULTIPLY(z2, - FIX_0_541196100); /* -c9 */
+
+ tmp10 = z1 + z3;
+ tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669)); /* c7 */
+ tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384)); /* c5-c7 */
+ tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716)); /* c1-c5 */
+ tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580)); /* -(c7+c11) */
+ tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
+ tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
+ tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) - /* c7-c11 */
+ MULTIPLY(z4, FIX(1.982889723)); /* c5+c7 */
+
+ z1 -= z4;
+ z2 -= z3;
+ z3 = MULTIPLY(z1 + z2, FIX_0_541196100); /* c9 */
+ tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865); /* c3-c9 */
+ tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065); /* c3+c9 */
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+
+ wsptr += 8; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 10x5 output block.
+ *
+ * 5-point IDCT in pass 1 (columns), 10-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_10x5 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
+ INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
+ INT32 z1, z2, z3, z4;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE * quantptr;
+ int * wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[8*5]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array.
+ * 5-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/10).
+ */
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ tmp12 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ tmp12 <<= CONST_BITS;
+ /* Add fudge factor here for final descale. */
+ tmp12 += ONE << (CONST_BITS-PASS1_BITS-1);
+ tmp13 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+ tmp14 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+ z1 = MULTIPLY(tmp13 + tmp14, FIX(0.790569415)); /* (c2+c4)/2 */
+ z2 = MULTIPLY(tmp13 - tmp14, FIX(0.353553391)); /* (c2-c4)/2 */
+ z3 = tmp12 + z2;
+ tmp10 = z3 + z1;
+ tmp11 = z3 - z1;
+ tmp12 -= z2 << 2;
+
+ /* Odd part */
+
+ z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+
+ z1 = MULTIPLY(z2 + z3, FIX(0.831253876)); /* c3 */
+ tmp13 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c1-c3 */
+ tmp14 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c1+c3 */
+
+ /* Final output stage */
+
+ wsptr[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp13, CONST_BITS-PASS1_BITS);
+ wsptr[8*4] = (int) RIGHT_SHIFT(tmp10 - tmp13, CONST_BITS-PASS1_BITS);
+ wsptr[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp14, CONST_BITS-PASS1_BITS);
+ wsptr[8*3] = (int) RIGHT_SHIFT(tmp11 - tmp14, CONST_BITS-PASS1_BITS);
+ wsptr[8*2] = (int) RIGHT_SHIFT(tmp12, CONST_BITS-PASS1_BITS);
+ }
+
+ /* Pass 2: process 5 rows from work array, store into output array.
+ * 10-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/20).
+ */
+ wsptr = workspace;
+ for (ctr = 0; ctr < 5; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+ z3 <<= CONST_BITS;
+ z4 = (INT32) wsptr[4];
+ z1 = MULTIPLY(z4, FIX(1.144122806)); /* c4 */
+ z2 = MULTIPLY(z4, FIX(0.437016024)); /* c8 */
+ tmp10 = z3 + z1;
+ tmp11 = z3 - z2;
+
+ tmp22 = z3 - ((z1 - z2) << 1); /* c0 = (c4-c8)*2 */
+
+ z2 = (INT32) wsptr[2];
+ z3 = (INT32) wsptr[6];
+
+ z1 = MULTIPLY(z2 + z3, FIX(0.831253876)); /* c6 */
+ tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
+ tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
+
+ tmp20 = tmp10 + tmp12;
+ tmp24 = tmp10 - tmp12;
+ tmp21 = tmp11 + tmp13;
+ tmp23 = tmp11 - tmp13;
+
+ /* Odd part */
+
+ z1 = (INT32) wsptr[1];
+ z2 = (INT32) wsptr[3];
+ z3 = (INT32) wsptr[5];
+ z3 <<= CONST_BITS;
+ z4 = (INT32) wsptr[7];
+
+ tmp11 = z2 + z4;
+ tmp13 = z2 - z4;
+
+ tmp12 = MULTIPLY(tmp13, FIX(0.309016994)); /* (c3-c7)/2 */
+
+ z2 = MULTIPLY(tmp11, FIX(0.951056516)); /* (c3+c7)/2 */
+ z4 = z3 + tmp12;
+
+ tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
+ tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
+
+ z2 = MULTIPLY(tmp11, FIX(0.587785252)); /* (c1-c9)/2 */
+ z4 = z3 - tmp12 - (tmp13 << (CONST_BITS - 1));
+
+ tmp12 = ((z1 - tmp13) << CONST_BITS) - z3;
+
+ tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
+ tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+
+ wsptr += 8; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 8x4 output block.
+ *
+ * 4-point IDCT in pass 1 (columns), 8-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_8x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ INT32 tmp0, tmp1, tmp2, tmp3;
+ INT32 tmp10, tmp11, tmp12, tmp13;
+ INT32 z1, z2, z3;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE * quantptr;
+ int * wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[8*4]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array.
+ * 4-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
+ */
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+
+ tmp10 = (tmp0 + tmp2) << PASS1_BITS;
+ tmp12 = (tmp0 - tmp2) << PASS1_BITS;
+
+ /* Odd part */
+ /* Same rotation as in the even part of the 8x8 LL&M IDCT */
+
+ z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+
+ z1 = MULTIPLY(z2 + z3, FIX_0_541196100); /* c6 */
+ /* Add fudge factor here for final descale. */
+ z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+ tmp0 = RIGHT_SHIFT(z1 + MULTIPLY(z2, FIX_0_765366865), /* c2-c6 */
+ CONST_BITS-PASS1_BITS);
+ tmp2 = RIGHT_SHIFT(z1 - MULTIPLY(z3, FIX_1_847759065), /* c2+c6 */
+ CONST_BITS-PASS1_BITS);
+
+ /* Final output stage */
+
+ wsptr[8*0] = (int) (tmp10 + tmp0);
+ wsptr[8*3] = (int) (tmp10 - tmp0);
+ wsptr[8*1] = (int) (tmp12 + tmp2);
+ wsptr[8*2] = (int) (tmp12 - tmp2);
+ }
+
+ /* Pass 2: process rows from work array, store into output array. */
+ /* Note that we must descale the results by a factor of 8 == 2**3, */
+ /* and also undo the PASS1_BITS scaling. */
+
+ wsptr = workspace;
+ for (ctr = 0; ctr < 4; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part: reverse the even part of the forward DCT. */
+ /* The rotator is sqrt(2)*c(-6). */
+
+ z2 = (INT32) wsptr[2];
+ z3 = (INT32) wsptr[6];
+
+ z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
+ tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
+ tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
+
+ /* Add fudge factor here for final descale. */
+ z2 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+ z3 = (INT32) wsptr[4];
+
+ tmp0 = (z2 + z3) << CONST_BITS;
+ tmp1 = (z2 - z3) << CONST_BITS;
+
+ tmp10 = tmp0 + tmp2;
+ tmp13 = tmp0 - tmp2;
+ tmp11 = tmp1 + tmp3;
+ tmp12 = tmp1 - tmp3;
+
+ /* Odd part per figure 8; the matrix is unitary and hence its
+ * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
+ */
+
tmp0 = (INT32) wsptr[7];
tmp1 = (INT32) wsptr[5];
tmp2 = (INT32) wsptr[3];
tmp3 = (INT32) wsptr[1];
-
- z1 = tmp0 + tmp3;
- z2 = tmp1 + tmp2;
- z3 = tmp0 + tmp2;
- z4 = tmp1 + tmp3;
- z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
-
+
+ z2 = tmp0 + tmp2;
+ z3 = tmp1 + tmp3;
+
+ z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
+ z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+ z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+ z2 += z1;
+ z3 += z1;
+
+ z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+ tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+ tmp0 += z1 + z2;
+ tmp3 += z1 + z3;
+
+ z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
- tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
- z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
- z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
- z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
- z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+ tmp1 += z1 + z3;
+ tmp2 += z1 + z2;
+
+ /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp3,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp3,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp2,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp2,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp1,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp1,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp0,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp0,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+
+ wsptr += DCTSIZE; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 6x3 output block.
+ *
+ * 3-point IDCT in pass 1 (columns), 6-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_6x3 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
+ INT32 z1, z2, z3;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE * quantptr;
+ int * wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[6*3]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array.
+ * 3-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/6).
+ */
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ tmp0 <<= CONST_BITS;
+ /* Add fudge factor here for final descale. */
+ tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
+ tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+ tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
+ tmp10 = tmp0 + tmp12;
+ tmp2 = tmp0 - tmp12 - tmp12;
+
+ /* Odd part */
+
+ tmp12 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+ tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
+
+ /* Final output stage */
+
+ wsptr[6*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
+ wsptr[6*2] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
+ wsptr[6*1] = (int) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
+ }
+
+ /* Pass 2: process 3 rows from work array, store into output array.
+ * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
+ */
+ wsptr = workspace;
+ for (ctr = 0; ctr < 3; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+ tmp0 <<= CONST_BITS;
+ tmp2 = (INT32) wsptr[4];
+ tmp10 = MULTIPLY(tmp2, FIX(0.707106781)); /* c4 */
+ tmp1 = tmp0 + tmp10;
+ tmp11 = tmp0 - tmp10 - tmp10;
+ tmp10 = (INT32) wsptr[2];
+ tmp0 = MULTIPLY(tmp10, FIX(1.224744871)); /* c2 */
+ tmp10 = tmp1 + tmp0;
+ tmp12 = tmp1 - tmp0;
+
+ /* Odd part */
+
+ z1 = (INT32) wsptr[1];
+ z2 = (INT32) wsptr[3];
+ z3 = (INT32) wsptr[5];
+ tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
+ tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
+ tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
+ tmp1 = (z1 - z2 - z3) << CONST_BITS;
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+
+ wsptr += 6; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 4x2 output block.
+ *
+ * 2-point IDCT in pass 1 (columns), 4-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_4x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ INT32 tmp0, tmp2, tmp10, tmp12;
+ INT32 z1, z2, z3;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE * quantptr;
+ INT32 * wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ INT32 workspace[4*2]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array. */
+
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 4; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+
+ /* Odd part */
+
+ tmp0 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+
+ /* Final output stage */
+
+ wsptr[4*0] = tmp10 + tmp0;
+ wsptr[4*1] = tmp10 - tmp0;
+ }
+
+ /* Pass 2: process 2 rows from work array, store into output array.
+ * 4-point IDCT kernel,
+ * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
+ */
+ wsptr = workspace;
+ for (ctr = 0; ctr < 2; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ tmp0 = wsptr[0] + (ONE << 2);
+ tmp2 = wsptr[2];
+
+ tmp10 = (tmp0 + tmp2) << CONST_BITS;
+ tmp12 = (tmp0 - tmp2) << CONST_BITS;
+
+ /* Odd part */
+ /* Same rotation as in the even part of the 8x8 LL&M IDCT */
+
+ z2 = wsptr[1];
+ z3 = wsptr[3];
+
+ z1 = MULTIPLY(z2 + z3, FIX_0_541196100); /* c6 */
+ tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
+ tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+ CONST_BITS+3)
+ & RANGE_MASK];
+ outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+ CONST_BITS+3)
+ & RANGE_MASK];
+ outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
+ CONST_BITS+3)
+ & RANGE_MASK];
+ outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
+ CONST_BITS+3)
+ & RANGE_MASK];
+
+ wsptr += 4; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 2x1 output block.
+ *
+ * 1-point IDCT in pass 1 (columns), 2-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_2x1 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ INT32 tmp0, tmp10;
+ ISLOW_MULT_TYPE * quantptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ SHIFT_TEMPS
+
+ /* Pass 1: empty. */
+
+ /* Pass 2: process 1 row from input, store into output array. */
+
+ quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ outptr = output_buf[0] + output_col;
+
+ /* Even part */
+
+ tmp10 = DEQUANTIZE(coef_block[0], quantptr[0]);
+ /* Add fudge factor here for final descale. */
+ tmp10 += ONE << 2;
+
+ /* Odd part */
+
+ tmp0 = DEQUANTIZE(coef_block[1], quantptr[1]);
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, 3) & RANGE_MASK];
+ outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, 3) & RANGE_MASK];
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 8x16 output block.
+ *
+ * 16-point IDCT in pass 1 (columns), 8-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_8x16 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
+ INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
+ INT32 z1, z2, z3, z4;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE * quantptr;
+ int * wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[8*16]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array.
+ * 16-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
+ */
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ tmp0 <<= CONST_BITS;
+ /* Add fudge factor here for final descale. */
+ tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+ tmp1 = MULTIPLY(z1, FIX(1.306562965)); /* c4[16] = c2[8] */
+ tmp2 = MULTIPLY(z1, FIX_0_541196100); /* c12[16] = c6[8] */
+
+ tmp10 = tmp0 + tmp1;
+ tmp11 = tmp0 - tmp1;
+ tmp12 = tmp0 + tmp2;
+ tmp13 = tmp0 - tmp2;
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+ z3 = z1 - z2;
+ z4 = MULTIPLY(z3, FIX(0.275899379)); /* c14[16] = c7[8] */
+ z3 = MULTIPLY(z3, FIX(1.387039845)); /* c2[16] = c1[8] */
+
+ tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447); /* (c6+c2)[16] = (c3+c1)[8] */
+ tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223); /* (c6-c14)[16] = (c3-c7)[8] */
+ tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
+ tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
+
+ tmp20 = tmp10 + tmp0;
+ tmp27 = tmp10 - tmp0;
+ tmp21 = tmp12 + tmp1;
+ tmp26 = tmp12 - tmp1;
+ tmp22 = tmp13 + tmp2;
+ tmp25 = tmp13 - tmp2;
+ tmp23 = tmp11 + tmp3;
+ tmp24 = tmp11 - tmp3;
+
+ /* Odd part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+ tmp11 = z1 + z3;
+
+ tmp1 = MULTIPLY(z1 + z2, FIX(1.353318001)); /* c3 */
+ tmp2 = MULTIPLY(tmp11, FIX(1.247225013)); /* c5 */
+ tmp3 = MULTIPLY(z1 + z4, FIX(1.093201867)); /* c7 */
+ tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586)); /* c9 */
+ tmp11 = MULTIPLY(tmp11, FIX(0.666655658)); /* c11 */
+ tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528)); /* c13 */
+ tmp0 = tmp1 + tmp2 + tmp3 -
+ MULTIPLY(z1, FIX(2.286341144)); /* c7+c5+c3-c1 */
+ tmp13 = tmp10 + tmp11 + tmp12 -
+ MULTIPLY(z1, FIX(1.835730603)); /* c9+c11+c13-c15 */
+ z1 = MULTIPLY(z2 + z3, FIX(0.138617169)); /* c15 */
+ tmp1 += z1 + MULTIPLY(z2, FIX(0.071888074)); /* c9+c11-c3-c15 */
+ tmp2 += z1 - MULTIPLY(z3, FIX(1.125726048)); /* c5+c7+c15-c3 */
+ z1 = MULTIPLY(z3 - z2, FIX(1.407403738)); /* c1 */
+ tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282)); /* c1+c11-c9-c13 */
+ tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411)); /* c1+c5+c13-c7 */
+ z2 += z4;
+ z1 = MULTIPLY(z2, - FIX(0.666655658)); /* -c11 */
+ tmp1 += z1;
+ tmp3 += z1 + MULTIPLY(z4, FIX(1.065388962)); /* c3+c11+c15-c7 */
+ z2 = MULTIPLY(z2, - FIX(1.247225013)); /* -c5 */
+ tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809)); /* c1+c5+c9-c13 */
+ tmp12 += z2;
+ z2 = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
+ tmp2 += z2;
+ tmp3 += z2;
+ z2 = MULTIPLY(z4 - z3, FIX(0.410524528)); /* c13 */
+ tmp10 += z2;
+ tmp11 += z2;
+
+ /* Final output stage */
+
+ wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp0, CONST_BITS-PASS1_BITS);
+ wsptr[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0, CONST_BITS-PASS1_BITS);
+ wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp1, CONST_BITS-PASS1_BITS);
+ wsptr[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1, CONST_BITS-PASS1_BITS);
+ wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp2, CONST_BITS-PASS1_BITS);
+ wsptr[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2, CONST_BITS-PASS1_BITS);
+ wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp3, CONST_BITS-PASS1_BITS);
+ wsptr[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3, CONST_BITS-PASS1_BITS);
+ wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
+ wsptr[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
+ wsptr[8*5] = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
+ wsptr[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
+ wsptr[8*6] = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
+ wsptr[8*9] = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
+ wsptr[8*7] = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
+ wsptr[8*8] = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
+ }
+
+ /* Pass 2: process rows from work array, store into output array. */
+ /* Note that we must descale the results by a factor of 8 == 2**3, */
+ /* and also undo the PASS1_BITS scaling. */
+
+ wsptr = workspace;
+ for (ctr = 0; ctr < 16; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part: reverse the even part of the forward DCT. */
+ /* The rotator is sqrt(2)*c(-6). */
+
+ z2 = (INT32) wsptr[2];
+ z3 = (INT32) wsptr[6];
+
+ z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
+ tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
+ tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
+
+ /* Add fudge factor here for final descale. */
+ z2 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+ z3 = (INT32) wsptr[4];
+
+ tmp0 = (z2 + z3) << CONST_BITS;
+ tmp1 = (z2 - z3) << CONST_BITS;
+
+ tmp10 = tmp0 + tmp2;
+ tmp13 = tmp0 - tmp2;
+ tmp11 = tmp1 + tmp3;
+ tmp12 = tmp1 - tmp3;
+
+ /* Odd part per figure 8; the matrix is unitary and hence its
+ * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
+ */
- z3 += z5;
- z4 += z5;
+ tmp0 = (INT32) wsptr[7];
+ tmp1 = (INT32) wsptr[5];
+ tmp2 = (INT32) wsptr[3];
+ tmp3 = (INT32) wsptr[1];
- tmp0 += z1 + z3;
- tmp1 += z2 + z4;
- tmp2 += z2 + z3;
- tmp3 += z1 + z4;
+ z2 = tmp0 + tmp2;
+ z3 = tmp1 + tmp3;
+
+ z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
+ z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+ z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+ z2 += z1;
+ z3 += z1;
+
+ z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
+ tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+ tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+ tmp0 += z1 + z2;
+ tmp3 += z1 + z3;
+
+ z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+ tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
+ tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
+ tmp1 += z1 + z3;
+ tmp2 += z1 + z2;
/* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
- outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp3,
- CONST_BITS+PASS1_BITS+3)
+ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp3,
+ CONST_BITS+PASS1_BITS+3)
& RANGE_MASK];
- outptr[7] = range_limit[(int) DESCALE(tmp10 - tmp3,
- CONST_BITS+PASS1_BITS+3)
+ outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp3,
+ CONST_BITS+PASS1_BITS+3)
& RANGE_MASK];
- outptr[1] = range_limit[(int) DESCALE(tmp11 + tmp2,
- CONST_BITS+PASS1_BITS+3)
+ outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp2,
+ CONST_BITS+PASS1_BITS+3)
& RANGE_MASK];
- outptr[6] = range_limit[(int) DESCALE(tmp11 - tmp2,
- CONST_BITS+PASS1_BITS+3)
+ outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp2,
+ CONST_BITS+PASS1_BITS+3)
& RANGE_MASK];
- outptr[2] = range_limit[(int) DESCALE(tmp12 + tmp1,
- CONST_BITS+PASS1_BITS+3)
+ outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp1,
+ CONST_BITS+PASS1_BITS+3)
& RANGE_MASK];
- outptr[5] = range_limit[(int) DESCALE(tmp12 - tmp1,
- CONST_BITS+PASS1_BITS+3)
+ outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp1,
+ CONST_BITS+PASS1_BITS+3)
& RANGE_MASK];
- outptr[3] = range_limit[(int) DESCALE(tmp13 + tmp0,
- CONST_BITS+PASS1_BITS+3)
+ outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp0,
+ CONST_BITS+PASS1_BITS+3)
& RANGE_MASK];
- outptr[4] = range_limit[(int) DESCALE(tmp13 - tmp0,
- CONST_BITS+PASS1_BITS+3)
+ outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp0,
+ CONST_BITS+PASS1_BITS+3)
& RANGE_MASK];
wsptr += DCTSIZE; /* advance pointer to next row */
}
}
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 7x14 output block.
+ *
+ * 14-point IDCT in pass 1 (columns), 7-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_7x14 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+ INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
+ INT32 z1, z2, z3, z4;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE * quantptr;
+ int * wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[7*14]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array.
+ * 14-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/28).
+ */
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ z1 <<= CONST_BITS;
+ /* Add fudge factor here for final descale. */
+ z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+ z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+ z2 = MULTIPLY(z4, FIX(1.274162392)); /* c4 */
+ z3 = MULTIPLY(z4, FIX(0.314692123)); /* c12 */
+ z4 = MULTIPLY(z4, FIX(0.881747734)); /* c8 */
+
+ tmp10 = z1 + z2;
+ tmp11 = z1 + z3;
+ tmp12 = z1 - z4;
+
+ tmp23 = RIGHT_SHIFT(z1 - ((z2 + z3 - z4) << 1), /* c0 = (c4+c12-c8)*2 */
+ CONST_BITS-PASS1_BITS);
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+ z3 = MULTIPLY(z1 + z2, FIX(1.105676686)); /* c6 */
+
+ tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
+ tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
+ tmp15 = MULTIPLY(z1, FIX(0.613604268)) - /* c10 */
+ MULTIPLY(z2, FIX(1.378756276)); /* c2 */
+
+ tmp20 = tmp10 + tmp13;
+ tmp26 = tmp10 - tmp13;
+ tmp21 = tmp11 + tmp14;
+ tmp25 = tmp11 - tmp14;
+ tmp22 = tmp12 + tmp15;
+ tmp24 = tmp12 - tmp15;
+
+ /* Odd part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+ tmp13 = z4 << CONST_BITS;
+
+ tmp14 = z1 + z3;
+ tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607)); /* c3 */
+ tmp12 = MULTIPLY(tmp14, FIX(1.197448846)); /* c5 */
+ tmp10 = tmp11 + tmp12 + tmp13 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
+ tmp14 = MULTIPLY(tmp14, FIX(0.752406978)); /* c9 */
+ tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426)); /* c9+c11-c13 */
+ z1 -= z2;
+ tmp15 = MULTIPLY(z1, FIX(0.467085129)) - tmp13; /* c11 */
+ tmp16 += tmp15;
+ z1 += z4;
+ z4 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - tmp13; /* -c13 */
+ tmp11 += z4 - MULTIPLY(z2, FIX(0.424103948)); /* c3-c9-c13 */
+ tmp12 += z4 - MULTIPLY(z3, FIX(2.373959773)); /* c3+c5-c13 */
+ z4 = MULTIPLY(z3 - z2, FIX(1.405321284)); /* c1 */
+ tmp14 += z4 + tmp13 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
+ tmp15 += z4 + MULTIPLY(z2, FIX(0.674957567)); /* c1+c11-c5 */
+
+ tmp13 = (z1 - z3) << PASS1_BITS;
+
+ /* Final output stage */
+
+ wsptr[7*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+ wsptr[7*13] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+ wsptr[7*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+ wsptr[7*12] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+ wsptr[7*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
+ wsptr[7*11] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
+ wsptr[7*3] = (int) (tmp23 + tmp13);
+ wsptr[7*10] = (int) (tmp23 - tmp13);
+ wsptr[7*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
+ wsptr[7*9] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+ wsptr[7*5] = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
+ wsptr[7*8] = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
+ wsptr[7*6] = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
+ wsptr[7*7] = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
+ }
+
+ /* Pass 2: process 14 rows from work array, store into output array.
+ * 7-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/14).
+ */
+ wsptr = workspace;
+ for (ctr = 0; ctr < 14; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ tmp23 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+ tmp23 <<= CONST_BITS;
+
+ z1 = (INT32) wsptr[2];
+ z2 = (INT32) wsptr[4];
+ z3 = (INT32) wsptr[6];
+
+ tmp20 = MULTIPLY(z2 - z3, FIX(0.881747734)); /* c4 */
+ tmp22 = MULTIPLY(z1 - z2, FIX(0.314692123)); /* c6 */
+ tmp21 = tmp20 + tmp22 + tmp23 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
+ tmp10 = z1 + z3;
+ z2 -= tmp10;
+ tmp10 = MULTIPLY(tmp10, FIX(1.274162392)) + tmp23; /* c2 */
+ tmp20 += tmp10 - MULTIPLY(z3, FIX(0.077722536)); /* c2-c4-c6 */
+ tmp22 += tmp10 - MULTIPLY(z1, FIX(2.470602249)); /* c2+c4+c6 */
+ tmp23 += MULTIPLY(z2, FIX(1.414213562)); /* c0 */
+
+ /* Odd part */
+
+ z1 = (INT32) wsptr[1];
+ z2 = (INT32) wsptr[3];
+ z3 = (INT32) wsptr[5];
+
+ tmp11 = MULTIPLY(z1 + z2, FIX(0.935414347)); /* (c3+c1-c5)/2 */
+ tmp12 = MULTIPLY(z1 - z2, FIX(0.170262339)); /* (c3+c5-c1)/2 */
+ tmp10 = tmp11 - tmp12;
+ tmp11 += tmp12;
+ tmp12 = MULTIPLY(z2 + z3, - FIX(1.378756276)); /* -c1 */
+ tmp11 += tmp12;
+ z2 = MULTIPLY(z1 + z3, FIX(0.613604268)); /* c5 */
+ tmp10 += z2;
+ tmp12 += z2 + MULTIPLY(z3, FIX(1.870828693)); /* c3+c1-c5 */
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+
+ wsptr += 7; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 6x12 output block.
+ *
+ * 12-point IDCT in pass 1 (columns), 6-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_6x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+ INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
+ INT32 z1, z2, z3, z4;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE * quantptr;
+ int * wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[6*12]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array.
+ * 12-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/24).
+ */
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ z3 <<= CONST_BITS;
+ /* Add fudge factor here for final descale. */
+ z3 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+ z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+ z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
+
+ tmp10 = z3 + z4;
+ tmp11 = z3 - z4;
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+ z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
+ z1 <<= CONST_BITS;
+ z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+ z2 <<= CONST_BITS;
+
+ tmp12 = z1 - z2;
+
+ tmp21 = z3 + tmp12;
+ tmp24 = z3 - tmp12;
+
+ tmp12 = z4 + z2;
+
+ tmp20 = tmp10 + tmp12;
+ tmp25 = tmp10 - tmp12;
+
+ tmp12 = z4 - z1 - z2;
+
+ tmp22 = tmp11 + tmp12;
+ tmp23 = tmp11 - tmp12;
+
+ /* Odd part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+ tmp11 = MULTIPLY(z2, FIX(1.306562965)); /* c3 */
+ tmp14 = MULTIPLY(z2, - FIX_0_541196100); /* -c9 */
+
+ tmp10 = z1 + z3;
+ tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669)); /* c7 */
+ tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384)); /* c5-c7 */
+ tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716)); /* c1-c5 */
+ tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580)); /* -(c7+c11) */
+ tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
+ tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
+ tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) - /* c7-c11 */
+ MULTIPLY(z4, FIX(1.982889723)); /* c5+c7 */
+
+ z1 -= z4;
+ z2 -= z3;
+ z3 = MULTIPLY(z1 + z2, FIX_0_541196100); /* c9 */
+ tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865); /* c3-c9 */
+ tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065); /* c3+c9 */
+
+ /* Final output stage */
+
+ wsptr[6*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+ wsptr[6*11] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+ wsptr[6*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+ wsptr[6*10] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+ wsptr[6*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
+ wsptr[6*9] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
+ wsptr[6*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
+ wsptr[6*8] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
+ wsptr[6*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
+ wsptr[6*7] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+ wsptr[6*5] = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
+ wsptr[6*6] = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
+ }
+
+ /* Pass 2: process 12 rows from work array, store into output array.
+ * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
+ */
+ wsptr = workspace;
+ for (ctr = 0; ctr < 12; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ tmp10 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+ tmp10 <<= CONST_BITS;
+ tmp12 = (INT32) wsptr[4];
+ tmp20 = MULTIPLY(tmp12, FIX(0.707106781)); /* c4 */
+ tmp11 = tmp10 + tmp20;
+ tmp21 = tmp10 - tmp20 - tmp20;
+ tmp20 = (INT32) wsptr[2];
+ tmp10 = MULTIPLY(tmp20, FIX(1.224744871)); /* c2 */
+ tmp20 = tmp11 + tmp10;
+ tmp22 = tmp11 - tmp10;
+
+ /* Odd part */
+
+ z1 = (INT32) wsptr[1];
+ z2 = (INT32) wsptr[3];
+ z3 = (INT32) wsptr[5];
+ tmp11 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
+ tmp10 = tmp11 + ((z1 + z2) << CONST_BITS);
+ tmp12 = tmp11 + ((z3 - z2) << CONST_BITS);
+ tmp11 = (z1 - z2 - z3) << CONST_BITS;
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+
+ wsptr += 6; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 5x10 output block.
+ *
+ * 10-point IDCT in pass 1 (columns), 5-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_5x10 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
+ INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
+ INT32 z1, z2, z3, z4, z5;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE * quantptr;
+ int * wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[5*10]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array.
+ * 10-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/20).
+ */
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ z3 <<= CONST_BITS;
+ /* Add fudge factor here for final descale. */
+ z3 += ONE << (CONST_BITS-PASS1_BITS-1);
+ z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+ z1 = MULTIPLY(z4, FIX(1.144122806)); /* c4 */
+ z2 = MULTIPLY(z4, FIX(0.437016024)); /* c8 */
+ tmp10 = z3 + z1;
+ tmp11 = z3 - z2;
+
+ tmp22 = RIGHT_SHIFT(z3 - ((z1 - z2) << 1), /* c0 = (c4-c8)*2 */
+ CONST_BITS-PASS1_BITS);
+
+ z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+ z1 = MULTIPLY(z2 + z3, FIX(0.831253876)); /* c6 */
+ tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
+ tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
+
+ tmp20 = tmp10 + tmp12;
+ tmp24 = tmp10 - tmp12;
+ tmp21 = tmp11 + tmp13;
+ tmp23 = tmp11 - tmp13;
+
+ /* Odd part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+ tmp11 = z2 + z4;
+ tmp13 = z2 - z4;
+
+ tmp12 = MULTIPLY(tmp13, FIX(0.309016994)); /* (c3-c7)/2 */
+ z5 = z3 << CONST_BITS;
+
+ z2 = MULTIPLY(tmp11, FIX(0.951056516)); /* (c3+c7)/2 */
+ z4 = z5 + tmp12;
+
+ tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
+ tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
+
+ z2 = MULTIPLY(tmp11, FIX(0.587785252)); /* (c1-c9)/2 */
+ z4 = z5 - tmp12 - (tmp13 << (CONST_BITS - 1));
+
+ tmp12 = (z1 - tmp13 - z3) << PASS1_BITS;
+
+ tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
+ tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
+
+ /* Final output stage */
+
+ wsptr[5*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+ wsptr[5*9] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+ wsptr[5*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+ wsptr[5*8] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+ wsptr[5*2] = (int) (tmp22 + tmp12);
+ wsptr[5*7] = (int) (tmp22 - tmp12);
+ wsptr[5*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
+ wsptr[5*6] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
+ wsptr[5*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
+ wsptr[5*5] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+ }
+
+ /* Pass 2: process 10 rows from work array, store into output array.
+ * 5-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/10).
+ */
+ wsptr = workspace;
+ for (ctr = 0; ctr < 10; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ tmp12 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+ tmp12 <<= CONST_BITS;
+ tmp13 = (INT32) wsptr[2];
+ tmp14 = (INT32) wsptr[4];
+ z1 = MULTIPLY(tmp13 + tmp14, FIX(0.790569415)); /* (c2+c4)/2 */
+ z2 = MULTIPLY(tmp13 - tmp14, FIX(0.353553391)); /* (c2-c4)/2 */
+ z3 = tmp12 + z2;
+ tmp10 = z3 + z1;
+ tmp11 = z3 - z1;
+ tmp12 -= z2 << 2;
+
+ /* Odd part */
+
+ z2 = (INT32) wsptr[1];
+ z3 = (INT32) wsptr[3];
+
+ z1 = MULTIPLY(z2 + z3, FIX(0.831253876)); /* c3 */
+ tmp13 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c1-c3 */
+ tmp14 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c1+c3 */
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp13,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp13,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp14,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp14,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+
+ wsptr += 5; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 4x8 output block.
+ *
+ * 8-point IDCT in pass 1 (columns), 4-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_4x8 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ INT32 tmp0, tmp1, tmp2, tmp3;
+ INT32 tmp10, tmp11, tmp12, tmp13;
+ INT32 z1, z2, z3;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE * quantptr;
+ int * wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[4*8]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array. */
+ /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
+ /* furthermore, we scale the results by 2**PASS1_BITS. */
+
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 4; ctr > 0; ctr--) {
+ /* Due to quantization, we will usually find that many of the input
+ * coefficients are zero, especially the AC terms. We can exploit this
+ * by short-circuiting the IDCT calculation for any column in which all
+ * the AC terms are zero. In that case each output is equal to the
+ * DC coefficient (with scale factor as needed).
+ * With typical images and quantization tables, half or more of the
+ * column DCT calculations can be simplified this way.
+ */
+
+ if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
+ inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
+ inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
+ inptr[DCTSIZE*7] == 0) {
+ /* AC terms all zero */
+ int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
+
+ wsptr[4*0] = dcval;
+ wsptr[4*1] = dcval;
+ wsptr[4*2] = dcval;
+ wsptr[4*3] = dcval;
+ wsptr[4*4] = dcval;
+ wsptr[4*5] = dcval;
+ wsptr[4*6] = dcval;
+ wsptr[4*7] = dcval;
+
+ inptr++; /* advance pointers to next column */
+ quantptr++;
+ wsptr++;
+ continue;
+ }
+
+ /* Even part: reverse the even part of the forward DCT. */
+ /* The rotator is sqrt(2)*c(-6). */
+
+ z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+ z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
+ tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
+ tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
+
+ z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+ z2 <<= CONST_BITS;
+ z3 <<= CONST_BITS;
+ /* Add fudge factor here for final descale. */
+ z2 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+ tmp0 = z2 + z3;
+ tmp1 = z2 - z3;
+
+ tmp10 = tmp0 + tmp2;
+ tmp13 = tmp0 - tmp2;
+ tmp11 = tmp1 + tmp3;
+ tmp12 = tmp1 - tmp3;
+
+ /* Odd part per figure 8; the matrix is unitary and hence its
+ * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
+ */
+
+ tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+ tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+ tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+ tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+
+ z2 = tmp0 + tmp2;
+ z3 = tmp1 + tmp3;
+
+ z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
+ z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+ z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+ z2 += z1;
+ z3 += z1;
+
+ z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
+ tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+ tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+ tmp0 += z1 + z2;
+ tmp3 += z1 + z3;
+
+ z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+ tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
+ tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
+ tmp1 += z1 + z3;
+ tmp2 += z1 + z2;
+
+ /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+ wsptr[4*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
+ wsptr[4*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
+ wsptr[4*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
+ wsptr[4*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
+ wsptr[4*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
+ wsptr[4*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
+ wsptr[4*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
+ wsptr[4*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
+
+ inptr++; /* advance pointers to next column */
+ quantptr++;
+ wsptr++;
+ }
+
+ /* Pass 2: process 8 rows from work array, store into output array.
+ * 4-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
+ */
+ wsptr = workspace;
+ for (ctr = 0; ctr < 8; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+ tmp2 = (INT32) wsptr[2];
+
+ tmp10 = (tmp0 + tmp2) << CONST_BITS;
+ tmp12 = (tmp0 - tmp2) << CONST_BITS;
+
+ /* Odd part */
+ /* Same rotation as in the even part of the 8x8 LL&M IDCT */
+
+ z2 = (INT32) wsptr[1];
+ z3 = (INT32) wsptr[3];
+
+ z1 = MULTIPLY(z2 + z3, FIX_0_541196100); /* c6 */
+ tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
+ tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+
+ wsptr += 4; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 3x6 output block.
+ *
+ * 6-point IDCT in pass 1 (columns), 3-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_3x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
+ INT32 z1, z2, z3;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE * quantptr;
+ int * wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[3*6]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array.
+ * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
+ */
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ tmp0 <<= CONST_BITS;
+ /* Add fudge factor here for final descale. */
+ tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
+ tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+ tmp10 = MULTIPLY(tmp2, FIX(0.707106781)); /* c4 */
+ tmp1 = tmp0 + tmp10;
+ tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS-PASS1_BITS);
+ tmp10 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+ tmp0 = MULTIPLY(tmp10, FIX(1.224744871)); /* c2 */
+ tmp10 = tmp1 + tmp0;
+ tmp12 = tmp1 - tmp0;
+
+ /* Odd part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+ tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
+ tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
+ tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
+ tmp1 = (z1 - z2 - z3) << PASS1_BITS;
+
+ /* Final output stage */
+
+ wsptr[3*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
+ wsptr[3*5] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
+ wsptr[3*1] = (int) (tmp11 + tmp1);
+ wsptr[3*4] = (int) (tmp11 - tmp1);
+ wsptr[3*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
+ wsptr[3*3] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
+ }
+
+ /* Pass 2: process 6 rows from work array, store into output array.
+ * 3-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/6).
+ */
+ wsptr = workspace;
+ for (ctr = 0; ctr < 6; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+ tmp0 <<= CONST_BITS;
+ tmp2 = (INT32) wsptr[2];
+ tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
+ tmp10 = tmp0 + tmp12;
+ tmp2 = tmp0 - tmp12 - tmp12;
+
+ /* Odd part */
+
+ tmp12 = (INT32) wsptr[1];
+ tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2,
+ CONST_BITS+PASS1_BITS+3)
+ & RANGE_MASK];
+
+ wsptr += 3; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 2x4 output block.
+ *
+ * 4-point IDCT in pass 1 (columns), 2-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_2x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ INT32 tmp0, tmp2, tmp10, tmp12;
+ INT32 z1, z2, z3;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE * quantptr;
+ INT32 * wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ INT32 workspace[2*4]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array.
+ * 4-point IDCT kernel,
+ * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
+ */
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 2; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+
+ tmp10 = (tmp0 + tmp2) << CONST_BITS;
+ tmp12 = (tmp0 - tmp2) << CONST_BITS;
+
+ /* Odd part */
+ /* Same rotation as in the even part of the 8x8 LL&M IDCT */
+
+ z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+
+ z1 = MULTIPLY(z2 + z3, FIX_0_541196100); /* c6 */
+ tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
+ tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
+
+ /* Final output stage */
+
+ wsptr[2*0] = tmp10 + tmp0;
+ wsptr[2*3] = tmp10 - tmp0;
+ wsptr[2*1] = tmp12 + tmp2;
+ wsptr[2*2] = tmp12 - tmp2;
+ }
+
+ /* Pass 2: process 4 rows from work array, store into output array. */
+
+ wsptr = workspace;
+ for (ctr = 0; ctr < 4; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ tmp10 = wsptr[0] + (ONE << (CONST_BITS+2));
+
+ /* Odd part */
+
+ tmp0 = wsptr[1];
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS+3)
+ & RANGE_MASK];
+ outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS+3)
+ & RANGE_MASK];
+
+ wsptr += 2; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 1x2 output block.
+ *
+ * 2-point IDCT in pass 1 (columns), 1-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_1x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ INT32 tmp0, tmp10;
+ ISLOW_MULT_TYPE * quantptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ SHIFT_TEMPS
+
+ /* Process 1 column from input, store into output array. */
+
+ quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+
+ /* Even part */
+
+ tmp10 = DEQUANTIZE(coef_block[DCTSIZE*0], quantptr[DCTSIZE*0]);
+ /* Add fudge factor here for final descale. */
+ tmp10 += ONE << 2;
+
+ /* Odd part */
+
+ tmp0 = DEQUANTIZE(coef_block[DCTSIZE*1], quantptr[DCTSIZE*1]);
+
+ /* Final output stage */
+
+ output_buf[0][output_col] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, 3)
+ & RANGE_MASK];
+ output_buf[1][output_col] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, 3)
+ & RANGE_MASK];
+}
+
+#endif /* IDCT_SCALING_SUPPORTED */
#endif /* DCT_ISLOW_SUPPORTED */
diff --git a/src/libjpeg/jmorecfg.h b/src/libjpeg/jmorecfg.h
index 2182f15..c610bc2 100644
--- a/src/libjpeg/jmorecfg.h
+++ b/src/libjpeg/jmorecfg.h
@@ -2,6 +2,7 @@
* jmorecfg.h
*
* Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 1997-2009 by Guido Vollbeding.
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -158,8 +159,11 @@ typedef short INT16;
/* INT32 must hold at least signed 32-bit values. */
#ifndef XMD_H /* X11/xmd.h correctly defines INT32 */
-typedef int JINT32;
-#define INT32 JINT32
+#ifndef _BASETSD_H_ /* Microsoft defines it in basetsd.h */
+#ifndef QGLOBAL_H /* Qt defines it in qglobal.h */
+typedef long INT32;
+#endif
+#endif
#endif
/* Datatype used for image dimensions. The JPEG standard only supports
@@ -210,11 +214,13 @@ typedef unsigned int JDIMENSION;
* explicit coding is needed; see uses of the NEED_FAR_POINTERS symbol.
*/
+#ifndef FAR
#ifdef NEED_FAR_POINTERS
#define FAR far
#else
#define FAR
#endif
+#endif
/*
@@ -257,8 +263,6 @@ typedef int boolean;
* (You may HAVE to do that if your compiler doesn't like null source files.)
*/
-/* Arithmetic coding is unsupported for legal reasons. Complaints to IBM. */
-
/* Capability options common to encoder and decoder: */
#define DCT_ISLOW_SUPPORTED /* slow but accurate integer algorithm */
@@ -267,9 +271,10 @@ typedef int boolean;
/* Encoder capability options: */
-#undef C_ARITH_CODING_SUPPORTED /* Arithmetic coding back end? */
+#define C_ARITH_CODING_SUPPORTED /* Arithmetic coding back end? */
#define C_MULTISCAN_FILES_SUPPORTED /* Multiple-scan JPEG files? */
#define C_PROGRESSIVE_SUPPORTED /* Progressive JPEG? (Requires MULTISCAN)*/
+#define DCT_SCALING_SUPPORTED /* Input rescaling via DCT? (Requires DCT_ISLOW)*/
#define ENTROPY_OPT_SUPPORTED /* Optimization of entropy coding parms? */
/* Note: if you selected 12-bit data precision, it is dangerous to turn off
* ENTROPY_OPT_SUPPORTED. The standard Huffman tables are only good for 8-bit
@@ -283,12 +288,12 @@ typedef int boolean;
/* Decoder capability options: */
-#undef D_ARITH_CODING_SUPPORTED /* Arithmetic coding back end? */
+#define D_ARITH_CODING_SUPPORTED /* Arithmetic coding back end? */
#define D_MULTISCAN_FILES_SUPPORTED /* Multiple-scan JPEG files? */
#define D_PROGRESSIVE_SUPPORTED /* Progressive JPEG? (Requires MULTISCAN)*/
+#define IDCT_SCALING_SUPPORTED /* Output rescaling via IDCT? */
#define SAVE_MARKERS_SUPPORTED /* jpeg_save_markers() needed? */
#define BLOCK_SMOOTHING_SUPPORTED /* Block smoothing? (Progressive only) */
-#define IDCT_SCALING_SUPPORTED /* Output rescaling via IDCT? */
#undef UPSAMPLE_SCALING_SUPPORTED /* Output rescaling at upsample stage? */
#define UPSAMPLE_MERGING_SUPPORTED /* Fast path for sloppy upsampling? */
#define QUANT_1PASS_SUPPORTED /* 1-pass color quantization? */
diff --git a/src/libjpeg/jpegint.h b/src/libjpeg/jpegint.h
index 95b00d4..69b781b 100644
--- a/src/libjpeg/jpegint.h
+++ b/src/libjpeg/jpegint.h
@@ -2,6 +2,7 @@
* jpegint.h
*
* Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 1997-2009 by Guido Vollbeding.
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -99,14 +100,16 @@ struct jpeg_downsampler {
};
/* Forward DCT (also controls coefficient quantization) */
+typedef JMETHOD(void, forward_DCT_ptr,
+ (j_compress_ptr cinfo, jpeg_component_info * compptr,
+ JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
+ JDIMENSION start_row, JDIMENSION start_col,
+ JDIMENSION num_blocks));
+
struct jpeg_forward_dct {
JMETHOD(void, start_pass, (j_compress_ptr cinfo));
- /* perhaps this should be an array??? */
- JMETHOD(void, forward_DCT, (j_compress_ptr cinfo,
- jpeg_component_info * compptr,
- JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
- JDIMENSION start_row, JDIMENSION start_col,
- JDIMENSION num_blocks));
+ /* It is useful to allow each component to have a separate FDCT method. */
+ forward_DCT_ptr forward_DCT[MAX_COMPONENTS];
};
/* Entropy encoding */
@@ -303,7 +306,7 @@ struct jpeg_color_quantizer {
#define jinit_downsampler jIDownsampler
#define jinit_forward_dct jIFDCT
#define jinit_huff_encoder jIHEncoder
-#define jinit_phuff_encoder jIPHEncoder
+#define jinit_arith_encoder jIAEncoder
#define jinit_marker_writer jIMWriter
#define jinit_master_decompress jIDMaster
#define jinit_d_main_controller jIDMainC
@@ -312,7 +315,7 @@ struct jpeg_color_quantizer {
#define jinit_input_controller jIInCtlr
#define jinit_marker_reader jIMReader
#define jinit_huff_decoder jIHDecoder
-#define jinit_phuff_decoder jIPHDecoder
+#define jinit_arith_decoder jIADecoder
#define jinit_inverse_dct jIIDCT
#define jinit_upsampler jIUpsampler
#define jinit_color_deconverter jIDColor
@@ -344,7 +347,7 @@ EXTERN(void) jinit_color_converter JPP((j_compress_ptr cinfo));
EXTERN(void) jinit_downsampler JPP((j_compress_ptr cinfo));
EXTERN(void) jinit_forward_dct JPP((j_compress_ptr cinfo));
EXTERN(void) jinit_huff_encoder JPP((j_compress_ptr cinfo));
-EXTERN(void) jinit_phuff_encoder JPP((j_compress_ptr cinfo));
+EXTERN(void) jinit_arith_encoder JPP((j_compress_ptr cinfo));
EXTERN(void) jinit_marker_writer JPP((j_compress_ptr cinfo));
/* Decompression module initialization routines */
EXTERN(void) jinit_master_decompress JPP((j_decompress_ptr cinfo));
@@ -357,7 +360,7 @@ EXTERN(void) jinit_d_post_controller JPP((j_decompress_ptr cinfo,
EXTERN(void) jinit_input_controller JPP((j_decompress_ptr cinfo));
EXTERN(void) jinit_marker_reader JPP((j_decompress_ptr cinfo));
EXTERN(void) jinit_huff_decoder JPP((j_decompress_ptr cinfo));
-EXTERN(void) jinit_phuff_decoder JPP((j_decompress_ptr cinfo));
+EXTERN(void) jinit_arith_decoder JPP((j_decompress_ptr cinfo));
EXTERN(void) jinit_inverse_dct JPP((j_decompress_ptr cinfo));
EXTERN(void) jinit_upsampler JPP((j_decompress_ptr cinfo));
EXTERN(void) jinit_color_deconverter JPP((j_decompress_ptr cinfo));
diff --git a/src/libjpeg/jpeglib.h b/src/libjpeg/jpeglib.h
index d1be8dd..341a19c 100644
--- a/src/libjpeg/jpeglib.h
+++ b/src/libjpeg/jpeglib.h
@@ -2,6 +2,7 @@
* jpeglib.h
*
* Copyright (C) 1991-1998, Thomas G. Lane.
+ * Modified 2002-2009 by Guido Vollbeding.
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -26,11 +27,17 @@
#include "jmorecfg.h" /* seldom changed options */
+#ifdef __cplusplus
+#ifndef DONT_USE_EXTERN_C
+extern "C" {
+#endif
+#endif
+
/* Version ID for the JPEG library.
- * Might be useful for tests like "#if JPEG_LIB_VERSION >= 60".
+ * Might be useful for tests like "#if JPEG_LIB_VERSION >= 70".
*/
-#define JPEG_LIB_VERSION 62 /* Version 6b */
+#define JPEG_LIB_VERSION 70 /* Version 7.0 */
/* Various constants determining the sizes of things.
@@ -138,18 +145,18 @@ typedef struct {
*/
JDIMENSION width_in_blocks;
JDIMENSION height_in_blocks;
- /* Size of a DCT block in samples. Always DCTSIZE for compression.
- * For decompression this is the size of the output from one DCT block,
- * reflecting any scaling we choose to apply during the IDCT step.
- * Values of 1,2,4,8 are likely to be supported. Note that different
- * components may receive different IDCT scalings.
+ /* Size of a DCT block in samples,
+ * reflecting any scaling we choose to apply during the DCT step.
+ * Values from 1 to 16 are supported.
+ * Note that different components may receive different DCT scalings.
*/
- int DCT_scaled_size;
+ int DCT_h_scaled_size;
+ int DCT_v_scaled_size;
/* The downsampled dimensions are the component's actual, unpadded number
- * of samples at the main buffer (preprocessing/compression interface), thus
- * downsampled_width = ceil(image_width * Hi/Hmax)
- * and similarly for height. For decompression, IDCT scaling is included, so
- * downsampled_width = ceil(image_width * Hi/Hmax * DCT_scaled_size/DCTSIZE)
+ * of samples at the main buffer (preprocessing/compression interface);
+ * DCT scaling is included, so
+ * downsampled_width = ceil(image_width * Hi/Hmax * DCT_h_scaled_size/DCTSIZE)
+ * and similarly for height.
*/
JDIMENSION downsampled_width; /* actual width in samples */
JDIMENSION downsampled_height; /* actual height in samples */
@@ -291,6 +298,17 @@ struct jpeg_compress_struct {
* helper routines to simplify changing parameters.
*/
+ unsigned int scale_num, scale_denom; /* fraction by which to scale image */
+
+ JDIMENSION jpeg_width; /* scaled JPEG image width */
+ JDIMENSION jpeg_height; /* scaled JPEG image height */
+ /* Dimensions of actual JPEG image that will be written to file,
+ * derived from input dimensions by scaling factors above.
+ * These fields are computed by jpeg_start_compress().
+ * You can also use jpeg_calc_jpeg_dimensions() to determine these values
+ * in advance of calling jpeg_start_compress().
+ */
+
int data_precision; /* bits of precision in image data */
int num_components; /* # of color components in JPEG image */
@@ -298,14 +316,17 @@ struct jpeg_compress_struct {
jpeg_component_info * comp_info;
/* comp_info[i] describes component that appears i'th in SOF */
-
+
JQUANT_TBL * quant_tbl_ptrs[NUM_QUANT_TBLS];
- /* ptrs to coefficient quantization tables, or NULL if not defined */
-
+ int q_scale_factor[NUM_QUANT_TBLS];
+ /* ptrs to coefficient quantization tables, or NULL if not defined,
+ * and corresponding scale factors (percentage, initialized 100).
+ */
+
JHUFF_TBL * dc_huff_tbl_ptrs[NUM_HUFF_TBLS];
JHUFF_TBL * ac_huff_tbl_ptrs[NUM_HUFF_TBLS];
/* ptrs to Huffman coding tables, or NULL if not defined */
-
+
UINT8 arith_dc_L[NUM_ARITH_TBLS]; /* L values for DC arith-coding tables */
UINT8 arith_dc_U[NUM_ARITH_TBLS]; /* U values for DC arith-coding tables */
UINT8 arith_ac_K[NUM_ARITH_TBLS]; /* Kx values for AC arith-coding tables */
@@ -321,6 +342,7 @@ struct jpeg_compress_struct {
boolean arith_code; /* TRUE=arithmetic coding, FALSE=Huffman */
boolean optimize_coding; /* TRUE=optimize entropy encoding parms */
boolean CCIR601_sampling; /* TRUE=first samples are cosited */
+ boolean do_fancy_downsampling; /* TRUE=apply fancy downsampling */
int smoothing_factor; /* 1..100, or 0 for no input smoothing */
J_DCT_METHOD dct_method; /* DCT algorithm selector */
@@ -364,6 +386,9 @@ struct jpeg_compress_struct {
int max_h_samp_factor; /* largest h_samp_factor */
int max_v_samp_factor; /* largest v_samp_factor */
+ int min_DCT_h_scaled_size; /* smallest DCT_h_scaled_size of any component */
+ int min_DCT_v_scaled_size; /* smallest DCT_v_scaled_size of any component */
+
JDIMENSION total_iMCU_rows; /* # of iMCU rows to be input to coef ctlr */
/* The coefficient controller receives data in units of MCU rows as defined
* for fully interleaved scans (whether the JPEG file is interleaved or not).
@@ -575,7 +600,8 @@ struct jpeg_decompress_struct {
int max_h_samp_factor; /* largest h_samp_factor */
int max_v_samp_factor; /* largest v_samp_factor */
- int min_DCT_scaled_size; /* smallest DCT_scaled_size of any component */
+ int min_DCT_h_scaled_size; /* smallest DCT_h_scaled_size of any component */
+ int min_DCT_v_scaled_size; /* smallest DCT_v_scaled_size of any component */
JDIMENSION total_iMCU_rows; /* # of iMCU rows in image */
/* The coefficient controller's input and output progress is measured in
@@ -583,7 +609,7 @@ struct jpeg_decompress_struct {
* in fully interleaved JPEG scans, but are used whether the scan is
* interleaved or not. We define an iMCU row as v_samp_factor DCT block
* rows of each component. Therefore, the IDCT output contains
- * v_samp_factor*DCT_scaled_size sample rows of a component per iMCU row.
+ * v_samp_factor*DCT_v_scaled_size sample rows of a component per iMCU row.
*/
JSAMPLE * sample_range_limit; /* table for fast range-limiting */
@@ -841,6 +867,7 @@ typedef JMETHOD(boolean, jpeg_marker_parser_method, (j_decompress_ptr cinfo));
#define jpeg_default_colorspace jDefColorspace
#define jpeg_set_quality jSetQuality
#define jpeg_set_linear_quality jSetLQuality
+#define jpeg_default_qtables jDefQTables
#define jpeg_add_quant_table jAddQuantTable
#define jpeg_quality_scaling jQualityScaling
#define jpeg_simple_progression jSimProgress
@@ -850,6 +877,7 @@ typedef JMETHOD(boolean, jpeg_marker_parser_method, (j_decompress_ptr cinfo));
#define jpeg_start_compress jStrtCompress
#define jpeg_write_scanlines jWrtScanlines
#define jpeg_finish_compress jFinCompress
+#define jpeg_calc_jpeg_dimensions jCjpegDimensions
#define jpeg_write_raw_data jWrtRawData
#define jpeg_write_marker jWrtMarker
#define jpeg_write_m_header jWrtMHeader
@@ -921,6 +949,8 @@ EXTERN(void) jpeg_set_quality JPP((j_compress_ptr cinfo, int quality,
EXTERN(void) jpeg_set_linear_quality JPP((j_compress_ptr cinfo,
int scale_factor,
boolean force_baseline));
+EXTERN(void) jpeg_default_qtables JPP((j_compress_ptr cinfo,
+ boolean force_baseline));
EXTERN(void) jpeg_add_quant_table JPP((j_compress_ptr cinfo, int which_tbl,
const unsigned int *basic_table,
int scale_factor,
@@ -940,12 +970,15 @@ EXTERN(JDIMENSION) jpeg_write_scanlines JPP((j_compress_ptr cinfo,
JDIMENSION num_lines));
EXTERN(void) jpeg_finish_compress JPP((j_compress_ptr cinfo));
+/* Precalculate JPEG dimensions for current compression parameters. */
+EXTERN(void) jpeg_calc_jpeg_dimensions JPP((j_compress_ptr cinfo));
+
/* Replaces jpeg_write_scanlines when writing raw downsampled data. */
EXTERN(JDIMENSION) jpeg_write_raw_data JPP((j_compress_ptr cinfo,
JSAMPIMAGE data,
JDIMENSION num_lines));
-/* Write a special marker. See libjpeg.doc concerning safe usage. */
+/* Write a special marker. See libjpeg.txt concerning safe usage. */
EXTERN(void) jpeg_write_marker
JPP((j_compress_ptr cinfo, int marker,
const JOCTET * dataptr, unsigned int datalen));
@@ -1093,4 +1126,10 @@ struct jpeg_color_quantizer { long dummy; };
#include "jerror.h" /* fetch error codes too */
#endif
+#ifdef __cplusplus
+#ifndef DONT_USE_EXTERN_C
+}
+#endif
+#endif
+
#endif /* JPEGLIB_H */
diff --git a/src/libjpeg/jversion.h b/src/libjpeg/jversion.h
index 6472c58..5151731 100644
--- a/src/libjpeg/jversion.h
+++ b/src/libjpeg/jversion.h
@@ -1,7 +1,7 @@
/*
* jversion.h
*
- * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Copyright (C) 1991-2009, Thomas G. Lane, Guido Vollbeding.
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -9,6 +9,6 @@
*/
-#define JVERSION "6b 27-Mar-1998"
+#define JVERSION "7 27-Jun-2009"
-#define JCOPYRIGHT "Copyright (C) 1998, Thomas G. Lane"
+#define JCOPYRIGHT "Copyright (C) 2009, Thomas G. Lane, Guido Vollbeding"