/*---------------------------------------------------------------------------* | PDFlib - A library for generating PDF on the fly | +---------------------------------------------------------------------------+ | Copyright (c) 1997-2006 Thomas Merz and PDFlib GmbH. All rights reserved. | +---------------------------------------------------------------------------+ | | | This software is subject to the PDFlib license. It is NOT in the | | public domain. Extended versions and commercial licenses are | | available, please check http://www.pdflib.com. | | | *---------------------------------------------------------------------------*/ /* $Id: pc_unicode.c,v 1.1 2008/10/17 06:10:43 scuri Exp $ * * PDFlib Unicode converting routines * */ #define PC_UNICODE_C #include "pc_util.h" #if defined(WIN32) #define WIN32_LEAN_AND_MEAN #include #endif /* WIN32 */ /* * The following source is based on Unicode's original source * code ConvertUTF.c. It has been adapted to PDFlib programming * conventions. * * The original file had the following notice: * * Copyright 2001 Unicode, Inc. * * Limitations on Rights to Redistribute This Code * * Author: Mark E. Davis, 1994. * Rev History: Rick McGowan, fixes & updates May 2001. * * * Functions for conversions between UTF32, UTF-16, and UTF-8. * These funtions forming a complete set of conversions between * the three formats. UTF-7 is not included here. * * Each of these routines takes pointers to input buffers and output * buffers. The input buffers are const. * * Each routine converts the text between *sourceStart and sourceEnd, * putting the result into the buffer between *targetStart and * targetEnd. Note: the end pointers are *after* the last item: e.g. * *(sourceEnd - 1) is the last item. * * The return result indicates whether the conversion was successful, * and if not, whether the problem was in the source or target buffers. * (Only the first encountered problem is indicated.) * * After the conversion, *sourceStart and *targetStart are both * updated to point to the end of last text successfully converted in * the respective buffers. * * Input parameters: * sourceStart - pointer to a pointer to the source buffer. * The contents of this are modified on return so that * it points at the next thing to be converted. * targetStart - similarly, pointer to pointer to the target buffer. * sourceEnd, targetEnd - respectively pointers to the ends of the * two buffers, for overflow checking only. * * These conversion functions take a pdc_convers_flags argument. When this * flag is set to strict, both irregular sequences and isolated surrogates * will cause an error. When the flag is set to lenient, both irregular * sequences and isolated surrogates are converted. * * Whether the flag is strict or lenient, all illegal sequences will cause * an error return. This includes sequences such as: , , * or in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code * must check for illegal sequences. * * When the flag is set to lenient, characters over 0x10FFFF are converted * to the replacement character; otherwise (when the flag is set to strict) * they constitute an error. * * Output parameters: * The value "sourceIllegal" is returned from some routines if the input * sequence is malformed. When "sourceIllegal" is returned, the source * value will point to the illegal value that caused the problem. E.g., * in UTF-8 when a sequence is malformed, it points to the start of the * malformed sequence. * * Author: Mark E. Davis, 1994. * Rev History: Rick McGowan, fixes & updates May 2001. * */ /* * The following 4 definitions are compiler-specific. * The C standard does not guarantee that wchar_t has at least * 16 bits, so wchar_t is no less portable than unsigned short! * All should be unsigned values to avoid sign extension during * bit mask & shift operations. */ /* Unicode original: typedef unsigned long UTF32; at least 32 bits typedef unsigned short UTF16; at least 16 bits */ typedef unsigned int UTF32; /* 32 bits */ typedef unsigned short UTF16; /* 16 bits */ typedef unsigned char UTF8; /* typically 8 bits */ /* Some fundamental constants */ #define UNI_SUR_HIGH_START (UTF32)0xD800 #define UNI_SUR_HIGH_END (UTF32)0xDBFF #define UNI_SUR_LOW_START (UTF32)0xDC00 #define UNI_SUR_LOW_END (UTF32)0xDFFF #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD #define UNI_MAX_BMP (UTF32)0x0000FFFF #define UNI_MAX_UTF16 (UTF32)0x0010FFFF #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF static const int halfShift = 10; /* used for shifting by 10 bits */ static const UTF32 halfBase = 0x0010000UL; static const UTF32 halfMask = 0x3FFUL; /* --------------------------------------------------------------------- */ static pdc_convers_result pdc_convertUTF32toUTF16 ( UTF32** sourceStart, const UTF32* sourceEnd, UTF16** targetStart, const UTF16* targetEnd, const pdc_convers_flags flags) { pdc_convers_result result = conversionOK; UTF32* source = *sourceStart; UTF16* target = *targetStart; while (source < sourceEnd) { UTF32 ch; if (target >= targetEnd) { result = targetExhausted; break; } ch = *source++; if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ if ((flags == strictConversion) && (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)) { --source; /* return to the illegal value itself */ result = sourceIllegal; break; } else { *target++ = (UTF16) ch; /* normal case */ } } else if (ch > UNI_MAX_UTF16) { if (flags == strictConversion) { result = sourceIllegal; } else { *target++ = UNI_REPLACEMENT_CHAR; } } else { /* target is a character in range 0xFFFF - 0x10FFFF. */ if (target + 1 >= targetEnd) { result = targetExhausted; break; } ch -= halfBase; *target++ = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START); *target++ = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START); } } *sourceStart = source; *targetStart = target; return result; } /* --------------------------------------------------------------------- */ static pdc_convers_result pdc_convertUTF16toUTF32 ( UTF16** sourceStart, UTF16* sourceEnd, UTF32** targetStart, const UTF32* targetEnd, const pdc_convers_flags flags) { pdc_convers_result result = conversionOK; UTF16* source = *sourceStart; UTF32* target = *targetStart; UTF32 ch, ch2; while (source < sourceEnd) { ch = *source++; if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END && source < sourceEnd) { ch2 = *source; if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + (ch2 - UNI_SUR_LOW_START) + halfBase; ++source; } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ --source; /* return to the illegal value itself */ result = sourceIllegal; break; } } else if ((flags == strictConversion) && (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)) { /* an unpaired low surrogate */ --source; /* return to the illegal value itself */ result = sourceIllegal; break; } if (target >= targetEnd) { result = targetExhausted; break; } *target++ = ch; } *sourceStart = source; *targetStart = target; #ifdef CVTUTF_DEBUG if (result == sourceIllegal) { fprintf(stderr, "pdc_convertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2); fflush(stderr); } #endif return result; } /* --------------------------------------------------------------------- */ /* * Index into the table below with the first byte of a UTF-8 sequence to * get the number of trailing bytes that are supposed to follow it. */ static const char trailingBytesForUTF8[256] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 }; #if 0 static const char pdc_get_trailingBytesForUTF8(int i) { return (trailingBytesForUTF8[i]); } #endif /* * Magic values subtracted from a buffer value during UTF8 conversion. * This table contains as many values as there might be trailing bytes * in a UTF-8 sequence. */ static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; /* * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed * into the first byte, depending on how many bytes follow. There are * as many entries in this table as there are UTF-8 sequence types. * (I.e., one byte sequence, two byte... six byte sequence.) */ static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; /* --------------------------------------------------------------------- */ /* The interface converts a whole buffer to avoid function-call overhead. * Constants have been gathered. Loops & conditionals have been removed as * much as possible for efficiency, in favor of drop-through switches. * (See "Note A" at the bottom of the file for equivalent code.) * If your compiler supports it, the "pdc_islegalUTF8" call can be turned * into an inline function. */ /* --------------------------------------------------------------------- */ static pdc_convers_result pdc_convertUTF16toUTF8 ( UTF16** sourceStart, const UTF16* sourceEnd, UTF8** targetStart, const UTF8* targetEnd, const pdc_convers_flags flags) { pdc_convers_result result = conversionOK; UTF16* source = *sourceStart; UTF8* target = *targetStart; while (source < sourceEnd) { UTF32 ch; unsigned short bytesToWrite = 0; const UTF32 byteMask = 0xBF; const UTF32 byteMark = 0x80; ch = *source++; /* If we have a surrogate pair, convert to UTF32 first. */ if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END && source < sourceEnd) { UTF32 ch2 = *source; if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + (ch2 - UNI_SUR_LOW_START) + halfBase; ++source; } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ --source; /* return to the illegal value itself */ result = sourceIllegal; break; } } else if ((flags == strictConversion) && (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)) { --source; /* return to the illegal value itself */ result = sourceIllegal; break; } /* Figure out how many bytes the result will require */ if (ch < (UTF32)0x80) { bytesToWrite = 1; } else if (ch < (UTF32)0x800) { bytesToWrite = 2; } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; } else if (ch < (UTF32)0x200000) { bytesToWrite = 4; } else { bytesToWrite = 2; ch = UNI_REPLACEMENT_CHAR; } target += bytesToWrite; if (target > targetEnd) { target -= bytesToWrite; result = targetExhausted; break; } switch (bytesToWrite) { /* note: everything falls through. */ case 4: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6; case 3: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6; case 2: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6; case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]); } target += bytesToWrite; } *sourceStart = source; *targetStart = target; return result; } /* --------------------------------------------------------------------- */ /* * Utility routine to tell whether a sequence of bytes is legal UTF-8. * This must be called with the length pre-determined by the first byte. * If not calling this from pdc_convertUTF8to*, then the length can be set by: * length = trailingBytesForUTF8[*source]+1; * and the sequence is illegal right away if there aren't that many bytes * available. * If presented with a length > 4, this returns pdc_false. The Unicode * definition of UTF-8 goes up to 4-byte sequences. */ static pdc_bool pdc_islegalUTF8(UTF8 *source, int length) { UTF8 a; UTF8 *srcptr = source+length; switch (length) { default: return pdc_false; /* Everything else falls through when "pdc_true"... */ case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return pdc_false; case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return pdc_false; case 2: if ((a = (*--srcptr)) > 0xBF) return pdc_false; switch (*source) { /* no fall-through in this inner switch */ case 0xE0: if (a < 0xA0) return pdc_false; break; case 0xF0: if (a < 0x90) return pdc_false; break; case 0xF4: if (a > 0x8F) return pdc_false; break; default: if (a < 0x80) return pdc_false; } case 1: if (*source >= 0x80 && *source < 0xC2) return pdc_false; if (*source > 0xF4) return pdc_false; } return pdc_true; } /* --------------------------------------------------------------------- */ /* * Exported function to return whether a UTF-8 sequence is legal or not. * This is not used here; it's just exported. */ #if 0 static pdc_bool pdc_islegalUTF8sequence(UTF8 *source, UTF8 *sourceEnd) { int length = trailingBytesForUTF8[*source]+1; if (source+length > sourceEnd) { return pdc_false; } return pdc_islegalUTF8(source, length); } #endif /* --------------------------------------------------------------------- */ static pdc_convers_result pdc_convertUTF8toUTF16 ( UTF8** sourceStart, UTF8* sourceEnd, UTF16** targetStart, const UTF16* targetEnd, const pdc_convers_flags flags) { pdc_convers_result result = conversionOK; UTF8* source = *sourceStart; UTF16* target = *targetStart; while (source < sourceEnd) { UTF32 ch = 0L; unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; if (source + extraBytesToRead >= sourceEnd) { result = sourceExhausted; break; } /* Do this check whether lenient or strict */ if (! pdc_islegalUTF8(source, extraBytesToRead+1)) { result = sourceIllegal; break; } /* * The cases all fall through. See "Note A" below. */ switch (extraBytesToRead) { case 3: ch += *source++; ch <<= 6; case 2: ch += *source++; ch <<= 6; case 1: ch += *source++; ch <<= 6; case 0: ch += *source++; } ch -= offsetsFromUTF8[extraBytesToRead]; if (target >= targetEnd) { result = targetExhausted; break; } if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ if ((flags == strictConversion) && (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)) { --source; /* return to the illegal value itself */ result = sourceIllegal; break; } else { *target++ = (UTF16) ch; /* normal case */ } } else if (ch > UNI_MAX_UTF16) { if (flags == strictConversion) { result = sourceIllegal; source -= extraBytesToRead; /* return to the start */ } else { *target++ = UNI_REPLACEMENT_CHAR; } } else { /* target is a character in range 0xFFFF - 0x10FFFF. */ if (target + 1 >= targetEnd) { result = targetExhausted; break; } ch -= halfBase; *target++ = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START); *target++ = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START); } } *sourceStart = source; *targetStart = target; return result; } /* --------------------------------------------------------------------- */ static pdc_convers_result pdc_convertUTF32toUTF8 ( UTF32** sourceStart, const UTF32* sourceEnd, UTF8** targetStart, const UTF8* targetEnd, const pdc_convers_flags flags) { pdc_convers_result result = conversionOK; UTF32* source = *sourceStart; UTF8* target = *targetStart; while (source < sourceEnd) { UTF32 ch; unsigned short bytesToWrite = 0; const UTF32 byteMask = 0x000000BF; const UTF32 byteMark = 0x00000080; ch = *source++; /* surrogates of any stripe are not legal UTF32 characters */ if (flags == strictConversion ) { if ((ch >= UNI_SUR_HIGH_START) && (ch <= UNI_SUR_LOW_END)) { --source; /* return to the illegal value itself */ result = sourceIllegal; break; } } /* Figure out how many bytes the result will require */ if (ch < (UTF32)0x80) { bytesToWrite = 1; } else if (ch < (UTF32)0x800) { bytesToWrite = 2; } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; } else if (ch < (UTF32)0x200000) { bytesToWrite = 4; } else { bytesToWrite = 2; ch = UNI_REPLACEMENT_CHAR; } target += bytesToWrite; if (target > targetEnd) { target -= bytesToWrite; result = targetExhausted; break; } switch (bytesToWrite) { /* note: everything falls through. */ case 4: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6; case 3: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6; case 2: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6; case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]); } target += bytesToWrite; } *sourceStart = source; *targetStart = target; return result; } /* --------------------------------------------------------------------- */ static pdc_convers_result pdc_convertUTF8toUTF32 ( UTF8** sourceStart, UTF8* sourceEnd, UTF32** targetStart, const UTF32* targetEnd, const pdc_convers_flags flags) { pdc_convers_result result = conversionOK; UTF8* source = *sourceStart; UTF32* target = *targetStart; (void) flags; while (source < sourceEnd) { UTF32 ch = 0; unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; if (source + extraBytesToRead >= sourceEnd) { result = sourceExhausted; break; } /* Do this check whether lenient or strict */ if (! pdc_islegalUTF8(source, extraBytesToRead+1)) { result = sourceIllegal; break; } /* * The cases all fall through. See "Note A" below. */ switch (extraBytesToRead) { case 3: ch += *source++; ch <<= 6; case 2: ch += *source++; ch <<= 6; case 1: ch += *source++; ch <<= 6; case 0: ch += *source++; } ch -= offsetsFromUTF8[extraBytesToRead]; if (target >= targetEnd) { result = targetExhausted; break; } if (ch <= UNI_MAX_UTF32) { *target++ = ch; } else if (ch > UNI_MAX_UTF32) { *target++ = UNI_REPLACEMENT_CHAR; } else { if (target + 1 >= targetEnd) { result = targetExhausted; break; } ch -= halfBase; *target++ = (ch >> halfShift) + UNI_SUR_HIGH_START; *target++ = (ch & halfMask) + UNI_SUR_LOW_START; } } *sourceStart = source; *targetStart = target; return result; } /* --------------------------------------------------------------------- Note A. The fall-through switches in UTF-8 reading code save a temp variable, some decrements & conditionals. The switches are equivalent to the following loop: { int tmpBytesToRead = extraBytesToRead+1; do { ch += *source++; --tmpBytesToRead; if (tmpBytesToRead) ch <<= 6; } while (tmpBytesToRead > 0); } In UTF-8 writing code, the switches on "bytesToWrite" are similarly unrolled loops. --------------------------------------------------------------------- */ static const pdc_keyconn pdc_utfformat_keylist[] = { {"8", pdc_utf8}, {"16", pdc_utf16}, {"32", pdc_utf32}, {NULL, 0} }; /* * pdc_convert_string converts a arbitrary encoded string (maybe UTF) to * another encoded string. * * The new converted string is allocated and terminated by the required * number of zeros. * * The caller is responsible for freeing the resulting string buffer. * * * LBP: low byte picking * * Input-Parameter: * * inutf: input string format (see pc_unicode.h): * * pdc_auto: If codepage != 0: * see above. * Otherwise: * If a BOM is recognized: * pdc_utf8 or pdc_utf16xx resp. * Otherwise if input encoding is specified * and flag PDC_CONV_FORCEUTF16 not set: * pdc_bytes * Otherwise: * pdc_utf16 * * pdc_auto2: If input encoding is not specified: * pdc_utf16 * Otherwise after successfull LBP: * pdc_auto * Otherwise: * pdc_utf16 * * pdc_bytes: 8-bit string. Encoding is if specified. * * pdc_bytes2: After successfull LBP: * pdc_bytes * Otherwise: * pdc_utf16 * * pdc_utf8: UTF-8 formatted string. * * pdc_ebcdicutf8: EBCDIC-UTF-8 formatted string. * * pdc_utf16: If a UTF16 BOM is recognized: * pdc_utf16be or pdc_utf16le * Otherwise UTF-16 machine byte ordered string. * * pdc_utf16be UTF-16 big endian formatted string. * * pdc_utf16le UTF-16 little endian formatted string. * * codepage: OEM multi byte code-page number. If > 0 and * = pdc_auto, text will be converted to UTF-16. * * inev: Encoding vector for input pdc_bytes string. * * glyphtab: Mapping table for character reference names * * tabsize: Size of mapping table * * replchar: Treatment of non resolvable character references: * >= 0: replacement character * == text_error: error message * == text_nocheck: will be ignored * (see also pdc_charref2unicodelist()) * * instring: Input string. * * inlen: Length of input string in byte. * * oututf: Target format for output string. * pdc_auto, pdc_auto2 and pdc_bytes2 are not supported. * * outev: Encoding vector for output pdc_bytes string. * * flags: PDC_CONV_FORCEUTF16: * In the case of = pdc_auto[2] and != NULL * = pdc_utf16 will be forced. * * PDC_CONV_TRY7BYTES: * UTF-8 output strings will have no BOM if each byte * is smaller than x80. * *oututf: pdc_byte. * * PDC_CONV_TRYBYTES: * UTF-UTF-16xx output strings will be converted by LBP * if each character is smaller than x0100. * *oututf: pdc_byte. * * PDC_CONV_WITHBOM: * UTF-8 or UTF-UTF-16xx output strings will be armed * with an appropriate BOM. * * PDC_CONV_NOBOM: * In UTF-8 or UTF-UTF-16xx output strings any BOM sequence * will be removed. PDC_CONV_WITHBOM is dominant. * * PDC_CONV_AUTOBOM: * BOM sequence will be set automatically if input string * has a BOM. * * PDC_CONV_ANALYZE: * Only analyzing BOMs of input string and dissolving auto * textformats. * * PDC_CONV_TMPALLOC * Temporary memory functions (pdc_malloc_tmp) are used * rather than pdc_malloc etc. * * PDC_CONV_HTMLCHAR * If input encoding vector is specified HTML character * entities will be substituted. * * PDC_CONV_NEWALLOC * Input string must be allocated at first to guarantee * pointer alignment. * * PDC_CONV_INFLATE * Invalid UTF-8 to UTF-16xx conversion will not cause * an exception but rather an inflated byte string will * be output. * * PDC_CONV_ESCSEQU * Unicode sequences framed by escape character U+001B * (found in PDF text strings) will be skipped. * * PDC_CONV_BSSEQU * Code sequences beginning with backslash '\' * will be substituted. * * PDC_CONV_ENCERROR * If an 8-bit code cannot be converted to Unicode by * or a Unicode cannot be converted to an 8-bit code by * an error message will be created. * * PDC_CONV_KEEPLBCHAR * In the case of PDC_CONV_ENCERROR relevant characters for * line breaking do not lead to an error message. * * PDC_CONV_LOGGING * Enables logging. * * verbose: Error messages are put out. Otherwise they are saved only. * * Output-Parameter: * * oututf: Reached format for output string. * * outstring: Pointer of allocated output string * * outlen: Length of output string. * */ #if defined(_MSC_VER) && defined(_MANAGED) #pragma unmanaged #endif int pdc_convert_string(pdc_core *pdc, pdc_text_format inutf, int codepage, pdc_encodingvector *inev, pdc_byte *instring, int inlen, pdc_text_format *oututf_p, pdc_encodingvector *outev, pdc_byte **outstring, int *outlen, int flags, pdc_bool verbose) { return pdc_convert_textstring(pdc, inutf, codepage, inev, NULL, 0, -1, instring, inlen, oututf_p, outev, outstring, outlen, flags, verbose); } int pdc_convert_textstring(pdc_core *pdc, pdc_text_format inutf, int codepage, pdc_encodingvector *inev, const pdc_glyph_tab *glyphtab, int tabsize, int replchar, pdc_byte *instring, int inlen, pdc_text_format *oututf_p, pdc_encodingvector *outev, pdc_byte **outstring, int *outlen, int flags, pdc_bool verbose) { static const char *fn = "pdc_convert_textstring"; pdc_bool logg = flags & PDC_CONV_LOGGING; const char *stemp1 = NULL, *stemp2 = NULL; pdc_text_format oututf = *oututf_p; pdc_text_format oututf_s; pdc_ushort *usinstr = (pdc_ushort *) instring; pdc_ushort uv = 0; pdc_byte *instr = NULL; pdc_bool inalloc = pdc_false; pdc_bool hasbom = pdc_false; pdc_bool toswap = pdc_false; int errcode = 0; int i, j, n, len = 0; (void) glyphtab; (void) tabsize; (void) replchar; if (logg) pdc_logg(pdc, "\t\tinput textformat for string conversion: %s\n", pdc_get_keyword(inutf, pdc_textformat_keylist)); /* prophylactic */ if (!inlen) { instring = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ? pdc_calloc_tmp(pdc, 4, fn, NULL, NULL) : pdc_calloc(pdc, 4, fn)); inalloc = pdc_true; } else if ((flags & PDC_CONV_NEWALLOC) || (flags & PDC_CONV_TMPALLOC) || (flags & PDC_CONV_BSSEQU)) { instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ? pdc_calloc_tmp(pdc, (size_t) (inlen + 2), fn, NULL, NULL) : pdc_calloc(pdc, (size_t) (inlen + 2), fn)); memcpy(instr, instring, (size_t) inlen); inalloc = pdc_true; instring = instr; instr = NULL; usinstr = (pdc_ushort *) instring; } switch(inutf) { /* analyzing 2 byte textformat */ case pdc_auto2: case pdc_bytes2: if ((inutf == pdc_auto2 && (inev == NULL || (flags & PDC_CONV_FORCEUTF16))) || (flags & PDC_CONV_ANALYZE)) { inutf = pdc_utf16; } else { if (logg) pdc_logg(pdc, "\t\ttry to pick low bytes\n"); len = inlen / 2; if (2 * len != inlen) { errcode = PDC_E_CONV_ILLUTF16; goto PDC_CONV_ERROR; } for (i = 0; i < len; i++) if (usinstr[i] > PDC_UNICODE_MAXLATIN1) break; /* low byte picking */ if (i == len) { instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ? pdc_calloc_tmp(pdc, (size_t) (len + 2), fn, NULL, NULL) : pdc_calloc(pdc, (size_t) (len + 2), fn)); for (i = 0; i < len; i++) instr[i] = (pdc_byte) usinstr[i]; if (inalloc) { if (flags & PDC_CONV_TMPALLOC) pdc_free_tmp(pdc, instring); else pdc_free(pdc, instring); } inalloc = pdc_true; instring = instr; instr = NULL; inlen = len; if (inutf == pdc_bytes2) inutf = pdc_bytes; else inutf = pdc_auto; } else { inutf = pdc_utf16; } } break; /* OEM multi byte text strings */ case pdc_auto: case pdc_bytes: if (codepage > 0) { #if defined(WIN32) if (!(flags & PDC_CONV_ANALYZE)) { if (logg) pdc_logg(pdc, "\t\tconverting according Windows codepage %d\n", codepage); len = MultiByteToWideChar((UINT) codepage, (DWORD) 0, (LPCSTR) instring, inlen, NULL, 0); if (len == 0) { DWORD lasterror = GetLastError(); stemp1 = pdc_errprintf(pdc, "cp%d", codepage); if (lasterror == ERROR_INVALID_PARAMETER) { errcode = PDC_E_CONV_UNSUPP_MBTEXTFORM; } else { errcode = PDC_E_CONV_ILL_MBTEXTSTRING; } goto PDC_CONV_ERROR; } len *= 2; instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ? pdc_calloc_tmp(pdc, (size_t) (len + 2), fn, NULL, NULL) : pdc_calloc(pdc, (size_t) (len + 2), fn)); MultiByteToWideChar((UINT) codepage, (DWORD) 0, (LPCSTR) instring, inlen, (LPWSTR) instr, len); if (inalloc) { if (flags & PDC_CONV_TMPALLOC) pdc_free_tmp(pdc, instring); else pdc_free(pdc, instring); } inalloc = pdc_true; instring = instr; instr = NULL; inlen = len; inutf = pdc_utf16; } else { inutf = pdc_bytes; } #else /* WIN32 */ errcode = PDC_E_CONV_UNSUPP_MBTEXTFORM; goto PDC_CONV_ERROR; #endif /* !WIN32 */ } break; default: break; } /* analyzing UTF-16 textformat */ if (inutf == pdc_utf16) { if (pdc_is_utf16be_unicode(instring)) inutf = pdc_utf16be; else if (pdc_is_utf16le_unicode(instring)) inutf = pdc_utf16le; } /* analyzing auto textformat */ else if (inutf == pdc_auto) { if (pdc_is_utf8_bytecode(instring)) inutf = PDC_UTF8; else if (pdc_is_utf16be_unicode(instring)) inutf = pdc_utf16be; else if (pdc_is_utf16le_unicode(instring)) inutf = pdc_utf16le; else if (inev && !(flags & PDC_CONV_FORCEUTF16)) inutf = pdc_bytes; else inutf = pdc_utf16; } if (logg) pdc_logg(pdc, "\t\tdetermined textformat: %s\n", pdc_get_keyword(inutf, pdc_textformat_keylist)); /* only analyzing */ if (flags & PDC_CONV_ANALYZE) goto PDC_CONV_EXIT; /* conversion to UTF-16 by swapping */ if ((inutf == pdc_utf16be || inutf == pdc_utf16le) && (inutf != oututf || flags & PDC_CONV_TRYBYTES || flags & PDC_CONV_HTMLCHAR)) { if (inlen && ((inutf == pdc_utf16be && !PDC_ISBIGENDIAN) || (inutf == pdc_utf16le && PDC_ISBIGENDIAN))) { if (inalloc) pdc_swap_bytes((char *) instring, inlen, NULL); else { instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ? pdc_calloc_tmp(pdc, (size_t) (inlen + 2), fn, NULL, NULL) : pdc_calloc(pdc, (size_t) (inlen + 2), fn)); pdc_swap_bytes((char *) instring, inlen, (char *) instr); inalloc = pdc_true; instring = instr; instr = NULL; } } inutf = pdc_utf16; } /* illegal UTF-16 */ if (inutf >= pdc_utf16 && inlen % 2) { errcode = PDC_E_CONV_ILLUTF16; goto PDC_CONV_ERROR; } /* conversion to UTF-16 by inflation or encoding vector */ if (inutf == pdc_bytes && (oututf != pdc_bytes || flags & PDC_CONV_HTMLCHAR || inev != outev)) { if (logg) { if (flags & PDC_CONV_HTMLCHAR) pdc_logg(pdc, "\t\tbyte character entity substitution\n"); } len = 2 * inlen; instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ? pdc_calloc_tmp(pdc, (size_t) (len + 2), fn, NULL, NULL) : pdc_calloc(pdc, (size_t) (len + 2), fn)); usinstr = (pdc_ushort *) instr; j = 0; for (i = 0; i < inlen; i++) { uv = (pdc_ushort) instring[i]; if (inev) { uv = inev->codes[uv]; if (!uv && (flags & PDC_CONV_ENCERROR) && (!(flags & PDC_CONV_KEEPLBCHAR) || !pdc_is_linebreaking_relchar(uv))) { errcode = PDC_E_ENC_NOTDEF_CODE; stemp1 = pdc_errprintf(pdc, "x%02X", instring[i]); stemp2 = inev->apiname; goto PDC_CONV_ERROR; } } usinstr[j] = uv; j++; } if (inalloc) { if (flags & PDC_CONV_TMPALLOC) pdc_free_tmp(pdc, instring); else pdc_free(pdc, instring); } inalloc = pdc_true; instring = instr; instr = NULL; inlen = 2 * j; inutf = pdc_utf16; } /* UTF conversion */ oututf_s = oututf; if ((oututf_s == pdc_bytes && inutf == pdc_utf8) || oututf_s == pdc_utf16be || oututf_s == pdc_utf16le) oututf_s = pdc_utf16; if (inutf != oututf_s && oututf_s != pdc_bytes) { len = 4 * (inlen + 1); instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ? pdc_calloc_tmp(pdc, (size_t) len, fn, NULL, NULL) : pdc_calloc(pdc, (size_t) len, fn)); if (inlen) { pdc_convers_result result = conversionOK; pdc_byte *instringa, *instra, *instringe, *instre; UTF8 *isa8, *ise8; UTF16 *isa16, *ise16; UTF32 *isa32, *ise32; if (logg) pdc_logg(pdc, "\t\tUTF conversion\n"); instringa = instring; instringe = instring + inlen; instra = instr; instre = instr + len; if (inutf == pdc_utf8) { isa8 = (UTF8 *) instringa; ise8 = (UTF8 *) instringe; if (oututf_s == pdc_utf16) { isa16 = (UTF16 *) instra; ise16 = (UTF16 *) instre; result = pdc_convertUTF8toUTF16(&isa8, ise8, &isa16, ise16, strictConversion); instra = (pdc_byte *) isa16; instre = (pdc_byte *) ise16; } else { isa32 = (UTF32 *) instra; ise32 = (UTF32 *) instre; result = pdc_convertUTF8toUTF32(&isa8, ise8, &isa32, ise32, strictConversion); instra = (pdc_byte *) isa32; instre = (pdc_byte *) ise32; } } else if (inutf == pdc_utf16) { isa16 = (UTF16 *) instringa; ise16 = (UTF16 *) instringe; if (oututf_s == pdc_utf8) { isa8 = (UTF8 *) instra; ise8 = (UTF8 *) instre; result = pdc_convertUTF16toUTF8(&isa16, ise16, &isa8, ise8, strictConversion); instra = (pdc_byte *) isa8; instre = (pdc_byte *) ise8; } else { isa32 = (UTF32 *) instra; ise32 = (UTF32 *) instre; result = pdc_convertUTF16toUTF32(&isa16, ise16, &isa32, ise32, strictConversion); instra = (pdc_byte *) isa32; instre = (pdc_byte *) ise32; } } else if (inutf == pdc_utf32) { isa32 = (UTF32 *) instringa; ise32 = (UTF32 *) instringe; if (oututf_s == pdc_utf8) { isa8 = (UTF8 *) instra; ise8 = (UTF8 *) instre; result = pdc_convertUTF32toUTF8(&isa32, ise32, &isa8, ise8, strictConversion); instra = (pdc_byte *) isa8; instre = (pdc_byte *) ise8; } else { isa16 = (UTF16 *) instra; ise16 = (UTF16 *) instre; result = pdc_convertUTF32toUTF16(&isa32, ise32, &isa16, ise16, strictConversion); instra = (pdc_byte *) isa16; instre = (pdc_byte *) ise16; } } switch (result) { case targetExhausted: errcode = PDC_E_CONV_MEMOVERFLOW; break; case sourceExhausted: case sourceIllegal: if (inutf == pdc_utf8 && (flags & PDC_CONV_INFLATE)) { pdc_inflate_ascii((char *) instring, inlen, (char *) instr, pdc_utf16); instra = instr + 2 * inlen; } else { errcode = PDC_E_CONV_ILLUTF; stemp1 = pdc_get_keyword((int)inutf, pdc_utfformat_keylist); } break; default: break; } if (errcode) { if (logg) pdc_logg(pdc, "\t\tUTF conversion error %d\n", result); goto PDC_CONV_ERROR; } inlen = instra - instr; } if (inalloc) { if (flags & PDC_CONV_TMPALLOC) pdc_free_tmp(pdc, instring); else pdc_free(pdc, instring); } len = (oututf == pdc_utf32) ? inlen + 4 : inlen + 2; if (inlen + 4 != len) instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ? pdc_realloc_tmp(pdc, instr, (size_t) len, fn) : pdc_realloc(pdc, instr, (size_t) len, fn)); instr[inlen] = 0; instr[inlen + 1] = 0; if (oututf == pdc_utf32) { instr[inlen + 2] = 0; instr[inlen + 3] = 0; } inalloc = pdc_true; instring = instr; instr = NULL; inutf = oututf_s; } if (inutf == pdc_bytes) { if (!inalloc) { instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ? pdc_calloc_tmp(pdc, (size_t) (inlen + 2), fn, NULL, NULL) : pdc_calloc(pdc, (size_t) (inlen + 2), fn)); memcpy(instr, instring, (size_t) inlen); inalloc = pdc_true; instring = instr; instr = NULL; } } /* trying to reduce UTF-16 string to bytes string */ if (inutf == pdc_utf16 && (oututf == pdc_bytes || flags & PDC_CONV_TRYBYTES)) { if (logg) pdc_logg(pdc, "\t\ttry to reduce UTF-16 to bytes\n"); if (pdc_is_utf16be_unicode(instring) || pdc_is_utf16le_unicode(instring)) n = 1; else n = 0; len = (inlen - n) / 2; instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ? pdc_calloc_tmp(pdc, (size_t) (len + 2), fn, NULL, NULL) : pdc_calloc(pdc, (size_t) (len + 2), fn)); usinstr = (pdc_ushort *) instring; for (i = 0; i < len; i++) { uv = usinstr[i + n]; if (outev && uv) { j = pdc_get_encoding_bytecode(pdc, outev, uv); if (j < 0 && (flags & PDC_CONV_ENCERROR) && oututf == pdc_bytes) { errcode = PDC_E_ENC_NOTDEF_UNICODE; stemp1 = pdc_errprintf(pdc, "%04X", uv); stemp2 = outev->apiname; goto PDC_CONV_ERROR; } uv = (pdc_ushort) j; } if (uv > PDC_UNICODE_MAXLATIN1) break; instr[i] = (pdc_byte) uv; } if (i == len) { if (inalloc) { if (flags & PDC_CONV_TMPALLOC) pdc_free_tmp(pdc, instring); else pdc_free(pdc, instring); } inalloc = pdc_true; instring = instr; instr = NULL; inlen = len; inutf = pdc_bytes; } else { if (flags & PDC_CONV_TMPALLOC) pdc_free_tmp(pdc, instr); else pdc_free(pdc, instr); instr = NULL; } } /* UTF-8 format */ if (inutf == pdc_utf8) { hasbom = pdc_is_utf8_unicode(instring); if (flags & PDC_CONV_TRY7BYTES) { if (logg) pdc_logg(pdc, "\t\ttry to reduce UTF-8 to 7-bit\n"); for (i = hasbom ? 3 : 0; i < inlen; i++) if (instring[i] > PDC_UNICODE_MAXASCII) break; if (i == inlen) { flags &= ~PDC_CONV_WITHBOM; flags |= PDC_CONV_NOBOM; inutf = pdc_bytes; } } else if (hasbom && (flags & PDC_CONV_AUTOBOM)) { flags &= ~PDC_CONV_NOBOM; flags |= PDC_CONV_WITHBOM; } else if ((flags & PDC_CONV_WITHBOM) && (flags & PDC_CONV_NOBOM)) { flags &= ~PDC_CONV_NOBOM; } if (!inalloc || flags & PDC_CONV_WITHBOM || flags & PDC_CONV_NOBOM) { i = (flags & PDC_CONV_WITHBOM && !hasbom) ? 3 : 0; j = (flags & PDC_CONV_NOBOM && hasbom) ? 3 : 0; len = inlen + i - j; instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ? pdc_calloc_tmp(pdc, (size_t) (len + 2), fn, NULL, NULL) : pdc_calloc(pdc, (size_t) (len + 2), fn)); memcpy(&instr[i], &instring[j], (size_t) (inlen - j)); instr[len] = 0; if (inalloc) { if (flags & PDC_CONV_TMPALLOC) pdc_free_tmp(pdc, instring); else pdc_free(pdc, instring); } instring = instr; instr = NULL; inlen = len; hasbom = (flags & PDC_CONV_WITHBOM); } if (hasbom) { instring[0] = PDF_BOM2; instring[1] = PDF_BOM3; instring[2] = PDF_BOM4; } } /* UTF-16 formats */ if (inutf == pdc_utf16 || inutf == pdc_utf16be || inutf == pdc_utf16le) { hasbom = pdc_is_utf16be_unicode(instring) || pdc_is_utf16le_unicode(instring); if (hasbom && (flags & PDC_CONV_AUTOBOM)) { flags &= ~PDC_CONV_NOBOM; flags |= PDC_CONV_WITHBOM; } else if ((flags & PDC_CONV_WITHBOM) && (flags & PDC_CONV_NOBOM)) { flags &= ~PDC_CONV_NOBOM; } if (!inalloc || oututf == pdc_utf16be || oututf == pdc_utf16le || flags & PDC_CONV_WITHBOM || flags & PDC_CONV_NOBOM) { i = (flags & PDC_CONV_WITHBOM && !hasbom) ? 2 : 0; j = (flags & PDC_CONV_NOBOM && hasbom) ? 2 : 0; len = inlen + i - j; instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ? pdc_calloc_tmp(pdc, (size_t) (len + 2), fn, NULL, NULL) : pdc_calloc(pdc, (size_t) (len + 2), fn)); memcpy(&instr[i], &instring[j], (size_t) (inlen - j)); if (inalloc) { if (flags & PDC_CONV_TMPALLOC) pdc_free_tmp(pdc, instring); else pdc_free(pdc, instring); } instring = instr; instr = NULL; inlen = len; hasbom = (flags & PDC_CONV_WITHBOM); } i = hasbom ? 2 : 0; if (inutf == pdc_utf16) { if (oututf == pdc_utf16be) { inutf = pdc_utf16be; toswap = !PDC_ISBIGENDIAN; } if (oututf == pdc_utf16le) { inutf = pdc_utf16le; toswap = PDC_ISBIGENDIAN; } if (toswap) pdc_swap_bytes((char *) &instring[i], inlen - i, NULL); } if (hasbom) { if (inutf == pdc_utf16be || (inutf == pdc_utf16 && PDC_ISBIGENDIAN)) { instring[0] = PDF_BOM0; instring[1] = PDF_BOM1; } if (inutf == pdc_utf16le || (inutf == pdc_utf16 && !PDC_ISBIGENDIAN)) { instring[0] = PDF_BOM1; instring[1] = PDF_BOM0; } } } if (logg) pdc_logg(pdc, "\t\ttextformat of converted string: %s\n", pdc_get_keyword(inutf, pdc_textformat_keylist)); PDC_CONV_EXIT: *oututf_p = inutf; if (outlen) *outlen = inlen; *outstring = instring; return 0; PDC_CONV_ERROR: if (outlen) *outlen = 0; *outstring = NULL; if (errcode > 0) pdc_set_errmsg(pdc, errcode, stemp1, stemp2, 0, 0); if (instr != NULL) { if (flags & PDC_CONV_TMPALLOC) pdc_free_tmp(pdc, instr); else pdc_free(pdc, instr); } if (inalloc) { if (flags & PDC_CONV_TMPALLOC) pdc_free_tmp(pdc, instring); else pdc_free(pdc, instring); } if (verbose) PDC_RETHROW(pdc); return errcode; } #if defined(_MSC_VER) && defined(_MANAGED) #pragma managed #endif /* * pdc_convert_name_ext converts a string of name data type to UTF-8 * * flags & PDC_CONV_EBCDIC: converts to EBCDIC-UTF-8 * * len == 0: If the string has a [EBCDIC-]UTF-8 BOM or * flags & PDC_CONV_ISUTF8 is set the string will be duplicated. * Otherwise the string has encoding enc and codepage * codepage. * If enc < pdc_winansi the string is "host" encoded. * * len > 0: The string is a UTF-16 string of len bytes. * */ char * pdc_convert_name_ext(pdc_core *pdc, const char *name, int len, pdc_encoding enc, int codepage, int flags) { pdc_encodingvector *ev = NULL; pdc_text_format nameformat = pdc_utf16; pdc_text_format outnameformat = pdc_utf8; pdc_byte *convname; char *outname = NULL; int outlen; if (name == NULL) return NULL; if (len == 0) { pdc_bool hasbom = pdc_is_utf8_bytecode(name); pdc_bool withbom = (flags & PDC_CONV_WITHBOM) ? pdc_true : pdc_false; /* already [EBCDIC-]UTF-8 encoded */ if ((flags & PDC_CONV_ISUTF8) || hasbom) { if ((hasbom && withbom) || (!hasbom && !withbom)) outname = pdc_strdup(pdc, name); else if (hasbom && !withbom) outname = pdc_strdup(pdc, &name[3]); else if (!hasbom && withbom) outname = pdc_strdup_withbom(pdc, name); if (outname != NULL) { return outname; } } /* 8-bit encoded string */ nameformat = pdc_bytes; if (enc < pdc_winansi) ev = pdc_get_encoding_vector(pdc, pdc_find_encoding(pdc, "host")); else ev = pdc_get_encoding_vector(pdc, enc); len = (int) strlen(name); } if (flags & PDC_CONV_EBCDIC) outnameformat = PDC_UTF8; flags |= PDC_CONV_TRY7BYTES; if (pdc->charref) flags |= PDC_CONV_HTMLCHAR; if (pdc->escapesequ) flags |= PDC_CONV_BSSEQU; /* convert to UTF-8 */ pdc_convert_string(pdc, nameformat, codepage, ev, (pdc_byte *) name, len, &outnameformat, NULL, &convname, &outlen, flags, pdc_true); return (char *) convname; } char * pdc_convert_name(pdc_core *pdc, const char *name, int len, int flags) { return pdc_convert_name_ext(pdc, name, len, pdc_invalidenc, 0, flags); } char * pdc_utf8_to_hostbytes(pdc_core *pdc, pdc_bool honorlang, char *name) { static const char fn[] = "pdc_utf8_to_hostbytes"; pdc_encoding outenc = pdc_invalidenc; pdc_encodingvector *outev = NULL; pdc_text_format informat = PDC_UTF8; pdc_text_format outformat = pdc_utf16; pdc_byte *outname = NULL; int len = (int) strlen(name); { (void) fn; (void) honorlang; outenc = pdc_find_encoding(pdc, "host"); } outev = pdc_get_encoding_vector(pdc, outenc); pdc_convert_string(pdc, informat, 0, NULL, (pdc_byte *) name, len, &outformat, outev, &outname, &len, PDC_CONV_TRYBYTES | PDC_CONV_NOBOM, pdc_true); if (outformat == pdc_utf16) { pdc_free(pdc, outname); outname = NULL; } return (char *) outname; } char * pdc_hostbytes_to_utf8(pdc_core *pdc, pdc_bool honorlang, char *name) { static const char fn[] = "pdc_hostbytes_to_utf8"; pdc_encoding inenc = pdc_invalidenc; pdc_encodingvector *inev = NULL; pdc_text_format informat = pdc_bytes; pdc_text_format outformat = PDC_UTF8; pdc_byte *outname = NULL; int len = (int) strlen(name); { (void) fn; (void) honorlang; inenc = pdc_find_encoding(pdc, "host"); } inev = pdc_get_encoding_vector(pdc, inenc); pdc_convert_string(pdc, informat, 0, inev, (pdc_byte *) name, len, &outformat, NULL, &outname, &len, PDC_CONV_NOBOM, pdc_true); return (char *) outname; } /* --------------------- basic UTF conversion functions --------------------- */ char * pdc_utf16_to_utf8(pdc_core *pdc, const char *utf16string, int len, int flags, int *size) { pdc_text_format outtextformat = pdc_utf8; pdc_byte *utf8string = NULL; int outlen; if (!utf16string) pdc_error(pdc, PDC_E_ILLARG_EMPTY, "utf16string", 0, 0, 0); if (flags & PDC_CONV_EBCDIC) outtextformat = PDC_UTF8; flags |= PDC_CONV_AUTOBOM; pdc_convert_string(pdc, pdc_utf16, 0, NULL, (pdc_byte *) utf16string, len, &outtextformat, NULL, &utf8string, &outlen, flags, pdc_true); if (size) *size = outlen; return (char *) utf8string; } char * pdc_utf8_to_utf16(pdc_core *pdc, const char *utf8string, const char *format, int flags, int *size) { pdc_text_format textformat = pdc_utf8; pdc_text_format outtextformat = pdc_utf16; pdc_byte *utf16string = NULL; int len; if (!utf8string) pdc_error(pdc, PDC_E_ILLARG_EMPTY, "utf8string", 0, 0, 0); len = (int) strlen(utf8string); if (format && *format) { int k = pdc_get_keycode_ci(format, pdc_textformat_keylist); if (k == PDC_KEY_NOTFOUND || ((pdc_text_format) k != pdc_utf16 && (pdc_text_format) k != pdc_utf16be && (pdc_text_format) k != pdc_utf16le)) pdc_error(pdc, PDC_E_ILLARG_STRING, "format", format, 0, 0); outtextformat = (pdc_text_format) k; } if (flags & PDC_CONV_EBCDIC) textformat = PDC_UTF8; if (outtextformat == pdc_utf16) flags |= PDC_CONV_AUTOBOM; else flags |= PDC_CONV_WITHBOM; pdc_convert_string(pdc, textformat, 0, NULL, (pdc_byte *) utf8string, len, &outtextformat, NULL, &utf16string, size, flags, pdc_true); return (char *) utf16string; } char * pdc_utf16_to_utf32(pdc_core *pdc, const char *utf16string, int len, int *size) { pdc_text_format outtextformat = pdc_utf32; pdc_byte *utf32string = NULL; if (!utf16string) pdc_error(pdc, PDC_E_ILLARG_EMPTY, "utf16string", 0, 0, 0); pdc_convert_string(pdc, pdc_utf16, 0, NULL, (pdc_byte *) utf16string, len, &outtextformat, NULL, &utf32string, size, 0, pdc_true); return (char *) utf32string; } char * pdc_utf32_to_utf8(pdc_core *pdc, const char *utf32string, int len, int flags, int *size) { pdc_text_format outtextformat = pdc_utf8; pdc_byte *utf8string = NULL; int outlen; if (!utf32string) pdc_error(pdc, PDC_E_ILLARG_EMPTY, "utf32string", 0, 0, 0); if (flags & PDC_CONV_EBCDIC) outtextformat = PDC_UTF8; flags |= PDC_CONV_AUTOBOM; pdc_convert_string(pdc, pdc_utf32, 0, NULL, (pdc_byte *) utf32string, len, &outtextformat, NULL, &utf8string, &outlen, flags, pdc_true); if (size) *size = outlen; return (char *) utf8string; } char * pdc_utf32_to_utf16(pdc_core *pdc, const char *utf32string, int len, const char *format, int flags, int *size) { pdc_text_format textformat = pdc_utf32; pdc_text_format outtextformat = pdc_utf16; pdc_byte *utf16string = NULL; if (!utf32string) pdc_error(pdc, PDC_E_ILLARG_EMPTY, "utf32string", 0, 0, 0); if (format && *format) { int k = pdc_get_keycode_ci(format, pdc_textformat_keylist); if (k == PDC_KEY_NOTFOUND || ((pdc_text_format) k != pdc_utf16 && (pdc_text_format) k != pdc_utf16be && (pdc_text_format) k != pdc_utf16le)) pdc_error(pdc, PDC_E_ILLARG_STRING, "format", format, 0, 0); outtextformat = (pdc_text_format) k; } if (outtextformat == pdc_utf16) flags |= PDC_CONV_AUTOBOM; else flags |= PDC_CONV_WITHBOM; pdc_convert_string(pdc, textformat, 0, NULL, (pdc_byte *) utf32string, len, &outtextformat, NULL, &utf16string, size, flags, pdc_true); return (char *) utf16string; } int pdc_char16_to_char32(pdc_core *pdc, const pdc_ushort *ustext, int *ic, int len, pdc_bool verbose) { pdc_ushort uvh = ustext[*ic]; if (uvh < PDC_UNICODE_MINHIGHSUR || uvh > PDC_UNICODE_MAXLOWSUR) { return (int) uvh; } else { UTF16 *isa16 = (UTF16 *) &ustext[*ic]; pdc_ushort uvl = 0; int icn = *ic + 1; if (icn < len) { uvl = ustext[icn]; if (uvh <= PDC_UNICODE_MAXHIGHSUR) { if (uvl >= PDC_UNICODE_MINLOWSUR && uvl <= PDC_UNICODE_MAXLOWSUR) { int usv; UTF16 *ise16 = isa16 + 2; UTF32 *isa32 = (UTF32 *) &usv; UTF32 *ise32 = isa32 + 1; pdc_convers_result result = pdc_convertUTF16toUTF32( &isa16, ise16, &isa32, ise32, strictConversion); if (result == conversionOK) { *ic = icn; return usv; } } } } pdc_set_errmsg(pdc, PDC_E_CONV_ILLUTF16SUR, pdc_errprintf(pdc, "%04X", uvh), pdc_errprintf(pdc, "%04X", uvl), 0, 0); if (verbose) pdc_error(pdc, -1, 0, 0, 0, 0); } return -1; } int pdc_char32_to_char16(pdc_core *pdc, int usv, pdc_ushort *uvlist, pdc_bool verbose) { if (usv < PDC_NUM_BMPVAL) { uvlist[0] = (pdc_ushort) usv; return 1; } else { UTF32 *isa32 = (UTF32 *) &usv; UTF32 *ise32 = isa32 + 1; UTF16 *isa16 = (UTF16 *) uvlist; UTF16 *ise16 = isa16 + 2; pdc_convers_result result = pdc_convertUTF32toUTF16( &isa32, ise32, &isa16, ise16, strictConversion); if (result == conversionOK) { return 2; } pdc_set_errmsg(pdc, PDC_E_CONV_ILLUTF32, pdc_errprintf(pdc, "%05X", usv), 0, 0, 0); if (verbose) pdc_error(pdc, -1, 0, 0, 0, 0); } return 0; }