diff options
Diffstat (limited to 'MSVC/iconv/iso2022_jp2.h')
-rw-r--r-- | MSVC/iconv/iso2022_jp2.h | 688 |
1 files changed, 688 insertions, 0 deletions
diff --git a/MSVC/iconv/iso2022_jp2.h b/MSVC/iconv/iso2022_jp2.h new file mode 100644 index 0000000..276d04c --- /dev/null +++ b/MSVC/iconv/iso2022_jp2.h @@ -0,0 +1,688 @@ +/* + * Copyright (C) 1999-2001 Free Software Foundation, Inc. + * This file is part of the GNU LIBICONV Library. + * + * The GNU LIBICONV Library is free software; you can redistribute it + * and/or modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * The GNU LIBICONV Library is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with the GNU LIBICONV Library; see the file COPYING.LIB. + * If not, write to the Free Software Foundation, Inc., 59 Temple Place - + * Suite 330, Boston, MA 02111-1307, USA. + */ + +/* + * ISO-2022-JP-2 + */ + +/* Specification: RFC 1554 */ +/* ESC '(' 'I' for JISX0201 Katakana is an extension not found in RFC 1554 or + CJK.INF, but implemented in glibc-2.1 and qt-2.0. */ + +#define ESC 0x1b + +/* + * The state is composed of one of the following values + */ +#define STATE_ASCII 0 +#define STATE_JISX0201ROMAN 1 +#define STATE_JISX0201KATAKANA 2 +#define STATE_JISX0208 3 +#define STATE_JISX0212 4 +#define STATE_GB2312 5 +#define STATE_KSC5601 6 +/* + * and one of the following values, << 8 + */ +#define STATE_G2_NONE 0 +#define STATE_G2_ISO8859_1 1 +#define STATE_G2_ISO8859_7 2 + +#define SPLIT_STATE \ + unsigned int state1 = state & 0xff, state2 = state >> 8 +#define COMBINE_STATE \ + state = (state2 << 8) | state1 + +static int +iso2022_jp2_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n) +{ + state_t state = conv->istate; + SPLIT_STATE; + int count = 0; + unsigned char c; + for (;;) { + c = *s; + if (c == ESC) { + if (n < count+3) + goto none; + if (s[1] == '(') { + if (s[2] == 'B') { + state1 = STATE_ASCII; + s += 3; count += 3; + if (n < count+1) + goto none; + continue; + } + if (s[2] == 'J') { + state1 = STATE_JISX0201ROMAN; + s += 3; count += 3; + if (n < count+1) + goto none; + continue; + } + if (s[2] == 'I') { + state1 = STATE_JISX0201KATAKANA; + s += 3; count += 3; + if (n < count+1) + goto none; + continue; + } + return RET_ILSEQ; + } + if (s[1] == '$') { + if (s[2] == '@' || s[2] == 'B') { + /* We don't distinguish JIS X 0208-1978 and JIS X 0208-1983. */ + state1 = STATE_JISX0208; + s += 3; count += 3; + if (n < count+1) + goto none; + continue; + } + if (s[2] == 'A') { + state1 = STATE_GB2312; + s += 3; count += 3; + if (n < count+1) + goto none; + continue; + } + if (s[2] == '(') { + if (n < count+4) + goto none; + if (s[3] == 'D') { + state1 = STATE_JISX0212; + s += 4; count += 4; + if (n < count+1) + goto none; + continue; + } + if (s[3] == 'C') { + state1 = STATE_KSC5601; + s += 4; count += 4; + if (n < count+1) + goto none; + continue; + } + return RET_ILSEQ; + } + return RET_ILSEQ; + } + if (s[1] == '.') { + if (n < count+3) + goto none; + if (s[2] == 'A') { + state2 = STATE_G2_ISO8859_1; + s += 3; count += 3; + if (n < count+1) + goto none; + continue; + } + if (s[2] == 'F') { + state2 = STATE_G2_ISO8859_7; + s += 3; count += 3; + if (n < count+1) + goto none; + continue; + } + return RET_ILSEQ; + } + if (s[1] == 'N') { + switch (state2) { + case STATE_G2_NONE: + return RET_ILSEQ; + case STATE_G2_ISO8859_1: + if (s[2] < 0x80) { + unsigned char buf = s[2]+0x80; + int ret = iso8859_1_mbtowc(conv,pwc,&buf,1); + if (ret == RET_ILSEQ) + return RET_ILSEQ; + if (ret != 1) abort(); + COMBINE_STATE; + conv->istate = state; + return count+3; + } else + return RET_ILSEQ; + case STATE_G2_ISO8859_7: + if (s[2] < 0x80) { + unsigned char buf = s[2]+0x80; + int ret = iso8859_7_mbtowc(conv,pwc,&buf,1); + if (ret == RET_ILSEQ) + return RET_ILSEQ; + if (ret != 1) abort(); + COMBINE_STATE; + conv->istate = state; + return count+3; + } else + return RET_ILSEQ; + default: abort(); + } + } + return RET_ILSEQ; + } + break; + } + switch (state1) { + case STATE_ASCII: + if (c < 0x80) { + int ret = ascii_mbtowc(conv,pwc,s,1); + if (ret == RET_ILSEQ) + return RET_ILSEQ; + if (ret != 1) abort(); + if (*pwc == 0x000a || *pwc == 0x000d) + state2 = STATE_G2_NONE; + COMBINE_STATE; + conv->istate = state; + return count+1; + } else + return RET_ILSEQ; + case STATE_JISX0201ROMAN: + if (c < 0x80) { + int ret = jisx0201_mbtowc(conv,pwc,s,1); + if (ret == RET_ILSEQ) + return RET_ILSEQ; + if (ret != 1) abort(); + if (*pwc == 0x000a || *pwc == 0x000d) + state2 = STATE_G2_NONE; + COMBINE_STATE; + conv->istate = state; + return count+1; + } else + return RET_ILSEQ; + case STATE_JISX0201KATAKANA: + if (c < 0x80) { + unsigned char buf = c+0x80; + int ret = jisx0201_mbtowc(conv,pwc,&buf,1); + if (ret == RET_ILSEQ) + return RET_ILSEQ; + if (ret != 1) abort(); + COMBINE_STATE; + conv->istate = state; + return count+1; + } else + return RET_ILSEQ; + case STATE_JISX0208: + if (n < count+2) + goto none; + if (s[0] < 0x80 && s[1] < 0x80) { + int ret = jisx0208_mbtowc(conv,pwc,s,2); + if (ret == RET_ILSEQ) + return RET_ILSEQ; + if (ret != 2) abort(); + COMBINE_STATE; + conv->istate = state; + return count+2; + } else + return RET_ILSEQ; + case STATE_JISX0212: + if (n < count+2) + goto none; + if (s[0] < 0x80 && s[1] < 0x80) { + int ret = jisx0212_mbtowc(conv,pwc,s,2); + if (ret == RET_ILSEQ) + return RET_ILSEQ; + if (ret != 2) abort(); + COMBINE_STATE; + conv->istate = state; + return count+2; + } else + return RET_ILSEQ; + case STATE_GB2312: + if (n < count+2) + goto none; + if (s[0] < 0x80 && s[1] < 0x80) { + int ret = gb2312_mbtowc(conv,pwc,s,2); + if (ret == RET_ILSEQ) + return RET_ILSEQ; + if (ret != 2) abort(); + COMBINE_STATE; + conv->istate = state; + return count+2; + } else + return RET_ILSEQ; + case STATE_KSC5601: + if (n < count+2) + goto none; + if (s[0] < 0x80 && s[1] < 0x80) { + int ret = ksc5601_mbtowc(conv,pwc,s,2); + if (ret == RET_ILSEQ) + return RET_ILSEQ; + if (ret != 2) abort(); + COMBINE_STATE; + conv->istate = state; + return count+2; + } else + return RET_ILSEQ; + default: abort(); + } + +none: + COMBINE_STATE; + conv->istate = state; + return RET_TOOFEW(count); +} + +#undef COMBINE_STATE +#undef SPLIT_STATE + +/* + * The state can also contain one of the following values, << 16. + * Values >= STATE_TAG_LANGUAGE are temporary tag parsing states. + */ +#define STATE_TAG_NONE 0 +#define STATE_TAG_LANGUAGE 4 +#define STATE_TAG_LANGUAGE_j 5 +#define STATE_TAG_LANGUAGE_ja 1 +#define STATE_TAG_LANGUAGE_k 6 +#define STATE_TAG_LANGUAGE_ko 2 +#define STATE_TAG_LANGUAGE_z 7 +#define STATE_TAG_LANGUAGE_zh 3 + +#define SPLIT_STATE \ + unsigned int state1 = state & 0xff, state2 = (state >> 8) & 0xff, state3 = state >> 16 +#define COMBINE_STATE \ + state = (state3 << 16) | (state2 << 8) | state1 + +static int +iso2022_jp2_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n) +{ + state_t state = conv->ostate; + SPLIT_STATE; + unsigned char buf[2]; + int ret; + /* This defines the conversion preferences depending on the current + langauge tag. */ + enum conversion { none = 0, european, japanese, chinese, korean, other }; + static const unsigned int conversion_lists[STATE_TAG_LANGUAGE] = { + /* STATE_TAG_NONE */ + japanese + (european << 3) + (chinese << 6) + (korean << 9) + (other << 12), + /* STATE_TAG_LANGUAGE_ja */ + japanese + (european << 3) + (chinese << 6) + (korean << 9) + (other << 12), + /* STATE_TAG_LANGUAGE_ko */ + korean + (european << 3) + (japanese << 6) + (chinese << 9) + (other << 12), + /* STATE_TAG_LANGUAGE_zh */ + chinese + (european << 3) + (japanese << 6) + (korean << 9) + (other << 12) + }; + unsigned int conversion_list; + + /* Handle Unicode tag characters (range U+E0000..U+E007F). */ + if ((wc >> 7) == (0xe0000 >> 7)) { + char c = wc & 0x7f; + if (c >= 'A' && c <= 'Z') + c += 'a'-'A'; + switch (c) { + case 0x01: + state3 = STATE_TAG_LANGUAGE; + COMBINE_STATE; + conv->ostate = state; + return 0; + case 'j': + if (state3 == STATE_TAG_LANGUAGE) { + state3 = STATE_TAG_LANGUAGE_j; + COMBINE_STATE; + conv->ostate = state; + return 0; + } + break; + case 'a': + if (state3 == STATE_TAG_LANGUAGE_j) { + state3 = STATE_TAG_LANGUAGE_ja; + COMBINE_STATE; + conv->ostate = state; + return 0; + } + break; + case 'k': + if (state3 == STATE_TAG_LANGUAGE) { + state3 = STATE_TAG_LANGUAGE_k; + COMBINE_STATE; + conv->ostate = state; + return 0; + } + break; + case 'o': + if (state3 == STATE_TAG_LANGUAGE_k) { + state3 = STATE_TAG_LANGUAGE_ko; + COMBINE_STATE; + conv->ostate = state; + return 0; + } + break; + case 'z': + if (state3 == STATE_TAG_LANGUAGE) { + state3 = STATE_TAG_LANGUAGE_z; + COMBINE_STATE; + conv->ostate = state; + return 0; + } + break; + case 'h': + if (state3 == STATE_TAG_LANGUAGE_z) { + state3 = STATE_TAG_LANGUAGE_zh; + COMBINE_STATE; + conv->ostate = state; + return 0; + } + break; + case 0x7f: + state3 = STATE_TAG_NONE; + COMBINE_STATE; + conv->ostate = state; + return 0; + default: + break; + } + /* Other tag characters reset the tag parsing state or are ignored. */ + if (state3 >= STATE_TAG_LANGUAGE) + state3 = STATE_TAG_NONE; + COMBINE_STATE; + conv->ostate = state; + return 0; + } + if (state3 >= STATE_TAG_LANGUAGE) + state3 = STATE_TAG_NONE; + + /* Try ASCII. */ + ret = ascii_wctomb(conv,buf,wc,1); + if (ret != RET_ILUNI) { + if (ret != 1) abort(); + if (buf[0] < 0x80) { + int count = (state1 == STATE_ASCII ? 1 : 4); + if (n < count) + return RET_TOOSMALL; + if (state1 != STATE_ASCII) { + r[0] = ESC; + r[1] = '('; + r[2] = 'B'; + r += 3; + state1 = STATE_ASCII; + } + r[0] = buf[0]; + if (wc == 0x000a || wc == 0x000d) + state2 = STATE_G2_NONE; + COMBINE_STATE; + conv->ostate = state; + return count; + } + } + + conversion_list = conversion_lists[state3]; + + do { + switch (conversion_list & ((1 << 3) - 1)) { + + case european: + + /* Try ISO-8859-1. */ + ret = iso8859_1_wctomb(conv,buf,wc,1); + if (ret != RET_ILUNI) { + if (ret != 1) abort(); + if (buf[0] >= 0x80) { + int count = (state2 == STATE_G2_ISO8859_1 ? 3 : 6); + if (n < count) + return RET_TOOSMALL; + if (state2 != STATE_G2_ISO8859_1) { + r[0] = ESC; + r[1] = '.'; + r[2] = 'A'; + r += 3; + state2 = STATE_G2_ISO8859_1; + } + r[0] = ESC; + r[1] = 'N'; + r[2] = buf[0]-0x80; + COMBINE_STATE; + conv->ostate = state; + return count; + } + } + + /* Try ISO-8859-7. */ + ret = iso8859_7_wctomb(conv,buf,wc,1); + if (ret != RET_ILUNI) { + if (ret != 1) abort(); + if (buf[0] >= 0x80) { + int count = (state2 == STATE_G2_ISO8859_7 ? 3 : 6); + if (n < count) + return RET_TOOSMALL; + if (state2 != STATE_G2_ISO8859_7) { + r[0] = ESC; + r[1] = '.'; + r[2] = 'F'; + r += 3; + state2 = STATE_G2_ISO8859_7; + } + r[0] = ESC; + r[1] = 'N'; + r[2] = buf[0]-0x80; + COMBINE_STATE; + conv->ostate = state; + return count; + } + } + + break; + + case japanese: + + /* Try JIS X 0201-1976 Roman. */ + ret = jisx0201_wctomb(conv,buf,wc,1); + if (ret != RET_ILUNI) { + if (ret != 1) abort(); + if (buf[0] < 0x80) { + int count = (state1 == STATE_JISX0201ROMAN ? 1 : 4); + if (n < count) + return RET_TOOSMALL; + if (state1 != STATE_JISX0201ROMAN) { + r[0] = ESC; + r[1] = '('; + r[2] = 'J'; + r += 3; + state1 = STATE_JISX0201ROMAN; + } + r[0] = buf[0]; + if (wc == 0x000a || wc == 0x000d) + state2 = STATE_G2_NONE; + COMBINE_STATE; + conv->ostate = state; + return count; + } + } + + /* Try JIS X 0208-1990 in place of JIS X 0208-1978 and + JIS X 0208-1983. */ + ret = jisx0208_wctomb(conv,buf,wc,2); + if (ret != RET_ILUNI) { + if (ret != 2) abort(); + if (buf[0] < 0x80 && buf[1] < 0x80) { + int count = (state1 == STATE_JISX0208 ? 2 : 5); + if (n < count) + return RET_TOOSMALL; + if (state1 != STATE_JISX0208) { + r[0] = ESC; + r[1] = '$'; + r[2] = 'B'; + r += 3; + state1 = STATE_JISX0208; + } + r[0] = buf[0]; + r[1] = buf[1]; + COMBINE_STATE; + conv->ostate = state; + return count; + } + } + + /* Try JIS X 0212-1990. */ + ret = jisx0212_wctomb(conv,buf,wc,2); + if (ret != RET_ILUNI) { + if (ret != 2) abort(); + if (buf[0] < 0x80 && buf[1] < 0x80) { + int count = (state1 == STATE_JISX0212 ? 2 : 6); + if (n < count) + return RET_TOOSMALL; + if (state1 != STATE_JISX0212) { + r[0] = ESC; + r[1] = '$'; + r[2] = '('; + r[3] = 'D'; + r += 4; + state1 = STATE_JISX0212; + } + r[0] = buf[0]; + r[1] = buf[1]; + COMBINE_STATE; + conv->ostate = state; + return count; + } + } + + break; + + case chinese: + + /* Try GB 2312-1980. */ + ret = gb2312_wctomb(conv,buf,wc,2); + if (ret != RET_ILUNI) { + if (ret != 2) abort(); + if (buf[0] < 0x80 && buf[1] < 0x80) { + int count = (state1 == STATE_GB2312 ? 2 : 5); + if (n < count) + return RET_TOOSMALL; + if (state1 != STATE_GB2312) { + r[0] = ESC; + r[1] = '$'; + r[2] = 'A'; + r += 3; + state1 = STATE_GB2312; + } + r[0] = buf[0]; + r[1] = buf[1]; + COMBINE_STATE; + conv->ostate = state; + return count; + } + } + + break; + + case korean: + + /* Try KS C 5601-1992. */ + ret = ksc5601_wctomb(conv,buf,wc,2); + if (ret != RET_ILUNI) { + if (ret != 2) abort(); + if (buf[0] < 0x80 && buf[1] < 0x80) { + int count = (state1 == STATE_KSC5601 ? 2 : 6); + if (n < count) + return RET_TOOSMALL; + if (state1 != STATE_KSC5601) { + r[0] = ESC; + r[1] = '$'; + r[2] = '('; + r[3] = 'C'; + r += 4; + state1 = STATE_KSC5601; + } + r[0] = buf[0]; + r[1] = buf[1]; + COMBINE_STATE; + conv->ostate = state; + return count; + } + } + + break; + + case other: + + /* Try JIS X 0201-1976 Kana. This is not officially part of + ISO-2022-JP-2, according to RFC 1554. Therefore we try this + only after all other attempts. */ + ret = jisx0201_wctomb(conv,buf,wc,1); + if (ret != RET_ILUNI) { + if (ret != 1) abort(); + if (buf[0] >= 0x80) { + int count = (state1 == STATE_JISX0201KATAKANA ? 1 : 4); + if (n < count) + return RET_TOOSMALL; + if (state1 != STATE_JISX0201KATAKANA) { + r[0] = ESC; + r[1] = '('; + r[2] = 'I'; + r += 3; + state1 = STATE_JISX0201KATAKANA; + } + r[0] = buf[0]-0x80; + COMBINE_STATE; + conv->ostate = state; + return count; + } + } + + break; + + default: + abort(); + } + + conversion_list = conversion_list >> 3; + } while (conversion_list != 0); + + return RET_ILUNI; +} + +static int +iso2022_jp2_reset (conv_t conv, unsigned char *r, int n) +{ + state_t state = conv->ostate; + SPLIT_STATE; + (void)state2; + (void)state3; + if (state1 != STATE_ASCII) { + if (n < 3) + return RET_TOOSMALL; + r[0] = ESC; + r[1] = '('; + r[2] = 'B'; + /* conv->ostate = 0; will be done by the caller */ + return 3; + } else + return 0; +} + +#undef COMBINE_STATE +#undef SPLIT_STATE +#undef STATE_TAG_LANGUAGE_zh +#undef STATE_TAG_LANGUAGE_z +#undef STATE_TAG_LANGUAGE_ko +#undef STATE_TAG_LANGUAGE_k +#undef STATE_TAG_LANGUAGE_ja +#undef STATE_TAG_LANGUAGE_j +#undef STATE_TAG_LANGUAGE +#undef STATE_TAG_NONE +#undef STATE_G2_ISO8859_7 +#undef STATE_G2_ISO8859_1 +#undef STATE_G2_NONE +#undef STATE_KSC5601 +#undef STATE_GB2312 +#undef STATE_JISX0212 +#undef STATE_JISX0208 +#undef STATE_JISX0201KATAKANA +#undef STATE_JISX0201ROMAN +#undef STATE_ASCII |