1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
|
/*---------------------------------------------------------------------------*
| PDFlib - A library for generating PDF on the fly |
+---------------------------------------------------------------------------+
| Copyright (c) 1997-2006 Thomas Merz and PDFlib GmbH. All rights reserved. |
+---------------------------------------------------------------------------+
| |
| This software is subject to the PDFlib license. It is NOT in the |
| public domain. Extended versions and commercial licenses are |
| available, please check http://www.pdflib.com. |
| |
*---------------------------------------------------------------------------*/
/* $Id: pc_unicode.h,v 1.2 2009/10/20 18:12:26 scuri Exp $
*
* Unicode glyph name conversion routines
*
*/
#ifndef PC_UNICODE_H
#define PC_UNICODE_H
#define PDC_NUM_BMPVAL 0x10000
#define PDC_NUM_UNIVAL 0x110000
#define PDC_MAX_UNIVAL 0x10FFFF
#define PDC_UNICODE_HT 0x0009
#define PDC_UNICODE_LF 0x000A
#define PDC_UNICODE_VT 0x000B
#define PDC_UNICODE_FF 0x000C
#define PDC_UNICODE_CR 0x000D
#define PDC_UNICODE_ETB 0x0017
#define PDC_UNICODE_ESC 0x001B
#define PDC_UNICODE_SPACE 0x0020
#define PDC_UNICODE_QUOTMARK 0x0022
#define PDC_UNICODE_AMPERSAND 0x0026
#define PDC_UNICODE_APOSTROPHE 0x0027
#define PDC_UNICODE_HYPHEN 0x002D
#define PDC_UNICODE_PERIOD 0x002E
#define PDC_UNICODE_SEMICOLON 0x003B
#define PDC_UNICODE_LESS_THAN 0x003C
#define PDC_UNICODE_GREATER_THAN 0x003E
#define PDC_UNICODE_BACKSLASH 0x005C
#define PDC_UNICODE_LEFT_CURLY 0x007B
#define PDC_UNICODE_RIGHT_CURLY 0x007D
#define PDC_UNICODE_DELETE 0x007F
#define PDC_UNICODE_NEL 0x0085
#define PDC_UNICODE_NBSP 0x00A0
#define PDC_UNICODE_SHY 0x00AD
#define PDC_UNICODE_MACRON 0x00AF
#define PDC_UNICODE_MICRO 0x00B5
#define PDC_UNICODE_MIDDLEDOT 0x00B7
#define PDC_UNICODE_MODMACRON 0x02C9
#define PDC_UNICODE_CAPDELTA 0x0394
#define PDC_UNICODE_CAPOMEGA 0x03A9
#define PDC_UNICODE_SMALLMU 0x03BC
#define PDC_UNICODE_LS 0x2028
#define PDC_UNICODE_PS 0x2029
#define PDC_UNICODE_NNBSP 0x202F
#define PDC_UNICODE_FRACSLASH 0x2044
#define PDC_UNICODE_MMSPACE 0x205F
#define PDC_UNICODE_EURO 0x20AC
#define PDC_UNICODE_OHMSIGN 0x2126
#define PDC_UNICODE_INCREMENT 0x2206
#define PDC_UNICODE_DIVSLASH 0x2215
#define PDC_UNICODE_BULLETOP 0x2219
#define PDC_UNICODE_IDEOSPACE 0x3000
/* maximal value of Latin-1 characters */
#define PDC_UNICODE_MAXASCII 0x007F
#define PDC_UNICODE_MAXLATIN1 0x00FF
/* maximal resp. single value of Japanese HW characters */
#define PDC_UNICODE_MAXHW 0x007E
#define PDC_UNICODE_SINGHW 0x00A5
/* Unicode borders of fullwidth forms of ASCII characters */
#define PDC_UNICODE_MINFWASCII 0xFF00
#define PDC_UNICODE_MAXFWASCII 0xFF5E
#define PDC_UNICODE_DIFFWASCII 0xFEE0
/* PDC_UNICODE_MINFASCII - PDC_UNICODE_SPACE */
/* Unicode borders of fullwidth forms of Symbol characters */
#define PDC_UNICODE_MINFWSYMBOL 0xFFE0
#define PDC_UNICODE_MAXFWSYMBOL 0xFFE6
/* Unicode borders of Private Use Area (PUA) */
#define PDC_UNICODE_MINPUA 0xE000
#define PDC_UNICODE_MAXPUA 0xF8FF
/* Begin of PDFlib PUA */
#define PDC_UNICODE_PDFPUA 0xF200
/* Unicode borders of Unicode Corporate Use Subarea as used by Adobe Systems */
#define PDC_UNICODE_MINCUS 0xF600
#define PDC_UNICODE_MAXCUS 0xF8FF
/* Unicode Surrogate ranges */
#define PDC_UNICODE_MINHIGHSUR 0xD800
#define PDC_UNICODE_MAXHIGHSUR 0xDBFF
#define PDC_UNICODE_MINLOWSUR 0xDC00
#define PDC_UNICODE_MAXLOWSUR 0xDFFF
/* Unicode borders of higher Unicode spaces */
#define PDC_UNICODE_MINSPACE 0x2000
#define PDC_UNICODE_MAXSPACE 0x200B
/* Unicode borders of CJK compatibility forms and small form variants */
#define PDC_UNICODE_MINCJKFORMS 0xFE30
#define PDC_UNICODE_MIDCJKFORMS 0xFE48
#define PDC_UNICODE_MAXCJKFORMS 0xFE6F
/* replacement character */
#define PDC_UNICODE_REPLCHAR 0xFFFD
/* special character for CRLF */
#define PDF_UNICODE_CRLF 0xFDD0
/* not a character */
#define PDC_UNICODE_NOTCHAR 0xFFFF
/* Latin and Armenian ligatures */
#define PDC_UNICODE_CAPLIGATIJ 0x0132
#define PDC_UNICODE_SMALLLIGATIJ 0x0133
#define PDC_UNICODE_MINLIGAT 0xFB00
#define PDC_UNICODE_MAXLIGAT 0xFB17
/* The Unicode byte order mark (BOM) byte parts */
#define PDC_UNICODE_BOM 0xFEFF
#define PDF_BOM0 0xFE
#define PDF_BOM1 0xFF
#define PDF_BOM2 0xEF
#define PDF_BOM3 0xBB
#define PDF_BOM4 0xBF
/*
* check whether the string is UTF-16 unicode by looking for the BOM
* in big-endian or little-endian format resp.
* s must not be NULL.
*/
#define pdc_is_utf16be_unicode(s) \
(((pdc_byte *)(s))[0] == PDF_BOM0 && \
((pdc_byte *)(s))[1] == PDF_BOM1)
#define pdc_is_utf16le_unicode(s) \
(((pdc_byte *)(s))[0] == PDF_BOM1 && \
((pdc_byte *)(s))[1] == PDF_BOM0)
/*
* check whether the string is UTF-32 unicode by looking for the BOM
* in big-endian or little-endian format resp.
* s must not be NULL.
*/
#define pdc_is_utf32be_unicode(s) \
(((pdc_byte *)(s))[0] == 0x00 && \
((pdc_byte *)(s))[1] == 0x00 && \
((pdc_byte *)(s))[2] == PDF_BOM0 && \
((pdc_byte *)(s))[3] == PDF_BOM1)
#define pdc_is_utf32le_unicode(s) \
(((pdc_byte *)(s))[0] == PDF_BOM1 && \
((pdc_byte *)(s))[1] == PDF_BOM0 && \
((pdc_byte *)(s))[2] == 0x00 && \
((pdc_byte *)(s))[3] == 0x00)
/*
* check whether the string is UTF-8 unicode by looking for the BOM
* s must not be NULL.
*/
#define pdc_is_utf8_unicode(s) \
(((pdc_byte *)(s))[0] == PDF_BOM2 && \
((pdc_byte *)(s))[1] == PDF_BOM3 && \
((pdc_byte *)(s))[2] == PDF_BOM4)
#define PDC_UTF8_STRING "\xEF\xBB\xBF"
#define pdc_is_utf8_bytecode(s) \
(((pdc_byte *)(s))[0] == PDF_BOM2 && \
((pdc_byte *)(s))[1] == PDF_BOM3 && \
((pdc_byte *)(s))[2] == PDF_BOM4)
#define pdc_copy_utf8_bom(s) \
((pdc_byte *)(s))[0] = PDF_BOM2, \
((pdc_byte *)(s))[1] = PDF_BOM3, \
((pdc_byte *)(s))[2] = PDF_BOM4;
#define PDC_UTF8 pdc_utf8
#define PDC_UTF8_STRG "utf8"
#define PDC_UTF8_FLAG pdc_false
#define PDC_HTML_CTRLCHAR '&'
#define PDC_HTML_DELIMITCHAR ';'
typedef enum
{
conversionOK, /* conversion successful */
sourceExhausted, /* partial character in source, but hit end */
targetExhausted, /* insuff. room in target for conversion */
sourceIllegal /* source sequence is illegal/malformed */
}
pdc_convers_result;
typedef enum
{
strictConversion = 0,
lenientConversion
}
pdc_convers_flags;
/* flags for pdc_convert_string(), pdc_strdup_ext(),
* pdc_utfxx6_to_utfxx(), pdc_convert_name_ext()
*/
#define PDC_CONV_FORCEUTF16 (1<<0)
#define PDC_CONV_TRY7BYTES (1<<1)
#define PDC_CONV_TRYBYTES (1<<2)
#define PDC_CONV_WITHBOM (1<<3)
#define PDC_CONV_NOBOM (1<<4)
#define PDC_CONV_AUTOBOM (1<<5)
#define PDC_CONV_ANALYZE (1<<6)
#define PDC_CONV_TMPALLOC (1<<7)
#define PDC_CONV_HTMLCHAR (1<<8)
#define PDC_CONV_NEWALLOC (1<<9)
#define PDC_CONV_INFLATE (1<<10)
#define PDC_CONV_ESCSEQU (1<<11)
#define PDC_CONV_BSSEQU (1<<12)
#define PDC_CONV_EBCDIC (1<<13)
#define PDC_CONV_ENCERROR (1<<14)
#define PDC_CONV_KEEPLBCHAR (1<<15)
#define PDC_CONV_LOGGING (1<<16)
#define PDC_CONV_ISUTF8 (1<<17)
#define PDC_CONV_ASCII (1<<18)
#define PDC_CONV_MAXSTRLEN (1<<19)
#define PDC_CONV_FILENAME (1<<20)
/* DON'T change the order */
typedef enum
{
pdc_auto = 1,
pdc_auto2 = 2,
pdc_bytes = 3,
pdc_bytes2 = 4,
pdc_utf8 = 5, /* UTF-8 */
pdc_utf16 = 7, /* UTF-16 */
pdc_utf16be = 8, /* UTF-16 big endian */
pdc_utf16le = 9, /* UTF-16 little endian */
pdc_utf32 = 10 /* UTF-32 */
}
pdc_text_format;
/* copy for pdflib in p_keyconn.h */
#if defined(PC_UNICODE_C)
static const pdc_keyconn pdc_textformat_keylist[] =
{
{"auto", pdc_auto},
{"auto2", pdc_auto2},
{"bytes", pdc_bytes},
{"bytes2", pdc_bytes2},
{"utf8", pdc_utf8},
{"utf16", pdc_utf16},
{"utf16be", pdc_utf16be},
{"utf16le", pdc_utf16le},
{NULL, 0}
};
#endif /* PC_UNICODE_C */
const char *pdc_get_textformat(int textformat);
int pdc_convert_string(pdc_core *pdc,
pdc_text_format inutf, int codepage, pdc_encodingvector *inev,
pdc_byte *instring, int inlen, pdc_text_format *oututf_p,
pdc_encodingvector *outev, pdc_byte **outstring, int *outlen, int flags,
pdc_bool verbose);
int pdc_convert_textstring(pdc_core *pdc,
pdc_text_format inutf, int codepage, pdc_encodingvector *inev,
const pdc_glyph_tab *glyphtab, int tabsize, int replchar,
pdc_byte *instring, int inlen,
pdc_text_format *oututf_p, pdc_encodingvector *outev,
pdc_byte **outstring, int *outlen, int flags,
pdc_bool verbose);
char *pdc_convert_name(pdc_core *pdc, const char *name, int len, int flags);
char *pdc_convert_name_ext(pdc_core *pdc, const char *name, int len,
pdc_encoding enc, int codepage, int flags);
char *pdc_utf8_to_hostbytes(pdc_core *pdc, pdc_bool honorlang, char *name);
char *pdc_hostbytes_to_utf8(pdc_core *pdc, pdc_bool honorlang, char *name);
char *pdc_utf16_to_utf8(pdc_core *pdc, const char *utf16string, int len,
int flags, int *size);
char *pdc_utf8_to_utf16(pdc_core *pdc, const char *utf8string,
const char *format, int flags, int *size);
char *pdc_utf16_to_utf32(pdc_core *pdc, const char *utf16string, int len,
int *size);
char *pdc_utf32_to_utf8(pdc_core *pdc, const char *utf32string, int len,
int flags, int *size);
char *pdc_utf32_to_utf16(pdc_core *pdc, const char *utf32string, int len,
const char *format, int flags, int *size);
int pdc_char16_to_char32(pdc_core *pdc, const pdc_ushort *ustext, int *ic,
int len, pdc_bool verbose);
int pdc_char32_to_char16(pdc_core *pdc, int usv, pdc_ushort *uvlist,
pdc_bool verbose);
#endif /* PC_UNICODE_H */
|