1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
|
/*
* FTGL - OpenGL font library
*
* Copyright (c) 2008 Daniel Remenak <dtremenak@users.sourceforge.net>
*
* Portions derived from ConvertUTF.c Copyright (C) 2001-2004 Unicode, Inc
* Unicode, Inc. hereby grants the right to freely use the information
* supplied in this file in the creation of products supporting the
* Unicode Standard, and to make copies of this file in any form
* for internal or external distribution as long as this notice
* remains attached.
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef __FTUnicode__
#define __FTUnicode__
/**
* Provides a way to easily walk multibyte unicode strings in the various
* Unicode encodings (UTF-8, UTF-16, UTF-32, UCS-2, and UCS-4). Encodings
* with elements larger than one byte must already be in the correct endian
* order for the current architecture.
*/
template <typename T>
class FTUnicodeStringItr
{
public:
/**
* Constructor. Also reads the first character and stores it.
*
* @param string The buffer to iterate. No copy is made.
*/
FTUnicodeStringItr(const T* string) : curPos(string), nextPos(string)
{
(*this)++;
};
/**
* Pre-increment operator. Reads the next unicode character and sets
* the state appropriately.
* Note - not protected against overruns.
*/
FTUnicodeStringItr& operator++()
{
curPos = nextPos;
// unicode handling
switch (sizeof(T))
{
case 1: // UTF-8
// get this character
readUTF8(); break;
case 2: // UTF-16
readUTF16(); break;
case 4: // UTF-32
// fall through
default: // error condition really, but give it a shot anyway
curChar = *nextPos++;
}
return *this;
}
/**
* Post-increment operator. Reads the next character and sets
* the state appropriately.
* Note - not protected against overruns.
*/
FTUnicodeStringItr operator++(int)
{
FTUnicodeStringItr temp = *this;
++*this;
return temp;
}
/**
* Equality operator. Two FTUnicodeStringItrs are considered equal
* if they have the same current buffer and buffer position.
*/
bool operator==(const FTUnicodeStringItr& right) const
{
if (curPos == right.getBufferFromHere())
return true;
return false;
}
/**
* Dereference operator.
*
* @return The unicode codepoint of the character currently pointed
* to by the FTUnicodeStringItr.
*/
unsigned int operator*() const
{
return curChar;
}
/**
* Buffer-fetching getter. You can use this to retreive the buffer
* starting at the currently-iterated character for functions which
* require a Unicode string as input.
*/
const T* getBufferFromHere() const { return curPos; }
private:
/**
* Helper function for reading a single UTF8 character from the string.
* Updates internal state appropriately.
*/
void readUTF8();
/**
* Helper function for reading a single UTF16 character from the string.
* Updates internal state appropriately.
*/
void readUTF16();
/**
* The buffer position of the first element in the current character.
*/
const T* curPos;
/**
* The character stored at the current buffer position (prefetched on
* increment, so there's no penalty for dereferencing more than once).
*/
unsigned int curChar;
/**
* The buffer position of the first element in the next character.
*/
const T* nextPos;
// unicode magic numbers
static const char utf8bytes[256];
static const unsigned long offsetsFromUTF8[6];
static const unsigned long highSurrogateStart;
static const unsigned long highSurrogateEnd;
static const unsigned long lowSurrogateStart;
static const unsigned long lowSurrogateEnd;
static const unsigned long highSurrogateShift;
static const unsigned long lowSurrogateBase;
};
/* The first character in a UTF8 sequence indicates how many bytes
* to read (among other things) */
template <typename T>
const char FTUnicodeStringItr<T>::utf8bytes[256] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,6,6
};
/* Magic values subtracted from a buffer value during UTF8 conversion.
* This table contains as many values as there might be trailing bytes
* in a UTF-8 sequence. */
template <typename T>
const unsigned long FTUnicodeStringItr<T>::offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
0x03C82080UL, 0xFA082080UL, 0x82082080UL };
// get a UTF8 character; leave the tracking pointer at the start of the
// next character
// not protected against invalid UTF8
template <typename T>
inline void FTUnicodeStringItr<T>::readUTF8()
{
unsigned int ch = 0;
unsigned int extraBytesToRead = utf8bytes[(unsigned char)(*nextPos)];
// falls through
switch (extraBytesToRead)
{
case 6: ch += *nextPos++; ch <<= 6; /* remember, illegal UTF-8 */
case 5: ch += *nextPos++; ch <<= 6; /* remember, illegal UTF-8 */
case 4: ch += *nextPos++; ch <<= 6;
case 3: ch += *nextPos++; ch <<= 6;
case 2: ch += *nextPos++; ch <<= 6;
case 1: ch += *nextPos++;
}
ch -= offsetsFromUTF8[extraBytesToRead-1];
curChar = ch;
}
// Magic numbers for UTF-16 conversions
template <typename T>
const unsigned long FTUnicodeStringItr<T>::highSurrogateStart = 0xD800;
template <typename T>
const unsigned long FTUnicodeStringItr<T>::highSurrogateEnd = 0xDBFF;
template <typename T>
const unsigned long FTUnicodeStringItr<T>::lowSurrogateStart = 0xDC00;
template <typename T>
const unsigned long FTUnicodeStringItr<T>::lowSurrogateEnd = 0xDFFF;
template <typename T>
const unsigned long FTUnicodeStringItr<T>::highSurrogateShift = 10;
template <typename T>
const unsigned long FTUnicodeStringItr<T>::lowSurrogateBase = 0x0010000UL;
template <typename T>
inline void FTUnicodeStringItr<T>::readUTF16()
{
unsigned int ch = *nextPos++;
// if we have the first half of the surrogate pair
if (ch >= highSurrogateStart && ch <= highSurrogateEnd)
{
unsigned int ch2 = *curPos;
// complete the surrogate pair
if (ch2 >= lowSurrogateStart && ch2 <= lowSurrogateEnd)
{
ch = ((ch - highSurrogateStart) << highSurrogateShift)
+ (ch2 - lowSurrogateStart) + lowSurrogateBase;
++nextPos;
}
}
curChar = ch;
}
#endif
|