00001
00002
00003
00004
00005
00006
00007
00008 #ifndef BOOSTER_LOCALE_UTF_H_INCLUDED
00009 #define BOOSTER_LOCALE_UTF_H_INCLUDED
00010
00011 #include <booster/cstdint.h>
00012
00013 namespace booster {
00014 namespace locale {
00020 namespace utf {
00022 #ifdef __GNUC__
00023 # define BOOSTER_LOCALE_LIKELY(x) __builtin_expect((x),1)
00024 # define BOOSTER_LOCALE_UNLIKELY(x) __builtin_expect((x),0)
00025 #else
00026 # define BOOSTER_LOCALE_LIKELY(x) (x)
00027 # define BOOSTER_LOCALE_UNLIKELY(x) (x)
00028 #endif
00029
00030
00034 typedef uint32_t code_point;
00035
00039 static const code_point illegal = 0xFFFFFFFFu;
00040
00044 static const code_point incomplete = 0xFFFFFFFEu;
00045
00049 inline bool is_valid_codepoint(code_point v)
00050 {
00051 if(v>0x10FFFF)
00052 return false;
00053 if(0xD800 <=v && v<= 0xDFFF)
00054 return false;
00055 return true;
00056 }
00057
00058 #ifdef BOOSTER_LOCALE_DOXYGEN
00059
00060
00061
00062 template<typename CharType,int size=sizeof(CharType)>
00063 struct utf_traits {
00067 typedef CharType char_type;
00082 template<typename Iterator>
00083 static code_point decode(Iterator &p,Iterator e);
00084
00092 static const int max_width;
00099 static int width(code_point value);
00100
00106 static int trail_length(char_type c);
00110 static bool is_trail(char_type c);
00114 static bool is_lead(char_type c);
00115
00126 template<typename Iterator>
00127 static Iterator encode(code_point value,Iterator out);
00133 template<typename Iterator>
00134 static code_point decode_valid(Iterator &p);
00135 };
00136
00137 #else
00138
00139 template<typename CharType,int size=sizeof(CharType)>
00140 struct utf_traits;
00141
00142 template<typename CharType>
00143 struct utf_traits<CharType,1> {
00144
00145 typedef CharType char_type;
00146
00147 static int trail_length(char_type ci)
00148 {
00149 unsigned char c = ci;
00150 if(c < 128)
00151 return 0;
00152 if(BOOSTER_LOCALE_UNLIKELY(c < 194))
00153 return -1;
00154 if(c < 224)
00155 return 1;
00156 if(c < 240)
00157 return 2;
00158 if(BOOSTER_LOCALE_LIKELY(c <=244))
00159 return 3;
00160 return -1;
00161 }
00162
00163 static const int max_width = 4;
00164
00165 static int width(code_point value)
00166 {
00167 if(value <=0x7F) {
00168 return 1;
00169 }
00170 else if(value <=0x7FF) {
00171 return 2;
00172 }
00173 else if(BOOSTER_LOCALE_LIKELY(value <=0xFFFF)) {
00174 return 3;
00175 }
00176 else {
00177 return 4;
00178 }
00179 }
00180
00181 static bool is_trail(char_type ci)
00182 {
00183 unsigned char c=ci;
00184 return (c & 0xC0)==0x80;
00185 }
00186
00187 static bool is_lead(char_type ci)
00188 {
00189 return !is_trail(ci);
00190 }
00191
00192 template<typename Iterator>
00193 static code_point decode(Iterator &p,Iterator e)
00194 {
00195 if(BOOSTER_LOCALE_UNLIKELY(p==e))
00196 return incomplete;
00197
00198 unsigned char lead = *p++;
00199
00200
00201 int trail_size = trail_length(lead);
00202
00203 if(BOOSTER_LOCALE_UNLIKELY(trail_size < 0))
00204 return illegal;
00205
00206
00207
00208
00209
00210 if(trail_size == 0)
00211 return lead;
00212
00213 code_point c = lead & ((1<<(6-trail_size))-1);
00214
00215
00216 unsigned char tmp;
00217 switch(trail_size) {
00218 case 3:
00219 if(BOOSTER_LOCALE_UNLIKELY(p==e))
00220 return incomplete;
00221 tmp = *p++;
00222 c = (c << 6) | ( tmp & 0x3F);
00223 case 2:
00224 if(BOOSTER_LOCALE_UNLIKELY(p==e))
00225 return incomplete;
00226 tmp = *p++;
00227 c = (c << 6) | ( tmp & 0x3F);
00228 case 1:
00229 if(BOOSTER_LOCALE_UNLIKELY(p==e))
00230 return incomplete;
00231 tmp = *p++;
00232 c = (c << 6) | ( tmp & 0x3F);
00233 }
00234
00235
00236
00237 if(BOOSTER_LOCALE_UNLIKELY(!is_valid_codepoint(c)))
00238 return illegal;
00239
00240
00241 if(BOOSTER_LOCALE_UNLIKELY(width(c)!=trail_size + 1))
00242 return illegal;
00243
00244 return c;
00245
00246 }
00247
00248 template<typename Iterator>
00249 static code_point decode_valid(Iterator &p)
00250 {
00251 unsigned char lead = *p++;
00252 if(lead < 192)
00253 return lead;
00254
00255 int trail_size;
00256
00257 if(lead < 224)
00258 trail_size = 1;
00259 else if(BOOSTER_LOCALE_LIKELY(lead < 240))
00260 trail_size = 2;
00261 else
00262 trail_size = 3;
00263
00264 code_point c = lead & ((1<<(6-trail_size))-1);
00265
00266 switch(trail_size) {
00267 case 3:
00268 c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
00269 case 2:
00270 c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
00271 case 1:
00272 c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
00273 }
00274
00275 return c;
00276 }
00277
00278
00279
00280 template<typename Iterator>
00281 static Iterator encode(code_point value,Iterator out)
00282 {
00283 if(value <=0x7F) {
00284 *out++ = value;
00285 }
00286 else if(value <=0x7FF) {
00287 *out++=(value >> 6) | 0xC0;
00288 *out++=(value & 0x3F) | 0x80;
00289 }
00290 else if(BOOSTER_LOCALE_LIKELY(value <=0xFFFF)) {
00291 *out++=(value >> 12) | 0xE0;
00292 *out++=((value >> 6) & 0x3F) | 0x80;
00293 *out++=(value & 0x3F) | 0x80;
00294 }
00295 else {
00296 *out++=(value >> 18) | 0xF0;
00297 *out++=((value >> 12) & 0x3F) | 0x80;
00298 *out++=((value >> 6) & 0x3F) | 0x80;
00299 *out++=(value & 0x3F) | 0x80;
00300 }
00301 return out;
00302 }
00303 };
00304
00305 template<typename CharType>
00306 struct utf_traits<CharType,2> {
00307 typedef CharType char_type;
00308
00309
00310 static bool is_first_surrogate(uint16_t x)
00311 {
00312 return 0xD800 <=x && x<= 0xDBFF;
00313 }
00314 static bool is_second_surrogate(uint16_t x)
00315 {
00316 return 0xDC00 <=x && x<= 0xDFFF;
00317 }
00318 static code_point combine_surrogate(uint16_t w1,uint16_t w2)
00319 {
00320 return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;
00321 }
00322 static int trail_length(char_type c)
00323 {
00324 if(is_first_surrogate(c))
00325 return 1;
00326 if(is_second_surrogate(c))
00327 return -1;
00328 return 0;
00329 }
00333 static bool is_trail(char_type c)
00334 {
00335 return is_second_surrogate(c);
00336 }
00340 static bool is_lead(char_type c)
00341 {
00342 return !is_second_surrogate(c);
00343 }
00344
00345 template<typename It>
00346 static code_point decode(It ¤t,It last)
00347 {
00348 if(BOOSTER_LOCALE_UNLIKELY(current == last))
00349 return incomplete;
00350 uint16_t w1=*current++;
00351 if(BOOSTER_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) {
00352 return w1;
00353 }
00354 if(w1 > 0xDBFF)
00355 return illegal;
00356 if(current==last)
00357 return incomplete;
00358 uint16_t w2=*current++;
00359 if(w2 < 0xDC00 || 0xDFFF < w2)
00360 return illegal;
00361 return combine_surrogate(w1,w2);
00362 }
00363 template<typename It>
00364 static code_point decode_valid(It ¤t)
00365 {
00366 uint16_t w1=*current++;
00367 if(BOOSTER_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) {
00368 return w1;
00369 }
00370 uint16_t w2=*current++;
00371 return combine_surrogate(w1,w2);
00372 }
00373
00374 static const int max_width = 2;
00375 static int width(code_point u)
00376 {
00377 return u>=0x10000 ? 2 : 1;
00378 }
00379 template<typename It>
00380 static It encode(code_point u,It out)
00381 {
00382 if(BOOSTER_LOCALE_LIKELY(u<=0xFFFF)) {
00383 *out++ = u;
00384 }
00385 else {
00386 u-=0x10000;
00387 *out++=0xD800 | (u>>10);
00388 *out++=0xDC00 | (u & 0x3FF);
00389 }
00390 return out;
00391 }
00392 };
00393
00394
00395 template<typename CharType>
00396 struct utf_traits<CharType,4> {
00397 typedef CharType char_type;
00398 static int trail_length(char_type c)
00399 {
00400 if(is_valid_codepoint(c))
00401 return 0;
00402 return -1;
00403 }
00404 static bool is_trail(char_type )
00405 {
00406 return false;
00407 }
00408 static bool is_lead(char_type )
00409 {
00410 return true;
00411 }
00412
00413 template<typename It>
00414 static code_point decode_valid(It ¤t)
00415 {
00416 return *current++;
00417 }
00418
00419 template<typename It>
00420 static code_point decode(It ¤t,It last)
00421 {
00422 if(BOOSTER_LOCALE_UNLIKELY(current == last))
00423 return booster::locale::utf::incomplete;
00424 code_point c=*current++;
00425 if(BOOSTER_LOCALE_UNLIKELY(!is_valid_codepoint(c)))
00426 return booster::locale::utf::illegal;
00427 return c;
00428 }
00429 static const int max_width = 1;
00430 static int width(code_point )
00431 {
00432 return 1;
00433 }
00434 template<typename It>
00435 static It encode(code_point u,It out)
00436 {
00437 *out++ = u;
00438 return out;
00439 }
00440
00441 };
00442
00443 #endif
00444
00445
00446 }
00447 }
00448 }
00449
00450
00451 #endif
00452
00453
00454