CppCMS
|
00001 // 00002 // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh) 00003 // 00004 // Distributed under the Boost Software License, Version 1.0. (See 00005 // accompanying file LICENSE_1_0.txt or copy at 00006 // http://www.boost.org/LICENSE_1_0.txt) 00007 // 00008 #ifndef BOOSTER_LOCALE_UTF_H_INCLUDED 00009 #define BOOSTER_LOCALE_UTF_H_INCLUDED 00010 00011 #include <booster/cstdint.h> 00012 00013 namespace booster { 00014 namespace locale { 00020 namespace utf { 00022 #ifdef __GNUC__ 00023 # define BOOSTER_LOCALE_LIKELY(x) __builtin_expect((x),1) 00024 # define BOOSTER_LOCALE_UNLIKELY(x) __builtin_expect((x),0) 00025 #else 00026 # define BOOSTER_LOCALE_LIKELY(x) (x) 00027 # define BOOSTER_LOCALE_UNLIKELY(x) (x) 00028 #endif 00029 00030 00034 typedef uint32_t code_point; 00035 00039 static const code_point illegal = 0xFFFFFFFFu; 00040 00044 static const code_point incomplete = 0xFFFFFFFEu; 00045 00049 inline bool is_valid_codepoint(code_point v) 00050 { 00051 if(v>0x10FFFF) 00052 return false; 00053 if(0xD800 <=v && v<= 0xDFFF) // surragates 00054 return false; 00055 return true; 00056 } 00057 00058 #ifdef BOOSTER_LOCALE_DOXYGEN 00059 00060 00061 00062 template<typename CharType,int size=sizeof(CharType)> 00063 struct utf_traits { 00067 typedef CharType char_type; 00082 template<typename Iterator> 00083 static code_point decode(Iterator &p,Iterator e); 00084 00092 static const int max_width; 00099 static int width(code_point value); 00100 00106 static int trail_length(char_type c); 00110 static bool is_trail(char_type c); 00114 static bool is_lead(char_type c); 00115 00126 template<typename Iterator> 00127 static Iterator encode(code_point value,Iterator out); 00133 template<typename Iterator> 00134 static code_point decode_valid(Iterator &p); 00135 }; 00136 00137 #else 00138 00139 template<typename CharType,int size=sizeof(CharType)> 00140 struct utf_traits; 00141 00142 template<typename CharType> 00143 struct utf_traits<CharType,1> { 00144 00145 typedef CharType char_type; 00146 00147 static int trail_length(char_type ci) 00148 { 00149 unsigned char c = ci; 00150 if(c < 128) 00151 return 0; 00152 if(BOOSTER_LOCALE_UNLIKELY(c < 194)) 00153 return -1; 00154 if(c < 224) 00155 return 1; 00156 if(c < 240) 00157 return 2; 00158 if(BOOSTER_LOCALE_LIKELY(c <=244)) 00159 return 3; 00160 return -1; 00161 } 00162 00163 static const int max_width = 4; 00164 00165 static int width(code_point value) 00166 { 00167 if(value <=0x7F) { 00168 return 1; 00169 } 00170 else if(value <=0x7FF) { 00171 return 2; 00172 } 00173 else if(BOOSTER_LOCALE_LIKELY(value <=0xFFFF)) { 00174 return 3; 00175 } 00176 else { 00177 return 4; 00178 } 00179 } 00180 00181 static bool is_trail(char_type ci) 00182 { 00183 unsigned char c=ci; 00184 return (c & 0xC0)==0x80; 00185 } 00186 00187 static bool is_lead(char_type ci) 00188 { 00189 return !is_trail(ci); 00190 } 00191 00192 template<typename Iterator> 00193 static code_point decode(Iterator &p,Iterator e) 00194 { 00195 if(BOOSTER_LOCALE_UNLIKELY(p==e)) 00196 return incomplete; 00197 00198 unsigned char lead = *p++; 00199 00200 // First byte is fully validated here 00201 int trail_size = trail_length(lead); 00202 00203 if(BOOSTER_LOCALE_UNLIKELY(trail_size < 0)) 00204 return illegal; 00205 00206 // 00207 // Ok as only ASCII may be of size = 0 00208 // also optimize for ASCII text 00209 // 00210 if(trail_size == 0) 00211 return lead; 00212 00213 code_point c = lead & ((1<<(6-trail_size))-1); 00214 00215 // Read the rest 00216 unsigned char tmp; 00217 switch(trail_size) { 00218 case 3: 00219 if(BOOSTER_LOCALE_UNLIKELY(p==e)) 00220 return incomplete; 00221 tmp = *p++; 00222 c = (c << 6) | ( tmp & 0x3F); 00223 case 2: 00224 if(BOOSTER_LOCALE_UNLIKELY(p==e)) 00225 return incomplete; 00226 tmp = *p++; 00227 c = (c << 6) | ( tmp & 0x3F); 00228 case 1: 00229 if(BOOSTER_LOCALE_UNLIKELY(p==e)) 00230 return incomplete; 00231 tmp = *p++; 00232 c = (c << 6) | ( tmp & 0x3F); 00233 } 00234 00235 // Check code point validity: no surrogates and 00236 // valid range 00237 if(BOOSTER_LOCALE_UNLIKELY(!is_valid_codepoint(c))) 00238 return illegal; 00239 00240 // make sure it is the most compact representation 00241 if(BOOSTER_LOCALE_UNLIKELY(width(c)!=trail_size + 1)) 00242 return illegal; 00243 00244 return c; 00245 00246 } 00247 00248 template<typename Iterator> 00249 static code_point decode_valid(Iterator &p) 00250 { 00251 unsigned char lead = *p++; 00252 if(lead < 192) 00253 return lead; 00254 00255 int trail_size; 00256 00257 if(lead < 224) 00258 trail_size = 1; 00259 else if(BOOSTER_LOCALE_LIKELY(lead < 240)) // non-BMP rare 00260 trail_size = 2; 00261 else 00262 trail_size = 3; 00263 00264 code_point c = lead & ((1<<(6-trail_size))-1); 00265 00266 switch(trail_size) { 00267 case 3: 00268 c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F); 00269 case 2: 00270 c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F); 00271 case 1: 00272 c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F); 00273 } 00274 00275 return c; 00276 } 00277 00278 00279 00280 template<typename Iterator> 00281 static Iterator encode(code_point value,Iterator out) 00282 { 00283 if(value <= 0x7F) { 00284 *out++ = static_cast<char_type>(value); 00285 } 00286 else if(value <= 0x7FF) { 00287 *out++ = static_cast<char_type>((value >> 6) | 0xC0); 00288 *out++ = static_cast<char_type>((value & 0x3F) | 0x80); 00289 } 00290 else if(BOOSTER_LOCALE_LIKELY(value <= 0xFFFF)) { 00291 *out++ = static_cast<char_type>((value >> 12) | 0xE0); 00292 *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80); 00293 *out++ = static_cast<char_type>((value & 0x3F) | 0x80); 00294 } 00295 else { 00296 *out++ = static_cast<char_type>((value >> 18) | 0xF0); 00297 *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80); 00298 *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80); 00299 *out++ = static_cast<char_type>((value & 0x3F) | 0x80); 00300 } 00301 return out; 00302 } 00303 }; // utf8 00304 00305 template<typename CharType> 00306 struct utf_traits<CharType,2> { 00307 typedef CharType char_type; 00308 00309 // See RFC 2781 00310 static bool is_first_surrogate(uint16_t x) 00311 { 00312 return 0xD800 <=x && x<= 0xDBFF; 00313 } 00314 static bool is_second_surrogate(uint16_t x) 00315 { 00316 return 0xDC00 <=x && x<= 0xDFFF; 00317 } 00318 static code_point combine_surrogate(uint16_t w1,uint16_t w2) 00319 { 00320 return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000; 00321 } 00322 static int trail_length(char_type c) 00323 { 00324 if(is_first_surrogate(c)) 00325 return 1; 00326 if(is_second_surrogate(c)) 00327 return -1; 00328 return 0; 00329 } 00333 static bool is_trail(char_type c) 00334 { 00335 return is_second_surrogate(c); 00336 } 00340 static bool is_lead(char_type c) 00341 { 00342 return !is_second_surrogate(c); 00343 } 00344 00345 template<typename It> 00346 static code_point decode(It ¤t,It last) 00347 { 00348 if(BOOSTER_LOCALE_UNLIKELY(current == last)) 00349 return incomplete; 00350 uint16_t w1=*current++; 00351 if(BOOSTER_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) { 00352 return w1; 00353 } 00354 if(w1 > 0xDBFF) 00355 return illegal; 00356 if(current==last) 00357 return incomplete; 00358 uint16_t w2=*current++; 00359 if(w2 < 0xDC00 || 0xDFFF < w2) 00360 return illegal; 00361 return combine_surrogate(w1,w2); 00362 } 00363 template<typename It> 00364 static code_point decode_valid(It ¤t) 00365 { 00366 uint16_t w1=*current++; 00367 if(BOOSTER_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) { 00368 return w1; 00369 } 00370 uint16_t w2=*current++; 00371 return combine_surrogate(w1,w2); 00372 } 00373 00374 static const int max_width = 2; 00375 static int width(code_point u) 00376 { 00377 return u>=0x10000 ? 2 : 1; 00378 } 00379 template<typename It> 00380 static It encode(code_point u,It out) 00381 { 00382 if(BOOSTER_LOCALE_LIKELY(u<=0xFFFF)) { 00383 *out++ = static_cast<char_type>(u); 00384 } 00385 else { 00386 u -= 0x10000; 00387 *out++ = static_cast<char_type>(0xD800 | (u>>10)); 00388 *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF)); 00389 } 00390 return out; 00391 } 00392 }; // utf16; 00393 00394 00395 template<typename CharType> 00396 struct utf_traits<CharType,4> { 00397 typedef CharType char_type; 00398 static int trail_length(char_type c) 00399 { 00400 if(is_valid_codepoint(c)) 00401 return 0; 00402 return -1; 00403 } 00404 static bool is_trail(char_type /*c*/) 00405 { 00406 return false; 00407 } 00408 static bool is_lead(char_type /*c*/) 00409 { 00410 return true; 00411 } 00412 00413 template<typename It> 00414 static code_point decode_valid(It ¤t) 00415 { 00416 return *current++; 00417 } 00418 00419 template<typename It> 00420 static code_point decode(It ¤t,It last) 00421 { 00422 if(BOOSTER_LOCALE_UNLIKELY(current == last)) 00423 return booster::locale::utf::incomplete; 00424 code_point c=*current++; 00425 if(BOOSTER_LOCALE_UNLIKELY(!is_valid_codepoint(c))) 00426 return booster::locale::utf::illegal; 00427 return c; 00428 } 00429 static const int max_width = 1; 00430 static int width(code_point /*u*/) 00431 { 00432 return 1; 00433 } 00434 template<typename It> 00435 static It encode(code_point u,It out) 00436 { 00437 *out++ = static_cast<char_type>(u); 00438 return out; 00439 } 00440 00441 }; // utf32 00442 00443 #endif 00444 00445 00446 } // utf 00447 } // locale 00448 } // boost 00449 00450 00451 #endif 00452 00453 // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 00454