CppCMS
booster/locale/utf.h
00001 //
00002 //  Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
00003 //
00004 //  Distributed under the Boost Software License, Version 1.0. (See
00005 //  accompanying file LICENSE_1_0.txt or copy at
00006 //  http://www.boost.org/LICENSE_1_0.txt)
00007 //
00008 #ifndef BOOSTER_LOCALE_UTF_H_INCLUDED
00009 #define BOOSTER_LOCALE_UTF_H_INCLUDED
00010 
00011 #include <booster/cstdint.h>
00012 
00013 namespace booster {
00014 namespace locale {
00020 namespace utf {
00022     #ifdef __GNUC__
00023     #   define BOOSTER_LOCALE_LIKELY(x)   __builtin_expect((x),1)
00024     #   define BOOSTER_LOCALE_UNLIKELY(x) __builtin_expect((x),0)
00025     #else
00026     #   define BOOSTER_LOCALE_LIKELY(x)   (x)
00027     #   define BOOSTER_LOCALE_UNLIKELY(x) (x)
00028     #endif
00029 
00030 
00034     typedef uint32_t code_point;
00035 
00039     static const code_point illegal = 0xFFFFFFFFu;
00040 
00044     static const code_point incomplete = 0xFFFFFFFEu;
00045 
00049     inline bool is_valid_codepoint(code_point v)
00050     {
00051         if(v>0x10FFFF)
00052             return false;
00053         if(0xD800 <=v && v<= 0xDFFF) // surragates
00054             return false;
00055         return true;
00056     }
00057 
00058     #ifdef BOOSTER_LOCALE_DOXYGEN
00059 
00060 
00061 
00062     template<typename CharType,int size=sizeof(CharType)>
00063     struct utf_traits {
00067         typedef CharType char_type;
00082         template<typename Iterator>
00083         static code_point decode(Iterator &p,Iterator e);
00084 
00092         static const int max_width;
00099         static int width(code_point value);
00100 
00106         static int trail_length(char_type c);
00110         static bool is_trail(char_type c);
00114         static bool is_lead(char_type c);
00115 
00126         template<typename Iterator>
00127         static Iterator encode(code_point value,Iterator out);
00133         template<typename Iterator>
00134         static code_point decode_valid(Iterator &p);
00135     };
00136     
00137     #else
00138 
00139     template<typename CharType,int size=sizeof(CharType)>
00140     struct utf_traits;
00141 
00142     template<typename CharType>
00143     struct utf_traits<CharType,1> {
00144 
00145         typedef CharType char_type;
00146         
00147         static int trail_length(char_type ci) 
00148         {
00149             unsigned char c = ci;
00150             if(c < 128)
00151                 return 0;
00152             if(BOOSTER_LOCALE_UNLIKELY(c < 194))
00153                 return -1;
00154             if(c < 224)
00155                 return 1;
00156             if(c < 240)
00157                 return 2;
00158             if(BOOSTER_LOCALE_LIKELY(c <=244))
00159                 return 3;
00160             return -1;
00161         }
00162         
00163         static const int max_width = 4;
00164 
00165         static int width(code_point value)
00166         {
00167             if(value <=0x7F) {
00168                 return 1;
00169             }
00170             else if(value <=0x7FF) {
00171                 return 2;
00172             }
00173             else if(BOOSTER_LOCALE_LIKELY(value <=0xFFFF)) {
00174                 return 3;
00175             }
00176             else {
00177                 return 4;
00178             }
00179         }
00180 
00181         static bool is_trail(char_type ci)
00182         {
00183             unsigned char c=ci;
00184             return (c & 0xC0)==0x80;
00185         }
00186 
00187         static bool is_lead(char_type ci)
00188         {
00189             return !is_trail(ci);
00190         }
00191         
00192         template<typename Iterator>
00193         static code_point decode(Iterator &p,Iterator e)
00194         {
00195             if(BOOSTER_LOCALE_UNLIKELY(p==e))
00196                 return incomplete;
00197 
00198             unsigned char lead = *p++;
00199 
00200             // First byte is fully validated here
00201             int trail_size = trail_length(lead);
00202 
00203             if(BOOSTER_LOCALE_UNLIKELY(trail_size < 0))
00204                 return illegal;
00205 
00206             //
00207             // Ok as only ASCII may be of size = 0
00208             // also optimize for ASCII text
00209             //
00210             if(trail_size == 0)
00211                 return lead;
00212             
00213             code_point c = lead & ((1<<(6-trail_size))-1);
00214 
00215             // Read the rest
00216             unsigned char tmp;
00217             switch(trail_size) {
00218             case 3:
00219                 if(BOOSTER_LOCALE_UNLIKELY(p==e))
00220                     return incomplete;
00221                 tmp = *p++;
00222                 c = (c << 6) | ( tmp & 0x3F);
00223             case 2:
00224                 if(BOOSTER_LOCALE_UNLIKELY(p==e))
00225                     return incomplete;
00226                 tmp = *p++;
00227                 c = (c << 6) | ( tmp & 0x3F);
00228             case 1:
00229                 if(BOOSTER_LOCALE_UNLIKELY(p==e))
00230                     return incomplete;
00231                 tmp = *p++;
00232                 c = (c << 6) | ( tmp & 0x3F);
00233             }
00234 
00235             // Check code point validity: no surrogates and
00236             // valid range
00237             if(BOOSTER_LOCALE_UNLIKELY(!is_valid_codepoint(c)))
00238                 return illegal;
00239 
00240             // make sure it is the most compact representation
00241             if(BOOSTER_LOCALE_UNLIKELY(width(c)!=trail_size + 1))
00242                 return illegal;
00243 
00244             return c;
00245 
00246         }
00247         
00248         template<typename Iterator>
00249         static code_point decode_valid(Iterator &p)
00250         {
00251             unsigned char lead = *p++;
00252             if(lead < 192)
00253                 return lead;
00254 
00255             int trail_size;
00256 
00257             if(lead < 224)
00258                 trail_size = 1;
00259             else if(BOOSTER_LOCALE_LIKELY(lead < 240)) // non-BMP rare
00260                 trail_size = 2;
00261             else
00262                 trail_size = 3;
00263             
00264             code_point c = lead & ((1<<(6-trail_size))-1);
00265 
00266             switch(trail_size) {
00267             case 3:
00268                 c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
00269             case 2:
00270                 c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
00271             case 1:
00272                 c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
00273             }
00274 
00275             return c;
00276         }
00277 
00278 
00279 
00280         template<typename Iterator>
00281         static Iterator encode(code_point value,Iterator out)
00282         {
00283             if(value <= 0x7F) {
00284                 *out++ = static_cast<char_type>(value);
00285             }
00286             else if(value <= 0x7FF) {
00287                 *out++ = static_cast<char_type>((value >> 6) | 0xC0);
00288                 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
00289             }
00290             else if(BOOSTER_LOCALE_LIKELY(value <= 0xFFFF)) {
00291                 *out++ = static_cast<char_type>((value >> 12) | 0xE0);
00292                 *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
00293                 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
00294             }
00295             else {
00296                 *out++ = static_cast<char_type>((value >> 18) | 0xF0);
00297                 *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80);
00298                 *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
00299                 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
00300             }
00301             return out;
00302         }
00303     }; // utf8
00304 
00305     template<typename CharType>
00306     struct utf_traits<CharType,2> {
00307         typedef CharType char_type;
00308 
00309         // See RFC 2781
00310         static bool is_first_surrogate(uint16_t x)
00311         {
00312             return 0xD800 <=x && x<= 0xDBFF;
00313         }
00314         static bool is_second_surrogate(uint16_t x)
00315         {
00316             return 0xDC00 <=x && x<= 0xDFFF;
00317         }
00318         static code_point combine_surrogate(uint16_t w1,uint16_t w2)
00319         {
00320             return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;
00321         }
00322         static int trail_length(char_type c)
00323         {
00324             if(is_first_surrogate(c))
00325                 return 1;
00326             if(is_second_surrogate(c))
00327                 return -1;
00328             return 0;
00329         }
00333         static bool is_trail(char_type c)
00334         {
00335             return is_second_surrogate(c);
00336         }
00340         static bool is_lead(char_type c)
00341         {
00342             return !is_second_surrogate(c);
00343         }
00344 
00345         template<typename It>
00346         static code_point decode(It &current,It last)
00347         {
00348             if(BOOSTER_LOCALE_UNLIKELY(current == last))
00349                 return incomplete;
00350             uint16_t w1=*current++;
00351             if(BOOSTER_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) {
00352                 return w1;
00353             }
00354             if(w1 > 0xDBFF)
00355                 return illegal;
00356             if(current==last)
00357                 return incomplete;
00358             uint16_t w2=*current++;
00359             if(w2 < 0xDC00 || 0xDFFF < w2)
00360                 return illegal;
00361             return combine_surrogate(w1,w2);
00362         }
00363         template<typename It>
00364         static code_point decode_valid(It &current)
00365         {
00366             uint16_t w1=*current++;
00367             if(BOOSTER_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) {
00368                 return w1;
00369             }
00370             uint16_t w2=*current++;
00371             return combine_surrogate(w1,w2);
00372         }
00373 
00374         static const int max_width = 2;
00375         static int width(code_point u)
00376         {
00377             return u>=0x10000 ? 2 : 1;
00378         }
00379         template<typename It>
00380         static It encode(code_point u,It out)
00381         {
00382             if(BOOSTER_LOCALE_LIKELY(u<=0xFFFF)) {
00383                 *out++ = static_cast<char_type>(u);
00384             }
00385             else {
00386                 u -= 0x10000;
00387                 *out++ = static_cast<char_type>(0xD800 | (u>>10));
00388                 *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF));
00389             }
00390             return out;
00391         }
00392     }; // utf16;
00393 
00394         
00395     template<typename CharType>
00396     struct utf_traits<CharType,4> {
00397         typedef CharType char_type;
00398         static int trail_length(char_type c)
00399         {
00400             if(is_valid_codepoint(c))
00401                 return 0;
00402             return -1;
00403         }
00404         static bool is_trail(char_type /*c*/)
00405         {
00406             return false;
00407         }
00408         static bool is_lead(char_type /*c*/)
00409         {
00410             return true;
00411         }
00412 
00413         template<typename It>
00414         static code_point decode_valid(It &current)
00415         {
00416             return *current++;
00417         }
00418 
00419         template<typename It>
00420         static code_point decode(It &current,It last)
00421         {
00422             if(BOOSTER_LOCALE_UNLIKELY(current == last))
00423                 return booster::locale::utf::incomplete;
00424             code_point c=*current++;
00425             if(BOOSTER_LOCALE_UNLIKELY(!is_valid_codepoint(c)))
00426                 return booster::locale::utf::illegal;
00427             return c;
00428         }
00429         static const int max_width = 1;
00430         static int width(code_point /*u*/)
00431         {
00432             return 1;
00433         }
00434         template<typename It>
00435         static It encode(code_point u,It out)
00436         {
00437             *out++ = static_cast<char_type>(u);
00438             return out;
00439         }
00440 
00441     }; // utf32
00442 
00443     #endif
00444 
00445 
00446 } // utf
00447 } // locale
00448 } // boost
00449 
00450 
00451 #endif
00452 
00453 // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
00454