Boost.Nowide
utf8_codecvt.hpp
1 //
2 // Copyright (c) 2015 Artyom Beilis (Tonkikh)
3 //
4 // Distributed under the Boost Software License, Version 1.0. (See
5 // accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt)
7 //
8 #ifndef BOOST_NOWIDE_UTF8_CODECVT_HPP
9 #define BOOST_NOWIDE_UTF8_CODECVT_HPP
10 
11 #include <boost/locale/utf.hpp>
12 #include <boost/cstdint.hpp>
13 #include <boost/static_assert.hpp>
14 #include <locale>
15 
16 namespace boost {
17 namespace nowide {
18 
19 //
20 // Make sure that mbstate can keep 16 bit of UTF-16 sequence
21 //
22 BOOST_STATIC_ASSERT(sizeof(std::mbstate_t)>=2);
23 
24 #if defined _MSC_VER && _MSC_VER < 1700
25 // MSVC do_length is non-standard it counts wide characters instead of narrow and does not change mbstate
26 #define BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
27 #endif
28 
29 template<typename CharType,int CharSize=sizeof(CharType)>
31 
32 template<typename CharType>
33 class utf8_codecvt<CharType,2> : public std::codecvt<CharType,char,std::mbstate_t>
34 {
35 public:
36  utf8_codecvt(size_t refs = 0) : std::codecvt<CharType,char,std::mbstate_t>(refs)
37  {
38  }
39 protected:
40 
41  typedef CharType uchar;
42 
43  virtual std::codecvt_base::result do_unshift(std::mbstate_t &s,char *from,char * /*to*/,char *&next) const
44  {
45  boost::uint16_t &state = *reinterpret_cast<boost::uint16_t *>(&s);
46 #ifdef DEBUG_CODECVT
47  std::cout << "Entering unshift " << std::hex << state << std::dec << std::endl;
48 #endif
49  if(state != 0)
50  return std::codecvt_base::error;
51  next=from;
52  return std::codecvt_base::ok;
53  }
54  virtual int do_encoding() const throw()
55  {
56  return 0;
57  }
58  virtual int do_max_length() const throw()
59  {
60  return 4;
61  }
62  virtual bool do_always_noconv() const throw()
63  {
64  return false;
65  }
66 
67  virtual int
68  do_length( std::mbstate_t
69  #ifdef BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
70  const
71  #endif
72  &std_state,
73  char const *from,
74  char const *from_end,
75  size_t max) const
76  {
77  #ifndef BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
78  char const *save_from = from;
79  boost::uint16_t &state = *reinterpret_cast<boost::uint16_t *>(&std_state);
80  #else
81  size_t save_max = max;
82  boost::uint16_t state = *reinterpret_cast<boost::uint16_t const *>(&std_state);
83  #endif
84  while(max > 0 && from < from_end){
85  char const *prev_from = from;
86  boost::uint32_t ch=boost::locale::utf::utf_traits<char>::decode(from,from_end);
87  if(ch==boost::locale::utf::incomplete || ch==boost::locale::utf::illegal) {
88  from = prev_from;
89  break;
90  }
91  max --;
92  if(ch > 0xFFFF) {
93  if(state == 0) {
94  from = prev_from;
95  state = 1;
96  }
97  else {
98  state = 0;
99  }
100  }
101  }
102  #ifndef BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
103  return from - save_from;
104  #else
105  return save_max - max;
106  #endif
107  }
108 
109 
110  virtual std::codecvt_base::result
111  do_in( std::mbstate_t &std_state,
112  char const *from,
113  char const *from_end,
114  char const *&from_next,
115  uchar *to,
116  uchar *to_end,
117  uchar *&to_next) const
118  {
119  std::codecvt_base::result r=std::codecvt_base::ok;
120 
121  // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
122  // according to standard. We use it to keep a flag 0/1 for surrogate pair writing
123  //
124  // if 0 no code above >0xFFFF observed, of 1 a code above 0xFFFF observerd
125  // and first pair is written, but no input consumed
126  boost::uint16_t &state = *reinterpret_cast<boost::uint16_t *>(&std_state);
127  while(to < to_end && from < from_end)
128  {
129 #ifdef DEBUG_CODECVT
130  std::cout << "Entering IN--------------" << std::endl;
131  std::cout << "State " << std::hex << state <<std::endl;
132  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
133 #endif
134  char const *from_saved = from;
135 
136  uint32_t ch=boost::locale::utf::utf_traits<char>::decode(from,from_end);
137 
138  if(ch==boost::locale::utf::illegal) {
139  from = from_saved;
140  r=std::codecvt_base::error;
141  break;
142  }
143  if(ch==boost::locale::utf::incomplete) {
144  from = from_saved;
145  r=std::codecvt_base::partial;
146  break;
147  }
148  // Normal codepoints go direcly to stream
149  if(ch <= 0xFFFF) {
150  *to++=ch;
151  }
152  else {
153  // for other codepoints we do following
154  //
155  // 1. We can't consume our input as we may find ourselfs
156  // in state where all input consumed but not all output written,i.e. only
157  // 1st pair is written
158  // 2. We only write first pair and mark this in the state, we also revert back
159  // the from pointer in order to make sure this codepoint would be read
160  // once again and then we would consume our input together with writing
161  // second surrogate pair
162  ch-=0x10000;
163  boost::uint16_t vh = ch >> 10;
164  boost::uint16_t vl = ch & 0x3FF;
165  boost::uint16_t w1 = vh + 0xD800;
166  boost::uint16_t w2 = vl + 0xDC00;
167  if(state == 0) {
168  from = from_saved;
169  *to++ = w1;
170  state = 1;
171  }
172  else {
173  *to++ = w2;
174  state = 0;
175  }
176  }
177  }
178  from_next=from;
179  to_next=to;
180  if(r == std::codecvt_base::ok && (from!=from_end || state!=0))
181  r = std::codecvt_base::partial;
182 #ifdef DEBUG_CODECVT
183  std::cout << "Returning ";
184  switch(r) {
185  case std::codecvt_base::ok:
186  std::cout << "ok" << std::endl;
187  break;
188  case std::codecvt_base::partial:
189  std::cout << "partial" << std::endl;
190  break;
191  case std::codecvt_base::error:
192  std::cout << "error" << std::endl;
193  break;
194  default:
195  std::cout << "other" << std::endl;
196  break;
197  }
198  std::cout << "State " << std::hex << state <<std::endl;
199  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
200 #endif
201  return r;
202  }
203 
204  virtual std::codecvt_base::result
205  do_out( std::mbstate_t &std_state,
206  uchar const *from,
207  uchar const *from_end,
208  uchar const *&from_next,
209  char *to,
210  char *to_end,
211  char *&to_next) const
212  {
213  std::codecvt_base::result r=std::codecvt_base::ok;
214  // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
215  // according to standard. We assume that sizeof(mbstate_t) >=2 in order
216  // to be able to store first observerd surrogate pair
217  //
218  // State: state!=0 - a first surrogate pair was observerd (state = first pair),
219  // we expect the second one to come and then zero the state
221  boost::uint16_t &state = *reinterpret_cast<boost::uint16_t *>(&std_state);
222  while(to < to_end && from < from_end)
223  {
224 #ifdef DEBUG_CODECVT
225  std::cout << "Entering OUT --------------" << std::endl;
226  std::cout << "State " << std::hex << state <<std::endl;
227  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
228 #endif
229  boost::uint32_t ch=0;
230  if(state != 0) {
231  // if the state idecates that 1st surrogate pair was written
232  // we should make sure that the second one that comes is actually
233  // second surrogate
234  boost::uint16_t w1 = state;
235  boost::uint16_t w2 = *from;
236  // we don't forward from as writing may fail to incomplete or
237  // partial conversion
238  if(0xDC00 <= w2 && w2<=0xDFFF) {
239  boost::uint16_t vh = w1 - 0xD800;
240  boost::uint16_t vl = w2 - 0xDC00;
241  ch=((uint32_t(vh) << 10) | vl) + 0x10000;
242  }
243  else {
244  // Invalid surrogate
245  r=std::codecvt_base::error;
246  break;
247  }
248  }
249  else {
250  ch = *from;
251  if(0xD800 <= ch && ch<=0xDBFF) {
252  // if this is a first surrogate pair we put
253  // it into the state and consume it, note we don't
254  // go forward as it should be illegal so we increase
255  // the from pointer manually
256  state = ch;
257  from++;
258  continue;
259  }
260  else if(0xDC00 <= ch && ch<=0xDFFF) {
261  // if we observe second surrogate pair and
262  // first only may be expected we should break from the loop with error
263  // as it is illegal input
264  r=std::codecvt_base::error;
265  break;
266  }
267  }
268  if(!boost::locale::utf::is_valid_codepoint(ch)) {
269  r=std::codecvt_base::error;
270  break;
271  }
272  int len = boost::locale::utf::utf_traits<char>::width(ch);
273  if(to_end - to < len) {
274  r=std::codecvt_base::partial;
275  break;
276  }
277  to = boost::locale::utf::utf_traits<char>::encode(ch,to);
278  state = 0;
279  from++;
280  }
281  from_next=from;
282  to_next=to;
283  if(r==std::codecvt_base::ok && from!=from_end)
284  r = std::codecvt_base::partial;
285 #ifdef DEBUG_CODECVT
286  std::cout << "Returning ";
287  switch(r) {
288  case std::codecvt_base::ok:
289  std::cout << "ok" << std::endl;
290  break;
291  case std::codecvt_base::partial:
292  std::cout << "partial" << std::endl;
293  break;
294  case std::codecvt_base::error:
295  std::cout << "error" << std::endl;
296  break;
297  default:
298  std::cout << "other" << std::endl;
299  break;
300  }
301  std::cout << "State " << std::hex << state <<std::endl;
302  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
303 #endif
304  return r;
305  }
306 
307 };
308 
309 template<typename CharType>
310 class utf8_codecvt<CharType,4> : public std::codecvt<CharType,char,std::mbstate_t>
311 {
312 public:
313  utf8_codecvt(size_t refs = 0) : std::codecvt<CharType,char,std::mbstate_t>(refs)
314  {
315  }
316 protected:
317 
318  typedef CharType uchar;
319 
320  virtual std::codecvt_base::result do_unshift(std::mbstate_t &/*s*/,char *from,char * /*to*/,char *&next) const
321  {
322  next=from;
323  return std::codecvt_base::ok;
324  }
325  virtual int do_encoding() const throw()
326  {
327  return 0;
328  }
329  virtual int do_max_length() const throw()
330  {
331  return 4;
332  }
333  virtual bool do_always_noconv() const throw()
334  {
335  return false;
336  }
337 
338  virtual int
339  do_length( std::mbstate_t
340  #ifdef BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
341  const
342  #endif
343  &/*state*/,
344  char const *from,
345  char const *from_end,
346  size_t max) const
347  {
348  #ifndef BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
349  char const *start_from = from;
350  #else
351  size_t save_max = max;
352  #endif
353 
354  while(max > 0 && from < from_end){
355  char const *save_from = from;
356  boost::uint32_t ch=boost::locale::utf::utf_traits<char>::decode(from,from_end);
357  if(ch==boost::locale::utf::incomplete || ch==boost::locale::utf::illegal) {
358  from = save_from;
359  break;
360  }
361  max--;
362  }
363  #ifndef BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
364  return from - start_from;
365  #else
366  return save_max - max;
367  #endif
368  }
369 
370 
371  virtual std::codecvt_base::result
372  do_in( std::mbstate_t &/*state*/,
373  char const *from,
374  char const *from_end,
375  char const *&from_next,
376  uchar *to,
377  uchar *to_end,
378  uchar *&to_next) const
379  {
380  std::codecvt_base::result r=std::codecvt_base::ok;
381 
382  // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
383  // according to standard. We use it to keep a flag 0/1 for surrogate pair writing
384  //
385  // if 0 no code above >0xFFFF observed, of 1 a code above 0xFFFF observerd
386  // and first pair is written, but no input consumed
387  while(to < to_end && from < from_end)
388  {
389 #ifdef DEBUG_CODECVT
390  std::cout << "Entering IN--------------" << std::endl;
391  std::cout << "State " << std::hex << state <<std::endl;
392  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
393 #endif
394  char const *from_saved = from;
395 
396  uint32_t ch=boost::locale::utf::utf_traits<char>::decode(from,from_end);
397 
398  if(ch==boost::locale::utf::illegal) {
399  r=std::codecvt_base::error;
400  from = from_saved;
401  break;
402  }
403  if(ch==boost::locale::utf::incomplete) {
404  r=std::codecvt_base::partial;
405  from=from_saved;
406  break;
407  }
408  *to++=ch;
409  }
410  from_next=from;
411  to_next=to;
412  if(r == std::codecvt_base::ok && from!=from_end)
413  r = std::codecvt_base::partial;
414 #ifdef DEBUG_CODECVT
415  std::cout << "Returning ";
416  switch(r) {
417  case std::codecvt_base::ok:
418  std::cout << "ok" << std::endl;
419  break;
420  case std::codecvt_base::partial:
421  std::cout << "partial" << std::endl;
422  break;
423  case std::codecvt_base::error:
424  std::cout << "error" << std::endl;
425  break;
426  default:
427  std::cout << "other" << std::endl;
428  break;
429  }
430  std::cout << "State " << std::hex << state <<std::endl;
431  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
432 #endif
433  return r;
434  }
435 
436  virtual std::codecvt_base::result
437  do_out( std::mbstate_t &std_state,
438  uchar const *from,
439  uchar const *from_end,
440  uchar const *&from_next,
441  char *to,
442  char *to_end,
443  char *&to_next) const
444  {
445  std::codecvt_base::result r=std::codecvt_base::ok;
446  while(to < to_end && from < from_end)
447  {
448 #ifdef DEBUG_CODECVT
449  std::cout << "Entering OUT --------------" << std::endl;
450  std::cout << "State " << std::hex << state <<std::endl;
451  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
452 #endif
453  boost::uint32_t ch=0;
454  ch = *from;
455  if(!boost::locale::utf::is_valid_codepoint(ch)) {
456  r=std::codecvt_base::error;
457  break;
458  }
459  int len = boost::locale::utf::utf_traits<char>::width(ch);
460  if(to_end - to < len) {
461  r=std::codecvt_base::partial;
462  break;
463  }
464  to = boost::locale::utf::utf_traits<char>::encode(ch,to);
465  from++;
466  }
467  from_next=from;
468  to_next=to;
469  if(r==std::codecvt_base::ok && from!=from_end)
470  r = std::codecvt_base::partial;
471 #ifdef DEBUG_CODECVT
472  std::cout << "Returning ";
473  switch(r) {
474  case std::codecvt_base::ok:
475  std::cout << "ok" << std::endl;
476  break;
477  case std::codecvt_base::partial:
478  std::cout << "partial" << std::endl;
479  break;
480  case std::codecvt_base::error:
481  std::cout << "error" << std::endl;
482  break;
483  default:
484  std::cout << "other" << std::endl;
485  break;
486  }
487  std::cout << "State " << std::hex << state <<std::endl;
488  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
489 #endif
490  return r;
491  }
492 };
493 
494 } // nowide
495 } // namespace boost
496 
497 #endif
498 // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
Definition: args.hpp:18
Definition: utf8_codecvt.hpp:30