CppCMS
generic_codecvt.h
1 //
2 // Copyright (c) 2015 Artyom Beilis (Tonkikh)
3 //
4 // Distributed under the Boost Software License, Version 1.0. (See
5 // accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt)
7 //
8 #ifndef BOOSTER_LOCALE_GENERIC_CODECVT_HPP
9 #define BOOSTER_LOCALE_GENERIC_CODECVT_HPP
10 
11 #include <booster/locale/utf.h>
12 #include <booster/cstdint.h>
13 #include <locale>
14 
15 namespace booster {
16 namespace locale {
17 
18 
19 #if defined(_MSC_VER) && _MSC_VER < 1700
20 // up to MSVC 11 (2012) do_length is non-standard it counts wide characters instead of narrow and does not change mbstate
21 #define BOOSTER_LOCALE_DO_LENGTH_MBSTATE_CONST
22 #endif
23 
28 public:
35  };
36 };
37 
132 template<typename CharType,typename CodecvtImpl,int CharSize=sizeof(CharType)>
134 
143 template<typename CharType,typename CodecvtImpl>
144 class generic_codecvt<CharType,CodecvtImpl,2> : public std::codecvt<CharType,char,std::mbstate_t>, public generic_codecvt_base
145 {
146 public:
147 
148  typedef CharType uchar;
149 
150  generic_codecvt(size_t refs = 0) :
151  std::codecvt<CharType,char,std::mbstate_t>(refs)
152  {
153  }
154  CodecvtImpl const &implementation() const
155  {
156  return *static_cast<CodecvtImpl const *>(this);
157  }
158 
159 protected:
160 
161 
162  virtual std::codecvt_base::result do_unshift(std::mbstate_t &s,char *from,char * /*to*/,char *&next) const
163  {
164  booster::uint16_t &state = *reinterpret_cast<booster::uint16_t *>(&s);
165 #ifdef DEBUG_CODECVT
166  std::cout << "Entering unshift " << std::hex << state << std::dec << std::endl;
167 #endif
168  if(state != 0)
169  return std::codecvt_base::error;
170  next=from;
171  return std::codecvt_base::ok;
172  }
173  virtual int do_encoding() const throw()
174  {
175  return 0;
176  }
177  virtual int do_max_length() const throw()
178  {
179  return implementation().max_encoding_length();
180  }
181  virtual bool do_always_noconv() const throw()
182  {
183  return false;
184  }
185 
186  virtual int
187  do_length( std::mbstate_t
188  #ifdef BOOSTER_LOCALE_DO_LENGTH_MBSTATE_CONST
189  const
190  #endif
191  &std_state,
192  char const *from,
193  char const *from_end,
194  size_t max) const
195  {
196  #ifndef BOOSTER_LOCALE_DO_LENGTH_MBSTATE_CONST
197  char const *save_from = from;
198  booster::uint16_t &state = *reinterpret_cast<booster::uint16_t *>(&std_state);
199  #else
200  size_t save_max = max;
201  booster::uint16_t state = *reinterpret_cast<booster::uint16_t const *>(&std_state);
202  #endif
203 
204  typedef typename CodecvtImpl::state_type state_type;
205  state_type cvt_state = implementation().initial_state(generic_codecvt_base::to_unicode_state);
206  while(max > 0 && from < from_end){
207  char const *prev_from = from;
208  booster::uint32_t ch=implementation().to_unicode(cvt_state,from,from_end);
210  from = prev_from;
211  break;
212  }
213  max --;
214  if(ch > 0xFFFF) {
215  if(state == 0) {
216  from = prev_from;
217  state = 1;
218  }
219  else {
220  state = 0;
221  }
222  }
223  }
224  #ifndef BOOSTER_LOCALE_DO_LENGTH_MBSTATE_CONST
225  return from - save_from;
226  #else
227  return save_max - max;
228  #endif
229  }
230 
231 
232  virtual std::codecvt_base::result
233  do_in( std::mbstate_t &std_state,
234  char const *from,
235  char const *from_end,
236  char const *&from_next,
237  uchar *to,
238  uchar *to_end,
239  uchar *&to_next) const
240  {
241  std::codecvt_base::result r=std::codecvt_base::ok;
242 
243  // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
244  // according to standard. We use it to keep a flag 0/1 for surrogate pair writing
245  //
246  // if 0 no code above >0xFFFF observed, of 1 a code above 0xFFFF observerd
247  // and first pair is written, but no input consumed
248  booster::uint16_t &state = *reinterpret_cast<booster::uint16_t *>(&std_state);
249  typedef typename CodecvtImpl::state_type state_type;
250  state_type cvt_state = implementation().initial_state(generic_codecvt_base::to_unicode_state);
251  while(to < to_end && from < from_end)
252  {
253 #ifdef DEBUG_CODECVT
254  std::cout << "Entering IN--------------" << std::endl;
255  std::cout << "State " << std::hex << state <<std::endl;
256  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
257 #endif
258  char const *from_saved = from;
259 
260  uint32_t ch=implementation().to_unicode(cvt_state,from,from_end);
261 
263  from = from_saved;
264  r=std::codecvt_base::error;
265  break;
266  }
268  from = from_saved;
269  r=std::codecvt_base::partial;
270  break;
271  }
272  // Normal codepoints go direcly to stream
273  if(ch <= 0xFFFF) {
274  *to++=ch;
275  }
276  else {
277  // for other codepoints we do following
278  //
279  // 1. We can't consume our input as we may find ourselfs
280  // in state where all input consumed but not all output written,i.e. only
281  // 1st pair is written
282  // 2. We only write first pair and mark this in the state, we also revert back
283  // the from pointer in order to make sure this codepoint would be read
284  // once again and then we would consume our input together with writing
285  // second surrogate pair
286  ch-=0x10000;
287  booster::uint16_t vh = ch >> 10;
288  booster::uint16_t vl = ch & 0x3FF;
289  booster::uint16_t w1 = vh + 0xD800;
290  booster::uint16_t w2 = vl + 0xDC00;
291  if(state == 0) {
292  from = from_saved;
293  *to++ = w1;
294  state = 1;
295  }
296  else {
297  *to++ = w2;
298  state = 0;
299  }
300  }
301  }
302  from_next=from;
303  to_next=to;
304  if(r == std::codecvt_base::ok && (from!=from_end || state!=0))
305  r = std::codecvt_base::partial;
306 #ifdef DEBUG_CODECVT
307  std::cout << "Returning ";
308  switch(r) {
309  case std::codecvt_base::ok:
310  std::cout << "ok" << std::endl;
311  break;
312  case std::codecvt_base::partial:
313  std::cout << "partial" << std::endl;
314  break;
315  case std::codecvt_base::error:
316  std::cout << "error" << std::endl;
317  break;
318  default:
319  std::cout << "other" << std::endl;
320  break;
321  }
322  std::cout << "State " << std::hex << state <<std::endl;
323  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
324 #endif
325  return r;
326  }
327 
328  virtual std::codecvt_base::result
329  do_out( std::mbstate_t &std_state,
330  uchar const *from,
331  uchar const *from_end,
332  uchar const *&from_next,
333  char *to,
334  char *to_end,
335  char *&to_next) const
336  {
337  std::codecvt_base::result r=std::codecvt_base::ok;
338  // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
339  // according to standard. We assume that sizeof(mbstate_t) >=2 in order
340  // to be able to store first observerd surrogate pair
341  //
342  // State: state!=0 - a first surrogate pair was observerd (state = first pair),
343  // we expect the second one to come and then zero the state
345  booster::uint16_t &state = *reinterpret_cast<booster::uint16_t *>(&std_state);
346  typedef typename CodecvtImpl::state_type state_type;
347  state_type cvt_state = implementation().initial_state(generic_codecvt_base::from_unicode_state);
348  while(to < to_end && from < from_end)
349  {
350 #ifdef DEBUG_CODECVT
351  std::cout << "Entering OUT --------------" << std::endl;
352  std::cout << "State " << std::hex << state <<std::endl;
353  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
354 #endif
355  booster::uint32_t ch=0;
356  if(state != 0) {
357  // if the state idecates that 1st surrogate pair was written
358  // we should make sure that the second one that comes is actually
359  // second surrogate
360  booster::uint16_t w1 = state;
361  booster::uint16_t w2 = *from;
362  // we don't forward from as writing may fail to incomplete or
363  // partial conversion
364  if(0xDC00 <= w2 && w2<=0xDFFF) {
365  booster::uint16_t vh = w1 - 0xD800;
366  booster::uint16_t vl = w2 - 0xDC00;
367  ch=((uint32_t(vh) << 10) | vl) + 0x10000;
368  }
369  else {
370  // Invalid surrogate
371  r=std::codecvt_base::error;
372  break;
373  }
374  }
375  else {
376  ch = *from;
377  if(0xD800 <= ch && ch<=0xDBFF) {
378  // if this is a first surrogate pair we put
379  // it into the state and consume it, note we don't
380  // go forward as it should be illegal so we increase
381  // the from pointer manually
382  state = ch;
383  from++;
384  continue;
385  }
386  else if(0xDC00 <= ch && ch<=0xDFFF) {
387  // if we observe second surrogate pair and
388  // first only may be expected we should break from the loop with error
389  // as it is illegal input
390  r=std::codecvt_base::error;
391  break;
392  }
393  }
395  r=std::codecvt_base::error;
396  break;
397  }
398  booster::uint32_t len = implementation().from_unicode(cvt_state,ch,to,to_end);
400  r=std::codecvt_base::partial;
401  break;
402  }
403  else if(len == booster::locale::utf::illegal) {
404  r=std::codecvt_base::error;
405  break;
406  }
407  else
408  to+= len;
409  state = 0;
410  from++;
411  }
412  from_next=from;
413  to_next=to;
414  if(r==std::codecvt_base::ok && from!=from_end)
415  r = std::codecvt_base::partial;
416 #ifdef DEBUG_CODECVT
417  std::cout << "Returning ";
418  switch(r) {
419  case std::codecvt_base::ok:
420  std::cout << "ok" << std::endl;
421  break;
422  case std::codecvt_base::partial:
423  std::cout << "partial" << std::endl;
424  break;
425  case std::codecvt_base::error:
426  std::cout << "error" << std::endl;
427  break;
428  default:
429  std::cout << "other" << std::endl;
430  break;
431  }
432  std::cout << "State " << std::hex << state <<std::endl;
433  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
434 #endif
435  return r;
436  }
437 
438 };
439 
446 template<typename CharType,typename CodecvtImpl>
447 class generic_codecvt<CharType,CodecvtImpl,4> : public std::codecvt<CharType,char,std::mbstate_t>, public generic_codecvt_base
448 {
449 public:
450  typedef CharType uchar;
451 
452  generic_codecvt(size_t refs = 0) :
453  std::codecvt<CharType,char,std::mbstate_t>(refs)
454  {
455  }
456 
457  CodecvtImpl const &implementation() const
458  {
459  return *static_cast<CodecvtImpl const *>(this);
460  }
461 
462 protected:
463 
464  virtual std::codecvt_base::result do_unshift(std::mbstate_t &/*s*/,char *from,char * /*to*/,char *&next) const
465  {
466  next=from;
467  return std::codecvt_base::ok;
468  }
469  virtual int do_encoding() const throw()
470  {
471  return 0;
472  }
473  virtual int do_max_length() const throw()
474  {
475  return implementation().max_encoding_length();
476  }
477  virtual bool do_always_noconv() const throw()
478  {
479  return false;
480  }
481 
482  virtual int
483  do_length( std::mbstate_t
484  #ifdef BOOSTER_LOCALE_DO_LENGTH_MBSTATE_CONST
485  const
486  #endif
487  &/*state*/,
488  char const *from,
489  char const *from_end,
490  size_t max) const
491  {
492  #ifndef BOOSTER_LOCALE_DO_LENGTH_MBSTATE_CONST
493  char const *start_from = from;
494  #else
495  size_t save_max = max;
496  #endif
497  typedef typename CodecvtImpl::state_type state_type;
498  state_type cvt_state = implementation().initial_state(generic_codecvt_base::to_unicode_state);
499  while(max > 0 && from < from_end){
500  char const *save_from = from;
501  booster::uint32_t ch=implementation().to_unicode(cvt_state,from,from_end);
503  from = save_from;
504  break;
505  }
506  max--;
507  }
508  #ifndef BOOSTER_LOCALE_DO_LENGTH_MBSTATE_CONST
509  return from - start_from;
510  #else
511  return save_max - max;
512  #endif
513  }
514 
515 
516  virtual std::codecvt_base::result
517  do_in( std::mbstate_t &/*state*/,
518  char const *from,
519  char const *from_end,
520  char const *&from_next,
521  uchar *to,
522  uchar *to_end,
523  uchar *&to_next) const
524  {
525  std::codecvt_base::result r=std::codecvt_base::ok;
526 
527  // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
528  // according to standard. We use it to keep a flag 0/1 for surrogate pair writing
529  //
530  // if 0 no code above >0xFFFF observed, of 1 a code above 0xFFFF observerd
531  // and first pair is written, but no input consumed
532  typedef typename CodecvtImpl::state_type state_type;
533  state_type cvt_state = implementation().initial_state(generic_codecvt_base::to_unicode_state);
534  while(to < to_end && from < from_end)
535  {
536 #ifdef DEBUG_CODECVT
537  std::cout << "Entering IN--------------" << std::endl;
538  std::cout << "State " << std::hex << state <<std::endl;
539  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
540 #endif
541  char const *from_saved = from;
542 
543  uint32_t ch=implementation().to_unicode(cvt_state,from,from_end);
544 
546  r=std::codecvt_base::error;
547  from = from_saved;
548  break;
549  }
551  r=std::codecvt_base::partial;
552  from=from_saved;
553  break;
554  }
555  *to++=ch;
556  }
557  from_next=from;
558  to_next=to;
559  if(r == std::codecvt_base::ok && from!=from_end)
560  r = std::codecvt_base::partial;
561 #ifdef DEBUG_CODECVT
562  std::cout << "Returning ";
563  switch(r) {
564  case std::codecvt_base::ok:
565  std::cout << "ok" << std::endl;
566  break;
567  case std::codecvt_base::partial:
568  std::cout << "partial" << std::endl;
569  break;
570  case std::codecvt_base::error:
571  std::cout << "error" << std::endl;
572  break;
573  default:
574  std::cout << "other" << std::endl;
575  break;
576  }
577  std::cout << "State " << std::hex << state <<std::endl;
578  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
579 #endif
580  return r;
581  }
582 
583  virtual std::codecvt_base::result
584  do_out( std::mbstate_t &/*std_state*/,
585  uchar const *from,
586  uchar const *from_end,
587  uchar const *&from_next,
588  char *to,
589  char *to_end,
590  char *&to_next) const
591  {
592  std::codecvt_base::result r=std::codecvt_base::ok;
593  typedef typename CodecvtImpl::state_type state_type;
594  state_type cvt_state = implementation().initial_state(generic_codecvt_base::from_unicode_state);
595  while(to < to_end && from < from_end)
596  {
597 #ifdef DEBUG_CODECVT
598  std::cout << "Entering OUT --------------" << std::endl;
599  std::cout << "State " << std::hex << state <<std::endl;
600  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
601 #endif
602  booster::uint32_t ch=0;
603  ch = *from;
605  r=std::codecvt_base::error;
606  break;
607  }
608  booster::uint32_t len = implementation().from_unicode(cvt_state,ch,to,to_end);
610  r=std::codecvt_base::partial;
611  break;
612  }
613  else if(len == booster::locale::utf::illegal) {
614  r=std::codecvt_base::error;
615  break;
616  }
617  to+=len;
618  from++;
619  }
620  from_next=from;
621  to_next=to;
622  if(r==std::codecvt_base::ok && from!=from_end)
623  r = std::codecvt_base::partial;
624 #ifdef DEBUG_CODECVT
625  std::cout << "Returning ";
626  switch(r) {
627  case std::codecvt_base::ok:
628  std::cout << "ok" << std::endl;
629  break;
630  case std::codecvt_base::partial:
631  std::cout << "partial" << std::endl;
632  break;
633  case std::codecvt_base::error:
634  std::cout << "error" << std::endl;
635  break;
636  default:
637  std::cout << "other" << std::endl;
638  break;
639  }
640  std::cout << "State " << std::hex << state <<std::endl;
641  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
642 #endif
643  return r;
644  }
645 };
646 
647 
648 template<typename CharType,typename CodecvtImpl>
649 class generic_codecvt<CharType,CodecvtImpl,1> : public std::codecvt<CharType,char,std::mbstate_t>, public generic_codecvt_base
650 {
651 public:
652  typedef CharType uchar;
653 
654  CodecvtImpl const &implementation() const
655  {
656  return *static_cast<CodecvtImpl const *>(this);
657  }
658 
659  generic_codecvt(size_t refs = 0) : std::codecvt<char,char,std::mbstate_t>(refs)
660  {
661  }
662 };
663 
664 } // locale
665 } // namespace boost
666 
667 #endif
668 // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
initial_convertion_state
Definition: generic_codecvt.h:32
Geneneric generic codecvt facet, various stateless encodings to UTF-16 and UTF-32 using wchar_t...
Definition: generic_codecvt.h:133
static const code_point incomplete
Special constant that defines incomplete code point.
Definition: utf.h:44
static const code_point illegal
Special constant that defines illegal code point.
Definition: utf.h:39
bool is_valid_codepoint(code_point v)
the function checks if v is a valid code point
Definition: utf.h:49
A base class that used to define constants for generic_codecvt.
Definition: generic_codecvt.h:27
The state would be used by from_unicode functions.
Definition: generic_codecvt.h:34
This is the main namespace that encloses all localization classes.
Definition: locale_fwd.h:14
The state would be used by to_unicode functions.
Definition: generic_codecvt.h:33
Booster library namespace. The library that implements Boost Like API in ABI backward compatible way...
Definition: application.h:23