Boost.Nowide
utf.hpp
1 //
2 // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
3 // Copyright (c) 2020 Alexander Grund
4 //
5 // Distributed under the Boost Software License, Version 1.0. (See
6 // accompanying file LICENSE or copy at
7 // http://www.boost.org/LICENSE_1_0.txt)
8 //
9 #ifndef BOOST_NOWIDE_UTF_HPP_INCLUDED
10 #define BOOST_NOWIDE_UTF_HPP_INCLUDED
11 
12 #include <boost/nowide/config.hpp>
13 #include <cstdint>
14 
15 namespace boost {
16 namespace nowide {
23  namespace utf {
24 
28  using code_point = uint32_t;
29 
33  static const code_point illegal = 0xFFFFFFFFu;
34 
38  static const code_point incomplete = 0xFFFFFFFEu;
39 
44  {
45  if(v > 0x10FFFF)
46  return false;
47  if(0xD800 <= v && v <= 0xDFFF) // surrogates
48  return false;
49  return true;
50  }
51 
52 #ifdef BOOST_NOWIDE_DOXYGEN
53  template<typename CharType, int size = sizeof(CharType)>
57  struct utf_traits
58  {
62  using char_type = CharType;
77  template<typename Iterator>
78  static code_point decode(Iterator& p, Iterator e);
79 
87  static const int max_width;
94  static int width(code_point value);
95 
101  static int trail_length(char_type c);
105  static bool is_trail(char_type c);
109  static bool is_lead(char_type c);
110 
121  template<typename Iterator>
122  static Iterator encode(code_point value, Iterator out);
128  template<typename Iterator>
129  static code_point decode_valid(Iterator& p);
130  };
131 
132 #else
133 
134  template<typename CharType, int size = sizeof(CharType)>
135  struct utf_traits;
136 
137  template<typename CharType>
138  struct utf_traits<CharType, 1>
139  {
140  using char_type = CharType;
141 
142  static int trail_length(char_type ci)
143  {
144  unsigned char c = ci;
145  if(c < 128)
146  return 0;
147  if(BOOST_UNLIKELY(c < 194))
148  return -1;
149  if(c < 224)
150  return 1;
151  if(c < 240)
152  return 2;
153  if(BOOST_LIKELY(c <= 244))
154  return 3;
155  return -1;
156  }
157 
158  static const int max_width = 4;
159 
160  static int width(code_point value)
161  {
162  if(value <= 0x7F)
163  {
164  return 1;
165  } else if(value <= 0x7FF)
166  {
167  return 2;
168  } else if(BOOST_LIKELY(value <= 0xFFFF))
169  {
170  return 3;
171  } else
172  {
173  return 4;
174  }
175  }
176 
177  static bool is_trail(char_type ci)
178  {
179  unsigned char c = ci;
180  return (c & 0xC0) == 0x80;
181  }
182 
183  static bool is_lead(char_type ci)
184  {
185  return !is_trail(ci);
186  }
187 
188  template<typename Iterator>
189  static code_point decode(Iterator& p, Iterator e)
190  {
191  if(BOOST_UNLIKELY(p == e))
192  return incomplete;
193 
194  unsigned char lead = *p++;
195 
196  // First byte is fully validated here
197  int trail_size = trail_length(lead);
198 
199  if(BOOST_UNLIKELY(trail_size < 0))
200  return illegal;
201 
202  // OK as only ASCII may be of size = 0
203  // also optimize for ASCII text
204  if(trail_size == 0)
205  return lead;
206 
207  code_point c = lead & ((1 << (6 - trail_size)) - 1);
208 
209  // Read the rest
210  unsigned char tmp;
211  switch(trail_size)
212  {
213  case 3:
214  if(BOOST_UNLIKELY(p == e))
215  return incomplete;
216  tmp = *p++;
217  if(!is_trail(tmp))
218  return illegal;
219  c = (c << 6) | (tmp & 0x3F);
220  BOOST_NOWIDE_FALLTHROUGH;
221  case 2:
222  if(BOOST_UNLIKELY(p == e))
223  return incomplete;
224  tmp = *p++;
225  if(!is_trail(tmp))
226  return illegal;
227  c = (c << 6) | (tmp & 0x3F);
228  BOOST_NOWIDE_FALLTHROUGH;
229  case 1:
230  if(BOOST_UNLIKELY(p == e))
231  return incomplete;
232  tmp = *p++;
233  if(!is_trail(tmp))
234  return illegal;
235  c = (c << 6) | (tmp & 0x3F);
236  }
237 
238  // Check code point validity:
239  // - no surrogates and valid range
240  // - most compact representation
241  if(BOOST_UNLIKELY(!is_valid_codepoint(c)) || BOOST_UNLIKELY(width(c) != trail_size + 1))
242  {
243  p -= trail_size;
244  return illegal;
245  }
246 
247  return c;
248  }
249 
250  template<typename Iterator>
251  static code_point decode_valid(Iterator& p)
252  {
253  unsigned char lead = *p++;
254  if(lead < 192)
255  return lead;
256 
257  int trail_size;
258 
259  if(lead < 224)
260  trail_size = 1;
261  else if(BOOST_LIKELY(lead < 240)) // non-BMP rare
262  trail_size = 2;
263  else
264  trail_size = 3;
265 
266  code_point c = lead & ((1 << (6 - trail_size)) - 1);
267 
268  switch(trail_size)
269  {
270  case 3: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_NOWIDE_FALLTHROUGH;
271  case 2: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_NOWIDE_FALLTHROUGH;
272  case 1: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F);
273  }
274 
275  return c;
276  }
277 
278  template<typename Iterator>
279  static Iterator encode(code_point value, Iterator out)
280  {
281  if(value <= 0x7F)
282  {
283  *out++ = static_cast<char_type>(value);
284  } else if(value <= 0x7FF)
285  {
286  *out++ = static_cast<char_type>((value >> 6) | 0xC0);
287  *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
288  } else if(BOOST_LIKELY(value <= 0xFFFF))
289  {
290  *out++ = static_cast<char_type>((value >> 12) | 0xE0);
291  *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
292  *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
293  } else
294  {
295  *out++ = static_cast<char_type>((value >> 18) | 0xF0);
296  *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80);
297  *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
298  *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
299  }
300  return out;
301  }
302  }; // utf8
303 
304  template<typename CharType>
305  struct utf_traits<CharType, 2>
306  {
307  using char_type = CharType;
308 
309  // See RFC 2781
310  static bool is_first_surrogate(uint16_t x)
311  {
312  return 0xD800 <= x && x <= 0xDBFF;
313  }
314  static bool is_second_surrogate(uint16_t x)
315  {
316  return 0xDC00 <= x && x <= 0xDFFF;
317  }
318  static code_point combine_surrogate(uint16_t w1, uint16_t w2)
319  {
320  return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;
321  }
322  static int trail_length(char_type c)
323  {
324  if(is_first_surrogate(c))
325  return 1;
326  if(is_second_surrogate(c))
327  return -1;
328  return 0;
329  }
333  static bool is_trail(char_type c)
334  {
335  return is_second_surrogate(c);
336  }
340  static bool is_lead(char_type c)
341  {
342  return !is_second_surrogate(c);
343  }
344 
345  template<typename It>
346  static code_point decode(It& current, It last)
347  {
348  if(BOOST_UNLIKELY(current == last))
349  return incomplete;
350  uint16_t w1 = *current++;
351  if(BOOST_LIKELY(w1 < 0xD800 || 0xDFFF < w1))
352  {
353  return w1;
354  }
355  if(w1 > 0xDBFF)
356  return illegal;
357  if(current == last)
358  return incomplete;
359  uint16_t w2 = *current++;
360  if(w2 < 0xDC00 || 0xDFFF < w2)
361  return illegal;
362  return combine_surrogate(w1, w2);
363  }
364  template<typename It>
365  static code_point decode_valid(It& current)
366  {
367  uint16_t w1 = *current++;
368  if(BOOST_LIKELY(w1 < 0xD800 || 0xDFFF < w1))
369  {
370  return w1;
371  }
372  uint16_t w2 = *current++;
373  return combine_surrogate(w1, w2);
374  }
375 
376  static const int max_width = 2;
377  static int width(code_point u)
378  {
379  return u >= 0x10000 ? 2 : 1;
380  }
381  template<typename It>
382  static It encode(code_point u, It out)
383  {
384  if(BOOST_LIKELY(u <= 0xFFFF))
385  {
386  *out++ = static_cast<char_type>(u);
387  } else
388  {
389  u -= 0x10000;
390  *out++ = static_cast<char_type>(0xD800 | (u >> 10));
391  *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF));
392  }
393  return out;
394  }
395  }; // utf16;
396 
397  template<typename CharType>
398  struct utf_traits<CharType, 4>
399  {
400  using char_type = CharType;
401  static int trail_length(char_type c)
402  {
403  if(is_valid_codepoint(c))
404  return 0;
405  return -1;
406  }
407  static bool is_trail(char_type /*c*/)
408  {
409  return false;
410  }
411  static bool is_lead(char_type /*c*/)
412  {
413  return true;
414  }
415 
416  template<typename It>
417  static code_point decode_valid(It& current)
418  {
419  return *current++;
420  }
421 
422  template<typename It>
423  static code_point decode(It& current, It last)
424  {
425  if(BOOST_UNLIKELY(current == last))
426  return incomplete;
427  code_point c = *current++;
428  if(BOOST_UNLIKELY(!is_valid_codepoint(c)))
429  return illegal;
430  return c;
431  }
432  static const int max_width = 1;
433  static int width(code_point /*u*/)
434  {
435  return 1;
436  }
437  template<typename It>
438  static It encode(code_point u, It out)
439  {
440  *out++ = static_cast<char_type>(u);
441  return out;
442  }
443 
444  }; // utf32
445 
446 #endif
447 
448  } // namespace utf
449 } // namespace nowide
450 } // namespace boost
451 
452 #endif
static const int max_width
Definition: utf.hpp:87
UTF Traits class - functions to convert UTF sequences to and from Unicode code points.
Definition: utf.hpp:57
static bool is_trail(char_type c)
Namespace that holds basic operations on UTF encoded sequences.
Definition: convert.hpp:20
static Iterator encode(code_point value, Iterator out)
static bool is_lead(char_type c)
static const code_point illegal
Special constant that defines illegal code point.
Definition: utf.hpp:33
uint32_t code_point
The integral type that can hold a Unicode code point.
Definition: utf.hpp:28
CharType char_type
Definition: utf.hpp:62
static code_point decode_valid(Iterator &p)
static int trail_length(char_type c)
static const code_point incomplete
Special constant that defines incomplete code point.
Definition: utf.hpp:38
static int width(code_point value)
bool is_valid_codepoint(code_point v)
the function checks if v is a valid code point
Definition: utf.hpp:43
static code_point decode(Iterator &p, Iterator e)