libstdc++
text_encoding
Go to the documentation of this file.
1 // <text_encoding> -*- C++ -*-
2 
3 // Copyright The GNU Toolchain Authors.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 /** @file include/text_encoding
26  * This is a Standard C++ Library header.
27  */
28 
29 #ifndef _GLIBCXX_TEXT_ENCODING
30 #define _GLIBCXX_TEXT_ENCODING
31 
32 #ifdef _GLIBCXX_SYSHDR
33 #pragma GCC system_header
34 #endif
35 
36 #include <bits/requires_hosted.h>
37 
38 #define __glibcxx_want_text_encoding
39 #include <bits/version.h>
40 
41 #ifdef __cpp_lib_text_encoding
42 #include <compare>
43 #include <string_view>
44 #include <bits/functional_hash.h> // hash
45 #include <bits/ranges_util.h> // view_interface
46 #include <bits/unicode.h> // __charset_alias_match
47 #include <ext/numeric_traits.h> // __int_traits
48 
49 namespace std _GLIBCXX_VISIBILITY(default)
50 {
51 _GLIBCXX_BEGIN_NAMESPACE_VERSION
52 
53  /**
54  * @brief An interface for accessing the IANA Character Sets registry.
55  * @ingroup locales
56  * @since C++23
57  */
58  struct text_encoding
59  {
60  private:
61  struct _Rep
62  {
63  using id = __INT_LEAST32_TYPE__;
64  id _M_id;
65  const char* _M_name;
66 
67  friend constexpr bool
68  operator<(const _Rep& __r, id __m) noexcept
69  { return __r._M_id < __m; }
70 
71  friend constexpr bool
72  operator==(const _Rep& __r, string_view __name) noexcept
73  { return __r._M_name == __name; }
74  };
75 
76  public:
77  static constexpr size_t max_name_length = 63;
78 
79  enum class id : _Rep::id
80  {
81  other = 1,
82  unknown = 2,
83  ASCII = 3,
84  ISOLatin1 = 4,
85  ISOLatin2 = 5,
86  ISOLatin3 = 6,
87  ISOLatin4 = 7,
88  ISOLatinCyrillic = 8,
89  ISOLatinArabic = 9,
90  ISOLatinGreek = 10,
91  ISOLatinHebrew = 11,
92  ISOLatin5 = 12,
93  ISOLatin6 = 13,
94  ISOTextComm = 14,
95  HalfWidthKatakana = 15,
96  JISEncoding = 16,
97  ShiftJIS = 17,
98  EUCPkdFmtJapanese = 18,
99  EUCFixWidJapanese = 19,
100  ISO4UnitedKingdom = 20,
101  ISO11SwedishForNames = 21,
102  ISO15Italian = 22,
103  ISO17Spanish = 23,
104  ISO21German = 24,
105  ISO60DanishNorwegian = 25,
106  ISO69French = 26,
107  ISO10646UTF1 = 27,
108  ISO646basic1983 = 28,
109  INVARIANT = 29,
110  ISO2IntlRefVersion = 30,
111  NATSSEFI = 31,
112  NATSSEFIADD = 32,
113  ISO10Swedish = 35,
114  KSC56011987 = 36,
115  ISO2022KR = 37,
116  EUCKR = 38,
117  ISO2022JP = 39,
118  ISO2022JP2 = 40,
119  ISO13JISC6220jp = 41,
120  ISO14JISC6220ro = 42,
121  ISO16Portuguese = 43,
122  ISO18Greek7Old = 44,
123  ISO19LatinGreek = 45,
124  ISO25French = 46,
125  ISO27LatinGreek1 = 47,
126  ISO5427Cyrillic = 48,
127  ISO42JISC62261978 = 49,
128  ISO47BSViewdata = 50,
129  ISO49INIS = 51,
130  ISO50INIS8 = 52,
131  ISO51INISCyrillic = 53,
132  ISO54271981 = 54,
133  ISO5428Greek = 55,
134  ISO57GB1988 = 56,
135  ISO58GB231280 = 57,
136  ISO61Norwegian2 = 58,
137  ISO70VideotexSupp1 = 59,
138  ISO84Portuguese2 = 60,
139  ISO85Spanish2 = 61,
140  ISO86Hungarian = 62,
141  ISO87JISX0208 = 63,
142  ISO88Greek7 = 64,
143  ISO89ASMO449 = 65,
144  ISO90 = 66,
145  ISO91JISC62291984a = 67,
146  ISO92JISC62991984b = 68,
147  ISO93JIS62291984badd = 69,
148  ISO94JIS62291984hand = 70,
149  ISO95JIS62291984handadd = 71,
150  ISO96JISC62291984kana = 72,
151  ISO2033 = 73,
152  ISO99NAPLPS = 74,
153  ISO102T617bit = 75,
154  ISO103T618bit = 76,
155  ISO111ECMACyrillic = 77,
156  ISO121Canadian1 = 78,
157  ISO122Canadian2 = 79,
158  ISO123CSAZ24341985gr = 80,
159  ISO88596E = 81,
160  ISO88596I = 82,
161  ISO128T101G2 = 83,
162  ISO88598E = 84,
163  ISO88598I = 85,
164  ISO139CSN369103 = 86,
165  ISO141JUSIB1002 = 87,
166  ISO143IECP271 = 88,
167  ISO146Serbian = 89,
168  ISO147Macedonian = 90,
169  ISO150 = 91,
170  ISO151Cuba = 92,
171  ISO6937Add = 93,
172  ISO153GOST1976874 = 94,
173  ISO8859Supp = 95,
174  ISO10367Box = 96,
175  ISO158Lap = 97,
176  ISO159JISX02121990 = 98,
177  ISO646Danish = 99,
178  USDK = 100,
179  DKUS = 101,
180  KSC5636 = 102,
181  Unicode11UTF7 = 103,
182  ISO2022CN = 104,
183  ISO2022CNEXT = 105,
184  UTF8 = 106,
185  ISO885913 = 109,
186  ISO885914 = 110,
187  ISO885915 = 111,
188  ISO885916 = 112,
189  GBK = 113,
190  GB18030 = 114,
191  OSDEBCDICDF0415 = 115,
192  OSDEBCDICDF03IRV = 116,
193  OSDEBCDICDF041 = 117,
194  ISO115481 = 118,
195  KZ1048 = 119,
196  UCS2 = 1000,
197  UCS4 = 1001,
198  UnicodeASCII = 1002,
199  UnicodeLatin1 = 1003,
200  UnicodeJapanese = 1004,
201  UnicodeIBM1261 = 1005,
202  UnicodeIBM1268 = 1006,
203  UnicodeIBM1276 = 1007,
204  UnicodeIBM1264 = 1008,
205  UnicodeIBM1265 = 1009,
206  Unicode11 = 1010,
207  SCSU = 1011,
208  UTF7 = 1012,
209  UTF16BE = 1013,
210  UTF16LE = 1014,
211  UTF16 = 1015,
212  CESU8 = 1016,
213  UTF32 = 1017,
214  UTF32BE = 1018,
215  UTF32LE = 1019,
216  BOCU1 = 1020,
217  UTF7IMAP = 1021,
218  Windows30Latin1 = 2000,
219  Windows31Latin1 = 2001,
220  Windows31Latin2 = 2002,
221  Windows31Latin5 = 2003,
222  HPRoman8 = 2004,
223  AdobeStandardEncoding = 2005,
224  VenturaUS = 2006,
225  VenturaInternational = 2007,
226  DECMCS = 2008,
227  PC850Multilingual = 2009,
228  PC8DanishNorwegian = 2012,
229  PC862LatinHebrew = 2013,
230  PC8Turkish = 2014,
231  IBMSymbols = 2015,
232  IBMThai = 2016,
233  HPLegal = 2017,
234  HPPiFont = 2018,
235  HPMath8 = 2019,
236  HPPSMath = 2020,
237  HPDesktop = 2021,
238  VenturaMath = 2022,
239  MicrosoftPublishing = 2023,
240  Windows31J = 2024,
241  GB2312 = 2025,
242  Big5 = 2026,
243  Macintosh = 2027,
244  IBM037 = 2028,
245  IBM038 = 2029,
246  IBM273 = 2030,
247  IBM274 = 2031,
248  IBM275 = 2032,
249  IBM277 = 2033,
250  IBM278 = 2034,
251  IBM280 = 2035,
252  IBM281 = 2036,
253  IBM284 = 2037,
254  IBM285 = 2038,
255  IBM290 = 2039,
256  IBM297 = 2040,
257  IBM420 = 2041,
258  IBM423 = 2042,
259  IBM424 = 2043,
260  PC8CodePage437 = 2011,
261  IBM500 = 2044,
262  IBM851 = 2045,
263  PCp852 = 2010,
264  IBM855 = 2046,
265  IBM857 = 2047,
266  IBM860 = 2048,
267  IBM861 = 2049,
268  IBM863 = 2050,
269  IBM864 = 2051,
270  IBM865 = 2052,
271  IBM868 = 2053,
272  IBM869 = 2054,
273  IBM870 = 2055,
274  IBM871 = 2056,
275  IBM880 = 2057,
276  IBM891 = 2058,
277  IBM903 = 2059,
278  IBM904 = 2060,
279  IBM905 = 2061,
280  IBM918 = 2062,
281  IBM1026 = 2063,
282  IBMEBCDICATDE = 2064,
283  EBCDICATDEA = 2065,
284  EBCDICCAFR = 2066,
285  EBCDICDKNO = 2067,
286  EBCDICDKNOA = 2068,
287  EBCDICFISE = 2069,
288  EBCDICFISEA = 2070,
289  EBCDICFR = 2071,
290  EBCDICIT = 2072,
291  EBCDICPT = 2073,
292  EBCDICES = 2074,
293  EBCDICESA = 2075,
294  EBCDICESS = 2076,
295  EBCDICUK = 2077,
296  EBCDICUS = 2078,
297  Unknown8BiT = 2079,
298  Mnemonic = 2080,
299  Mnem = 2081,
300  VISCII = 2082,
301  VIQR = 2083,
302  KOI8R = 2084,
303  HZGB2312 = 2085,
304  IBM866 = 2086,
305  PC775Baltic = 2087,
306  KOI8U = 2088,
307  IBM00858 = 2089,
308  IBM00924 = 2090,
309  IBM01140 = 2091,
310  IBM01141 = 2092,
311  IBM01142 = 2093,
312  IBM01143 = 2094,
313  IBM01144 = 2095,
314  IBM01145 = 2096,
315  IBM01146 = 2097,
316  IBM01147 = 2098,
317  IBM01148 = 2099,
318  IBM01149 = 2100,
319  Big5HKSCS = 2101,
320  IBM1047 = 2102,
321  PTCP154 = 2103,
322  Amiga1251 = 2104,
323  KOI7switched = 2105,
324  BRF = 2106,
325  TSCII = 2107,
326  CP51932 = 2108,
327  windows874 = 2109,
328  windows1250 = 2250,
329  windows1251 = 2251,
330  windows1252 = 2252,
331  windows1253 = 2253,
332  windows1254 = 2254,
333  windows1255 = 2255,
334  windows1256 = 2256,
335  windows1257 = 2257,
336  windows1258 = 2258,
337  TIS620 = 2259,
338  CP50220 = 2260
339  };
340  using enum id;
341 
342  constexpr text_encoding() = default;
343 
344  constexpr explicit
345  text_encoding(string_view __enc) noexcept
346  : _M_rep(_S_find_name(__enc))
347  {
348  __enc.copy(_M_name, max_name_length);
349  }
350 
351  // @pre i has the value of one of the enumerators of id.
352  constexpr
353  text_encoding(id __i) noexcept
354  : _M_rep(_S_find_id(__i))
355  {
356  if (string_view __name(_M_rep->_M_name); !__name.empty())
357  __name.copy(_M_name, max_name_length);
358  }
359 
360  constexpr id mib() const noexcept { return id(_M_rep->_M_id); }
361 
362  constexpr const char* name() const noexcept { return _M_name; }
363 
364  struct aliases_view : ranges::view_interface<aliases_view>
365  {
366  private:
367  class _Iterator;
368  struct _Sentinel { };
369 
370  public:
371  constexpr _Iterator begin() const noexcept;
372  constexpr _Sentinel end() const noexcept { return {}; }
373 
374  private:
375  friend struct text_encoding;
376 
377  constexpr explicit aliases_view(const _Rep* __r) : _M_begin(__r) { }
378 
379  const _Rep* _M_begin = nullptr;
380  };
381 
382  constexpr aliases_view
383  aliases() const noexcept
384  {
385  return _M_rep->_M_name[0] ? aliases_view(_M_rep) : aliases_view{nullptr};
386  }
387 
388  friend constexpr bool
389  operator==(const text_encoding& __a,
390  const text_encoding& __b) noexcept
391  {
392  if (__a.mib() == id::other && __b.mib() == id::other) [[unlikely]]
393  return _S_comp(__a._M_name, __b._M_name);
394  else
395  return __a.mib() == __b.mib();
396  }
397 
398  friend constexpr bool
399  operator==(const text_encoding& __encoding, id __i) noexcept
400  { return __encoding.mib() == __i; }
401 
402 #if __CHAR_BIT__ == 8
403  static consteval text_encoding
404  literal() noexcept
405  {
406 #ifdef __GNUC_EXECUTION_CHARSET_NAME
407  return text_encoding(__GNUC_EXECUTION_CHARSET_NAME);
408 #elif defined __clang_literal_encoding__
409  return text_encoding(__clang_literal_encoding__);
410 #else
411  return text_encoding();
412 #endif
413  }
414 
415  static text_encoding
416  environment();
417 
418  template<id _Id>
419  static bool
420  environment_is()
421  { return text_encoding(_Id)._M_is_environment(); }
422 #else
423  static text_encoding literal() = delete;
424  static text_encoding environment() = delete;
425  template<id> static bool environment_is() = delete;
426 #endif
427 
428  private:
429  const _Rep* _M_rep = _S_reps + 1; // id::unknown
430  char _M_name[max_name_length + 1] = {0};
431 
432  bool
433  _M_is_environment() const;
434 
435  static inline constexpr _Rep _S_reps[] = {
436  { 1, "" }, { 2, "" },
437 #define _GLIBCXX_GET_ENCODING_DATA
438 #include <bits/text_encoding-data.h>
439 #ifdef _GLIBCXX_GET_ENCODING_DATA
440 # error "Invalid text_encoding data"
441 #endif
442  { 9999, nullptr }, // sentinel
443  };
444 
445  static constexpr bool
446  _S_comp(string_view __a, string_view __b)
447  { return __unicode::__charset_alias_match(__a, __b); }
448 
449  static constexpr const _Rep*
450  _S_find_name(string_view __name) noexcept
451  {
452 #ifdef _GLIBCXX_TEXT_ENCODING_UTF8_OFFSET
453  // Optimize the common UTF-8 case to avoid a linear search through all
454  // strings in the table using the _S_comp function.
455  if (__name == "UTF-8")
456  return _S_reps + 2 + _GLIBCXX_TEXT_ENCODING_UTF8_OFFSET;
457 #endif
458 
459  // The first two array elements (other and unknown) don't have names.
460  // The last element is a sentinel that can never match anything.
461  const auto __first = _S_reps + 2, __end = std::end(_S_reps) - 1;
462  for (auto __r = __first; __r != __end; ++__r)
463  if (_S_comp(__r->_M_name, __name))
464  {
465  // Might have matched an alias. Find the first entry for this ID.
466  const auto __id = __r->_M_id;
467  while (__r[-1]._M_id == __id)
468  --__r;
469  return __r;
470  }
471  return _S_reps; // id::other
472  }
473 
474  static constexpr const _Rep*
475  _S_find_id(id __id) noexcept
476  {
477  const auto __i = (_Rep::id)__id;
478  const auto __r = std::lower_bound(_S_reps, std::end(_S_reps) - 1, __i);
479  if (__r->_M_id == __i) [[likely]]
480  return __r;
481  else
482  {
483  // Preconditions: i has the value of one of the enumerators of id.
484  __glibcxx_assert(__r->_M_id == __i);
485  return _S_reps + 1; // id::unknown
486  }
487  }
488  };
489 
490  template<>
491  struct hash<text_encoding>
492  {
493  size_t
494  operator()(const text_encoding& __enc) const noexcept
495  { return std::hash<text_encoding::id>()(__enc.mib()); }
496  };
497 
498  class text_encoding::aliases_view::_Iterator
499  {
500  public:
501  using value_type = const char*;
502  using reference = const char*;
503  using difference_type = int;
504 
505  constexpr _Iterator() = default;
506 
507  constexpr value_type
508  operator*() const
509  {
510  if (_M_dereferenceable()) [[likely]]
511  return _M_rep->_M_name;
512  else
513  {
514  __glibcxx_assert(_M_dereferenceable());
515  return "";
516  }
517  }
518 
519  constexpr _Iterator&
520  operator++()
521  {
522  if (_M_dereferenceable()) [[likely]]
523  ++_M_rep;
524  else
525  {
526  __glibcxx_assert(_M_dereferenceable());
527  *this = _Iterator{};
528  }
529  return *this;
530  }
531 
532  constexpr _Iterator&
533  operator--()
534  {
535  const bool __decrementable
536  = _M_rep != nullptr && _M_rep[-1]._M_id == _M_id;
537  if (__decrementable) [[likely]]
538  --_M_rep;
539  else
540  {
541  __glibcxx_assert(__decrementable);
542  *this = _Iterator{};
543  }
544  return *this;
545  }
546 
547  constexpr _Iterator
548  operator++(int)
549  {
550  auto __it = *this;
551  ++*this;
552  return __it;
553  }
554 
555  constexpr _Iterator
556  operator--(int)
557  {
558  auto __it = *this;
559  --*this;
560  return __it;
561  }
562 
563  constexpr value_type
564  operator[](difference_type __n) const
565  { return *(*this + __n); }
566 
567  constexpr _Iterator&
568  operator+=(difference_type __n)
569  {
570  if (_M_rep != nullptr)
571  {
572  if (__n > 0)
573  {
574  if (__n < (std::end(_S_reps) - _M_rep)
575  && _M_rep[__n - 1]._M_id == _M_id) [[likely]]
576  _M_rep += __n;
577  else
578  *this = _Iterator{};
579  }
580  else if (__n < 0)
581  {
582  if (__n > (_S_reps - _M_rep)
583  && _M_rep[__n]._M_id == _M_id) [[likely]]
584  _M_rep += __n;
585  else
586  *this = _Iterator{};
587  }
588  }
589  if (__n != 0)
590  __glibcxx_assert(_M_rep != nullptr);
591  return *this;
592  }
593 
594  constexpr _Iterator&
595  operator-=(difference_type __n)
596  {
597  using _Traits = __gnu_cxx::__int_traits<difference_type>;
598  if (__n == _Traits::__min) [[unlikely]]
599  return operator+=(_Traits::__max);
600  return operator+=(-__n);
601  }
602 
603  constexpr difference_type
604  operator-(const _Iterator& __i) const
605  {
606  if (_M_id == __i._M_id)
607  return _M_rep - __i._M_rep;
608  __glibcxx_assert(_M_id == __i._M_id);
609  return __gnu_cxx::__int_traits<difference_type>::__max;
610  }
611 
612  constexpr bool
613  operator==(const _Iterator&) const = default;
614 
615  constexpr bool
616  operator==(_Sentinel) const noexcept
617  { return !_M_dereferenceable(); }
618 
619  constexpr strong_ordering
620  operator<=>(const _Iterator& __i) const
621  {
622  __glibcxx_assert(_M_id == __i._M_id);
623  return _M_rep <=> __i._M_rep;
624  }
625 
626  friend constexpr _Iterator
627  operator+(_Iterator __i, difference_type __n)
628  {
629  __i += __n;
630  return __i;
631  }
632 
633  friend constexpr _Iterator
634  operator+(difference_type __n, _Iterator __i)
635  {
636  __i += __n;
637  return __i;
638  }
639 
640  friend constexpr _Iterator
641  operator-(_Iterator __i, difference_type __n)
642  {
643  __i -= __n;
644  return __i;
645  }
646 
647  private:
648  friend struct text_encoding;
649 
650  constexpr explicit
651  _Iterator(const _Rep* __r) noexcept
652  : _M_rep(__r), _M_id(__r ? __r->_M_id : 0)
653  { }
654 
655  constexpr bool
656  _M_dereferenceable() const noexcept
657  { return _M_rep != nullptr && _M_rep->_M_id == _M_id; }
658 
659  const _Rep* _M_rep = nullptr;
660  _Rep::id _M_id = 0;
661  };
662 
663  constexpr auto
664  text_encoding::aliases_view::begin() const noexcept
665  -> _Iterator
666  { return _Iterator(_M_begin); }
667 
668 namespace ranges
669 {
670  // Opt-in to borrowed_range concept
671  template<>
672  inline constexpr bool
673  enable_borrowed_range<std::text_encoding::aliases_view> = true;
674 }
675 
676 _GLIBCXX_END_NAMESPACE_VERSION
677 } // namespace std
678 
679 #endif // __cpp_lib_text_encoding
680 #endif // _GLIBCXX_TEXT_ENCODING