OpenMW
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
utf8stream.hpp
Go to the documentation of this file.
1 #ifndef MISC_UTF8ITER_HPP
2 #define MISC_UTF8ITER_HPP
3 
4 #include <cstring>
5 #include <tuple>
6 
7 class Utf8Stream
8 {
9 public:
10 
11  typedef uint32_t UnicodeChar;
12  typedef unsigned char const * Point;
13 
14  //static const unicode_char sBadChar = 0xFFFFFFFF; gcc can't handle this
15  static UnicodeChar sBadChar () { return UnicodeChar (0xFFFFFFFF); }
16 
18  cur (begin), nxt (begin), end (end), val(Utf8Stream::sBadChar())
19  {
20  }
21 
22  Utf8Stream (const char * str) :
23  cur ((unsigned char*) str), nxt ((unsigned char*) str), end ((unsigned char*) str + strlen(str)), val(Utf8Stream::sBadChar())
24  {
25  }
26 
27  Utf8Stream (std::pair <Point, Point> range) :
28  cur (range.first), nxt (range.first), end (range.second), val(Utf8Stream::sBadChar())
29  {
30  }
31 
32  bool eof () const
33  {
34  return cur == end;
35  }
36 
37  Point current () const
38  {
39  return cur;
40  }
41 
43  {
44  if (cur == nxt)
45  next ();
46  return val;
47  }
48 
50  {
51  if (cur == nxt)
52  next ();
53  cur = nxt;
54  return val;
55  }
56 
57  static std::pair <UnicodeChar, Point> decode (Point cur, Point end)
58  {
59  if ((*cur & 0x80) == 0)
60  {
61  UnicodeChar chr = *cur++;
62 
63  return std::make_pair (chr, cur);
64  }
65 
66  int octets;
67  UnicodeChar chr;
68 
69  std::tie (octets, chr) = octet_count (*cur++);
70 
71  if (octets > 5)
72  return std::make_pair (sBadChar(), cur);
73 
74  Point eoc = cur + octets;
75 
76  if (eoc > end)
77  return std::make_pair (sBadChar(), cur);
78 
79  while (cur != eoc)
80  {
81  if ((*cur & 0xC0) != 0x80) // check continuation mark
82  return std::make_pair (sBadChar(), cur);
83 
84  chr = (chr << 6) | UnicodeChar ((*cur++) & 0x3F);
85  }
86 
87  return std::make_pair (chr, cur);
88  }
89 
90 private:
91 
92  static std::pair <int, UnicodeChar> octet_count (unsigned char octet)
93  {
94  int octets;
95 
96  unsigned char mark = 0xC0;
97  unsigned char mask = 0xE0;
98 
99  for (octets = 1; octets <= 5; ++octets)
100  {
101  if ((octet & mask) == mark)
102  break;
103 
104  mark = (mark >> 1) | 0x80;
105  mask = (mask >> 1) | 0x80;
106  }
107 
108  return std::make_pair (octets, octet & ~mask);
109  }
110 
111  void next ()
112  {
113  std::tie (val, nxt) = decode (nxt, end);
114  }
115 
120 };
121 
122 #endif
unsigned char const * Point
Definition: utf8stream.hpp:12
Utf8Stream(Point begin, Point end)
Definition: utf8stream.hpp:17
bool eof() const
Definition: utf8stream.hpp:32
UnicodeChar consume()
Definition: utf8stream.hpp:49
Point end
Definition: utf8stream.hpp:118
static UnicodeChar sBadChar()
Definition: utf8stream.hpp:15
Definition: utf8stream.hpp:7
Utf8Stream(const char *str)
Definition: utf8stream.hpp:22
Utf8Stream(std::pair< Point, Point > range)
Definition: utf8stream.hpp:27
static std::pair< int, UnicodeChar > octet_count(unsigned char octet)
Definition: utf8stream.hpp:92
uint32_t UnicodeChar
Definition: utf8stream.hpp:11
static std::pair< UnicodeChar, Point > decode(Point cur, Point end)
Definition: utf8stream.hpp:57
Point current() const
Definition: utf8stream.hpp:37
Point nxt
Definition: utf8stream.hpp:117
void next()
Definition: utf8stream.hpp:111
UnicodeChar peek()
Definition: utf8stream.hpp:42
Point cur
Definition: utf8stream.hpp:116
UnicodeChar val
Definition: utf8stream.hpp:119