Iros
 
Loading...
Searching...
No Matches
utf8_encoding.h
Go to the documentation of this file.
1#pragma once
2
8
9namespace di::container::string {
10// NOTE: see https://www.unicode.org/versions/Unicode14.0.0/UnicodeStandard-14.0.pdf for details on the UTF-8 encoding.
11// In particular, section 3.9, table 3-6, and table 3-7.
12namespace utf8 {
13 constexpr static auto is_start_of_one_byte_sequence(c8 byte) -> bool {
14 return byte <= 0x7F;
15 }
16
17 constexpr static auto is_start_of_two_byte_sequence(c8 byte) -> bool {
18 return byte >= 0xC2 && byte <= 0xDF;
19 }
20
21 constexpr static auto is_start_of_three_byte_sequence(c8 byte) -> bool {
22 return byte >= 0xE0 && byte <= 0xEF;
23 }
24
25 constexpr static auto is_start_of_four_byte_sequence(c8 byte) -> bool {
26 return byte >= 0xF0 && byte <= 0xF4;
27 }
28
29 constexpr static auto is_start_of_multi_byte_sequence(c8 byte) -> bool {
30 return is_start_of_two_byte_sequence(byte) || is_start_of_three_byte_sequence(byte) ||
31 is_start_of_four_byte_sequence(byte);
32 }
33
34 constexpr static auto is_valid_first_byte(c8 byte) -> bool {
35 return is_start_of_one_byte_sequence(byte) || is_start_of_multi_byte_sequence(byte);
36 }
37
38 constexpr static auto is_valid_second_byte(c8 first_byte, c8 second_byte) -> bool {
39 switch (first_byte) {
40 case 0xE0:
41 return second_byte >= 0xA0 && second_byte <= 0xBF;
42 case 0xED:
43 return second_byte >= 0x80 && second_byte <= 0x9F;
44 case 0xF0:
45 return second_byte >= 0x90 && second_byte <= 0xBF;
46 case 0xF4:
47 return second_byte >= 0x80 && second_byte <= 0x8F;
48 default:
49 return second_byte >= 0x80 && second_byte <= 0xBF;
50 }
51 }
52
53 constexpr static auto is_valid_third_byte([[maybe_unused]] c8 first_byte, c8 third_byte) -> bool {
54 return third_byte >= 0x80 && third_byte <= 0xBF;
55 }
56
57 constexpr static auto is_valid_fourth_byte([[maybe_unused]] c8 first_byte, c8 fourth_byte) -> bool {
58 return fourth_byte >= 0x80 && fourth_byte <= 0xBF;
59 }
60
61 constexpr static auto byte_sequence_length(c8 first_byte) -> u8 {
62 return is_start_of_one_byte_sequence(first_byte) ? 1
63 : is_start_of_two_byte_sequence(first_byte) ? 2
64 : is_start_of_three_byte_sequence(first_byte) ? 3
65 : 4;
66 }
67
68 class Utf8Iterator : public IteratorBase<Utf8Iterator, BidirectionalIteratorTag, c32, ssize_t> {
69 public:
70 Utf8Iterator() = default;
71 constexpr explicit Utf8Iterator(c8 const* data) : m_data(data) {}
72
73 constexpr auto operator*() const -> c32 {
74 auto length = byte_sequence_length(*m_data);
75 auto first_byte_mask = 0b11111111 >> length;
76 auto result = static_cast<c32>(*m_data & first_byte_mask);
77 for (auto i : view::range(1U, length)) {
78 result <<= 6;
79 result |= m_data[i] & 0b00111111;
80 }
81 return result;
82 }
83
84 constexpr void advance_one() { m_data += byte_sequence_length(*m_data); }
85 constexpr void back_one() {
86 do {
87 --m_data;
88 } while (!is_valid_first_byte(*m_data));
89 }
90
91 constexpr auto data() const -> c8 const* { return m_data; }
92
93 constexpr explicit operator c8 const*() const { return data(); }
94
95 private:
96 constexpr friend auto operator==(Utf8Iterator const& a, Utf8Iterator const& b) -> bool {
97 return a.data() == b.data();
98 }
99 constexpr friend auto operator<=>(Utf8Iterator const& a, Utf8Iterator const& b) {
100 return a.data() <=> b.data();
101 }
102
103 c8 const* m_data { nullptr };
104 };
105}
106
108public:
109 using CodeUnit = c8;
110 using CodePoint = c32;
112
113private:
114 template<typename = void>
116 size_t i = 0;
117 while (i < data.size()) {
118 auto first_byte = data.data()[i];
119 if (!utf8::is_valid_first_byte(first_byte)) {
120 return false;
121 }
122 auto length = utf8::byte_sequence_length(first_byte);
123 if (i + length > data.size()) {
124 return false;
125 }
126 switch (length) {
127 case 4:
128 if (!utf8::is_valid_fourth_byte(first_byte, data.data()[i + 3])) {
129 return false;
130 }
131 [[fallthrough]];
132 case 3:
133 if (!utf8::is_valid_third_byte(first_byte, data.data()[i + 2])) {
134 return false;
135 }
136 [[fallthrough]];
137 case 2:
138 if (!utf8::is_valid_second_byte(first_byte, data.data()[i + 1])) {
139 return false;
140 }
141 break;
142 default:
143 break;
144 }
145 i += length;
146 }
147 return true;
148 }
149
151 size_t offset) -> bool {
152 // NOTE: this function can assume the underlying c8 data is valid UTF-8.
153 if (offset >= data.size()) {
154 return offset == data.size();
155 }
156 return utf8::is_valid_first_byte(data[offset]);
157 }
158
159 constexpr friend auto tag_invoke(types::Tag<encoding::convert_to_code_units>, Utf8Encoding const&, c32 code_point) {
161 auto code_point_value = static_cast<u32>(code_point);
162 if (code_point_value <= 0x7F) {
163 (void) result.resize(1);
164 result[0] = code_point_value;
165 } else if (code_point_value <= 0x7FF) {
166 (void) result.resize(2);
167 result[0] = 0b11000000 | (code_point_value >> 6);
168 result[1] = 0b10000000 | (code_point_value & 0x3F);
169 } else if (code_point_value <= 0xFFFF) {
170 (void) result.resize(3);
171 result[0] = 0b11100000 | (code_point_value >> 12);
172 result[1] = 0b10000000 | ((code_point_value >> 6) & 0x3F);
173 result[2] = 0b10000000 | (code_point_value & 0x3F);
174 } else {
175 (void) result.resize(4);
176 result[0] = 0b11110000 | (code_point_value >> 18);
177 result[1] = 0b10000000 | ((code_point_value >> 12) & 0x3F);
178 result[2] = 0b10000000 | ((code_point_value >> 6) & 0x3F);
179 result[3] = 0b10000000 | (code_point_value & 0x3F);
180 }
181 return result;
182 }
183};
184}
Definition static_vector.h:17
Definition utf8_encoding.h:107
utf8::Utf8Iterator Iterator
Definition utf8_encoding.h:111
constexpr friend auto tag_invoke(types::Tag< encoding::convert_to_code_units >, Utf8Encoding const &, c32 code_point)
Definition utf8_encoding.h:159
constexpr friend auto tag_invoke(types::Tag< encoding::validate >, Utf8Encoding const &, Span< c8 const > data) -> bool
Definition utf8_encoding.h:115
constexpr friend auto tag_invoke(types::Tag< encoding::valid_byte_offset >, Utf8Encoding const &, Span< c8 const > data, size_t offset) -> bool
Definition utf8_encoding.h:150
c8 CodeUnit
Definition utf8_encoding.h:109
c32 CodePoint
Definition utf8_encoding.h:110
Definition utf8_encoding.h:68
constexpr void advance_one()
Definition utf8_encoding.h:84
constexpr void back_one()
Definition utf8_encoding.h:85
constexpr auto operator*() const -> c32
Definition utf8_encoding.h:73
constexpr friend auto operator==(Utf8Iterator const &a, Utf8Iterator const &b) -> bool
Definition utf8_encoding.h:96
constexpr auto data() const -> c8 const *
Definition utf8_encoding.h:91
constexpr friend auto operator<=>(Utf8Iterator const &a, Utf8Iterator const &b)
Definition utf8_encoding.h:99
constexpr Utf8Iterator(c8 const *data)
Definition utf8_encoding.h:71
Definition span_forward_declaration.h:10
Definition utf8_encoding.h:12
Definition constant_string_interface.h:31
constexpr auto data(concepts::detail::ConstantString auto const &string)
Definition string_data.h:6
constexpr auto range
Definition range.h:22
char8_t c8
Definition char.h:4
__UINT8_TYPE__ u8
Definition integers.h:9
char32_t c32
Definition char.h:6
__UINT32_TYPE__ u32
Definition integers.h:11
di::meta::Decay< decltype(T)> Tag
Definition tag_invoke.h:28