di 0.1.0
Loading...
Searching...
No Matches
utf8_stream_decoder.h
Go to the documentation of this file.
1#pragma once
2
5
8public:
9 constexpr static auto replacement_character = U'\uFFFD';
10
11 // Decode the incoming byte stream as UTF-8. This may buffer code
12 // units if the input doesn't end on a code point boundary.
13 //
14 // Invalid UTF-8 sequences are replaced with replacement characters.
15 constexpr auto decode(Span<byte const> input) -> String {
16 auto result = ""_s;
17 for (auto byte : input) {
18 decode_byte(result, byte);
19 }
20 return result;
21 }
22
23 // Flush any pending data. If there is any pending data, a single
24 // replacement character will be output.
25 constexpr auto flush() -> String {
26 auto result = ""_s;
27 if (m_pending_code_units > 0) {
28 output_code_point(result, replacement_character);
29 }
30 return result;
31 }
32
33private:
34 constexpr static auto default_lower_bound = u8(0x80);
35 constexpr static auto default_upper_bound = u8(0xBF);
36
37 constexpr void decode_byte(String& output, byte input) {
38 if (m_pending_code_units == 0) {
39 decode_first_byte(output, input);
40 return;
41 }
42
43 auto input_u8 = di::to_integer<u8>(input);
44 if (!di::between_inclusive(input_u8, m_lower_bound, m_upper_bound)) {
45 output_code_point(output, replacement_character);
46 decode_first_byte(output, input);
47 return;
48 }
49
50 m_lower_bound = default_lower_bound;
51 m_upper_bound = default_upper_bound;
52 m_pending_code_point <<= 6;
53 m_pending_code_point |= di::to_integer<u32>(input & byte(0b0011'1111));
54 if (--m_pending_code_units == 0) {
55 output_code_point(output, m_pending_code_point);
56 }
57 }
58
59 constexpr void decode_first_byte(String& output, byte input) {
60 // Valid ranges come from the Unicode core specification, table 3-7.
61 // https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506
62 auto input_u8 = di::to_integer<u8>(input);
63 if (di::between_inclusive(input_u8, 0x00, 0x7F)) {
64 output_code_point(output, di::to_integer<u32>(input));
65 } else if (di::between_inclusive(input_u8, 0xC2, 0xDF)) {
66 m_pending_code_units = 1;
67 m_pending_code_point = di::to_integer<u32>(input & byte(0x1F));
68 } else if (di::between_inclusive(input_u8, 0xE0, 0xEF)) {
69 m_pending_code_units = 2;
70 if (input == byte(0xE0)) {
71 m_lower_bound = 0xA0;
72 } else if (input == byte(0xED)) {
73 m_upper_bound = 0x9F;
74 }
75 m_pending_code_point = di::to_integer<u32>(input & byte(0x0F));
76 } else if (di::between_inclusive(input_u8, 0xF0, 0xF4)) {
77 m_pending_code_units = 3;
78 if (input == byte(0xF0)) {
79 m_lower_bound = 0x90;
80 } else if (input == byte(0xF4)) {
81 m_upper_bound = 0x8F;
82 }
83 m_pending_code_point = di::to_integer<u32>(input & byte(0x07));
84 } else {
85 output_code_point(output, replacement_character);
86 }
87 }
88
89 constexpr void output_code_point(String& output, c32 code_point) {
90 output.push_back(code_point);
91
92 // Reset state.
93 *this = {};
94 }
95
96 u8 m_pending_code_units { 0 };
97 u32 m_pending_code_point { 0 };
98 u8 m_lower_bound { default_lower_bound };
99 u8 m_upper_bound { default_upper_bound };
100};
101}
102
103namespace di {
105}
static constexpr auto replacement_character
Definition utf8_stream_decoder.h:9
Definition utf8_stream_decoder.h:7
constexpr auto flush() -> String
Definition utf8_stream_decoder.h:25
static constexpr auto replacement_character
Definition utf8_stream_decoder.h:9
constexpr auto decode(Span< byte const > input) -> String
Definition utf8_stream_decoder.h:15
Definition span_forward_declaration.h:10
Definition utf8_encoding.h:12
string::StringImpl< string::Utf8Encoding > String
Definition string.h:11
__UINT8_TYPE__ u8
Definition integers.h:9
char32_t c32
Definition char.h:6
__UINT32_TYPE__ u32
Definition integers.h:11
Definition any_storable.h:9
constexpr auto between_inclusive
Definition between_inclusive.h:23