di 0.1.0
Loading...
Searching...
No Matches
utf8_strict_stream_decoder.h
Go to the documentation of this file.
1#pragma once
2
5
8public:
9 // Decode the incoming byte stream as UTF-8. This may buffer code
10 // units if the input doesn't end on a code point boundary.
11 constexpr auto decode(byte input) -> Result<Optional<c32>> { return decode_byte(input); }
12
13 // Flush any pending data. If there is any pending data, the input is
14 // invalid utf8.
15 constexpr auto flush() -> Result<> {
16 if (m_pending_code_units > 0) {
17 return Unexpected(BasicError::InvalidArgument);
18 }
19 return {};
20 }
21
22private:
23 constexpr static auto default_lower_bound = u8(0x80);
24 constexpr static auto default_upper_bound = u8(0xBF);
25
26 constexpr auto decode_byte(byte input) -> Result<Optional<c32>> {
27 if (m_pending_code_units == 0) {
28 return decode_first_byte(input);
29 }
30
31 auto input_u8 = di::to_integer<u8>(input);
32 if (!di::between_inclusive(input_u8, m_lower_bound, m_upper_bound)) {
33 return Unexpected(BasicError::InvalidArgument);
34 }
35
36 m_lower_bound = default_lower_bound;
37 m_upper_bound = default_upper_bound;
38 m_pending_code_point <<= 6;
39 m_pending_code_point |= di::to_integer<u32>(input & byte(0b0011'1111));
40 if (--m_pending_code_units == 0) {
41 return output_code_point(m_pending_code_point);
42 }
43 return {};
44 }
45
46 constexpr auto decode_first_byte(byte input) -> Result<Optional<c32>> {
47 // Valid ranges come from the Unicode core specification, table 3-7.
48 // https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506
49 auto input_u8 = di::to_integer<u8>(input);
50 if (di::between_inclusive(input_u8, 0x00, 0x7F)) {
51 return output_code_point(di::to_integer<u32>(input));
52 }
53 if (di::between_inclusive(input_u8, 0xC2, 0xDF)) {
54 m_pending_code_units = 1;
55 m_pending_code_point = di::to_integer<u32>(input & byte(0x1F));
56 return {};
57 }
58 if (di::between_inclusive(input_u8, 0xE0, 0xEF)) {
59 m_pending_code_units = 2;
60 if (input == byte(0xE0)) {
61 m_lower_bound = 0xA0;
62 } else if (input == byte(0xED)) {
63 m_upper_bound = 0x9F;
64 }
65 m_pending_code_point = di::to_integer<u32>(input & byte(0x0F));
66 return {};
67 }
68 if (di::between_inclusive(input_u8, 0xF0, 0xF4)) {
69 m_pending_code_units = 3;
70 if (input == byte(0xF0)) {
71 m_lower_bound = 0x90;
72 } else if (input == byte(0xF4)) {
73 m_upper_bound = 0x8F;
74 }
75 m_pending_code_point = di::to_integer<u32>(input & byte(0x07));
76 return {};
77 }
78 return Unexpected(BasicError::InvalidArgument);
79 }
80
81 constexpr auto output_code_point(c32 code_point) -> c32 {
82 // Reset state.
83 *this = {};
84
85 return code_point;
86 }
87
88 u8 m_pending_code_units { 0 };
89 u32 m_pending_code_point { 0 };
90 u8 m_lower_bound { default_lower_bound };
91 u8 m_upper_bound { default_upper_bound };
92};
93}
94
95namespace di {
97}
Definition utf8_strict_stream_decoder.h:7
constexpr auto flush() -> Result<>
Definition utf8_strict_stream_decoder.h:15
constexpr auto decode(byte input) -> Result< Optional< c32 > >
Definition utf8_strict_stream_decoder.h:11
Definition unexpected.h:14
Definition utf8_encoding.h:12
constexpr auto code_point
Definition code_point_parser.h:35
__UINT8_TYPE__ u8
Definition integers.h:9
char32_t c32
Definition char.h:6
__UINT32_TYPE__ u32
Definition integers.h:11
Expected< T, Error > Result
Definition result.h:8
Unexpected(E &&) -> Unexpected< meta::UnwrapRefDecay< E > >
Definition any_storable.h:9
constexpr auto between_inclusive
Definition between_inclusive.h:23