1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
|
/**
@file rexmpp_utf8.h
@brief UTF-8 helper functions
@author defanor <defanor@uberspace.net>
@date 2023
@copyright MIT license.
*/
#ifndef REXMPP_UTF8_H
#define REXMPP_UTF8_H
#include <stddef.h>
#include <stdint.h>
#ifdef HAVE_ICU
#include <unicode/utf8.h>
#define REXMPP_U8_NEXT U8_NEXT
#else
#define REXMPP_U8_NEXT(str, pos, len, out) \
rexmpp_utf8_next(str, &pos, len, &out);
/**
@brief Similar to libicu's U8_NEXT macros: reads a single UTF-8
code point, advances the position.
@param[in] str A string to read.
@param[in,out] pos Byte position within the string. Advanced by the
number of bytes read to produce a code point, not advanced on
failure.
@param[in] len String length.
@param[in,out] out A pointer to the location for writing the code
point.
@returns 0 on failure, 1 on success.
*/
inline static
void rexmpp_utf8_next (const uint8_t *str,
size_t *pos,
size_t len,
int32_t *out)
{
if (*pos >= len) {
*out = -1;
return;
}
if ((str[*pos] & 0x80) == 0
&& *pos + 1 <= len)
/* U+0000 to U+007F: 0xxxxxxx */
{
*out = str[*pos];
*pos = *pos + 1;
} else if ((str[*pos] & 0xe0) == 0xc0
&& *pos + 2 <= len
&& (str[*pos + 1] & 0xc0) == 0x80)
/* U+0080 to U+07FF: 110xxxxx 10xxxxxx */
{
*out = (((int32_t)(str[*pos] & 0x1f) << 6)
| ((int32_t)str[*pos + 1] & 0x3f));
*pos = *pos + 2;
} else if ((str[*pos] & 0xf0) == 0xe0
&& *pos + 3 <= len
&& (str[*pos + 1] & 0xc0) == 0x80
&& (str[*pos + 2] & 0xc0) == 0x80)
/* U+0800 to U+FFFF: 1110xxxx 10xxxxxx 10xxxxxx */
{
*out = (((((int32_t)(str[*pos] & 0xf) << 6)
| ((int32_t)str[*pos + 1] & 0x3f)) << 6)
| ((int32_t)str[*pos + 2] & 0x3f));
*pos = *pos + 3;
} else if ((str[*pos] & 0xf8) == 0xf0
&& *pos + 4 <= len
&& (str[*pos + 1] & 0xc0) == 0x80
&& (str[*pos + 2] & 0xc0) == 0x80
&& (str[*pos + 3] & 0xc0) == 0x80)
/* U+10000 to U+10FFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
{
*out = (((((((int32_t)(str[*pos] & 7) << 6)
| ((int32_t)str[*pos + 1] & 0x3f)) << 6)
| (((int32_t)str[*pos + 2] & 0x3f))) << 6)
| ((int32_t)str[*pos + 3] & 0x3f));
*pos = *pos + 4;
} else
/* Invalid UTF-8 */
{
*out = -1;
}
}
#endif /* HAVE_ICU */
#endif /* REXMPP_UTF8_H */
|