summaryrefslogtreecommitdiff
path: root/src/rexmpp_utf8.h
blob: 4096aac5598ca62341f398ec7f074fb8a47b25f1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
/**
   @file rexmpp_utf8.h
   @brief UTF-8 helper functions
   @author defanor <defanor@uberspace.net>
   @date 2023
   @copyright MIT license.
*/

#ifndef REXMPP_UTF8_H
#define REXMPP_UTF8_H

#include <stddef.h>
#include <stdint.h>

#ifdef HAVE_ICU

#include <unicode/utf8.h>
#define REXMPP_U8_NEXT U8_NEXT

#else

#define REXMPP_U8_NEXT(str, pos, len, out) \
  rexmpp_utf8_next(str, &pos, len, &out);

/**
   @brief Similar to libicu's U8_NEXT macros: reads a single UTF-8
   code point, advances the position.
   @param[in] str A string to read.
   @param[in,out] pos Byte position within the string. Advanced by the
   number of bytes read to produce a code point, not advanced on
   failure.
   @param[in] len String length.
   @param[in,out] out A pointer to the location for writing the code
   point.
   @returns 0 on failure, 1 on success.
*/
inline static
void rexmpp_utf8_next (const uint8_t *str,
                       size_t *pos,
                       size_t len,
                       int32_t *out)
{
  if (*pos >= len) {
    *out = -1;
    return;
  }

  if ((str[*pos] & 0x80) == 0
      && *pos + 1 <= len)
    /* U+0000 to U+007F: 0xxxxxxx */
    {
      *out = str[*pos];
      *pos = *pos + 1;
    } else if ((str[*pos] & 0xe0) == 0xc0
               && *pos + 2 <= len
               && (str[*pos + 1] & 0xc0) == 0x80)
    /* U+0080 to U+07FF: 110xxxxx 10xxxxxx */
    {
      *out = (((int32_t)(str[*pos] & 0x1f) << 6)
              | ((int32_t)str[*pos + 1] & 0x3f));
      *pos = *pos + 2;
    } else if ((str[*pos] & 0xf0) == 0xe0
               && *pos + 3 <= len
               && (str[*pos + 1] & 0xc0) == 0x80
               && (str[*pos + 2] & 0xc0) == 0x80)
    /* U+0800 to U+FFFF: 1110xxxx 10xxxxxx 10xxxxxx */
    {
      *out = (((((int32_t)(str[*pos] & 0xf) << 6)
                | ((int32_t)str[*pos + 1] & 0x3f)) << 6)
              | ((int32_t)str[*pos + 2] & 0x3f));
      *pos = *pos + 3;
    } else if ((str[*pos] & 0xf8) == 0xf0
               && *pos + 4 <= len
               && (str[*pos + 1] & 0xc0) == 0x80
               && (str[*pos + 2] & 0xc0) == 0x80
               && (str[*pos + 3] & 0xc0) == 0x80)
    /* U+10000 to U+10FFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
    {
      *out = (((((((int32_t)(str[*pos] & 7) << 6)
                  | ((int32_t)str[*pos + 1] & 0x3f)) << 6)
                | (((int32_t)str[*pos + 2] & 0x3f))) << 6)
              | ((int32_t)str[*pos + 3] & 0x3f));
      *pos = *pos + 4;
    } else
    /* Invalid UTF-8 */
    {
      *out = -1;
    }
}

#endif  /* HAVE_ICU */

#endif  /* REXMPP_UTF8_H */