summaryrefslogtreecommitdiff
path: root/src/rexmpp_utf8.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/rexmpp_utf8.h')
-rw-r--r--src/rexmpp_utf8.h93
1 files changed, 93 insertions, 0 deletions
diff --git a/src/rexmpp_utf8.h b/src/rexmpp_utf8.h
new file mode 100644
index 0000000..4096aac
--- /dev/null
+++ b/src/rexmpp_utf8.h
@@ -0,0 +1,93 @@
+/**
+ @file rexmpp_utf8.h
+ @brief UTF-8 helper functions
+ @author defanor <defanor@uberspace.net>
+ @date 2023
+ @copyright MIT license.
+*/
+
+#ifndef REXMPP_UTF8_H
+#define REXMPP_UTF8_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef HAVE_ICU
+
+#include <unicode/utf8.h>
+#define REXMPP_U8_NEXT U8_NEXT
+
+#else
+
+#define REXMPP_U8_NEXT(str, pos, len, out) \
+ rexmpp_utf8_next(str, &pos, len, &out);
+
+/**
+ @brief Similar to libicu's U8_NEXT macros: reads a single UTF-8
+ code point, advances the position.
+ @param[in] str A string to read.
+ @param[in,out] pos Byte position within the string. Advanced by the
+ number of bytes read to produce a code point, not advanced on
+ failure.
+ @param[in] len String length.
+ @param[in,out] out A pointer to the location for writing the code
+ point.
+ @returns 0 on failure, 1 on success.
+*/
+inline static
+void rexmpp_utf8_next (const uint8_t *str,
+ size_t *pos,
+ size_t len,
+ int32_t *out)
+{
+ if (*pos >= len) {
+ *out = -1;
+ return;
+ }
+
+ if ((str[*pos] & 0x80) == 0
+ && *pos + 1 <= len)
+ /* U+0000 to U+007F: 0xxxxxxx */
+ {
+ *out = str[*pos];
+ *pos = *pos + 1;
+ } else if ((str[*pos] & 0xe0) == 0xc0
+ && *pos + 2 <= len
+ && (str[*pos + 1] & 0xc0) == 0x80)
+ /* U+0080 to U+07FF: 110xxxxx 10xxxxxx */
+ {
+ *out = (((int32_t)(str[*pos] & 0x1f) << 6)
+ | ((int32_t)str[*pos + 1] & 0x3f));
+ *pos = *pos + 2;
+ } else if ((str[*pos] & 0xf0) == 0xe0
+ && *pos + 3 <= len
+ && (str[*pos + 1] & 0xc0) == 0x80
+ && (str[*pos + 2] & 0xc0) == 0x80)
+ /* U+0800 to U+FFFF: 1110xxxx 10xxxxxx 10xxxxxx */
+ {
+ *out = (((((int32_t)(str[*pos] & 0xf) << 6)
+ | ((int32_t)str[*pos + 1] & 0x3f)) << 6)
+ | ((int32_t)str[*pos + 2] & 0x3f));
+ *pos = *pos + 3;
+ } else if ((str[*pos] & 0xf8) == 0xf0
+ && *pos + 4 <= len
+ && (str[*pos + 1] & 0xc0) == 0x80
+ && (str[*pos + 2] & 0xc0) == 0x80
+ && (str[*pos + 3] & 0xc0) == 0x80)
+ /* U+10000 to U+10FFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
+ {
+ *out = (((((((int32_t)(str[*pos] & 7) << 6)
+ | ((int32_t)str[*pos + 1] & 0x3f)) << 6)
+ | (((int32_t)str[*pos + 2] & 0x3f))) << 6)
+ | ((int32_t)str[*pos + 3] & 0x3f));
+ *pos = *pos + 4;
+ } else
+ /* Invalid UTF-8 */
+ {
+ *out = -1;
+ }
+}
+
+#endif /* HAVE_ICU */
+
+#endif /* REXMPP_UTF8_H */