From de14affe2e316f00663759100d658731fb8fc0ca Mon Sep 17 00:00:00 2001
From: "Rafael G. Martins" <rafael@rafaelmartins.eng.br>
Date: Thu, 5 Nov 2020 19:13:47 +0100
Subject: common: utf: simplified utf-8 validation

we don't need to evaluate codepoints, just to check if the byte
sequences are valid.
---
 src/common/utf8.c         | 16 +---------------
 tests/common/check_utf8.c |  4 ++++
 2 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/src/common/utf8.c b/src/common/utf8.c
index f029e5b..df5e2d2 100644
--- a/src/common/utf8.c
+++ b/src/common/utf8.c
@@ -56,27 +56,13 @@ static const uint8_t utf8d[] = {
 };
 
 
-static uint32_t inline
-decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
-    uint32_t type = utf8d[byte];
-
-    *codep = (*state != UTF8_ACCEPT) ?
-        (byte & 0x3fu) | (*codep << 6) :
-        (0xff >> type) & (byte);
-
-    *state = utf8d[256 + *state + type];
-    return *state;
-}
-
-
 bool
 bc_utf8_validate(const uint8_t *str, size_t len)
 {
-    uint32_t codepoint;
     uint32_t state = 0;
 
     for (size_t i = 0; i < len; i++)
-        decode(&state, &codepoint, str[i]);
+        state = utf8d[256 + state + utf8d[str[i]]];
 
     return state == UTF8_ACCEPT;
 }
diff --git a/tests/common/check_utf8.c b/tests/common/check_utf8.c
index 05bf566..803818b 100644
--- a/tests/common/check_utf8.c
+++ b/tests/common/check_utf8.c
@@ -39,6 +39,10 @@ test_utf8_invalid(void **state)
     assert_false(bc_utf8_validate(c, 4));
     const uint8_t d[8] = {0xff, 0xfe, 0x00, 0x00, 0xac, 0x20, 0x00, 0x00};  // utf-32
     assert_false(bc_utf8_validate(d, 8));
+    const uint8_t e[6] = {'a', 0xff, 0xfe, 0xac, 0x20, 'b'};  // utf-16
+    assert_false(bc_utf8_validate(e, 6));
+    const uint8_t f[10] = {'a', 0xff, 0xfe, 0x00, 0x00, 0xac, 0x20, 0x00, 0x00, 'b'};  // utf-32
+    assert_false(bc_utf8_validate(f, 10));
 }
 
 
-- 
cgit v1.2.3-18-g5258