From de14affe2e316f00663759100d658731fb8fc0ca Mon Sep 17 00:00:00 2001 From: "Rafael G. Martins" Date: Thu, 5 Nov 2020 19:13:47 +0100 Subject: common: utf: simplified utf-8 validation we don't need to evaluate codepoints, just to check if the byte sequences are valid. --- src/common/utf8.c | 16 +--------------- tests/common/check_utf8.c | 4 ++++ 2 files changed, 5 insertions(+), 15 deletions(-) diff --git a/src/common/utf8.c b/src/common/utf8.c index f029e5b..df5e2d2 100644 --- a/src/common/utf8.c +++ b/src/common/utf8.c @@ -56,27 +56,13 @@ static const uint8_t utf8d[] = { }; -static uint32_t inline -decode(uint32_t* state, uint32_t* codep, uint32_t byte) { - uint32_t type = utf8d[byte]; - - *codep = (*state != UTF8_ACCEPT) ? - (byte & 0x3fu) | (*codep << 6) : - (0xff >> type) & (byte); - - *state = utf8d[256 + *state + type]; - return *state; -} - - bool bc_utf8_validate(const uint8_t *str, size_t len) { - uint32_t codepoint; uint32_t state = 0; for (size_t i = 0; i < len; i++) - decode(&state, &codepoint, str[i]); + state = utf8d[256 + state + utf8d[str[i]]]; return state == UTF8_ACCEPT; } diff --git a/tests/common/check_utf8.c b/tests/common/check_utf8.c index 05bf566..803818b 100644 --- a/tests/common/check_utf8.c +++ b/tests/common/check_utf8.c @@ -39,6 +39,10 @@ test_utf8_invalid(void **state) assert_false(bc_utf8_validate(c, 4)); const uint8_t d[8] = {0xff, 0xfe, 0x00, 0x00, 0xac, 0x20, 0x00, 0x00}; // utf-32 assert_false(bc_utf8_validate(d, 8)); + const uint8_t e[6] = {'a', 0xff, 0xfe, 0xac, 0x20, 'b'}; // utf-16 + assert_false(bc_utf8_validate(e, 6)); + const uint8_t f[10] = {'a', 0xff, 0xfe, 0x00, 0x00, 0xac, 0x20, 0x00, 0x00, 'b'}; // utf-32 + assert_false(bc_utf8_validate(f, 10)); } -- cgit v1.2.3-18-g5258