aboutsummaryrefslogtreecommitdiffstats
path: root/src/common/utf8.c
diff options
context:
space:
mode:
authorRafael G. Martins <rafael@rafaelmartins.eng.br>2020-11-05 19:13:47 +0100
committerRafael G. Martins <rafael@rafaelmartins.eng.br>2020-11-05 19:13:49 +0100
commitde14affe2e316f00663759100d658731fb8fc0ca (patch)
treeb31937440eedb113f5e936a23c941c7831ee9c23 /src/common/utf8.c
parent39986b29f34795d346b8feb10ce4fe9caac8878e (diff)
downloadblogc-de14affe2e316f00663759100d658731fb8fc0ca.tar.gz
blogc-de14affe2e316f00663759100d658731fb8fc0ca.tar.bz2
blogc-de14affe2e316f00663759100d658731fb8fc0ca.zip
common: utf: simplified utf-8 validation
we don't need to evaluate codepoints, just to check if the byte sequences are valid.
Diffstat (limited to 'src/common/utf8.c')
-rw-r--r--src/common/utf8.c16
1 files changed, 1 insertions, 15 deletions
diff --git a/src/common/utf8.c b/src/common/utf8.c
index f029e5b..df5e2d2 100644
--- a/src/common/utf8.c
+++ b/src/common/utf8.c
@@ -56,27 +56,13 @@ static const uint8_t utf8d[] = {
};
-static uint32_t inline
-decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
- uint32_t type = utf8d[byte];
-
- *codep = (*state != UTF8_ACCEPT) ?
- (byte & 0x3fu) | (*codep << 6) :
- (0xff >> type) & (byte);
-
- *state = utf8d[256 + *state + type];
- return *state;
-}
-
-
bool
bc_utf8_validate(const uint8_t *str, size_t len)
{
- uint32_t codepoint;
uint32_t state = 0;
for (size_t i = 0; i < len; i++)
- decode(&state, &codepoint, str[i]);
+ state = utf8d[256 + state + utf8d[str[i]]];
return state == UTF8_ACCEPT;
}