diff options
Diffstat (limited to 'src/utf8.c')
-rw-r--r-- | src/utf8.c | 102 |
1 files changed, 0 insertions, 102 deletions
diff --git a/src/utf8.c b/src/utf8.c deleted file mode 100644 index a2f4fdd..0000000 --- a/src/utf8.c +++ /dev/null @@ -1,102 +0,0 @@ -/* - * blogc: A blog compiler. - * Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de> - * Copyright (c) 2016 Rafael G. Martins <rafael@rafaelmartins.eng.br> - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -// Based on Bjoern Hoehrmann's algorithm. -// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. - -#include <stdbool.h> -#include <stddef.h> -#include <stdint.h> -#include "utils.h" - -#define UTF8_ACCEPT 0 -#define UTF8_REJECT 12 - - -static const uint8_t utf8d[] = { - // The first part of the table maps bytes to character classes that - // to reduce the size of the transition table and create bitmasks. - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, - 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, - - // The second part is a transition table that maps a combination - // of a state of the automaton and a character class to a state. - 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, - 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, - 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, - 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, - 12,36,12,12,12,12,12,12,12,12,12,12, -}; - - -static uint32_t inline -decode(uint32_t* state, uint32_t* codep, uint32_t byte) { - uint32_t type = utf8d[byte]; - - *codep = (*state != UTF8_ACCEPT) ? - (byte & 0x3fu) | (*codep << 6) : - (0xff >> type) & (byte); - - *state = utf8d[256 + *state + type]; - return *state; -} - - -bool -blogc_utf8_validate(const uint8_t *str, size_t len) -{ - uint32_t codepoint; - uint32_t state = 0; - - for (size_t i = 0; i < len; i++) - decode(&state, &codepoint, str[i]); - - return state == UTF8_ACCEPT; -} - - -bool -blogc_utf8_validate_str(sb_string_t *str) -{ - return blogc_utf8_validate((uint8_t*) str->str, str->len); -} - - -size_t -blogc_utf8_skip_bom(const uint8_t *str, size_t len) -{ - if (len < 3) - return 0; - - if (str[0] == 0xef && str[1] == 0xbb && str[2] == 0xbf) - return 3; - - return 0; -} |