From 1099a4d991942655c0291a74b488322d5da533bd Mon Sep 17 00:00:00 2001 From: "Rafael G. Martins" Date: Sun, 3 Jul 2016 21:52:51 +0200 Subject: utf8: skip BOM, if found --- src/file.c | 14 +++++++++++++- src/utf8.c | 13 +++++++++++++ src/utf8.h | 1 + tests/check_utf8.c | 22 +++++++++++++++++++++- 4 files changed, 48 insertions(+), 2 deletions(-) diff --git a/src/file.c b/src/file.c index 7171f31..6da94a2 100644 --- a/src/file.c +++ b/src/file.c @@ -41,11 +41,23 @@ blogc_file_get_contents(const char *path, size_t *len, blogc_error_t **err) sb_string_t *str = sb_string_new(); char buffer[BLOGC_FILE_CHUNK_SIZE]; + char *tmp; while (!feof(fp)) { size_t read_len = fread(buffer, sizeof(char), BLOGC_FILE_CHUNK_SIZE, fp); + + tmp = buffer; + + if (str->len == 0 && read_len > 0) { + // skipping BOM before validation, for performance. should be safe + // enough + size_t skip = blogc_utf8_skip_bom((uint8_t*) buffer, read_len); + read_len -= skip; + tmp += skip; + } + *len += read_len; - sb_string_append_len(str, buffer, read_len); + sb_string_append_len(str, tmp, read_len); } fclose(fp); diff --git a/src/utf8.c b/src/utf8.c index deea46d..0c04d60 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -86,3 +86,16 @@ blogc_utf8_validate_str(sb_string_t *str) { return blogc_utf8_validate((uint8_t*) str->str, str->len); } + + +size_t +blogc_utf8_skip_bom(const uint8_t *str, size_t len) +{ + if (len < 3) + return 0; + + if (str[0] == 0xef && str[1] == 0xbb && str[2] == 0xbf) + return 3; + + return 0; +} diff --git a/src/utf8.h b/src/utf8.h index 837d02f..582ae1c 100644 --- a/src/utf8.h +++ b/src/utf8.h @@ -15,5 +15,6 @@ bool blogc_utf8_validate(const uint8_t *str, size_t len); bool blogc_utf8_validate_str(sb_string_t *str); +size_t blogc_utf8_skip_bom(const uint8_t *str, size_t len); #endif /* _UTF_8_H */ diff --git a/tests/check_utf8.c b/tests/check_utf8.c index b0dec4e..9f98886 100644 --- a/tests/check_utf8.c +++ b/tests/check_utf8.c @@ -18,6 +18,8 @@ #include "../src/utf8.h" #include "../src/utils.h" +// this file MUST be ASCII + static void test_utf8_valid(void **state) @@ -25,8 +27,10 @@ test_utf8_valid(void **state) const char *c = "" "\xc2\xab Newer posts"; assert_true(blogc_utf8_validate((uint8_t*) c, strlen(c))); - const uint8_t d[3] = {0xe2, 0x82, 0xac}; + const uint8_t d[3] = {0xe2, 0x82, 0xac}; // euro sign assert_true(blogc_utf8_validate(d, 3)); + const uint8_t e[3] = {0xef, 0xbb, 0xbf}; // utf-8 bom + assert_true(blogc_utf8_validate(e, 3)); } @@ -70,6 +74,21 @@ test_utf8_invalid_str(void **state) } +static void +test_utf8_skip_bom(void **state) +{ + const char c[4] = {0xef, 0xbb, 0xbf, 0}; + assert_int_equal(blogc_utf8_skip_bom(c, 2), 0); + assert_int_equal(blogc_utf8_skip_bom(c, 3), 3); + assert_string_equal(c + 3, ""); + const char d[8] = {0xef, 0xbb, 0xbf, 'b', 'o', 'l', 'a', 0}; + assert_int_equal(blogc_utf8_skip_bom(d, 8), 3); + assert_string_equal(d + 3, "bola"); + const char e[5] = "bola"; + assert_int_equal(blogc_utf8_skip_bom(e, 4), 0); +} + + int main(void) { @@ -78,6 +97,7 @@ main(void) unit_test(test_utf8_invalid), unit_test(test_utf8_valid_str), unit_test(test_utf8_invalid_str), + unit_test(test_utf8_skip_bom), }; return run_tests(tests); } -- cgit v1.2.3-18-g5258