diff options
| -rw-r--r-- | src/file.c | 14 | ||||
| -rw-r--r-- | src/utf8.c | 13 | ||||
| -rw-r--r-- | src/utf8.h | 1 | ||||
| -rw-r--r-- | tests/check_utf8.c | 22 | 
4 files changed, 48 insertions, 2 deletions
| @@ -41,11 +41,23 @@ blogc_file_get_contents(const char *path, size_t *len, blogc_error_t **err)      sb_string_t *str = sb_string_new();      char buffer[BLOGC_FILE_CHUNK_SIZE]; +    char *tmp;      while (!feof(fp)) {          size_t read_len = fread(buffer, sizeof(char), BLOGC_FILE_CHUNK_SIZE, fp); + +        tmp = buffer; + +        if (str->len == 0 && read_len > 0) { +            // skipping BOM before validation, for performance. should be safe +            // enough +            size_t skip = blogc_utf8_skip_bom((uint8_t*) buffer, read_len); +            read_len -= skip; +            tmp += skip; +        } +          *len += read_len; -        sb_string_append_len(str, buffer, read_len); +        sb_string_append_len(str, tmp, read_len);      }      fclose(fp); @@ -86,3 +86,16 @@ blogc_utf8_validate_str(sb_string_t *str)  {      return blogc_utf8_validate((uint8_t*) str->str, str->len);  } + + +size_t +blogc_utf8_skip_bom(const uint8_t *str, size_t len) +{ +    if (len < 3) +        return 0; + +    if (str[0] == 0xef && str[1] == 0xbb && str[2] == 0xbf) +        return 3; + +    return 0; +} @@ -15,5 +15,6 @@  bool blogc_utf8_validate(const uint8_t *str, size_t len);  bool blogc_utf8_validate_str(sb_string_t *str); +size_t blogc_utf8_skip_bom(const uint8_t *str, size_t len);  #endif /* _UTF_8_H */ diff --git a/tests/check_utf8.c b/tests/check_utf8.c index b0dec4e..9f98886 100644 --- a/tests/check_utf8.c +++ b/tests/check_utf8.c @@ -18,6 +18,8 @@  #include "../src/utf8.h"  #include "../src/utils.h" +// this file MUST be ASCII +  static void  test_utf8_valid(void **state) @@ -25,8 +27,10 @@ test_utf8_valid(void **state)      const char *c = "<a href=\"{{ BASE_URL }}/page/{{ PREVIOUS_PAGE }}/\">"          "\xc2\xab Newer posts</a>";      assert_true(blogc_utf8_validate((uint8_t*) c, strlen(c))); -    const uint8_t d[3] = {0xe2, 0x82, 0xac}; +    const uint8_t d[3] = {0xe2, 0x82, 0xac};  // euro sign      assert_true(blogc_utf8_validate(d, 3)); +    const uint8_t e[3] = {0xef, 0xbb, 0xbf};  // utf-8 bom +    assert_true(blogc_utf8_validate(e, 3));  } @@ -70,6 +74,21 @@ test_utf8_invalid_str(void **state)  } +static void +test_utf8_skip_bom(void **state) +{ +    const char c[4] = {0xef, 0xbb, 0xbf, 0}; +    assert_int_equal(blogc_utf8_skip_bom(c, 2), 0); +    assert_int_equal(blogc_utf8_skip_bom(c, 3), 3); +    assert_string_equal(c + 3, ""); +    const char d[8] = {0xef, 0xbb, 0xbf, 'b', 'o', 'l', 'a', 0}; +    assert_int_equal(blogc_utf8_skip_bom(d, 8), 3); +    assert_string_equal(d + 3, "bola"); +    const char e[5] = "bola"; +    assert_int_equal(blogc_utf8_skip_bom(e, 4), 0); +} + +  int  main(void)  { @@ -78,6 +97,7 @@ main(void)          unit_test(test_utf8_invalid),          unit_test(test_utf8_valid_str),          unit_test(test_utf8_invalid_str), +        unit_test(test_utf8_skip_bom),      };      return run_tests(tests);  } | 
