utf8: skip BOM, if found

author: Rafael G. Martins <rafael@rafaelmartins.eng.br> 2016-07-03 21:52:51 +0200
committer: Rafael G. Martins <rafael@rafaelmartins.eng.br> 2016-07-04 01:07:34 +0200
commit: 1099a4d991942655c0291a74b488322d5da533bd (patch)
tree: 0a91f413d3675122a15f49a87fd9413baf46c655
parent: 4aac65c4b8f2d8415ca8d9d8449e0158e0ff1e9c (diff)
download: blogc-1099a4d991942655c0291a74b488322d5da533bd.tar.gz
blogc-1099a4d991942655c0291a74b488322d5da533bd.tar.bz2
blogc-1099a4d991942655c0291a74b488322d5da533bd.zip
4 files changed, 48 insertions, 2 deletions
diff --git a/src/file.c b/src/file.c
index 7171f31..6da94a2 100644
--- a/src/file.c
+++ b/src/file.c
@@ -41,11 +41,23 @@ blogc_file_get_contents(const char *path, size_t *len, blogc_error_t **err)
 
     sb_string_t *str = sb_string_new();
     char buffer[BLOGC_FILE_CHUNK_SIZE];
+    char *tmp;
 
     while (!feof(fp)) {
         size_t read_len = fread(buffer, sizeof(char), BLOGC_FILE_CHUNK_SIZE, fp);
+
+        tmp = buffer;
+
+        if (str->len == 0 && read_len > 0) {
+            // skipping BOM before validation, for performance. should be safe
+            // enough
+            size_t skip = blogc_utf8_skip_bom((uint8_t*) buffer, read_len);
+            read_len -= skip;
+            tmp += skip;
+        }
+
         *len += read_len;
-        sb_string_append_len(str, buffer, read_len);
+        sb_string_append_len(str, tmp, read_len);
     }
     fclose(fp);
 
diff --git a/src/utf8.c b/src/utf8.c
index deea46d..0c04d60 100644
--- a/src/utf8.c
+++ b/src/utf8.c
@@ -86,3 +86,16 @@ blogc_utf8_validate_str(sb_string_t *str)
 {
     return blogc_utf8_validate((uint8_t*) str->str, str->len);
 }
+
+
+size_t
+blogc_utf8_skip_bom(const uint8_t *str, size_t len)
+{
+    if (len < 3)
+        return 0;
+
+    if (str[0] == 0xef && str[1] == 0xbb && str[2] == 0xbf)
+        return 3;
+
+    return 0;
+}
diff --git a/src/utf8.h b/src/utf8.h
index 837d02f..582ae1c 100644
--- a/src/utf8.h
+++ b/src/utf8.h
@@ -15,5 +15,6 @@
 
 bool blogc_utf8_validate(const uint8_t *str, size_t len);
 bool blogc_utf8_validate_str(sb_string_t *str);
+size_t blogc_utf8_skip_bom(const uint8_t *str, size_t len);
 
 #endif /* _UTF_8_H */
diff --git a/tests/check_utf8.c b/tests/check_utf8.c
index b0dec4e..9f98886 100644
--- a/tests/check_utf8.c
+++ b/tests/check_utf8.c
@@ -18,6 +18,8 @@
 #include "../src/utf8.h"
 #include "../src/utils.h"
 
+// this file MUST be ASCII
+
 
 static void
 test_utf8_valid(void **state)
@@ -25,8 +27,10 @@ test_utf8_valid(void **state)
     const char *c = "<a href=\"{{ BASE_URL }}/page/{{ PREVIOUS_PAGE }}/\">"
         "\xc2\xab Newer posts</a>";
     assert_true(blogc_utf8_validate((uint8_t*) c, strlen(c)));
-    const uint8_t d[3] = {0xe2, 0x82, 0xac};
+    const uint8_t d[3] = {0xe2, 0x82, 0xac};  // euro sign
     assert_true(blogc_utf8_validate(d, 3));
+    const uint8_t e[3] = {0xef, 0xbb, 0xbf};  // utf-8 bom
+    assert_true(blogc_utf8_validate(e, 3));
 }
 
 
@@ -70,6 +74,21 @@ test_utf8_invalid_str(void **state)
 }
 
 
+static void
+test_utf8_skip_bom(void **state)
+{
+    const char c[4] = {0xef, 0xbb, 0xbf, 0};
+    assert_int_equal(blogc_utf8_skip_bom(c, 2), 0);
+    assert_int_equal(blogc_utf8_skip_bom(c, 3), 3);
+    assert_string_equal(c + 3, "");
+    const char d[8] = {0xef, 0xbb, 0xbf, 'b', 'o', 'l', 'a', 0};
+    assert_int_equal(blogc_utf8_skip_bom(d, 8), 3);
+    assert_string_equal(d + 3, "bola");
+    const char e[5] = "bola";
+    assert_int_equal(blogc_utf8_skip_bom(e, 4), 0);
+}
+
+
 int
 main(void)
 {
@@ -78,6 +97,7 @@ main(void)
         unit_test(test_utf8_invalid),
         unit_test(test_utf8_valid_str),
         unit_test(test_utf8_invalid_str),
+        unit_test(test_utf8_skip_bom),
     };
     return run_tests(tests);
 }
author	Rafael G. Martins <rafael@rafaelmartins.eng.br>	2016-07-03 21:52:51 +0200
committer	Rafael G. Martins <rafael@rafaelmartins.eng.br>	2016-07-04 01:07:34 +0200
commit	1099a4d991942655c0291a74b488322d5da533bd (patch)
tree	0a91f413d3675122a15f49a87fd9413baf46c655
parent	4aac65c4b8f2d8415ca8d9d8449e0158e0ff1e9c (diff)
download	blogc-1099a4d991942655c0291a74b488322d5da533bd.tar.gz blogc-1099a4d991942655c0291a74b488322d5da533bd.tar.bz2 blogc-1099a4d991942655c0291a74b488322d5da533bd.zip