diff options
author | Rafael G. Martins <rafael@rafaelmartins.eng.br> | 2016-06-27 03:01:20 +0200 |
---|---|---|
committer | Rafael G. Martins <rafael@rafaelmartins.eng.br> | 2016-06-29 01:35:45 +0200 |
commit | 519f1f8031687ebf3853817a8b2e3557c2443d67 (patch) | |
tree | 320be2942773ee95c34a798da09685728115045f | |
parent | 4cfeb39fbe99be28f22611c6146b1655549f7850 (diff) | |
download | blogc-519f1f8031687ebf3853817a8b2e3557c2443d67.tar.gz blogc-519f1f8031687ebf3853817a8b2e3557c2443d67.tar.bz2 blogc-519f1f8031687ebf3853817a8b2e3557c2443d67.zip |
content-parser: rewrote inline parser.
parser is stricter now, and won't produce invalid HTML anymore.
-rw-r--r-- | src/content-parser.c | 716 | ||||
-rw-r--r-- | src/utils.c | 41 | ||||
-rw-r--r-- | src/utils.h | 2 | ||||
-rw-r--r-- | tests/check_content_parser.c | 162 | ||||
-rw-r--r-- | tests/check_utils.c | 33 |
5 files changed, 650 insertions, 304 deletions
diff --git a/src/content-parser.c b/src/content-parser.c index 2062814..b4bc790 100644 --- a/src/content-parser.c +++ b/src/content-parser.c @@ -41,36 +41,46 @@ blogc_slugify(const char *str) } +static const char* +htmlentities(char c) +{ + switch (c) { + case '&': + return "&"; + case '<': + return "<"; + case '>': + return ">"; + case '"': + return """; + case '\'': + return "'"; + case '/': + return "/"; + } + return NULL; +} + + +static void +htmlentities_append(sb_string_t *str, char c) +{ + const char *e = htmlentities(c); + if (e == NULL) + sb_string_append_c(str, c); + else + sb_string_append(str, e); +} + + char* blogc_htmlentities(const char *str) { if (str == NULL) return NULL; sb_string_t *rv = sb_string_new(); - for (size_t i = 0; str[i] != '\0'; i++) { - switch (str[i]) { - case '&': - sb_string_append(rv, "&"); - break; - case '<': - sb_string_append(rv, "<"); - break; - case '>': - sb_string_append(rv, ">"); - break; - case '"': - sb_string_append(rv, """); - break; - case '\'': - sb_string_append(rv, "'"); - break; - case '/': - sb_string_append(rv, "/"); - break; - default: - sb_string_append_c(rv, str[i]); - } - } + for (size_t i = 0; str[i] != '\0'; i++) + htmlentities_append(rv, str[i]); return sb_string_free(rv, false); } @@ -145,334 +155,500 @@ typedef enum { typedef enum { - LINK_CLOSED = 1, - LINK_IMAGE, - LINK_TEXT, - LINK_TEXT_CLOSE, - LINK_URL, - LINK_AUTO, - LINK_AUTO_CLOSE, -} blogc_content_parser_link_state_t; - - -char* -blogc_content_parse_inline(const char *src) + CONTENT_INLINE_START = 1, + CONTENT_INLINE_ASTERISK, + CONTENT_INLINE_ASTERISK_DOUBLE, + CONTENT_INLINE_UNDERSCORE, + CONTENT_INLINE_UNDERSCORE_DOUBLE, + CONTENT_INLINE_BACKTICKS, + CONTENT_INLINE_BACKTICKS_DOUBLE, + CONTENT_INLINE_LINK_START, + CONTENT_INLINE_LINK_AUTO, + CONTENT_INLINE_LINK_CONTENT, + CONTENT_INLINE_LINK_URL_START, + CONTENT_INLINE_LINK_URL, + CONTENT_INLINE_IMAGE_START, + CONTENT_INLINE_IMAGE_ALT, + CONTENT_INLINE_IMAGE_URL_START, + CONTENT_INLINE_IMAGE_URL, + CONTENT_INLINE_ENDASH, + CONTENT_INLINE_EMDASH, + CONTENT_INLINE_LINE_BREAK_START, + CONTENT_INLINE_LINE_BREAK, +} blogc_content_parser_inline_state_t; + + +static char* +blogc_content_parse_inline_internal(const char *src, size_t src_len) { - // this function is always called by blogc_content_parse or by itself, - // then its safe to assume that src is always nul-terminated. - size_t src_len = strlen(src); - size_t current = 0; size_t start = 0; - size_t start_state = 0; - size_t end = 0; - - sb_string_t *rv = sb_string_new(); - - bool open_em_ast = false; - bool open_strong_ast = false; - bool open_em_und = false; - bool open_strong_und = false; - bool open_code = false; - bool open_code_double = false; - - blogc_content_parser_link_state_t state = LINK_CLOSED; - bool is_image = false; + size_t count = 0; - char *tmp = NULL; + const char *tmp = NULL; char *tmp2 = NULL; + char *tmp3 = NULL; + + size_t start_link = 0; + char *link1 = NULL; - unsigned int open_bracket = 0; - unsigned int spaces = 0; + sb_string_t *rv = sb_string_new(); - bool escape = false; + blogc_content_parser_inline_state_t state = CONTENT_INLINE_START; while (current < src_len) { char c = src[current]; bool is_last = current == src_len - 1; - if (escape) { - if (state == LINK_CLOSED) - sb_string_append_c(rv, c); - current++; - escape = false; - continue; - } - - if (c != ' ' && c != '\n' && c != '\r') - spaces = 0; - - if (state == LINK_TEXT_CLOSE && c != ' ' && c != '\n' && c != '\r' && - c != '(') - { - sb_string_append_c(rv, src[start_state]); - tmp = blogc_content_parse_inline(src + start_state + 1); - sb_string_append(rv, tmp); - // no need to free here, we will exit the loop! - break; - } - - switch (c) { - - case '\\': - if (state == LINK_CLOSED && (open_code || open_code_double)) { - sb_string_append_c(rv, c); + switch (state) { + case CONTENT_INLINE_START: + if (is_last) { + htmlentities_append(rv, c); + break; + } + if (c == '\\') { + htmlentities_append(rv, src[++current]); + break; + } + if (c == '*') { + state = CONTENT_INLINE_ASTERISK; break; } - if (!escape) - escape = true; + if (c == '_') { + state = CONTENT_INLINE_UNDERSCORE; + break; + } + if (c == '`') { + state = CONTENT_INLINE_BACKTICKS; + break; + } + if (c == '[') { + state = CONTENT_INLINE_LINK_START; + break; + } + if (c == '!') { + state = CONTENT_INLINE_IMAGE_START; + break; + } + if (c == '-') { + state = CONTENT_INLINE_ENDASH; + break; + } + if (c == ' ') { + state = CONTENT_INLINE_LINE_BREAK_START; + break; + } + htmlentities_append(rv, c); break; - case '*': - case '_': - if (state == LINK_CLOSED && (open_code || open_code_double)) { - sb_string_append_c(rv, c); + case CONTENT_INLINE_ASTERISK: + if (c == '*') { + state = CONTENT_INLINE_ASTERISK_DOUBLE; break; } - if (!is_last && src[current + 1] == c) { - current++; - if ((c == '*' && open_strong_ast) || - (c == '_' && open_strong_und)) - { - if (state == LINK_CLOSED) - sb_string_append(rv, "</strong>"); - if (c == '*') - open_strong_ast = false; - else - open_strong_und = false; + tmp = sb_str_find(src + current, '*'); + if (tmp == NULL || ((tmp - src) >= src_len)) { + sb_string_append_c(rv, '*'); + state = CONTENT_INLINE_START; + continue; + } + tmp2 = blogc_content_parse_inline_internal( + src + current, (tmp - src) - current); + sb_string_append_printf(rv, "<em>%s</em>", tmp2); + current = tmp - src; + tmp = NULL; + free(tmp2); + tmp2 = NULL; + state = CONTENT_INLINE_START; + break; + + case CONTENT_INLINE_ASTERISK_DOUBLE: + tmp = src + current; + do { + tmp = sb_str_find(tmp, '*'); + if (((tmp - src) < src_len) && *(tmp + 1) == '*') { break; } - if (state == LINK_CLOSED) - sb_string_append(rv, "<strong>"); - if (c == '*') - open_strong_ast = true; - else - open_strong_und = true; - break; + tmp++; + } while (tmp != NULL && (tmp - src) < src_len); + if (tmp == NULL || ((tmp - src) >= src_len)) { + sb_string_append_c(rv, '*'); + sb_string_append_c(rv, '*'); + state = CONTENT_INLINE_START; + continue; } - if ((c == '*' && open_em_ast) || (c == '_' && open_em_und)) { - if (state == LINK_CLOSED) - sb_string_append(rv, "</em>"); - if (c == '*') - open_em_ast = false; - else - open_em_und = false; + tmp2 = blogc_content_parse_inline_internal( + src + current, (tmp - src) - current); + sb_string_append_printf(rv, "<strong>%s</strong>", tmp2); + current = tmp - src + 1; + tmp = NULL; + free(tmp2); + tmp2 = NULL; + state = CONTENT_INLINE_START; + break; + + case CONTENT_INLINE_UNDERSCORE: + if (c == '_') { + state = CONTENT_INLINE_UNDERSCORE_DOUBLE; break; } - if (state == LINK_CLOSED) - sb_string_append(rv, "<em>"); - if (c == '*') - open_em_ast = true; - else - open_em_und = true; + tmp = sb_str_find(src + current, '_'); + if (tmp == NULL || ((tmp - src) >= src_len)) { + sb_string_append_c(rv, '_'); + state = CONTENT_INLINE_START; + continue; + } + tmp2 = blogc_content_parse_inline_internal( + src + current, (tmp - src) - current); + sb_string_append_printf(rv, "<em>%s</em>", tmp2); + current = tmp - src; + tmp = NULL; + free(tmp2); + tmp2 = NULL; + state = CONTENT_INLINE_START; break; - case '`': - if (!is_last && src[current + 1] == c) { - current++; - if (state == LINK_CLOSED) - sb_string_append_printf(rv, "<%scode>", - open_code_double ? "/" : ""); - open_code_double = !open_code_double; + case CONTENT_INLINE_UNDERSCORE_DOUBLE: + tmp = src + current; + do { + tmp = sb_str_find(tmp, '_'); + if (((tmp - src) < src_len) && *(tmp + 1) == '_') { + break; + } + tmp++; + } while (tmp != NULL && (tmp - src) < src_len); + if (tmp == NULL || ((tmp - src) >= src_len)) { + sb_string_append_c(rv, '_'); + sb_string_append_c(rv, '_'); + state = CONTENT_INLINE_START; + continue; + } + tmp2 = blogc_content_parse_inline_internal( + src + current, (tmp - src) - current); + sb_string_append_printf(rv, "<strong>%s</strong>", tmp2); + current = tmp - src + 1; + tmp = NULL; + free(tmp2); + tmp2 = NULL; + state = CONTENT_INLINE_START; + break; + + case CONTENT_INLINE_BACKTICKS: + if (c == '`') { + state = CONTENT_INLINE_BACKTICKS_DOUBLE; break; } - if (state == LINK_CLOSED) - sb_string_append_printf(rv, "<%scode>", open_code ? "/" : ""); - open_code = !open_code; + tmp = sb_str_find(src + current, '`'); + if (tmp == NULL || ((tmp - src) >= src_len)) { + sb_string_append_c(rv, '`'); + state = CONTENT_INLINE_START; + continue; + } + tmp3 = sb_strndup(src + current, (tmp - src) - current); + tmp2 = blogc_htmlentities(tmp3); + free(tmp3); + tmp3 = NULL; + sb_string_append(rv, "<code>"); + sb_string_append_escaped(rv, tmp2); + sb_string_append(rv, "</code>"); + current = tmp - src; + tmp = NULL; + free(tmp2); + tmp2 = NULL; + state = CONTENT_INLINE_START; break; - case '!': - if (state == LINK_CLOSED) { - if (open_code || open_code_double) { - sb_string_append_c(rv, c); + case CONTENT_INLINE_BACKTICKS_DOUBLE: + tmp = src + current; + do { + tmp = sb_str_find(tmp, '`'); + if (((tmp - src) < src_len) && *(tmp + 1) == '`') { break; } - if (!is_last && src[current + 1] != '[') { - sb_string_append_c(rv, c); + tmp++; + } while (tmp != NULL && (tmp - src) < src_len); + if (tmp == NULL || ((tmp - src) >= src_len)) { + sb_string_append_c(rv, '`'); + sb_string_append_c(rv, '`'); + state = CONTENT_INLINE_START; + continue; + } + tmp3 = sb_strndup(src + current, (tmp - src) - current); + tmp2 = blogc_htmlentities(tmp3); + free(tmp3); + tmp3 = NULL; + sb_string_append(rv, "<code>"); + sb_string_append_escaped(rv, tmp2); + sb_string_append(rv, "</code>"); + current = tmp - src + 1; + tmp = NULL; + free(tmp2); + tmp2 = NULL; + state = CONTENT_INLINE_START; + break; + + case CONTENT_INLINE_LINK_START: + if (c == '[') { + state = CONTENT_INLINE_LINK_AUTO; + break; + } + start_link = current; + count = 1; + state = CONTENT_INLINE_LINK_CONTENT; + break; + + case CONTENT_INLINE_LINK_AUTO: + tmp = src + current; + do { + tmp = sb_str_find(tmp, ']'); + if (((tmp - src) < src_len) && *(tmp + 1) == ']') { break; } - state = LINK_IMAGE; - is_image = true; - start_state = current; + tmp++; + } while (tmp != NULL && (tmp - src) < src_len); + if (tmp == NULL || ((tmp - src) >= src_len)) { + sb_string_append_c(rv, '['); + sb_string_append_c(rv, '['); + state = CONTENT_INLINE_START; + continue; } + tmp2 = sb_strndup(src + current, (tmp - src) - current); + sb_string_append(rv, "<a href=\""); + sb_string_append_escaped(rv, tmp2); + sb_string_append(rv, "\">"); + sb_string_append_escaped(rv, tmp2); + sb_string_append(rv, "</a>"); + current = tmp - src + 1; + tmp = NULL; + free(tmp2); + tmp2 = NULL; + state = CONTENT_INLINE_START; break; - case '[': - if (state == LINK_CLOSED && (open_code || open_code_double)) { - sb_string_append_c(rv, c); + case CONTENT_INLINE_LINK_CONTENT: + if (c == '\\') { + current++; break; } - if (state == LINK_CLOSED || state == LINK_IMAGE) { - if (state == LINK_CLOSED) - start_state = current; - state = LINK_TEXT; - start = current + 1; - open_bracket = 0; + if (c == '[') { // links can be nested :/ + count++; break; } - if (state == LINK_TEXT) { - if (current == start) { - start = current + 1; - state = LINK_AUTO; - break; + if (c == ']') { + if (--count == 0) { + link1 = sb_strndup(src + start_link, current - start_link); + state = CONTENT_INLINE_LINK_URL_START; } - open_bracket++; - break; } break; - case ']': - if (state == LINK_AUTO) { - end = current; - state = LINK_AUTO_CLOSE; + case CONTENT_INLINE_LINK_URL_START: + if (c == ' ' || c == '\t' || c == '\n' || c == '\r') + break; + if (c == '(') { + state = CONTENT_INLINE_LINK_URL; + start = current + 1; break; } - if (state == LINK_AUTO_CLOSE) { - state = LINK_CLOSED; - tmp = sb_strndup(src + start, end - start); - sb_string_append_printf(rv, "<a href=\"%s\">%s</a>", tmp, tmp); - end = 0; - free(tmp); - tmp = NULL; - is_image = false; + sb_string_append_c(rv, '['); + state = CONTENT_INLINE_START; + current = start_link; + start_link = 0; + continue; + + case CONTENT_INLINE_LINK_URL: + if (c == '\\') { + current++; break; } - if (state == LINK_TEXT) { - if (open_bracket-- == 0) { - state = LINK_TEXT_CLOSE; - tmp = sb_strndup(src + start, current - start); - tmp2 = blogc_content_parse_inline(tmp); - free(tmp); - tmp = NULL; - } + if (c == ')') { + tmp2 = sb_strndup(src + start, current - start); + tmp3 = blogc_content_parse_inline(link1); + free(link1); + link1 = NULL; + sb_string_append(rv, "<a href=\""); + sb_string_append_escaped(rv, tmp2); + sb_string_append_printf(rv, "\">%s</a>", tmp3); + free(tmp2); + tmp2 = NULL; + free(tmp3); + tmp3 = NULL; + state = CONTENT_INLINE_START; break; } - if (state == LINK_CLOSED) - sb_string_append_c(rv, c); break; - case '(': - if (state == LINK_TEXT_CLOSE) { - state = LINK_URL; - start = current + 1; + case CONTENT_INLINE_IMAGE_START: + // we use the same variables used for links, because why not? + if (c == '[') { + state = CONTENT_INLINE_IMAGE_ALT; + start_link = current + 1; + break; + } + sb_string_append_c(rv, '!'); + state = CONTENT_INLINE_START; + continue; + + case CONTENT_INLINE_IMAGE_ALT: + if (c == '\\') { + current++; break; } - if (state == LINK_CLOSED) - sb_string_append_c(rv, c); + if (c == ']') { + link1 = sb_strndup(src + start_link, current - start_link); + state = CONTENT_INLINE_IMAGE_URL_START; + } break; - case ')': - if (state == LINK_URL) { - state = LINK_CLOSED; - tmp = sb_strndup(src + start, current - start); - if (is_image) - sb_string_append_printf(rv, "<img src=\"%s\" alt=\"%s\">", - tmp, tmp2); - else - sb_string_append_printf(rv, "<a href=\"%s\">%s</a>", - tmp, tmp2); - free(tmp); - tmp = NULL; + case CONTENT_INLINE_IMAGE_URL_START: + if (c == ' ' || c == '\t' || c == '\n' || c == '\r') + break; + if (c == '(') { + state = CONTENT_INLINE_IMAGE_URL; + start = current + 1; + break; + } + sb_string_append_c(rv, '!'); + sb_string_append_c(rv, '['); + state = CONTENT_INLINE_START; + current = start_link; + start_link = 0; + continue; + + case CONTENT_INLINE_IMAGE_URL: + if (c == '\\') { + current++; + break; + } + if (c == ')') { + tmp2 = sb_strndup(src + start, current - start); + sb_string_append(rv, "<img src=\""); + sb_string_append_escaped(rv, tmp2); + sb_string_append(rv, "\" alt=\""); + sb_string_append_escaped(rv, link1); + sb_string_append(rv, "\">"); free(tmp2); tmp2 = NULL; - is_image = false; + free(link1); + link1 = NULL; + state = CONTENT_INLINE_START; break; } - if (state == LINK_CLOSED) - sb_string_append_c(rv, c); break; - case ' ': - if (state == LINK_CLOSED) { - spaces++; - sb_string_append_c(rv, c); + case CONTENT_INLINE_ENDASH: + if (c == '-') { + if (is_last) { + sb_string_append(rv, "–"); + state = CONTENT_INLINE_START; // wat + break; + } + state = CONTENT_INLINE_EMDASH; + break; } - if (!is_last) + sb_string_append_c(rv, '-'); + state = CONTENT_INLINE_START; + continue; + + case CONTENT_INLINE_EMDASH: + if (c == '-') { + sb_string_append(rv, "—"); + state = CONTENT_INLINE_START; break; + } + sb_string_append(rv, "–"); + state = CONTENT_INLINE_START; + continue; - case '\n': - case '\r': - if (state == LINK_CLOSED) { - if (spaces >= 2) { + case CONTENT_INLINE_LINE_BREAK_START: + if (c == ' ') { + if (is_last) { sb_string_append(rv, "<br />"); - spaces = 0; + state = CONTENT_INLINE_START; // wat + break; } - if (c == '\n' || c == '\r') - sb_string_append_c(rv, c); + count = 2; + state = CONTENT_INLINE_LINE_BREAK; + break; } - break; + sb_string_append_c(rv, ' '); + state = CONTENT_INLINE_START; + continue; - case '-': - if (state != LINK_CLOSED) - break; - if ((current < (src_len - 1) && src[current + 1] == '-') && - !(open_code || open_code_double)) - { - if (current < (src_len - 2) && src[current + 2] == '-') { - sb_string_append(rv, "—"); - current += 2; - } - else { - sb_string_append(rv, "–"); - current += 1; + case CONTENT_INLINE_LINE_BREAK: + if (c == ' ') { + if (is_last) { + sb_string_append(rv, "<br />"); + state = CONTENT_INLINE_START; // wat + break; } + count++; + break; } - else { - sb_string_append_c(rv, c); + if (c == '\n' || c == '\r') { + sb_string_append_printf(rv, "<br />%c", c); + state = CONTENT_INLINE_START; + break; } - break; - - case '&': - if (state == LINK_CLOSED) - sb_string_append(rv, "&"); - break; - - case '<': - if (state == LINK_CLOSED) - sb_string_append(rv, "<"); - break; - - case '>': - if (state == LINK_CLOSED) - sb_string_append(rv, ">"); - break; - - case '"': - if (state == LINK_CLOSED) - sb_string_append(rv, """); - break; - - case '\'': - if (state == LINK_CLOSED) - sb_string_append(rv, "'"); - break; - - case '/': - if (state == LINK_CLOSED) - sb_string_append(rv, "/"); - break; - - default: - if (state == LINK_CLOSED) - sb_string_append_c(rv, c); - } - - if (is_last && state != LINK_CLOSED) { - sb_string_append_c(rv, src[start_state]); - tmp = blogc_content_parse_inline(src + start_state + 1); - sb_string_append(rv, tmp); - // no need to free here, its the last iteration + for (size_t i = 0; i < count; i++) + sb_string_append_c(rv, ' '); + state = CONTENT_INLINE_START; + continue; } current++; } - free(tmp); + switch (state) { + + // if after the end of the loop we are on any of the following states, + // we must call the parser again, from start_link + case CONTENT_INLINE_IMAGE_START: + case CONTENT_INLINE_IMAGE_ALT: + case CONTENT_INLINE_IMAGE_URL_START: + case CONTENT_INLINE_IMAGE_URL: + sb_string_append_c(rv, '!'); + + case CONTENT_INLINE_LINK_CONTENT: + case CONTENT_INLINE_LINK_URL_START: + case CONTENT_INLINE_LINK_URL: + tmp2 = blogc_content_parse_inline(src + start_link); + sb_string_append_c(rv, '['); + sb_string_append_escaped(rv, tmp2); // no need to free, as it wil be done below. + break; + + // add all the other states here explicitly, so the compiler helps us + // not missing any new state that should be handled. + case CONTENT_INLINE_START: + case CONTENT_INLINE_ASTERISK: + case CONTENT_INLINE_ASTERISK_DOUBLE: + case CONTENT_INLINE_UNDERSCORE: + case CONTENT_INLINE_UNDERSCORE_DOUBLE: + case CONTENT_INLINE_BACKTICKS: + case CONTENT_INLINE_BACKTICKS_DOUBLE: + case CONTENT_INLINE_LINK_START: + case CONTENT_INLINE_LINK_AUTO: + case CONTENT_INLINE_ENDASH: + case CONTENT_INLINE_EMDASH: + case CONTENT_INLINE_LINE_BREAK_START: + case CONTENT_INLINE_LINE_BREAK: + break; + } + free(tmp2); + free(tmp3); + free(link1); return sb_string_free(rv, false); } +char* +blogc_content_parse_inline(const char *src) +{ + return blogc_content_parse_inline_internal(src, strlen(src)); +} + + bool blogc_is_ordered_list_item(const char *str, size_t prefix_len) { diff --git a/src/utils.c b/src/utils.c index 855b503..d7362a6 100644 --- a/src/utils.c +++ b/src/utils.c @@ -287,6 +287,27 @@ sb_str_replace(const char *str, const char search, const char *replace) } +char* +sb_str_find(const char *str, char c) +{ + // this is somewhat similar to strchr, but respects '\' escaping. + if (str == NULL) + return NULL; + if (c == '\0') + return (char*) str + strlen(str); + for (size_t i = 0; str[i] != '\0'; i++) { + if (str[i] == '\\') { + i++; + continue; + } + if (str[i] == c) { + return (char*) str + i; + } + } + return NULL; +} + + void sb_strv_free(char **strv) { @@ -425,6 +446,26 @@ sb_string_append_printf(sb_string_t *str, const char *format, ...) } +sb_string_t* +sb_string_append_escaped(sb_string_t *str, const char *suffix) +{ + if (str == NULL) + return NULL; + if (suffix == NULL) + return str; + bool escaped = false; + for (size_t i = 0; suffix[i] != '\0'; i++) { + if (suffix[i] == '\\' && !escaped) { + escaped = true; + continue; + } + escaped = false; + str = sb_string_append_c(str, suffix[i]); + } + return str; +} + + sb_trie_t* sb_trie_new(sb_free_func_t free_func) { diff --git a/src/utils.h b/src/utils.h index 411295a..aefcbf3 100644 --- a/src/utils.h +++ b/src/utils.h @@ -51,6 +51,7 @@ char* sb_str_rstrip(char *str); char* sb_str_strip(char *str); char** sb_str_split(const char *str, char c, unsigned int max_pieces); char* sb_str_replace(const char *str, const char search, const char *replace); +char* sb_str_find(const char *str, char c); void sb_strv_free(char **strv); char* sb_strv_join(char **strv, const char *separator); size_t sb_strv_length(char **strv); @@ -71,6 +72,7 @@ sb_string_t* sb_string_append_len(sb_string_t *str, const char *suffix, size_t l sb_string_t* sb_string_append(sb_string_t *str, const char *suffix); sb_string_t* sb_string_append_c(sb_string_t *str, char c); sb_string_t* sb_string_append_printf(sb_string_t *str, const char *format, ...); +sb_string_t* sb_string_append_escaped(sb_string_t *str, const char *suffix); // trie diff --git a/tests/check_content_parser.c b/tests/check_content_parser.c index 783d3f3..b0272fb 100644 --- a/tests/check_content_parser.c +++ b/tests/check_content_parser.c @@ -184,7 +184,7 @@ test_content_parse(void **state) "<h6 id=\"seis\">seis</h6>\n" "<p>bola\n" "chunda</p>\n" - "<blockquote><p>bola <br />\n" + "<blockquote><p>bola<br />\n" "guda\n" "buga</p>\n" "<pre><code>asd</code></pre>\n" @@ -276,7 +276,7 @@ test_content_parse_crlf(void **state) "<h6 id=\"seis\">seis</h6>\r\n" "<p>bola\r\n" "chunda</p>\r\n" - "<blockquote><p>bola <br />\r\n" + "<blockquote><p>bola<br />\r\n" "guda\r\n" "buga</p>\r\n" "<pre><code>asd</code></pre>\r\n" @@ -1485,14 +1485,13 @@ test_content_parse_invalid_code(void **state) static void test_content_parse_invalid_horizontal_rule(void **state) { - // this generates invalid html, but... char *html = blogc_content_parse("** asd", NULL, NULL); assert_non_null(html); - assert_string_equal(html, "<p><strong> asd</p>\n"); + assert_string_equal(html, "<p>** asd</p>\n"); free(html); html = blogc_content_parse("** asd\n", NULL, NULL); assert_non_null(html); - assert_string_equal(html, "<p><strong> asd</p>\n"); + assert_string_equal(html, "<p>** asd</p>\n"); free(html); } @@ -1500,13 +1499,12 @@ test_content_parse_invalid_horizontal_rule(void **state) static void test_content_parse_invalid_unordered_list(void **state) { - // more invalid html char *html = blogc_content_parse( "* asd\n" "1. qwe", NULL, NULL); assert_non_null(html); assert_string_equal(html, - "<p><em> asd\n" + "<p>* asd\n" "1. qwe</p>\n"); free(html); html = blogc_content_parse( @@ -1515,7 +1513,7 @@ test_content_parse_invalid_unordered_list(void **state) "\n", NULL, NULL); assert_non_null(html); assert_string_equal(html, - "<p><em> asd\n" + "<p>* asd\n" "1. qwe</p>\n"); free(html); html = blogc_content_parse( @@ -1523,7 +1521,7 @@ test_content_parse_invalid_unordered_list(void **state) "1. qwe\n", NULL, NULL); assert_non_null(html); assert_string_equal(html, - "<p><em> asd\n" + "<p>* asd\n" "1. qwe" "</p>\n"); free(html); @@ -1532,7 +1530,7 @@ test_content_parse_invalid_unordered_list(void **state) "1. qwe\n", NULL, NULL); assert_non_null(html); assert_string_equal(html, - "<p><em> asd\n" + "<p>* asd\n" "1. qwe" "</p>\n"); free(html); @@ -1546,7 +1544,7 @@ test_content_parse_invalid_unordered_list(void **state) assert_non_null(html); assert_string_equal(html, "<p>chunda</p>\n" - "<p><em> asd\n" + "<p>* asd\n" "1. qwe</p>\n" "<p>poi</p>\n"); free(html); @@ -1556,14 +1554,13 @@ test_content_parse_invalid_unordered_list(void **state) static void test_content_parse_invalid_ordered_list(void **state) { - // more invalid html char *html = blogc_content_parse( "1. asd\n" "* qwe", NULL, NULL); assert_non_null(html); assert_string_equal(html, "<p>1. asd\n" - "<em> qwe</p>\n"); + "* qwe</p>\n"); free(html); html = blogc_content_parse( "1. asd\n" @@ -1572,7 +1569,7 @@ test_content_parse_invalid_ordered_list(void **state) assert_non_null(html); assert_string_equal(html, "<p>1. asd\n" - "<em> qwe</p>\n"); + "* qwe</p>\n"); free(html); html = blogc_content_parse( "1. asd\n" @@ -1580,7 +1577,7 @@ test_content_parse_invalid_ordered_list(void **state) assert_non_null(html); assert_string_equal(html, "<p>1. asd\n" - "<em> qwe" + "* qwe" "</p>\n"); free(html); html = blogc_content_parse( @@ -1589,7 +1586,7 @@ test_content_parse_invalid_ordered_list(void **state) assert_non_null(html); assert_string_equal(html, "<p>1. asd\n" - "<em> qwe" + "* qwe" "</p>\n"); free(html); html = blogc_content_parse( @@ -1603,7 +1600,7 @@ test_content_parse_invalid_ordered_list(void **state) assert_string_equal(html, "<p>chunda</p>\n" "<p>1. asd\n" - "<em> qwe</p>\n" + "* qwe</p>\n" "<p>poi</p>\n"); free(html); html = blogc_content_parse( @@ -1612,7 +1609,7 @@ test_content_parse_invalid_ordered_list(void **state) assert_non_null(html); assert_string_equal(html, "<p>1 asd\n" - "<em> qwe</p>\n"); + "* qwe</p>\n"); free(html); html = blogc_content_parse( "a. asd\n" @@ -1677,6 +1674,10 @@ test_content_parse_inline_em(void **state) assert_non_null(html); assert_string_equal(html, "<em>bola</em>\n"); free(html); + html = blogc_content_parse_inline("*bo\\*la*\n"); + assert_non_null(html); + assert_string_equal(html, "<em>bo*la</em>\n"); + free(html); html = blogc_content_parse_inline("_bola_"); assert_non_null(html); assert_string_equal(html, "<em>bola</em>"); @@ -1685,14 +1686,25 @@ test_content_parse_inline_em(void **state) assert_non_null(html); assert_string_equal(html, "<em>bola</em>\n"); free(html); + html = blogc_content_parse_inline("_bo\\*la_\n"); + assert_non_null(html); + assert_string_equal(html, "<em>bo*la</em>\n"); + free(html); html = blogc_content_parse_inline("_**bola**_\n"); assert_non_null(html); assert_string_equal(html, "<em><strong>bola</strong></em>\n"); free(html); - // this is not really valid + html = blogc_content_parse_inline("_**bo\\_\\*la**_\n"); + assert_non_null(html); + assert_string_equal(html, "<em><strong>bo_*la</strong></em>\n"); + free(html); html = blogc_content_parse_inline("_**bola\n"); assert_non_null(html); - assert_string_equal(html, "<em><strong>bola\n"); + assert_string_equal(html, "_**bola\n"); + free(html); + html = blogc_content_parse_inline("**_bola\\*\n"); + assert_non_null(html); + assert_string_equal(html, "**_bola*\n"); free(html); } @@ -1708,6 +1720,10 @@ test_content_parse_inline_strong(void **state) assert_non_null(html); assert_string_equal(html, "<strong>bola</strong>\n"); free(html); + html = blogc_content_parse_inline("**bo\*la**\n"); + assert_non_null(html); + assert_string_equal(html, "<strong>bo*la</strong>\n"); + free(html); html = blogc_content_parse_inline("__bola__"); assert_non_null(html); assert_string_equal(html, "<strong>bola</strong>"); @@ -1716,14 +1732,25 @@ test_content_parse_inline_strong(void **state) assert_non_null(html); assert_string_equal(html, "<strong>bola</strong>\n"); free(html); + html = blogc_content_parse_inline("__bo\*la__\n"); + assert_non_null(html); + assert_string_equal(html, "<strong>bo*la</strong>\n"); + free(html); html = blogc_content_parse_inline("__*bola*__\n"); assert_non_null(html); assert_string_equal(html, "<strong><em>bola</em></strong>\n"); free(html); - // this is not really valid + html = blogc_content_parse_inline("__*bo\\_\\*la*__\n"); + assert_non_null(html); + assert_string_equal(html, "<strong><em>bo_*la</em></strong>\n"); + free(html); html = blogc_content_parse_inline("__*bola\n"); assert_non_null(html); - assert_string_equal(html, "<strong><em>bola\n"); + assert_string_equal(html, "__*bola\n"); + free(html); + html = blogc_content_parse_inline("__*bola\\_\n"); + assert_non_null(html); + assert_string_equal(html, "__*bola_\n"); free(html); } @@ -1751,18 +1778,29 @@ test_content_parse_inline_code(void **state) assert_non_null(html); assert_string_equal(html, "<code>bo*la</code>\n"); free(html); - // invalid + html = blogc_content_parse_inline("``bo<la``\n"); + assert_non_null(html); + assert_string_equal(html, "<code>bo<la</code>\n"); + free(html); + html = blogc_content_parse_inline("`bo\\`\\`la`\n"); + assert_non_null(html); + assert_string_equal(html, "<code>bo``la</code>\n"); + free(html); + html = blogc_content_parse_inline("``bo\\`\\`la``\n"); + assert_non_null(html); + assert_string_equal(html, "<code>bo``la</code>\n"); + free(html); html = blogc_content_parse_inline("``bola\n"); assert_non_null(html); - assert_string_equal(html, "<code>bola\n"); + assert_string_equal(html, "``bola\n"); free(html); html = blogc_content_parse_inline("`bola\n"); assert_non_null(html); - assert_string_equal(html, "<code>bola\n"); + assert_string_equal(html, "`bola\n"); free(html); html = blogc_content_parse_inline("``bola`\n"); assert_non_null(html); - assert_string_equal(html, "<code>bola<code>\n"); + assert_string_equal(html, "``bola`\n"); free(html); } @@ -1802,9 +1840,9 @@ test_content_parse_inline_link(void **state) assert_non_null(html); assert_string_equal(html, "<a href=\"http://example.org/\"><code>bola</code></a>\n"); free(html); - html = blogc_content_parse_inline("[``bola(2)[3]**!\\``](http://example.org/)\n"); + html = blogc_content_parse_inline("[``bola(2)[3]**!\\```](http://example.org/)\n"); assert_non_null(html); - assert_string_equal(html, "<a href=\"http://example.org/\"><code>bola(2)[3]**!\\</code></a>\n"); + assert_string_equal(html, "<a href=\"http://example.org/\"><code>bola(2)[3]**!`</code></a>\n"); free(html); html = blogc_content_parse_inline("test suite!)\n" "depends on [cmocka](http://cmocka.org/), though.\n"); @@ -1821,6 +1859,10 @@ test_content_parse_inline_link(void **state) assert_non_null(html); assert_string_equal(html, "<a href=\"\nhttp://example.org/\">bola</a>\n"); free(html); + html = blogc_content_parse_inline("[bo[]\\[\\]()la](http://example.org/?\\(\\))\n"); + assert_non_null(html); + assert_string_equal(html, "<a href=\"http://example.org/?()\">bo[][]()la</a>\n"); + free(html); html = blogc_content_parse_inline("[bola](http://example.org/\n"); assert_non_null(html); assert_string_equal(html, "[bola](http://example.org/\n"); @@ -1863,13 +1905,21 @@ test_content_parse_inline_link_auto(void **state) assert_non_null(html); assert_string_equal(html, "<a href=\"guda\">guda</a>\n"); free(html); + html = blogc_content_parse_inline("[[http://example.org/?\\[\\]]]\n"); + assert_non_null(html); + assert_string_equal(html, "<a href=\"http://example.org/?[]\">http://example.org/?[]</a>\n"); + free(html); + html = blogc_content_parse_inline("[[http://example.org/?\\[\\]a]]\n"); + assert_non_null(html); + assert_string_equal(html, "<a href=\"http://example.org/?[]a\">http://example.org/?[]a</a>\n"); + free(html); html = blogc_content_parse_inline("[[guda]asd]"); assert_non_null(html); - assert_string_equal(html, "<a href=\"guda\">guda</a>"); + assert_string_equal(html, "[[guda]asd]"); free(html); html = blogc_content_parse_inline("[[guda]asd]\n"); assert_non_null(html); - assert_string_equal(html, "<a href=\"guda\">guda</a>\n"); + assert_string_equal(html, "[[guda]asd]\n"); free(html); html = blogc_content_parse_inline("[[guda]asd"); assert_non_null(html); @@ -1942,6 +1992,10 @@ test_content_parse_inline_image(void **state) assert_non_null(html); assert_string_equal(html, "<img src=\"\nhttp://example.org/\" alt=\"bola\">\n"); free(html); + html = blogc_content_parse_inline("![bo\\[\\]()la](http://example.org/?\\(\\))\n"); + assert_non_null(html); + assert_string_equal(html, "<img src=\"http://example.org/?()\" alt=\"bo[]()la\">\n"); + free(html); html = blogc_content_parse_inline("![bola](http://example.org/\n"); assert_non_null(html); assert_string_equal(html, "![bola](http://example.org/\n"); @@ -1994,15 +2048,15 @@ test_content_parse_inline_line_break(void **state) { char *html = blogc_content_parse_inline("asd \n"); assert_non_null(html); - assert_string_equal(html, "asd <br />\n"); + assert_string_equal(html, "asd<br />\n"); free(html); html = blogc_content_parse_inline("asd "); assert_non_null(html); - assert_string_equal(html, "asd <br />"); + assert_string_equal(html, "asd<br />"); free(html); html = blogc_content_parse_inline("asd "); assert_non_null(html); - assert_string_equal(html, "asd <br />"); + assert_string_equal(html, "asd<br />"); free(html); // invalid html = blogc_content_parse_inline("asd "); @@ -2021,7 +2075,7 @@ test_content_parse_inline_line_break_crlf(void **state) { char *html = blogc_content_parse_inline("asd \r\n"); assert_non_null(html); - assert_string_equal(html, "asd <br />\r\n"); + assert_string_equal(html, "asd<br />\r\n"); free(html); html = blogc_content_parse_inline("asd \r\n"); assert_non_null(html); @@ -2041,6 +2095,46 @@ test_content_parse_inline_endash_emdash(void **state) assert_non_null(html); assert_string_equal(html, "foo — bar"); free(html); + html = blogc_content_parse_inline("foo --"); + assert_non_null(html); + assert_string_equal(html, "foo –"); + free(html); + html = blogc_content_parse_inline("foo ---"); + assert_non_null(html); + assert_string_equal(html, "foo —"); + free(html); + html = blogc_content_parse_inline("foo \\-\\-"); + assert_non_null(html); + assert_string_equal(html, "foo --"); + free(html); + html = blogc_content_parse_inline("foo \\-\\-\\-"); + assert_non_null(html); + assert_string_equal(html, "foo ---"); + free(html); + html = blogc_content_parse_inline("foo \\---"); + assert_non_null(html); + assert_string_equal(html, "foo -–"); + free(html); + html = blogc_content_parse_inline("foo \\----"); + assert_non_null(html); + assert_string_equal(html, "foo -—"); + free(html); + html = blogc_content_parse_inline("foo \\-\\- bar"); + assert_non_null(html); + assert_string_equal(html, "foo -- bar"); + free(html); + html = blogc_content_parse_inline("foo \\-\\-\\- bar"); + assert_non_null(html); + assert_string_equal(html, "foo --- bar"); + free(html); + html = blogc_content_parse_inline("foo \\--- bar"); + assert_non_null(html); + assert_string_equal(html, "foo -– bar"); + free(html); + html = blogc_content_parse_inline("foo \\---- bar"); + assert_non_null(html); + assert_string_equal(html, "foo -— bar"); + free(html); html = blogc_content_parse_inline("`foo -- bar`"); assert_non_null(html); assert_string_equal(html, "<code>foo -- bar</code>"); diff --git a/tests/check_utils.c b/tests/check_utils.c index 6a6ceca..31087f1 100644 --- a/tests/check_utils.c +++ b/tests/check_utils.c @@ -257,6 +257,18 @@ test_str_replace(void **state) static void +test_str_find(void **state) +{ + assert_null(sb_str_find(NULL, 'c')); + assert_string_equal(sb_str_find("bola", 'l'), "la"); + assert_string_equal(sb_str_find("bo\\lalala", 'l'), "lala"); + assert_string_equal(sb_str_find("bola", '\0'), ""); + assert_null(sb_str_find("bola", 'g')); + assert_null(sb_str_find("bo\\la", 'l')); +} + + +static void test_strv_join(void **state) { char *pieces[] = {"guda","bola", "chunda", NULL}; @@ -529,6 +541,25 @@ test_string_append_printf(void **state) static void +test_string_append_escaped(void **state) +{ + sb_string_t *str = sb_string_new(); + str = sb_string_append_escaped(str, NULL); + assert_non_null(str); + assert_string_equal(str->str, ""); + assert_int_equal(str->len, 0); + assert_int_equal(str->allocated_len, SB_STRING_CHUNK_SIZE); + str = sb_string_append_escaped(str, "foo \\a bar \\\\ lol"); + assert_non_null(str); + assert_string_equal(str->str, "foo a bar \\ lol"); + assert_int_equal(str->len, 15); + assert_int_equal(str->allocated_len, SB_STRING_CHUNK_SIZE); + assert_null(sb_string_free(str, true)); + assert_null(sb_string_append_escaped(NULL, "asd")); +} + + +static void test_trie_new(void **state) { sb_trie_t *trie = sb_trie_new(free); @@ -934,6 +965,7 @@ main(void) unit_test(test_str_strip), unit_test(test_str_split), unit_test(test_str_replace), + unit_test(test_str_find), unit_test(test_strv_join), unit_test(test_strv_length), @@ -945,6 +977,7 @@ main(void) unit_test(test_string_append), unit_test(test_string_append_c), unit_test(test_string_append_printf), + unit_test(test_string_append_escaped), // trie unit_test(test_trie_new), |