diff options
| -rw-r--r-- | src/content-parser.c | 716 | ||||
| -rw-r--r-- | src/utils.c | 41 | ||||
| -rw-r--r-- | src/utils.h | 2 | ||||
| -rw-r--r-- | tests/check_content_parser.c | 162 | ||||
| -rw-r--r-- | tests/check_utils.c | 33 | 
5 files changed, 650 insertions, 304 deletions
diff --git a/src/content-parser.c b/src/content-parser.c index 2062814..b4bc790 100644 --- a/src/content-parser.c +++ b/src/content-parser.c @@ -41,36 +41,46 @@ blogc_slugify(const char *str)  } +static const char* +htmlentities(char c) +{ +    switch (c) { +        case '&': +            return "&"; +        case '<': +            return "<"; +        case '>': +            return ">"; +        case '"': +            return """; +        case '\'': +            return "'"; +        case '/': +            return "/"; +    } +    return NULL; +} + + +static void +htmlentities_append(sb_string_t *str, char c) +{ +    const char *e = htmlentities(c); +    if (e == NULL) +        sb_string_append_c(str, c); +    else +        sb_string_append(str, e); +} + +  char*  blogc_htmlentities(const char *str)  {      if (str == NULL)          return NULL;      sb_string_t *rv = sb_string_new(); -    for (size_t i = 0; str[i] != '\0'; i++) { -        switch (str[i]) { -            case '&': -                sb_string_append(rv, "&"); -                break; -            case '<': -                sb_string_append(rv, "<"); -                break; -            case '>': -                sb_string_append(rv, ">"); -                break; -            case '"': -                sb_string_append(rv, """); -                break; -            case '\'': -                sb_string_append(rv, "'"); -                break; -            case '/': -                sb_string_append(rv, "/"); -                break; -            default: -                sb_string_append_c(rv, str[i]); -        } -    } +    for (size_t i = 0; str[i] != '\0'; i++) +        htmlentities_append(rv, str[i]);      return sb_string_free(rv, false);  } @@ -145,334 +155,500 @@ typedef enum {  typedef enum { -    LINK_CLOSED = 1, -    LINK_IMAGE, -    LINK_TEXT, -    LINK_TEXT_CLOSE, -    LINK_URL, -    LINK_AUTO, -    LINK_AUTO_CLOSE, -} blogc_content_parser_link_state_t; - - -char* -blogc_content_parse_inline(const char *src) +    CONTENT_INLINE_START = 1, +    CONTENT_INLINE_ASTERISK, +    CONTENT_INLINE_ASTERISK_DOUBLE, +    CONTENT_INLINE_UNDERSCORE, +    CONTENT_INLINE_UNDERSCORE_DOUBLE, +    CONTENT_INLINE_BACKTICKS, +    CONTENT_INLINE_BACKTICKS_DOUBLE, +    CONTENT_INLINE_LINK_START, +    CONTENT_INLINE_LINK_AUTO, +    CONTENT_INLINE_LINK_CONTENT, +    CONTENT_INLINE_LINK_URL_START, +    CONTENT_INLINE_LINK_URL, +    CONTENT_INLINE_IMAGE_START, +    CONTENT_INLINE_IMAGE_ALT, +    CONTENT_INLINE_IMAGE_URL_START, +    CONTENT_INLINE_IMAGE_URL, +    CONTENT_INLINE_ENDASH, +    CONTENT_INLINE_EMDASH, +    CONTENT_INLINE_LINE_BREAK_START, +    CONTENT_INLINE_LINE_BREAK, +} blogc_content_parser_inline_state_t; + + +static char* +blogc_content_parse_inline_internal(const char *src, size_t src_len)  { -    // this function is always called by blogc_content_parse or by itself, -    // then its safe to assume that src is always nul-terminated. -    size_t src_len = strlen(src); -      size_t current = 0;      size_t start = 0; -    size_t start_state = 0; -    size_t end = 0; - -    sb_string_t *rv = sb_string_new(); - -    bool open_em_ast = false; -    bool open_strong_ast = false; -    bool open_em_und = false; -    bool open_strong_und = false; -    bool open_code = false; -    bool open_code_double = false; - -    blogc_content_parser_link_state_t state = LINK_CLOSED; -    bool is_image = false; +    size_t count = 0; -    char *tmp = NULL; +    const char *tmp = NULL;      char *tmp2 = NULL; +    char *tmp3 = NULL; + +    size_t start_link = 0; +    char *link1 = NULL; -    unsigned int open_bracket = 0; -    unsigned int spaces = 0; +    sb_string_t *rv = sb_string_new(); -    bool escape = false; +    blogc_content_parser_inline_state_t state = CONTENT_INLINE_START;      while (current < src_len) {          char c = src[current];          bool is_last = current == src_len - 1; -        if (escape) { -            if (state == LINK_CLOSED) -                sb_string_append_c(rv, c); -            current++; -            escape = false; -            continue; -        } - -        if (c != ' ' && c != '\n' && c != '\r') -            spaces = 0; - -        if (state == LINK_TEXT_CLOSE && c != ' ' && c != '\n' && c != '\r' && -            c != '(') -        { -            sb_string_append_c(rv, src[start_state]); -            tmp = blogc_content_parse_inline(src + start_state + 1); -            sb_string_append(rv, tmp); -            // no need to free here, we will exit the loop! -            break; -        } - -        switch (c) { - -            case '\\': -                if (state == LINK_CLOSED && (open_code || open_code_double)) { -                    sb_string_append_c(rv, c); +        switch (state) { +            case CONTENT_INLINE_START: +                if (is_last) { +                    htmlentities_append(rv, c); +                    break; +                } +                if (c == '\\') { +                    htmlentities_append(rv, src[++current]); +                    break; +                } +                if (c == '*') { +                    state = CONTENT_INLINE_ASTERISK;                      break;                  } -                if (!escape) -                    escape = true; +                if (c == '_') { +                    state = CONTENT_INLINE_UNDERSCORE; +                    break; +                } +                if (c == '`') { +                    state = CONTENT_INLINE_BACKTICKS; +                    break; +                } +                if (c == '[') { +                    state = CONTENT_INLINE_LINK_START; +                    break; +                } +                if (c == '!') { +                    state = CONTENT_INLINE_IMAGE_START; +                    break; +                } +                if (c == '-') { +                    state = CONTENT_INLINE_ENDASH; +                    break; +                } +                if (c == ' ') { +                    state = CONTENT_INLINE_LINE_BREAK_START; +                    break; +                } +                htmlentities_append(rv, c);                  break; -            case '*': -            case '_': -                if (state == LINK_CLOSED && (open_code || open_code_double)) { -                    sb_string_append_c(rv, c); +            case CONTENT_INLINE_ASTERISK: +                if (c == '*') { +                    state = CONTENT_INLINE_ASTERISK_DOUBLE;                      break;                  } -                if (!is_last && src[current + 1] == c) { -                    current++; -                    if ((c == '*' && open_strong_ast) || -                        (c == '_' && open_strong_und)) -                    { -                        if (state == LINK_CLOSED) -                            sb_string_append(rv, "</strong>"); -                        if (c == '*') -                            open_strong_ast = false; -                        else -                            open_strong_und = false; +                tmp = sb_str_find(src + current, '*'); +                if (tmp == NULL || ((tmp - src) >= src_len)) { +                    sb_string_append_c(rv, '*'); +                    state = CONTENT_INLINE_START; +                    continue; +                } +                tmp2 = blogc_content_parse_inline_internal( +                    src + current, (tmp - src) - current); +                sb_string_append_printf(rv, "<em>%s</em>", tmp2); +                current = tmp - src; +                tmp = NULL; +                free(tmp2); +                tmp2 = NULL; +                state = CONTENT_INLINE_START; +                break; + +            case CONTENT_INLINE_ASTERISK_DOUBLE: +                tmp = src + current; +                do { +                    tmp = sb_str_find(tmp, '*'); +                    if (((tmp - src) < src_len) && *(tmp + 1) == '*') {                          break;                      } -                    if (state == LINK_CLOSED) -                        sb_string_append(rv, "<strong>"); -                    if (c == '*') -                        open_strong_ast = true; -                    else -                        open_strong_und = true; -                    break; +                    tmp++; +                } while (tmp != NULL && (tmp - src) < src_len); +                if (tmp == NULL || ((tmp - src) >= src_len)) { +                    sb_string_append_c(rv, '*'); +                    sb_string_append_c(rv, '*'); +                    state = CONTENT_INLINE_START; +                    continue;                  } -                if ((c == '*' && open_em_ast) || (c == '_' && open_em_und)) { -                    if (state == LINK_CLOSED) -                        sb_string_append(rv, "</em>"); -                    if (c == '*') -                        open_em_ast = false; -                    else -                        open_em_und = false; +                tmp2 = blogc_content_parse_inline_internal( +                    src + current, (tmp - src) - current); +                sb_string_append_printf(rv, "<strong>%s</strong>", tmp2); +                current = tmp - src + 1; +                tmp = NULL; +                free(tmp2); +                tmp2 = NULL; +                state = CONTENT_INLINE_START; +                break; + +            case CONTENT_INLINE_UNDERSCORE: +                if (c == '_') { +                    state = CONTENT_INLINE_UNDERSCORE_DOUBLE;                      break;                  } -                if (state == LINK_CLOSED) -                    sb_string_append(rv, "<em>"); -                if (c == '*') -                    open_em_ast = true; -                else -                    open_em_und = true; +                tmp = sb_str_find(src + current, '_'); +                if (tmp == NULL || ((tmp - src) >= src_len)) { +                    sb_string_append_c(rv, '_'); +                    state = CONTENT_INLINE_START; +                    continue; +                } +                tmp2 = blogc_content_parse_inline_internal( +                    src + current, (tmp - src) - current); +                sb_string_append_printf(rv, "<em>%s</em>", tmp2); +                current = tmp - src; +                tmp = NULL; +                free(tmp2); +                tmp2 = NULL; +                state = CONTENT_INLINE_START;                  break; -            case '`': -                if (!is_last && src[current + 1] == c) { -                    current++; -                    if (state == LINK_CLOSED) -                        sb_string_append_printf(rv, "<%scode>", -                            open_code_double ? "/" : ""); -                    open_code_double = !open_code_double; +            case CONTENT_INLINE_UNDERSCORE_DOUBLE: +                tmp = src + current; +                do { +                    tmp = sb_str_find(tmp, '_'); +                    if (((tmp - src) < src_len) && *(tmp + 1) == '_') { +                        break; +                    } +                    tmp++; +                } while (tmp != NULL && (tmp - src) < src_len); +                if (tmp == NULL || ((tmp - src) >= src_len)) { +                    sb_string_append_c(rv, '_'); +                    sb_string_append_c(rv, '_'); +                    state = CONTENT_INLINE_START; +                    continue; +                } +                tmp2 = blogc_content_parse_inline_internal( +                    src + current, (tmp - src) - current); +                sb_string_append_printf(rv, "<strong>%s</strong>", tmp2); +                current = tmp - src + 1; +                tmp = NULL; +                free(tmp2); +                tmp2 = NULL; +                state = CONTENT_INLINE_START; +                break; + +            case CONTENT_INLINE_BACKTICKS: +                if (c == '`') { +                    state = CONTENT_INLINE_BACKTICKS_DOUBLE;                      break;                  } -                if (state == LINK_CLOSED) -                    sb_string_append_printf(rv, "<%scode>", open_code ? "/" : ""); -                open_code = !open_code; +                tmp = sb_str_find(src + current, '`'); +                if (tmp == NULL || ((tmp - src) >= src_len)) { +                    sb_string_append_c(rv, '`'); +                    state = CONTENT_INLINE_START; +                    continue; +                } +                tmp3 = sb_strndup(src + current, (tmp - src) - current); +                tmp2 = blogc_htmlentities(tmp3); +                free(tmp3); +                tmp3 = NULL; +                sb_string_append(rv, "<code>"); +                sb_string_append_escaped(rv, tmp2); +                sb_string_append(rv, "</code>"); +                current = tmp - src; +                tmp = NULL; +                free(tmp2); +                tmp2 = NULL; +                state = CONTENT_INLINE_START;                  break; -            case '!': -                if (state == LINK_CLOSED) { -                    if (open_code || open_code_double) { -                        sb_string_append_c(rv, c); +            case CONTENT_INLINE_BACKTICKS_DOUBLE: +                tmp = src + current; +                do { +                    tmp = sb_str_find(tmp, '`'); +                    if (((tmp - src) < src_len) && *(tmp + 1) == '`') {                          break;                      } -                    if (!is_last && src[current + 1] != '[') { -                        sb_string_append_c(rv, c); +                    tmp++; +                } while (tmp != NULL && (tmp - src) < src_len); +                if (tmp == NULL || ((tmp - src) >= src_len)) { +                    sb_string_append_c(rv, '`'); +                    sb_string_append_c(rv, '`'); +                    state = CONTENT_INLINE_START; +                    continue; +                } +                tmp3 = sb_strndup(src + current, (tmp - src) - current); +                tmp2 = blogc_htmlentities(tmp3); +                free(tmp3); +                tmp3 = NULL; +                sb_string_append(rv, "<code>"); +                sb_string_append_escaped(rv, tmp2); +                sb_string_append(rv, "</code>"); +                current = tmp - src + 1; +                tmp = NULL; +                free(tmp2); +                tmp2 = NULL; +                state = CONTENT_INLINE_START; +                break; + +            case CONTENT_INLINE_LINK_START: +                if (c == '[') { +                    state = CONTENT_INLINE_LINK_AUTO; +                    break; +                } +                start_link = current; +                count = 1; +                state = CONTENT_INLINE_LINK_CONTENT; +                break; + +            case CONTENT_INLINE_LINK_AUTO: +                tmp = src + current; +                do { +                    tmp = sb_str_find(tmp, ']'); +                    if (((tmp - src) < src_len) && *(tmp + 1) == ']') {                          break;                      } -                    state = LINK_IMAGE; -                    is_image = true; -                    start_state = current; +                    tmp++; +                } while (tmp != NULL && (tmp - src) < src_len); +                if (tmp == NULL || ((tmp - src) >= src_len)) { +                    sb_string_append_c(rv, '['); +                    sb_string_append_c(rv, '['); +                    state = CONTENT_INLINE_START; +                    continue;                  } +                tmp2 = sb_strndup(src + current, (tmp - src) - current); +                sb_string_append(rv, "<a href=\""); +                sb_string_append_escaped(rv, tmp2); +                sb_string_append(rv, "\">"); +                sb_string_append_escaped(rv, tmp2); +                sb_string_append(rv, "</a>"); +                current = tmp - src + 1; +                tmp = NULL; +                free(tmp2); +                tmp2 = NULL; +                state = CONTENT_INLINE_START;                  break; -            case '[': -                if (state == LINK_CLOSED && (open_code || open_code_double)) { -                    sb_string_append_c(rv, c); +            case CONTENT_INLINE_LINK_CONTENT: +                if (c == '\\') { +                    current++;                      break;                  } -                if (state == LINK_CLOSED || state == LINK_IMAGE) { -                    if (state == LINK_CLOSED) -                        start_state = current; -                    state = LINK_TEXT; -                    start = current + 1; -                    open_bracket = 0; +                if (c == '[') {  // links can be nested :/ +                    count++;                      break;                  } -                if (state == LINK_TEXT) { -                    if (current == start) { -                        start = current + 1; -                        state = LINK_AUTO; -                        break; +                if (c == ']') { +                    if (--count == 0) { +                        link1 = sb_strndup(src + start_link, current - start_link); +                        state = CONTENT_INLINE_LINK_URL_START;                      } -                    open_bracket++; -                    break;                  }                  break; -            case ']': -                if (state == LINK_AUTO) { -                    end = current; -                    state = LINK_AUTO_CLOSE; +            case CONTENT_INLINE_LINK_URL_START: +                if (c == ' ' || c == '\t' || c == '\n' || c == '\r') +                    break; +                if (c == '(') { +                    state = CONTENT_INLINE_LINK_URL; +                    start = current + 1;                      break;                  } -                if (state == LINK_AUTO_CLOSE) { -                    state = LINK_CLOSED; -                    tmp = sb_strndup(src + start, end - start); -                    sb_string_append_printf(rv, "<a href=\"%s\">%s</a>", tmp, tmp); -                    end = 0; -                    free(tmp); -                    tmp = NULL; -                    is_image = false; +                sb_string_append_c(rv, '['); +                state = CONTENT_INLINE_START; +                current = start_link; +                start_link = 0; +                continue; + +            case CONTENT_INLINE_LINK_URL: +                if (c == '\\') { +                    current++;                      break;                  } -                if (state == LINK_TEXT) { -                    if (open_bracket-- == 0) { -                        state = LINK_TEXT_CLOSE; -                        tmp = sb_strndup(src + start, current - start); -                        tmp2 = blogc_content_parse_inline(tmp); -                        free(tmp); -                        tmp = NULL; -                    } +                if (c == ')') { +                    tmp2 = sb_strndup(src + start, current - start); +                    tmp3 = blogc_content_parse_inline(link1); +                    free(link1); +                    link1 = NULL; +                    sb_string_append(rv, "<a href=\""); +                    sb_string_append_escaped(rv, tmp2); +                    sb_string_append_printf(rv, "\">%s</a>", tmp3); +                    free(tmp2); +                    tmp2 = NULL; +                    free(tmp3); +                    tmp3 = NULL; +                    state = CONTENT_INLINE_START;                      break;                  } -                if (state == LINK_CLOSED) -                    sb_string_append_c(rv, c);                  break; -            case '(': -                if (state == LINK_TEXT_CLOSE) { -                    state = LINK_URL; -                    start = current + 1; +            case CONTENT_INLINE_IMAGE_START: +                // we use the same variables used for links, because why not? +                if (c == '[') { +                    state = CONTENT_INLINE_IMAGE_ALT; +                    start_link = current + 1; +                    break; +                } +                sb_string_append_c(rv, '!'); +                state = CONTENT_INLINE_START; +                continue; + +            case CONTENT_INLINE_IMAGE_ALT: +                if (c == '\\') { +                    current++;                      break;                  } -                if (state == LINK_CLOSED) -                    sb_string_append_c(rv, c); +                if (c == ']') { +                    link1 = sb_strndup(src + start_link, current - start_link); +                    state = CONTENT_INLINE_IMAGE_URL_START; +                }                  break; -            case ')': -                if (state == LINK_URL) { -                    state = LINK_CLOSED; -                    tmp = sb_strndup(src + start, current - start); -                    if (is_image) -                        sb_string_append_printf(rv, "<img src=\"%s\" alt=\"%s\">", -                            tmp, tmp2); -                    else -                        sb_string_append_printf(rv, "<a href=\"%s\">%s</a>", -                            tmp, tmp2); -                    free(tmp); -                    tmp = NULL; +            case CONTENT_INLINE_IMAGE_URL_START: +                if (c == ' ' || c == '\t' || c == '\n' || c == '\r') +                    break; +                if (c == '(') { +                    state = CONTENT_INLINE_IMAGE_URL; +                    start = current + 1; +                    break; +                } +                sb_string_append_c(rv, '!'); +                sb_string_append_c(rv, '['); +                state = CONTENT_INLINE_START; +                current = start_link; +                start_link = 0; +                continue; + +            case CONTENT_INLINE_IMAGE_URL: +                if (c == '\\') { +                    current++; +                    break; +                } +                if (c == ')') { +                    tmp2 = sb_strndup(src + start, current - start); +                    sb_string_append(rv, "<img src=\""); +                    sb_string_append_escaped(rv, tmp2); +                    sb_string_append(rv, "\" alt=\""); +                    sb_string_append_escaped(rv, link1); +                    sb_string_append(rv, "\">");                      free(tmp2);                      tmp2 = NULL; -                    is_image = false; +                    free(link1); +                    link1 = NULL; +                    state = CONTENT_INLINE_START;                      break;                  } -                if (state == LINK_CLOSED) -                    sb_string_append_c(rv, c);                  break; -            case ' ': -                if (state == LINK_CLOSED) { -                    spaces++; -                    sb_string_append_c(rv, c); +            case CONTENT_INLINE_ENDASH: +                if (c == '-') { +                    if (is_last) { +                        sb_string_append(rv, "–"); +                        state = CONTENT_INLINE_START;  // wat +                        break; +                    } +                    state = CONTENT_INLINE_EMDASH; +                    break;                  } -                if (!is_last) +                sb_string_append_c(rv, '-'); +                state = CONTENT_INLINE_START; +                continue; + +            case CONTENT_INLINE_EMDASH: +                if (c == '-') { +                    sb_string_append(rv, "—"); +                    state = CONTENT_INLINE_START;                      break; +                } +                sb_string_append(rv, "–"); +                state = CONTENT_INLINE_START; +                continue; -            case '\n': -            case '\r': -                if (state == LINK_CLOSED) { -                    if (spaces >= 2) { +            case CONTENT_INLINE_LINE_BREAK_START: +                if (c == ' ') { +                    if (is_last) {                          sb_string_append(rv, "<br />"); -                        spaces = 0; +                        state = CONTENT_INLINE_START;  // wat +                        break;                      } -                    if (c == '\n' || c == '\r') -                        sb_string_append_c(rv, c); +                    count = 2; +                    state = CONTENT_INLINE_LINE_BREAK; +                    break;                  } -                break; +                sb_string_append_c(rv, ' '); +                state = CONTENT_INLINE_START; +                continue; -            case '-': -                if (state != LINK_CLOSED) -                    break; -                if ((current < (src_len - 1) && src[current + 1] == '-') && -                    !(open_code || open_code_double)) -                { -                    if (current < (src_len - 2) && src[current + 2] == '-') { -                        sb_string_append(rv, "—"); -                        current += 2; -                    } -                    else { -                        sb_string_append(rv, "–"); -                        current += 1; +            case CONTENT_INLINE_LINE_BREAK: +                if (c == ' ') { +                    if (is_last) { +                        sb_string_append(rv, "<br />"); +                        state = CONTENT_INLINE_START;  // wat +                        break;                      } +                    count++; +                    break;                  } -                else { -                    sb_string_append_c(rv, c); +                if (c == '\n' || c == '\r') { +                    sb_string_append_printf(rv, "<br />%c", c); +                    state = CONTENT_INLINE_START; +                    break;                  } -                break; - -            case '&': -                if (state == LINK_CLOSED) -                    sb_string_append(rv, "&"); -                break; - -            case '<': -                if (state == LINK_CLOSED) -                    sb_string_append(rv, "<"); -                break; - -            case '>': -                if (state == LINK_CLOSED) -                    sb_string_append(rv, ">"); -                break; - -            case '"': -                if (state == LINK_CLOSED) -                    sb_string_append(rv, """); -                break; - -            case '\'': -                if (state == LINK_CLOSED) -                    sb_string_append(rv, "'"); -                break; - -            case '/': -                if (state == LINK_CLOSED) -                    sb_string_append(rv, "/"); -                break; - -            default: -                if (state == LINK_CLOSED) -                    sb_string_append_c(rv, c); -        } - -        if (is_last && state != LINK_CLOSED) { -            sb_string_append_c(rv, src[start_state]); -            tmp = blogc_content_parse_inline(src + start_state + 1); -            sb_string_append(rv, tmp); -            // no need to free here, its the last iteration +                for (size_t i = 0; i < count; i++) +                    sb_string_append_c(rv, ' '); +                state = CONTENT_INLINE_START; +                continue;          }          current++;      } -    free(tmp); +    switch (state) { + +        // if after the end of the loop we are on any of the following states, +        // we must call the parser again, from start_link +        case CONTENT_INLINE_IMAGE_START: +        case CONTENT_INLINE_IMAGE_ALT: +        case CONTENT_INLINE_IMAGE_URL_START: +        case CONTENT_INLINE_IMAGE_URL: +            sb_string_append_c(rv, '!'); + +        case CONTENT_INLINE_LINK_CONTENT: +        case CONTENT_INLINE_LINK_URL_START: +        case CONTENT_INLINE_LINK_URL: +            tmp2 = blogc_content_parse_inline(src + start_link); +            sb_string_append_c(rv, '['); +            sb_string_append_escaped(rv, tmp2);  // no need to free, as it wil be done below. +            break; + +        // add all the other states here explicitly, so the compiler helps us +        // not missing any new state that should be handled. +        case CONTENT_INLINE_START: +        case CONTENT_INLINE_ASTERISK: +        case CONTENT_INLINE_ASTERISK_DOUBLE: +        case CONTENT_INLINE_UNDERSCORE: +        case CONTENT_INLINE_UNDERSCORE_DOUBLE: +        case CONTENT_INLINE_BACKTICKS: +        case CONTENT_INLINE_BACKTICKS_DOUBLE: +        case CONTENT_INLINE_LINK_START: +        case CONTENT_INLINE_LINK_AUTO: +        case CONTENT_INLINE_ENDASH: +        case CONTENT_INLINE_EMDASH: +        case CONTENT_INLINE_LINE_BREAK_START: +        case CONTENT_INLINE_LINE_BREAK: +            break; +    } +      free(tmp2); +    free(tmp3); +    free(link1);      return sb_string_free(rv, false);  } +char* +blogc_content_parse_inline(const char *src) +{ +    return blogc_content_parse_inline_internal(src, strlen(src)); +} + +  bool  blogc_is_ordered_list_item(const char *str, size_t prefix_len)  { diff --git a/src/utils.c b/src/utils.c index 855b503..d7362a6 100644 --- a/src/utils.c +++ b/src/utils.c @@ -287,6 +287,27 @@ sb_str_replace(const char *str, const char search, const char *replace)  } +char* +sb_str_find(const char *str, char c) +{ +    // this is somewhat similar to strchr, but respects '\' escaping. +    if (str == NULL) +        return NULL; +    if (c == '\0') +        return (char*) str + strlen(str); +    for (size_t i = 0; str[i] != '\0'; i++) { +        if (str[i] == '\\') { +            i++; +            continue; +        } +        if (str[i] == c) { +            return (char*) str + i; +        } +    } +    return NULL; +} + +  void  sb_strv_free(char **strv)  { @@ -425,6 +446,26 @@ sb_string_append_printf(sb_string_t *str, const char *format, ...)  } +sb_string_t* +sb_string_append_escaped(sb_string_t *str, const char *suffix) +{ +    if (str == NULL) +        return NULL; +    if (suffix == NULL) +        return str; +    bool escaped = false; +    for (size_t i = 0; suffix[i] != '\0'; i++) { +        if (suffix[i] == '\\' && !escaped) { +            escaped = true; +            continue; +        } +        escaped = false; +        str = sb_string_append_c(str, suffix[i]); +    } +    return str; +} + +  sb_trie_t*  sb_trie_new(sb_free_func_t free_func)  { diff --git a/src/utils.h b/src/utils.h index 411295a..aefcbf3 100644 --- a/src/utils.h +++ b/src/utils.h @@ -51,6 +51,7 @@ char* sb_str_rstrip(char *str);  char* sb_str_strip(char *str);  char** sb_str_split(const char *str, char c, unsigned int max_pieces);  char* sb_str_replace(const char *str, const char search, const char *replace); +char* sb_str_find(const char *str, char c);  void sb_strv_free(char **strv);  char* sb_strv_join(char **strv, const char *separator);  size_t sb_strv_length(char **strv); @@ -71,6 +72,7 @@ sb_string_t* sb_string_append_len(sb_string_t *str, const char *suffix, size_t l  sb_string_t* sb_string_append(sb_string_t *str, const char *suffix);  sb_string_t* sb_string_append_c(sb_string_t *str, char c);  sb_string_t* sb_string_append_printf(sb_string_t *str, const char *format, ...); +sb_string_t* sb_string_append_escaped(sb_string_t *str, const char *suffix);  // trie diff --git a/tests/check_content_parser.c b/tests/check_content_parser.c index 783d3f3..b0272fb 100644 --- a/tests/check_content_parser.c +++ b/tests/check_content_parser.c @@ -184,7 +184,7 @@ test_content_parse(void **state)          "<h6 id=\"seis\">seis</h6>\n"          "<p>bola\n"          "chunda</p>\n" -        "<blockquote><p>bola  <br />\n" +        "<blockquote><p>bola<br />\n"          "guda\n"          "buga</p>\n"          "<pre><code>asd</code></pre>\n" @@ -276,7 +276,7 @@ test_content_parse_crlf(void **state)          "<h6 id=\"seis\">seis</h6>\r\n"          "<p>bola\r\n"          "chunda</p>\r\n" -        "<blockquote><p>bola  <br />\r\n" +        "<blockquote><p>bola<br />\r\n"          "guda\r\n"          "buga</p>\r\n"          "<pre><code>asd</code></pre>\r\n" @@ -1485,14 +1485,13 @@ test_content_parse_invalid_code(void **state)  static void  test_content_parse_invalid_horizontal_rule(void **state)  { -    // this generates invalid html, but...      char *html = blogc_content_parse("** asd", NULL, NULL);      assert_non_null(html); -    assert_string_equal(html, "<p><strong> asd</p>\n"); +    assert_string_equal(html, "<p>** asd</p>\n");      free(html);      html = blogc_content_parse("** asd\n", NULL, NULL);      assert_non_null(html); -    assert_string_equal(html, "<p><strong> asd</p>\n"); +    assert_string_equal(html, "<p>** asd</p>\n");      free(html);  } @@ -1500,13 +1499,12 @@ test_content_parse_invalid_horizontal_rule(void **state)  static void  test_content_parse_invalid_unordered_list(void **state)  { -    // more invalid html      char *html = blogc_content_parse(          "*  asd\n"          "1. qwe", NULL, NULL);      assert_non_null(html);      assert_string_equal(html, -        "<p><em>  asd\n" +        "<p>*  asd\n"          "1. qwe</p>\n");      free(html);      html = blogc_content_parse( @@ -1515,7 +1513,7 @@ test_content_parse_invalid_unordered_list(void **state)          "\n", NULL, NULL);      assert_non_null(html);      assert_string_equal(html, -        "<p><em>  asd\n" +        "<p>*  asd\n"          "1. qwe</p>\n");      free(html);      html = blogc_content_parse( @@ -1523,7 +1521,7 @@ test_content_parse_invalid_unordered_list(void **state)          "1. qwe\n", NULL, NULL);      assert_non_null(html);      assert_string_equal(html, -        "<p><em>  asd\n" +        "<p>*  asd\n"          "1. qwe"          "</p>\n");      free(html); @@ -1532,7 +1530,7 @@ test_content_parse_invalid_unordered_list(void **state)          "1. qwe\n", NULL, NULL);      assert_non_null(html);      assert_string_equal(html, -        "<p><em> asd\n" +        "<p>* asd\n"          "1. qwe"          "</p>\n");      free(html); @@ -1546,7 +1544,7 @@ test_content_parse_invalid_unordered_list(void **state)      assert_non_null(html);      assert_string_equal(html,          "<p>chunda</p>\n" -        "<p><em> asd\n" +        "<p>* asd\n"          "1. qwe</p>\n"          "<p>poi</p>\n");      free(html); @@ -1556,14 +1554,13 @@ test_content_parse_invalid_unordered_list(void **state)  static void  test_content_parse_invalid_ordered_list(void **state)  { -    // more invalid html      char *html = blogc_content_parse(          "1. asd\n"          "*  qwe", NULL, NULL);      assert_non_null(html);      assert_string_equal(html,          "<p>1. asd\n" -        "<em>  qwe</p>\n"); +        "*  qwe</p>\n");      free(html);      html = blogc_content_parse(          "1. asd\n" @@ -1572,7 +1569,7 @@ test_content_parse_invalid_ordered_list(void **state)      assert_non_null(html);      assert_string_equal(html,          "<p>1. asd\n" -        "<em>  qwe</p>\n"); +        "*  qwe</p>\n");      free(html);      html = blogc_content_parse(          "1. asd\n" @@ -1580,7 +1577,7 @@ test_content_parse_invalid_ordered_list(void **state)      assert_non_null(html);      assert_string_equal(html,          "<p>1. asd\n" -        "<em>  qwe" +        "*  qwe"          "</p>\n");      free(html);      html = blogc_content_parse( @@ -1589,7 +1586,7 @@ test_content_parse_invalid_ordered_list(void **state)      assert_non_null(html);      assert_string_equal(html,          "<p>1. asd\n" -        "<em>  qwe" +        "*  qwe"          "</p>\n");      free(html);      html = blogc_content_parse( @@ -1603,7 +1600,7 @@ test_content_parse_invalid_ordered_list(void **state)      assert_string_equal(html,          "<p>chunda</p>\n"          "<p>1. asd\n" -        "<em>  qwe</p>\n" +        "*  qwe</p>\n"          "<p>poi</p>\n");      free(html);      html = blogc_content_parse( @@ -1612,7 +1609,7 @@ test_content_parse_invalid_ordered_list(void **state)      assert_non_null(html);      assert_string_equal(html,          "<p>1 asd\n" -        "<em> qwe</p>\n"); +        "* qwe</p>\n");      free(html);      html = blogc_content_parse(          "a. asd\n" @@ -1677,6 +1674,10 @@ test_content_parse_inline_em(void **state)      assert_non_null(html);      assert_string_equal(html, "<em>bola</em>\n");      free(html); +    html = blogc_content_parse_inline("*bo\\*la*\n"); +    assert_non_null(html); +    assert_string_equal(html, "<em>bo*la</em>\n"); +    free(html);      html = blogc_content_parse_inline("_bola_");      assert_non_null(html);      assert_string_equal(html, "<em>bola</em>"); @@ -1685,14 +1686,25 @@ test_content_parse_inline_em(void **state)      assert_non_null(html);      assert_string_equal(html, "<em>bola</em>\n");      free(html); +    html = blogc_content_parse_inline("_bo\\*la_\n"); +    assert_non_null(html); +    assert_string_equal(html, "<em>bo*la</em>\n"); +    free(html);      html = blogc_content_parse_inline("_**bola**_\n");      assert_non_null(html);      assert_string_equal(html, "<em><strong>bola</strong></em>\n");      free(html); -    // this is not really valid +    html = blogc_content_parse_inline("_**bo\\_\\*la**_\n"); +    assert_non_null(html); +    assert_string_equal(html, "<em><strong>bo_*la</strong></em>\n"); +    free(html);      html = blogc_content_parse_inline("_**bola\n");      assert_non_null(html); -    assert_string_equal(html, "<em><strong>bola\n"); +    assert_string_equal(html, "_**bola\n"); +    free(html); +    html = blogc_content_parse_inline("**_bola\\*\n"); +    assert_non_null(html); +    assert_string_equal(html, "**_bola*\n");      free(html);  } @@ -1708,6 +1720,10 @@ test_content_parse_inline_strong(void **state)      assert_non_null(html);      assert_string_equal(html, "<strong>bola</strong>\n");      free(html); +    html = blogc_content_parse_inline("**bo\*la**\n"); +    assert_non_null(html); +    assert_string_equal(html, "<strong>bo*la</strong>\n"); +    free(html);      html = blogc_content_parse_inline("__bola__");      assert_non_null(html);      assert_string_equal(html, "<strong>bola</strong>"); @@ -1716,14 +1732,25 @@ test_content_parse_inline_strong(void **state)      assert_non_null(html);      assert_string_equal(html, "<strong>bola</strong>\n");      free(html); +    html = blogc_content_parse_inline("__bo\*la__\n"); +    assert_non_null(html); +    assert_string_equal(html, "<strong>bo*la</strong>\n"); +    free(html);      html = blogc_content_parse_inline("__*bola*__\n");      assert_non_null(html);      assert_string_equal(html, "<strong><em>bola</em></strong>\n");      free(html); -    // this is not really valid +    html = blogc_content_parse_inline("__*bo\\_\\*la*__\n"); +    assert_non_null(html); +    assert_string_equal(html, "<strong><em>bo_*la</em></strong>\n"); +    free(html);      html = blogc_content_parse_inline("__*bola\n");      assert_non_null(html); -    assert_string_equal(html, "<strong><em>bola\n"); +    assert_string_equal(html, "__*bola\n"); +    free(html); +    html = blogc_content_parse_inline("__*bola\\_\n"); +    assert_non_null(html); +    assert_string_equal(html, "__*bola_\n");      free(html);  } @@ -1751,18 +1778,29 @@ test_content_parse_inline_code(void **state)      assert_non_null(html);      assert_string_equal(html, "<code>bo*la</code>\n");      free(html); -    // invalid +    html = blogc_content_parse_inline("``bo<la``\n"); +    assert_non_null(html); +    assert_string_equal(html, "<code>bo<la</code>\n"); +    free(html); +    html = blogc_content_parse_inline("`bo\\`\\`la`\n"); +    assert_non_null(html); +    assert_string_equal(html, "<code>bo``la</code>\n"); +    free(html); +    html = blogc_content_parse_inline("``bo\\`\\`la``\n"); +    assert_non_null(html); +    assert_string_equal(html, "<code>bo``la</code>\n"); +    free(html);      html = blogc_content_parse_inline("``bola\n");      assert_non_null(html); -    assert_string_equal(html, "<code>bola\n"); +    assert_string_equal(html, "``bola\n");      free(html);      html = blogc_content_parse_inline("`bola\n");      assert_non_null(html); -    assert_string_equal(html, "<code>bola\n"); +    assert_string_equal(html, "`bola\n");      free(html);      html = blogc_content_parse_inline("``bola`\n");      assert_non_null(html); -    assert_string_equal(html, "<code>bola<code>\n"); +    assert_string_equal(html, "``bola`\n");      free(html);  } @@ -1802,9 +1840,9 @@ test_content_parse_inline_link(void **state)      assert_non_null(html);      assert_string_equal(html, "<a href=\"http://example.org/\"><code>bola</code></a>\n");      free(html); -    html = blogc_content_parse_inline("[``bola(2)[3]**!\\``](http://example.org/)\n"); +    html = blogc_content_parse_inline("[``bola(2)[3]**!\\```](http://example.org/)\n");      assert_non_null(html); -    assert_string_equal(html, "<a href=\"http://example.org/\"><code>bola(2)[3]**!\\</code></a>\n"); +    assert_string_equal(html, "<a href=\"http://example.org/\"><code>bola(2)[3]**!`</code></a>\n");      free(html);      html = blogc_content_parse_inline("test suite!)\n"          "depends on [cmocka](http://cmocka.org/), though.\n"); @@ -1821,6 +1859,10 @@ test_content_parse_inline_link(void **state)      assert_non_null(html);      assert_string_equal(html, "<a href=\"\nhttp://example.org/\">bola</a>\n");      free(html); +    html = blogc_content_parse_inline("[bo[]\\[\\]()la](http://example.org/?\\(\\))\n"); +    assert_non_null(html); +    assert_string_equal(html, "<a href=\"http://example.org/?()\">bo[][]()la</a>\n"); +    free(html);      html = blogc_content_parse_inline("[bola](http://example.org/\n");      assert_non_null(html);      assert_string_equal(html, "[bola](http://example.org/\n"); @@ -1863,13 +1905,21 @@ test_content_parse_inline_link_auto(void **state)      assert_non_null(html);      assert_string_equal(html, "<a href=\"guda\">guda</a>\n");      free(html); +    html = blogc_content_parse_inline("[[http://example.org/?\\[\\]]]\n"); +    assert_non_null(html); +    assert_string_equal(html, "<a href=\"http://example.org/?[]\">http://example.org/?[]</a>\n"); +    free(html); +    html = blogc_content_parse_inline("[[http://example.org/?\\[\\]a]]\n"); +    assert_non_null(html); +    assert_string_equal(html, "<a href=\"http://example.org/?[]a\">http://example.org/?[]a</a>\n"); +    free(html);      html = blogc_content_parse_inline("[[guda]asd]");      assert_non_null(html); -    assert_string_equal(html, "<a href=\"guda\">guda</a>"); +    assert_string_equal(html, "[[guda]asd]");      free(html);      html = blogc_content_parse_inline("[[guda]asd]\n");      assert_non_null(html); -    assert_string_equal(html, "<a href=\"guda\">guda</a>\n"); +    assert_string_equal(html, "[[guda]asd]\n");      free(html);      html = blogc_content_parse_inline("[[guda]asd");      assert_non_null(html); @@ -1942,6 +1992,10 @@ test_content_parse_inline_image(void **state)      assert_non_null(html);      assert_string_equal(html, "<img src=\"\nhttp://example.org/\" alt=\"bola\">\n");      free(html); +    html = blogc_content_parse_inline("![bo\\[\\]()la](http://example.org/?\\(\\))\n"); +    assert_non_null(html); +    assert_string_equal(html, "<img src=\"http://example.org/?()\" alt=\"bo[]()la\">\n"); +    free(html);      html = blogc_content_parse_inline(";      assert_non_null(html);      assert_string_equal(html, "; @@ -1994,15 +2048,15 @@ test_content_parse_inline_line_break(void **state)  {      char *html = blogc_content_parse_inline("asd  \n");      assert_non_null(html); -    assert_string_equal(html, "asd  <br />\n"); +    assert_string_equal(html, "asd<br />\n");      free(html);      html = blogc_content_parse_inline("asd  ");      assert_non_null(html); -    assert_string_equal(html, "asd  <br />"); +    assert_string_equal(html, "asd<br />");      free(html);      html = blogc_content_parse_inline("asd   ");      assert_non_null(html); -    assert_string_equal(html, "asd   <br />"); +    assert_string_equal(html, "asd<br />");      free(html);      // invalid      html = blogc_content_parse_inline("asd "); @@ -2021,7 +2075,7 @@ test_content_parse_inline_line_break_crlf(void **state)  {      char *html = blogc_content_parse_inline("asd  \r\n");      assert_non_null(html); -    assert_string_equal(html, "asd  <br />\r\n"); +    assert_string_equal(html, "asd<br />\r\n");      free(html);      html = blogc_content_parse_inline("asd \r\n");      assert_non_null(html); @@ -2041,6 +2095,46 @@ test_content_parse_inline_endash_emdash(void **state)      assert_non_null(html);      assert_string_equal(html, "foo — bar");      free(html); +    html = blogc_content_parse_inline("foo --"); +    assert_non_null(html); +    assert_string_equal(html, "foo –"); +    free(html); +    html = blogc_content_parse_inline("foo ---"); +    assert_non_null(html); +    assert_string_equal(html, "foo —"); +    free(html); +    html = blogc_content_parse_inline("foo \\-\\-"); +    assert_non_null(html); +    assert_string_equal(html, "foo --"); +    free(html); +    html = blogc_content_parse_inline("foo \\-\\-\\-"); +    assert_non_null(html); +    assert_string_equal(html, "foo ---"); +    free(html); +    html = blogc_content_parse_inline("foo \\---"); +    assert_non_null(html); +    assert_string_equal(html, "foo -–"); +    free(html); +    html = blogc_content_parse_inline("foo \\----"); +    assert_non_null(html); +    assert_string_equal(html, "foo -—"); +    free(html); +    html = blogc_content_parse_inline("foo \\-\\- bar"); +    assert_non_null(html); +    assert_string_equal(html, "foo -- bar"); +    free(html); +    html = blogc_content_parse_inline("foo \\-\\-\\- bar"); +    assert_non_null(html); +    assert_string_equal(html, "foo --- bar"); +    free(html); +    html = blogc_content_parse_inline("foo \\--- bar"); +    assert_non_null(html); +    assert_string_equal(html, "foo -– bar"); +    free(html); +    html = blogc_content_parse_inline("foo \\---- bar"); +    assert_non_null(html); +    assert_string_equal(html, "foo -— bar"); +    free(html);      html = blogc_content_parse_inline("`foo -- bar`");      assert_non_null(html);      assert_string_equal(html, "<code>foo -- bar</code>"); diff --git a/tests/check_utils.c b/tests/check_utils.c index 6a6ceca..31087f1 100644 --- a/tests/check_utils.c +++ b/tests/check_utils.c @@ -257,6 +257,18 @@ test_str_replace(void **state)  static void +test_str_find(void **state) +{ +    assert_null(sb_str_find(NULL, 'c')); +    assert_string_equal(sb_str_find("bola", 'l'), "la"); +    assert_string_equal(sb_str_find("bo\\lalala", 'l'), "lala"); +    assert_string_equal(sb_str_find("bola", '\0'), ""); +    assert_null(sb_str_find("bola", 'g')); +    assert_null(sb_str_find("bo\\la", 'l')); +} + + +static void  test_strv_join(void **state)  {      char *pieces[] = {"guda","bola", "chunda", NULL}; @@ -529,6 +541,25 @@ test_string_append_printf(void **state)  static void +test_string_append_escaped(void **state) +{ +    sb_string_t *str = sb_string_new(); +    str = sb_string_append_escaped(str, NULL); +    assert_non_null(str); +    assert_string_equal(str->str, ""); +    assert_int_equal(str->len, 0); +    assert_int_equal(str->allocated_len, SB_STRING_CHUNK_SIZE); +    str = sb_string_append_escaped(str, "foo \\a bar \\\\ lol"); +    assert_non_null(str); +    assert_string_equal(str->str, "foo a bar \\ lol"); +    assert_int_equal(str->len, 15); +    assert_int_equal(str->allocated_len, SB_STRING_CHUNK_SIZE); +    assert_null(sb_string_free(str, true)); +    assert_null(sb_string_append_escaped(NULL, "asd")); +} + + +static void  test_trie_new(void **state)  {      sb_trie_t *trie = sb_trie_new(free); @@ -934,6 +965,7 @@ main(void)          unit_test(test_str_strip),          unit_test(test_str_split),          unit_test(test_str_replace), +        unit_test(test_str_find),          unit_test(test_strv_join),          unit_test(test_strv_length), @@ -945,6 +977,7 @@ main(void)          unit_test(test_string_append),          unit_test(test_string_append_c),          unit_test(test_string_append_printf), +        unit_test(test_string_append_escaped),          // trie          unit_test(test_trie_new),  | 
