diff options
Diffstat (limited to 'src/content-parser.c')
-rw-r--r-- | src/content-parser.c | 1509 |
1 files changed, 1509 insertions, 0 deletions
diff --git a/src/content-parser.c b/src/content-parser.c new file mode 100644 index 0000000..410fc90 --- /dev/null +++ b/src/content-parser.c @@ -0,0 +1,1509 @@ +/* + * blogc: A blog compiler. + * Copyright (C) 2015-2016 Rafael G. Martins <rafael@rafaelmartins.eng.br> + * + * This program can be distributed under the terms of the BSD License. + * See the file LICENSE. + */ + +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "content-parser.h" +#include "utils.h" + +// this is a half ass implementation of a markdown-like syntax. bugs are +// expected. feel free to improve the parser and add new features. + + +char* +blogc_slugify(const char *str) +{ + if (str == NULL) + return NULL; + char *new_str = sb_strdup(str); + int diff = 'a' - 'A'; // just to avoid magic numbers + for (size_t i = 0; new_str[i] != '\0'; i++) { + if (new_str[i] >= 'a' && new_str[i] <= 'z') + continue; + if (new_str[i] >= '0' && new_str[i] <= '9') + continue; + if (new_str[i] >= 'A' && new_str[i] <= 'Z') + new_str[i] += diff; + else + new_str[i] = '-'; + } + return new_str; +} + + +static const char* +htmlentities(char c) +{ + switch (c) { + case '&': + return "&"; + case '<': + return "<"; + case '>': + return ">"; + case '"': + return """; + case '\'': + return "'"; + case '/': + return "/"; + } + return NULL; +} + + +static void +htmlentities_append(sb_string_t *str, char c) +{ + const char *e = htmlentities(c); + if (e == NULL) + sb_string_append_c(str, c); + else + sb_string_append(str, e); +} + + +char* +blogc_htmlentities(const char *str) +{ + if (str == NULL) + return NULL; + sb_string_t *rv = sb_string_new(); + for (size_t i = 0; str[i] != '\0'; i++) + htmlentities_append(rv, str[i]); + return sb_string_free(rv, false); +} + + +char* +blogc_fix_description(const char *paragraph) +{ + if (paragraph == NULL) + return NULL; + sb_string_t *rv = sb_string_new(); + bool last = false; + bool newline = false; + char *tmp = NULL; + size_t start = 0; + size_t current = 0; + while (true) { + switch (paragraph[current]) { + case '\0': + last = true; + case '\r': + case '\n': + if (newline) + break; + tmp = sb_strndup(paragraph + start, current - start); + sb_string_append(rv, sb_str_strip(tmp)); + free(tmp); + tmp = NULL; + if (!last) + sb_string_append_c(rv, ' '); + start = current + 1; + newline = true; + break; + default: + newline = false; + } + if (last) + break; + current++; + } + tmp = blogc_htmlentities(sb_str_strip(rv->str)); + sb_string_free(rv, true); + return tmp; +} + + +typedef enum { + CONTENT_START_LINE = 1, + CONTENT_EXCERPT, + CONTENT_EXCERPT_END, + CONTENT_HEADER, + CONTENT_HEADER_TITLE_START, + CONTENT_HEADER_TITLE, + CONTENT_HTML, + CONTENT_HTML_END, + CONTENT_BLOCKQUOTE, + CONTENT_BLOCKQUOTE_START, + CONTENT_BLOCKQUOTE_END, + CONTENT_CODE, + CONTENT_CODE_START, + CONTENT_CODE_END, + CONTENT_UNORDERED_LIST_OR_HORIZONTAL_RULE, + CONTENT_HORIZONTAL_RULE, + CONTENT_UNORDERED_LIST_START, + CONTENT_UNORDERED_LIST_END, + CONTENT_ORDERED_LIST, + CONTENT_ORDERED_LIST_SPACE, + CONTENT_ORDERED_LIST_START, + CONTENT_ORDERED_LIST_END, + CONTENT_PARAGRAPH, + CONTENT_PARAGRAPH_END, +} blogc_content_parser_state_t; + + +typedef enum { + CONTENT_INLINE_START = 1, + CONTENT_INLINE_ASTERISK, + CONTENT_INLINE_ASTERISK_DOUBLE, + CONTENT_INLINE_UNDERSCORE, + CONTENT_INLINE_UNDERSCORE_DOUBLE, + CONTENT_INLINE_BACKTICKS, + CONTENT_INLINE_BACKTICKS_DOUBLE, + CONTENT_INLINE_LINK_START, + CONTENT_INLINE_LINK_AUTO, + CONTENT_INLINE_LINK_CONTENT, + CONTENT_INLINE_LINK_URL_START, + CONTENT_INLINE_LINK_URL, + CONTENT_INLINE_IMAGE_START, + CONTENT_INLINE_IMAGE_ALT, + CONTENT_INLINE_IMAGE_URL_START, + CONTENT_INLINE_IMAGE_URL, + CONTENT_INLINE_ENDASH, + CONTENT_INLINE_EMDASH, + CONTENT_INLINE_LINE_BREAK_START, + CONTENT_INLINE_LINE_BREAK, +} blogc_content_parser_inline_state_t; + + +static char* +blogc_content_parse_inline_internal(const char *src, size_t src_len) +{ + size_t current = 0; + size_t start = 0; + size_t count = 0; + + const char *tmp = NULL; + char *tmp2 = NULL; + char *tmp3 = NULL; + + size_t start_link = 0; + char *link1 = NULL; + + sb_string_t *rv = sb_string_new(); + + blogc_content_parser_inline_state_t state = CONTENT_INLINE_START; + + while (current < src_len) { + char c = src[current]; + bool is_last = current == src_len - 1; + + switch (state) { + case CONTENT_INLINE_START: + if (is_last) { + htmlentities_append(rv, c); + break; + } + if (c == '\\') { + htmlentities_append(rv, src[++current]); + break; + } + if (c == '*') { + state = CONTENT_INLINE_ASTERISK; + break; + } + if (c == '_') { + state = CONTENT_INLINE_UNDERSCORE; + break; + } + if (c == '`') { + state = CONTENT_INLINE_BACKTICKS; + break; + } + if (c == '[') { + state = CONTENT_INLINE_LINK_START; + break; + } + if (c == '!') { + state = CONTENT_INLINE_IMAGE_START; + break; + } + if (c == '-') { + state = CONTENT_INLINE_ENDASH; + break; + } + if (c == ' ') { + state = CONTENT_INLINE_LINE_BREAK_START; + break; + } + htmlentities_append(rv, c); + break; + + case CONTENT_INLINE_ASTERISK: + if (c == '*') { + state = CONTENT_INLINE_ASTERISK_DOUBLE; + break; + } + tmp = sb_str_find(src + current, '*'); + if (tmp == NULL || ((tmp - src) >= src_len)) { + sb_string_append_c(rv, '*'); + state = CONTENT_INLINE_START; + continue; + } + tmp2 = blogc_content_parse_inline_internal( + src + current, (tmp - src) - current); + sb_string_append_printf(rv, "<em>%s</em>", tmp2); + current = tmp - src; + tmp = NULL; + free(tmp2); + tmp2 = NULL; + state = CONTENT_INLINE_START; + break; + + case CONTENT_INLINE_ASTERISK_DOUBLE: + tmp = src + current; + do { + tmp = sb_str_find(tmp, '*'); + if (((tmp - src) < src_len) && *(tmp + 1) == '*') { + break; + } + tmp++; + } while (tmp != NULL && (tmp - src) < src_len); + if (tmp == NULL || ((tmp - src) >= src_len)) { + sb_string_append_c(rv, '*'); + sb_string_append_c(rv, '*'); + state = CONTENT_INLINE_START; + continue; + } + tmp2 = blogc_content_parse_inline_internal( + src + current, (tmp - src) - current); + sb_string_append_printf(rv, "<strong>%s</strong>", tmp2); + current = tmp - src + 1; + tmp = NULL; + free(tmp2); + tmp2 = NULL; + state = CONTENT_INLINE_START; + break; + + case CONTENT_INLINE_UNDERSCORE: + if (c == '_') { + state = CONTENT_INLINE_UNDERSCORE_DOUBLE; + break; + } + tmp = sb_str_find(src + current, '_'); + if (tmp == NULL || ((tmp - src) >= src_len)) { + sb_string_append_c(rv, '_'); + state = CONTENT_INLINE_START; + continue; + } + tmp2 = blogc_content_parse_inline_internal( + src + current, (tmp - src) - current); + sb_string_append_printf(rv, "<em>%s</em>", tmp2); + current = tmp - src; + tmp = NULL; + free(tmp2); + tmp2 = NULL; + state = CONTENT_INLINE_START; + break; + + case CONTENT_INLINE_UNDERSCORE_DOUBLE: + tmp = src + current; + do { + tmp = sb_str_find(tmp, '_'); + if (((tmp - src) < src_len) && *(tmp + 1) == '_') { + break; + } + tmp++; + } while (tmp != NULL && (tmp - src) < src_len); + if (tmp == NULL || ((tmp - src) >= src_len)) { + sb_string_append_c(rv, '_'); + sb_string_append_c(rv, '_'); + state = CONTENT_INLINE_START; + continue; + } + tmp2 = blogc_content_parse_inline_internal( + src + current, (tmp - src) - current); + sb_string_append_printf(rv, "<strong>%s</strong>", tmp2); + current = tmp - src + 1; + tmp = NULL; + free(tmp2); + tmp2 = NULL; + state = CONTENT_INLINE_START; + break; + + case CONTENT_INLINE_BACKTICKS: + if (c == '`') { + state = CONTENT_INLINE_BACKTICKS_DOUBLE; + break; + } + tmp = sb_str_find(src + current, '`'); + if (tmp == NULL || ((tmp - src) >= src_len)) { + sb_string_append_c(rv, '`'); + state = CONTENT_INLINE_START; + continue; + } + tmp3 = sb_strndup(src + current, (tmp - src) - current); + tmp2 = blogc_htmlentities(tmp3); + free(tmp3); + tmp3 = NULL; + sb_string_append(rv, "<code>"); + sb_string_append_escaped(rv, tmp2); + sb_string_append(rv, "</code>"); + current = tmp - src; + tmp = NULL; + free(tmp2); + tmp2 = NULL; + state = CONTENT_INLINE_START; + break; + + case CONTENT_INLINE_BACKTICKS_DOUBLE: + tmp = src + current; + do { + tmp = sb_str_find(tmp, '`'); + if (((tmp - src) < src_len) && *(tmp + 1) == '`') { + break; + } + tmp++; + } while (tmp != NULL && (tmp - src) < src_len); + if (tmp == NULL || ((tmp - src) >= src_len)) { + sb_string_append_c(rv, '`'); + sb_string_append_c(rv, '`'); + state = CONTENT_INLINE_START; + continue; + } + tmp3 = sb_strndup(src + current, (tmp - src) - current); + tmp2 = blogc_htmlentities(tmp3); + free(tmp3); + tmp3 = NULL; + sb_string_append(rv, "<code>"); + sb_string_append_escaped(rv, tmp2); + sb_string_append(rv, "</code>"); + current = tmp - src + 1; + tmp = NULL; + free(tmp2); + tmp2 = NULL; + state = CONTENT_INLINE_START; + break; + + case CONTENT_INLINE_LINK_START: + if (c == '[') { + state = CONTENT_INLINE_LINK_AUTO; + break; + } + start_link = current; + count = 1; + state = CONTENT_INLINE_LINK_CONTENT; + break; + + case CONTENT_INLINE_LINK_AUTO: + tmp = src + current; + do { + tmp = sb_str_find(tmp, ']'); + if (((tmp - src) < src_len) && *(tmp + 1) == ']') { + break; + } + tmp++; + } while (tmp != NULL && (tmp - src) < src_len); + if (tmp == NULL || ((tmp - src) >= src_len)) { + sb_string_append_c(rv, '['); + sb_string_append_c(rv, '['); + state = CONTENT_INLINE_START; + continue; + } + tmp2 = sb_strndup(src + current, (tmp - src) - current); + sb_string_append(rv, "<a href=\""); + sb_string_append_escaped(rv, tmp2); + sb_string_append(rv, "\">"); + sb_string_append_escaped(rv, tmp2); + sb_string_append(rv, "</a>"); + current = tmp - src + 1; + tmp = NULL; + free(tmp2); + tmp2 = NULL; + state = CONTENT_INLINE_START; + break; + + case CONTENT_INLINE_LINK_CONTENT: + if (c == '\\') { + current++; + break; + } + if (c == '[') { // links can be nested :/ + count++; + break; + } + if (c == ']') { + if (--count == 0) { + link1 = sb_strndup(src + start_link, current - start_link); + state = CONTENT_INLINE_LINK_URL_START; + } + } + break; + + case CONTENT_INLINE_LINK_URL_START: + if (c == ' ' || c == '\t' || c == '\n' || c == '\r') + break; + if (c == '(') { + state = CONTENT_INLINE_LINK_URL; + start = current + 1; + break; + } + sb_string_append_c(rv, '['); + state = CONTENT_INLINE_START; + current = start_link; + start_link = 0; + continue; + + case CONTENT_INLINE_LINK_URL: + if (c == '\\') { + current++; + break; + } + if (c == ')') { + tmp2 = sb_strndup(src + start, current - start); + tmp3 = blogc_content_parse_inline(link1); + free(link1); + link1 = NULL; + sb_string_append(rv, "<a href=\""); + sb_string_append_escaped(rv, tmp2); + sb_string_append_printf(rv, "\">%s</a>", tmp3); + free(tmp2); + tmp2 = NULL; + free(tmp3); + tmp3 = NULL; + state = CONTENT_INLINE_START; + break; + } + break; + + case CONTENT_INLINE_IMAGE_START: + // we use the same variables used for links, because why not? + if (c == '[') { + state = CONTENT_INLINE_IMAGE_ALT; + start_link = current + 1; + break; + } + sb_string_append_c(rv, '!'); + state = CONTENT_INLINE_START; + continue; + + case CONTENT_INLINE_IMAGE_ALT: + if (c == '\\') { + current++; + break; + } + if (c == ']') { + link1 = sb_strndup(src + start_link, current - start_link); + state = CONTENT_INLINE_IMAGE_URL_START; + } + break; + + case CONTENT_INLINE_IMAGE_URL_START: + if (c == ' ' || c == '\t' || c == '\n' || c == '\r') + break; + if (c == '(') { + state = CONTENT_INLINE_IMAGE_URL; + start = current + 1; + break; + } + sb_string_append_c(rv, '!'); + sb_string_append_c(rv, '['); + state = CONTENT_INLINE_START; + current = start_link; + start_link = 0; + continue; + + case CONTENT_INLINE_IMAGE_URL: + if (c == '\\') { + current++; + break; + } + if (c == ')') { + tmp2 = sb_strndup(src + start, current - start); + sb_string_append(rv, "<img src=\""); + sb_string_append_escaped(rv, tmp2); + sb_string_append(rv, "\" alt=\""); + sb_string_append_escaped(rv, link1); + sb_string_append(rv, "\">"); + free(tmp2); + tmp2 = NULL; + free(link1); + link1 = NULL; + state = CONTENT_INLINE_START; + break; + } + break; + + case CONTENT_INLINE_ENDASH: + if (c == '-') { + if (is_last) { + sb_string_append(rv, "–"); + state = CONTENT_INLINE_START; // wat + break; + } + state = CONTENT_INLINE_EMDASH; + break; + } + sb_string_append_c(rv, '-'); + state = CONTENT_INLINE_START; + continue; + + case CONTENT_INLINE_EMDASH: + if (c == '-') { + sb_string_append(rv, "—"); + state = CONTENT_INLINE_START; + break; + } + sb_string_append(rv, "–"); + state = CONTENT_INLINE_START; + continue; + + case CONTENT_INLINE_LINE_BREAK_START: + if (c == ' ') { + if (is_last) { + sb_string_append(rv, "<br />"); + state = CONTENT_INLINE_START; // wat + break; + } + count = 2; + state = CONTENT_INLINE_LINE_BREAK; + break; + } + sb_string_append_c(rv, ' '); + state = CONTENT_INLINE_START; + continue; + + case CONTENT_INLINE_LINE_BREAK: + if (c == ' ') { + if (is_last) { + sb_string_append(rv, "<br />"); + state = CONTENT_INLINE_START; // wat + break; + } + count++; + break; + } + if (c == '\n' || c == '\r') { + sb_string_append_printf(rv, "<br />%c", c); + state = CONTENT_INLINE_START; + break; + } + for (size_t i = 0; i < count; i++) + sb_string_append_c(rv, ' '); + state = CONTENT_INLINE_START; + continue; + } + current++; + } + + switch (state) { + + // if after the end of the loop we are on any of the following states, + // we must call the parser again, from start_link + case CONTENT_INLINE_IMAGE_START: + case CONTENT_INLINE_IMAGE_ALT: + case CONTENT_INLINE_IMAGE_URL_START: + case CONTENT_INLINE_IMAGE_URL: + sb_string_append_c(rv, '!'); + + case CONTENT_INLINE_LINK_CONTENT: + case CONTENT_INLINE_LINK_URL_START: + case CONTENT_INLINE_LINK_URL: + tmp2 = blogc_content_parse_inline(src + start_link); + sb_string_append_c(rv, '['); + sb_string_append_escaped(rv, tmp2); // no need to free, as it wil be done below. + break; + + // add all the other states here explicitly, so the compiler helps us + // not missing any new state that should be handled. + case CONTENT_INLINE_START: + case CONTENT_INLINE_ASTERISK: + case CONTENT_INLINE_ASTERISK_DOUBLE: + case CONTENT_INLINE_UNDERSCORE: + case CONTENT_INLINE_UNDERSCORE_DOUBLE: + case CONTENT_INLINE_BACKTICKS: + case CONTENT_INLINE_BACKTICKS_DOUBLE: + case CONTENT_INLINE_LINK_START: + case CONTENT_INLINE_LINK_AUTO: + case CONTENT_INLINE_ENDASH: + case CONTENT_INLINE_EMDASH: + case CONTENT_INLINE_LINE_BREAK_START: + case CONTENT_INLINE_LINE_BREAK: + break; + } + + free(tmp2); + free(tmp3); + free(link1); + + return sb_string_free(rv, false); +} + + +char* +blogc_content_parse_inline(const char *src) +{ + return blogc_content_parse_inline_internal(src, strlen(src)); +} + + +bool +blogc_is_ordered_list_item(const char *str, size_t prefix_len) +{ + if (str == NULL) + return false; + + if (strlen(str) < 2) + return false; + + size_t i; + + for (i = 0; str[i] >= '0' && str[i] <= '9'; i++); + + if (i == 0) + return false; + if (str[i] != '.') + return false; + + for (i++; i < prefix_len && (str[i] == ' ' || str[i] == '\t'); i++); + + if (str[i] == '\0') + return false; + + return i == prefix_len; +} + + +static blogc_content_node_t* +block_node_new(blogc_content_block_type_t type, char *content, sb_trie_t *parameters) +{ + blogc_content_node_t *rv = sb_malloc(sizeof(blogc_content_node_t)); + rv->node_type = BLOGC_CONTENT_BLOCK; + rv->type.block_type = type; + rv->content = content; + rv->parameters = parameters; + rv->child = NULL; + rv->next = NULL; + return rv; +} + + +static blogc_content_node_t* +inline_node_new(blogc_content_inline_type_t type, char *content, sb_trie_t *parameters) +{ + blogc_content_node_t *rv = sb_malloc(sizeof(blogc_content_node_t)); + rv->node_type = BLOGC_CONTENT_INLINE; + rv->type.inline_type = type; + rv->content = content; + rv->parameters = parameters; + rv->child = NULL; + rv->next = NULL; + return rv; +} + + +blogc_content_node_t* +blogc_content_parse_ast(const char *src, char **nl) +{ + // src is always nul-terminated. + size_t src_len = strlen(src); + + size_t current = 0; + size_t start = 0; + size_t start2 = 0; + size_t end = 0; + size_t real_end = 0; + + unsigned int header_level = 0; + char *prefix = NULL; + size_t prefix_len = 0; + char *tmp = NULL; + char *tmp2 = NULL; + char *parsed = NULL; + + // this isn't empty because we need some reasonable default value in the + // unlikely case that we need to print some line ending before evaluating + // the "real" value. + char line_ending[3] = "\n"; + bool line_ending_found = false; + + char d = '\0'; + + sb_slist_t *lines = NULL; + sb_slist_t *lines2 = NULL; + + sb_string_t *tmp_str = NULL; + + blogc_content_node_t *ast = NULL; + blogc_content_node_t *last = NULL; + + blogc_content_parser_state_t state = CONTENT_START_LINE; + + while (current < src_len) { + char c = src[current]; + bool is_last = current == src_len - 1; + + if (c == '\n' || c == '\r') { + if ((current + 1) < src_len) { + if ((c == '\n' && src[current + 1] == '\r') || + (c == '\r' && src[current + 1] == '\n')) + { + if (!line_ending_found) { + line_ending[0] = c; + line_ending[1] = src[current + 1]; + line_ending[2] = '\0'; + line_ending_found = true; + } + real_end = current; + c = src[++current]; + is_last = current == src_len - 1; + } + } + if (!line_ending_found) { + line_ending[0] = c; + line_ending[1] = '\0'; + line_ending_found = true; + } + } + + switch (state) { + + case CONTENT_START_LINE: + if (c == '\n' || c == '\r' || is_last) + break; + start = current; + if (c == '.') { + state = CONTENT_EXCERPT; + break; + } + if (c == '#') { + header_level = 1; + state = CONTENT_HEADER; + break; + } + if (c == '*' || c == '+' || c == '-') { + start2 = current; + state = CONTENT_UNORDERED_LIST_OR_HORIZONTAL_RULE; + d = c; + break; + } + if (c >= '0' && c <= '9') { + start2 = current; + state = CONTENT_ORDERED_LIST; + break; + } + if (c == ' ' || c == '\t') { + start2 = current; + state = CONTENT_CODE; + break; + } + if (c == '<') { + state = CONTENT_HTML; + break; + } + if (c == '>') { + state = CONTENT_BLOCKQUOTE; + start2 = current; + break; + } + state = CONTENT_PARAGRAPH; + break; + + case CONTENT_EXCERPT: + if (c == '.') + break; + if (c == '\n' || c == '\r') { + state = CONTENT_EXCERPT_END; + break; + } + state = CONTENT_PARAGRAPH; + break; + + case CONTENT_EXCERPT_END: + if (c == '\n' || c == '\r') { + if (ast == NULL) { + ast = block_node_new(BLOGC_CONTENT_BLOCK_EXCERPT, NULL, NULL); + last = ast; + } + else { + last->next = block_node_new(BLOGC_CONTENT_BLOCK_EXCERPT, NULL, NULL); + last = last->next; + } + state = CONTENT_START_LINE; + break; + } + state = CONTENT_PARAGRAPH_END; + break; + + case CONTENT_HEADER: + if (c == '#') { + header_level += 1; + break; + } + if (c == ' ' || c == '\t') { + state = CONTENT_HEADER_TITLE_START; + break; + } + state = CONTENT_PARAGRAPH; + break; + + case CONTENT_HEADER_TITLE_START: + if (c == ' ' || c == '\t') + break; + start = current; + if (c != '\n' && c != '\r') { + state = CONTENT_HEADER_TITLE; + break; + } + + case CONTENT_HEADER_TITLE: + if (c == '\n' || c == '\r' || is_last) { + end = is_last && c != '\n' && c != '\r' ? src_len : + (real_end != 0 ? real_end : current); + tmp = sb_strndup(src + start, end - start); + sb_trie_t *t = sb_trie_new(free); + sb_trie_insert(t, "level", sb_strdup_printf("%d", header_level)); + if (ast == NULL) { + ast = block_node_new(BLOGC_CONTENT_BLOCK_HEADER, blogc_content_parse_inline(tmp), t); + last = ast; + } + else { + last->next = block_node_new(BLOGC_CONTENT_BLOCK_HEADER, blogc_content_parse_inline(tmp), t); // TODO: inline-me + last = last->next; + } + free(tmp); + tmp = NULL; + state = CONTENT_START_LINE; + start = current; + } + break; + + case CONTENT_HTML: + if (c == '\n' || c == '\r' || is_last) { + state = CONTENT_HTML_END; + end = is_last && c != '\n' && c != '\r' ? src_len : + (real_end != 0 ? real_end : current); + } + if (!is_last) + break; + + case CONTENT_HTML_END: + if (c == '\n' || c == '\r' || is_last) { + if (ast == NULL) { + ast = block_node_new(BLOGC_CONTENT_BLOCK_RAW, + sb_strndup(src + start, end - start), NULL); + last = ast; + } + else { + last->next = block_node_new(BLOGC_CONTENT_BLOCK_RAW, + sb_strndup(src + start, end - start), NULL); + last = last->next; + } + state = CONTENT_START_LINE; + start = current; + } + else + state = CONTENT_HTML; + break; + + case CONTENT_BLOCKQUOTE: + if (c == ' ' || c == '\t') + break; + prefix = sb_strndup(src + start, current - start); + state = CONTENT_BLOCKQUOTE_START; + break; + + case CONTENT_BLOCKQUOTE_START: + if (c == '\n' || c == '\r' || is_last) { + end = is_last && c != '\n' && c != '\r' ? src_len : + (real_end != 0 ? real_end : current); + tmp = sb_strndup(src + start2, end - start2); + if (sb_str_starts_with(tmp, prefix)) { + lines = sb_slist_append(lines, sb_strdup(tmp + strlen(prefix))); + state = CONTENT_BLOCKQUOTE_END; + } + else { + state = CONTENT_PARAGRAPH; + free(prefix); + prefix = NULL; + sb_slist_free_full(lines, free); + lines = NULL; + if (is_last) { + free(tmp); + tmp = NULL; + continue; + } + } + free(tmp); + tmp = NULL; + } + if (!is_last) + break; + + case CONTENT_BLOCKQUOTE_END: + if (c == '\n' || c == '\r' || is_last) { + tmp_str = sb_string_new(); + for (sb_slist_t *l = lines; l != NULL; l = l->next) + sb_string_append_printf(tmp_str, "%s%s", l->data, + line_ending); + if (ast == NULL) { + ast = block_node_new(BLOGC_CONTENT_BLOCK_BLOCKQUOTE, + NULL, NULL); + ast->child = blogc_content_parse_ast(tmp_str->str, nl); + last = ast; + } + else { + last->next = block_node_new(BLOGC_CONTENT_BLOCK_BLOCKQUOTE, + NULL, NULL); + last->next->child = blogc_content_parse_ast(tmp_str->str, nl); + last = last->next; + } + sb_string_free(tmp_str, true); + tmp_str = NULL; + sb_slist_free_full(lines, free); + lines = NULL; + free(prefix); + prefix = NULL; + state = CONTENT_START_LINE; + start2 = current; + } + else { + start2 = current; + state = CONTENT_BLOCKQUOTE_START; + } + break; + + case CONTENT_CODE: + if (c == ' ' || c == '\t') + break; + prefix = sb_strndup(src + start, current - start); + state = CONTENT_CODE_START; + break; + + case CONTENT_CODE_START: + if (c == '\n' || c == '\r' || is_last) { + end = is_last && c != '\n' && c != '\r' ? src_len : + (real_end != 0 ? real_end : current); + tmp = sb_strndup(src + start2, end - start2); + if (sb_str_starts_with(tmp, prefix)) { + lines = sb_slist_append(lines, sb_strdup(tmp + strlen(prefix))); + state = CONTENT_CODE_END; + } + else { + state = CONTENT_PARAGRAPH; + free(prefix); + prefix = NULL; + sb_slist_free_full(lines, free); + lines = NULL; + free(tmp); + tmp = NULL; + if (is_last) + continue; + break; + } + free(tmp); + tmp = NULL; + } + if (!is_last) + break; + + case CONTENT_CODE_END: + if (c == '\n' || c == '\r' || is_last) { + tmp_str = sb_string_new(); + for (sb_slist_t *l = lines; l != NULL; l = l->next) { + if (l->next == NULL) + sb_string_append_printf(tmp_str, "%s", l->data); + else + sb_string_append_printf(tmp_str, "%s%s", l->data, + line_ending); + } + if (ast == NULL) { + ast = block_node_new(BLOGC_CONTENT_BLOCK_CODE, + sb_string_free(tmp_str, false), NULL); + last = ast; + } + else { + last->next = block_node_new(BLOGC_CONTENT_BLOCK_CODE, + sb_string_free(tmp_str, false), NULL); + last = last->next; + } + tmp_str = NULL; + sb_slist_free_full(lines, free); + lines = NULL; + free(prefix); + prefix = NULL; + state = CONTENT_START_LINE; + start2 = current; + } + else { + start2 = current; + state = CONTENT_CODE_START; + } + break; + + case CONTENT_UNORDERED_LIST_OR_HORIZONTAL_RULE: + if (c == d) { + state = CONTENT_HORIZONTAL_RULE; + if (is_last) + continue; + break; + } + if (c == ' ' || c == '\t') + break; + prefix = sb_strndup(src + start, current - start); + state = CONTENT_UNORDERED_LIST_START; + break; + + case CONTENT_HORIZONTAL_RULE: + if (c == d && !is_last) { + break; + } + if (c == '\n' || c == '\r' || is_last) { + if (ast == NULL) { + ast = block_node_new(BLOGC_CONTENT_BLOCK_HORIZONTAL_RULE, + NULL, NULL); + last = ast; + } + else { + last->next = block_node_new(BLOGC_CONTENT_BLOCK_HORIZONTAL_RULE, + NULL, NULL); + last = last->next; + } + state = CONTENT_START_LINE; + start = current; + d = '\0'; + break; + } + state = CONTENT_PARAGRAPH; + break; + + case CONTENT_UNORDERED_LIST_START: + if (c == '\n' || c == '\r' || is_last) { + end = is_last && c != '\n' && c != '\r' ? src_len : + (real_end != 0 ? real_end : current); + tmp = sb_strndup(src + start2, end - start2); + tmp2 = sb_strdup_printf("%-*s", strlen(prefix), ""); + if (sb_str_starts_with(tmp, prefix)) { + if (lines2 != NULL) { + tmp_str = sb_string_new(); + for (sb_slist_t *l = lines2; l != NULL; l = l->next) { + if (l->next == NULL) + sb_string_append_printf(tmp_str, "%s", l->data); + else + sb_string_append_printf(tmp_str, "%s%s", l->data, + line_ending); + } + sb_slist_free_full(lines2, free); + lines2 = NULL; + parsed = blogc_content_parse_inline(tmp_str->str); + sb_string_free(tmp_str, true); + lines = sb_slist_append(lines, sb_strdup(parsed)); + free(parsed); + parsed = NULL; + } + lines2 = sb_slist_append(lines2, sb_strdup(tmp + strlen(prefix))); + } + else if (sb_str_starts_with(tmp, tmp2)) { + lines2 = sb_slist_append(lines2, sb_strdup(tmp + strlen(prefix))); + } + else { + state = CONTENT_PARAGRAPH_END; + free(tmp); + tmp = NULL; + free(tmp2); + tmp2 = NULL; + free(prefix); + prefix = NULL; + sb_slist_free_full(lines, free); + sb_slist_free_full(lines2, free); + lines = NULL; + if (is_last) + continue; + break; + } + free(tmp); + tmp = NULL; + free(tmp2); + tmp2 = NULL; + state = CONTENT_UNORDERED_LIST_END; + } + if (!is_last) + break; + + case CONTENT_UNORDERED_LIST_END: + if (c == '\n' || c == '\r' || is_last) { + if (lines2 != NULL) { + // FIXME: avoid repeting the code below + tmp_str = sb_string_new(); + for (sb_slist_t *l = lines2; l != NULL; l = l->next) { + if (l->next == NULL) + sb_string_append_printf(tmp_str, "%s", l->data); + else + sb_string_append_printf(tmp_str, "%s%s", l->data, + line_ending); + } + sb_slist_free_full(lines2, free); + lines2 = NULL; + parsed = blogc_content_parse_inline(tmp_str->str); + sb_string_free(tmp_str, true); + lines = sb_slist_append(lines, sb_strdup(parsed)); + free(parsed); + parsed = NULL; + } + if (ast == NULL) { + ast = block_node_new(BLOGC_CONTENT_BLOCK_UNORDERED_LIST, + NULL, NULL); + last = ast; + } + else { + last->next = block_node_new(BLOGC_CONTENT_BLOCK_UNORDERED_LIST, + NULL, NULL); + last = last->next; + } + blogc_content_node_t *last_list = NULL; + for (sb_slist_t *l = lines; l != NULL; l = l->next) { + if (last_list == NULL) { + last->child = block_node_new(BLOGC_CONTENT_BLOCK_LIST_ITEM, + l->data, NULL); + last_list = last->child; + } + else { + last_list->next = block_node_new(BLOGC_CONTENT_BLOCK_LIST_ITEM, + l->data, NULL); + last_list = last_list->next; + } + } + sb_slist_free(lines); + lines = NULL; + free(prefix); + prefix = NULL; + state = CONTENT_START_LINE; + start2 = current; + } + else { + start2 = current; + state = CONTENT_UNORDERED_LIST_START; + } + break; + + case CONTENT_ORDERED_LIST: + if (c >= '0' && c <= '9') + break; + if (c == '.') { + state = CONTENT_ORDERED_LIST_SPACE; + break; + } + state = CONTENT_PARAGRAPH; + if (is_last) + continue; + break; + + case CONTENT_ORDERED_LIST_SPACE: + if (c == ' ' || c == '\t') + break; + prefix_len = current - start; + state = CONTENT_ORDERED_LIST_START; + if (c != '\n' && c != '\r' && !is_last) + break; + + case CONTENT_ORDERED_LIST_START: + if (c == '\n' || c == '\r' || is_last) { + end = is_last && c != '\n' && c != '\r' ? src_len : + (real_end != 0 ? real_end : current); + tmp = sb_strndup(src + start2, end - start2); + tmp2 = sb_strdup_printf("%-*s", prefix_len, ""); + if (blogc_is_ordered_list_item(tmp, prefix_len)) { + if (lines2 != NULL) { + tmp_str = sb_string_new(); + for (sb_slist_t *l = lines2; l != NULL; l = l->next) { + if (l->next == NULL) + sb_string_append_printf(tmp_str, "%s", l->data); + else + sb_string_append_printf(tmp_str, "%s%s", l->data, + line_ending); + } + sb_slist_free_full(lines2, free); + lines2 = NULL; + parsed = blogc_content_parse_inline(tmp_str->str); + sb_string_free(tmp_str, true); + lines = sb_slist_append(lines, sb_strdup(parsed)); + free(parsed); + parsed = NULL; + } + lines2 = sb_slist_append(lines2, sb_strdup(tmp + prefix_len)); + } + else if (sb_str_starts_with(tmp, tmp2)) { + lines2 = sb_slist_append(lines2, sb_strdup(tmp + prefix_len)); + } + else { + state = CONTENT_PARAGRAPH_END; + free(tmp); + tmp = NULL; + free(tmp2); + tmp2 = NULL; + free(parsed); + parsed = NULL; + sb_slist_free_full(lines, free); + sb_slist_free_full(lines2, free); + lines = NULL; + if (is_last) + continue; + break; + } + free(tmp); + tmp = NULL; + free(tmp2); + tmp2 = NULL; + state = CONTENT_ORDERED_LIST_END; + } + if (!is_last) + break; + + case CONTENT_ORDERED_LIST_END: + if (c == '\n' || c == '\r' || is_last) { + if (lines2 != NULL) { + // FIXME: avoid repeting the code below + tmp_str = sb_string_new(); + for (sb_slist_t *l = lines2; l != NULL; l = l->next) { + if (l->next == NULL) + sb_string_append_printf(tmp_str, "%s", l->data); + else + sb_string_append_printf(tmp_str, "%s%s", l->data, + line_ending); + } + sb_slist_free_full(lines2, free); + lines2 = NULL; + parsed = blogc_content_parse_inline(tmp_str->str); + sb_string_free(tmp_str, true); + lines = sb_slist_append(lines, sb_strdup(parsed)); + free(parsed); + parsed = NULL; + } + if (ast == NULL) { + ast = block_node_new(BLOGC_CONTENT_BLOCK_ORDERED_LIST, + NULL, NULL); + last = ast; + } + else { + last->next = block_node_new(BLOGC_CONTENT_BLOCK_ORDERED_LIST, + NULL, NULL); + last = last->next; + } + blogc_content_node_t *last_list = NULL; + for (sb_slist_t *l = lines; l != NULL; l = l->next) { + if (last_list == NULL) { + last->child = block_node_new(BLOGC_CONTENT_BLOCK_LIST_ITEM, + l->data, NULL); + last_list = last->child; + } + else { + last_list->next = block_node_new(BLOGC_CONTENT_BLOCK_LIST_ITEM, + l->data, NULL); + last_list = last_list->next; + } + } + sb_slist_free(lines); + lines = NULL; + free(prefix); + prefix = NULL; + state = CONTENT_START_LINE; + start2 = current; + } + else { + start2 = current; + state = CONTENT_ORDERED_LIST_START; + } + break; + + case CONTENT_PARAGRAPH: + if (c == '\n' || c == '\r' || is_last) { + state = CONTENT_PARAGRAPH_END; + end = is_last && c != '\n' && c != '\r' ? src_len : + (real_end != 0 ? real_end : current); + } + if (!is_last) + break; + + case CONTENT_PARAGRAPH_END: + if (c == '\n' || c == '\r' || is_last) { + char *tmp2 = sb_strndup(src + start, end - start); + sb_trie_t *t = sb_trie_new(free); + sb_trie_insert(t, "parsed", blogc_content_parse_inline(tmp2)); + if (ast == NULL) { + ast = block_node_new(BLOGC_CONTENT_BLOCK_PARAGRAPH, + tmp2, t); + last = ast; + } + else { + last->next = block_node_new(BLOGC_CONTENT_BLOCK_PARAGRAPH, + tmp2, t); + last = last->next; + } + state = CONTENT_START_LINE; + start = current; + } + else + state = CONTENT_PARAGRAPH; + break; + + } + + current++; + } + + if (nl != NULL && *nl == NULL) + *nl = sb_strdup(line_ending); + + return ast; +} + + +void +blogc_content_free_ast(blogc_content_node_t *ast) +{ + if (ast == NULL) + return; + free(ast->content); + sb_trie_free(ast->parameters); + blogc_content_free_ast(ast->child); + blogc_content_free_ast(ast->next); + free(ast); +} + + +char* +blogc_content_parse(const char *src, char **excerpt, char **description) +{ + char *nl = NULL; + blogc_content_node_t *c = blogc_content_parse_ast(src, &nl); + char *rv = blogc_content_render_html(c, nl, excerpt, description); + free(nl); + blogc_content_free_ast(c); + return rv; +} + + +char* +blogc_content_render_html(blogc_content_node_t *ast, char *nl, char **excerpt, + char **description) +{ + sb_string_t *rv = sb_string_new(); + char *tmp = NULL; + for (blogc_content_node_t *l = ast; l != NULL; l = l->next) { + switch (l->node_type) { + case BLOGC_CONTENT_BLOCK: + switch (l->type.block_type) { + case BLOGC_CONTENT_BLOCK_RAW: + sb_string_append_printf(rv, "%s%s", l->content, nl); + break; + case BLOGC_CONTENT_BLOCK_HEADER: + tmp = blogc_slugify(l->content); + sb_string_append_printf(rv, "<h%s id=\"%s\">%s</h%s>%s", + sb_trie_lookup(l->parameters, "level"), tmp, l->content, + sb_trie_lookup(l->parameters, "level"), nl); + free(tmp); + tmp = NULL; + break; + case BLOGC_CONTENT_BLOCK_BLOCKQUOTE: + tmp = blogc_content_render_html(l->child, nl, NULL, NULL); + sb_string_append_printf(rv, "<blockquote>%s</blockquote>%s", + tmp, nl); + free(tmp); + tmp = NULL; + break; + case BLOGC_CONTENT_BLOCK_CODE: + tmp = blogc_htmlentities(l->content); + sb_string_append_printf(rv, "<pre><code>%s</code></pre>%s", + tmp, nl); + free(tmp); + tmp = NULL; + break; + case BLOGC_CONTENT_BLOCK_HORIZONTAL_RULE: + sb_string_append_printf(rv, "<hr />%s", nl); + break; + case BLOGC_CONTENT_BLOCK_UNORDERED_LIST: + tmp = blogc_content_render_html(l->child, nl, NULL, NULL); + sb_string_append_printf(rv, "<ul>%s%s</ul>%s", nl, + tmp, nl); + free(tmp); + tmp = NULL; + break; + case BLOGC_CONTENT_BLOCK_ORDERED_LIST: + tmp = blogc_content_render_html(l->child, nl, NULL, NULL); + sb_string_append_printf(rv, "<ol>%s%s</ol>%s", nl, + tmp, nl); + free(tmp); + tmp = NULL; + break; + case BLOGC_CONTENT_BLOCK_LIST_ITEM: + sb_string_append_printf(rv, "<li>%s</li>%s", + l->content, nl); + break; + case BLOGC_CONTENT_BLOCK_PARAGRAPH: + if (description != NULL && *description == NULL) + *description = blogc_fix_description(l->content); + sb_string_append_printf(rv, "<p>%s</p>%s", + sb_trie_lookup(l->parameters, "parsed"), nl); + break; + case BLOGC_CONTENT_BLOCK_EXCERPT: + if (excerpt != NULL && *excerpt == NULL) + *excerpt = sb_strdup(rv->str); + break; + } + break; + case BLOGC_CONTENT_INLINE: + break; + } + } + return sb_string_free(rv, false); +} + + +void +blogc_content_debug(blogc_content_node_t *ast) +{ + for (blogc_content_node_t *l = ast; l != NULL; l = l->next) { + switch (l->node_type) { + case BLOGC_CONTENT_BLOCK: + fprintf(stderr, "DEBUG: <CONTENT BLOCK "); + switch (l->type.block_type) { + case BLOGC_CONTENT_BLOCK_RAW: + fprintf(stderr, "RAW: `%s`", l->content); + break; + case BLOGC_CONTENT_BLOCK_HEADER: + fprintf(stderr, "HEADER: \"%s\"", l->content); + break; + case BLOGC_CONTENT_BLOCK_BLOCKQUOTE: + fprintf(stderr, "BLOCKQUOTE"); + break; + case BLOGC_CONTENT_BLOCK_CODE: + fprintf(stderr, "CODE: `%s`", l->content); + break; + case BLOGC_CONTENT_BLOCK_HORIZONTAL_RULE: + fprintf(stderr, "HORIZONTAL_RULE"); + break; + case BLOGC_CONTENT_BLOCK_UNORDERED_LIST: + fprintf(stderr, "UNORDERED_LIST"); + break; + case BLOGC_CONTENT_BLOCK_ORDERED_LIST: + fprintf(stderr, "ORDERED_LIST"); + break; + case BLOGC_CONTENT_BLOCK_LIST_ITEM: + fprintf(stderr, "LIST_ITEM: `%s`", l->content); + break; + case BLOGC_CONTENT_BLOCK_PARAGRAPH: + fprintf(stderr, "PARAGRAPH: `%s`", l->content); + break; + case BLOGC_CONTENT_BLOCK_EXCERPT: + fprintf(stderr, "EXCERPT"); + break; + } + fprintf(stderr, ">\n"); + if (l->child != NULL) + blogc_content_debug(l->child); + break; + case BLOGC_CONTENT_INLINE: + break; + } + } +} |