/* * blogc: A blog compiler. * Copyright (C) 2015 Rafael G. Martins <rafael@rafaelmartins.eng.br> * * This program can be distributed under the terms of the BSD License. * See the file COPYING. */ #ifdef HAVE_CONFIG_H #include <config.h> #endif /* HAVE_CONFIG_H */ #include <stdbool.h> #include <string.h> #include "utils/utils.h" #include "content-parser.h" #include "error.h" // this is a half ass implementation of a markdown-like syntax. bugs are // expected. feel free to improve the parser and and new features. // TODO: inline elements: links, emphasis, code, images, line breaks // TODO: automatic scaping of html entities // TODO: automatic links // TODO: error handling typedef enum { CONTENT_START_LINE = 1, CONTENT_HEADER, CONTENT_HEADER_TITLE_START, CONTENT_HEADER_TITLE, CONTENT_HTML, CONTENT_HTML_END, CONTENT_BLOCKQUOTE, CONTENT_BLOCKQUOTE_START, CONTENT_BLOCKQUOTE_END, CONTENT_CODE, CONTENT_CODE_START, CONTENT_CODE_END, CONTENT_UNORDERED_LIST_OR_HORIZONTAL_RULE, CONTENT_HORIZONTAL_RULE, CONTENT_UNORDERED_LIST_START, CONTENT_UNORDERED_LIST_END, CONTENT_ORDERED_LIST, CONTENT_ORDERED_LIST_SPACE, CONTENT_ORDERED_LIST_START, CONTENT_ORDERED_LIST_END, CONTENT_PARAGRAPH, CONTENT_PARAGRAPH_END, } blogc_content_parser_state_t; char* blogc_content_parse(const char *src, size_t src_len, blogc_error_t **err) { if (err == NULL || *err != NULL) return NULL; size_t current = 0; size_t start = 0; size_t end = 0; unsigned int header_level = 0; char *prefix = NULL; size_t prefix_len = 0; char *tmp = NULL; char *tmp2 = NULL; char **tmpv = NULL; char d; b_slist_t *lines = NULL; b_string_t *rv = b_string_new(); b_string_t *tmp_str = NULL; blogc_content_parser_state_t state = CONTENT_START_LINE; while (current < src_len) { char c = src[current]; bool is_last = current == src_len - 1; switch (state) { case CONTENT_START_LINE: if (c == '\n' || c == '\r' || is_last) break; if (c == '#') { header_level = 1; state = CONTENT_HEADER; break; } if (c == '*' || c == '+' || c == '-') { state = CONTENT_UNORDERED_LIST_OR_HORIZONTAL_RULE; start = current; d = c; break; } if (c >= '0' && c <= '9') { state = CONTENT_ORDERED_LIST; start = current; break; } if (c == ' ' || c == '\t') { state = CONTENT_CODE; start = current; break; } if (c == '<') { state = CONTENT_HTML; start = current; break; } if (c == '>') { state = CONTENT_BLOCKQUOTE; start = current; break; } start = current; state = CONTENT_PARAGRAPH; break; case CONTENT_HEADER: if (c == '#') { header_level += 1; break; } if (c == ' ' || c == '\t') { state = CONTENT_HEADER_TITLE_START; break; } *err = blogc_error_parser(BLOGC_ERROR_CONTENT_PARSER, src, src_len, current, "Malformed header, no space or tab after '#'"); break; case CONTENT_HEADER_TITLE_START: if (c == ' ' || c == '\t') break; start = current; if (c != '\n' && c != '\r') { state = CONTENT_HEADER_TITLE; break; } case CONTENT_HEADER_TITLE: if (c == '\n' || c == '\r' || is_last) { end = is_last && c != '\n' && c != '\r' ? src_len : current; tmp = b_strndup(src + start, end - start); b_string_append_printf(rv, "<h%d>%s</h%d>\n", header_level, tmp, header_level); free(tmp); tmp = NULL; state = CONTENT_START_LINE; start = current; } break; case CONTENT_HTML: if (c == '\n' || c == '\r' || is_last) { state = CONTENT_HTML_END; end = is_last && c != '\n' && c != '\r' ? src_len : current; } if (!is_last) break; case CONTENT_HTML_END: if (c == '\n' || c == '\r' || is_last) { tmp = b_strndup(src + start, end - start); b_string_append_printf(rv, "%s\n", tmp); free(tmp); tmp = NULL; state = CONTENT_START_LINE; start = current; } else state = CONTENT_HTML; break; case CONTENT_BLOCKQUOTE: if (c == ' ' || c == '\t') break; prefix = b_strndup(src + start, current - start); state = CONTENT_BLOCKQUOTE_START; break; case CONTENT_BLOCKQUOTE_START: if (c == '\n' || c == '\r' || is_last) { end = is_last && c != '\n' && c != '\r' ? src_len : current; tmp = b_strndup(src + start, end - start); if (b_str_starts_with(tmp, prefix)) { lines = b_slist_append(lines, b_strdup(tmp + strlen(prefix))); state = CONTENT_BLOCKQUOTE_END; } else { *err = blogc_error_parser(BLOGC_ERROR_CONTENT_PARSER, src, src_len, current, "Malformed blockquote, must use same prefix " "as previous line(s): %s", prefix); free(prefix); prefix = NULL; b_slist_free_full(lines, free); lines = NULL; } free(tmp); tmp = NULL; } if (!is_last) break; case CONTENT_BLOCKQUOTE_END: if (c == '\n' || c == '\r' || is_last) { tmp_str = b_string_new(); for (b_slist_t *l = lines; l != NULL; l = l->next) { if (l->next == NULL) b_string_append_printf(tmp_str, "%s", l->data); else b_string_append_printf(tmp_str, "%s\n", l->data); } tmp = blogc_content_parse(tmp_str->str, tmp_str->len, err); if (*err == NULL) { b_string_append_printf(rv, "<blockquote>%s</blockquote>\n", tmp); } free(tmp); tmp = NULL; b_string_free(tmp_str, true); tmp_str = NULL; b_slist_free_full(lines, free); lines = NULL; free(prefix); prefix = NULL; state = CONTENT_START_LINE; start = current; } else { start = current; state = CONTENT_BLOCKQUOTE_START; } break; case CONTENT_CODE: if (c == ' ' || c == '\t') break; prefix = b_strndup(src + start, current - start); state = CONTENT_CODE_START; break; case CONTENT_CODE_START: if (c == '\n' || c == '\r' || is_last) { end = is_last && c != '\n' && c != '\r' ? src_len : current; tmp = b_strndup(src + start, end - start); if (b_str_starts_with(tmp, prefix)) { lines = b_slist_append(lines, b_strdup(tmp + strlen(prefix))); state = CONTENT_CODE_END; } else { *err = blogc_error_parser(BLOGC_ERROR_CONTENT_PARSER, src, src_len, current, "Malformed code block, must use same prefix " "as previous line(s): '%s'", prefix); free(prefix); prefix = NULL; b_slist_free_full(lines, free); lines = NULL; } free(tmp); tmp = NULL; } if (!is_last) break; case CONTENT_CODE_END: if (c == '\n' || c == '\r' || is_last) { b_string_append(rv, "<pre><code>"); for (b_slist_t *l = lines; l != NULL; l = l->next) { if (l->next == NULL) b_string_append_printf(rv, "%s", l->data); else b_string_append_printf(rv, "%s\n", l->data); } b_string_append(rv, "</code></pre>\n"); b_slist_free_full(lines, free); lines = NULL; free(prefix); prefix = NULL; state = CONTENT_START_LINE; start = current; } else { start = current; state = CONTENT_CODE_START; } break; case CONTENT_UNORDERED_LIST_OR_HORIZONTAL_RULE: if (c == d) { state = CONTENT_HORIZONTAL_RULE; break; } if (c == ' ' || c == '\t') break; prefix = b_strndup(src + start, current - start); state = CONTENT_UNORDERED_LIST_START; break; case CONTENT_HORIZONTAL_RULE: if (c == d) { break; } if (c == '\n' || c == '\r' || is_last) { b_string_append(rv, "<hr />\n"); state = CONTENT_START_LINE; start = current; d = '\0'; break; } *err = blogc_error_parser(BLOGC_ERROR_CONTENT_PARSER, src, src_len, current, "Malformed horizontal rule, must use only '%c'", d); break; case CONTENT_UNORDERED_LIST_START: if (c == '\n' || c == '\r' || is_last) { end = is_last && c != '\n' && c != '\r' ? src_len : current; tmp = b_strndup(src + start, end - start); if (b_str_starts_with(tmp, prefix)) { lines = b_slist_append(lines, b_strdup(tmp + strlen(prefix))); } else { *err = blogc_error_parser(BLOGC_ERROR_CONTENT_PARSER, src, src_len, current, "Malformed unordered list, must use same prefix " "as previous line(s): %s", prefix); } free(tmp); tmp = NULL; state = CONTENT_UNORDERED_LIST_END; } if (!is_last) break; case CONTENT_UNORDERED_LIST_END: if (c == '\n' || c == '\r' || is_last) { b_string_append(rv, "<ul>\n"); for (b_slist_t *l = lines; l != NULL; l = l->next) b_string_append_printf(rv, "<li>%s</li>\n", l->data); b_string_append(rv, "</ul>\n"); b_slist_free_full(lines, free); lines = NULL; free(prefix); prefix = NULL; state = CONTENT_START_LINE; start = current; } else { start = current; state = CONTENT_UNORDERED_LIST_START; } break; case CONTENT_ORDERED_LIST: if (c >= '0' && c <= '9') break; if (c == '.') { state = CONTENT_ORDERED_LIST_SPACE; break; } state = CONTENT_PARAGRAPH; break; case CONTENT_ORDERED_LIST_SPACE: if (c == ' ' || c == '\t') break; prefix_len = current - start; state = CONTENT_ORDERED_LIST_START; break; case CONTENT_ORDERED_LIST_START: if (c == '\n' || c == '\r' || is_last) { end = is_last && c != '\n' && c != '\r' ? src_len : current; tmp = b_strndup(src + start, end - start); if (strlen(tmp) >= prefix_len) { tmp2 = b_strndup(tmp, prefix_len); tmpv = b_str_split(tmp2, '.', 2); free(tmp2); tmp2 = NULL; if (b_strv_length(tmpv) != 2) { *err = blogc_error_parser(BLOGC_ERROR_CONTENT_PARSER, src, src_len, current, "Malformed ordered list, prefix must be a " "number, followed by a '.', followed by the content. " "Content must be aligned with content from previous line(s)"); goto err_li; } for (unsigned int i = 0; tmpv[0][i] != '\0'; i++) { if (!(tmpv[0][i] >= '0' && tmpv[0][i] <= '9')) { *err = blogc_error_parser(BLOGC_ERROR_CONTENT_PARSER, src, src_len, current, "Malformed ordered list, prefix must be a " "number, followed by a '.', followed by the content. " "Content must be aligned with content from previous line(s)"); goto err_li; } } for (unsigned int i = 0; tmpv[1][i] != '\0'; i++) { if (!(tmpv[1][i] == ' ' || tmpv[1][i] == '\t')) { *err = blogc_error_parser(BLOGC_ERROR_CONTENT_PARSER, src, src_len, current, "Malformed ordered list, prefix must be a " "number, followed by a '.', followed by the content. " "Content must be aligned with content from previous line(s)"); goto err_li; } } lines = b_slist_append(lines, b_strdup(tmp + prefix_len)); state = CONTENT_ORDERED_LIST_END; err_li: b_strv_free(tmpv); tmpv = NULL; } free(tmp); tmp = NULL; } if (!is_last) break; case CONTENT_ORDERED_LIST_END: if (c == '\n' || c == '\r' || is_last) { b_string_append(rv, "<ol>\n"); for (b_slist_t *l = lines; l != NULL; l = l->next) b_string_append_printf(rv, "<li>%s</li>\n", l->data); b_string_append(rv, "</ol>\n"); b_slist_free_full(lines, free); lines = NULL; free(prefix); prefix = NULL; state = CONTENT_START_LINE; start = current; } else { start = current; state = CONTENT_ORDERED_LIST_START; } break; case CONTENT_PARAGRAPH: if (c == '\n' || c == '\r' || is_last) { state = CONTENT_PARAGRAPH_END; end = is_last && c != '\n' && c != '\r' ? src_len : current; } if (!is_last) break; case CONTENT_PARAGRAPH_END: if (c == '\n' || c == '\r' || is_last) { tmp = b_strndup(src + start, end - start); b_string_append_printf(rv, "<p>%s</p>\n", tmp); free(tmp); tmp = NULL; state = CONTENT_START_LINE; start = current; } else state = CONTENT_PARAGRAPH; break; } if (*err != NULL) break; current++; } if (*err != NULL) { b_string_free(rv, true); return NULL; } return b_string_free(rv, false); }