#include "autolink.h" #include "houdini.h" #include "qfm.h" #include "scanners.h" #include #include #include #include #if defined(_WIN32) #define strncasecmp _strnicmp #else #include #endif static void escape_html(cmark_strbuf *dest, const unsigned char *source, bufsize_t length) { houdini_escape_html0(dest, source, length, 0); } static int is_valid_hostchar(const uint8_t *link, size_t link_len) { int32_t ch; int r = cmark_utf8proc_iterate(link, (bufsize_t)link_len, &ch); if (r < 0) return 0; return !cmark_utf8proc_is_space(ch) && !cmark_utf8proc_is_punctuation(ch); } static int sd_autolink_issafe(const uint8_t *link, size_t link_len) { static const size_t valid_uris_count = 3; static const char *valid_uris[] = {"http://", "https://", "ftp://"}; size_t i; for (i = 0; i < valid_uris_count; ++i) { size_t len = strlen(valid_uris[i]); if (link_len > len && strncasecmp((char *)link, valid_uris[i], len) == 0 && is_valid_hostchar(link + len, link_len - len)) return 1; } return 0; } static size_t autolink_delim(uint8_t *data, size_t link_end) { uint8_t cclose, copen; size_t i; for (i = 0; i < link_end; ++i) if (data[i] == '<') { link_end = i; break; } while (link_end > 0) { cclose = data[link_end - 1]; switch (cclose) { case ')': copen = '('; break; default: copen = 0; } if (strchr("?!.,:*_~'\"", data[link_end - 1]) != NULL) link_end--; else if (data[link_end - 1] == ';') { size_t new_end = link_end - 2; while (new_end > 0 && cmark_isalpha(data[new_end])) new_end--; if (new_end < link_end - 2 && data[new_end] == '&') link_end = new_end; else link_end--; } else if (copen != 0) { size_t closing = 0; size_t opening = 0; i = 0; /* Allow any number of matching brackets (as recognised in copen/cclose) * at the end of the URL. If there is a greater number of closing * brackets than opening ones, we remove one character from the end of * the link. * * Examples (input text => output linked portion): * * http://www.pokemon.com/Pikachu_(Electric) * => http://www.pokemon.com/Pikachu_(Electric) * * http://www.pokemon.com/Pikachu_((Electric) * => http://www.pokemon.com/Pikachu_((Electric) * * http://www.pokemon.com/Pikachu_(Electric)) * => http://www.pokemon.com/Pikachu_(Electric) * * http://www.pokemon.com/Pikachu_((Electric)) * => http://www.pokemon.com/Pikachu_((Electric)) */ while (i < link_end) { if (data[i] == copen) opening++; else if (data[i] == cclose) closing++; i++; } if (closing <= opening) break; link_end--; } else break; } return link_end; } static size_t check_domain(uint8_t *data, size_t size, int allow_short) { size_t i, np = 0, uscore1 = 0, uscore2 = 0; for (i = 1; i < size - 1; i++) { if (data[i] == '_') uscore2++; else if (data[i] == '.') { uscore1 = uscore2; uscore2 = 0; np++; } else if (!is_valid_hostchar(data + i, size - i) && data[i] != '-') break; } if (uscore1 > 0 || uscore2 > 0) return 0; if (allow_short) { /* We don't need a valid domain in the strict sense (with * least one dot; so just make sure it's composed of valid * domain characters and return the length of the the valid * sequence. */ return i; } else { /* a valid domain needs to have at least a dot. * that's as far as we get */ return np ? i : 0; } } static cmark_node *www_match(cmark_parser *parser, cmark_node *parent, cmark_inline_parser *inline_parser, cmark_syntax_extension *ext) { cmark_chunk *chunk = cmark_inline_parser_get_chunk(inline_parser); size_t max_rewind = cmark_inline_parser_get_offset(inline_parser); uint8_t *data = chunk->data + max_rewind; size_t size = chunk->len - max_rewind; int start = cmark_inline_parser_get_column(inline_parser); size_t link_end; if (max_rewind > 0 && strchr("*_~(", data[-1]) == NULL && !cmark_isspace(data[-1])) return 0; if (size < 4 || memcmp(data, "www.", strlen("www.")) != 0) return 0; link_end = check_domain(data, size, 0); if (link_end == 0) return NULL; while (link_end < size && !cmark_isspace(data[link_end])) link_end++; link_end = autolink_delim(data, link_end); if (link_end == 0) return NULL; cmark_inline_parser_set_offset(inline_parser, (int)(max_rewind + link_end)); cmark_node *node = cmark_node_new_with_mem(CMARK_NODE_LINK, parser->mem); if (parser->options & CMARK_OPT_AUTOLINK_CLASS_NAME) { cmark_node_set_syntax_extension(node, ext); } cmark_strbuf buf; cmark_strbuf_init(parser->mem, &buf, 10); cmark_strbuf_puts(&buf, "http://"); cmark_strbuf_put(&buf, data, (bufsize_t)link_end); node->as.link.url = cmark_chunk_buf_detach(&buf); cmark_node *text = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem); text->as.literal = cmark_chunk_dup(chunk, (bufsize_t)max_rewind, (bufsize_t)link_end); cmark_node_append_child(node, text); node->start_line = text->start_line = node->end_line = text->end_line = cmark_inline_parser_get_line(inline_parser); node->start_column = text->start_column = start - 1; node->end_column = text->end_column = cmark_inline_parser_get_column(inline_parser) - 1; return node; } static cmark_node *url_match(cmark_parser *parser, cmark_node *parent, cmark_inline_parser *inline_parser, cmark_syntax_extension *ext) { size_t link_end, domain_len; int rewind = 0; cmark_chunk *chunk = cmark_inline_parser_get_chunk(inline_parser); int max_rewind = cmark_inline_parser_get_offset(inline_parser); uint8_t *data = chunk->data + max_rewind; size_t size = chunk->len - max_rewind; if (size < 4 || data[1] != '/' || data[2] != '/') return 0; while (rewind < max_rewind && cmark_isalpha(data[-rewind - 1])) rewind++; if (!sd_autolink_issafe(data - rewind, size + rewind)) return 0; link_end = strlen("://"); domain_len = check_domain(data + link_end, size - link_end, 1); if (domain_len == 0) return 0; link_end += domain_len; while (link_end < size && !cmark_isspace(data[link_end])) link_end++; link_end = autolink_delim(data, link_end); if (link_end == 0) return NULL; cmark_inline_parser_set_offset(inline_parser, (int)(max_rewind + link_end)); cmark_node_unput(parent, rewind); cmark_node *node = cmark_node_new_with_mem(CMARK_NODE_LINK, parser->mem); if (parser->options & CMARK_OPT_AUTOLINK_CLASS_NAME) { cmark_node_set_syntax_extension(node, ext); } cmark_chunk url = cmark_chunk_dup(chunk, max_rewind - rewind, (bufsize_t)(link_end + rewind)); node->as.link.url = url; cmark_node *text = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem); text->as.literal = url; cmark_node_append_child(node, text); return node; } static cmark_node *match(cmark_syntax_extension *ext, cmark_parser *parser, cmark_node *parent, unsigned char c, cmark_inline_parser *inline_parser) { if (cmark_inline_parser_in_bracket(inline_parser, false) || cmark_inline_parser_in_bracket(inline_parser, true)) return NULL; if (c == ':') return url_match(parser, parent, inline_parser, ext); if (c == 'w') return www_match(parser, parent, inline_parser, ext); return NULL; // note that we could end up re-consuming something already a // part of an inline, because we don't track when the last // inline was finished in inlines.c. } static void postprocess_text(cmark_parser *parser, cmark_node *text, int offset, int depth, cmark_syntax_extension *ext) { // postprocess_text can recurse very deeply if there is a very long line of // '@' only. Stop at a reasonable depth to ensure it cannot crash. if (depth > 1000) return; size_t link_end; uint8_t *data = text->as.literal.data, *at; size_t size = text->as.literal.len; int rewind, max_rewind, nb = 0, np = 0, ns = 0; if (offset < 0 || (size_t)offset >= size) return; data += offset; size -= offset; at = (uint8_t *)memchr(data, '@', size); if (!at) return; max_rewind = (int)(at - data); data += max_rewind; size -= max_rewind; for (rewind = 0; rewind < max_rewind; ++rewind) { uint8_t c = data[-rewind - 1]; if (cmark_isalnum(c)) continue; if (strchr(".+-_", c) != NULL) continue; if (c == '/') ns++; break; } if (rewind == 0 || ns > 0) { postprocess_text(parser, text, max_rewind + 1 + offset, depth + 1, ext); return; } for (link_end = 0; link_end < size; ++link_end) { uint8_t c = data[link_end]; if (cmark_isalnum(c)) continue; if (c == '@') nb++; else if (c == '.' && link_end < size - 1 && cmark_isalnum(data[link_end + 1])) np++; else if (c != '-' && c != '_') break; } if (link_end < 2 || nb != 1 || np == 0 || (!cmark_isalpha(data[link_end - 1]) && data[link_end - 1] != '.')) { postprocess_text(parser, text, max_rewind + 1 + offset, depth + 1, ext); return; } link_end = autolink_delim(data, link_end); if (link_end == 0) { postprocess_text(parser, text, max_rewind + 1 + offset, depth + 1, ext); return; } cmark_chunk_to_cstr(parser->mem, &text->as.literal); cmark_node *link_node = cmark_node_new_with_mem(CMARK_NODE_LINK, parser->mem); if (parser->options & CMARK_OPT_AUTOLINK_CLASS_NAME) { cmark_node_set_syntax_extension(link_node, ext); } cmark_strbuf buf; cmark_strbuf_init(parser->mem, &buf, 10); cmark_strbuf_puts(&buf, "mailto:"); cmark_strbuf_put(&buf, data - rewind, (bufsize_t)(link_end + rewind)); link_node->as.link.url = cmark_chunk_buf_detach(&buf); cmark_node *link_text = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem); cmark_chunk email = cmark_chunk_dup( &text->as.literal, offset + max_rewind - rewind, (bufsize_t)(link_end + rewind)); cmark_chunk_to_cstr(parser->mem, &email); link_text->as.literal = email; cmark_node_append_child(link_node, link_text); cmark_node_insert_after(text, link_node); cmark_node *post = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem); post->as.literal = cmark_chunk_dup(&text->as.literal, (bufsize_t)(offset + max_rewind + link_end), (bufsize_t)(size - link_end)); cmark_chunk_to_cstr(parser->mem, &post->as.literal); cmark_node_insert_after(link_node, post); text->as.literal.len = offset + max_rewind - rewind; text->as.literal.data[text->as.literal.len] = 0; postprocess_text(parser, post, 0, depth + 1, ext); } static cmark_node *postprocess(cmark_syntax_extension *ext, cmark_parser *parser, cmark_node *root) { cmark_iter *iter; cmark_event_type ev; cmark_node *node; bool in_link = false; cmark_consolidate_text_nodes(root); iter = cmark_iter_new(root); while ((ev = cmark_iter_next(iter)) != CMARK_EVENT_DONE) { node = cmark_iter_get_node(iter); if (in_link) { if (ev == CMARK_EVENT_EXIT && node->type == CMARK_NODE_LINK) { in_link = false; } continue; } if (ev == CMARK_EVENT_ENTER && node->type == CMARK_NODE_LINK) { in_link = true; continue; } if (ev == CMARK_EVENT_ENTER && node->type == CMARK_NODE_TEXT) { postprocess_text(parser, node, 0, /*depth*/ 0, ext); } } cmark_iter_free(iter); return root; } static void html_render(cmark_syntax_extension *extension, cmark_html_renderer *renderer, cmark_node *node, cmark_event_type ev_type, int options) { bool entering = (ev_type == CMARK_EVENT_ENTER); cmark_strbuf *html = renderer->html; if (entering) { cmark_strbuf_puts(html, "as.link.url, 0))) { houdini_escape_href(html, node->as.link.url.data, node->as.link.url.len); } if (node->as.link.title.len) { cmark_strbuf_puts(html, "\" title=\""); escape_html(html, node->as.link.title.data, node->as.link.title.len); } if (options & CMARK_OPT_AUTOLINK_CLASS_NAME) { cmark_strbuf_puts(html, "\" class=\"autolink"); } cmark_strbuf_puts(html, "\">"); } else { cmark_strbuf_puts(html, ""); } } cmark_syntax_extension *create_autolink_extension(void) { cmark_syntax_extension *ext = cmark_syntax_extension_new("autolink"); cmark_llist *special_chars = NULL; cmark_syntax_extension_set_match_inline_func(ext, match); cmark_syntax_extension_set_postprocess_func(ext, postprocess); cmark_syntax_extension_set_html_render_func(ext, html_render); cmark_mem *mem = cmark_get_default_mem_allocator(); special_chars = cmark_llist_append(mem, special_chars, (void *)':'); special_chars = cmark_llist_append(mem, special_chars, (void *)'w'); cmark_syntax_extension_set_special_inline_chars(ext, special_chars); return ext; }