ext/rinku/autolink.c in rinku-1.7.3 vs ext/rinku/autolink.c in rinku-2.0.0

- old
+ new

@@ -1,7 +1,7 @@ /* - * Copyright (c) 2011, Vicent Marti + * Copyright (c) 2016, GitHub, Inc * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * @@ -11,25 +11,26 @@ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include "buffer.h" -#include "autolink.h" - #include <string.h> +#include <assert.h> #include <stdlib.h> #include <stdio.h> -#include <ctype.h> +#include <stdbool.h> +#include "buffer.h" +#include "autolink.h" +#include "utf8.h" + #if defined(_WIN32) #define strncasecmp _strnicmp #endif -int -sd_autolink_issafe(const uint8_t *link, size_t link_len) +bool +autolink_issafe(const uint8_t *link, size_t link_len) { static const size_t valid_uris_count = 5; static const char *valid_uris[] = { "/", "http://", "https://", "ftp://", "mailto:" }; @@ -39,51 +40,57 @@ for (i = 0; i < valid_uris_count; ++i) { size_t len = strlen(valid_uris[i]); if (link_len > len && strncasecmp((char *)link, valid_uris[i], len) == 0 && - isalnum(link[len])) - return 1; + rinku_isalnum(link[len])) + return true; } - return 0; + return false; } -static size_t -autolink_delim(uint8_t *data, size_t link_end, size_t max_rewind, size_t size) +static bool +autolink_delim(const uint8_t *data, struct autolink_pos *link) { uint8_t cclose, copen = 0; size_t i; - for (i = 0; i < link_end; ++i) + for (i = link->start; i < link->end; ++i) if (data[i] == '<') { - link_end = i; + link->end = i; break; } - while (link_end > 0) { - if (strchr("?!.,:", data[link_end - 1]) != NULL) - link_end--; + while (link->end > link->start) { + if (strchr("?!.,:", data[link->end - 1]) != NULL) + link->end--; - else if (data[link_end - 1] == ';') { - size_t new_end = link_end - 2; + else if (data[link->end - 1] == ';') { + size_t new_end = link->end - 2; - while (new_end > 0 && isalpha(data[new_end])) + while (new_end > 0 && rinku_isalnum(data[new_end])) new_end--; - if (new_end < link_end - 2 && data[new_end] == '&') - link_end = new_end; - else - link_end--; + if (new_end < link->end - 2) { + if (new_end > 0 && data[new_end] == '#') + new_end--; + + if (data[new_end] == '&') { + link->end = new_end; + continue; + } + } + link->end--; } else break; } - if (link_end == 0) - return 0; + if (link->end == link->start) + return false; - cclose = data[link_end - 1]; + cclose = data[link->end - 1]; switch (cclose) { case '"': copen = '"'; break; case '\'': copen = '\''; break; case ')': copen = '('; break; @@ -92,11 +99,11 @@ } if (copen != 0) { size_t closing = 0; size_t opening = 0; - size_t i = 0; + size_t i = link->start; /* Try to close the final punctuation sign in this same line; * if we managed to close it outside of the URL, that means that it's * not part of the URL. If it closes inside the URL, that means it * is part of the URL. @@ -114,182 +121,162 @@ * * (foo http://www.pokemon.com/Pikachu_(Electric)) bar * => foo http://www.pokemon.com/Pikachu_(Electric) */ - while (i < link_end) { + while (i < link->end) { if (data[i] == copen) opening++; else if (data[i] == cclose) closing++; i++; } if (closing != opening) - link_end--; + link->end--; } - return link_end; + return true; } -static size_t -check_domain(uint8_t *data, size_t size, int allow_short) +static bool +check_domain(const uint8_t *data, size_t size, + struct autolink_pos *link, bool allow_short) { size_t i, np = 0; - if (!isalnum(data[0])) - return 0; + if (!rinku_isalnum(data[link->start])) + return false; - for (i = 1; i < size - 1; ++i) { + for (i = link->start + 1; i < size - 1; ++i) { if (data[i] == '.') np++; - else if (!isalnum(data[i]) && data[i] != '-') break; + else if (!rinku_isalnum(data[i]) && data[i] != '-') break; } + link->end = i; + if (allow_short) { /* We don't need a valid domain in the strict sense (with * least one dot; so just make sure it's composed of valid * domain characters and return the length of the the valid * sequence. */ - return i; + return true; } else { /* a valid domain needs to have at least a dot. * that's as far as we get */ - return np ? i : 0; + return (np > 0); } } -size_t -sd_autolink__www( - size_t *rewind_p, - struct buf *link, - uint8_t *data, - size_t max_rewind, +bool +autolink__www( + struct autolink_pos *link, + const uint8_t *data, + size_t pos, size_t size, unsigned int flags) { - size_t link_end; + int32_t boundary; + assert(data[pos] == 'w' || data[pos] == 'W'); - if (max_rewind > 0 && !ispunct(data[-1]) && !isspace(data[-1])) - return 0; + if ((size - pos) < 4 || + (data[pos + 1] != 'w' && data[pos + 1] != 'W') || + (data[pos + 2] != 'w' && data[pos + 2] != 'W') || + data[pos + 3] != '.') + return false; - if (size < 4 || memcmp(data, "www.", strlen("www.")) != 0) - return 0; + boundary = utf8proc_rewind(data, pos); + if (boundary && + !utf8proc_is_space(boundary) && + !utf8proc_is_punctuation(boundary)) + return false; - link_end = check_domain(data, size, 0); + link->start = pos; + link->end = 0; - if (link_end == 0) - return 0; + if (!check_domain(data, size, link, false)) + return false; - while (link_end < size && !isspace(data[link_end])) - link_end++; - - link_end = autolink_delim(data, link_end, max_rewind, size); - - if (link_end == 0) - return 0; - - bufput(link, data, link_end); - *rewind_p = 0; - - return (int)link_end; + link->end = utf8proc_find_space(data, link->end, size); + return autolink_delim(data, link); } -size_t -sd_autolink__email( - size_t *rewind_p, - struct buf *link, - uint8_t *data, - size_t max_rewind, +bool +autolink__email( + struct autolink_pos *link, + const uint8_t *data, + size_t pos, size_t size, unsigned int flags) { - size_t link_end, rewind; int nb = 0, np = 0; + assert(data[pos] == '@'); - for (rewind = 0; rewind < max_rewind; ++rewind) { - uint8_t c = data[-rewind - 1]; + link->start = pos; + link->end = pos; - if (isalnum(c)) + for (; link->start > 0; link->start--) { + uint8_t c = data[link->start - 1]; + + if (rinku_isalnum(c)) continue; - if (strchr(".+-_", c) != NULL) + if (strchr(".+-_%", c) != NULL) continue; break; } - if (rewind == 0) - return 0; + if (link->start == pos) + return false; - for (link_end = 0; link_end < size; ++link_end) { - uint8_t c = data[link_end]; + for (; link->end < size; link->end++) { + uint8_t c = data[link->end]; - if (isalnum(c)) + if (rinku_isalnum(c)) continue; if (c == '@') nb++; - else if (c == '.' && link_end < size - 1) + else if (c == '.' && link->end < size - 1) np++; else if (c != '-' && c != '_') break; } - if (link_end < 2 || nb != 1 || np == 0) - return 0; + if ((link->end - pos) < 2 || nb != 1 || np == 0) + return false; - link_end = autolink_delim(data, link_end, max_rewind, size); - - if (link_end == 0) - return 0; - - bufput(link, data - rewind, link_end + rewind); - *rewind_p = rewind; - - return link_end; + return autolink_delim(data, link); } -size_t -sd_autolink__url( - size_t *rewind_p, - struct buf *link, - uint8_t *data, - size_t max_rewind, +bool +autolink__url( + struct autolink_pos *link, + const uint8_t *data, + size_t pos, size_t size, unsigned int flags) { - size_t link_end, rewind = 0, domain_len; + assert(data[pos] == ':'); - if (size < 4 || data[1] != '/' || data[2] != '/') - return 0; + if ((size - pos) < 4 || data[pos + 1] != '/' || data[pos + 2] != '/') + return false; - while (rewind < max_rewind && isalpha(data[-rewind - 1])) - rewind++; + link->start = pos + 3; + link->end = 0; - if (!sd_autolink_issafe(data - rewind, size + rewind)) - return 0; + if (!check_domain(data, size, link, flags & AUTOLINK_SHORT_DOMAINS)) + return false; - link_end = strlen("://"); + link->start = pos; + link->end = utf8proc_find_space(data, link->end, size); - domain_len = check_domain( - data + link_end, - size - link_end, - flags & SD_AUTOLINK_SHORT_DOMAINS); + while (link->start && rinku_isalpha(data[link->start - 1])) + link->start--; - if (domain_len == 0) - return 0; + if (!autolink_issafe(data + link->start, size - link->start)) + return false; - link_end += domain_len; - while (link_end < size && !isspace(data[link_end])) - link_end++; - - link_end = autolink_delim(data, link_end, max_rewind, size); - - if (link_end == 0) - return 0; - - bufput(link, data - rewind, link_end + rewind); - *rewind_p = rewind; - - return link_end; + return autolink_delim(data, link); }