lib/regex.rb in twitter-text-1.1.8 vs lib/regex.rb in twitter-text-1.2.0
- old
+ new
@@ -41,20 +41,27 @@
# Latin accented characters (subtracted 0xD7 from the range, it's a confusable multiplication sign. Looks like "x")
LATIN_ACCENTS = [(0xc0..0xd6).to_a, (0xd8..0xf6).to_a, (0xf8..0xff).to_a].flatten.pack('U*').freeze
REGEXEN[:latin_accents] = /[#{LATIN_ACCENTS}]+/o
+ REGEXEN[:end_screen_name_match] = /#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}/o
+
# Characters considered valid in a hashtag but not at the beginning, where only a-z and 0-9 are valid.
HASHTAG_CHARACTERS = /[a-z0-9_#{LATIN_ACCENTS}]/io
- REGEXEN[:auto_link_hashtags] = /(^|[^0-9A-Z&\/]+)(#|#)([0-9A-Z_]*[A-Z_]+#{HASHTAG_CHARACTERS}*)/io
- REGEXEN[:auto_link_usernames_or_lists] = /([^a-zA-Z0-9_]|^)([@@]+)([a-zA-Z0-9_]{1,20})(\/#{REGEXEN[:list_name]})?/o
+ REGEXEN[:auto_link_hashtags] = /(^|[^0-9A-Z&\/\?]+)(#|#)([0-9A-Z_]*[A-Z_]+#{HASHTAG_CHARACTERS}*)/io
+ REGEXEN[:auto_link_usernames_or_lists] = /([^a-zA-Z0-9_]|^|RT:?)([@@]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?($|.)/o
REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\<\|:~\(|\}:o\{|:\-\[|\>o\<|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/
# URL related hash regex collection
- REGEXEN[:valid_preceding_chars] = /(?:[^\/"':!=]|^|\:)/
+ REGEXEN[:valid_preceding_chars] = /(?:[^-\/"':!=A-Z0-9_]|^|\:)/i
REGEXEN[:valid_domain] = /(?:[^[:punct:]\s][\.-](?=[^[:punct:]\s])|[^[:punct:]\s]){1,}\.[a-z]{2,}(?::[0-9]+)?/i
+ # For protocol-less URLs, we'll accept them if they end in one of a handful of likely TLDs
+ REGEXEN[:probable_tld] = /\.(?:com|net|org|gov|edu)$/i
+
+ REGEXEN[:www] = /www\./i
+
REGEXEN[:valid_general_url_path_chars] = /[a-z0-9!\*';:=\+\$\/%#\[\]\-_,~]/i
# Allow URL paths to contain balanced parens
# 1. Used in Wikipedia URLs like /Primer_(film)
# 2. Used in IIS sessions like /S(dfd346)/
REGEXEN[:wikipedia_disambiguation] = /(?:\(#{REGEXEN[:valid_general_url_path_chars]}+\))/i
@@ -71,10 +78,10 @@
REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#]/i
REGEXEN[:valid_url] = %r{
( # $1 total match
(#{REGEXEN[:valid_preceding_chars]}) # $2 Preceeding chracter
( # $3 URL
- (https?:\/\/|www\.) # $4 Protocol or beginning
+ ((?:https?:\/\/|www\.)?) # $4 Protocol or beginning
(#{REGEXEN[:valid_domain]}) # $5 Domain(s) and optional post number
(/#{REGEXEN[:valid_url_path_chars]}*
#{REGEXEN[:valid_url_path_ending_chars]}?
)? # $6 URL Path
(\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $7 Query String