lib/regex.rb in twitter-text-1.1.2 vs lib/regex.rb in twitter-text-1.1.4
- old
+ new
@@ -1,5 +1,6 @@
+# coding: UTF-8
module Twitter
# A collection of regular expressions for parsing Tweet text. The regular expression
# list is frozen at load time to ensure immutability. These reular expressions are
# used throughout the <tt>Twitter</tt> classes. Special care has been taken to make
@@ -28,38 +29,47 @@
REGEXEN[:at_signs] = /[@@]/
REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(?=(.|$))/o
REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
- REGEXEN[:list_name] = /^[a-zA-Z\x80-\xff].{0,79}$/
+ REGEXEN[:list_name] = /^[a-zA-Z\u0080-\u00ff].{0,79}$/
# Latin accented characters (subtracted 0xD7 from the range, it's a confusable multiplication sign. Looks like "x")
LATIN_ACCENTS = [(0xc0..0xd6).to_a, (0xd8..0xf6).to_a, (0xf8..0xff).to_a].flatten.pack('U*').freeze
REGEXEN[:latin_accents] = /[#{LATIN_ACCENTS}]+/o
# Characters considered valid in a hashtag but not at the beginning, where only a-z and 0-9 are valid.
HASHTAG_CHARACTERS = /[a-z0-9_#{LATIN_ACCENTS}]/io
REGEXEN[:auto_link_hashtags] = /(^|[^0-9A-Z&\/]+)(#|#)([0-9A-Z_]*[A-Z_]+#{HASHTAG_CHARACTERS}*)/io
- REGEXEN[:auto_link_usernames_or_lists] = /([^a-zA-Z0-9_]|^)([@@]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9\x80-\xff\-]{0,79})?/
+ REGEXEN[:auto_link_usernames_or_lists] = /([^a-zA-Z0-9_]|^)([@@]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9\u0080-\u00ff\-]{0,79})?/
REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\<\|:~\(|\}:o\{|:\-\[|\>o\<|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/
# URL related hash regex collection
REGEXEN[:valid_preceding_chars] = /(?:[^\/"':!=]|^|\:)/
REGEXEN[:valid_domain] = /(?:[^[:punct:]\s][\.-](?=[^[:punct:]\s])|[^[:punct:]\s]){1,}\.[a-z]{2,}(?::[0-9]+)?/i
- REGEXEN[:valid_url_path_chars] = /[\.\,]?[a-z0-9!\*'\(\);:=\+\$\/%#\[\]\-_,~@]/i
+
+ # Allow URL paths to contain balanced parens
+ # 1. Used in Wikipedia URLs like /Primer_(film)
+ # 2. Used in IIS sessions like /S(dfd346)/
+ REGEXEN[:wikipedia_disambiguation] = /(?:\([^\)]+\))/i
+ REGEXEN[:valid_url_path_chars] = /(?:
+ #{REGEXEN[:wikipedia_disambiguation]}|
+ [\.\,]?[a-z0-9!\*';:=\+\$\/%#\[\]\-_,~@]
+ )/ix
# Valid end-of-path chracters (so /foo. does not gobble the period).
- # 1. Allow ) for Wikipedia URLs.
- # 2. Allow =&# for empty URL parameters and other URL-join artifacts
- REGEXEN[:valid_url_path_ending_chars] = /[a-z0-9\)=#\/]/i
+ # 1. Allow =&# for empty URL parameters and other URL-join artifacts
+ REGEXEN[:valid_url_path_ending_chars] = /[a-z0-9=#\/]/i
REGEXEN[:valid_url_query_chars] = /[a-z0-9!\*'\(\);:&=\+\$\/%#\[\]\-_\.,~]/i
REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#]/i
REGEXEN[:valid_url] = %r{
( # $1 total match
(#{REGEXEN[:valid_preceding_chars]}) # $2 Preceeding chracter
( # $3 URL
(https?:\/\/|www\.) # $4 Protocol or beginning
(#{REGEXEN[:valid_domain]}) # $5 Domain(s) and optional post number
- (/#{REGEXEN[:valid_url_path_chars]}*#{REGEXEN[:valid_url_path_ending_chars]}?)? # $6 URL Path
+ (/#{REGEXEN[:valid_url_path_chars]}*
+ #{REGEXEN[:valid_url_path_ending_chars]}?
+ )? # $6 URL Path
(\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $7 Query String
)
)
}iox;