lib/regex.rb in twitter-text-1.4.2 vs lib/regex.rb in twitter-text-1.4.5
- old
+ new
@@ -22,10 +22,11 @@
0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
].flatten.freeze
+ SPACE_CHAR_CLASS_VALUE = Regexp.new(UNICODE_SPACES.collect{ |e| [e].pack 'U*' }.join(''))
REGEXEN[:spaces] = Regexp.new(UNICODE_SPACES.collect{ |e| [e].pack 'U*' }.join('|'))
REGEXEN[:at_signs] = /[@@]/
REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(?=(.|$))/o
REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
@@ -40,28 +41,61 @@
# Latin accented characters
# Excludes 0xd7 from the range (the multiplication sign, confusable with "x").
# Also excludes 0xf7, the division sign
LATIN_ACCENTS = [(0xc0..0xd6).to_a, (0xd8..0xf6).to_a, (0xf8..0xff).to_a].flatten.pack('U*').freeze
+ NON_LATIN_HASHTAG_CHARS = [
+ # Cyrillic (Russian, Ukrainian, etc.)
+ (0x0400..0x04ff).to_a, # Cyrillic
+ (0x0500..0x0527).to_a, # Cyrillic Supplement
+ # Hangul (Korean)
+ (0x1100..0x11ff).to_a, # Hangul Jamo
+ (0x3130..0x3185).to_a, # Hangul Compatibility Jamo
+ (0xA960..0xA97F).to_a, # Hangul Jamo Extended-A
+ (0xAC00..0xD7AF).to_a, # Hangul Syllables
+ (0xD7B0..0xD7FF).to_a # Hangul Jamo Extended-B
+ ].flatten.pack('U*').freeze
REGEXEN[:latin_accents] = /[#{LATIN_ACCENTS}]+/o
REGEXEN[:end_screen_name_match] = /^(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o
+ CJ_HASHTAG_CHARACTERS = [
+ (0x30A1..0x30FA).to_a, # Katakana (full-width)
+ (0xFF66..0xFF9D).to_a, # Katakana (half-width)
+ (0xFF10..0xFF19).to_a, (0xFF21..0xFF3A).to_a, (0xFF41..0xFF5A).to_a, # Latin (full-width)
+ (0x3041..0x3096).to_a, # Hiragana
+ (0x3400..0x4DBF).to_a, # Kanji (CJK Extension A)
+ (0x4E00..0x9FFF).to_a, # Kanji (Unified)
+ (0x20000..0x2A6DF).to_a, # Kanji (CJK Extension B)
+ (0x2A700..0x2B73F).to_a, # Kanji (CJK Extension C)
+ (0x2B740..0x2B81F).to_a, # Kanji (CJK Extension D)
+ (0x2F800..0x2FA1F).to_a # Kanji (CJK supplement)
+ ].flatten.pack('U*').freeze
+
+ CJ_BOUNDARY = /(?:\A|\z|#{REGEXEN[:spaces]}|「|」|。|\.)/
+
# A hashtag must contain latin characters, numbers and underscores, but not all numbers.
- HASHTAG_ALPHA = /[a-z_#{LATIN_ACCENTS}]/io
- HASHTAG_ALPHANUMERIC = /[a-z0-9_#{LATIN_ACCENTS}]/io
- REGEXEN[:auto_link_hashtags] = /(^|[^0-9A-Z&\/\?]+)(#|#)(#{HASHTAG_ALPHANUMERIC}*#{HASHTAG_ALPHA}#{HASHTAG_ALPHANUMERIC}*)/io
+ HASHTAG_ALPHA = /[a-z_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}]/io
+ HASHTAG_ALPHANUMERIC = /[a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}]/io
+
+ CJ_HASHTAG = /(#{CJ_BOUNDARY})(#|#)([#{CJ_HASHTAG_CHARACTERS}]+)(?=#{CJ_BOUNDARY})/o
+ NON_CJ_HASHTAG = /(^|[^0-9A-Z&\/\?]+)(#|#)(#{HASHTAG_ALPHANUMERIC}*#{HASHTAG_ALPHA}#{HASHTAG_ALPHANUMERIC}*)/io
+
+ REGEXEN[:auto_link_hashtags] = /#{NON_CJ_HASHTAG}/io
+ REGEXEN[:auto_link_cj_hashtags] = /#{CJ_HASHTAG}/io
+
REGEXEN[:auto_link_usernames_or_lists] = /([^a-zA-Z0-9_]|^|RT:?)([@@]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?/o
REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\<\|:~\(|\}:o\{|:\-\[|\>o\<|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/
# URL related hash regex collection
REGEXEN[:valid_preceding_chars] = /(?:[^-\/"':!=A-Z0-9_@@]|^|\:)/i
- REGEXEN[:valid_subdomain] = /([^[:punct:]\s]([_-]|[^[:punct:]\s])*)?[^[:punct:]\s]\./
- REGEXEN[:valid_domain_name] = /([^[:punct:]\s]([-]|[^[:punct:]\s])*)?[^[:punct:]\s]/
- REGEXEN[:valid_domain] = /#{REGEXEN[:valid_subdomain]}*#{REGEXEN[:valid_domain_name]}\.[a-z]{2,}(?::[0-9]+)?/i
+ DOMAIN_EXCLUDE_PART = "[:punct:][:space:][:blank:]#{[0x00A0].pack('U')}"
+ REGEXEN[:valid_subdomain] = /(?:[^#{DOMAIN_EXCLUDE_PART}](?:[_-]|[^#{DOMAIN_EXCLUDE_PART}])*)?[^#{DOMAIN_EXCLUDE_PART}]\./
+ REGEXEN[:valid_domain_name] = /(?:[^#{DOMAIN_EXCLUDE_PART}](?:[-]|[^#{DOMAIN_EXCLUDE_PART}])*)?[^#{DOMAIN_EXCLUDE_PART}]/
+ REGEXEN[:valid_domain] = /#{REGEXEN[:valid_subdomain]}*#{REGEXEN[:valid_domain_name]}\.(?:xn--[a-z0-9]{2,}|[a-z]{2,})(?::[0-9]+)?/i
- REGEXEN[:valid_general_url_path_chars] = /[a-z0-9!\*';:=\+\,\$\/%#\[\]\-_~|]/i
+ REGEXEN[:valid_general_url_path_chars] = /[a-z0-9!\*';:=\+\,\$\/%#\[\]\-_~|\.]/i
# Allow URL paths to contain balanced parens
# 1. Used in Wikipedia URLs like /Primer_(film)
# 2. Used in IIS sessions like /S(dfd346)/
REGEXEN[:wikipedia_disambiguation] = /(?:\(#{REGEXEN[:valid_general_url_path_chars]}+\))/i
# Allow @ in a url, but only in the middle. Catch things like http://example.com/@user