lib/regex.rb in twitter-text-1.4.2 vs lib/regex.rb in twitter-text-1.4.5

- old
+ new

@@ -22,10 +22,11 @@ 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE ].flatten.freeze + SPACE_CHAR_CLASS_VALUE = Regexp.new(UNICODE_SPACES.collect{ |e| [e].pack 'U*' }.join('')) REGEXEN[:spaces] = Regexp.new(UNICODE_SPACES.collect{ |e| [e].pack 'U*' }.join('|')) REGEXEN[:at_signs] = /[@@]/ REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(?=(.|$))/o REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o @@ -40,28 +41,61 @@ # Latin accented characters # Excludes 0xd7 from the range (the multiplication sign, confusable with "x"). # Also excludes 0xf7, the division sign LATIN_ACCENTS = [(0xc0..0xd6).to_a, (0xd8..0xf6).to_a, (0xf8..0xff).to_a].flatten.pack('U*').freeze + NON_LATIN_HASHTAG_CHARS = [ + # Cyrillic (Russian, Ukrainian, etc.) + (0x0400..0x04ff).to_a, # Cyrillic + (0x0500..0x0527).to_a, # Cyrillic Supplement + # Hangul (Korean) + (0x1100..0x11ff).to_a, # Hangul Jamo + (0x3130..0x3185).to_a, # Hangul Compatibility Jamo + (0xA960..0xA97F).to_a, # Hangul Jamo Extended-A + (0xAC00..0xD7AF).to_a, # Hangul Syllables + (0xD7B0..0xD7FF).to_a # Hangul Jamo Extended-B + ].flatten.pack('U*').freeze REGEXEN[:latin_accents] = /[#{LATIN_ACCENTS}]+/o REGEXEN[:end_screen_name_match] = /^(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o + CJ_HASHTAG_CHARACTERS = [ + (0x30A1..0x30FA).to_a, # Katakana (full-width) + (0xFF66..0xFF9D).to_a, # Katakana (half-width) + (0xFF10..0xFF19).to_a, (0xFF21..0xFF3A).to_a, (0xFF41..0xFF5A).to_a, # Latin (full-width) + (0x3041..0x3096).to_a, # Hiragana + (0x3400..0x4DBF).to_a, # Kanji (CJK Extension A) + (0x4E00..0x9FFF).to_a, # Kanji (Unified) + (0x20000..0x2A6DF).to_a, # Kanji (CJK Extension B) + (0x2A700..0x2B73F).to_a, # Kanji (CJK Extension C) + (0x2B740..0x2B81F).to_a, # Kanji (CJK Extension D) + (0x2F800..0x2FA1F).to_a # Kanji (CJK supplement) + ].flatten.pack('U*').freeze + + CJ_BOUNDARY = /(?:\A|\z|#{REGEXEN[:spaces]}|「|」|。|\.)/ + # A hashtag must contain latin characters, numbers and underscores, but not all numbers. - HASHTAG_ALPHA = /[a-z_#{LATIN_ACCENTS}]/io - HASHTAG_ALPHANUMERIC = /[a-z0-9_#{LATIN_ACCENTS}]/io - REGEXEN[:auto_link_hashtags] = /(^|[^0-9A-Z&\/\?]+)(#|#)(#{HASHTAG_ALPHANUMERIC}*#{HASHTAG_ALPHA}#{HASHTAG_ALPHANUMERIC}*)/io + HASHTAG_ALPHA = /[a-z_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}]/io + HASHTAG_ALPHANUMERIC = /[a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}]/io + + CJ_HASHTAG = /(#{CJ_BOUNDARY})(#|#)([#{CJ_HASHTAG_CHARACTERS}]+)(?=#{CJ_BOUNDARY})/o + NON_CJ_HASHTAG = /(^|[^0-9A-Z&\/\?]+)(#|#)(#{HASHTAG_ALPHANUMERIC}*#{HASHTAG_ALPHA}#{HASHTAG_ALPHANUMERIC}*)/io + + REGEXEN[:auto_link_hashtags] = /#{NON_CJ_HASHTAG}/io + REGEXEN[:auto_link_cj_hashtags] = /#{CJ_HASHTAG}/io + REGEXEN[:auto_link_usernames_or_lists] = /([^a-zA-Z0-9_]|^|RT:?)([@@]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?/o REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\<\|:~\(|\}:o\{|:\-\[|\>o\<|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/ # URL related hash regex collection REGEXEN[:valid_preceding_chars] = /(?:[^-\/"':!=A-Z0-9_@@]|^|\:)/i - REGEXEN[:valid_subdomain] = /([^[:punct:]\s]([_-]|[^[:punct:]\s])*)?[^[:punct:]\s]\./ - REGEXEN[:valid_domain_name] = /([^[:punct:]\s]([-]|[^[:punct:]\s])*)?[^[:punct:]\s]/ - REGEXEN[:valid_domain] = /#{REGEXEN[:valid_subdomain]}*#{REGEXEN[:valid_domain_name]}\.[a-z]{2,}(?::[0-9]+)?/i + DOMAIN_EXCLUDE_PART = "[:punct:][:space:][:blank:]#{[0x00A0].pack('U')}" + REGEXEN[:valid_subdomain] = /(?:[^#{DOMAIN_EXCLUDE_PART}](?:[_-]|[^#{DOMAIN_EXCLUDE_PART}])*)?[^#{DOMAIN_EXCLUDE_PART}]\./ + REGEXEN[:valid_domain_name] = /(?:[^#{DOMAIN_EXCLUDE_PART}](?:[-]|[^#{DOMAIN_EXCLUDE_PART}])*)?[^#{DOMAIN_EXCLUDE_PART}]/ + REGEXEN[:valid_domain] = /#{REGEXEN[:valid_subdomain]}*#{REGEXEN[:valid_domain_name]}\.(?:xn--[a-z0-9]{2,}|[a-z]{2,})(?::[0-9]+)?/i - REGEXEN[:valid_general_url_path_chars] = /[a-z0-9!\*';:=\+\,\$\/%#\[\]\-_~|]/i + REGEXEN[:valid_general_url_path_chars] = /[a-z0-9!\*';:=\+\,\$\/%#\[\]\-_~|\.]/i # Allow URL paths to contain balanced parens # 1. Used in Wikipedia URLs like /Primer_(film) # 2. Used in IIS sessions like /S(dfd346)/ REGEXEN[:wikipedia_disambiguation] = /(?:\(#{REGEXEN[:valid_general_url_path_chars]}+\))/i # Allow @ in a url, but only in the middle. Catch things like http://example.com/@user