class String # Helper function to count the character length by first converting to an # array. This is needed because with unicode strings, the return value # of length may be incorrect def char_length if respond_to? :codepoints length else chars.kind_of?(Enumerable) ? chars.to_a.size : chars.size end end # Helper function to convert this string into an array of unicode characters. def to_char_a @to_char_a ||= if chars.kind_of?(Enumerable) chars.to_a else char_array = [] 0.upto(char_length - 1) { |i| char_array << [chars.slice(i)].pack('U') } char_array end end # Helper function to find the index of the sub_string in # str. This is needed because with unicode strings, the return # of index may be incorrect. def sub_string_search(sub_str, position = 0) if respond_to? :codepoints index(sub_str, position) else index = to_char_a[position..-1].each_with_index.find do |e| to_char_a.slice(e.last + position, sub_str.char_length).map{|ci| ci.first }.join == sub_str end index.nil? ? -1 : index.last + position end end end module Twitter # A module for including Tweet parsing in a class. This module provides function for the extraction and processing # of usernames, lists, URLs and hashtags. module Extractor # Extracts a list of all usernames mentioned in the Tweet text. If the # text is nil or contains no username mentions an empty array # will be returned. # # If a block is given then it will be called for each username. def extract_mentioned_screen_names(text) # :yields: username screen_names_only = extract_mentioned_screen_names_with_indices(text).map{|mention| mention[:screen_name] } screen_names_only.each{|mention| yield mention } if block_given? screen_names_only end # Extracts a list of all usersnames mentioned in the Tweet text # along with the indices for where the mention ocurred. If the # text is nil or contains no username mentions, an empty array # will be returned. # # If a block is given, then it will be called with each username, the start # index, and the end index in the text. def extract_mentioned_screen_names_with_indices(text) # :yields: username, start, end return [] unless text possible_screen_names = [] position = 0 text.to_s.scan(Twitter::Regex[:extract_mentions]) do |before, sn, after| unless after =~ Twitter::Regex[:at_signs] start_position = text.to_s.sub_string_search(sn, position) - 1 position = start_position + sn.char_length + 1 possible_screen_names << { :screen_name => sn, :indices => [start_position, position] } end end if block_given? possible_screen_names.each do |mention| yield mention[:screen_name], mention[:indices].first, mention[:indices].last end end possible_screen_names end # Extracts the username username replied to in the Tweet text. If the # text is nil or is not a reply nil will be returned. # # If a block is given then it will be called with the username replied to (if any) def extract_reply_screen_name(text) # :yields: username return nil unless text possible_screen_name = text.match(Twitter::Regex[:extract_reply]) return unless possible_screen_name.respond_to?(:captures) screen_name = possible_screen_name.captures.first yield screen_name if block_given? screen_name end # Extracts a list of all URLs included in the Tweet text. If the # text is nil or contains no URLs an empty array # will be returned. # # If a block is given then it will be called for each URL. def extract_urls(text) # :yields: url urls_only = extract_urls_with_indices(text).map{|url| url[:url] } urls_only.each{|url| yield url } if block_given? urls_only end # Extracts a list of all URLs included in the Tweet text along # with the indices. If the text is nil or contains no # URLs an empty array will be returned. # # If a block is given then it will be called for each URL. def extract_urls_with_indices(text) # :yields: url, start, end return [] unless text urls = [] position = 0 text.to_s.scan(Twitter::Regex[:valid_url]) do |all, before, url, protocol, domain, path, query| start_position = text.to_s.sub_string_search(url, position) end_position = start_position + url.char_length position = end_position urls << { :url => (protocol == "www." ? "http://#{url}" : url), :indices => [start_position, end_position] } end urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last } if block_given? urls end # Extracts a list of all hashtags included in the Tweet text. If the # text is nil or contains no hashtags an empty array # will be returned. The array returned will not include the leading # # character. # # If a block is given then it will be called for each hashtag. def extract_hashtags(text) # :yields: hashtag_text hashtags_only = extract_hashtags_with_indices(text).map{|hash| hash[:hashtag] } hashtags_only.each{|hash| yield hash } if block_given? hashtags_only end # Extracts a list of all hashtags included in the Tweet text. If the # text is nil or contains no hashtags an empty array # will be returned. The array returned will not include the leading # # character. # # If a block is given then it will be called for each hashtag. def extract_hashtags_with_indices(text) # :yields: hashtag_text, start, end return [] unless text tags = [] position = 0 text.scan(Twitter::Regex[:auto_link_hashtags]) do |before, hash, hash_text| start_position = text.to_s.sub_string_search(hash, position) position = start_position + hash_text.char_length + 1 tags << { :hashtag => hash_text, :indices => [start_position, position] } end tags.each{|tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last } if block_given? tags end end end