require 'yaml' require 'xmlsimple' class ImdbMovie include Comparable attr_reader :id, :url#, :title def initialize(id, title = nil) @id = id @url = "http://www.imdb.com/title/tt#{@id}/" @title = title end # this is intended to be stubed by rspec where it # should return true. def self.use_html_cache false end # add comparator so Arrays containing ImdbMovie objects # can use uniq() def <=>(other) @id <=> other.id end def title if @title.nil? @title = document.at("div#tn15title h1").innerHTML.split('').first.strip.unescape_html rescue nil end @title end def directors document.search("h5[text()^='Director'] ~ a").map { |link| link.innerHTML.strip.unescape_html }.reject { |w| w == 'more' }.uniq rescue [] end def poster_url document.at("a[@name='poster']")['href'] rescue nil end def tiny_poster_url document.at("a[@name='poster'] img")['src'] rescue nil end def poster ImdbImage.new(poster_url) rescue nil end def rating document.at("h5[text()='User Rating:'] ~ b").innerHTML.strip.unescape_html.split('/').first.to_f rescue nil end def cast_members # document.search("table.cast td.nm a").map { |link| link.innerHTML.strip.unescape_html } rescue [] document.search("table.cast tr").inject([]) do |result, row| a = row.search("td.nm a").innerHTML.strip.unescape_html c = row.search("td.char a").innerHTML.strip.unescape_html if c.empty? c = row.search("td.char").innerHTML.strip.unescape_html end result << [a,c] end end def writers document.search("h5[text()^='Writer'] ~ a").map { |link| link.innerHTML.strip.unescape_html }.reject { |w| w == 'more' }.uniq rescue [] end def year document.search('a[@href^="/Sections/Years/"]').innerHTML end def release_date date = document.search("//h5[text()^='Release Date']/..").innerHTML[/^\d{1,2} \w+ \d{4}/] Date.parse(Chronic.parse(date).strftime('%Y/%m/%d')) rescue nil end def genres document.search("h5[text()='Genre:'] ~ a[@href*=/Sections/Genres/']").map { |link| link.innerHTML.strip.unescape_html } rescue [] end def plot document.search("//h5[text()^='Plot']/..").innerHTML.split("\n")[2].gsub(/<.+>.+<\/.+>/, '').strip.unescape_html rescue nil end def tagline document.search("//h5[text()^='Tagline']/..").innerHTML.split("\n")[2].gsub(/<.+>.+<\/.+>/, '').strip.unescape_html rescue nil end def aspect_ratio document.search("//h5[text()^='Aspect Ratio']/..").innerHTML.split("\n")[2].gsub(/<.+>.+<\/.+>/, '').strip.unescape_html rescue nil end def length document.search("//h5[text()^='Runtime']/..").innerHTML[/\d+ min/] rescue nil end def countries document.search("h5[text()='Country:'] ~ a[@href*=/Sections/Countries/']").map { |link| link.innerHTML.strip.unescape_html } rescue [] end def languages document.search("h5[text()='Language:'] ~ a[@href*=/Sections/Languages/']").map { |link| link.innerHTML.strip.unescape_html } rescue [] end def color document.at("h5[text()='Color:'] ~ a[@href*=color-info']").innerHTML.strip.unescape_html rescue nil end def company document.at("h5[text()='Company:'] ~ a[@href*=/company/']").innerHTML.strip.unescape_html rescue nil end def photos document.search(".media_strip_thumb img").map { |img| img['src'] } rescue [] end # return the raw title def raw_title document.at("h1").innerText end # is this a video game as indicated by a '(VG)' in the raw title? def video_game? raw_title =~ /$VG$/ end # find the release year # Note, this is needed because not all entries on IMDB have a full # release date as parsed by release_date. def release_year document.search("//h5[text()^='Release Date']/..").innerHTML[/\d{4}/] end # return an Array of Strings containing AKA titles def also_known_as el = document.search("//h5[text()^='Also Known As:']/..").at('h5') aka = [] while(!el.nil?) aka << el.to_s unless el.elem? el = el.next end aka.collect!{|a| remove_parens(a).strip} aka.uniq! aka.compact! aka.select{|a| !a.empty?} end def remove_parens(str) while str =~ /$.*$/ str.gsub!(/$[^$$]*$/, '') end str end # The MPAA rating, i.e. "PG-13" def mpaa document.search("//h5[text()^='MPAA']/..").text.gsub('MPAA:', '').strip rescue nil end # older films may not have MPAA ratings but usually have a certification. # return a hash with country abbreviations for keys and the certification string for the value # example: {'USA' => 'Approved'} def certifications certs = [] cert_set = document.search("h5[text()='Certification:'] ~ a[@href*=/List?certificates']").map { |link| link.innerHTML.strip } rescue [] cert_set.each do |line| if line =~ /(.*):(.*)/ cert_hash = {} cert_hash['country'] = $1 cert_hash['rating'] = $2 certs << cert_hash end end certs end def to_hash hash = {} [:title, :directors, :poster_url, :tiny_poster_url, :poster, :rating, :cast_members, :writers, :year, :genres, :plot, :tagline, :aspect_ratio, :length, :release_date, :countries, :languages, :color, :company, :photos, :raw_title, :release_year, :also_known_as, :mpaa, :certifications ].each do |sym| begin value = send(sym.to_s) hash[sym.to_s] = value unless value.nil? rescue Exception => e puts "Error getting data for hash for #{sym} - #{e.to_s}" end end hash end def to_xml XmlSimple.xml_out(to_hash, 'NoAttr' => true, 'RootName' => 'movie') end def to_yaml YAML.dump(to_hash) end private # def update_title # @title = document.at("h1").innerHTML.split('').first.unescape_html rescue nil # end # Fetch the document with retry to handle the occasional glitches def document if @document.nil? html = fetch(self.url.escape_unicode) @document = Hpricot(html) end @document end MAX_ATTEMPTS = 3 SECONDS_BETWEEN_RETRIES = 1.0 def fetch(page) doc = nil attempts = 0 begin doc = read_page(page) rescue Exception => e attempts += 1 if attempts > MAX_ATTEMPTS raise else sleep SECONDS_BETWEEN_RETRIES retry end end doc end def read_page(page) open(page).read end end