require 'nokogiri' require 'open-uri' class Scraper BASE_URL = 'http://www.imdb.com' def self.scrape_now_playing doc = Nokogiri::HTML(open("#{BASE_URL}/movies-in-theaters")) movies = doc.css('div#main div.list.detail.sub-list') .last.css('td#img_primary a') movies_data = Array.new movies.each do |movie| movie_data = { :id => movie.attribute('href').value.split('/')[2][(2..-1)], :title => movie.css('img').attribute('title').value[(0..-7)].strip } movies_data << movie_data end movies_data end def self.scrape_opening_this_week doc = Nokogiri::HTML(open("#{BASE_URL}/movies-in-theaters")) movies = doc.css('div#main div.list.detail.sub-list') .first.css('td#img_primary a') movies_data = Array.new movies.each do |movie| movie_data = { :id => movie.attribute('href').value.split('/')[2][(2..-1)], :title => movie.css('img').attribute('title').value[(0..-7)].strip } movies_data << movie_data end movies_data end def self.scrape_movie_by_title(title) # /find?q=Iron+Man doc = Nokogiri::HTML(open("#{BASE_URL}/find?q=#{title}")) movies = doc.css('div.findSection').first.css('td.result_text') movies_data = Array.new movies.each_with_index do |movie, index| type = movie.text.scan(/\([\w ]+\)/).reverse.detect{|e| e[1].match(/[A-Za-z]/) || break } movie_data = { :id => movie.css('a').attribute('href').value.split('/')[2][(2..-1)], :title => movie.css('a').text, :release_year => movie.text.scan(/[12]\d{3}/)[-1], :type => begin type.match(/[\w ]+/) rescue nil end } movies_data << movie_data end movies_data end def self.scrape_movie_by_id(id) doc = Nokogiri::HTML(open("#{BASE_URL}/title/tt#{id}")) title_overview = doc.css('div#title-overview-widget') title_cast = doc.css('div#titleCast') directors = title_overview.css('div.plot_summary_wrapper span[itemprop="director"] span') movie = { :id => id, :title => title_overview.css('div.title_wrapper > h1').text.strip[(0..-8)], :release_year => title_overview.css('span#titleYear a').text.strip, :content_rating => title_overview.css('div.subtext meta').attribute('content').value, :runtime => title_overview.css('div.subtext time').text.strip, :genres => title_overview.css('div.subtext span.itemprop').map { |e| e.text }, :summary => title_overview.css('div.plot_summary_wrapper div.summary_text').text.strip, :director => directors.size > 1 ? directors.map{|e| e.text}.join(', ') : directors.text, :stars => title_overview.css('div.plot_summary_wrapper span[itemprop="actors"] span').map { |e| e.text } } movie end end