require 'open-uri' require 'zlib' module Sjunkieex class Interface STANDARD_CONFIG = { url: "http://serienjunkies.org", german_only: true, subbed_allowed: false, } attr_reader :options def initialize(series_index, options = {}) @options = STANDARD_CONFIG.merge(options) @index = series_index end # Public: Looks for new episodes on the homepage # # Returns a Hash of links for sites that should be visited def look_for_new_episodes() link_freq = {} doc = Nokogiri::XML(get_page_data(@options[:url])) doc.css("div#content > div.post > div.post-content a").each do |link| content = link.content # skip links that are not suitable next unless is_useful?(content) series_name = Sindex::SeriesIndex.extract_seriesname(content) language = get_language_from_link_data(content) href = link[:href] next if @index.episode_existing?(series_name, content, language) # count the occurrences for each link and select the most common one link_freq[series_name] ||= {} link_freq[series_name][href] ||= 0 link_freq[series_name][href] += 1 end Hash[ link_freq.collect { |series,v| [ v.key(v.values.max), series ] } ] end # Public: parses a series page and extracts links # # series_name - the series name and the key in the index # series_link - the link to the page # # Returns a hash indexed by series identifier def parse_series_page(series_name, series_link, recursive=true) link_data = Hash.new doc = Nokogiri::XML(get_page_data(series_link)) doc.css("div#content > div.post div.post-content p").each do |paragraph| next if paragraph[:class] episode_data = paragraph.css("strong:first-child").text next unless is_useful?(episode_data) language = get_language_from_link_data(episode_data) next if @index.episode_existing?(series_name, episode_data, language) if id = Sindex::SeriesIndex.extract_episode_identifier(episode_data) # classify episode resolution resolution = :sd (resolution = :hd_720p) if episode_data.match(/720[pi]/i) (resolution = :hd_1080p) if episode_data.match(/1080[pi]/i) # extract hoster links episode_links = [] paragraph.css("a").each do |link| episode_links << link[:href] end (link_data[id] = Hash.new) unless link_data[id] link_data[id][resolution] = episode_links link_data[id][:episodedata] = episode_data link_data[id][:series] = series_name end end # check if this page contains pagination (for example TBBT has more than # 3 pages), so we have to parse all pages recursively data_from_other_page = {} if recursive && next_page = doc.css('a.next').first data_from_other_page = parse_series_page( series_name, next_page[:href], recursive) end return link_data.merge(data_from_other_page) end private # Internal: check the link data against criterias # # link_data - data for the link # # Returns true if the link is useful or false if it can be skipped def is_useful?(link_data) return false unless link_data.match(/S\d+E\d+/i) return false unless @index.is_series_in_index?(link_data) language = get_language_from_link_data(link_data) return false if language.nil? series_name = Sindex::SeriesIndex.extract_seriesname(link_data) return false unless @index.is_series_in_this_language?(series_name, language) if not @options[:subbed_allowed] return false if link_data.match(/Subbed/i) end true end # Private: determines the language the link is in # # data - link data # # Returns either :de or :en def get_language_from_link_data(data) return nil unless data.match(/S\d+E\d+/i) if data.match(/German/i) :de else :en end end # Internal: get a page and do some stuff if the page is gzip encoded # # link - the link that is fetched # # Returns the page content def get_page_data(link) body = nil stream = open(link) if stream.is_a? File # file is a local file, has not methods below body = stream.read else # file is web uri if (stream.content_encoding.empty?) body = stream.read else body = Zlib::GzipReader.new(stream).read end end return strip_multiple_doctypes(body) end def strip_multiple_doctypes(content) return content.split(/(?=\<\!DOCTYPE)/).last end end end