# encoding: UTF-8 require 'media_wiki' module Plugin # This Plugin tries to extract the series # information from wikipedia # # (by now only the german wikipedia) class Wikipedia < Serienrenamer::Pluginbase def self.plugin_name; "Wikipedia" end def self.usable; true end def self.priority; 30 end @@WIKIPEDIA_URL = 'http://de.wikipedia.org/w/api.php' # patterns used in this class @@EPISODE_TABLE_PATTERN = /.*(?\{\{Episodenlistentabelle.*\}\})\s*$/m @@EPISODE_ENTRY_PATTERN = /\{\{Episodenlisteneintrag|S-Episode/ @@SERIES_SITE_TEST_PATTERN = /\{\{Infobox.Fernsehsendung.*\}\}/m @@DISAMBIGUATION_TEST_PATTERN = /\{\{Begriffsklärung\}\}/m @@CONTAINS_LINK_TO_EPISODE_LIST = /Hauptartikel.*(?
Liste.*?)[\]\}]+/ @@CONTAINS_INARTICLE_EPISODE_LIST = /\Staffel.(\d+).*\<\/div\>.*class=\"wikitable\".*titel/m @@INPAGE_SEASON_SEPARATOR = /\/ @@WIKITABLE_EXTRACT_PATTERN = /(\{\|.class=\"wikitable\".*\|\})\n/m @@IS_ONE_LINE_EPISODE_LIST = /\|.*\|\|.*\|\|.*\|\|/m # this method will be called from the main program # with an Serienrenamer::Episode instance as parameter # # it returns an array of episode information def self.generate_episode_information(episode) raise ArgumentError, "Serienrenamer::Episode instance needed" unless episode.is_a?(Serienrenamer::Episode) return [] unless episode.series.match(/\w+/) unless defined? @cached_data @cached_data = Hash.new end wiki = MediaWiki::Gateway.new(@@WIKIPEDIA_URL) if ! @cached_data.has_key?(episode.series) # search for a series site in wikipedia series_site = nil tries = 3 search_pattern = episode.series search_pattern_modified = false begin wiki.search(search_pattern, nil, 50).each do |title| pagedata = wiki.get(title) if is_series_main_page?(pagedata) series_site = title break end end # modify the search term pattern so that it contains # only the last word if the search_pattern contains # more than one words if series_site.nil? && ! search_pattern_modified search_pattern = search_pattern.match(/(\w+)\s*$/)[1] search_pattern_modified = true raise EOFError if search_pattern # break out and retry end rescue MediaWiki::APIError => e tries -= 1 retry if tries > 0 rescue EOFError => e retry end return [] unless series_site # look for a link to a list of episodes pagedata = wiki.get(series_site) if contains_link_to_episode_list?(pagedata) mainarticle = pagedata.match(@@CONTAINS_LINK_TO_EPISODE_LIST)[:main] if mainarticle episodelist_page = wiki.get(mainarticle) series = parse_episodelist_page_data(episodelist_page) @cached_data[episode.series] = series end elsif contains_inarticle_episode_list?(pagedata) series = parse_inarticle_episodelist_page_data(pagedata) @cached_data[episode.series] = series else warn "no episode list found" return [] end end episode_names = [] # tries to find an episodename in cached_data # otherwise returns empty array begin series = @cached_data[episode.series] episodename = series[episode.season][episode.episode] if episodename.match(/\w+/) episode_names.push(episodename) end rescue end return episode_names end # This method will extract season based information # from a string that contains a wikipedia episodelist page # # returns an Array of Arrays with episode information # where episode and season numbers are the indizes def self.parse_episodelist_page_data(pagedata, debug=false) raise ArgumentError, 'String with pagedata expected' unless pagedata.is_a?(String) series_data = [] is_season_table_following = false season_number = nil # split the wikipedia page by headings and process # the following paragraph if the heading starts with # 'Staffel' pagedata.split(/(==.*)==/).each do |paragraph| if paragraph.match(/^==.*Staffel/) match = paragraph.match(/^==.*Staffel.(?\d+)/) if match season_number = match[:seasonnr].to_i is_season_table_following = true end elsif is_season_table_following # # extract season table from this paragraph season = parse_season_table(paragraph) series_data[season_number] = season is_season_table_following = false end end return series_data end # this method will be called with a wikipedia seasontable # as parameter and will extract all episodes from this # and returns that as an array where the episode number is # the index def self.parse_season_table(table) raise ArgumentError, 'String with seasontable expected' unless table.is_a?(String) season_data = [] matched_table = table.match(@@EPISODE_TABLE_PATTERN) if matched_table # extract all episode entries that # looks like the following # # {{Episodenlisteneintrag # | NR_GES = 107 # | NR_ST = 1 # | OT = The Mastodon in the Room # | DT = Die Rückkehr der Scheuklappen # | ZF = # | EA = {{dts|23|09|2010}} # | EAD = {{dts|08|09|2011}} # }} episodes = matched_table[:table].split(@@EPISODE_ENTRY_PATTERN) if episodes episodes.each do |epi| # build up a hash from the entry infos = {} epi.lines.each do |part| parts = part.strip.match(/(?\w+).=.(?.*)$/) if parts infos[parts[:key].strip] = parts[:value].strip end end next unless infos.has_key?('NR_ST') # extract useful information and # add it to the array epi_nr = infos['NR_ST'].to_i next unless epi_nr # TODO make the following variable epi_name = infos['DT'].strip # remove all html tags and all following # text from the episode name and the bold # syntax from mediawiki [[text]] epi_name.gsub!(/<\/?[^>]*>.*/, "") epi_name.gsub!(/[\[\[\]\]]/, "") next unless epi_name.match(/\w+/) season_data[epi_nr] = epi_name end end end return season_data end # This method will extract season based information # from a string that contains a series page with an # episodelist included # # returns an Array of Arrays with episode information # where episode and season numbers are the indizes def self.parse_inarticle_episodelist_page_data(pagedata, debug=false) raise ArgumentError, 'String with pagedata expected' unless pagedata.is_a?(String) series_data = [] # look for a paragraph with an episodelist episodelist_paragraph = pagedata.split(/==.*==/).select { |p| contains_inarticle_episode_list?(p) }[0] raise ArgumentError, 'no episodelist found' unless episodelist_paragraph # iterate through all seasons in this episode table episodelist_paragraph.split(@@INPAGE_SEASON_SEPARATOR).each do |season| next unless contains_inarticle_episode_list?(season) season_nr = season.match(@@CONTAINS_INARTICLE_EPISODE_LIST)[1].to_i wikitable = season.match(@@WIKITABLE_EXTRACT_PATTERN)[1] # we have to detect the type of the inarticle season page # because there are two different kinds of table structures # used in the german wikipedia if self.is_episode_list_with_one_episode_per_line?(wikitable) episodes = parse_inarticle_season_table_with_one_line(wikitable) else episodes = parse_inarticle_season_table(wikitable) end # HACK if a season is splitted into different parts # eg. Flashpoint (2.1 and 2.2) than merge that if possible if series_data[season_nr] != nil series_data[season_nr].each_with_index do |item, index| episodes[index] = item unless episodes[index] end end series_data[season_nr] = episodes end return series_data end # this method will be called with a wikitable for a season # as parameter and will extract all episodes from this # and returns that as an array where the episode number is # the index # # Example for an wikitable for episodes: # # {| class="wikitable" width="100%" # |- vertical-align: top; text-align:center; " # | width="15" | '''Nummer'''
(Gesamt) # | width="15" | '''Nummer'''
(Staffel) # ! width="250" | Originaltitel # ! width="250" | Deutscher Titel # ! width="180" | Erstausstrahlung
(USA Network) # ! width="180" | Erstausstrahlung
(RTL) # ! width="180" | Erstausstrahlung
(SF zwei) # |- # | bgcolor="#DFEEEF"| 01 # | 01 # | ''Pilot'' # | ''Auch Reiche sind nur Menschen'' # | 4. Mai 2009 # | 17. Mai 2011 # | 6. Juni 2011 (Teil 1)
13. Juni 2011 (Teil 2) # |- # |} # def self.parse_inarticle_season_table(table) raise ArgumentError, 'String with seasontable expected' unless table.is_a?(String) season_data = [] episode_nr_line_nr = nil episode_name_line_nr = nil table.split(/^\|\-.*$/).each do |tablerow| tablerow.strip! # skip invalid rows lines = tablerow.lines.to_a next unless lines.length >= 4 if tablerow.match(/width=\"\d+\"/) # extract line numbers for needed data that # are in the table header lines.each_with_index do |item, index| if item.match(/Nummer.*Staffel/i) episode_nr_line_nr = index # TODO make the following more variable elsif item.match(/Deutscher.*Titel/i) episode_name_line_nr = index end end else # extract episode information if episode_nr_line_nr && episode_name_line_nr md_nr = lines[episode_nr_line_nr].strip.match(/(\d+)/) if md_nr episode_nr = md_nr[1].to_i md_name = lines[episode_name_line_nr].strip.match(/^\|.(.*)$/) if md_name episode_name = md_name[1] episode_name.gsub!(/[\'\"\[\]]/, "") next unless episode_name.match(/\w+/) season_data[episode_nr] = episode_name.strip end end end end end return season_data end # this method will be called with a wikitable for a season # as parameter and will extract all episodes from this # and returns that as an array where the episode number is # the index # # this method implements a special format that takes place in # e.g. 'Prison Break' where an episode is not spread along several # lines like in the method above # # Example for an wikitable for episodes: # #{| class="wikitable" # |- style="color:#black; background-color:#006699" # ! '''Episode''' !! '''Deutscher Titel''' !! '''Originaltitel''' !! '''Erstausstrahlung (DE)''' !! '''Erstausstrahlung (USA)''' # |- # |'''1''' (1-01) || Der große Plan || Pilot || 21. Juni 2007 || 29. August 2005 # |- # |'''2''' (1-02) || Lügt Lincoln? || Allen || 21. Juni 2007 || 29. August 2005 # |- # |'''3''' (1-03) || Vertrauenstest || Cell Test || 28. Juni 2007 || 5. September 2005 # |- # |'''4''' (1-04) || Veronica steigt ein || Cute Poison || 28. Juni 2007 || 12. September 2005 # def self.parse_inarticle_season_table_with_one_line(table) raise ArgumentError, 'String with seasontable expected' unless table.is_a?(String) season_data = [] episode_nr_col = nil episode_name_col = nil table.split(/^\|\-.*$/).each do |tablerow| if tablerow.match(/!!.*!!.*!!/) # extract column numbers from table header tablerow.split(/!!/).each_with_index do |col,index| episode_nr_col = index if col.match(/Episode/i) episode_name_col = index if col.match(/Deutsch.*Titel/i) end elsif tablerow.match(/\|\|.*\w+.*\|\|/) tablerow.strip! columns = tablerow.split(/\|\|/) # the following cleanes up the column so that the following occurs # " '''7''' (1-07) " => "7 1 07" # # we can now extract the last bunch of digits and this algorithm is # some kind of format independent dirty_episode_nr = columns[episode_nr_col].gsub(/\D/, " ").strip episode_nr = dirty_episode_nr.match(/(\d+)$/)[1] next unless episode_nr episode_name = columns[episode_name_col].strip next unless episode_nr.match(/\w+/) season_data[episode_nr.to_i] = episode_name end end return season_data end # this method checks if the page is the main page # for a series # # returns true if page contains the infobox that # is typical for series pages in wikipedia def self.is_series_main_page?(page) page.match(@@SERIES_SITE_TEST_PATTERN) != nil end # check the site if it is a disambiguation site # # returns true if this site links to pages with # themes with the same name def self.is_disambiguation_site?(page) page.match(@@DISAMBIGUATION_TEST_PATTERN) != nil end # test if the page contains a link to an article # with an episode list def self.contains_link_to_episode_list?(page) page.match(@@CONTAINS_LINK_TO_EPISODE_LIST) != nil end # test if the page contains a episode list def self.contains_inarticle_episode_list?(page) page.match(@@CONTAINS_INARTICLE_EPISODE_LIST) != nil end # tests for the type of in article episode list def self.is_episode_list_with_one_episode_per_line?(page) page.match(@@IS_ONE_LINE_EPISODE_LIST) != nil end end end