# encoding: UTF-8
require 'media_wiki'
module Plugin
# This Plugin tries to extract the series
# information from wikipedia
#
# (by now only the german wikipedia)
class Wikipedia < Serienrenamer::Pluginbase
def self.plugin_name; "Wikipedia" end
def self.usable; true end
def self.priority; 30 end
@@WIKIPEDIA_URL = 'http://de.wikipedia.org/w/api.php'
# patterns used in this class
@@EPISODE_TABLE_PATTERN = /.*(?
\{\{Episodenlistentabelle.*\}\})\s*$/m
@@EPISODE_ENTRY_PATTERN = /\{\{Episodenlisteneintrag|S-Episode/
@@SERIES_SITE_TEST_PATTERN = /\{\{Infobox.Fernsehsendung.*\}\}/m
@@DISAMBIGUATION_TEST_PATTERN = /\{\{Begriffsklärung\}\}/m
@@CONTAINS_LINK_TO_EPISODE_LIST = /Hauptartikel.*(?Liste.*?)[\]\}]+/
@@CONTAINS_INARTICLE_EPISODE_LIST = /\Staffel.(\d+).*\<\/div\>.*class=\"wikitable\".*titel/m
@@INPAGE_SEASON_SEPARATOR = /\/
@@WIKITABLE_EXTRACT_PATTERN = /(\{\|.class=\"wikitable\".*\|\})\n/m
@@IS_ONE_LINE_EPISODE_LIST = /\|.*\|\|.*\|\|.*\|\|/m
# this method will be called from the main program
# with an Serienrenamer::Episode instance as parameter
#
# it returns an array of episode information
def self.generate_episode_information(episode)
raise ArgumentError, "Serienrenamer::Episode instance needed" unless
episode.is_a?(Serienrenamer::Episode)
return [] unless episode.series.match(/\w+/)
unless defined? @cached_data
@cached_data = Hash.new
end
wiki = MediaWiki::Gateway.new(@@WIKIPEDIA_URL)
if ! @cached_data.has_key?(episode.series)
# search for a series site in wikipedia
series_site = nil
tries = 3
search_pattern = episode.series
search_pattern_modified = false
begin
wiki.search(search_pattern, nil, 50).each do |title|
pagedata = wiki.get(title)
if is_series_main_page?(pagedata)
series_site = title
break
end
end
# modify the search term pattern so that it contains
# only the last word if the search_pattern contains
# more than one words
if series_site.nil? && ! search_pattern_modified
search_pattern = search_pattern.match(/(\w+)\s*$/)[1]
search_pattern_modified = true
raise EOFError if search_pattern # break out and retry
end
rescue MediaWiki::APIError => e
tries -= 1
retry if tries > 0
rescue EOFError => e
retry
end
return [] unless series_site
# look for a link to a list of episodes
pagedata = wiki.get(series_site)
if contains_link_to_episode_list?(pagedata)
mainarticle = pagedata.match(@@CONTAINS_LINK_TO_EPISODE_LIST)[:main]
if mainarticle
episodelist_page = wiki.get(mainarticle)
series = parse_episodelist_page_data(episodelist_page)
@cached_data[episode.series] = series
end
elsif contains_inarticle_episode_list?(pagedata)
series = parse_inarticle_episodelist_page_data(pagedata)
@cached_data[episode.series] = series
else
warn "no episode list found"
return []
end
end
episode_names = []
# tries to find an episodename in cached_data
# otherwise returns empty array
begin
series = @cached_data[episode.series]
episodename = series[episode.season][episode.episode]
if episodename.match(/\w+/)
episode_names.push(episodename)
end
rescue
end
return episode_names
end
# This method will extract season based information
# from a string that contains a wikipedia episodelist page
#
# returns an Array of Arrays with episode information
# where episode and season numbers are the indizes
def self.parse_episodelist_page_data(pagedata, debug=false)
raise ArgumentError, 'String with pagedata expected' unless
pagedata.is_a?(String)
series_data = []
is_season_table_following = false
season_number = nil
# split the wikipedia page by headings and process
# the following paragraph if the heading starts with
# 'Staffel'
pagedata.split(/(==.*)==/).each do |paragraph|
if paragraph.match(/^==.*Staffel/)
match = paragraph.match(/^==.*Staffel.(?\d+)/)
if match
season_number = match[:seasonnr].to_i
is_season_table_following = true
end
elsif is_season_table_following
#
# extract season table from this paragraph
season = parse_season_table(paragraph)
series_data[season_number] = season
is_season_table_following = false
end
end
return series_data
end
# this method will be called with a wikipedia seasontable
# as parameter and will extract all episodes from this
# and returns that as an array where the episode number is
# the index
def self.parse_season_table(table)
raise ArgumentError, 'String with seasontable expected' unless
table.is_a?(String)
season_data = []
matched_table = table.match(@@EPISODE_TABLE_PATTERN)
if matched_table
# extract all episode entries that
# looks like the following
#
# {{Episodenlisteneintrag
# | NR_GES = 107
# | NR_ST = 1
# | OT = The Mastodon in the Room
# | DT = Die Rückkehr der Scheuklappen
# | ZF =
# | EA = {{dts|23|09|2010}}
# | EAD = {{dts|08|09|2011}}
# }}
episodes = matched_table[:table].split(@@EPISODE_ENTRY_PATTERN)
if episodes
episodes.each do |epi|
# build up a hash from the entry
infos = {}
epi.lines.each do |part|
parts = part.strip.match(/(?\w+).=.(?.*)$/)
if parts
infos[parts[:key].strip] = parts[:value].strip
end
end
next unless infos.has_key?('NR_ST')
# extract useful information and
# add it to the array
epi_nr = infos['NR_ST'].to_i
next unless epi_nr
# TODO make the following variable
epi_name = infos['DT'].strip
# remove all html tags and all following
# text from the episode name and the bold
# syntax from mediawiki [[text]]
epi_name.gsub!(/<\/?[^>]*>.*/, "")
epi_name.gsub!(/[\[\[\]\]]/, "")
next unless epi_name.match(/\w+/)
season_data[epi_nr] = epi_name
end
end
end
return season_data
end
# This method will extract season based information
# from a string that contains a series page with an
# episodelist included
#
# returns an Array of Arrays with episode information
# where episode and season numbers are the indizes
def self.parse_inarticle_episodelist_page_data(pagedata, debug=false)
raise ArgumentError, 'String with pagedata expected' unless
pagedata.is_a?(String)
series_data = []
# look for a paragraph with an episodelist
episodelist_paragraph = pagedata.split(/==.*==/).select { |p|
contains_inarticle_episode_list?(p) }[0]
raise ArgumentError, 'no episodelist found' unless episodelist_paragraph
# iterate through all seasons in this episode table
episodelist_paragraph.split(@@INPAGE_SEASON_SEPARATOR).each do |season|
next unless contains_inarticle_episode_list?(season)
season_nr = season.match(@@CONTAINS_INARTICLE_EPISODE_LIST)[1].to_i
wikitable = season.match(@@WIKITABLE_EXTRACT_PATTERN)[1]
# we have to detect the type of the inarticle season page
# because there are two different kinds of table structures
# used in the german wikipedia
if self.is_episode_list_with_one_episode_per_line?(wikitable)
episodes = parse_inarticle_season_table_with_one_line(wikitable)
else
episodes = parse_inarticle_season_table(wikitable)
end
# HACK if a season is splitted into different parts
# eg. Flashpoint (2.1 and 2.2) than merge that if possible
if series_data[season_nr] != nil
series_data[season_nr].each_with_index do |item, index|
episodes[index] = item unless episodes[index]
end
end
series_data[season_nr] = episodes
end
return series_data
end
# this method will be called with a wikitable for a season
# as parameter and will extract all episodes from this
# and returns that as an array where the episode number is
# the index
#
# Example for an wikitable for episodes:
#
# {| class="wikitable" width="100%"
# |- vertical-align: top; text-align:center; "
# | width="15" | '''Nummer'''
(Gesamt)
# | width="15" | '''Nummer'''
(Staffel)
# ! width="250" | Originaltitel
# ! width="250" | Deutscher Titel
# ! width="180" | Erstausstrahlung
(USA Network)
# ! width="180" | Erstausstrahlung
(RTL)
# ! width="180" | Erstausstrahlung
(SF zwei)
# |-
# | bgcolor="#DFEEEF"| 01
# | 01
# | ''Pilot''
# | ''Auch Reiche sind nur Menschen''
# | 4. Mai 2009
# | 17. Mai 2011
# | 6. Juni 2011 (Teil 1)
13. Juni 2011 (Teil 2)
# |-
# |}
#
def self.parse_inarticle_season_table(table)
raise ArgumentError, 'String with seasontable expected' unless
table.is_a?(String)
season_data = []
episode_nr_line_nr = nil
episode_name_line_nr = nil
table.split(/^\|\-.*$/).each do |tablerow|
tablerow.strip!
# skip invalid rows
lines = tablerow.lines.to_a
next unless lines.length >= 4
if tablerow.match(/width=\"\d+\"/)
# extract line numbers for needed data that
# are in the table header
lines.each_with_index do |item, index|
if item.match(/Nummer.*Staffel/i)
episode_nr_line_nr = index
# TODO make the following more variable
elsif item.match(/Deutscher.*Titel/i)
episode_name_line_nr = index
end
end
else
# extract episode information
if episode_nr_line_nr && episode_name_line_nr
md_nr = lines[episode_nr_line_nr].strip.match(/(\d+)/)
if md_nr
episode_nr = md_nr[1].to_i
md_name = lines[episode_name_line_nr].strip.match(/^\|.(.*)$/)
if md_name
episode_name = md_name[1]
episode_name.gsub!(/[\'\"\[\]]/, "")
next unless episode_name.match(/\w+/)
season_data[episode_nr] = episode_name.strip
end
end
end
end
end
return season_data
end
# this method will be called with a wikitable for a season
# as parameter and will extract all episodes from this
# and returns that as an array where the episode number is
# the index
#
# this method implements a special format that takes place in
# e.g. 'Prison Break' where an episode is not spread along several
# lines like in the method above
#
# Example for an wikitable for episodes:
#
#{| class="wikitable"
# |- style="color:#black; background-color:#006699"
# ! '''Episode''' !! '''Deutscher Titel''' !! '''Originaltitel''' !! '''Erstausstrahlung (DE)''' !! '''Erstausstrahlung (USA)'''
# |-
# |'''1''' (1-01) || Der große Plan || Pilot || 21. Juni 2007 || 29. August 2005
# |-
# |'''2''' (1-02) || Lügt Lincoln? || Allen || 21. Juni 2007 || 29. August 2005
# |-
# |'''3''' (1-03) || Vertrauenstest || Cell Test || 28. Juni 2007 || 5. September 2005
# |-
# |'''4''' (1-04) || Veronica steigt ein || Cute Poison || 28. Juni 2007 || 12. September 2005
#
def self.parse_inarticle_season_table_with_one_line(table)
raise ArgumentError, 'String with seasontable expected' unless
table.is_a?(String)
season_data = []
episode_nr_col = nil
episode_name_col = nil
table.split(/^\|\-.*$/).each do |tablerow|
if tablerow.match(/!!.*!!.*!!/)
# extract column numbers from table header
tablerow.split(/!!/).each_with_index do |col,index|
episode_nr_col = index if col.match(/Episode/i)
episode_name_col = index if col.match(/Deutsch.*Titel/i)
end
elsif tablerow.match(/\|\|.*\w+.*\|\|/)
tablerow.strip!
columns = tablerow.split(/\|\|/)
# the following cleanes up the column so that the following occurs
# " '''7''' (1-07) " => "7 1 07"
#
# we can now extract the last bunch of digits and this algorithm is
# some kind of format independent
dirty_episode_nr = columns[episode_nr_col].gsub(/\D/, " ").strip
episode_nr = dirty_episode_nr.match(/(\d+)$/)[1]
next unless episode_nr
episode_name = columns[episode_name_col].strip
next unless episode_nr.match(/\w+/)
season_data[episode_nr.to_i] = episode_name
end
end
return season_data
end
# this method checks if the page is the main page
# for a series
#
# returns true if page contains the infobox that
# is typical for series pages in wikipedia
def self.is_series_main_page?(page)
page.match(@@SERIES_SITE_TEST_PATTERN) != nil
end
# check the site if it is a disambiguation site
#
# returns true if this site links to pages with
# themes with the same name
def self.is_disambiguation_site?(page)
page.match(@@DISAMBIGUATION_TEST_PATTERN) != nil
end
# test if the page contains a link to an article
# with an episode list
def self.contains_link_to_episode_list?(page)
page.match(@@CONTAINS_LINK_TO_EPISODE_LIST) != nil
end
# test if the page contains a episode list
def self.contains_inarticle_episode_list?(page)
page.match(@@CONTAINS_INARTICLE_EPISODE_LIST) != nil
end
# tests for the type of in article episode list
def self.is_episode_list_with_one_episode_per_line?(page)
page.match(@@IS_ONE_LINE_EPISODE_LIST) != nil
end
end
end