lib/plugin/wikipedia.rb in serienrenamer-0.0.2 vs lib/plugin/wikipedia.rb in serienrenamer-0.0.3
- old
+ new
@@ -22,11 +22,13 @@
@@DISAMBIGUATION_TEST_PATTERN = /\{\{Begriffsklärung\}\}/m
@@CONTAINS_LINK_TO_EPISODE_LIST = /Hauptartikel.*(?<main>Liste.*?)[\]\}]+/
@@CONTAINS_INARTICLE_EPISODE_LIST = /\<div.*\>Staffel.(\d+).*\<\/div\>.*class=\"wikitable\".*titel/m
@@INPAGE_SEASON_SEPARATOR = /\<div.style=\"clear:both\;.class=\"NavFrame\"\>/
@@WIKITABLE_EXTRACT_PATTERN = /(\{\|.class=\"wikitable\".*\|\})\n/m
+ @@IS_ONE_LINE_EPISODE_LIST = /\|.*\|\|.*\|\|.*\|\|/m
+
# this method will be called from the main program
# with an Serienrenamer::Episode instance as parameter
#
# it returns an array of episode information
def self.generate_episode_information(episode)
@@ -111,10 +113,11 @@
end
return episode_names
end
+
# This method will extract season based information
# from a string that contains a wikipedia episodelist page
#
# returns an Array of Arrays with episode information
# where episode and season numbers are the indizes
@@ -147,10 +150,11 @@
end
return series_data
end
+
# this method will be called with a wikipedia seasontable
# as parameter and will extract all episodes from this
# and returns that as an array where the episode number is
# the index
def self.parse_season_table(table)
@@ -210,10 +214,11 @@
end
end
return season_data
end
+
# This method will extract season based information
# from a string that contains a series page with an
# episodelist included
#
# returns an Array of Arrays with episode information
@@ -235,12 +240,20 @@
next unless contains_inarticle_episode_list?(season)
season_nr = season.match(@@CONTAINS_INARTICLE_EPISODE_LIST)[1].to_i
wikitable = season.match(@@WIKITABLE_EXTRACT_PATTERN)[1]
- episodes = parse_inarticle_season_table(wikitable)
+ # we have to detect the type of the inarticle season page
+ # because there are two different kinds of table structures
+ # used in the german wikipedia
+ if self.is_episode_list_with_one_episode_per_line?(wikitable)
+ episodes = parse_inarticle_season_table_with_one_line(wikitable)
+ else
+ episodes = parse_inarticle_season_table(wikitable)
+ end
+
# HACK if a season is splitted into different parts
# eg. Flashpoint (2.1 and 2.2) than merge that if possible
if series_data[season_nr] != nil
series_data[season_nr].each_with_index do |item, index|
episodes[index] = item unless episodes[index]
@@ -251,10 +264,11 @@
end
return series_data
end
+
# this method will be called with a wikitable for a season
# as parameter and will extract all episodes from this
# and returns that as an array where the episode number is
# the index
#
@@ -329,10 +343,75 @@
end
return season_data
end
+
+ # this method will be called with a wikitable for a season
+ # as parameter and will extract all episodes from this
+ # and returns that as an array where the episode number is
+ # the index
+ #
+ # this method implements a special format that takes place in
+ # e.g. 'Prison Break' where an episode is not spread along several
+ # lines like in the method above
+ #
+ # Example for an wikitable for episodes:
+ #
+ #{| class="wikitable"
+ # |- style="color:#black; background-color:#006699"
+ # ! '''Episode''' !! '''Deutscher Titel''' !! '''Originaltitel''' !! '''Erstausstrahlung (DE)''' !! '''Erstausstrahlung (USA)'''
+ # |-
+ # |'''1''' (1-01) || Der große Plan || Pilot || 21. Juni 2007 || 29. August 2005
+ # |-
+ # |'''2''' (1-02) || Lügt Lincoln? || Allen || 21. Juni 2007 || 29. August 2005
+ # |-
+ # |'''3''' (1-03) || Vertrauenstest || Cell Test || 28. Juni 2007 || 5. September 2005
+ # |-
+ # |'''4''' (1-04) || Veronica steigt ein || Cute Poison || 28. Juni 2007 || 12. September 2005
+ #
+ def self.parse_inarticle_season_table_with_one_line(table)
+ raise ArgumentError, 'String with seasontable expected' unless
+ table.is_a?(String)
+
+ season_data = []
+ episode_nr_col = nil
+ episode_name_col = nil
+
+ table.split(/^\|\-.*$/).each do |tablerow|
+
+ if tablerow.match(/!!.*!!.*!!/)
+ # extract column numbers from table header
+ tablerow.split(/!!/).each_with_index do |col,index|
+ episode_nr_col = index if col.match(/Episode/i)
+ episode_name_col = index if col.match(/Deutsch.*Titel/i)
+ end
+
+ elsif tablerow.match(/\|\|.*\w+.*\|\|/)
+ tablerow.strip!
+ columns = tablerow.split(/\|\|/)
+
+ # the following cleanes up the column so that the following occurs
+ # " '''7''' (1-07) " => "7 1 07"
+ #
+ # we can now extract the last bunch of digits and this algorithm is
+ # some kind of format independent
+ dirty_episode_nr = columns[episode_nr_col].gsub(/\D/, " ").strip
+ episode_nr = dirty_episode_nr.match(/(\d+)$/)[1]
+ next unless episode_nr
+
+ episode_name = columns[episode_name_col].strip
+ next unless episode_nr.match(/\w+/)
+
+ season_data[episode_nr.to_i] = episode_name
+ end
+ end
+
+ return season_data
+ end
+
+
# this method checks if the page is the main page
# for a series
#
# returns true if page contains the infobox that
# is typical for series pages in wikipedia
@@ -355,8 +434,13 @@
end
# test if the page contains a episode list
def self.contains_inarticle_episode_list?(page)
page.match(@@CONTAINS_INARTICLE_EPISODE_LIST) != nil
+ end
+
+ # tests for the type of in article episode list
+ def self.is_episode_list_with_one_episode_per_line?(page)
+ page.match(@@IS_ONE_LINE_EPISODE_LIST) != nil
end
end
end