utils.rb in sportdb-1.0.0

- old
+ new

@@ -1,49 +1,63 @@
+# encoding: utf-8
 
 ### some utils moved to worldbdb/utils for reuse
 
 
 module SportDB::FixtureHelpers
 
   def is_round?( line )
-    line =~ /Spieltag|Runde|Achtelfinale|Viertelfinale|Halbfinale|Finale/
+    line =~ SportDB.lang.regex_round
   end
-  
+
   def is_group?( line )
     # NB: check after is_round? (round may contain group reference!)
-    line =~ /Gruppe|Group/
+    line =~ SportDB.lang.regex_group
   end
-  
+
   def is_knockout_round?( line )
-    if line =~ /Achtelfinale|Viertelfinale|Halbfinale|Spiel um Platz 3|Finale|K\.O\.|Knockout/
+    
+    ## todo: check for adding ignore case for regex (e.g. 1st leg/1st Leg)
+    
+    if line =~ SportDB.lang.regex_leg1
+      puts "  two leg knockout; skip knockout flag on first leg"
+      false
+    elsif line =~ SportDB.lang.regex_knockout_round
       puts "   setting knockout flag to true"
       true
+    elsif line =~ /K\.O\.|Knockout/
+        ## NB: add two language independent markers, that is, K.O. and Knockout
+      puts "   setting knockout flag to true (lang independent marker)"
+      true
     else
       false
     end
   end
   
   def find_group_title_and_pos!( line )
-    ## group pos - for now support single digit e.g 1,2,3 or letter e.g. A,B,C
+    ## group pos - for now support single digit e.g 1,2,3 or letter e.g. A,B,C or HEX
     ## nb:  (?:)  = is for non-capturing group(ing)
-    regex = /(?:Group|Gruppe)\s+((?:\d{1}|[A-Z]{1}))\b/
+    regex = /(?:Group|Gruppe|Grupo)\s+((?:\d{1}|[A-Z]{1,3}))\b/
     
     match = regex.match( line )
     
     return [nil,nil] if match.nil?
 
-    pos = case match[1]      
+    pos = case match[1]
           when 'A' then 1
           when 'B' then 2
           when 'C' then 3
           when 'D' then 4
           when 'E' then 5
           when 'F' then 6
           when 'G' then 7
           when 'H' then 8
           when 'I' then 9
           when 'J' then 10
+          when 'K' then 11
+          when 'L' then 12
+          when 'HEX' then 666    # HEX for Hexagonal - todo/check: map to something else ??
           else  match[1].to_i
           end
 
     title = match[0]
 
@@ -54,13 +68,46 @@
 
     return [title,pos]
   end
   
   def find_round_pos!( line )
+    
+    ## todo: let title2 go first to cut off //
+    ## todo: cut of end of line comments w/ # ???
+    
     ## fix/todo:
     ##  if no round found assume last_pos+1 ??? why? why not?
 
+    # extract optional round pos from line
+    # e.g.  (1)   - must start line 
+    regex = /^[ \t]*\((\d{1,3})\)[ \t]+/
+    if line =~ regex
+      puts "   pos: >#{$1}<"
+      
+      line.sub!( regex, '[ROUND|POS] ' )  ## NB: add back trailing space that got swallowed w/ regex -> [ \t]+
+      return $1.to_i
+    end
+
+    # continue; try some other options
+
+    # NB: do not search string after free standing / or //
+    #  cut-off optional trailing part w/ starting w/  / or //
+    #
+    # e.g.  Viertelfinale   //   Di+Mi 10.+11. April 2012  becomes just
+    #       Viertelfinale
+    
+    cutoff_regex = /^(.+?)[ \t]\/{1,3}[ \t]/
+    
+    if line =~ cutoff_regex
+      line = $1.to_s    # cut off the rest if regex matches
+    end
+
+    ## fix/todo: use cutoff_line for search
+    ## and use line.sub! to change original string
+    # e.g.  Jornada 3  // 1,2 y 3 febrero
+    #   only replaces match in local string w/ [ROUND|POS]
+
     regex = /\b(\d+)\b/
     
     if line =~ regex
       value = $1.to_i
       puts "   pos: >#{value}<"
@@ -68,23 +115,29 @@
       line.sub!( regex, '[ROUND|POS]' )
 
       return value
     else
       return nil
-    end    
+    end
   end
   
   def find_date!( line )
     # extract date from line
     # and return it
     # NB: side effect - removes date from line string
     
     # e.g. 2012-09-14 20:30   => YYYY-MM-DD HH:MM
     regex_db = /\b(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2})\b/
+    
+    # e.g. 2012-09-14  w/ implied hours (set to 12:00)
+    regex_db2 = /\b(\d{4})-(\d{2})-(\d{2})\b/
 
     # e.g. 14.09. 20:30  => DD.MM. HH:MM
     regex_de = /\b(\d{2})\.(\d{2})\.\s+(\d{2}):(\d{2})\b/
+    
+    # e.g. 14.09.2012 20:30   => DD.MM.YYYY HH:MM
+    regex_de2 = /\b(\d{2})\.(\d{2})\.(\d{4})\s+(\d{2}):(\d{2})\b/
 
     if line =~ regex_db
       value = "#{$1}-#{$2}-#{$3} #{$4}:#{$5}"
       puts "   date: >#{value}<"
 
@@ -92,10 +145,27 @@
       ##  and time zone (e.g. cet, eet, utc, etc.)
       
       line.sub!( regex_db, '[DATE.DB]' )
 
       return DateTime.strptime( value, '%Y-%m-%d %H:%M' )
+    elsif line =~ regex_db2
+      value = "#{$1}-#{$2}-#{$3} 12:00"
+      puts "   date: >#{value}<"
+      
+      line.sub!( regex_db2, '[DATE.DB2]' )
+
+      return DateTime.strptime( value, '%Y-%m-%d %H:%M' )
+    elsif line =~ regex_de2
+      value = "#{$3}-#{$2}-#{$1} #{$4}:#{$5}"
+      puts "   date: >#{value}<"
+
+      ## todo: lets you configure year
+      ##  and time zone (e.g. cet, eet, utc, etc.)
+      
+      line.sub!( regex_de2, '[DATE.DE2]' )
+
+      return DateTime.strptime( value, '%Y-%m-%d %H:%M' )
     elsif line =~ regex_de
       value = "2012-#{$2}-#{$1} #{$3}:#{$4}"
       puts "   date: >#{value}<"
 
       ## todo: lets you configure year
@@ -127,21 +197,28 @@
     end
 
   end
 
   def find_scores!( line )
+
+    ### fix: depending on language allow 1:1 or 1-1
+    ##   do NOT allow mix and match
+    ##  e.g. default to en is  1-1
+    ##    de is 1:1 etc.
+    
+
     # extract score from line
     # and return it
     # NB: side effect - removes date from line string
     
-    # e.g. 1:2 or 0:2 or 3:3
-    regex = /\b(\d):(\d)\b/
+    # e.g. 1:2 or 0:2 or 3:3 // 1-1 or 0-2 or 3-3
+    regex = /\b(\d)[:\-](\d)\b/
     
     # e.g. 1:2nV  => overtime
-    regex_ot = /\b(\d):(\d)[ \t]?[nN][vV]\b/
+    regex_ot = /\b(\d)[:\-](\d)[ \t]?[nN][vV]\b/
     
     # e.g. 5:4iE  => penalty
-    regex_p = /\b(\d):(\d)[ \t]?[iI][eE]\b/
+    regex_p = /\b(\d)[:\-](\d)[ \t]?[iI][eE]\b/
     
     scores = []
     
     if line =~ regex
       puts "   score: >#{$1}-#{$2}<"