lib/sportdb/utils.rb in sportdb-0.9.7 vs lib/sportdb/utils.rb in sportdb-1.0.0
- old
+ new
@@ -1,49 +1,63 @@
+# encoding: utf-8
### some utils moved to worldbdb/utils for reuse
module SportDB::FixtureHelpers
def is_round?( line )
- line =~ /Spieltag|Runde|Achtelfinale|Viertelfinale|Halbfinale|Finale/
+ line =~ SportDB.lang.regex_round
end
-
+
def is_group?( line )
# NB: check after is_round? (round may contain group reference!)
- line =~ /Gruppe|Group/
+ line =~ SportDB.lang.regex_group
end
-
+
def is_knockout_round?( line )
- if line =~ /Achtelfinale|Viertelfinale|Halbfinale|Spiel um Platz 3|Finale|K\.O\.|Knockout/
+
+ ## todo: check for adding ignore case for regex (e.g. 1st leg/1st Leg)
+
+ if line =~ SportDB.lang.regex_leg1
+ puts " two leg knockout; skip knockout flag on first leg"
+ false
+ elsif line =~ SportDB.lang.regex_knockout_round
puts " setting knockout flag to true"
true
+ elsif line =~ /K\.O\.|Knockout/
+ ## NB: add two language independent markers, that is, K.O. and Knockout
+ puts " setting knockout flag to true (lang independent marker)"
+ true
else
false
end
end
def find_group_title_and_pos!( line )
- ## group pos - for now support single digit e.g 1,2,3 or letter e.g. A,B,C
+ ## group pos - for now support single digit e.g 1,2,3 or letter e.g. A,B,C or HEX
## nb: (?:) = is for non-capturing group(ing)
- regex = /(?:Group|Gruppe)\s+((?:\d{1}|[A-Z]{1}))\b/
+ regex = /(?:Group|Gruppe|Grupo)\s+((?:\d{1}|[A-Z]{1,3}))\b/
match = regex.match( line )
return [nil,nil] if match.nil?
- pos = case match[1]
+ pos = case match[1]
when 'A' then 1
when 'B' then 2
when 'C' then 3
when 'D' then 4
when 'E' then 5
when 'F' then 6
when 'G' then 7
when 'H' then 8
when 'I' then 9
when 'J' then 10
+ when 'K' then 11
+ when 'L' then 12
+ when 'HEX' then 666 # HEX for Hexagonal - todo/check: map to something else ??
else match[1].to_i
end
title = match[0]
@@ -54,13 +68,46 @@
return [title,pos]
end
def find_round_pos!( line )
+
+ ## todo: let title2 go first to cut off //
+ ## todo: cut of end of line comments w/ # ???
+
## fix/todo:
## if no round found assume last_pos+1 ??? why? why not?
+ # extract optional round pos from line
+ # e.g. (1) - must start line
+ regex = /^[ \t]*\((\d{1,3})\)[ \t]+/
+ if line =~ regex
+ puts " pos: >#{$1}<"
+
+ line.sub!( regex, '[ROUND|POS] ' ) ## NB: add back trailing space that got swallowed w/ regex -> [ \t]+
+ return $1.to_i
+ end
+
+ # continue; try some other options
+
+ # NB: do not search string after free standing / or //
+ # cut-off optional trailing part w/ starting w/ / or //
+ #
+ # e.g. Viertelfinale // Di+Mi 10.+11. April 2012 becomes just
+ # Viertelfinale
+
+ cutoff_regex = /^(.+?)[ \t]\/{1,3}[ \t]/
+
+ if line =~ cutoff_regex
+ line = $1.to_s # cut off the rest if regex matches
+ end
+
+ ## fix/todo: use cutoff_line for search
+ ## and use line.sub! to change original string
+ # e.g. Jornada 3 // 1,2 y 3 febrero
+ # only replaces match in local string w/ [ROUND|POS]
+
regex = /\b(\d+)\b/
if line =~ regex
value = $1.to_i
puts " pos: >#{value}<"
@@ -68,23 +115,29 @@
line.sub!( regex, '[ROUND|POS]' )
return value
else
return nil
- end
+ end
end
def find_date!( line )
# extract date from line
# and return it
# NB: side effect - removes date from line string
# e.g. 2012-09-14 20:30 => YYYY-MM-DD HH:MM
regex_db = /\b(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2})\b/
+
+ # e.g. 2012-09-14 w/ implied hours (set to 12:00)
+ regex_db2 = /\b(\d{4})-(\d{2})-(\d{2})\b/
# e.g. 14.09. 20:30 => DD.MM. HH:MM
regex_de = /\b(\d{2})\.(\d{2})\.\s+(\d{2}):(\d{2})\b/
+
+ # e.g. 14.09.2012 20:30 => DD.MM.YYYY HH:MM
+ regex_de2 = /\b(\d{2})\.(\d{2})\.(\d{4})\s+(\d{2}):(\d{2})\b/
if line =~ regex_db
value = "#{$1}-#{$2}-#{$3} #{$4}:#{$5}"
puts " date: >#{value}<"
@@ -92,10 +145,27 @@
## and time zone (e.g. cet, eet, utc, etc.)
line.sub!( regex_db, '[DATE.DB]' )
return DateTime.strptime( value, '%Y-%m-%d %H:%M' )
+ elsif line =~ regex_db2
+ value = "#{$1}-#{$2}-#{$3} 12:00"
+ puts " date: >#{value}<"
+
+ line.sub!( regex_db2, '[DATE.DB2]' )
+
+ return DateTime.strptime( value, '%Y-%m-%d %H:%M' )
+ elsif line =~ regex_de2
+ value = "#{$3}-#{$2}-#{$1} #{$4}:#{$5}"
+ puts " date: >#{value}<"
+
+ ## todo: lets you configure year
+ ## and time zone (e.g. cet, eet, utc, etc.)
+
+ line.sub!( regex_de2, '[DATE.DE2]' )
+
+ return DateTime.strptime( value, '%Y-%m-%d %H:%M' )
elsif line =~ regex_de
value = "2012-#{$2}-#{$1} #{$3}:#{$4}"
puts " date: >#{value}<"
## todo: lets you configure year
@@ -127,21 +197,28 @@
end
end
def find_scores!( line )
+
+ ### fix: depending on language allow 1:1 or 1-1
+ ## do NOT allow mix and match
+ ## e.g. default to en is 1-1
+ ## de is 1:1 etc.
+
+
# extract score from line
# and return it
# NB: side effect - removes date from line string
- # e.g. 1:2 or 0:2 or 3:3
- regex = /\b(\d):(\d)\b/
+ # e.g. 1:2 or 0:2 or 3:3 // 1-1 or 0-2 or 3-3
+ regex = /\b(\d)[:\-](\d)\b/
# e.g. 1:2nV => overtime
- regex_ot = /\b(\d):(\d)[ \t]?[nN][vV]\b/
+ regex_ot = /\b(\d)[:\-](\d)[ \t]?[nN][vV]\b/
# e.g. 5:4iE => penalty
- regex_p = /\b(\d):(\d)[ \t]?[iI][eE]\b/
+ regex_p = /\b(\d)[:\-](\d)[ \t]?[iI][eE]\b/
scores = []
if line =~ regex
puts " score: >#{$1}-#{$2}<"