module Embulk::Guess module TimeFormatGuess module Parts YEAR = /[1-4][0-9]{3}/ MONTH = /10|11|12|[0 ]?[0-9]/ MONTH_NODELIM = /10|11|12|[0][0-9]/ DAY = /31|30|[1-2][0-9]|[0 ]?[1-9]/ DAY_NODELIM = /31|30|[1-2][0-9]|[0][1-9]/ HOUR = /20|21|22|23|24|1[0-9]|[0 ]?[0-9]/ HOUR_NODELIM = /20|21|22|23|24|1[0-9]|[0][0-9]/ MINUTE = SECOND = /60|[1-5][0-9]|[0 ]?[0-9]/ MINUTE_NODELIM = SECOND_NODELIM = /60|[1-5][0-9]|[0][0-9]/ MONTH_NAME_SHORT = /Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec/ MONTH_NAME_FULL = /January|February|March|April|May|June|July|August|September|October|November|December/ WEEKDAY_NAME_SHORT = /Sun|Mon|Tue|Wed|Thu|Fri|Sat/ WEEKDAY_NAME_FULL = /Sunday|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday/ ZONE_OFF = /(?:Z|[\-\+]\d\d(?::?\d\d)?)/ ZONE_ABB = /[A-Z]{1,3}/ end class GuessMatch def initialize(delimiters, parts, part_options) @delimiters = delimiters @parts = parts @part_options = part_options end def format format = '' @parts.size.times do |i| format << @delimiters[i-1] if i != 0 option = @part_options[i] case @parts[i] when :year format << '%Y' when :month case option when :zero format << '%m' when :blank #format << '%_m' # not supported format << '%m' when :none #format << '%-m' # not supported format << '%m' else format << '%m' end when :day case option when :zero format << '%d' when :blank format << '%e' when :none format << '%d' # not supported else format << '%d' end when :hour case option when :zero format << '%H' when :blank format << '%k' when :none format << '%k' # not supported else format << '%H' end when :minute # heading options are not supported format << '%M' when :second # heading options are not supported format << '%S' when :frac if option <= 3 format << '%L' #elsif option <= 6 # format << '%6N' #elsif option <= 6 # format << '%6N' #elsif option <= 9 # format << '%9N' #elsif option <= 12 # format << '%12N' #elsif option <= 15 # format << '%15N' #elsif option <= 18 # format << '%18N' #elsif option <= 21 # format << '%21N' #elsif option <= 24 # format << '%24N' else format << '%N' end when :zone_off format << '%z' when :zone_abb format << '%Z' else raise "Unknown part: #{@parts[i]}" end end return format end def mergeable_group # MDY is mergible with DMY if i = array_sequence_find(@parts, [:day, :month, :year]) ps = @parts.dup ps[i, 3] = [:month, :day, :year] [@delimiters, ps] else [@delimiters, @parts] end end attr_reader :parts attr_reader :part_options def merge!(another_in_group) part_options = another_in_group.part_options @part_options.size.times do |i| @part_options[i] ||= part_options[i] if @part_options[i] == nil part_options[i] elsif part_options[i] == nil @part_options[i] else [@part_options[i], part_options[i]].sort.last end end # if DMY matches, MDY is likely false match of DMY. dmy = array_sequence_find(another_in_group.parts, [:day, :month, :year]) mdy = array_sequence_find(@parts, [:month, :day, :year]) if mdy && dmy @parts[mdy, 3] = [:day, :month, :year] end end def array_sequence_find(array, seq) (array.size - seq.size + 1).times {|i| return i if array[i, seq.size] == seq } return nil end end class GuessPattern include Parts date_delims = /[\/\-\.]/ # yyyy-MM-dd YMD = /(?#{YEAR})(?#{date_delims})(?#{MONTH})\k(?#{DAY})/ YMD_NODELIM = /(?#{YEAR})(?#{MONTH_NODELIM})(?#{DAY_NODELIM})/ # MM/dd/yyyy MDY = /(?#{MONTH})(?#{date_delims})(?#{DAY})\k(?#{YEAR})/ MDY_NODELIM = /(?#{MONTH_NODELIM})(?#{DAY_NODELIM})(?#{YEAR})/ # dd.MM.yyyy DMY = /(?#{DAY})(?#{date_delims})(?#{MONTH})\k(?#{YEAR})/ DMY_NODELIM = /(?#{DAY_NODELIM})(?#{MONTH_NODELIM})(?#{YEAR})/ frac = /[0-9]{1,9}/ time_delims = /[\:\-]/ frac_delims = /[\.\,]/ TIME = /(?#{HOUR})(?:(?#{time_delims})(?#{MINUTE})(?:\k(?#{SECOND})(?:(?#{frac_delims})(?#{frac}))?)?)?/ TIME_NODELIM = /(?#{HOUR_NODELIM})(?:(?#{MINUTE_NODELIM})((?#{SECOND_NODELIM})(?:(?#{frac_delims})(?#{frac}))?)?)?/ ZONE = /(? )?(?(?#{ZONE_OFF})|(?#{ZONE_ABB}))/ def match(text) delimiters = [] parts = [] part_options = [] if dm = (/^#{YMD}(?.*?)$/.match(text) or /^#{YMD_NODELIM}(?.*?)$/.match(text)) date_delim = dm["date_delim"] rescue "" parts << :year part_options << nil delimiters << date_delim parts << :month part_options << part_heading_option(dm["month"]) delimiters << date_delim parts << :day part_options << part_heading_option(dm["day"]) elsif dm = (/^#{MDY}(?.*?)$/.match(text) or /^#{MDY_NODELIM}(?.*?)$/.match(text)) date_delim = dm["date_delim"] rescue "" parts << :month part_options << part_heading_option(dm["month"]) delimiters << date_delim parts << :day part_options << part_heading_option(dm["day"]) delimiters << date_delim parts << :year part_options << nil elsif dm = (/^#{DMY}(?.*?)$/.match(text) or /^#{DMY_NODELIM}(?.*?)$/.match(text)) date_delim = dm["date_delim"] rescue "" parts << :day part_options << part_heading_option(dm["day"]) delimiters << date_delim parts << :month part_options << part_heading_option(dm["month"]) delimiters << date_delim parts << :year part_options << nil else date_delim = "" return nil end rest = dm["rest"] date_time_delims = /(:? |_|T|\. ?)/ if tm = ( /^(?#{date_time_delims})#{TIME}(?.*?)?$/.match(rest) or /^(?#{date_time_delims})#{TIME_NODELIM}(?.*?)?$/.match(rest) or (date_delim == "" && /^#{TIME_NODELIM}(?.*?)?$/.match(rest)) ) date_time_delim = tm["date_time_delim"] rescue "" time_delim = tm["time_delim"] rescue "" delimiters << date_time_delim parts << :hour part_options << part_heading_option(tm["hour"]) if tm["minute"] delimiters << time_delim parts << :minute part_options << part_heading_option(tm["minute"]) if tm["second"] delimiters << time_delim parts << :second part_options << part_heading_option(tm["second"]) if tm["frac"] delimiters << tm["frac_delim"] parts << :frac part_options << tm["frac"].size end end end rest = tm["rest"] end if zm = /^#{ZONE}$/.match(rest) delimiters << (zm["zone_space"] || '') if zm["zone_off"] parts << :zone_off else parts << :zone_abb end part_options << nil return GuessMatch.new(delimiters, parts, part_options) elsif rest =~ /^\s*$/ return GuessMatch.new(delimiters, parts, part_options) else return nil end end def part_heading_option(text) if text[0] == '0' :zero elsif text[0] == ' ' :blank elsif text.size == 1 :none else nil end end end class SimpleMatch def initialize(format) @format = format end attr_reader :format def mergeable_group @format end def merge!(another_in_group) end end class Rfc2822Pattern include Parts def initialize @regexp = /^(?#{WEEKDAY_NAME_SHORT}, )?\d\d #{MONTH_NAME_SHORT} \d\d\d\d(?