module ValidatesTimeliness # A date and time format regular expression generator. Allows you to # construct a date, time or datetime format using predefined tokens in # a string. This makes it much easier to catalogue and customize the formats # rather than dealing directly with regular expressions. The formats are then # compiled into regular expressions for use validating date or time strings. # # Formats can be added or removed to customize the set of valid date or time # string values. # class Formats cattr_accessor :time_formats cattr_accessor :date_formats cattr_accessor :datetime_formats cattr_accessor :time_expressions cattr_accessor :date_expressions cattr_accessor :datetime_expressions cattr_accessor :format_tokens cattr_accessor :format_proc_args # Format tokens: # y = year # m = month # d = day # h = hour # n = minute # s = second # u = micro-seconds # ampm = meridian (am or pm) with or without dots (e.g. am, a.m, or a.m.) # _ = optional space # tz = Timezone abbreviation (e.g. UTC, GMT, PST, EST) # zo = Timezone offset (e.g. +10:00, -08:00, +1000) # # All other characters are considered literal. You can embed regexp in the # format but no gurantees that it will remain intact. If you avoid the use # of any token characters and regexp dots or backslashes as special characters # in the regexp, it may well work as expected. For special characters use # POSIX character clsses for safety. # # Repeating tokens: # x = 1 or 2 digits for unit (e.g. 'h' means an hour can be '9' or '09') # xx = 2 digits exactly for unit (e.g. 'hh' means an hour can only be '09') # # Special Cases: # yy = 2 or 4 digit year # yyyyy = exactly 4 digit year # mmm = month long name (e.g. 'Jul' or 'July') # ddd = Day name of 3 to 9 letters (e.g. Wed or Wednesday) # u = microseconds matches 1 to 6 digits # # Any other invalid combination of repeating tokens will be swallowed up # by the next lowest length valid repeating token (e.g. yyy will be # replaced with yy) @@time_formats = [ 'hh:nn:ss', 'hh-nn-ss', 'h:nn', 'h.nn', 'h nn', 'h-nn', 'h:nn_ampm', 'h.nn_ampm', 'h nn_ampm', 'h-nn_ampm', 'h_ampm' ] @@date_formats = [ 'yyyy-mm-dd', 'yyyy/mm/dd', 'yyyy.mm.dd', 'm/d/yy', 'd/m/yy', 'm\d\yy', 'd\m\yy', 'd-m-yy', 'd.m.yy', 'd mmm yy' ] @@datetime_formats = [ 'yyyy-mm-dd hh:nn:ss', 'yyyy-mm-dd h:nn', 'yyyy-mm-dd hh:nn:ss.u', 'm/d/yy h:nn:ss', 'm/d/yy h:nn_ampm', 'm/d/yy h:nn', 'd/m/yy hh:nn:ss', 'd/m/yy h:nn_ampm', 'd/m/yy h:nn', 'ddd, dd mmm yyyy hh:nn:ss (zo|tz)', # RFC 822 'ddd mmm d hh:nn:ss zo yyyy', # Ruby time string 'yyyy-mm-ddThh:nn:ss(?:Z|zo)' # iso 8601 ] # All tokens available for format construction. The token array is made of # token regexp, validation regexp and key for format proc mapping if any. # If the token needs no format proc arg then the validation regexp should # not have a capturing group, as all captured groups are passed to the # format proc. # # The token regexp should only use a capture group if 'look-behind' anchor # is required. The first capture group will be considered a literal and put # into the validation regexp string as-is. This is a hack. @@format_tokens = [ { 'd' => [ /(\A|[^d])d{1}(?=[^d])/, '(\d{1,2})', :day ] }, #/ { 'ddd' => [ /d{3,}/, '(\w{3,9})' ] }, { 'dd' => [ /d{2,}/, '(\d{2})', :day ] }, { 'mmm' => [ /m{3,}/, '(\w{3,9})', :month ] }, { 'mm' => [ /m{2}/, '(\d{2})', :month ] }, { 'm' => [ /(\A|[^ap])m{1}/, '(\d{1,2})', :month ] }, { 'yyyy' => [ /y{4,}/, '(\d{4})', :year ] }, { 'yy' => [ /y{2,}/, '(\d{2}|\d{4})', :year ] }, { 'hh' => [ /h{2,}/, '(\d{2})', :hour ] }, { 'h' => [ /h{1}/, '(\d{1,2})', :hour ] }, { 'nn' => [ /n{2,}/, '(\d{2})', :min ] }, { 'n' => [ /n{1}/, '(\d{1,2})', :min ] }, { 'ss' => [ /s{2,}/, '(\d{2})', :sec ] }, { 's' => [ /s{1}/, '(\d{1,2})', :sec ] }, { 'u' => [ /u{1,}/, '(\d{1,6})', :usec ] }, { 'ampm' => [ /ampm/, '((?:[aApP])\.?[mM]\.?)', :meridian ] }, { 'zo' => [ /zo/, '(?:[+-]\d{2}:?\d{2})'] }, { 'tz' => [ /tz/, '(?:[A-Z]{1,4})' ] }, { '_' => [ /_/, '\s?' ] } ] # Arguments whichs will be passed to the format proc if matched in the # time string. The key must should the key from the format tokens. The array # consists of the arry position of the arg, the arg name, and the code to # place in the time array slot. The position can be nil which means the arg # won't be placed in the array. # # The code can be used to manipulate the arg value if required, otherwise # should just be the arg name. # @@format_proc_args = { :year => [0, 'y', 'unambiguous_year(y)'], :month => [1, 'm', 'month_index(m)'], :day => [2, 'd', 'd'], :hour => [3, 'h', 'full_hour(h,md)'], :min => [4, 'n', 'n'], :sec => [5, 's', 's'], :usec => [6, 'u', 'microseconds(u)'], :meridian => [nil, 'md', nil] } class << self def compile_format_expressions @@time_expressions = compile_formats(@@time_formats) @@date_expressions = compile_formats(@@date_formats) @@datetime_expressions = compile_formats(@@datetime_formats) end # Loop through format expressions for type and call proc on matches. Allow # pre or post match strings to exist if strict is false. Otherwise wrap # regexp in start and end anchors. # Returns 7 part time array. def parse(string, type, strict=true) return string unless string.is_a?(String) expressions = expression_set(type, string) time_array = nil expressions.each do |(regexp, processor)| regexp = strict || type == :datetime ? /\A#{regexp}\Z/ : (type == :date ? /\A#{regexp}/ : /#{regexp}\Z/) if matches = regexp.match(string.strip) time_array = processor.call(*matches[1..7]) break end end return time_array end # Delete formats of specified type. Error raised if format not found. def remove_formats(type, *remove_formats) remove_formats.each do |format| unless self.send("#{type}_formats").delete(format) raise "Format #{format} not found in #{type} formats" end end compile_format_expressions end # Adds new formats. Must specify format type and can specify a :before # option to nominate which format the new formats should be inserted in # front on to take higher precedence. # Error is raised if format already exists or if :before format is not found. def add_formats(type, *add_formats) formats = self.send("#{type}_formats") options = {} options = add_formats.pop if add_formats.last.is_a?(Hash) before = options[:before] raise "Format for :before option #{format} was not found." if before && !formats.include?(before) add_formats.each do |format| raise "Format #{format} is already included in #{type} formats" if formats.include?(format) index = before ? formats.index(before) : -1 formats.insert(index, format) end compile_format_expressions end # Removes formats where the 1 or 2 digit month comes first, to eliminate # formats which are ambiguous with the European style of day then month. # The mmm token is ignored as its not ambigous. def remove_us_formats us_format_regexp = /\Am{1,2}[^m]/ date_formats.reject! { |format| us_format_regexp =~ format } datetime_formats.reject! { |format| us_format_regexp =~ format } compile_format_expressions end private # Compile formats into validation regexps and format procs def format_expression_generator(string_format) regexp = string_format.dup order = {} regexp.gsub!(/([\.\\])/, '\\\\\1') # escapes dots and backslashes ]/ format_tokens.each do |token| token_name = token.keys.first token_regexp, regexp_str, arg_key = *token.values.first # hack for lack of look-behinds. If has a capture group then is # considered an anchor to put straight back in the regexp string. regexp.gsub!(token_regexp) {|m| "#{$1}" + regexp_str } order[arg_key] = $~.begin(0) if $~ && !arg_key.nil? end return Regexp.new(regexp), format_proc(order) rescue puts "The following format regular expression failed to compile: #{regexp}\n from format #{string_format}." raise end # Generates a proc which when executed maps the regexp capture groups to a # proc argument based on order captured. A time array is built using the proc # argument in the position indicated by the first element of the proc arg # array. # # Examples: # # 'yyyy-mm-dd hh:nn' => lambda {|y,m,d,h,n| md||=0; [unambiguous_year(y),month_index(m),d,full_hour(h,md),n,nil,nil].map {|i| i.to_i } } # 'dd/mm/yyyy h:nn_ampm' => lambda {|d,m,y,h,n,md| md||=0; [unambiguous_year(y),month_index(m),d,full_hour(h,md),n,nil,nil].map {|i| i.to_i } } # def format_proc(order) arg_map = format_proc_args args = order.invert.sort.map {|p| arg_map[p[1]][1] } arr = [nil] * 7 order.keys.each {|k| i = arg_map[k][0]; arr[i] = arg_map[k][2] unless i.nil? } proc_string = "lambda {|#{args.join(',')}| md||=nil; [#{arr.map {|i| i.nil? ? 'nil' : i }.join(',')}].map {|i| i.to_i } }" eval proc_string end def compile_formats(formats) formats.collect { |format| regexp, format_proc = format_expression_generator(format) } end # Pick expression set and combine date and datetimes for # datetime attributes to allow date string as datetime def expression_set(type, string) case type when :date date_expressions when :time time_expressions when :datetime # gives a speed-up for date string as datetime attributes if string.length < 11 date_expressions + datetime_expressions else datetime_expressions + date_expressions end end end def full_hour(hour, meridian) hour = hour.to_i return hour if meridian.nil? if meridian.delete('.').downcase == 'am' hour == 12 ? 0 : hour else hour == 12 ? hour : hour + 12 end end def unambiguous_year(year, threshold=30) year = "#{year.to_i < threshold ? '20' : '19'}#{year}" if year.length == 2 year.to_i end def month_index(month) return month.to_i if month.to_i.nonzero? Date::ABBR_MONTHNAMES.index(month.capitalize) || Date::MONTHNAMES.index(month.capitalize) end def microseconds(usec) (".#{usec}".to_f * 1_000_000).to_i end end end end