package org.embulk.spi.time;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.EnumSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Arrays;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.embulk.spi.time.lexer.StrptimeLexer;
/**
* This is Java implementation of ext/date/date_strptime.c in Ruby v2.3.1.
* @see date_strptime.c
*
* TODO
* This class is tentatively required for {@code TimestampParser} class.
* The {@code StrptimeParser} and {@code RubyDateParser} will be merged into JRuby
* (jruby/jruby#4591). embulk-jruby-strptime is removed when Embulk start using
* the JRuby that bundles embulk-jruby-strptime.
*/
public class StrptimeParser
{
// day_names
private static final String[] DAY_NAMES = new String[] {
"Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday",
"Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"
};
// month_names
private static final String[] MONTH_NAMES = new String[] {
"January", "February", "March", "April", "May", "June", "July", "August", "September",
"October", "November", "December", "Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
};
// merid_names
private static final String[] MERID_NAMES = new String[] {
"am", "pm", "a.m.", "p.m."
};
/**
* Ported Date::Format::Bag from JRuby 9.1.5.0's lib/ruby/stdlib/date/format.rb.
* @see format.rb
*/
public static class FormatBag
{
private int mDay = Integer.MIN_VALUE;
private int wDay = Integer.MIN_VALUE;
private int cWDay = Integer.MIN_VALUE;
private int yDay = Integer.MIN_VALUE;
private int cWeek = Integer.MIN_VALUE;
private int cWYear = Integer.MIN_VALUE;
private int min = Integer.MIN_VALUE;
private int mon = Integer.MIN_VALUE;
private int hour = Integer.MIN_VALUE;
private int year = Integer.MIN_VALUE;
private int sec = Integer.MIN_VALUE;
private int wNum0 = Integer.MIN_VALUE;
private int wNum1 = Integer.MIN_VALUE;
private String zone = null;
private int secFraction = Integer.MIN_VALUE; // Rational
private int secFractionSize = Integer.MIN_VALUE;
private long seconds = Long.MIN_VALUE; // long or Rational
private int secondsSize = Integer.MIN_VALUE;
private int merid = Integer.MIN_VALUE;
private int cent = Integer.MIN_VALUE;
private boolean fail = false;
private String leftover = null;
public int getMDay()
{
return mDay;
}
public int getWDay()
{
return wDay;
}
public int getCWDay()
{
return cWDay;
}
public int getYDay()
{
return yDay;
}
public int getCWeek()
{
return cWeek;
}
public int getCWYear()
{
return cWYear;
}
public int getMin()
{
return min;
}
public int getMon()
{
return mon;
}
public int getHour()
{
return hour;
}
public int getYear()
{
return year;
}
public int getSec()
{
return sec;
}
public int getWNum0()
{
return wNum0;
}
public int getWNum1()
{
return wNum1;
}
public String getZone()
{
return zone;
}
public int getSecFraction()
{
return secFraction;
}
public int getSecFractionSize()
{
return secFractionSize;
}
public long getSeconds()
{
return seconds;
}
public int getSecondsSize()
{
return secondsSize;
}
public int getMerid()
{
return merid;
}
public int getCent()
{
return cent;
}
void fail()
{
fail = true;
}
public String getLeftover()
{
return leftover;
}
public boolean setYearIfNotSet(int v)
{
if (has(year)) {
return false;
}
else {
year = v;
return true;
}
}
public boolean setMonthIfNotSet(int v)
{
if (has(mon)) {
return false;
}
else {
mon = v;
return true;
}
}
public boolean setMdayIfNotSet(int v)
{
if (has(mDay)) {
return false;
}
else {
mDay = v;
return true;
}
}
public boolean hasSeconds()
{
return seconds != Long.MIN_VALUE;
}
public static boolean has(int v)
{
return v != Integer.MIN_VALUE;
}
}
private final StrptimeLexer lexer;
public StrptimeParser()
{
this.lexer = new StrptimeLexer((Reader) null);
}
/**
* Ported from org.jruby.util.RubyDateFormatter#addToPattern in JRuby 9.1.5.0
* under EPL.
* @see RubyDateFormatter.java
*/
private void addToPattern(final List compiledPattern, final String str)
{
for (int i = 0; i < str.length(); i++) {
final char c = str.charAt(i);
if (('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z')) {
compiledPattern.add(StrptimeToken.format(c));
}
else {
compiledPattern.add(StrptimeToken.str(Character.toString(c)));
}
}
}
/**
* Ported from org.jruby.util.RubyDateFormatter#compilePattern in JRuby 9.1.5.0
* under EPL.
* @see RubyDateFormatter.java
*/
public List compilePattern(final String pattern)
{
final List compiledPattern = new LinkedList<>();
final Reader reader = new StringReader(pattern); // TODO Use try-with-resource statement
lexer.yyreset(reader);
StrptimeToken token;
try {
while ((token = lexer.yylex()) != null) {
if (token.getFormat() != StrptimeFormat.FORMAT_SPECIAL) {
compiledPattern.add(token);
}
else {
char c = (Character) token.getData();
switch (c) {
case 'c':
addToPattern(compiledPattern, "a b e H:M:S Y");
break;
case 'D':
case 'x':
addToPattern(compiledPattern, "m/d/y");
break;
case 'F':
addToPattern(compiledPattern, "Y-m-d");
break;
case 'n':
compiledPattern.add(StrptimeToken.str("\n"));
break;
case 'R':
addToPattern(compiledPattern, "H:M");
break;
case 'r':
addToPattern(compiledPattern, "I:M:S p");
break;
case 'T':
case 'X':
addToPattern(compiledPattern, "H:M:S");
break;
case 't':
compiledPattern.add(StrptimeToken.str("\t"));
break;
case 'v':
addToPattern(compiledPattern, "e-b-Y");
break;
case 'Z':
// +HH:MM in 'date', never zone name
compiledPattern.add(StrptimeToken.zoneOffsetColons(1));
break;
case '+':
addToPattern(compiledPattern, "a b e H:M:S ");
// %Z: +HH:MM in 'date', never zone name
compiledPattern.add(StrptimeToken.zoneOffsetColons(1));
addToPattern(compiledPattern, " Y");
break;
default:
throw new Error("Unknown special char: " + c);
}
}
}
}
catch (IOException e) {
e.printStackTrace();
}
return compiledPattern;
}
public FormatBag parse(final List compiledPattern, final String text)
{
final FormatBag bag = new StringParser(text).parse(compiledPattern);
if (bag == null) {
return null;
}
if (FormatBag.has(bag.cent)) {
if (FormatBag.has(bag.cWYear)) {
bag.cWYear += bag.cent * 100;
}
if (FormatBag.has(bag.year)) {
bag.year += bag.cent * 100;
}
// delete bag._cent
bag.cent = Integer.MIN_VALUE;
}
if (FormatBag.has(bag.merid)) {
if (FormatBag.has(bag.hour)) {
bag.hour %= 12;
bag.hour += bag.merid;
}
// delete bag._merid
bag.merid = Integer.MIN_VALUE;
}
return bag;
}
private static class StringParser
{
private static final Pattern ZONE_PARSE_REGEX = Pattern.compile("\\A(" +
"(?:gmt|utc?)?[-+]\\d+(?:[,.:]\\d+(?::\\d+)?)?" +
"|(?-i:[[\\p{Alpha}].\\s]+)(?:standard|daylight)\\s+time\\b" +
"|(?-i:[[\\p{Alpha}]]+)(?:\\s+dst)?\\b" +
")", Pattern.CASE_INSENSITIVE);
private final String text;
private final FormatBag bag;
private int pos;
private boolean fail;
private StringParser(String text)
{
this.text = text;
this.bag = new FormatBag();
this.pos = 0;
this.fail = false;
}
private FormatBag parse(final List compiledPattern)
{
for (int tokenIndex = 0; tokenIndex < compiledPattern.size(); tokenIndex++) {
final StrptimeToken token = compiledPattern.get(tokenIndex);
switch (token.getFormat()) {
case FORMAT_STRING: {
final String str = token.getData().toString();
for (int i = 0; i < str.length(); i++) {
final char c = str.charAt(i);
if (isSpace(c)) {
while (!isEndOfText(text, pos) && isSpace(text.charAt(pos))) {
pos++;
}
}
else {
if (isEndOfText(text, pos) || c != text.charAt(pos)) {
fail = true;
}
pos++;
}
}
break;
}
case FORMAT_WEEK_LONG: // %A - The full weekday name (``Sunday'')
case FORMAT_WEEK_SHORT: { // %a - The abbreviated name (``Sun'')
final int dayIndex = findIndexInPatterns(DAY_NAMES);
if (dayIndex >= 0) {
bag.wDay = dayIndex % 7;
pos += DAY_NAMES[dayIndex].length();
}
else {
fail = true;
}
break;
}
case FORMAT_MONTH_LONG: // %B - The full month name (``January'')
case FORMAT_MONTH_SHORT: { // %b, %h - The abbreviated month name (``Jan'')
final int monIndex = findIndexInPatterns(MONTH_NAMES);
if (monIndex >= 0) {
bag.mon = monIndex % 12 + 1;
pos += MONTH_NAMES[monIndex].length();
}
else {
fail = true;
}
break;
}
case FORMAT_CENTURY: { // %C - year / 100 (round down. 20 in 2009)
final long cent;
if (isNumberPattern(compiledPattern, tokenIndex)) {
cent = readDigits(2);
}
else {
cent = readDigitsMax();
}
bag.cent = (int)cent;
break;
}
case FORMAT_DAY: // %d, %Od - Day of the month, zero-padded (01..31)
case FORMAT_DAY_S: { // %e, %Oe - Day of the month, blank-padded ( 1..31)
final long day;
if (isBlank(text, pos)) {
pos += 1; // blank
day = readDigits(1);
}
else {
day = readDigits(2);
}
if (!validRange(day, 1, 31)) {
fail = true;
}
bag.mDay = (int)day;
break;
}
case FORMAT_WEEKYEAR: { // %G - The week-based year
final long year;
if (isNumberPattern(compiledPattern, tokenIndex)) {
year = readDigits(4);
}
else {
year = readDigitsMax();
}
bag.cWYear = (int)year;
break;
}
case FORMAT_WEEKYEAR_SHORT: { // %g - The last 2 digits of the week-based year (00..99)
final long v = readDigits(2);
if (!validRange(v, 0, 99)) {
fail = true;
}
bag.cWYear = (int)v;
if (!bag.has(bag.cent)) {
bag.cent = v >= 69 ? 19 : 20;
}
break;
}
case FORMAT_HOUR: // %H, %OH - Hour of the day, 24-hour clock, zero-padded (00..23)
case FORMAT_HOUR_BLANK: { // %k - Hour of the day, 24-hour clock, blank-padded ( 0..23)
final long hour;
if (isBlank(text, pos)) {
pos += 1; // blank
hour = readDigits(1);
}
else {
hour = readDigits(2);
}
if (!validRange(hour, 0, 24)) {
fail = true;
}
bag.hour = (int)hour;
break;
}
case FORMAT_HOUR_M: // %I, %OI - Hour of the day, 12-hour clock, zero-padded (01..12)
case FORMAT_HOUR_S: { // %l - Hour of the day, 12-hour clock, blank-padded ( 1..12)
final long hour;
if (isBlank(text, pos)) {
pos += 1; // blank
hour = readDigits(1);
}
else {
hour = readDigits(2);
}
if (!validRange(hour, 1, 12)) {
fail = true;
}
bag.hour = (int)hour;
break;
}
case FORMAT_DAY_YEAR: { // %j - Day of the year (001..366)
final long day = readDigits(3);
if (!validRange(day, 1, 365)) {
fail = true;
}
bag.yDay = (int)day;
break;
}
case FORMAT_MILLISEC: // %L - Millisecond of the second (000..999)
case FORMAT_NANOSEC: { // %N - Fractional seconds digits, default is 9 digits (nanosecond)
boolean negative = false;
if (isSign(text, pos)) {
negative = text.charAt(pos) == '-';
pos++;
}
final long v;
final int initPos = pos;
if (isNumberPattern(compiledPattern, tokenIndex)) {
if (token.getFormat() == StrptimeFormat.FORMAT_MILLISEC) {
v = readDigits(3);
}
else {
v = readDigits(9);
}
}
else {
v = readDigitsMax();
}
bag.secFraction = (int)(!negative ? v : -v);
bag.secFractionSize = pos - initPos;
break;
}
case FORMAT_MINUTES: { // %M, %OM - Minute of the hour (00..59)
final long min = readDigits(2);
if (!validRange(min, 0, 59)) {
fail = true;
}
bag.min = (int)min;
break;
}
case FORMAT_MONTH: { // %m, %Om - Month of the year, zero-padded (01..12)
final long mon = readDigits(2);
if (!validRange(mon, 1, 12)) {
fail = true;
}
bag.mon = (int)mon;
break;
}
case FORMAT_MERIDIAN: // %P - Meridian indicator, lowercase (``am'' or ``pm'')
case FORMAT_MERIDIAN_LOWER_CASE: { // %p - Meridian indicator, uppercase (``AM'' or ``PM'')
final int meridIndex = findIndexInPatterns(MERID_NAMES);
if (meridIndex >= 0) {
bag.merid = meridIndex % 2 == 0 ? 0 : 12;
pos += MERID_NAMES[meridIndex].length();
}
else {
fail = true;
}
break;
}
case FORMAT_MILLISEC_EPOCH: { // %Q - Number of milliseconds since 1970-01-01 00:00:00 UTC.
boolean negative = false;
if (isMinus(text, pos)) {
negative = true;
pos++;
}
final long sec = readDigitsMax();
bag.seconds = !negative ? sec : -sec;
bag.secondsSize = 3;
break;
}
case FORMAT_SECONDS: { // %S - Second of the minute (00..59)
final long sec = readDigits(2);
if (!validRange(sec, 0, 60)) {
fail = true;
}
bag.sec = (int)sec;
break;
}
case FORMAT_EPOCH: { // %s - Number of seconds since 1970-01-01 00:00:00 UTC.
boolean negative = false;
if (isMinus(text, pos)) {
negative = true;
pos++;
}
final long sec = readDigitsMax();
bag.seconds = (int)(!negative ? sec : -sec);
break;
}
case FORMAT_WEEK_YEAR_S: // %U, %OU - Week number of the year. The week starts with Sunday. (00..53)
case FORMAT_WEEK_YEAR_M: { // %W, %OW - Week number of the year. The week starts with Monday. (00..53)
final long week = readDigits(2);
if (!validRange(week, 0, 53)) {
fail = true;
}
if (token.getFormat() == StrptimeFormat.FORMAT_WEEK_YEAR_S) {
bag.wNum0 = (int)week;
} else {
bag.wNum1 = (int)week;
}
break;
}
case FORMAT_DAY_WEEK2: { // %u, %Ou - Day of the week (Monday is 1, 1..7)
final long day = readDigits(1);
if (!validRange(day, 1, 7)) {
fail = true;
}
bag.cWDay = (int)day;
break;
}
case FORMAT_WEEK_WEEKYEAR: { // %V, %OV - Week number of the week-based year (01..53)
final long week = readDigits(2);
if (!validRange(week, 1, 53)) {
fail = true;
}
bag.cWeek = (int)week;
break;
}
case FORMAT_DAY_WEEK: { // %w - Day of the week (Sunday is 0, 0..6)
final long day = readDigits(1);
if (!validRange(day, 0, 6)) {
fail = true;
}
bag.wDay = (int)day;
break;
}
case FORMAT_YEAR_LONG: {
// %Y, %EY - Year with century (can be negative, 4 digits at least)
// -0001, 0000, 1995, 2009, 14292, etc.
boolean negative = false;
if (isSign(text, pos)) {
negative = text.charAt(pos) == '-';
pos++;
}
final long year;
if (isNumberPattern(compiledPattern, tokenIndex)) {
year = readDigits(4);
} else {
year = readDigitsMax();
}
bag.year = (int)(!negative ? year : -year);
break;
}
case FORMAT_YEAR_SHORT: { // %y, %Ey, %Oy - year % 100 (00..99)
final long y = readDigits(2);
if (!validRange(y, 0, 99)) {
fail = true;
}
bag.year = (int)y;
if (!bag.has(bag.cent)) {
bag.cent = y >= 69 ? 19 : 20;
}
break;
}
case FORMAT_ZONE_ID: // %Z - Time zone abbreviation name
case FORMAT_COLON_ZONE_OFF: {
// %z - Time zone as hour and minute offset from UTC (e.g. +0900)
// %:z - hour and minute offset from UTC with a colon (e.g. +09:00)
// %::z - hour, minute and second offset from UTC (e.g. +09:00:00)
// %:::z - hour, minute and second offset from UTC
// (e.g. +09, +09:30, +09:30:30)
if (isEndOfText(text, pos)) {
fail = true;
break;
}
final Matcher m = ZONE_PARSE_REGEX.matcher(text.substring(pos));
if (m.find()) {
// zone
String zone = text.substring(pos, pos + m.end());
bag.zone = zone;
pos += zone.length();
} else {
fail = true;
}
break;
}
case FORMAT_SPECIAL:
{
throw new Error("FORMAT_SPECIAL is a special token only for the lexer.");
}
}
}
if (fail) {
return null;
}
if (text.length() > pos) {
bag.leftover = text.substring(pos, text.length());
}
return bag;
}
/**
* Ported read_digits in MRI 2.3.1's ext/date/date_strptime.c
* @see date_strftime.c
*/
private long readDigits(final int len)
{
char c;
long v = 0;
final int initPos = pos;
for (int i = 0; i < len; i++) {
if (isEndOfText(text, pos)) {
break;
}
c = text.charAt(pos);
if (!isDigit(c)) {
break;
}
else {
v = v * 10 + toInt(c);
}
pos += 1;
}
if (pos == initPos) {
fail = true;
}
return v;
}
/**
* Ported from READ_DIGITS_MAX in MRI 2.3.1's ext/date/date_strptime.c under BSDL.
* @see date_strftime.c
*/
private long readDigitsMax()
{
return readDigits(Integer.MAX_VALUE);
}
/**
* Returns -1 if text doesn't match with patterns.
*/
private int findIndexInPatterns(final String[] patterns)
{
if (isEndOfText(text, pos)) {
return -1;
}
for (int i = 0; i < patterns.length; i++) {
final String pattern = patterns[i];
final int len = pattern.length();
if (!isEndOfText(text, pos + len - 1)
&& pattern.equalsIgnoreCase(text.substring(pos, pos + len))) { // strncasecmp
return i;
}
}
return -1; // text doesn't match at any patterns.
}
/**
* Ported from num_pattern_p in MRI 2.3.1's ext/date/date_strptime.c under BSDL.
* @see date_strftime.c
*/
private static boolean isNumberPattern(final List compiledPattern, final int i)
{
if (compiledPattern.size() <= i + 1) {
return false;
}
else {
final StrptimeToken nextToken = compiledPattern.get(i + 1);
final StrptimeFormat f = nextToken.getFormat();
if (f == StrptimeFormat.FORMAT_STRING && isDigit(((String) nextToken.getData()).charAt(0))) {
return true;
}
else if (NUMBER_PATTERNS.contains(f)) {
return true;
}
else {
return false;
}
}
}
// CDdeFGgHIjkLlMmNQRrSsTUuVvWwXxYy
private static final EnumSet NUMBER_PATTERNS =
EnumSet.copyOf(Arrays.asList(
StrptimeFormat.FORMAT_CENTURY, // 'C'
// D
StrptimeFormat.FORMAT_DAY, // 'd'
StrptimeFormat.FORMAT_DAY_S, // 'e'
// F
StrptimeFormat.FORMAT_WEEKYEAR, // 'G'
StrptimeFormat.FORMAT_WEEKYEAR_SHORT, // 'g'
StrptimeFormat.FORMAT_HOUR, // 'H'
StrptimeFormat.FORMAT_HOUR_M, // 'I'
StrptimeFormat.FORMAT_DAY_YEAR, // 'j'
StrptimeFormat.FORMAT_HOUR_BLANK, // 'k'
StrptimeFormat.FORMAT_MILLISEC, // 'L'
StrptimeFormat.FORMAT_HOUR_S, // 'l'
StrptimeFormat.FORMAT_MINUTES, // 'M'
StrptimeFormat.FORMAT_MONTH, // 'm'
StrptimeFormat.FORMAT_NANOSEC, // 'N'
// Q, R, r
StrptimeFormat.FORMAT_SECONDS, // 'S'
StrptimeFormat.FORMAT_EPOCH, // 's'
// T
StrptimeFormat.FORMAT_WEEK_YEAR_S, // 'U'
StrptimeFormat.FORMAT_DAY_WEEK2, // 'u'
StrptimeFormat.FORMAT_WEEK_WEEKYEAR, // 'V'
// v
StrptimeFormat.FORMAT_WEEK_YEAR_M, // 'W'
StrptimeFormat.FORMAT_DAY_WEEK, // 'w'
// X, x
StrptimeFormat.FORMAT_YEAR_LONG, // 'Y'
StrptimeFormat.FORMAT_YEAR_SHORT // 'y'
));
/**
* Ported from valid_pattern_p in MRI 2.3.1's ext/date/date_strptime.c under BSDL.
* @see date_strftime.c
*/
private static boolean validRange(long v, int lower, int upper)
{
return lower <= v && v <= upper;
}
private static boolean isSpace(char c)
{
return c == ' ' || c == '\t' || c == '\n' ||
c == '\u000b' || c == '\f' || c == '\r';
}
private static boolean isDigit(char c)
{
return '0' <= c && c <= '9';
}
private static boolean isEndOfText(String text, int pos)
{
return pos >= text.length();
}
private static boolean isSign(String text, int pos)
{
return !isEndOfText(text, pos) && (text.charAt(pos) == '+' || text.charAt(pos) == '-');
}
private static boolean isMinus(String text, int pos)
{
return !isEndOfText(text, pos) && text.charAt(pos) == '-';
}
private static boolean isBlank(String text, int pos)
{
return !isEndOfText(text, pos) && text.charAt(pos) == ' ';
}
private static int toInt(char c)
{
return c - '0';
}
}
}