#encoding: utf-8

require "rubygems"
require "string-strtr"

#A simple library for parsing CSV-files through IO's. Solves corrupt file formats automatically like when files contains several spaces after a column and more.
class Csv_lazy
  include Enumerable
  
  #===Examples
  #  File.open("csvfile.csv", "r") do |fp|
  #    Csv_lazy.new(:io => fp, :quote_char => '"', :col_sep => ";", :row_sep => "\n", :encode => "utf-8") do |row_array|
  #      puts "Row: #{row_array}"
  #    end
  #  end
  def initialize(args = {}, &blk)
    @args = {
      :quote_char => '"',
      :row_sep => "\n",
      :col_sep => ";",
      :headers => false,
      :buffer_length => 4096
    }.merge(args)
    
    @io = @args[:io]
    @eof = false
    @buffer = ""
    @debug = @args[:debug]
    @encode = @args[:encode]
    @mutex = Mutex.new
    @buffer_length = @args[:buffer_length]
    @escape_char = "\\"
    @escaped_quote = "#{@escape_char}#{@args[:quote_char]}"
    @escaped_quote_double = "#{@escape_char}#{@escape_char}#{@args[:quote_char]}"
    
    #@debug = true
    
    accepted = [:encode, :quote_char, :row_sep, :col_sep, :io, :debug, :headers, :buffer_length]
    @args.each do |key, val|
      if accepted.index(key) == nil
        raise "Unknown argument: '#{key}'."
      end
    end
    
    raise "No ':quote_char' was given." if @args[:quote_char].to_s.strip.empty?
    raise "No ':col_sep' was given." if @args[:col_sep].to_s.empty?
    raise "No ':row_sep' was given." if @args[:row_sep].to_s.empty?
    raise "No ':io' was given." if !@args[:io]
    
    @regex_begin_quote_char = /\A\s*#{Regexp.escape(@args[:quote_char])}/
    
    @regex_row_end = /\A\s*?#{Regexp.escape(@args[:row_sep])}/
    @regex_colsep_next = /\A#{Regexp.escape(@args[:col_sep])}/
    
    @regex_read_until_quote_char = /\A(.*?)#{Regexp.escape(@args[:quote_char])}/
    @regex_read_until_col_sep = /\A(.*?)#{Regexp.escape(@args[:col_sep])}/
    @regex_read_until_row_sep = /\A(.+?)#{Regexp.escape(@args[:row_sep])}/
    @regex_read_until_end = /\A(.+?)\Z/
    
    if @args[:headers]
      headers = []
      read_row.each do |key|
        headers << key.to_sym
      end
      
      @headers = headers
    end
    
    self.each(&blk) if blk
  end
  
  #Yields each row as an array.
  def each
    if block_given?
      @mutex.synchronize do
        while row = read_row
          yield(row)
        end
      end
    else
      Enumerable.new do |yielder|
        @mutex.synchronize do
          while row = read_row
            yielder << row
          end
        end
      end
    end
  end
  
  #Returns the next row.
  def read_row
    @row = []
    while !@eof or !@buffer.empty?
      break if !read_next_col
    end
    
    row = @row
    @row = nil
    
    puts "csv_lazy: Row: #{row}\n\n" if @debug
    
    if row.empty?
      return false
    else
      if @headers
        ret = {}
        row.length.times do |count|
          ret[@headers[count]] = row[count]
        end
        
        return ret
      else
        return row
      end
    end
  end
  
  private
  
  #Reads more content into the buffer.
  def read_buffer
    read = @io.gets
    
    if !read
      @eof = true
    else
      read = read.encode(@encode) if @encode
      @buffer << read
    end
  end
  
  #Runs a regex against the buffer. If matched it also removes it from the buffer.
  def read_remove_regex(regex)
    if match = @buffer.match(regex)
      oldbuffer = @buffer
      @buffer = @buffer.gsub(regex, "")
      
      if @debug
        print "csv_lazy: Regex: #{regex.to_s}\n"
        print "csv_lazy: Match: #{match.to_a}\n"
        print "csv_lazy: Buffer before: #{oldbuffer}\n"
        print "csv_lazy: Buffer after: #{@buffer}\n"
        print "\n"
      end
      
      raise "Buffer was the same before regex?" if oldbuffer == @buffer
      return match
    end
    
    return false
  end
  
  def unescape(str)
    return str.strtr(
      "\\\\" => "\\",
      "\\t" => "\t",
      "\\n" => "\n",
      "\\r" => "\r",
      "\\\"" => "\""
    )
  end
  
  #Adds the next column to the row. Returns true if more columns should be read or false if this was the end of the row.
  def read_next_col
    read_buffer if @buffer.length < @buffer_length
    return false if @buffer.empty? and @eof
    
    if @buffer.empty? or read_remove_regex(@regex_row_end)
      return false
    elsif match = read_remove_regex(@regex_begin_quote_char)
      read = ""
      col_content = ""
      
      loop do
        match_read = read_remove_regex(@regex_read_until_quote_char)
        if !match_read
          if @eof
            add_col(@buffer) unless @buffer.empty?
            @buffer = ""
            break
          else
            read_buffer
          end
        else
          all = match_read[0]
          escaped_quote_char = all[-@escaped_quote.length, @escaped_quote.length]
          double_escaped_quote_char = all[-@escaped_quote_double.length, @escaped_quote_double.length]
          all_without_quote = match_read[1]
          
          if escaped_quote_char == @escaped_quote and double_escaped_quote_char != @escaped_quote_double
            #continue reading - the quote char is escaped.
            col_content << all
          else
            col_content << match_read[1]
            add_col(unescape(col_content))
            break
          end
        end
      end
      
      read_buffer if @buffer.length < 4096
      
      if read_remove_regex(@regex_colsep_next)
        return true
      elsif @eof and @buffer.empty?
        puts "csv_lazy: End-of-file and empty buffer." if @debug
        return false
      elsif read_remove_regex(@regex_row_end)
        puts "csv_lazy: Row-end found." if @debug
        return false
      else
        raise "Dont know what to do (#{@buffer.length}): #{@buffer}"
      end
    elsif match = read_remove_regex(@regex_read_until_col_sep)
      add_col(match[1])
      return true
    elsif match = read_remove_regex(@regex_read_until_row_sep)
      puts "csv_lazy: Row seperator reached." if @debug
      add_col(match[1])
      return false
    elsif match = read_remove_regex(@regex_read_until_end)
      #If the very end of the file has been reached, then add this data and stop parsing.
      if @eof
        add_col(match[1])
        return false
      end
      
      #The end-of-file hasnt been reached. Add more data to buffer and try again.
      @buffer << match[0]
      read_buffer
      raise Errno::EAGAIN
    else
      raise "Dont know what to do with buffer: '#{@buffer}'."
    end
  rescue Errno::EAGAIN
    puts "csv_lazy: Retry! Probably we ran out of buffer..." if @debug
    retry
  end
  
  #Adds a new column to the current row.
  def add_col(str)
    @row << str
  end
end