require 'csv'

module Tabula
  class TableExtractor
    attr_accessor :text_elements, :options

    DEFAULT_OPTIONS = {
      :horizontal_rulings => [],
      :vertical_rulings => [],
      :merge_words => true,
      :split_multiline_cells => false
    }

    def initialize(text_elements, options = {})
      self.text_elements = text_elements
      self.options = DEFAULT_OPTIONS.merge(options)

      if self.options[:merge_words]
        if self.options[:vertical_rulings]
          merge_words_in_a_vertical_rulings_aware_manner!(self.options[:vertical_rulings])
        else
          merge_words!
        end
      end
      
    end

    def get_rows
      hg = self.get_line_boundaries
      hg.sort_by(&:top).map { |r| {'top' => r.top, 'bottom' => r.bottom, 'text' => r.texts} }
    end

    # TODO finish writing this method
    # it should be analogous to get_line_boundaries
    # (ie, take into account vertical ruling lines if available)
    def group_by_columns
      columns = []
      tes = self.text_elements.sort_by &:left

      # we don't have vertical rulings
      if self.options[:vertical_rulings].empty?
        tes.each do |te|
          if column = columns.detect { |c| te.horizontally_overlaps?(c) }
            column << te
          else
            columns << Column.new(te.left, te.width, [te])
          end
        end
      else
        self.options[:vertical_rulings].sort_by! &:left
        1.upto(self.options[:vertical_rulings].size - 1) do |i|
          left_ruling_line =  self.options[:vertical_rulings][i - 1]
          right_ruling_line = self.options[:vertical_rulings][i]
          columns << Column.new(left_ruling_line.left, right_ruling_line.left - left_ruling_line.left, []) if (right_ruling_line.left - left_ruling_line.left > 10)
        end
        tes.each do |te|
          if column = columns.detect { |c| te.horizontally_overlaps?(c) }
            column << te
          else
            #puts "couldn't find a place for #{te.inspect}"
            #columns << Column.new(te.left, te.width, [te])
          end
        end
      end
      columns
    end

    def get_columns
      TableExtractor.new(text_elements).group_by_columns.map do |c|
        {'left' => c.left, 'right' => c.right, 'width' => c.width}
      end
    end

    def get_line_boundaries
      boundaries = []

      if self.options[:horizontal_rulings].empty?
        # we don't have rulings
        # iteratively grow boundaries to construct lines
        self.text_elements.each do |te|
          row = boundaries.detect { |l| l.vertically_overlaps?(te) }
          ze = ZoneEntity.new(te.top, te.left, te.width, te.height)
          if row.nil?
            boundaries << ze
            ze.texts << te.text
          else
            row.merge!(ze)
            row.texts << te.text
          end
        end
      else
        self.options[:horizontal_rulings].sort_by!(&:top)
        1.upto(self.options[:horizontal_rulings].size - 1) do |i|
          above = self.options[:horizontal_rulings][i - 1]
          below = self.options[:horizontal_rulings][i]

          # construct zone between a horizontal ruling and the next
          ze = ZoneEntity.new(above.top,
                              [above.left, below.left].min,
                              [above.width, below.width].max,
                              below.top - above.top)

          # skip areas shorter than some threshold
          # TODO: this should be the height of the shortest character, or something like that
          next if ze.height < 2

          boundaries << ze
        end
      end
      boundaries
    end

    private

    #this is where spaces come from!
    def merge_words!
      return self.text_elements if @merged # only merge once. awful hack.
      @merged = true
      current_word_index = i = 0
      char1 = self.text_elements[i]

      while i < self.text_elements.size-1 do

        char2 = self.text_elements[i+1]

        next if char2.nil? or char1.nil?

        if self.text_elements[current_word_index].should_merge?(char2)
          self.text_elements[current_word_index].merge!(char2)
          char1 = char2
          self.text_elements[i+1] = nil
        else
          # is there a space? is this within `CHARACTER_DISTANCE_THRESHOLD` points of previous char?
          if (char1.text != " ") and (char2.text != " ") and self.text_elements[current_word_index].should_add_space?(char2)
            self.text_elements[current_word_index].text += " "
            #self.text_elements[current_word_index].width += self.text_elements[current_word_index].width_of_space
          end
          current_word_index = i+1
        end
        i += 1
      end
      self.text_elements.compact!
      return self.text_elements
    end

      #this is where spaces come from!
    def merge_words_in_a_vertical_rulings_aware_manner!(vertical_rulings)
      #don't merge words across a ruling.

      return self.text_elements if @merged # only merge once. awful hack.
      @merged = true
      current_word_index = i = 0
      char1 = self.text_elements[i]
      vertical_ruling_locations = vertical_rulings.map &:left

      while i < self.text_elements.size-1 do

        char2 = self.text_elements[i+1]

        next if char2.nil? or char1.nil?

        if self.text_elements[current_word_index].should_merge?(char2)
            unless vertical_ruling_locations.map{|loc| self.text_elements[current_word_index].left < loc && char2.left > loc}.include?(true)
              self.text_elements[current_word_index].merge!(char2)
            end
    
            char1 = char2
            self.text_elements[i+1] = nil
        else
          # is there a space? is this within `CHARACTER_DISTANCE_THRESHOLD` points of previous char?
          if (char1.text != " ") and (char2.text != " ") and self.text_elements[current_word_index].should_add_space?(char2)
            self.text_elements[current_word_index].text += " "
            #self.text_elements[current_word_index].width += self.text_elements[current_word_index].width_of_space
          end
          current_word_index = i+1
        end
        i += 1
      end
      self.text_elements.compact!
      return self.text_elements
    end
  end

  ##
  # Deprecated.
  ##
  def Tabula.group_by_columns(text_elements, merge_words=false)
    TableExtractor.new(text_elements, :merge_words => merge_words).group_by_columns
  end

  ##
  # Deprecated.
  ##
  def Tabula.get_line_boundaries(text_elements)
    TableExtractor.new(text_elements).get_line_boundaries
  end

  ##
  # Deprecated.
  ##
  def Tabula.get_columns(text_elements, merge_words=true)
    TableExtractor.new(text_elements, :merge_words => merge_words).get_columns
  end

  ##
  # Deprecated.
  ##
  def Tabula.get_rows(text_elements, merge_words=true)
    TableExtractor.new(text_elements, :merge_words => merge_words).get_rows
  end

  def Tabula.lines_to_csv(lines)
    CSV.generate do |csv|
      lines.each do |l|
        csv << l.map { |c| c.text.strip }
      end
    end
  end

  ONLY_SPACES_RE = Regexp.new('^\s+$')

  def Tabula.group_by_lines(text_elements)
    lines = []
    text_elements.each do |te|
      next if te.text =~ ONLY_SPACES_RE
      l = lines.find { |line| line.horizontal_overlap_ratio(te) >= 0.01 }
      if l.nil?
        l = Line.new
        lines << l
      end
      l << te
    end
    lines
  end

  # Returns an array of Tabula::Line
  def Tabula.make_table(text_elements, options={})
    default_options = {:separators => []}
    options = default_options.merge(options)

    if text_elements.empty?
      return []
    end

    extractor = TableExtractor.new(text_elements, options).text_elements
    lines = group_by_lines(text_elements)
    top = lines[0].text_elements.map(&:top).min
    right = 0
    columns = []

    text_elements.sort_by(&:left).each do |te|
      next if te.text =~ ONLY_SPACES_RE
      if te.top >= top
        left = te.left
        if (left > right)
          columns << right
          right = te.right
        elsif te.right > right
          right = te.right
        end
      end
    end

    separators = columns[1..-1].sort.reverse

    table = Table.new(lines.count, separators)
    lines.each_with_index do |line, i|
      line.text_elements.each do |te|
        j = separators.find_index { |s| te.left > s } || separators.count
        table.add_text_element(te, i, separators.count - j)
      end
    end

    table.lines.map { |l|
      l.text_elements.map! { |te|
        te.nil? ? TextElement.new(nil, nil, nil, nil, nil, nil, '', nil) : te
      }
    }.sort_by { |l| l.map { |te| te.top or 0 }.max }

  end


  def Tabula.make_table_with_vertical_rulings(text_elements, options={})
    extractor = TableExtractor.new(text_elements, options)

    # group by lines
    lines = []
    line_boundaries = extractor.get_line_boundaries

    # find all the text elements
    # contained within each detected line (table row) boundary
    line_boundaries.each do |lb|
      line = Line.new

      line_members = text_elements.find_all do |te|
        te.vertically_overlaps?(lb)
      end

      text_elements -= line_members

      line_members.sort_by(&:left).each do |te|
        # skip text_elements that only contain spaces
        next if te.text =~ ONLY_SPACES_RE
        line << te
      end

      lines << line if line.text_elements.size > 0
    end

    lines.sort_by!(&:top)

    vertical_rulings = options[:vertical_rulings]
    columns = TableExtractor.new(lines.map(&:text_elements).flatten.compact.uniq, {:merge_words => options[:merge_words], :vertical_rulings => vertical_rulings}).group_by_columns.sort_by(&:left)

    # insert an empty cell in a given column if there's no text elements within that column's boundaries
    lines.each_with_index do |l, line_index|
      next if l.text_elements.nil?
      l.text_elements.compact! # TODO WHY do I have to do this?
      l.text_elements.uniq!  # TODO WHY do I have to do this?
      l.text_elements.sort_by!(&:left)

      columns.each_with_index do |c, i|
        if (l.text_elements.select{|te| te && te.left >= c.left && te.right <= (c.left + c.width)}.empty?)
          l.text_elements.insert(i, TextElement.new(l.top, c.left, c.width, l.height, nil, 0, '', 0))
        end
      end
    end

    # merge elements that are in the same column
    unless options[:dontmerge]
      lines.each_with_index do |l, line_index|
        next if l.text_elements.nil?

        (0..l.text_elements.size-1).to_a.combination(2).each do |t1, t2|  #don't remove a string of empty cells
          next if l.text_elements[t1].nil? or l.text_elements[t2].nil?  or l.text_elements[t1].text.empty? or l.text_elements[t2].text.empty?

          # if same column...
          if columns.detect { |c| c.text_elements.include? l.text_elements[t1] } \
            == columns.detect { |c| c.text_elements.include? l.text_elements[t2] }
            if l.text_elements[t1].bottom <= l.text_elements[t2].bottom
              l.text_elements[t1].merge!(l.text_elements[t2])
              l.text_elements[t2] = nil
            else
              l.text_elements[t2].merge!(l.text_elements[t1])
              l.text_elements[t1] = nil
            end
          end
        end

        l.text_elements.compact!
      end
    end

    # remove duplicate lines
    # TODO this shouldn't have happened here, check why we have to do
    # this (maybe duplication is happening in the column merging phase?)
    (0..lines.size - 2).each do |i|
      next if lines[i].nil?
      # if any of the elements on the next line is duplicated, kill
      # the next line
      if (0..lines[i].text_elements.size-1).any? { |j| lines[i].text_elements[j] == lines[i+1].text_elements[j] }
        lines[i+1] = nil
      end
    end

    lines.compact.map do |line|
      line.text_elements.sort_by(&:left)
    end
  end
end