Sha256: d400ac593274a482413be2bc81668a9052ed535abdb424696e8c033406d51d29

Contents?: true

Size: 1.63 KB

Versions: 1

Compression:

Stored size: 1.63 KB

Contents

class OrcFileReader
  attr_reader :reader, :orc_options, :table_schema

  def initialize(table_schema, path='orc_file.orc')
    @orc_options = OrcReaderOptions.new
    @table_schema = table_schema
    path = Path.new(path)
    @reader = OrcFile.createReader(path, @orc_options.orc)
  end

  def read_row(row_batch, row_index)
    orc_row = {}
    row_batch.cols.each_with_index do |column, index|
      column_name = @table_schema.keys[index]
      data_type = @table_schema[column_name]
      case data_type
        when :integer
          orc_row[column_name] = column.vector[row_index]
        when :decimal
          orc_row[column_name] = column.vector[row_index].get_hive_decimal.to_s.to_d
        when :float
          #sets float value as 0.0005000000237487257 instead of 0.0005
          orc_row[column_name] = column.vector[row_index]
        when :datetime
          orc_row[column_name] = DateTime.strptime(column.time[row_index].to_s, '%Q').to_time.to_datetime
        when :time
          orc_row[column_name] = Time.strptime(column.time[row_index].to_s, '%Q')
        when :date
          # orc_row[column_name] = Time.at(column.vector.first * 86400).to_date
          orc_row[column_name] = Date.new(1970,1,1) + column.vector[row_index]
        when :string
          orc_row[column_name] = column.toString(row_index)
      end
    end
    orc_row
  end

  def read_from_orc
    rows = Array.new
    row_batch = @reader.get_schema.createRowBatch()
    @reader.rows.next_batch(row_batch)

    @reader.number_of_rows.times do |row_index|
      rows << read_row(row_batch, row_index)
    end
    rows
  end

end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
orcfile-1.0.0 lib/orc_file_reader.rb