#!/usr/bin/env ruby require "groonga" require "arrow" db_path = ARGV[0] metadata_output_path = ARGV[1] data_output_path = ARGV[2] use_tfidf = (ARGV[3] != "tf") use_filter = (ARGV[4] != "raw") Groonga::Database.open(db_path) Groonga::Schema.define do |schema| schema.create_table("Words", :type => :patricia_trie, :key_type => "ShortText", :default_tokenizer => "TokenMecab", :normalizer => "NormalizerAuto") do |table| table.index("Entries.document") end end entries = Groonga["Entries"] total_n_entries = entries.size target_entries = entries.select do |record| if use_filter (record.version == "2.4.0") - (record.document =~ "@todo") else record.version == "2.4.0" end end n_entries = target_entries.size too_many_threshold = n_entries * 0.9 too_less_threshold = n_entries * 0.01 bow = {} index = Groonga["Words.Entries_document"] max_term_id = 0 index.table.open_cursor(:order_by => :id) do |table_cursor| table_cursor.each do |term| n_match_documents = index.estimate_size(term) # p [term.key, n_match_documents, (n_match_documents / n_entries.to_f)] if use_filter if n_match_documents <= too_less_threshold p [:skip, :too_less, term.key, n_match_documents] next end if n_match_documents >= too_many_threshold p [:skip, :too_many, term.key, n_match_documents] next end end max_term_id = [max_term_id, term.id].max df = Math.log(total_n_entries.to_f / n_match_documents) index.open_cursor(term.id, :with_position => false) do |index_cursor| index_cursor.each(:reuse_posting_object => true) do |posting| if target_entries[posting.record_id].nil? next end bow[posting.record_id] ||= [] if use_tfidf score = posting.term_frequency / df else score = posting.term_frequency end bow[posting.record_id] << [posting.term_id, score] end end end end File.open(metadata_output_path, "w") do |metadata_file| metadata_file.puts({ "n_documents" => bow.size, "n_features" => max_term_id, }.to_json) end Arrow::IO::FileOutputStream.open(data_output_path, false) do |output_stream| term_id_field = Arrow::Field.new("term_id", :uint32) score_field = Arrow::Field.new("score", :double) schema = Arrow::Schema.new([term_id_field, score_field]) Arrow::IPC::StreamWriter.open(output_stream, schema) do |writer| bow.each do |record_id, words| term_ids = Arrow::UInt32Array.new(words.collect(&:first)) scores = Arrow::DoubleArray.new(words.collect(&:last)) record_batch = Arrow::RecordBatch.new(schema, words.size, [term_ids, scores]) writer.write_record_batch(record_batch) end end end