Sha256: 297c54b0f9d8c28b82c787afd7364dacca676822a7369ca815295b67e83c7166

Contents?: true

Size: 1.42 KB

Versions: 3

Compression:

Stored size: 1.42 KB

Contents

require "csv"

require_relative "dataset"

module Datasets
  class QuoraDuplicateQuestionPair < Dataset
    class Record < Struct.new(:id,
                              :first_question_id,
                              :second_question_id,
                              :first_question,
                              :second_question,
                              :duplicated)
      alias_method :duplicated?, :duplicated
    end

    def initialize
      super()
      @metadata.id = "quora-duplicate-question-pair"
      @metadata.name = "Quora's duplicated question pair dataset"
      @metadata.url = "https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs"
      @metadata.licenses = [
        {
          name: "Quora's Terms of Service",
          url: "https://www.quora.com/about/tos",
        }
      ]
    end

    def each
      return to_enum(__method__) unless block_given?

      open_data do |csv|
        csv.each do |row|
          row["is_duplicate"] = (row["is_duplicate"] == 1)
          record = Record.new(*row.fields)
          yield(record)
        end
      end
    end

    private
    def open_data
      data_path = cache_dir_path + "quora_duplicate_questions.tsv"
      data_url = "https://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
      download(data_path, data_url)
      CSV.open(data_path, col_sep: "\t", headers: true, converters: :all) do |csv|
        yield(csv)
      end
    end
  end
end

Version data entries

3 entries across 3 versions & 1 rubygems

Version Path
red-datasets-0.1.7 lib/datasets/quora-duplicate-question-pair.rb
red-datasets-0.1.6 lib/datasets/quora-duplicate-question-pair.rb
red-datasets-0.1.5 lib/datasets/quora-duplicate-question-pair.rb