Sha256: 93e6283a97d3450d3f8c961f7a4b513a9b94486aa47b10ee24077ccca734af58
Contents?: true
Size: 1.53 KB
Versions: 3
Compression:
Stored size: 1.53 KB
Contents
require_relative 'dataset' module Datasets class ITACorpus < Dataset Record = Struct.new(:id, :sentence) def initialize(type: :emotion) unless [:emotion, :recitation].include?(type) raise ArgumentError, "Please set type :emotion or :recitation: #{type.inspect}" end super() @type = type @metadata.id = 'ita-corpus' @metadata.name = 'ITA-corpus' @metadata.url = 'https://github.com/mmorise/ita-corpus' @metadata.licenses = ['Unlicense'] @metadata.description = lambda do fetch_readme end end def each(&block) return to_enum(__method__) unless block_given? data_path = cache_dir_path + "#{@type}_transcript_utf8.txt" data_url = "#{download_base_url}/#{@type}_transcript_utf8.txt" download(data_path, data_url) parse_data(data_path, &block) end private def fetch_readme readme_base_name = "README.md" readme_path = cache_dir_path + readme_base_name readme_url = "#{download_base_url}/#{readme_base_name}" download(readme_path, readme_url) readme_path.read.split(/^## ファイル構成/, 2)[0].strip end def download_base_url "https://raw.githubusercontent.com/mmorise/ita-corpus/main" end def parse_data(data_path) File.open(data_path) do |f| f.each_line(chomp: true) do |line| id, sentence = line.split(':', 2) record = Record.new(id , sentence) yield(record) end end end end end
Version data entries
3 entries across 3 versions & 1 rubygems
Version | Path |
---|---|
red-datasets-0.1.7 | lib/datasets/ita-corpus.rb |
red-datasets-0.1.6 | lib/datasets/ita-corpus.rb |
red-datasets-0.1.5 | lib/datasets/ita-corpus.rb |