# encoding: utf-8 module JLDrill # Describes how a piece of vocabulary has been used in a sentence. # It includes the kanji, reading, the sense that the vocabulary # was used, the grammatical form that was used and whether # the usage has been checked as being accurate. # This class also includes input and output routines for # the Tanaka "B" lines. class VocabularyUsage attr_reader :kanji, :reading, :sense, :usedForm, :checked attr_writer :kanji, :reading, :sense, :usedForm, :checked B_LINE_RE = /^([^(\[{~]*)(\(([^)]*)\))?(\[([^\]]*)\])?(\{([^}]*)\})?(~)?/u HASH_RE = /([^{(\[~]*(\([^)]*\))?)/u # Map of VocabularyUsages to file positions that can be searched by # kanji and reading quickly. This is used to store the location # in the example dictionary where each vocabulary is used. class Map # Result of searching a UsageMap # The successfulHash is the actual hash value that returned results # The positions is an array of positions in a file class SearchResult attr_reader :successfulHash, :positions def initialize(successfulHash, positions) @successfulHash = successfulHash if !positions.nil? @positions = positions else @positions = [] end end end def initialize @usages = {} end # Add a Vocabulary usage corresponding to Tanaka "B" line data # and map it to the position, pos. def add(usageData, pos) hash = VocabularyUsage.hashFrom_B_line(usageData) if !hash.empty? (@usages[hash] ||= []).push(pos) end end # Take an entire Tanaka "B" line for an example sentences # and add it to the map with position, pos. def add_B_line(b_line, pos) w = b_line.split(' ') w.each do |usageData| add(usageData, pos) end end # Search for VocabularyUsages which have the giving kanji and reading. # If kanji is nil, reading will be used for the kanji (useful for # vocabulary without kanji). This will first search for entries # with both kanji and reading specified (to disambiguate entries # with the same kanji and different readings). If this is empty, # it will search for entries with just the kanji specified. def search(kanji, reading) hash = VocabularyUsage.hashFromStrings(kanji, reading) positions = @usages[hash] if positions.nil? # The corpus only uses readings to disambiguate # kanji. Most usages don't have readings. So # if we don't find anything, search again without # the reading. hash = JLDrill::VocabularyUsage.hashFromStrings(kanji, nil) positions = @usages[kanji] end return SearchResult.new(hash, positions) end end def initialize() @kanji = "" @reading = "" @sense = 0 @usedForm = "" @checked = false end # Create a VocabularyUsage from data taken from a Tanaka "B" line # Note: This is not the whole line. Just the data for a single # vocabulary. def VocabularyUsage::from_B_line(data) retVal = VocabularyUsage.new() if B_LINE_RE.match(data) retVal.kanji = $1 retVal.reading = $3 if !$5.nil? retVal.sense = $5.to_i else retVal.sense = 0 end retVal.usedForm = $7 retVal.checked = $8.eql?("~") end return retVal end # Create a hash that can be used in a hash table for searching # for vocabulary usages. This has is generated from a Tanaka # "B" line. It is composed of the kanji followed by the # reading, enclosed in parentheses, if the reading is ambiguous # from the kanji. def VocabularyUsage::hashFrom_B_line(data) retVal = "" if HASH_RE.match(data) retVal = $1 end return retVal end # Create a hash that can be used in a hash table for searching # for vocabulary usages. This is generated from kanji and reading # strings. Either can be nil. If the kanji is nil, then the reading # is used for the kanji (for vocabulary without kanji). The reading # should normally be nil, unless the reading from the kanji is # ambiguous. def VocabularyUsage::hashFromStrings(kanji, reading) if reading.nil? return kanji elsif kanji.nil? return reading else return "#{kanji}(#{reading})" end end # Output the Vocabulary usage in the same form as used # by the B lines in the Tanaka Corpus def to_B_line retVal = @kanji.to_s if !@reading.nil? retVal += "(#{@reading})" end if @sense != 0 retVal += "[#{@sense.to_s}]" end if !@actual.nil? retVal += "{#{@actual.to_s}}" end if @checked retVal += "~" end return retVal end # Output a string form of the VocabularyUsage. # Currently this is just the Tanaka "B" line data # for the Vocabulary Usage. def to_s return to_B_line end end end