require 'jldrill/model/DataFile' module JLDrill::Tanaka # Represents one of the words stored in the Tanaka library class Word attr_reader :contents def initialize(contents) @contents = contents end def Word.create(kanji, reading) contents = kanji if !reading.nil? contents += "(#{reading})" end return Word.new(contents) end def to_s @contents end def eql?(word) return @contents.eql?(word.contents) end def hash @contents.hash end end class Sentence RE = /^A: ([^\t]*)\t(.*)#ID=(.*)$/u WORD_RE = /^([^(\[{~]*)(\(([^)]*)\))?(\[([^\]]*)\])?(\{([^}]*)\})?(~)?/u attr_reader :kanji, :reading, :sense, :actual, :checked def initialize(data, wordData) @data = data @wordData = wordData parseWordData end def parseWordData if WORD_RE.match(@wordData) @kanji = $1 @reading = $3 if !$5.nil? @sense = $5.to_i else @sense = 0 end @actual = $7 @checked = $8.eql?("~") end end def english retVal = "" if RE.match(@data) retVal = $2 end return retVal end def japanese retVal = "" if RE.match(@data) retVal = $1 end return retVal end def id retVal = "" if RE.match(@data) retVal = $3.to_i end return retVal end def word_to_s retVal = @kanji.to_s if !@reading.nil? retVal += "(#{@reading})" end if @sense != 0 retVal += "[#{@sense.to_s}]" end if !@actual.nil? retVal += "{#{@actual.to_s}}" end if @checked retVal += "~" end return retVal end def to_s return "#{self.id}: " + word_to_s + "\n\t#{self.japanese}\n\t#{self.english}" end end # Represents the results of searching the Tanaka reference library # It is composed of a list of sentences. class SearchResults attr_reader :sentences, :connections attr_writer :sentences, :connections def initialize(word, connections, sentences) @word = word @sentences = sentences @connections = connections end def getSentences retVal = [] if !@connections.nil? wordData = getWordData @connections.each_with_index do |connection, i| retVal.push(Sentence.new(@sentences[connection], wordData[i])) end end return retVal end def findWord(connection) connection.split(" ").each do |word| if word.start_with?(@word) return word end end return "" end def getWordData wordData = [] @connections.each_with_index do |connection, i| wordData.push(findWord(@sentences[connection + 1])) end return wordData end end # Represents the Tanaka reference library class Reference < JLDrill::DataFile attr_reader :words attr_writer :words A_RE = /^A:/ B_RE = /^B: (.*)/ WORD_RE = /([^{(\[~]*(\([^)]*\))?)/u def initialize() super @sentences = 0 @words = {} @stepSize = 1000 end def numSentences dataSize end def numWords return @words.keys.size end def addWord(word, pos) if WORD_RE.match(word) (@words[$1] ||= []).push(pos) end end def parseLines(aLine, bLine, pos) success = false if A_RE.match(aLine) if B_RE.match(bLine) @sentences += 1 w = $1.split(' ') w.each do |word| addWord(word, pos) end success = true end end return success end def dataSize @sentences end def parseEntry if parseLines(@lines[@parsed], @lines[@parsed + 1], @parsed) @parsed += 2 # As long as a single line gets parsed it is a success else @parsed += 1 end end def search(kanji, reading) word = nil if !kanji.nil? word = Word.create(kanji, reading).to_s connections = @words[word] if connections.nil? # The corpus only uses readings to disambiguate # kanji. Most words don't have readings. So # if we don't find anything, search again without # the reading. word = Word.create(kanji, nil).to_s connections = @words[word] end else # When there is no kanji, use the reading as the kanji word = Word.create(reading, nil).to_s connections = @words[word] end return SearchResults.new(word, connections, @lines).getSentences end # Don't erase @lines because we need them later def finishParsing setLoaded(true) end end end