# encoding: utf-8
require 'jldrill/model/DataFile'
require 'jldrill/model/VocabularyUsage.rb'
require 'jldrill/model/ExampleSentence.rb'

module JLDrill::Tatoeba

    class SentenceFile < JLDrill::DataFile
        INDEX_RE = /^(\d*)[\t]/
        SENTENCE_RE = /^(\d*)[\t](.*)[\t](.*)/
        def initialize()
            super
            @sentences = []
            @stepSize = 1000
        end

        def dataSize
            @sentences.size
        end

        def parseEntry
            if INDEX_RE.match(@lines[@parsed])
                index = $1.to_i
                @sentences[index] = @parsed
            end
            @parsed += 1
        end

        # Don't erase @lines because we need them later
        def finishParsing
            setLoaded(true)
        end

        def sentenceAt(index)
            retVal = ""
            entry = @sentences[index]
            if !entry.nil?
                if SENTENCE_RE.match(@lines[entry])
                    retVal = $3
                end
            end
            return retVal
        end

        def dataAt(index)
            retVal = ""
            entry = @sentences[index]
            if !entry.nil?
                retVal = @lines[entry]
            end
            return retVal
        end
    end

    class LinkFile < JLDrill::DataFile
        LINK_RE = /^(\d*)[\t](\d*)/
        def initialize()
            super
            @links = []
            @stepSize = 1000
        end

        def dataSize
            @links.size
        end

        def parseEntry
            if LINK_RE.match(@lines[@parsed])
                index = $1.to_i
                (@links[index] ||= []).push($2.to_i)
            end
            @parsed += 1
        end

        # Don't erase @lines because we need them later
        def finishParsing
            setLoaded(true)
        end

        def getLinksTo(index)
            retVal = @links[index]
            if retVal.nil?
                retVal = []
            end
            return retVal
        end
    end

    # Represents an Example sentence in the Tatoeba database
    class TatoebaExample < JLDrill::ExampleSentence

        INDEX_RE = /^(\d*)[\t](\d*)[\t](.*)/

        def initialize(targetIndex, nativeIndex, key, sentences)
            @sentences = sentences
            
            @targetIndex = targetIndex
            @nativeIndex = nativeIndex
            @key = key
        end

        def nativeLanguage()
            return "#{@nativeIndex}: #{@sentences.sentenceAt(@nativeIndex)}"
        end

        def targetLanguage()
            return "#{@targetIndex}: #{@sentences.sentenceAt(@targetIndex)}"
        end
    end

    class ChineseIndexFile < JLDrill::DataFile

        LINK_RE = /^(\d*)[\t](\d*)/
        CHINESE_INDEX_RE = /^(\d*)[\t]cmn/
        ENGLISH_INDEX_RE = /^(\d*)[\t]eng/

        def initialize(sentences)
            super()
            @sentences = sentences
            @chineseIndeces = []
            @englishIndeces = []
            @stepSize = 10000
            @ruledOut = 0
        end

        def parseEntry
            if LINK_RE.match(@lines[@parsed])
                cindex = $1.to_i
                eindex = $2.to_i
                # We are only intereste in Chinese sentences.  We'll
                # first check the index on the left hand side.  If it is
                # not Chinese, we will ignore all the rest of the entries
                # with the same index (they are in order of the left hand side
                # so we just have to keep track of the last one).  If it is
                # Chinese, we will keep checking the right hand entry
                # until we find English.  Then we will ignore all the rest
                # of the entries.
                if cindex != @ruledOut
                    chinese = @sentences.dataAt(cindex)
                    english = @sentences.dataAt(eindex)
                    if CHINESE_INDEX_RE.match(chinese)
                        if ENGLISH_INDEX_RE.match(english)
                            @chineseIndeces.push(cindex)
                            @englishIndeces.push(eindex)
                            # We've found the English for this Chinese
                            # sentence, so don't process the following ones
                            # with the same index
                            @ruledOut = cindex
                        end
                    else
                        # It's not a Chinese sentences, so don't process
                        # the following ones with the same index
                        @ruledOut = cindex
                    end
                end
            end
            @parsed += 1
        end

        def dataSize
            @chineseIndeces.size
        end

        # Don't erase @lines because we need them later
        def finishParsing
            setLoaded(true)
        end

        def loaded?
            retVal = super
            return retVal
        end

        # Return an array of positions in the chineseIndeces for which
        # the respective sentence contains the given kanji
        def getPositions(kanji)
            return (0..@chineseIndeces.size - 1).find_all do |i|
                @sentences.sentenceAt(@chineseIndeces[i]).match(kanji)
            end
        end

        def search(kanji, reading)
            retVal = []
            positions = getPositions(kanji)
            positions.each do |i|
                cindex = @chineseIndeces[i]
                eindex = @englishIndeces[i]
                usage = JLDrill::VocabularyUsage.from_B_line(kanji)
                retVal.push(TatoebaExample.new(cindex, eindex, usage, @sentences))
            end
            return retVal
        end
    end

    class JapaneseIndexFile < JLDrill::DataFile

        INDEX_RE = /^(\d*)[\t](\d*)[\t](.*)/

        attr_reader :sentences

		def initialize(sentences)
            super()
            @sentences = sentences
            @numSentences = 0
            @usageMap = JLDrill::VocabularyUsage::Map.new
            @stepSize = 1000
		end

        def parseEntry
            if INDEX_RE.match(@lines[@parsed])
                @numSentences += 1
                @usageMap.add_B_line($3, @parsed)
            end
            @parsed += 1
        end

        def dataSize
            @numSentences
        end

        # Find the usage data that matches the usageHash in the
        # supplied B line.  If it doesn't exist, return empty string
        def findUsageData(usageHash, b_line)
            retVal = b_line.split(" ").find do |usageData|
                usageData.start_with?(usageHash)
            end
            if retVal.nil?
                retVal = ""
            end
            return retVal
        end

        def parseDataOnLine(pos)
            if INDEX_RE.match(@lines[pos])
                return $1.to_i, $2.to_i, $3
            else
                return 0, 0, ""
            end
        end

        def search(kanji, reading)
            retVal = []
            result = @usageMap.search(kanji, reading)
            result.positions.each do |position|
                jidx, eidx, b_line = parseDataOnLine(position)
                usageData = findUsageData(result.successfulHash, b_line)
                usage = JLDrill::VocabularyUsage.from_B_line(usageData)
                retVal.push(TatoebaExample.new(jidx, eidx, usage, @sentences))
            end
            return retVal
        end

        # Don't erase @lines because we need them later
        def finishParsing
            setLoaded(true)
        end
    end

    # Represents the Tatoeba database
    class Database
        attr_reader :sentences, :japaneseIndeces, :chineseIndeces
    
        def initialize()
            @sentences = SentenceFile.new
            @japaneseIndeces = JapaneseIndexFile.new(@sentences)
            @chineseIndeces = ChineseIndexFile.new(@sentences)
        end

        def indeces(options)
            if options.language.eql?("Chinese")
                return @chineseIndeces
            else
                return @japaneseIndeces
            end
        end

        def loaded?(options)
            return indeces(options).loaded?
        end

        def search(kanji, reading, options)
            indeces(options).search(kanji, reading)
        end
    end
end