Sha256: b0f6794fd8ec5611baaa7bb47ea99da5780427f3505977c77f351f3461c07670

Contents?: true

Size: 1.33 KB

Versions: 2

Compression:

Stored size: 1.33 KB

Contents

# -*- coding: utf-8 -*-
require 'csv'

module CabochaParser
	def parse(line)
		case line.chomp
		when /^#/
			return parse_excab(line)
		when /^\*/
			return parse_chunk(line)
		when 'EOS'
			return {type: 'EOS'}
		when ''
			return nil
		else
			return parse_token(line)
		end
	end
	def parse_excab(line)
		line = line.gsub('\"', '""')
		null, type, *data = CSV.parse_line(line.chomp, col_sep:' ')
		case type
		when 'SEGMENT', 'SEGMENT_S', 'LINK', 'LINK_S'
			excab = {type: type, name: data[0], start: data[1].to_i, end: data[2].to_i, comment: data[3]}
		when 'GROUP', 'GROUP_S'
			excab = {type: type, name: data[0], member: data[1..-2], comment: data[-1]}
		when 'ATTR'
			excab = {type: type, name: data[0], value: data[1]}
		end
		return excab
	end
	def parse_chunk(line)
		null, id, dep, part, score = line.chomp.split("\s")
		link, rel = dep[0..-2], dep[-1]
		head, func = part.split('/')
		chunk = {type: 'CHUNK', id: id, link: link, rel: rel, head: head, func: func, score: score}
		return chunk
	end
	def parse_token(line)
		text, attrs, ne = line.chomp.split("\t")
		attrs = CSV.parse_line(attrs, col_sep:',')
		pos = attrs[0, 4].delete_if{|item| item.empty?}.join('-')
		token = {type: 'TOKEN', text: text, ne: ne, pos: pos, ctype: attrs[4], cform: attrs[5]}
		return token
	end
	module_function :parse, :parse_excab, :parse_chunk, :parse_token
end

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
kokugo_tagger-0.0.6 lib/kokugo_tagger/parser.rb
kokugo_tagger-0.0.5 lib/kokugo_tagger/parser.rb