#!/usr/bin/ruby # # dictionary.rb - this file contains the Dictionary class of the # Ruby-LinkParser system. This governs the parsing of the dictionary files # into their respective data structures. # # == Synopsis # # require "linkparser/dictionary" # # dict_opts = Hash::new('') # dict_opts[dict] = "tiny.dict" # dict_opts[affix] = "4.0.affix" # dict_opts[datadir] = "/usr/local/share/linkparser" # dict = LinkParser::Dictionary::new(dict_opts) # puts dict['word'].to_connectors # # == Rcsid # # $Id: dictionary.rb,v 1.16 2003/08/28 04:49:34 stillflame Exp $ # # == Authors # # Martin Chase # #:include: COPYRIGHT # #--- # # Please see the file COPYRIGHT for licensing details. # require "forwardable" require "rbconfig" #require "linkparser/log" require "linkparser/utils" require "linkparser/definition" class LinkParser class Dictionary DefaultDataDir = Config::CONFIG['datadir'] # Lots of constants for the handling of the different dictionary files. Dict = 1 Affix = 2 Post = 3 Constituent = 4 DefaultDict = "4.0.dict" DefaultAffix = "4.0.affix" DefaultPost = "4.0.knowledge" DefaultConstituent = "4.0.constituent-knowledge" class << self # This parses a dictionary string/file for its words and their # definitions, returning a hash keyed by word with values being # LinkParser::Definition objects. def read_dict( dict, datadir = DefaultDataDir ) wordHash = {} # Hash#[] is faster than BinarySearchTree#[] macros = [] # Array#each is faster than Hash#each # read the dictionary file into an array of words:definition # "statements", excluding comments. statements = dict.gsub(/\n+|\s*%(?!\").*?\n/, " ").split(/\s*;(?!\")\s*/).compact return nil if statements.empty? statements.each {|statement| words, definition = statement.split(/\s*:(?!\")\s*/) macros.each {|macro| definition.gsub!(macro[0], macro[1]) } if(words =~ /<.*>/) words.strip! macros << [Regexp::new(words), definition] elsif(!words or !definition) $stderr.print "dict error #{statement}" # raise ParseError, "Dictionary outta whack: '#{statement}'" else if words =~ /^\// #/ # then it's a filename, not a word, and the file will # contain a list of words. # Log.info("Reading in words from %s." % datadir + words) $stderr.print "Reading in words \n" words = File.open(datadir + words) {|f| f.read(f.stat.size)} end # so now we have a bunch of words and their shared # definition. put each word into the hash with a value of # the definition data structure. words.gsub!(/"([^ ]+?)"/, '\1') # punctuation marks are in double-quotes words = words.split(/\s+/) definition = Definition::new(definition) words.each {|word| wordHash[word] = definition unless word.empty? } end } return wordHash end # This takes a filename of a dictionary, and reads it into the # word-keyed hash. def open_read_dict( dicttype, dictname, datadir = DefaultDataDir ) if dictname and dictname.empty? # do nothing return nil else if dictname f = File.open( File.join(datadir, dictname) ) else default = case dicttype when Dict DefaultDict when Affix DefaultAffix when Post DefaultPost when Constituent DefaultConstituent end f = File.open( File.join(datadir, default) ) end return read_dict(f.read(f.stat.size), datadir) end end end # class << self extend Forwardable # Initializes a new Dictionary object. takes a hash as its argument. # entries: # datadir - the directory where the dictionary files are located # dict - the main dictionary file # affix - the affix dictionary file # knowledge - the post-processing dictionary file # constituent-knowledge - the constituent knowledge dictionary file # # setting a value to an empty string prevents it from being used, which # will work out fine for all but the datadir and the main dict. a value # set to nil will mean to use the default setting. def initialize( dict_opts ) @datadir = dict_opts['datadir'] || DefaultDataDir @dict = Dictionary::open_read_dict( Dict, dict_opts['dict'], @datadir ) @affix = Dictionary::open_read_dict( Affix, dict_opts['affix'], @datadir ) @post = Dictionary::open_read_dict( Post, dict_opts['knowledge'], @datadir ) @constituent = Dictionary::open_read_dict( Constituent, dict_opts['constituent-knowledge'], @datadir ) end # the directory that the dictionary files are located attr_reader :datadir # the main dictionary hash attr_reader :dict def_delegators :@dict, *(Hash.instance_methods(false)) # does affix processing on the words, which is just separating # conjunctions and punctuation from the words they are next to. def affix( words ) return words unless @affix @affix.each {|punct,move| words = words.inject([]) {|arr,ele| if /RPUNC/.match(move.inspect) && /(.*)(#{Regexp.escape(punct)}.*)$/.match(ele) arr << $1 << $2 elsif /LPUNC/.match(move.inspect) && /^(#{Regexp.escape(punct)})(.*)/.match(ele) arr << $1 << $2 else arr << ele end } } words end # the post-processing dictionary hash attr_reader :post # the constituent-knowledge dictionary hash attr_reader :constituent end # class Dictionary end # module LinkParser if $0 == __FILE__ # require 'profile' # p LinkParser::Dictionary::read_dict("/home/stillflame/src/system-4.1/link-4.1/data/tiny.dict")["dog"] s = <<-DICT :(A- and {Bb+ or @C+}); :(Aa- & E+); bar:(A- or A-) and A-; foo:( ((A- and E-) and {Bb+ or (@C+ and De-)});% and (E+) ); word: %comment - miaow () | ( & [@Dd- or De-]); meow:(A+ and {B- or C-}); boo:[[[()]]]; am:((Sp- or (RS- & Bp-) or ({Q-} & SIp+)) & (((O+ or B-) & {@MV+}) or P+ or AF-)); DICT LinkParser::Dictionary::read_dict(s).each {|k,v| puts "#{k} = #{(v).to_connectors}"} end