#!/usr/bin/env ruby # -*- coding: utf-8 -*- #========================== # string_parser.rb #========================== # # Parses a phrase into leafs and nodes and store the result in an element list # (see element_list.rb) # # This file is part of RSyntaxTree, which is a ruby port of Andre Eisenbach's # excellent program phpSyntaxTree. # # Copyright (c) 2007-2021 Yoichiro Hasebe # Copyright (c) 2003-2004 Andre Eisenbach require 'elementlist' require 'element' # def escape_high_ascii(string) # html = "" # string.length.times do |i| # ch = string[i] # if(ch < 127) # html += ch.chr # else # html += sprintf("&#%d;", ch) # end # end # html # end class StringParser attr_accessor :data, :elist, :pos, :id, :level, :tncnt def initialize(str) # Clean up the data a little to make processing easier string = str.gsub(/\t/, "") rescue "" string.gsub!(/\s+/, " ") string.gsub!(/\] \[/, "][") string.gsub!(/ \[/, "[") @data = string # Store it for later... @elist = ElementList.new # Initialize internal element list @pos = 0 # Position in the sentence @id = 1 # ID for the next element @level = 0 # Level in the diagram @tncnt = Hash.new # Node type counts end # caution: quick and dirty solution def valid? if(@data.length < 1) return false end if /\[\s*\]/m =~ @data return false end if /\[\_/ =~ @data return false end text = @data.strip text_r = text.split(//) open_br, close_br = [], [] escape = false text_r.each do |chr| if chr == "\\" escape = true elsif chr == '[' && !escape open_br.push(chr) elsif chr == ']' && !escape close_br.push(chr) if open_br.length < close_br.length break end elsif escape escape = false end end return false unless open_br.length == close_br.length # make_tree(0) # return false if @tncnt.empty? # @tncnt.each do |key, value| # return false if key == "" # end return true end def parse make_tree(0); end def get_elementlist @elist; end def auto_subscript elements = @elist.get_elements tmpcnt = Hash.new elements.each do |element| if(element.type == ETYPE_NODE) count = 1 content = element.content if @tncnt[content] count = @tncnt[content] end if(count > 1) if tmpcnt[content] tmpcnt[content] += 1 else tmpcnt[content] = 1 end element.content += ("_" + tmpcnt[content].to_s) end end end @tncnt end def count_node(name) name = name.strip if @tncnt[name] @tncnt[name] += 1 else @tncnt[name] = 1 end end def get_next_token data = @data.split(//) gottoken = false token = "" i = 0 if((@pos + 1) >= data.length) return "" end escape = false while(((@pos + i) < data.length) && !gottoken) ch = data[@pos + i]; case ch when "[" if escape token += ch escape = false else if(i > 0) gottoken = true else token += ch end end when "]" if escape token += ch escape = false else if(i == 0 ) token += ch end gottoken = true end when "\\" escape = true when "n", " ", "+", "-", "=", "~", "#", "*" if escape token += "\\#{ch}" escape = false else token += ch end # when /[\n\r]/ # gottoken = false # same as do nothing else token += ch escape = false if escape end i += 1 end if(i > 1) @pos += (i - 1) else @pos += 1 end return token end def make_tree(parent) token = get_next_token.strip parts = Array.new while(token != "" && token != "]" ) token_r = token.split(//) case token_r[0] when "[" tl = token_r.length token_r = token_r[1, tl - 1] spaceat = token_r.index(" ") newparent = -1 if spaceat parts[0] = token_r[0, spaceat].join parts[0] = parts[0].gsub("<>", " ") tl =token_r.length parts[1] = token_r[spaceat, tl - spaceat].join parts[1] = parts[1].gsub("<>", " ") element = Element.new(@id, parent, parts[0], @level) @id += 1 @elist.add(element) newparent = element.id count_node(parts[0]) element = Element.new(@id, @id - 1, parts[1], @level + 1 ) @id += 1 @elist.add(element) else joined = token_r.join.gsub("<>", " ") element = Element.new(@id, parent, joined, @level) @id += 1 newparent = element.id @elist.add(element) count_node(joined) end @level += 1 make_tree(newparent) else if token.strip != "" element = Element.new(@id, parent, token, @level) @id += 1 @elist.add(element) count_node(token) end end token = get_next_token end @level -= 1 end end