# Copyright 2012 Twitter, Inc # http://www.apache.org/licenses/LICENSE-2.0 class TwitterCldr.SegmentationParser extends TwitterCldr.Parser constructor : -> @begin_token ||= new TwitterCldr.Token({type : "special_char", value: "^"}) @regex_parser ||= new TwitterCldr.UnicodeRegexParser class @.RuleMatchData constructor : (@text, @boundary_offset) -> class @.Rule constructor : -> class @.BreakRule extends @.Rule constructor : (@left, @right) -> @boundary_symbol = "break" super match : (str) -> left_match = @left.match(str) if @left? and left_match? match_pos = str.indexOf(left_match[0]) + left_match[0].length if @right? right_match = @right.match(str.slice(match_pos)) if right_match? return new TwitterCldr.SegmentationParser.RuleMatchData((left_match[0] + right_match[0]), match_pos) else return new TwitterCldr.SegmentationParser.RuleMatchData(str, str.length) return null class @.NoBreakRule extends @.Rule constructor : (@regex) -> @boundary_symbol = "no_break" super match : (str) -> match = @regex.match(str) if match? new TwitterCldr.SegmentationParser.RuleMatchData(match[0], str.indexOf(match[0]) + match[0].length) else null do_parse: (options = {}) -> regex_token_lists = [] current_regex_tokens = [] boundary_symbol = null while @current_token()? switch @current_token().type when "break", "no_break" boundary_symbol = @current_token().type regex_token_lists.push(current_regex_tokens) current_regex_tokens = [] else current_regex_tokens.push(@current_token()) @next_token(@current_token().type) regex_token_lists.push(current_regex_tokens) result = null switch boundary_symbol when "break" result = new TwitterCldr.SegmentationParser.BreakRule( @parse_regex(@add_anchors(regex_token_lists[0]), options), @parse_regex(@add_anchors(regex_token_lists[1]), options) ) when "no_break" result = new TwitterCldr.SegmentationParser.NoBreakRule( @parse_regex( @add_anchors( [].concat(regex_token_lists...) ), options ) ) result add_anchors : (token_list) -> [@begin_token].concat(token_list) parse_regex : (tokens, options = {}) -> if tokens? and tokens.length != 0 then new TwitterCldr.UnicodeRegex(@regex_parser.parse(tokens, options)) else null