test-unit/lib/test/unit/diff.rb in groonga-0.0.7 vs test-unit/lib/test/unit/diff.rb in groonga-0.9.0

- old
+ new

@@ -33,11 +33,11 @@ @operations ||= compute_operations end def grouped_operations(context_size=nil) context_size ||= 3 - _operations = operations + _operations = operations.dup _operations = [[:equal, 0, 0, 0, 0]] if _operations.empty? expand_edge_equal_operations!(_operations, context_size) group_window = context_size * 2 groups = [] @@ -264,33 +264,191 @@ def tag(mark, contents) contents.collect {|content| "#{mark}#{content}"} end end + class UTF8Line + class << self + # from http://unicode.org/reports/tr11/ + WIDE_CHARACTERS = + [0x1100..0x1159, 0x115F..0x115F, 0x2329..0x232A, + 0x2E80..0x2E99, 0x2E9B..0x2EF3, 0x2F00..0x2FD5, + 0x2FF0..0x2FFB, 0x3000..0x303E, 0x3041..0x3096, + 0x3099..0x30FF, 0x3105..0x312D, 0x3131..0x318E, + 0x3190..0x31B7, 0x31C0..0x31E3, 0x31F0..0x321E, + 0x3220..0x3243, 0x3250..0x32FE, 0x3300..0x4DB5, + 0x4E00..0x9FC3, 0xA000..0xA48C, 0xA490..0xA4C6, + 0xAC00..0xD7A3, 0xF900..0xFA2D, 0xFA30..0xFA6A, + 0xFA70..0xFAD9, 0xFE10..0xFE19, 0xFE30..0xFE52, + 0xFE54..0xFE66, 0xFE68..0xFE6B, 0xFF01..0xFF60, + 0xFFE0..0xFFE6, 0x20000..0x2FFFD, 0x30000..0x3FFFD, + ] + + AMBIGUOUS = + [0x00A1..0x00A1, 0x00A4..0x00A4, 0x00A7..0x00A8, + 0x00AA..0x00AA, 0x00AD..0x00AE, 0x00B0..0x00B4, + 0x00B6..0x00BA, 0x00BC..0x00BF, 0x00C6..0x00C6, + 0x00D0..0x00D0, 0x00D7..0x00D8, 0x00DE..0x00E1, + 0x00E6..0x00E6, 0x00E8..0x00EA, 0x00EC..0x00ED, + 0x00F0..0x00F0, 0x00F2..0x00F3, 0x00F7..0x00FA, + 0x00FC..0x00FC, 0x00FE..0x00FE, 0x0101..0x0101, + 0x0111..0x0111, 0x0113..0x0113, 0x011B..0x011B, + 0x0126..0x0127, 0x012B..0x012B, 0x0131..0x0133, + 0x0138..0x0138, 0x013F..0x0142, 0x0144..0x0144, + 0x0148..0x014B, 0x014D..0x014D, 0x0152..0x0153, + 0x0166..0x0167, 0x016B..0x016B, 0x01CE..0x01CE, + 0x01D0..0x01D0, 0x01D2..0x01D2, 0x01D4..0x01D4, + 0x01D6..0x01D6, 0x01D8..0x01D8, 0x01DA..0x01DA, + 0x01DC..0x01DC, 0x0251..0x0251, 0x0261..0x0261, + 0x02C4..0x02C4, 0x02C7..0x02C7, 0x02C9..0x02CB, + 0x02CD..0x02CD, 0x02D0..0x02D0, 0x02D8..0x02DB, + 0x02DD..0x02DD, 0x02DF..0x02DF, 0x0300..0x036F, + 0x0391..0x03A1, 0x03A3..0x03A9, 0x03B1..0x03C1, + 0x03C3..0x03C9, 0x0401..0x0401, 0x0410..0x044F, + 0x0451..0x0451, 0x2010..0x2010, 0x2013..0x2016, + 0x2018..0x2019, 0x201C..0x201D, 0x2020..0x2022, + 0x2024..0x2027, 0x2030..0x2030, 0x2032..0x2033, + 0x2035..0x2035, 0x203B..0x203B, 0x203E..0x203E, + 0x2074..0x2074, 0x207F..0x207F, 0x2081..0x2084, + 0x20AC..0x20AC, 0x2103..0x2103, 0x2105..0x2105, + 0x2109..0x2109, 0x2113..0x2113, 0x2116..0x2116, + 0x2121..0x2122, 0x2126..0x2126, 0x212B..0x212B, + 0x2153..0x2154, 0x215B..0x215E, 0x2160..0x216B, + 0x2170..0x2179, 0x2190..0x2199, 0x21B8..0x21B9, + 0x21D2..0x21D2, 0x21D4..0x21D4, 0x21E7..0x21E7, + 0x2200..0x2200, 0x2202..0x2203, 0x2207..0x2208, + 0x220B..0x220B, 0x220F..0x220F, 0x2211..0x2211, + 0x2215..0x2215, 0x221A..0x221A, 0x221D..0x2220, + 0x2223..0x2223, 0x2225..0x2225, 0x2227..0x222C, + 0x222E..0x222E, 0x2234..0x2237, 0x223C..0x223D, + 0x2248..0x2248, 0x224C..0x224C, 0x2252..0x2252, + 0x2260..0x2261, 0x2264..0x2267, 0x226A..0x226B, + 0x226E..0x226F, 0x2282..0x2283, 0x2286..0x2287, + 0x2295..0x2295, 0x2299..0x2299, 0x22A5..0x22A5, + 0x22BF..0x22BF, 0x2312..0x2312, 0x2460..0x24E9, + 0x24EB..0x254B, 0x2550..0x2573, 0x2580..0x258F, + 0x2592..0x2595, 0x25A0..0x25A1, 0x25A3..0x25A9, + 0x25B2..0x25B3, 0x25B6..0x25B7, 0x25BC..0x25BD, + 0x25C0..0x25C1, 0x25C6..0x25C8, 0x25CB..0x25CB, + 0x25CE..0x25D1, 0x25E2..0x25E5, 0x25EF..0x25EF, + 0x2605..0x2606, 0x2609..0x2609, 0x260E..0x260F, + 0x2614..0x2615, 0x261C..0x261C, 0x261E..0x261E, + 0x2640..0x2640, 0x2642..0x2642, 0x2660..0x2661, + 0x2663..0x2665, 0x2667..0x266A, 0x266C..0x266D, + 0x266F..0x266F, 0x273D..0x273D, 0x2776..0x277F, + 0xE000..0xF8FF, 0xFE00..0xFE0F, 0xFFFD..0xFFFD, + 0xE0100..0xE01EF, 0xF0000..0xFFFFD, 0x100000..0x10FFFD, + ] + + def wide_character?(character) + binary_search_ranges(character, WIDE_CHARACTERS) or + binary_search_ranges(character, AMBIGUOUS) + end + + private + def binary_search_ranges(character, ranges) + if ranges.size.zero? + false + elsif ranges.size == 1 + ranges[0].include?(character) + else + half = ranges.size / 2 + range = ranges[half] + if range.include?(character) + true + elsif character < range.begin + binary_search_ranges(character, ranges[0...half]) + else + binary_search_ranges(character, ranges[(half + 1)..-1]) + end + end + end + end + + def initialize(line) + @line = line + @characters = @line.unpack("U*") + end + + def [](*args) + result = @characters[*args] + if result.respond_to?(:pack) + result.pack("U*") + else + result + end + end + + def each(&block) + @characters.each(&block) + end + + def size + @characters.size + end + + def to_s + @line + end + + def compute_width(start, _end) + width = 0 + start.upto(_end - 1) do |i| + if self.class.wide_character?(@characters[i]) + width += 2 + else + width += 1 + end + end + width + end + end + class ReadableDiffer < Differ def diff(options={}) - result = [] - matcher = SequenceMatcher.new(@from, @to) - matcher.operations.each do |args| - tag, from_start, from_end, to_start, to_end = args + @result = [] + operations.each do |tag, from_start, from_end, to_start, to_end| case tag when :replace - result.concat(diff_lines(from_start, from_end, to_start, to_end)) + diff_lines(from_start, from_end, to_start, to_end) when :delete - result.concat(tag_deleted(@from[from_start...from_end])) + tag_deleted(@from[from_start...from_end]) when :insert - result.concat(tag_inserted(@to[to_start...to_end])) + tag_inserted(@to[to_start...to_end]) when :equal - result.concat(tag_equal(@from[from_start...from_end])) + tag_equal(@from[from_start...from_end]) else raise "unknown tag: #{tag}" end end - result + @result end private + def operations + @operations ||= nil + if @operations.nil? + matcher = SequenceMatcher.new(@from, @to) + @operations = matcher.operations + end + @operations + end + + def default_ratio + 0.74 + end + + def cut_off_ratio + 0.75 + end + + def tag(mark, contents) + contents.each do |content| + @result << "#{mark}#{content}" + end + end + def tag_deleted(contents) tag("- ", contents) end def tag_inserted(contents) @@ -304,11 +462,11 @@ def tag_difference(contents) tag("? ", contents) end def find_diff_line_info(from_start, from_end, to_start, to_end) - best_ratio = 0.74 + best_ratio = default_ratio from_equal_index = to_equal_index = nil from_best_index = to_best_index = nil to_start.upto(to_end - 1) do |to_index| from_start.upto(from_end - 1) do |from_index| @@ -332,34 +490,35 @@ from_equal_index, to_equal_index, from_best_index, to_best_index] end def diff_lines(from_start, from_end, to_start, to_end) - cut_off = 0.75 - info = find_diff_line_info(from_start, from_end, to_start, to_end) best_ratio, from_equal_index, to_equal_index, *info = info from_best_index, to_best_index = info + from_best_index ||= from_start + to_best_index ||= to_start - if best_ratio < cut_off + if best_ratio < cut_off_ratio if from_equal_index.nil? - tagged_from = tag_deleted(@from[from_start...from_end]) - tagged_to = tag_inserted(@to[to_start...to_end]) if to_end - to_start < from_end - from_start - return tagged_to + tagged_from + tag_inserted(@to[to_start...to_end]) + tag_deleted(@from[from_start...from_end]) else - return tagged_from + tagged_to + tag_deleted(@from[from_start...from_end]) + tag_inserted(@to[to_start...to_end]) end + return end from_best_index = from_equal_index to_best_index = to_equal_index best_ratio = 1.0 end - _diff_lines(from_start, from_best_index, to_start, to_best_index) + - diff_line(@from[from_best_index], @to[to_best_index]) + - _diff_lines(from_best_index + 1, from_end, to_best_index + 1, to_end) + _diff_lines(from_start, from_best_index, to_start, to_best_index) + diff_line(@from[from_best_index], @to[to_best_index]) + _diff_lines(from_best_index + 1, from_end, to_best_index + 1, to_end) end def _diff_lines(from_start, from_end, to_start, to_end) if from_start < from_end if to_start < to_end @@ -370,30 +529,58 @@ else tag_inserted(@to[to_start...to_end]) end end + def line_operations(from_line, to_line) + if !from_line.respond_to?(:force_encoding) and $KCODE == "UTF8" + from_line = UTF8Line.new(from_line) + to_line = UTF8Line.new(to_line) + end + matcher = SequenceMatcher.new(from_line, to_line, + &method(:space_character?)) + [from_line, to_line, matcher.operations] + end + + def compute_width(line, start, _end) + if line.respond_to?(:encoding) and + Encoding.compatible?(Encoding::UTF_8, line.encoding) + utf8_line = line[start..._end].encode(Encoding::UTF_8) + width = 0 + utf8_line.each_codepoint do |unicode_codepoint| + if UTF8Line.wide_character?(unicode_codepoint) + width += 2 + else + width += 1 + end + end + width + elsif line.is_a?(UTF8Line) + line.compute_width(start, _end) + else + _end - start + end + end + def diff_line(from_line, to_line) from_tags = "" to_tags = "" - matcher = SequenceMatcher.new(from_line, to_line, - &method(:space_character?)) - operations = matcher.operations - operations.each do |tag, from_start, from_end, to_start, to_end| - from_length = from_end - from_start - to_length = to_end - to_start + from_line, to_line, _operations = line_operations(from_line, to_line) + _operations.each do |tag, from_start, from_end, to_start, to_end| + from_width = compute_width(from_line, from_start, from_end) + to_width = compute_width(to_line, to_start, to_end) case tag when :replace - from_tags << "^" * from_length - to_tags << "^" * to_length + from_tags << "^" * from_width + to_tags << "^" * to_width when :delete - from_tags << "-" * from_length + from_tags << "-" * from_width when :insert - to_tags << "+" * to_length + to_tags << "+" * to_width when :equal - from_tags << " " * from_length - to_tags << " " * to_length + from_tags << " " * from_width + to_tags << " " * to_width else raise "unknown tag: #{tag}" end end format_diff_point(from_line, to_line, from_tags, to_tags) @@ -407,16 +594,15 @@ from_tags = from_tags[common..-1].rstrip to_tags = to_tags[common..-1].rstrip result = tag_deleted([from_line]) unless from_tags.empty? - result.concat(tag_difference(["#{"\t" * common}#{from_tags}"])) + tag_difference(["#{"\t" * common}#{from_tags}"]) end - result.concat(tag_inserted([to_line])) + tag_inserted([to_line]) unless to_tags.empty? - result.concat(tag_difference(["#{"\t" * common}#{to_tags}"])) + tag_difference(["#{"\t" * common}#{to_tags}"]) end - result end def n_leading_characters(string, character) n = 0 while string[n] == character