test-unit/lib/test/unit/diff.rb in groonga-0.0.7 vs test-unit/lib/test/unit/diff.rb in groonga-0.9.0
- old
+ new
@@ -33,11 +33,11 @@
@operations ||= compute_operations
end
def grouped_operations(context_size=nil)
context_size ||= 3
- _operations = operations
+ _operations = operations.dup
_operations = [[:equal, 0, 0, 0, 0]] if _operations.empty?
expand_edge_equal_operations!(_operations, context_size)
group_window = context_size * 2
groups = []
@@ -264,33 +264,191 @@
def tag(mark, contents)
contents.collect {|content| "#{mark}#{content}"}
end
end
+ class UTF8Line
+ class << self
+ # from http://unicode.org/reports/tr11/
+ WIDE_CHARACTERS =
+ [0x1100..0x1159, 0x115F..0x115F, 0x2329..0x232A,
+ 0x2E80..0x2E99, 0x2E9B..0x2EF3, 0x2F00..0x2FD5,
+ 0x2FF0..0x2FFB, 0x3000..0x303E, 0x3041..0x3096,
+ 0x3099..0x30FF, 0x3105..0x312D, 0x3131..0x318E,
+ 0x3190..0x31B7, 0x31C0..0x31E3, 0x31F0..0x321E,
+ 0x3220..0x3243, 0x3250..0x32FE, 0x3300..0x4DB5,
+ 0x4E00..0x9FC3, 0xA000..0xA48C, 0xA490..0xA4C6,
+ 0xAC00..0xD7A3, 0xF900..0xFA2D, 0xFA30..0xFA6A,
+ 0xFA70..0xFAD9, 0xFE10..0xFE19, 0xFE30..0xFE52,
+ 0xFE54..0xFE66, 0xFE68..0xFE6B, 0xFF01..0xFF60,
+ 0xFFE0..0xFFE6, 0x20000..0x2FFFD, 0x30000..0x3FFFD,
+ ]
+
+ AMBIGUOUS =
+ [0x00A1..0x00A1, 0x00A4..0x00A4, 0x00A7..0x00A8,
+ 0x00AA..0x00AA, 0x00AD..0x00AE, 0x00B0..0x00B4,
+ 0x00B6..0x00BA, 0x00BC..0x00BF, 0x00C6..0x00C6,
+ 0x00D0..0x00D0, 0x00D7..0x00D8, 0x00DE..0x00E1,
+ 0x00E6..0x00E6, 0x00E8..0x00EA, 0x00EC..0x00ED,
+ 0x00F0..0x00F0, 0x00F2..0x00F3, 0x00F7..0x00FA,
+ 0x00FC..0x00FC, 0x00FE..0x00FE, 0x0101..0x0101,
+ 0x0111..0x0111, 0x0113..0x0113, 0x011B..0x011B,
+ 0x0126..0x0127, 0x012B..0x012B, 0x0131..0x0133,
+ 0x0138..0x0138, 0x013F..0x0142, 0x0144..0x0144,
+ 0x0148..0x014B, 0x014D..0x014D, 0x0152..0x0153,
+ 0x0166..0x0167, 0x016B..0x016B, 0x01CE..0x01CE,
+ 0x01D0..0x01D0, 0x01D2..0x01D2, 0x01D4..0x01D4,
+ 0x01D6..0x01D6, 0x01D8..0x01D8, 0x01DA..0x01DA,
+ 0x01DC..0x01DC, 0x0251..0x0251, 0x0261..0x0261,
+ 0x02C4..0x02C4, 0x02C7..0x02C7, 0x02C9..0x02CB,
+ 0x02CD..0x02CD, 0x02D0..0x02D0, 0x02D8..0x02DB,
+ 0x02DD..0x02DD, 0x02DF..0x02DF, 0x0300..0x036F,
+ 0x0391..0x03A1, 0x03A3..0x03A9, 0x03B1..0x03C1,
+ 0x03C3..0x03C9, 0x0401..0x0401, 0x0410..0x044F,
+ 0x0451..0x0451, 0x2010..0x2010, 0x2013..0x2016,
+ 0x2018..0x2019, 0x201C..0x201D, 0x2020..0x2022,
+ 0x2024..0x2027, 0x2030..0x2030, 0x2032..0x2033,
+ 0x2035..0x2035, 0x203B..0x203B, 0x203E..0x203E,
+ 0x2074..0x2074, 0x207F..0x207F, 0x2081..0x2084,
+ 0x20AC..0x20AC, 0x2103..0x2103, 0x2105..0x2105,
+ 0x2109..0x2109, 0x2113..0x2113, 0x2116..0x2116,
+ 0x2121..0x2122, 0x2126..0x2126, 0x212B..0x212B,
+ 0x2153..0x2154, 0x215B..0x215E, 0x2160..0x216B,
+ 0x2170..0x2179, 0x2190..0x2199, 0x21B8..0x21B9,
+ 0x21D2..0x21D2, 0x21D4..0x21D4, 0x21E7..0x21E7,
+ 0x2200..0x2200, 0x2202..0x2203, 0x2207..0x2208,
+ 0x220B..0x220B, 0x220F..0x220F, 0x2211..0x2211,
+ 0x2215..0x2215, 0x221A..0x221A, 0x221D..0x2220,
+ 0x2223..0x2223, 0x2225..0x2225, 0x2227..0x222C,
+ 0x222E..0x222E, 0x2234..0x2237, 0x223C..0x223D,
+ 0x2248..0x2248, 0x224C..0x224C, 0x2252..0x2252,
+ 0x2260..0x2261, 0x2264..0x2267, 0x226A..0x226B,
+ 0x226E..0x226F, 0x2282..0x2283, 0x2286..0x2287,
+ 0x2295..0x2295, 0x2299..0x2299, 0x22A5..0x22A5,
+ 0x22BF..0x22BF, 0x2312..0x2312, 0x2460..0x24E9,
+ 0x24EB..0x254B, 0x2550..0x2573, 0x2580..0x258F,
+ 0x2592..0x2595, 0x25A0..0x25A1, 0x25A3..0x25A9,
+ 0x25B2..0x25B3, 0x25B6..0x25B7, 0x25BC..0x25BD,
+ 0x25C0..0x25C1, 0x25C6..0x25C8, 0x25CB..0x25CB,
+ 0x25CE..0x25D1, 0x25E2..0x25E5, 0x25EF..0x25EF,
+ 0x2605..0x2606, 0x2609..0x2609, 0x260E..0x260F,
+ 0x2614..0x2615, 0x261C..0x261C, 0x261E..0x261E,
+ 0x2640..0x2640, 0x2642..0x2642, 0x2660..0x2661,
+ 0x2663..0x2665, 0x2667..0x266A, 0x266C..0x266D,
+ 0x266F..0x266F, 0x273D..0x273D, 0x2776..0x277F,
+ 0xE000..0xF8FF, 0xFE00..0xFE0F, 0xFFFD..0xFFFD,
+ 0xE0100..0xE01EF, 0xF0000..0xFFFFD, 0x100000..0x10FFFD,
+ ]
+
+ def wide_character?(character)
+ binary_search_ranges(character, WIDE_CHARACTERS) or
+ binary_search_ranges(character, AMBIGUOUS)
+ end
+
+ private
+ def binary_search_ranges(character, ranges)
+ if ranges.size.zero?
+ false
+ elsif ranges.size == 1
+ ranges[0].include?(character)
+ else
+ half = ranges.size / 2
+ range = ranges[half]
+ if range.include?(character)
+ true
+ elsif character < range.begin
+ binary_search_ranges(character, ranges[0...half])
+ else
+ binary_search_ranges(character, ranges[(half + 1)..-1])
+ end
+ end
+ end
+ end
+
+ def initialize(line)
+ @line = line
+ @characters = @line.unpack("U*")
+ end
+
+ def [](*args)
+ result = @characters[*args]
+ if result.respond_to?(:pack)
+ result.pack("U*")
+ else
+ result
+ end
+ end
+
+ def each(&block)
+ @characters.each(&block)
+ end
+
+ def size
+ @characters.size
+ end
+
+ def to_s
+ @line
+ end
+
+ def compute_width(start, _end)
+ width = 0
+ start.upto(_end - 1) do |i|
+ if self.class.wide_character?(@characters[i])
+ width += 2
+ else
+ width += 1
+ end
+ end
+ width
+ end
+ end
+
class ReadableDiffer < Differ
def diff(options={})
- result = []
- matcher = SequenceMatcher.new(@from, @to)
- matcher.operations.each do |args|
- tag, from_start, from_end, to_start, to_end = args
+ @result = []
+ operations.each do |tag, from_start, from_end, to_start, to_end|
case tag
when :replace
- result.concat(diff_lines(from_start, from_end, to_start, to_end))
+ diff_lines(from_start, from_end, to_start, to_end)
when :delete
- result.concat(tag_deleted(@from[from_start...from_end]))
+ tag_deleted(@from[from_start...from_end])
when :insert
- result.concat(tag_inserted(@to[to_start...to_end]))
+ tag_inserted(@to[to_start...to_end])
when :equal
- result.concat(tag_equal(@from[from_start...from_end]))
+ tag_equal(@from[from_start...from_end])
else
raise "unknown tag: #{tag}"
end
end
- result
+ @result
end
private
+ def operations
+ @operations ||= nil
+ if @operations.nil?
+ matcher = SequenceMatcher.new(@from, @to)
+ @operations = matcher.operations
+ end
+ @operations
+ end
+
+ def default_ratio
+ 0.74
+ end
+
+ def cut_off_ratio
+ 0.75
+ end
+
+ def tag(mark, contents)
+ contents.each do |content|
+ @result << "#{mark}#{content}"
+ end
+ end
+
def tag_deleted(contents)
tag("- ", contents)
end
def tag_inserted(contents)
@@ -304,11 +462,11 @@
def tag_difference(contents)
tag("? ", contents)
end
def find_diff_line_info(from_start, from_end, to_start, to_end)
- best_ratio = 0.74
+ best_ratio = default_ratio
from_equal_index = to_equal_index = nil
from_best_index = to_best_index = nil
to_start.upto(to_end - 1) do |to_index|
from_start.upto(from_end - 1) do |from_index|
@@ -332,34 +490,35 @@
from_equal_index, to_equal_index,
from_best_index, to_best_index]
end
def diff_lines(from_start, from_end, to_start, to_end)
- cut_off = 0.75
-
info = find_diff_line_info(from_start, from_end, to_start, to_end)
best_ratio, from_equal_index, to_equal_index, *info = info
from_best_index, to_best_index = info
+ from_best_index ||= from_start
+ to_best_index ||= to_start
- if best_ratio < cut_off
+ if best_ratio < cut_off_ratio
if from_equal_index.nil?
- tagged_from = tag_deleted(@from[from_start...from_end])
- tagged_to = tag_inserted(@to[to_start...to_end])
if to_end - to_start < from_end - from_start
- return tagged_to + tagged_from
+ tag_inserted(@to[to_start...to_end])
+ tag_deleted(@from[from_start...from_end])
else
- return tagged_from + tagged_to
+ tag_deleted(@from[from_start...from_end])
+ tag_inserted(@to[to_start...to_end])
end
+ return
end
from_best_index = from_equal_index
to_best_index = to_equal_index
best_ratio = 1.0
end
- _diff_lines(from_start, from_best_index, to_start, to_best_index) +
- diff_line(@from[from_best_index], @to[to_best_index]) +
- _diff_lines(from_best_index + 1, from_end, to_best_index + 1, to_end)
+ _diff_lines(from_start, from_best_index, to_start, to_best_index)
+ diff_line(@from[from_best_index], @to[to_best_index])
+ _diff_lines(from_best_index + 1, from_end, to_best_index + 1, to_end)
end
def _diff_lines(from_start, from_end, to_start, to_end)
if from_start < from_end
if to_start < to_end
@@ -370,30 +529,58 @@
else
tag_inserted(@to[to_start...to_end])
end
end
+ def line_operations(from_line, to_line)
+ if !from_line.respond_to?(:force_encoding) and $KCODE == "UTF8"
+ from_line = UTF8Line.new(from_line)
+ to_line = UTF8Line.new(to_line)
+ end
+ matcher = SequenceMatcher.new(from_line, to_line,
+ &method(:space_character?))
+ [from_line, to_line, matcher.operations]
+ end
+
+ def compute_width(line, start, _end)
+ if line.respond_to?(:encoding) and
+ Encoding.compatible?(Encoding::UTF_8, line.encoding)
+ utf8_line = line[start..._end].encode(Encoding::UTF_8)
+ width = 0
+ utf8_line.each_codepoint do |unicode_codepoint|
+ if UTF8Line.wide_character?(unicode_codepoint)
+ width += 2
+ else
+ width += 1
+ end
+ end
+ width
+ elsif line.is_a?(UTF8Line)
+ line.compute_width(start, _end)
+ else
+ _end - start
+ end
+ end
+
def diff_line(from_line, to_line)
from_tags = ""
to_tags = ""
- matcher = SequenceMatcher.new(from_line, to_line,
- &method(:space_character?))
- operations = matcher.operations
- operations.each do |tag, from_start, from_end, to_start, to_end|
- from_length = from_end - from_start
- to_length = to_end - to_start
+ from_line, to_line, _operations = line_operations(from_line, to_line)
+ _operations.each do |tag, from_start, from_end, to_start, to_end|
+ from_width = compute_width(from_line, from_start, from_end)
+ to_width = compute_width(to_line, to_start, to_end)
case tag
when :replace
- from_tags << "^" * from_length
- to_tags << "^" * to_length
+ from_tags << "^" * from_width
+ to_tags << "^" * to_width
when :delete
- from_tags << "-" * from_length
+ from_tags << "-" * from_width
when :insert
- to_tags << "+" * to_length
+ to_tags << "+" * to_width
when :equal
- from_tags << " " * from_length
- to_tags << " " * to_length
+ from_tags << " " * from_width
+ to_tags << " " * to_width
else
raise "unknown tag: #{tag}"
end
end
format_diff_point(from_line, to_line, from_tags, to_tags)
@@ -407,16 +594,15 @@
from_tags = from_tags[common..-1].rstrip
to_tags = to_tags[common..-1].rstrip
result = tag_deleted([from_line])
unless from_tags.empty?
- result.concat(tag_difference(["#{"\t" * common}#{from_tags}"]))
+ tag_difference(["#{"\t" * common}#{from_tags}"])
end
- result.concat(tag_inserted([to_line]))
+ tag_inserted([to_line])
unless to_tags.empty?
- result.concat(tag_difference(["#{"\t" * common}#{to_tags}"]))
+ tag_difference(["#{"\t" * common}#{to_tags}"])
end
- result
end
def n_leading_characters(string, character)
n = 0
while string[n] == character