lib/diff.rb in instiki-0.10.0 vs lib/diff.rb in instiki-0.10.1
- old
+ new
@@ -1,444 +1,444 @@
-# heavily based off difflib.py - see that file for documentation
-# ported from Python by Bill Atkins
-
-# This does not support all features offered by difflib; it
-# implements only the subset of features necessary
-# to support a Ruby version of HTML Differ. You're welcome to finish this off.
-
-# By default, String#each iterates by line. This isn't really appropriate
-# for diff, so often a string will be split by // to get an array of one-
-# character strings.
-
-# Some methods in Diff are untested and are not guaranteed to work. The
-# methods in HTMLDiff and any methods it calls should work quite well.
-
-# changes by DenisMertz
-# * main change:
-# ** get the tag soup away
-# the tag soup problem was first reported with <p> tags, but it appeared also with
-# <li>, <ul> etc... tags
-# this version should mostly fix these problems
-# ** added a Builder class to manage the creation of the final htmldiff
-# * minor changes:
-# ** use symbols instead of string to represent opcodes
-# ** small fix to html2list
-#
-
-module Enumerable
- def reduce(init)
- result = init
- each { |item| result = yield(result, item) }
- result
- end
-end
-
-module Diff
-
- class SequenceMatcher
- def initialize(a=[''], b=[''], isjunk=nil, byline=false)
- a = (!byline and a.kind_of? String) ? a.split(//) : a
- b = (!byline and b.kind_of? String) ? b.split(//) : b
- @isjunk = isjunk || proc {}
- set_seqs a, b
- end
-
- def set_seqs(a, b)
- set_seq_a a
- set_seq_b b
- end
-
- def set_seq_a(a)
- @a = a
- @matching_blocks = @opcodes = nil
- end
-
- def set_seq_b(b)
- @b = b
- @matching_blocks = @opcodes = nil
- chain_b
- end
-
- def chain_b
- @fullbcount = nil
- @b2j = {}
- pophash = {}
- junkdict = {}
-
- @b.each_with_index do |elt, i|
- if @b2j.has_key? elt
- indices = @b2j[elt]
- if @b.length >= 200 and indices.length * 100 > @b.length
- pophash[elt] = 1
- indices.clear
- else
- indices.push i
- end
- else
- @b2j[elt] = [i]
- end
- end
-
- pophash.each_key { |elt| @b2j.delete elt }
-
- junkdict = {}
-
- unless @isjunk.nil?
- [pophash, @b2j].each do |d|
- d.each_key do |elt|
- if @isjunk.call(elt)
- junkdict[elt] = 1
- d.delete elt
- end
- end
- end
- end
-
- @isbjunk = junkdict.method(:has_key?)
- @isbpopular = junkdict.method(:has_key?)
- end
-
- def find_longest_match(alo, ahi, blo, bhi)
- besti, bestj, bestsize = alo, blo, 0
-
- j2len = {}
-
- (alo..ahi).step do |i|
- newj2len = {}
- (@b2j[@a[i]] || []).each do |j|
- if j < blo
- next
- end
- if j >= bhi
- break
- end
-
- k = newj2len[j] = (j2len[j - 1] || 0) + 1
- if k > bestsize
- besti, bestj, bestsize = i - k + 1, j - k + 1, k
- end
- end
- j2len = newj2len
- end
-
- while besti > alo and bestj > blo and
- not @isbjunk.call(@b[bestj-1]) and
- @a[besti-1] == @b[bestj-1]
- besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
- end
-
- while besti+bestsize < ahi and bestj+bestsize < bhi and
- not @isbjunk.call(@b[bestj+bestsize]) and
- @a[besti+bestsize] == @b[bestj+bestsize]
- bestsize += 1
- end
-
- while besti > alo and bestj > blo and
- @isbjunk.call(@b[bestj-1]) and
- @a[besti-1] == @b[bestj-1]
- besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
- end
-
- while besti+bestsize < ahi and bestj+bestsize < bhi and
- @isbjunk.call(@b[bestj+bestsize]) and
- @a[besti+bestsize] == @b[bestj+bestsize]
- bestsize += 1
- end
-
- [besti, bestj, bestsize]
- end
-
- def get_matching_blocks
- return @matching_blocks unless @matching_blocks.nil? or
- @matching_blocks.empty?
-
- @matching_blocks = []
- la, lb = @a.length, @b.length
- match_block_helper(0, la, 0, lb, @matching_blocks)
- @matching_blocks.push [la, lb, 0]
- end
-
- def match_block_helper(alo, ahi, blo, bhi, answer)
- i, j, k = x = find_longest_match(alo, ahi, blo, bhi)
- if not k.zero?
- if alo < i and blo < j
- match_block_helper(alo, i, blo, j, answer)
- end
- answer.push x
- if i + k < ahi and j + k < bhi
- match_block_helper(i + k, ahi, j + k, bhi, answer)
- end
- end
- end
-
- def get_opcodes
- unless @opcodes.nil? or @opcodes.empty?
- return @opcodes
- end
-
- i = j = 0
- @opcodes = answer = []
- get_matching_blocks.each do |ai, bj, size|
- tag = if i < ai and j < bj
- :replace
- elsif i < ai
- :delete
- elsif j < bj
- :insert
- end
-
- answer.push [tag, i, ai, j, bj] if tag
-
- i, j = ai + size, bj + size
-
- answer.push [:equal, ai, i, bj, j] unless size.zero?
-
- end
- return answer
- end
-
- # XXX: untested
- def get_grouped_opcodes(n=3)
- codes = get_opcodes
- if codes[0][0] == :equal
- tag, i1, i2, j1, j2 = codes[0]
- codes[0] = tag, [i1, i2 - n].max, i2, [j1, j2-n].max, j2
- end
-
- if codes[-1][0] == :equal
- tag, i1, i2, j1, j2 = codes[-1]
- codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n)
- end
- nn = n + n
- group = []
- codes.each do |tag, i1, i2, j1, j2|
- if tag == :equal and i2-i1 > nn
- group.push [tag, i1, [i2, i1 + n].min, j1, [j2, j1 + n].min]
- yield group
- group = []
- i1, j1 = [i1, i2-n].max, [j1, j2-n].max
- group.push [tag, i1, i2, j1 ,j2]
- end
- end
- if group and group.length != 1 and group[0][0] == :equal
- yield group
- end
- end
-
- def ratio
- matches = get_matching_blocks.reduce(0) do |sum, triple|
- sum + triple[-1]
- end
- Diff.calculate_ratio(matches, @a.length + @b.length)
- end
-
- def quick_ratio
- if @fullbcount.nil? or @fullbcount.empty?
- @fullbcount = {}
- @b.each do |elt|
- @fullbcount[elt] = (@fullbcount[elt] || 0) + 1
- end
- end
-
- avail = {}
- matches = 0
- @a.each do |elt|
- if avail.has_key? elt
- numb = avail[elt]
- else
- numb = @fullbcount[elt] || 0
- end
- avail[elt] = numb - 1
- if numb > 0
- matches += 1
- end
- end
- Diff.calculate_ratio matches, @a.length + @b.length
- end
-
- def real_quick_ratio
- la, lb = @a.length, @b.length
- Diff.calculate_ratio([la, lb].min, la + lb)
- end
-
- protected :chain_b, :match_block_helper
- end # end class SequenceMatcher
-
- def self.calculate_ratio(matches, length)
- return 1.0 if length.zero?
- 2.0 * matches / length
- end
-
- # XXX: untested
- def self.get_close_matches(word, possibilities, n=3, cutoff=0.6)
- unless n > 0
- raise "n must be > 0: #{n}"
- end
- unless 0.0 <= cutoff and cutoff <= 1.0
- raise "cutoff must be in (0.0..1.0): #{cutoff}"
- end
-
- result = []
- s = SequenceMatcher.new
- s.set_seq_b word
- possibilities.each do |x|
- s.set_seq_a x
- if s.real_quick_ratio >= cutoff and
- s.quick_ratio >= cutoff and
- s.ratio >= cutoff
- result.push [s.ratio, x]
- end
- end
-
- unless result.nil? or result.empty?
- result.sort
- result.reverse!
- result = result[-n..-1]
- end
- result.collect { |score, x| x }
- end
-
- def self.count_leading(line, ch)
- i, n = 0, line.length
- while i < n and line[i].chr == ch
- i += 1
- end
- i
- end
-end
-
-
-module HTMLDiff
- include Diff
- class Builder
- VALID_METHODS = [:replace, :insert, :delete, :equal]
- def initialize(a, b)
- @a = a
- @b = b
- @content = []
- end
-
- def do_op(opcode)
- @opcode = opcode
- op = @opcode[0]
- VALID_METHODS.include?(op) or raise(NameError, "Invalid opcode #{op}")
- self.method(op).call
- end
-
- def result
- @content.join('')
- end
-
- #this methods have to be called via do_op(opcode) so that @opcode is set properly
- private
-
- def replace
- delete("diffmod")
- insert("diffmod")
- end
-
- def insert(tagclass="diffins")
- op_helper("ins", tagclass, @b[@opcode[3]...@opcode[4]])
- end
-
- def delete(tagclass="diffdel")
- op_helper("del", tagclass, @a[@opcode[1]...@opcode[2]])
- end
-
- def equal
- @content += @b[@opcode[3]...@opcode[4]]
- end
-
- # using this as op_helper would be equivalent to the first version of diff.rb by Bill Atkins
- def op_helper_simple(tagname, tagclass, to_add)
- @content << "<#{tagname} class=\"#{tagclass}\">"
- @content += to_add
- @content << "</#{tagname}>"
- end
-
- # this tries to put <p> tags or newline chars before the opening diff tags (<ins> or <del>)
- # or after the ending diff tags
- # as a result the diff tags should be the "more inside" possible.
- # this seems to work nice with html containing only paragraphs
- # but not sure it works if there are other tags (div, span ... ? ) around
- def op_helper(tagname, tagclass, to_add)
- @content << to_add.shift while ( HTMLDiff.is_newline(to_add.first) or
- HTMLDiff.is_p_close_tag(to_add.first) or
- HTMLDiff.is_p_open_tag(to_add.first) )
- @content << "<#{tagname} class=\"#{tagclass}\">"
- @content += to_add
- last_tags = []
- last_tags.unshift(@content.pop) while ( HTMLDiff.is_newline(@content.last) or
- HTMLDiff.is_p_close_tag(@content.last) or
- HTMLDiff.is_p_open_tag(@content.last) )
- last_tags.unshift "</#{tagname}>"
- @content += last_tags
- remove_empty_diff(tagname, tagclass)
- end
-
- def remove_empty_diff(tagname, tagclass)
- if @content[-2] == "<#{tagname} class=\"#{tagclass}\">" and @content[-1] == "</#{tagname}>" then
- @content.pop
- @content.pop
- end
- end
-
- end
-
- def self.is_newline(x)
- (x == "\n") or (x == "\r") or (x == "\t")
- end
-
- def self.is_p_open_tag(x)
- x =~ /\A<(p|li|ul|ol|dir|dt|dl)/
- end
-
- def self.is_p_close_tag(x)
- x =~ %r!\A</(p|li|ul|ol|dir|dt|dl)!
- end
-
- def self.diff(a, b)
- a = html2list(a)
- b = html2list(b)
-
- out = Builder.new(a, b)
- s = SequenceMatcher.new(a, b)
-
- s.get_opcodes.each do |opcode|
- out.do_op(opcode)
- end
-
- out.result
- end
-
- def self.html2list(x)
- mode = :char
- cur = ''
- out = []
-
- x.split('').each do |c|
- if mode == :tag
- cur += c
- if c == '>'
- out.push(cur)
- cur = ''
- mode = :char
- end
- elsif mode == :char
- if c == '<'
- out.push cur
- cur = c
- mode = :tag
- elsif c =~ /\s/
- out.push cur + c
- cur = ''
- else
- cur += c
- end
- end
- end
-
- out.push cur
- out.find_all { |x| x != '' }
- end
-
-end
+# heavily based off difflib.py - see that file for documentation
+# ported from Python by Bill Atkins
+
+# This does not support all features offered by difflib; it
+# implements only the subset of features necessary
+# to support a Ruby version of HTML Differ. You're welcome to finish this off.
+
+# By default, String#each iterates by line. This isn't really appropriate
+# for diff, so often a string will be split by // to get an array of one-
+# character strings.
+
+# Some methods in Diff are untested and are not guaranteed to work. The
+# methods in HTMLDiff and any methods it calls should work quite well.
+
+# changes by DenisMertz
+# * main change:
+# ** get the tag soup away
+# the tag soup problem was first reported with <p> tags, but it appeared also with
+# <li>, <ul> etc... tags
+# this version should mostly fix these problems
+# ** added a Builder class to manage the creation of the final htmldiff
+# * minor changes:
+# ** use symbols instead of string to represent opcodes
+# ** small fix to html2list
+#
+
+module Enumerable
+ def reduce(init)
+ result = init
+ each { |item| result = yield(result, item) }
+ result
+ end
+end
+
+module Diff
+
+ class SequenceMatcher
+ def initialize(a=[''], b=[''], isjunk=nil, byline=false)
+ a = (!byline and a.kind_of? String) ? a.split(//) : a
+ b = (!byline and b.kind_of? String) ? b.split(//) : b
+ @isjunk = isjunk || proc {}
+ set_seqs a, b
+ end
+
+ def set_seqs(a, b)
+ set_seq_a a
+ set_seq_b b
+ end
+
+ def set_seq_a(a)
+ @a = a
+ @matching_blocks = @opcodes = nil
+ end
+
+ def set_seq_b(b)
+ @b = b
+ @matching_blocks = @opcodes = nil
+ chain_b
+ end
+
+ def chain_b
+ @fullbcount = nil
+ @b2j = {}
+ pophash = {}
+ junkdict = {}
+
+ @b.each_with_index do |elt, i|
+ if @b2j.has_key? elt
+ indices = @b2j[elt]
+ if @b.length >= 200 and indices.length * 100 > @b.length
+ pophash[elt] = 1
+ indices.clear
+ else
+ indices.push i
+ end
+ else
+ @b2j[elt] = [i]
+ end
+ end
+
+ pophash.each_key { |elt| @b2j.delete elt }
+
+ junkdict = {}
+
+ unless @isjunk.nil?
+ [pophash, @b2j].each do |d|
+ d.each_key do |elt|
+ if @isjunk.call(elt)
+ junkdict[elt] = 1
+ d.delete elt
+ end
+ end
+ end
+ end
+
+ @isbjunk = junkdict.method(:has_key?)
+ @isbpopular = junkdict.method(:has_key?)
+ end
+
+ def find_longest_match(alo, ahi, blo, bhi)
+ besti, bestj, bestsize = alo, blo, 0
+
+ j2len = {}
+
+ (alo..ahi).step do |i|
+ newj2len = {}
+ (@b2j[@a[i]] || []).each do |j|
+ if j < blo
+ next
+ end
+ if j >= bhi
+ break
+ end
+
+ k = newj2len[j] = (j2len[j - 1] || 0) + 1
+ if k > bestsize
+ besti, bestj, bestsize = i - k + 1, j - k + 1, k
+ end
+ end
+ j2len = newj2len
+ end
+
+ while besti > alo and bestj > blo and
+ not @isbjunk.call(@b[bestj-1]) and
+ @a[besti-1] == @b[bestj-1]
+ besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
+ end
+
+ while besti+bestsize < ahi and bestj+bestsize < bhi and
+ not @isbjunk.call(@b[bestj+bestsize]) and
+ @a[besti+bestsize] == @b[bestj+bestsize]
+ bestsize += 1
+ end
+
+ while besti > alo and bestj > blo and
+ @isbjunk.call(@b[bestj-1]) and
+ @a[besti-1] == @b[bestj-1]
+ besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
+ end
+
+ while besti+bestsize < ahi and bestj+bestsize < bhi and
+ @isbjunk.call(@b[bestj+bestsize]) and
+ @a[besti+bestsize] == @b[bestj+bestsize]
+ bestsize += 1
+ end
+
+ [besti, bestj, bestsize]
+ end
+
+ def get_matching_blocks
+ return @matching_blocks unless @matching_blocks.nil? or
+ @matching_blocks.empty?
+
+ @matching_blocks = []
+ la, lb = @a.length, @b.length
+ match_block_helper(0, la, 0, lb, @matching_blocks)
+ @matching_blocks.push [la, lb, 0]
+ end
+
+ def match_block_helper(alo, ahi, blo, bhi, answer)
+ i, j, k = x = find_longest_match(alo, ahi, blo, bhi)
+ if not k.zero?
+ if alo < i and blo < j
+ match_block_helper(alo, i, blo, j, answer)
+ end
+ answer.push x
+ if i + k < ahi and j + k < bhi
+ match_block_helper(i + k, ahi, j + k, bhi, answer)
+ end
+ end
+ end
+
+ def get_opcodes
+ unless @opcodes.nil? or @opcodes.empty?
+ return @opcodes
+ end
+
+ i = j = 0
+ @opcodes = answer = []
+ get_matching_blocks.each do |ai, bj, size|
+ tag = if i < ai and j < bj
+ :replace
+ elsif i < ai
+ :delete
+ elsif j < bj
+ :insert
+ end
+
+ answer.push [tag, i, ai, j, bj] if tag
+
+ i, j = ai + size, bj + size
+
+ answer.push [:equal, ai, i, bj, j] unless size.zero?
+
+ end
+ return answer
+ end
+
+ # XXX: untested
+ def get_grouped_opcodes(n=3)
+ codes = get_opcodes
+ if codes[0][0] == :equal
+ tag, i1, i2, j1, j2 = codes[0]
+ codes[0] = tag, [i1, i2 - n].max, i2, [j1, j2-n].max, j2
+ end
+
+ if codes[-1][0] == :equal
+ tag, i1, i2, j1, j2 = codes[-1]
+ codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n)
+ end
+ nn = n + n
+ group = []
+ codes.each do |tag, i1, i2, j1, j2|
+ if tag == :equal and i2-i1 > nn
+ group.push [tag, i1, [i2, i1 + n].min, j1, [j2, j1 + n].min]
+ yield group
+ group = []
+ i1, j1 = [i1, i2-n].max, [j1, j2-n].max
+ group.push [tag, i1, i2, j1 ,j2]
+ end
+ end
+ if group and group.length != 1 and group[0][0] == :equal
+ yield group
+ end
+ end
+
+ def ratio
+ matches = get_matching_blocks.reduce(0) do |sum, triple|
+ sum + triple[-1]
+ end
+ Diff.calculate_ratio(matches, @a.length + @b.length)
+ end
+
+ def quick_ratio
+ if @fullbcount.nil? or @fullbcount.empty?
+ @fullbcount = {}
+ @b.each do |elt|
+ @fullbcount[elt] = (@fullbcount[elt] || 0) + 1
+ end
+ end
+
+ avail = {}
+ matches = 0
+ @a.each do |elt|
+ if avail.has_key? elt
+ numb = avail[elt]
+ else
+ numb = @fullbcount[elt] || 0
+ end
+ avail[elt] = numb - 1
+ if numb > 0
+ matches += 1
+ end
+ end
+ Diff.calculate_ratio matches, @a.length + @b.length
+ end
+
+ def real_quick_ratio
+ la, lb = @a.length, @b.length
+ Diff.calculate_ratio([la, lb].min, la + lb)
+ end
+
+ protected :chain_b, :match_block_helper
+ end # end class SequenceMatcher
+
+ def self.calculate_ratio(matches, length)
+ return 1.0 if length.zero?
+ 2.0 * matches / length
+ end
+
+ # XXX: untested
+ def self.get_close_matches(word, possibilities, n=3, cutoff=0.6)
+ unless n > 0
+ raise "n must be > 0: #{n}"
+ end
+ unless 0.0 <= cutoff and cutoff <= 1.0
+ raise "cutoff must be in (0.0..1.0): #{cutoff}"
+ end
+
+ result = []
+ s = SequenceMatcher.new
+ s.set_seq_b word
+ possibilities.each do |x|
+ s.set_seq_a x
+ if s.real_quick_ratio >= cutoff and
+ s.quick_ratio >= cutoff and
+ s.ratio >= cutoff
+ result.push [s.ratio, x]
+ end
+ end
+
+ unless result.nil? or result.empty?
+ result.sort
+ result.reverse!
+ result = result[-n..-1]
+ end
+ result.collect { |score, x| x }
+ end
+
+ def self.count_leading(line, ch)
+ i, n = 0, line.length
+ while i < n and line[i].chr == ch
+ i += 1
+ end
+ i
+ end
+end
+
+
+module HTMLDiff
+ include Diff
+ class Builder
+ VALID_METHODS = [:replace, :insert, :delete, :equal]
+ def initialize(a, b)
+ @a = a
+ @b = b
+ @content = []
+ end
+
+ def do_op(opcode)
+ @opcode = opcode
+ op = @opcode[0]
+ VALID_METHODS.include?(op) or raise(NameError, "Invalid opcode #{op}")
+ self.method(op).call
+ end
+
+ def result
+ @content.join('')
+ end
+
+ #this methods have to be called via do_op(opcode) so that @opcode is set properly
+ private
+
+ def replace
+ delete("diffmod")
+ insert("diffmod")
+ end
+
+ def insert(tagclass="diffins")
+ op_helper("ins", tagclass, @b[@opcode[3]...@opcode[4]])
+ end
+
+ def delete(tagclass="diffdel")
+ op_helper("del", tagclass, @a[@opcode[1]...@opcode[2]])
+ end
+
+ def equal
+ @content += @b[@opcode[3]...@opcode[4]]
+ end
+
+ # using this as op_helper would be equivalent to the first version of diff.rb by Bill Atkins
+ def op_helper_simple(tagname, tagclass, to_add)
+ @content << "<#{tagname} class=\"#{tagclass}\">"
+ @content += to_add
+ @content << "</#{tagname}>"
+ end
+
+ # this tries to put <p> tags or newline chars before the opening diff tags (<ins> or <del>)
+ # or after the ending diff tags
+ # as a result the diff tags should be the "more inside" possible.
+ # this seems to work nice with html containing only paragraphs
+ # but not sure it works if there are other tags (div, span ... ? ) around
+ def op_helper(tagname, tagclass, to_add)
+ @content << to_add.shift while ( HTMLDiff.is_newline(to_add.first) or
+ HTMLDiff.is_p_close_tag(to_add.first) or
+ HTMLDiff.is_p_open_tag(to_add.first) )
+ @content << "<#{tagname} class=\"#{tagclass}\">"
+ @content += to_add
+ last_tags = []
+ last_tags.unshift(@content.pop) while ( HTMLDiff.is_newline(@content.last) or
+ HTMLDiff.is_p_close_tag(@content.last) or
+ HTMLDiff.is_p_open_tag(@content.last) )
+ last_tags.unshift "</#{tagname}>"
+ @content += last_tags
+ remove_empty_diff(tagname, tagclass)
+ end
+
+ def remove_empty_diff(tagname, tagclass)
+ if @content[-2] == "<#{tagname} class=\"#{tagclass}\">" and @content[-1] == "</#{tagname}>" then
+ @content.pop
+ @content.pop
+ end
+ end
+
+ end
+
+ def self.is_newline(x)
+ (x == "\n") or (x == "\r") or (x == "\t")
+ end
+
+ def self.is_p_open_tag(x)
+ x =~ /\A<(p|li|ul|ol|dir|dt|dl)/
+ end
+
+ def self.is_p_close_tag(x)
+ x =~ %r!\A</(p|li|ul|ol|dir|dt|dl)!
+ end
+
+ def self.diff(a, b)
+ a = html2list(a)
+ b = html2list(b)
+
+ out = Builder.new(a, b)
+ s = SequenceMatcher.new(a, b)
+
+ s.get_opcodes.each do |opcode|
+ out.do_op(opcode)
+ end
+
+ out.result
+ end
+
+ def self.html2list(x)
+ mode = :char
+ cur = ''
+ out = []
+
+ x.split('').each do |c|
+ if mode == :tag
+ cur += c
+ if c == '>'
+ out.push(cur)
+ cur = ''
+ mode = :char
+ end
+ elsif mode == :char
+ if c == '<'
+ out.push cur
+ cur = c
+ mode = :tag
+ elsif c =~ /\s/
+ out.push cur + c
+ cur = ''
+ else
+ cur += c
+ end
+ end
+ end
+
+ out.push cur
+ out.find_all { |x| x != '' }
+ end
+
+end