require "diff_methods" require "patch_obj" # Class containing the diff, match and patch methods. # Also contains the behaviour settings. class DiMaPa include DiffMethods attr_accessor :diff_edit_cost attr_accessor :match_threshold attr_accessor :match_distance attr_accessor :patch_delete_threshold attr_accessor :patch_margin attr_reader :match_max_bits def initialize # Inits a diff_match_patch object with default settings. # Redefine these in your program to override the defaults. # Cost of an empty edit operation in terms of edit characters. @diff_edit_cost = 4 # At what point is no match declared (0.0 = perfection, 1.0 = very loose). @match_threshold = 0.5 # How far to search for a match (0 = exact location, 1000+ = broad match). # A match this many characters away from the expected location will add # 1.0 to the score (0.0 is a perfect match). @match_distance = 1000 # When deleting a large block of text (over ~64 characters), how close does # the contents have to match the expected contents. (0.0 = perfection, # 1.0 = very loose). Note that Match_Threshold controls how closely the # end points of a delete need to match. @patch_delete_threshold = 0.5 # Chunk size for context length. @patch_margin = 4 # The number of bits in an int. # Python has no maximum, thus to disable patch splitting set to 0. # However to avoid long patches in certain pathological cases, use 32. # Multiple short patches (using native ints) are much faster than long ones. @match_max_bits = 32 super end # Do a quick line-level diff on both strings, then rediff the parts for # greater accuracy. # This speedup can produce non-minimal diffs. def diff_line_mode(text1, text2, deadline) # Scan the text on a line-by-line basis first. text1, text2, line_array = diff_lines_to_chars(text1, text2) diffs = diff_main(text1, text2, false, deadline) # Convert the diff back to original text. diff_chars_to_lines(diffs, line_array) # Eliminate freak matches (e.g. blank lines) diff_cleanup_semantic(diffs) # Rediff any replacement blocks, this time character-by-character. # Add a dummy entry at the end. diffs.push([:equal, ""]) pointer = 0 count_delete = 0 count_insert = 0 text_delete = "" text_insert = "" while pointer < diffs.length case diffs[pointer][0] when :insert count_insert += 1 text_insert += diffs[pointer][1] when :delete count_delete += 1 text_delete += diffs[pointer][1] when :equal # Upon reaching an equality, check for prior redundancies. if count_delete >= 1 && count_insert >= 1 # Delete the offending records and add the merged ones. a = diff_main(text_delete, text_insert, false, deadline) diffs[pointer - count_delete - count_insert, count_delete + count_insert] = [] pointer = pointer - count_delete - count_insert diffs[pointer, 0] = a pointer += a.length end count_insert = 0 count_delete = 0 text_delete = "" text_insert = "" end pointer += 1 end diffs.pop # Remove the dummy entry at the end. diffs end # Find the 'middle snake' of a diff, split the problem in two # and return the recursively constructed diff. # See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations. def diff_bisect(text1, text2, deadline) # Cache the text lengths to prevent multiple calls. text1_length = text1.length text2_length = text2.length max_d = (text1_length + text2_length + 1) / 2 v_offset = max_d v_length = 2 * max_d v1 = Array.new(v_length, -1) v2 = Array.new(v_length, -1) v1[v_offset + 1] = 0 v2[v_offset + 1] = 0 delta = text1_length - text2_length # If the total number of characters is odd, then the front path will # collide with the reverse path. front = (delta % 2 != 0) # Offsets for start and end of k loop. # Prevents mapping of space beyond the grid. k1start = 0 k1end = 0 k2start = 0 k2end = 0 max_d.times do |d| # Bail out if deadline is reached. break if deadline && Time.now >= deadline # Walk the front path one step. (-d + k1start).step(d - k1end, 2) do |k1| k1_offset = v_offset + k1 x1 = if k1 == -d || k1 != d && v1[k1_offset - 1] < v1[k1_offset + 1] v1[k1_offset + 1] else v1[k1_offset - 1] + 1 end y1 = x1 - k1 while x1 < text1_length && y1 < text2_length && text1[x1] == text2[y1] x1 += 1 y1 += 1 end v1[k1_offset] = x1 if x1 > text1_length # Ran off the right of the graph. k1end += 2 elsif y1 > text2_length # Ran off the bottom of the graph. k1start += 2 elsif front k2_offset = v_offset + delta - k1 if k2_offset >= 0 && k2_offset < v_length && v2[k2_offset] != -1 # Mirror x2 onto top-left coordinate system. x2 = text1_length - v2[k2_offset] if x1 >= x2 # Overlap detected. return diff_bisect_split(text1, text2, x1, y1, deadline) end end end end # Walk the reverse path one step. (-d + k2start).step(d - k2end, 2) do |k2| k2_offset = v_offset + k2 x2 = if k2 == -d || k2 != d && v2[k2_offset - 1] < v2[k2_offset + 1] v2[k2_offset + 1] else v2[k2_offset - 1] + 1 end y2 = x2 - k2 while x2 < text1_length && y2 < text2_length && text1[-x2 - 1] == text2[-y2 - 1] x2 += 1 y2 += 1 end v2[k2_offset] = x2 if x2 > text1_length # Ran off the left of the graph. k2end += 2 elsif y2 > text2_length # Ran off the top of the graph. k2start += 2 elsif !front k1_offset = v_offset + delta - k2 if k1_offset >= 0 && k1_offset < v_length && v1[k1_offset] != -1 x1 = v1[k1_offset] y1 = v_offset + x1 - k1_offset # Mirror x2 onto top-left coordinate system. x2 = text1_length - x2 if x1 >= x2 # Overlap detected. return diff_bisect_split(text1, text2, x1, y1, deadline) end end end end end # Diff took too long and hit the deadline or # number of diffs equals number of characters, no commonality at all. [[:delete, text1], [:insert, text2]] end # Given the location of the 'middle snake', split the diff in two parts # and recurse. def diff_bisect_split(text1, text2, x, y, deadline) text1a = text1[0...x] text2a = text2[0...y] text1b = text1[x..-1] text2b = text2[y..-1] # Compute both diffs serially. diffs = diff_main(text1a, text2a, false, deadline) diffsb = diff_main(text1b, text2b, false, deadline) diffs + diffsb end # Split two texts into an array of strings. Reduce the texts to a string # of hashes where each Unicode character represents one line. def diff_lines_to_chars(text1, text2) line_array = [""] # e.g. line_array[4] == "Hello\n" line_hash = {} # e.g. line_hash["Hello\n"] == 4 [text1, text2].map { |text| # Split text into an array of strings. Reduce the text to a string of # hashes where each Unicode character represents one line. chars = "" text.each_line do |line| if line_hash[line] chars += line_hash[line].chr(Encoding::UTF_8) else chars += line_array.length.chr(Encoding::UTF_8) line_hash[line] = line_array.length line_array.push(line) end end chars }.push(line_array) end # Rehydrate the text in a diff from a string of line hashes to real lines of text. def diff_chars_to_lines(diffs, line_array) diffs.each do |diff| diff[1] = diff[1].chars.map { |c| line_array[c.ord] }.join end end # Determine the common prefix of two strings. def diff_common_prefix(text1, text2) # Quick check for common null cases. return 0 if text1.empty? || text2.empty? || text1[0] != text2[0] # Binary search. # Performance analysis: http://neil.fraser.name/news/2007/10/09/ pointer_min = 0 pointer_max = [text1.length, text2.length].min pointer_mid = pointer_max pointer_start = 0 while pointer_min < pointer_mid if text1[pointer_start...pointer_mid] == text2[pointer_start...pointer_mid] pointer_min = pointer_mid pointer_start = pointer_min else pointer_max = pointer_mid end pointer_mid = (pointer_max - pointer_min) / 2 + pointer_min end pointer_mid end # Determine the common suffix of two strings. def diff_common_suffix(text1, text2) # Quick check for common null cases. return 0 if text1.empty? || text2.empty? || text1[-1] != text2[-1] # Binary search. # Performance analysis: http://neil.fraser.name/news/2007/10/09/ pointer_min = 0 pointer_max = [text1.length, text2.length].min pointer_mid = pointer_max pointer_end = 0 while pointer_min < pointer_mid if text1[-pointer_mid..(-pointer_end - 1)] == text2[-pointer_mid..(-pointer_end - 1)] pointer_min = pointer_mid pointer_end = pointer_min else pointer_max = pointer_mid end pointer_mid = (pointer_max - pointer_min) / 2 + pointer_min end pointer_mid end # Determine if the suffix of one string is the prefix of another. def diff_common_overlap(text1, text2) # Cache the text lengths to prevent multiple calls. text1_length = text1.length text2_length = text2.length # Eliminate the null case. return 0 if text1_length.zero? || text2_length.zero? # Truncate the longer string. if text1_length > text2_length text1 = text1[-text2_length..-1] else text2 = text2[0...text1_length] end text_length = [text1_length, text2_length].min # Quick check for the whole case. return text_length if text1 == text2 # Start by looking for a single character match # and increase length until no match is found. # Performance analysis: http://neil.fraser.name/news/2010/11/04/ best = 0 length = 1 loop do pattern = text1[(text_length - length)..-1] found = text2.index(pattern) return best if found.nil? length += found if found == 0 || text1[(text_length - length)..-1] == text2[0..length] best = length length += 1 end end end # Does a substring of shorttext exist within longtext such that the # substring is at least half the length of longtext? def diff_half_match_i(longtext, shorttext, i) seed = longtext[i, longtext.length / 4] j = -1 best_common = "" while (j = shorttext.index(seed, j + 1)) prefix_length = diff_common_prefix(longtext[i..-1], shorttext[j..-1]) suffix_length = diff_common_suffix(longtext[0...i], shorttext[0...j]) if best_common.length < suffix_length + prefix_length best_common = shorttext[(j - suffix_length)...j] + shorttext[j...(j + prefix_length)] best_longtext_a = longtext[0...(i - suffix_length)] best_longtext_b = longtext[(i + prefix_length)..-1] best_shorttext_a = shorttext[0...(j - suffix_length)] best_shorttext_b = shorttext[(j + prefix_length)..-1] end end if best_common.length * 2 >= longtext.length [best_longtext_a, best_longtext_b, best_shorttext_a, best_shorttext_b, best_common] end end # Do the two texts share a substring which is at least half the length of the # longer text? # This speedup can produce non-minimal diffs. def diff_half_match(text1, text2) # Don't risk returning a non-optimal diff if we have unlimited time return nil if diff_timeout <= 0 shorttext, longtext = [text1, text2].sort_by(&:length) if longtext.length < 4 || shorttext.length * 2 < longtext.length return nil # Pointless. end # First check if the second quarter is the seed for a half-match. hm1 = diff_half_match_i(longtext, shorttext, (longtext.length + 3) / 4) # Check again based on the third quarter. hm2 = diff_half_match_i(longtext, shorttext, (longtext.length + 1) / 2) if hm1.nil? && hm2.nil? return nil elsif hm2.nil? || hm1.nil? hm = hm2.nil? ? hm1 : hm2 else # Both matched. Select the longest. hm = hm1[4].length > hm2[4].length ? hm1 : hm2 end # A half-match was found, sort out the return data. if text1.length > text2.length text1_a, text1_b, text2_a, text2_b, mid_common = hm else text2_a, text2_b, text1_a, text1_b, mid_common = hm end [text1_a, text1_b, text2_a, text2_b, mid_common] end # Reduce the number of edits by eliminating semantically trivial equalities. def diff_cleanup_semantic(diffs) changes = false equalities = [] # Stack of indices where equalities are found. last_equality = nil # Always equal to equalities.last[1] pointer = 0 # Index of current position. # Number of characters that changed prior to the equality. length_insertions1 = 0 length_deletions1 = 0 # Number of characters that changed after the equality. length_insertions2 = 0 length_deletions2 = 0 while pointer < diffs.length if diffs[pointer][0] == :equal # Equality found. equalities.push(pointer) length_insertions1 = length_insertions2 length_deletions1 = length_deletions2 length_insertions2 = 0 length_deletions2 = 0 last_equality = diffs[pointer][1] else # An insertion or deletion. if diffs[pointer][0] == :insert length_insertions2 += diffs[pointer][1].length else length_deletions2 += diffs[pointer][1].length end if last_equality && last_equality.length <= [length_insertions1, length_deletions1].max && last_equality.length <= [length_insertions2, length_deletions2].max # Duplicate record. diffs[equalities.last, 0] = [[:delete, last_equality]] # Change second copy to insert. diffs[equalities.last + 1][0] = :insert # Throw away the equality we just deleted. equalities.pop # Throw away the previous equality (it needs to be reevaluated). equalities.pop pointer = equalities.last || -1 # Reset the counters. length_insertions1 = 0 length_deletions1 = 0 length_insertions2 = 0 length_deletions2 = 0 last_equality = nil changes = true end end pointer += 1 end # Normalize the diff. diff_cleanup_merge(diffs) if changes diff_cleanup_semantic_lossless(diffs) # Find any overlaps between deletions and insertions. # e.g: abcxxxxxxdef # -> abcxxxdef # e.g: xxxabcdefxxx # -> defxxxabc # Only extract an overlap if it is as big as the edit ahead or behind it. pointer = 1 while pointer < diffs.length if diffs[pointer - 1][0] == :delete && diffs[pointer][0] == :insert deletion = diffs[pointer - 1][1] insertion = diffs[pointer][1] overlap_length1 = diff_common_overlap(deletion, insertion) overlap_length2 = diff_common_overlap(insertion, deletion) if overlap_length1 >= overlap_length2 if overlap_length1 >= deletion.length / 2.0 || overlap_length1 >= insertion.length / 2.0 # Overlap found. Insert an equality and trim the surrounding edits. diffs[pointer, 0] = [[:equal, insertion[0...overlap_length1]]] diffs[pointer - 1][0] = :delete diffs[pointer - 1][1] = deletion[0...-overlap_length1] diffs[pointer + 1][0] = :insert diffs[pointer + 1][1] = insertion[overlap_length1..-1] pointer += 1 end elsif overlap_length2 >= deletion.length / 2.0 || overlap_length2 >= insertion.length / 2.0 diffs[pointer, 0] = [[:equal, deletion[0...overlap_length2]]] diffs[pointer - 1][0] = :insert diffs[pointer - 1][1] = insertion[0...-overlap_length2] diffs[pointer + 1][0] = :delete diffs[pointer + 1][1] = deletion[overlap_length2..-1] pointer += 1 end pointer += 1 end pointer += 1 end end # Given two strings, compute a score representing whether the # internal boundary falls on logical boundaries. # Scores range from 5 (best) to 0 (worst). def diff_cleanup_semantic_score(one, two) if one.empty? || two.empty? # Edges are the best. return 5 end # Define some regex patterns for matching boundaries. non_word_character = /[^a-zA-Z0-9]/ whitespace = /\s/ linebreak = /[\r\n]/ line_end = /\n\r?\n$/ line_start = /^\r?\n\r?\n/ # Each port of this function behaves slightly differently due to # subtle differences in each language's definition of things like # 'whitespace'. Since this function's purpose is largely cosmetic, # the choice has been made to use each language's native features # rather than force total conformity. score = 0 # One point for non-alphanumeric. if one[-1] =~ non_word_character || two[0] =~ non_word_character score += 1 # Two points for whitespace. if one[-1] =~ whitespace || two[0] =~ whitespace score += 1 # Three points for line breaks. if one[-1] =~ linebreak || two[0] =~ linebreak score += 1 # Four points for blank lines. if one =~ line_end || two =~ line_start score += 1 end end end end score end # Look for single edits surrounded on both sides by equalities # which can be shifted sideways to align the edit to a word boundary. # e.g: The cat came. -> The cat came. def diff_cleanup_semantic_lossless(diffs) pointer = 1 # Intentionally ignore the first and last element (don't need checking). while pointer < diffs.length - 1 if diffs[pointer - 1][0] == :equal && diffs[pointer + 1][0] == :equal # This is a single edit surrounded by equalities. equality1 = diffs[pointer - 1][1] edit = diffs[pointer][1] equality2 = diffs[pointer + 1][1] # First, shift the edit as far left as possible. common_offset = diff_common_suffix(equality1, edit) if common_offset != 0 common_string = edit[-common_offset..-1] equality1 = equality1[0...-common_offset] edit = common_string + edit[0...-common_offset] equality2 = common_string + equality2 end # Second, step character by character right, looking for the best fit. best_equality1 = equality1 best_edit = edit best_equality2 = equality2 best_score = diff_cleanup_semantic_score(equality1, edit) + diff_cleanup_semantic_score(edit, equality2) while edit[0] == equality2[0] equality1 += edit[0] edit = edit[1..-1] + equality2[0] equality2 = equality2[1..-1] score = diff_cleanup_semantic_score(equality1, edit) + diff_cleanup_semantic_score(edit, equality2) # The >= encourages trailing rather than leading whitespace on edits. if score >= best_score best_score = score best_equality1 = equality1 best_edit = edit best_equality2 = equality2 end end if diffs[pointer - 1][1] != best_equality1 # We have an improvement, save it back to the diff. if best_equality1.empty? diffs[pointer - 1, 1] = [] pointer -= 1 else diffs[pointer - 1][1] = best_equality1 end diffs[pointer][1] = best_edit if best_equality2.empty? diffs[pointer + 1, 1] = [] pointer -= 1 else diffs[pointer + 1][1] = best_equality2 end end end pointer += 1 end end # Reduce the number of edits by eliminating operationally trivial equalities. def diff_cleanup_efficiency(diffs) changes = false equalities = [] # Stack of indices where equalities are found. last_equality = "" # Always equal to equalities.last[1] pointer = 0 # Index of current position. pre_ins = false # Is there an insertion operation before the last equality. pre_del = false # Is there a deletion operation before the last equality. post_ins = false # Is there an insertion operation after the last equality. post_del = false # Is there a deletion operation after the last equality. while pointer < diffs.length if diffs[pointer][0] == :equal # Equality found. if diffs[pointer][1].length < diff_edit_cost && (post_ins || post_del) # Candidate found. equalities.push(pointer) pre_ins = post_ins pre_del = post_del last_equality = diffs[pointer][1] else # Not a candidate, and can never become one. equalities.clear last_equality = "" end post_ins = false post_del = false else # An insertion or deletion. if diffs[pointer][0] == :delete post_del = true else post_ins = true end # Five types to be split: # ABXYCD # AXCD # ABXC # AXCD # ABXC if !last_equality.empty? && ((pre_ins && pre_del && post_ins && post_del) || ((last_equality.length < diff_edit_cost / 2) && [pre_ins, pre_del, post_ins, post_del].count(true) == 3)) # Duplicate record. diffs[equalities.last, 0] = [[:delete, last_equality]] # Change second copy to insert. diffs[equalities.last + 1][0] = :insert equalities.pop # Throw away the equality we just deleted last_equality = "" if pre_ins && pre_del # No changes made which could affect previous entry, keep going. post_ins = true post_del = true equalities.clear else unless equalities.empty? equalities.pop # Throw away the previous equality. pointer = equalities.last || -1 end post_ins = false post_del = false end changes = true end end pointer += 1 end if changes diff_cleanup_merge(diffs) end end # Reorder and merge like edit sections. Merge equalities. # Any edit section can move as long as it doesn't cross an equality. def diff_cleanup_merge(diffs) diffs.push([:equal, ""]) # Add a dummy entry at the end. pointer = 0 count_delete = 0 count_insert = 0 text_delete = "" text_insert = "" while pointer < diffs.length case diffs[pointer][0] when :insert count_insert += 1 text_insert += diffs[pointer][1] pointer += 1 when :delete count_delete += 1 text_delete += diffs[pointer][1] pointer += 1 when :equal # Upon reaching an equality, check for prior redundancies. if count_delete + count_insert > 1 if count_delete != 0 && count_insert != 0 # Factor out any common prefixies. common_length = diff_common_prefix(text_insert, text_delete) if common_length != 0 if (pointer - count_delete - count_insert) > 0 && diffs[pointer - count_delete - count_insert - 1][0] == :equal diffs[pointer - count_delete - count_insert - 1][1] += text_insert[0...common_length] else diffs.unshift([:equal, text_insert[0...common_length]]) pointer += 1 end text_insert = text_insert[common_length..-1] text_delete = text_delete[common_length..-1] end # Factor out any common suffixies. common_length = diff_common_suffix(text_insert, text_delete) if common_length != 0 diffs[pointer][1] = text_insert[-common_length..-1] + diffs[pointer][1] text_insert = text_insert[0...-common_length] text_delete = text_delete[0...-common_length] end end # Delete the offending records and add the merged ones. diffs[pointer - count_delete - count_insert, count_delete + count_insert] = if count_delete.zero? [[:insert, text_insert]] elsif count_insert.zero? [[:delete, text_delete]] else [[:delete, text_delete], [:insert, text_insert]] end pointer = pointer - count_delete - count_insert + (count_delete.zero? ? 0 : 1) + (count_insert.zero? ? 0 : 1) + 1 elsif pointer != 0 && diffs[pointer - 1][0] == :equal # Merge this equality with the previous one. diffs[pointer - 1][1] += diffs[pointer][1] diffs[pointer, 1] = [] else pointer += 1 end count_insert = 0 count_delete = 0 text_delete = "" text_insert = "" end end if diffs.last[1].empty? diffs.pop # Remove the dummy entry at the end. end # Second pass: look for single edits surrounded on both sides by equalities # which can be shifted sideways to eliminate an equality. # e.g: ABAC -> ABAC changes = false pointer = 1 # Intentionally ignore the first and last element (don't need checking). while pointer < diffs.length - 1 if diffs[pointer - 1][0] == :equal && diffs[pointer + 1][0] == :equal # This is a single edit surrounded by equalities. if diffs[pointer][1][-diffs[pointer - 1][1].length..-1] == diffs[pointer - 1][1] # Shift the edit over the previous equality. diffs[pointer][1] = diffs[pointer - 1][1] + diffs[pointer][1][0...-diffs[pointer - 1][1].length] diffs[pointer + 1][1] = diffs[pointer - 1][1] + diffs[pointer + 1][1] diffs[pointer - 1, 1] = [] changes = true elsif diffs[pointer][1][0...diffs[pointer + 1][1].length] == diffs[pointer + 1][1] # Shift the edit over the next equality. diffs[pointer - 1][1] += diffs[pointer + 1][1] diffs[pointer][1] = diffs[pointer][1][diffs[pointer + 1][1].length..-1] + diffs[pointer + 1][1] diffs[pointer + 1, 1] = [] changes = true end end pointer += 1 end # If shifts were made, the diff needs reordering and another shift sweep. if changes diff_cleanup_merge(diffs) end end # loc is a location in text1, compute and return the equivalent location # in text2. e.g. 'The cat' vs 'The big cat', 1->1, 5->8 def diff_x_index(diffs, loc) chars1 = 0 chars2 = 0 last_chars1 = 0 last_chars2 = 0 x = diffs.index { |diff| if diff[0] != :insert chars1 += diff[1].length end if diff[0] != :delete chars2 += diff[1].length end if chars1 > loc true else last_chars1 = chars1 last_chars2 = chars2 false end } if !x.nil? && diffs.length != x && diffs[x][0] == :delete # The location was deleted. last_chars2 else # Add the remaining len(character). last_chars2 + (loc - last_chars1) end end # Convert a diff array into a pretty HTML report. def diff_pretty_html(diffs) diffs.map { |op, data| text = data.gsub("&", "&").gsub("<", "<").gsub(">", ">").gsub('\n', "¶
") case op when :insert "#{text}" when :delete "#{text}" when :equal "#{text}" end }.join end # Compute and return the source text (all equalities and deletions). def diff_text1(diffs) diffs.map { |op, data| if op == :insert "" else data end }.join end # Compute and return the destination text (all equalities and insertions). def diff_text2(diffs) diffs.map { |op, data| if op == :delete "" else data end }.join end # Compute the Levenshtein distance; the number of inserted, deleted or # substituted characters. def diff_levenshtein(diffs) levenshtein = 0 insertions = 0 deletions = 0 diffs.each do |op, data| case op when :insert insertions += data.length when :delete deletions += data.length when :equal # A deletion and an insertion is one substitution. levenshtein += [insertions, deletions].max insertions = 0 deletions = 0 end end levenshtein + [insertions, deletions].max end # Crush the diff into an encoded string which describes the operations # required to transform text1 into text2. # E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'. # Operations are tab-separated. Inserted text is escaped using %xx notation. def diff_to_delta(diffs) diffs.map { |op, data| case op when :insert "+" + PatchObj::PATCH_PARSER.escape(data, /[^0-9A-Za-z_.;!~*'(),\/?:@&=+$\#-]/) when :delete "-" + data.length.to_s when :equal "=" + data.length.to_s end }.join("\t").gsub("%20", " ") end # Given the original text1, and an encoded string which describes the # operations required to transform text1 into text2, compute the full diff. def diff_from_delta(text1, delta) # Deltas should be composed of a subset of ascii chars, Unicode not required. delta.encode("ascii") diffs = [] pointer = 0 # Cursor in text1 delta.split("\t").each do |token| # Each token begins with a one character parameter which specifies the # operation of this token (delete, insert, equality). param = token[1..-1] case token[0] when "+" diffs.push([:insert, PatchObj::PATCH_PARSER.unescape(param.force_encoding(Encoding::UTF_8))]) when "-", "=" begin n = Integer(param) raise if n < 0 text = text1[pointer...(pointer + n)] pointer += n if token[0] == "=" diffs.push([:equal, text]) else diffs.push([:delete, text]) end rescue ArgumentError => _ raise ArgumentError.new( "Invalid number in diff_fromDelta: #{param.inspect}" ) end else raise ArgumentError.new( "Invalid diff operation in diff_fromDelta: #{token.inspect}" ) end end if pointer != text1.length raise ArgumentError.new("Delta length (#{pointer}) does not equal " \ "source text length #{text1.length}") end diffs end # Locate the best instance of 'pattern' in 'text' near 'loc'. def match_main(text, pattern, loc) # Check for null inputs. if [text, pattern].any?(&:nil?) raise ArgumentError.new("Null input. (match_main)") end loc = [0, [loc, text.length].min].max if text == pattern # Shortcut (potentially not guaranteed by the algorithm) 0 elsif text.empty? # Nothing to match -1 elsif text[loc, pattern.length] == pattern # Perfect match at the perfect spot! (Includes case of null pattern) loc else # Do a fuzzy compare. match_bitap(text, pattern, loc) end end # Locate the best instance of 'pattern' in 'text' near 'loc' using the # Bitap algorithm. def match_bitap(text, pattern, loc) if pattern.length > match_max_bits throw ArgumentError.new("Pattern too long") end # Initialise the alphabet. s = match_alphabet(pattern) # Compute and return the score for a match with e errors and x location. match_bitap_score = ->(e, x) do accuracy = e.to_f / pattern.length proximity = (loc - x).abs if match_distance == 0 # Dodge divide by zero error. return proximity == 0 ? accuracy : 1.0 end return accuracy + (proximity.to_f / match_distance) end # Highest score beyond which we give up. score_threshold = match_threshold # Is there a nearby exact match? (speedup) best_loc = text.index(pattern, loc) if best_loc score_threshold = [match_bitap_score[0, best_loc], score_threshold].min # What about in the other direction? (speedup) best_loc = text.rindex(pattern, loc + pattern.length) if best_loc score_threshold = [match_bitap_score[0, best_loc], score_threshold].min end end # Initialise the bit arrays. match_mask = 1 << (pattern.length - 1) best_loc = -1 bin_max = pattern.length + text.length # Empty initialization added to appease pychecker. last_rd = nil pattern.length.times do |d| # Scan for the best match; each iteration allows for one more error. # Run a binary search to determine how far from 'loc' we can stray at this # error level. bin_min = 0 bin_mid = bin_max while bin_min < bin_mid if match_bitap_score[d, loc + bin_mid] <= score_threshold bin_min = bin_mid else bin_max = bin_mid end bin_mid = (bin_max - bin_min) / 2 + bin_min end # Use the result from this iteration as the maximum for the next. bin_max = bin_mid start = [1, loc - bin_mid + 1].max finish = [loc + bin_mid, text.length].min + pattern.length rd = Array.new(finish + 2, 0) rd[finish + 1] = (1 << d) - 1 finish.downto(start) do |j| char_match = s[text[j - 1]] || 0 rd[j] = if d == 0 # First pass: exact match. ((rd[j + 1] << 1) | 1) & char_match else # Subsequent passes: fuzzy match. ((rd[j + 1] << 1) | 1) & char_match | (((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1] end if (rd[j] & match_mask).nonzero? score = match_bitap_score[d, j - 1] # This match will almost certainly be better than any existing match. # But check anyway. if score <= score_threshold # Told you so. score_threshold = score best_loc = j - 1 if best_loc > loc # When passing loc, don't exceed our current distance from loc. start = [1, 2 * loc - best_loc].max else # Already passed loc, downhill from here on in. break end end end end # No hope for a (better) match at greater error levels. if match_bitap_score[d + 1, loc] > score_threshold break end last_rd = rd end best_loc end # Initialise the alphabet for the Bitap algorithm. def match_alphabet(pattern) s = {} pattern.chars.each_with_index do |c, i| s[c] ||= 0 s[c] |= 1 << (pattern.length - i - 1) end s end # Parse a textual representation of patches and return a list of patch # objects. def patch_from_text(textline) return [] if textline.empty? patches = [] text = textline.split("\n") text_pointer = 0 patch_header = /^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$/ while text_pointer < text.length m = text[text_pointer].match(patch_header) if m.nil? raise ArgumentError.new("Invalid patch string: #{text[text_pointer]}") end patch = PatchObj.new patches.push(patch) patch.start1 = m[1].to_i if m[2].empty? patch.start1 -= 1 patch.length1 = 1 elsif m[2] == "0" patch.length1 = 0 else patch.start1 -= 1 patch.length1 = m[2].to_i end patch.start2 = m[3].to_i if m[4].empty? patch.start2 -= 1 patch.length2 = 1 elsif m[4] == "0" patch.length2 = 0 else patch.start2 -= 1 patch.length2 = m[4].to_i end text_pointer += 1 while text_pointer < text.length if text[text_pointer].empty? # Blank line? Whatever. text_pointer += 1 next end sign = text[text_pointer][0] line = PatchObj::PATCH_PARSER.unescape(text[text_pointer][1..-1].force_encoding(Encoding::UTF_8)) case sign when "-" # Deletion. patch.diffs.push([:delete, line]) when "+" # Insertion. patch.diffs.push([:insert, line]) when " " # Minor equality patch.diffs.push([:equal, line]) when "@" # Start of next patch. break else # WTF? raise ArgumentError.new("Invalid patch mode \"#{sign}\" in: #{line}") end text_pointer += 1 end end patches end # Take a list of patches and return a textual representation def patch_to_text(patches) patches.join end # Increase the context until it is unique, # but don't let the pattern expand beyond match_max_bits def patch_add_context(patch, text) return if text.empty? pattern = text[patch.start2, patch.length1] padding = 0 # Look for the first and last matches of pattern in text. If two different # matches are found, increase the pattern length. while text.index(pattern) != text.rindex(pattern) && pattern.length < match_max_bits - 2 * patch_margin padding += patch_margin pattern = text[[0, patch.start2 - padding].max...(patch.start2 + patch.length1 + padding)] end # Add one chunk for good luck. padding += patch_margin # Add the prefix. prefix = text[[0, patch.start2 - padding].max...patch.start2] patch.diffs.unshift([:equal, prefix]) unless prefix.to_s.empty? # Add the suffix. suffix = text[patch.start2 + patch.length1, padding] patch.diffs.push([:equal, suffix]) unless suffix.to_s.empty? # Roll back the start points. patch.start1 -= prefix.length patch.start2 -= prefix.length # Extend the lengths. patch.length1 += prefix.length + suffix.length patch.length2 += prefix.length + suffix.length end # Compute a list of patches to turn text1 into text2. # Use diffs if provided, otherwise compute it ourselves. # There are four ways to call this function, depending on what data is # available to the caller: # Method 1: # a = text1, b = text2 # Method 2: # a = diffs # Method 3 (optimal): # a = text1, b = diffs # Method 4 (deprecated, use method 3): # a = text1, b = text2, c = diffs def patch_make(*args) text1 = nil diffs = nil if args.length == 2 && args[0].is_a?(String) && args[1].is_a?(String) # Compute diffs from text1 and text2. text1 = args[0] text2 = args[1] diffs = diff_main(text1, text2, true) if diffs.length > 2 diff_cleanup_semantic(diffs) diff_cleanup_efficiency(diffs) end elsif args.length == 1 && args[0].is_a?(Array) # Compute text1 from diffs. diffs = args[0] text1 = diff_text1(diffs) elsif args.length == 2 && args[0].is_a?(String) && args[1].is_a?(Array) text1 = args[0] diffs = args[1] elsif args.length == 3 && args[0].is_a?(String) && args[1].is_a?(String) && args[2].is_a?(Array) # Method 4: text1, text2, diffs # text2 is not used. text1 = args[0] # text2 = args[1] diffs = args[2] else raise ArgumentError.new("Unknown call format to patch_make.") end return [] if diffs.empty? # Get rid of the null case. patches = [] patch = PatchObj.new char_count1 = 0 # Number of characters into the text1 string. char_count2 = 0 # Number of characters into the text2 string. prepatch_text = text1 # Recreate the patches to determine context info. postpatch_text = text1 diffs.each_with_index do |diff, x| diff_type, diff_text = diffs[x] if patch.diffs.empty? && diff_type != :equal # A new patch starts here. patch.start1 = char_count1 patch.start2 = char_count2 end case diff_type when :insert patch.diffs.push(diff) patch.length2 += diff_text.length postpatch_text = postpatch_text[0...char_count2] + diff_text + postpatch_text[char_count2..-1] when :delete patch.length1 += diff_text.length patch.diffs.push(diff) postpatch_text = postpatch_text[0...char_count2] + postpatch_text[(char_count2 + diff_text.length)..-1] when :equal if diff_text.length <= 2 * patch_margin && !patch.diffs.empty? && diffs.length != x + 1 # Small equality inside a patch. patch.diffs.push(diff) patch.length1 += diff_text.length patch.length2 += diff_text.length elsif diff_text.length >= 2 * patch_margin # Time for a new patch. unless patch.diffs.empty? patch_add_context(patch, prepatch_text) patches.push(patch) patch = PatchObj.new # Unlike Unidiff, our patch lists have a rolling context. # http://code.google.com/p/google-diff-match-patch/wiki/Unidiff # Update prepatch text & pos to reflect the application of the # just completed patch. prepatch_text = postpatch_text char_count1 = char_count2 end end end # Update the current character count. if diff_type != :insert char_count1 += diff_text.length end if diff_type != :delete char_count2 += diff_text.length end end # Pick up the leftover patch if not empty. unless patch.diffs.empty? patch_add_context(patch, prepatch_text) patches.push(patch) end patches end # Merge a set of patches onto the text. Return a patched text, as well # as a list of true/false values indicating which patches were applied. def patch_apply(patches, text) return [text, []] if patches.empty? # Deep copy the patches so that no changes are made to originals. patches = Marshal.load(Marshal.dump(patches)) null_padding = patch_add_padding(patches) text = null_padding + text + null_padding patch_split_max(patches) # delta keeps track of the offset between the expected and actual location # of the previous patch. If there are patches expected at positions 10 and # 20, but the first patch was found at 12, delta is 2 and the second patch # has an effective expected position of 22. delta = 0 results = [] patches.each_with_index do |patch, x| expected_loc = patch.start2 + delta text1 = diff_text1(patch.diffs) end_loc = -1 if text1.length > match_max_bits # patch_splitMax will only provide an oversized pattern in the case of # a monster delete. start_loc = match_main(text, text1[0, match_max_bits], expected_loc) if start_loc != -1 end_loc = match_main(text, text1[(text1.length - match_max_bits)..-1], expected_loc + text1.length - match_max_bits) if end_loc == -1 || start_loc >= end_loc # Can't find valid trailing context. Drop this patch. start_loc = -1 end end else start_loc = match_main(text, text1, expected_loc) end if start_loc == -1 # No match found. :( results[x] = false # Subtract the delta for this failed patch from subsequent patches. delta -= patch.length2 - patch.length1 else # Found a match. :) results[x] = true delta = start_loc - expected_loc text2 = text[start_loc, end_loc == -1 ? text1.length : end_loc + match_max_bits] if text1 == text2 # Perfect match, just shove the replacement text in. text = text[0, start_loc] + diff_text2(patch.diffs) + text[(start_loc + text1.length)..-1] else # Imperfect match. # Run a diff to get a framework of equivalent indices. diffs = diff_main(text1, text2, false) if text1.length > match_max_bits && diff_levenshtein(diffs).to_f / text1.length > patch_delete_threshold # The end points match, but the content is unacceptably bad. results[x] = false else diff_cleanup_semantic_lossless(diffs) index1 = 0 patch.diffs.each do |op, data| if op != :equal index2 = diff_x_index(diffs, index1) end if op == :insert # Insertion text = text[0, start_loc + index2] + data + text[(start_loc + index2)..-1] elsif op == :delete # Deletion text = text[0, start_loc + index2] + text[(start_loc + diff_x_index(diffs, index1 + data.length))..-1] end if op != :delete index1 += data.length end end end end end end # Strip the padding off. text = text[null_padding.length...-null_padding.length] [text, results] end # Add some padding on text start and end so that edges can match # something. Intended to be called only from within patch_apply. def patch_add_padding(patches) padding_length = patch_margin null_padding = (1..padding_length).map { |x| x.chr(Encoding::UTF_8) }.join # Bump all the patches forward. patches.each do |patch| patch.start1 += padding_length patch.start2 += padding_length end # Add some padding on start of first diff. patch = patches.first diffs = patch.diffs if diffs.empty? || diffs.first[0] != :equal # Add nullPadding equality. diffs.unshift([:equal, null_padding]) patch.start1 -= padding_length # Should be 0. patch.start2 -= padding_length # Should be 0. patch.length1 += padding_length patch.length2 += padding_length elsif padding_length > diffs.first[1].length # Grow first equality. extra_length = padding_length - diffs.first[1].length diffs.first[1] = null_padding[diffs.first[1].length..-1] + diffs.first[1] patch.start1 -= extra_length patch.start2 -= extra_length patch.length1 += extra_length patch.length2 += extra_length end # Add some padding on end of last diff. patch = patches.last diffs = patch.diffs if diffs.empty? || diffs.last[0] != :equal # Add nullPadding equality. diffs.push([:equal, null_padding]) patch.length1 += padding_length patch.length2 += padding_length elsif padding_length > diffs.last[1].length # Grow last equality. extra_length = padding_length - diffs.last[1].length diffs.last[1] += null_padding[0, extra_length] patch.length1 += extra_length patch.length2 += extra_length end null_padding end # Look through the patches and break up any which are longer than the # maximum limit of the match algorithm. def patch_split_max(patches) patch_size = match_max_bits x = 0 while x < patches.length if patches[x].length1 > patch_size big_patch = patches[x] # Remove the big old patch patches[x, 1] = [] x -= 1 start1 = big_patch.start1 start2 = big_patch.start2 pre_context = "" until big_patch.diffs.empty? # Create one of several smaller patches. patch = PatchObj.new empty = true patch.start1 = start1 - pre_context.length patch.start2 = start2 - pre_context.length unless pre_context.empty? patch.length1 = patch.length2 = pre_context.length patch.diffs.push([:equal, pre_context]) end while !big_patch.diffs.empty? && patch.length1 < patch_size - patch_margin diff = big_patch.diffs.first if diff[0] == :insert # Insertions are harmless. patch.length2 += diff[1].length start2 += diff[1].length patch.diffs.push(big_patch.diffs.shift) empty = false elsif diff[0] == :delete && patch.diffs.length == 1 && patch.diffs.first[0] == :equal && diff[1].length > 2 * patch_size # This is a large deletion. Let it pass in one chunk. patch.length1 += diff[1].length start1 += diff[1].length empty = false patch.diffs.push(big_patch.diffs.shift) else # Deletion or equality. Only take as much as we can stomach. diff_text = diff[1][0, patch_size - patch.length1 - patch_margin] patch.length1 += diff_text.length start1 += diff_text.length if diff[0] == :equal patch.length2 += diff_text.length start2 += diff_text.length else empty = false end patch.diffs.push([diff[0], diff_text]) if diff_text == big_patch.diffs.first[1] big_patch.diffs.shift else big_patch.diffs.first[1] = big_patch.diffs.first[1][diff_text.length..-1] end end end # Compute the head context for the next patch. pre_context = diff_text2(patch.diffs)[-patch_margin..-1] || "" # Append the end context for this patch. post_context = diff_text1(big_patch.diffs)[0...patch_margin] || "" unless post_context.empty? patch.length1 += post_context.length patch.length2 += post_context.length if !patch.diffs.empty? && patch.diffs.last[0] == :equal patch.diffs.last[1] += post_context else patch.diffs.push([:equal, post_context]) end end unless empty x += 1 patches[x, 0] = [patch] end end end x += 1 end end end DiffMatchPatch = DiMaPa