From 134a4c66b7609e8681eff74feb8a12a4ac957059 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9gis=20Hanol?= Date: Thu, 27 Jun 2019 01:45:52 +0200 Subject: [PATCH] PERF: limit time spent diffing large blobs of text REFACTOR: s/.length/.size/g --- lib/discourse_diff.rb | 26 +++++++++++++------------- lib/onpdiff.rb | 24 ++++++++++++++---------- lib/post_revisor.rb | 2 +- spec/components/onpdiff_spec.rb | 6 ++++++ 4 files changed, 34 insertions(+), 24 deletions(-) diff --git a/lib/discourse_diff.rb b/lib/discourse_diff.rb index 041a71306a8..6cac77603ac 100644 --- a/lib/discourse_diff.rb +++ b/lib/discourse_diff.rb @@ -21,7 +21,7 @@ class DiscourseDiff def inline_html i = 0 inline = [] - while i < @block_by_block_diff.length + while i < @block_by_block_diff.size op_code = @block_by_block_diff[i][1] if op_code == :common then inline << @block_by_block_diff[i][0] else @@ -37,7 +37,7 @@ class DiscourseDiff second = i end - if i + 1 < @block_by_block_diff.length && @block_by_block_diff[i + 1][1] == opposite_op_code + if i + 1 < @block_by_block_diff.size && @block_by_block_diff[i + 1][1] == opposite_op_code diff = ONPDiff.new(tokenize_html(@block_by_block_diff[first][0]), tokenize_html(@block_by_block_diff[second][0])).diff inline << generate_inline_html(diff) i += 1 @@ -54,7 +54,7 @@ class DiscourseDiff def side_by_side_html i = 0 left, right = [], [] - while i < @block_by_block_diff.length + while i < @block_by_block_diff.size op_code = @block_by_block_diff[i][1] if op_code == :common left << @block_by_block_diff[i][0] @@ -74,7 +74,7 @@ class DiscourseDiff second = i end - if i + 1 < @block_by_block_diff.length && @block_by_block_diff[i + 1][1] == opposite_op_code + if i + 1 < @block_by_block_diff.size && @block_by_block_diff[i + 1][1] == opposite_op_code diff = ONPDiff.new(tokenize_html(@block_by_block_diff[first][0]), tokenize_html(@block_by_block_diff[second][0])).diff deleted, inserted = generate_side_by_side_html(diff) left << deleted @@ -93,7 +93,7 @@ class DiscourseDiff def side_by_side_markdown i = 0 table = [""] - while i < @line_by_line_diff.length + while i < @line_by_line_diff.size table << "" op_code = @line_by_line_diff[i][1] if op_code == :common @@ -110,9 +110,9 @@ class DiscourseDiff second = i end - if i + 1 < @line_by_line_diff.length && @line_by_line_diff[i + 1][1] == opposite_op_code + if i + 1 < @line_by_line_diff.size && @line_by_line_diff[i + 1][1] == opposite_op_code before_tokens, after_tokens = tokenize_markdown(@line_by_line_diff[first][0]), tokenize_markdown(@line_by_line_diff[second][0]) - if (before_tokens.length - after_tokens.length).abs > MAX_DIFFERENCE + if (before_tokens.size - after_tokens.size).abs > MAX_DIFFERENCE before_tokens, after_tokens = tokenize_line(@line_by_line_diff[first][0]), tokenize_line(@line_by_line_diff[second][0]) end diff = ONPDiff.new(before_tokens, after_tokens).short_diff @@ -147,25 +147,25 @@ class DiscourseDiff def tokenize_markdown(text) t, tokens = [], [] i = 0 - while i < text.length + while i < text.size if text[i] =~ /\w/ t << text[i] elsif text[i] =~ /[ \t]/ && t.join =~ /^\w+$/ begin t << text[i] i += 1 - end while i < text.length && text[i] =~ /[ \t]/ + end while i < text.size && text[i] =~ /[ \t]/ i -= 1 tokens << t.join t = [] else - tokens << t.join if t.length > 0 + tokens << t.join if t.size > 0 tokens << text[i] t = [] end i += 1 end - tokens << t.join if t.length > 0 + tokens << t.join if t.size > 0 tokens end @@ -180,7 +180,7 @@ class DiscourseDiff def add_class_or_wrap_in_tags(html_or_text, klass) result = html_or_text.dup index_of_next_chevron = result.index(">") - if result.length > 0 && result[0] == '<' && index_of_next_chevron + if result.size > 0 && result[0] == '<' && index_of_next_chevron index_of_class = result.index("class=") if index_of_class.nil? || index_of_class > index_of_next_chevron # we do not have a class for the current tag @@ -192,7 +192,7 @@ class DiscourseDiff if classes.include?("diff-#{klass}") result else - result.insert(index_of_class + "class=".length + 1, "diff-#{klass} ") + result.insert(index_of_class + "class=".size + 1, "diff-#{klass} ") end end else diff --git a/lib/onpdiff.rb b/lib/onpdiff.rb index bcca737ba2b..ef4799f2a4a 100644 --- a/lib/onpdiff.rb +++ b/lib/onpdiff.rb @@ -1,12 +1,13 @@ # frozen_string_literal: true # Use "An O(NP) Sequence Comparison Algorithm" as described by Sun Wu, Udi Manber and Gene Myers -# in http://www.itu.dk/stud/speciale/bepjea/xwebtex/litt/an-onp-sequence-comparison-algorithm.pdf +# in https://publications.mpi-cbg.de/Wu_1990_6334.pdf + class ONPDiff def initialize(a, b) @a, @b = a, b - @m, @n = a.length, b.length + @m, @n = a.size, b.size @backtrack = [] if @reverse = @m > @n @a, @b = @b, @a @@ -30,13 +31,15 @@ class ONPDiff return @shortest_path if @shortest_path size = @m + @n + 3 - fp = Array.new(size) { |i| -1 } - @path = Array.new(size) { |i| -1 } + fp = Array.new(size, -1) + @path = Array.new(size, -1) p = -1 begin p += 1 + return (@shortest_path = []) if p >= 1000 + k = -p while k <= @delta - 1 fp[k + @offset] = snake(k, fp[k - 1 + @offset] + 1, fp[k + 1 + @offset]) @@ -56,6 +59,7 @@ class ONPDiff r = @path[@delta + @offset] @shortest_path = [] + while r != -1 @shortest_path << [@backtrack[r][0], @backtrack[r][1]] r = @backtrack[r][2] @@ -74,7 +78,7 @@ class ONPDiff y += 1 end - @path[k + @offset] = @backtrack.length + @path[k + @offset] = @backtrack.size @backtrack << [x, y, r] y @@ -84,7 +88,7 @@ class ONPDiff ses = [] x, y = 1, 1 px, py = 0, 0 - i = shortest_path.length - 1 + i = shortest_path.size - 1 while i >= 0 while px < shortest_path[i][0] || py < shortest_path[i][1] if shortest_path[i][1] - shortest_path[i][0] > py - px @@ -114,12 +118,12 @@ class ONPDiff ses = [] x, y = 1, 1 px, py = 0, 0 - i = shortest_path.length - 1 + i = shortest_path.size - 1 while i >= 0 while px < shortest_path[i][0] || py < shortest_path[i][1] if shortest_path[i][1] - shortest_path[i][0] > py - px t = @reverse ? :delete : :add - if ses.length > 0 && ses[-1][1] == t + if ses.size > 0 && ses[-1][1] == t ses[-1][0] << @b[py] else ses << [@b[py], t] @@ -128,7 +132,7 @@ class ONPDiff py += 1 elsif shortest_path[i][1] - shortest_path[i][0] < py - px t = @reverse ? :add : :delete - if ses.length > 0 && ses[-1][1] == t + if ses.size > 0 && ses[-1][1] == t ses[-1][0] << @a[px] else ses << [@a[px], t] @@ -136,7 +140,7 @@ class ONPDiff x += 1 px += 1 else - if ses.length > 0 && ses[-1][1] == :common + if ses.size > 0 && ses[-1][1] == :common ses[-1][0] << @a[px] else ses << [@a[px], :common] diff --git a/lib/post_revisor.rb b/lib/post_revisor.rb index 8ac7b298775..527440f76f0 100644 --- a/lib/post_revisor.rb +++ b/lib/post_revisor.rb @@ -299,7 +299,7 @@ class PostRevisor max_diff = SiteSetting.editing_grace_period_max_diff_high_trust.to_i end - if (original_raw.length - new_raw.length).abs > max_diff || + if (original_raw.size - new_raw.size).abs > max_diff || diff_size(original_raw, new_raw) > max_diff return false end diff --git a/spec/components/onpdiff_spec.rb b/spec/components/onpdiff_spec.rb index 1d0b708563b..67c5b0beab3 100644 --- a/spec/components/onpdiff_spec.rb +++ b/spec/components/onpdiff_spec.rb @@ -15,6 +15,12 @@ describe ONPDiff do expect(ONPDiff.new("abcd", "abef").diff).to eq([["a", :common], ["b", :common], ["e", :add], ["f", :add], ["c", :delete], ["d", :delete]]) end + it "bails out on large diffs" do + a = SecureRandom.alphanumeric(5_000) + b = SecureRandom.alphanumeric(5_000) + expect(ONPDiff.new(a, b).diff).to eq([]) + end + end describe "short_diff" do