PERF: Limit characters used to generate headline for search blurb.

We determined using the following benchmark script that limiting to 2500 chars would mean a maximum of
25ms spent generating headlines.

```
require 'benchmark/ips'

string = <<~STRING
Far far away, behind the word mountains...
STRING

def sql_excerpt(string, l = 1000000)
  DB.query_single(<<~SQL)
  SELECT TS_HEADLINE('english', left('#{string}', #{l}), PLAINTO_TSQUERY('mountains'))
  SQL
end

def ruby_excerpt(string)
  output = DB.query_single("SELECT '#{string}'")[0]
  Search::GroupedSearchResults::TextHelper.excerpt(output, 'mountains', radius: 100)
end

puts "Ruby Excerpt: #{ruby_excerpt(string)}"
puts "SQL Excerpt: #{sql_excerpt(string)}"
puts

Benchmark.ips do |x|
  x.time = 10

  [1000, 2500, 5000, 10000, 20000, 50000].each do |l|
    short_string = string[0..l]

    x.report("ts_headline excerpt #{l}") do
      sql_excerpt(short_string, l)
    end

    x.report("actionview excerpt #{l}") do
      ruby_excerpt(short_string)
    end
  end

  x.compare!
end
```

```
actionview excerpt 1000:    20570.7 i/s
actionview excerpt 2500:    17863.1 i/s - 1.15x  (± 0.00) slower
actionview excerpt 5000:    14228.9 i/s - 1.45x  (± 0.00) slower
actionview excerpt 10000:    10906.2 i/s - 1.89x  (± 0.00) slower
actionview excerpt 20000:     6255.0 i/s - 3.29x  (± 0.00) slower
ts_headline excerpt 1000:     4337.5 i/s - 4.74x  (± 0.00) slower
actionview excerpt 50000:     3222.7 i/s - 6.38x  (± 0.00) slower
ts_headline excerpt 2500:     2240.4 i/s - 9.18x  (± 0.00) slower
ts_headline excerpt 5000:     1258.7 i/s - 16.34x  (± 0.00) slower
ts_headline excerpt 10000:      667.2 i/s - 30.83x  (± 0.00) slower
ts_headline excerpt 20000:      348.7 i/s - 58.98x  (± 0.00) slower
ts_headline excerpt 50000:      131.9 i/s - 155.91x  (± 0.00) slower
```
This commit is contained in:
Guo Xiang Tan 2020-08-07 14:36:12 +08:00
parent cf2797bf58
commit 053cbe3112
No known key found for this signature in database
GPG Key ID: FBD110179AAC1F20
2 changed files with 19 additions and 1 deletions

View File

@ -1164,6 +1164,10 @@ class Search
query.includes(topic: topic_eager_loads)
end
# Limited for performance reasons since `TS_HEADLINE` is slow when the text
# document is too long.
MAX_LENGTH_FOR_HEADLINE = 2500
def posts_scope(default_scope = Post.all)
if SiteSetting.use_pg_headlines_for_excerpt
search_term = @term.present? ? PG::Connection.escape_string(@term) : nil
@ -1174,7 +1178,7 @@ class Search
.joins("INNER JOIN topics t1 ON t1.id = posts.topic_id")
.select(
"TS_HEADLINE(#{ts_config}, t1.fancy_title, PLAINTO_TSQUERY(#{ts_config}, '#{search_term}'), 'StartSel=''<span class=\"#{HIGHLIGHT_CSS_CLASS}\">'', StopSel=''</span>''') AS topic_title_headline",
"TS_HEADLINE(#{ts_config}, pd.raw_data, PLAINTO_TSQUERY(#{ts_config}, '#{search_term}'), 'ShortWord=0, MaxFragments=1, MinWords=50, MaxWords=51, StartSel=''<span class=\"#{HIGHLIGHT_CSS_CLASS}\">'', StopSel=''</span>''') AS headline",
"TS_HEADLINE(#{ts_config}, LEFT(pd.raw_data, #{MAX_LENGTH_FOR_HEADLINE}), PLAINTO_TSQUERY(#{ts_config}, '#{search_term}'), 'ShortWord=0, MaxFragments=1, MinWords=50, MaxWords=51, StartSel=''<span class=\"#{HIGHLIGHT_CSS_CLASS}\">'', StopSel=''</span>''') AS headline",
default_scope.arel.projections
)
else

View File

@ -429,6 +429,20 @@ describe Search do
expect(post.topic_title_headline).to eq(topic.fancy_title)
end
it "it limits the headline to #{Search::MAX_LENGTH_FOR_HEADLINE} characters" do
SiteSetting.use_pg_headlines_for_excerpt = true
reply.update!(raw: "#{'a' * Search::MAX_LENGTH_FOR_HEADLINE} #{reply.raw}")
result = Search.execute('elephant')
expect(result.posts.map(&:id)).to contain_exactly(reply.id)
post = result.posts.first
expect(post.headline.include?('elephant')).to eq(false)
end
it 'returns the right post and blurb for searches with phrase' do
SiteSetting.use_pg_headlines_for_excerpt = true