FIX: Limit PG headline based search blurb generation to 200 characters.

* Recovers omission characters '...' in blurb as well.
This commit is contained in:
Guo Xiang Tan 2020-08-12 15:33:26 +08:00
parent ec173a72d9
commit 93f8396b4b
No known key found for this signature in database
GPG Key ID: FBD110179AAC1F20
3 changed files with 45 additions and 5 deletions

View File

@ -1177,8 +1177,28 @@ class Search
.joins("INNER JOIN post_search_data pd ON pd.post_id = posts.id")
.joins("INNER JOIN topics t1 ON t1.id = posts.topic_id")
.select(
"TS_HEADLINE(#{ts_config}, t1.fancy_title, PLAINTO_TSQUERY(#{ts_config}, '#{search_term}'), 'StartSel=''<span class=\"#{HIGHLIGHT_CSS_CLASS}\">'', StopSel=''</span>''') AS topic_title_headline",
"TS_HEADLINE(#{ts_config}, LEFT(pd.raw_data, #{MAX_LENGTH_FOR_HEADLINE}), PLAINTO_TSQUERY(#{ts_config}, '#{search_term}'), 'ShortWord=0, MaxFragments=1, MinWords=50, MaxWords=51, StartSel=''<span class=\"#{HIGHLIGHT_CSS_CLASS}\">'', StopSel=''</span>''') AS headline",
"TS_HEADLINE(
#{ts_config},
t1.fancy_title,
PLAINTO_TSQUERY(#{ts_config}, '#{search_term}'),
'StartSel=''<span class=\"#{HIGHLIGHT_CSS_CLASS}\">'', StopSel=''</span>'''
) AS topic_title_headline",
"TS_HEADLINE(
#{ts_config},
LEFT(
TS_HEADLINE(
#{ts_config},
LEFT(pd.raw_data, #{MAX_LENGTH_FOR_HEADLINE}),
PLAINTO_TSQUERY(#{ts_config}, '#{search_term}'),
'ShortWord=0, MaxFragments=1, MinWords=50, MaxWords=51, StartSel='''', StopSel='''''
),
#{Search::GroupedSearchResults::BLURB_LENGTH}
),
PLAINTO_TSQUERY(#{ts_config}, '#{search_term}'),
'HighlightAll=true, StartSel=''<span class=\"#{HIGHLIGHT_CSS_CLASS}\">'', StopSel=''</span>'''
) AS headline",
"LEFT(pd.raw_data, 50) AS leading_raw_data",
"RIGHT(pd.raw_data, 50) AS trailing_raw_data",
default_scope.arel.projections
)
else

View File

@ -78,6 +78,9 @@ class Search
end
end
OMISSION = '...'
SCRUB_HEADLINE_REGEXP = /<span(?: \w+="[^"]+")* class="#{Search::HIGHLIGHT_CSS_CLASS}"(?: \w+="[^"]+")*>([^<]*)<\/span>/
def blurb(post)
opts = {
term: @blurb_term,
@ -86,7 +89,10 @@ class Search
if post.post_search_data.version > SearchIndexer::MIN_POST_REINDEX_VERSION
if SiteSetting.use_pg_headlines_for_excerpt
return post.headline
scrubbed_headline = post.headline.gsub(SCRUB_HEADLINE_REGEXP, '\1')
prefix_omission = scrubbed_headline.start_with?(post.leading_raw_data) ? '' : OMISSION
postfix_omission = scrubbed_headline.end_with?(post.trailing_raw_data) ? '' : OMISSION
return "#{prefix_omission}#{post.headline}#{postfix_omission}"
else
opts[:cooked] = post.post_search_data.raw_data
opts[:scrub] = false

View File

@ -410,7 +410,7 @@ describe Search do
end
let(:expected_blurb) do
"hundred characters to satisfy any test conditions that require content longer than the typical test post raw content. It really is some long content, folks. <span class=\"search-highlight\">elephant</span>"
"#{Search::GroupedSearchResults::OMISSION}hundred characters to satisfy any test conditions that require content longer than the typical test post raw content. It really is some long content, folks. <span class=\"#{Search::HIGHLIGHT_CSS_CLASS}\">elephant</span>"
end
it 'returns the post' do
@ -429,7 +429,7 @@ describe Search do
expect(post.topic_title_headline).to eq(topic.fancy_title)
end
it "it limits the headline to #{Search::MAX_LENGTH_FOR_HEADLINE} characters" do
it "only applies highlighting to the first #{Search::MAX_LENGTH_FOR_HEADLINE} characters" do
SiteSetting.use_pg_headlines_for_excerpt = true
reply.update!(raw: "#{'a' * Search::MAX_LENGTH_FOR_HEADLINE} #{reply.raw}")
@ -443,6 +443,20 @@ describe Search do
expect(post.headline.include?('elephant')).to eq(false)
end
it "limits the search headline to #{Search::GroupedSearchResults::BLURB_LENGTH} characters" do
SiteSetting.use_pg_headlines_for_excerpt = true
reply.update!(raw: "#{'a' * Search::GroupedSearchResults::BLURB_LENGTH} elephant")
result = Search.execute('elephant')
expect(result.posts.map(&:id)).to contain_exactly(reply.id)
post = result.posts.first
expect(result.blurb(post)).to eq("#{'a' * Search::GroupedSearchResults::BLURB_LENGTH}#{Search::GroupedSearchResults::OMISSION}")
end
it 'returns the right post and blurb for searches with phrase' do
SiteSetting.use_pg_headlines_for_excerpt = true