PERF: Avoid parsing `Post#cooked` with Nokogiri for every search.

This commit is contained in:
Guo Xiang Tan 2020-07-17 16:27:30 +08:00 committed by Alan Guo Xiang Tan
parent b979579c1b
commit 181c4eb760
7 changed files with 64 additions and 33 deletions

View File

@ -10,7 +10,7 @@ class SimilarTopicsController < ApplicationController
attr_reader :topic
def blurb
Search::GroupedSearchResults.blurb_for(@topic.try(:blurb))
Search::GroupedSearchResults.blurb_for(cooked: @topic.try(:blurb))
end
end

View File

@ -1,7 +1,7 @@
# frozen_string_literal: true
class SearchIndexer
POST_INDEX_VERSION = 3
POST_INDEX_VERSION = 4
MIN_POST_REINDEX_VERSION = 3
TOPIC_INDEX_VERSION = 3
CATEGORY_INDEX_VERSION = 3
@ -39,8 +39,6 @@ class SearchIndexer
setweight(to_tsvector('#{stemmer}', coalesce(:d,'')), 'D')
SQL
indexed_data = search_data.select { |d| d.length > 0 }.join(' ')
ranked_params = {
a: search_data[0],
b: search_data[1],
@ -48,6 +46,13 @@ class SearchIndexer
d: search_data[3],
}
indexed_data =
if table.to_s == "post"
ranked_params[:d]
else
search_data.select { |d| d.length > 0 }.join(' ')
end
tsvector = DB.query_single("SELECT #{ranked_index}", ranked_params)[0]
additional_lexemes = []
@ -105,7 +110,7 @@ class SearchIndexer
scrubbed_cooked = scrub_html_for_search(cooked)[0...Topic::MAX_SIMILAR_BODY_LENGTH]
# a bit inconsitent that we use title as A and body as B when in
# the post index body is C
# the post index body is D
update_index(table: 'topic', id: topic_id, raw_data: [title, scrubbed_cooked])
end
@ -165,9 +170,11 @@ class SearchIndexer
end
category_name = topic.category&.name if topic
if topic
tags = topic.tags.select(:id, :name)
unless tags.empty?
tags = topic.tags.select(:id, :name).to_a
if tags.present?
tag_names = (tags.map(&:name) + Tag.where(target_tag_id: tags.map(&:id)).pluck(:name)).join(' ')
end
end

View File

@ -1128,7 +1128,7 @@ class Search
end
def posts_eager_loads(query)
query = query.includes(:user)
query = query.includes(:user, :post_search_data)
topic_eager_loads = [:category]
if SiteSetting.tagging_enabled

View File

@ -58,7 +58,19 @@ class Search
end
def blurb(post)
GroupedSearchResults.blurb_for(post.cooked, @blurb_term, @blurb_length)
opts = {
term: @blurb_term,
blurb_length: @blurb_length
}
if post.post_search_data.version > SearchIndexer::MIN_POST_REINDEX_VERSION
opts[:cooked] = post.post_search_data.raw_data
opts[:scrub] = false
else
opts[:cooked] = post.cooked
end
GroupedSearchResults.blurb_for(**opts)
end
def add(object)
@ -73,9 +85,9 @@ class Search
end
end
def self.blurb_for(cooked, term = nil, blurb_length = BLURB_LENGTH)
def self.blurb_for(cooked: nil, term: nil, blurb_length: BLURB_LENGTH, scrub: true)
blurb = nil
cooked = SearchIndexer.scrub_html_for_search(cooked)
cooked = SearchIndexer.scrub_html_for_search(cooked) if scrub
urls = Set.new
cooked.scan(URI.regexp(%w{http https})) { urls << $& }

View File

@ -38,7 +38,7 @@ describe Search do
link to a video file: https://somesite.com/content/somethingelse.MOV
RAW
result = Search::GroupedSearchResults.blurb_for(cooked)
result = Search::GroupedSearchResults.blurb_for(cooked: cooked)
expect(result).to eq("link to an external page: https://google.com/?u=bar link to an audio file: #{I18n.t("search.audio")} link to a video file: #{I18n.t("search.video")}")
end
@ -51,7 +51,7 @@ describe Search do
http://localhost/uploads/default/original/1X/90adc0092b30c04b761541bc0322d0dce3d896e7.m4a
RAW
result = Search::GroupedSearchResults.blurb_for(cooked)
result = Search::GroupedSearchResults.blurb_for(cooked: cooked)
expect(result).to eq("Here goes a test cooked with enough characters to hit the blurb limit. Something is very interesting about this audio file. #{I18n.t("search.audio")}")
end
@ -59,7 +59,7 @@ describe Search do
cooked = <<~RAW
invalid URL: http:error] should not trip up blurb generation.
RAW
result = Search::GroupedSearchResults.blurb_for(cooked)
result = Search::GroupedSearchResults.blurb_for(cooked: cooked)
expect(result).to eq("invalid URL: http:error] should not trip up blurb generation.")
end
end

View File

@ -3,10 +3,22 @@
require 'rails_helper'
describe SearchController do
fab!(:awesome_topic) do
topic = Fabricate(:topic)
tag = Fabricate(:tag)
topic.tags << tag
Fabricate(:tag, target_tag_id: tag.id)
topic
end
fab!(:awesome_post) do
SearchIndexer.enable
Fabricate(:post, raw: 'this is my really awesome post')
Fabricate(:post, topic: awesome_topic, raw: 'this is my really awesome post')
end
fab!(:awesome_post_2) do
SearchIndexer.enable
Fabricate(:post, raw: 'this is my really awesome post 2')
end
fab!(:user) do
@ -95,10 +107,14 @@ describe SearchController do
data = response.parsed_body
expect(data['posts'].length).to eq(1)
expect(data['posts'][0]['id']).to eq(awesome_post.id)
expect(data['posts'][0]['blurb']).to eq(awesome_post.raw)
expect(data['topics'][0]['id']).to eq(awesome_post.topic_id)
expect(data['posts'].length).to eq(2)
expect(data['posts'][0]['id']).to eq(awesome_post_2.id)
expect(data['posts'][0]['blurb']).to eq(awesome_post_2.raw)
expect(data['topics'][0]['id']).to eq(awesome_post_2.topic_id)
expect(data['posts'][1]['id']).to eq(awesome_post.id)
expect(data['posts'][1]['blurb']).to eq(awesome_post.raw)
expect(data['topics'][1]['id']).to eq(awesome_post.topic_id)
end
it "can search correctly with advanced search filters" do

View File

@ -20,12 +20,13 @@ describe SearchIndexer do
it 'correctly indexes chinese' do
SiteSetting.default_locale = 'zh_CN'
data = "你好世界"
expect(data.split(" ").length).to eq(1)
SearchIndexer.update_posts_index(post_id, "你好世界", "", "", nil)
SearchIndexer.update_posts_index(post_id, "", "", "", data)
raw_data = PostSearchData.where(post_id: post_id).pluck(:raw_data)[0]
expect(raw_data.split(' ').length).to eq(2)
post_search_data = PostSearchData.find_by(post_id: post_id)
expect(post_search_data.raw_data).to eq("你好 世界")
expect(post_search_data.search_data).to eq("'世界':2 '你好':1")
end
it 'extract youtube title' do
@ -104,11 +105,6 @@ describe SearchIndexer do
expect(raw_data).to eq("This is a test")
expect(locale).to eq(SiteSetting.default_locale)
expect(version).to eq(SearchIndexer::POST_INDEX_VERSION)
SearchIndexer.update_posts_index(post_id, "tester", "", nil, nil)
raw_data = PostSearchData.where(post_id: post_id).pluck(:raw_data)[0]
expect(raw_data).to eq("tester")
end
describe '.index' do
@ -118,10 +114,10 @@ describe SearchIndexer do
expect { post }.to change { PostSearchData.count }.by(1)
expect { post.update!(raw: "this is new content") }
.to change { post.reload.post_search_data.raw_data }
.to change { post.reload.post_search_data.search_data }
expect { post.update!(topic_id: Fabricate(:topic).id) }
.to change { post.reload.post_search_data.raw_data }
.to change { post.reload.post_search_data.search_data }
end
it 'should not index posts with empty raw' do
@ -141,7 +137,7 @@ describe SearchIndexer do
topic = post.topic
expect(post.post_search_data.raw_data).to eq(
"#{topic.title} #{topic.category.name} https://meta.discourse.org/some.png"
"https://meta.discourse.org/some.png"
)
end
@ -158,7 +154,7 @@ describe SearchIndexer do
topic = post.topic
expect(post.post_search_data.raw_data).to eq(
"#{topic.title} #{category.name} a https://cnn.com , http://stuff.com.au b http://abc.net/xyz=1 abc.net/xyz=1"
"a https://cnn.com , http://stuff.com.au b http://abc.net/xyz=1 abc.net/xyz=1"
)
expect(post.post_search_data.search_data).to eq(
@ -190,7 +186,7 @@ describe SearchIndexer do
)
expect(post.post_search_data.raw_data).to eq(
"#{topic.title} #{topic.category.name} Let me see how I can fix this image white walkers GOT"
"Let me see how I can fix this image white walkers GOT"
)
end
end