PERF: Replace video and audio links in search blurb while indexing.

In the near future, we will be swtiching to PG headlines to generate the
search blurb. As such, we need to replace audio and video links in the
raw data used for headline generation. This also means that we avoid
replacing links each time we need to generate the blurb.
This commit is contained in:
Guo Xiang Tan 2020-08-06 12:25:03 +08:00
parent 06ef87da51
commit 255b0e9f14
No known key found for this signature in database
GPG Key ID: FBD110179AAC1F20
6 changed files with 68 additions and 22 deletions

View File

@ -88,7 +88,7 @@ class GroupSmtpMailer < ActionMailer::Base
def strip_secure_urls(raw)
urls = Set.new
raw.scan(URI.regexp(%w{http https})) { urls << $& }
raw.scan(Discourse::Utils::URI_REGEXP) { urls << $& }
urls.each do |url|
if (url.start_with?(Discourse.store.s3_upload_host) && FileHelper.is_supported_media?(url))

View File

@ -365,7 +365,7 @@ class UserNotifications < ActionMailer::Base
def strip_secure_urls(raw)
urls = Set.new
raw.scan(URI.regexp(%w{http https})) { urls << $& }
raw.scan(Discourse::Utils::URI_REGEXP) { urls << $& }
urls.each do |url|
if (url.start_with?(Discourse.store.s3_upload_host) && FileHelper.is_supported_media?(url))

View File

@ -46,13 +46,6 @@ class SearchIndexer
d: search_data[3],
}
indexed_data =
if table.to_s == "post"
ranked_params[:d]
else
search_data.select { |d| d.length > 0 }.join(' ')
end
tsvector = DB.query_single("SELECT #{ranked_index}", ranked_params)[0]
additional_lexemes = []
@ -75,6 +68,13 @@ class SearchIndexer
tsvector = "#{tsvector} #{additional_lexemes.join(' ')}"
indexed_data =
if table.to_s == "post"
clean_post_raw_data!(ranked_params[:d])
else
search_data.select { |d| d.length > 0 }.join(' ')
end
params = {
raw_data: indexed_data,
id: id,
@ -216,6 +216,26 @@ class SearchIndexer
end
end
def self.clean_post_raw_data!(raw_data)
urls = Set.new
raw_data.scan(Discourse::Utils::URI_REGEXP) { urls << $& }
urls.each do |url|
begin
case File.extname(URI(url).path || "")
when Oneboxer::VIDEO_REGEX
raw_data.gsub!(url, I18n.t("search.video"))
when Oneboxer::AUDIO_REGEX
raw_data.gsub!(url, I18n.t("search.audio"))
end
rescue URI::InvalidURIError
end
end
raw_data
end
private_class_method :clean_post_raw_data!
class HtmlScrubber < Nokogiri::XML::SAX::Document
attr_reader :scrubbed

View File

@ -24,6 +24,8 @@ module Discourse
end
class Utils
URI_REGEXP = URI.regexp(%w{http https})
# Usage:
# Discourse::Utils.execute_command("pwd", chdir: 'mydirectory')
# or with a block

View File

@ -106,14 +106,14 @@ class Search
end
end
URI_REGEXP = URI.regexp(%w{http https})
def self.blurb_for(cooked: nil, term: nil, blurb_length: BLURB_LENGTH, scrub: true)
blurb = nil
cooked = SearchIndexer.scrub_html_for_search(cooked) if scrub
if scrub
cooked = SearchIndexer.scrub_html_for_search(cooked)
urls = Set.new
cooked.scan(URI_REGEXP) { urls << $& }
cooked.scan(Discourse::Utils::URI_REGEXP) { urls << $& }
urls.each do |url|
begin
case File.extname(URI(url).path || "")
@ -125,6 +125,7 @@ class Search
rescue URI::InvalidURIError
end
end
end
if term
if term =~ Regexp.new(Search::PHRASE_MATCH_REGEXP_PATTERN)

View File

@ -209,6 +209,29 @@ describe SearchIndexer do
"Let me see how I can fix this image white walkers GOT"
)
end
it 'should strips audio and videos URLs from raw data' do
SiteSetting.authorized_extensions = 'mp4'
upload = Fabricate(:video_upload)
post.update!(raw: <<~RAW)
link to an external page: https://google.com/?u=bar
link to an audio file: https://somesite.com/audio.m4a
link to a video file: https://somesite.com/content/somethingelse.MOV
link to an invalid URL: http:error]
RAW
expect(post.post_search_data.raw_data).to eq(
"link to an external page: https://google.com/ link to an audio file: #{I18n.t("search.audio")} link to a video file: #{I18n.t("search.video")} link to an invalid URL: http:error]"
)
expect(post.post_search_data.search_data).to eq(
"'/audio.m4a':23 '/content/somethingelse.mov':31 'audio':19 'com':15,22,30 'error':38 'extern':13 'file':20,28 'google.com':15 'http':37 'invalid':35 'link':10,16,24,32 'page':14 'somesite.com':22,30 'somesite.com/audio.m4a':21 'somesite.com/content/somethingelse.mov':29 'test':8A 'titl':4A 'uncategor':9B 'url':36 'video':27"
)
end
end
describe '.queue_post_reindex' do