PERF: Replace video and audio links in search blurb while indexing.
In the near future, we will be swtiching to PG headlines to generate the search blurb. As such, we need to replace audio and video links in the raw data used for headline generation. This also means that we avoid replacing links each time we need to generate the blurb.
This commit is contained in:
parent
06ef87da51
commit
255b0e9f14
|
@ -88,7 +88,7 @@ class GroupSmtpMailer < ActionMailer::Base
|
|||
|
||||
def strip_secure_urls(raw)
|
||||
urls = Set.new
|
||||
raw.scan(URI.regexp(%w{http https})) { urls << $& }
|
||||
raw.scan(Discourse::Utils::URI_REGEXP) { urls << $& }
|
||||
|
||||
urls.each do |url|
|
||||
if (url.start_with?(Discourse.store.s3_upload_host) && FileHelper.is_supported_media?(url))
|
||||
|
|
|
@ -365,7 +365,7 @@ class UserNotifications < ActionMailer::Base
|
|||
|
||||
def strip_secure_urls(raw)
|
||||
urls = Set.new
|
||||
raw.scan(URI.regexp(%w{http https})) { urls << $& }
|
||||
raw.scan(Discourse::Utils::URI_REGEXP) { urls << $& }
|
||||
|
||||
urls.each do |url|
|
||||
if (url.start_with?(Discourse.store.s3_upload_host) && FileHelper.is_supported_media?(url))
|
||||
|
|
|
@ -46,13 +46,6 @@ class SearchIndexer
|
|||
d: search_data[3],
|
||||
}
|
||||
|
||||
indexed_data =
|
||||
if table.to_s == "post"
|
||||
ranked_params[:d]
|
||||
else
|
||||
search_data.select { |d| d.length > 0 }.join(' ')
|
||||
end
|
||||
|
||||
tsvector = DB.query_single("SELECT #{ranked_index}", ranked_params)[0]
|
||||
additional_lexemes = []
|
||||
|
||||
|
@ -75,6 +68,13 @@ class SearchIndexer
|
|||
|
||||
tsvector = "#{tsvector} #{additional_lexemes.join(' ')}"
|
||||
|
||||
indexed_data =
|
||||
if table.to_s == "post"
|
||||
clean_post_raw_data!(ranked_params[:d])
|
||||
else
|
||||
search_data.select { |d| d.length > 0 }.join(' ')
|
||||
end
|
||||
|
||||
params = {
|
||||
raw_data: indexed_data,
|
||||
id: id,
|
||||
|
@ -216,6 +216,26 @@ class SearchIndexer
|
|||
end
|
||||
end
|
||||
|
||||
def self.clean_post_raw_data!(raw_data)
|
||||
urls = Set.new
|
||||
raw_data.scan(Discourse::Utils::URI_REGEXP) { urls << $& }
|
||||
|
||||
urls.each do |url|
|
||||
begin
|
||||
case File.extname(URI(url).path || "")
|
||||
when Oneboxer::VIDEO_REGEX
|
||||
raw_data.gsub!(url, I18n.t("search.video"))
|
||||
when Oneboxer::AUDIO_REGEX
|
||||
raw_data.gsub!(url, I18n.t("search.audio"))
|
||||
end
|
||||
rescue URI::InvalidURIError
|
||||
end
|
||||
end
|
||||
|
||||
raw_data
|
||||
end
|
||||
private_class_method :clean_post_raw_data!
|
||||
|
||||
class HtmlScrubber < Nokogiri::XML::SAX::Document
|
||||
|
||||
attr_reader :scrubbed
|
||||
|
|
|
@ -24,6 +24,8 @@ module Discourse
|
|||
end
|
||||
|
||||
class Utils
|
||||
URI_REGEXP = URI.regexp(%w{http https})
|
||||
|
||||
# Usage:
|
||||
# Discourse::Utils.execute_command("pwd", chdir: 'mydirectory')
|
||||
# or with a block
|
||||
|
|
|
@ -106,14 +106,14 @@ class Search
|
|||
end
|
||||
end
|
||||
|
||||
URI_REGEXP = URI.regexp(%w{http https})
|
||||
|
||||
def self.blurb_for(cooked: nil, term: nil, blurb_length: BLURB_LENGTH, scrub: true)
|
||||
blurb = nil
|
||||
cooked = SearchIndexer.scrub_html_for_search(cooked) if scrub
|
||||
|
||||
if scrub
|
||||
cooked = SearchIndexer.scrub_html_for_search(cooked)
|
||||
|
||||
urls = Set.new
|
||||
cooked.scan(URI_REGEXP) { urls << $& }
|
||||
cooked.scan(Discourse::Utils::URI_REGEXP) { urls << $& }
|
||||
urls.each do |url|
|
||||
begin
|
||||
case File.extname(URI(url).path || "")
|
||||
|
@ -125,6 +125,7 @@ class Search
|
|||
rescue URI::InvalidURIError
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
if term
|
||||
if term =~ Regexp.new(Search::PHRASE_MATCH_REGEXP_PATTERN)
|
||||
|
|
|
@ -209,6 +209,29 @@ describe SearchIndexer do
|
|||
"Let me see how I can fix this image white walkers GOT"
|
||||
)
|
||||
end
|
||||
|
||||
it 'should strips audio and videos URLs from raw data' do
|
||||
SiteSetting.authorized_extensions = 'mp4'
|
||||
upload = Fabricate(:video_upload)
|
||||
|
||||
post.update!(raw: <<~RAW)
|
||||
link to an external page: https://google.com/?u=bar
|
||||
|
||||
link to an audio file: https://somesite.com/audio.m4a
|
||||
|
||||
link to a video file: https://somesite.com/content/somethingelse.MOV
|
||||
|
||||
link to an invalid URL: http:error]
|
||||
RAW
|
||||
|
||||
expect(post.post_search_data.raw_data).to eq(
|
||||
"link to an external page: https://google.com/ link to an audio file: #{I18n.t("search.audio")} link to a video file: #{I18n.t("search.video")} link to an invalid URL: http:error]"
|
||||
)
|
||||
|
||||
expect(post.post_search_data.search_data).to eq(
|
||||
"'/audio.m4a':23 '/content/somethingelse.mov':31 'audio':19 'com':15,22,30 'error':38 'extern':13 'file':20,28 'google.com':15 'http':37 'invalid':35 'link':10,16,24,32 'page':14 'somesite.com':22,30 'somesite.com/audio.m4a':21 'somesite.com/content/somethingelse.mov':29 'test':8A 'titl':4A 'uncategor':9B 'url':36 'video':27"
|
||||
)
|
||||
end
|
||||
end
|
||||
|
||||
describe '.queue_post_reindex' do
|
||||
|
|
Loading…
Reference in New Issue