FEATURE: Use Postgres unaccent to ignore accents (#16100)
The search_ignore_accents site setting can be used to make the search indexer remove the accents before indexing the content. The unaccent function from PostgreSQL is better than Ruby's unicode_normalize(:nfkd).
This commit is contained in:
parent
6e7cdc5bc3
commit
34b4b53bac
|
@ -17,10 +17,6 @@ class SearchIndexer
|
||||||
@disabled = false
|
@disabled = false
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.scrub_html_for_search(html, strip_diacritics: SiteSetting.search_ignore_accents)
|
|
||||||
HtmlScrubber.scrub(html, strip_diacritics: strip_diacritics)
|
|
||||||
end
|
|
||||||
|
|
||||||
def self.update_index(table: , id: , a_weight: nil, b_weight: nil, c_weight: nil, d_weight: nil)
|
def self.update_index(table: , id: , a_weight: nil, b_weight: nil, c_weight: nil, d_weight: nil)
|
||||||
raw_data = [a_weight, b_weight, c_weight, d_weight]
|
raw_data = [a_weight, b_weight, c_weight, d_weight]
|
||||||
|
|
||||||
|
@ -35,10 +31,10 @@ class SearchIndexer
|
||||||
stemmer = table == "user" ? "simple" : Search.ts_config
|
stemmer = table == "user" ? "simple" : Search.ts_config
|
||||||
|
|
||||||
ranked_index = <<~SQL
|
ranked_index = <<~SQL
|
||||||
setweight(to_tsvector('#{stemmer}', coalesce(:a,'')), 'A') ||
|
setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:a,''))")}, 'A') ||
|
||||||
setweight(to_tsvector('#{stemmer}', coalesce(:b,'')), 'B') ||
|
setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:b,''))")}, 'B') ||
|
||||||
setweight(to_tsvector('#{stemmer}', coalesce(:c,'')), 'C') ||
|
setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:c,''))")}, 'C') ||
|
||||||
setweight(to_tsvector('#{stemmer}', coalesce(:d,'')), 'D')
|
setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:d,''))")}, 'D')
|
||||||
SQL
|
SQL
|
||||||
|
|
||||||
ranked_params = {
|
ranked_params = {
|
||||||
|
@ -109,7 +105,7 @@ class SearchIndexer
|
||||||
table: 'topic',
|
table: 'topic',
|
||||||
id: topic_id,
|
id: topic_id,
|
||||||
a_weight: title,
|
a_weight: title,
|
||||||
b_weight: scrub_html_for_search(cooked)[0...Topic::MAX_SIMILAR_BODY_LENGTH]
|
b_weight: HtmlScrubber.scrub(cooked)[0...Topic::MAX_SIMILAR_BODY_LENGTH]
|
||||||
)
|
)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -124,7 +120,7 @@ class SearchIndexer
|
||||||
# the original string. Since there is no way to estimate the length of
|
# the original string. Since there is no way to estimate the length of
|
||||||
# the expected tsvector, we limit the input to ~50% of the maximum
|
# the expected tsvector, we limit the input to ~50% of the maximum
|
||||||
# length of a tsvector (1_048_576 bytes).
|
# length of a tsvector (1_048_576 bytes).
|
||||||
d_weight: scrub_html_for_search(cooked)[0..600_000]
|
d_weight: HtmlScrubber.scrub(cooked)[0..600_000]
|
||||||
) do |params|
|
) do |params|
|
||||||
params["private_message"] = private_message
|
params["private_message"] = private_message
|
||||||
end
|
end
|
||||||
|
@ -294,12 +290,11 @@ class SearchIndexer
|
||||||
|
|
||||||
attr_reader :scrubbed
|
attr_reader :scrubbed
|
||||||
|
|
||||||
def initialize(strip_diacritics: false)
|
def initialize
|
||||||
@scrubbed = +""
|
@scrubbed = +""
|
||||||
@strip_diacritics = strip_diacritics
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.scrub(html, strip_diacritics: false)
|
def self.scrub(html)
|
||||||
return +"" if html.blank?
|
return +"" if html.blank?
|
||||||
|
|
||||||
begin
|
begin
|
||||||
|
@ -338,9 +333,9 @@ class SearchIndexer
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
me = new(strip_diacritics: strip_diacritics)
|
html_scrubber = new
|
||||||
Nokogiri::HTML::SAX::Parser.new(me).parse(document.to_html)
|
Nokogiri::HTML::SAX::Parser.new(html_scrubber).parse(document.to_html)
|
||||||
me.scrubbed.squish
|
html_scrubber.scrubbed.squish
|
||||||
end
|
end
|
||||||
|
|
||||||
MENTION_CLASSES ||= %w{mention mention-group}
|
MENTION_CLASSES ||= %w{mention mention-group}
|
||||||
|
@ -362,7 +357,6 @@ class SearchIndexer
|
||||||
end
|
end
|
||||||
|
|
||||||
def characters(str)
|
def characters(str)
|
||||||
str = Search.strip_diacritics(str) if @strip_diacritics
|
|
||||||
scrubbed << " #{str} "
|
scrubbed << " #{str} "
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -0,0 +1,7 @@
|
||||||
|
# frozen_string_literal: true
|
||||||
|
|
||||||
|
class EnableUnaccentExtension < ActiveRecord::Migration[6.1]
|
||||||
|
def change
|
||||||
|
enable_extension 'unaccent'
|
||||||
|
end
|
||||||
|
end
|
|
@ -21,13 +21,6 @@ class Search
|
||||||
5
|
5
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.strip_diacritics(str)
|
|
||||||
s = str.unicode_normalize(:nfkd)
|
|
||||||
s.gsub!(DIACRITICS, "")
|
|
||||||
s.strip!
|
|
||||||
s
|
|
||||||
end
|
|
||||||
|
|
||||||
def self.per_filter
|
def self.per_filter
|
||||||
50
|
50
|
||||||
end
|
end
|
||||||
|
@ -64,6 +57,10 @@ class Search
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def self.wrap_unaccent(str)
|
||||||
|
SiteSetting.search_ignore_accents ? "unaccent(#{str})" : str
|
||||||
|
end
|
||||||
|
|
||||||
def self.segment_chinese?
|
def self.segment_chinese?
|
||||||
['zh_TW', 'zh_CN'].include?(SiteSetting.default_locale) || SiteSetting.search_tokenize_chinese
|
['zh_TW', 'zh_CN'].include?(SiteSetting.default_locale) || SiteSetting.search_tokenize_chinese
|
||||||
end
|
end
|
||||||
|
@ -115,10 +112,6 @@ class Search
|
||||||
else
|
else
|
||||||
data.squish!
|
data.squish!
|
||||||
end
|
end
|
||||||
|
|
||||||
if SiteSetting.search_ignore_accents
|
|
||||||
data = strip_diacritics(data)
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
|
||||||
data.gsub!(/\S+/) do |str|
|
data.gsub!(/\S+/) do |str|
|
||||||
|
@ -704,7 +697,7 @@ class Search
|
||||||
FROM topic_tags tt, tags
|
FROM topic_tags tt, tags
|
||||||
WHERE tt.tag_id = tags.id
|
WHERE tt.tag_id = tags.id
|
||||||
GROUP BY tt.topic_id
|
GROUP BY tt.topic_id
|
||||||
HAVING to_tsvector(#{default_ts_config}, array_to_string(array_agg(lower(tags.name)), ' ')) @@ to_tsquery(#{default_ts_config}, ?)
|
HAVING to_tsvector(#{default_ts_config}, #{Search.wrap_unaccent("array_to_string(array_agg(lower(tags.name)), ' ')")}) @@ to_tsquery(#{default_ts_config}, #{Search.wrap_unaccent('?')})
|
||||||
)", tags.join('&'))
|
)", tags.join('&'))
|
||||||
else
|
else
|
||||||
tags = match.split(",")
|
tags = match.split(",")
|
||||||
|
@ -1151,7 +1144,8 @@ class Search
|
||||||
|
|
||||||
def self.to_tsquery(ts_config: nil, term:, joiner: nil)
|
def self.to_tsquery(ts_config: nil, term:, joiner: nil)
|
||||||
ts_config = ActiveRecord::Base.connection.quote(ts_config) if ts_config
|
ts_config = ActiveRecord::Base.connection.quote(ts_config) if ts_config
|
||||||
tsquery = "TO_TSQUERY(#{ts_config || default_ts_config}, '#{self.escape_string(term)}')"
|
escaped_term = Search.wrap_unaccent("'#{self.escape_string(term)}'")
|
||||||
|
tsquery = "TO_TSQUERY(#{ts_config || default_ts_config}, #{escaped_term})"
|
||||||
tsquery = "REPLACE(#{tsquery}::text, '&', '#{self.escape_string(joiner)}')::tsquery" if joiner
|
tsquery = "REPLACE(#{tsquery}::text, '&', '#{self.escape_string(joiner)}')::tsquery" if joiner
|
||||||
tsquery
|
tsquery
|
||||||
end
|
end
|
||||||
|
|
|
@ -120,7 +120,7 @@ class Search
|
||||||
blurb = nil
|
blurb = nil
|
||||||
|
|
||||||
if scrub
|
if scrub
|
||||||
cooked = SearchIndexer.scrub_html_for_search(cooked)
|
cooked = SearchIndexer::HtmlScrubber.scrub(cooked)
|
||||||
|
|
||||||
urls = Set.new
|
urls = Set.new
|
||||||
cooked.scan(Discourse::Utils::URI_REGEXP) { urls << $& }
|
cooked.scan(Discourse::Utils::URI_REGEXP) { urls << $& }
|
||||||
|
|
|
@ -77,6 +77,35 @@ describe Search do
|
||||||
expect(result.tags).to contain_exactly()
|
expect(result.tags).to contain_exactly()
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
context "accents" do
|
||||||
|
fab!(:post_1) { Fabricate(:post, raw: "Cette ****** d'art n'est pas une œuvre") }
|
||||||
|
fab!(:post_2) { Fabricate(:post, raw: "Cette oeuvre d'art n'est pas une *****") }
|
||||||
|
|
||||||
|
before do
|
||||||
|
SearchIndexer.enable
|
||||||
|
end
|
||||||
|
|
||||||
|
after do
|
||||||
|
SearchIndexer.disable
|
||||||
|
end
|
||||||
|
|
||||||
|
it "removes them if search_ignore_accents" do
|
||||||
|
SiteSetting.search_ignore_accents = true
|
||||||
|
[post_1, post_2].each { |post| SearchIndexer.index(post.topic, force: true) }
|
||||||
|
|
||||||
|
expect(Search.execute("oeuvre").posts).to contain_exactly(post_1, post_2)
|
||||||
|
expect(Search.execute("œuvre").posts).to contain_exactly(post_1, post_2)
|
||||||
|
end
|
||||||
|
|
||||||
|
it "does not remove them if not search_ignore_accents" do
|
||||||
|
SiteSetting.search_ignore_accents = false
|
||||||
|
[post_1, post_2].each { |post| SearchIndexer.index(post.topic, force: true) }
|
||||||
|
|
||||||
|
expect(Search.execute("œuvre").posts).to contain_exactly(post_1)
|
||||||
|
expect(Search.execute("oeuvre").posts).to contain_exactly(post_2)
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
context "custom_eager_load" do
|
context "custom_eager_load" do
|
||||||
|
|
|
@ -11,10 +11,6 @@ describe SearchIndexer do
|
||||||
SearchIndexer.disable
|
SearchIndexer.disable
|
||||||
end
|
end
|
||||||
|
|
||||||
def scrub(html, strip_diacritics: false)
|
|
||||||
SearchIndexer.scrub_html_for_search(html, strip_diacritics: strip_diacritics)
|
|
||||||
end
|
|
||||||
|
|
||||||
it 'correctly indexes chinese' do
|
it 'correctly indexes chinese' do
|
||||||
SiteSetting.default_locale = 'zh_CN'
|
SiteSetting.default_locale = 'zh_CN'
|
||||||
data = "你好世界"
|
data = "你好世界"
|
||||||
|
@ -36,48 +32,36 @@ describe SearchIndexer do
|
||||||
|
|
||||||
it 'extract youtube title' do
|
it 'extract youtube title' do
|
||||||
html = "<div class=\"lazyYT\" data-youtube-id=\"lmFgeFh2nlw\" data-youtube-title=\"Metallica Mixer Explains Missing Bass on 'And Justice for All' [Exclusive]\" data-width=\"480\" data-height=\"270\" data-parameters=\"feature=oembed&wmode=opaque\"></div>"
|
html = "<div class=\"lazyYT\" data-youtube-id=\"lmFgeFh2nlw\" data-youtube-title=\"Metallica Mixer Explains Missing Bass on 'And Justice for All' [Exclusive]\" data-width=\"480\" data-height=\"270\" data-parameters=\"feature=oembed&wmode=opaque\"></div>"
|
||||||
scrubbed = scrub(html)
|
scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
|
||||||
expect(scrubbed).to eq("Metallica Mixer Explains Missing Bass on 'And Justice for All' [Exclusive]")
|
expect(scrubbed).to eq("Metallica Mixer Explains Missing Bass on 'And Justice for All' [Exclusive]")
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'extract a link' do
|
it 'extract a link' do
|
||||||
html = "<a href='http://meta.discourse.org/'>link</a>"
|
html = "<a href='http://meta.discourse.org/'>link</a>"
|
||||||
scrubbed = scrub(html)
|
scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
|
||||||
expect(scrubbed).to eq("http://meta.discourse.org/ link")
|
expect(scrubbed).to eq("http://meta.discourse.org/ link")
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'extracts @username from mentions' do
|
it 'extracts @username from mentions' do
|
||||||
html = '<p><a class="mention" href="/u/%E7%8B%AE%E5%AD%90">@狮子</a> <a class="mention" href="/u/foo">@foo</a></p>'
|
html = '<p><a class="mention" href="/u/%E7%8B%AE%E5%AD%90">@狮子</a> <a class="mention" href="/u/foo">@foo</a></p>'
|
||||||
scrubbed = scrub(html)
|
scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
|
||||||
expect(scrubbed).to eq('@狮子 @foo')
|
expect(scrubbed).to eq('@狮子 @foo')
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'extracts @groupname from group mentions' do
|
it 'extracts @groupname from group mentions' do
|
||||||
html = '<p><a class="mention-group" href="/groups/%D0%B0%D0%B2%D1%82%D0%BE%D0%BC%D0%BE%D0%B1%D0%B8%D0%BB%D0%B8%D1%81%D1%82">@автомобилист</a></p>'
|
html = '<p><a class="mention-group" href="/groups/%D0%B0%D0%B2%D1%82%D0%BE%D0%BC%D0%BE%D0%B1%D0%B8%D0%BB%D0%B8%D1%81%D1%82">@автомобилист</a></p>'
|
||||||
scrubbed = scrub(html)
|
scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
|
||||||
expect(scrubbed).to eq('@автомобилист')
|
expect(scrubbed).to eq('@автомобилист')
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'extracts emoji name from emoji image' do
|
it 'extracts emoji name from emoji image' do
|
||||||
emoji = Emoji["wink"]
|
emoji = Emoji["wink"]
|
||||||
html = %Q|<img src=\"#{URI.join(Discourse.base_url_no_prefix, emoji.url)}\" title=\":wink:\" class=\"emoji only-emoji\" alt=\":wink:\" loading=\"lazy\" width=\"20\" height=\"20\">|
|
html = %Q|<img src=\"#{URI.join(Discourse.base_url_no_prefix, emoji.url)}\" title=\":wink:\" class=\"emoji only-emoji\" alt=\":wink:\" loading=\"lazy\" width=\"20\" height=\"20\">|
|
||||||
scrubbed = scrub(html)
|
scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
|
||||||
|
|
||||||
expect(scrubbed).to eq(':wink:')
|
expect(scrubbed).to eq(':wink:')
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'uses ignore_accent setting to strip diacritics' do
|
|
||||||
html = "<p>HELLO Hétérogénéité Здравствуйте هتاف للترحيب 你好</p>"
|
|
||||||
|
|
||||||
SiteSetting.search_ignore_accents = true
|
|
||||||
scrubbed = SearchIndexer.scrub_html_for_search(html)
|
|
||||||
expect(scrubbed).to eq("HELLO Heterogeneite Здравствуите هتاف للترحيب 你好")
|
|
||||||
|
|
||||||
SiteSetting.search_ignore_accents = false
|
|
||||||
scrubbed = SearchIndexer.scrub_html_for_search(html)
|
|
||||||
expect(scrubbed).to eq("HELLO Hétérogénéité Здравствуйте هتاف للترحيب 你好")
|
|
||||||
end
|
|
||||||
|
|
||||||
it "doesn't index local files" do
|
it "doesn't index local files" do
|
||||||
html = <<~HTML
|
html = <<~HTML
|
||||||
<p><img src="https://www.discourse.org/logo.png" alt="Discourse"></p>
|
<p><img src="https://www.discourse.org/logo.png" alt="Discourse"></p>
|
||||||
|
@ -95,7 +79,7 @@ describe SearchIndexer do
|
||||||
</div>
|
</div>
|
||||||
HTML
|
HTML
|
||||||
|
|
||||||
scrubbed = scrub(html)
|
scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
|
||||||
|
|
||||||
expect(scrubbed).to eq("Discourse 51%20PM Untitled%20design%20(21)")
|
expect(scrubbed).to eq("Discourse 51%20PM Untitled%20design%20(21)")
|
||||||
end
|
end
|
||||||
|
@ -271,6 +255,20 @@ describe SearchIndexer do
|
||||||
"'/audio.m4a':23 '/content/somethingelse.mov':31 'audio':19 'com':15,22,30 'error':38 'extern':13 'file':20,28 'google.com':15 'http':37 'invalid':35 'link':10,16,24,32 'page':14 'somesite.com':22,30 'somesite.com/audio.m4a':21 'somesite.com/content/somethingelse.mov':29 'test':8A 'titl':4A 'uncategor':9B 'url':36 'video':27"
|
"'/audio.m4a':23 '/content/somethingelse.mov':31 'audio':19 'com':15,22,30 'error':38 'extern':13 'file':20,28 'google.com':15 'http':37 'invalid':35 'link':10,16,24,32 'page':14 'somesite.com':22,30 'somesite.com/audio.m4a':21 'somesite.com/content/somethingelse.mov':29 'test':8A 'titl':4A 'uncategor':9B 'url':36 'video':27"
|
||||||
)
|
)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
it 'should unaccent indexed content' do
|
||||||
|
SiteSetting.search_ignore_accents = true
|
||||||
|
post.update!(raw: "Cette oeuvre d'art n'est pas une œuvre")
|
||||||
|
post.post_search_data.reload
|
||||||
|
expect(post.post_search_data.search_data).not_to include('œuvr')
|
||||||
|
expect(post.post_search_data.search_data).to include('oeuvr')
|
||||||
|
|
||||||
|
SiteSetting.search_ignore_accents = false
|
||||||
|
SearchIndexer.index(post, force: true)
|
||||||
|
post.post_search_data.reload
|
||||||
|
expect(post.post_search_data.search_data).to include('œuvr')
|
||||||
|
expect(post.post_search_data.search_data).to include('oeuvr')
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
describe '.queue_post_reindex' do
|
describe '.queue_post_reindex' do
|
||||||
|
|
Loading…
Reference in New Issue