FEATURE: Use Postgres unaccent to ignore accents (#16100)

The search_ignore_accents site setting can be used to make the search
indexer remove the accents before indexing the content. The unaccent
function from PostgreSQL is better than Ruby's unicode_normalize(:nfkd).
This commit is contained in:
Bianca Nenciu 2022-03-07 23:03:10 +02:00 committed by GitHub
parent 6e7cdc5bc3
commit 34b4b53bac
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 75 additions and 53 deletions

View File

@ -17,10 +17,6 @@ class SearchIndexer
@disabled = false
end
def self.scrub_html_for_search(html, strip_diacritics: SiteSetting.search_ignore_accents)
HtmlScrubber.scrub(html, strip_diacritics: strip_diacritics)
end
def self.update_index(table: , id: , a_weight: nil, b_weight: nil, c_weight: nil, d_weight: nil)
raw_data = [a_weight, b_weight, c_weight, d_weight]
@ -35,10 +31,10 @@ class SearchIndexer
stemmer = table == "user" ? "simple" : Search.ts_config
ranked_index = <<~SQL
setweight(to_tsvector('#{stemmer}', coalesce(:a,'')), 'A') ||
setweight(to_tsvector('#{stemmer}', coalesce(:b,'')), 'B') ||
setweight(to_tsvector('#{stemmer}', coalesce(:c,'')), 'C') ||
setweight(to_tsvector('#{stemmer}', coalesce(:d,'')), 'D')
setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:a,''))")}, 'A') ||
setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:b,''))")}, 'B') ||
setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:c,''))")}, 'C') ||
setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:d,''))")}, 'D')
SQL
ranked_params = {
@ -109,7 +105,7 @@ class SearchIndexer
table: 'topic',
id: topic_id,
a_weight: title,
b_weight: scrub_html_for_search(cooked)[0...Topic::MAX_SIMILAR_BODY_LENGTH]
b_weight: HtmlScrubber.scrub(cooked)[0...Topic::MAX_SIMILAR_BODY_LENGTH]
)
end
@ -124,7 +120,7 @@ class SearchIndexer
# the original string. Since there is no way to estimate the length of
# the expected tsvector, we limit the input to ~50% of the maximum
# length of a tsvector (1_048_576 bytes).
d_weight: scrub_html_for_search(cooked)[0..600_000]
d_weight: HtmlScrubber.scrub(cooked)[0..600_000]
) do |params|
params["private_message"] = private_message
end
@ -294,12 +290,11 @@ class SearchIndexer
attr_reader :scrubbed
def initialize(strip_diacritics: false)
def initialize
@scrubbed = +""
@strip_diacritics = strip_diacritics
end
def self.scrub(html, strip_diacritics: false)
def self.scrub(html)
return +"" if html.blank?
begin
@ -338,9 +333,9 @@ class SearchIndexer
end
end
me = new(strip_diacritics: strip_diacritics)
Nokogiri::HTML::SAX::Parser.new(me).parse(document.to_html)
me.scrubbed.squish
html_scrubber = new
Nokogiri::HTML::SAX::Parser.new(html_scrubber).parse(document.to_html)
html_scrubber.scrubbed.squish
end
MENTION_CLASSES ||= %w{mention mention-group}
@ -362,7 +357,6 @@ class SearchIndexer
end
def characters(str)
str = Search.strip_diacritics(str) if @strip_diacritics
scrubbed << " #{str} "
end
end

View File

@ -0,0 +1,7 @@
# frozen_string_literal: true
class EnableUnaccentExtension < ActiveRecord::Migration[6.1]
def change
enable_extension 'unaccent'
end
end

View File

@ -21,13 +21,6 @@ class Search
5
end
def self.strip_diacritics(str)
s = str.unicode_normalize(:nfkd)
s.gsub!(DIACRITICS, "")
s.strip!
s
end
def self.per_filter
50
end
@ -64,6 +57,10 @@ class Search
end
end
def self.wrap_unaccent(str)
SiteSetting.search_ignore_accents ? "unaccent(#{str})" : str
end
def self.segment_chinese?
['zh_TW', 'zh_CN'].include?(SiteSetting.default_locale) || SiteSetting.search_tokenize_chinese
end
@ -115,10 +112,6 @@ class Search
else
data.squish!
end
if SiteSetting.search_ignore_accents
data = strip_diacritics(data)
end
end
data.gsub!(/\S+/) do |str|
@ -704,7 +697,7 @@ class Search
FROM topic_tags tt, tags
WHERE tt.tag_id = tags.id
GROUP BY tt.topic_id
HAVING to_tsvector(#{default_ts_config}, array_to_string(array_agg(lower(tags.name)), ' ')) @@ to_tsquery(#{default_ts_config}, ?)
HAVING to_tsvector(#{default_ts_config}, #{Search.wrap_unaccent("array_to_string(array_agg(lower(tags.name)), ' ')")}) @@ to_tsquery(#{default_ts_config}, #{Search.wrap_unaccent('?')})
)", tags.join('&'))
else
tags = match.split(",")
@ -1151,7 +1144,8 @@ class Search
def self.to_tsquery(ts_config: nil, term:, joiner: nil)
ts_config = ActiveRecord::Base.connection.quote(ts_config) if ts_config
tsquery = "TO_TSQUERY(#{ts_config || default_ts_config}, '#{self.escape_string(term)}')"
escaped_term = Search.wrap_unaccent("'#{self.escape_string(term)}'")
tsquery = "TO_TSQUERY(#{ts_config || default_ts_config}, #{escaped_term})"
tsquery = "REPLACE(#{tsquery}::text, '&', '#{self.escape_string(joiner)}')::tsquery" if joiner
tsquery
end

View File

@ -120,7 +120,7 @@ class Search
blurb = nil
if scrub
cooked = SearchIndexer.scrub_html_for_search(cooked)
cooked = SearchIndexer::HtmlScrubber.scrub(cooked)
urls = Set.new
cooked.scan(Discourse::Utils::URI_REGEXP) { urls << $& }

View File

@ -77,6 +77,35 @@ describe Search do
expect(result.tags).to contain_exactly()
end
end
context "accents" do
fab!(:post_1) { Fabricate(:post, raw: "Cette ****** d'art n'est pas une œuvre") }
fab!(:post_2) { Fabricate(:post, raw: "Cette oeuvre d'art n'est pas une *****") }
before do
SearchIndexer.enable
end
after do
SearchIndexer.disable
end
it "removes them if search_ignore_accents" do
SiteSetting.search_ignore_accents = true
[post_1, post_2].each { |post| SearchIndexer.index(post.topic, force: true) }
expect(Search.execute("oeuvre").posts).to contain_exactly(post_1, post_2)
expect(Search.execute("œuvre").posts).to contain_exactly(post_1, post_2)
end
it "does not remove them if not search_ignore_accents" do
SiteSetting.search_ignore_accents = false
[post_1, post_2].each { |post| SearchIndexer.index(post.topic, force: true) }
expect(Search.execute("œuvre").posts).to contain_exactly(post_1)
expect(Search.execute("oeuvre").posts).to contain_exactly(post_2)
end
end
end
context "custom_eager_load" do

View File

@ -11,10 +11,6 @@ describe SearchIndexer do
SearchIndexer.disable
end
def scrub(html, strip_diacritics: false)
SearchIndexer.scrub_html_for_search(html, strip_diacritics: strip_diacritics)
end
it 'correctly indexes chinese' do
SiteSetting.default_locale = 'zh_CN'
data = "你好世界"
@ -36,48 +32,36 @@ describe SearchIndexer do
it 'extract youtube title' do
html = "<div class=\"lazyYT\" data-youtube-id=\"lmFgeFh2nlw\" data-youtube-title=\"Metallica Mixer Explains Missing Bass on 'And Justice for All' [Exclusive]\" data-width=\"480\" data-height=\"270\" data-parameters=\"feature=oembed&amp;wmode=opaque\"></div>"
scrubbed = scrub(html)
scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
expect(scrubbed).to eq("Metallica Mixer Explains Missing Bass on 'And Justice for All' [Exclusive]")
end
it 'extract a link' do
html = "<a href='http://meta.discourse.org/'>link</a>"
scrubbed = scrub(html)
scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
expect(scrubbed).to eq("http://meta.discourse.org/ link")
end
it 'extracts @username from mentions' do
html = '<p><a class="mention" href="/u/%E7%8B%AE%E5%AD%90">@狮子</a> <a class="mention" href="/u/foo">@foo</a></p>'
scrubbed = scrub(html)
scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
expect(scrubbed).to eq('@狮子 @foo')
end
it 'extracts @groupname from group mentions' do
html = '<p><a class="mention-group" href="/groups/%D0%B0%D0%B2%D1%82%D0%BE%D0%BC%D0%BE%D0%B1%D0%B8%D0%BB%D0%B8%D1%81%D1%82">@автомобилист</a></p>'
scrubbed = scrub(html)
scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
expect(scrubbed).to eq('@автомобилист')
end
it 'extracts emoji name from emoji image' do
emoji = Emoji["wink"]
html = %Q|<img src=\"#{URI.join(Discourse.base_url_no_prefix, emoji.url)}\" title=\":wink:\" class=\"emoji only-emoji\" alt=\":wink:\" loading=\"lazy\" width=\"20\" height=\"20\">|
scrubbed = scrub(html)
scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
expect(scrubbed).to eq(':wink:')
end
it 'uses ignore_accent setting to strip diacritics' do
html = "<p>HELLO Hétérogénéité Здравствуйте هتاف للترحيب 你好</p>"
SiteSetting.search_ignore_accents = true
scrubbed = SearchIndexer.scrub_html_for_search(html)
expect(scrubbed).to eq("HELLO Heterogeneite Здравствуите هتاف للترحيب 你好")
SiteSetting.search_ignore_accents = false
scrubbed = SearchIndexer.scrub_html_for_search(html)
expect(scrubbed).to eq("HELLO Hétérogénéité Здравствуйте هتاف للترحيب 你好")
end
it "doesn't index local files" do
html = <<~HTML
<p><img src="https://www.discourse.org/logo.png" alt="Discourse"></p>
@ -95,7 +79,7 @@ describe SearchIndexer do
</div>
HTML
scrubbed = scrub(html)
scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
expect(scrubbed).to eq("Discourse 51%20PM Untitled%20design%20(21)")
end
@ -271,6 +255,20 @@ describe SearchIndexer do
"'/audio.m4a':23 '/content/somethingelse.mov':31 'audio':19 'com':15,22,30 'error':38 'extern':13 'file':20,28 'google.com':15 'http':37 'invalid':35 'link':10,16,24,32 'page':14 'somesite.com':22,30 'somesite.com/audio.m4a':21 'somesite.com/content/somethingelse.mov':29 'test':8A 'titl':4A 'uncategor':9B 'url':36 'video':27"
)
end
it 'should unaccent indexed content' do
SiteSetting.search_ignore_accents = true
post.update!(raw: "Cette oeuvre d'art n'est pas une œuvre")
post.post_search_data.reload
expect(post.post_search_data.search_data).not_to include('œuvr')
expect(post.post_search_data.search_data).to include('oeuvr')
SiteSetting.search_ignore_accents = false
SearchIndexer.index(post, force: true)
post.post_search_data.reload
expect(post.post_search_data.search_data).to include('œuvr')
expect(post.post_search_data.search_data).to include('oeuvr')
end
end
describe '.queue_post_reindex' do