FEATURE: Use Postgres unaccent to ignore accents (#16100)
The search_ignore_accents site setting can be used to make the search indexer remove the accents before indexing the content. The unaccent function from PostgreSQL is better than Ruby's unicode_normalize(:nfkd).
This commit is contained in:
parent
6e7cdc5bc3
commit
34b4b53bac
|
@ -17,10 +17,6 @@ class SearchIndexer
|
|||
@disabled = false
|
||||
end
|
||||
|
||||
def self.scrub_html_for_search(html, strip_diacritics: SiteSetting.search_ignore_accents)
|
||||
HtmlScrubber.scrub(html, strip_diacritics: strip_diacritics)
|
||||
end
|
||||
|
||||
def self.update_index(table: , id: , a_weight: nil, b_weight: nil, c_weight: nil, d_weight: nil)
|
||||
raw_data = [a_weight, b_weight, c_weight, d_weight]
|
||||
|
||||
|
@ -35,10 +31,10 @@ class SearchIndexer
|
|||
stemmer = table == "user" ? "simple" : Search.ts_config
|
||||
|
||||
ranked_index = <<~SQL
|
||||
setweight(to_tsvector('#{stemmer}', coalesce(:a,'')), 'A') ||
|
||||
setweight(to_tsvector('#{stemmer}', coalesce(:b,'')), 'B') ||
|
||||
setweight(to_tsvector('#{stemmer}', coalesce(:c,'')), 'C') ||
|
||||
setweight(to_tsvector('#{stemmer}', coalesce(:d,'')), 'D')
|
||||
setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:a,''))")}, 'A') ||
|
||||
setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:b,''))")}, 'B') ||
|
||||
setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:c,''))")}, 'C') ||
|
||||
setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:d,''))")}, 'D')
|
||||
SQL
|
||||
|
||||
ranked_params = {
|
||||
|
@ -109,7 +105,7 @@ class SearchIndexer
|
|||
table: 'topic',
|
||||
id: topic_id,
|
||||
a_weight: title,
|
||||
b_weight: scrub_html_for_search(cooked)[0...Topic::MAX_SIMILAR_BODY_LENGTH]
|
||||
b_weight: HtmlScrubber.scrub(cooked)[0...Topic::MAX_SIMILAR_BODY_LENGTH]
|
||||
)
|
||||
end
|
||||
|
||||
|
@ -124,7 +120,7 @@ class SearchIndexer
|
|||
# the original string. Since there is no way to estimate the length of
|
||||
# the expected tsvector, we limit the input to ~50% of the maximum
|
||||
# length of a tsvector (1_048_576 bytes).
|
||||
d_weight: scrub_html_for_search(cooked)[0..600_000]
|
||||
d_weight: HtmlScrubber.scrub(cooked)[0..600_000]
|
||||
) do |params|
|
||||
params["private_message"] = private_message
|
||||
end
|
||||
|
@ -294,12 +290,11 @@ class SearchIndexer
|
|||
|
||||
attr_reader :scrubbed
|
||||
|
||||
def initialize(strip_diacritics: false)
|
||||
def initialize
|
||||
@scrubbed = +""
|
||||
@strip_diacritics = strip_diacritics
|
||||
end
|
||||
|
||||
def self.scrub(html, strip_diacritics: false)
|
||||
def self.scrub(html)
|
||||
return +"" if html.blank?
|
||||
|
||||
begin
|
||||
|
@ -338,9 +333,9 @@ class SearchIndexer
|
|||
end
|
||||
end
|
||||
|
||||
me = new(strip_diacritics: strip_diacritics)
|
||||
Nokogiri::HTML::SAX::Parser.new(me).parse(document.to_html)
|
||||
me.scrubbed.squish
|
||||
html_scrubber = new
|
||||
Nokogiri::HTML::SAX::Parser.new(html_scrubber).parse(document.to_html)
|
||||
html_scrubber.scrubbed.squish
|
||||
end
|
||||
|
||||
MENTION_CLASSES ||= %w{mention mention-group}
|
||||
|
@ -362,7 +357,6 @@ class SearchIndexer
|
|||
end
|
||||
|
||||
def characters(str)
|
||||
str = Search.strip_diacritics(str) if @strip_diacritics
|
||||
scrubbed << " #{str} "
|
||||
end
|
||||
end
|
||||
|
|
|
@ -0,0 +1,7 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
class EnableUnaccentExtension < ActiveRecord::Migration[6.1]
|
||||
def change
|
||||
enable_extension 'unaccent'
|
||||
end
|
||||
end
|
|
@ -21,13 +21,6 @@ class Search
|
|||
5
|
||||
end
|
||||
|
||||
def self.strip_diacritics(str)
|
||||
s = str.unicode_normalize(:nfkd)
|
||||
s.gsub!(DIACRITICS, "")
|
||||
s.strip!
|
||||
s
|
||||
end
|
||||
|
||||
def self.per_filter
|
||||
50
|
||||
end
|
||||
|
@ -64,6 +57,10 @@ class Search
|
|||
end
|
||||
end
|
||||
|
||||
def self.wrap_unaccent(str)
|
||||
SiteSetting.search_ignore_accents ? "unaccent(#{str})" : str
|
||||
end
|
||||
|
||||
def self.segment_chinese?
|
||||
['zh_TW', 'zh_CN'].include?(SiteSetting.default_locale) || SiteSetting.search_tokenize_chinese
|
||||
end
|
||||
|
@ -115,10 +112,6 @@ class Search
|
|||
else
|
||||
data.squish!
|
||||
end
|
||||
|
||||
if SiteSetting.search_ignore_accents
|
||||
data = strip_diacritics(data)
|
||||
end
|
||||
end
|
||||
|
||||
data.gsub!(/\S+/) do |str|
|
||||
|
@ -704,7 +697,7 @@ class Search
|
|||
FROM topic_tags tt, tags
|
||||
WHERE tt.tag_id = tags.id
|
||||
GROUP BY tt.topic_id
|
||||
HAVING to_tsvector(#{default_ts_config}, array_to_string(array_agg(lower(tags.name)), ' ')) @@ to_tsquery(#{default_ts_config}, ?)
|
||||
HAVING to_tsvector(#{default_ts_config}, #{Search.wrap_unaccent("array_to_string(array_agg(lower(tags.name)), ' ')")}) @@ to_tsquery(#{default_ts_config}, #{Search.wrap_unaccent('?')})
|
||||
)", tags.join('&'))
|
||||
else
|
||||
tags = match.split(",")
|
||||
|
@ -1151,7 +1144,8 @@ class Search
|
|||
|
||||
def self.to_tsquery(ts_config: nil, term:, joiner: nil)
|
||||
ts_config = ActiveRecord::Base.connection.quote(ts_config) if ts_config
|
||||
tsquery = "TO_TSQUERY(#{ts_config || default_ts_config}, '#{self.escape_string(term)}')"
|
||||
escaped_term = Search.wrap_unaccent("'#{self.escape_string(term)}'")
|
||||
tsquery = "TO_TSQUERY(#{ts_config || default_ts_config}, #{escaped_term})"
|
||||
tsquery = "REPLACE(#{tsquery}::text, '&', '#{self.escape_string(joiner)}')::tsquery" if joiner
|
||||
tsquery
|
||||
end
|
||||
|
|
|
@ -120,7 +120,7 @@ class Search
|
|||
blurb = nil
|
||||
|
||||
if scrub
|
||||
cooked = SearchIndexer.scrub_html_for_search(cooked)
|
||||
cooked = SearchIndexer::HtmlScrubber.scrub(cooked)
|
||||
|
||||
urls = Set.new
|
||||
cooked.scan(Discourse::Utils::URI_REGEXP) { urls << $& }
|
||||
|
|
|
@ -77,6 +77,35 @@ describe Search do
|
|||
expect(result.tags).to contain_exactly()
|
||||
end
|
||||
end
|
||||
|
||||
context "accents" do
|
||||
fab!(:post_1) { Fabricate(:post, raw: "Cette ****** d'art n'est pas une œuvre") }
|
||||
fab!(:post_2) { Fabricate(:post, raw: "Cette oeuvre d'art n'est pas une *****") }
|
||||
|
||||
before do
|
||||
SearchIndexer.enable
|
||||
end
|
||||
|
||||
after do
|
||||
SearchIndexer.disable
|
||||
end
|
||||
|
||||
it "removes them if search_ignore_accents" do
|
||||
SiteSetting.search_ignore_accents = true
|
||||
[post_1, post_2].each { |post| SearchIndexer.index(post.topic, force: true) }
|
||||
|
||||
expect(Search.execute("oeuvre").posts).to contain_exactly(post_1, post_2)
|
||||
expect(Search.execute("œuvre").posts).to contain_exactly(post_1, post_2)
|
||||
end
|
||||
|
||||
it "does not remove them if not search_ignore_accents" do
|
||||
SiteSetting.search_ignore_accents = false
|
||||
[post_1, post_2].each { |post| SearchIndexer.index(post.topic, force: true) }
|
||||
|
||||
expect(Search.execute("œuvre").posts).to contain_exactly(post_1)
|
||||
expect(Search.execute("oeuvre").posts).to contain_exactly(post_2)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
context "custom_eager_load" do
|
||||
|
|
|
@ -11,10 +11,6 @@ describe SearchIndexer do
|
|||
SearchIndexer.disable
|
||||
end
|
||||
|
||||
def scrub(html, strip_diacritics: false)
|
||||
SearchIndexer.scrub_html_for_search(html, strip_diacritics: strip_diacritics)
|
||||
end
|
||||
|
||||
it 'correctly indexes chinese' do
|
||||
SiteSetting.default_locale = 'zh_CN'
|
||||
data = "你好世界"
|
||||
|
@ -36,48 +32,36 @@ describe SearchIndexer do
|
|||
|
||||
it 'extract youtube title' do
|
||||
html = "<div class=\"lazyYT\" data-youtube-id=\"lmFgeFh2nlw\" data-youtube-title=\"Metallica Mixer Explains Missing Bass on 'And Justice for All' [Exclusive]\" data-width=\"480\" data-height=\"270\" data-parameters=\"feature=oembed&wmode=opaque\"></div>"
|
||||
scrubbed = scrub(html)
|
||||
scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
|
||||
expect(scrubbed).to eq("Metallica Mixer Explains Missing Bass on 'And Justice for All' [Exclusive]")
|
||||
end
|
||||
|
||||
it 'extract a link' do
|
||||
html = "<a href='http://meta.discourse.org/'>link</a>"
|
||||
scrubbed = scrub(html)
|
||||
scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
|
||||
expect(scrubbed).to eq("http://meta.discourse.org/ link")
|
||||
end
|
||||
|
||||
it 'extracts @username from mentions' do
|
||||
html = '<p><a class="mention" href="/u/%E7%8B%AE%E5%AD%90">@狮子</a> <a class="mention" href="/u/foo">@foo</a></p>'
|
||||
scrubbed = scrub(html)
|
||||
scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
|
||||
expect(scrubbed).to eq('@狮子 @foo')
|
||||
end
|
||||
|
||||
it 'extracts @groupname from group mentions' do
|
||||
html = '<p><a class="mention-group" href="/groups/%D0%B0%D0%B2%D1%82%D0%BE%D0%BC%D0%BE%D0%B1%D0%B8%D0%BB%D0%B8%D1%81%D1%82">@автомобилист</a></p>'
|
||||
scrubbed = scrub(html)
|
||||
scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
|
||||
expect(scrubbed).to eq('@автомобилист')
|
||||
end
|
||||
|
||||
it 'extracts emoji name from emoji image' do
|
||||
emoji = Emoji["wink"]
|
||||
html = %Q|<img src=\"#{URI.join(Discourse.base_url_no_prefix, emoji.url)}\" title=\":wink:\" class=\"emoji only-emoji\" alt=\":wink:\" loading=\"lazy\" width=\"20\" height=\"20\">|
|
||||
scrubbed = scrub(html)
|
||||
scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
|
||||
|
||||
expect(scrubbed).to eq(':wink:')
|
||||
end
|
||||
|
||||
it 'uses ignore_accent setting to strip diacritics' do
|
||||
html = "<p>HELLO Hétérogénéité Здравствуйте هتاف للترحيب 你好</p>"
|
||||
|
||||
SiteSetting.search_ignore_accents = true
|
||||
scrubbed = SearchIndexer.scrub_html_for_search(html)
|
||||
expect(scrubbed).to eq("HELLO Heterogeneite Здравствуите هتاف للترحيب 你好")
|
||||
|
||||
SiteSetting.search_ignore_accents = false
|
||||
scrubbed = SearchIndexer.scrub_html_for_search(html)
|
||||
expect(scrubbed).to eq("HELLO Hétérogénéité Здравствуйте هتاف للترحيب 你好")
|
||||
end
|
||||
|
||||
it "doesn't index local files" do
|
||||
html = <<~HTML
|
||||
<p><img src="https://www.discourse.org/logo.png" alt="Discourse"></p>
|
||||
|
@ -95,7 +79,7 @@ describe SearchIndexer do
|
|||
</div>
|
||||
HTML
|
||||
|
||||
scrubbed = scrub(html)
|
||||
scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
|
||||
|
||||
expect(scrubbed).to eq("Discourse 51%20PM Untitled%20design%20(21)")
|
||||
end
|
||||
|
@ -271,6 +255,20 @@ describe SearchIndexer do
|
|||
"'/audio.m4a':23 '/content/somethingelse.mov':31 'audio':19 'com':15,22,30 'error':38 'extern':13 'file':20,28 'google.com':15 'http':37 'invalid':35 'link':10,16,24,32 'page':14 'somesite.com':22,30 'somesite.com/audio.m4a':21 'somesite.com/content/somethingelse.mov':29 'test':8A 'titl':4A 'uncategor':9B 'url':36 'video':27"
|
||||
)
|
||||
end
|
||||
|
||||
it 'should unaccent indexed content' do
|
||||
SiteSetting.search_ignore_accents = true
|
||||
post.update!(raw: "Cette oeuvre d'art n'est pas une œuvre")
|
||||
post.post_search_data.reload
|
||||
expect(post.post_search_data.search_data).not_to include('œuvr')
|
||||
expect(post.post_search_data.search_data).to include('oeuvr')
|
||||
|
||||
SiteSetting.search_ignore_accents = false
|
||||
SearchIndexer.index(post, force: true)
|
||||
post.post_search_data.reload
|
||||
expect(post.post_search_data.search_data).to include('œuvr')
|
||||
expect(post.post_search_data.search_data).to include('oeuvr')
|
||||
end
|
||||
end
|
||||
|
||||
describe '.queue_post_reindex' do
|
||||
|
|
Loading…
Reference in New Issue