FIX: revert diacritic stripping

See more details in test case and at: https://meta.discourse.org/t/discourse-should-ignore-if-a-character-is-accented-when-doing-a-search/90198/16?u=sam
This commit is contained in:
Sam 2018-08-31 11:46:55 +10:00
parent 81b99efc68
commit 9b7cab589a
4 changed files with 15 additions and 9 deletions

View File

@ -176,14 +176,16 @@ class SearchIndexer
attr_reader :scrubbed attr_reader :scrubbed
def initialize def initialize(strip_diacritics: false)
@scrubbed = +"" @scrubbed = +""
# for now we are disabling this per: https://meta.discourse.org/t/discourse-should-ignore-if-a-character-is-accented-when-doing-a-search/90198/16?u=sam
@strip_diacritics = strip_diacritics
end end
def self.scrub(html) def self.scrub(html, strip_diacritics: false)
return +"" if html.blank? return +"" if html.blank?
me = new me = new(strip_diacritics: strip_diacritics)
Nokogiri::HTML::SAX::Parser.new(me).parse("<div>#{html}</div>") Nokogiri::HTML::SAX::Parser.new(me).parse("<div>#{html}</div>")
me.scrubbed me.scrubbed
end end
@ -201,7 +203,8 @@ class SearchIndexer
DIACRITICS ||= /([\u0300-\u036f]|[\u1AB0-\u1AFF]|[\u1DC0-\u1DFF]|[\u20D0-\u20FF])/ DIACRITICS ||= /([\u0300-\u036f]|[\u1AB0-\u1AFF]|[\u1DC0-\u1DFF]|[\u20D0-\u20FF])/
def characters(str) def characters(str)
scrubbed << " #{HtmlScrubber.strip_diacritics(str)} " str = HtmlScrubber.strip_diacritics(str) if @strip_diacritics
scrubbed << " #{str} "
end end
end end
end end

View File

@ -828,10 +828,9 @@ class Search
end end
def ts_query(ts_config = nil, weight_filter: nil) def ts_query(ts_config = nil, weight_filter: nil)
# we must strip diacritics otherwise we will get no matches
@ts_query_cache ||= {} @ts_query_cache ||= {}
@ts_query_cache["#{ts_config || default_ts_config} #{@term} #{weight_filter}"] ||= @ts_query_cache["#{ts_config || default_ts_config} #{@term} #{weight_filter}"] ||=
Search.ts_query(term: SearchIndexer::HtmlScrubber.strip_diacritics(@term), ts_config: ts_config, weight_filter: weight_filter) Search.ts_query(term: @term, ts_config: ts_config, weight_filter: weight_filter)
end end
def wrap_rows(query) def wrap_rows(query)

View File

@ -1006,11 +1006,15 @@ describe Search do
results = Search.execute('hello', type_filter: 'topic') results = Search.execute('hello', type_filter: 'topic')
expect(results.posts.length).to eq(1) expect(results.posts.length).to eq(1)
# TODO when we add diacritic support we should return 1 here
results = Search.execute('regis', type_filter: 'topic') results = Search.execute('regis', type_filter: 'topic')
expect(results.posts.length).to eq(0)
results = Search.execute('Régis', type_filter: 'topic', include_blurbs: true)
expect(results.posts.length).to eq(1) expect(results.posts.length).to eq(1)
results = Search.execute('Régis', type_filter: 'topic') # this is a test we got to keep working
expect(results.posts.length).to eq(1) expect(results.blurb(results.posts.first)).to include('Régis')
results = Search.execute('สวัสดี', type_filter: 'topic') results = Search.execute('สวัสดี', type_filter: 'topic')
expect(results.posts.length).to eq(1) expect(results.posts.length).to eq(1)

View File

@ -32,7 +32,7 @@ describe SearchIndexer do
it 'removes diacritics' do it 'removes diacritics' do
html = "<p>HELLO Hétérogénéité Здравствуйте هتاف للترحيب 你好</p>" html = "<p>HELLO Hétérogénéité Здравствуйте هتاف للترحيب 你好</p>"
scrubbed = SearchIndexer::HtmlScrubber.scrub(html) scrubbed = SearchIndexer::HtmlScrubber.scrub(html, strip_diacritics: true)
expect(scrubbed).to eq(" HELLO Heterogeneite Здравствуите هتاف للترحيب 你好 ") expect(scrubbed).to eq(" HELLO Heterogeneite Здравствуите هتاف للترحيب 你好 ")
end end