FIX: revert diacritic stripping
See more details in test case and at: https://meta.discourse.org/t/discourse-should-ignore-if-a-character-is-accented-when-doing-a-search/90198/16?u=sam
This commit is contained in:
parent
81b99efc68
commit
9b7cab589a
|
@ -176,14 +176,16 @@ class SearchIndexer
|
||||||
|
|
||||||
attr_reader :scrubbed
|
attr_reader :scrubbed
|
||||||
|
|
||||||
def initialize
|
def initialize(strip_diacritics: false)
|
||||||
@scrubbed = +""
|
@scrubbed = +""
|
||||||
|
# for now we are disabling this per: https://meta.discourse.org/t/discourse-should-ignore-if-a-character-is-accented-when-doing-a-search/90198/16?u=sam
|
||||||
|
@strip_diacritics = strip_diacritics
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.scrub(html)
|
def self.scrub(html, strip_diacritics: false)
|
||||||
return +"" if html.blank?
|
return +"" if html.blank?
|
||||||
|
|
||||||
me = new
|
me = new(strip_diacritics: strip_diacritics)
|
||||||
Nokogiri::HTML::SAX::Parser.new(me).parse("<div>#{html}</div>")
|
Nokogiri::HTML::SAX::Parser.new(me).parse("<div>#{html}</div>")
|
||||||
me.scrubbed
|
me.scrubbed
|
||||||
end
|
end
|
||||||
|
@ -201,7 +203,8 @@ class SearchIndexer
|
||||||
DIACRITICS ||= /([\u0300-\u036f]|[\u1AB0-\u1AFF]|[\u1DC0-\u1DFF]|[\u20D0-\u20FF])/
|
DIACRITICS ||= /([\u0300-\u036f]|[\u1AB0-\u1AFF]|[\u1DC0-\u1DFF]|[\u20D0-\u20FF])/
|
||||||
|
|
||||||
def characters(str)
|
def characters(str)
|
||||||
scrubbed << " #{HtmlScrubber.strip_diacritics(str)} "
|
str = HtmlScrubber.strip_diacritics(str) if @strip_diacritics
|
||||||
|
scrubbed << " #{str} "
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -828,10 +828,9 @@ class Search
|
||||||
end
|
end
|
||||||
|
|
||||||
def ts_query(ts_config = nil, weight_filter: nil)
|
def ts_query(ts_config = nil, weight_filter: nil)
|
||||||
# we must strip diacritics otherwise we will get no matches
|
|
||||||
@ts_query_cache ||= {}
|
@ts_query_cache ||= {}
|
||||||
@ts_query_cache["#{ts_config || default_ts_config} #{@term} #{weight_filter}"] ||=
|
@ts_query_cache["#{ts_config || default_ts_config} #{@term} #{weight_filter}"] ||=
|
||||||
Search.ts_query(term: SearchIndexer::HtmlScrubber.strip_diacritics(@term), ts_config: ts_config, weight_filter: weight_filter)
|
Search.ts_query(term: @term, ts_config: ts_config, weight_filter: weight_filter)
|
||||||
end
|
end
|
||||||
|
|
||||||
def wrap_rows(query)
|
def wrap_rows(query)
|
||||||
|
|
|
@ -1006,11 +1006,15 @@ describe Search do
|
||||||
results = Search.execute('hello', type_filter: 'topic')
|
results = Search.execute('hello', type_filter: 'topic')
|
||||||
expect(results.posts.length).to eq(1)
|
expect(results.posts.length).to eq(1)
|
||||||
|
|
||||||
|
# TODO when we add diacritic support we should return 1 here
|
||||||
results = Search.execute('regis', type_filter: 'topic')
|
results = Search.execute('regis', type_filter: 'topic')
|
||||||
|
expect(results.posts.length).to eq(0)
|
||||||
|
|
||||||
|
results = Search.execute('Régis', type_filter: 'topic', include_blurbs: true)
|
||||||
expect(results.posts.length).to eq(1)
|
expect(results.posts.length).to eq(1)
|
||||||
|
|
||||||
results = Search.execute('Régis', type_filter: 'topic')
|
# this is a test we got to keep working
|
||||||
expect(results.posts.length).to eq(1)
|
expect(results.blurb(results.posts.first)).to include('Régis')
|
||||||
|
|
||||||
results = Search.execute('สวัสดี', type_filter: 'topic')
|
results = Search.execute('สวัสดี', type_filter: 'topic')
|
||||||
expect(results.posts.length).to eq(1)
|
expect(results.posts.length).to eq(1)
|
||||||
|
|
|
@ -32,7 +32,7 @@ describe SearchIndexer do
|
||||||
it 'removes diacritics' do
|
it 'removes diacritics' do
|
||||||
html = "<p>HELLO Hétérogénéité Здравствуйте هتاف للترحيب 你好</p>"
|
html = "<p>HELLO Hétérogénéité Здравствуйте هتاف للترحيب 你好</p>"
|
||||||
|
|
||||||
scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
|
scrubbed = SearchIndexer::HtmlScrubber.scrub(html, strip_diacritics: true)
|
||||||
|
|
||||||
expect(scrubbed).to eq(" HELLO Heterogeneite Здравствуите هتاف للترحيب 你好 ")
|
expect(scrubbed).to eq(" HELLO Heterogeneite Здравствуите هتاف للترحيب 你好 ")
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue