correct regression searching with diacritics

This commit is contained in:
Sam 2018-08-24 10:00:51 +10:00
parent 29315b73c2
commit ac11f8df52
3 changed files with 36 additions and 6 deletions

View File

@ -166,6 +166,14 @@ class SearchIndexer
end
class HtmlScrubber < Nokogiri::XML::SAX::Document
def self.strip_diacritics(str)
s = str.unicode_normalize(:nfkd)
s.gsub!(DIACRITICS, "")
s.strip!
s
end
attr_reader :scrubbed
def initialize
@ -192,8 +200,8 @@ class SearchIndexer
DIACRITICS ||= /([\u0300-\u036f]|[\u1AB0-\u1AFF]|[\u1DC0-\u1DFF]|[\u20D0-\u20FF])/
def characters(string)
scrubbed << " #{string.unicode_normalize(:nfkd).gsub(DIACRITICS, "").strip} "
def characters(str)
scrubbed << " #{HtmlScrubber.strip_diacritics(str)} "
end
end
end

View File

@ -132,11 +132,14 @@ class Search
@valid = true
@page = @opts[:page]
term = term.to_s.dup
# Removes any zero-width characters from search terms
term.to_s.gsub!(/[\u200B-\u200D\uFEFF]/, '')
term.gsub!(/[\u200B-\u200D\uFEFF]/, '')
# Replace curly quotes to regular quotes
term.to_s.gsub!(/[\u201c\u201d]/, '"')
@clean_term = term.to_s.dup
term.gsub!(/[\u201c\u201d]/, '"')
@clean_term = term
term = process_advanced_search!(term)
@ -825,9 +828,10 @@ class Search
end
def ts_query(ts_config = nil, weight_filter: nil)
# we must strip diacritics otherwise we will get no matches
@ts_query_cache ||= {}
@ts_query_cache["#{ts_config || default_ts_config} #{@term} #{weight_filter}"] ||=
Search.ts_query(term: @term, ts_config: ts_config, weight_filter: weight_filter)
Search.ts_query(term: SearchIndexer::HtmlScrubber.strip_diacritics(@term), ts_config: ts_config, weight_filter: weight_filter)
end
def wrap_rows(query)

View File

@ -990,6 +990,24 @@ describe Search do
end
end
context 'diacritics' do
let!(:post1) { Fabricate(:post, raw: 'สวัสดี Régis hello') }
it ('allows strips correctly') do
results = Search.execute('hello', type_filter: 'topic')
expect(results.posts.length).to eq(1)
results = Search.execute('regis', type_filter: 'topic')
expect(results.posts.length).to eq(1)
results = Search.execute('Régis', type_filter: 'topic')
expect(results.posts.length).to eq(1)
results = Search.execute('สวัสดี', type_filter: 'topic')
expect(results.posts.length).to eq(1)
end
end
context 'pagination' do
let(:number_of_results) { 2 }
let!(:post1) { Fabricate(:post, raw: 'hello hello hello hello hello') }