FIX: revert diacritic stripping

See more details in test case and at: https://meta.discourse.org/t/discourse-should-ignore-if-a-character-is-accented-when-doing-a-search/90198/16?u=sam
2018-08-31 11:46:55 +10:00 · 2018-08-31 11:46:55 +10:00 · 9b7cab589a
parent 81b99efc68
commit 9b7cab589a
4 changed files with 15 additions and 9 deletions
--- a/app/services/search_indexer.rb
+++ b/app/services/search_indexer.rb
@ -176,14 +176,16 @@ class SearchIndexer

    attr_reader :scrubbed

-    def initialize
+    def initialize(strip_diacritics: false)
      @scrubbed = +""
+      # for now we are disabling this per: https://meta.discourse.org/t/discourse-should-ignore-if-a-character-is-accented-when-doing-a-search/90198/16?u=sam
+      @strip_diacritics = strip_diacritics
    end

-    def self.scrub(html)
+    def self.scrub(html, strip_diacritics: false)
      return +"" if html.blank?

-      me = new
+      me = new(strip_diacritics: strip_diacritics)
      Nokogiri::HTML::SAX::Parser.new(me).parse("<div>#{html}</div>")
      me.scrubbed
    end
@ -201,7 +203,8 @@ class SearchIndexer
    DIACRITICS ||= /([\u0300-\u036f]|[\u1AB0-\u1AFF]|[\u1DC0-\u1DFF]|[\u20D0-\u20FF])/

    def characters(str)
-      scrubbed << " #{HtmlScrubber.strip_diacritics(str)} "
+      str = HtmlScrubber.strip_diacritics(str) if @strip_diacritics
+      scrubbed << " #{str} "
    end
  end
 end
--- a/lib/search.rb
+++ b/lib/search.rb
@ -828,10 +828,9 @@ class Search
  end

  def ts_query(ts_config = nil, weight_filter: nil)
-    # we must strip diacritics otherwise we will get no matches
    @ts_query_cache ||= {}
    @ts_query_cache["#{ts_config || default_ts_config} #{@term} #{weight_filter}"] ||=
-      Search.ts_query(term: SearchIndexer::HtmlScrubber.strip_diacritics(@term), ts_config: ts_config, weight_filter: weight_filter)
+      Search.ts_query(term: @term, ts_config: ts_config, weight_filter: weight_filter)
  end

  def wrap_rows(query)
--- a/spec/components/search_spec.rb
+++ b/spec/components/search_spec.rb
@ -1006,11 +1006,15 @@ describe Search do
      results = Search.execute('hello', type_filter: 'topic')
      expect(results.posts.length).to eq(1)

+      # TODO when we add diacritic support we should return 1 here
      results = Search.execute('regis', type_filter: 'topic')
+      expect(results.posts.length).to eq(0)
+
+      results = Search.execute('Régis', type_filter: 'topic', include_blurbs: true)
      expect(results.posts.length).to eq(1)

-      results = Search.execute('Régis', type_filter: 'topic')
-      expect(results.posts.length).to eq(1)
+      # this is a test we got to keep working
+      expect(results.blurb(results.posts.first)).to include('Régis')

      results = Search.execute('สวัสดี', type_filter: 'topic')
      expect(results.posts.length).to eq(1)
--- a/spec/services/search_indexer_spec.rb
+++ b/spec/services/search_indexer_spec.rb
@ -32,7 +32,7 @@ describe SearchIndexer do
  it 'removes diacritics' do
    html = "<p>HELLO Hétérogénéité Здравствуйте هتاف للترحيب 你好</p>"

-    scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
+    scrubbed = SearchIndexer::HtmlScrubber.scrub(html, strip_diacritics: true)

    expect(scrubbed).to eq(" HELLO Heterogeneite Здравствуите هتاف للترحيب 你好 ")
  end