FIX: Strip query from URLs when indexing for search.

Indexing query strings in URLS produces inconsistent results in PG and pollutes the search data for really little gain. The following seems to work as expected... ``` discourse_development=# SELECT TO_TSVECTOR('https://www.discourse.org?test=2&test2=3'); to_tsvector ------------------------------------------------------ '2':3 '3':5 'test':2 'test2':4 'www.discourse.org':1 ``` However, once a path is present ``` discourse_development=# SELECT TO_TSVECTOR('https://www.discourse.org/latest?test=2&test2=3'); to_tsvector ---------------------------------------------------------------------------------------------- '/latest?test=2&test2=3':3 'www.discourse.org':2 'www.discourse.org/latest?test=2&test2=3':1 ``` The lexeme contains both the path and the query string.
2020-07-09 17:02:02 +08:00 · 2020-07-09 17:02:02 +08:00 · 2196d0b9ae
parent 5c230266d3
commit 2196d0b9ae
2 changed files with 12 additions and 3 deletions
--- a/lib/search.rb
+++ b/lib/search.rb
@ -86,6 +86,15 @@ class Search
        data = strip_diacritics(data)
      end
    end
+
+    data.gsub!(EmailCook.url_regexp) do |url|
+      uri = URI.parse(url)
+      uri.query = nil
+      uri.to_s
+    rescue URI::Error
+      # Don't fail even if URL turns out to be invalid
+    end
+
    data
  end

--- a/spec/services/search_indexer_spec.rb
+++ b/spec/services/search_indexer_spec.rb
@ -145,7 +145,7 @@ describe SearchIndexer do
      )
    end

-    it 'should tokenize host of a URL' do
+    it 'should tokenize host of a URL and removes query string' do
      category = Fabricate(:category, name: 'awesome category')
      topic = Fabricate(:topic, category: category, title: 'this is a test topic')

@ -158,11 +158,11 @@ describe SearchIndexer do
      topic = post.topic

      expect(post.post_search_data.raw_data).to eq(
-        "#{topic.title} #{category.name} a https://cnn.com?bob=1 , http://stuff.com.au?bill=1 b http://abc.net/xyz=1 abc.net/xyz=1"
+        "#{topic.title} #{category.name} a https://cnn.com , http://stuff.com.au b http://abc.net/xyz=1 abc.net/xyz=1"
      )

      expect(post.post_search_data.search_data).to eq(
-        "'/xyz=1':18,21 '1':11,14 'abc':17,20 'abc.net':17,20 'abc.net/xyz=1':16,19 'au':12 'awesom':6B 'b':15 'bill':13 'bob':10 'categori':7B 'cnn':9 'cnn.com':9 'com':9,12 'com.au':12 'net':17,20 'stuff':12 'stuff.com.au':12 'test':4A 'topic':5A"
+        "'/xyz=1':14,17 'abc':13,16 'abc.net':13,16 'abc.net/xyz=1':12,15 'au':10 'awesom':6B 'b':11 'categori':7B 'cnn':9 'cnn.com':9 'com':9,10 'com.au':10 'net':13,16 'stuff':10 'stuff.com.au':10 'test':4A 'topic':5A"
      )
    end