FIX: Strip query from URLs when indexing for search.
Indexing query strings in URLS produces inconsistent results in PG and pollutes the search data for really little gain. The following seems to work as expected... ``` discourse_development=# SELECT TO_TSVECTOR('https://www.discourse.org?test=2&test2=3'); to_tsvector ------------------------------------------------------ '2':3 '3':5 'test':2 'test2':4 'www.discourse.org':1 ``` However, once a path is present ``` discourse_development=# SELECT TO_TSVECTOR('https://www.discourse.org/latest?test=2&test2=3'); to_tsvector ---------------------------------------------------------------------------------------------- '/latest?test=2&test2=3':3 'www.discourse.org':2 'www.discourse.org/latest?test=2&test2=3':1 ``` The lexeme contains both the path and the query string.
This commit is contained in:
parent
5c230266d3
commit
2196d0b9ae
|
@ -86,6 +86,15 @@ class Search
|
|||
data = strip_diacritics(data)
|
||||
end
|
||||
end
|
||||
|
||||
data.gsub!(EmailCook.url_regexp) do |url|
|
||||
uri = URI.parse(url)
|
||||
uri.query = nil
|
||||
uri.to_s
|
||||
rescue URI::Error
|
||||
# Don't fail even if URL turns out to be invalid
|
||||
end
|
||||
|
||||
data
|
||||
end
|
||||
|
||||
|
|
|
@ -145,7 +145,7 @@ describe SearchIndexer do
|
|||
)
|
||||
end
|
||||
|
||||
it 'should tokenize host of a URL' do
|
||||
it 'should tokenize host of a URL and removes query string' do
|
||||
category = Fabricate(:category, name: 'awesome category')
|
||||
topic = Fabricate(:topic, category: category, title: 'this is a test topic')
|
||||
|
||||
|
@ -158,11 +158,11 @@ describe SearchIndexer do
|
|||
topic = post.topic
|
||||
|
||||
expect(post.post_search_data.raw_data).to eq(
|
||||
"#{topic.title} #{category.name} a https://cnn.com?bob=1 , http://stuff.com.au?bill=1 b http://abc.net/xyz=1 abc.net/xyz=1"
|
||||
"#{topic.title} #{category.name} a https://cnn.com , http://stuff.com.au b http://abc.net/xyz=1 abc.net/xyz=1"
|
||||
)
|
||||
|
||||
expect(post.post_search_data.search_data).to eq(
|
||||
"'/xyz=1':18,21 '1':11,14 'abc':17,20 'abc.net':17,20 'abc.net/xyz=1':16,19 'au':12 'awesom':6B 'b':15 'bill':13 'bob':10 'categori':7B 'cnn':9 'cnn.com':9 'com':9,12 'com.au':12 'net':17,20 'stuff':12 'stuff.com.au':12 'test':4A 'topic':5A"
|
||||
"'/xyz=1':14,17 'abc':13,16 'abc.net':13,16 'abc.net/xyz=1':12,15 'au':10 'awesom':6B 'b':11 'categori':7B 'cnn':9 'cnn.com':9 'com':9,10 'com.au':10 'net':13,16 'stuff':10 'stuff.com.au':10 'test':4A 'topic':5A"
|
||||
)
|
||||
end
|
||||
|
||||
|
|
Loading…
Reference in New Issue