FIX: domain searches not working properly for URLs (#20136)
If a post contains domain with a word that stems to a non prefix single words will not match it. For example: in happy.com, `happy` stems to `happi`. Thus searches for happy will not find URLs with it included. This bloats the index a tiny bit, but impact is limited. Will require a full reindex of search to take effect. When we are done refining search we can consider a full version bump.
This commit is contained in:
parent
24f026c895
commit
651476e89e
|
@ -8,6 +8,7 @@ class SearchIndexer
|
|||
USER_INDEX_VERSION = 3
|
||||
TAG_INDEX_VERSION = 3
|
||||
REINDEX_VERSION = 0
|
||||
TS_VECTOR_PARSE_REGEX = /('([^']*|'')*'\:)(([0-9]+[A-D]?,?)+)/
|
||||
|
||||
def self.disable
|
||||
@disabled = true
|
||||
|
@ -45,6 +46,10 @@ class SearchIndexer
|
|||
tsvector = DB.query_single("SELECT #{ranked_index}", indexed_data)[0]
|
||||
additional_lexemes = []
|
||||
|
||||
# we also want to index parts of a domain name
|
||||
# that way stemmed single word searches will match
|
||||
additional_words = []
|
||||
|
||||
tsvector
|
||||
.scan(/'(([a-zA-Z0-9]+\.)+[a-zA-Z0-9]+)'\:([\w+,]+)/)
|
||||
.reduce(additional_lexemes) do |array, (lexeme, _, positions)|
|
||||
|
@ -54,8 +59,11 @@ class SearchIndexer
|
|||
loop do
|
||||
count += 1
|
||||
break if count >= 10 # Safeguard here to prevent infinite loop when a term has many dots
|
||||
_term, _, remaining = lexeme.partition(".")
|
||||
term, _, remaining = lexeme.partition(".")
|
||||
break if remaining.blank?
|
||||
|
||||
additional_words << [term, positions]
|
||||
|
||||
array << "'#{remaining}':#{positions}"
|
||||
lexeme = remaining
|
||||
end
|
||||
|
@ -64,12 +72,30 @@ class SearchIndexer
|
|||
array
|
||||
end
|
||||
|
||||
tsvector = "#{tsvector} #{additional_lexemes.join(" ")}"
|
||||
extra_domain_word_terms =
|
||||
if additional_words.length > 0
|
||||
DB
|
||||
.query_single(
|
||||
"SELECT to_tsvector(?, ?)",
|
||||
stemmer,
|
||||
additional_words.map { |term, _| term }.join(" "),
|
||||
)
|
||||
.first
|
||||
.scan(TS_VECTOR_PARSE_REGEX)
|
||||
.map do |term, _, indexes|
|
||||
new_indexes =
|
||||
indexes.split(",").map { |index| additional_words[index.to_i - 1][1] }.join(",")
|
||||
"#{term}#{new_indexes}"
|
||||
end
|
||||
.join(" ")
|
||||
end
|
||||
|
||||
tsvector = "#{tsvector} #{additional_lexemes.join(" ")} #{extra_domain_word_terms}"
|
||||
|
||||
if (max_dupes = SiteSetting.max_duplicate_search_index_terms) > 0
|
||||
reduced = []
|
||||
tsvector
|
||||
.scan(/('([^']*|'')*'\:)(([0-9]+[A-D]?,?)+)/)
|
||||
.scan(TS_VECTOR_PARSE_REGEX)
|
||||
.each do |term, _, indexes|
|
||||
family_counts = Hash.new(0)
|
||||
new_index_array = []
|
||||
|
|
|
@ -745,7 +745,7 @@ RSpec.describe Search do
|
|||
end
|
||||
|
||||
it "returns nothing if user is not a group member" do
|
||||
pm = create_pm(users: [current, participant], group: group)
|
||||
_pm = create_pm(users: [current, participant], group: group)
|
||||
|
||||
results =
|
||||
Search.execute("group_messages:#{group.name}", guardian: Guardian.new(non_participant))
|
||||
|
@ -757,7 +757,7 @@ RSpec.describe Search do
|
|||
end
|
||||
|
||||
it "returns nothing if group has messages disabled" do
|
||||
pm = create_pm(users: [current, participant], group: group)
|
||||
_pm = create_pm(users: [current, participant], group: group)
|
||||
group.update!(has_messages: false)
|
||||
|
||||
results = Search.execute("group_messages:#{group.name}", guardian: Guardian.new(current))
|
||||
|
@ -967,6 +967,18 @@ RSpec.describe Search do
|
|||
expect(result.posts.pluck(:id)).to eq([post2.id, post.id])
|
||||
end
|
||||
|
||||
it "can find posts by searching for a url prefix" do
|
||||
post = Fabricate(:post, raw: "checkout the amazing domain https://happy.sappy.com")
|
||||
|
||||
results = Search.execute("happy")
|
||||
expect(results.posts.count).to eq(1)
|
||||
expect(results.posts.first.id).to eq(post.id)
|
||||
|
||||
results = Search.execute("sappy")
|
||||
expect(results.posts.count).to eq(1)
|
||||
expect(results.posts.first.id).to eq(post.id)
|
||||
end
|
||||
|
||||
it "aggregates searches in a topic by returning the post with the lowest post number" do
|
||||
post = Fabricate(:post, topic: topic, raw: "this is a play post")
|
||||
_post2 = Fabricate(:post, topic: topic, raw: "play play playing played play")
|
||||
|
@ -1660,7 +1672,7 @@ RSpec.describe Search do
|
|||
|
||||
it "can filter by posts in the user's bookmarks" do
|
||||
expect(search_with_bookmarks.posts.map(&:id)).to eq([])
|
||||
bm = Fabricate(:bookmark, user: user, bookmarkable: bookmark_post1)
|
||||
Fabricate(:bookmark, user: user, bookmarkable: bookmark_post1)
|
||||
expect(search_with_bookmarks.posts.map(&:id)).to match_array([bookmark_post1.id])
|
||||
end
|
||||
end
|
||||
|
@ -2490,7 +2502,7 @@ RSpec.describe Search do
|
|||
expect(results.posts.length).to eq(Search.per_facet)
|
||||
expect(results.more_posts).to eq(nil) # not 6 posts yet
|
||||
|
||||
post6 = Fabricate(:post, raw: "hello post #6")
|
||||
_post6 = Fabricate(:post, raw: "hello post #6")
|
||||
|
||||
results = Search.execute("hello", search_type: :header)
|
||||
expect(results.posts.length).to eq(Search.per_facet)
|
||||
|
|
|
@ -169,8 +169,8 @@ RSpec.describe SearchIndexer do
|
|||
|
||||
expect(post.post_search_data.raw_data).to eq("https://meta.discourse.org/some.png")
|
||||
|
||||
expect(post.post_search_data.search_data).to eq(
|
||||
"'/some.png':12 'discourse.org':11 'meta.discourse.org':11 'meta.discourse.org/some.png':10 'org':11 'test':8A 'titl':4A 'uncategor':9B",
|
||||
expect(post.post_search_data.search_data).to eq_ts_vector(
|
||||
"'/some.png':12 'discourse.org':11 'meta.discourse.org':11 'meta.discourse.org/some.png':10 'org':11 'test':8A 'titl':4A 'uncategor':9B 'meta':11 'discours':11",
|
||||
)
|
||||
end
|
||||
|
||||
|
@ -194,8 +194,8 @@ RSpec.describe SearchIndexer do
|
|||
topic = Fabricate(:topic, category: category, title: "this is a test topic")
|
||||
|
||||
post = Fabricate(:post, topic: topic, raw: <<~RAW)
|
||||
a https://abc.com?bob=1, http://efg.com.au?bill=1 b hij.net/xyz=1
|
||||
www.klm.net/?IGNORE=1 <a href="http://abc.de.nop.co.uk?IGNORE=1&ignore2=2">test</a>
|
||||
a https://car.com?bob=1, http://efg.com.au?bill=1 b hij.net/xyz=1
|
||||
www.klm.net/?IGNORE=1 <a href="http://abc.de.nop.co.uk?IGNORE=1&ignore2=2">test</a> https://cars.com
|
||||
RAW
|
||||
|
||||
post.rebake!
|
||||
|
@ -208,11 +208,11 @@ RSpec.describe SearchIndexer do
|
|||
# more context to say "hey, this part of <a href>...</a> was a guess by autolinker.
|
||||
# A blanket treating of non-urls without this logic is risky.
|
||||
expect(post.post_search_data.raw_data).to eq(
|
||||
"a https://abc.com , http://efg.com.au b http://hij.net/xyz=1 hij.net/xyz=1 http://www.klm.net/ www.klm.net/?IGNORE=1 http://abc.de.nop.co.uk test",
|
||||
"a https://car.com , http://efg.com.au b http://hij.net/xyz=1 hij.net/xyz=1 http://www.klm.net/ www.klm.net/?IGNORE=1 http://abc.de.nop.co.uk test https://cars.com",
|
||||
)
|
||||
|
||||
expect(post.post_search_data.search_data).to eq(
|
||||
"'/?ignore=1':21 '/xyz=1':14,17 'abc.com':9 'abc.de.nop.co.uk':22 'au':10 'awesom':6B 'b':11 'categori':7B 'co.uk':22 'com':9 'com.au':10 'de.nop.co.uk':22 'efg.com.au':10 'hij.net':13,16 'hij.net/xyz=1':12,15 'klm.net':18,20 'net':13,16,18,20 'nop.co.uk':22 'test':4A,23 'topic':5A 'uk':22 'www.klm.net':18,20 'www.klm.net/?ignore=1':19",
|
||||
expect(post.post_search_data.search_data).to eq_ts_vector(
|
||||
"'/?ignore=1':21 '/xyz=1':14,17 'car.com':9 'cars.com':24 'abc.de.nop.co.uk':22 'au':10 'awesom':6B 'b':11 'categori':7B 'co.uk':22 'com':9,10,24 'com.au':10 'de.nop.co.uk':22 'efg.com.au':10 'hij.net':13,16 'hij.net/xyz=1':12,15 'klm.net':18,20 'net':13,16,18,20 'nop.co.uk':22 'test':4A,23 'topic':5A 'uk':22 'www.klm.net':18,20 'www.klm.net/?ignore=1':19 'car':9,24 'co':22 'de':22 'efg':10 'hij':13,16 'klm':18,20 'nop':22 'www':18,20 'abc':22",
|
||||
)
|
||||
end
|
||||
|
||||
|
@ -260,8 +260,8 @@ RSpec.describe SearchIndexer do
|
|||
"link to an external page: https://google.com/ link to an audio file: #{I18n.t("search.audio")} link to a video file: #{I18n.t("search.video")} link to an invalid URL: http:error]",
|
||||
)
|
||||
|
||||
expect(post.post_search_data.search_data).to eq(
|
||||
"'/audio.m4a':23 '/content/somethingelse.mov':31 'audio':19 'com':15,22,30 'error':38 'extern':13 'file':20,28 'google.com':15 'http':37 'invalid':35 'link':10,16,24,32 'page':14 'somesite.com':22,30 'somesite.com/audio.m4a':21 'somesite.com/content/somethingelse.mov':29 'test':8A 'titl':4A 'uncategor':9B 'url':36 'video':27",
|
||||
expect(post.post_search_data.search_data).to eq_ts_vector(
|
||||
"'/audio.m4a':23 '/content/somethingelse.mov':31 'audio':19 'com':15,22,30 'error':38 'extern':13 'file':20,28 'google.com':15 'http':37 'invalid':35 'link':10,16,24,32 'page':14 'somesite.com':22,30 'somesite.com/audio.m4a':21 'somesite.com/content/somethingelse.mov':29 'test':8A 'titl':4A 'uncategor':9B 'url':36 'video':27 'googl':15 'somesit':22,30",
|
||||
)
|
||||
end
|
||||
|
||||
|
@ -310,6 +310,7 @@ RSpec.describe SearchIndexer do
|
|||
contents = <<~TEXT
|
||||
#{"sam " * 10}
|
||||
<a href="https://something.com/path:path'path?term='hello'">url</a>
|
||||
<a href="https://somethings.com/path:path'path?term='hello'">url</a>
|
||||
TEXT
|
||||
|
||||
post.update!(raw: contents)
|
||||
|
@ -318,11 +319,9 @@ RSpec.describe SearchIndexer do
|
|||
post_search_data.reload
|
||||
|
||||
terms =
|
||||
"'/path:path''path':22 'com':21 'sam':10,11,12,13,14 'something.com':21 'something.com/path:path''path':20 'test':8A 'titl':4A 'uncategor':9B 'url':23".split(
|
||||
" ",
|
||||
).sort
|
||||
"'/path:path''path':22,26 'com':21,25 'sam':10,11,12,13,14 'something.com':21 'something.com/path:path''path':20 'test':8A 'titl':4A 'uncategor':9B 'url':23,27 'someth':21,25 'somethings.com':25 'somethings.com/path:path''path':24"
|
||||
|
||||
expect(post_search_data.search_data.split(" ").sort).to contain_exactly(*terms)
|
||||
expect(post_search_data.search_data).to eq_ts_vector(terms)
|
||||
end
|
||||
end
|
||||
|
||||
|
|
|
@ -0,0 +1,21 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
RSpec::Matchers.define :eq_ts_vector do |expected_vector|
|
||||
match do |actual_vector|
|
||||
actual = actual_vector.split(" ").sort
|
||||
expected = expected_vector.split(" ").sort
|
||||
|
||||
(expected - actual == []) && (actual - expected == [])
|
||||
end
|
||||
failure_message do |actual_vector|
|
||||
actual = actual_vector.split(" ").sort
|
||||
expected = expected_vector.split(" ").sort
|
||||
|
||||
message = +"ts_vector does not match!\n\n"
|
||||
message << "Additional elements:\n"
|
||||
message << (expected - actual).join("\n")
|
||||
message << "\nMissing elements:\n"
|
||||
message << (actual - expected).join("\n")
|
||||
message
|
||||
end
|
||||
end
|
Loading…
Reference in New Issue