diff --git a/app/services/search_indexer.rb b/app/services/search_indexer.rb index 889c6c1ac0e..96d750a281a 100644 --- a/app/services/search_indexer.rb +++ b/app/services/search_indexer.rb @@ -8,6 +8,7 @@ class SearchIndexer USER_INDEX_VERSION = 3 TAG_INDEX_VERSION = 3 REINDEX_VERSION = 0 + TS_VECTOR_PARSE_REGEX = /('([^']*|'')*'\:)(([0-9]+[A-D]?,?)+)/ def self.disable @disabled = true @@ -45,6 +46,10 @@ class SearchIndexer tsvector = DB.query_single("SELECT #{ranked_index}", indexed_data)[0] additional_lexemes = [] + # we also want to index parts of a domain name + # that way stemmed single word searches will match + additional_words = [] + tsvector .scan(/'(([a-zA-Z0-9]+\.)+[a-zA-Z0-9]+)'\:([\w+,]+)/) .reduce(additional_lexemes) do |array, (lexeme, _, positions)| @@ -54,8 +59,11 @@ class SearchIndexer loop do count += 1 break if count >= 10 # Safeguard here to prevent infinite loop when a term has many dots - _term, _, remaining = lexeme.partition(".") + term, _, remaining = lexeme.partition(".") break if remaining.blank? + + additional_words << [term, positions] + array << "'#{remaining}':#{positions}" lexeme = remaining end @@ -64,12 +72,30 @@ class SearchIndexer array end - tsvector = "#{tsvector} #{additional_lexemes.join(" ")}" + extra_domain_word_terms = + if additional_words.length > 0 + DB + .query_single( + "SELECT to_tsvector(?, ?)", + stemmer, + additional_words.map { |term, _| term }.join(" "), + ) + .first + .scan(TS_VECTOR_PARSE_REGEX) + .map do |term, _, indexes| + new_indexes = + indexes.split(",").map { |index| additional_words[index.to_i - 1][1] }.join(",") + "#{term}#{new_indexes}" + end + .join(" ") + end + + tsvector = "#{tsvector} #{additional_lexemes.join(" ")} #{extra_domain_word_terms}" if (max_dupes = SiteSetting.max_duplicate_search_index_terms) > 0 reduced = [] tsvector - .scan(/('([^']*|'')*'\:)(([0-9]+[A-D]?,?)+)/) + .scan(TS_VECTOR_PARSE_REGEX) .each do |term, _, indexes| family_counts = Hash.new(0) new_index_array = [] diff --git a/spec/lib/search_spec.rb b/spec/lib/search_spec.rb index 53e2a6edbf8..515684c7670 100644 --- a/spec/lib/search_spec.rb +++ b/spec/lib/search_spec.rb @@ -745,7 +745,7 @@ RSpec.describe Search do end it "returns nothing if user is not a group member" do - pm = create_pm(users: [current, participant], group: group) + _pm = create_pm(users: [current, participant], group: group) results = Search.execute("group_messages:#{group.name}", guardian: Guardian.new(non_participant)) @@ -757,7 +757,7 @@ RSpec.describe Search do end it "returns nothing if group has messages disabled" do - pm = create_pm(users: [current, participant], group: group) + _pm = create_pm(users: [current, participant], group: group) group.update!(has_messages: false) results = Search.execute("group_messages:#{group.name}", guardian: Guardian.new(current)) @@ -967,6 +967,18 @@ RSpec.describe Search do expect(result.posts.pluck(:id)).to eq([post2.id, post.id]) end + it "can find posts by searching for a url prefix" do + post = Fabricate(:post, raw: "checkout the amazing domain https://happy.sappy.com") + + results = Search.execute("happy") + expect(results.posts.count).to eq(1) + expect(results.posts.first.id).to eq(post.id) + + results = Search.execute("sappy") + expect(results.posts.count).to eq(1) + expect(results.posts.first.id).to eq(post.id) + end + it "aggregates searches in a topic by returning the post with the lowest post number" do post = Fabricate(:post, topic: topic, raw: "this is a play post") _post2 = Fabricate(:post, topic: topic, raw: "play play playing played play") @@ -1660,7 +1672,7 @@ RSpec.describe Search do it "can filter by posts in the user's bookmarks" do expect(search_with_bookmarks.posts.map(&:id)).to eq([]) - bm = Fabricate(:bookmark, user: user, bookmarkable: bookmark_post1) + Fabricate(:bookmark, user: user, bookmarkable: bookmark_post1) expect(search_with_bookmarks.posts.map(&:id)).to match_array([bookmark_post1.id]) end end @@ -2490,7 +2502,7 @@ RSpec.describe Search do expect(results.posts.length).to eq(Search.per_facet) expect(results.more_posts).to eq(nil) # not 6 posts yet - post6 = Fabricate(:post, raw: "hello post #6") + _post6 = Fabricate(:post, raw: "hello post #6") results = Search.execute("hello", search_type: :header) expect(results.posts.length).to eq(Search.per_facet) diff --git a/spec/services/search_indexer_spec.rb b/spec/services/search_indexer_spec.rb index fa43789e9fb..9e1b9a88701 100644 --- a/spec/services/search_indexer_spec.rb +++ b/spec/services/search_indexer_spec.rb @@ -169,8 +169,8 @@ RSpec.describe SearchIndexer do expect(post.post_search_data.raw_data).to eq("https://meta.discourse.org/some.png") - expect(post.post_search_data.search_data).to eq( - "'/some.png':12 'discourse.org':11 'meta.discourse.org':11 'meta.discourse.org/some.png':10 'org':11 'test':8A 'titl':4A 'uncategor':9B", + expect(post.post_search_data.search_data).to eq_ts_vector( + "'/some.png':12 'discourse.org':11 'meta.discourse.org':11 'meta.discourse.org/some.png':10 'org':11 'test':8A 'titl':4A 'uncategor':9B 'meta':11 'discours':11", ) end @@ -194,8 +194,8 @@ RSpec.describe SearchIndexer do topic = Fabricate(:topic, category: category, title: "this is a test topic") post = Fabricate(:post, topic: topic, raw: <<~RAW) - a https://abc.com?bob=1, http://efg.com.au?bill=1 b hij.net/xyz=1 - www.klm.net/?IGNORE=1 test + a https://car.com?bob=1, http://efg.com.au?bill=1 b hij.net/xyz=1 + www.klm.net/?IGNORE=1 test https://cars.com RAW post.rebake! @@ -208,11 +208,11 @@ RSpec.describe SearchIndexer do # more context to say "hey, this part of ... was a guess by autolinker. # A blanket treating of non-urls without this logic is risky. expect(post.post_search_data.raw_data).to eq( - "a https://abc.com , http://efg.com.au b http://hij.net/xyz=1 hij.net/xyz=1 http://www.klm.net/ www.klm.net/?IGNORE=1 http://abc.de.nop.co.uk test", + "a https://car.com , http://efg.com.au b http://hij.net/xyz=1 hij.net/xyz=1 http://www.klm.net/ www.klm.net/?IGNORE=1 http://abc.de.nop.co.uk test https://cars.com", ) - expect(post.post_search_data.search_data).to eq( - "'/?ignore=1':21 '/xyz=1':14,17 'abc.com':9 'abc.de.nop.co.uk':22 'au':10 'awesom':6B 'b':11 'categori':7B 'co.uk':22 'com':9 'com.au':10 'de.nop.co.uk':22 'efg.com.au':10 'hij.net':13,16 'hij.net/xyz=1':12,15 'klm.net':18,20 'net':13,16,18,20 'nop.co.uk':22 'test':4A,23 'topic':5A 'uk':22 'www.klm.net':18,20 'www.klm.net/?ignore=1':19", + expect(post.post_search_data.search_data).to eq_ts_vector( + "'/?ignore=1':21 '/xyz=1':14,17 'car.com':9 'cars.com':24 'abc.de.nop.co.uk':22 'au':10 'awesom':6B 'b':11 'categori':7B 'co.uk':22 'com':9,10,24 'com.au':10 'de.nop.co.uk':22 'efg.com.au':10 'hij.net':13,16 'hij.net/xyz=1':12,15 'klm.net':18,20 'net':13,16,18,20 'nop.co.uk':22 'test':4A,23 'topic':5A 'uk':22 'www.klm.net':18,20 'www.klm.net/?ignore=1':19 'car':9,24 'co':22 'de':22 'efg':10 'hij':13,16 'klm':18,20 'nop':22 'www':18,20 'abc':22", ) end @@ -260,8 +260,8 @@ RSpec.describe SearchIndexer do "link to an external page: https://google.com/ link to an audio file: #{I18n.t("search.audio")} link to a video file: #{I18n.t("search.video")} link to an invalid URL: http:error]", ) - expect(post.post_search_data.search_data).to eq( - "'/audio.m4a':23 '/content/somethingelse.mov':31 'audio':19 'com':15,22,30 'error':38 'extern':13 'file':20,28 'google.com':15 'http':37 'invalid':35 'link':10,16,24,32 'page':14 'somesite.com':22,30 'somesite.com/audio.m4a':21 'somesite.com/content/somethingelse.mov':29 'test':8A 'titl':4A 'uncategor':9B 'url':36 'video':27", + expect(post.post_search_data.search_data).to eq_ts_vector( + "'/audio.m4a':23 '/content/somethingelse.mov':31 'audio':19 'com':15,22,30 'error':38 'extern':13 'file':20,28 'google.com':15 'http':37 'invalid':35 'link':10,16,24,32 'page':14 'somesite.com':22,30 'somesite.com/audio.m4a':21 'somesite.com/content/somethingelse.mov':29 'test':8A 'titl':4A 'uncategor':9B 'url':36 'video':27 'googl':15 'somesit':22,30", ) end @@ -310,6 +310,7 @@ RSpec.describe SearchIndexer do contents = <<~TEXT #{"sam " * 10} url + url TEXT post.update!(raw: contents) @@ -318,11 +319,9 @@ RSpec.describe SearchIndexer do post_search_data.reload terms = - "'/path:path''path':22 'com':21 'sam':10,11,12,13,14 'something.com':21 'something.com/path:path''path':20 'test':8A 'titl':4A 'uncategor':9B 'url':23".split( - " ", - ).sort + "'/path:path''path':22,26 'com':21,25 'sam':10,11,12,13,14 'something.com':21 'something.com/path:path''path':20 'test':8A 'titl':4A 'uncategor':9B 'url':23,27 'someth':21,25 'somethings.com':25 'somethings.com/path:path''path':24" - expect(post_search_data.search_data.split(" ").sort).to contain_exactly(*terms) + expect(post_search_data.search_data).to eq_ts_vector(terms) end end diff --git a/spec/support/ts_vector_matcher.rb b/spec/support/ts_vector_matcher.rb new file mode 100644 index 00000000000..49d5d0da343 --- /dev/null +++ b/spec/support/ts_vector_matcher.rb @@ -0,0 +1,21 @@ +# frozen_string_literal: true + +RSpec::Matchers.define :eq_ts_vector do |expected_vector| + match do |actual_vector| + actual = actual_vector.split(" ").sort + expected = expected_vector.split(" ").sort + + (expected - actual == []) && (actual - expected == []) + end + failure_message do |actual_vector| + actual = actual_vector.split(" ").sort + expected = expected_vector.split(" ").sort + + message = +"ts_vector does not match!\n\n" + message << "Additional elements:\n" + message << (expected - actual).join("\n") + message << "\nMissing elements:\n" + message << (actual - expected).join("\n") + message + end +end