From 4a3c13a37baa3c9ae13c1cb8c603a65859cd43b4 Mon Sep 17 00:00:00 2001 From: Sam Date: Mon, 20 Mar 2023 15:43:08 +1100 Subject: [PATCH] FIX: search index failing on certain posts (#20736) During search indexing we "stuff" the index with additional keywords for entities that look like domain names. This allows searches for `cnn` to find URLs for `www.cnn.com` The search stuffing attempted to keep indexes aligned at the correct positions by remapping the indexed terms. However under certain edge cases a single word can stem into 2 different lexemes. If this happened we had an off by one which caused the entire indexing to fail. We work around this edge case (and carry incorrect index positions) for cases like this. It is unlikely to impact search quality at all given index position makes almost no difference in the search algorithm. --- app/services/search_indexer.rb | 12 +++++++++++- spec/services/search_indexer_spec.rb | 18 ++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/app/services/search_indexer.rb b/app/services/search_indexer.rb index 40104c5b13c..530ac26c6d0 100644 --- a/app/services/search_indexer.rb +++ b/app/services/search_indexer.rb @@ -87,7 +87,17 @@ class SearchIndexer .scan(TS_VECTOR_PARSE_REGEX) .map do |term, _, indexes| new_indexes = - indexes.split(",").map { |index| additional_words[index.to_i - 1][1] }.join(",") + indexes + .split(",") + .map do |index| + existing_positions = additional_words[index.to_i - 1] + if existing_positions + existing_positions[1] + else + index + end + end + .join(",") "#{term}#{new_indexes}" end .join(" ") diff --git a/spec/services/search_indexer_spec.rb b/spec/services/search_indexer_spec.rb index 13890c884c0..b8c1cc8b9eb 100644 --- a/spec/services/search_indexer_spec.rb +++ b/spec/services/search_indexer_spec.rb @@ -139,6 +139,24 @@ RSpec.describe SearchIndexer do } end + it "should work with edge case domain names" do + # 00E5A4 stems to 00e5 and a4, which is odd, but by-design + # this may cause internal indexing to fail due to indexes not aligning + # when stuffing terms for domains + post.update!(cooked: <<~HTML) + Test.00E5A4.1 + HTML + + SearchIndexer.update_posts_index( + post_id: post.id, + topic_title: post.topic.title, + category_name: post.topic.category&.name, + topic_tags: post.topic.tags.map(&:name).join(" "), + cooked: post.cooked, + private_message: post.topic.private_message?, + ) + end + it "should work with invalid HTML" do post.update!(cooked: "" * Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH)