From 37bf160d26b4ee395f065dd86301be6fdc5f8474 Mon Sep 17 00:00:00 2001 From: Rafael dos Santos Silva Date: Wed, 19 Feb 2025 16:30:01 -0300 Subject: [PATCH] FIX: Add workaround to pgvector HNSW search limitations (#1133) From [pgvector/pgvector](https://github.com/pgvector/pgvector) README > With approximate indexes, filtering is applied after the index is scanned. If a condition matches 10% of rows, with HNSW and the default hnsw.ef_search of 40, only 4 rows will match on average. For more rows, increase hnsw.ef_search. > > Starting with 0.8.0, you can enable [iterative index scans](https://github.com/pgvector/pgvector#iterative-index-scans), which will automatically scan more of the index when needed. Since we are stuck on 0.7.0 we are going the first option for now. --- lib/ai_helper/semantic_categorizer.rb | 19 +++++---- lib/embeddings/schema.rb | 40 ++++++++++++++----- .../ai_helper/ai_composer_helper_spec.rb | 11 ++++- .../ai_split_topic_suggestion_spec.rb | 11 ++++- 4 files changed, 60 insertions(+), 21 deletions(-) diff --git a/lib/ai_helper/semantic_categorizer.rb b/lib/ai_helper/semantic_categorizer.rb index a0be3dcb..b05c3ece 100644 --- a/lib/ai_helper/semantic_categorizer.rb +++ b/lib/ai_helper/semantic_categorizer.rb @@ -5,13 +5,15 @@ module DiscourseAi def initialize(input, user) @user = user @text = input[:text] + @vector = DiscourseAi::Embeddings::Vector.instance + @schema = DiscourseAi::Embeddings::Schema.for(Topic) end def categories return [] if @text.blank? return [] if !DiscourseAi::Embeddings.enabled? - candidates = nearest_neighbors(limit: 100) + candidates = nearest_neighbors return [] if candidates.empty? candidate_ids = candidates.map(&:first) @@ -40,6 +42,9 @@ module DiscourseAi } end .map do |c| + # Note: <#> returns the negative inner product since Postgres only supports ASC order index scans on operators + c[:score] = (c[:score] + 1).abs if @vector.vdef.pg_function = "<#>" + c[:score] = 1 / (c[:score] + 1) # inverse of the distance c end @@ -72,6 +77,9 @@ module DiscourseAi .with_index { |tag_list, index| { tags: tag_list, score: candidates[index].last } } .flat_map { |c| c[:tags].map { |t| { name: t, score: c[:score] } } } .map do |c| + # Note: <#> returns the negative inner product since Postgres only supports ASC order index scans on operators + c[:score] = (c[:score] + 1).abs if @vector.vdef.pg_function = "<#>" + c[:score] = 1 / (c[:score] + 1) # inverse of the distance c end @@ -91,11 +99,8 @@ module DiscourseAi private - def nearest_neighbors(limit: 100) - vector = DiscourseAi::Embeddings::Vector.instance - schema = DiscourseAi::Embeddings::Schema.for(Topic) - - raw_vector = vector.vector_from(@text) + def nearest_neighbors(limit: 50) + raw_vector = @vector.vector_from(@text) muted_category_ids = nil if @user.present? @@ -106,7 +111,7 @@ module DiscourseAi ).pluck(:category_id) end - schema + @schema .asymmetric_similarity_search(raw_vector, limit: limit, offset: 0) do |builder| builder.join("topics t on t.id = topic_id") unless muted_category_ids.empty? diff --git a/lib/embeddings/schema.rb b/lib/embeddings/schema.rb index 013c460e..4d540fa7 100644 --- a/lib/embeddings/schema.rb +++ b/lib/embeddings/schema.rb @@ -15,6 +15,8 @@ module DiscourseAi EMBEDDING_TARGETS = %w[topics posts document_fragments] EMBEDDING_TABLES = [TOPICS_TABLE, POSTS_TABLE, RAG_DOCS_TABLE] + DEFAULT_HNSW_EF_SEARCH = 40 + MissingEmbeddingError = Class.new(StandardError) class << self @@ -132,6 +134,8 @@ module DiscourseAi end def asymmetric_similarity_search(embedding, limit:, offset:) + before_query = hnsw_search_workaround(limit) + builder = DB.build(<<~SQL) WITH candidates AS ( SELECT @@ -153,7 +157,7 @@ module DiscourseAi ORDER BY embeddings::halfvec(#{dimensions}) #{pg_function} '[:query_embedding]'::halfvec(#{dimensions}) LIMIT :limit - OFFSET :offset + OFFSET :offset; SQL builder.where( @@ -171,18 +175,24 @@ module DiscourseAi candidates_limit = limit * 2 end - builder.query( - query_embedding: embedding, - candidates_limit: candidates_limit, - limit: limit, - offset: offset, - ) + ActiveRecord::Base.transaction do + DB.exec(before_query) if before_query.present? + builder.query( + query_embedding: embedding, + candidates_limit: candidates_limit, + limit: limit, + offset: offset, + ) + end rescue PG::Error => e Rails.logger.error("Error #{e} querying embeddings for model #{vector_def.display_name}") raise MissingEmbeddingError end def symmetric_similarity_search(record) + limit = 200 + before_query = hnsw_search_workaround(limit) + builder = DB.build(<<~SQL) WITH le_target AS ( SELECT @@ -210,7 +220,7 @@ module DiscourseAi le_target LIMIT 1 ) - LIMIT 200 + LIMIT #{limit} ) AS widenet ORDER BY embeddings::halfvec(#{dimensions}) #{pg_function} ( @@ -220,14 +230,17 @@ module DiscourseAi le_target LIMIT 1 ) - LIMIT 100; + LIMIT #{limit / 2}; SQL builder.where("model_id = :vid AND strategy_id = :vsid") yield(builder) if block_given? - builder.query(vid: vector_def.id, vsid: vector_def.strategy_id, target_id: record.id) + ActiveRecord::Base.transaction do + DB.exec(before_query) if before_query.present? + builder.query(vid: vector_def.id, vsid: vector_def.strategy_id, target_id: record.id) + end rescue PG::Error => e Rails.logger.error("Error #{e} querying embeddings for model #{vector_def.display_name}") raise MissingEmbeddingError @@ -259,6 +272,13 @@ module DiscourseAi private + def hnsw_search_workaround(limit) + threshold = limit * 2 + + return "" if threshold < DEFAULT_HNSW_EF_SEARCH + "SET LOCAL hnsw.ef_search = #{threshold};" + end + delegate :dimensions, :pg_function, to: :vector_def end end diff --git a/spec/system/ai_helper/ai_composer_helper_spec.rb b/spec/system/ai_helper/ai_composer_helper_spec.rb index 324d8c4b..c8ac92c5 100644 --- a/spec/system/ai_helper/ai_composer_helper_spec.rb +++ b/spec/system/ai_helper/ai_composer_helper_spec.rb @@ -3,6 +3,7 @@ RSpec.describe "AI Composer helper", type: :system, js: true do fab!(:user) { Fabricate(:admin, refresh_auto_groups: true) } fab!(:non_member_group) { Fabricate(:group) } + fab!(:embedding_definition) before do Group.find_by(id: Group::AUTO_GROUPS[:admins]).add(user) @@ -243,7 +244,10 @@ RSpec.describe "AI Composer helper", type: :system, js: true do end context "when suggesting the category with AI category suggester" do - before { SiteSetting.ai_embeddings_enabled = true } + before do + SiteSetting.ai_embeddings_selected_model = embedding_definition.id + SiteSetting.ai_embeddings_enabled = true + end it "updates the category with the suggested category" do response = @@ -274,7 +278,10 @@ RSpec.describe "AI Composer helper", type: :system, js: true do end context "when suggesting the tags with AI tag suggester" do - before { SiteSetting.ai_embeddings_enabled = true } + before do + SiteSetting.ai_embeddings_selected_model = embedding_definition.id + SiteSetting.ai_embeddings_enabled = true + end it "updates the tag with the suggested tag" do response = diff --git a/spec/system/ai_helper/ai_split_topic_suggestion_spec.rb b/spec/system/ai_helper/ai_split_topic_suggestion_spec.rb index 150bf0df..0ae6459d 100644 --- a/spec/system/ai_helper/ai_split_topic_suggestion_spec.rb +++ b/spec/system/ai_helper/ai_split_topic_suggestion_spec.rb @@ -35,6 +35,7 @@ RSpec.describe "AI Post helper", type: :system, js: true do fab!(:cloud) { Fabricate(:tag) } fab!(:feedback) { Fabricate(:tag) } fab!(:review) { Fabricate(:tag) } + fab!(:embedding_definition) before do Group.find_by(id: Group::AUTO_GROUPS[:admins]).add(user) @@ -80,7 +81,10 @@ RSpec.describe "AI Post helper", type: :system, js: true do end context "when suggesting categories with AI category suggester" do - before { SiteSetting.ai_embeddings_enabled = true } + before do + SiteSetting.ai_embeddings_selected_model = embedding_definition.id + SiteSetting.ai_embeddings_enabled = true + end skip "TODO: Category suggester only loading one category in test" do it "updates the category with the suggested category" do @@ -108,7 +112,10 @@ RSpec.describe "AI Post helper", type: :system, js: true do end context "when suggesting tags with AI tag suggester" do - before { SiteSetting.ai_embeddings_enabled = true } + before do + SiteSetting.ai_embeddings_selected_model = embedding_definition.id + SiteSetting.ai_embeddings_enabled = true + end it "update the tag with the suggested tag" do response =