FIX: Open AI embeddings config migration & Seeded indexes cleanup & (#1092)

This change fixes two different problems.

First, we add a data migration to migrate the configuration of sites using Open AI's embedding model. There was a window between the embedding config changes and #1087, where sites could end up in a broken state due to an unconfigured selected model setting, as reported on https://meta.discourse.org/t/-/348964

The second fix drops pre-seeded search indexes of the models we didn't migrate and corrects the ones where the dimensions don't match. Since the index uses the model ID, new embedding configs could use one of these ones even when the dimensions no longer match.
This commit is contained in:
Roman Rizzi 2025-01-27 15:24:43 -03:00 committed by GitHub
parent fe44d78156
commit a53719ab8e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 377 additions and 0 deletions

View File

@ -0,0 +1,107 @@
# frozen_string_literal: true
class FixBrokenOpenAiEmbeddingsConfig < ActiveRecord::Migration[7.2]
def up
return if fetch_setting("ai_embeddings_selected_model").present?
return if DB.query_single("SELECT COUNT(*) FROM embedding_definitions").first > 0
open_ai_models = %w[text-embedding-3-large text-embedding-3-small text-embedding-ada-002]
current_model = fetch_setting("ai_embeddings_model")
return if !open_ai_models.include?(current_model)
endpoint = fetch_setting("ai_openai_embeddings_url") || "https://api.openai.com/v1/embeddings"
api_key = fetch_setting("ai_openai_api_key")
return if api_key.blank?
attrs = {
display_name: current_model,
url: endpoint,
api_key: api_key,
provider: "open_ai",
}.merge(model_attrs(current_model))
persist_config(attrs)
end
def fetch_setting(name)
DB.query_single(
"SELECT value FROM site_settings WHERE name = :setting_name",
setting_name: name,
).first || ENV["DISCOURSE_#{name&.upcase}"]
end
def model_attrs(model_name)
if model_name == "text-embedding-3-large"
{
dimensions: 2000,
max_sequence_length: 8191,
id: 7,
pg_function: "<=>",
tokenizer_class: "DiscourseAi::Tokenizer::OpenAiTokenizer",
matryoshka_dimensions: true,
provider_params: {
model_name: "text-embedding-3-large",
},
}
elsif model_name == "text-embedding-3-small"
{
dimensions: 1536,
max_sequence_length: 8191,
id: 6,
pg_function: "<=>",
tokenizer_class: "DiscourseAi::Tokenizer::OpenAiTokenizer",
provider_params: {
model_name: "text-embedding-3-small",
},
}
else
{
dimensions: 1536,
max_sequence_length: 8191,
id: 2,
pg_function: "<=>",
tokenizer_class: "DiscourseAi::Tokenizer::OpenAiTokenizer",
provider_params: {
model_name: "text-embedding-ada-002",
},
}
end
end
def persist_config(attrs)
DB.exec(
<<~SQL,
INSERT INTO embedding_definitions (id, display_name, dimensions, max_sequence_length, version, pg_function, provider, tokenizer_class, url, api_key, provider_params, matryoshka_dimensions, created_at, updated_at)
VALUES (:id, :display_name, :dimensions, :max_sequence_length, 1, :pg_function, :provider, :tokenizer_class, :url, :api_key, :provider_params, :matryoshka_dimensions, :now, :now)
SQL
id: attrs[:id],
display_name: attrs[:display_name],
dimensions: attrs[:dimensions],
max_sequence_length: attrs[:max_sequence_length],
pg_function: attrs[:pg_function],
provider: attrs[:provider],
tokenizer_class: attrs[:tokenizer_class],
url: attrs[:url],
api_key: attrs[:api_key],
provider_params: attrs[:provider_params]&.to_json,
matryoshka_dimensions: !!attrs[:matryoshka_dimensions],
now: Time.zone.now,
)
# We hardcoded the ID to match with already generated embeddings. Let's restart the seq to avoid conflicts.
DB.exec(
"ALTER SEQUENCE embedding_definitions_id_seq RESTART WITH :new_seq",
new_seq: attrs[:id].to_i + 1,
)
DB.exec(<<~SQL, new_value: attrs[:id])
INSERT INTO site_settings(name, data_type, value, created_at, updated_at)
VALUES ('ai_embeddings_selected_model', 3, ':new_value', NOW(), NOW())
SQL
end
def down
raise ActiveRecord::IrreversibleMigration
end
end

View File

@ -0,0 +1,61 @@
# frozen_string_literal: true
class CleanUnusedEmbeddingSearchIndexes < ActiveRecord::Migration[7.2]
def up
existing_definitions =
DB.query("SELECT id, dimensions FROM embedding_definitions WHERE id <= 8")
drop_statements =
(1..8)
.reduce([]) do |memo, model_id|
model = existing_definitions.find { |ed| ed&.id == model_id }
if model.blank? || !correctly_indexed?(model)
embedding_tables.each do |type|
memo << "DROP INDEX IF EXISTS ai_#{type}_embeddings_#{model_id}_1_search_bit;"
end
end
memo
end
.join("\n")
DB.exec(drop_statements) if drop_statements.present?
amend_statements =
(1..8)
.reduce([]) do |memo, model_id|
model = existing_definitions.find { |ed| ed&.id == model_id }
memo << amended_idxs(model) if model.present? && !correctly_indexed?(model)
memo
end
.join("\n")
DB.exec(amend_statements) if amend_statements.present?
end
def embedding_tables
%w[topics posts document_fragments]
end
def amended_idxs(model)
embedding_tables.map { |t| <<~SQL }.join("\n")
CREATE INDEX IF NOT EXISTS ai_#{t}_embeddings_#{model.id}_1_search_bit ON ai_#{t}_embeddings
USING hnsw ((binary_quantize(embeddings)::bit(#{model.dimensions})) bit_hamming_ops)
WHERE model_id = #{model.id} AND strategy_id = 1;
SQL
end
def correctly_indexed?(edef)
seeded_dimensions[edef.id] == edef.dimensions
end
def seeded_dimensions
{ 1 => 768, 2 => 1536, 3 => 1024, 4 => 1024, 5 => 768, 6 => 1536, 7 => 2000, 8 => 1024 }
end
def down
raise ActiveRecord::IrreversibleMigration
end
end

View File

@ -0,0 +1,73 @@
# frozen_string_literal: true
require "rails_helper"
require Rails.root.join(
"plugins/discourse-ai/db/migrate/20250125162658_fix_broken_open_ai_embeddings_config",
)
RSpec.describe FixBrokenOpenAiEmbeddingsConfig do
let(:connection) { ActiveRecord::Base.connection }
def store_setting(name, val)
connection.execute <<~SQL
INSERT INTO site_settings(name, data_type, value, created_at, updated_at)
VALUES ('#{name}', 3, '#{val}', NOW(), NOW())
SQL
end
def configured_model_id
DB.query_single(
"SELECT value FROM site_settings WHERE name = 'ai_embeddings_selected_model'",
).first
end
describe "#up" do
context "when embeddings are already configured" do
fab!(:embedding_definition)
before { store_setting("ai_embeddings_selected_model", embedding_definition.id) }
it "does nothing" do
subject.up
expect(configured_model_id).to eq(embedding_definition.id.to_s)
end
end
context "when there is no previous config" do
it "does nothing" do
subject.up
expect(configured_model_id).to be_blank
end
end
context "when things are not fully configured" do
before do
store_setting("ai_embeddings_model", "text-embedding-3-large")
store_setting("ai_openai_api_key", "")
end
it "does nothing" do
subject.up
expect(configured_model_id).to be_blank
end
end
context "when we have a configuration that previously failed to copy" do
before do
store_setting("ai_embeddings_model", "text-embedding-3-large")
store_setting("ai_openai_api_key", "123")
end
it "copies the config" do
subject.up
embedding_def = EmbeddingDefinition.last
expect(configured_model_id).to eq(embedding_def.id.to_s)
end
end
end
end

View File

@ -0,0 +1,136 @@
# frozen_string_literal: true
require "rails_helper"
require Rails.root.join(
"plugins/discourse-ai/db/migrate/20250127145305_clean_unused_embedding_search_indexes",
)
RSpec.describe CleanUnusedEmbeddingSearchIndexes do
let(:connection) { ActiveRecord::Base.connection }
describe "#up" do
before do
# Copied from 20241008054440_create_binary_indexes_for_embeddings
%w[topics posts document_fragments].each do |type|
# our supported embeddings models IDs and dimensions
[
[1, 768],
[2, 1536],
[3, 1024],
[4, 1024],
[5, 768],
[6, 1536],
[7, 2000],
[8, 1024],
].each { |model_id, dimensions| connection.execute <<-SQL }
CREATE INDEX IF NOT EXISTS ai_#{type}_embeddings_#{model_id}_1_search_bit ON ai_#{type}_embeddings
USING hnsw ((binary_quantize(embeddings)::bit(#{dimensions})) bit_hamming_ops)
WHERE model_id = #{model_id} AND strategy_id = 1;
SQL
end
end
let(:all_idx_names) do
%w[topics posts document_fragments].reduce([]) do |memo, type|
(1..8).each { |model_id| memo << "ai_#{type}_embeddings_#{model_id}_1_search_bit" }
memo
end
end
context "when there are no embedding definitions" do
it "removes all indexes" do
subject.up
remaininig_idxs =
DB.query_single(
"SELECT indexname FROM pg_indexes WHERE indexname IN (:names)",
names: all_idx_names,
)
expect(remaininig_idxs).to be_empty
end
end
context "when there is an embedding definition with the same dimensions" do
fab!(:embedding_def) { Fabricate(:embedding_definition, id: 2, dimensions: 1536) }
it "keeps the matching index and removes the rest" do
expected_model_idxs =
%w[topics posts document_fragments].reduce([]) do |memo, type|
memo << "ai_#{type}_embeddings_2_1_search_bit"
end
subject.up
remaininig_idxs =
DB.query_single(
"SELECT indexname FROM pg_indexes WHERE indexname IN (:names)",
names: all_idx_names,
)
expect(remaininig_idxs).to contain_exactly(*expected_model_idxs)
# This method checks dimensions are correct.
expect(DiscourseAi::Embeddings::Schema.correctly_indexed?(embedding_def)).to eq(true)
end
end
context "when there is an embedding definition with different dimenstions" do
fab!(:embedding_def) { Fabricate(:embedding_definition, id: 2, dimensions: 1536) }
fab!(:embedding_def_2) { Fabricate(:embedding_definition, id: 3, dimensions: 768) }
it "updates the index to use the right dimensions" do
expected_model_idxs =
%w[topics posts document_fragments].reduce([]) do |memo, type|
memo << "ai_#{type}_embeddings_2_1_search_bit"
memo << "ai_#{type}_embeddings_3_1_search_bit"
end
subject.up
remaininig_idxs =
DB.query_single(
"SELECT indexname FROM pg_indexes WHERE indexname IN (:names)",
names: all_idx_names,
)
expect(remaininig_idxs).to contain_exactly(*expected_model_idxs)
# This method checks dimensions are correct.
expect(DiscourseAi::Embeddings::Schema.correctly_indexed?(embedding_def_2)).to eq(true)
end
end
context "when there are indexes outside the pre-seeded range" do
before { %w[topics posts document_fragments].each { |type| connection.execute <<-SQL } }
CREATE INDEX IF NOT EXISTS ai_#{type}_embeddings_9_1_search_bit ON ai_#{type}_embeddings
USING hnsw ((binary_quantize(embeddings)::bit(556)) bit_hamming_ops)
WHERE model_id = 9 AND strategy_id = 1;
SQL
let(:all_idx_names) do
%w[topics posts document_fragments].reduce([]) do |memo, type|
(1..8).each { |model_id| memo << "ai_#{type}_embeddings_#{model_id}_1_search_bit" }
memo
end
end
it "leaves them untouched" do
expected_model_idxs =
%w[topics posts document_fragments].reduce([]) do |memo, type|
memo << "ai_#{type}_embeddings_9_1_search_bit"
end
subject.up
other_idxs =
DB.query_single(
"SELECT indexname FROM pg_indexes WHERE indexname IN (:names)",
names: expected_model_idxs,
)
expect(other_idxs).to contain_exactly(*expected_model_idxs)
end
end
end
end