FIX: Fix embeddings to use the old OpenAI tokenizer (#1506)

This commit is contained in:
Roman Rizzi 2025-07-15 14:44:11 -03:00 committed by GitHub
parent 67664029e5
commit 06743d1939
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 18 additions and 4 deletions

View File

@ -84,7 +84,7 @@ class EmbeddingDefinition < ActiveRecord::Base
dimensions: 2000,
max_sequence_length: 8191,
pg_function: "<=>",
tokenizer_class: "DiscourseAi::Tokenizer::OpenAiTokenizer",
tokenizer_class: "DiscourseAi::Tokenizer::OpenAiCl100kTokenizer",
url: "https://api.openai.com/v1/embeddings",
provider: OPEN_AI,
matryoshka_dimensions: true,
@ -98,7 +98,7 @@ class EmbeddingDefinition < ActiveRecord::Base
dimensions: 1536,
max_sequence_length: 8191,
pg_function: "<=>",
tokenizer_class: "DiscourseAi::Tokenizer::OpenAiTokenizer",
tokenizer_class: "DiscourseAi::Tokenizer::OpenAiCl100kTokenizer",
url: "https://api.openai.com/v1/embeddings",
provider: OPEN_AI,
matryoshka_dimensions: true,
@ -112,7 +112,7 @@ class EmbeddingDefinition < ActiveRecord::Base
dimensions: 1536,
max_sequence_length: 8191,
pg_function: "<=>",
tokenizer_class: "DiscourseAi::Tokenizer::OpenAiTokenizer",
tokenizer_class: "DiscourseAi::Tokenizer::OpenAiCl100kTokenizer",
url: "https://api.openai.com/v1/embeddings",
provider: OPEN_AI,
provider_params: {

View File

@ -0,0 +1,14 @@
# frozen_string_literal: true
class UpdateOpenAiEmbeddingsTokenizer < ActiveRecord::Migration[7.2]
def up
execute <<~SQL
UPDATE embedding_definitions
SET tokenizer_class = 'DiscourseAi::Tokenizer::OpenAiCl100kTokenizer'
WHERE url LIKE '%https://api.openai.com/%' AND tokenizer_class <> 'DiscourseAi::Tokenizer::OpenAiCl100kTokenizer'
SQL
end
def down
raise ActiveRecord::IrreversibleMigration
end
end

View File

@ -50,7 +50,7 @@ RSpec.describe "Managing Embeddings configurations", type: :system, js: true do
form.field("provider").select(EmbeddingDefinition::OPEN_AI)
form.field("url").fill_in("https://api.openai.com/v1/embeddings")
form.field("api_key").fill_in(api_key)
form.field("tokenizer_class").select("DiscourseAi::Tokenizer::OpenAiTokenizer")
form.field("tokenizer_class").select("DiscourseAi::Tokenizer::OpenAiCl100kTokenizer")
embed_prefix = "On creation:"
search_prefix = "On search:"