FEATURE: Support for locally infered embeddings in 100 languages (#115)

* FEATURE: Support for locally infered embeddings in 100 languages

* add table
This commit is contained in:
Rafael dos Santos Silva 2023-07-27 15:50:03 -03:00 committed by GitHub
parent b25daed60b
commit 3e7c99de89
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 1000290 additions and 14 deletions

View File

@ -46,13 +46,11 @@ en:
ai_embeddings_enabled: "Enable the embeddings module." ai_embeddings_enabled: "Enable the embeddings module."
ai_embeddings_discourse_service_api_endpoint: "URL where the API is running for the embeddings module" ai_embeddings_discourse_service_api_endpoint: "URL where the API is running for the embeddings module"
ai_embeddings_discourse_service_api_key: "API key for the embeddings API" ai_embeddings_discourse_service_api_key: "API key for the embeddings API"
ai_embeddings_models: "Discourse will generate embeddings for each of the models enabled here" ai_embeddings_model: "Use all-mpnet-base-v2 for local and fast inference in english, text-embedding-ada-002 to use OpenAI API (need API key) and multilingual-e5-large for local multilingual embeddings"
ai_embeddings_semantic_related_model: "Model to use for related topics."
ai_embeddings_generate_for_pms: "Generate embeddings for personal messages." ai_embeddings_generate_for_pms: "Generate embeddings for personal messages."
ai_embeddings_semantic_related_topics_enabled: "Use Semantic Search for related topics." ai_embeddings_semantic_related_topics_enabled: "Use Semantic Search for related topics."
ai_embeddings_semantic_related_topics: "Maximum number of topics to show in related topic section." ai_embeddings_semantic_related_topics: "Maximum number of topics to show in related topic section."
ai_embeddings_pg_connection_string: "PostgreSQL connection string for the embeddings module. Needs pgvector extension enabled and a series of tables created. See docs for more info." ai_embeddings_pg_connection_string: "PostgreSQL connection string for the embeddings module. Needs pgvector extension enabled and a series of tables created. See docs for more info."
ai_embeddings_semantic_search_model: "Model to use for semantic search."
ai_embeddings_semantic_search_enabled: "Enable full-page semantic search." ai_embeddings_semantic_search_enabled: "Enable full-page semantic search."
ai_embeddings_semantic_related_include_closed_topics: "Include closed topics in semantic search results" ai_embeddings_semantic_related_include_closed_topics: "Include closed topics in semantic search results"

View File

@ -154,6 +154,7 @@ plugins:
choices: choices:
- all-mpnet-base-v2 - all-mpnet-base-v2
- text-embedding-ada-002 - text-embedding-ada-002
- multilingual-e5-large
ai_embeddings_generate_for_pms: false ai_embeddings_generate_for_pms: false
ai_embeddings_semantic_related_topics_enabled: false ai_embeddings_semantic_related_topics_enabled: false
ai_embeddings_semantic_related_topics: 5 ai_embeddings_semantic_related_topics: 5

View File

@ -2,6 +2,17 @@
class EnablePgVectorExtension < ActiveRecord::Migration[7.0] class EnablePgVectorExtension < ActiveRecord::Migration[7.0]
def change def change
begin
enable_extension :vector enable_extension :vector
rescue Exception => e
if DB.query_single("SELECT 1 FROM pg_available_extensions WHERE name = 'vector';").empty?
STDERR.puts "------------------------------DISCOURSE AI ERROR----------------------------------"
STDERR.puts " Discourse AI requires the pgvector extension on the PostgreSQL database."
STDERR.puts " Run a `./launcher rebuild app` to fix it on a standard install."
STDERR.puts " Alternatively, you can remove Discourse AI to rebuild."
STDERR.puts "------------------------------DISCOURSE AI ERROR----------------------------------"
end
raise e
end
end end
end end

View File

@ -0,0 +1,25 @@
# frozen_string_literal: true
class CreateMultilingualTopicEmbeddingsTable < ActiveRecord::Migration[7.0]
def change
models = [DiscourseAi::Embeddings::Models::MultilingualE5Large]
strategies = [DiscourseAi::Embeddings::Strategies::Truncation]
models.each do |model|
strategies.each do |strategy|
table_name = "ai_topic_embeddings_#{model.id}_#{strategy.id}".to_sym
create_table table_name, id: false do |t|
t.integer :topic_id, null: false
t.integer :model_version, null: false
t.integer :strategy_version, null: false
t.text :digest, null: false
t.column :embeddings, "vector(#{model.dimensions})", null: false
t.timestamps
t.index :topic_id, unique: true
end
end
end
end
end

View File

@ -7,6 +7,7 @@ module DiscourseAi
require_relative "models/base" require_relative "models/base"
require_relative "models/all_mpnet_base_v2" require_relative "models/all_mpnet_base_v2"
require_relative "models/text_embedding_ada_002" require_relative "models/text_embedding_ada_002"
require_relative "models/multilingual_e5_large"
require_relative "strategies/truncation" require_relative "strategies/truncation"
require_relative "manager" require_relative "manager"
require_relative "jobs/regular/generate_embeddings" require_relative "jobs/regular/generate_embeddings"

View File

@ -0,0 +1,52 @@
# frozen_string_literal: true
module DiscourseAi
module Embeddings
module Models
class MultilingualE5Large < Base
class << self
def id
3
end
def version
1
end
def name
"multilingual-e5-large"
end
def dimensions
1024
end
def max_sequence_length
512
end
def pg_function
"<=>"
end
def pg_index_type
"vector_cosine_ops"
end
def generate_embeddings(text)
DiscourseAi::Inference::DiscourseClassifier.perform!(
"#{SiteSetting.ai_embeddings_discourse_service_api_endpoint}/api/v1/classify",
name,
"query: #{text}",
SiteSetting.ai_embeddings_discourse_service_api_key,
)
end
def tokenizer
DiscourseAi::Tokenizer::MultilingualE5LargeTokenizer
end
end
end
end
end
end

View File

@ -59,6 +59,13 @@ module DiscourseAi
end end
end end
class MultilingualE5LargeTokenizer < BasicTokenizer
def self.tokenizer
@@tokenizer ||=
Tokenizers.from_file("./plugins/discourse-ai/tokenizers/multilingual-e5-large.json")
end
end
class OpenAiTokenizer < BasicTokenizer class OpenAiTokenizer < BasicTokenizer
class << self class << self
def tokenizer def tokenizer

View File

@ -68,13 +68,4 @@ after_initialize do
on(:reviewable_transitioned_to) do |new_status, reviewable| on(:reviewable_transitioned_to) do |new_status, reviewable|
ModelAccuracy.adjust_model_accuracy(new_status, reviewable) ModelAccuracy.adjust_model_accuracy(new_status, reviewable)
end end
if DB.query_single("SELECT 1 FROM pg_available_extensions WHERE name = 'vector';").empty?
STDERR.puts "------------------------------DISCOURSE AI ERROR----------------------------------"
STDERR.puts " Discourse AI requires the pgvector extension on the PostgreSQL database."
STDERR.puts " Run a `./launcher rebuild app` to fix it on a standard install."
STDERR.puts " Alternatively, you can remove Discourse AI to rebuild."
STDERR.puts "------------------------------DISCOURSE AI ERROR----------------------------------"
exit 1
end
end end

View File

@ -117,3 +117,20 @@ describe DiscourseAi::Tokenizer::Llama2Tokenizer do
end end
end end
end end
describe DiscourseAi::Tokenizer::MultilingualE5LargeTokenizer do
describe "#size" do
describe "returns a token count" do
it "for a sentence with punctuation and capitalization and numbers" do
expect(described_class.size("Hello, World! 123")).to eq(7)
end
end
end
describe "#truncate" do
it "truncates a sentence" do
sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
expect(described_class.truncate(sentence, 3)).to eq("foo")
end
end
end

View File

@ -13,3 +13,7 @@ Licensed under Apache License
## llama-2-70b-chat-hf ## llama-2-70b-chat-hf
Licensed under LLAMA 2 COMMUNITY LICENSE AGREEMENT Licensed under LLAMA 2 COMMUNITY LICENSE AGREEMENT
## multilingual-e5-large
Licensed under MIT License

File diff suppressed because one or more lines are too long