FEATURE: Support for locally infered embeddings in 100 languages (#115)
* FEATURE: Support for locally infered embeddings in 100 languages * add table
This commit is contained in:
parent
b25daed60b
commit
3e7c99de89
|
@ -46,13 +46,11 @@ en:
|
|||
ai_embeddings_enabled: "Enable the embeddings module."
|
||||
ai_embeddings_discourse_service_api_endpoint: "URL where the API is running for the embeddings module"
|
||||
ai_embeddings_discourse_service_api_key: "API key for the embeddings API"
|
||||
ai_embeddings_models: "Discourse will generate embeddings for each of the models enabled here"
|
||||
ai_embeddings_semantic_related_model: "Model to use for related topics."
|
||||
ai_embeddings_model: "Use all-mpnet-base-v2 for local and fast inference in english, text-embedding-ada-002 to use OpenAI API (need API key) and multilingual-e5-large for local multilingual embeddings"
|
||||
ai_embeddings_generate_for_pms: "Generate embeddings for personal messages."
|
||||
ai_embeddings_semantic_related_topics_enabled: "Use Semantic Search for related topics."
|
||||
ai_embeddings_semantic_related_topics: "Maximum number of topics to show in related topic section."
|
||||
ai_embeddings_pg_connection_string: "PostgreSQL connection string for the embeddings module. Needs pgvector extension enabled and a series of tables created. See docs for more info."
|
||||
ai_embeddings_semantic_search_model: "Model to use for semantic search."
|
||||
ai_embeddings_semantic_search_enabled: "Enable full-page semantic search."
|
||||
ai_embeddings_semantic_related_include_closed_topics: "Include closed topics in semantic search results"
|
||||
|
||||
|
|
|
@ -154,6 +154,7 @@ plugins:
|
|||
choices:
|
||||
- all-mpnet-base-v2
|
||||
- text-embedding-ada-002
|
||||
- multilingual-e5-large
|
||||
ai_embeddings_generate_for_pms: false
|
||||
ai_embeddings_semantic_related_topics_enabled: false
|
||||
ai_embeddings_semantic_related_topics: 5
|
||||
|
|
|
@ -2,6 +2,17 @@
|
|||
|
||||
class EnablePgVectorExtension < ActiveRecord::Migration[7.0]
|
||||
def change
|
||||
enable_extension :vector
|
||||
begin
|
||||
enable_extension :vector
|
||||
rescue Exception => e
|
||||
if DB.query_single("SELECT 1 FROM pg_available_extensions WHERE name = 'vector';").empty?
|
||||
STDERR.puts "------------------------------DISCOURSE AI ERROR----------------------------------"
|
||||
STDERR.puts " Discourse AI requires the pgvector extension on the PostgreSQL database."
|
||||
STDERR.puts " Run a `./launcher rebuild app` to fix it on a standard install."
|
||||
STDERR.puts " Alternatively, you can remove Discourse AI to rebuild."
|
||||
STDERR.puts "------------------------------DISCOURSE AI ERROR----------------------------------"
|
||||
end
|
||||
raise e
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -0,0 +1,25 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
class CreateMultilingualTopicEmbeddingsTable < ActiveRecord::Migration[7.0]
|
||||
def change
|
||||
models = [DiscourseAi::Embeddings::Models::MultilingualE5Large]
|
||||
strategies = [DiscourseAi::Embeddings::Strategies::Truncation]
|
||||
|
||||
models.each do |model|
|
||||
strategies.each do |strategy|
|
||||
table_name = "ai_topic_embeddings_#{model.id}_#{strategy.id}".to_sym
|
||||
|
||||
create_table table_name, id: false do |t|
|
||||
t.integer :topic_id, null: false
|
||||
t.integer :model_version, null: false
|
||||
t.integer :strategy_version, null: false
|
||||
t.text :digest, null: false
|
||||
t.column :embeddings, "vector(#{model.dimensions})", null: false
|
||||
t.timestamps
|
||||
|
||||
t.index :topic_id, unique: true
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
|
@ -7,6 +7,7 @@ module DiscourseAi
|
|||
require_relative "models/base"
|
||||
require_relative "models/all_mpnet_base_v2"
|
||||
require_relative "models/text_embedding_ada_002"
|
||||
require_relative "models/multilingual_e5_large"
|
||||
require_relative "strategies/truncation"
|
||||
require_relative "manager"
|
||||
require_relative "jobs/regular/generate_embeddings"
|
||||
|
|
|
@ -0,0 +1,52 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
module DiscourseAi
|
||||
module Embeddings
|
||||
module Models
|
||||
class MultilingualE5Large < Base
|
||||
class << self
|
||||
def id
|
||||
3
|
||||
end
|
||||
|
||||
def version
|
||||
1
|
||||
end
|
||||
|
||||
def name
|
||||
"multilingual-e5-large"
|
||||
end
|
||||
|
||||
def dimensions
|
||||
1024
|
||||
end
|
||||
|
||||
def max_sequence_length
|
||||
512
|
||||
end
|
||||
|
||||
def pg_function
|
||||
"<=>"
|
||||
end
|
||||
|
||||
def pg_index_type
|
||||
"vector_cosine_ops"
|
||||
end
|
||||
|
||||
def generate_embeddings(text)
|
||||
DiscourseAi::Inference::DiscourseClassifier.perform!(
|
||||
"#{SiteSetting.ai_embeddings_discourse_service_api_endpoint}/api/v1/classify",
|
||||
name,
|
||||
"query: #{text}",
|
||||
SiteSetting.ai_embeddings_discourse_service_api_key,
|
||||
)
|
||||
end
|
||||
|
||||
def tokenizer
|
||||
DiscourseAi::Tokenizer::MultilingualE5LargeTokenizer
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
|
@ -59,6 +59,13 @@ module DiscourseAi
|
|||
end
|
||||
end
|
||||
|
||||
class MultilingualE5LargeTokenizer < BasicTokenizer
|
||||
def self.tokenizer
|
||||
@@tokenizer ||=
|
||||
Tokenizers.from_file("./plugins/discourse-ai/tokenizers/multilingual-e5-large.json")
|
||||
end
|
||||
end
|
||||
|
||||
class OpenAiTokenizer < BasicTokenizer
|
||||
class << self
|
||||
def tokenizer
|
||||
|
|
|
@ -68,13 +68,4 @@ after_initialize do
|
|||
on(:reviewable_transitioned_to) do |new_status, reviewable|
|
||||
ModelAccuracy.adjust_model_accuracy(new_status, reviewable)
|
||||
end
|
||||
|
||||
if DB.query_single("SELECT 1 FROM pg_available_extensions WHERE name = 'vector';").empty?
|
||||
STDERR.puts "------------------------------DISCOURSE AI ERROR----------------------------------"
|
||||
STDERR.puts " Discourse AI requires the pgvector extension on the PostgreSQL database."
|
||||
STDERR.puts " Run a `./launcher rebuild app` to fix it on a standard install."
|
||||
STDERR.puts " Alternatively, you can remove Discourse AI to rebuild."
|
||||
STDERR.puts "------------------------------DISCOURSE AI ERROR----------------------------------"
|
||||
exit 1
|
||||
end
|
||||
end
|
||||
|
|
|
@ -117,3 +117,20 @@ describe DiscourseAi::Tokenizer::Llama2Tokenizer do
|
|||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe DiscourseAi::Tokenizer::MultilingualE5LargeTokenizer do
|
||||
describe "#size" do
|
||||
describe "returns a token count" do
|
||||
it "for a sentence with punctuation and capitalization and numbers" do
|
||||
expect(described_class.size("Hello, World! 123")).to eq(7)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe "#truncate" do
|
||||
it "truncates a sentence" do
|
||||
sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
|
||||
expect(described_class.truncate(sentence, 3)).to eq("foo")
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -12,4 +12,8 @@ Licensed under Apache License
|
|||
|
||||
## llama-2-70b-chat-hf
|
||||
|
||||
Licensed under LLAMA 2 COMMUNITY LICENSE AGREEMENT
|
||||
Licensed under LLAMA 2 COMMUNITY LICENSE AGREEMENT
|
||||
|
||||
## multilingual-e5-large
|
||||
|
||||
Licensed under MIT License
|
||||
|
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue