FEATURE: Support for locally infered embeddings in 100 languages (#115)
* FEATURE: Support for locally infered embeddings in 100 languages * add table
This commit is contained in:
parent
b25daed60b
commit
3e7c99de89
|
@ -46,13 +46,11 @@ en:
|
||||||
ai_embeddings_enabled: "Enable the embeddings module."
|
ai_embeddings_enabled: "Enable the embeddings module."
|
||||||
ai_embeddings_discourse_service_api_endpoint: "URL where the API is running for the embeddings module"
|
ai_embeddings_discourse_service_api_endpoint: "URL where the API is running for the embeddings module"
|
||||||
ai_embeddings_discourse_service_api_key: "API key for the embeddings API"
|
ai_embeddings_discourse_service_api_key: "API key for the embeddings API"
|
||||||
ai_embeddings_models: "Discourse will generate embeddings for each of the models enabled here"
|
ai_embeddings_model: "Use all-mpnet-base-v2 for local and fast inference in english, text-embedding-ada-002 to use OpenAI API (need API key) and multilingual-e5-large for local multilingual embeddings"
|
||||||
ai_embeddings_semantic_related_model: "Model to use for related topics."
|
|
||||||
ai_embeddings_generate_for_pms: "Generate embeddings for personal messages."
|
ai_embeddings_generate_for_pms: "Generate embeddings for personal messages."
|
||||||
ai_embeddings_semantic_related_topics_enabled: "Use Semantic Search for related topics."
|
ai_embeddings_semantic_related_topics_enabled: "Use Semantic Search for related topics."
|
||||||
ai_embeddings_semantic_related_topics: "Maximum number of topics to show in related topic section."
|
ai_embeddings_semantic_related_topics: "Maximum number of topics to show in related topic section."
|
||||||
ai_embeddings_pg_connection_string: "PostgreSQL connection string for the embeddings module. Needs pgvector extension enabled and a series of tables created. See docs for more info."
|
ai_embeddings_pg_connection_string: "PostgreSQL connection string for the embeddings module. Needs pgvector extension enabled and a series of tables created. See docs for more info."
|
||||||
ai_embeddings_semantic_search_model: "Model to use for semantic search."
|
|
||||||
ai_embeddings_semantic_search_enabled: "Enable full-page semantic search."
|
ai_embeddings_semantic_search_enabled: "Enable full-page semantic search."
|
||||||
ai_embeddings_semantic_related_include_closed_topics: "Include closed topics in semantic search results"
|
ai_embeddings_semantic_related_include_closed_topics: "Include closed topics in semantic search results"
|
||||||
|
|
||||||
|
|
|
@ -154,6 +154,7 @@ plugins:
|
||||||
choices:
|
choices:
|
||||||
- all-mpnet-base-v2
|
- all-mpnet-base-v2
|
||||||
- text-embedding-ada-002
|
- text-embedding-ada-002
|
||||||
|
- multilingual-e5-large
|
||||||
ai_embeddings_generate_for_pms: false
|
ai_embeddings_generate_for_pms: false
|
||||||
ai_embeddings_semantic_related_topics_enabled: false
|
ai_embeddings_semantic_related_topics_enabled: false
|
||||||
ai_embeddings_semantic_related_topics: 5
|
ai_embeddings_semantic_related_topics: 5
|
||||||
|
|
|
@ -2,6 +2,17 @@
|
||||||
|
|
||||||
class EnablePgVectorExtension < ActiveRecord::Migration[7.0]
|
class EnablePgVectorExtension < ActiveRecord::Migration[7.0]
|
||||||
def change
|
def change
|
||||||
enable_extension :vector
|
begin
|
||||||
|
enable_extension :vector
|
||||||
|
rescue Exception => e
|
||||||
|
if DB.query_single("SELECT 1 FROM pg_available_extensions WHERE name = 'vector';").empty?
|
||||||
|
STDERR.puts "------------------------------DISCOURSE AI ERROR----------------------------------"
|
||||||
|
STDERR.puts " Discourse AI requires the pgvector extension on the PostgreSQL database."
|
||||||
|
STDERR.puts " Run a `./launcher rebuild app` to fix it on a standard install."
|
||||||
|
STDERR.puts " Alternatively, you can remove Discourse AI to rebuild."
|
||||||
|
STDERR.puts "------------------------------DISCOURSE AI ERROR----------------------------------"
|
||||||
|
end
|
||||||
|
raise e
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -0,0 +1,25 @@
|
||||||
|
# frozen_string_literal: true
|
||||||
|
|
||||||
|
class CreateMultilingualTopicEmbeddingsTable < ActiveRecord::Migration[7.0]
|
||||||
|
def change
|
||||||
|
models = [DiscourseAi::Embeddings::Models::MultilingualE5Large]
|
||||||
|
strategies = [DiscourseAi::Embeddings::Strategies::Truncation]
|
||||||
|
|
||||||
|
models.each do |model|
|
||||||
|
strategies.each do |strategy|
|
||||||
|
table_name = "ai_topic_embeddings_#{model.id}_#{strategy.id}".to_sym
|
||||||
|
|
||||||
|
create_table table_name, id: false do |t|
|
||||||
|
t.integer :topic_id, null: false
|
||||||
|
t.integer :model_version, null: false
|
||||||
|
t.integer :strategy_version, null: false
|
||||||
|
t.text :digest, null: false
|
||||||
|
t.column :embeddings, "vector(#{model.dimensions})", null: false
|
||||||
|
t.timestamps
|
||||||
|
|
||||||
|
t.index :topic_id, unique: true
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -7,6 +7,7 @@ module DiscourseAi
|
||||||
require_relative "models/base"
|
require_relative "models/base"
|
||||||
require_relative "models/all_mpnet_base_v2"
|
require_relative "models/all_mpnet_base_v2"
|
||||||
require_relative "models/text_embedding_ada_002"
|
require_relative "models/text_embedding_ada_002"
|
||||||
|
require_relative "models/multilingual_e5_large"
|
||||||
require_relative "strategies/truncation"
|
require_relative "strategies/truncation"
|
||||||
require_relative "manager"
|
require_relative "manager"
|
||||||
require_relative "jobs/regular/generate_embeddings"
|
require_relative "jobs/regular/generate_embeddings"
|
||||||
|
|
|
@ -0,0 +1,52 @@
|
||||||
|
# frozen_string_literal: true
|
||||||
|
|
||||||
|
module DiscourseAi
|
||||||
|
module Embeddings
|
||||||
|
module Models
|
||||||
|
class MultilingualE5Large < Base
|
||||||
|
class << self
|
||||||
|
def id
|
||||||
|
3
|
||||||
|
end
|
||||||
|
|
||||||
|
def version
|
||||||
|
1
|
||||||
|
end
|
||||||
|
|
||||||
|
def name
|
||||||
|
"multilingual-e5-large"
|
||||||
|
end
|
||||||
|
|
||||||
|
def dimensions
|
||||||
|
1024
|
||||||
|
end
|
||||||
|
|
||||||
|
def max_sequence_length
|
||||||
|
512
|
||||||
|
end
|
||||||
|
|
||||||
|
def pg_function
|
||||||
|
"<=>"
|
||||||
|
end
|
||||||
|
|
||||||
|
def pg_index_type
|
||||||
|
"vector_cosine_ops"
|
||||||
|
end
|
||||||
|
|
||||||
|
def generate_embeddings(text)
|
||||||
|
DiscourseAi::Inference::DiscourseClassifier.perform!(
|
||||||
|
"#{SiteSetting.ai_embeddings_discourse_service_api_endpoint}/api/v1/classify",
|
||||||
|
name,
|
||||||
|
"query: #{text}",
|
||||||
|
SiteSetting.ai_embeddings_discourse_service_api_key,
|
||||||
|
)
|
||||||
|
end
|
||||||
|
|
||||||
|
def tokenizer
|
||||||
|
DiscourseAi::Tokenizer::MultilingualE5LargeTokenizer
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -59,6 +59,13 @@ module DiscourseAi
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
class MultilingualE5LargeTokenizer < BasicTokenizer
|
||||||
|
def self.tokenizer
|
||||||
|
@@tokenizer ||=
|
||||||
|
Tokenizers.from_file("./plugins/discourse-ai/tokenizers/multilingual-e5-large.json")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
class OpenAiTokenizer < BasicTokenizer
|
class OpenAiTokenizer < BasicTokenizer
|
||||||
class << self
|
class << self
|
||||||
def tokenizer
|
def tokenizer
|
||||||
|
|
|
@ -68,13 +68,4 @@ after_initialize do
|
||||||
on(:reviewable_transitioned_to) do |new_status, reviewable|
|
on(:reviewable_transitioned_to) do |new_status, reviewable|
|
||||||
ModelAccuracy.adjust_model_accuracy(new_status, reviewable)
|
ModelAccuracy.adjust_model_accuracy(new_status, reviewable)
|
||||||
end
|
end
|
||||||
|
|
||||||
if DB.query_single("SELECT 1 FROM pg_available_extensions WHERE name = 'vector';").empty?
|
|
||||||
STDERR.puts "------------------------------DISCOURSE AI ERROR----------------------------------"
|
|
||||||
STDERR.puts " Discourse AI requires the pgvector extension on the PostgreSQL database."
|
|
||||||
STDERR.puts " Run a `./launcher rebuild app` to fix it on a standard install."
|
|
||||||
STDERR.puts " Alternatively, you can remove Discourse AI to rebuild."
|
|
||||||
STDERR.puts "------------------------------DISCOURSE AI ERROR----------------------------------"
|
|
||||||
exit 1
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
|
|
@ -117,3 +117,20 @@ describe DiscourseAi::Tokenizer::Llama2Tokenizer do
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
describe DiscourseAi::Tokenizer::MultilingualE5LargeTokenizer do
|
||||||
|
describe "#size" do
|
||||||
|
describe "returns a token count" do
|
||||||
|
it "for a sentence with punctuation and capitalization and numbers" do
|
||||||
|
expect(described_class.size("Hello, World! 123")).to eq(7)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
describe "#truncate" do
|
||||||
|
it "truncates a sentence" do
|
||||||
|
sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
|
||||||
|
expect(described_class.truncate(sentence, 3)).to eq("foo")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
|
@ -12,4 +12,8 @@ Licensed under Apache License
|
||||||
|
|
||||||
## llama-2-70b-chat-hf
|
## llama-2-70b-chat-hf
|
||||||
|
|
||||||
Licensed under LLAMA 2 COMMUNITY LICENSE AGREEMENT
|
Licensed under LLAMA 2 COMMUNITY LICENSE AGREEMENT
|
||||||
|
|
||||||
|
## multilingual-e5-large
|
||||||
|
|
||||||
|
Licensed under MIT License
|
||||||
|
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue