2023-09-05 10:08:23 -04:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
|
|
|
module DiscourseAi
|
|
|
|
module Embeddings
|
|
|
|
module VectorRepresentations
|
|
|
|
class MultilingualE5Large < Base
|
|
|
|
def vector_from(text)
|
2023-12-18 11:21:21 -05:00
|
|
|
if DiscourseAi::Inference::HuggingFaceTextEmbeddings.configured?
|
2023-12-07 08:36:56 -05:00
|
|
|
truncated_text = tokenizer.truncate(text, max_sequence_length - 2)
|
|
|
|
DiscourseAi::Inference::HuggingFaceTextEmbeddings.perform!(truncated_text).first
|
2024-01-10 17:23:07 -05:00
|
|
|
elsif discourse_embeddings_endpoint.present?
|
2023-11-28 15:05:26 -05:00
|
|
|
DiscourseAi::Inference::DiscourseClassifier.perform!(
|
2024-01-10 17:23:07 -05:00
|
|
|
"#{discourse_embeddings_endpoint}/api/v1/classify",
|
2023-11-28 15:05:26 -05:00
|
|
|
name,
|
|
|
|
"query: #{text}",
|
|
|
|
SiteSetting.ai_embeddings_discourse_service_api_key,
|
|
|
|
)
|
|
|
|
else
|
|
|
|
raise "No inference endpoint configured"
|
|
|
|
end
|
2023-09-05 10:08:23 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
def id
|
|
|
|
3
|
|
|
|
end
|
|
|
|
|
|
|
|
def version
|
|
|
|
1
|
|
|
|
end
|
|
|
|
|
|
|
|
def name
|
|
|
|
"multilingual-e5-large"
|
|
|
|
end
|
|
|
|
|
|
|
|
def dimensions
|
|
|
|
1024
|
|
|
|
end
|
|
|
|
|
|
|
|
def max_sequence_length
|
|
|
|
512
|
|
|
|
end
|
|
|
|
|
|
|
|
def pg_function
|
|
|
|
"<=>"
|
|
|
|
end
|
|
|
|
|
|
|
|
def pg_index_type
|
|
|
|
"vector_cosine_ops"
|
|
|
|
end
|
|
|
|
|
|
|
|
def tokenizer
|
|
|
|
DiscourseAi::Tokenizer::MultilingualE5LargeTokenizer
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|