# frozen_string_literal: true # A facade that abstracts multiple LLMs behind a single interface. # # Internally, it consists of the combination of a dialect and an endpoint. # After receiving a prompt using our generic format, it translates it to # the target model and routes the completion request through the correct gateway. # # Use the .proxy method to instantiate an object. # It chooses the correct dialect and endpoint for the model you want to interact with. # # Tests of modules that perform LLM calls can use .with_prepared_responses to return canned responses # instead of relying on WebMock stubs like we did in the past. # module DiscourseAi module Completions class Llm UNKNOWN_MODEL = Class.new(StandardError) class << self def presets # Sam: I am not sure if it makes sense to translate model names at all @presets ||= begin [ { id: "anthropic", models: [ { name: "claude-3-5-sonnet", tokens: 200_000, display_name: "Claude 3.5 Sonnet", }, { name: "claude-3-opus", tokens: 200_000, display_name: "Claude 3 Opus" }, { name: "claude-3-sonnet", tokens: 200_000, display_name: "Claude 3 Sonnet" }, { name: "claude-3-haiku", tokens: 200_000, display_name: "Claude 3 Haiku" }, ], tokenizer: DiscourseAi::Tokenizer::AnthropicTokenizer, endpoint: "https://api.anthropic.com/v1/messages", provider: "anthropic", }, { id: "google", models: [ { name: "gemini-1.5-pro", tokens: 800_000, endpoint: "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest", display_name: "Gemini 1.5 Pro", }, { name: "gemini-1.5-flash", tokens: 800_000, endpoint: "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest", display_name: "Gemini 1.5 Flash", }, ], tokenizer: DiscourseAi::Tokenizer::OpenAiTokenizer, provider: "google", }, { id: "open_ai", models: [ { name: "gpt-4o", tokens: 131_072, display_name: "GPT-4 Omni" }, { name: "gpt-4-turbo", tokens: 131_072, display_name: "GPT-4 Turbo" }, { name: "gpt-3.5-turbo", tokens: 16_385, display_name: "GPT-3.5 Turbo" }, ], tokenizer: DiscourseAi::Tokenizer::OpenAiTokenizer, endpoint: "https://api.openai.com/v1/chat/completions", provider: "open_ai", }, ] end end def provider_names providers = %w[aws_bedrock anthropic vllm hugging_face cohere open_ai google azure] if !Rails.env.production? providers << "fake" providers << "ollama" end providers end def tokenizer_names DiscourseAi::Tokenizer::BasicTokenizer.available_llm_tokenizers.map(&:name) end def vision_models_by_provider @vision_models_by_provider ||= { aws_bedrock: %w[claude-3-sonnet claude-3-opus claude-3-haiku], anthropic: %w[claude-3-sonnet claude-3-opus claude-3-haiku], open_ai: %w[gpt-4-vision-preview gpt-4-turbo gpt-4o], google: %w[gemini-1.5-pro gemini-1.5-flash], } end def models_by_provider # ChatGPT models are listed under open_ai but they are actually available through OpenAI and Azure. # However, since they use the same URL/key settings, there's no reason to duplicate them. @models_by_provider ||= { aws_bedrock: %w[ claude-instant-1 claude-2 claude-3-haiku claude-3-sonnet claude-3-opus ], anthropic: %w[claude-instant-1 claude-2 claude-3-haiku claude-3-sonnet claude-3-opus], vllm: %w[mistralai/Mixtral-8x7B-Instruct-v0.1 mistralai/Mistral-7B-Instruct-v0.2], hugging_face: %w[ mistralai/Mixtral-8x7B-Instruct-v0.1 mistralai/Mistral-7B-Instruct-v0.2 ], cohere: %w[command-light command command-r command-r-plus], open_ai: %w[ gpt-3.5-turbo gpt-4 gpt-3.5-turbo-16k gpt-4-32k gpt-4-turbo gpt-4-vision-preview gpt-4o ], google: %w[gemini-pro gemini-1.5-pro gemini-1.5-flash], }.tap do |h| h[:ollama] = ["mistral"] if Rails.env.development? h[:fake] = ["fake"] if Rails.env.test? || Rails.env.development? end end def valid_provider_models return @valid_provider_models if defined?(@valid_provider_models) valid_provider_models = [] models_by_provider.each do |provider, models| valid_provider_models.concat(models.map { |model| "#{provider}:#{model}" }) end @valid_provider_models = Set.new(valid_provider_models) end def with_prepared_responses(responses, llm: nil) @canned_response = DiscourseAi::Completions::Endpoints::CannedResponse.new(responses) @canned_llm = llm @prompts = [] yield(@canned_response, llm, @prompts) ensure # Don't leak prepared response if there's an exception. @canned_response = nil @canned_llm = nil @prompts = nil end def record_prompt(prompt) @prompts << prompt if @prompts end def proxy(model_name) provider_and_model_name = model_name.split(":") provider_name = provider_and_model_name.first model_name_without_prov = provider_and_model_name[1..].join # We are in the process of transitioning to always use objects here. # We'll live with this hack for a while. if provider_name == "custom" llm_model = LlmModel.find(model_name_without_prov) raise UNKNOWN_MODEL if !llm_model return proxy_from_obj(llm_model) end dialect_klass = DiscourseAi::Completions::Dialects::Dialect.dialect_for(model_name_without_prov) if @canned_response if @canned_llm && @canned_llm != model_name raise "Invalid call LLM call, expected #{@canned_llm} but got #{model_name}" end return new(dialect_klass, nil, model_name, gateway: @canned_response) end gateway_klass = DiscourseAi::Completions::Endpoints::Base.endpoint_for(provider_name) new(dialect_klass, gateway_klass, model_name_without_prov) end def proxy_from_obj(llm_model) provider_name = llm_model.provider model_name = llm_model.name dialect_klass = DiscourseAi::Completions::Dialects::Dialect.dialect_for(model_name) if @canned_response if @canned_llm && @canned_llm != [provider_name, model_name].join(":") raise "Invalid call LLM call, expected #{@canned_llm} but got #{model_name}" end return new(dialect_klass, nil, model_name, gateway: @canned_response) end gateway_klass = DiscourseAi::Completions::Endpoints::Base.endpoint_for(provider_name) new(dialect_klass, gateway_klass, model_name, llm_model: llm_model) end end def initialize(dialect_klass, gateway_klass, model_name, gateway: nil, llm_model: nil) @dialect_klass = dialect_klass @gateway_klass = gateway_klass @model_name = model_name @gateway = gateway @llm_model = llm_model end # @param generic_prompt { DiscourseAi::Completions::Prompt } - Our generic prompt object # @param user { User } - User requesting the summary. # # @param &on_partial_blk { Block - Optional } - The passed block will get called with the LLM partial response alongside a cancel function. # # @returns { String } - Completion result. # # When the model invokes a tool, we'll wait until the endpoint finishes replying and feed you a fully-formed tool, # even if you passed a partial_read_blk block. Invocations are strings that look like this: # # # # get_weather # get_weather # # Sydney # c # # # # def generate( prompt, temperature: nil, top_p: nil, max_tokens: nil, stop_sequences: nil, user:, feature_name: nil, &partial_read_blk ) self.class.record_prompt(prompt) model_params = { max_tokens: max_tokens, stop_sequences: stop_sequences } model_params[:temperature] = temperature if temperature model_params[:top_p] = top_p if top_p if prompt.is_a?(String) prompt = DiscourseAi::Completions::Prompt.new( "You are a helpful bot", messages: [{ type: :user, content: prompt }], ) elsif prompt.is_a?(Array) prompt = DiscourseAi::Completions::Prompt.new(messages: prompt) end if !prompt.is_a?(DiscourseAi::Completions::Prompt) raise ArgumentError, "Prompt must be either a string, array, of Prompt object" end model_params.keys.each { |key| model_params.delete(key) if model_params[key].nil? } dialect = dialect_klass.new(prompt, model_name, opts: model_params, llm_model: llm_model) gateway = @gateway || gateway_klass.new(model_name, dialect.tokenizer, llm_model: llm_model) gateway.perform_completion!( dialect, user, model_params, feature_name: feature_name, &partial_read_blk ) end def max_prompt_tokens llm_model&.max_prompt_tokens || dialect_klass.new(DiscourseAi::Completions::Prompt.new(""), model_name).max_prompt_tokens end def tokenizer llm_model&.tokenizer_class || dialect_klass.new(DiscourseAi::Completions::Prompt.new(""), model_name).tokenizer end attr_reader :model_name private attr_reader :dialect_klass, :gateway_klass, :llm_model end end end