discourse-ai/lib/completions/endpoints/ollama.rb

# frozen_string_literal: true

module DiscourseAi
  module Completions
    module Endpoints
      class Ollama < Base
        class << self
          def can_contact?(endpoint_name)
            endpoint_name == "ollama"
          end

          def dependant_setting_names
            %w[ai_ollama_endpoint]
          end

          def correctly_configured?(_model_name)
            SiteSetting.ai_ollama_endpoint.present?
          end

          def endpoint_name(model_name)
            "Ollama - #{model_name}"
          end
        end

        def normalize_model_params(model_params)
          model_params = model_params.dup

          # max_tokens, temperature are already supported
          if model_params[:stop_sequences]
            model_params[:stop] = model_params.delete(:stop_sequences)
          end

          model_params
        end

        def default_options
          { max_tokens: 2000, model: model }
        end

        def provider_id
          AiApiAuditLog::Provider::Ollama
        end

        def use_ssl?
          false
        end

        private

        def model_uri
          URI("#{SiteSetting.ai_ollama_endpoint}/v1/chat/completions")
        end

        def prepare_payload(prompt, model_params, _dialect)
          default_options
            .merge(model_params)
            .merge(messages: prompt)
            .tap { |payload| payload[:stream] = true if @streaming_mode }
        end

        def prepare_request(payload)
          headers = { "Content-Type" => "application/json" }

          Net::HTTP::Post.new(model_uri, headers).tap { |r| r.body = payload }
        end

        def partials_from(decoded_chunk)
          decoded_chunk
            .split("\n")
            .map do |line|
              data = line.split("data: ", 2)[1]
              data == "[DONE]" ? nil : data
            end
            .compact
        end

        def extract_completion_from(response_raw)
          parsed = JSON.parse(response_raw, symbolize_names: true).dig(:choices, 0)
          # half a line sent here
          return if !parsed

          response_h = @streaming_mode ? parsed.dig(:delta) : parsed.dig(:message)

          response_h.dig(:content)
        end
      end
    end
  end
end
REFACTOR: Migrate Vllm/TGI-served models to the OpenAI format. (#588) Both endpoints provide OpenAI-compatible servers. The only difference is that Vllm doesn't support passing tools as a separate parameter. Even if the tool param is supported, it ultimately relies on the model's ability to handle native functions, which is not the case with the models we have today. As a part of this change, we are dropping support for StableBeluga/Llama2 models. They don't have a chat_template, meaning the new API can translate them. These changes let us remove some of our existing dialects and are a first step in our plan to support any LLM by defining them as data-driven concepts. I rewrote the "translate" method to use a template method and extracted the tool support strategies into its classes to simplify the code. Finally, these changes bring support for Ollama when running in dev mode. It only works with Mistral for now, but it will change soon.. 2024-05-07 09:02:16 -04:00			`# frozen_string_literal: true`

			`module DiscourseAi`
			`module Completions`
			`module Endpoints`
			`class Ollama < Base`
			`class << self`
HACK: Llama3 support for summarization/AI helper. (#616) There are still some limitations to which models we can support with the `LlmModel` class. This will enable support for Llama3 while we sort those out. 2024-05-13 14:54:42 -04:00			`def can_contact?(endpoint_name)`
			`endpoint_name == "ollama"`
REFACTOR: Migrate Vllm/TGI-served models to the OpenAI format. (#588) Both endpoints provide OpenAI-compatible servers. The only difference is that Vllm doesn't support passing tools as a separate parameter. Even if the tool param is supported, it ultimately relies on the model's ability to handle native functions, which is not the case with the models we have today. As a part of this change, we are dropping support for StableBeluga/Llama2 models. They don't have a chat_template, meaning the new API can translate them. These changes let us remove some of our existing dialects and are a first step in our plan to support any LLM by defining them as data-driven concepts. I rewrote the "translate" method to use a template method and extracted the tool support strategies into its classes to simplify the code. Finally, these changes bring support for Ollama when running in dev mode. It only works with Mistral for now, but it will change soon.. 2024-05-07 09:02:16 -04:00			`end`

			`def dependant_setting_names`
			`%w[ai_ollama_endpoint]`
			`end`

			`def correctly_configured?(_model_name)`
			`SiteSetting.ai_ollama_endpoint.present?`
			`end`

			`def endpoint_name(model_name)`
			`"Ollama - #{model_name}"`
			`end`
			`end`

			`def normalize_model_params(model_params)`
			`model_params = model_params.dup`

			`# max_tokens, temperature are already supported`
			`if model_params[:stop_sequences]`
			`model_params[:stop] = model_params.delete(:stop_sequences)`
			`end`

			`model_params`
			`end`

			`def default_options`
			`{ max_tokens: 2000, model: model }`
			`end`

			`def provider_id`
			`AiApiAuditLog::Provider::Ollama`
			`end`

			`def use_ssl?`
			`false`
			`end`

			`private`

			`def model_uri`
			`URI("#{SiteSetting.ai_ollama_endpoint}/v1/chat/completions")`
			`end`

			`def prepare_payload(prompt, model_params, _dialect)`
			`default_options`
			`.merge(model_params)`
			`.merge(messages: prompt)`
			`.tap { \|payload\| payload[:stream] = true if @streaming_mode }`
			`end`

			`def prepare_request(payload)`
			`headers = { "Content-Type" => "application/json" }`

			`Net::HTTP::Post.new(model_uri, headers).tap { \|r\| r.body = payload }`
			`end`

			`def partials_from(decoded_chunk)`
			`decoded_chunk`
			`.split("\n")`
			`.map do \|line\|`
			`data = line.split("data: ", 2)[1]`
			`data == "[DONE]" ? nil : data`
			`end`
			`.compact`
			`end`

			`def extract_completion_from(response_raw)`
			`parsed = JSON.parse(response_raw, symbolize_names: true).dig(:choices, 0)`
			`# half a line sent here`
			`return if !parsed`

			`response_h = @streaming_mode ? parsed.dig(:delta) : parsed.dig(:message)`

			`response_h.dig(:content)`
			`end`
			`end`
			`end`
			`end`
			`end`