discourse-ai/lib/completions/endpoints/base.rb

# frozen_string_literal: true

module DiscourseAi
  module Completions
    module Endpoints
      class Base
        CompletionFailed = Class.new(StandardError)
        TIMEOUT = 60

        def self.endpoint_for(model_name)
          # Order is important.
          # Bedrock has priority over Anthropic if creadentials are present.
          [
            DiscourseAi::Completions::Endpoints::AwsBedrock,
            DiscourseAi::Completions::Endpoints::Anthropic,
            DiscourseAi::Completions::Endpoints::OpenAi,
            DiscourseAi::Completions::Endpoints::HuggingFace,
            DiscourseAi::Completions::Endpoints::Gemini,
          ].detect(-> { raise DiscourseAi::Completions::Llm::UNKNOWN_MODEL }) do |ek|
            ek.can_contact?(model_name)
          end
        end

        def self.can_contact?(_model_name)
          raise NotImplementedError
        end

        def initialize(model_name, tokenizer)
          @model = model_name
          @tokenizer = tokenizer
        end

        def perform_completion!(dialect, user, model_params = {})
          @streaming_mode = block_given?

          prompt = dialect.translate

          Net::HTTP.start(
            model_uri.host,
            model_uri.port,
            use_ssl: true,
            read_timeout: TIMEOUT,
            open_timeout: TIMEOUT,
            write_timeout: TIMEOUT,
          ) do |http|
            response_data = +""
            response_raw = +""

            # Needed to response token calculations. Cannot rely on response_data due to function buffering.
            partials_raw = +""
            request_body = prepare_payload(prompt, model_params, dialect).to_json

            request = prepare_request(request_body)

            http.request(request) do |response|
              if response.code.to_i != 200
                Rails.logger.error(
                  "#{self.class.name}: status: #{response.code.to_i} - body: #{response.body}",
                )
                raise CompletionFailed
              end

              log =
                AiApiAuditLog.new(
                  provider_id: provider_id,
                  user_id: user&.id,
                  raw_request_payload: request_body,
                  request_tokens: prompt_size(prompt),
                )

              if !@streaming_mode
                response_raw = response.read_body
                response_data = extract_completion_from(response_raw)
                partials_raw = response_data.to_s

                if has_tool?("", response_data)
                  function_buffer = build_buffer # Nokogiri document
                  function_buffer = add_to_buffer(function_buffer, "", response_data)

                  response_data = +function_buffer.at("function_calls").to_s
                  response_data << "\n"
                end

                return response_data
              end

              begin
                cancelled = false
                cancel = lambda { cancelled = true }

                leftover = ""
                function_buffer = build_buffer # Nokogiri document

                response.read_body do |chunk|
                  if cancelled
                    http.finish
                    return
                  end

                  decoded_chunk = decode(chunk)
                  response_raw << decoded_chunk

                  # Buffering for extremely slow streaming.
                  if (leftover + decoded_chunk).length < "data: [DONE]".length
                    leftover += decoded_chunk
                    next
                  end

                  partials_from(leftover + decoded_chunk).each do |raw_partial|
                    next if cancelled
                    next if raw_partial.blank?

                    begin
                      partial = extract_completion_from(raw_partial)
                      next if partial.nil?
                      leftover = ""

                      if has_tool?(response_data, partial)
                        function_buffer = add_to_buffer(function_buffer, response_data, partial)

                        if buffering_finished?(dialect.tools, function_buffer)
                          invocation = +function_buffer.at("function_calls").to_s
                          invocation << "\n"

                          partials_raw << partial.to_s
                          response_data << invocation

                          yield invocation, cancel
                        end
                      else
                        partials_raw << partial
                        response_data << partial

                        yield partial, cancel if partial
                      end
                    rescue JSON::ParserError
                      leftover += decoded_chunk
                    end
                  end
                end
              rescue IOError, StandardError
                raise if !cancelled
              end

              return response_data
            ensure
              if log
                log.raw_response_payload = response_raw
                log.response_tokens = tokenizer.size(partials_raw)
                log.save!

                if Rails.env.development?
                  puts "#{self.class.name}: request_tokens #{log.request_tokens} response_tokens #{log.response_tokens}"
                end
              end
            end
          end
        end

        def default_options
          raise NotImplementedError
        end

        def provider_id
          raise NotImplementedError
        end

        def prompt_size(prompt)
          tokenizer.size(extract_prompt_for_tokenizer(prompt))
        end

        attr_reader :tokenizer

        protected

        attr_reader :model

        def model_uri
          raise NotImplementedError
        end

        def prepare_payload(_prompt, _model_params)
          raise NotImplementedError
        end

        def prepare_request(_payload)
          raise NotImplementedError
        end

        def extract_completion_from(_response_raw)
          raise NotImplementedError
        end

        def decode(chunk)
          chunk
        end

        def partials_from(_decoded_chunk)
          raise NotImplementedError
        end

        def extract_prompt_for_tokenizer(prompt)
          prompt
        end

        def build_buffer
          Nokogiri::HTML5.fragment(<<~TEXT)
          <function_calls>
          <invoke>
          <tool_name></tool_name>
          <tool_id></tool_id>
          <parameters></parameters>
          </invoke>
          </function_calls>
          TEXT
        end

        def has_tool?(response, partial)
          (response + partial).include?("<function_calls>")
        end

        def add_to_buffer(function_buffer, response_data, partial)
          new_buffer = Nokogiri::HTML5.fragment(response_data + partial)
          if tool_name = new_buffer.at("tool_name").text
            if new_buffer.at("tool_id").nil?
              tool_id_node =
                Nokogiri::HTML5::DocumentFragment.parse("\n<tool_id>#{tool_name}</tool_id>")

              new_buffer.at("invoke").children[1].add_next_sibling(tool_id_node)
            end
          end

          new_buffer
        end

        def buffering_finished?(_available_functions, buffer)
          buffer.to_s.include?("</function_calls>")
        end
      end
    end
  end
end
REFACTOR: Summarization and HyDE now use an LLM abstraction. (#297) * DEV: One LLM abstraction to rule them all * REFACTOR: HyDE search uses new LLM abstraction * REFACTOR: Summarization uses the LLM abstraction * Updated documentation and made small fixes. Remove Bedrock claude-2 restriction 2023-11-23 10:58:54 -05:00			`# frozen_string_literal: true`

			`module DiscourseAi`
			`module Completions`
			`module Endpoints`
			`class Base`
			`CompletionFailed = Class.new(StandardError)`
			`TIMEOUT = 60`

			`def self.endpoint_for(model_name)`
			`# Order is important.`
			`# Bedrock has priority over Anthropic if creadentials are present.`
			`[`
			`DiscourseAi::Completions::Endpoints::AwsBedrock,`
			`DiscourseAi::Completions::Endpoints::Anthropic,`
DEV: port directory structure to Zeitwerk (#319) Previous to this change we relied on explicit loading for a files in Discourse AI. This had a few downsides: - Busywork whenever you add a file (an extra require relative) - We were not keeping to conventions internally ... some places were OpenAI others are OpenAi - Autoloader did not work which lead to lots of full application broken reloads when developing. This moves all of DiscourseAI into a Zeitwerk compatible structure. It also leaves some minimal amount of manual loading (automation - which is loading into an existing namespace that may or may not be there) To avoid needing /lib/discourse_ai/... we mount a namespace thus we are able to keep /lib pointed at ::DiscourseAi Various files were renamed to get around zeitwerk rules and minimize usage of custom inflections Though we can get custom inflections to work it is not worth it, will require a Discourse core patch which means we create a hard dependency. 2023-11-28 23:17:46 -05:00			`DiscourseAi::Completions::Endpoints::OpenAi,`
			`DiscourseAi::Completions::Endpoints::HuggingFace,`
FEATURE: Support for Gemini in AiHelper / Search / Summarization (#358) 2023-12-15 12:32:01 -05:00			`DiscourseAi::Completions::Endpoints::Gemini,`
DEV: port directory structure to Zeitwerk (#319) Previous to this change we relied on explicit loading for a files in Discourse AI. This had a few downsides: - Busywork whenever you add a file (an extra require relative) - We were not keeping to conventions internally ... some places were OpenAI others are OpenAi - Autoloader did not work which lead to lots of full application broken reloads when developing. This moves all of DiscourseAI into a Zeitwerk compatible structure. It also leaves some minimal amount of manual loading (automation - which is loading into an existing namespace that may or may not be there) To avoid needing /lib/discourse_ai/... we mount a namespace thus we are able to keep /lib pointed at ::DiscourseAi Various files were renamed to get around zeitwerk rules and minimize usage of custom inflections Though we can get custom inflections to work it is not worth it, will require a Discourse core patch which means we create a hard dependency. 2023-11-28 23:17:46 -05:00			`].detect(-> { raise DiscourseAi::Completions::Llm::UNKNOWN_MODEL }) do \|ek\|`
REFACTOR: Summarization and HyDE now use an LLM abstraction. (#297) * DEV: One LLM abstraction to rule them all * REFACTOR: HyDE search uses new LLM abstraction * REFACTOR: Summarization uses the LLM abstraction * Updated documentation and made small fixes. Remove Bedrock claude-2 restriction 2023-11-23 10:58:54 -05:00			`ek.can_contact?(model_name)`
			`end`
			`end`

			`def self.can_contact?(_model_name)`
			`raise NotImplementedError`
			`end`

			`def initialize(model_name, tokenizer)`
			`@model = model_name`
			`@tokenizer = tokenizer`
			`end`

DEV: Tool support for the LLM service. (#366) This PR adds tool support to available LLMs. We'll buffer tool invocations and return them instead of making users of this service parse the response. It also adds support for conversation context in the generic prompt. It includes bot messages, user messages, and tool invocations, which we'll trim to make sure it doesn't exceed the prompt limit, then translate them to the correct dialect. Finally, It adds some buffering when reading chunks to handle cases when streaming is extremely slow.:M 2023-12-18 16:06:01 -05:00			`def perform_completion!(dialect, user, model_params = {})`
REFACTOR: Summarization and HyDE now use an LLM abstraction. (#297) * DEV: One LLM abstraction to rule them all * REFACTOR: HyDE search uses new LLM abstraction * REFACTOR: Summarization uses the LLM abstraction * Updated documentation and made small fixes. Remove Bedrock claude-2 restriction 2023-11-23 10:58:54 -05:00			`@streaming_mode = block_given?`

DEV: Tool support for the LLM service. (#366) This PR adds tool support to available LLMs. We'll buffer tool invocations and return them instead of making users of this service parse the response. It also adds support for conversation context in the generic prompt. It includes bot messages, user messages, and tool invocations, which we'll trim to make sure it doesn't exceed the prompt limit, then translate them to the correct dialect. Finally, It adds some buffering when reading chunks to handle cases when streaming is extremely slow.:M 2023-12-18 16:06:01 -05:00			`prompt = dialect.translate`

REFACTOR: Summarization and HyDE now use an LLM abstraction. (#297) * DEV: One LLM abstraction to rule them all * REFACTOR: HyDE search uses new LLM abstraction * REFACTOR: Summarization uses the LLM abstraction * Updated documentation and made small fixes. Remove Bedrock claude-2 restriction 2023-11-23 10:58:54 -05:00			`Net::HTTP.start(`
			`model_uri.host,`
			`model_uri.port,`
			`use_ssl: true,`
			`read_timeout: TIMEOUT,`
			`open_timeout: TIMEOUT,`
			`write_timeout: TIMEOUT,`
			`) do \|http\|`
			`response_data = +""`
			`response_raw = +""`
DEV: Tool support for the LLM service. (#366) This PR adds tool support to available LLMs. We'll buffer tool invocations and return them instead of making users of this service parse the response. It also adds support for conversation context in the generic prompt. It includes bot messages, user messages, and tool invocations, which we'll trim to make sure it doesn't exceed the prompt limit, then translate them to the correct dialect. Finally, It adds some buffering when reading chunks to handle cases when streaming is extremely slow.:M 2023-12-18 16:06:01 -05:00
			`# Needed to response token calculations. Cannot rely on response_data due to function buffering.`
			`partials_raw = +""`
			`request_body = prepare_payload(prompt, model_params, dialect).to_json`
REFACTOR: Summarization and HyDE now use an LLM abstraction. (#297) * DEV: One LLM abstraction to rule them all * REFACTOR: HyDE search uses new LLM abstraction * REFACTOR: Summarization uses the LLM abstraction * Updated documentation and made small fixes. Remove Bedrock claude-2 restriction 2023-11-23 10:58:54 -05:00
			`request = prepare_request(request_body)`

			`http.request(request) do \|response\|`
			`if response.code.to_i != 200`
			`Rails.logger.error(`
			`"#{self.class.name}: status: #{response.code.to_i} - body: #{response.body}",`
			`)`
			`raise CompletionFailed`
			`end`

			`log =`
			`AiApiAuditLog.new(`
			`provider_id: provider_id,`
FIX: Use XML tags in generate_titles prompt. (#322) We must ensure we can isolate titles, and the models sometimes ignore the example we give them. Additionally, anons can generate HyDE posts, so we need to check if user is nil when attempting to log requests. 2023-11-28 10:52:22 -05:00			`user_id: user&.id,`
REFACTOR: Summarization and HyDE now use an LLM abstraction. (#297) * DEV: One LLM abstraction to rule them all * REFACTOR: HyDE search uses new LLM abstraction * REFACTOR: Summarization uses the LLM abstraction * Updated documentation and made small fixes. Remove Bedrock claude-2 restriction 2023-11-23 10:58:54 -05:00			`raw_request_payload: request_body,`
			`request_tokens: prompt_size(prompt),`
			`)`

			`if !@streaming_mode`
			`response_raw = response.read_body`
			`response_data = extract_completion_from(response_raw)`
DEV: Tool support for the LLM service. (#366) This PR adds tool support to available LLMs. We'll buffer tool invocations and return them instead of making users of this service parse the response. It also adds support for conversation context in the generic prompt. It includes bot messages, user messages, and tool invocations, which we'll trim to make sure it doesn't exceed the prompt limit, then translate them to the correct dialect. Finally, It adds some buffering when reading chunks to handle cases when streaming is extremely slow.:M 2023-12-18 16:06:01 -05:00			`partials_raw = response_data.to_s`

			`if has_tool?("", response_data)`
			`function_buffer = build_buffer # Nokogiri document`
			`function_buffer = add_to_buffer(function_buffer, "", response_data)`

			`response_data = +function_buffer.at("function_calls").to_s`
			`response_data << "\n"`
			`end`
REFACTOR: Summarization and HyDE now use an LLM abstraction. (#297) * DEV: One LLM abstraction to rule them all * REFACTOR: HyDE search uses new LLM abstraction * REFACTOR: Summarization uses the LLM abstraction * Updated documentation and made small fixes. Remove Bedrock claude-2 restriction 2023-11-23 10:58:54 -05:00
			`return response_data`
			`end`

			`begin`
			`cancelled = false`
			`cancel = lambda { cancelled = true }`

			`leftover = ""`
DEV: Tool support for the LLM service. (#366) This PR adds tool support to available LLMs. We'll buffer tool invocations and return them instead of making users of this service parse the response. It also adds support for conversation context in the generic prompt. It includes bot messages, user messages, and tool invocations, which we'll trim to make sure it doesn't exceed the prompt limit, then translate them to the correct dialect. Finally, It adds some buffering when reading chunks to handle cases when streaming is extremely slow.:M 2023-12-18 16:06:01 -05:00			`function_buffer = build_buffer # Nokogiri document`
REFACTOR: Summarization and HyDE now use an LLM abstraction. (#297) * DEV: One LLM abstraction to rule them all * REFACTOR: HyDE search uses new LLM abstraction * REFACTOR: Summarization uses the LLM abstraction * Updated documentation and made small fixes. Remove Bedrock claude-2 restriction 2023-11-23 10:58:54 -05:00
			`response.read_body do \|chunk\|`
			`if cancelled`
			`http.finish`
			`return`
			`end`

			`decoded_chunk = decode(chunk)`
			`response_raw << decoded_chunk`

DEV: Tool support for the LLM service. (#366) This PR adds tool support to available LLMs. We'll buffer tool invocations and return them instead of making users of this service parse the response. It also adds support for conversation context in the generic prompt. It includes bot messages, user messages, and tool invocations, which we'll trim to make sure it doesn't exceed the prompt limit, then translate them to the correct dialect. Finally, It adds some buffering when reading chunks to handle cases when streaming is extremely slow.:M 2023-12-18 16:06:01 -05:00			`# Buffering for extremely slow streaming.`
			`if (leftover + decoded_chunk).length < "data: [DONE]".length`
			`leftover += decoded_chunk`
			`next`
			`end`

REFACTOR: Summarization and HyDE now use an LLM abstraction. (#297) * DEV: One LLM abstraction to rule them all * REFACTOR: HyDE search uses new LLM abstraction * REFACTOR: Summarization uses the LLM abstraction * Updated documentation and made small fixes. Remove Bedrock claude-2 restriction 2023-11-23 10:58:54 -05:00			`partials_from(leftover + decoded_chunk).each do \|raw_partial\|`
			`next if cancelled`
			`next if raw_partial.blank?`

			`begin`
			`partial = extract_completion_from(raw_partial)`
FIX: Many fixes for huggingface and llama2 inference (#335) 2023-12-06 09:22:42 -05:00			`next if partial.nil?`
REFACTOR: Summarization and HyDE now use an LLM abstraction. (#297) * DEV: One LLM abstraction to rule them all * REFACTOR: HyDE search uses new LLM abstraction * REFACTOR: Summarization uses the LLM abstraction * Updated documentation and made small fixes. Remove Bedrock claude-2 restriction 2023-11-23 10:58:54 -05:00			`leftover = ""`

DEV: Tool support for the LLM service. (#366) This PR adds tool support to available LLMs. We'll buffer tool invocations and return them instead of making users of this service parse the response. It also adds support for conversation context in the generic prompt. It includes bot messages, user messages, and tool invocations, which we'll trim to make sure it doesn't exceed the prompt limit, then translate them to the correct dialect. Finally, It adds some buffering when reading chunks to handle cases when streaming is extremely slow.:M 2023-12-18 16:06:01 -05:00			`if has_tool?(response_data, partial)`
			`function_buffer = add_to_buffer(function_buffer, response_data, partial)`

			`if buffering_finished?(dialect.tools, function_buffer)`
			`invocation = +function_buffer.at("function_calls").to_s`
			`invocation << "\n"`

			`partials_raw << partial.to_s`
			`response_data << invocation`

			`yield invocation, cancel`
			`end`
			`else`
			`partials_raw << partial`
			`response_data << partial`

			`yield partial, cancel if partial`
			`end`
REFACTOR: Summarization and HyDE now use an LLM abstraction. (#297) * DEV: One LLM abstraction to rule them all * REFACTOR: HyDE search uses new LLM abstraction * REFACTOR: Summarization uses the LLM abstraction * Updated documentation and made small fixes. Remove Bedrock claude-2 restriction 2023-11-23 10:58:54 -05:00			`rescue JSON::ParserError`
DEV: Tool support for the LLM service. (#366) This PR adds tool support to available LLMs. We'll buffer tool invocations and return them instead of making users of this service parse the response. It also adds support for conversation context in the generic prompt. It includes bot messages, user messages, and tool invocations, which we'll trim to make sure it doesn't exceed the prompt limit, then translate them to the correct dialect. Finally, It adds some buffering when reading chunks to handle cases when streaming is extremely slow.:M 2023-12-18 16:06:01 -05:00			`leftover += decoded_chunk`
REFACTOR: Summarization and HyDE now use an LLM abstraction. (#297) * DEV: One LLM abstraction to rule them all * REFACTOR: HyDE search uses new LLM abstraction * REFACTOR: Summarization uses the LLM abstraction * Updated documentation and made small fixes. Remove Bedrock claude-2 restriction 2023-11-23 10:58:54 -05:00			`end`
			`end`
			`end`
			`rescue IOError, StandardError`
			`raise if !cancelled`
			`end`

			`return response_data`
			`ensure`
DEV: Don't attempt to update log if completion request fails. (#321) We already log the request failure when we raise the exception. 2023-11-28 09:15:12 -05:00			`if log`
			`log.raw_response_payload = response_raw`
DEV: Tool support for the LLM service. (#366) This PR adds tool support to available LLMs. We'll buffer tool invocations and return them instead of making users of this service parse the response. It also adds support for conversation context in the generic prompt. It includes bot messages, user messages, and tool invocations, which we'll trim to make sure it doesn't exceed the prompt limit, then translate them to the correct dialect. Finally, It adds some buffering when reading chunks to handle cases when streaming is extremely slow.:M 2023-12-18 16:06:01 -05:00			`log.response_tokens = tokenizer.size(partials_raw)`
DEV: Don't attempt to update log if completion request fails. (#321) We already log the request failure when we raise the exception. 2023-11-28 09:15:12 -05:00			`log.save!`
REFACTOR: Summarization and HyDE now use an LLM abstraction. (#297) * DEV: One LLM abstraction to rule them all * REFACTOR: HyDE search uses new LLM abstraction * REFACTOR: Summarization uses the LLM abstraction * Updated documentation and made small fixes. Remove Bedrock claude-2 restriction 2023-11-23 10:58:54 -05:00
DEV: Don't attempt to update log if completion request fails. (#321) We already log the request failure when we raise the exception. 2023-11-28 09:15:12 -05:00			`if Rails.env.development?`
			`puts "#{self.class.name}: request_tokens #{log.request_tokens} response_tokens #{log.response_tokens}"`
			`end`
REFACTOR: Summarization and HyDE now use an LLM abstraction. (#297) * DEV: One LLM abstraction to rule them all * REFACTOR: HyDE search uses new LLM abstraction * REFACTOR: Summarization uses the LLM abstraction * Updated documentation and made small fixes. Remove Bedrock claude-2 restriction 2023-11-23 10:58:54 -05:00			`end`
			`end`
			`end`
			`end`

			`def default_options`
			`raise NotImplementedError`
			`end`

			`def provider_id`
			`raise NotImplementedError`
			`end`

			`def prompt_size(prompt)`
			`tokenizer.size(extract_prompt_for_tokenizer(prompt))`
			`end`

			`attr_reader :tokenizer`

			`protected`

			`attr_reader :model`

			`def model_uri`
			`raise NotImplementedError`
			`end`

			`def prepare_payload(_prompt, _model_params)`
			`raise NotImplementedError`
			`end`

			`def prepare_request(_payload)`
			`raise NotImplementedError`
			`end`

			`def extract_completion_from(_response_raw)`
			`raise NotImplementedError`
			`end`

			`def decode(chunk)`
			`chunk`
			`end`

			`def partials_from(_decoded_chunk)`
			`raise NotImplementedError`
			`end`

			`def extract_prompt_for_tokenizer(prompt)`
			`prompt`
			`end`
DEV: Tool support for the LLM service. (#366) This PR adds tool support to available LLMs. We'll buffer tool invocations and return them instead of making users of this service parse the response. It also adds support for conversation context in the generic prompt. It includes bot messages, user messages, and tool invocations, which we'll trim to make sure it doesn't exceed the prompt limit, then translate them to the correct dialect. Finally, It adds some buffering when reading chunks to handle cases when streaming is extremely slow.:M 2023-12-18 16:06:01 -05:00
			`def build_buffer`
			`Nokogiri::HTML5.fragment(<<~TEXT)`
			`<function_calls>`
			`<invoke>`
			`<tool_name></tool_name>`
			`<tool_id></tool_id>`
			`<parameters></parameters>`
			`</invoke>`
			`</function_calls>`
			`TEXT`
			`end`

			`def has_tool?(response, partial)`
			`(response + partial).include?("<function_calls>")`
			`end`

			`def add_to_buffer(function_buffer, response_data, partial)`
			`new_buffer = Nokogiri::HTML5.fragment(response_data + partial)`
			`if tool_name = new_buffer.at("tool_name").text`
			`if new_buffer.at("tool_id").nil?`
			`tool_id_node =`
			`Nokogiri::HTML5::DocumentFragment.parse("\n<tool_id>#{tool_name}</tool_id>")`

			`new_buffer.at("invoke").children[1].add_next_sibling(tool_id_node)`
			`end`
			`end`

			`new_buffer`
			`end`

			`def buffering_finished?(_available_functions, buffer)`
			`buffer.to_s.include?("</function_calls>")`
			`end`
REFACTOR: Summarization and HyDE now use an LLM abstraction. (#297) * DEV: One LLM abstraction to rule them all * REFACTOR: HyDE search uses new LLM abstraction * REFACTOR: Summarization uses the LLM abstraction * Updated documentation and made small fixes. Remove Bedrock claude-2 restriction 2023-11-23 10:58:54 -05:00			`end`
			`end`
			`end`
			`end`