REFACTOR: Summarization and HyDE now use an LLM abstraction. (#297)

* DEV: One LLM abstraction to rule them all * REFACTOR: HyDE search uses new LLM abstraction * REFACTOR: Summarization uses the LLM abstraction * Updated documentation and made small fixes. Remove Bedrock claude-2 restriction
2023-11-23 12:58:54 -03:00 · 2023-11-23 12:58:54 -03:00 · 3064d4c288
parent 53b7f031ba
commit 3064d4c288
47 changed files with 1679 additions and 1040 deletions
--- a/lib/completions/dialects/chat_gpt.rb
+++ b/lib/completions/dialects/chat_gpt.rb
@ -0,0 +1,35 @@
+# frozen_string_literal: true
+
+module DiscourseAi
+  module Completions
+    module Dialects
+      class ChatGPT
+        def self.can_translate?(model_name)
+          %w[gpt-3.5-turbo gpt-4 gpt-3.5-turbo-16k gpt-4-32k].include?(model_name)
+        end
+
+        def translate(generic_prompt)
+          open_ai_prompt = [
+            {
+              role: "system",
+              content: [generic_prompt[:insts], generic_prompt[:post_insts].to_s].join("\n"),
+            },
+          ]
+
+          if generic_prompt[:examples]
+            generic_prompt[:examples].each do |example_pair|
+              open_ai_prompt << { role: "user", content: example_pair.first }
+              open_ai_prompt << { role: "assistant", content: example_pair.second }
+            end
+          end
+
+          open_ai_prompt << { role: "user", content: generic_prompt[:input] }
+        end
+
+        def tokenizer
+          DiscourseAi::Tokenizer::OpenAiTokenizer
+        end
+      end
+    end
+  end
+end
--- a/lib/completions/dialects/claude.rb
+++ b/lib/completions/dialects/claude.rb
@ -0,0 +1,37 @@
+# frozen_string_literal: true
+
+module DiscourseAi
+  module Completions
+    module Dialects
+      class Claude
+        def self.can_translate?(model_name)
+          %w[claude-instant-1 claude-2].include?(model_name)
+        end
+
+        def translate(generic_prompt)
+          claude_prompt = +"Human: #{generic_prompt[:insts]}\n"
+
+          claude_prompt << build_examples(generic_prompt[:examples]) if generic_prompt[:examples]
+
+          claude_prompt << "#{generic_prompt[:input]}\n"
+
+          claude_prompt << "#{generic_prompt[:post_insts]}\n" if generic_prompt[:post_insts]
+
+          claude_prompt << "Assistant:\n"
+        end
+
+        def tokenizer
+          DiscourseAi::Tokenizer::AnthropicTokenizer
+        end
+
+        private
+
+        def build_examples(examples_arr)
+          examples_arr.reduce("") do |memo, example|
+            memo += "<example>\nH: #{example[0]}\nA: #{example[1]}\n</example>\n"
+          end
+        end
+      end
+    end
+  end
+end
--- a/lib/completions/dialects/llama2_classic.rb
+++ b/lib/completions/dialects/llama2_classic.rb
@ -0,0 +1,31 @@
+# frozen_string_literal: true
+
+module DiscourseAi
+  module Completions
+    module Dialects
+      class Llama2Classic
+        def self.can_translate?(model_name)
+          "Llama2-*-chat-hf" == model_name
+        end
+
+        def translate(generic_prompt)
+          llama2_prompt =
+            +"[INST]<<SYS>>#{[generic_prompt[:insts], generic_prompt[:post_insts].to_s].join("\n")}<</SYS>>[/INST]\n"
+
+          if generic_prompt[:examples]
+            generic_prompt[:examples].each do |example_pair|
+              llama2_prompt << "[INST]#{example_pair.first}[/INST]\n"
+              llama2_prompt << "#{example_pair.second}\n"
+            end
+          end
+
+          llama2_prompt << "[INST]#{generic_prompt[:input]}[/INST]\n"
+        end
+
+        def tokenizer
+          DiscourseAi::Tokenizer::Llama2Tokenizer
+        end
+      end
+    end
+  end
+end
--- a/lib/completions/dialects/orca_style.rb
+++ b/lib/completions/dialects/orca_style.rb
@ -0,0 +1,33 @@
+# frozen_string_literal: true
+
+module DiscourseAi
+  module Completions
+    module Dialects
+      class OrcaStyle
+        def self.can_translate?(model_name)
+          %w[StableBeluga2 Upstage-Llama-2-*-instruct-v2].include?(model_name)
+        end
+
+        def translate(generic_prompt)
+          orca_style_prompt =
+            +"### System:\n#{[generic_prompt[:insts], generic_prompt[:post_insts].to_s].join("\n")}\n"
+
+          if generic_prompt[:examples]
+            generic_prompt[:examples].each do |example_pair|
+              orca_style_prompt << "### User:\n#{example_pair.first}\n"
+              orca_style_prompt << "### Assistant:\n#{example_pair.second}\n"
+            end
+          end
+
+          orca_style_prompt << "### User:\n#{generic_prompt[:input]}\n"
+
+          orca_style_prompt << "### Assistant:\n"
+        end
+
+        def tokenizer
+          DiscourseAi::Tokenizer::Llama2Tokenizer
+        end
+      end
+    end
+  end
+end
--- a/lib/completions/endpoints/anthropic.rb
+++ b/lib/completions/endpoints/anthropic.rb
@ -0,0 +1,52 @@
+# frozen_string_literal: true
+
+module DiscourseAi
+  module Completions
+    module Endpoints
+      class Anthropic < Base
+        def self.can_contact?(model_name)
+          %w[claude-instant-1 claude-2].include?(model_name)
+        end
+
+        def default_options
+          { max_tokens_to_sample: 2000, model: model }
+        end
+
+        def provider_id
+          AiApiAuditLog::Provider::Anthropic
+        end
+
+        private
+
+        def model_uri
+          @uri ||= URI("https://api.anthropic.com/v1/complete")
+        end
+
+        def prepare_payload(prompt, model_params)
+          default_options
+            .merge(model_params)
+            .merge(prompt: prompt)
+            .tap { |payload| payload[:stream] = true if @streaming_mode }
+        end
+
+        def prepare_request(payload)
+          headers = {
+            "anthropic-version" => "2023-06-01",
+            "x-api-key" => SiteSetting.ai_anthropic_api_key,
+            "content-type" => "application/json",
+          }
+
+          Net::HTTP::Post.new(model_uri, headers).tap { |r| r.body = payload }
+        end
+
+        def extract_completion_from(response_raw)
+          JSON.parse(response_raw, symbolize_names: true)[:completion].to_s
+        end
+
+        def partials_from(decoded_chunk)
+          decoded_chunk.split("\n").map { |line| line.split("data: ", 2)[1] }.compact
+        end
+      end
+    end
+  end
+end
--- a/lib/completions/endpoints/aws_bedrock.rb
+++ b/lib/completions/endpoints/aws_bedrock.rb
@ -0,0 +1,86 @@
+# frozen_string_literal: true
+
+module DiscourseAi
+  module Completions
+    module Endpoints
+      class AwsBedrock < Base
+        def self.can_contact?(model_name)
+          SiteSetting.ai_bedrock_access_key_id.present? &&
+            SiteSetting.ai_bedrock_secret_access_key.present? &&
+            SiteSetting.ai_bedrock_region.present?
+        end
+
+        def default_options
+          { max_tokens_to_sample: 20_000 }
+        end
+
+        def provider_id
+          AiApiAuditLog::Provider::Anthropic
+        end
+
+        private
+
+        def model_uri
+          api_url =
+            "https://bedrock-runtime.#{SiteSetting.ai_bedrock_region}.amazonaws.com/model/anthropic.#{model}/invoke"
+
+          api_url = @streaming_mode ? (api_url + "-with-response-stream") : api_url
+
+          URI(api_url)
+        end
+
+        def prepare_payload(prompt, model_params)
+          default_options.merge(prompt: prompt).merge(model_params)
+        end
+
+        def prepare_request(payload)
+          headers = { "content-type" => "application/json", "Accept" => "*/*" }
+
+          signer =
+            Aws::Sigv4::Signer.new(
+              access_key_id: SiteSetting.ai_bedrock_access_key_id,
+              region: SiteSetting.ai_bedrock_region,
+              secret_access_key: SiteSetting.ai_bedrock_secret_access_key,
+              service: "bedrock",
+            )
+
+          Net::HTTP::Post
+            .new(model_uri, headers)
+            .tap do |r|
+              r.body = payload
+
+              signed_request =
+                signer.sign_request(req: r, http_method: r.method, url: model_uri, body: r.body)
+
+              r.initialize_http_header(headers.merge(signed_request.headers))
+            end
+        end
+
+        def decode(chunk)
+          Aws::EventStream::Decoder
+            .new
+            .decode_chunk(chunk)
+            .first
+            .payload
+            .string
+            .then { JSON.parse(_1) }
+            .dig("bytes")
+            .then { Base64.decode64(_1) }
+        rescue JSON::ParserError,
+               Aws::EventStream::Errors::MessageChecksumError,
+               Aws::EventStream::Errors::PreludeChecksumError => e
+          Rails.logger.error("#{self.class.name}: #{e.message}")
+          nil
+        end
+
+        def extract_completion_from(response_raw)
+          JSON.parse(response_raw, symbolize_names: true)[:completion].to_s
+        end
+
+        def partials_from(decoded_chunk)
+          [decoded_chunk]
+        end
+      end
+    end
+  end
+end
--- a/lib/completions/endpoints/base.rb
+++ b/lib/completions/endpoints/base.rb
@ -0,0 +1,167 @@
+# frozen_string_literal: true
+
+module DiscourseAi
+  module Completions
+    module Endpoints
+      class Base
+        CompletionFailed = Class.new(StandardError)
+        TIMEOUT = 60
+
+        def self.endpoint_for(model_name)
+          # Order is important.
+          # Bedrock has priority over Anthropic if creadentials are present.
+          [
+            DiscourseAi::Completions::Endpoints::AwsBedrock,
+            DiscourseAi::Completions::Endpoints::Anthropic,
+            DiscourseAi::Completions::Endpoints::OpenAI,
+            DiscourseAi::Completions::Endpoints::Huggingface,
+          ].detect(-> { raise DiscourseAi::Completions::LLM::UNKNOWN_MODEL }) do |ek|
+            ek.can_contact?(model_name)
+          end
+        end
+
+        def self.can_contact?(_model_name)
+          raise NotImplementedError
+        end
+
+        def initialize(model_name, tokenizer)
+          @model = model_name
+          @tokenizer = tokenizer
+        end
+
+        def perform_completion!(prompt, user, model_params = {})
+          @streaming_mode = block_given?
+
+          Net::HTTP.start(
+            model_uri.host,
+            model_uri.port,
+            use_ssl: true,
+            read_timeout: TIMEOUT,
+            open_timeout: TIMEOUT,
+            write_timeout: TIMEOUT,
+          ) do |http|
+            response_data = +""
+            response_raw = +""
+            request_body = prepare_payload(prompt, model_params).to_json
+
+            request = prepare_request(request_body)
+
+            http.request(request) do |response|
+              if response.code.to_i != 200
+                Rails.logger.error(
+                  "#{self.class.name}: status: #{response.code.to_i} - body: #{response.body}",
+                )
+                raise CompletionFailed
+              end
+
+              log =
+                AiApiAuditLog.new(
+                  provider_id: provider_id,
+                  user_id: user.id,
+                  raw_request_payload: request_body,
+                  request_tokens: prompt_size(prompt),
+                )
+
+              if !@streaming_mode
+                response_raw = response.read_body
+                response_data = extract_completion_from(response_raw)
+
+                return response_data
+              end
+
+              begin
+                cancelled = false
+                cancel = lambda { cancelled = true }
+
+                leftover = ""
+
+                response.read_body do |chunk|
+                  if cancelled
+                    http.finish
+                    return
+                  end
+
+                  decoded_chunk = decode(chunk)
+                  response_raw << decoded_chunk
+
+                  partials_from(leftover + decoded_chunk).each do |raw_partial|
+                    next if cancelled
+                    next if raw_partial.blank?
+
+                    begin
+                      partial = extract_completion_from(raw_partial)
+                      leftover = ""
+                      response_data << partial
+
+                      yield partial, cancel if partial
+                    rescue JSON::ParserError
+                      leftover = raw_partial
+                    end
+                  end
+                end
+              rescue IOError, StandardError
+                raise if !cancelled
+              end
+
+              return response_data
+            ensure
+              log.raw_response_payload = response_raw
+              log.response_tokens = tokenizer.size(response_data)
+              log.save!
+
+              if Rails.env.development? && log
+                puts "#{self.class.name}: request_tokens #{log.request_tokens} response_tokens #{log.response_tokens}"
+              end
+            end
+          end
+        end
+
+        def default_options
+          raise NotImplementedError
+        end
+
+        def provider_id
+          raise NotImplementedError
+        end
+
+        def prompt_size(prompt)
+          tokenizer.size(extract_prompt_for_tokenizer(prompt))
+        end
+
+        attr_reader :tokenizer
+
+        protected
+
+        attr_reader :model
+
+        def model_uri
+          raise NotImplementedError
+        end
+
+        def prepare_payload(_prompt, _model_params)
+          raise NotImplementedError
+        end
+
+        def prepare_request(_payload)
+          raise NotImplementedError
+        end
+
+        def extract_completion_from(_response_raw)
+          raise NotImplementedError
+        end
+
+        def decode(chunk)
+          chunk
+        end
+
+        def partials_from(_decoded_chunk)
+          raise NotImplementedError
+        end
+
+        def extract_prompt_for_tokenizer(prompt)
+          prompt
+        end
+      end
+    end
+  end
+end
--- a/lib/completions/endpoints/canned_response.rb
+++ b/lib/completions/endpoints/canned_response.rb
@ -0,0 +1,47 @@
+# frozen_string_literal: true
+
+module DiscourseAi
+  module Completions
+    module Endpoints
+      class CannedResponse
+        CANNED_RESPONSE_ERROR = Class.new(StandardError)
+
+        def self.can_contact?(_)
+          Rails.env.test?
+        end
+
+        def initialize(responses)
+          @responses = responses
+          @completions = 0
+        end
+
+        attr_reader :responses, :completions
+
+        def perform_completion!(_prompt, _user, _model_params)
+          response = responses[completions]
+          if response.nil?
+            raise CANNED_RESPONSE_ERROR,
+                  "The number of completions you requested exceed the number of canned responses"
+          end
+
+          @completions += 1
+          if block_given?
+            cancelled = false
+            cancel_fn = lambda { cancelled = true }
+
+            response.each_char do |char|
+              break if cancelled
+              yield(char, cancel_fn)
+            end
+          else
+            response
+          end
+        end
+
+        def tokenizer
+          DiscourseAi::Tokenizer::OpenAiTokenizer
+        end
+      end
+    end
+  end
+end
--- a/lib/completions/endpoints/hugging_face.rb
+++ b/lib/completions/endpoints/hugging_face.rb
@ -0,0 +1,75 @@
+# frozen_string_literal: true
+
+module DiscourseAi
+  module Completions
+    module Endpoints
+      class Huggingface < Base
+        def self.can_contact?(model_name)
+          %w[StableBeluga2 Upstage-Llama-2-*-instruct-v2 Llama2-*-chat-hf].include?(model_name)
+        end
+
+        def default_options
+          { parameters: { repetition_penalty: 1.1, temperature: 0.7 } }
+        end
+
+        def provider_id
+          AiApiAuditLog::Provider::HuggingFaceTextGeneration
+        end
+
+        private
+
+        def model_uri
+          URI(SiteSetting.ai_hugging_face_api_url).tap do |uri|
+            uri.path = @streaming_mode ? "/generate_stream" : "/generate"
+          end
+        end
+
+        def prepare_payload(prompt, model_params)
+          default_options
+            .merge(inputs: prompt)
+            .tap do |payload|
+              payload[:parameters].merge!(model_params)
+
+              token_limit = 2_000 || SiteSetting.ai_hugging_face_token_limit
+
+              payload[:parameters][:max_new_tokens] = token_limit - prompt_size(prompt)
+            end
+        end
+
+        def prepare_request(payload)
+          headers =
+            { "Content-Type" => "application/json" }.tap do |h|
+              if SiteSetting.ai_hugging_face_api_key.present?
+                h["Authorization"] = "Bearer #{SiteSetting.ai_hugging_face_api_key}"
+              end
+            end
+
+          Net::HTTP::Post.new(model_uri, headers).tap { |r| r.body = payload }
+        end
+
+        def extract_completion_from(response_raw)
+          parsed = JSON.parse(response_raw, symbolize_names: true)
+
+          if @streaming_mode
+            # Last chunk contains full response, which we already yielded.
+            return if parsed.dig(:token, :special)
+
+            parsed.dig(:token, :text).to_s
+          else
+            parsed[:generated_text].to_s
+          end
+        end
+
+        def partials_from(decoded_chunk)
+          decoded_chunk
+            .split("\n")
+            .map do |line|
+              data = line.split("data: ", 2)[1]
+              data&.squish == "[DONE]" ? nil : data
+            end
+            .compact
+        end
+      end
+    end
+  end
+end
--- a/lib/completions/endpoints/open_ai.rb
+++ b/lib/completions/endpoints/open_ai.rb
@ -0,0 +1,92 @@
+# frozen_string_literal: true
+
+module DiscourseAi
+  module Completions
+    module Endpoints
+      class OpenAI < Base
+        def self.can_contact?(model_name)
+          %w[gpt-3.5-turbo gpt-4 gpt-3.5-turbo-16k gpt-4-32k].include?(model_name)
+        end
+
+        def default_options
+          { model: model }
+        end
+
+        def provider_id
+          AiApiAuditLog::Provider::OpenAI
+        end
+
+        private
+
+        def model_uri
+          url =
+            if model.include?("gpt-4")
+              if model.include?("32k")
+                SiteSetting.ai_openai_gpt4_32k_url
+              else
+                SiteSetting.ai_openai_gpt4_url
+              end
+            else
+              if model.include?("16k")
+                SiteSetting.ai_openai_gpt35_16k_url
+              else
+                SiteSetting.ai_openai_gpt35_url
+              end
+            end
+
+          URI(url)
+        end
+
+        def prepare_payload(prompt, model_params)
+          default_options
+            .merge(model_params)
+            .merge(messages: prompt)
+            .tap { |payload| payload[:stream] = true if @streaming_mode }
+        end
+
+        def prepare_request(payload)
+          headers =
+            { "Content-Type" => "application/json" }.tap do |h|
+              if model_uri.host.include?("azure")
+                h["api-key"] = SiteSetting.ai_openai_api_key
+              else
+                h["Authorization"] = "Bearer #{SiteSetting.ai_openai_api_key}"
+              end
+
+              if SiteSetting.ai_openai_organization.present?
+                h["OpenAI-Organization"] = SiteSetting.ai_openai_organization
+              end
+            end
+
+          Net::HTTP::Post.new(model_uri, headers).tap { |r| r.body = payload }
+        end
+
+        def extract_completion_from(response_raw)
+          parsed = JSON.parse(response_raw, symbolize_names: true)
+
+          (
+            if @streaming_mode
+              parsed.dig(:choices, 0, :delta, :content)
+            else
+              parsed.dig(:choices, 0, :message, :content)
+            end
+          ).to_s
+        end
+
+        def partials_from(decoded_chunk)
+          decoded_chunk
+            .split("\n")
+            .map do |line|
+              data = line.split("data: ", 2)[1]
+              data == "[DONE]" ? nil : data
+            end
+            .compact
+        end
+
+        def extract_prompt_for_tokenizer(prompt)
+          prompt.map { |message| message[:content] || message["content"] || "" }.join("\n")
+        end
+      end
+    end
+  end
+end
--- a/lib/completions/entry_point.rb
+++ b/lib/completions/entry_point.rb
@ -0,0 +1,26 @@
+# frozen_string_literal: true
+
+module DiscourseAi
+  module Completions
+    class EntryPoint
+      def load_files
+        require_relative "dialects/chat_gpt"
+        require_relative "dialects/llama2_classic"
+        require_relative "dialects/orca_style"
+        require_relative "dialects/claude"
+
+        require_relative "endpoints/canned_response"
+        require_relative "endpoints/base"
+        require_relative "endpoints/anthropic"
+        require_relative "endpoints/aws_bedrock"
+        require_relative "endpoints/open_ai"
+        require_relative "endpoints/hugging_face"
+
+        require_relative "llm"
+      end
+
+      def inject_into(_)
+      end
+    end
+  end
+end
--- a/lib/completions/llm.rb
+++ b/lib/completions/llm.rb
@ -0,0 +1,81 @@
+# frozen_string_literal: true
+
+# A facade that abstracts multiple LLMs behind a single interface.
+#
+# Internally, it consists of the combination of a dialect and an endpoint.
+# After recieving a prompt using our generic format, it translates it to
+# the target model and routes the completion request through the correct gateway.
+#
+# Use the .proxy method to instantiate an object.
+# It chooses the best dialect and endpoint for the model you want to interact with.
+#
+# Tests of modules that perform LLM calls can use .with_prepared_responses to return canned responses
+# instead of relying on WebMock stubs like we did in the past.
+#
+module DiscourseAi
+  module Completions
+    class LLM
+      UNKNOWN_MODEL = Class.new(StandardError)
+
+      def self.with_prepared_responses(responses)
+        @canned_response = DiscourseAi::Completions::Endpoints::CannedResponse.new(responses)
+
+        yield(@canned_response).tap { @canned_response = nil }
+      end
+
+      def self.proxy(model_name)
+        dialects = [
+          DiscourseAi::Completions::Dialects::Claude,
+          DiscourseAi::Completions::Dialects::Llama2Classic,
+          DiscourseAi::Completions::Dialects::ChatGPT,
+          DiscourseAi::Completions::Dialects::OrcaStyle,
+        ]
+
+        dialect =
+          dialects.detect(-> { raise UNKNOWN_MODEL }) { |d| d.can_translate?(model_name) }.new
+
+        return new(dialect, @canned_response, model_name) if @canned_response
+
+        gateway =
+          DiscourseAi::Completions::Endpoints::Base.endpoint_for(model_name).new(
+            model_name,
+            dialect.tokenizer,
+          )
+
+        new(dialect, gateway, model_name)
+      end
+
+      def initialize(dialect, gateway, model_name)
+        @dialect = dialect
+        @gateway = gateway
+        @model_name = model_name
+      end
+
+      delegate :tokenizer, to: :dialect
+
+      # @param generic_prompt { Hash } - Prompt using our generic format.
+      # We use the following keys from the hash:
+      #   - insts: String with instructions for the LLM.
+      #   - input: String containing user input
+      #   - examples (optional): Array of arrays with examples of input and responses. Each array is a input/response pair like [[example1, response1], [example2, response2]].
+      #   - post_insts (optional): Additional instructions for the LLM. Some dialects like Claude add these at the end of the prompt.
+      #
+      # @param user { User } - User requesting the summary.
+      #
+      # @param &on_partial_blk { Block - Optional } - The passed block will get called with the LLM partial response alongside a cancel function.
+      #
+      # @returns { String } - Completion result.
+      def completion!(generic_prompt, user, &partial_read_blk)
+        prompt = dialect.translate(generic_prompt)
+
+        model_params = generic_prompt.dig(:params, model_name) || {}
+
+        gateway.perform_completion!(prompt, user, model_params, &partial_read_blk)
+      end
+
+      private
+
+      attr_reader :dialect, :gateway, :model_name
+    end
+  end
+end
--- a/lib/modules/embeddings/entry_point.rb
+++ b/lib/modules/embeddings/entry_point.rb
@ -15,11 +15,6 @@ module DiscourseAi
        require_relative "semantic_related"
        require_relative "semantic_topic_query"

-        require_relative "hyde_generators/base"
-        require_relative "hyde_generators/openai"
-        require_relative "hyde_generators/anthropic"
-        require_relative "hyde_generators/llama2"
-        require_relative "hyde_generators/llama2_ftos"
        require_relative "semantic_search"
      end

--- a/lib/modules/embeddings/hyde_generators/anthropic.rb
+++ b/lib/modules/embeddings/hyde_generators/anthropic.rb
@ -1,37 +0,0 @@
-# frozen_string_literal: true
-
-module DiscourseAi
-  module Embeddings
-    module HydeGenerators
-      class Anthropic < DiscourseAi::Embeddings::HydeGenerators::Base
-        def prompt(search_term)
-          <<~TEXT
-            Human: Given a search term given between <input> tags, generate a forum post about a given subject.
-            #{basic_prompt_instruction}
-            <input>#{search_term}</input>
-
-            Respond with the generated post between <ai> tags.
-
-            Assistant:\n
-          TEXT
-        end
-
-        def models
-          %w[claude-instant-1 claude-2]
-        end
-
-        def hypothetical_post_from(query)
-          response =
-            ::DiscourseAi::Inference::AnthropicCompletions.perform!(
-              prompt(query),
-              SiteSetting.ai_embeddings_semantic_search_hyde_model,
-              max_tokens: 400,
-              stop_sequences: ["</ai>"],
-            ).dig(:completion)
-
-          Nokogiri::HTML5.fragment(response).at("ai").text
-        end
-      end
-    end
-  end
-end
--- a/lib/modules/embeddings/hyde_generators/base.rb
+++ b/lib/modules/embeddings/hyde_generators/base.rb
@ -1,28 +0,0 @@
-# frozen_string_literal: true
-
-module DiscourseAi
-  module Embeddings
-    module HydeGenerators
-      class Base
-        def self.current_hyde_model
-          DiscourseAi::Embeddings::HydeGenerators::Base.descendants.find do |generator_klass|
-            generator_klass.new.models.include?(
-              SiteSetting.ai_embeddings_semantic_search_hyde_model,
-            )
-          end
-        end
-
-        def basic_prompt_instruction
-          <<~TEXT
-            Act as a content writer for a forum.
-            The forum description is as follows:
-            #{SiteSetting.title}
-            #{SiteSetting.site_description}
-
-            Given the forum description write a forum post about the following subject:
-          TEXT
-        end
-      end
-    end
-  end
-end
--- a/lib/modules/embeddings/hyde_generators/llama2.rb
+++ b/lib/modules/embeddings/hyde_generators/llama2.rb
@ -1,35 +0,0 @@
-# frozen_string_literal: true
-
-module DiscourseAi
-  module Embeddings
-    module HydeGenerators
-      class Llama2 < DiscourseAi::Embeddings::HydeGenerators::Base
-        def prompt(search_term)
-          <<~TEXT
-              [INST] <<SYS>>
-              You are a helpful bot
-              You create forum posts about a given subject
-              <</SYS>>
-
-              #{basic_prompt_instruction}
-              #{search_term}
-              [/INST]
-              Here is a forum post about the above subject:
-            TEXT
-        end
-
-        def models
-          ["Llama2-*-chat-hf"]
-        end
-
-        def hypothetical_post_from(query)
-          ::DiscourseAi::Inference::HuggingFaceTextGeneration.perform!(
-            prompt(query),
-            SiteSetting.ai_embeddings_semantic_search_hyde_model,
-            token_limit: 400,
-          ).dig(:generated_text)
-        end
-      end
-    end
-  end
-end
--- a/lib/modules/embeddings/hyde_generators/llama2_ftos.rb
+++ b/lib/modules/embeddings/hyde_generators/llama2_ftos.rb
@ -1,28 +0,0 @@
-# frozen_string_literal: true
-
-module DiscourseAi
-  module Embeddings
-    module HydeGenerators
-      class Llama2Ftos < DiscourseAi::Embeddings::HydeGenerators::Llama2
-        def prompt(search_term)
-          <<~TEXT
-              ### System:
-              You are a helpful bot
-              You create forum posts about a given subject
-              
-              ### User:
-              #{basic_prompt_instruction}
-              #{search_term}
-    
-              ### Assistant:
-              Here is a forum post about the above subject:
-            TEXT
-        end
-
-        def models
-          %w[StableBeluga2 Upstage-Llama-2-*-instruct-v2]
-        end
-      end
-    end
-  end
-end
--- a/lib/modules/embeddings/hyde_generators/openai.rb
+++ b/lib/modules/embeddings/hyde_generators/openai.rb
@ -1,31 +0,0 @@
-# frozen_string_literal: true
-
-module DiscourseAi
-  module Embeddings
-    module HydeGenerators
-      class OpenAi < DiscourseAi::Embeddings::HydeGenerators::Base
-        def prompt(search_term)
-          [
-            {
-              role: "system",
-              content: "You are a helpful bot. You create forum posts about a given subject.",
-            },
-            { role: "user", content: "#{basic_prompt_instruction}\n#{search_term}" },
-          ]
-        end
-
-        def models
-          %w[gpt-3.5-turbo gpt-4]
-        end
-
-        def hypothetical_post_from(query)
-          ::DiscourseAi::Inference::OpenAiCompletions.perform!(
-            prompt(query),
-            SiteSetting.ai_embeddings_semantic_search_hyde_model,
-            max_tokens: 400,
-          ).dig(:choices, 0, :message, :content)
-        end
-      end
-    end
-  end
-end
--- a/lib/modules/embeddings/semantic_search.rb
+++ b/lib/modules/embeddings/semantic_search.rb
@ -55,10 +55,7 @@ module DiscourseAi
        hypothetical_post =
          Discourse
            .cache
-            .fetch(hyde_key, expires_in: 1.week) do
-              hyde_generator = DiscourseAi::Embeddings::HydeGenerators::Base.current_hyde_model.new
-              hyde_generator.hypothetical_post_from(search_term)
-            end
+            .fetch(hyde_key, expires_in: 1.week) { hypothetical_post_from(search_term) }

        hypothetical_post_embedding =
          Discourse
@ -96,6 +93,30 @@ module DiscourseAi
      def build_embedding_key(digest, hyde_model, embedding_model)
        "#{build_hyde_key(digest, hyde_model)}-#{embedding_model}"
      end
+
+      def hypothetical_post_from(search_term)
+        prompt = {
+          insts: <<~TEXT,
+          You are a content creator for a forum. The forum description is as follows:
+          #{SiteSetting.title}
+          #{SiteSetting.site_description}
+          Given the forum description write a forum post about the following subject:
+        TEXT
+          input: <<~TEXT,
+          Using this description, write a forum post about the subject inside the <input></input> XML tags:
+          
+          <input>#{search_term}</input>
+        TEXT
+          post_insts: "Put the forum post between <ai></ai> tags.",
+        }
+
+        llm_response =
+          DiscourseAi::Completions::LLM.proxy(
+            SiteSetting.ai_embeddings_semantic_search_hyde_model,
+          ).completion!(prompt, @guardian.user)
+
+        Nokogiri::HTML5.fragment(llm_response).at("ai").text
+      end
    end
  end
 end
--- a/lib/modules/summarization/entry_point.rb
+++ b/lib/modules/summarization/entry_point.rb
@ -21,7 +21,6 @@ module DiscourseAi
          Models::OpenAi.new("gpt-4-32k", max_tokens: 32_768),
          Models::OpenAi.new("gpt-3.5-turbo", max_tokens: 4096),
          Models::OpenAi.new("gpt-3.5-turbo-16k", max_tokens: 16_384),
-          Models::Discourse.new("long-t5-tglobal-base-16384-book-summary", max_tokens: 16_384),
          Models::Anthropic.new("claude-2", max_tokens: 100_000),
          Models::Anthropic.new("claude-instant-1", max_tokens: 100_000),
          Models::Llama2.new("Llama2-chat-hf", max_tokens: SiteSetting.ai_hugging_face_token_limit),
@ -36,6 +35,7 @@ module DiscourseAi
        end

        truncable_models = [
+          Models::Discourse.new("long-t5-tglobal-base-16384-book-summary", max_tokens: 16_384),
          Models::Discourse.new("bart-large-cnn-samsum", max_tokens: 1024),
          Models::Discourse.new("flan-t5-base-samsum", max_tokens: 512),
        ]
--- a/lib/modules/summarization/models/anthropic.rb
+++ b/lib/modules/summarization/models/anthropic.rb
@ -19,109 +19,6 @@ module DiscourseAi
            setting: "ai_anthropic_api_key",
          )
        end
-
-        def concatenate_summaries(summaries, &on_partial_blk)
-          instructions = <<~TEXT
-            Human: Concatenate the following disjoint summaries inside the given input tags, creating a cohesive narrative.
-            Include only the summary inside <ai> tags.
-          TEXT
-
-          instructions += summaries.reduce("") { |m, s| m += "<input>#{s}</input>\n" }
-          instructions += "Assistant:\n"
-
-          completion(instructions, &on_partial_blk)
-        end
-
-        def summarize_with_truncation(contents, opts, &on_partial_blk)
-          instructions = build_base_prompt(opts)
-
-          text_to_summarize = contents.map { |c| format_content_item(c) }.join
-          truncated_content = tokenizer.truncate(text_to_summarize, available_tokens)
-
-          instructions += "<input>#{truncated_content}</input>\nAssistant:\n"
-
-          completion(instructions, &on_partial_blk)
-        end
-
-        def summarize_single(chunk_text, opts, &on_partial_blk)
-          summarize_chunk(chunk_text, opts.merge(single_chunk: true), &on_partial_blk)
-        end
-
-        private
-
-        def summarize_chunk(chunk_text, opts, &on_partial_blk)
-          completion(
-            build_base_prompt(opts) + "<input>#{chunk_text}</input>\nAssistant:\n",
-            &on_partial_blk
-          )
-        end
-
-        def build_base_prompt(opts)
-          initial_instruction =
-            if opts[:single_chunk]
-              "Summarize the following forum discussion inside the given <input> tag, creating a cohesive narrative."
-            else
-              "Summarize the following forum discussion inside the given <input> tag."
-            end
-
-          base_prompt = <<~TEXT
-            Human: #{initial_instruction}
-            Try to keep the summary in the same language as the forum discussion.
-            Format the response, including links, using markdown.
-          TEXT
-
-          base_prompt += <<~TEXT if opts[:resource_path]
-              Try generating links as well the format is #{opts[:resource_path]}/POST_ID
-              For example, a link to the 3rd post in the topic would be [post 3](#{opts[:resource_path]}/3)
-            TEXT
-
-          base_prompt += "Wrap the whole the summary inside <ai> tags.\n"
-
-          base_prompt += "The discussion title is: #{opts[:content_title]}.\n" if opts[
-            :content_title
-          ]
-
-          base_prompt += "Don't use more than 400 words.\n" unless opts[:single_chunk]
-
-          base_prompt
-        end
-
-        def completion(prompt, &on_partial_blk)
-          # We need to discard any text that might come before the <ai> tag.
-          # Instructing the model to reply only with the summary seems impossible.
-          pre_tag_partial = +""
-
-          if on_partial_blk
-            on_partial_read =
-              Proc.new do |partial|
-                if pre_tag_partial.include?("<ai>")
-                  on_partial_blk.call(partial[:completion])
-                else
-                  pre_tag_partial << partial[:completion]
-                end
-              end
-
-            response =
-              ::DiscourseAi::Inference::AnthropicCompletions.perform!(
-                prompt,
-                model,
-                &on_partial_read
-              )
-          else
-            response =
-              ::DiscourseAi::Inference::AnthropicCompletions.perform!(prompt, model).dig(
-                :completion,
-              )
-          end
-
-          Nokogiri::HTML5.fragment(response).at("ai")&.text.presence || response
-        end
-
-        def tokenizer
-          DiscourseAi::Tokenizer::AnthropicTokenizer
-        end
-
-        attr_reader :max_tokens
      end
    end
  end
--- a/lib/modules/summarization/models/base.rb
+++ b/lib/modules/summarization/models/base.rb
@ -21,29 +21,6 @@ module DiscourseAi
          raise NotImplemented
        end

-        def summarize_in_chunks(chunks, opts)
-          chunks.map do |chunk|
-            chunk[:summary] = summarize_chunk(chunk[:summary], opts)
-            chunk
-          end
-        end
-
-        def concatenate_summaries(_summaries)
-          raise NotImplemented
-        end
-
-        def summarize_with_truncation(_contents, _opts)
-          raise NotImplemented
-        end
-
-        def summarize_single(chunk_text, opts)
-          raise NotImplemented
-        end
-
-        def format_content_item(item)
-          "(#{item[:id]} #{item[:poster]} said: #{item[:text]} "
-        end
-
        def available_tokens
          max_tokens - reserved_tokens
        end
@ -57,16 +34,6 @@ module DiscourseAi
          # ~500 words
          700
        end
-
-        def summarize_chunk(_chunk_text, _opts)
-          raise NotImplemented
-        end
-
-        def tokenizer
-          raise NotImplemented
-        end
-
-        delegate :can_expand_tokens?, to: :tokenizer
      end
    end
  end
--- a/lib/modules/summarization/models/discourse.rb
+++ b/lib/modules/summarization/models/discourse.rb
@ -22,44 +22,11 @@ module DiscourseAi
          )
        end

-        def concatenate_summaries(summaries)
-          completion(summaries.join("\n"))
-        end
-
-        def summarize_with_truncation(contents, opts)
-          text_to_summarize = contents.map { |c| format_content_item(c) }.join
-          truncated_content =
-            ::DiscourseAi::Tokenizer::BertTokenizer.truncate(text_to_summarize, available_tokens)
-
-          completion(truncated_content)
-        end
-
-        def summarize_single(chunk_text, _opts)
-          completion(chunk_text)
-        end
-
        private

-        def summarize_chunk(chunk_text, _opts)
-          completion(chunk_text)
-        end
-
        def reserved_tokens
          0
        end
-
-        def completion(prompt)
-          ::DiscourseAi::Inference::DiscourseClassifier.perform!(
-            "#{SiteSetting.ai_summarization_discourse_service_api_endpoint}/api/v1/classify",
-            model,
-            prompt,
-            SiteSetting.ai_summarization_discourse_service_api_key,
-          ).dig(:summary_text)
-        end
-
-        def tokenizer
-          DiscourseAi::Tokenizer::BertTokenizer
-        end
      end
    end
  end
--- a/lib/modules/summarization/models/llama2.rb
+++ b/lib/modules/summarization/models/llama2.rb
@ -19,104 +19,6 @@ module DiscourseAi
            setting: "ai_hugging_face_api_url",
          )
        end
-
-        def concatenate_summaries(summaries, &on_partial_blk)
-          prompt = <<~TEXT
-            [INST] <<SYS>>
-            You are a helpful bot
-            <</SYS>>
-
-            Concatenate these disjoint summaries, creating a cohesive narrative:
-            #{summaries.join("\n")} [/INST]
-          TEXT
-
-          completion(prompt, &on_partial_blk)
-        end
-
-        def summarize_with_truncation(contents, opts, &on_partial_blk)
-          text_to_summarize = contents.map { |c| format_content_item(c) }.join
-          truncated_content = tokenizer.truncate(text_to_summarize, available_tokens)
-
-          prompt = <<~TEXT
-            [INST] <<SYS>>
-            #{build_base_prompt(opts)}
-            <</SYS>>
-
-            Summarize the following in up to 400 words:
-            #{truncated_content} [/INST]
-            Here is a summary of the above topic:
-          TEXT
-
-          completion(prompt, &on_partial_blk)
-        end
-
-        def summarize_single(chunk_text, opts, &on_partial_blk)
-          summarize_chunk(chunk_text, opts.merge(single_chunk: true), &on_partial_blk)
-        end
-
-        private
-
-        def summarize_chunk(chunk_text, opts, &on_partial_blk)
-          summary_instruction =
-            if opts[:single_chunk]
-              "Summarize the following forum discussion, creating a cohesive narrative:"
-            else
-              "Summarize the following in up to 400 words:"
-            end
-
-          prompt = <<~TEXT
-            [INST] <<SYS>>
-            #{build_base_prompt(opts)}
-            <</SYS>>
-
-            #{summary_instruction}
-            #{chunk_text} [/INST]
-            Here is a summary of the above topic:
-          TEXT
-
-          completion(prompt, &on_partial_blk)
-        end
-
-        def build_base_prompt(opts)
-          base_prompt = <<~TEXT
-            You are a summarization bot.
-            You effectively summarise any text and reply ONLY with ONLY the summarized text.
-            You condense it into a shorter version.
-            You understand and generate Discourse forum Markdown.
-          TEXT
-
-          if opts[:resource_path]
-            base_prompt +=
-              "Try generating links as well the format is #{opts[:resource_path]}. eg: [ref](#{opts[:resource_path]}/77)\n"
-          end
-
-          base_prompt += "The discussion title is: #{opts[:content_title]}.\n" if opts[
-            :content_title
-          ]
-
-          base_prompt
-        end
-
-        def completion(prompt, &on_partial_blk)
-          if on_partial_blk
-            on_partial_read =
-              Proc.new { |partial| on_partial_blk.call(partial.dig(:token, :text).to_s) }
-
-            ::DiscourseAi::Inference::HuggingFaceTextGeneration.perform!(
-              prompt,
-              model,
-              &on_partial_read
-            )
-          else
-            ::DiscourseAi::Inference::HuggingFaceTextGeneration.perform!(prompt, model).dig(
-              :generated_text,
-            )
-          end
-        end
-
-        def tokenizer
-          DiscourseAi::Tokenizer::Llama2Tokenizer
-        end
      end
    end
  end
--- a/lib/modules/summarization/models/llama2_fine_tuned_orca_style.rb
+++ b/lib/modules/summarization/models/llama2_fine_tuned_orca_style.rb
@ -7,65 +7,6 @@ module DiscourseAi
        def display_name
          "Llama2FineTunedOrcaStyle's #{SiteSetting.ai_hugging_face_model_display_name.presence || model}"
        end
-
-        def concatenate_summaries(summaries, &on_partial_blk)
-          prompt = <<~TEXT
-            ### System:
-            You are a helpful bot
-            
-            ### User:
-            Concatenate these disjoint summaries, creating a cohesive narrative:
-            #{summaries.join("\n")}
-
-            ### Assistant:
-          TEXT
-
-          completion(prompt, &on_partial_blk)
-        end
-
-        def summarize_with_truncation(contents, opts, &on_partial_blk)
-          text_to_summarize = contents.map { |c| format_content_item(c) }.join
-          truncated_content = tokenizer.truncate(text_to_summarize, available_tokens)
-
-          prompt = <<~TEXT
-          ### System:
-          #{build_base_prompt(opts)}
-          
-          ### User:
-          Summarize the following in up to 400 words:
-          #{truncated_content}
-
-          ### Assistant:
-          Here is a summary of the above topic:
-        TEXT
-
-          completion(prompt, &on_partial_blk)
-        end
-
-        private
-
-        def summarize_chunk(chunk_text, opts, &on_partial_blk)
-          summary_instruction =
-            if opts[:single_chunk]
-              "Summarize the following forum discussion, creating a cohesive narrative:"
-            else
-              "Summarize the following in up to 400 words:"
-            end
-
-          prompt = <<~TEXT
-            ### System:
-            #{build_base_prompt(opts)}
-
-            ### User:
-            #{summary_instruction}
-            #{chunk_text}
-
-            ### Assistant:
-            Here is a summary of the above topic:
-          TEXT
-
-          completion(prompt, &on_partial_blk)
-        end
      end
    end
  end
--- a/lib/modules/summarization/models/open_ai.rb
+++ b/lib/modules/summarization/models/open_ai.rb
@ -19,100 +19,6 @@ module DiscourseAi
            setting: "ai_openai_api_key",
          )
        end
-
-        def concatenate_summaries(summaries, &on_partial_blk)
-          messages = [
-            { role: "system", content: "You are a helpful bot" },
-            {
-              role: "user",
-              content:
-                "Concatenate these disjoint summaries, creating a cohesive narrative. Keep the summary in the same language used in the text below.\n#{summaries.join("\n")}",
-            },
-          ]
-
-          completion(messages, &on_partial_blk)
-        end
-
-        def summarize_with_truncation(contents, opts, &on_partial_blk)
-          messages = [{ role: "system", content: build_base_prompt(opts) }]
-
-          text_to_summarize = contents.map { |c| format_content_item(c) }.join
-          truncated_content = tokenizer.truncate(text_to_summarize, available_tokens)
-
-          messages << {
-            role: "user",
-            content:
-              "Summarize the following in 400 words. Keep the summary in the same language used in the text below.\n#{truncated_content}",
-          }
-
-          completion(messages, &on_partial_blk)
-        end
-
-        def summarize_single(chunk_text, opts, &on_partial_blk)
-          summarize_chunk(chunk_text, opts.merge(single_chunk: true), &on_partial_blk)
-        end
-
-        private
-
-        def summarize_chunk(chunk_text, opts, &on_partial_blk)
-          summary_instruction =
-            if opts[:single_chunk]
-              "Summarize the following forum discussion, creating a cohesive narrative. Keep the summary in the same language used in the text below."
-            else
-              "Summarize the following in 400 words. Keep the summary in the same language used in the text below."
-            end
-
-          completion(
-            [
-              { role: "system", content: build_base_prompt(opts) },
-              { role: "user", content: "#{summary_instruction}\n#{chunk_text}" },
-            ],
-            &on_partial_blk
-          )
-        end
-
-        def build_base_prompt(opts)
-          base_prompt = <<~TEXT
-            You are a summarization bot.
-            You effectively summarise any text and reply ONLY with ONLY the summarized text.
-            You condense it into a shorter version.
-            You understand and generate Discourse forum Markdown.
-            You format the response, including links, using markdown.
-          TEXT
-
-          if opts[:resource_path]
-            base_prompt +=
-              "Try generating links as well the format is #{opts[:resource_path]}. eg: [ref](#{opts[:resource_path]}/77)\n"
-          end
-
-          base_prompt += "The discussion title is: #{opts[:content_title]}.\n" if opts[
-            :content_title
-          ]
-
-          base_prompt
-        end
-
-        def completion(prompt, &on_partial_blk)
-          if on_partial_blk
-            on_partial_read =
-              Proc.new do |partial|
-                on_partial_blk.call(partial.dig(:choices, 0, :delta, :content).to_s)
-              end
-
-            ::DiscourseAi::Inference::OpenAiCompletions.perform!(prompt, model, &on_partial_read)
-          else
-            ::DiscourseAi::Inference::OpenAiCompletions.perform!(prompt, model).dig(
-              :choices,
-              0,
-              :message,
-              :content,
-            )
-          end
-        end
-
-        def tokenizer
-          DiscourseAi::Tokenizer::OpenAiTokenizer
-        end
      end
    end
  end
--- a/lib/modules/summarization/strategies/fold_content.rb
+++ b/lib/modules/summarization/strategies/fold_content.rb
@ -16,22 +16,29 @@ module DiscourseAi
                 :model,
                 to: :completion_model

-        def summarize(content, _user, &on_partial_blk)
+        def summarize(content, user, &on_partial_blk)
          opts = content.except(:contents)

-          chunks = split_into_chunks(content[:contents])
+          llm = DiscourseAi::Completions::LLM.proxy(completion_model.model)
+
+          chunks = split_into_chunks(llm.tokenizer, content[:contents])

          if chunks.length == 1
            {
-              summary:
-                completion_model.summarize_single(chunks.first[:summary], opts, &on_partial_blk),
+              summary: summarize_single(llm, chunks.first[:summary], user, opts, &on_partial_blk),
              chunks: [],
            }
          else
-            summaries = completion_model.summarize_in_chunks(chunks, opts)
+            summaries = summarize_in_chunks(llm, chunks, user, opts)

            {
-              summary: completion_model.concatenate_summaries(summaries, &on_partial_blk),
+              summary:
+                concatenate_summaries(
+                  llm,
+                  summaries.map { |s| s[:summary] },
+                  user,
+                  &on_partial_blk
+                ),
              chunks: summaries,
            }
          end
@ -39,14 +46,18 @@ module DiscourseAi

        private

-        def split_into_chunks(contents)
+        def format_content_item(item)
+          "(#{item[:id]} #{item[:poster]} said: #{item[:text]} "
+        end
+
+        def split_into_chunks(tokenizer, contents)
          section = { ids: [], summary: "" }

          chunks =
            contents.reduce([]) do |sections, item|
-              new_content = completion_model.format_content_item(item)
+              new_content = format_content_item(item)

-              if completion_model.can_expand_tokens?(
+              if tokenizer.can_expand_tokens?(
                   section[:summary],
                   new_content,
                   completion_model.available_tokens,
@ -65,6 +76,71 @@ module DiscourseAi

          chunks
        end
+
+        def summarize_single(llm, text, user, opts, &on_partial_blk)
+          prompt = summarization_prompt(text, opts)
+
+          llm.completion!(prompt, user, &on_partial_blk)
+        end
+
+        def summarize_in_chunks(llm, chunks, user, opts)
+          chunks.map do |chunk|
+            prompt = summarization_prompt(chunk[:summary], opts)
+            prompt[:post_insts] = "Don't use more than 400 words for the summary."
+
+            chunk[:summary] = llm.completion!(prompt, user)
+            chunk
+          end
+        end
+
+        def concatenate_summaries(llm, summaries, user, &on_partial_blk)
+          prompt = summarization_prompt(summaries.join("\n"), {})
+          prompt[:insts] = <<~TEXT
+            You are a bot that can concatenate disjoint summaries, creating a cohesive narrative.
+            Keep the resulting summary in the same language used in the text below.
+          TEXT
+
+          llm.completion!(prompt, user, &on_partial_blk)
+        end
+
+        def summarization_prompt(input, opts)
+          insts = <<~TEXT
+            You are a summarization bot that effectively summarize any text, creating a cohesive narrative.
+            Your replies contain ONLY a summarized version of the text I provided and you, using the same language.
+            You understand and generate Discourse forum Markdown.
+            You format the response, including links, using Markdown.
+          TEXT
+
+          insts += <<~TEXT if opts[:resource_path]
+                Each message is formatted as "<POST_NUMBER>) <USERNAME> <MESSAGE> "
+                Append <POST_NUMBER> to #{opts[:resource_path]} when linking posts.
+              TEXT
+
+          insts += "The discussion title is: #{opts[:content_title]}.\n" if opts[:content_title]
+
+          prompt = { insts: insts, input: <<~TEXT }
+              Here is the text, inside <input></input> XML tags:
+
+              <input>
+                #{input}
+              </input>
+          TEXT
+
+          if opts[:resource_path]
+            prompt[:examples] = [
+              [
+                "<input>(1 user1 said: I love Mondays 2) user2 said: I hate Mondays</input>",
+                "Two users are sharing their feelings toward Mondays. [user1](#{opts[:resource_path]}/1) hates them, while [user2](#{opts[:resource_path]}/2) loves them.",
+              ],
+              [
+                "<input>3) usuario1: Amo los lunes 6) usuario2: Odio los lunes</input>",
+                "Dos usuarios charlan sobre los lunes. [usuario1](#{opts[:resource_path]}/3) dice que los ama, mientras que [usuario2](#{opts[:resource_path]}/2) los odia.",
+              ],
+            ]
+          end
+
+          prompt
+        end
      end
    end
  end
--- a/lib/modules/summarization/strategies/truncate_content.rb
+++ b/lib/modules/summarization/strategies/truncate_content.rb
@ -25,6 +25,32 @@ module DiscourseAi
            chunks: [],
          }
        end
+
+        private
+
+        def format_content_item(item)
+          "(#{item[:id]} #{item[:poster]} said: #{item[:text]} "
+        end
+
+        def summarize_with_truncation(contents, opts)
+          text_to_summarize = contents.map { |c| format_content_item(c) }.join
+          truncated_content =
+            ::DiscourseAi::Tokenizer::BertTokenizer.truncate(
+              text_to_summarize,
+              completion_model.available_tokens,
+            )
+
+          completion(truncated_content)
+        end
+
+        def completion(prompt)
+          ::DiscourseAi::Inference::DiscourseClassifier.perform!(
+            "#{SiteSetting.ai_summarization_discourse_service_api_endpoint}/api/v1/classify",
+            completion_model.model,
+            prompt,
+            SiteSetting.ai_summarization_discourse_service_api_key,
+          ).dig(:summary_text)
+        end
      end
    end
  end
--- a/plugin.rb
+++ b/plugin.rb
@ -52,6 +52,8 @@ after_initialize do

  require_relative "lib/shared/database/connection"

+  require_relative "lib/completions/entry_point"
+
  require_relative "lib/modules/nsfw/entry_point"
  require_relative "lib/modules/toxicity/entry_point"
  require_relative "lib/modules/sentiment/entry_point"
@ -64,6 +66,7 @@ after_initialize do
  add_admin_route "discourse_ai.title", "discourse-ai"

  [
+    DiscourseAi::Completions::EntryPoint.new,
    DiscourseAi::Embeddings::EntryPoint.new,
    DiscourseAi::NSFW::EntryPoint.new,
    DiscourseAi::Toxicity::EntryPoint.new,
--- a/spec/lib/completions/dialects/chat_gpt_spec.rb
+++ b/spec/lib/completions/dialects/chat_gpt_spec.rb
@ -0,0 +1,63 @@
+# frozen_string_literal: true
+
+RSpec.describe DiscourseAi::Completions::Dialects::ChatGPT do
+  subject(:dialect) { described_class.new }
+
+  let(:prompt) do
+    {
+      insts: <<~TEXT,
+      I want you to act as a title generator for written pieces. I will provide you with a text,
+      and you will generate five attention-grabbing titles. Please keep the title concise and under 20 words,
+      and ensure that the meaning is maintained. Replies will utilize the language type of the topic.
+    TEXT
+      input: <<~TEXT,
+      Here is the text, inside <input></input> XML tags:
+      <input>
+        To perfect his horror, Caesar, surrounded at the base of the statue by the impatient daggers of his friends,
+        discovers among the faces and blades that of Marcus Brutus, his protege, perhaps his son, and he no longer
+        defends himself, but instead exclaims: 'You too, my son!' Shakespeare and Quevedo capture the pathetic cry.
+
+        Destiny favors repetitions, variants, symmetries; nineteen centuries later, in the southern province of Buenos Aires,
+        a gaucho is attacked by other gauchos and, as he falls, recognizes a godson of his and says with gentle rebuke and
+        slow surprise (these words must be heard, not read): 'But, my friend!' He is killed and does not know that he
+        dies so that a scene may be repeated.
+      </input>
+    TEXT
+      post_insts:
+        "Please put the translation between <ai></ai> tags and separate each title with a comma.",
+    }
+  end
+
+  describe "#translate" do
+    it "translates a prompt written in our generic format to the ChatGPT format" do
+      open_ai_version = [
+        { role: "system", content: [prompt[:insts], prompt[:post_insts]].join("\n") },
+        { role: "user", content: prompt[:input] },
+      ]
+
+      translated = dialect.translate(prompt)
+
+      expect(translated).to contain_exactly(*open_ai_version)
+    end
+
+    it "include examples in the ChatGPT version" do
+      prompt[:examples] = [
+        [
+          "<input>In the labyrinth of time, a solitary horse, etched in gold by the setting sun, embarked on an infinite journey.</input>",
+          "<ai>The solitary horse.,The horse etched in gold.,A horse's infinite journey.,A horse lost in time.,A horse's last ride.</ai>",
+        ],
+      ]
+
+      open_ai_version = [
+        { role: "system", content: [prompt[:insts], prompt[:post_insts]].join("\n") },
+        { role: "user", content: prompt[:examples][0][0] },
+        { role: "assistant", content: prompt[:examples][0][1] },
+        { role: "user", content: prompt[:input] },
+      ]
+
+      translated = dialect.translate(prompt)
+
+      expect(translated).to contain_exactly(*open_ai_version)
+    end
+  end
+end
--- a/spec/lib/completions/dialects/claude_spec.rb
+++ b/spec/lib/completions/dialects/claude_spec.rb
@ -0,0 +1,68 @@
+# frozen_string_literal: true
+
+RSpec.describe DiscourseAi::Completions::Dialects::Claude do
+  subject(:dialect) { described_class.new }
+
+  let(:prompt) do
+    {
+      insts: <<~TEXT,
+      I want you to act as a title generator for written pieces. I will provide you with a text,
+      and you will generate five attention-grabbing titles. Please keep the title concise and under 20 words,
+      and ensure that the meaning is maintained. Replies will utilize the language type of the topic.
+    TEXT
+      input: <<~TEXT,
+      Here is the text, inside <input></input> XML tags:
+      <input>
+        To perfect his horror, Caesar, surrounded at the base of the statue by the impatient daggers of his friends,
+        discovers among the faces and blades that of Marcus Brutus, his protege, perhaps his son, and he no longer
+        defends himself, but instead exclaims: 'You too, my son!' Shakespeare and Quevedo capture the pathetic cry.
+
+        Destiny favors repetitions, variants, symmetries; nineteen centuries later, in the southern province of Buenos Aires,
+        a gaucho is attacked by other gauchos and, as he falls, recognizes a godson of his and says with gentle rebuke and
+        slow surprise (these words must be heard, not read): 'But, my friend!' He is killed and does not know that he
+        dies so that a scene may be repeated.
+      </input>
+    TEXT
+      post_insts:
+        "Please put the translation between <ai></ai> tags and separate each title with a comma.",
+    }
+  end
+
+  describe "#translate" do
+    it "translates a prompt written in our generic format to Claude's format" do
+      anthropic_version = <<~TEXT
+      Human: #{prompt[:insts]}
+      #{prompt[:input]}
+      #{prompt[:post_insts]}
+      Assistant:
+      TEXT
+
+      translated = dialect.translate(prompt)
+
+      expect(translated).to eq(anthropic_version)
+    end
+
+    it "knows how to translate examples to Claude's format" do
+      prompt[:examples] = [
+        [
+          "<input>In the labyrinth of time, a solitary horse, etched in gold by the setting sun, embarked on an infinite journey.</input>",
+          "<ai>The solitary horse.,The horse etched in gold.,A horse's infinite journey.,A horse lost in time.,A horse's last ride.</ai>",
+        ],
+      ]
+      anthropic_version = <<~TEXT
+      Human: #{prompt[:insts]}
+      <example>
+      H: #{prompt[:examples][0][0]}
+      A: #{prompt[:examples][0][1]}
+      </example>
+      #{prompt[:input]}
+      #{prompt[:post_insts]}
+      Assistant:
+      TEXT
+
+      translated = dialect.translate(prompt)
+
+      expect(translated).to eq(anthropic_version)
+    end
+  end
+end
--- a/spec/lib/completions/dialects/llama2_classic_spec.rb
+++ b/spec/lib/completions/dialects/llama2_classic_spec.rb
@ -0,0 +1,63 @@
+# frozen_string_literal: true
+
+RSpec.describe DiscourseAi::Completions::Dialects::Llama2Classic do
+  subject(:dialect) { described_class.new }
+
+  let(:prompt) do
+    {
+      insts: <<~TEXT,
+      I want you to act as a title generator for written pieces. I will provide you with a text,
+      and you will generate five attention-grabbing titles. Please keep the title concise and under 20 words,
+      and ensure that the meaning is maintained. Replies will utilize the language type of the topic.
+    TEXT
+      input: <<~TEXT,
+      Here is the text, inside <input></input> XML tags:
+      <input>
+        To perfect his horror, Caesar, surrounded at the base of the statue by the impatient daggers of his friends,
+        discovers among the faces and blades that of Marcus Brutus, his protege, perhaps his son, and he no longer
+        defends himself, but instead exclaims: 'You too, my son!' Shakespeare and Quevedo capture the pathetic cry.
+
+        Destiny favors repetitions, variants, symmetries; nineteen centuries later, in the southern province of Buenos Aires,
+        a gaucho is attacked by other gauchos and, as he falls, recognizes a godson of his and says with gentle rebuke and
+        slow surprise (these words must be heard, not read): 'But, my friend!' He is killed and does not know that he
+        dies so that a scene may be repeated.
+      </input>
+    TEXT
+      post_insts:
+        "Please put the translation between <ai></ai> tags and separate each title with a comma.",
+    }
+  end
+
+  describe "#translate" do
+    it "translates a prompt written in our generic format to the Llama2 format" do
+      llama2_classic_version = <<~TEXT
+      [INST]<<SYS>>#{[prompt[:insts], prompt[:post_insts]].join("\n")}<</SYS>>[/INST]
+      [INST]#{prompt[:input]}[/INST]
+      TEXT
+
+      translated = dialect.translate(prompt)
+
+      expect(translated).to eq(llama2_classic_version)
+    end
+
+    it "includes examples in the translation" do
+      prompt[:examples] = [
+        [
+          "<input>In the labyrinth of time, a solitary horse, etched in gold by the setting sun, embarked on an infinite journey.</input>",
+          "<ai>The solitary horse.,The horse etched in gold.,A horse's infinite journey.,A horse lost in time.,A horse's last ride.</ai>",
+        ],
+      ]
+
+      llama2_classic_version = <<~TEXT
+      [INST]<<SYS>>#{[prompt[:insts], prompt[:post_insts]].join("\n")}<</SYS>>[/INST]
+      [INST]#{prompt[:examples][0][0]}[/INST]
+      #{prompt[:examples][0][1]}
+      [INST]#{prompt[:input]}[/INST]
+      TEXT
+
+      translated = dialect.translate(prompt)
+
+      expect(translated).to eq(llama2_classic_version)
+    end
+  end
+end
--- a/spec/lib/completions/dialects/orca_style_spec.rb
+++ b/spec/lib/completions/dialects/orca_style_spec.rb
@ -0,0 +1,71 @@
+# frozen_string_literal: true
+
+RSpec.describe DiscourseAi::Completions::Dialects::OrcaStyle do
+  subject(:dialect) { described_class.new }
+
+  describe "#translate" do
+    let(:prompt) do
+      {
+        insts: <<~TEXT,
+        I want you to act as a title generator for written pieces. I will provide you with a text,
+        and you will generate five attention-grabbing titles. Please keep the title concise and under 20 words,
+        and ensure that the meaning is maintained. Replies will utilize the language type of the topic.
+      TEXT
+        input: <<~TEXT,
+        Here is the text, inside <input></input> XML tags:
+        <input>
+          To perfect his horror, Caesar, surrounded at the base of the statue by the impatient daggers of his friends,
+          discovers among the faces and blades that of Marcus Brutus, his protege, perhaps his son, and he no longer
+          defends himself, but instead exclaims: 'You too, my son!' Shakespeare and Quevedo capture the pathetic cry.
+  
+          Destiny favors repetitions, variants, symmetries; nineteen centuries later, in the southern province of Buenos Aires,
+          a gaucho is attacked by other gauchos and, as he falls, recognizes a godson of his and says with gentle rebuke and
+          slow surprise (these words must be heard, not read): 'But, my friend!' He is killed and does not know that he
+          dies so that a scene may be repeated.
+        </input>
+      TEXT
+        post_insts:
+          "Please put the translation between <ai></ai> tags and separate each title with a comma.",
+      }
+    end
+
+    it "translates a prompt written in our generic format to the Open AI format" do
+      orca_style_version = <<~TEXT
+      ### System:
+      #{[prompt[:insts], prompt[:post_insts]].join("\n")}
+      ### User:
+      #{prompt[:input]}
+      ### Assistant:
+      TEXT
+
+      translated = dialect.translate(prompt)
+
+      expect(translated).to eq(orca_style_version)
+    end
+
+    it "include examples in the translated prompt" do
+      prompt[:examples] = [
+        [
+          "<input>In the labyrinth of time, a solitary horse, etched in gold by the setting sun, embarked on an infinite journey.</input>",
+          "<ai>The solitary horse.,The horse etched in gold.,A horse's infinite journey.,A horse lost in time.,A horse's last ride.</ai>",
+        ],
+      ]
+
+      orca_style_version = <<~TEXT
+      ### System:
+      #{[prompt[:insts], prompt[:post_insts]].join("\n")}
+      ### User:
+      #{prompt[:examples][0][0]}
+      ### Assistant:
+      #{prompt[:examples][0][1]}
+      ### User:
+      #{prompt[:input]}
+      ### Assistant:
+      TEXT
+
+      translated = dialect.translate(prompt)
+
+      expect(translated).to eq(orca_style_version)
+    end
+  end
+end
--- a/spec/lib/completions/endpoints/anthropic_spec.rb
+++ b/spec/lib/completions/endpoints/anthropic_spec.rb
@ -0,0 +1,64 @@
+# frozen_String_literal: true
+
+require_relative "endpoint_examples"
+
+RSpec.describe DiscourseAi::Completions::Endpoints::Anthropic do
+  subject(:model) { described_class.new(model_name, DiscourseAi::Tokenizer::AnthropicTokenizer) }
+
+  let(:model_name) { "claude-2" }
+  let(:prompt) { "Human: write 3 words\n\n" }
+
+  let(:request_body) { model.default_options.merge(prompt: prompt).to_json }
+  let(:stream_request_body) { model.default_options.merge(prompt: prompt, stream: true).to_json }
+
+  def response(content)
+    {
+      completion: content,
+      stop: "\n\nHuman:",
+      stop_reason: "stop_sequence",
+      truncated: false,
+      log_id: "12dcc7feafbee4a394e0de9dffde3ac5",
+      model: model_name,
+      exception: nil,
+    }
+  end
+
+  def stub_response(prompt, response_text)
+    WebMock
+      .stub_request(:post, "https://api.anthropic.com/v1/complete")
+      .with(body: model.default_options.merge(prompt: prompt).to_json)
+      .to_return(status: 200, body: JSON.dump(response(response_text)))
+  end
+
+  def stream_line(delta, finish_reason: nil)
+    +"data: " << {
+      completion: delta,
+      stop: finish_reason ? "\n\nHuman:" : nil,
+      stop_reason: finish_reason,
+      truncated: false,
+      log_id: "12b029451c6d18094d868bc04ce83f63",
+      model: "claude-2",
+      exception: nil,
+    }.to_json
+  end
+
+  def stub_streamed_response(prompt, deltas)
+    chunks =
+      deltas.each_with_index.map do |_, index|
+        if index == (deltas.length - 1)
+          stream_line(deltas[index], finish_reason: "stop_sequence")
+        else
+          stream_line(deltas[index])
+        end
+      end
+
+    chunks = chunks.join("\n\n")
+
+    WebMock
+      .stub_request(:post, "https://api.anthropic.com/v1/complete")
+      .with(body: model.default_options.merge(prompt: prompt, stream: true).to_json)
+      .to_return(status: 200, body: chunks)
+  end
+
+  it_behaves_like "an endpoint that can communicate with a completion service"
+end
--- a/spec/lib/completions/endpoints/aws_bedrock_spec.rb
+++ b/spec/lib/completions/endpoints/aws_bedrock_spec.rb
@ -0,0 +1,122 @@
+# frozen_string_literal: true
+
+require_relative "endpoint_examples"
+
+RSpec.describe DiscourseAi::Completions::Endpoints::AwsBedrock do
+  subject(:model) { described_class.new(model_name, DiscourseAi::Tokenizer::AnthropicTokenizer) }
+
+  let(:model_name) { "claude-2" }
+  let(:prompt) { "Human: write 3 words\n\n" }
+
+  let(:request_body) { model.default_options.merge(prompt: prompt).to_json }
+  let(:stream_request_body) { model.default_options.merge(prompt: prompt).to_json }
+
+  before do
+    SiteSetting.ai_bedrock_access_key_id = "123456"
+    SiteSetting.ai_bedrock_secret_access_key = "asd-asd-asd"
+    SiteSetting.ai_bedrock_region = "us-east-1"
+  end
+
+  # Copied from https://github.com/bblimke/webmock/issues/629
+  # Workaround for stubbing a streamed response
+  before do
+    mocked_http =
+      Class.new(Net::HTTP) do
+        def request(*)
+          super do |response|
+            response.instance_eval do
+              def read_body(*, &block)
+                if block_given?
+                  @body.each(&block)
+                else
+                  super
+                end
+              end
+            end
+
+            yield response if block_given?
+
+            response
+          end
+        end
+      end
+
+    @original_net_http = Net.send(:remove_const, :HTTP)
+    Net.send(:const_set, :HTTP, mocked_http)
+  end
+
+  after do
+    Net.send(:remove_const, :HTTP)
+    Net.send(:const_set, :HTTP, @original_net_http)
+  end
+
+  def response(content)
+    {
+      completion: content,
+      stop: "\n\nHuman:",
+      stop_reason: "stop_sequence",
+      truncated: false,
+      log_id: "12dcc7feafbee4a394e0de9dffde3ac5",
+      model: model_name,
+      exception: nil,
+    }
+  end
+
+  def stub_response(prompt, response_text)
+    WebMock
+      .stub_request(
+        :post,
+        "https://bedrock-runtime.#{SiteSetting.ai_bedrock_region}.amazonaws.com/model/anthropic.#{model_name}/invoke",
+      )
+      .with(body: request_body)
+      .to_return(status: 200, body: JSON.dump(response(response_text)))
+  end
+
+  def stream_line(delta, finish_reason: nil)
+    encoder = Aws::EventStream::Encoder.new
+
+    message =
+      Aws::EventStream::Message.new(
+        payload:
+          StringIO.new(
+            {
+              bytes:
+                Base64.encode64(
+                  {
+                    completion: delta,
+                    stop: finish_reason ? "\n\nHuman:" : nil,
+                    stop_reason: finish_reason,
+                    truncated: false,
+                    log_id: "12b029451c6d18094d868bc04ce83f63",
+                    model: "claude-2",
+                    exception: nil,
+                  }.to_json,
+                ),
+            }.to_json,
+          ),
+      )
+
+    encoder.encode(message)
+  end
+
+  def stub_streamed_response(prompt, deltas)
+    chunks =
+      deltas.each_with_index.map do |_, index|
+        if index == (deltas.length - 1)
+          stream_line(deltas[index], finish_reason: "stop_sequence")
+        else
+          stream_line(deltas[index])
+        end
+      end
+
+    WebMock
+      .stub_request(
+        :post,
+        "https://bedrock-runtime.#{SiteSetting.ai_bedrock_region}.amazonaws.com/model/anthropic.#{model_name}/invoke-with-response-stream",
+      )
+      .with(body: stream_request_body)
+      .to_return(status: 200, body: chunks)
+  end
+
+  it_behaves_like "an endpoint that can communicate with a completion service"
+end
--- a/spec/lib/completions/endpoints/endpoint_examples.rb
+++ b/spec/lib/completions/endpoints/endpoint_examples.rb
@ -0,0 +1,71 @@
+# frozen_string_literal: true
+
+RSpec.shared_examples "an endpoint that can communicate with a completion service" do
+  describe "#perform_completion!" do
+    fab!(:user) { Fabricate(:user) }
+
+    let(:response_text) { "1. Serenity\\n2. Laughter\\n3. Adventure" }
+
+    context "when using regular mode" do
+      before { stub_response(prompt, response_text) }
+
+      it "can complete a trivial prompt" do
+        completion_response = model.perform_completion!(prompt, user)
+
+        expect(completion_response).to eq(response_text)
+      end
+
+      it "creates an audit log for the request" do
+        model.perform_completion!(prompt, user)
+
+        expect(AiApiAuditLog.count).to eq(1)
+        log = AiApiAuditLog.first
+
+        response_body = response(response_text).to_json
+
+        expect(log.provider_id).to eq(model.provider_id)
+        expect(log.user_id).to eq(user.id)
+        expect(log.raw_request_payload).to eq(request_body)
+        expect(log.raw_response_payload).to eq(response_body)
+        expect(log.request_tokens).to eq(model.prompt_size(prompt))
+        expect(log.response_tokens).to eq(model.tokenizer.size(response_text))
+      end
+    end
+
+    context "when using stream mode" do
+      let(:deltas) { ["Mount", "ain", " ", "Tree ", "Frog"] }
+
+      before { stub_streamed_response(prompt, deltas) }
+
+      it "can complete a trivial prompt" do
+        completion_response = +""
+
+        model.perform_completion!(prompt, user) do |partial, cancel|
+          completion_response << partial
+          cancel.call if completion_response.split(" ").length == 2
+        end
+
+        expect(completion_response).to eq(deltas[0...-1].join)
+      end
+
+      it "creates an audit log and updates is on each read." do
+        completion_response = +""
+
+        model.perform_completion!(prompt, user) do |partial, cancel|
+          completion_response << partial
+          cancel.call if completion_response.split(" ").length == 2
+        end
+
+        expect(AiApiAuditLog.count).to eq(1)
+        log = AiApiAuditLog.first
+
+        expect(log.provider_id).to eq(model.provider_id)
+        expect(log.user_id).to eq(user.id)
+        expect(log.raw_request_payload).to eq(stream_request_body)
+        expect(log.raw_response_payload).to be_present
+        expect(log.request_tokens).to eq(model.prompt_size(prompt))
+        expect(log.response_tokens).to eq(model.tokenizer.size(deltas[0...-1].join))
+      end
+    end
+  end
+end
--- a/spec/lib/completions/endpoints/hugging_face_spec.rb
+++ b/spec/lib/completions/endpoints/hugging_face_spec.rb
@ -0,0 +1,68 @@
+# frozen_string_literal: true
+
+require_relative "endpoint_examples"
+
+RSpec.describe DiscourseAi::Completions::Endpoints::Huggingface do
+  subject(:model) { described_class.new(model_name, DiscourseAi::Tokenizer::Llama2Tokenizer) }
+
+  let(:model_name) { "Llama2-*-chat-hf" }
+  let(:prompt) { <<~TEXT }
+      [INST]<<SYS>>You are a helpful bot.<</SYS>>[/INST]
+      [INST]Write 3 words[/INST]
+    TEXT
+
+  let(:request_body) do
+    model
+      .default_options
+      .merge(inputs: prompt)
+      .tap { |payload| payload[:parameters][:max_new_tokens] = 2_000 - model.prompt_size(prompt) }
+      .to_json
+  end
+  let(:stream_request_body) { request_body }
+
+  before { SiteSetting.ai_hugging_face_api_url = "https://test.dev" }
+
+  def response(content)
+    { generated_text: content }
+  end
+
+  def stub_response(prompt, response_text)
+    WebMock
+      .stub_request(:post, "#{SiteSetting.ai_hugging_face_api_url}/generate")
+      .with(body: request_body)
+      .to_return(status: 200, body: JSON.dump(response(response_text)))
+  end
+
+  def stream_line(delta, finish_reason: nil)
+    +"data: " << {
+      token: {
+        id: 29_889,
+        text: delta,
+        logprob: -0.08319092,
+        special: !!finish_reason,
+      },
+      generated_text: finish_reason ? response_text : nil,
+      details: nil,
+    }.to_json
+  end
+
+  def stub_streamed_response(prompt, deltas)
+    chunks =
+      deltas.each_with_index.map do |_, index|
+        if index == (deltas.length - 1)
+          stream_line(deltas[index], finish_reason: true)
+        else
+          stream_line(deltas[index])
+        end
+      end
+
+    chunks = chunks.join("\n\n")
+
+    WebMock
+      .stub_request(:post, "#{SiteSetting.ai_hugging_face_api_url}/generate_stream")
+      .with(body: request_body)
+      .to_return(status: 200, body: chunks)
+  end
+
+  it_behaves_like "an endpoint that can communicate with a completion service"
+end
--- a/spec/lib/completions/endpoints/open_ai_spec.rb
+++ b/spec/lib/completions/endpoints/open_ai_spec.rb
@ -0,0 +1,74 @@
+# frozen_string_literal: true
+
+require_relative "endpoint_examples"
+
+RSpec.describe DiscourseAi::Completions::Endpoints::OpenAI do
+  subject(:model) { described_class.new(model_name, DiscourseAi::Tokenizer::OpenAiTokenizer) }
+
+  let(:model_name) { "gpt-3.5-turbo" }
+  let(:prompt) do
+    [
+      { role: "system", content: "You are a helpful bot." },
+      { role: "user", content: "Write 3 words" },
+    ]
+  end
+
+  let(:request_body) { model.default_options.merge(messages: prompt).to_json }
+  let(:stream_request_body) { model.default_options.merge(messages: prompt, stream: true).to_json }
+
+  def response(content)
+    {
+      id: "chatcmpl-6sZfAb30Rnv9Q7ufzFwvQsMpjZh8S",
+      object: "chat.completion",
+      created: 1_678_464_820,
+      model: "gpt-3.5-turbo-0301",
+      usage: {
+        prompt_tokens: 337,
+        completion_tokens: 162,
+        total_tokens: 499,
+      },
+      choices: [
+        { message: { role: "assistant", content: content }, finish_reason: "stop", index: 0 },
+      ],
+    }
+  end
+
+  def stub_response(prompt, response_text)
+    WebMock
+      .stub_request(:post, "https://api.openai.com/v1/chat/completions")
+      .with(body: { model: model_name, messages: prompt })
+      .to_return(status: 200, body: JSON.dump(response(response_text)))
+  end
+
+  def stream_line(delta, finish_reason: nil)
+    +"data: " << {
+      id: "chatcmpl-#{SecureRandom.hex}",
+      object: "chat.completion.chunk",
+      created: 1_681_283_881,
+      model: "gpt-3.5-turbo-0301",
+      choices: [{ delta: { content: delta } }],
+      finish_reason: finish_reason,
+      index: 0,
+    }.to_json
+  end
+
+  def stub_streamed_response(prompt, deltas)
+    chunks =
+      deltas.each_with_index.map do |_, index|
+        if index == (deltas.length - 1)
+          stream_line(deltas[index], finish_reason: "stop_sequence")
+        else
+          stream_line(deltas[index])
+        end
+      end
+
+    chunks = chunks.join("\n\n")
+
+    WebMock
+      .stub_request(:post, "https://api.openai.com/v1/chat/completions")
+      .with(body: model.default_options.merge(messages: prompt, stream: true).to_json)
+      .to_return(status: 200, body: chunks)
+  end
+
+  it_behaves_like "an endpoint that can communicate with a completion service"
+end
--- a/spec/lib/completions/llm_spec.rb
+++ b/spec/lib/completions/llm_spec.rb
@ -0,0 +1,71 @@
+# frozen_string_literal: true
+
+RSpec.describe DiscourseAi::Completions::LLM do
+  subject(:llm) do
+    described_class.new(
+      DiscourseAi::Completions::Dialects::OrcaStyle.new,
+      canned_response,
+      "Upstage-Llama-2-*-instruct-v2",
+    )
+  end
+
+  fab!(:user) { Fabricate(:user) }
+
+  describe ".proxy" do
+    it "raises an exception when we can't proxy the model" do
+      fake_model = "unknown_v2"
+
+      expect { described_class.proxy(fake_model) }.to(
+        raise_error(DiscourseAi::Completions::LLM::UNKNOWN_MODEL),
+      )
+    end
+  end
+
+  describe "#completion!" do
+    let(:prompt) do
+      {
+        insts: <<~TEXT,
+        I want you to act as a title generator for written pieces. I will provide you with a text,
+        and you will generate five attention-grabbing titles. Please keep the title concise and under 20 words,
+        and ensure that the meaning is maintained. Replies will utilize the language type of the topic.
+      TEXT
+        input: <<~TEXT,
+        Here is the text, inside <input></input> XML tags:
+        <input>
+          To perfect his horror, Caesar, surrounded at the base of the statue by the impatient daggers of his friends,
+          discovers among the faces and blades that of Marcus Brutus, his protege, perhaps his son, and he no longer
+          defends himself, but instead exclaims: 'You too, my son!' Shakespeare and Quevedo capture the pathetic cry.
+        </input>
+      TEXT
+        post_insts:
+          "Please put the translation between <ai></ai> tags and separate each title with a comma.",
+      }
+    end
+
+    let(:canned_response) do
+      DiscourseAi::Completions::Endpoints::CannedResponse.new(
+        [
+          "<ai>The solitary horse.,The horse etched in gold.,A horse's infinite journey.,A horse lost in time.,A horse's last ride.</ai>",
+        ],
+      )
+    end
+
+    context "when getting the full response" do
+      it "processes the prompt and return the response" do
+        llm_response = llm.completion!(prompt, user)
+
+        expect(llm_response).to eq(canned_response.responses[0])
+      end
+    end
+
+    context "when getting a streamed response" do
+      it "processes the prompt and call the given block with the partial response" do
+        llm_response = +""
+
+        llm.completion!(prompt, user) { |partial, cancel_fn| llm_response << partial }
+
+        expect(llm_response).to eq(canned_response.responses[0])
+      end
+    end
+  end
+end
--- a/spec/lib/modules/ai_bot/commands/search_command_spec.rb
+++ b/spec/lib/modules/ai_bot/commands/search_command_spec.rb
@ -66,7 +66,10 @@ RSpec.describe DiscourseAi::AiBot::Commands::SearchCommand do
          .expects(:asymmetric_topics_similarity_search)
          .returns([post1.topic_id])

-        results = search.process(search_query: "hello world, sam", status: "public")
+        results =
+          DiscourseAi::Completions::LLM.with_prepared_responses(["<ai>#{query}</ai>"]) do
+            search.process(search_query: "hello world, sam", status: "public")
+          end

        expect(results[:args]).to eq({ search_query: "hello world, sam", status: "public" })
        expect(results[:rows].length).to eq(1)
--- a/spec/lib/modules/embeddings/semantic_search_spec.rb
+++ b/spec/lib/modules/embeddings/semantic_search_spec.rb
@ -13,15 +13,6 @@ RSpec.describe DiscourseAi::Embeddings::SemanticSearch do
    before do
      SiteSetting.ai_embeddings_discourse_service_api_endpoint = "http://test.com"

-      prompt = DiscourseAi::Embeddings::HydeGenerators::OpenAi.new.prompt(query)
-      OpenAiCompletionsInferenceStubs.stub_response(
-        prompt,
-        hypothetical_post,
-        req_opts: {
-          max_tokens: 400,
-        },
-      )
-
      hyde_embedding = [0.049382, 0.9999]
      EmbeddingsGenerationStubs.discourse_service(
        SiteSetting.ai_embeddings_model,
@ -39,10 +30,16 @@ RSpec.describe DiscourseAi::Embeddings::SemanticSearch do
        .returns(candidate_ids)
    end

+    def trigger_search(query)
+      DiscourseAi::Completions::LLM.with_prepared_responses(["<ai>#{hypothetical_post}</ai>"]) do
+        subject.search_for_topics(query)
+      end
+    end
+
    it "returns the first post of a topic included in the asymmetric search results" do
      stub_candidate_ids([post.topic_id])

-      posts = subject.search_for_topics(query)
+      posts = trigger_search(query)

      expect(posts).to contain_exactly(post)
    end
@ -53,7 +50,7 @@ RSpec.describe DiscourseAi::Embeddings::SemanticSearch do
          post.topic.update!(visible: false)
          stub_candidate_ids([post.topic_id])

-          posts = subject.search_for_topics(query)
+          posts = trigger_search(query)

          expect(posts).to be_empty
        end
@ -64,7 +61,7 @@ RSpec.describe DiscourseAi::Embeddings::SemanticSearch do
          pm_post = Fabricate(:private_message_post)
          stub_candidate_ids([pm_post.topic_id])

-          posts = subject.search_for_topics(query)
+          posts = trigger_search(query)

          expect(posts).to be_empty
        end
@ -75,7 +72,7 @@ RSpec.describe DiscourseAi::Embeddings::SemanticSearch do
          post.update!(post_type: Post.types[:whisper])
          stub_candidate_ids([post.topic_id])

-          posts = subject.search_for_topics(query)
+          posts = trigger_search(query)

          expect(posts).to be_empty
        end
@ -87,7 +84,7 @@ RSpec.describe DiscourseAi::Embeddings::SemanticSearch do
          reply.topic.first_post.trash!
          stub_candidate_ids([reply.topic_id])

-          posts = subject.search_for_topics(query)
+          posts = trigger_search(query)

          expect(posts).to be_empty
        end
@ -98,7 +95,7 @@ RSpec.describe DiscourseAi::Embeddings::SemanticSearch do
          post_2 = Fabricate(:post)
          stub_candidate_ids([post.topic_id])

-          posts = subject.search_for_topics(query)
+          posts = trigger_search(query)

          expect(posts).not_to include(post_2)
        end
@ -114,7 +111,7 @@ RSpec.describe DiscourseAi::Embeddings::SemanticSearch do
        end

        it "returns an empty list" do
-          posts = subject.search_for_topics(query)
+          posts = trigger_search(query)

          expect(posts).to be_empty
        end
@ -122,14 +119,17 @@ RSpec.describe DiscourseAi::Embeddings::SemanticSearch do
        it "returns the results if the user has access to the category" do
          group.add(user)

-          posts = subject.search_for_topics(query)
+          posts = trigger_search(query)

          expect(posts).to contain_exactly(post)
        end

        context "while searching as anon" do
          it "returns an empty list" do
-            posts = described_class.new(Guardian.new(nil)).search_for_topics(query)
+            posts =
+              DiscourseAi::Completions::LLM.with_prepared_responses(
+                ["<ai>#{hypothetical_post}</ai>"],
+              ) { described_class.new(Guardian.new(nil)).search_for_topics(query) }

            expect(posts).to be_empty
          end
--- a/spec/lib/modules/summarization/models/anthropic_spec.rb
+++ b/spec/lib/modules/summarization/models/anthropic_spec.rb
@ -1,122 +0,0 @@
-# frozen_string_literal: true
-
-RSpec.describe DiscourseAi::Summarization::Models::Anthropic do
-  subject(:model) { described_class.new(model_name, max_tokens: max_tokens) }
-
-  let(:model_name) { "claude-2" }
-  let(:max_tokens) { 720 }
-
-  let(:content) do
-    {
-      resource_path: "/t/-/1",
-      content_title: "This is a title",
-      contents: [{ poster: "asd", id: 1, text: "This is a text" }],
-    }
-  end
-
-  def as_chunk(item)
-    { ids: [item[:id]], summary: "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }
-  end
-
-  def expected_messages(contents, opts)
-    base_prompt = <<~TEXT
-      Human: Summarize the following forum discussion inside the given <input> tag.
-      Try to keep the summary in the same language as the forum discussion.
-      Format the response, including links, using markdown.
-      Try generating links as well the format is #{opts[:resource_path]}/POST_ID
-      For example, a link to the 3rd post in the topic would be [post 3](#{opts[:resource_path]}/3)
-      Wrap the whole the summary inside <ai> tags.
-      The discussion title is: #{opts[:content_title]}.
-      Don't use more than 400 words.
-    TEXT
-
-    text =
-      contents.reduce("") do |memo, item|
-        memo += "(#{item[:id]} #{item[:poster]} said: #{item[:text]} "
-      end
-
-    base_prompt += "<input>#{text}</input>\nAssistant:\n"
-  end
-
-  describe "#summarize_in_chunks" do
-    context "when the content fits in a single chunk" do
-      it "performs a request to summarize" do
-        opts = content.except(:contents)
-
-        AnthropicCompletionStubs.stub_response(
-          expected_messages(content[:contents], opts),
-          "<ai>This is summary 1</ai>",
-        )
-
-        chunks = content[:contents].map { |c| as_chunk(c) }
-        summarized_chunks = model.summarize_in_chunks(chunks, opts).map { |c| c[:summary] }
-
-        expect(summarized_chunks).to contain_exactly("This is summary 1")
-      end
-    end
-
-    context "when the content fits in multiple chunks" do
-      it "performs a request for each one to summarize" do
-        content[:contents] << {
-          poster: "asd2",
-          id: 2,
-          text: "This is a different text to summarize",
-        }
-        opts = content.except(:contents)
-
-        content[:contents].each_with_index do |item, idx|
-          AnthropicCompletionStubs.stub_response(
-            expected_messages([item], opts),
-            "<ai>This is summary #{idx + 1}</ai>",
-          )
-        end
-
-        chunks = content[:contents].map { |c| as_chunk(c) }
-        summarized_chunks = model.summarize_in_chunks(chunks, opts).map { |c| c[:summary] }
-
-        expect(summarized_chunks).to contain_exactly("This is summary 1", "This is summary 2")
-      end
-    end
-  end
-
-  describe "#concatenate_summaries" do
-    it "combines all the different summaries into a single one" do
-      messages = <<~TEXT
-        Human: Concatenate the following disjoint summaries inside the given input tags, creating a cohesive narrative.
-        Include only the summary inside <ai> tags.
-        <input>summary 1</input>
-        <input>summary 2</input>
-        Assistant:
-      TEXT
-
-      AnthropicCompletionStubs.stub_response(messages, "<ai>concatenated summary</ai>")
-
-      expect(model.concatenate_summaries(["summary 1", "summary 2"])).to eq("concatenated summary")
-    end
-  end
-
-  describe "#summarize_with_truncation" do
-    let(:max_tokens) { 709 }
-
-    it "truncates the context to meet the token limit" do
-      opts = content.except(:contents)
-
-      instructions = <<~TEXT
-        Human: Summarize the following forum discussion inside the given <input> tag.
-        Try to keep the summary in the same language as the forum discussion.
-        Format the response, including links, using markdown.
-        Try generating links as well the format is #{opts[:resource_path]}/POST_ID
-        For example, a link to the 3rd post in the topic would be [post 3](#{opts[:resource_path]}/3)
-        Wrap the whole the summary inside <ai> tags.
-        The discussion title is: #{opts[:content_title]}.
-        Don't use more than 400 words.
-        <input>(1 asd said: This is a</input>
-        Assistant:
-      TEXT
-
-      AnthropicCompletionStubs.stub_response(instructions, "<ai>truncated summary</ai>")
-
-      expect(model.summarize_with_truncation(content[:contents], opts)).to eq("truncated summary")
-    end
-  end
-end
--- a/spec/lib/modules/summarization/models/discourse_spec.rb
+++ b/spec/lib/modules/summarization/models/discourse_spec.rb
@ -1,95 +0,0 @@
-# frozen_string_literal: true
-
-RSpec.describe DiscourseAi::Summarization::Models::Discourse do
-  subject(:model) { described_class.new(model_name, max_tokens: max_tokens) }
-
-  let(:model_name) { "bart-large-cnn-samsum" }
-  let(:max_tokens) { 20 }
-
-  let(:content) do
-    {
-      resource_path: "/t/1/POST_NUMBER",
-      content_title: "This is a title",
-      contents: [{ poster: "asd", id: 1, text: "This is a text" }],
-    }
-  end
-
-  before { SiteSetting.ai_summarization_discourse_service_api_endpoint = "https://test.com" }
-
-  def stub_request(prompt, response)
-    WebMock
-      .stub_request(
-        :post,
-        "#{SiteSetting.ai_summarization_discourse_service_api_endpoint}/api/v1/classify",
-      )
-      .with(body: JSON.dump(model: model_name, content: prompt))
-      .to_return(status: 200, body: JSON.dump(summary_text: response))
-  end
-
-  def expected_messages(contents, opts)
-    contents.reduce("") do |memo, item|
-      memo += "(#{item[:id]} #{item[:poster]} said: #{item[:text]} "
-    end
-  end
-
-  def as_chunk(item)
-    { ids: [item[:id]], summary: "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }
-  end
-
-  describe "#summarize_in_chunks" do
-    context "when the content fits in a single chunk" do
-      it "performs a request to summarize" do
-        opts = content.except(:contents)
-
-        stub_request(expected_messages(content[:contents], opts), "This is summary 1")
-
-        chunks = content[:contents].map { |c| as_chunk(c) }
-        summarized_chunks = model.summarize_in_chunks(chunks, opts).map { |c| c[:summary] }
-
-        expect(summarized_chunks).to contain_exactly("This is summary 1")
-      end
-    end
-
-    context "when the content fits in multiple chunks" do
-      it "performs a request for each one to summarize" do
-        content[:contents] << {
-          poster: "asd2",
-          id: 2,
-          text: "This is a different text to summarize",
-        }
-        opts = content.except(:contents)
-
-        content[:contents].each_with_index do |item, idx|
-          stub_request(expected_messages([item], opts), "This is summary #{idx + 1}")
-        end
-
-        chunks = content[:contents].map { |c| as_chunk(c) }
-        summarized_chunks = model.summarize_in_chunks(chunks, opts).map { |c| c[:summary] }
-
-        expect(summarized_chunks).to contain_exactly("This is summary 1", "This is summary 2")
-      end
-    end
-  end
-
-  describe "#concatenate_summaries" do
-    it "combines all the different summaries into a single one" do
-      messages = ["summary 1", "summary 2"].join("\n")
-
-      stub_request(messages, "concatenated summary")
-
-      expect(model.concatenate_summaries(["summary 1", "summary 2"])).to eq("concatenated summary")
-    end
-  end
-
-  describe "#summarize_with_truncation" do
-    let(:max_tokens) { 9 }
-
-    it "truncates the context to meet the token limit" do
-      opts = content.except(:contents)
-
-      stub_request("( 1 asd said : this is", "truncated summary")
-
-      expect(model.summarize_with_truncation(content[:contents], opts)).to eq("truncated summary")
-    end
-  end
-end
--- a/spec/lib/modules/summarization/models/open_ai_spec.rb
+++ b/spec/lib/modules/summarization/models/open_ai_spec.rb
@ -1,121 +0,0 @@
-# frozen_string_literal: true
-
-RSpec.describe DiscourseAi::Summarization::Models::OpenAi do
-  subject(:model) { described_class.new(model_name, max_tokens: max_tokens) }
-
-  let(:model_name) { "gpt-3.5-turbo" }
-  let(:max_tokens) { 720 }
-
-  let(:content) do
-    {
-      resource_path: "/t/1/POST_NUMBER",
-      content_title: "This is a title",
-      contents: [{ poster: "asd", id: 1, text: "This is a text" }],
-    }
-  end
-
-  def as_chunk(item)
-    { ids: [item[:id]], summary: "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }
-  end
-
-  def expected_messages(contents, opts)
-    base_prompt = <<~TEXT
-      You are a summarization bot.
-      You effectively summarise any text and reply ONLY with ONLY the summarized text.
-      You condense it into a shorter version.
-      You understand and generate Discourse forum Markdown.
-      You format the response, including links, using markdown.
-      Try generating links as well the format is #{opts[:resource_path]}. eg: [ref](#{opts[:resource_path]}/77)
-      The discussion title is: #{opts[:content_title]}.
-    TEXT
-
-    messages = [{ role: "system", content: base_prompt }]
-
-    text =
-      contents.reduce("") do |memo, item|
-        memo += "(#{item[:id]} #{item[:poster]} said: #{item[:text]} "
-      end
-
-    messages << {
-      role: "user",
-      content:
-        "Summarize the following in 400 words. Keep the summary in the same language used in the text below.\n#{text}",
-    }
-  end
-
-  describe "#summarize_in_chunks" do
-    context "when the content fits in a single chunk" do
-      it "performs a request to summarize" do
-        opts = content.except(:contents)
-
-        OpenAiCompletionsInferenceStubs.stub_response(
-          expected_messages(content[:contents], opts),
-          "This is summary 1",
-        )
-
-        chunks = content[:contents].map { |c| as_chunk(c) }
-        summarized_chunks = model.summarize_in_chunks(chunks, opts).map { |c| c[:summary] }
-
-        expect(summarized_chunks).to contain_exactly("This is summary 1")
-      end
-    end
-
-    context "when the content fits in multiple chunks" do
-      it "performs a request for each one to summarize" do
-        content[:contents] << {
-          poster: "asd2",
-          id: 2,
-          text: "This is a different text to summarize",
-        }
-        opts = content.except(:contents)
-
-        content[:contents].each_with_index do |item, idx|
-          OpenAiCompletionsInferenceStubs.stub_response(
-            expected_messages([item], opts),
-            "This is summary #{idx + 1}",
-          )
-        end
-
-        chunks = content[:contents].map { |c| as_chunk(c) }
-        summarized_chunks = model.summarize_in_chunks(chunks, opts).map { |c| c[:summary] }
-
-        expect(summarized_chunks).to contain_exactly("This is summary 1", "This is summary 2")
-      end
-    end
-  end
-
-  describe "#concatenate_summaries" do
-    it "combines all the different summaries into a single one" do
-      messages = [
-        { role: "system", content: "You are a helpful bot" },
-        {
-          role: "user",
-          content:
-            "Concatenate these disjoint summaries, creating a cohesive narrative. Keep the summary in the same language used in the text below.\nsummary 1\nsummary 2",
-        },
-      ]
-
-      OpenAiCompletionsInferenceStubs.stub_response(messages, "concatenated summary")
-
-      expect(model.concatenate_summaries(["summary 1", "summary 2"])).to eq("concatenated summary")
-    end
-  end
-
-  describe "#summarize_with_truncation" do
-    let(:max_tokens) { 709 }
-
-    it "truncates the context to meet the token limit" do
-      opts = content.except(:contents)
-
-      truncated_version = expected_messages(content[:contents], opts)
-
-      truncated_version.last[
-        :content
-      ] = "Summarize the following in 400 words. Keep the summary in the same language used in the text below.\n(1 asd said: This is a"
-
-      OpenAiCompletionsInferenceStubs.stub_response(truncated_version, "truncated summary")
-
-      expect(model.summarize_with_truncation(content[:contents], opts)).to eq("truncated summary")
-    end
-  end
-end
--- a/spec/lib/modules/summarization/strategies/fold_content_spec.rb
+++ b/spec/lib/modules/summarization/strategies/fold_content_spec.rb
@ -1,28 +1,35 @@
 # frozen_string_literal: true

-require_relative "../../../../support/summarization/dummy_completion_model"
-
 RSpec.describe DiscourseAi::Summarization::Strategies::FoldContent do
  describe "#summarize" do
    subject(:strategy) { described_class.new(model) }

    let(:summarize_text) { "This is a text" }
-    let(:model) { DummyCompletionModel.new(model_tokens) }
    let(:model_tokens) do
      # Make sure each content fits in a single chunk.
-      DiscourseAi::Tokenizer::BertTokenizer.size("(1 asd said: This is a text ") + 3
+      # 700 is the number of tokens reserved for the prompt.
+      700 + DiscourseAi::Tokenizer::OpenAiTokenizer.size("(1 asd said: This is a text ") + 3
    end

-    let(:user) { User.new }
+    let(:model) do
+      DiscourseAi::Summarization::Models::OpenAi.new("gpt-4", max_tokens: model_tokens)
+    end

    let(:content) { { contents: [{ poster: "asd", id: 1, text: summarize_text }] } }

+    let(:single_summary) { "this is a single summary" }
+    let(:concatenated_summary) { "this is a concatenated summary" }
+
+    let(:user) { User.new }
+
    context "when the content to summarize fits in a single call" do
      it "does one call to summarize content" do
-        result = strategy.summarize(content, user)
+        result =
+          DiscourseAi::Completions::LLM.with_prepared_responses([single_summary]) do |spy|
+            strategy.summarize(content, user).tap { expect(spy.completions).to eq(1) }
+          end

-        expect(model.summarization_calls).to eq(1)
-        expect(result[:summary]).to eq(DummyCompletionModel::SINGLE_SUMMARY)
+        expect(result[:summary]).to eq(single_summary)
      end
    end

@ -30,10 +37,12 @@ RSpec.describe DiscourseAi::Summarization::Strategies::FoldContent do
      it "summarizes each chunk and then concatenates them" do
        content[:contents] << { poster: "asd2", id: 2, text: summarize_text }

-        result = strategy.summarize(content, user)
+        result =
+          DiscourseAi::Completions::LLM.with_prepared_responses(
+            [single_summary, single_summary, concatenated_summary],
+          ) { |spy| strategy.summarize(content, user).tap { expect(spy.completions).to eq(3) } }

-        expect(model.summarization_calls).to eq(3)
-        expect(result[:summary]).to eq(DummyCompletionModel::CONCATENATED_SUMMARIES)
+        expect(result[:summary]).to eq(concatenated_summary)
      end
    end
  end
--- a/spec/lib/modules/summarization/strategies/truncate_content_spec.rb
+++ b/spec/lib/modules/summarization/strategies/truncate_content_spec.rb
@ -1,28 +0,0 @@
-# frozen_string_literal: true
-
-require_relative "../../../../support/summarization/dummy_completion_model"
-
-RSpec.describe DiscourseAi::Summarization::Strategies::TruncateContent do
-  describe "#summarize" do
-    subject(:strategy) { described_class.new(model) }
-
-    let(:summarize_text) { "This is a text" }
-    let(:model_tokens) { summarize_text.length }
-    let(:model) { DummyCompletionModel.new(model_tokens) }
-
-    let(:user) { User.new }
-
-    let(:content) { { contents: [{ poster: "asd", id: 1, text: summarize_text }] } }
-
-    context "when the content to summarize doesn't fit in a single call" do
-      it "summarizes a truncated version" do
-        content[:contents] << { poster: "asd2", id: 2, text: summarize_text }
-
-        result = strategy.summarize(content, user)
-
-        expect(model.summarization_calls).to eq(1)
-        expect(result[:summary]).to eq(DummyCompletionModel::SINGLE_SUMMARY)
-      end
-    end
-  end
-end
--- a/spec/support/summarization/dummy_completion_model.rb
+++ b/spec/support/summarization/dummy_completion_model.rb
@ -1,46 +0,0 @@
-# frozen_string_literal: true
-
-class DummyCompletionModel
-  SINGLE_SUMMARY = "this is a single summary"
-  CONCATENATED_SUMMARIES = "this is a concatenated summary"
-
-  def initialize(max_tokens)
-    @summarization_calls = 0
-    @available_tokens = max_tokens
-  end
-
-  attr_reader :max_length, :summarization_calls, :available_tokens
-
-  delegate :can_expand_tokens?, to: :tokenizer
-
-  def summarize_single(single_chunk, opts)
-    @summarization_calls += 1
-    SINGLE_SUMMARY
-  end
-
-  def summarize_in_chunks(chunks, opts)
-    chunks.map do |chunk|
-      chunk[:summary] = SINGLE_SUMMARY
-      @summarization_calls += 1
-      chunk
-    end
-  end
-
-  def concatenate_summaries(summaries)
-    @summarization_calls += 1
-    CONCATENATED_SUMMARIES
-  end
-
-  def summarize_with_truncation(_contents, _opts)
-    @summarization_calls += 1
-    SINGLE_SUMMARY
-  end
-
-  def format_content_item(item)
-    "(#{item[:id]} #{item[:poster]} said: #{item[:text]} "
-  end
-
-  def tokenizer
-    DiscourseAi::Tokenizer::BertTokenizer
-  end
-end