Mixtral (#376)

Add both Mistral and Mixtral support. Also includes vLLM-openAI inference support. Co-authored-by: Roman Rizzi <rizziromanalejandro@gmail.com>
2023-12-26 14:49:55 -03:00 · 2023-12-26 14:49:55 -03:00 · 5db7bf6e68
parent cb325bb883
commit 5db7bf6e68
14 changed files with 91631 additions and 14 deletions
--- a/app/models/ai_api_audit_log.rb
+++ b/app/models/ai_api_audit_log.rb
@ -6,6 +6,7 @@ class AiApiAuditLog < ActiveRecord::Base
    Anthropic = 2
    HuggingFaceTextGeneration = 3
    Gemini = 4
    Vllm = 5
  end
 end
--- a/config/settings.yml
+++ b/config/settings.yml
@ -153,6 +153,11 @@ discourse_ai:
  ai_gemini_api_key:
    default: ""
    secret: true
  ai_vllm_endpoint:
    default: ""
  ai_vllm_endpoint_srv:
    default: ""
    hidden: true
  composer_ai_helper_enabled:
    default: false
@ -177,6 +182,8 @@ discourse_ai:
      - stable-beluga-2
      - Llama2-chat-hf
      - gemini-pro
      - mistralai/Mixtral-8x7B-Instruct-v0.1
      - mistralai/Mistral-7B-Instruct-v0.2
  ai_helper_custom_prompts_allowed_groups:
    client: true
    type: group_list
@ -241,6 +248,8 @@ discourse_ai:
      - StableBeluga2
      - Upstage-Llama-2-*-instruct-v2
      - gemini-pro
      - mistralai/Mixtral-8x7B-Instruct-v0.1
      - mistralai/Mistral-7B-Instruct-v0.2
  ai_summarization_discourse_service_api_endpoint: ""
  ai_summarization_discourse_service_api_key:
--- a/lib/completions/dialects/dialect.rb
+++ b/lib/completions/dialects/dialect.rb
@ -16,6 +16,7 @@ module DiscourseAi
              DiscourseAi::Completions::Dialects::ChatGpt,
              DiscourseAi::Completions::Dialects::OrcaStyle,
              DiscourseAi::Completions::Dialects::Gemini,
              DiscourseAi::Completions::Dialects::Mixtral,
            ]
            dialect = dialects.find { |d| d.can_translate?(model_name) }
@ -87,6 +88,7 @@ module DiscourseAi
        def trim_context(conversation_context)
          prompt_limit = max_prompt_tokens
          current_token_count = calculate_token_count_without_context
          message_step_size = (max_prompt_tokens / 25).to_i * -1
          conversation_context.reduce([]) do |memo, context|
            break(memo) if current_token_count >= prompt_limit
@ -98,7 +100,7 @@ module DiscourseAi
            # Trimming content to make sure we respect token limit.
            while dupped_context[:content].present? &&
                    message_tokens + current_token_count + per_message_overhead > prompt_limit
-              dupped_context[:content] = dupped_context[:content][0..-100] || ""
+              dupped_context[:content] = dupped_context[:content][0..message_step_size] || ""
              message_tokens = calculate_message_token(dupped_context)
            end
--- a/lib/completions/dialects/mixtral.rb
+++ b/lib/completions/dialects/mixtral.rb
@ -0,0 +1,76 @@
 # frozen_string_literal: true
 module DiscourseAi
  module Completions
    module Dialects
      class Mixtral < Dialect
        class << self
          def can_translate?(model_name)
            %w[mistralai/Mixtral-8x7B-Instruct-v0.1 mistralai/Mistral-7B-Instruct-v0.2].include?(
              model_name,
            )
          end
          def tokenizer
            DiscourseAi::Tokenizer::MixtralTokenizer
          end
        end
        def translate
          mixtral_prompt = +<<~TEXT
          <s> [INST]
          #{prompt[:insts]}
          #{build_tools_prompt}#{prompt[:post_insts]}
          [/INST] Ok </s>
          TEXT
          if prompt[:examples]
            prompt[:examples].each do |example_pair|
              mixtral_prompt << "[INST] #{example_pair.first} [/INST]\n"
              mixtral_prompt << "#{example_pair.second}\n"
            end
          end
          mixtral_prompt << conversation_context if prompt[:conversation_context].present?
          mixtral_prompt << "[INST] #{prompt[:input]} [/INST]\n"
        end
        def conversation_context
          return "" if prompt[:conversation_context].blank?
          trimmed_context = trim_context(prompt[:conversation_context])
          trimmed_context
            .reverse
            .reduce(+"") do |memo, context|
              memo << "[INST] " if context[:type] == "user"
              if context[:type] == "tool"
                memo << <<~TEXT
                <function_results>
                <result>
                <tool_name>#{context[:name]}</tool_name>
                <json>
                #{context[:content]}
                </json>
                </result>
                </function_results>
                TEXT
              else
                memo << context[:content] << "\n"
                memo << "[/INST]" if context[:type] == "user"
              end
              memo
            end
        end
        def max_prompt_tokens
          32_000
        end
      end
    end
  end
 end
--- a/lib/completions/endpoints/base.rb
+++ b/lib/completions/endpoints/base.rb
@ -16,6 +16,7 @@ module DiscourseAi
            DiscourseAi::Completions::Endpoints::OpenAi,
            DiscourseAi::Completions::Endpoints::HuggingFace,
            DiscourseAi::Completions::Endpoints::Gemini,
            DiscourseAi::Completions::Endpoints::Vllm,
          ].detect(-> { raise DiscourseAi::Completions::Llm::UNKNOWN_MODEL }) do |ek|
            ek.can_contact?(model_name)
          end
@ -228,7 +229,8 @@ module DiscourseAi
          <invoke>
          <tool_name></tool_name>
          <tool_id></tool_id>
-          <parameters></parameters>
+          <parameters>
          </parameters>
          </invoke>
          </function_calls>
          TEXT
@ -239,17 +241,28 @@ module DiscourseAi
        end
        def add_to_buffer(function_buffer, response_data, partial)
-          new_buffer = Nokogiri::HTML5.fragment(response_data + partial)
+          read_function = Nokogiri::HTML5.fragment(response_data + partial)
          if tool_name = new_buffer.at("tool_name").text
            if new_buffer.at("tool_id").nil?
              tool_id_node =
                Nokogiri::HTML5::DocumentFragment.parse("\n<tool_id>#{tool_name}</tool_id>")
-              new_buffer.at("invoke").children[1].add_next_sibling(tool_id_node)
+          if tool_name = read_function.at("tool_name").text
-            end
+            function_buffer.at("tool_name").inner_html = tool_name
            function_buffer.at("tool_id").inner_html = tool_name
          end
-          new_buffer
+          read_parameters =
            read_function
              .at("parameters")
              .elements
              .each do |elem|
                if paramenter = function_buffer.at(elem.name)&.text
                  function_buffer.at(elem.name).inner_html = paramenter
                else
                  param_node = read_function.at(elem.name)
                  function_buffer.at("parameters").add_child(param_node)
                  function_buffer.at("parameters").add_child("\n")
                end
              end
          function_buffer
        end
        def buffering_finished?(_available_functions, buffer)
--- a/lib/completions/endpoints/hugging_face.rb
+++ b/lib/completions/endpoints/hugging_face.rb
@ -5,9 +5,14 @@ module DiscourseAi
    module Endpoints
      class HuggingFace < Base
        def self.can_contact?(model_name)
-          %w[StableBeluga2 Upstage-Llama-2-*-instruct-v2 Llama2-*-chat-hf Llama2-chat-hf].include?(
+          %w[
-            model_name,
+            StableBeluga2
-          )
+            Upstage-Llama-2-*-instruct-v2
            Llama2-*-chat-hf
            Llama2-chat-hf
            mistralai/Mixtral-8x7B-Instruct-v0.1
            mistralai/Mistral-7B-Instruct-v0.2
          ].include?(model_name)
        end
        def default_options
--- a/lib/completions/endpoints/vllm.rb
+++ b/lib/completions/endpoints/vllm.rb
@ -0,0 +1,67 @@
 # frozen_string_literal: true
 module DiscourseAi
  module Completions
    module Endpoints
      class Vllm < Base
        def self.can_contact?(model_name)
          %w[mistralai/Mixtral-8x7B-Instruct-v0.1 mistralai/Mistral-7B-Instruct-v0.2].include?(
            model_name,
          )
        end
        def default_options
          { max_tokens_to_sample: 2000, model: model }
        end
        def provider_id
          AiApiAuditLog::Provider::Vllm
        end
        private
        def model_uri
          service = DiscourseAi::Utils::DnsSrv.lookup(SiteSetting.ai_vllm_endpoint_srv)
          if service.present?
            api_endpoint = "https://#{service.target}:#{service.port}/v1/completions"
          else
            api_endpoint = "#{SiteSetting.ai_vllm_endpoint}/v1/completions"
          end
          @uri ||= URI(api_endpoint)
        end
        def prepare_payload(prompt, model_params, _dialect)
          default_options
            .merge(model_params)
            .merge(prompt: prompt)
            .tap { |payload| payload[:stream] = true if @streaming_mode }
        end
        def prepare_request(payload)
          headers = { "Referer" => Discourse.base_url, "Content-Type" => "application/json" }
          Net::HTTP::Post.new(model_uri, headers).tap { |r| r.body = payload }
        end
        def extract_completion_from(response_raw)
          parsed = JSON.parse(response_raw, symbolize_names: true).dig(:choices, 0)
          # half a line sent here
          return if !parsed
          parsed.dig(:text)
        end
        def partials_from(decoded_chunk)
          decoded_chunk
            .split("\n")
            .map do |line|
              data = line.split("data: ", 2)[1]
              data == "[DONE]" ? nil : data
            end
            .compact
        end
      end
    end
  end
 end
--- a/lib/summarization/models/mixtral.rb
+++ b/lib/summarization/models/mixtral.rb
@ -0,0 +1,25 @@
 # frozen_string_literal: true
 module DiscourseAi
  module Summarization
    module Models
      class Mixtral < Base
        def display_name
          "MistralAI's #{model}"
        end
        def correctly_configured?
          SiteSetting.ai_hugging_face_api_url.present? || SiteSetting.ai_vllm_endpoint_srv.present?
        end
        def configuration_hint
          I18n.t(
            "discourse_ai.summarization.configuration_hint",
            count: 1,
            settings: %w[ai_hugging_face_api_url ai_vllm_endpoint_srv],
          )
        end
      end
    end
  end
 end
--- a/lib/tokenizer/mixtral_tokenizer.rb
+++ b/lib/tokenizer/mixtral_tokenizer.rb
@ -0,0 +1,11 @@
 # frozen_string_literal: true
 module DiscourseAi
  module Tokenizer
    class MixtralTokenizer < BasicTokenizer
      def self.tokenizer
        @@tokenizer ||= Tokenizers.from_file("./plugins/discourse-ai/tokenizers/mixtral.json")
      end
    end
  end
 end
--- a/plugin.rb
+++ b/plugin.rb
@ -8,7 +8,7 @@
 # url: https://meta.discourse.org/t/discourse-ai/259214
 # required_version: 2.7.0
-gem "tokenizers", "0.3.3"
+gem "tokenizers", "0.4.2"
 gem "tiktoken_ruby", "0.0.5"
 enabled_site_setting :discourse_ai_enabled
--- a/spec/lib/completions/dialects/mixtral_spec.rb
+++ b/spec/lib/completions/dialects/mixtral_spec.rb
@ -0,0 +1,189 @@
 # frozen_string_literal: true
 RSpec.describe DiscourseAi::Completions::Dialects::Mixtral do
  subject(:dialect) { described_class.new(prompt, "mistralai/Mixtral-8x7B-Instruct-v0.1") }
  let(:tool) do
    {
      name: "get_weather",
      description: "Get the weather in a city",
      parameters: [
        { name: "location", type: "string", description: "the city name", required: true },
        {
          name: "unit",
          type: "string",
          description: "the unit of measurement celcius c or fahrenheit f",
          enum: %w[c f],
          required: true,
        },
      ],
    }
  end
  let(:prompt) do
    {
      insts: <<~TEXT,
      I want you to act as a title generator for written pieces. I will provide you with a text,
      and you will generate five attention-grabbing titles. Please keep the title concise and under 20 words,
      and ensure that the meaning is maintained. Replies will utilize the language type of the topic.
    TEXT
      input: <<~TEXT,
      Here is the text, inside <input></input> XML tags:
      <input>
        To perfect his horror, Caesar, surrounded at the base of the statue by the impatient daggers of his friends,
        discovers among the faces and blades that of Marcus Brutus, his protege, perhaps his son, and he no longer
        defends himself, but instead exclaims: 'You too, my son!' Shakespeare and Quevedo capture the pathetic cry.
        Destiny favors repetitions, variants, symmetries; nineteen centuries later, in the southern province of Buenos Aires,
        a gaucho is attacked by other gauchos and, as he falls, recognizes a godson of his and says with gentle rebuke and
        slow surprise (these words must be heard, not read): 'But, my friend!' He is killed and does not know that he
        dies so that a scene may be repeated.
      </input>
    TEXT
      post_insts:
        "Please put the translation between <ai></ai> tags and separate each title with a comma.",
    }
  end
  describe "#translate" do
    it "translates a prompt written in our generic format to the Open AI format" do
      orca_style_version = <<~TEXT
      <s> [INST]
      #{prompt[:insts]}
      #{prompt[:post_insts]}
      [/INST] Ok </s>
      [INST] #{prompt[:input]} [/INST]
      TEXT
      translated = dialect.translate
      expect(translated).to eq(orca_style_version)
    end
    it "include examples in the translated prompt" do
      prompt[:examples] = [
        [
          "<input>In the labyrinth of time, a solitary horse, etched in gold by the setting sun, embarked on an infinite journey.</input>",
          "<ai>The solitary horse.,The horse etched in gold.,A horse's infinite journey.,A horse lost in time.,A horse's last ride.</ai>",
        ],
      ]
      orca_style_version = <<~TEXT
      <s> [INST]
      #{prompt[:insts]}
      #{prompt[:post_insts]}
      [/INST] Ok </s>
      [INST] #{prompt[:examples][0][0]} [/INST]
      #{prompt[:examples][0][1]}
      [INST] #{prompt[:input]} [/INST]
      TEXT
      translated = dialect.translate
      expect(translated).to eq(orca_style_version)
    end
    it "include tools inside the prompt" do
      prompt[:tools] = [tool]
      orca_style_version = <<~TEXT
      <s> [INST]
      #{prompt[:insts]}
      In this environment you have access to a set of tools you can use to answer the user's question.
      You may call them like this. Only invoke one function at a time and wait for the results before invoking another function:
      <function_calls>
      <invoke>
      <tool_name>$TOOL_NAME</tool_name>
      <parameters>
      <$PARAMETER_NAME>$PARAMETER_VALUE</$PARAMETER_NAME>
      ...
      </parameters>
      </invoke>
      </function_calls>
      Here are the tools available:
      <tools>
      #{dialect.tools}</tools>
      #{prompt[:post_insts]}
      [/INST] Ok </s>
      [INST] #{prompt[:input]} [/INST]
      TEXT
      translated = dialect.translate
      expect(translated).to eq(orca_style_version)
    end
  end
  describe "#conversation_context" do
    let(:context) do
      [
        { type: "user", name: "user1", content: "This is a new message by a user" },
        { type: "assistant", content: "I'm a previous bot reply, that's why there's no user" },
        { type: "tool", name: "tool_id", content: "I'm a tool result" },
      ]
    end
    it "adds conversation in reverse order (first == newer)" do
      prompt[:conversation_context] = context
      expected = <<~TEXT
      <function_results>
      <result>
      <tool_name>tool_id</tool_name>
      <json>
      #{context.last[:content]}
      </json>
      </result>
      </function_results>
      #{context.second[:content]}
      [INST] #{context.first[:content]}
      [/INST]
      TEXT
      translated_context = dialect.conversation_context
      expect(translated_context.strip).to eq(expected.strip)
    end
    it "trims content if it's getting too long" do
      context.last[:content] = context.last[:content] * 6_000
      prompt[:conversation_context] = context
      translated_context = dialect.conversation_context
      expect(translated_context.length).to be < context.last[:content].length
    end
  end
  describe "#tools" do
    it "translates tools to the tool syntax" do
      prompt[:tools] = [tool]
      translated_tool = <<~TEXT
        <tool_description>
        <tool_name>get_weather</tool_name>
        <description>Get the weather in a city</description>
        <parameters>
        <parameter>
        <name>location</name>
        <type>string</type>
        <description>the city name</description>
        <required>true</required>
        </parameter>
        <parameter>
        <name>unit</name>
        <type>string</type>
        <description>the unit of measurement celcius c or fahrenheit f</description>
        <required>true</required>
        <options>c,f</options>
        </parameter>
        </parameters>
        </tool_description>
      TEXT
      expect(dialect.tools).to eq(translated_tool)
    end
  end
 end
--- a/spec/lib/completions/endpoints/vllm_spec.rb
+++ b/spec/lib/completions/endpoints/vllm_spec.rb
@ -0,0 +1,93 @@
 # frozen_string_literal: true
 require_relative "endpoint_examples"
 RSpec.describe DiscourseAi::Completions::Endpoints::Vllm do
  subject(:model) { described_class.new(model_name, DiscourseAi::Tokenizer::MixtralTokenizer) }
  let(:model_name) { "mistralai/Mixtral-8x7B-Instruct-v0.1" }
  let(:generic_prompt) { { insts: "You are a helpful bot.", input: "write 3 words" } }
  let(:dialect) { DiscourseAi::Completions::Dialects::Mixtral.new(generic_prompt, model_name) }
  let(:prompt) { dialect.translate }
  let(:request_body) { model.default_options.merge(prompt: prompt).to_json }
  let(:stream_request_body) { model.default_options.merge(prompt: prompt, stream: true).to_json }
  before { SiteSetting.ai_vllm_endpoint = "https://test.dev" }
  def response(content)
    {
      id: "cmpl-6sZfAb30Rnv9Q7ufzFwvQsMpjZh8S",
      object: "text_completion",
      created: 1_678_464_820,
      model: model_name,
      usage: {
        prompt_tokens: 337,
        completion_tokens: 162,
        total_tokens: 499,
      },
      choices: [{ text: content, finish_reason: "stop", index: 0 }],
    }
  end
  def stub_response(prompt, response_text, tool_call: false)
    WebMock
      .stub_request(:post, "#{SiteSetting.ai_vllm_endpoint}/v1/completions")
      .with(body: request_body)
      .to_return(status: 200, body: JSON.dump(response(response_text)))
  end
  def stream_line(delta, finish_reason: nil)
    +"data: " << {
      id: "cmpl-#{SecureRandom.hex}",
      created: 1_681_283_881,
      model: model_name,
      choices: [{ text: delta, finish_reason: finish_reason, index: 0 }],
      index: 0,
    }.to_json
  end
  def stub_streamed_response(prompt, deltas, tool_call: false)
    chunks =
      deltas.each_with_index.map do |_, index|
        if index == (deltas.length - 1)
          stream_line(deltas[index], finish_reason: "stop_sequence")
        else
          stream_line(deltas[index])
        end
      end
    chunks = (chunks.join("\n\n") << "data: [DONE]").split("")
    WebMock
      .stub_request(:post, "#{SiteSetting.ai_vllm_endpoint}/v1/completions")
      .with(body: stream_request_body)
      .to_return(status: 200, body: chunks)
  end
  let(:tool_deltas) { ["<function", <<~REPLY, <<~REPLY] }
      _calls>
      <invoke>
      <tool_name>get_weather</tool_name>
      <parameters>
      <location>Sydney</location>
      <unit>c</unit>
      </parameters>
      </invoke>
      </function_calls>
      REPLY
      <function_calls>
      <invoke>
      <tool_name>get_weather</tool_name>
      <parameters>
      <location>Sydney</location>
      <unit>c</unit>
      </parameters>
      </invoke>
      </function_calls>
      REPLY
  let(:tool_call) { invocation }
  it_behaves_like "an endpoint that can communicate with a completion service"
 end
--- a/tokenizers/README.md
+++ b/tokenizers/README.md
@ -21,3 +21,7 @@ Licensed under MIT License
 ## bge-large-en
 Licensed under MIT License
 ## mixtral
 Licensed under Apache 2.0 License
--- a/tokenizers/mixtral.json
+++ b/tokenizers/mixtral.json