discourse-ai/spec/lib/completions/endpoints/vllm_spec.rb

# frozen_string_literal: true

require_relative "endpoint_compliance"

class VllmMock < EndpointMock
  def response(content)
    {
      id: "cmpl-6sZfAb30Rnv9Q7ufzFwvQsMpjZh8S",
      object: "chat.completion",
      created: 1_678_464_820,
      model: "mistralai/Mixtral-8x7B-Instruct-v0.1",
      usage: {
        prompt_tokens: 337,
        completion_tokens: 162,
        total_tokens: 499,
      },
      choices: [
        { message: { role: "assistant", content: content }, finish_reason: "stop", index: 0 },
      ],
    }
  end

  def stub_response(prompt, response_text, tool_call: false)
    WebMock
      .stub_request(:post, "https://test.dev/v1/chat/completions")
      .with(body: model.default_options.merge(messages: prompt).to_json)
      .to_return(status: 200, body: JSON.dump(response(response_text)))
  end

  def stream_line(delta, finish_reason: nil)
    +"data: " << {
      id: "cmpl-#{SecureRandom.hex}",
      created: 1_681_283_881,
      model: "mistralai/Mixtral-8x7B-Instruct-v0.1",
      choices: [{ delta: { content: delta } }],
      index: 0,
    }.to_json
  end

  def stub_streamed_response(prompt, deltas, tool_call: false)
    chunks =
      deltas.each_with_index.map do |_, index|
        if index == (deltas.length - 1)
          stream_line(deltas[index], finish_reason: "stop_sequence")
        else
          stream_line(deltas[index])
        end
      end

    chunks = (chunks.join("\n\n") << "data: [DONE]").split("")

    WebMock
      .stub_request(:post, "https://test.dev/v1/chat/completions")
      .with(body: model.default_options.merge(messages: prompt, stream: true).to_json)
      .to_return(status: 200, body: chunks)
  end
end

RSpec.describe DiscourseAi::Completions::Endpoints::Vllm do
  subject(:endpoint) { described_class.new(llm_model) }

  fab!(:llm_model) { Fabricate(:vllm_model) }

  fab!(:user)

  let(:anthropic_mock) { VllmMock.new(endpoint) }

  let(:compliance) do
    EndpointsCompliance.new(
      self,
      endpoint,
      DiscourseAi::Completions::Dialects::OpenAiCompatible,
      user,
    )
  end

  let(:dialect) do
    DiscourseAi::Completions::Dialects::OpenAiCompatible.new(generic_prompt, llm_model)
  end
  let(:prompt) { dialect.translate }

  let(:request_body) { model.default_options.merge(messages: prompt).to_json }
  let(:stream_request_body) { model.default_options.merge(messages: prompt, stream: true).to_json }

  describe "#perform_completion!" do
    context "when using regular mode" do
      context "with simple prompts" do
        it "completes a trivial prompt and logs the response" do
          compliance.regular_mode_simple_prompt(anthropic_mock)
        end
      end

      context "with tools" do
        it "returns a function invocation" do
          compliance.regular_mode_tools(anthropic_mock)
        end
      end
    end

    describe "when using streaming mode" do
      context "with simple prompts" do
        it "completes a trivial prompt and logs the response" do
          compliance.streaming_mode_simple_prompt(anthropic_mock)
        end
      end

      context "with tools" do
        it "returns a function invoncation" do
          compliance.streaming_mode_tools(anthropic_mock)
        end
      end
    end
  end
end
Mixtral (#376) Add both Mistral and Mixtral support. Also includes vLLM-openAI inference support. Co-authored-by: Roman Rizzi <rizziromanalejandro@gmail.com> 2023-12-26 12:49:55 -05:00			`# frozen_string_literal: true`

DEV: Stop using shared_examples for endpoint specs (#430) 2024-01-17 13:08:49 -05:00			`require_relative "endpoint_compliance"`
FIX: Correctly translate and read tools for Claude and Chat GPT. (#393) I tested against the live models for the AI bot migration. It ensures Open AI's tool syntax is correct and we can correctly read the replies. : 2024-01-02 09:21:13 -05:00
DEV: Stop using shared_examples for endpoint specs (#430) 2024-01-17 13:08:49 -05:00			`class VllmMock < EndpointMock`
Mixtral (#376) Add both Mistral and Mixtral support. Also includes vLLM-openAI inference support. Co-authored-by: Roman Rizzi <rizziromanalejandro@gmail.com> 2023-12-26 12:49:55 -05:00			`def response(content)`
			`{`
			`id: "cmpl-6sZfAb30Rnv9Q7ufzFwvQsMpjZh8S",`
REFACTOR: Migrate Vllm/TGI-served models to the OpenAI format. (#588) Both endpoints provide OpenAI-compatible servers. The only difference is that Vllm doesn't support passing tools as a separate parameter. Even if the tool param is supported, it ultimately relies on the model's ability to handle native functions, which is not the case with the models we have today. As a part of this change, we are dropping support for StableBeluga/Llama2 models. They don't have a chat_template, meaning the new API can translate them. These changes let us remove some of our existing dialects and are a first step in our plan to support any LLM by defining them as data-driven concepts. I rewrote the "translate" method to use a template method and extracted the tool support strategies into its classes to simplify the code. Finally, these changes bring support for Ollama when running in dev mode. It only works with Mistral for now, but it will change soon.. 2024-05-07 09:02:16 -04:00			`object: "chat.completion",`
Mixtral (#376) Add both Mistral and Mixtral support. Also includes vLLM-openAI inference support. Co-authored-by: Roman Rizzi <rizziromanalejandro@gmail.com> 2023-12-26 12:49:55 -05:00			`created: 1_678_464_820,`
DEV: Stop using shared_examples for endpoint specs (#430) 2024-01-17 13:08:49 -05:00			`model: "mistralai/Mixtral-8x7B-Instruct-v0.1",`
Mixtral (#376) Add both Mistral and Mixtral support. Also includes vLLM-openAI inference support. Co-authored-by: Roman Rizzi <rizziromanalejandro@gmail.com> 2023-12-26 12:49:55 -05:00			`usage: {`
			`prompt_tokens: 337,`
			`completion_tokens: 162,`
			`total_tokens: 499,`
			`},`
REFACTOR: Migrate Vllm/TGI-served models to the OpenAI format. (#588) Both endpoints provide OpenAI-compatible servers. The only difference is that Vllm doesn't support passing tools as a separate parameter. Even if the tool param is supported, it ultimately relies on the model's ability to handle native functions, which is not the case with the models we have today. As a part of this change, we are dropping support for StableBeluga/Llama2 models. They don't have a chat_template, meaning the new API can translate them. These changes let us remove some of our existing dialects and are a first step in our plan to support any LLM by defining them as data-driven concepts. I rewrote the "translate" method to use a template method and extracted the tool support strategies into its classes to simplify the code. Finally, these changes bring support for Ollama when running in dev mode. It only works with Mistral for now, but it will change soon.. 2024-05-07 09:02:16 -04:00			`choices: [`
			`{ message: { role: "assistant", content: content }, finish_reason: "stop", index: 0 },`
			`],`
Mixtral (#376) Add both Mistral and Mixtral support. Also includes vLLM-openAI inference support. Co-authored-by: Roman Rizzi <rizziromanalejandro@gmail.com> 2023-12-26 12:49:55 -05:00			`}`
			`end`

			`def stub_response(prompt, response_text, tool_call: false)`
			`WebMock`
DEV: Remove old code now that features rely on LlmModels. (#729) * DEV: Remove old code now that features rely on LlmModels. * Hide old settings and migrate persona llm overrides * Remove shadowing special URL + seeding code. Use srv:// prefix instead. 2024-07-30 12:44:57 -04:00			`.stub_request(:post, "https://test.dev/v1/chat/completions")`
REFACTOR: Migrate Vllm/TGI-served models to the OpenAI format. (#588) Both endpoints provide OpenAI-compatible servers. The only difference is that Vllm doesn't support passing tools as a separate parameter. Even if the tool param is supported, it ultimately relies on the model's ability to handle native functions, which is not the case with the models we have today. As a part of this change, we are dropping support for StableBeluga/Llama2 models. They don't have a chat_template, meaning the new API can translate them. These changes let us remove some of our existing dialects and are a first step in our plan to support any LLM by defining them as data-driven concepts. I rewrote the "translate" method to use a template method and extracted the tool support strategies into its classes to simplify the code. Finally, these changes bring support for Ollama when running in dev mode. It only works with Mistral for now, but it will change soon.. 2024-05-07 09:02:16 -04:00			`.with(body: model.default_options.merge(messages: prompt).to_json)`
Mixtral (#376) Add both Mistral and Mixtral support. Also includes vLLM-openAI inference support. Co-authored-by: Roman Rizzi <rizziromanalejandro@gmail.com> 2023-12-26 12:49:55 -05:00			`.to_return(status: 200, body: JSON.dump(response(response_text)))`
			`end`

			`def stream_line(delta, finish_reason: nil)`
			`+"data: " << {`
			`id: "cmpl-#{SecureRandom.hex}",`
			`created: 1_681_283_881,`
DEV: Stop using shared_examples for endpoint specs (#430) 2024-01-17 13:08:49 -05:00			`model: "mistralai/Mixtral-8x7B-Instruct-v0.1",`
REFACTOR: Migrate Vllm/TGI-served models to the OpenAI format. (#588) Both endpoints provide OpenAI-compatible servers. The only difference is that Vllm doesn't support passing tools as a separate parameter. Even if the tool param is supported, it ultimately relies on the model's ability to handle native functions, which is not the case with the models we have today. As a part of this change, we are dropping support for StableBeluga/Llama2 models. They don't have a chat_template, meaning the new API can translate them. These changes let us remove some of our existing dialects and are a first step in our plan to support any LLM by defining them as data-driven concepts. I rewrote the "translate" method to use a template method and extracted the tool support strategies into its classes to simplify the code. Finally, these changes bring support for Ollama when running in dev mode. It only works with Mistral for now, but it will change soon.. 2024-05-07 09:02:16 -04:00			`choices: [{ delta: { content: delta } }],`
Mixtral (#376) Add both Mistral and Mixtral support. Also includes vLLM-openAI inference support. Co-authored-by: Roman Rizzi <rizziromanalejandro@gmail.com> 2023-12-26 12:49:55 -05:00			`index: 0,`
			`}.to_json`
			`end`

			`def stub_streamed_response(prompt, deltas, tool_call: false)`
			`chunks =`
			`deltas.each_with_index.map do \|_, index\|`
			`if index == (deltas.length - 1)`
			`stream_line(deltas[index], finish_reason: "stop_sequence")`
			`else`
			`stream_line(deltas[index])`
			`end`
			`end`

			`chunks = (chunks.join("\n\n") << "data: [DONE]").split("")`

			`WebMock`
DEV: Remove old code now that features rely on LlmModels. (#729) * DEV: Remove old code now that features rely on LlmModels. * Hide old settings and migrate persona llm overrides * Remove shadowing special URL + seeding code. Use srv:// prefix instead. 2024-07-30 12:44:57 -04:00			`.stub_request(:post, "https://test.dev/v1/chat/completions")`
REFACTOR: Migrate Vllm/TGI-served models to the OpenAI format. (#588) Both endpoints provide OpenAI-compatible servers. The only difference is that Vllm doesn't support passing tools as a separate parameter. Even if the tool param is supported, it ultimately relies on the model's ability to handle native functions, which is not the case with the models we have today. As a part of this change, we are dropping support for StableBeluga/Llama2 models. They don't have a chat_template, meaning the new API can translate them. These changes let us remove some of our existing dialects and are a first step in our plan to support any LLM by defining them as data-driven concepts. I rewrote the "translate" method to use a template method and extracted the tool support strategies into its classes to simplify the code. Finally, these changes bring support for Ollama when running in dev mode. It only works with Mistral for now, but it will change soon.. 2024-05-07 09:02:16 -04:00			`.with(body: model.default_options.merge(messages: prompt, stream: true).to_json)`
Mixtral (#376) Add both Mistral and Mixtral support. Also includes vLLM-openAI inference support. Co-authored-by: Roman Rizzi <rizziromanalejandro@gmail.com> 2023-12-26 12:49:55 -05:00			`.to_return(status: 200, body: chunks)`
			`end`
DEV: Stop using shared_examples for endpoint specs (#430) 2024-01-17 13:08:49 -05:00			`end`

			`RSpec.describe DiscourseAi::Completions::Endpoints::Vllm do`
DEV: Remove old code now that features rely on LlmModels. (#729) * DEV: Remove old code now that features rely on LlmModels. * Hide old settings and migrate persona llm overrides * Remove shadowing special URL + seeding code. Use srv:// prefix instead. 2024-07-30 12:44:57 -04:00			`subject(:endpoint) { described_class.new(llm_model) }`

			`fab!(:llm_model) { Fabricate(:vllm_model) }`
Mixtral (#376) Add both Mistral and Mixtral support. Also includes vLLM-openAI inference support. Co-authored-by: Roman Rizzi <rizziromanalejandro@gmail.com> 2023-12-26 12:49:55 -05:00
DEV: Fix new Rubocop offenses 2024-03-05 10:48:28 -05:00			`fab!(:user)`
DEV: Stop using shared_examples for endpoint specs (#430) 2024-01-17 13:08:49 -05:00
			`let(:anthropic_mock) { VllmMock.new(endpoint) }`

			`let(:compliance) do`
FIX: Mixtral models have system role support. (#703) Using assistant role for system produces an error because they expect alternating roles like user/assistant/user and so on. Prompts cannot start with the assistant role. 2024-07-04 12:23:03 -04:00			`EndpointsCompliance.new(`
			`self,`
			`endpoint,`
			`DiscourseAi::Completions::Dialects::OpenAiCompatible,`
			`user,`
			`)`
DEV: Stop using shared_examples for endpoint specs (#430) 2024-01-17 13:08:49 -05:00			`end`

FIX: Mixtral models have system role support. (#703) Using assistant role for system produces an error because they expect alternating roles like user/assistant/user and so on. Prompts cannot start with the assistant role. 2024-07-04 12:23:03 -04:00			`let(:dialect) do`
DEV: Remove old code now that features rely on LlmModels. (#729) * DEV: Remove old code now that features rely on LlmModels. * Hide old settings and migrate persona llm overrides * Remove shadowing special URL + seeding code. Use srv:// prefix instead. 2024-07-30 12:44:57 -04:00			`DiscourseAi::Completions::Dialects::OpenAiCompatible.new(generic_prompt, llm_model)`
FIX: Mixtral models have system role support. (#703) Using assistant role for system produces an error because they expect alternating roles like user/assistant/user and so on. Prompts cannot start with the assistant role. 2024-07-04 12:23:03 -04:00			`end`
DEV: Stop using shared_examples for endpoint specs (#430) 2024-01-17 13:08:49 -05:00			`let(:prompt) { dialect.translate }`

REFACTOR: Migrate Vllm/TGI-served models to the OpenAI format. (#588) Both endpoints provide OpenAI-compatible servers. The only difference is that Vllm doesn't support passing tools as a separate parameter. Even if the tool param is supported, it ultimately relies on the model's ability to handle native functions, which is not the case with the models we have today. As a part of this change, we are dropping support for StableBeluga/Llama2 models. They don't have a chat_template, meaning the new API can translate them. These changes let us remove some of our existing dialects and are a first step in our plan to support any LLM by defining them as data-driven concepts. I rewrote the "translate" method to use a template method and extracted the tool support strategies into its classes to simplify the code. Finally, these changes bring support for Ollama when running in dev mode. It only works with Mistral for now, but it will change soon.. 2024-05-07 09:02:16 -04:00			`let(:request_body) { model.default_options.merge(messages: prompt).to_json }`
			`let(:stream_request_body) { model.default_options.merge(messages: prompt, stream: true).to_json }`
DEV: Stop using shared_examples for endpoint specs (#430) 2024-01-17 13:08:49 -05:00
			`describe "#perform_completion!" do`
			`context "when using regular mode" do`
			`context "with simple prompts" do`
			`it "completes a trivial prompt and logs the response" do`
			`compliance.regular_mode_simple_prompt(anthropic_mock)`
			`end`
			`end`

			`context "with tools" do`
			`it "returns a function invocation" do`
			`compliance.regular_mode_tools(anthropic_mock)`
			`end`
			`end`
			`end`

			`describe "when using streaming mode" do`
			`context "with simple prompts" do`
			`it "completes a trivial prompt and logs the response" do`
			`compliance.streaming_mode_simple_prompt(anthropic_mock)`
			`end`
			`end`

			`context "with tools" do`
			`it "returns a function invoncation" do`
			`compliance.streaming_mode_tools(anthropic_mock)`
			`end`
			`end`
			`end`
			`end`
Mixtral (#376) Add both Mistral and Mixtral support. Also includes vLLM-openAI inference support. Co-authored-by: Roman Rizzi <rizziromanalejandro@gmail.com> 2023-12-26 12:49:55 -05:00			`end`