discourse-ai/spec/lib/completions/endpoints/vllm_spec.rb

# frozen_string_literal: true

require_relative "endpoint_compliance"

class VllmMock < EndpointMock
  def response(content)
    {
      id: "cmpl-6sZfAb30Rnv9Q7ufzFwvQsMpjZh8S",
      object: "chat.completion",
      created: 1_678_464_820,
      model: "mistralai/Mixtral-8x7B-Instruct-v0.1",
      usage: {
        prompt_tokens: 337,
        completion_tokens: 162,
        total_tokens: 499,
      },
      choices: [
        { message: { role: "assistant", content: content }, finish_reason: "stop", index: 0 },
      ],
    }
  end

  def stub_response(prompt, response_text, tool_call: false)
    WebMock
      .stub_request(:post, "https://test.dev/v1/chat/completions")
      .with(body: model.default_options.merge(messages: prompt).to_json)
      .to_return(status: 200, body: JSON.dump(response(response_text)))
  end

  def stream_line(delta, finish_reason: nil)
    +"data: " << {
      id: "cmpl-#{SecureRandom.hex}",
      created: 1_681_283_881,
      model: "mistralai/Mixtral-8x7B-Instruct-v0.1",
      choices: [{ delta: { content: delta } }],
      index: 0,
    }.to_json
  end

  def stub_streamed_response(prompt, deltas, tool_call: false)
    chunks =
      deltas.each_with_index.map do |_, index|
        if index == (deltas.length - 1)
          stream_line(deltas[index], finish_reason: "stop_sequence")
        else
          stream_line(deltas[index])
        end
      end

    chunks = (chunks.join("\n\n") << "data: [DONE]").split("")

    WebMock
      .stub_request(:post, "https://test.dev/v1/chat/completions")
      .with(body: model.default_options.merge(messages: prompt, stream: true).to_json)
      .to_return(status: 200, body: chunks)
  end
end

RSpec.describe DiscourseAi::Completions::Endpoints::Vllm do
  subject(:endpoint) { described_class.new(llm_model) }

  fab!(:llm_model) { Fabricate(:vllm_model) }
  fab!(:user)

  let(:llm) { DiscourseAi::Completions::Llm.proxy("custom:#{llm_model.id}") }
  let(:vllm_mock) { VllmMock.new(endpoint) }

  let(:compliance) do
    EndpointsCompliance.new(
      self,
      endpoint,
      DiscourseAi::Completions::Dialects::OpenAiCompatible,
      user,
    )
  end

  let(:dialect) do
    DiscourseAi::Completions::Dialects::OpenAiCompatible.new(generic_prompt, llm_model)
  end
  let(:prompt) { dialect.translate }

  let(:request_body) { model.default_options.merge(messages: prompt).to_json }
  let(:stream_request_body) { model.default_options.merge(messages: prompt, stream: true).to_json }

  describe "tool support" do
    it "is able to invoke XML tools correctly" do
      xml = <<~XML
        <function_calls>
        <invoke>
        <tool_name>calculate</tool_name>
        <parameters>
        <expression>1+1</expression></parameters>
        </invoke>
        </function_calls>
        should be ignored
      XML

      body = {
        id: "chatcmpl-6sZfAb30Rnv9Q7ufzFwvQsMpjZh8S",
        object: "chat.completion",
        created: 1_678_464_820,
        model: "gpt-3.5-turbo-0301",
        usage: {
          prompt_tokens: 337,
          completion_tokens: 162,
          total_tokens: 499,
        },
        choices: [
          { message: { role: "assistant", content: xml }, finish_reason: "stop", index: 0 },
        ],
      }
      tool = {
        name: "calculate",
        description: "calculate something",
        parameters: [
          {
            name: "expression",
            type: "string",
            description: "expression to calculate",
            required: true,
          },
        ],
      }

      stub_request(:post, "https://test.dev/v1/chat/completions").to_return(
        status: 200,
        body: body.to_json,
      )

      prompt =
        DiscourseAi::Completions::Prompt.new(
          "You a calculator",
          messages: [{ type: :user, id: "user1", content: "calculate 2758975 + 21.11" }],
          tools: [tool],
        )

      result = llm.generate(prompt, user: Discourse.system_user)

      expected = <<~TEXT
        <function_calls>
        <invoke>
        <tool_name>calculate</tool_name>
        <parameters>
        <expression>1+1</expression></parameters>
        <tool_id>tool_0</tool_id>
        </invoke>
        </function_calls>
      TEXT

      expect(result.strip).to eq(expected.strip)
    end
  end

  describe "#perform_completion!" do
    context "when using regular mode" do
      context "with simple prompts" do
        it "completes a trivial prompt and logs the response" do
          compliance.regular_mode_simple_prompt(vllm_mock)
        end
      end

      context "with tools" do
        it "returns a function invocation" do
          compliance.regular_mode_tools(vllm_mock)
        end
      end
    end

    describe "when using streaming mode" do
      context "with simple prompts" do
        it "completes a trivial prompt and logs the response" do
          compliance.streaming_mode_simple_prompt(vllm_mock)
        end
      end

      context "with tools" do
        it "returns a function invoncation" do
          compliance.streaming_mode_tools(vllm_mock)
        end
      end
    end
  end
end
Mixtral (#376) Add both Mistral and Mixtral support. Also includes vLLM-openAI inference support. Co-authored-by: Roman Rizzi <rizziromanalejandro@gmail.com> 2023-12-26 12:49:55 -05:00			`# frozen_string_literal: true`

DEV: Stop using shared_examples for endpoint specs (#430) 2024-01-17 13:08:49 -05:00			`require_relative "endpoint_compliance"`
FIX: Correctly translate and read tools for Claude and Chat GPT. (#393) I tested against the live models for the AI bot migration. It ensures Open AI's tool syntax is correct and we can correctly read the replies. : 2024-01-02 09:21:13 -05:00
DEV: Stop using shared_examples for endpoint specs (#430) 2024-01-17 13:08:49 -05:00			`class VllmMock < EndpointMock`
Mixtral (#376) Add both Mistral and Mixtral support. Also includes vLLM-openAI inference support. Co-authored-by: Roman Rizzi <rizziromanalejandro@gmail.com> 2023-12-26 12:49:55 -05:00			`def response(content)`
			`{`
			`id: "cmpl-6sZfAb30Rnv9Q7ufzFwvQsMpjZh8S",`
REFACTOR: Migrate Vllm/TGI-served models to the OpenAI format. (#588) Both endpoints provide OpenAI-compatible servers. The only difference is that Vllm doesn't support passing tools as a separate parameter. Even if the tool param is supported, it ultimately relies on the model's ability to handle native functions, which is not the case with the models we have today. As a part of this change, we are dropping support for StableBeluga/Llama2 models. They don't have a chat_template, meaning the new API can translate them. These changes let us remove some of our existing dialects and are a first step in our plan to support any LLM by defining them as data-driven concepts. I rewrote the "translate" method to use a template method and extracted the tool support strategies into its classes to simplify the code. Finally, these changes bring support for Ollama when running in dev mode. It only works with Mistral for now, but it will change soon.. 2024-05-07 09:02:16 -04:00			`object: "chat.completion",`
Mixtral (#376) Add both Mistral and Mixtral support. Also includes vLLM-openAI inference support. Co-authored-by: Roman Rizzi <rizziromanalejandro@gmail.com> 2023-12-26 12:49:55 -05:00			`created: 1_678_464_820,`
DEV: Stop using shared_examples for endpoint specs (#430) 2024-01-17 13:08:49 -05:00			`model: "mistralai/Mixtral-8x7B-Instruct-v0.1",`
Mixtral (#376) Add both Mistral and Mixtral support. Also includes vLLM-openAI inference support. Co-authored-by: Roman Rizzi <rizziromanalejandro@gmail.com> 2023-12-26 12:49:55 -05:00			`usage: {`
			`prompt_tokens: 337,`
			`completion_tokens: 162,`
			`total_tokens: 499,`
			`},`
REFACTOR: Migrate Vllm/TGI-served models to the OpenAI format. (#588) Both endpoints provide OpenAI-compatible servers. The only difference is that Vllm doesn't support passing tools as a separate parameter. Even if the tool param is supported, it ultimately relies on the model's ability to handle native functions, which is not the case with the models we have today. As a part of this change, we are dropping support for StableBeluga/Llama2 models. They don't have a chat_template, meaning the new API can translate them. These changes let us remove some of our existing dialects and are a first step in our plan to support any LLM by defining them as data-driven concepts. I rewrote the "translate" method to use a template method and extracted the tool support strategies into its classes to simplify the code. Finally, these changes bring support for Ollama when running in dev mode. It only works with Mistral for now, but it will change soon.. 2024-05-07 09:02:16 -04:00			`choices: [`
			`{ message: { role: "assistant", content: content }, finish_reason: "stop", index: 0 },`
			`],`
Mixtral (#376) Add both Mistral and Mixtral support. Also includes vLLM-openAI inference support. Co-authored-by: Roman Rizzi <rizziromanalejandro@gmail.com> 2023-12-26 12:49:55 -05:00			`}`
			`end`

			`def stub_response(prompt, response_text, tool_call: false)`
			`WebMock`
DEV: Remove old code now that features rely on LlmModels. (#729) * DEV: Remove old code now that features rely on LlmModels. * Hide old settings and migrate persona llm overrides * Remove shadowing special URL + seeding code. Use srv:// prefix instead. 2024-07-30 12:44:57 -04:00			`.stub_request(:post, "https://test.dev/v1/chat/completions")`
REFACTOR: Migrate Vllm/TGI-served models to the OpenAI format. (#588) Both endpoints provide OpenAI-compatible servers. The only difference is that Vllm doesn't support passing tools as a separate parameter. Even if the tool param is supported, it ultimately relies on the model's ability to handle native functions, which is not the case with the models we have today. As a part of this change, we are dropping support for StableBeluga/Llama2 models. They don't have a chat_template, meaning the new API can translate them. These changes let us remove some of our existing dialects and are a first step in our plan to support any LLM by defining them as data-driven concepts. I rewrote the "translate" method to use a template method and extracted the tool support strategies into its classes to simplify the code. Finally, these changes bring support for Ollama when running in dev mode. It only works with Mistral for now, but it will change soon.. 2024-05-07 09:02:16 -04:00			`.with(body: model.default_options.merge(messages: prompt).to_json)`
Mixtral (#376) Add both Mistral and Mixtral support. Also includes vLLM-openAI inference support. Co-authored-by: Roman Rizzi <rizziromanalejandro@gmail.com> 2023-12-26 12:49:55 -05:00			`.to_return(status: 200, body: JSON.dump(response(response_text)))`
			`end`

			`def stream_line(delta, finish_reason: nil)`
			`+"data: " << {`
			`id: "cmpl-#{SecureRandom.hex}",`
			`created: 1_681_283_881,`
DEV: Stop using shared_examples for endpoint specs (#430) 2024-01-17 13:08:49 -05:00			`model: "mistralai/Mixtral-8x7B-Instruct-v0.1",`
REFACTOR: Migrate Vllm/TGI-served models to the OpenAI format. (#588) Both endpoints provide OpenAI-compatible servers. The only difference is that Vllm doesn't support passing tools as a separate parameter. Even if the tool param is supported, it ultimately relies on the model's ability to handle native functions, which is not the case with the models we have today. As a part of this change, we are dropping support for StableBeluga/Llama2 models. They don't have a chat_template, meaning the new API can translate them. These changes let us remove some of our existing dialects and are a first step in our plan to support any LLM by defining them as data-driven concepts. I rewrote the "translate" method to use a template method and extracted the tool support strategies into its classes to simplify the code. Finally, these changes bring support for Ollama when running in dev mode. It only works with Mistral for now, but it will change soon.. 2024-05-07 09:02:16 -04:00			`choices: [{ delta: { content: delta } }],`
Mixtral (#376) Add both Mistral and Mixtral support. Also includes vLLM-openAI inference support. Co-authored-by: Roman Rizzi <rizziromanalejandro@gmail.com> 2023-12-26 12:49:55 -05:00			`index: 0,`
			`}.to_json`
			`end`

			`def stub_streamed_response(prompt, deltas, tool_call: false)`
			`chunks =`
			`deltas.each_with_index.map do \|_, index\|`
			`if index == (deltas.length - 1)`
			`stream_line(deltas[index], finish_reason: "stop_sequence")`
			`else`
			`stream_line(deltas[index])`
			`end`
			`end`

			`chunks = (chunks.join("\n\n") << "data: [DONE]").split("")`

			`WebMock`
DEV: Remove old code now that features rely on LlmModels. (#729) * DEV: Remove old code now that features rely on LlmModels. * Hide old settings and migrate persona llm overrides * Remove shadowing special URL + seeding code. Use srv:// prefix instead. 2024-07-30 12:44:57 -04:00			`.stub_request(:post, "https://test.dev/v1/chat/completions")`
REFACTOR: Migrate Vllm/TGI-served models to the OpenAI format. (#588) Both endpoints provide OpenAI-compatible servers. The only difference is that Vllm doesn't support passing tools as a separate parameter. Even if the tool param is supported, it ultimately relies on the model's ability to handle native functions, which is not the case with the models we have today. As a part of this change, we are dropping support for StableBeluga/Llama2 models. They don't have a chat_template, meaning the new API can translate them. These changes let us remove some of our existing dialects and are a first step in our plan to support any LLM by defining them as data-driven concepts. I rewrote the "translate" method to use a template method and extracted the tool support strategies into its classes to simplify the code. Finally, these changes bring support for Ollama when running in dev mode. It only works with Mistral for now, but it will change soon.. 2024-05-07 09:02:16 -04:00			`.with(body: model.default_options.merge(messages: prompt, stream: true).to_json)`
Mixtral (#376) Add both Mistral and Mixtral support. Also includes vLLM-openAI inference support. Co-authored-by: Roman Rizzi <rizziromanalejandro@gmail.com> 2023-12-26 12:49:55 -05:00			`.to_return(status: 200, body: chunks)`
			`end`
DEV: Stop using shared_examples for endpoint specs (#430) 2024-01-17 13:08:49 -05:00			`end`

			`RSpec.describe DiscourseAi::Completions::Endpoints::Vllm do`
DEV: Remove old code now that features rely on LlmModels. (#729) * DEV: Remove old code now that features rely on LlmModels. * Hide old settings and migrate persona llm overrides * Remove shadowing special URL + seeding code. Use srv:// prefix instead. 2024-07-30 12:44:57 -04:00			`subject(:endpoint) { described_class.new(llm_model) }`

			`fab!(:llm_model) { Fabricate(:vllm_model) }`
DEV: Fix new Rubocop offenses 2024-03-05 10:48:28 -05:00			`fab!(:user)`
DEV: Stop using shared_examples for endpoint specs (#430) 2024-01-17 13:08:49 -05:00
FIX: Add tool support to open ai compatible dialect and vllm (#734) * FIX: Add tool support to open ai compatible dialect and vllm Automatic tools are in progress in vllm see: https://github.com/vllm-project/vllm/pull/5649 Even when they are supported, initial support will be uneven, only some models have native tool support notably mistral which has some special tokens for tool support. After the above PR lands in vllm we will still need to swap to XML based tools on models without native tool support. * fix specs 2024-08-02 08:52:33 -04:00			`let(:llm) { DiscourseAi::Completions::Llm.proxy("custom:#{llm_model.id}") }`
			`let(:vllm_mock) { VllmMock.new(endpoint) }`
DEV: Stop using shared_examples for endpoint specs (#430) 2024-01-17 13:08:49 -05:00
			`let(:compliance) do`
FIX: Mixtral models have system role support. (#703) Using assistant role for system produces an error because they expect alternating roles like user/assistant/user and so on. Prompts cannot start with the assistant role. 2024-07-04 12:23:03 -04:00			`EndpointsCompliance.new(`
			`self,`
			`endpoint,`
			`DiscourseAi::Completions::Dialects::OpenAiCompatible,`
			`user,`
			`)`
DEV: Stop using shared_examples for endpoint specs (#430) 2024-01-17 13:08:49 -05:00			`end`

FIX: Mixtral models have system role support. (#703) Using assistant role for system produces an error because they expect alternating roles like user/assistant/user and so on. Prompts cannot start with the assistant role. 2024-07-04 12:23:03 -04:00			`let(:dialect) do`
DEV: Remove old code now that features rely on LlmModels. (#729) * DEV: Remove old code now that features rely on LlmModels. * Hide old settings and migrate persona llm overrides * Remove shadowing special URL + seeding code. Use srv:// prefix instead. 2024-07-30 12:44:57 -04:00			`DiscourseAi::Completions::Dialects::OpenAiCompatible.new(generic_prompt, llm_model)`
FIX: Mixtral models have system role support. (#703) Using assistant role for system produces an error because they expect alternating roles like user/assistant/user and so on. Prompts cannot start with the assistant role. 2024-07-04 12:23:03 -04:00			`end`
DEV: Stop using shared_examples for endpoint specs (#430) 2024-01-17 13:08:49 -05:00			`let(:prompt) { dialect.translate }`

REFACTOR: Migrate Vllm/TGI-served models to the OpenAI format. (#588) Both endpoints provide OpenAI-compatible servers. The only difference is that Vllm doesn't support passing tools as a separate parameter. Even if the tool param is supported, it ultimately relies on the model's ability to handle native functions, which is not the case with the models we have today. As a part of this change, we are dropping support for StableBeluga/Llama2 models. They don't have a chat_template, meaning the new API can translate them. These changes let us remove some of our existing dialects and are a first step in our plan to support any LLM by defining them as data-driven concepts. I rewrote the "translate" method to use a template method and extracted the tool support strategies into its classes to simplify the code. Finally, these changes bring support for Ollama when running in dev mode. It only works with Mistral for now, but it will change soon.. 2024-05-07 09:02:16 -04:00			`let(:request_body) { model.default_options.merge(messages: prompt).to_json }`
			`let(:stream_request_body) { model.default_options.merge(messages: prompt, stream: true).to_json }`
DEV: Stop using shared_examples for endpoint specs (#430) 2024-01-17 13:08:49 -05:00
FIX: Add tool support to open ai compatible dialect and vllm (#734) * FIX: Add tool support to open ai compatible dialect and vllm Automatic tools are in progress in vllm see: https://github.com/vllm-project/vllm/pull/5649 Even when they are supported, initial support will be uneven, only some models have native tool support notably mistral which has some special tokens for tool support. After the above PR lands in vllm we will still need to swap to XML based tools on models without native tool support. * fix specs 2024-08-02 08:52:33 -04:00			`describe "tool support" do`
			`it "is able to invoke XML tools correctly" do`
			`xml = <<~XML`
			`<function_calls>`
			`<invoke>`
			`<tool_name>calculate</tool_name>`
			`<parameters>`
			`<expression>1+1</expression></parameters>`
			`</invoke>`
			`</function_calls>`
			`should be ignored`
			`XML`

			`body = {`
			`id: "chatcmpl-6sZfAb30Rnv9Q7ufzFwvQsMpjZh8S",`
			`object: "chat.completion",`
			`created: 1_678_464_820,`
			`model: "gpt-3.5-turbo-0301",`
			`usage: {`
			`prompt_tokens: 337,`
			`completion_tokens: 162,`
			`total_tokens: 499,`
			`},`
			`choices: [`
			`{ message: { role: "assistant", content: xml }, finish_reason: "stop", index: 0 },`
			`],`
			`}`
			`tool = {`
			`name: "calculate",`
			`description: "calculate something",`
			`parameters: [`
			`{`
			`name: "expression",`
			`type: "string",`
			`description: "expression to calculate",`
			`required: true,`
			`},`
			`],`
			`}`

			`stub_request(:post, "https://test.dev/v1/chat/completions").to_return(`
			`status: 200,`
			`body: body.to_json,`
			`)`

			`prompt =`
			`DiscourseAi::Completions::Prompt.new(`
			`"You a calculator",`
			`messages: [{ type: :user, id: "user1", content: "calculate 2758975 + 21.11" }],`
			`tools: [tool],`
			`)`

			`result = llm.generate(prompt, user: Discourse.system_user)`

			`expected = <<~TEXT`
			`<function_calls>`
			`<invoke>`
			`<tool_name>calculate</tool_name>`
			`<parameters>`
			`<expression>1+1</expression></parameters>`
			`<tool_id>tool_0</tool_id>`
			`</invoke>`
			`</function_calls>`
			`TEXT`

			`expect(result.strip).to eq(expected.strip)`
			`end`
			`end`

DEV: Stop using shared_examples for endpoint specs (#430) 2024-01-17 13:08:49 -05:00			`describe "#perform_completion!" do`
			`context "when using regular mode" do`
			`context "with simple prompts" do`
			`it "completes a trivial prompt and logs the response" do`
FIX: Add tool support to open ai compatible dialect and vllm (#734) * FIX: Add tool support to open ai compatible dialect and vllm Automatic tools are in progress in vllm see: https://github.com/vllm-project/vllm/pull/5649 Even when they are supported, initial support will be uneven, only some models have native tool support notably mistral which has some special tokens for tool support. After the above PR lands in vllm we will still need to swap to XML based tools on models without native tool support. * fix specs 2024-08-02 08:52:33 -04:00			`compliance.regular_mode_simple_prompt(vllm_mock)`
DEV: Stop using shared_examples for endpoint specs (#430) 2024-01-17 13:08:49 -05:00			`end`
			`end`

			`context "with tools" do`
			`it "returns a function invocation" do`
FIX: Add tool support to open ai compatible dialect and vllm (#734) * FIX: Add tool support to open ai compatible dialect and vllm Automatic tools are in progress in vllm see: https://github.com/vllm-project/vllm/pull/5649 Even when they are supported, initial support will be uneven, only some models have native tool support notably mistral which has some special tokens for tool support. After the above PR lands in vllm we will still need to swap to XML based tools on models without native tool support. * fix specs 2024-08-02 08:52:33 -04:00			`compliance.regular_mode_tools(vllm_mock)`
DEV: Stop using shared_examples for endpoint specs (#430) 2024-01-17 13:08:49 -05:00			`end`
			`end`
			`end`

			`describe "when using streaming mode" do`
			`context "with simple prompts" do`
			`it "completes a trivial prompt and logs the response" do`
FIX: Add tool support to open ai compatible dialect and vllm (#734) * FIX: Add tool support to open ai compatible dialect and vllm Automatic tools are in progress in vllm see: https://github.com/vllm-project/vllm/pull/5649 Even when they are supported, initial support will be uneven, only some models have native tool support notably mistral which has some special tokens for tool support. After the above PR lands in vllm we will still need to swap to XML based tools on models without native tool support. * fix specs 2024-08-02 08:52:33 -04:00			`compliance.streaming_mode_simple_prompt(vllm_mock)`
DEV: Stop using shared_examples for endpoint specs (#430) 2024-01-17 13:08:49 -05:00			`end`
			`end`

			`context "with tools" do`
			`it "returns a function invoncation" do`
FIX: Add tool support to open ai compatible dialect and vllm (#734) * FIX: Add tool support to open ai compatible dialect and vllm Automatic tools are in progress in vllm see: https://github.com/vllm-project/vllm/pull/5649 Even when they are supported, initial support will be uneven, only some models have native tool support notably mistral which has some special tokens for tool support. After the above PR lands in vllm we will still need to swap to XML based tools on models without native tool support. * fix specs 2024-08-02 08:52:33 -04:00			`compliance.streaming_mode_tools(vllm_mock)`
DEV: Stop using shared_examples for endpoint specs (#430) 2024-01-17 13:08:49 -05:00			`end`
			`end`
			`end`
			`end`
Mixtral (#376) Add both Mistral and Mixtral support. Also includes vLLM-openAI inference support. Co-authored-by: Roman Rizzi <rizziromanalejandro@gmail.com> 2023-12-26 12:49:55 -05:00			`end`