discourse-ai/spec/lib/completions/endpoints/hugging_face_spec.rb

# frozen_string_literal: true

require_relative "endpoint_compliance"

class HuggingFaceMock < EndpointMock
  def response(content)
    [{ generated_text: content }]
  end

  def stub_response(prompt, response_text, tool_call: false)
    WebMock
      .stub_request(:post, "#{SiteSetting.ai_hugging_face_api_url}")
      .with(body: request_body(prompt))
      .to_return(status: 200, body: JSON.dump(response(response_text)))
  end

  def stream_line(delta, deltas, finish_reason: nil)
    +"data: " << {
      token: {
        id: 29_889,
        text: delta,
        logprob: -0.08319092,
        special: !!finish_reason,
      },
      generated_text: finish_reason ? deltas.join : nil,
      details: nil,
    }.to_json
  end

  def stub_streamed_response(prompt, deltas, tool_call: false)
    chunks =
      deltas.each_with_index.map do |_, index|
        if index == (deltas.length - 1)
          stream_line(deltas[index], deltas, finish_reason: true)
        else
          stream_line(deltas[index], deltas)
        end
      end

    chunks = (chunks.join("\n\n") << "data: [DONE]").split("")

    WebMock
      .stub_request(:post, "#{SiteSetting.ai_hugging_face_api_url}")
      .with(body: request_body(prompt, stream: true))
      .to_return(status: 200, body: chunks)
  end

  def request_body(prompt, stream: false)
    model
      .default_options
      .merge(inputs: prompt)
      .tap do |payload|
        payload[:parameters][:max_new_tokens] = (SiteSetting.ai_hugging_face_token_limit || 4_000) -
          model.prompt_size(prompt)
        payload[:stream] = true if stream
      end
      .to_json
  end
end

RSpec.describe DiscourseAi::Completions::Endpoints::HuggingFace do
  subject(:endpoint) do
    described_class.new("Llama2-*-chat-hf", DiscourseAi::Tokenizer::Llama2Tokenizer)
  end

  before { SiteSetting.ai_hugging_face_api_url = "https://test.dev" }

  fab!(:user) { Fabricate(:user) }

  let(:hf_mock) { HuggingFaceMock.new(endpoint) }

  let(:compliance) do
    EndpointsCompliance.new(self, endpoint, DiscourseAi::Completions::Dialects::Llama2Classic, user)
  end

  describe "#perform_completion!" do
    context "when using regular mode" do
      context "with simple prompts" do
        it "completes a trivial prompt and logs the response" do
          compliance.regular_mode_simple_prompt(hf_mock)
        end
      end

      context "with tools" do
        it "returns a function invocation" do
          compliance.regular_mode_tools(hf_mock)
        end
      end
    end

    describe "when using streaming mode" do
      context "with simple prompts" do
        it "completes a trivial prompt and logs the response" do
          compliance.streaming_mode_simple_prompt(hf_mock)
        end
      end

      context "with tools" do
        it "returns a function invocation" do
          compliance.streaming_mode_tools(hf_mock)
        end
      end
    end
  end
end
REFACTOR: Summarization and HyDE now use an LLM abstraction. (#297) * DEV: One LLM abstraction to rule them all * REFACTOR: HyDE search uses new LLM abstraction * REFACTOR: Summarization uses the LLM abstraction * Updated documentation and made small fixes. Remove Bedrock claude-2 restriction 2023-11-23 10:58:54 -05:00			`# frozen_string_literal: true`

DEV: Stop using shared_examples for endpoint specs (#430) 2024-01-17 13:08:49 -05:00			`require_relative "endpoint_compliance"`
REFACTOR: Summarization and HyDE now use an LLM abstraction. (#297) * DEV: One LLM abstraction to rule them all * REFACTOR: HyDE search uses new LLM abstraction * REFACTOR: Summarization uses the LLM abstraction * Updated documentation and made small fixes. Remove Bedrock claude-2 restriction 2023-11-23 10:58:54 -05:00
DEV: Stop using shared_examples for endpoint specs (#430) 2024-01-17 13:08:49 -05:00			`class HuggingFaceMock < EndpointMock`
REFACTOR: Summarization and HyDE now use an LLM abstraction. (#297) * DEV: One LLM abstraction to rule them all * REFACTOR: HyDE search uses new LLM abstraction * REFACTOR: Summarization uses the LLM abstraction * Updated documentation and made small fixes. Remove Bedrock claude-2 restriction 2023-11-23 10:58:54 -05:00			`def response(content)`
FIX: Many fixes for huggingface and llama2 inference (#335) 2023-12-06 09:22:42 -05:00			`[{ generated_text: content }]`
REFACTOR: Summarization and HyDE now use an LLM abstraction. (#297) * DEV: One LLM abstraction to rule them all * REFACTOR: HyDE search uses new LLM abstraction * REFACTOR: Summarization uses the LLM abstraction * Updated documentation and made small fixes. Remove Bedrock claude-2 restriction 2023-11-23 10:58:54 -05:00			`end`

DEV: Tool support for the LLM service. (#366) This PR adds tool support to available LLMs. We'll buffer tool invocations and return them instead of making users of this service parse the response. It also adds support for conversation context in the generic prompt. It includes bot messages, user messages, and tool invocations, which we'll trim to make sure it doesn't exceed the prompt limit, then translate them to the correct dialect. Finally, It adds some buffering when reading chunks to handle cases when streaming is extremely slow.:M 2023-12-18 16:06:01 -05:00			`def stub_response(prompt, response_text, tool_call: false)`
REFACTOR: Summarization and HyDE now use an LLM abstraction. (#297) * DEV: One LLM abstraction to rule them all * REFACTOR: HyDE search uses new LLM abstraction * REFACTOR: Summarization uses the LLM abstraction * Updated documentation and made small fixes. Remove Bedrock claude-2 restriction 2023-11-23 10:58:54 -05:00			`WebMock`
FIX: Many fixes for huggingface and llama2 inference (#335) 2023-12-06 09:22:42 -05:00			`.stub_request(:post, "#{SiteSetting.ai_hugging_face_api_url}")`
DEV: Stop using shared_examples for endpoint specs (#430) 2024-01-17 13:08:49 -05:00			`.with(body: request_body(prompt))`
REFACTOR: Summarization and HyDE now use an LLM abstraction. (#297) * DEV: One LLM abstraction to rule them all * REFACTOR: HyDE search uses new LLM abstraction * REFACTOR: Summarization uses the LLM abstraction * Updated documentation and made small fixes. Remove Bedrock claude-2 restriction 2023-11-23 10:58:54 -05:00			`.to_return(status: 200, body: JSON.dump(response(response_text)))`
			`end`

DEV: Tool support for the LLM service. (#366) This PR adds tool support to available LLMs. We'll buffer tool invocations and return them instead of making users of this service parse the response. It also adds support for conversation context in the generic prompt. It includes bot messages, user messages, and tool invocations, which we'll trim to make sure it doesn't exceed the prompt limit, then translate them to the correct dialect. Finally, It adds some buffering when reading chunks to handle cases when streaming is extremely slow.:M 2023-12-18 16:06:01 -05:00			`def stream_line(delta, deltas, finish_reason: nil)`
REFACTOR: Summarization and HyDE now use an LLM abstraction. (#297) * DEV: One LLM abstraction to rule them all * REFACTOR: HyDE search uses new LLM abstraction * REFACTOR: Summarization uses the LLM abstraction * Updated documentation and made small fixes. Remove Bedrock claude-2 restriction 2023-11-23 10:58:54 -05:00			`+"data: " << {`
			`token: {`
			`id: 29_889,`
			`text: delta,`
			`logprob: -0.08319092,`
			`special: !!finish_reason,`
			`},`
DEV: Tool support for the LLM service. (#366) This PR adds tool support to available LLMs. We'll buffer tool invocations and return them instead of making users of this service parse the response. It also adds support for conversation context in the generic prompt. It includes bot messages, user messages, and tool invocations, which we'll trim to make sure it doesn't exceed the prompt limit, then translate them to the correct dialect. Finally, It adds some buffering when reading chunks to handle cases when streaming is extremely slow.:M 2023-12-18 16:06:01 -05:00			`generated_text: finish_reason ? deltas.join : nil,`
REFACTOR: Summarization and HyDE now use an LLM abstraction. (#297) * DEV: One LLM abstraction to rule them all * REFACTOR: HyDE search uses new LLM abstraction * REFACTOR: Summarization uses the LLM abstraction * Updated documentation and made small fixes. Remove Bedrock claude-2 restriction 2023-11-23 10:58:54 -05:00			`details: nil,`
			`}.to_json`
			`end`

DEV: Tool support for the LLM service. (#366) This PR adds tool support to available LLMs. We'll buffer tool invocations and return them instead of making users of this service parse the response. It also adds support for conversation context in the generic prompt. It includes bot messages, user messages, and tool invocations, which we'll trim to make sure it doesn't exceed the prompt limit, then translate them to the correct dialect. Finally, It adds some buffering when reading chunks to handle cases when streaming is extremely slow.:M 2023-12-18 16:06:01 -05:00			`def stub_streamed_response(prompt, deltas, tool_call: false)`
REFACTOR: Summarization and HyDE now use an LLM abstraction. (#297) * DEV: One LLM abstraction to rule them all * REFACTOR: HyDE search uses new LLM abstraction * REFACTOR: Summarization uses the LLM abstraction * Updated documentation and made small fixes. Remove Bedrock claude-2 restriction 2023-11-23 10:58:54 -05:00			`chunks =`
			`deltas.each_with_index.map do \|_, index\|`
			`if index == (deltas.length - 1)`
DEV: Tool support for the LLM service. (#366) This PR adds tool support to available LLMs. We'll buffer tool invocations and return them instead of making users of this service parse the response. It also adds support for conversation context in the generic prompt. It includes bot messages, user messages, and tool invocations, which we'll trim to make sure it doesn't exceed the prompt limit, then translate them to the correct dialect. Finally, It adds some buffering when reading chunks to handle cases when streaming is extremely slow.:M 2023-12-18 16:06:01 -05:00			`stream_line(deltas[index], deltas, finish_reason: true)`
REFACTOR: Summarization and HyDE now use an LLM abstraction. (#297) * DEV: One LLM abstraction to rule them all * REFACTOR: HyDE search uses new LLM abstraction * REFACTOR: Summarization uses the LLM abstraction * Updated documentation and made small fixes. Remove Bedrock claude-2 restriction 2023-11-23 10:58:54 -05:00			`else`
DEV: Tool support for the LLM service. (#366) This PR adds tool support to available LLMs. We'll buffer tool invocations and return them instead of making users of this service parse the response. It also adds support for conversation context in the generic prompt. It includes bot messages, user messages, and tool invocations, which we'll trim to make sure it doesn't exceed the prompt limit, then translate them to the correct dialect. Finally, It adds some buffering when reading chunks to handle cases when streaming is extremely slow.:M 2023-12-18 16:06:01 -05:00			`stream_line(deltas[index], deltas)`
REFACTOR: Summarization and HyDE now use an LLM abstraction. (#297) * DEV: One LLM abstraction to rule them all * REFACTOR: HyDE search uses new LLM abstraction * REFACTOR: Summarization uses the LLM abstraction * Updated documentation and made small fixes. Remove Bedrock claude-2 restriction 2023-11-23 10:58:54 -05:00			`end`
			`end`

DEV: Tool support for the LLM service. (#366) This PR adds tool support to available LLMs. We'll buffer tool invocations and return them instead of making users of this service parse the response. It also adds support for conversation context in the generic prompt. It includes bot messages, user messages, and tool invocations, which we'll trim to make sure it doesn't exceed the prompt limit, then translate them to the correct dialect. Finally, It adds some buffering when reading chunks to handle cases when streaming is extremely slow.:M 2023-12-18 16:06:01 -05:00			`chunks = (chunks.join("\n\n") << "data: [DONE]").split("")`
REFACTOR: Summarization and HyDE now use an LLM abstraction. (#297) * DEV: One LLM abstraction to rule them all * REFACTOR: HyDE search uses new LLM abstraction * REFACTOR: Summarization uses the LLM abstraction * Updated documentation and made small fixes. Remove Bedrock claude-2 restriction 2023-11-23 10:58:54 -05:00
			`WebMock`
FIX: Many fixes for huggingface and llama2 inference (#335) 2023-12-06 09:22:42 -05:00			`.stub_request(:post, "#{SiteSetting.ai_hugging_face_api_url}")`
DEV: Stop using shared_examples for endpoint specs (#430) 2024-01-17 13:08:49 -05:00			`.with(body: request_body(prompt, stream: true))`
REFACTOR: Summarization and HyDE now use an LLM abstraction. (#297) * DEV: One LLM abstraction to rule them all * REFACTOR: HyDE search uses new LLM abstraction * REFACTOR: Summarization uses the LLM abstraction * Updated documentation and made small fixes. Remove Bedrock claude-2 restriction 2023-11-23 10:58:54 -05:00			`.to_return(status: 200, body: chunks)`
			`end`

DEV: Stop using shared_examples for endpoint specs (#430) 2024-01-17 13:08:49 -05:00			`def request_body(prompt, stream: false)`
			`model`
			`.default_options`
			`.merge(inputs: prompt)`
			`.tap do \|payload\|`
			`payload[:parameters][:max_new_tokens] = (SiteSetting.ai_hugging_face_token_limit \|\| 4_000) -`
			`model.prompt_size(prompt)`
			`payload[:stream] = true if stream`
			`end`
			`.to_json`
			`end`
			`end`

			`RSpec.describe DiscourseAi::Completions::Endpoints::HuggingFace do`
			`subject(:endpoint) do`
			`described_class.new("Llama2-*-chat-hf", DiscourseAi::Tokenizer::Llama2Tokenizer)`
			`end`

			`before { SiteSetting.ai_hugging_face_api_url = "https://test.dev" }`

			`fab!(:user) { Fabricate(:user) }`

			`let(:hf_mock) { HuggingFaceMock.new(endpoint) }`

			`let(:compliance) do`
			`EndpointsCompliance.new(self, endpoint, DiscourseAi::Completions::Dialects::Llama2Classic, user)`
			`end`

			`describe "#perform_completion!" do`
			`context "when using regular mode" do`
			`context "with simple prompts" do`
			`it "completes a trivial prompt and logs the response" do`
			`compliance.regular_mode_simple_prompt(hf_mock)`
			`end`
			`end`

			`context "with tools" do`
			`it "returns a function invocation" do`
			`compliance.regular_mode_tools(hf_mock)`
			`end`
			`end`
			`end`

			`describe "when using streaming mode" do`
			`context "with simple prompts" do`
			`it "completes a trivial prompt and logs the response" do`
			`compliance.streaming_mode_simple_prompt(hf_mock)`
			`end`
			`end`

			`context "with tools" do`
DEV: Fix various typos (#434) 2024-01-19 06:51:26 -05:00			`it "returns a function invocation" do`
DEV: Stop using shared_examples for endpoint specs (#430) 2024-01-17 13:08:49 -05:00			`compliance.streaming_mode_tools(hf_mock)`
			`end`
			`end`
			`end`
			`end`
REFACTOR: Summarization and HyDE now use an LLM abstraction. (#297) * DEV: One LLM abstraction to rule them all * REFACTOR: HyDE search uses new LLM abstraction * REFACTOR: Summarization uses the LLM abstraction * Updated documentation and made small fixes. Remove Bedrock claude-2 restriction 2023-11-23 10:58:54 -05:00			`end`