Sam e817b7dc11
FEATURE: improve tool support (#904)
This re-implements tool support in DiscourseAi::Completions::Llm #generate

Previously tool support was always returned via XML and it would be the responsibility of the caller to parse XML

New implementation has the endpoints return ToolCall objects.

Additionally this simplifies the Llm endpoint interface and gives it more clarity. Llms must implement

decode, decode_chunk (for streaming)

It is the implementers responsibility to figure out how to decode chunks, base no longer implements. To make this easy we ship a flexible json decoder which is easy to wire up.

Also (new)

    Better debugging for PMs, we now have a next / previous button to see all the Llm messages associated with a PM
    Token accounting is fixed for vllm (we were not correctly counting tokens)
2024-11-12 08:14:30 +11:00

103 lines
3.0 KiB
Ruby

# frozen_string_literal: true
module DiscourseAi
module Completions
module Endpoints
class Vllm < Base
def self.can_contact?(model_provider)
model_provider == "vllm"
end
def normalize_model_params(model_params)
model_params = model_params.dup
# max_tokens, temperature are already supported
if model_params[:stop_sequences]
model_params[:stop] = model_params.delete(:stop_sequences)
end
model_params
end
def default_options
{ max_tokens: 2000, model: llm_model.name }
end
def provider_id
AiApiAuditLog::Provider::Vllm
end
private
def model_uri
if llm_model.url.to_s.starts_with?("srv://")
service = DiscourseAi::Utils::DnsSrv.lookup(llm_model.url.sub("srv://", ""))
api_endpoint = "https://#{service.target}:#{service.port}/v1/chat/completions"
else
api_endpoint = llm_model.url
end
@uri ||= URI(api_endpoint)
end
def prepare_payload(prompt, model_params, dialect)
payload = default_options.merge(model_params).merge(messages: prompt)
if @streaming_mode
payload[:stream] = true if @streaming_mode
payload[:stream_options] = { include_usage: true }
end
payload
end
def prepare_request(payload)
headers = { "Referer" => Discourse.base_url, "Content-Type" => "application/json" }
api_key = llm_model&.api_key || SiteSetting.ai_vllm_api_key
headers["X-API-KEY"] = api_key if api_key.present?
Net::HTTP::Post.new(model_uri, headers).tap { |r| r.body = payload }
end
def xml_tools_enabled?
true
end
def final_log_update(log)
log.request_tokens = @prompt_tokens if @prompt_tokens
log.response_tokens = @completion_tokens if @completion_tokens
end
def decode(response_raw)
json = JSON.parse(response_raw, symbolize_names: true)
@prompt_tokens = json.dig(:usage, :prompt_tokens)
@completion_tokens = json.dig(:usage, :completion_tokens)
[json.dig(:choices, 0, :message, :content)]
end
def decode_chunk(chunk)
@json_decoder ||= JsonStreamDecoder.new
(@json_decoder << chunk)
.map do |parsed|
# vLLM keeps sending usage over and over again
prompt_tokens = parsed.dig(:usage, :prompt_tokens)
completion_tokens = parsed.dig(:usage, :completion_tokens)
@prompt_tokens = prompt_tokens if prompt_tokens
@completion_tokens = completion_tokens if completion_tokens
text = parsed.dig(:choices, 0, :delta, :content)
if text.to_s.empty?
nil
else
text
end
end
.compact
end
end
end
end
end