FEATURE: Add periodic problem checks for each LLM in use (#1020)

This feature adds a periodic problem check which periodically checks for issues with LLMs that are in use. Periodically, we will run a test to see if the in use LLMs are still operational. If it is not, the LLM with the problem is surfaced to the admin so they can easily go and update the configuration.
2025-03-07 17:59:56 +00:00 · 2024-12-17 08:00:05 +09:00 · 2024-12-17 08:00:05 +09:00 · 90ce942108
commit 90ce942108
parent 24b107881a
5 changed files with 136 additions and 0 deletions
--- a/app/models/llm_model.rb
+++ b/app/models/llm_model.rb
@ -13,6 +13,11 @@ class LlmModel < ActiveRecord::Base
  validates_presence_of :name, :api_key
  validates :max_prompt_tokens, numericality: { greater_than: 0 }
  validate :required_provider_params
+  scope :in_use,
+        -> do
+          model_ids = DiscourseAi::Configuration::LlmEnumerator.global_usage.keys
+          where(id: model_ids)
+        end

  def self.provider_params
    {
--- a/app/services/problem_check/ai_llm_status.rb
+++ b/app/services/problem_check/ai_llm_status.rb
@ -0,0 +1,58 @@
+# frozen_string_literal: true
+
+class ProblemCheck::AiLlmStatus < ProblemCheck
+  self.priority = "high"
+  self.perform_every = 6.hours
+
+  def call
+    llm_errors
+  end
+
+  def base_path
+    Discourse.base_path
+  end
+
+  private
+
+  def llm_errors
+    return [] if !SiteSetting.discourse_ai_enabled
+    LlmModel.in_use.find_each.filter_map do |model|
+      try_validate(model) { validator.run_test(model) }
+    end
+  end
+
+  def try_validate(model, &blk)
+    begin
+      blk.call
+      nil
+    rescue => e
+      error_message = parse_error_message(e.message)
+      message =
+        "#{I18n.t("dashboard.problem.ai_llm_status", { base_path: base_path, model_name: model.display_name, model_id: model.id })}"
+
+      Problem.new(
+        message,
+        priority: "high",
+        identifier: "ai_llm_status",
+        target: model.id,
+        details: {
+          model_id: model.id,
+          model_name: model.display_name,
+          error: error_message,
+        },
+      )
+    end
+  end
+
+  def validator
+    @validator ||= DiscourseAi::Configuration::LlmValidator.new
+  end
+
+  def parse_error_message(message)
+    begin
+      JSON.parse(message)["message"]
+    rescue JSON::ParserError
+      message.to_s
+    end
+  end
+end
--- a/config/locales/server.en.yml
+++ b/config/locales/server.en.yml
@ -453,3 +453,6 @@ en:
      no_default_llm: The persona must have a default_llm defined.
      user_not_allowed: The user is not allowed to participate in the topic.
      prompt_message_length: The message %{idx} is over the 1000 character limit.
+  dashboard:
+    problem:
+      ai_llm_status: "The LLM model: %{model_name} is encountering issues. Please check the <a href='%{base_path}/admin/plugins/discourse-ai/ai-llms/%{model_id}'>model's configuration page</a>."
--- a/plugin.rb
+++ b/plugin.rb
@ -75,6 +75,8 @@ after_initialize do
    DiscourseAi::AiModeration::EntryPoint.new,
  ].each { |a_module| a_module.inject_into(self) }

+  register_problem_check ProblemCheck::AiLlmStatus
+
  register_reviewable_type ReviewableAiChatMessage
  register_reviewable_type ReviewableAiPost

--- a/spec/services/problem_check/ai_llm_status_spec.rb
+++ b/spec/services/problem_check/ai_llm_status_spec.rb
@ -0,0 +1,68 @@
+# frozen_string_literal: true
+
+require "rails_helper"
+
+RSpec.describe ProblemCheck::AiLlmStatus do
+  subject(:check) { described_class.new }
+
+  fab!(:llm_model)
+
+  let(:post_url) { "https://api.openai.com/v1/chat/completions" }
+  let(:success_response) do
+    {
+      model: "gpt-4-turbo",
+      usage: {
+        max_prompt_tokens: 131_072,
+      },
+      choices: [
+        { message: { role: "assistant", content: "test" }, finish_reason: "stop", index: 0 },
+      ],
+    }.to_json
+  end
+
+  let(:error_response) do
+    { message: "API key error! Please check you have supplied the correct API key." }.to_json
+  end
+
+  before do
+    stub_request(:post, post_url).to_return(status: 200, body: success_response, headers: {})
+    SiteSetting.ai_summarization_model = "custom:#{llm_model.id}"
+    SiteSetting.ai_summarization_enabled = true
+  end
+
+  describe "#call" do
+    it "does nothing if discourse-ai plugin disabled" do
+      SiteSetting.discourse_ai_enabled = false
+      expect(check).to be_chill_about_it
+    end
+
+    context "with discourse-ai plugin enabled for the site" do
+      before { SiteSetting.discourse_ai_enabled = true }
+
+      it "returns a problem with an LLM model" do
+        stub_request(:post, post_url).to_return(status: 403, body: error_response, headers: {})
+        message =
+          "#{I18n.t("dashboard.problem.ai_llm_status", { base_path: Discourse.base_path, model_name: llm_model.display_name, model_id: llm_model.id })}"
+
+        expect(described_class.new.call).to contain_exactly(
+          have_attributes(
+            identifier: "ai_llm_status",
+            target: llm_model.id,
+            priority: "high",
+            message: message,
+            details: {
+              model_id: llm_model.id,
+              model_name: llm_model.display_name,
+              error: JSON.parse(error_response)["message"],
+            },
+          ),
+        )
+      end
+
+      it "does not return a problem if the LLM models are working" do
+        stub_request(:post, post_url).to_return(status: 200, body: success_response, headers: {})
+        expect(check).to be_chill_about_it
+      end
+    end
+  end
+end