discourse-ai/evals/lib/eval.rb

#frozen_string_literal: true

class DiscourseAi::Evals::Eval
  attr_reader :type,
              :path,
              :name,
              :description,
              :id,
              :args,
              :vision,
              :expected_output,
              :expected_output_regex,
              :expected_tool_call,
              :judge

  class EvalError < StandardError
    attr_reader :context

    def initialize(message, context)
      super(message)
      @context = context
    end
  end

  def initialize(path:)
    @yaml = YAML.load_file(path).symbolize_keys
    @path = path
    @name = @yaml[:name]
    @id = @yaml[:id]
    @description = @yaml[:description]
    @vision = @yaml[:vision]
    @args = @yaml[:args]&.symbolize_keys
    @type = @yaml[:type]
    @expected_output = @yaml[:expected_output]
    @expected_output_regex = @yaml[:expected_output_regex]
    @expected_output_regex =
      Regexp.new(@expected_output_regex, Regexp::MULTILINE) if @expected_output_regex
    @expected_tool_call = @yaml[:expected_tool_call]
    @expected_tool_call.symbolize_keys! if @expected_tool_call
    @judge = @yaml[:judge]
    @judge.symbolize_keys! if @judge

    @args.each do |key, value|
      if (key.to_s.include?("_path") || key.to_s == "path") && value.is_a?(String)
        @args[key] = File.expand_path(File.join(File.dirname(path), value))
      end
    end
  end

  def run(llm:)
    result =
      case type
      when "helper"
        helper(llm, **args)
      when "pdf_to_text"
        pdf_to_text(llm, **args)
      when "image_to_text"
        image_to_text(llm, **args)
      when "prompt"
        prompt_call(llm, **args)
      when "edit_artifact"
        edit_artifact(llm, **args)
      end

    if expected_output
      if result == expected_output
        { result: :pass }
      else
        { result: :fail, expected_output: expected_output, actual_output: result }
      end
    elsif expected_output_regex
      if result.to_s.match?(expected_output_regex)
        { result: :pass }
      else
        { result: :fail, expected_output: expected_output_regex, actual_output: result }
      end
    elsif expected_tool_call
      tool_call = result

      if result.is_a?(Array)
        tool_call = result.find { |r| r.is_a?(DiscourseAi::Completions::ToolCall) }
      end
      if !tool_call.is_a?(DiscourseAi::Completions::ToolCall) ||
           (tool_call.name != expected_tool_call[:name]) ||
           (tool_call.parameters != expected_tool_call[:params])
        { result: :fail, expected_output: expected_tool_call, actual_output: result }
      else
        { result: :pass }
      end
    elsif judge
      judge_result(result)
    else
      { result: :pass }
    end
  rescue EvalError => e
    { result: :fail, message: e.message, context: e.context }
  end

  def print
    puts "#{id}: #{description}"
  end

  def to_json
    {
      type: @type,
      path: @path,
      name: @name,
      description: @description,
      id: @id,
      args: @args,
      vision: @vision,
      expected_output: @expected_output,
      expected_output_regex: @expected_output_regex,
    }.compact
  end

  private

  def judge_result(result)
    prompt = judge[:prompt].dup
    prompt.sub!("{{output}}", result)
    prompt.sub!("{{input}}", args[:input])

    prompt += <<~SUFFIX

      Reply with a rating from 1 to 10, where 10 is perfect and 1 is terrible.

      example output:

      [RATING]10[/RATING] perfect output

      example output:

      [RATING]5[/RATING]

      the following failed to preserve... etc...
    SUFFIX

    judge_llm = DiscourseAi::Evals::Llm.choose(judge[:llm]).first

    DiscourseAi::Completions::Prompt.new(
      "You are an expert judge tasked at testing LLM outputs.",
      messages: [{ type: :user, content: prompt }],
    )

    result = judge_llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user)

    if rating = result.match(%r{\[RATING\](\d+)\[/RATING\]})
      rating = rating[1].to_i
    end

    if rating.to_i >= judge[:pass_rating]
      { result: :pass }
    else
      {
        result: :fail,
        message: "LLM Rating below threshold, it was #{rating}, expecting #{judge[:pass_rating]}",
        context: result,
      }
    end
  end

  def helper(llm, input:, name:, locale: nil)
    completion_prompt = CompletionPrompt.find_by(name: name)
    helper = DiscourseAi::AiHelper::Assistant.new(helper_llm: llm.llm_proxy)
    user = Discourse.system_user
    if locale
      user = User.new
      class << user
        attr_accessor :effective_locale
      end

      user.effective_locale = locale
      user.admin = true
    end
    result =
      helper.generate_and_send_prompt(
        completion_prompt,
        input,
        current_user = user,
        _force_default_locale = false,
      )

    result[:suggestions].first
  end

  def image_to_text(llm, path:)
    upload =
      UploadCreator.new(File.open(path), File.basename(path)).create_for(Discourse.system_user.id)

    text = +""
    DiscourseAi::Utils::ImageToText
      .new(upload: upload, llm_model: llm.llm_model, user: Discourse.system_user)
      .extract_text do |chunk, error|
        text << chunk if chunk
        text << "\n\n" if chunk
      end
    text
  ensure
    upload.destroy if upload
  end

  def pdf_to_text(llm, path:)
    upload =
      UploadCreator.new(File.open(path), File.basename(path)).create_for(Discourse.system_user.id)

    text = +""
    DiscourseAi::Utils::PdfToText
      .new(upload: upload, user: Discourse.system_user, llm_model: llm.llm_model)
      .extract_text do |chunk|
        text << chunk if chunk
        text << "\n\n" if chunk
      end

    text
  ensure
    upload.destroy if upload
  end

  def prompt_call(llm, system_prompt:, message:, tools: nil, stream: false)
    if tools
      tools.each do |tool|
        tool.symbolize_keys!
        tool[:parameters].symbolize_keys! if tool[:parameters]
      end
    end
    prompt =
      DiscourseAi::Completions::Prompt.new(
        system_prompt,
        messages: [{ type: :user, content: message }],
        tools: tools,
      )

    result = nil
    if stream
      result = []
      llm
        .llm_model
        .to_llm
        .generate(prompt, user: Discourse.system_user) { |partial| result << partial }
    else
      result = llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user)
    end
    result
  end

  def edit_artifact(llm, css_path:, js_path:, html_path:, instructions_path:)
    css = File.read(css_path)
    js = File.read(js_path)
    html = File.read(html_path)
    instructions = File.read(instructions_path)
    artifact =
      AiArtifact.create!(
        css: css,
        js: js,
        html: html,
        user_id: Discourse.system_user.id,
        post_id: 1,
        name: "eval artifact",
      )

    post = Post.new(topic_id: 1, id: 1)
    diff =
      DiscourseAi::AiBot::ArtifactUpdateStrategies::Diff.new(
        llm: llm.llm_model.to_llm,
        post: post,
        user: Discourse.system_user,
        artifact: artifact,
        artifact_version: nil,
        instructions: instructions,
      )
    diff.apply

    if diff.failed_searches.present?
      puts "Eval Errors encountered"
      p diff.failed_searches
      raise EvalError.new("Failed to apply all changes", diff.failed_searches)
    end

    version = artifact.versions.last
    raise EvalError.new("Invalid JS", version.js) if !valid_javascript?(version.js)

    output = { css: version.css, js: version.js, html: version.html }

    artifact.destroy
    output
  end

  def valid_javascript?(str)
    require "open3"

    # Create a temporary file with the JavaScript code
    Tempfile.create(%w[test .js]) do |f|
      f.write(str)
      f.flush

      File.write("/tmp/test.js", str)

      begin
        Discourse::Utils.execute_command(
          "node",
          "--check",
          f.path,
          failure_message: "Invalid JavaScript syntax",
          timeout: 30, # reasonable timeout in seconds
        )
        true
      rescue Discourse::Utils::CommandError
        false
      end
    end
  rescue StandardError
    false
  end
end