discourse-ai/evals/lib/runner.rb

#frozen_string_literal: true

class DiscourseAi::Evals::Runner
  class StructuredLogger
    def initialize
      @log = []
      @current_step = @log
    end

    def log(name, args: nil, start_time: nil, end_time: nil)
      start_time ||= Time.now.utc
      end_time ||= Time.now.utc
      args ||= {}
      object = { name: name, args: args, start_time: start_time, end_time: end_time }
      @current_step << object
    end

    def step(name, args: nil)
      start_time = Time.now.utc
      start_step = @current_step

      new_step = { type: :step, name: name, args: args || {}, log: [], start_time: start_time }

      @current_step << new_step
      @current_step = new_step[:log]
      yield new_step
      @current_step = start_step
      new_step[:end_time] = Time.now.utc
    end

    def to_trace_event_json
      trace_events = []
      process_id = 1
      thread_id = 1

      to_trace_event(@log, process_id, thread_id, trace_events)

      JSON.pretty_generate({ traceEvents: trace_events })
    end

    private

    def to_trace_event(log_items, pid, tid, trace_events, parent_start_time = nil)
      log_items.each do |item|
        if item.is_a?(Hash) && item[:type] == :step
          trace_events << {
            name: item[:name],
            cat: "default",
            ph: "B", # Begin event
            pid: pid,
            tid: tid,
            args: item[:args],
            ts: timestamp_in_microseconds(item[:start_time]),
          }

          to_trace_event(item[:log], pid, tid, trace_events, item[:start_time])

          trace_events << {
            name: item[:name],
            cat: "default",
            ph: "E", # End event
            pid: pid,
            tid: tid,
            ts: timestamp_in_microseconds(item[:end_time]),
          }
        else
          trace_events << {
            name: item[:name],
            cat: "default",
            ph: "B",
            pid: pid,
            tid: tid,
            args: item[:args],
            ts: timestamp_in_microseconds(item[:start_time] || parent_start_time || Time.now.utc),
            s: "p", # Scope: process
          }
          trace_events << {
            name: item[:name],
            cat: "default",
            ph: "E",
            pid: pid,
            tid: tid,
            ts: timestamp_in_microseconds(item[:end_time] || Time.now.utc),
            s: "p",
          }
        end
      end
    end

    def timestamp_in_microseconds(time)
      (time.to_f * 1_000_000).to_i
    end
  end

  attr_reader :llms, :cases

  def self.evals_paths
    @eval_paths ||= Dir.glob(File.join(File.join(__dir__, "../cases"), "*/*.yml"))
  end

  def self.evals
    @evals ||= evals_paths.map { |path| DiscourseAi::Evals::Eval.new(path: path) }
  end

  def self.print
    evals.each(&:print)
  end

  def initialize(eval_name:, llms:)
    @llms = llms
    @eval = self.class.evals.find { |c| c.id == eval_name }

    if !@eval
      puts "Error: Unknown evaluation '#{eval_name}'"
      exit 1
    end

    if @llms.empty?
      puts "Error: Unknown model 'model'"
      exit 1
    end
  end

  def run!
    puts "Running evaluation '#{@eval.id}'"

    structured_log_filename = "#{@eval.id}-#{Time.now.strftime("%Y%m%d-%H%M%S")}.json"
    log_filename = "#{@eval.id}-#{Time.now.strftime("%Y%m%d-%H%M%S")}.log"
    logs_dir = File.join(__dir__, "../log")
    FileUtils.mkdir_p(logs_dir)

    log_path = File.expand_path(File.join(logs_dir, log_filename))
    structured_log_path = File.expand_path(File.join(logs_dir, structured_log_filename))

    logger = Logger.new(File.open(log_path, "a"))
    logger.info("Starting evaluation '#{@eval.id}'")

    Thread.current[:llm_audit_log] = logger
    structured_logger = Thread.current[:llm_audit_structured_log] = StructuredLogger.new

    structured_logger.step("Evaluating #{@eval.id}", args: @eval.to_json) do
      llms.each do |llm|
        if @eval.vision && !llm.vision?
          logger.info("Skipping LLM: #{llm.name} as it does not support vision")
          next
        end

        structured_logger.step("Evaluating with LLM: #{llm.name}") do |step|
          logger.info("Evaluating with LLM: #{llm.name}")
          print "#{llm.name}: "
          result = @eval.run(llm: llm)

          step[:args] = result
          step[:cname] = result[:result] == :pass ? :good : :bad

          if result[:result] == :fail
            puts "Failed 🔴"
            puts "---- Expected ----\n#{result[:expected_output]}"
            puts "---- Actual ----\n#{result[:actual_output]}"
            logger.error("Evaluation failed with LLM: #{llm.name}")
          elsif result[:result] == :pass
            puts "Passed 🟢"
            logger.info("Evaluation passed with LLM: #{llm.name}")
          else
            STDERR.puts "Error: Unknown result #{eval.inspect}"
            logger.error("Unknown result: #{eval.inspect}")
          end
        end
      end
    end

    #structured_logger.save(structured_log_path)

    File.write("#{structured_log_path}", structured_logger.to_trace_event_json)

    puts
    puts "Log file: #{log_path}"
    puts "Structured log file (ui.perfetto.dev): #{structured_log_path}"

    # temp code
    # puts File.read(structured_log_path)
  end
end
FEATURE: PDF support for rag pipeline (#1118) This PR introduces several enhancements and refactorings to the AI Persona and RAG (Retrieval-Augmented Generation) functionalities within the discourse-ai plugin. Here's a breakdown of the changes: 1. LLM Model Association for RAG and Personas: - New Database Columns: Adds `rag_llm_model_id` to both `ai_personas` and `ai_tools` tables. This allows specifying a dedicated LLM for RAG indexing, separate from the persona's primary LLM. Adds `default_llm_id` and `question_consolidator_llm_id` to `ai_personas`. - Migration: Includes a migration (`20250210032345_migrate_persona_to_llm_model_id.rb`) to populate the new `default_llm_id` and `question_consolidator_llm_id` columns in `ai_personas` based on the existing `default_llm` and `question_consolidator_llm` string columns, and a post migration to remove the latter. - Model Changes: The `AiPersona` and `AiTool` models now `belong_to` an `LlmModel` via `rag_llm_model_id`. The `LlmModel.proxy` method now accepts an `LlmModel` instance instead of just an identifier. `AiPersona` now has `default_llm_id` and `question_consolidator_llm_id` attributes. - UI Updates: The AI Persona and AI Tool editors in the admin panel now allow selecting an LLM for RAG indexing (if PDF/image support is enabled). The RAG options component displays an LLM selector. - Serialization: The serializers (`AiCustomToolSerializer`, `AiCustomToolListSerializer`, `LocalizedAiPersonaSerializer`) have been updated to include the new `rag_llm_model_id`, `default_llm_id` and `question_consolidator_llm_id` attributes. 2. PDF and Image Support for RAG: - Site Setting: Introduces a new hidden site setting, `ai_rag_pdf_images_enabled`, to control whether PDF and image files can be indexed for RAG. This defaults to `false`. - File Upload Validation: The `RagDocumentFragmentsController` now checks the `ai_rag_pdf_images_enabled` setting and allows PDF, PNG, JPG, and JPEG files if enabled. Error handling is included for cases where PDF/image indexing is attempted with the setting disabled. - PDF Processing: Adds a new utility class, `DiscourseAi::Utils::PdfToImages`, which uses ImageMagick (`magick`) to convert PDF pages into individual PNG images. A maximum PDF size and conversion timeout are enforced. - Image Processing: A new utility class, `DiscourseAi::Utils::ImageToText`, is included to handle OCR for the images and PDFs. - RAG Digestion Job: The `DigestRagUpload` job now handles PDF and image uploads. It uses `PdfToImages` and `ImageToText` to extract text and create document fragments. - UI Updates: The RAG uploader component now accepts PDF and image file types if `ai_rag_pdf_images_enabled` is true. The UI text is adjusted to indicate supported file types. 3. Refactoring and Improvements: - LLM Enumeration: The `DiscourseAi::Configuration::LlmEnumerator` now provides a `values_for_serialization` method, which returns a simplified array of LLM data (id, name, vision_enabled) suitable for use in serializers. This avoids exposing unnecessary details to the frontend. - AI Helper: The `AiHelper::Assistant` now takes optional `helper_llm` and `image_caption_llm` parameters in its constructor, allowing for greater flexibility. - Bot and Persona Updates: Several updates were made across the codebase, changing the string based association to a LLM to the new model based. - Audit Logs: The `DiscourseAi::Completions::Endpoints::Base` now formats raw request payloads as pretty JSON for easier auditing. - Eval Script: An evaluation script is included. 4. Testing: - The PR introduces a new eval system for LLMs, this allows us to test how functionality works across various LLM providers. This lives in `/evals` 2025-02-14 12:15:07 +11:00			`#frozen_string_literal: true`

			`class DiscourseAi::Evals::Runner`
			`class StructuredLogger`
			`def initialize`
			`@log = []`
			`@current_step = @log`
			`end`

			`def log(name, args: nil, start_time: nil, end_time: nil)`
			`start_time \|\|= Time.now.utc`
			`end_time \|\|= Time.now.utc`
			`args \|\|= {}`
			`object = { name: name, args: args, start_time: start_time, end_time: end_time }`
			`@current_step << object`
			`end`

			`def step(name, args: nil)`
			`start_time = Time.now.utc`
			`start_step = @current_step`

			`new_step = { type: :step, name: name, args: args \|\| {}, log: [], start_time: start_time }`

			`@current_step << new_step`
			`@current_step = new_step[:log]`
			`yield new_step`
			`@current_step = start_step`
			`new_step[:end_time] = Time.now.utc`
			`end`

			`def to_trace_event_json`
			`trace_events = []`
			`process_id = 1`
			`thread_id = 1`

			`to_trace_event(@log, process_id, thread_id, trace_events)`

			`JSON.pretty_generate({ traceEvents: trace_events })`
			`end`

			`private`

			`def to_trace_event(log_items, pid, tid, trace_events, parent_start_time = nil)`
			`log_items.each do \|item\|`
			`if item.is_a?(Hash) && item[:type] == :step`
			`trace_events << {`
			`name: item[:name],`
			`cat: "default",`
			`ph: "B", # Begin event`
			`pid: pid,`
			`tid: tid,`
			`args: item[:args],`
			`ts: timestamp_in_microseconds(item[:start_time]),`
			`}`

			`to_trace_event(item[:log], pid, tid, trace_events, item[:start_time])`

			`trace_events << {`
			`name: item[:name],`
			`cat: "default",`
			`ph: "E", # End event`
			`pid: pid,`
			`tid: tid,`
			`ts: timestamp_in_microseconds(item[:end_time]),`
			`}`
			`else`
			`trace_events << {`
			`name: item[:name],`
			`cat: "default",`
			`ph: "B",`
			`pid: pid,`
			`tid: tid,`
			`args: item[:args],`
			`ts: timestamp_in_microseconds(item[:start_time] \|\| parent_start_time \|\| Time.now.utc),`
			`s: "p", # Scope: process`
			`}`
			`trace_events << {`
			`name: item[:name],`
			`cat: "default",`
			`ph: "E",`
			`pid: pid,`
			`tid: tid,`
			`ts: timestamp_in_microseconds(item[:end_time] \|\| Time.now.utc),`
			`s: "p",`
			`}`
			`end`
			`end`
			`end`

			`def timestamp_in_microseconds(time)`
			`(time.to_f * 1_000_000).to_i`
			`end`
			`end`

			`attr_reader :llms, :cases`

			`def self.evals_paths`
			`@eval_paths \|\|= Dir.glob(File.join(File.join(__dir__, "../cases"), "/.yml"))`
			`end`

			`def self.evals`
			`@evals \|\|= evals_paths.map { \|path\| DiscourseAi::Evals::Eval.new(path: path) }`
			`end`

			`def self.print`
			`evals.each(&:print)`
			`end`

			`def initialize(eval_name:, llms:)`
			`@llms = llms`
			`@eval = self.class.evals.find { \|c\| c.id == eval_name }`

			`if !@eval`
			`puts "Error: Unknown evaluation '#{eval_name}'"`
			`exit 1`
			`end`

			`if @llms.empty?`
			`puts "Error: Unknown model 'model'"`
			`exit 1`
			`end`
			`end`

			`def run!`
			`puts "Running evaluation '#{@eval.id}'"`

			`structured_log_filename = "#{@eval.id}-#{Time.now.strftime("%Y%m%d-%H%M%S")}.json"`
			`log_filename = "#{@eval.id}-#{Time.now.strftime("%Y%m%d-%H%M%S")}.log"`
			`logs_dir = File.join(__dir__, "../log")`
			`FileUtils.mkdir_p(logs_dir)`

			`log_path = File.expand_path(File.join(logs_dir, log_filename))`
			`structured_log_path = File.expand_path(File.join(logs_dir, structured_log_filename))`

			`logger = Logger.new(File.open(log_path, "a"))`
			`logger.info("Starting evaluation '#{@eval.id}'")`

			`Thread.current[:llm_audit_log] = logger`
			`structured_logger = Thread.current[:llm_audit_structured_log] = StructuredLogger.new`

			`structured_logger.step("Evaluating #{@eval.id}", args: @eval.to_json) do`
			`llms.each do \|llm\|`
			`if @eval.vision && !llm.vision?`
			`logger.info("Skipping LLM: #{llm.name} as it does not support vision")`
			`next`
			`end`

			`structured_logger.step("Evaluating with LLM: #{llm.name}") do \|step\|`
			`logger.info("Evaluating with LLM: #{llm.name}")`
			`print "#{llm.name}: "`
			`result = @eval.run(llm: llm)`

			`step[:args] = result`
			`step[:cname] = result[:result] == :pass ? :good : :bad`

			`if result[:result] == :fail`
			`puts "Failed 🔴"`
			`puts "---- Expected ----\n#{result[:expected_output]}"`
			`puts "---- Actual ----\n#{result[:actual_output]}"`
			`logger.error("Evaluation failed with LLM: #{llm.name}")`
			`elsif result[:result] == :pass`
			`puts "Passed 🟢"`
			`logger.info("Evaluation passed with LLM: #{llm.name}")`
			`else`
			`STDERR.puts "Error: Unknown result #{eval.inspect}"`
			`logger.error("Unknown result: #{eval.inspect}")`
			`end`
			`end`
			`end`
			`end`

			`#structured_logger.save(structured_log_path)`

			`File.write("#{structured_log_path}", structured_logger.to_trace_event_json)`

			`puts`
			`puts "Log file: #{log_path}"`
			`puts "Structured log file (ui.perfetto.dev): #{structured_log_path}"`

			`# temp code`
			`# puts File.read(structured_log_path)`
			`end`
			`end`