discourse-ai/lib/utils/image_to_text.rb

# frozen_string_literal: true

class DiscourseAi::Utils::ImageToText
  BACKOFF_SECONDS = [5, 30, 60]
  MAX_IMAGE_SIZE = 10.megabytes

  class Reader
    def initialize(uploads:, llm_model:, user:)
      @uploads = uploads
      @llm_model = llm_model
      @user = user
      @buffer = +""

      @to_process = uploads.dup
    end

    # return nil if no more data
    def read(length)
      # for implementation simplicity we will process one image at a time
      if !@buffer.empty?
        part = @buffer.slice!(0, length)
        return part
      end

      return nil if @to_process.empty?

      upload = @to_process.shift
      extractor =
        DiscourseAi::Utils::ImageToText.new(upload: upload, llm_model: @llm_model, user: @user)
      extractor.extract_text do |chunk, error|
        if error
          Discourse.warn_exception(
            error,
            message: "Discourse AI: Failed to extract text from image",
          )
        else
          # this introduces chunk markers so discourse rag ingestion requires no overlaps
          @buffer << "\n[[metadata ]]\n"
          @buffer << chunk
        end
      end

      read(length)
    end
  end

  def self.as_fake_file(uploads:, llm_model:, user:)
    # given our implementation for extracting text expect a file, return a simple object that can simulate read(size)
    # and stream content
    Reader.new(uploads: uploads, llm_model: llm_model, user: user)
  end

  attr_reader :upload, :llm_model, :user

  def initialize(upload:, llm_model:, user:)
    @upload = upload
    @llm_model = llm_model
    @user = user
  end

  def extract_text(retries: 3)
    uploads ||= @uploaded_pages

    raise "must specify a block" if !block_given?
    extracted = nil
    error = nil

    backoff = BACKOFF_SECONDS.dup

    retries.times do
      seconds = nil
      begin
        extracted = extract_text_from_page(upload)
        break
      rescue => e
        error = e
        seconds = backoff.shift || seconds
        sleep(seconds)
      end
    end
    if extracted
      extracted.each { |chunk| yield(chunk) }
    else
      yield(nil, error)
    end
    extracted || []
  end

  private

  def system_message
    <<~MSG
      OCR the following page into Markdown. Tables should be formatted as Github flavored markdown.
      Do not surround your output with triple backticks.

      Chunk the document into sections of roughly 250 - 1000 words. Our goal is to identify parts of the page with same semantic theme. These chunks will be embedded and used in a RAG pipeline.

      Always prefer returning text in Markdown vs HTML.
      Describe all the images and graphs you encounter.
      Only return text that will assist in the querying of data. Omit text such as "I had trouble recognizing images" and so on.

      Surround the chunks with <chunk> </chunk> html tags.
    MSG
  end

  def extract_text_from_page(page)
    raw_text = extract_text_with_tesseract(page)

    llm = llm_model.to_llm
    if raw_text.present?
      messages = [
        {
          type: :user,
          content:
            "The following text was extracted from an image using OCR. Please enhance, correct, and structure this content while maintaining the original meaning:\n\n#{raw_text}",
          upload_ids: [page.id],
        },
      ]
    else
      messages = [
        { type: :user, content: "Please OCR the content in the image.", upload_ids: [page.id] },
      ]
    end
    prompt = DiscourseAi::Completions::Prompt.new(system_message, messages: messages)
    result = llm.generate(prompt, user: Discourse.system_user)
    extract_chunks(result)
  end

  def extract_text_with_tesseract(page)
    upload_path =
      if page.local?
        Discourse.store.path_for(page)
      else
        Discourse.store.download_safe(page, max_file_size_kb: MAX_IMAGE_SIZE)&.path
      end

    return "" if !upload_path || !File.exist?(upload_path)

    tmp_output_file = Tempfile.new(%w[tesseract_output .txt])
    tmp_output = tmp_output_file.path
    tmp_output_file.unlink

    command = [
      "tesseract",
      upload_path,
      tmp_output.sub(/\.txt$/, ""), # Tesseract adds .txt automatically
    ]

    success =
      Discourse::Utils.execute_command(
        *command,
        timeout: 20.seconds,
        failure_message: "Failed to OCR image with Tesseract",
      )

    if success && File.exist?("#{tmp_output}")
      text = File.read("#{tmp_output}")
      begin
        File.delete("#{tmp_output}")
      rescue StandardError
        nil
      end
      text.strip
    else
      Rails.logger.error("Tesseract OCR failed for #{upload_path}")
      ""
    end
  rescue => e
    Rails.logger.error("Error during OCR processing: #{e.message}")
    ""
  end

  def extract_chunks(text)
    return [] if text.nil? || text.empty?

    if text.include?("<chunk>") && text.include?("</chunk>")
      chunks = []
      remaining_text = text.dup

      while remaining_text.length > 0
        if remaining_text.start_with?("<chunk>")
          # Extract chunk content
          chunk_end = remaining_text.index("</chunk>")
          if chunk_end
            chunk = remaining_text[7..chunk_end - 1].strip
            chunks << chunk unless chunk.empty?
            remaining_text = remaining_text[chunk_end + 8..-1] || ""
          else
            # Malformed chunk - add remaining text and break
            chunks << remaining_text[7..-1].strip
            break
          end
        else
          # Handle text before next chunk if it exists
          next_chunk = remaining_text.index("<chunk>")
          if next_chunk
            text_before = remaining_text[0...next_chunk].strip
            chunks << text_before unless text_before.empty?
            remaining_text = remaining_text[next_chunk..-1]
          else
            # No more chunks - add remaining text and break
            chunks << remaining_text.strip
            break
          end
        end
      end

      return chunks.reject(&:empty?)
    end

    [text]
  end
end
FEATURE: PDF support for rag pipeline (#1118) This PR introduces several enhancements and refactorings to the AI Persona and RAG (Retrieval-Augmented Generation) functionalities within the discourse-ai plugin. Here's a breakdown of the changes: 1. LLM Model Association for RAG and Personas: - New Database Columns: Adds `rag_llm_model_id` to both `ai_personas` and `ai_tools` tables. This allows specifying a dedicated LLM for RAG indexing, separate from the persona's primary LLM. Adds `default_llm_id` and `question_consolidator_llm_id` to `ai_personas`. - Migration: Includes a migration (`20250210032345_migrate_persona_to_llm_model_id.rb`) to populate the new `default_llm_id` and `question_consolidator_llm_id` columns in `ai_personas` based on the existing `default_llm` and `question_consolidator_llm` string columns, and a post migration to remove the latter. - Model Changes: The `AiPersona` and `AiTool` models now `belong_to` an `LlmModel` via `rag_llm_model_id`. The `LlmModel.proxy` method now accepts an `LlmModel` instance instead of just an identifier. `AiPersona` now has `default_llm_id` and `question_consolidator_llm_id` attributes. - UI Updates: The AI Persona and AI Tool editors in the admin panel now allow selecting an LLM for RAG indexing (if PDF/image support is enabled). The RAG options component displays an LLM selector. - Serialization: The serializers (`AiCustomToolSerializer`, `AiCustomToolListSerializer`, `LocalizedAiPersonaSerializer`) have been updated to include the new `rag_llm_model_id`, `default_llm_id` and `question_consolidator_llm_id` attributes. 2. PDF and Image Support for RAG: - Site Setting: Introduces a new hidden site setting, `ai_rag_pdf_images_enabled`, to control whether PDF and image files can be indexed for RAG. This defaults to `false`. - File Upload Validation: The `RagDocumentFragmentsController` now checks the `ai_rag_pdf_images_enabled` setting and allows PDF, PNG, JPG, and JPEG files if enabled. Error handling is included for cases where PDF/image indexing is attempted with the setting disabled. - PDF Processing: Adds a new utility class, `DiscourseAi::Utils::PdfToImages`, which uses ImageMagick (`magick`) to convert PDF pages into individual PNG images. A maximum PDF size and conversion timeout are enforced. - Image Processing: A new utility class, `DiscourseAi::Utils::ImageToText`, is included to handle OCR for the images and PDFs. - RAG Digestion Job: The `DigestRagUpload` job now handles PDF and image uploads. It uses `PdfToImages` and `ImageToText` to extract text and create document fragments. - UI Updates: The RAG uploader component now accepts PDF and image file types if `ai_rag_pdf_images_enabled` is true. The UI text is adjusted to indicate supported file types. 3. Refactoring and Improvements: - LLM Enumeration: The `DiscourseAi::Configuration::LlmEnumerator` now provides a `values_for_serialization` method, which returns a simplified array of LLM data (id, name, vision_enabled) suitable for use in serializers. This avoids exposing unnecessary details to the frontend. - AI Helper: The `AiHelper::Assistant` now takes optional `helper_llm` and `image_caption_llm` parameters in its constructor, allowing for greater flexibility. - Bot and Persona Updates: Several updates were made across the codebase, changing the string based association to a LLM to the new model based. - Audit Logs: The `DiscourseAi::Completions::Endpoints::Base` now formats raw request payloads as pretty JSON for easier auditing. - Eval Script: An evaluation script is included. 4. Testing: - The PR introduces a new eval system for LLMs, this allows us to test how functionality works across various LLM providers. This lives in `/evals` 2025-02-14 12:15:07 +11:00			`# frozen_string_literal: true`

			`class DiscourseAi::Utils::ImageToText`
			`BACKOFF_SECONDS = [5, 30, 60]`
			`MAX_IMAGE_SIZE = 10.megabytes`

			`class Reader`
			`def initialize(uploads:, llm_model:, user:)`
			`@uploads = uploads`
			`@llm_model = llm_model`
			`@user = user`
			`@buffer = +""`

			`@to_process = uploads.dup`
			`end`

			`# return nil if no more data`
			`def read(length)`
			`# for implementation simplicity we will process one image at a time`
			`if !@buffer.empty?`
			`part = @buffer.slice!(0, length)`
			`return part`
			`end`

			`return nil if @to_process.empty?`

			`upload = @to_process.shift`
			`extractor =`
			`DiscourseAi::Utils::ImageToText.new(upload: upload, llm_model: @llm_model, user: @user)`
			`extractor.extract_text do \|chunk, error\|`
			`if error`
			`Discourse.warn_exception(`
			`error,`
			`message: "Discourse AI: Failed to extract text from image",`
			`)`
			`else`
			`# this introduces chunk markers so discourse rag ingestion requires no overlaps`
			`@buffer << "\n[[metadata ]]\n"`
			`@buffer << chunk`
			`end`
			`end`

			`read(length)`
			`end`
			`end`

			`def self.as_fake_file(uploads:, llm_model:, user:)`
			`# given our implementation for extracting text expect a file, return a simple object that can simulate read(size)`
			`# and stream content`
			`Reader.new(uploads: uploads, llm_model: llm_model, user: user)`
			`end`

			`attr_reader :upload, :llm_model, :user`

			`def initialize(upload:, llm_model:, user:)`
			`@upload = upload`
			`@llm_model = llm_model`
			`@user = user`
			`end`

			`def extract_text(retries: 3)`
			`uploads \|\|= @uploaded_pages`

			`raise "must specify a block" if !block_given?`
			`extracted = nil`
			`error = nil`

			`backoff = BACKOFF_SECONDS.dup`

			`retries.times do`
			`seconds = nil`
			`begin`
			`extracted = extract_text_from_page(upload)`
			`break`
			`rescue => e`
			`error = e`
			`seconds = backoff.shift \|\| seconds`
			`sleep(seconds)`
			`end`
			`end`
			`if extracted`
			`extracted.each { \|chunk\| yield(chunk) }`
			`else`
			`yield(nil, error)`
			`end`
			`extracted \|\| []`
			`end`

			`private`

			`def system_message`
			`<<~MSG`
			`OCR the following page into Markdown. Tables should be formatted as Github flavored markdown.`
			`Do not surround your output with triple backticks.`

			`Chunk the document into sections of roughly 250 - 1000 words. Our goal is to identify parts of the page with same semantic theme. These chunks will be embedded and used in a RAG pipeline.`

			`Always prefer returning text in Markdown vs HTML.`
			`Describe all the images and graphs you encounter.`
			`Only return text that will assist in the querying of data. Omit text such as "I had trouble recognizing images" and so on.`

			`Surround the chunks with <chunk> </chunk> html tags.`
			`MSG`
			`end`

			`def extract_text_from_page(page)`
			`raw_text = extract_text_with_tesseract(page)`

			`llm = llm_model.to_llm`
			`if raw_text.present?`
			`messages = [`
			`{`
			`type: :user,`
			`content:`
			`"The following text was extracted from an image using OCR. Please enhance, correct, and structure this content while maintaining the original meaning:\n\n#{raw_text}",`
			`upload_ids: [page.id],`
			`},`
			`]`
			`else`
			`messages = [`
			`{ type: :user, content: "Please OCR the content in the image.", upload_ids: [page.id] },`
			`]`
			`end`
			`prompt = DiscourseAi::Completions::Prompt.new(system_message, messages: messages)`
			`result = llm.generate(prompt, user: Discourse.system_user)`
			`extract_chunks(result)`
			`end`

			`def extract_text_with_tesseract(page)`
			`upload_path =`
			`if page.local?`
			`Discourse.store.path_for(page)`
			`else`
			`Discourse.store.download_safe(page, max_file_size_kb: MAX_IMAGE_SIZE)&.path`
			`end`

			`return "" if !upload_path \|\| !File.exist?(upload_path)`

			`tmp_output_file = Tempfile.new(%w[tesseract_output .txt])`
			`tmp_output = tmp_output_file.path`
			`tmp_output_file.unlink`

			`command = [`
			`"tesseract",`
			`upload_path,`
			`tmp_output.sub(/\.txt$/, ""), # Tesseract adds .txt automatically`
			`]`

			`success =`
			`Discourse::Utils.execute_command(`
			`*command,`
			`timeout: 20.seconds,`
			`failure_message: "Failed to OCR image with Tesseract",`
			`)`

			`if success && File.exist?("#{tmp_output}")`
			`text = File.read("#{tmp_output}")`
			`begin`
			`File.delete("#{tmp_output}")`
			`rescue StandardError`
			`nil`
			`end`
			`text.strip`
			`else`
			`Rails.logger.error("Tesseract OCR failed for #{upload_path}")`
			`""`
			`end`
			`rescue => e`
			`Rails.logger.error("Error during OCR processing: #{e.message}")`
			`""`
			`end`

			`def extract_chunks(text)`
			`return [] if text.nil? \|\| text.empty?`

			`if text.include?("<chunk>") && text.include?("</chunk>")`
			`chunks = []`
			`remaining_text = text.dup`

			`while remaining_text.length > 0`
			`if remaining_text.start_with?("<chunk>")`
			`# Extract chunk content`
			`chunk_end = remaining_text.index("</chunk>")`
			`if chunk_end`
			`chunk = remaining_text[7..chunk_end - 1].strip`
			`chunks << chunk unless chunk.empty?`
			`remaining_text = remaining_text[chunk_end + 8..-1] \|\| ""`
			`else`
			`# Malformed chunk - add remaining text and break`
			`chunks << remaining_text[7..-1].strip`
			`break`
			`end`
			`else`
			`# Handle text before next chunk if it exists`
			`next_chunk = remaining_text.index("<chunk>")`
			`if next_chunk`
			`text_before = remaining_text[0...next_chunk].strip`
			`chunks << text_before unless text_before.empty?`
			`remaining_text = remaining_text[next_chunk..-1]`
			`else`
			`# No more chunks - add remaining text and break`
			`chunks << remaining_text.strip`
			`break`
			`end`
			`end`
			`end`

			`return chunks.reject(&:empty?)`
			`end`

			`[text]`
			`end`
			`end`