discourse-ai/app/jobs/regular/digest_rag_upload.rb

# frozen_string_literal: true

module ::Jobs
  class DigestRagUpload < ::Jobs::Base
    CHUNK_SIZE = 1024
    CHUNK_OVERLAP = 64
    MAX_FRAGMENTS = 10_000

    # TODO(roman): Add a way to automatically recover from errors, resulting in unindexed uploads.
    def execute(args)
      return if (upload = Upload.find_by(id: args[:upload_id])).nil?
      return if (ai_persona = AiPersona.find_by(id: args[:ai_persona_id])).nil?

      fragment_ids = RagDocumentFragment.where(ai_persona: ai_persona, upload: upload).pluck(:id)

      # Check if this is the first time we process this upload.
      if fragment_ids.empty?
        document = get_uploaded_file(upload)
        return if document.nil?

        fragment_ids = []
        idx = 0

        ActiveRecord::Base.transaction do
          chunk_document(document) do |chunk, metadata|
            fragment_ids << RagDocumentFragment.create!(
              ai_persona: ai_persona,
              fragment: chunk,
              fragment_number: idx + 1,
              upload: upload,
              metadata: metadata,
            ).id

            idx += 1

            if idx > MAX_FRAGMENTS
              Rails.logger.warn("Upload #{upload.id} has too many fragments, truncating.")
              break
            end
          end
        end
      end

      RagDocumentFragment.publish_status(
        upload,
        { total: fragment_ids.size, indexed: 0, left: fragment_ids.size },
      )

      fragment_ids.each_slice(50) do |slice|
        Jobs.enqueue(:generate_rag_embeddings, fragment_ids: slice)
      end
    end

    private

    def chunk_document(file)
      buffer = +""
      current_metadata = nil
      done = false
      overlap = ""

      while buffer.present? || !done
        if buffer.length < CHUNK_SIZE * 2
          read = file.read(CHUNK_SIZE * 2)
          done = true if read.nil?

          read = Encodings.to_utf8(read) if read

          buffer << (read || "")
        end

        # at this point we unconditionally have 2x CHUNK_SIZE worth of data in the buffer
        metadata_regex = /\[\[metadata (.*?)\]\]/m

        before_metadata, new_metadata, after_metadata = buffer.split(metadata_regex)
        to_chunk = nil

        if before_metadata.present?
          to_chunk = before_metadata
        elsif after_metadata.present?
          current_metadata = new_metadata
          to_chunk = after_metadata
          buffer = buffer.split(metadata_regex, 2).last
          overlap = ""
        end

        chunk, split_char = first_chunk(to_chunk)
        buffer = buffer[chunk.length..-1]

        processed_chunk = overlap + chunk

        processed_chunk.strip!
        processed_chunk.gsub!(/\n[\n]+/, "\n\n")

        yield processed_chunk, current_metadata

        overlap = (chunk[-CHUNK_OVERLAP..-1] || chunk) + split_char

        # remove first word it is probably truncated
        overlap = overlap.split(" ", 2).last
      end
    end

    def first_chunk(text, chunk_size: CHUNK_SIZE, splitters: ["\n\n", "\n", ".", ""])
      return text, " " if text.length <= chunk_size

      splitters = splitters.find_all { |s| text.include?(s) }.compact

      buffer = +""
      split_char = nil

      splitters.each do |splitter|
        split_char = splitter

        text
          .split(split_char)
          .each do |part|
            break if (buffer.length + split_char.length + part.length) > chunk_size
            buffer << split_char
            buffer << part
          end
        break if buffer.length > 0
      end

      [buffer, split_char]
    end

    def get_uploaded_file(upload)
      store = Discourse.store
      @file ||=
        if store.external?
          # Upload#filesize could be approximate.
          # add two extra Mbs to make sure that we'll be able to download the upload.
          max_filesize = upload.filesize + 2.megabytes
          store.download(upload, max_file_size_kb: max_filesize)
        else
          File.open(store.path_for(upload))
        end
    end
  end
end
FEATURE: AI Bot RAG support. (#537) This PR lets you associate uploads to an AI persona, which we'll split and generate embeddings from. When building the system prompt to get a bot reply, we'll do a similarity search followed by a re-ranking (if available). This will let us find the most relevant fragments from the body of knowledge you associated with the persona, resulting in better, more informed responses. For now, we'll only allow plain-text files, but this will change in the future. Commits: * FEATURE: RAG embeddings for the AI Bot This first commit introduces a UI where admins can upload text files, which we'll store, split into fragments, and generate embeddings of. In a next commit, we'll use those to give the bot additional information during conversations. * Basic asymmetric similarity search to provide guidance in system prompt * Fix tests and lint * Apply reranker to fragments * Uploads filter, css adjustments and file validations * Add placeholder for rag fragments * Update annotations 2024-04-01 12:43:34 -04:00			`# frozen_string_literal: true`

			`module ::Jobs`
			`class DigestRagUpload < ::Jobs::Base`
FEATURE: Add metadata support for RAG (#553) * FEATURE: Add metadata support for RAG You may include non indexed metadata in the RAG document by using [[metadata ....]] This information is attached to all the text below and provided to the retriever. This allows for RAG to operate within a rich amount of contexts without getting lost Also: - re-implemented chunking algorithm so it streams - moved indexing to background low priority queue * Baran gem no longer required. * tokenizers is on 4.4 ... upgrade it ... 2024-04-04 10:02:16 -04:00			`CHUNK_SIZE = 1024`
			`CHUNK_OVERLAP = 64`
			`MAX_FRAGMENTS = 10_000`

FEATURE: AI Bot RAG support. (#537) This PR lets you associate uploads to an AI persona, which we'll split and generate embeddings from. When building the system prompt to get a bot reply, we'll do a similarity search followed by a re-ranking (if available). This will let us find the most relevant fragments from the body of knowledge you associated with the persona, resulting in better, more informed responses. For now, we'll only allow plain-text files, but this will change in the future. Commits: * FEATURE: RAG embeddings for the AI Bot This first commit introduces a UI where admins can upload text files, which we'll store, split into fragments, and generate embeddings of. In a next commit, we'll use those to give the bot additional information during conversations. * Basic asymmetric similarity search to provide guidance in system prompt * Fix tests and lint * Apply reranker to fragments * Uploads filter, css adjustments and file validations * Add placeholder for rag fragments * Update annotations 2024-04-01 12:43:34 -04:00			`# TODO(roman): Add a way to automatically recover from errors, resulting in unindexed uploads.`
			`def execute(args)`
			`return if (upload = Upload.find_by(id: args[:upload_id])).nil?`
			`return if (ai_persona = AiPersona.find_by(id: args[:ai_persona_id])).nil?`

			`fragment_ids = RagDocumentFragment.where(ai_persona: ai_persona, upload: upload).pluck(:id)`

			`# Check if this is the first time we process this upload.`
			`if fragment_ids.empty?`
			`document = get_uploaded_file(upload)`
			`return if document.nil?`

FEATURE: Add metadata support for RAG (#553) * FEATURE: Add metadata support for RAG You may include non indexed metadata in the RAG document by using [[metadata ....]] This information is attached to all the text below and provided to the retriever. This allows for RAG to operate within a rich amount of contexts without getting lost Also: - re-implemented chunking algorithm so it streams - moved indexing to background low priority queue * Baran gem no longer required. * tokenizers is on 4.4 ... upgrade it ... 2024-04-04 10:02:16 -04:00			`fragment_ids = []`
			`idx = 0`
FEATURE: AI Bot RAG support. (#537) This PR lets you associate uploads to an AI persona, which we'll split and generate embeddings from. When building the system prompt to get a bot reply, we'll do a similarity search followed by a re-ranking (if available). This will let us find the most relevant fragments from the body of knowledge you associated with the persona, resulting in better, more informed responses. For now, we'll only allow plain-text files, but this will change in the future. Commits: * FEATURE: RAG embeddings for the AI Bot This first commit introduces a UI where admins can upload text files, which we'll store, split into fragments, and generate embeddings of. In a next commit, we'll use those to give the bot additional information during conversations. * Basic asymmetric similarity search to provide guidance in system prompt * Fix tests and lint * Apply reranker to fragments * Uploads filter, css adjustments and file validations * Add placeholder for rag fragments * Update annotations 2024-04-01 12:43:34 -04:00
FEATURE: Add metadata support for RAG (#553) * FEATURE: Add metadata support for RAG You may include non indexed metadata in the RAG document by using [[metadata ....]] This information is attached to all the text below and provided to the retriever. This allows for RAG to operate within a rich amount of contexts without getting lost Also: - re-implemented chunking algorithm so it streams - moved indexing to background low priority queue * Baran gem no longer required. * tokenizers is on 4.4 ... upgrade it ... 2024-04-04 10:02:16 -04:00			`ActiveRecord::Base.transaction do`
			`chunk_document(document) do \|chunk, metadata\|`
			`fragment_ids << RagDocumentFragment.create!(`
			`ai_persona: ai_persona,`
			`fragment: chunk,`
			`fragment_number: idx + 1,`
			`upload: upload,`
			`metadata: metadata,`
			`).id`
FEATURE: AI Bot RAG support. (#537) This PR lets you associate uploads to an AI persona, which we'll split and generate embeddings from. When building the system prompt to get a bot reply, we'll do a similarity search followed by a re-ranking (if available). This will let us find the most relevant fragments from the body of knowledge you associated with the persona, resulting in better, more informed responses. For now, we'll only allow plain-text files, but this will change in the future. Commits: * FEATURE: RAG embeddings for the AI Bot This first commit introduces a UI where admins can upload text files, which we'll store, split into fragments, and generate embeddings of. In a next commit, we'll use those to give the bot additional information during conversations. * Basic asymmetric similarity search to provide guidance in system prompt * Fix tests and lint * Apply reranker to fragments * Uploads filter, css adjustments and file validations * Add placeholder for rag fragments * Update annotations 2024-04-01 12:43:34 -04:00
FEATURE: Add metadata support for RAG (#553) * FEATURE: Add metadata support for RAG You may include non indexed metadata in the RAG document by using [[metadata ....]] This information is attached to all the text below and provided to the retriever. This allows for RAG to operate within a rich amount of contexts without getting lost Also: - re-implemented chunking algorithm so it streams - moved indexing to background low priority queue * Baran gem no longer required. * tokenizers is on 4.4 ... upgrade it ... 2024-04-04 10:02:16 -04:00			`idx += 1`
FEATURE: AI Bot RAG support. (#537) This PR lets you associate uploads to an AI persona, which we'll split and generate embeddings from. When building the system prompt to get a bot reply, we'll do a similarity search followed by a re-ranking (if available). This will let us find the most relevant fragments from the body of knowledge you associated with the persona, resulting in better, more informed responses. For now, we'll only allow plain-text files, but this will change in the future. Commits: * FEATURE: RAG embeddings for the AI Bot This first commit introduces a UI where admins can upload text files, which we'll store, split into fragments, and generate embeddings of. In a next commit, we'll use those to give the bot additional information during conversations. * Basic asymmetric similarity search to provide guidance in system prompt * Fix tests and lint * Apply reranker to fragments * Uploads filter, css adjustments and file validations * Add placeholder for rag fragments * Update annotations 2024-04-01 12:43:34 -04:00
FEATURE: Add metadata support for RAG (#553) * FEATURE: Add metadata support for RAG You may include non indexed metadata in the RAG document by using [[metadata ....]] This information is attached to all the text below and provided to the retriever. This allows for RAG to operate within a rich amount of contexts without getting lost Also: - re-implemented chunking algorithm so it streams - moved indexing to background low priority queue * Baran gem no longer required. * tokenizers is on 4.4 ... upgrade it ... 2024-04-04 10:02:16 -04:00			`if idx > MAX_FRAGMENTS`
			`Rails.logger.warn("Upload #{upload.id} has too many fragments, truncating.")`
			`break`
FEATURE: AI Bot RAG support. (#537) This PR lets you associate uploads to an AI persona, which we'll split and generate embeddings from. When building the system prompt to get a bot reply, we'll do a similarity search followed by a re-ranking (if available). This will let us find the most relevant fragments from the body of knowledge you associated with the persona, resulting in better, more informed responses. For now, we'll only allow plain-text files, but this will change in the future. Commits: * FEATURE: RAG embeddings for the AI Bot This first commit introduces a UI where admins can upload text files, which we'll store, split into fragments, and generate embeddings of. In a next commit, we'll use those to give the bot additional information during conversations. * Basic asymmetric similarity search to provide guidance in system prompt * Fix tests and lint * Apply reranker to fragments * Uploads filter, css adjustments and file validations * Add placeholder for rag fragments * Update annotations 2024-04-01 12:43:34 -04:00			`end`
FEATURE: Add metadata support for RAG (#553) * FEATURE: Add metadata support for RAG You may include non indexed metadata in the RAG document by using [[metadata ....]] This information is attached to all the text below and provided to the retriever. This allows for RAG to operate within a rich amount of contexts without getting lost Also: - re-implemented chunking algorithm so it streams - moved indexing to background low priority queue * Baran gem no longer required. * tokenizers is on 4.4 ... upgrade it ... 2024-04-04 10:02:16 -04:00			`end`
FEATURE: AI Bot RAG support. (#537) This PR lets you associate uploads to an AI persona, which we'll split and generate embeddings from. When building the system prompt to get a bot reply, we'll do a similarity search followed by a re-ranking (if available). This will let us find the most relevant fragments from the body of knowledge you associated with the persona, resulting in better, more informed responses. For now, we'll only allow plain-text files, but this will change in the future. Commits: * FEATURE: RAG embeddings for the AI Bot This first commit introduces a UI where admins can upload text files, which we'll store, split into fragments, and generate embeddings of. In a next commit, we'll use those to give the bot additional information during conversations. * Basic asymmetric similarity search to provide guidance in system prompt * Fix tests and lint * Apply reranker to fragments * Uploads filter, css adjustments and file validations * Add placeholder for rag fragments * Update annotations 2024-04-01 12:43:34 -04:00			`end`
			`end`

UX: Display the indexing progress for RAG uploads (#557) 2024-04-09 10:03:07 -04:00			`RagDocumentFragment.publish_status(`
			`upload,`
			`{ total: fragment_ids.size, indexed: 0, left: fragment_ids.size },`
			`)`

FEATURE: AI Bot RAG support. (#537) This PR lets you associate uploads to an AI persona, which we'll split and generate embeddings from. When building the system prompt to get a bot reply, we'll do a similarity search followed by a re-ranking (if available). This will let us find the most relevant fragments from the body of knowledge you associated with the persona, resulting in better, more informed responses. For now, we'll only allow plain-text files, but this will change in the future. Commits: * FEATURE: RAG embeddings for the AI Bot This first commit introduces a UI where admins can upload text files, which we'll store, split into fragments, and generate embeddings of. In a next commit, we'll use those to give the bot additional information during conversations. * Basic asymmetric similarity search to provide guidance in system prompt * Fix tests and lint * Apply reranker to fragments * Uploads filter, css adjustments and file validations * Add placeholder for rag fragments * Update annotations 2024-04-01 12:43:34 -04:00			`fragment_ids.each_slice(50) do \|slice\|`
			`Jobs.enqueue(:generate_rag_embeddings, fragment_ids: slice)`
			`end`
			`end`

			`private`

FEATURE: Add metadata support for RAG (#553) * FEATURE: Add metadata support for RAG You may include non indexed metadata in the RAG document by using [[metadata ....]] This information is attached to all the text below and provided to the retriever. This allows for RAG to operate within a rich amount of contexts without getting lost Also: - re-implemented chunking algorithm so it streams - moved indexing to background low priority queue * Baran gem no longer required. * tokenizers is on 4.4 ... upgrade it ... 2024-04-04 10:02:16 -04:00			`def chunk_document(file)`
			`buffer = +""`
			`current_metadata = nil`
			`done = false`
			`overlap = ""`

			`while buffer.present? \|\| !done`
			`if buffer.length < CHUNK_SIZE * 2`
			`read = file.read(CHUNK_SIZE * 2)`
			`done = true if read.nil?`

			`read = Encodings.to_utf8(read) if read`

			`buffer << (read \|\| "")`
			`end`

			`# at this point we unconditionally have 2x CHUNK_SIZE worth of data in the buffer`
			`metadata_regex = /\[\[metadata (.*?)\]\]/m`

			`before_metadata, new_metadata, after_metadata = buffer.split(metadata_regex)`
			`to_chunk = nil`

			`if before_metadata.present?`
			`to_chunk = before_metadata`
			`elsif after_metadata.present?`
			`current_metadata = new_metadata`
			`to_chunk = after_metadata`
			`buffer = buffer.split(metadata_regex, 2).last`
			`overlap = ""`
			`end`

			`chunk, split_char = first_chunk(to_chunk)`
			`buffer = buffer[chunk.length..-1]`

			`processed_chunk = overlap + chunk`

			`processed_chunk.strip!`
			`processed_chunk.gsub!(/\n[\n]+/, "\n\n")`

			`yield processed_chunk, current_metadata`

			`overlap = (chunk[-CHUNK_OVERLAP..-1] \|\| chunk) + split_char`

			`# remove first word it is probably truncated`
			`overlap = overlap.split(" ", 2).last`
			`end`
			`end`

			`def first_chunk(text, chunk_size: CHUNK_SIZE, splitters: ["\n\n", "\n", ".", ""])`
			`return text, " " if text.length <= chunk_size`

			`splitters = splitters.find_all { \|s\| text.include?(s) }.compact`

			`buffer = +""`
			`split_char = nil`

			`splitters.each do \|splitter\|`
			`split_char = splitter`

			`text`
			`.split(split_char)`
			`.each do \|part\|`
			`break if (buffer.length + split_char.length + part.length) > chunk_size`
			`buffer << split_char`
			`buffer << part`
			`end`
			`break if buffer.length > 0`
			`end`

			`[buffer, split_char]`
			`end`

FEATURE: AI Bot RAG support. (#537) This PR lets you associate uploads to an AI persona, which we'll split and generate embeddings from. When building the system prompt to get a bot reply, we'll do a similarity search followed by a re-ranking (if available). This will let us find the most relevant fragments from the body of knowledge you associated with the persona, resulting in better, more informed responses. For now, we'll only allow plain-text files, but this will change in the future. Commits: * FEATURE: RAG embeddings for the AI Bot This first commit introduces a UI where admins can upload text files, which we'll store, split into fragments, and generate embeddings of. In a next commit, we'll use those to give the bot additional information during conversations. * Basic asymmetric similarity search to provide guidance in system prompt * Fix tests and lint * Apply reranker to fragments * Uploads filter, css adjustments and file validations * Add placeholder for rag fragments * Update annotations 2024-04-01 12:43:34 -04:00			`def get_uploaded_file(upload)`
			`store = Discourse.store`
			`@file \|\|=`
			`if store.external?`
			`# Upload#filesize could be approximate.`
			`# add two extra Mbs to make sure that we'll be able to download the upload.`
			`max_filesize = upload.filesize + 2.megabytes`
			`store.download(upload, max_file_size_kb: max_filesize)`
			`else`
			`File.open(store.path_for(upload))`
			`end`
			`end`
			`end`
			`end`