FEATURE: Add metadata support for RAG (#553)

* FEATURE: Add metadata support for RAG

You may include non indexed metadata in the RAG document by using

[[metadata ....]]

This information is attached to all the text below and provided to
the retriever.

This allows for RAG to operate within a rich amount of contexts
without getting lost

Also:

- re-implemented chunking algorithm so it streams
- moved indexing to background low priority queue

* Baran gem no longer required.

* tokenizers is on 4.4 ... upgrade it ...
This commit is contained in:
Sam 2024-04-05 01:02:16 +11:00 committed by GitHub
parent bc561eb332
commit 830cc26075
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 256 additions and 34 deletions

View File

@ -2,6 +2,10 @@
module ::Jobs
class DigestRagUpload < ::Jobs::Base
CHUNK_SIZE = 1024
CHUNK_OVERLAP = 64
MAX_FRAGMENTS = 10_000
# TODO(roman): Add a way to automatically recover from errors, resulting in unindexed uploads.
def execute(args)
return if (upload = Upload.find_by(id: args[:upload_id])).nil?
@ -14,33 +18,25 @@ module ::Jobs
document = get_uploaded_file(upload)
return if document.nil?
chunk_size = 1024
chunk_overlap = 64
chunks = []
overlap = ""
splitter =
Baran::RecursiveCharacterTextSplitter.new(
chunk_size: chunk_size,
chunk_overlap: chunk_overlap,
separators: ["\n\n", "\n", " ", ""],
)
while raw_text = document.read(2048)
splitter.chunks(overlap + raw_text).each { |chunk| chunks << chunk[:text] }
overlap = chunks.last[-chunk_overlap..-1] || chunks.last
end
fragment_ids = []
idx = 0
ActiveRecord::Base.transaction do
fragment_ids =
chunks.each_with_index.map do |fragment_text, idx|
RagDocumentFragment.create!(
chunk_document(document) do |chunk, metadata|
fragment_ids << RagDocumentFragment.create!(
ai_persona: ai_persona,
fragment: Encodings.to_utf8(fragment_text),
fragment: chunk,
fragment_number: idx + 1,
upload: upload,
metadata: metadata,
).id
idx += 1
if idx > MAX_FRAGMENTS
Rails.logger.warn("Upload #{upload.id} has too many fragments, truncating.")
break
end
end
end
end
@ -52,6 +48,78 @@ module ::Jobs
private
def chunk_document(file)
buffer = +""
current_metadata = nil
done = false
overlap = ""
while buffer.present? || !done
if buffer.length < CHUNK_SIZE * 2
read = file.read(CHUNK_SIZE * 2)
done = true if read.nil?
read = Encodings.to_utf8(read) if read
buffer << (read || "")
end
# at this point we unconditionally have 2x CHUNK_SIZE worth of data in the buffer
metadata_regex = /\[\[metadata (.*?)\]\]/m
before_metadata, new_metadata, after_metadata = buffer.split(metadata_regex)
to_chunk = nil
if before_metadata.present?
to_chunk = before_metadata
elsif after_metadata.present?
current_metadata = new_metadata
to_chunk = after_metadata
buffer = buffer.split(metadata_regex, 2).last
overlap = ""
end
chunk, split_char = first_chunk(to_chunk)
buffer = buffer[chunk.length..-1]
processed_chunk = overlap + chunk
processed_chunk.strip!
processed_chunk.gsub!(/\n[\n]+/, "\n\n")
yield processed_chunk, current_metadata
overlap = (chunk[-CHUNK_OVERLAP..-1] || chunk) + split_char
# remove first word it is probably truncated
overlap = overlap.split(" ", 2).last
end
end
def first_chunk(text, chunk_size: CHUNK_SIZE, splitters: ["\n\n", "\n", ".", ""])
return text, " " if text.length <= chunk_size
splitters = splitters.find_all { |s| text.include?(s) }.compact
buffer = +""
split_char = nil
splitters.each do |splitter|
split_char = splitter
text
.split(split_char)
.each do |part|
break if (buffer.length + split_char.length + part.length) > chunk_size
buffer << split_char
buffer << part
end
break if buffer.length > 0
end
[buffer, split_char]
end
def get_uploaded_file(upload)
store = Discourse.store
@file ||=

View File

@ -2,6 +2,8 @@
module ::Jobs
class GenerateRagEmbeddings < ::Jobs::Base
sidekiq_options queue: "low"
def execute(args)
return if (fragments = RagDocumentFragment.where(id: args[:fragment_ids].to_a)).empty?

View File

@ -38,9 +38,10 @@ end
#
# id :bigint not null, primary key
# fragment :text not null
# ai_persona_id :integer not null
# upload_id :integer not null
# ai_persona_id :integer not null
# fragment_number :integer not null
# created_at :datetime not null
# updated_at :datetime not null
# metadata :text
#

View File

@ -0,0 +1,8 @@
# frozen_string_literal: true
class AddMetadataToRagDocumentFrament < ActiveRecord::Migration[7.0]
def change
# limit is purely for safety
add_column :rag_document_fragments, :metadata, :text, null: true, limit: 100_000
end
end

View File

@ -227,12 +227,14 @@ module DiscourseAi
offset: 0,
)
guidance =
fragments =
RagDocumentFragment.where(upload_id: upload_refs, id: candidate_fragment_ids).pluck(
:fragment,
:metadata,
)
if reranker.reranker_configured?
guidance = fragments.map { |fragment, _metadata| fragment }
ranks =
DiscourseAi::Inference::HuggingFaceTextEmbeddings
.rerank(conversation_context.last[:content], guidance)
@ -241,21 +243,30 @@ module DiscourseAi
.map { _1[:index] }
if ranks.empty?
guidance = guidance.take(10)
fragments = fragments.take(10)
else
guidance = ranks.map { |idx| guidance[idx] }
fragments = ranks.map { |idx| fragments[idx] }
end
end
<<~TEXT
<guidance>
The following texts will give you additional guidance to elaborate a response.
The following texts will give you additional guidance for your response.
We included them because we believe they are relevant to this conversation topic.
Take them into account to elaborate a response.
Texts:
#{guidance.join("\n")}
#{
fragments
.map do |fragment, metadata|
if metadata.present?
["# #{metadata}", fragment].join("\n")
else
fragment
end
end
.join("\n")
}
</guidance>
TEXT
end

View File

@ -8,9 +8,8 @@
# url: https://meta.discourse.org/t/discourse-ai/259214
# required_version: 2.7.0
gem "tokenizers", "0.4.3"
gem "tokenizers", "0.4.4"
gem "tiktoken_ruby", "0.0.7"
gem "baran", "0.1.10"
enabled_site_setting :discourse_ai_enabled

38
spec/fixtures/rag/doc_with_metadata.txt vendored Normal file
View File

@ -0,0 +1,38 @@
No metadata yet, first chunk こんにちは
[[metadata Sam's story]]
Once upon a time, in a land far, far away (or maybe just down the street, who knows?), there lived a brilliant AI developer named Sam. Sam had a vision, a dream, nay, a burning desire to create the most impressive discourse AI the world had ever seen. Armed with a keyboard, an endless supply of coffee, and a mildly concerning lack of sleep, Sam embarked on this epic quest.
Day and night, Sam toiled away, crafting lines of code that would make even the most seasoned programmers weep with joy. The AI slowly took shape, like a majestic, digital phoenix rising from the ashes of Sams social life. It was a thing of beauty, a marvel of modern technology, and it had the uncanny ability to generate conversations about anything from the meaning of life to the best way to make a grilled cheese sandwich.
As the project neared completion, Sam realized that there was one crucial element missing: a spec doc. And not just any spec doc, but a spec doc filled with glorious, meaningless dummy text. Because lets face it, nothing screams “professional” quite like a wall of lorem ipsum.
So, Sam set out to create the most impressive dummy text the world had ever seen. It would be a masterpiece, a symphony of nonsensical words that would leave readers in awe of Sams ability to fill space with utter gibberish. And thus, the dummy text was born.
[[METADATE]]
It was a sight to behold, a tapestry of random words woven together in a way that almost made sense, but not quite. It spoke of ancient mysteries, like why hotdogs come in packs of ten, while hotdog buns come in packs of eight. It pondered the great questions of our time, like whether or not pineapple belongs on pizza (spoiler alert: it does). And it even dared to explore the darkest corners of Sams imagination, like the idea of a world without caffeine.
In the end, Sams discourse AI was a resounding success. It could carry on conversations with humans for hours on end, discussing everything from the latest trends in fashion to the intricacies of quantum physics. And whenever anyone asked about the impressive spec doc, Sam would just smile and nod, knowing full well that the real magic lay in the glorious dummy text that started it all.
And so, dear reader, if you ever find yourself in need of some impressive dummy text for your own project, just remember the tale of Sam and their magnificent discourse AI. Because sometimes, all it takes is a little nonsense to make the world a whole lot more interesting.
[[metadata Jane's story]]
Ah, Jane. The name alone conjures up images of brilliance, wit, and a certain je ne sais quoi that can only be described as “Janeesque.” And so, it comes as no surprise that our dear Jane found herself embarking on a journey of epic proportions: the creation of a discourse AI that would put all other discourse AIs to shame.
With a twinkle in her eye and a spring in her step, Jane set forth on this noble quest. She gathered her trusty companions: a laptop, a never-ending supply of tea, and a collection of obscure reference books that would make even the most studious librarian green with envy. Armed with these tools, Jane began her work.
As she typed away at her keyboard, Jane couldnt help but feel a sense of excitement bubbling up inside her. This was no ordinary project; this was a chance to create something truly extraordinary. She poured her heart and soul into every line of code, crafting algorithms that would make even the most advanced AI systems [[look]] like mere calculators.
But Jane knew that a discourse AI was only as good as its training data. And so, she scoured the internet, collecting the most fascinating, hilarious, and downright bizarre conversations she could find. From heated debates about the proper way to make a cup of tea to in-depth discussions on the mating habits of the rare Peruvian flying squirrel, Jane left no stone unturned.
As the weeks turned into months, Janes creation began to take shape. It was a thing of beauty, a masterpiece of artificial intelligence that could engage in witty banter, offer sage advice, and even tell the occasional joke (though its sense of humor was admittedly a bit on the quirky side). Jane beamed with pride as she watched her AI converse with humans, marveling at its ability to understand and respond to even the most complex of queries.
But there was one final hurdle to overcome: the dreaded spec doc. Jane knew that no self-respecting AI could be unleashed upon the world without a proper set of specifications. And so, she set about crafting the most magnificent dummy text the world had ever seen.
It was a masterpiece of nonsense, a symphony of absurdity that would leave even the most seasoned tech writer scratching their head in confusion. From descriptions of the AIs ability to recite Shakespearean sonnets in binary code to detailed explanations of its built-in “tea break” feature, Janes dummy text was a work of art.
And so, with a flourish of her keyboard and a triumphant grin, Jane unleashed her creation upon the world. The response was immediate and overwhelming. People from all walks of life flocked to converse with Janes AI, marveling at its intelligence, its charm, and its uncanny ability to make even the most mundane of topics seem fascinating.
In the end, Janes discourse AI became the stuff of legend, a shining example of what can be achieved when brilliance, determination, and a healthy dose of eccentricity come together. And as for Jane herself? Well, lets just say that shes already hard at work on her next project: a robot that can make the perfect cup of tea. But that, dear reader, is a story for another day.

View File

@ -0,0 +1,61 @@
metadata:
number: 1
No metadata yet, first chunk こんにちは
metadata: Sam's story
number: 2
Once upon a time, in a land far, far away (or maybe just down the street, who knows?), there lived a brilliant AI developer named Sam. Sam had a vision, a dream, nay, a burning desire to create the most impressive discourse AI the world had ever seen. Armed with a keyboard, an endless supply of coffee, and a mildly concerning lack of sleep, Sam embarked on this epic quest.
Day and night, Sam toiled away, crafting lines of code that would make even the most seasoned programmers weep with joy. The AI slowly took shape, like a majestic, digital phoenix rising from the ashes of Sams social life. It was a thing of beauty, a marvel of modern technology, and it had the uncanny ability to generate conversations about anything from the meaning of life to the best way to make a grilled cheese sandwich.
metadata: Sam's story
number: 3
of life to the best way to make a grilled cheese sandwich.
As the project neared completion, Sam realized that there was one crucial element missing: a spec doc. And not just any spec doc, but a spec doc filled with glorious, meaningless dummy text. Because lets face it, nothing screams “professional” quite like a wall of lorem ipsum.
So, Sam set out to create the most impressive dummy text the world had ever seen. It would be a masterpiece, a symphony of nonsensical words that would leave readers in awe of Sams ability to fill space with utter gibberish. And thus, the dummy text was born.
[[METADATE]]
It was a sight to behold, a tapestry of random words woven together in a way that almost made sense, but not quite. It spoke of ancient mysteries, like why hotdogs come in packs of ten, while hotdog buns come in packs of eight. It pondered the great questions of our time, like whether or not pineapple belongs on pizza (spoiler alert: it does). And it even dared to explore the darkest corners of Sams imagination, like the idea of a world without caffeine.
metadata: Sam's story
number: 4
Sams imagination, like the idea of a world without caffeine.
In the end, Sams discourse AI was a resounding success. It could carry on conversations with humans for hours on end, discussing everything from the latest trends in fashion to the intricacies of quantum physics. And whenever anyone asked about the impressive spec doc, Sam would just smile and nod, knowing full well that the real magic lay in the glorious dummy text that started it all.
And so, dear reader, if you ever find yourself in need of some impressive dummy text for your own project, just remember the tale of Sam and their magnificent discourse AI. Because sometimes, all it takes is a little nonsense to make the world a whole lot more interesting.
metadata: Jane's story
number: 5
Ah, Jane. The name alone conjures up images of brilliance, wit, and a certain je ne sais quoi that can only be described as “Janeesque.” And so, it comes as no surprise that our dear Jane found herself embarking on a journey of epic proportions: the creation of a discourse AI that would put all other discourse AIs to shame.
With a twinkle in her eye and a spring in her step, Jane set forth on this noble quest. She gathered her trusty companions: a laptop, a never-ending supply of tea, and a collection of obscure reference books that would make even the most studious librarian green with envy. Armed with these tools, Jane began her work.
As she typed away at her keyboard, Jane couldnt help but feel a sense of excitement bubbling up inside her. This was no ordinary project; this was a chance to create something truly extraordinary. She poured her heart and soul into every line of code, crafting algorithms that would make even the most advanced AI systems [[look]] like mere calculators.
metadata: Jane's story
number: 6
the most advanced AI systems [[look]] like mere calculators.
But Jane knew that a discourse AI was only as good as its training data. And so, she scoured the internet, collecting the most fascinating, hilarious, and downright bizarre conversations she could find. From heated debates about the proper way to make a cup of tea to in-depth discussions on the mating habits of the rare Peruvian flying squirrel, Jane left no stone unturned.
As the weeks turned into months, Janes creation began to take shape. It was a thing of beauty, a masterpiece of artificial intelligence that could engage in witty banter, offer sage advice, and even tell the occasional joke (though its sense of humor was admittedly a bit on the quirky side). Jane beamed with pride as she watched her AI converse with humans, marveling at its ability to understand and respond to even the most complex of queries.
metadata: Jane's story
number: 7
to understand and respond to even the most complex of queries.
But there was one final hurdle to overcome: the dreaded spec doc. Jane knew that no self-respecting AI could be unleashed upon the world without a proper set of specifications. And so, she set about crafting the most magnificent dummy text the world had ever seen.
It was a masterpiece of nonsense, a symphony of absurdity that would leave even the most seasoned tech writer scratching their head in confusion. From descriptions of the AIs ability to recite Shakespearean sonnets in binary code to detailed explanations of its built-in “tea break” feature, Janes dummy text was a work of art.
And so, with a flourish of her keyboard and a triumphant grin, Jane unleashed her creation upon the world. The response was immediate and overwhelming. People from all walks of life flocked to converse with Janes AI, marveling at its intelligence, its charm, and its uncanny ability to make even the most mundane of topics seem fascinating.
metadata: Jane's story
number: 8
to make even the most mundane of topics seem fascinating.
In the end, Janes discourse AI became the stuff of legend, a shining example of what can be achieved when brilliance, determination, and a healthy dose of eccentricity come together. And as for Jane herself? Well, lets just say that shes already hard at work on her next project: a robot that can make the perfect cup of tea. But that, dear reader, is a story for another day.

View File

@ -13,9 +13,20 @@ RSpec.describe Jobs::DigestRagUpload do
let(:expected_embedding) { [0.0038493] * vector_rep.dimensions }
let(:document_with_metadata) { plugin_file_from_fixtures("doc_with_metadata.txt", "rag") }
let(:parsed_document_with_metadata) do
plugin_file_from_fixtures("parsed_doc_with_metadata.txt", "rag")
end
let(:upload_with_metadata) do
UploadCreator.new(document_with_metadata, "document.txt").create_for(Discourse.system_user.id)
end
before do
SiteSetting.ai_embeddings_enabled = true
SiteSetting.ai_embeddings_discourse_service_api_endpoint = "http://test.com"
SiteSetting.authorized_extensions = "txt"
WebMock.stub_request(
:post,
@ -24,6 +35,29 @@ RSpec.describe Jobs::DigestRagUpload do
end
describe "#execute" do
context "when processing an upload containing metadata" do
it "correctly splits on metadata boundary" do
described_class.new.execute(upload_id: upload_with_metadata.id, ai_persona_id: persona.id)
parsed = +""
first = true
RagDocumentFragment
.where(upload: upload_with_metadata)
.order(:fragment_number)
.each do |fragment|
parsed << "\n\n" if !first
parsed << "metadata: #{fragment.metadata}\n"
parsed << "number: #{fragment.fragment_number}\n"
parsed << fragment.fragment
first = false
end
# to rebuild parsed
# File.write("/tmp/testing", parsed)
expect(parsed).to eq(parsed_document_with_metadata.read)
end
end
context "when processing an upload for the first time" do
before { File.expects(:open).returns(document_file) }