mirror of
https://github.com/discourse/discourse-ai.git
synced 2025-06-28 18:42:16 +00:00
REFACTOR: Cohesive narrative for single-chunk summaries. (#103)
Single and multi-chunk summaries end using different prompts for the last summary. This change detects when the summarized content fits in a single chunk and uses a slightly different prompt, which leads to more consistent summary formats. This PR also moves the chunk-splitting step to the `FoldContent` strategy as preparation for implementing streamed summaries.
This commit is contained in:
parent
48d880d3c8
commit
5f0c617880
@ -36,13 +36,17 @@ module DiscourseAi
|
|||||||
instructions = build_base_prompt(opts)
|
instructions = build_base_prompt(opts)
|
||||||
|
|
||||||
text_to_summarize = contents.map { |c| format_content_item(c) }.join
|
text_to_summarize = contents.map { |c| format_content_item(c) }.join
|
||||||
truncated_content = tokenizer.truncate(text_to_summarize, max_tokens - reserved_tokens)
|
truncated_content = tokenizer.truncate(text_to_summarize, available_tokens)
|
||||||
|
|
||||||
instructions += "<input>#{truncated_content}</input>\nAssistant:\n"
|
instructions += "<input>#{truncated_content}</input>\nAssistant:\n"
|
||||||
|
|
||||||
completion(instructions)
|
completion(instructions)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def summarize_single(chunk_text, opts)
|
||||||
|
summarize_chunk(chunk_text, opts.merge(single_chunk: true))
|
||||||
|
end
|
||||||
|
|
||||||
private
|
private
|
||||||
|
|
||||||
def summarize_chunk(chunk_text, opts)
|
def summarize_chunk(chunk_text, opts)
|
||||||
@ -50,8 +54,15 @@ module DiscourseAi
|
|||||||
end
|
end
|
||||||
|
|
||||||
def build_base_prompt(opts)
|
def build_base_prompt(opts)
|
||||||
|
initial_instruction =
|
||||||
|
if opts[:single_chunk]
|
||||||
|
"Summarize the following forum discussion inside the given <input> tag, creating a cohesive narrative."
|
||||||
|
else
|
||||||
|
"Summarize the following forum discussion inside the given <input> tag."
|
||||||
|
end
|
||||||
|
|
||||||
base_prompt = <<~TEXT
|
base_prompt = <<~TEXT
|
||||||
Human: Summarize the following forum discussion inside the given <input> tag.
|
Human: #{initial_instruction}
|
||||||
Include only the summary inside <ai> tags.
|
Include only the summary inside <ai> tags.
|
||||||
TEXT
|
TEXT
|
||||||
|
|
||||||
@ -63,7 +74,7 @@ module DiscourseAi
|
|||||||
:content_title
|
:content_title
|
||||||
]
|
]
|
||||||
|
|
||||||
base_prompt += "Don't use more than 400 words.\n"
|
base_prompt += "Don't use more than 400 words.\n" unless opts[:single_chunk]
|
||||||
end
|
end
|
||||||
|
|
||||||
def completion(prompt)
|
def completion(prompt)
|
||||||
|
@ -21,34 +21,13 @@ module DiscourseAi
|
|||||||
raise NotImplemented
|
raise NotImplemented
|
||||||
end
|
end
|
||||||
|
|
||||||
def summarize_in_chunks(contents, opts)
|
def summarize_in_chunks(chunks, opts)
|
||||||
chunks = []
|
chunks.map do |chunk|
|
||||||
|
chunk[:summary] = summarize_chunk(chunk[:summary], opts)
|
||||||
section = { ids: [], summary: "" }
|
chunk
|
||||||
|
|
||||||
contents.each do |item|
|
|
||||||
new_content = format_content_item(item)
|
|
||||||
|
|
||||||
if tokenizer.can_expand_tokens?(
|
|
||||||
section[:summary],
|
|
||||||
new_content,
|
|
||||||
max_tokens - reserved_tokens,
|
|
||||||
)
|
|
||||||
section[:summary] += new_content
|
|
||||||
section[:ids] << item[:id]
|
|
||||||
else
|
|
||||||
chunks << section
|
|
||||||
section = { ids: [item[:id]], summary: new_content }
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
chunks << section if section[:summary].present?
|
|
||||||
|
|
||||||
chunks.each { |chunk| chunk[:summary] = summarize_chunk(chunk[:summary], opts) }
|
|
||||||
|
|
||||||
chunks
|
|
||||||
end
|
|
||||||
|
|
||||||
def concatenate_summaries(_summaries)
|
def concatenate_summaries(_summaries)
|
||||||
raise NotImplemented
|
raise NotImplemented
|
||||||
end
|
end
|
||||||
@ -57,13 +36,7 @@ module DiscourseAi
|
|||||||
raise NotImplemented
|
raise NotImplemented
|
||||||
end
|
end
|
||||||
|
|
||||||
attr_reader :model
|
def summarize_single(chunk_text, opts)
|
||||||
|
|
||||||
protected
|
|
||||||
|
|
||||||
attr_reader :max_tokens
|
|
||||||
|
|
||||||
def summarize_chunk(_chunk_text, _opts)
|
|
||||||
raise NotImplemented
|
raise NotImplemented
|
||||||
end
|
end
|
||||||
|
|
||||||
@ -71,11 +44,29 @@ module DiscourseAi
|
|||||||
"(#{item[:id]} #{item[:poster]} said: #{item[:text]} "
|
"(#{item[:id]} #{item[:poster]} said: #{item[:text]} "
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def available_tokens
|
||||||
|
max_tokens - reserved_tokens
|
||||||
|
end
|
||||||
|
|
||||||
|
attr_reader :model, :max_tokens
|
||||||
|
|
||||||
|
protected
|
||||||
|
|
||||||
def reserved_tokens
|
def reserved_tokens
|
||||||
# Reserve tokens for the response and the base prompt
|
# Reserve tokens for the response and the base prompt
|
||||||
# ~500 words
|
# ~500 words
|
||||||
700
|
700
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def summarize_chunk(_chunk_text, _opts)
|
||||||
|
raise NotImplemented
|
||||||
|
end
|
||||||
|
|
||||||
|
def tokenizer
|
||||||
|
raise NotImplemented
|
||||||
|
end
|
||||||
|
|
||||||
|
delegate :can_expand_tokens?, to: :tokenizer
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
@ -29,11 +29,15 @@ module DiscourseAi
|
|||||||
def summarize_with_truncation(contents, opts)
|
def summarize_with_truncation(contents, opts)
|
||||||
text_to_summarize = contents.map { |c| format_content_item(c) }.join
|
text_to_summarize = contents.map { |c| format_content_item(c) }.join
|
||||||
truncated_content =
|
truncated_content =
|
||||||
::DiscourseAi::Tokenizer::BertTokenizer.truncate(text_to_summarize, max_tokens)
|
::DiscourseAi::Tokenizer::BertTokenizer.truncate(text_to_summarize, available_tokens)
|
||||||
|
|
||||||
completion(truncated_content)
|
completion(truncated_content)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def summarize_single(chunk_text, _opts)
|
||||||
|
completion(chunk_text)
|
||||||
|
end
|
||||||
|
|
||||||
private
|
private
|
||||||
|
|
||||||
def summarize_chunk(chunk_text, _opts)
|
def summarize_chunk(chunk_text, _opts)
|
||||||
|
@ -37,7 +37,7 @@ module DiscourseAi
|
|||||||
messages = [{ role: "system", content: build_base_prompt(opts) }]
|
messages = [{ role: "system", content: build_base_prompt(opts) }]
|
||||||
|
|
||||||
text_to_summarize = contents.map { |c| format_content_item(c) }.join
|
text_to_summarize = contents.map { |c| format_content_item(c) }.join
|
||||||
truncated_content = tokenizer.truncate(text_to_summarize, max_tokens - reserved_tokens)
|
truncated_content = tokenizer.truncate(text_to_summarize, available_tokens)
|
||||||
|
|
||||||
messages << {
|
messages << {
|
||||||
role: "user",
|
role: "user",
|
||||||
@ -47,13 +47,24 @@ module DiscourseAi
|
|||||||
completion(messages)
|
completion(messages)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def summarize_single(chunk_text, opts)
|
||||||
|
summarize_chunk(chunk_text, opts.merge(single_chunk: true))
|
||||||
|
end
|
||||||
|
|
||||||
private
|
private
|
||||||
|
|
||||||
def summarize_chunk(chunk_text, opts)
|
def summarize_chunk(chunk_text, opts)
|
||||||
|
summary_instruction =
|
||||||
|
if opts[:single_chunk]
|
||||||
|
"Summarize the following forum discussion, creating a cohesive narrative:"
|
||||||
|
else
|
||||||
|
"Summarize the following in 400 words:"
|
||||||
|
end
|
||||||
|
|
||||||
completion(
|
completion(
|
||||||
[
|
[
|
||||||
{ role: "system", content: build_base_prompt(opts) },
|
{ role: "system", content: build_base_prompt(opts) },
|
||||||
{ role: "user", content: "Summarize the following in 400 words:\n#{chunk_text}" },
|
{ role: "user", content: "#{summary_instruction}\n#{chunk_text}" },
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
end
|
end
|
||||||
|
@ -18,13 +18,47 @@ module DiscourseAi
|
|||||||
|
|
||||||
def summarize(content)
|
def summarize(content)
|
||||||
opts = content.except(:contents)
|
opts = content.except(:contents)
|
||||||
summaries = completion_model.summarize_in_chunks(content[:contents], opts)
|
|
||||||
|
|
||||||
return { summary: summaries.first[:summary], chunks: [] } if summaries.length == 1
|
chunks = split_into_chunks(content[:contents])
|
||||||
|
|
||||||
|
if chunks.length == 1
|
||||||
|
{ summary: completion_model.summarize_single(chunks.first[:summary], opts), chunks: [] }
|
||||||
|
else
|
||||||
|
summaries = completion_model.summarize_in_chunks(chunks, opts)
|
||||||
|
|
||||||
{ summary: completion_model.concatenate_summaries(summaries), chunks: summaries }
|
{ summary: completion_model.concatenate_summaries(summaries), chunks: summaries }
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
private
|
||||||
|
|
||||||
|
def split_into_chunks(contents)
|
||||||
|
section = { ids: [], summary: "" }
|
||||||
|
|
||||||
|
chunks =
|
||||||
|
contents.reduce([]) do |sections, item|
|
||||||
|
new_content = completion_model.format_content_item(item)
|
||||||
|
|
||||||
|
if completion_model.can_expand_tokens?(
|
||||||
|
section[:summary],
|
||||||
|
new_content,
|
||||||
|
completion_model.available_tokens,
|
||||||
|
)
|
||||||
|
section[:summary] += new_content
|
||||||
|
section[:ids] << item[:id]
|
||||||
|
else
|
||||||
|
sections << section
|
||||||
|
section = { ids: [item[:id]], summary: new_content }
|
||||||
|
end
|
||||||
|
|
||||||
|
sections
|
||||||
|
end
|
||||||
|
|
||||||
|
chunks << section if section[:summary].present?
|
||||||
|
|
||||||
|
chunks
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
@ -16,6 +16,10 @@ RSpec.describe DiscourseAi::Summarization::Models::Anthropic do
|
|||||||
}
|
}
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def as_chunk(item)
|
||||||
|
{ ids: [item[:id]], summary: "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }
|
||||||
|
end
|
||||||
|
|
||||||
def expected_messages(contents, opts)
|
def expected_messages(contents, opts)
|
||||||
base_prompt = <<~TEXT
|
base_prompt = <<~TEXT
|
||||||
Human: Summarize the following forum discussion inside the given <input> tag.
|
Human: Summarize the following forum discussion inside the given <input> tag.
|
||||||
@ -43,8 +47,8 @@ RSpec.describe DiscourseAi::Summarization::Models::Anthropic do
|
|||||||
"<ai>This is summary 1</ai>",
|
"<ai>This is summary 1</ai>",
|
||||||
)
|
)
|
||||||
|
|
||||||
summarized_chunks =
|
chunks = content[:contents].map { |c| as_chunk(c) }
|
||||||
model.summarize_in_chunks(content[:contents], opts).map { |c| c[:summary] }
|
summarized_chunks = model.summarize_in_chunks(chunks, opts).map { |c| c[:summary] }
|
||||||
|
|
||||||
expect(summarized_chunks).to contain_exactly("This is summary 1")
|
expect(summarized_chunks).to contain_exactly("This is summary 1")
|
||||||
end
|
end
|
||||||
@ -66,8 +70,8 @@ RSpec.describe DiscourseAi::Summarization::Models::Anthropic do
|
|||||||
)
|
)
|
||||||
end
|
end
|
||||||
|
|
||||||
summarized_chunks =
|
chunks = content[:contents].map { |c| as_chunk(c) }
|
||||||
model.summarize_in_chunks(content[:contents], opts).map { |c| c[:summary] }
|
summarized_chunks = model.summarize_in_chunks(chunks, opts).map { |c| c[:summary] }
|
||||||
|
|
||||||
expect(summarized_chunks).to contain_exactly("This is summary 1", "This is summary 2")
|
expect(summarized_chunks).to contain_exactly("This is summary 1", "This is summary 2")
|
||||||
end
|
end
|
||||||
|
@ -32,6 +32,10 @@ RSpec.describe DiscourseAi::Summarization::Models::Discourse do
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def as_chunk(item)
|
||||||
|
{ ids: [item[:id]], summary: "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }
|
||||||
|
end
|
||||||
|
|
||||||
describe "#summarize_in_chunks" do
|
describe "#summarize_in_chunks" do
|
||||||
context "when the content fits in a single chunk" do
|
context "when the content fits in a single chunk" do
|
||||||
it "performs a request to summarize" do
|
it "performs a request to summarize" do
|
||||||
@ -39,8 +43,8 @@ RSpec.describe DiscourseAi::Summarization::Models::Discourse do
|
|||||||
|
|
||||||
stub_request(expected_messages(content[:contents], opts), "This is summary 1")
|
stub_request(expected_messages(content[:contents], opts), "This is summary 1")
|
||||||
|
|
||||||
summarized_chunks =
|
chunks = content[:contents].map { |c| as_chunk(c) }
|
||||||
model.summarize_in_chunks(content[:contents], opts).map { |c| c[:summary] }
|
summarized_chunks = model.summarize_in_chunks(chunks, opts).map { |c| c[:summary] }
|
||||||
|
|
||||||
expect(summarized_chunks).to contain_exactly("This is summary 1")
|
expect(summarized_chunks).to contain_exactly("This is summary 1")
|
||||||
end
|
end
|
||||||
@ -59,8 +63,8 @@ RSpec.describe DiscourseAi::Summarization::Models::Discourse do
|
|||||||
stub_request(expected_messages([item], opts), "This is summary #{idx + 1}")
|
stub_request(expected_messages([item], opts), "This is summary #{idx + 1}")
|
||||||
end
|
end
|
||||||
|
|
||||||
summarized_chunks =
|
chunks = content[:contents].map { |c| as_chunk(c) }
|
||||||
model.summarize_in_chunks(content[:contents], opts).map { |c| c[:summary] }
|
summarized_chunks = model.summarize_in_chunks(chunks, opts).map { |c| c[:summary] }
|
||||||
|
|
||||||
expect(summarized_chunks).to contain_exactly("This is summary 1", "This is summary 2")
|
expect(summarized_chunks).to contain_exactly("This is summary 1", "This is summary 2")
|
||||||
end
|
end
|
||||||
|
@ -16,6 +16,10 @@ RSpec.describe DiscourseAi::Summarization::Models::OpenAi do
|
|||||||
}
|
}
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def as_chunk(item)
|
||||||
|
{ ids: [item[:id]], summary: "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }
|
||||||
|
end
|
||||||
|
|
||||||
def expected_messages(contents, opts)
|
def expected_messages(contents, opts)
|
||||||
base_prompt = <<~TEXT
|
base_prompt = <<~TEXT
|
||||||
You are a summarization bot.
|
You are a summarization bot.
|
||||||
@ -46,8 +50,8 @@ RSpec.describe DiscourseAi::Summarization::Models::OpenAi do
|
|||||||
"This is summary 1",
|
"This is summary 1",
|
||||||
)
|
)
|
||||||
|
|
||||||
summarized_chunks =
|
chunks = content[:contents].map { |c| as_chunk(c) }
|
||||||
model.summarize_in_chunks(content[:contents], opts).map { |c| c[:summary] }
|
summarized_chunks = model.summarize_in_chunks(chunks, opts).map { |c| c[:summary] }
|
||||||
|
|
||||||
expect(summarized_chunks).to contain_exactly("This is summary 1")
|
expect(summarized_chunks).to contain_exactly("This is summary 1")
|
||||||
end
|
end
|
||||||
@ -69,8 +73,8 @@ RSpec.describe DiscourseAi::Summarization::Models::OpenAi do
|
|||||||
)
|
)
|
||||||
end
|
end
|
||||||
|
|
||||||
summarized_chunks =
|
chunks = content[:contents].map { |c| as_chunk(c) }
|
||||||
model.summarize_in_chunks(content[:contents], opts).map { |c| c[:summary] }
|
summarized_chunks = model.summarize_in_chunks(chunks, opts).map { |c| c[:summary] }
|
||||||
|
|
||||||
expect(summarized_chunks).to contain_exactly("This is summary 1", "This is summary 2")
|
expect(summarized_chunks).to contain_exactly("This is summary 1", "This is summary 2")
|
||||||
end
|
end
|
||||||
|
@ -4,38 +4,26 @@ class DummyCompletionModel
|
|||||||
SINGLE_SUMMARY = "this is a single summary"
|
SINGLE_SUMMARY = "this is a single summary"
|
||||||
CONCATENATED_SUMMARIES = "this is a concatenated summary"
|
CONCATENATED_SUMMARIES = "this is a concatenated summary"
|
||||||
|
|
||||||
def initialize(prompt_length)
|
def initialize(max_tokens)
|
||||||
@max_length = prompt_length
|
|
||||||
@summarization_calls = 0
|
@summarization_calls = 0
|
||||||
|
@available_tokens = max_tokens
|
||||||
end
|
end
|
||||||
|
|
||||||
attr_reader :max_length, :summarization_calls
|
attr_reader :max_length, :summarization_calls, :available_tokens
|
||||||
|
|
||||||
def summarize_in_chunks(contents, opts)
|
delegate :can_expand_tokens?, to: :tokenizer
|
||||||
chunks = []
|
|
||||||
|
|
||||||
section = { ids: [], summary: "" }
|
def summarize_single(single_chunk, opts)
|
||||||
|
@summarization_calls += 1
|
||||||
contents.each do |item|
|
SINGLE_SUMMARY
|
||||||
new_content = "(#{item[:id]} #{item[:poster]} said: #{item[:text]} "
|
|
||||||
|
|
||||||
if tokenizer.can_expand_tokens?(section[:summary], new_content, max_length)
|
|
||||||
section[:summary] += new_content
|
|
||||||
section[:ids] << item[:id]
|
|
||||||
else
|
|
||||||
chunks << section
|
|
||||||
section = { id: [item[:id]], summary: new_content }
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
|
||||||
chunks << section if section[:summary].present?
|
def summarize_in_chunks(chunks, opts)
|
||||||
|
chunks.map do |chunk|
|
||||||
chunks.each do |chunk|
|
|
||||||
chunk[:summary] = SINGLE_SUMMARY
|
chunk[:summary] = SINGLE_SUMMARY
|
||||||
@summarization_calls += 1
|
@summarization_calls += 1
|
||||||
|
chunk
|
||||||
end
|
end
|
||||||
|
|
||||||
chunks
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def concatenate_summaries(summaries)
|
def concatenate_summaries(summaries)
|
||||||
@ -48,6 +36,10 @@ class DummyCompletionModel
|
|||||||
SINGLE_SUMMARY
|
SINGLE_SUMMARY
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def format_content_item(item)
|
||||||
|
"(#{item[:id]} #{item[:poster]} said: #{item[:text]} "
|
||||||
|
end
|
||||||
|
|
||||||
def tokenizer
|
def tokenizer
|
||||||
DiscourseAi::Tokenizer::BertTokenizer
|
DiscourseAi::Tokenizer::BertTokenizer
|
||||||
end
|
end
|
||||||
|
Loading…
x
Reference in New Issue
Block a user