diff --git a/lib/modules/summarization/models/anthropic.rb b/lib/modules/summarization/models/anthropic.rb
index d038385e..5c4d37d3 100644
--- a/lib/modules/summarization/models/anthropic.rb
+++ b/lib/modules/summarization/models/anthropic.rb
@@ -36,13 +36,17 @@ module DiscourseAi
instructions = build_base_prompt(opts)
text_to_summarize = contents.map { |c| format_content_item(c) }.join
- truncated_content = tokenizer.truncate(text_to_summarize, max_tokens - reserved_tokens)
+ truncated_content = tokenizer.truncate(text_to_summarize, available_tokens)
instructions += "#{truncated_content}\nAssistant:\n"
completion(instructions)
end
+ def summarize_single(chunk_text, opts)
+ summarize_chunk(chunk_text, opts.merge(single_chunk: true))
+ end
+
private
def summarize_chunk(chunk_text, opts)
@@ -50,8 +54,15 @@ module DiscourseAi
end
def build_base_prompt(opts)
+ initial_instruction =
+ if opts[:single_chunk]
+ "Summarize the following forum discussion inside the given tag, creating a cohesive narrative."
+ else
+ "Summarize the following forum discussion inside the given tag."
+ end
+
base_prompt = <<~TEXT
- Human: Summarize the following forum discussion inside the given tag.
+ Human: #{initial_instruction}
Include only the summary inside tags.
TEXT
@@ -63,7 +74,7 @@ module DiscourseAi
:content_title
]
- base_prompt += "Don't use more than 400 words.\n"
+ base_prompt += "Don't use more than 400 words.\n" unless opts[:single_chunk]
end
def completion(prompt)
diff --git a/lib/modules/summarization/models/base.rb b/lib/modules/summarization/models/base.rb
index 769288c9..1ce220fb 100644
--- a/lib/modules/summarization/models/base.rb
+++ b/lib/modules/summarization/models/base.rb
@@ -21,32 +21,11 @@ module DiscourseAi
raise NotImplemented
end
- def summarize_in_chunks(contents, opts)
- chunks = []
-
- section = { ids: [], summary: "" }
-
- contents.each do |item|
- new_content = format_content_item(item)
-
- if tokenizer.can_expand_tokens?(
- section[:summary],
- new_content,
- max_tokens - reserved_tokens,
- )
- section[:summary] += new_content
- section[:ids] << item[:id]
- else
- chunks << section
- section = { ids: [item[:id]], summary: new_content }
- end
+ def summarize_in_chunks(chunks, opts)
+ chunks.map do |chunk|
+ chunk[:summary] = summarize_chunk(chunk[:summary], opts)
+ chunk
end
-
- chunks << section if section[:summary].present?
-
- chunks.each { |chunk| chunk[:summary] = summarize_chunk(chunk[:summary], opts) }
-
- chunks
end
def concatenate_summaries(_summaries)
@@ -57,13 +36,7 @@ module DiscourseAi
raise NotImplemented
end
- attr_reader :model
-
- protected
-
- attr_reader :max_tokens
-
- def summarize_chunk(_chunk_text, _opts)
+ def summarize_single(chunk_text, opts)
raise NotImplemented
end
@@ -71,11 +44,29 @@ module DiscourseAi
"(#{item[:id]} #{item[:poster]} said: #{item[:text]} "
end
+ def available_tokens
+ max_tokens - reserved_tokens
+ end
+
+ attr_reader :model, :max_tokens
+
+ protected
+
def reserved_tokens
# Reserve tokens for the response and the base prompt
# ~500 words
700
end
+
+ def summarize_chunk(_chunk_text, _opts)
+ raise NotImplemented
+ end
+
+ def tokenizer
+ raise NotImplemented
+ end
+
+ delegate :can_expand_tokens?, to: :tokenizer
end
end
end
diff --git a/lib/modules/summarization/models/discourse.rb b/lib/modules/summarization/models/discourse.rb
index 240f1fb8..79669aa6 100644
--- a/lib/modules/summarization/models/discourse.rb
+++ b/lib/modules/summarization/models/discourse.rb
@@ -29,11 +29,15 @@ module DiscourseAi
def summarize_with_truncation(contents, opts)
text_to_summarize = contents.map { |c| format_content_item(c) }.join
truncated_content =
- ::DiscourseAi::Tokenizer::BertTokenizer.truncate(text_to_summarize, max_tokens)
+ ::DiscourseAi::Tokenizer::BertTokenizer.truncate(text_to_summarize, available_tokens)
completion(truncated_content)
end
+ def summarize_single(chunk_text, _opts)
+ completion(chunk_text)
+ end
+
private
def summarize_chunk(chunk_text, _opts)
diff --git a/lib/modules/summarization/models/open_ai.rb b/lib/modules/summarization/models/open_ai.rb
index cd91d31e..54ac8c5f 100644
--- a/lib/modules/summarization/models/open_ai.rb
+++ b/lib/modules/summarization/models/open_ai.rb
@@ -37,7 +37,7 @@ module DiscourseAi
messages = [{ role: "system", content: build_base_prompt(opts) }]
text_to_summarize = contents.map { |c| format_content_item(c) }.join
- truncated_content = tokenizer.truncate(text_to_summarize, max_tokens - reserved_tokens)
+ truncated_content = tokenizer.truncate(text_to_summarize, available_tokens)
messages << {
role: "user",
@@ -47,13 +47,24 @@ module DiscourseAi
completion(messages)
end
+ def summarize_single(chunk_text, opts)
+ summarize_chunk(chunk_text, opts.merge(single_chunk: true))
+ end
+
private
def summarize_chunk(chunk_text, opts)
+ summary_instruction =
+ if opts[:single_chunk]
+ "Summarize the following forum discussion, creating a cohesive narrative:"
+ else
+ "Summarize the following in 400 words:"
+ end
+
completion(
[
{ role: "system", content: build_base_prompt(opts) },
- { role: "user", content: "Summarize the following in 400 words:\n#{chunk_text}" },
+ { role: "user", content: "#{summary_instruction}\n#{chunk_text}" },
],
)
end
diff --git a/lib/modules/summarization/strategies/fold_content.rb b/lib/modules/summarization/strategies/fold_content.rb
index 2f4508d6..f6d13c23 100644
--- a/lib/modules/summarization/strategies/fold_content.rb
+++ b/lib/modules/summarization/strategies/fold_content.rb
@@ -18,11 +18,45 @@ module DiscourseAi
def summarize(content)
opts = content.except(:contents)
- summaries = completion_model.summarize_in_chunks(content[:contents], opts)
- return { summary: summaries.first[:summary], chunks: [] } if summaries.length == 1
+ chunks = split_into_chunks(content[:contents])
- { summary: completion_model.concatenate_summaries(summaries), chunks: summaries }
+ if chunks.length == 1
+ { summary: completion_model.summarize_single(chunks.first[:summary], opts), chunks: [] }
+ else
+ summaries = completion_model.summarize_in_chunks(chunks, opts)
+
+ { summary: completion_model.concatenate_summaries(summaries), chunks: summaries }
+ end
+ end
+
+ private
+
+ def split_into_chunks(contents)
+ section = { ids: [], summary: "" }
+
+ chunks =
+ contents.reduce([]) do |sections, item|
+ new_content = completion_model.format_content_item(item)
+
+ if completion_model.can_expand_tokens?(
+ section[:summary],
+ new_content,
+ completion_model.available_tokens,
+ )
+ section[:summary] += new_content
+ section[:ids] << item[:id]
+ else
+ sections << section
+ section = { ids: [item[:id]], summary: new_content }
+ end
+
+ sections
+ end
+
+ chunks << section if section[:summary].present?
+
+ chunks
end
end
end
diff --git a/spec/lib/modules/summarization/models/anthropic_spec.rb b/spec/lib/modules/summarization/models/anthropic_spec.rb
index f3c985e2..db9af083 100644
--- a/spec/lib/modules/summarization/models/anthropic_spec.rb
+++ b/spec/lib/modules/summarization/models/anthropic_spec.rb
@@ -16,6 +16,10 @@ RSpec.describe DiscourseAi::Summarization::Models::Anthropic do
}
end
+ def as_chunk(item)
+ { ids: [item[:id]], summary: "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }
+ end
+
def expected_messages(contents, opts)
base_prompt = <<~TEXT
Human: Summarize the following forum discussion inside the given tag.
@@ -43,8 +47,8 @@ RSpec.describe DiscourseAi::Summarization::Models::Anthropic do
"This is summary 1",
)
- summarized_chunks =
- model.summarize_in_chunks(content[:contents], opts).map { |c| c[:summary] }
+ chunks = content[:contents].map { |c| as_chunk(c) }
+ summarized_chunks = model.summarize_in_chunks(chunks, opts).map { |c| c[:summary] }
expect(summarized_chunks).to contain_exactly("This is summary 1")
end
@@ -66,8 +70,8 @@ RSpec.describe DiscourseAi::Summarization::Models::Anthropic do
)
end
- summarized_chunks =
- model.summarize_in_chunks(content[:contents], opts).map { |c| c[:summary] }
+ chunks = content[:contents].map { |c| as_chunk(c) }
+ summarized_chunks = model.summarize_in_chunks(chunks, opts).map { |c| c[:summary] }
expect(summarized_chunks).to contain_exactly("This is summary 1", "This is summary 2")
end
diff --git a/spec/lib/modules/summarization/models/discourse_spec.rb b/spec/lib/modules/summarization/models/discourse_spec.rb
index 3f0d6bdd..cd1c768a 100644
--- a/spec/lib/modules/summarization/models/discourse_spec.rb
+++ b/spec/lib/modules/summarization/models/discourse_spec.rb
@@ -32,6 +32,10 @@ RSpec.describe DiscourseAi::Summarization::Models::Discourse do
end
end
+ def as_chunk(item)
+ { ids: [item[:id]], summary: "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }
+ end
+
describe "#summarize_in_chunks" do
context "when the content fits in a single chunk" do
it "performs a request to summarize" do
@@ -39,8 +43,8 @@ RSpec.describe DiscourseAi::Summarization::Models::Discourse do
stub_request(expected_messages(content[:contents], opts), "This is summary 1")
- summarized_chunks =
- model.summarize_in_chunks(content[:contents], opts).map { |c| c[:summary] }
+ chunks = content[:contents].map { |c| as_chunk(c) }
+ summarized_chunks = model.summarize_in_chunks(chunks, opts).map { |c| c[:summary] }
expect(summarized_chunks).to contain_exactly("This is summary 1")
end
@@ -59,8 +63,8 @@ RSpec.describe DiscourseAi::Summarization::Models::Discourse do
stub_request(expected_messages([item], opts), "This is summary #{idx + 1}")
end
- summarized_chunks =
- model.summarize_in_chunks(content[:contents], opts).map { |c| c[:summary] }
+ chunks = content[:contents].map { |c| as_chunk(c) }
+ summarized_chunks = model.summarize_in_chunks(chunks, opts).map { |c| c[:summary] }
expect(summarized_chunks).to contain_exactly("This is summary 1", "This is summary 2")
end
diff --git a/spec/lib/modules/summarization/models/open_ai_spec.rb b/spec/lib/modules/summarization/models/open_ai_spec.rb
index 7e1a24f5..d3493d37 100644
--- a/spec/lib/modules/summarization/models/open_ai_spec.rb
+++ b/spec/lib/modules/summarization/models/open_ai_spec.rb
@@ -16,6 +16,10 @@ RSpec.describe DiscourseAi::Summarization::Models::OpenAi do
}
end
+ def as_chunk(item)
+ { ids: [item[:id]], summary: "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }
+ end
+
def expected_messages(contents, opts)
base_prompt = <<~TEXT
You are a summarization bot.
@@ -46,8 +50,8 @@ RSpec.describe DiscourseAi::Summarization::Models::OpenAi do
"This is summary 1",
)
- summarized_chunks =
- model.summarize_in_chunks(content[:contents], opts).map { |c| c[:summary] }
+ chunks = content[:contents].map { |c| as_chunk(c) }
+ summarized_chunks = model.summarize_in_chunks(chunks, opts).map { |c| c[:summary] }
expect(summarized_chunks).to contain_exactly("This is summary 1")
end
@@ -69,8 +73,8 @@ RSpec.describe DiscourseAi::Summarization::Models::OpenAi do
)
end
- summarized_chunks =
- model.summarize_in_chunks(content[:contents], opts).map { |c| c[:summary] }
+ chunks = content[:contents].map { |c| as_chunk(c) }
+ summarized_chunks = model.summarize_in_chunks(chunks, opts).map { |c| c[:summary] }
expect(summarized_chunks).to contain_exactly("This is summary 1", "This is summary 2")
end
diff --git a/spec/support/summarization/dummy_completion_model.rb b/spec/support/summarization/dummy_completion_model.rb
index 3c4136c0..a83a434a 100644
--- a/spec/support/summarization/dummy_completion_model.rb
+++ b/spec/support/summarization/dummy_completion_model.rb
@@ -4,38 +4,26 @@ class DummyCompletionModel
SINGLE_SUMMARY = "this is a single summary"
CONCATENATED_SUMMARIES = "this is a concatenated summary"
- def initialize(prompt_length)
- @max_length = prompt_length
+ def initialize(max_tokens)
@summarization_calls = 0
+ @available_tokens = max_tokens
end
- attr_reader :max_length, :summarization_calls
+ attr_reader :max_length, :summarization_calls, :available_tokens
- def summarize_in_chunks(contents, opts)
- chunks = []
+ delegate :can_expand_tokens?, to: :tokenizer
- section = { ids: [], summary: "" }
+ def summarize_single(single_chunk, opts)
+ @summarization_calls += 1
+ SINGLE_SUMMARY
+ end
- contents.each do |item|
- new_content = "(#{item[:id]} #{item[:poster]} said: #{item[:text]} "
-
- if tokenizer.can_expand_tokens?(section[:summary], new_content, max_length)
- section[:summary] += new_content
- section[:ids] << item[:id]
- else
- chunks << section
- section = { id: [item[:id]], summary: new_content }
- end
- end
-
- chunks << section if section[:summary].present?
-
- chunks.each do |chunk|
+ def summarize_in_chunks(chunks, opts)
+ chunks.map do |chunk|
chunk[:summary] = SINGLE_SUMMARY
@summarization_calls += 1
+ chunk
end
-
- chunks
end
def concatenate_summaries(summaries)
@@ -48,6 +36,10 @@ class DummyCompletionModel
SINGLE_SUMMARY
end
+ def format_content_item(item)
+ "(#{item[:id]} #{item[:poster]} said: #{item[:text]} "
+ end
+
def tokenizer
DiscourseAi::Tokenizer::BertTokenizer
end