diff --git a/lib/modules/summarization/models/anthropic.rb b/lib/modules/summarization/models/anthropic.rb index d038385e..5c4d37d3 100644 --- a/lib/modules/summarization/models/anthropic.rb +++ b/lib/modules/summarization/models/anthropic.rb @@ -36,13 +36,17 @@ module DiscourseAi instructions = build_base_prompt(opts) text_to_summarize = contents.map { |c| format_content_item(c) }.join - truncated_content = tokenizer.truncate(text_to_summarize, max_tokens - reserved_tokens) + truncated_content = tokenizer.truncate(text_to_summarize, available_tokens) instructions += "#{truncated_content}\nAssistant:\n" completion(instructions) end + def summarize_single(chunk_text, opts) + summarize_chunk(chunk_text, opts.merge(single_chunk: true)) + end + private def summarize_chunk(chunk_text, opts) @@ -50,8 +54,15 @@ module DiscourseAi end def build_base_prompt(opts) + initial_instruction = + if opts[:single_chunk] + "Summarize the following forum discussion inside the given tag, creating a cohesive narrative." + else + "Summarize the following forum discussion inside the given tag." + end + base_prompt = <<~TEXT - Human: Summarize the following forum discussion inside the given tag. + Human: #{initial_instruction} Include only the summary inside tags. TEXT @@ -63,7 +74,7 @@ module DiscourseAi :content_title ] - base_prompt += "Don't use more than 400 words.\n" + base_prompt += "Don't use more than 400 words.\n" unless opts[:single_chunk] end def completion(prompt) diff --git a/lib/modules/summarization/models/base.rb b/lib/modules/summarization/models/base.rb index 769288c9..1ce220fb 100644 --- a/lib/modules/summarization/models/base.rb +++ b/lib/modules/summarization/models/base.rb @@ -21,32 +21,11 @@ module DiscourseAi raise NotImplemented end - def summarize_in_chunks(contents, opts) - chunks = [] - - section = { ids: [], summary: "" } - - contents.each do |item| - new_content = format_content_item(item) - - if tokenizer.can_expand_tokens?( - section[:summary], - new_content, - max_tokens - reserved_tokens, - ) - section[:summary] += new_content - section[:ids] << item[:id] - else - chunks << section - section = { ids: [item[:id]], summary: new_content } - end + def summarize_in_chunks(chunks, opts) + chunks.map do |chunk| + chunk[:summary] = summarize_chunk(chunk[:summary], opts) + chunk end - - chunks << section if section[:summary].present? - - chunks.each { |chunk| chunk[:summary] = summarize_chunk(chunk[:summary], opts) } - - chunks end def concatenate_summaries(_summaries) @@ -57,13 +36,7 @@ module DiscourseAi raise NotImplemented end - attr_reader :model - - protected - - attr_reader :max_tokens - - def summarize_chunk(_chunk_text, _opts) + def summarize_single(chunk_text, opts) raise NotImplemented end @@ -71,11 +44,29 @@ module DiscourseAi "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " end + def available_tokens + max_tokens - reserved_tokens + end + + attr_reader :model, :max_tokens + + protected + def reserved_tokens # Reserve tokens for the response and the base prompt # ~500 words 700 end + + def summarize_chunk(_chunk_text, _opts) + raise NotImplemented + end + + def tokenizer + raise NotImplemented + end + + delegate :can_expand_tokens?, to: :tokenizer end end end diff --git a/lib/modules/summarization/models/discourse.rb b/lib/modules/summarization/models/discourse.rb index 240f1fb8..79669aa6 100644 --- a/lib/modules/summarization/models/discourse.rb +++ b/lib/modules/summarization/models/discourse.rb @@ -29,11 +29,15 @@ module DiscourseAi def summarize_with_truncation(contents, opts) text_to_summarize = contents.map { |c| format_content_item(c) }.join truncated_content = - ::DiscourseAi::Tokenizer::BertTokenizer.truncate(text_to_summarize, max_tokens) + ::DiscourseAi::Tokenizer::BertTokenizer.truncate(text_to_summarize, available_tokens) completion(truncated_content) end + def summarize_single(chunk_text, _opts) + completion(chunk_text) + end + private def summarize_chunk(chunk_text, _opts) diff --git a/lib/modules/summarization/models/open_ai.rb b/lib/modules/summarization/models/open_ai.rb index cd91d31e..54ac8c5f 100644 --- a/lib/modules/summarization/models/open_ai.rb +++ b/lib/modules/summarization/models/open_ai.rb @@ -37,7 +37,7 @@ module DiscourseAi messages = [{ role: "system", content: build_base_prompt(opts) }] text_to_summarize = contents.map { |c| format_content_item(c) }.join - truncated_content = tokenizer.truncate(text_to_summarize, max_tokens - reserved_tokens) + truncated_content = tokenizer.truncate(text_to_summarize, available_tokens) messages << { role: "user", @@ -47,13 +47,24 @@ module DiscourseAi completion(messages) end + def summarize_single(chunk_text, opts) + summarize_chunk(chunk_text, opts.merge(single_chunk: true)) + end + private def summarize_chunk(chunk_text, opts) + summary_instruction = + if opts[:single_chunk] + "Summarize the following forum discussion, creating a cohesive narrative:" + else + "Summarize the following in 400 words:" + end + completion( [ { role: "system", content: build_base_prompt(opts) }, - { role: "user", content: "Summarize the following in 400 words:\n#{chunk_text}" }, + { role: "user", content: "#{summary_instruction}\n#{chunk_text}" }, ], ) end diff --git a/lib/modules/summarization/strategies/fold_content.rb b/lib/modules/summarization/strategies/fold_content.rb index 2f4508d6..f6d13c23 100644 --- a/lib/modules/summarization/strategies/fold_content.rb +++ b/lib/modules/summarization/strategies/fold_content.rb @@ -18,11 +18,45 @@ module DiscourseAi def summarize(content) opts = content.except(:contents) - summaries = completion_model.summarize_in_chunks(content[:contents], opts) - return { summary: summaries.first[:summary], chunks: [] } if summaries.length == 1 + chunks = split_into_chunks(content[:contents]) - { summary: completion_model.concatenate_summaries(summaries), chunks: summaries } + if chunks.length == 1 + { summary: completion_model.summarize_single(chunks.first[:summary], opts), chunks: [] } + else + summaries = completion_model.summarize_in_chunks(chunks, opts) + + { summary: completion_model.concatenate_summaries(summaries), chunks: summaries } + end + end + + private + + def split_into_chunks(contents) + section = { ids: [], summary: "" } + + chunks = + contents.reduce([]) do |sections, item| + new_content = completion_model.format_content_item(item) + + if completion_model.can_expand_tokens?( + section[:summary], + new_content, + completion_model.available_tokens, + ) + section[:summary] += new_content + section[:ids] << item[:id] + else + sections << section + section = { ids: [item[:id]], summary: new_content } + end + + sections + end + + chunks << section if section[:summary].present? + + chunks end end end diff --git a/spec/lib/modules/summarization/models/anthropic_spec.rb b/spec/lib/modules/summarization/models/anthropic_spec.rb index f3c985e2..db9af083 100644 --- a/spec/lib/modules/summarization/models/anthropic_spec.rb +++ b/spec/lib/modules/summarization/models/anthropic_spec.rb @@ -16,6 +16,10 @@ RSpec.describe DiscourseAi::Summarization::Models::Anthropic do } end + def as_chunk(item) + { ids: [item[:id]], summary: "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " } + end + def expected_messages(contents, opts) base_prompt = <<~TEXT Human: Summarize the following forum discussion inside the given tag. @@ -43,8 +47,8 @@ RSpec.describe DiscourseAi::Summarization::Models::Anthropic do "This is summary 1", ) - summarized_chunks = - model.summarize_in_chunks(content[:contents], opts).map { |c| c[:summary] } + chunks = content[:contents].map { |c| as_chunk(c) } + summarized_chunks = model.summarize_in_chunks(chunks, opts).map { |c| c[:summary] } expect(summarized_chunks).to contain_exactly("This is summary 1") end @@ -66,8 +70,8 @@ RSpec.describe DiscourseAi::Summarization::Models::Anthropic do ) end - summarized_chunks = - model.summarize_in_chunks(content[:contents], opts).map { |c| c[:summary] } + chunks = content[:contents].map { |c| as_chunk(c) } + summarized_chunks = model.summarize_in_chunks(chunks, opts).map { |c| c[:summary] } expect(summarized_chunks).to contain_exactly("This is summary 1", "This is summary 2") end diff --git a/spec/lib/modules/summarization/models/discourse_spec.rb b/spec/lib/modules/summarization/models/discourse_spec.rb index 3f0d6bdd..cd1c768a 100644 --- a/spec/lib/modules/summarization/models/discourse_spec.rb +++ b/spec/lib/modules/summarization/models/discourse_spec.rb @@ -32,6 +32,10 @@ RSpec.describe DiscourseAi::Summarization::Models::Discourse do end end + def as_chunk(item) + { ids: [item[:id]], summary: "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " } + end + describe "#summarize_in_chunks" do context "when the content fits in a single chunk" do it "performs a request to summarize" do @@ -39,8 +43,8 @@ RSpec.describe DiscourseAi::Summarization::Models::Discourse do stub_request(expected_messages(content[:contents], opts), "This is summary 1") - summarized_chunks = - model.summarize_in_chunks(content[:contents], opts).map { |c| c[:summary] } + chunks = content[:contents].map { |c| as_chunk(c) } + summarized_chunks = model.summarize_in_chunks(chunks, opts).map { |c| c[:summary] } expect(summarized_chunks).to contain_exactly("This is summary 1") end @@ -59,8 +63,8 @@ RSpec.describe DiscourseAi::Summarization::Models::Discourse do stub_request(expected_messages([item], opts), "This is summary #{idx + 1}") end - summarized_chunks = - model.summarize_in_chunks(content[:contents], opts).map { |c| c[:summary] } + chunks = content[:contents].map { |c| as_chunk(c) } + summarized_chunks = model.summarize_in_chunks(chunks, opts).map { |c| c[:summary] } expect(summarized_chunks).to contain_exactly("This is summary 1", "This is summary 2") end diff --git a/spec/lib/modules/summarization/models/open_ai_spec.rb b/spec/lib/modules/summarization/models/open_ai_spec.rb index 7e1a24f5..d3493d37 100644 --- a/spec/lib/modules/summarization/models/open_ai_spec.rb +++ b/spec/lib/modules/summarization/models/open_ai_spec.rb @@ -16,6 +16,10 @@ RSpec.describe DiscourseAi::Summarization::Models::OpenAi do } end + def as_chunk(item) + { ids: [item[:id]], summary: "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " } + end + def expected_messages(contents, opts) base_prompt = <<~TEXT You are a summarization bot. @@ -46,8 +50,8 @@ RSpec.describe DiscourseAi::Summarization::Models::OpenAi do "This is summary 1", ) - summarized_chunks = - model.summarize_in_chunks(content[:contents], opts).map { |c| c[:summary] } + chunks = content[:contents].map { |c| as_chunk(c) } + summarized_chunks = model.summarize_in_chunks(chunks, opts).map { |c| c[:summary] } expect(summarized_chunks).to contain_exactly("This is summary 1") end @@ -69,8 +73,8 @@ RSpec.describe DiscourseAi::Summarization::Models::OpenAi do ) end - summarized_chunks = - model.summarize_in_chunks(content[:contents], opts).map { |c| c[:summary] } + chunks = content[:contents].map { |c| as_chunk(c) } + summarized_chunks = model.summarize_in_chunks(chunks, opts).map { |c| c[:summary] } expect(summarized_chunks).to contain_exactly("This is summary 1", "This is summary 2") end diff --git a/spec/support/summarization/dummy_completion_model.rb b/spec/support/summarization/dummy_completion_model.rb index 3c4136c0..a83a434a 100644 --- a/spec/support/summarization/dummy_completion_model.rb +++ b/spec/support/summarization/dummy_completion_model.rb @@ -4,38 +4,26 @@ class DummyCompletionModel SINGLE_SUMMARY = "this is a single summary" CONCATENATED_SUMMARIES = "this is a concatenated summary" - def initialize(prompt_length) - @max_length = prompt_length + def initialize(max_tokens) @summarization_calls = 0 + @available_tokens = max_tokens end - attr_reader :max_length, :summarization_calls + attr_reader :max_length, :summarization_calls, :available_tokens - def summarize_in_chunks(contents, opts) - chunks = [] + delegate :can_expand_tokens?, to: :tokenizer - section = { ids: [], summary: "" } + def summarize_single(single_chunk, opts) + @summarization_calls += 1 + SINGLE_SUMMARY + end - contents.each do |item| - new_content = "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " - - if tokenizer.can_expand_tokens?(section[:summary], new_content, max_length) - section[:summary] += new_content - section[:ids] << item[:id] - else - chunks << section - section = { id: [item[:id]], summary: new_content } - end - end - - chunks << section if section[:summary].present? - - chunks.each do |chunk| + def summarize_in_chunks(chunks, opts) + chunks.map do |chunk| chunk[:summary] = SINGLE_SUMMARY @summarization_calls += 1 + chunk end - - chunks end def concatenate_summaries(summaries) @@ -48,6 +36,10 @@ class DummyCompletionModel SINGLE_SUMMARY end + def format_content_item(item) + "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " + end + def tokenizer DiscourseAi::Tokenizer::BertTokenizer end