diff --git a/lib/summarization/fold_content.rb b/lib/summarization/fold_content.rb index 9df6d608..77d7f787 100644 --- a/lib/summarization/fold_content.rb +++ b/lib/summarization/fold_content.rb @@ -25,8 +25,10 @@ module DiscourseAi def summarize(user, &on_partial_blk) base_summary = "" initial_pos = 0 - folded_summary = - fold(content_to_summarize, base_summary, initial_pos, user, &on_partial_blk) + + truncated_content = content_to_summarize.map { |cts| truncate(cts) } + + folded_summary = fold(truncated_content, base_summary, initial_pos, user, &on_partial_blk) clean_summary = Nokogiri::HTML5.fragment(folded_summary).css("ai")&.first&.text || folded_summary @@ -37,7 +39,7 @@ module DiscourseAi strategy.type, llm_model.name, clean_summary, - content_to_summarize.map { |c| c[:id] }, + truncated_content.map { |c| c[:id] }, ) else AiSummary.new(summarized_text: clean_summary) @@ -121,9 +123,9 @@ module DiscourseAi prompt = ( if summary.blank? - strategy.first_summary_prompt(iteration_content, tokenizer) + strategy.first_summary_prompt(iteration_content) else - strategy.summary_extension_prompt(summary, iteration_content, tokenizer) + strategy.summary_extension_prompt(summary, iteration_content) end ) @@ -143,6 +145,22 @@ module DiscourseAi llm_model.max_prompt_tokens - reserved_tokens end + + def truncate(item) + item_content = item[:text].to_s + split_1, split_2 = + [item_content[0, item_content.size / 2], item_content[(item_content.size / 2)..-1]] + + truncation_length = 500 + tokenizer = llm_model.tokenizer_class + + item[:text] = [ + tokenizer.truncate(split_1, truncation_length), + tokenizer.truncate(split_2.reverse, truncation_length).reverse, + ].join(" ") + + item + end end end end diff --git a/lib/summarization/strategies/base.rb b/lib/summarization/strategies/base.rb index 57dfad1d..f9a5e182 100644 --- a/lib/summarization/strategies/base.rb +++ b/lib/summarization/strategies/base.rb @@ -34,12 +34,12 @@ module DiscourseAi end # @returns { DiscourseAi::Completions::Prompt } - Prompt passed to the LLM when extending an existing summary. - def summary_extension_prompt(_summary, _texts_to_summarize, _tokenizer) + def summary_extension_prompt(_summary, _texts_to_summarize) raise NotImplementedError end # @returns { DiscourseAi::Completions::Prompt } - Prompt passed to the LLM for summarizing a single chunk of content. - def first_summary_prompt(_input, _tokenizer) + def first_summary_prompt(_input) raise NotImplementedError end diff --git a/lib/summarization/strategies/chat_messages.rb b/lib/summarization/strategies/chat_messages.rb index a50fb108..1f3aad6d 100644 --- a/lib/summarization/strategies/chat_messages.rb +++ b/lib/summarization/strategies/chat_messages.rb @@ -23,7 +23,7 @@ module DiscourseAi .map { { id: _1, poster: _2, text: _3 } } end - def summary_extension_prompt(summary, contents, _tokenizer) + def summary_extension_prompt(summary, contents) input = contents .map { |item| "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " } @@ -63,7 +63,7 @@ module DiscourseAi prompt end - def first_summary_prompt(contents, _tokenizer) + def first_summary_prompt(contents) content_title = target.name input = contents.map { |item| "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }.join diff --git a/lib/summarization/strategies/hot_topic_gists.rb b/lib/summarization/strategies/hot_topic_gists.rb index 69e0269f..e0271be6 100644 --- a/lib/summarization/strategies/hot_topic_gists.rb +++ b/lib/summarization/strategies/hot_topic_gists.rb @@ -57,7 +57,7 @@ module DiscourseAi end end - def summary_extension_prompt(summary, contents, _tokenizer) + def summary_extension_prompt(summary, contents) statements = contents .to_a @@ -98,22 +98,11 @@ module DiscourseAi prompt end - def first_summary_prompt(contents, tokenizer) + def first_summary_prompt(contents) content_title = target.title statements = contents.to_a.map { |item| "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " } - op_statement = statements.shift.to_s - split_1, split_2 = - [op_statement[0, op_statement.size / 2], op_statement[(op_statement.size / 2)..-1]] - - truncation_length = 500 - - op_statement = [ - tokenizer.truncate(split_1, truncation_length), - tokenizer.truncate(split_2.reverse, truncation_length).reverse, - ].join(" ") - prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip) You are an advanced summarization bot. Analyze a given conversation and produce a concise, single-sentence summary that conveys the main topic and current developments to someone with no prior context. @@ -138,7 +127,7 @@ module DiscourseAi The conversation began with the following statement: - #{op_statement}\n + #{statements.shift}\n TEXT if statements.present? diff --git a/lib/summarization/strategies/topic_summary.rb b/lib/summarization/strategies/topic_summary.rb index 1cac7ee4..2b126383 100644 --- a/lib/summarization/strategies/topic_summary.rb +++ b/lib/summarization/strategies/topic_summary.rb @@ -27,7 +27,7 @@ module DiscourseAi end end - def summary_extension_prompt(summary, contents, _tokenizer) + def summary_extension_prompt(summary, contents) resource_path = "#{Discourse.base_path}/t/-/#{target.id}" content_title = target.title input = @@ -70,7 +70,7 @@ module DiscourseAi prompt end - def first_summary_prompt(contents, _tokenizer) + def first_summary_prompt(contents) resource_path = "#{Discourse.base_path}/t/-/#{target.id}" content_title = target.title input =