From ec97996905bcf0e90ff8b115506c06bb6bb6ec64 Mon Sep 17 00:00:00 2001 From: Roman Rizzi Date: Fri, 25 Oct 2024 11:51:17 -0300 Subject: [PATCH] FIX/REFACTOR: FoldContent revamp (#866) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * FIX/REFACTOR: FoldContent revamp We hit a snag with our hot topic gist strategy: the regex we used to split the content didn't work, so we cannot send the original post separately. This was important for letting the model focus on what's new in the topic. The algorithm doesn’t give us full control over how prompts are written, and figuring out how to format the content isn't straightforward. This means we're having to use more complicated workarounds, like regex. To tackle this, I'm suggesting we simplify the approach a bit. Let's focus on summarizing as much as we can upfront, then gradually add new content until there's nothing left to summarize. Also, the "extend" part is mostly for models with small context windows, which shouldn't pose a problem 99% of the time with the content volume we're dealing with. * Fix fold docs * Use #shift instead of #pop to get the first elem, not the last --- .../summarization/chat_summary_controller.rb | 2 +- lib/summarization/fold_content.rb | 141 ++++++------------ lib/summarization/strategies/base.rb | 25 +--- lib/summarization/strategies/chat_messages.rb | 52 +++++-- .../strategies/hot_topic_gists.rb | 62 +++++--- lib/summarization/strategies/topic_summary.rb | 127 +++++++++------- lib/tokenizer/basic_tokenizer.rb | 8 +- lib/tokenizer/open_ai_tokenizer.rb | 8 +- .../summarization/fold_content_spec.rb | 31 +--- .../strategies/hot_topic_gists_spec.rb | 11 +- .../strategies/topic_summary_spec.rb | 9 +- spec/shared/tokenizer_spec.rb | 10 +- 12 files changed, 227 insertions(+), 259 deletions(-) diff --git a/app/controllers/discourse_ai/summarization/chat_summary_controller.rb b/app/controllers/discourse_ai/summarization/chat_summary_controller.rb index 5354c539..c66b1f2c 100644 --- a/app/controllers/discourse_ai/summarization/chat_summary_controller.rb +++ b/app/controllers/discourse_ai/summarization/chat_summary_controller.rb @@ -26,7 +26,7 @@ module DiscourseAi strategy = DiscourseAi::Summarization::Strategies::ChatMessages.new(channel, since) summarized_text = - if strategy.targets_data[:contents].empty? + if strategy.targets_data.empty? I18n.t("discourse_ai.summarization.chat.no_targets") else summarizer.summarize(current_user)&.summarized_text diff --git a/lib/summarization/fold_content.rb b/lib/summarization/fold_content.rb index b21f2632..a443e04f 100644 --- a/lib/summarization/fold_content.rb +++ b/lib/summarization/fold_content.rb @@ -18,35 +18,18 @@ module DiscourseAi attr_reader :llm, :strategy # @param user { User } - User object used for auditing usage. - # # @param &on_partial_blk { Block - Optional } - The passed block will get called with the LLM partial response alongside a cancel function. # Note: The block is only called with results of the final summary, not intermediate summaries. # # @returns { AiSummary } - Resulting summary. def summarize(user, &on_partial_blk) - opts = content_to_summarize.except(:contents) - - initial_chunks = - rebalance_chunks( - content_to_summarize[:contents].map do |c| - { ids: [c[:id]], summary: format_content_item(c) } - end, - ) - - # Special case where we can do all the summarization in one pass. - result = - if initial_chunks.length == 1 - { - summary: - summarize_single(initial_chunks.first[:summary], user, opts, &on_partial_blk), - chunks: [], - } - else - summarize_chunks(initial_chunks, user, opts, &on_partial_blk) - end + base_summary = "" + initial_pos = 0 + folded_summary = + fold(content_to_summarize, base_summary, initial_pos, user, &on_partial_blk) clean_summary = - Nokogiri::HTML5.fragment(result[:summary]).css("ai")&.first&.text || result[:summary] + Nokogiri::HTML5.fragment(folded_summary).css("ai")&.first&.text || folded_summary if persist_summaries AiSummary.store!( @@ -54,7 +37,7 @@ module DiscourseAi strategy.type, llm_model.name, clean_summary, - content_to_summarize[:contents].map { |c| c[:id] }, + content_to_summarize.map { |c| c[:id] }, ) else AiSummary.new(summarized_text: clean_summary) @@ -96,90 +79,58 @@ module DiscourseAi end def latest_sha - @latest_sha ||= AiSummary.build_sha(content_to_summarize[:contents].map { |c| c[:id] }.join) + @latest_sha ||= AiSummary.build_sha(content_to_summarize.map { |c| c[:id] }.join) end - def summarize_chunks(chunks, user, opts, &on_partial_blk) - # Safely assume we always have more than one chunk. - summarized_chunks = summarize_in_chunks(chunks, user, opts) - total_summaries_size = - llm_model.tokenizer_class.size(summarized_chunks.map { |s| s[:summary].to_s }.join) + # @param items { Array } - Content to summarize. Structure will be: { poster: who wrote the content, id: a way to order content, text: content } + # @param summary { String } - Intermediate summaries that we'll keep extending as part of our "folding" algorithm. + # @param cursor { Integer } - Idx to know how much we already summarized. + # @param user { User } - User object used for auditing usage. + # @param &on_partial_blk { Block - Optional } - The passed block will get called with the LLM partial response alongside a cancel function. + # Note: The block is only called with results of the final summary, not intermediate summaries. + # + # The summarization algorithm. + # The idea is to build an initial summary packing as much content as we can. Once we have the initial summary, we'll keep extending using the leftover + # content until there is nothing left. + # + # @returns { String } - Resulting summary. + def fold(items, summary, cursor, user, &on_partial_blk) + tokenizer = llm_model.tokenizer_class + tokens_left = available_tokens - tokenizer.size(summary) + iteration_content = [] - if total_summaries_size < available_tokens - # Chunks are small enough, we can concatenate them. - { - summary: - concatenate_summaries( - summarized_chunks.map { |s| s[:summary] }, - user, - &on_partial_blk - ), - chunks: summarized_chunks, - } - else - # We have summarized chunks but we can't concatenate them yet. Split them into smaller summaries and summarize again. - rebalanced_chunks = rebalance_chunks(summarized_chunks) + items.each_with_index do |item, idx| + next if idx < cursor - summarize_chunks(rebalanced_chunks, user, opts, &on_partial_blk) - end - end + as_text = "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " - def format_content_item(item) - "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " - end - - def rebalance_chunks(chunks) - section = { ids: [], summary: "" } - - chunks = - chunks.reduce([]) do |sections, chunk| - if llm_model.tokenizer_class.can_expand_tokens?( - section[:summary], - chunk[:summary], - available_tokens, - ) - section[:summary] += chunk[:summary] - section[:ids] = section[:ids].concat(chunk[:ids]) - else - sections << section - section = chunk - end - - sections + if tokenizer.below_limit?(as_text, tokens_left) + iteration_content << item + tokens_left -= tokenizer.size(as_text) + cursor += 1 + else + break end + end - chunks << section if section[:summary].present? - - chunks - end - - def summarize_single(text, user, opts, &on_partial_blk) - prompt = strategy.summarize_single_prompt(text, opts) - - llm.generate(prompt, user: user, feature_name: "summarize", &on_partial_blk) - end - - def summarize_in_chunks(chunks, user, opts) - chunks.map do |chunk| - prompt = strategy.summarize_single_prompt(chunk[:summary], opts) - - chunk[:summary] = llm.generate( - prompt, - user: user, - max_tokens: 300, - feature_name: "summarize", + prompt = + ( + if summary.blank? + strategy.first_summary_prompt(iteration_content) + else + strategy.summary_extension_prompt(summary, iteration_content) + end ) - chunk + if cursor == items.length + llm.generate(prompt, user: user, feature_name: "summarize", &on_partial_blk) + else + latest_summary = + llm.generate(prompt, user: user, max_tokens: 600, feature_name: "summarize") + fold(items, latest_summary, cursor, user, &on_partial_blk) end end - def concatenate_summaries(texts_to_summarize, user, &on_partial_blk) - prompt = strategy.concatenation_prompt(texts_to_summarize) - - llm.generate(prompt, user: user, &on_partial_blk) - end - def available_tokens # Reserve tokens for the response and the base prompt # ~500 words diff --git a/lib/summarization/strategies/base.rb b/lib/summarization/strategies/base.rb index 2ca76383..b4b1dbfd 100644 --- a/lib/summarization/strategies/base.rb +++ b/lib/summarization/strategies/base.rb @@ -11,7 +11,7 @@ module DiscourseAi @target = target end - attr_reader :target + attr_reader :target, :opts # The summary type differentiates instances of `AiSummary` pointing to a single target. # See the `summary_type` enum for available options. @@ -19,11 +19,9 @@ module DiscourseAi raise NotImplementedError end - # @returns { Hash } - Content to summarize. + # @returns { Array } - Content to summarize. # - # This method returns a hash with the content to summarize and additional information. - # The only mandatory key is `contents`, which must be an array of hashes with - # the following structure: + # This method returns an array of hashes with the content to summarize using the following structure: # # { # poster: A way to tell who write the content, @@ -31,26 +29,17 @@ module DiscourseAi # text: Text to summarize # } # - # Additionally, you could add more context, which will be available in the prompt. e.g.: - # - # { - # resource_path: "#{Discourse.base_path}/t/-/#{target.id}", - # content_title: target.title, - # contents: [...] - # } - # def targets_data raise NotImplementedError end - # @returns { DiscourseAi::Completions::Prompt } - Prompt passed to the LLM when concatenating multiple chunks. - def contatenation_prompt(_texts_to_summarize) + # @returns { DiscourseAi::Completions::Prompt } - Prompt passed to the LLM when extending an existing summary. + def summary_extension_prompt(_summary, _texts_to_summarize) raise NotImplementedError end - # @returns { DiscourseAi::Completions::Prompt } - Prompt passed to the LLM on each chunk, - # and when the whole content fits in one call. - def summarize_single_prompt(_input, _opts) + # @returns { DiscourseAi::Completions::Prompt } - Prompt passed to the LLM for summarizing a single chunk of content. + def first_summary_prompt(_input) raise NotImplementedError end end diff --git a/lib/summarization/strategies/chat_messages.rb b/lib/summarization/strategies/chat_messages.rb index 3af267ff..1f3aad6d 100644 --- a/lib/summarization/strategies/chat_messages.rb +++ b/lib/summarization/strategies/chat_messages.rb @@ -14,38 +14,60 @@ module DiscourseAi end def targets_data - content = { content_title: target.name } - - content[:contents] = target + target .chat_messages .where("chat_messages.created_at > ?", since.hours.ago) .includes(:user) .order(created_at: :asc) .pluck(:id, :username_lower, :message) .map { { id: _1, poster: _2, text: _3 } } - - content end - def contatenation_prompt(texts_to_summarize) + def summary_extension_prompt(summary, contents) + input = + contents + .map { |item| "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " } + .join("\n") + prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip) - You are a summarization bot tasked with creating a cohesive narrative by intelligently merging multiple disjointed summaries. - Your response should consist of well-structured paragraphs that combines these summaries into a clear and comprehensive overview. - Avoid adding any additional text or commentary. Format your output using Discourse forum Markdown. + You are a summarization bot tasked with expanding on an existing summary by incorporating new chat messages. + Your goal is to seamlessly integrate the additional information into the existing summary, preserving the clarity and insights of the original while reflecting any new developments, themes, or conclusions. + Analyze the new messages to identify key themes, participants' intentions, and any significant decisions or resolutions. + Update the summary to include these aspects in a way that remains concise, comprehensive, and accessible to someone with no prior context of the conversation. + + ### Guidelines: + + - Merge the new information naturally with the existing summary without redundancy. + - Only include the updated summary, WITHOUT additional commentary. + - Don't mention the channel title. Avoid extraneous details or subjective opinions. + - Maintain the original language of the text being summarized. + - The same user could write multiple messages in a row, don't treat them as different persons. + - Aim for summaries to be extended by a reasonable amount, but strive to maintain a total length of 400 words or less, unless absolutely necessary for comprehensiveness. + TEXT prompt.push(type: :user, content: <<~TEXT.strip) - THESE are the summaries, each one separated by a newline, all of them inside XML tags: + ### Context: - - #{texts_to_summarize.join("\n")} - + This is the existing summary: + + #{summary} + + These are the new chat messages: + + #{input} + + Intengrate the new messages into the existing summary. TEXT prompt end - def summarize_single_prompt(input, opts) + def first_summary_prompt(contents) + content_title = target.name + input = + contents.map { |item| "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }.join + prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip) You are a summarization bot designed to generate clear and insightful paragraphs that conveys the main topics and developments from a series of chat messages within a user-selected time window. @@ -62,7 +84,7 @@ module DiscourseAi TEXT prompt.push(type: :user, content: <<~TEXT.strip) - #{opts[:content_title].present? ? "The name of the channel is: " + opts[:content_title] + ".\n" : ""} + #{content_title.present? ? "The name of the channel is: " + content_title + ".\n" : ""} Here are the messages, inside XML tags: diff --git a/lib/summarization/strategies/hot_topic_gists.rb b/lib/summarization/strategies/hot_topic_gists.rb index ad61f2b0..c7954805 100644 --- a/lib/summarization/strategies/hot_topic_gists.rb +++ b/lib/summarization/strategies/hot_topic_gists.rb @@ -9,8 +9,6 @@ module DiscourseAi end def targets_data - content = { content_title: target.title, contents: [] } - op_post_number = 1 hot_topics_recent_cutoff = Time.zone.now - SiteSetting.hot_topics_recent_days.days @@ -44,44 +42,62 @@ module DiscourseAi .order(:post_number) .pluck(:post_number, :raw, :username) - posts_data.each do |(pn, raw, username)| + posts_data.reduce([]) do |memo, (pn, raw, username)| raw_text = raw if pn == 1 && target.topic_embed&.embed_content_cache.present? raw_text = target.topic_embed&.embed_content_cache end - content[:contents] << { poster: username, id: pn, text: raw_text } + memo << { poster: username, id: pn, text: raw_text } end - - content end - def concatenation_prompt(texts_to_summarize) - prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip) - You are a summarization bot tasked with creating a single, concise sentence by merging disjointed summaries into a cohesive statement. - Your response should strictly be this single, comprehensive sentence, without any additional text or comments. + def summary_extension_prompt(summary, contents) + statements = + contents + .to_a + .map { |item| "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " } + .join("\n") - - Focus on the central theme or issue being addressed, maintaining an objective and neutral tone. - - Exclude extraneous details or subjective opinions. + prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip) + You are an advanced summarization bot. Your task is to update an existing single-sentence summary by integrating new developments from a conversation. + Analyze the most recent messages to identify key updates or shifts in the main topic and reflect these in the updated summary. + Emphasize new significant information or developments within the context of the initial conversation theme. + + ### Guidelines: + + - Ensure the revised summary remains concise and objective, maintaining a focus on the central theme or issue. + - Omit extraneous details or subjective opinions. - Use the original language of the text. - Begin directly with the main topic or issue, avoiding introductory phrases. - - Limit the summary to a maximum of 20 words. + - Limit the updated summary to a maximum of 20 words. + - Return the 20-word summary inside tags. + TEXT prompt.push(type: :user, content: <<~TEXT.strip) - THESE are the summaries, each one separated by a newline, all of them inside XML tags: + ### Context: - - #{texts_to_summarize.join("\n")} - + This is the existing single-sentence summary: + + #{summary} + + And these are the new developments in the conversation: + + #{statements} + + Your task is to update an existing single-sentence summary by integrating new developments from a conversation. + Return the 20-word summary inside tags. TEXT prompt end - def summarize_single_prompt(input, opts) - statements = input.split(/(?=\d+\) \w+ said:)/) + def first_summary_prompt(contents) + content_title = target.title + statements = + contents.to_a.map { |item| "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " } prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip) You are an advanced summarization bot. Analyze a given conversation and produce a concise, @@ -95,25 +111,25 @@ module DiscourseAi - Use the original language of the text. - Begin directly with the main topic or issue, avoiding introductory phrases. - Limit the summary to a maximum of 20 words. + - Return the 20-word summary inside tags. - Return the 20-word summary inside tags. TEXT context = +<<~TEXT ### Context: - #{opts[:content_title].present? ? "The discussion title is: " + opts[:content_title] + ".\n" : ""} + #{content_title.present? ? "The discussion title is: " + content_title + ".\n" : ""} The conversation began with the following statement: - #{statements&.pop}\n + #{statements.shift}\n TEXT if statements.present? context << <<~TEXT Subsequent discussion includes the following: - #{statements&.join("\n")} + #{statements.join("\n")} Your task is to focus on these latest messages, capturing their meaning in the context of the initial statement. TEXT diff --git a/lib/summarization/strategies/topic_summary.rb b/lib/summarization/strategies/topic_summary.rb index 945caace..2b126383 100644 --- a/lib/summarization/strategies/topic_summary.rb +++ b/lib/summarization/strategies/topic_summary.rb @@ -9,12 +9,6 @@ module DiscourseAi end def targets_data - content = { - resource_path: "#{Discourse.base_path}/t/-/#{target.id}", - content_title: target.title, - contents: [], - } - posts_data = (target.has_summary? ? best_replies : pick_selection).pluck( :post_number, @@ -22,85 +16,102 @@ module DiscourseAi :username, ) - posts_data.each do |(pn, raw, username)| + posts_data.reduce([]) do |memo, (pn, raw, username)| raw_text = raw if pn == 1 && target.topic_embed&.embed_content_cache.present? raw_text = target.topic_embed&.embed_content_cache end - content[:contents] << { poster: username, id: pn, text: raw_text } + memo << { poster: username, id: pn, text: raw_text } end - - content end - def concatenation_prompt(texts_to_summarize) - prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip) - You are a summarization bot that effectively concatenates disjointed summaries, creating a cohesive narrative. - The narrative you create is in the form of one or multiple paragraphs. - Your reply MUST BE a single concatenated summary using the summaries I'll provide to you. - I'm NOT interested in anything other than the concatenated summary, don't include additional text or comments. - You understand and generate Discourse forum Markdown. - You format the response, including links, using Markdown. + def summary_extension_prompt(summary, contents) + resource_path = "#{Discourse.base_path}/t/-/#{target.id}" + content_title = target.title + input = + contents.map { |item| "(#{item[:id]} #{item[:poster]} said: #{item[:text]})" }.join + + prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT) + You are an advanced summarization bot tasked with enhancing an existing summary by incorporating additional posts. + + ### Guidelines: + - Only include the enhanced summary, without any additional commentary. + - Understand and generate Discourse forum Markdown; including links, _italics_, **bold**. + - Maintain the original language of the text being summarized. + - Aim for summaries to be 400 words or less. + - Each new post is formatted as ") " + - Cite specific noteworthy posts using the format [NAME](#{resource_path}/POST_NUMBER) + - Example: link to the 3rd post by sam: [sam](#{resource_path}/3) + - Example: link to the 6th post by jane: [agreed with](#{resource_path}/6) + - Example: link to the 13th post by joe: [#13](#{resource_path}/13) + - When formatting usernames either use @USERNAME or [USERNAME](#{resource_path}/POST_NUMBER) TEXT prompt.push(type: :user, content: <<~TEXT.strip) - THESE are the summaries, each one separated by a newline, all of them inside XML tags: + ### Context: + + #{content_title.present? ? "The discussion title is: " + content_title + ".\n" : ""} + + Here is the existing summary: + + #{summary} + + Here are the new posts, inside XML tags: - #{texts_to_summarize.join("\n")} + #{input} + + Integrate the new information to generate an enhanced concise and coherent summary. TEXT prompt end - def summarize_single_prompt(input, opts) - insts = +<<~TEXT - You are an advanced summarization bot that generates concise, coherent summaries of provided text. + def first_summary_prompt(contents) + resource_path = "#{Discourse.base_path}/t/-/#{target.id}" + content_title = target.title + input = + contents.map { |item| "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }.join - - Only include the summary, without any additional commentary. - - You understand and generate Discourse forum Markdown; including links, _italics_, **bold**. - - Maintain the original language of the text being summarized. - - Aim for summaries to be 400 words or less. + prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip) + You are an advanced summarization bot that generates concise, coherent summaries of provided text. - TEXT + - Only include the summary, without any additional commentary. + - You understand and generate Discourse forum Markdown; including links, _italics_, **bold**. + - Maintain the original language of the text being summarized. + - Aim for summaries to be 400 words or less. + - Each post is formatted as ") " + - Cite specific noteworthy posts using the format [NAME](#{resource_path}/POST_NUMBER) + - Example: link to the 3rd post by sam: [sam](#{resource_path}/3) + - Example: link to the 6th post by jane: [agreed with](#{resource_path}/6) + - Example: link to the 13th post by joe: [#13](#{resource_path}/13) + - When formatting usernames either use @USERNMAE OR [USERNAME](#{resource_path}/POST_NUMBER) + TEXT - insts << <<~TEXT if opts[:resource_path] - - Each post is formatted as ") " - - Cite specific noteworthy posts using the format [NAME](#{opts[:resource_path]}/POST_NUMBER) - - Example: link to the 3rd post by sam: [sam](#{opts[:resource_path]}/3) - - Example: link to the 6th post by jane: [agreed with](#{opts[:resource_path]}/6) - - Example: link to the 13th post by joe: [#13](#{opts[:resource_path]}/13) - - When formatting usernames either use @USERNMAE OR [USERNAME](#{opts[:resource_path]}/POST_NUMBER) - TEXT - - prompt = DiscourseAi::Completions::Prompt.new(insts.strip) - - if opts[:resource_path] - prompt.push( - type: :user, - content: - "Here are the posts inside XML tags:\n\n1) user1 said: I love Mondays 2) user2 said: I hate Mondays\n\nGenerate a concise, coherent summary of the text above maintaining the original language.", - ) - prompt.push( - type: :model, - content: - "Two users are sharing their feelings toward Mondays. [user1](#{opts[:resource_path]}/1) hates them, while [user2](#{opts[:resource_path]}/2) loves them.", - ) - end + prompt.push( + type: :user, + content: + "Here are the posts inside XML tags:\n\n1) user1 said: I love Mondays 2) user2 said: I hate Mondays\n\nGenerate a concise, coherent summary of the text above maintaining the original language.", + ) + prompt.push( + type: :model, + content: + "Two users are sharing their feelings toward Mondays. [user1](#{resource_path}/1) hates them, while [user2](#{resource_path}/2) loves them.", + ) prompt.push(type: :user, content: <<~TEXT.strip) - #{opts[:content_title].present? ? "The discussion title is: " + opts[:content_title] + ".\n" : ""} - Here are the posts, inside XML tags: + #{content_title.present? ? "The discussion title is: " + content_title + ".\n" : ""} + Here are the posts, inside XML tags: - - #{input} - + + #{input} + - Generate a concise, coherent summary of the text above maintaining the original language. - TEXT + Generate a concise, coherent summary of the text above maintaining the original language. + TEXT prompt end diff --git a/lib/tokenizer/basic_tokenizer.rb b/lib/tokenizer/basic_tokenizer.rb index 0af218af..0822aeb3 100644 --- a/lib/tokenizer/basic_tokenizer.rb +++ b/lib/tokenizer/basic_tokenizer.rb @@ -40,14 +40,12 @@ module DiscourseAi tokenizer.decode(tokenizer.encode(text).ids.take(max_length)) end - def can_expand_tokens?(text, addition, max_length) + def below_limit?(text, limit) # fast track common case, /2 to handle unicode chars # than can take more than 1 token per char - if !SiteSetting.ai_strict_token_counting && text.size + addition.size < max_length / 2 - return true - end + return true if !SiteSetting.ai_strict_token_counting && text.size < limit / 2 - tokenizer.encode(text).ids.length + tokenizer.encode(addition).ids.length < max_length + tokenizer.encode(text).ids.length < limit end end end diff --git a/lib/tokenizer/open_ai_tokenizer.rb b/lib/tokenizer/open_ai_tokenizer.rb index 0ec9ce41..0fe06225 100644 --- a/lib/tokenizer/open_ai_tokenizer.rb +++ b/lib/tokenizer/open_ai_tokenizer.rb @@ -31,14 +31,12 @@ module DiscourseAi retry end - def can_expand_tokens?(text, addition, max_length) + def below_limit?(text, limit) # fast track common case, /2 to handle unicode chars # than can take more than 1 token per char - if !SiteSetting.ai_strict_token_counting && text.size + addition.size < max_length / 2 - return true - end + return true if !SiteSetting.ai_strict_token_counting && text.size < limit / 2 - tokenizer.encode(text).length + tokenizer.encode(addition).length < max_length + tokenizer.encode(text).length < limit end end end diff --git a/spec/lib/modules/summarization/fold_content_spec.rb b/spec/lib/modules/summarization/fold_content_spec.rb index b1f84519..808308d3 100644 --- a/spec/lib/modules/summarization/fold_content_spec.rb +++ b/spec/lib/modules/summarization/fold_content_spec.rb @@ -15,12 +15,15 @@ RSpec.describe DiscourseAi::Summarization::FoldContent do # Make sure each content fits in a single chunk. # 700 is the number of tokens reserved for the prompt. model_tokens = - 700 + DiscourseAi::Tokenizer::OpenAiTokenizer.size("(1 asd said: This is a text ") + 3 + 700 + + DiscourseAi::Tokenizer::OpenAiTokenizer.size( + "(1 #{post_1.user.username_lower} said: This is a text ", + ) + 3 llm_model.update!(max_prompt_tokens: model_tokens) end - let(:single_summary) { "this is a single summary" } + let(:single_summary) { "single" } let(:concatenated_summary) { "this is a concatenated summary" } let(:user) { User.new } @@ -39,29 +42,11 @@ RSpec.describe DiscourseAi::Summarization::FoldContent do context "when the content to summarize doesn't fit in a single call" do fab!(:post_2) { Fabricate(:post, topic: topic, post_number: 2, raw: "This is a text") } - it "summarizes each chunk and then concatenates them" do + it "keeps extending the summary until there is nothing else to process" do result = DiscourseAi::Completions::Llm.with_prepared_responses( - [single_summary, single_summary, concatenated_summary], - ) { |spy| summarizer.summarize(user).tap { expect(spy.completions).to eq(3) } } - - expect(result.summarized_text).to eq(concatenated_summary) - end - - it "keeps splitting into chunks until the content fits into a single call to create a cohesive narrative" do - max_length_response = "(1 asd said: This is a text " - chunk_of_chunks = "I'm smol" - - result = - DiscourseAi::Completions::Llm.with_prepared_responses( - [ - max_length_response, - max_length_response, - chunk_of_chunks, - chunk_of_chunks, - concatenated_summary, - ], - ) { |spy| summarizer.summarize(user).tap { expect(spy.completions).to eq(5) } } + [single_summary, concatenated_summary], + ) { |spy| summarizer.summarize(user).tap { expect(spy.completions).to eq(2) } } expect(result.summarized_text).to eq(concatenated_summary) end diff --git a/spec/lib/modules/summarization/strategies/hot_topic_gists_spec.rb b/spec/lib/modules/summarization/strategies/hot_topic_gists_spec.rb index 5eb38e5e..87d8c729 100644 --- a/spec/lib/modules/summarization/strategies/hot_topic_gists_spec.rb +++ b/spec/lib/modules/summarization/strategies/hot_topic_gists_spec.rb @@ -12,7 +12,7 @@ RSpec.describe DiscourseAi::Summarization::Strategies::HotTopicGists do post_2.update(created_at: (SiteSetting.hot_topics_recent_days + 1).days.ago) Fabricate(:post, topic: topic, post_number: 3) - post_numbers = gist.targets_data[:contents].map { |c| c[:id] } + post_numbers = gist.targets_data.map { |c| c[:id] } expect(post_numbers).to contain_exactly(1, 3) end @@ -20,7 +20,7 @@ RSpec.describe DiscourseAi::Summarization::Strategies::HotTopicGists do it "only includes visible posts" do post_2.update!(hidden: true) - post_numbers = gist.targets_data[:contents].map { |c| c[:id] } + post_numbers = gist.targets_data.map { |c| c[:id] } expect(post_numbers).to contain_exactly(1) end @@ -28,7 +28,7 @@ RSpec.describe DiscourseAi::Summarization::Strategies::HotTopicGists do it "doesn't include posts without users" do post_2.update!(user_id: nil) - post_numbers = gist.targets_data[:contents].map { |c| c[:id] } + post_numbers = gist.targets_data.map { |c| c[:id] } expect(post_numbers).to contain_exactly(1) end @@ -36,7 +36,7 @@ RSpec.describe DiscourseAi::Summarization::Strategies::HotTopicGists do it "doesn't include whispers" do post_2.update!(post_type: Post.types[:whisper]) - post_numbers = gist.targets_data[:contents].map { |c| c[:id] } + post_numbers = gist.targets_data.map { |c| c[:id] } expect(post_numbers).to contain_exactly(1) end @@ -51,8 +51,7 @@ RSpec.describe DiscourseAi::Summarization::Strategies::HotTopicGists do ) content = gist.targets_data - - op_content = content[:contents].first[:text] + op_content = content.first[:text] expect(op_content).to include(topic_embed.embed_content_cache) end diff --git a/spec/lib/modules/summarization/strategies/topic_summary_spec.rb b/spec/lib/modules/summarization/strategies/topic_summary_spec.rb index 329404a5..93f2c4c2 100644 --- a/spec/lib/modules/summarization/strategies/topic_summary_spec.rb +++ b/spec/lib/modules/summarization/strategies/topic_summary_spec.rb @@ -12,7 +12,7 @@ RSpec.describe DiscourseAi::Summarization::Strategies::TopicSummary do it "only includes visible posts" do post_2.update!(hidden: true) - post_numbers = topic_summary.targets_data[:contents].map { |c| c[:id] } + post_numbers = topic_summary.targets_data.map { |c| c[:id] } expect(post_numbers).to contain_exactly(1) end @@ -20,7 +20,7 @@ RSpec.describe DiscourseAi::Summarization::Strategies::TopicSummary do it "doesn't include posts without users" do post_2.update!(user_id: nil) - post_numbers = topic_summary.targets_data[:contents].map { |c| c[:id] } + post_numbers = topic_summary.targets_data.map { |c| c[:id] } expect(post_numbers).to contain_exactly(1) end @@ -28,7 +28,7 @@ RSpec.describe DiscourseAi::Summarization::Strategies::TopicSummary do it "doesn't include whispers" do post_2.update!(post_type: Post.types[:whisper]) - post_numbers = topic_summary.targets_data[:contents].map { |c| c[:id] } + post_numbers = topic_summary.targets_data.map { |c| c[:id] } expect(post_numbers).to contain_exactly(1) end @@ -56,8 +56,7 @@ RSpec.describe DiscourseAi::Summarization::Strategies::TopicSummary do ) content = topic_summary.targets_data - - op_content = content[:contents].first[:text] + op_content = content.first[:text] expect(op_content).to include(topic_embed.embed_content_cache) end diff --git a/spec/shared/tokenizer_spec.rb b/spec/shared/tokenizer_spec.rb index 92751624..52cec7bf 100644 --- a/spec/shared/tokenizer_spec.rb +++ b/spec/shared/tokenizer_spec.rb @@ -90,21 +90,21 @@ describe DiscourseAi::Tokenizer::OpenAiTokenizer do end end - describe "#can_expand_tokens?" do + describe "#below_limit?" do it "returns true when the tokens can be expanded" do - expect(described_class.can_expand_tokens?("foo bar", "baz qux", 6)).to eq(true) + expect(described_class.below_limit?("foo bar baz qux", 6)).to eq(true) end it "returns false when the tokens cannot be expanded" do - expect(described_class.can_expand_tokens?("foo bar", "baz qux", 3)).to eq(false) + expect(described_class.below_limit?("foo bar baz qux", 3)).to eq(false) end it "returns false when the tokens cannot be expanded due to multibyte unicode characters" do - expect(described_class.can_expand_tokens?("foo bar πŸ‘¨πŸΏ", "baz qux", 6)).to eq(false) + expect(described_class.below_limit?("foo bar πŸ‘¨πŸΏ baz qux", 6)).to eq(false) end it "handles unicode characters properly when they use more than one token per char" do - expect(described_class.can_expand_tokens?("ζˆ‘ε–œζ¬’εƒζ―”θ¨", "萨", 10)).to eq(false) + expect(described_class.below_limit?("ζˆ‘ε–œζ¬’εƒζ―”θ¨θ¨", 10)).to eq(false) end end end