FIX: Truncate OP for gists to help the model focus on the latest posts (#883)

2024-10-31 10:54:56 -03:00 · 2024-10-31 10:54:56 -03:00 · e8eed710e0
parent 32fb023357
commit e8eed710e0
5 changed files with 22 additions and 11 deletions
--- a/lib/summarization/fold_content.rb
+++ b/lib/summarization/fold_content.rb
@ -121,9 +121,9 @@ module DiscourseAi
        prompt =
          (
            if summary.blank?
-              strategy.first_summary_prompt(iteration_content)
+              strategy.first_summary_prompt(iteration_content, tokenizer)
            else
-              strategy.summary_extension_prompt(summary, iteration_content)
+              strategy.summary_extension_prompt(summary, iteration_content, tokenizer)
            end
          )

--- a/lib/summarization/strategies/base.rb
+++ b/lib/summarization/strategies/base.rb
@ -34,12 +34,12 @@ module DiscourseAi
        end

        # @returns { DiscourseAi::Completions::Prompt } - Prompt passed to the LLM when extending an existing summary.
-        def summary_extension_prompt(_summary, _texts_to_summarize)
+        def summary_extension_prompt(_summary, _texts_to_summarize, _tokenizer)
          raise NotImplementedError
        end

        # @returns { DiscourseAi::Completions::Prompt } - Prompt passed to the LLM for summarizing a single chunk of content.
-        def first_summary_prompt(_input)
+        def first_summary_prompt(_input, _tokenizer)
          raise NotImplementedError
        end

--- a/lib/summarization/strategies/chat_messages.rb
+++ b/lib/summarization/strategies/chat_messages.rb
@ -23,7 +23,7 @@ module DiscourseAi
            .map { { id: _1, poster: _2, text: _3 } }
        end

-        def summary_extension_prompt(summary, contents)
+        def summary_extension_prompt(summary, contents, _tokenizer)
          input =
            contents
              .map { |item| "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }
@ -63,7 +63,7 @@ module DiscourseAi
          prompt
        end

-        def first_summary_prompt(contents)
+        def first_summary_prompt(contents, _tokenizer)
          content_title = target.name
          input =
            contents.map { |item| "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }.join
--- a/lib/summarization/strategies/hot_topic_gists.rb
+++ b/lib/summarization/strategies/hot_topic_gists.rb
@ -57,7 +57,7 @@ module DiscourseAi
          end
        end

-        def summary_extension_prompt(summary, contents)
+        def summary_extension_prompt(summary, contents, _tokenizer)
          statements =
            contents
              .to_a
@ -98,11 +98,22 @@ module DiscourseAi
          prompt
        end

-        def first_summary_prompt(contents)
+        def first_summary_prompt(contents, tokenizer)
          content_title = target.title
          statements =
            contents.to_a.map { |item| "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }

+          op_statement = statements.shift.to_s
+          split_1, split_2 =
+            [op_statement[0, op_statement.size / 2], op_statement[(op_statement.size / 2)..-1]]
+
+          truncation_length = 500
+
+          op_statement = [
+            tokenizer.truncate(split_1, truncation_length),
+            tokenizer.truncate(split_2.reverse, truncation_length).reverse,
+          ].join(" ")
+
          prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip)
            You are an advanced summarization bot. Analyze a given conversation and produce a concise,
            single-sentence summary that conveys the main topic and current developments to someone with no prior context.
@ -127,7 +138,7 @@ module DiscourseAi
            
            The conversation began with the following statement:
        
-            #{statements.shift}\n
+            #{op_statement}\n
          TEXT

          if statements.present?
--- a/lib/summarization/strategies/topic_summary.rb
+++ b/lib/summarization/strategies/topic_summary.rb
@ -27,7 +27,7 @@ module DiscourseAi
          end
        end

-        def summary_extension_prompt(summary, contents)
+        def summary_extension_prompt(summary, contents, _tokenizer)
          resource_path = "#{Discourse.base_path}/t/-/#{target.id}"
          content_title = target.title
          input =
@ -70,7 +70,7 @@ module DiscourseAi
          prompt
        end

-        def first_summary_prompt(contents)
+        def first_summary_prompt(contents, _tokenizer)
          resource_path = "#{Discourse.base_path}/t/-/#{target.id}"
          content_title = target.title
          input =