iterate commands (#73)

* FEATURE: introduce a more efficient formatter Previous formatting style was space inefficient given JSON consumes lots of tokens, the new format is now used consistently across commands Also fixes - search limited to 10 - search breaking on limit: non existent directive * Slight improvement to summarizer Stop blowing up context with custom prompts * ensure we include the guiding message * correct spec * langchain style summarizer ... much more accurate (albeit more expensive) * lint
2023-05-22 12:09:14 +10:00 · 2023-05-22 12:09:14 +10:00 · 92fb84e24d
parent d59ed1091b
commit 92fb84e24d
15 changed files with 328 additions and 92 deletions
--- a/config/locales/server.en.yml
+++ b/config/locales/server.en.yml
@ -91,6 +91,7 @@ en:

    ai_bot:
      default_pm_prefix: "[Untitled AI bot PM]"
+      topic_not_found: "Summary unavailable, topic not found!"
      command_summary:
        categories: "List categories"
        search: "Search"
--- a/lib/modules/ai_bot/anthropic_bot.rb
+++ b/lib/modules/ai_bot/anthropic_bot.rb
@ -40,7 +40,7 @@ module DiscourseAi
        ).dig(:completion)
      end

-      def submit_prompt_and_stream_reply(prompt, prefer_low_cost: false, &blk)
+      def submit_prompt(prompt, prefer_low_cost: false, &blk)
        DiscourseAi::Inference::AnthropicCompletions.perform!(
          prompt,
          model_for,
--- a/lib/modules/ai_bot/bot.rb
+++ b/lib/modules/ai_bot/bot.rb
@ -55,10 +55,7 @@ module DiscourseAi

        setup_cancel = false

-        submit_prompt_and_stream_reply(
-          prompt,
-          prefer_low_cost: prefer_low_cost,
-        ) do |partial, cancel|
+        submit_prompt(prompt, prefer_low_cost: prefer_low_cost) do |partial, cancel|
          reply = update_with_delta(reply, partial)

          if redis_stream_key && !Discourse.redis.get(redis_stream_key)
@ -139,6 +136,7 @@ module DiscourseAi
        rendered_system_prompt = system_prompt(post)

        total_prompt_tokens = tokenize(rendered_system_prompt).length
+
        messages =
          conversation.reduce([]) do |memo, (raw, username)|
            break(memo) if total_prompt_tokens >= prompt_limit
@ -227,6 +225,14 @@ module DiscourseAi
        TEXT
      end

+      def tokenize(text)
+        raise NotImplemented
+      end
+
+      def submit_prompt(prompt, prefer_low_cost: false, &blk)
+        raise NotImplemented
+      end
+
      protected

      attr_reader :bot_user
@ -243,10 +249,6 @@ module DiscourseAi
        raise NotImplemented
      end

-      def submit_prompt_and_stream_reply(prompt, prefer_low_cost: false, &blk)
-        raise NotImplemented
-      end
-
      def conversation_context(post)
        context =
          post
@ -262,9 +264,15 @@ module DiscourseAi

        result = []

+        first = true
        context.each do |raw, username, custom_prompt|
          if custom_prompt.present?
+            if first
              custom_prompt.reverse_each { |message| result << message }
+              first = false
+            else
+              result << custom_prompt.first
+            end
          else
            result << [raw, username]
          end
@ -280,10 +288,6 @@ module DiscourseAi
          user_ids: bot_reply_post.topic.allowed_user_ids,
        )
      end
-
-      def tokenize(text)
-        raise NotImplemented
-      end
    end
  end
 end
--- a/lib/modules/ai_bot/commands/categories_command.rb
+++ b/lib/modules/ai_bot/commands/categories_command.rb
@ -21,29 +21,21 @@ module DiscourseAi::AiBot::Commands
    end

    def process(_args)
-      info =
-        +"Name, Slug, Description, Posts Year, Posts Month, Posts Week, id, parent_category_id\n"
+      columns = {
+        name: "Name",
+        slug: "Slug",
+        description: "Description",
+        posts_year: "Posts Year",
+        posts_month: "Posts Month",
+        posts_week: "Posts Week",
+        id: "id",
+        parent_category_id: "parent_category_id",
+      }

-      @count = 0
-      Category
-        .where(read_restricted: false)
-        .limit(100)
-        .pluck(
-          :id,
-          :parent_category_id,
-          :slug,
-          :name,
-          :description,
-          :posts_year,
-          :posts_month,
-          :posts_week,
-        )
-        .map do |id, parent_category_id, slug, name, description, posts_year, posts_month, posts_week|
-          @count += 1
-          info << "#{name}, #{slug}, #{(description || "").gsub(",", "")}, #{posts_year || 0}, #{posts_month || 0}, #{posts_week || 0},#{id}, #{parent_category_id} \n"
-        end
+      rows = Category.where(read_restricted: false).limit(100).pluck(*columns.keys)
+      @count = rows.length

-      info
+      format_results(rows, columns.values)
    end
  end
 end
--- a/lib/modules/ai_bot/commands/command.rb
+++ b/lib/modules/ai_bot/commands/command.rb
@ -22,6 +22,8 @@ module DiscourseAi
          end
        end

+        attr_reader :bot_user, :args
+
        def initialize(bot_user, args)
          @bot_user = bot_user
          @args = args
@ -89,6 +91,41 @@ module DiscourseAi
          chain_next_response
        end

+        def format_results(rows, column_names = nil)
+          rows = rows.map { |row| yield row } if block_given?
+
+          if !column_names
+            index = -1
+            column_indexes = {}
+
+            rows =
+              rows.map do |data|
+                new_row = []
+                data.each do |key, value|
+                  found_index = column_indexes[key.to_s] ||= (index += 1)
+                  new_row[found_index] = value
+                end
+                new_row
+              end
+            column_names = column_indexes.keys
+          end
+          # two tokens per delimiter is a reasonable balance
+          # there may be a single delimiter solution but GPT has
+          # a hard time dealing with escaped characters
+          delimiter = "¦"
+          formatted = +""
+          formatted << column_names.join(delimiter)
+          formatted << "\n"
+
+          rows.each do |array|
+            array.map! { |item| item.to_s.gsub(delimiter, "|").gsub(/\n/, " ") }
+            formatted << array.join(delimiter)
+            formatted << "\n"
+          end
+
+          formatted
+        end
+
        protected

        attr_reader :bot_user, :args
--- a/lib/modules/ai_bot/commands/google_command.rb
+++ b/lib/modules/ai_bot/commands/google_command.rb
@ -42,20 +42,15 @@ module DiscourseAi::AiBot::Commands

      @last_num_results = parsed.dig("searchInformation", "totalResults").to_i

-      formatted_results = []
-
-      results.each do |result|
-        formatted_result = {
+      format_results(results) do |result|
+        {
          title: result["title"],
          link: result["link"],
          snippet: result["snippet"],
          displayLink: result["displayLink"],
          formattedUrl: result["formattedUrl"],
        }
-        formatted_results << formatted_result
      end
-
-      formatted_results
    end
  end
 end
--- a/lib/modules/ai_bot/commands/search_command.rb
+++ b/lib/modules/ai_bot/commands/search_command.rb
@ -74,23 +74,44 @@ module DiscourseAi::AiBot::Commands
    end

    def process(search_string)
+      limit = nil
+
+      search_string =
+        search_string
+          .strip
+          .split(/\s+/)
+          .map do |term|
+            if term =~ /limit:(\d+)/
+              limit = $1.to_i
+              nil
+            else
+              term
+            end
+          end
+          .compact
+          .join(" ")
+
      @last_query = search_string
      results =
        Search.execute(search_string.to_s, search_type: :full_page, guardian: Guardian.new())

+      posts = results.posts
+      posts = posts[0..limit - 1] if limit
+
      @last_num_results = results.posts.length

-      results.posts[0..10]
-        .map do |p|
+      if posts.blank?
+        "No results found"
+      else
+        format_results(posts) do |post|
          {
-            title: p.topic.title,
-            url: p.url,
-            raw_truncated: p.raw[0..250],
-            excerpt: p.excerpt,
-            created: p.created_at,
+            title: post.topic.title,
+            url: post.url,
+            excerpt: post.excerpt,
+            created: post.created_at,
          }
        end
-        .to_json
+      end
    end
  end
 end
--- a/lib/modules/ai_bot/commands/summarize_command.rb
+++ b/lib/modules/ai_bot/commands/summarize_command.rb
@ -40,30 +40,135 @@ module DiscourseAi::AiBot::Commands
        topic = nil if !topic || !Guardian.new.can_see?(topic)
      end

-      rows = []
+      @last_summary = nil

      if topic
        @last_topic_title = topic.title
-        if guidance.present?
-          rows << ["Given: #{guidance}"]
-          rows << ["Summarise: #{topic.title}"]
+
+        posts =
          Post
-            .joins(:user)
            .where(topic_id: topic.id)
-            .order(:post_number)
            .where("post_type in (?)", [Post.types[:regular], Post.types[:small_action]])
            .where("not hidden")
-            .limit(50)
-            .pluck(:raw, :username)
-            .each { |raw, username| rows << ["#{username} said: #{raw}"] }
+            .order(:post_number)
+
+        columns = ["posts.id", :post_number, :raw, :username]
+
+        current_post_numbers = posts.limit(5).pluck(:post_number)
+        current_post_numbers += posts.reorder("posts.score desc").limit(50).pluck(:post_number)
+        current_post_numbers += posts.reorder("post_number desc").limit(5).pluck(:post_number)
+
+        data =
+          Post
+            .where(topic_id: topic.id)
+            .joins(:user)
+            .where("post_number in (?)", current_post_numbers)
+            .order(:post_number)
+            .pluck(*columns)
+
+        @last_summary = summarize(data, guidance, topic)
+      end
+
+      if !@last_summary
+        "Say: No topic found!"
+      else
+        "Topic summarized"
      end
    end

-      if rows.blank?
-        "Say: No topic found!"
-      else
-        "#{rows.join("\n")}"[0..2000]
+    def custom_raw
+      @last_summary || I18n.t("discourse_ai.ai_bot.topic_not_found")
    end
+
+    def chain_next_response
+      false
+    end
+
+    def bot
+      @bot ||= DiscourseAi::AiBot::Bot.as(bot_user)
+    end
+
+    def summarize(data, guidance, topic)
+      text = +""
+      data.each do |id, post_number, raw, username|
+        text << "(#{post_number} #{username} said: #{raw}"
+      end
+
+      summaries = []
+      current_section = +""
+      split = []
+
+      text
+        .split(/\s+/)
+        .each_slice(20) do |slice|
+          current_section << " "
+          current_section << slice.join(" ")
+
+          # somehow any more will get closer to limits
+          if bot.tokenize(current_section).length > 2500
+            split << current_section
+            current_section = +""
+          end
+        end
+
+      split << current_section if current_section.present?
+
+      split = split[0..3] + split[-3..-1] if split.length > 5
+
+      split.each do |section|
+        # TODO progress meter
+        summary =
+          generate_gpt_summary(
+            section,
+            topic: topic,
+            context: "Guidance: #{guidance}\nYou are summarizing the topic: #{topic.title}",
+          )
+        summaries << summary
+      end
+
+      if summaries.length > 1
+        messages = []
+        messages << { role: "system", content: "You are a helpful bot" }
+        messages << {
+          role: "user",
+          content:
+            "concatenated the disjoint summaries, creating a cohesive narrative:\n#{summaries.join("\n")}}",
+        }
+        bot.submit_prompt(messages, temperature: 0.6, max_tokens: 500, prefer_low_cost: true).dig(
+          :choices,
+          0,
+          :message,
+          :content,
+        )
+      else
+        summaries.first
+      end
+    end
+
+    def generate_gpt_summary(text, topic:, context: nil, length: nil)
+      length ||= 400
+
+      prompt = <<~TEXT
+        #{context}
+        Summarize the following in #{length} words:
+
+        #{text}
+      TEXT
+
+      system_prompt = <<~TEXT
+        You are a summarization bot.
+        You effectively summarise any text.
+        You condense it into a shorter version.
+        You understand and generate Discourse forum markdown.
+        Try generating links as well the format is #{topic.url}/POST_NUMBER. eg: [ref](#{topic.url}/77)
+      TEXT
+
+      messages = [{ role: "system", content: system_prompt }]
+      messages << { role: "user", content: prompt }
+
+      result =
+        bot.submit_prompt(messages, temperature: 0.6, max_tokens: length, prefer_low_cost: true)
+      result.dig(:choices, 0, :message, :content)
    end
  end
 end
--- a/lib/modules/ai_bot/commands/tags_command.rb
+++ b/lib/modules/ai_bot/commands/tags_command.rb
@ -21,18 +21,18 @@ module DiscourseAi::AiBot::Commands
    end

    def process(_args)
-      info = +"Name, Topic Count\n"
-      @last_count = 0
+      column_names = { name: "Name", public_topic_count: "Topic Count" }
+
+      tags =
        Tag
          .where("public_topic_count > 0")
          .order(public_topic_count: :desc)
          .limit(100)
-        .pluck(:name, :public_topic_count)
-        .each do |name, count|
-          @last_count += 1
-          info << "#{name}, #{count}\n"
-        end
-      info
+          .pluck(*column_names.keys)
+
+      @last_count = tags.length
+
+      format_results(tags, column_names.values)
    end
  end
 end
--- a/lib/modules/ai_bot/open_ai_bot.rb
+++ b/lib/modules/ai_bot/open_ai_bot.rb
@ -33,6 +33,29 @@ module DiscourseAi
        { temperature: 0.4, top_p: 0.9, max_tokens: max_tokens }
      end

+      def submit_prompt(
+        prompt,
+        prefer_low_cost: false,
+        temperature: nil,
+        top_p: nil,
+        max_tokens: nil,
+        &blk
+      )
+        params =
+          reply_params.merge(
+            temperature: temperature,
+            top_p: top_p,
+            max_tokens: max_tokens,
+          ) { |key, old_value, new_value| new_value.nil? ? old_value : new_value }
+
+        model = prefer_low_cost ? "gpt-3.5-turbo" : model_for
+        DiscourseAi::Inference::OpenAiCompletions.perform!(prompt, model, **params, &blk)
+      end
+
+      def tokenize(text)
+        DiscourseAi::Tokenizer::OpenAiTokenizer.tokenize(text)
+      end
+
      private

      def build_message(poster_username, content, system: false)
@ -65,15 +88,6 @@ module DiscourseAi
          max_tokens: 40,
        ).dig(:choices, 0, :message, :content)
      end
-
-      def submit_prompt_and_stream_reply(prompt, prefer_low_cost: false, &blk)
-        model = prefer_low_cost ? "gpt-3.5-turbo" : model_for
-        DiscourseAi::Inference::OpenAiCompletions.perform!(prompt, model, **reply_params, &blk)
-      end
-
-      def tokenize(text)
-        DiscourseAi::Tokenizer::OpenAiTokenizer.tokenize(text)
-      end
    end
  end
 end
--- a/spec/lib/modules/ai_bot/bot_spec.rb
+++ b/spec/lib/modules/ai_bot/bot_spec.rb
@ -54,7 +54,7 @@ RSpec.describe DiscourseAi::AiBot::Bot do
      )

      prompt << { role: "assistant", content: "!search test search" }
-      prompt << { role: "user", content: "results: []" }
+      prompt << { role: "user", content: "results: No results found" }

      OpenAiCompletionsInferenceStubs.stub_streamed_response(
        prompt,
--- a/spec/lib/modules/ai_bot/commands/command_spec.rb
+++ b/spec/lib/modules/ai_bot/commands/command_spec.rb
@ -0,0 +1,36 @@
+#frozen_string_literal: true
+
+require_relative "../../../../support/openai_completions_inference_stubs"
+
+RSpec.describe DiscourseAi::AiBot::Commands::Command do
+  fab!(:bot_user) { User.find(DiscourseAi::AiBot::EntryPoint::GPT3_5_TURBO_ID) }
+  let(:command) { DiscourseAi::AiBot::Commands::Command.new(bot_user, nil) }
+
+  describe "#format_results" do
+    it "can generate efficient tables of data" do
+      rows = [1, 2, 3, 4, 5]
+      column_names = %w[first second third]
+
+      formatted =
+        command.format_results(rows, column_names) { |row| ["row ¦ 1", row + 1, "a|b,\nc"] }
+
+      expect(formatted.split("\n").length).to eq(6)
+      expect(formatted).to include("a|b, c")
+    end
+
+    it "can also generate results by returning hash per row" do
+      rows = [1, 2, 3, 4, 5]
+      column_names = %w[first second third]
+
+      formatted =
+        command.format_results(rows, column_names) { |row| ["row ¦ 1", row + 1, "a|b,\nc"] }
+
+      formatted2 =
+        command.format_results(rows) do |row|
+          { first: "row ¦ 1", second: row + 1, third: "a|b,\nc" }
+        end
+
+      expect(formatted).to eq(formatted2)
+    end
+  end
+end
--- a/spec/lib/modules/ai_bot/commands/google_command_spec.rb
+++ b/spec/lib/modules/ai_bot/commands/google_command_spec.rb
@ -4,7 +4,6 @@ require_relative "../../../../support/openai_completions_inference_stubs"

 RSpec.describe DiscourseAi::AiBot::Commands::GoogleCommand do
  fab!(:bot_user) { User.find(DiscourseAi::AiBot::EntryPoint::GPT3_5_TURBO_ID) }
-  fab!(:bot) { DiscourseAi::AiBot::Bot.as(bot_user) }

  describe "#process" do
    it "can generate correct info" do
@ -33,7 +32,7 @@ RSpec.describe DiscourseAi::AiBot::Commands::GoogleCommand do
        "https://www.googleapis.com/customsearch/v1?cx=cx&key=abc&num=10&q=some%20search%20term",
      ).to_return(status: 200, body: json_text, headers: {})

-      google = described_class.new(bot, post)
+      google = described_class.new(bot_user, post)
      info = google.process("some search term")

      expect(google.description_args[:count]).to eq(1)
--- a/spec/lib/modules/ai_bot/commands/search_command_spec.rb
+++ b/spec/lib/modules/ai_bot/commands/search_command_spec.rb
@ -0,0 +1,26 @@
+#frozen_string_literal: true
+
+require_relative "../../../../support/openai_completions_inference_stubs"
+
+RSpec.describe DiscourseAi::AiBot::Commands::SearchCommand do
+  fab!(:bot_user) { User.find(DiscourseAi::AiBot::EntryPoint::GPT3_5_TURBO_ID) }
+
+  before { SearchIndexer.enable }
+  after { SearchIndexer.disable }
+
+  describe "#process" do
+    it "can handle limits" do
+      post1 = Fabricate(:post)
+      _post2 = Fabricate(:post, user: post1.user)
+      _post3 = Fabricate(:post, user: post1.user)
+
+      # search has no built in support for limit: so handle it from the outside
+      search = described_class.new(bot_user, post1)
+
+      results = search.process("@#{post1.user.username} limit:2")
+
+      # title + 2 rows
+      expect(results.split("\n").length).to eq(3)
+    end
+  end
+end
--- a/spec/lib/modules/ai_bot/commands/summarize_command_spec.rb
+++ b/spec/lib/modules/ai_bot/commands/summarize_command_spec.rb
@ -4,18 +4,22 @@ require_relative "../../../../support/openai_completions_inference_stubs"

 RSpec.describe DiscourseAi::AiBot::Commands::SummarizeCommand do
  fab!(:bot_user) { User.find(DiscourseAi::AiBot::EntryPoint::GPT3_5_TURBO_ID) }
-  fab!(:bot) { DiscourseAi::AiBot::Bot.as(bot_user) }

  describe "#process" do
    it "can generate correct info" do
      post = Fabricate(:post)

-      summarizer = described_class.new(bot, post)
+      WebMock.stub_request(:post, "https://api.openai.com/v1/chat/completions").to_return(
+        status: 200,
+        body: JSON.dump({ choices: [{ message: { content: "summary stuff" } }] }),
+      )
+
+      summarizer = described_class.new(bot_user, post)
      info = summarizer.process("#{post.topic_id} why did it happen?")

-      expect(info).to include("why did it happen?")
-      expect(info).to include(post.raw)
-      expect(info).to include(post.user.username)
+      expect(info).to include("Topic summarized")
+      expect(summarizer.custom_raw).to include("summary stuff")
+      expect(summarizer.chain_next_response).to eq(false)
    end

    it "protects hidden data" do
@ -26,10 +30,12 @@ RSpec.describe DiscourseAi::AiBot::Commands::SummarizeCommand do
      topic = Fabricate(:topic, category_id: category.id)
      post = Fabricate(:post, topic: topic)

-      summarizer = described_class.new(bot, post)
+      summarizer = described_class.new(bot_user, post)
      info = summarizer.process("#{post.topic_id} why did it happen?")

      expect(info).not_to include(post.raw)
+
+      expect(summarizer.custom_raw).to eq(I18n.t("discourse_ai.ai_bot.topic_not_found"))
    end
  end
 end