iterate commands (#73)

* FEATURE: introduce a more efficient formatter Previous formatting style was space inefficient given JSON consumes lots of tokens, the new format is now used consistently across commands Also fixes - search limited to 10 - search breaking on limit: non existent directive * Slight improvement to summarizer Stop blowing up context with custom prompts * ensure we include the guiding message * correct spec * langchain style summarizer ... much more accurate (albeit more expensive) * lint
2023-05-22 12:09:14 +10:00 · 2023-05-22 12:09:14 +10:00 · 92fb84e24d
parent d59ed1091b
commit 92fb84e24d
15 changed files with 328 additions and 92 deletions
--- a/config/locales/server.en.yml
+++ b/config/locales/server.en.yml
@ -91,6 +91,7 @@ en:
    ai_bot:
      default_pm_prefix: "[Untitled AI bot PM]"
      topic_not_found: "Summary unavailable, topic not found!"
      command_summary:
        categories: "List categories"
        search: "Search"
--- a/lib/modules/ai_bot/anthropic_bot.rb
+++ b/lib/modules/ai_bot/anthropic_bot.rb
@ -40,7 +40,7 @@ module DiscourseAi
        ).dig(:completion)
      end
-      def submit_prompt_and_stream_reply(prompt, prefer_low_cost: false, &blk)
+      def submit_prompt(prompt, prefer_low_cost: false, &blk)
        DiscourseAi::Inference::AnthropicCompletions.perform!(
          prompt,
          model_for,
--- a/lib/modules/ai_bot/bot.rb
+++ b/lib/modules/ai_bot/bot.rb
@ -55,10 +55,7 @@ module DiscourseAi
        setup_cancel = false
-        submit_prompt_and_stream_reply(
+        submit_prompt(prompt, prefer_low_cost: prefer_low_cost) do |partial, cancel|
          prompt,
          prefer_low_cost: prefer_low_cost,
        ) do |partial, cancel|
          reply = update_with_delta(reply, partial)
          if redis_stream_key && !Discourse.redis.get(redis_stream_key)
@ -139,6 +136,7 @@ module DiscourseAi
        rendered_system_prompt = system_prompt(post)
        total_prompt_tokens = tokenize(rendered_system_prompt).length
        messages =
          conversation.reduce([]) do |memo, (raw, username)|
            break(memo) if total_prompt_tokens >= prompt_limit
@ -227,6 +225,14 @@ module DiscourseAi
        TEXT
      end
      def tokenize(text)
        raise NotImplemented
      end
      def submit_prompt(prompt, prefer_low_cost: false, &blk)
        raise NotImplemented
      end
      protected
      attr_reader :bot_user
@ -243,10 +249,6 @@ module DiscourseAi
        raise NotImplemented
      end
      def submit_prompt_and_stream_reply(prompt, prefer_low_cost: false, &blk)
        raise NotImplemented
      end
      def conversation_context(post)
        context =
          post
@ -262,9 +264,15 @@ module DiscourseAi
        result = []
        first = true
        context.each do |raw, username, custom_prompt|
          if custom_prompt.present?
            if first
              custom_prompt.reverse_each { |message| result << message }
              first = false
            else
              result << custom_prompt.first
            end
          else
            result << [raw, username]
          end
@ -280,10 +288,6 @@ module DiscourseAi
          user_ids: bot_reply_post.topic.allowed_user_ids,
        )
      end
      def tokenize(text)
        raise NotImplemented
      end
    end
  end
 end
--- a/lib/modules/ai_bot/commands/categories_command.rb
+++ b/lib/modules/ai_bot/commands/categories_command.rb
@ -21,29 +21,21 @@ module DiscourseAi::AiBot::Commands
    end
    def process(_args)
-      info =
+      columns = {
-        +"Name, Slug, Description, Posts Year, Posts Month, Posts Week, id, parent_category_id\n"
+        name: "Name",
        slug: "Slug",
        description: "Description",
        posts_year: "Posts Year",
        posts_month: "Posts Month",
        posts_week: "Posts Week",
        id: "id",
        parent_category_id: "parent_category_id",
      }
-      @count = 0
+      rows = Category.where(read_restricted: false).limit(100).pluck(*columns.keys)
-      Category
+      @count = rows.length
        .where(read_restricted: false)
        .limit(100)
        .pluck(
          :id,
          :parent_category_id,
          :slug,
          :name,
          :description,
          :posts_year,
          :posts_month,
          :posts_week,
        )
        .map do |id, parent_category_id, slug, name, description, posts_year, posts_month, posts_week|
          @count += 1
          info << "#{name}, #{slug}, #{(description || "").gsub(",", "")}, #{posts_year || 0}, #{posts_month || 0}, #{posts_week || 0},#{id}, #{parent_category_id} \n"
        end
-      info
+      format_results(rows, columns.values)
    end
  end
 end
--- a/lib/modules/ai_bot/commands/command.rb
+++ b/lib/modules/ai_bot/commands/command.rb
@ -22,6 +22,8 @@ module DiscourseAi
          end
        end
        attr_reader :bot_user, :args
        def initialize(bot_user, args)
          @bot_user = bot_user
          @args = args
@ -89,6 +91,41 @@ module DiscourseAi
          chain_next_response
        end
        def format_results(rows, column_names = nil)
          rows = rows.map { |row| yield row } if block_given?
          if !column_names
            index = -1
            column_indexes = {}
            rows =
              rows.map do |data|
                new_row = []
                data.each do |key, value|
                  found_index = column_indexes[key.to_s] ||= (index += 1)
                  new_row[found_index] = value
                end
                new_row
              end
            column_names = column_indexes.keys
          end
          # two tokens per delimiter is a reasonable balance
          # there may be a single delimiter solution but GPT has
          # a hard time dealing with escaped characters
          delimiter = "¦"
          formatted = +""
          formatted << column_names.join(delimiter)
          formatted << "\n"
          rows.each do |array|
            array.map! { |item| item.to_s.gsub(delimiter, "|").gsub(/\n/, " ") }
            formatted << array.join(delimiter)
            formatted << "\n"
          end
          formatted
        end
        protected
        attr_reader :bot_user, :args
--- a/lib/modules/ai_bot/commands/google_command.rb
+++ b/lib/modules/ai_bot/commands/google_command.rb
@ -42,20 +42,15 @@ module DiscourseAi::AiBot::Commands
      @last_num_results = parsed.dig("searchInformation", "totalResults").to_i
-      formatted_results = []
+      format_results(results) do |result|
-
+        {
      results.each do |result|
        formatted_result = {
          title: result["title"],
          link: result["link"],
          snippet: result["snippet"],
          displayLink: result["displayLink"],
          formattedUrl: result["formattedUrl"],
        }
-        formatted_results << formatted_result
+      end
      end
      formatted_results
    end
  end
 end
--- a/lib/modules/ai_bot/commands/search_command.rb
+++ b/lib/modules/ai_bot/commands/search_command.rb
@ -74,23 +74,44 @@ module DiscourseAi::AiBot::Commands
    end
    def process(search_string)
      limit = nil
      search_string =
        search_string
          .strip
          .split(/\s+/)
          .map do |term|
            if term =~ /limit:(\d+)/
              limit = $1.to_i
              nil
            else
              term
            end
          end
          .compact
          .join(" ")
      @last_query = search_string
      results =
        Search.execute(search_string.to_s, search_type: :full_page, guardian: Guardian.new())
      posts = results.posts
      posts = posts[0..limit - 1] if limit
      @last_num_results = results.posts.length
-      results.posts[0..10]
+      if posts.blank?
-        .map do |p|
+        "No results found"
      else
        format_results(posts) do |post|
          {
-            title: p.topic.title,
+            title: post.topic.title,
-            url: p.url,
+            url: post.url,
-            raw_truncated: p.raw[0..250],
+            excerpt: post.excerpt,
-            excerpt: p.excerpt,
+            created: post.created_at,
            created: p.created_at,
          }
        end
-        .to_json
+      end
    end
  end
 end
--- a/lib/modules/ai_bot/commands/summarize_command.rb
+++ b/lib/modules/ai_bot/commands/summarize_command.rb
@ -40,30 +40,135 @@ module DiscourseAi::AiBot::Commands
        topic = nil if !topic || !Guardian.new.can_see?(topic)
      end
-      rows = []
+      @last_summary = nil
      if topic
        @last_topic_title = topic.title
-        if guidance.present?
+
-          rows << ["Given: #{guidance}"]
+        posts =
          rows << ["Summarise: #{topic.title}"]
          Post
            .joins(:user)
            .where(topic_id: topic.id)
            .order(:post_number)
            .where("post_type in (?)", [Post.types[:regular], Post.types[:small_action]])
            .where("not hidden")
-            .limit(50)
+            .order(:post_number)
-            .pluck(:raw, :username)
+
-            .each { |raw, username| rows << ["#{username} said: #{raw}"] }
+        columns = ["posts.id", :post_number, :raw, :username]
        current_post_numbers = posts.limit(5).pluck(:post_number)
        current_post_numbers += posts.reorder("posts.score desc").limit(50).pluck(:post_number)
        current_post_numbers += posts.reorder("post_number desc").limit(5).pluck(:post_number)
        data =
          Post
            .where(topic_id: topic.id)
            .joins(:user)
            .where("post_number in (?)", current_post_numbers)
            .order(:post_number)
            .pluck(*columns)
        @last_summary = summarize(data, guidance, topic)
      end
      if !@last_summary
        "Say: No topic found!"
      else
        "Topic summarized"
      end
    end
-      if rows.blank?
+    def custom_raw
-        "Say: No topic found!"
+      @last_summary || I18n.t("discourse_ai.ai_bot.topic_not_found")
    end
    def chain_next_response
      false
    end
    def bot
      @bot ||= DiscourseAi::AiBot::Bot.as(bot_user)
    end
    def summarize(data, guidance, topic)
      text = +""
      data.each do |id, post_number, raw, username|
        text << "(#{post_number} #{username} said: #{raw}"
      end
      summaries = []
      current_section = +""
      split = []
      text
        .split(/\s+/)
        .each_slice(20) do |slice|
          current_section << " "
          current_section << slice.join(" ")
          # somehow any more will get closer to limits
          if bot.tokenize(current_section).length > 2500
            split << current_section
            current_section = +""
          end
        end
      split << current_section if current_section.present?
      split = split[0..3] + split[-3..-1] if split.length > 5
      split.each do |section|
        # TODO progress meter
        summary =
          generate_gpt_summary(
            section,
            topic: topic,
            context: "Guidance: #{guidance}\nYou are summarizing the topic: #{topic.title}",
          )
        summaries << summary
      end
      if summaries.length > 1
        messages = []
        messages << { role: "system", content: "You are a helpful bot" }
        messages << {
          role: "user",
          content:
            "concatenated the disjoint summaries, creating a cohesive narrative:\n#{summaries.join("\n")}}",
        }
        bot.submit_prompt(messages, temperature: 0.6, max_tokens: 500, prefer_low_cost: true).dig(
          :choices,
          0,
          :message,
          :content,
        )
      else
-        "#{rows.join("\n")}"[0..2000]
+        summaries.first
-      end
+      end
    end
    def generate_gpt_summary(text, topic:, context: nil, length: nil)
      length ||= 400
      prompt = <<~TEXT
        #{context}
        Summarize the following in #{length} words:
        #{text}
      TEXT
      system_prompt = <<~TEXT
        You are a summarization bot.
        You effectively summarise any text.
        You condense it into a shorter version.
        You understand and generate Discourse forum markdown.
        Try generating links as well the format is #{topic.url}/POST_NUMBER. eg: [ref](#{topic.url}/77)
      TEXT
      messages = [{ role: "system", content: system_prompt }]
      messages << { role: "user", content: prompt }
      result =
        bot.submit_prompt(messages, temperature: 0.6, max_tokens: length, prefer_low_cost: true)
      result.dig(:choices, 0, :message, :content)
    end
  end
 end
--- a/lib/modules/ai_bot/commands/tags_command.rb
+++ b/lib/modules/ai_bot/commands/tags_command.rb
@ -21,18 +21,18 @@ module DiscourseAi::AiBot::Commands
    end
    def process(_args)
-      info = +"Name, Topic Count\n"
+      column_names = { name: "Name", public_topic_count: "Topic Count" }
-      @last_count = 0
+
      tags =
        Tag
          .where("public_topic_count > 0")
          .order(public_topic_count: :desc)
          .limit(100)
-        .pluck(:name, :public_topic_count)
+          .pluck(*column_names.keys)
-        .each do |name, count|
+
-          @last_count += 1
+      @last_count = tags.length
-          info << "#{name}, #{count}\n"
+
-        end
+      format_results(tags, column_names.values)
      info
    end
  end
 end
--- a/lib/modules/ai_bot/open_ai_bot.rb
+++ b/lib/modules/ai_bot/open_ai_bot.rb
@ -33,6 +33,29 @@ module DiscourseAi
        { temperature: 0.4, top_p: 0.9, max_tokens: max_tokens }
      end
      def submit_prompt(
        prompt,
        prefer_low_cost: false,
        temperature: nil,
        top_p: nil,
        max_tokens: nil,
        &blk
      )
        params =
          reply_params.merge(
            temperature: temperature,
            top_p: top_p,
            max_tokens: max_tokens,
          ) { |key, old_value, new_value| new_value.nil? ? old_value : new_value }
        model = prefer_low_cost ? "gpt-3.5-turbo" : model_for
        DiscourseAi::Inference::OpenAiCompletions.perform!(prompt, model, **params, &blk)
      end
      def tokenize(text)
        DiscourseAi::Tokenizer::OpenAiTokenizer.tokenize(text)
      end
      private
      def build_message(poster_username, content, system: false)
@ -65,15 +88,6 @@ module DiscourseAi
          max_tokens: 40,
        ).dig(:choices, 0, :message, :content)
      end
      def submit_prompt_and_stream_reply(prompt, prefer_low_cost: false, &blk)
        model = prefer_low_cost ? "gpt-3.5-turbo" : model_for
        DiscourseAi::Inference::OpenAiCompletions.perform!(prompt, model, **reply_params, &blk)
      end
      def tokenize(text)
        DiscourseAi::Tokenizer::OpenAiTokenizer.tokenize(text)
      end
    end
  end
 end
--- a/spec/lib/modules/ai_bot/bot_spec.rb
+++ b/spec/lib/modules/ai_bot/bot_spec.rb
@ -54,7 +54,7 @@ RSpec.describe DiscourseAi::AiBot::Bot do
      )
      prompt << { role: "assistant", content: "!search test search" }
-      prompt << { role: "user", content: "results: []" }
+      prompt << { role: "user", content: "results: No results found" }
      OpenAiCompletionsInferenceStubs.stub_streamed_response(
        prompt,
--- a/spec/lib/modules/ai_bot/commands/command_spec.rb
+++ b/spec/lib/modules/ai_bot/commands/command_spec.rb
@ -0,0 +1,36 @@
 #frozen_string_literal: true
 require_relative "../../../../support/openai_completions_inference_stubs"
 RSpec.describe DiscourseAi::AiBot::Commands::Command do
  fab!(:bot_user) { User.find(DiscourseAi::AiBot::EntryPoint::GPT3_5_TURBO_ID) }
  let(:command) { DiscourseAi::AiBot::Commands::Command.new(bot_user, nil) }
  describe "#format_results" do
    it "can generate efficient tables of data" do
      rows = [1, 2, 3, 4, 5]
      column_names = %w[first second third]
      formatted =
        command.format_results(rows, column_names) { |row| ["row ¦ 1", row + 1, "a|b,\nc"] }
      expect(formatted.split("\n").length).to eq(6)
      expect(formatted).to include("a|b, c")
    end
    it "can also generate results by returning hash per row" do
      rows = [1, 2, 3, 4, 5]
      column_names = %w[first second third]
      formatted =
        command.format_results(rows, column_names) { |row| ["row ¦ 1", row + 1, "a|b,\nc"] }
      formatted2 =
        command.format_results(rows) do |row|
          { first: "row ¦ 1", second: row + 1, third: "a|b,\nc" }
        end
      expect(formatted).to eq(formatted2)
    end
  end
 end
--- a/spec/lib/modules/ai_bot/commands/google_command_spec.rb
+++ b/spec/lib/modules/ai_bot/commands/google_command_spec.rb
@ -4,7 +4,6 @@ require_relative "../../../../support/openai_completions_inference_stubs"
 RSpec.describe DiscourseAi::AiBot::Commands::GoogleCommand do
  fab!(:bot_user) { User.find(DiscourseAi::AiBot::EntryPoint::GPT3_5_TURBO_ID) }
  fab!(:bot) { DiscourseAi::AiBot::Bot.as(bot_user) }
  describe "#process" do
    it "can generate correct info" do
@ -33,7 +32,7 @@ RSpec.describe DiscourseAi::AiBot::Commands::GoogleCommand do
        "https://www.googleapis.com/customsearch/v1?cx=cx&key=abc&num=10&q=some%20search%20term",
      ).to_return(status: 200, body: json_text, headers: {})
-      google = described_class.new(bot, post)
+      google = described_class.new(bot_user, post)
      info = google.process("some search term")
      expect(google.description_args[:count]).to eq(1)
--- a/spec/lib/modules/ai_bot/commands/search_command_spec.rb
+++ b/spec/lib/modules/ai_bot/commands/search_command_spec.rb
@ -0,0 +1,26 @@
 #frozen_string_literal: true
 require_relative "../../../../support/openai_completions_inference_stubs"
 RSpec.describe DiscourseAi::AiBot::Commands::SearchCommand do
  fab!(:bot_user) { User.find(DiscourseAi::AiBot::EntryPoint::GPT3_5_TURBO_ID) }
  before { SearchIndexer.enable }
  after { SearchIndexer.disable }
  describe "#process" do
    it "can handle limits" do
      post1 = Fabricate(:post)
      _post2 = Fabricate(:post, user: post1.user)
      _post3 = Fabricate(:post, user: post1.user)
      # search has no built in support for limit: so handle it from the outside
      search = described_class.new(bot_user, post1)
      results = search.process("@#{post1.user.username} limit:2")
      # title + 2 rows
      expect(results.split("\n").length).to eq(3)
    end
  end
 end
--- a/spec/lib/modules/ai_bot/commands/summarize_command_spec.rb
+++ b/spec/lib/modules/ai_bot/commands/summarize_command_spec.rb
@ -4,18 +4,22 @@ require_relative "../../../../support/openai_completions_inference_stubs"
 RSpec.describe DiscourseAi::AiBot::Commands::SummarizeCommand do
  fab!(:bot_user) { User.find(DiscourseAi::AiBot::EntryPoint::GPT3_5_TURBO_ID) }
  fab!(:bot) { DiscourseAi::AiBot::Bot.as(bot_user) }
  describe "#process" do
    it "can generate correct info" do
      post = Fabricate(:post)
-      summarizer = described_class.new(bot, post)
+      WebMock.stub_request(:post, "https://api.openai.com/v1/chat/completions").to_return(
        status: 200,
        body: JSON.dump({ choices: [{ message: { content: "summary stuff" } }] }),
      )
      summarizer = described_class.new(bot_user, post)
      info = summarizer.process("#{post.topic_id} why did it happen?")
-      expect(info).to include("why did it happen?")
+      expect(info).to include("Topic summarized")
-      expect(info).to include(post.raw)
+      expect(summarizer.custom_raw).to include("summary stuff")
-      expect(info).to include(post.user.username)
+      expect(summarizer.chain_next_response).to eq(false)
    end
    it "protects hidden data" do
@ -26,10 +30,12 @@ RSpec.describe DiscourseAi::AiBot::Commands::SummarizeCommand do
      topic = Fabricate(:topic, category_id: category.id)
      post = Fabricate(:post, topic: topic)
-      summarizer = described_class.new(bot, post)
+      summarizer = described_class.new(bot_user, post)
      info = summarizer.process("#{post.topic_id} why did it happen?")
      expect(info).not_to include(post.raw)
      expect(summarizer.custom_raw).to eq(I18n.t("discourse_ai.ai_bot.topic_not_found"))
    end
  end
 end