From 92fb84e24d62d4586c165e609ddc32200ae87897 Mon Sep 17 00:00:00 2001 From: Sam Date: Mon, 22 May 2023 12:09:14 +1000 Subject: [PATCH] iterate commands (#73) * FEATURE: introduce a more efficient formatter Previous formatting style was space inefficient given JSON consumes lots of tokens, the new format is now used consistently across commands Also fixes - search limited to 10 - search breaking on limit: non existent directive * Slight improvement to summarizer Stop blowing up context with custom prompts * ensure we include the guiding message * correct spec * langchain style summarizer ... much more accurate (albeit more expensive) * lint --- config/locales/server.en.yml | 1 + lib/modules/ai_bot/anthropic_bot.rb | 2 +- lib/modules/ai_bot/bot.rb | 30 ++-- .../ai_bot/commands/categories_command.rb | 34 ++--- lib/modules/ai_bot/commands/command.rb | 37 +++++ lib/modules/ai_bot/commands/google_command.rb | 9 +- lib/modules/ai_bot/commands/search_command.rb | 37 +++-- .../ai_bot/commands/summarize_command.rb | 129 ++++++++++++++++-- lib/modules/ai_bot/commands/tags_command.rb | 24 ++-- lib/modules/ai_bot/open_ai_bot.rb | 32 +++-- spec/lib/modules/ai_bot/bot_spec.rb | 2 +- .../modules/ai_bot/commands/command_spec.rb | 36 +++++ .../ai_bot/commands/google_command_spec.rb | 3 +- .../ai_bot/commands/search_command_spec.rb | 26 ++++ .../ai_bot/commands/summarize_command_spec.rb | 18 ++- 15 files changed, 328 insertions(+), 92 deletions(-) create mode 100644 spec/lib/modules/ai_bot/commands/command_spec.rb create mode 100644 spec/lib/modules/ai_bot/commands/search_command_spec.rb diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml index 79491584..c344a7fc 100644 --- a/config/locales/server.en.yml +++ b/config/locales/server.en.yml @@ -91,6 +91,7 @@ en: ai_bot: default_pm_prefix: "[Untitled AI bot PM]" + topic_not_found: "Summary unavailable, topic not found!" command_summary: categories: "List categories" search: "Search" diff --git a/lib/modules/ai_bot/anthropic_bot.rb b/lib/modules/ai_bot/anthropic_bot.rb index bdf75998..d42e1eb6 100644 --- a/lib/modules/ai_bot/anthropic_bot.rb +++ b/lib/modules/ai_bot/anthropic_bot.rb @@ -40,7 +40,7 @@ module DiscourseAi ).dig(:completion) end - def submit_prompt_and_stream_reply(prompt, prefer_low_cost: false, &blk) + def submit_prompt(prompt, prefer_low_cost: false, &blk) DiscourseAi::Inference::AnthropicCompletions.perform!( prompt, model_for, diff --git a/lib/modules/ai_bot/bot.rb b/lib/modules/ai_bot/bot.rb index d439d3e3..0de5ab43 100644 --- a/lib/modules/ai_bot/bot.rb +++ b/lib/modules/ai_bot/bot.rb @@ -55,10 +55,7 @@ module DiscourseAi setup_cancel = false - submit_prompt_and_stream_reply( - prompt, - prefer_low_cost: prefer_low_cost, - ) do |partial, cancel| + submit_prompt(prompt, prefer_low_cost: prefer_low_cost) do |partial, cancel| reply = update_with_delta(reply, partial) if redis_stream_key && !Discourse.redis.get(redis_stream_key) @@ -139,6 +136,7 @@ module DiscourseAi rendered_system_prompt = system_prompt(post) total_prompt_tokens = tokenize(rendered_system_prompt).length + messages = conversation.reduce([]) do |memo, (raw, username)| break(memo) if total_prompt_tokens >= prompt_limit @@ -227,6 +225,14 @@ module DiscourseAi TEXT end + def tokenize(text) + raise NotImplemented + end + + def submit_prompt(prompt, prefer_low_cost: false, &blk) + raise NotImplemented + end + protected attr_reader :bot_user @@ -243,10 +249,6 @@ module DiscourseAi raise NotImplemented end - def submit_prompt_and_stream_reply(prompt, prefer_low_cost: false, &blk) - raise NotImplemented - end - def conversation_context(post) context = post @@ -262,9 +264,15 @@ module DiscourseAi result = [] + first = true context.each do |raw, username, custom_prompt| if custom_prompt.present? - custom_prompt.reverse_each { |message| result << message } + if first + custom_prompt.reverse_each { |message| result << message } + first = false + else + result << custom_prompt.first + end else result << [raw, username] end @@ -280,10 +288,6 @@ module DiscourseAi user_ids: bot_reply_post.topic.allowed_user_ids, ) end - - def tokenize(text) - raise NotImplemented - end end end end diff --git a/lib/modules/ai_bot/commands/categories_command.rb b/lib/modules/ai_bot/commands/categories_command.rb index d5433bf4..edbce74a 100644 --- a/lib/modules/ai_bot/commands/categories_command.rb +++ b/lib/modules/ai_bot/commands/categories_command.rb @@ -21,29 +21,21 @@ module DiscourseAi::AiBot::Commands end def process(_args) - info = - +"Name, Slug, Description, Posts Year, Posts Month, Posts Week, id, parent_category_id\n" + columns = { + name: "Name", + slug: "Slug", + description: "Description", + posts_year: "Posts Year", + posts_month: "Posts Month", + posts_week: "Posts Week", + id: "id", + parent_category_id: "parent_category_id", + } - @count = 0 - Category - .where(read_restricted: false) - .limit(100) - .pluck( - :id, - :parent_category_id, - :slug, - :name, - :description, - :posts_year, - :posts_month, - :posts_week, - ) - .map do |id, parent_category_id, slug, name, description, posts_year, posts_month, posts_week| - @count += 1 - info << "#{name}, #{slug}, #{(description || "").gsub(",", "")}, #{posts_year || 0}, #{posts_month || 0}, #{posts_week || 0},#{id}, #{parent_category_id} \n" - end + rows = Category.where(read_restricted: false).limit(100).pluck(*columns.keys) + @count = rows.length - info + format_results(rows, columns.values) end end end diff --git a/lib/modules/ai_bot/commands/command.rb b/lib/modules/ai_bot/commands/command.rb index e3255857..505bea26 100644 --- a/lib/modules/ai_bot/commands/command.rb +++ b/lib/modules/ai_bot/commands/command.rb @@ -22,6 +22,8 @@ module DiscourseAi end end + attr_reader :bot_user, :args + def initialize(bot_user, args) @bot_user = bot_user @args = args @@ -89,6 +91,41 @@ module DiscourseAi chain_next_response end + def format_results(rows, column_names = nil) + rows = rows.map { |row| yield row } if block_given? + + if !column_names + index = -1 + column_indexes = {} + + rows = + rows.map do |data| + new_row = [] + data.each do |key, value| + found_index = column_indexes[key.to_s] ||= (index += 1) + new_row[found_index] = value + end + new_row + end + column_names = column_indexes.keys + end + # two tokens per delimiter is a reasonable balance + # there may be a single delimiter solution but GPT has + # a hard time dealing with escaped characters + delimiter = "¦" + formatted = +"" + formatted << column_names.join(delimiter) + formatted << "\n" + + rows.each do |array| + array.map! { |item| item.to_s.gsub(delimiter, "|").gsub(/\n/, " ") } + formatted << array.join(delimiter) + formatted << "\n" + end + + formatted + end + protected attr_reader :bot_user, :args diff --git a/lib/modules/ai_bot/commands/google_command.rb b/lib/modules/ai_bot/commands/google_command.rb index 5fb36fda..94afca93 100644 --- a/lib/modules/ai_bot/commands/google_command.rb +++ b/lib/modules/ai_bot/commands/google_command.rb @@ -42,20 +42,15 @@ module DiscourseAi::AiBot::Commands @last_num_results = parsed.dig("searchInformation", "totalResults").to_i - formatted_results = [] - - results.each do |result| - formatted_result = { + format_results(results) do |result| + { title: result["title"], link: result["link"], snippet: result["snippet"], displayLink: result["displayLink"], formattedUrl: result["formattedUrl"], } - formatted_results << formatted_result end - - formatted_results end end end diff --git a/lib/modules/ai_bot/commands/search_command.rb b/lib/modules/ai_bot/commands/search_command.rb index 0f71bf59..04b38901 100644 --- a/lib/modules/ai_bot/commands/search_command.rb +++ b/lib/modules/ai_bot/commands/search_command.rb @@ -74,23 +74,44 @@ module DiscourseAi::AiBot::Commands end def process(search_string) + limit = nil + + search_string = + search_string + .strip + .split(/\s+/) + .map do |term| + if term =~ /limit:(\d+)/ + limit = $1.to_i + nil + else + term + end + end + .compact + .join(" ") + @last_query = search_string results = Search.execute(search_string.to_s, search_type: :full_page, guardian: Guardian.new()) + posts = results.posts + posts = posts[0..limit - 1] if limit + @last_num_results = results.posts.length - results.posts[0..10] - .map do |p| + if posts.blank? + "No results found" + else + format_results(posts) do |post| { - title: p.topic.title, - url: p.url, - raw_truncated: p.raw[0..250], - excerpt: p.excerpt, - created: p.created_at, + title: post.topic.title, + url: post.url, + excerpt: post.excerpt, + created: post.created_at, } end - .to_json + end end end end diff --git a/lib/modules/ai_bot/commands/summarize_command.rb b/lib/modules/ai_bot/commands/summarize_command.rb index 6b86ee00..b11008fb 100644 --- a/lib/modules/ai_bot/commands/summarize_command.rb +++ b/lib/modules/ai_bot/commands/summarize_command.rb @@ -40,30 +40,135 @@ module DiscourseAi::AiBot::Commands topic = nil if !topic || !Guardian.new.can_see?(topic) end - rows = [] + @last_summary = nil if topic @last_topic_title = topic.title - if guidance.present? - rows << ["Given: #{guidance}"] - rows << ["Summarise: #{topic.title}"] + + posts = Post - .joins(:user) .where(topic_id: topic.id) - .order(:post_number) .where("post_type in (?)", [Post.types[:regular], Post.types[:small_action]]) .where("not hidden") - .limit(50) - .pluck(:raw, :username) - .each { |raw, username| rows << ["#{username} said: #{raw}"] } - end + .order(:post_number) + + columns = ["posts.id", :post_number, :raw, :username] + + current_post_numbers = posts.limit(5).pluck(:post_number) + current_post_numbers += posts.reorder("posts.score desc").limit(50).pluck(:post_number) + current_post_numbers += posts.reorder("post_number desc").limit(5).pluck(:post_number) + + data = + Post + .where(topic_id: topic.id) + .joins(:user) + .where("post_number in (?)", current_post_numbers) + .order(:post_number) + .pluck(*columns) + + @last_summary = summarize(data, guidance, topic) end - if rows.blank? + if !@last_summary "Say: No topic found!" else - "#{rows.join("\n")}"[0..2000] + "Topic summarized" end end + + def custom_raw + @last_summary || I18n.t("discourse_ai.ai_bot.topic_not_found") + end + + def chain_next_response + false + end + + def bot + @bot ||= DiscourseAi::AiBot::Bot.as(bot_user) + end + + def summarize(data, guidance, topic) + text = +"" + data.each do |id, post_number, raw, username| + text << "(#{post_number} #{username} said: #{raw}" + end + + summaries = [] + current_section = +"" + split = [] + + text + .split(/\s+/) + .each_slice(20) do |slice| + current_section << " " + current_section << slice.join(" ") + + # somehow any more will get closer to limits + if bot.tokenize(current_section).length > 2500 + split << current_section + current_section = +"" + end + end + + split << current_section if current_section.present? + + split = split[0..3] + split[-3..-1] if split.length > 5 + + split.each do |section| + # TODO progress meter + summary = + generate_gpt_summary( + section, + topic: topic, + context: "Guidance: #{guidance}\nYou are summarizing the topic: #{topic.title}", + ) + summaries << summary + end + + if summaries.length > 1 + messages = [] + messages << { role: "system", content: "You are a helpful bot" } + messages << { + role: "user", + content: + "concatenated the disjoint summaries, creating a cohesive narrative:\n#{summaries.join("\n")}}", + } + bot.submit_prompt(messages, temperature: 0.6, max_tokens: 500, prefer_low_cost: true).dig( + :choices, + 0, + :message, + :content, + ) + else + summaries.first + end + end + + def generate_gpt_summary(text, topic:, context: nil, length: nil) + length ||= 400 + + prompt = <<~TEXT + #{context} + Summarize the following in #{length} words: + + #{text} + TEXT + + system_prompt = <<~TEXT + You are a summarization bot. + You effectively summarise any text. + You condense it into a shorter version. + You understand and generate Discourse forum markdown. + Try generating links as well the format is #{topic.url}/POST_NUMBER. eg: [ref](#{topic.url}/77) + TEXT + + messages = [{ role: "system", content: system_prompt }] + messages << { role: "user", content: prompt } + + result = + bot.submit_prompt(messages, temperature: 0.6, max_tokens: length, prefer_low_cost: true) + result.dig(:choices, 0, :message, :content) + end end end diff --git a/lib/modules/ai_bot/commands/tags_command.rb b/lib/modules/ai_bot/commands/tags_command.rb index 5ab26c17..501159f8 100644 --- a/lib/modules/ai_bot/commands/tags_command.rb +++ b/lib/modules/ai_bot/commands/tags_command.rb @@ -21,18 +21,18 @@ module DiscourseAi::AiBot::Commands end def process(_args) - info = +"Name, Topic Count\n" - @last_count = 0 - Tag - .where("public_topic_count > 0") - .order(public_topic_count: :desc) - .limit(100) - .pluck(:name, :public_topic_count) - .each do |name, count| - @last_count += 1 - info << "#{name}, #{count}\n" - end - info + column_names = { name: "Name", public_topic_count: "Topic Count" } + + tags = + Tag + .where("public_topic_count > 0") + .order(public_topic_count: :desc) + .limit(100) + .pluck(*column_names.keys) + + @last_count = tags.length + + format_results(tags, column_names.values) end end end diff --git a/lib/modules/ai_bot/open_ai_bot.rb b/lib/modules/ai_bot/open_ai_bot.rb index 33c5080e..dea47379 100644 --- a/lib/modules/ai_bot/open_ai_bot.rb +++ b/lib/modules/ai_bot/open_ai_bot.rb @@ -33,6 +33,29 @@ module DiscourseAi { temperature: 0.4, top_p: 0.9, max_tokens: max_tokens } end + def submit_prompt( + prompt, + prefer_low_cost: false, + temperature: nil, + top_p: nil, + max_tokens: nil, + &blk + ) + params = + reply_params.merge( + temperature: temperature, + top_p: top_p, + max_tokens: max_tokens, + ) { |key, old_value, new_value| new_value.nil? ? old_value : new_value } + + model = prefer_low_cost ? "gpt-3.5-turbo" : model_for + DiscourseAi::Inference::OpenAiCompletions.perform!(prompt, model, **params, &blk) + end + + def tokenize(text) + DiscourseAi::Tokenizer::OpenAiTokenizer.tokenize(text) + end + private def build_message(poster_username, content, system: false) @@ -65,15 +88,6 @@ module DiscourseAi max_tokens: 40, ).dig(:choices, 0, :message, :content) end - - def submit_prompt_and_stream_reply(prompt, prefer_low_cost: false, &blk) - model = prefer_low_cost ? "gpt-3.5-turbo" : model_for - DiscourseAi::Inference::OpenAiCompletions.perform!(prompt, model, **reply_params, &blk) - end - - def tokenize(text) - DiscourseAi::Tokenizer::OpenAiTokenizer.tokenize(text) - end end end end diff --git a/spec/lib/modules/ai_bot/bot_spec.rb b/spec/lib/modules/ai_bot/bot_spec.rb index dbea6d2a..836cc800 100644 --- a/spec/lib/modules/ai_bot/bot_spec.rb +++ b/spec/lib/modules/ai_bot/bot_spec.rb @@ -54,7 +54,7 @@ RSpec.describe DiscourseAi::AiBot::Bot do ) prompt << { role: "assistant", content: "!search test search" } - prompt << { role: "user", content: "results: []" } + prompt << { role: "user", content: "results: No results found" } OpenAiCompletionsInferenceStubs.stub_streamed_response( prompt, diff --git a/spec/lib/modules/ai_bot/commands/command_spec.rb b/spec/lib/modules/ai_bot/commands/command_spec.rb new file mode 100644 index 00000000..f5a407bf --- /dev/null +++ b/spec/lib/modules/ai_bot/commands/command_spec.rb @@ -0,0 +1,36 @@ +#frozen_string_literal: true + +require_relative "../../../../support/openai_completions_inference_stubs" + +RSpec.describe DiscourseAi::AiBot::Commands::Command do + fab!(:bot_user) { User.find(DiscourseAi::AiBot::EntryPoint::GPT3_5_TURBO_ID) } + let(:command) { DiscourseAi::AiBot::Commands::Command.new(bot_user, nil) } + + describe "#format_results" do + it "can generate efficient tables of data" do + rows = [1, 2, 3, 4, 5] + column_names = %w[first second third] + + formatted = + command.format_results(rows, column_names) { |row| ["row ¦ 1", row + 1, "a|b,\nc"] } + + expect(formatted.split("\n").length).to eq(6) + expect(formatted).to include("a|b, c") + end + + it "can also generate results by returning hash per row" do + rows = [1, 2, 3, 4, 5] + column_names = %w[first second third] + + formatted = + command.format_results(rows, column_names) { |row| ["row ¦ 1", row + 1, "a|b,\nc"] } + + formatted2 = + command.format_results(rows) do |row| + { first: "row ¦ 1", second: row + 1, third: "a|b,\nc" } + end + + expect(formatted).to eq(formatted2) + end + end +end diff --git a/spec/lib/modules/ai_bot/commands/google_command_spec.rb b/spec/lib/modules/ai_bot/commands/google_command_spec.rb index 6717499e..1fac77be 100644 --- a/spec/lib/modules/ai_bot/commands/google_command_spec.rb +++ b/spec/lib/modules/ai_bot/commands/google_command_spec.rb @@ -4,7 +4,6 @@ require_relative "../../../../support/openai_completions_inference_stubs" RSpec.describe DiscourseAi::AiBot::Commands::GoogleCommand do fab!(:bot_user) { User.find(DiscourseAi::AiBot::EntryPoint::GPT3_5_TURBO_ID) } - fab!(:bot) { DiscourseAi::AiBot::Bot.as(bot_user) } describe "#process" do it "can generate correct info" do @@ -33,7 +32,7 @@ RSpec.describe DiscourseAi::AiBot::Commands::GoogleCommand do "https://www.googleapis.com/customsearch/v1?cx=cx&key=abc&num=10&q=some%20search%20term", ).to_return(status: 200, body: json_text, headers: {}) - google = described_class.new(bot, post) + google = described_class.new(bot_user, post) info = google.process("some search term") expect(google.description_args[:count]).to eq(1) diff --git a/spec/lib/modules/ai_bot/commands/search_command_spec.rb b/spec/lib/modules/ai_bot/commands/search_command_spec.rb new file mode 100644 index 00000000..37717400 --- /dev/null +++ b/spec/lib/modules/ai_bot/commands/search_command_spec.rb @@ -0,0 +1,26 @@ +#frozen_string_literal: true + +require_relative "../../../../support/openai_completions_inference_stubs" + +RSpec.describe DiscourseAi::AiBot::Commands::SearchCommand do + fab!(:bot_user) { User.find(DiscourseAi::AiBot::EntryPoint::GPT3_5_TURBO_ID) } + + before { SearchIndexer.enable } + after { SearchIndexer.disable } + + describe "#process" do + it "can handle limits" do + post1 = Fabricate(:post) + _post2 = Fabricate(:post, user: post1.user) + _post3 = Fabricate(:post, user: post1.user) + + # search has no built in support for limit: so handle it from the outside + search = described_class.new(bot_user, post1) + + results = search.process("@#{post1.user.username} limit:2") + + # title + 2 rows + expect(results.split("\n").length).to eq(3) + end + end +end diff --git a/spec/lib/modules/ai_bot/commands/summarize_command_spec.rb b/spec/lib/modules/ai_bot/commands/summarize_command_spec.rb index 7a51b3cb..d61f9a22 100644 --- a/spec/lib/modules/ai_bot/commands/summarize_command_spec.rb +++ b/spec/lib/modules/ai_bot/commands/summarize_command_spec.rb @@ -4,18 +4,22 @@ require_relative "../../../../support/openai_completions_inference_stubs" RSpec.describe DiscourseAi::AiBot::Commands::SummarizeCommand do fab!(:bot_user) { User.find(DiscourseAi::AiBot::EntryPoint::GPT3_5_TURBO_ID) } - fab!(:bot) { DiscourseAi::AiBot::Bot.as(bot_user) } describe "#process" do it "can generate correct info" do post = Fabricate(:post) - summarizer = described_class.new(bot, post) + WebMock.stub_request(:post, "https://api.openai.com/v1/chat/completions").to_return( + status: 200, + body: JSON.dump({ choices: [{ message: { content: "summary stuff" } }] }), + ) + + summarizer = described_class.new(bot_user, post) info = summarizer.process("#{post.topic_id} why did it happen?") - expect(info).to include("why did it happen?") - expect(info).to include(post.raw) - expect(info).to include(post.user.username) + expect(info).to include("Topic summarized") + expect(summarizer.custom_raw).to include("summary stuff") + expect(summarizer.chain_next_response).to eq(false) end it "protects hidden data" do @@ -26,10 +30,12 @@ RSpec.describe DiscourseAi::AiBot::Commands::SummarizeCommand do topic = Fabricate(:topic, category_id: category.id) post = Fabricate(:post, topic: topic) - summarizer = described_class.new(bot, post) + summarizer = described_class.new(bot_user, post) info = summarizer.process("#{post.topic_id} why did it happen?") expect(info).not_to include(post.raw) + + expect(summarizer.custom_raw).to eq(I18n.t("discourse_ai.ai_bot.topic_not_found")) end end end