iterate commands (#73)

* FEATURE: introduce a more efficient formatter

Previous formatting style was space inefficient given JSON consumes lots
of tokens, the new format is now used consistently across commands

Also fixes

- search limited to 10
- search breaking on limit: non existent directive

* Slight improvement to summarizer
Stop blowing up context with custom prompts

* ensure we include the guiding message

* correct spec

* langchain style summarizer ...

much more accurate (albeit more expensive)

* lint
This commit is contained in:
Sam 2023-05-22 12:09:14 +10:00 committed by GitHub
parent d59ed1091b
commit 92fb84e24d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 328 additions and 92 deletions

View File

@ -91,6 +91,7 @@ en:
ai_bot: ai_bot:
default_pm_prefix: "[Untitled AI bot PM]" default_pm_prefix: "[Untitled AI bot PM]"
topic_not_found: "Summary unavailable, topic not found!"
command_summary: command_summary:
categories: "List categories" categories: "List categories"
search: "Search" search: "Search"

View File

@ -40,7 +40,7 @@ module DiscourseAi
).dig(:completion) ).dig(:completion)
end end
def submit_prompt_and_stream_reply(prompt, prefer_low_cost: false, &blk) def submit_prompt(prompt, prefer_low_cost: false, &blk)
DiscourseAi::Inference::AnthropicCompletions.perform!( DiscourseAi::Inference::AnthropicCompletions.perform!(
prompt, prompt,
model_for, model_for,

View File

@ -55,10 +55,7 @@ module DiscourseAi
setup_cancel = false setup_cancel = false
submit_prompt_and_stream_reply( submit_prompt(prompt, prefer_low_cost: prefer_low_cost) do |partial, cancel|
prompt,
prefer_low_cost: prefer_low_cost,
) do |partial, cancel|
reply = update_with_delta(reply, partial) reply = update_with_delta(reply, partial)
if redis_stream_key && !Discourse.redis.get(redis_stream_key) if redis_stream_key && !Discourse.redis.get(redis_stream_key)
@ -139,6 +136,7 @@ module DiscourseAi
rendered_system_prompt = system_prompt(post) rendered_system_prompt = system_prompt(post)
total_prompt_tokens = tokenize(rendered_system_prompt).length total_prompt_tokens = tokenize(rendered_system_prompt).length
messages = messages =
conversation.reduce([]) do |memo, (raw, username)| conversation.reduce([]) do |memo, (raw, username)|
break(memo) if total_prompt_tokens >= prompt_limit break(memo) if total_prompt_tokens >= prompt_limit
@ -227,6 +225,14 @@ module DiscourseAi
TEXT TEXT
end end
def tokenize(text)
raise NotImplemented
end
def submit_prompt(prompt, prefer_low_cost: false, &blk)
raise NotImplemented
end
protected protected
attr_reader :bot_user attr_reader :bot_user
@ -243,10 +249,6 @@ module DiscourseAi
raise NotImplemented raise NotImplemented
end end
def submit_prompt_and_stream_reply(prompt, prefer_low_cost: false, &blk)
raise NotImplemented
end
def conversation_context(post) def conversation_context(post)
context = context =
post post
@ -262,9 +264,15 @@ module DiscourseAi
result = [] result = []
first = true
context.each do |raw, username, custom_prompt| context.each do |raw, username, custom_prompt|
if custom_prompt.present? if custom_prompt.present?
if first
custom_prompt.reverse_each { |message| result << message } custom_prompt.reverse_each { |message| result << message }
first = false
else
result << custom_prompt.first
end
else else
result << [raw, username] result << [raw, username]
end end
@ -280,10 +288,6 @@ module DiscourseAi
user_ids: bot_reply_post.topic.allowed_user_ids, user_ids: bot_reply_post.topic.allowed_user_ids,
) )
end end
def tokenize(text)
raise NotImplemented
end
end end
end end
end end

View File

@ -21,29 +21,21 @@ module DiscourseAi::AiBot::Commands
end end
def process(_args) def process(_args)
info = columns = {
+"Name, Slug, Description, Posts Year, Posts Month, Posts Week, id, parent_category_id\n" name: "Name",
slug: "Slug",
description: "Description",
posts_year: "Posts Year",
posts_month: "Posts Month",
posts_week: "Posts Week",
id: "id",
parent_category_id: "parent_category_id",
}
@count = 0 rows = Category.where(read_restricted: false).limit(100).pluck(*columns.keys)
Category @count = rows.length
.where(read_restricted: false)
.limit(100)
.pluck(
:id,
:parent_category_id,
:slug,
:name,
:description,
:posts_year,
:posts_month,
:posts_week,
)
.map do |id, parent_category_id, slug, name, description, posts_year, posts_month, posts_week|
@count += 1
info << "#{name}, #{slug}, #{(description || "").gsub(",", "")}, #{posts_year || 0}, #{posts_month || 0}, #{posts_week || 0},#{id}, #{parent_category_id} \n"
end
info format_results(rows, columns.values)
end end
end end
end end

View File

@ -22,6 +22,8 @@ module DiscourseAi
end end
end end
attr_reader :bot_user, :args
def initialize(bot_user, args) def initialize(bot_user, args)
@bot_user = bot_user @bot_user = bot_user
@args = args @args = args
@ -89,6 +91,41 @@ module DiscourseAi
chain_next_response chain_next_response
end end
def format_results(rows, column_names = nil)
rows = rows.map { |row| yield row } if block_given?
if !column_names
index = -1
column_indexes = {}
rows =
rows.map do |data|
new_row = []
data.each do |key, value|
found_index = column_indexes[key.to_s] ||= (index += 1)
new_row[found_index] = value
end
new_row
end
column_names = column_indexes.keys
end
# two tokens per delimiter is a reasonable balance
# there may be a single delimiter solution but GPT has
# a hard time dealing with escaped characters
delimiter = "¦"
formatted = +""
formatted << column_names.join(delimiter)
formatted << "\n"
rows.each do |array|
array.map! { |item| item.to_s.gsub(delimiter, "|").gsub(/\n/, " ") }
formatted << array.join(delimiter)
formatted << "\n"
end
formatted
end
protected protected
attr_reader :bot_user, :args attr_reader :bot_user, :args

View File

@ -42,20 +42,15 @@ module DiscourseAi::AiBot::Commands
@last_num_results = parsed.dig("searchInformation", "totalResults").to_i @last_num_results = parsed.dig("searchInformation", "totalResults").to_i
formatted_results = [] format_results(results) do |result|
{
results.each do |result|
formatted_result = {
title: result["title"], title: result["title"],
link: result["link"], link: result["link"],
snippet: result["snippet"], snippet: result["snippet"],
displayLink: result["displayLink"], displayLink: result["displayLink"],
formattedUrl: result["formattedUrl"], formattedUrl: result["formattedUrl"],
} }
formatted_results << formatted_result end
end
formatted_results
end end
end end
end end

View File

@ -74,23 +74,44 @@ module DiscourseAi::AiBot::Commands
end end
def process(search_string) def process(search_string)
limit = nil
search_string =
search_string
.strip
.split(/\s+/)
.map do |term|
if term =~ /limit:(\d+)/
limit = $1.to_i
nil
else
term
end
end
.compact
.join(" ")
@last_query = search_string @last_query = search_string
results = results =
Search.execute(search_string.to_s, search_type: :full_page, guardian: Guardian.new()) Search.execute(search_string.to_s, search_type: :full_page, guardian: Guardian.new())
posts = results.posts
posts = posts[0..limit - 1] if limit
@last_num_results = results.posts.length @last_num_results = results.posts.length
results.posts[0..10] if posts.blank?
.map do |p| "No results found"
else
format_results(posts) do |post|
{ {
title: p.topic.title, title: post.topic.title,
url: p.url, url: post.url,
raw_truncated: p.raw[0..250], excerpt: post.excerpt,
excerpt: p.excerpt, created: post.created_at,
created: p.created_at,
} }
end end
.to_json end
end end
end end
end end

View File

@ -40,30 +40,135 @@ module DiscourseAi::AiBot::Commands
topic = nil if !topic || !Guardian.new.can_see?(topic) topic = nil if !topic || !Guardian.new.can_see?(topic)
end end
rows = [] @last_summary = nil
if topic if topic
@last_topic_title = topic.title @last_topic_title = topic.title
if guidance.present?
rows << ["Given: #{guidance}"] posts =
rows << ["Summarise: #{topic.title}"]
Post Post
.joins(:user)
.where(topic_id: topic.id) .where(topic_id: topic.id)
.order(:post_number)
.where("post_type in (?)", [Post.types[:regular], Post.types[:small_action]]) .where("post_type in (?)", [Post.types[:regular], Post.types[:small_action]])
.where("not hidden") .where("not hidden")
.limit(50) .order(:post_number)
.pluck(:raw, :username)
.each { |raw, username| rows << ["#{username} said: #{raw}"] } columns = ["posts.id", :post_number, :raw, :username]
current_post_numbers = posts.limit(5).pluck(:post_number)
current_post_numbers += posts.reorder("posts.score desc").limit(50).pluck(:post_number)
current_post_numbers += posts.reorder("post_number desc").limit(5).pluck(:post_number)
data =
Post
.where(topic_id: topic.id)
.joins(:user)
.where("post_number in (?)", current_post_numbers)
.order(:post_number)
.pluck(*columns)
@last_summary = summarize(data, guidance, topic)
end
if !@last_summary
"Say: No topic found!"
else
"Topic summarized"
end end
end end
if rows.blank? def custom_raw
"Say: No topic found!" @last_summary || I18n.t("discourse_ai.ai_bot.topic_not_found")
end
def chain_next_response
false
end
def bot
@bot ||= DiscourseAi::AiBot::Bot.as(bot_user)
end
def summarize(data, guidance, topic)
text = +""
data.each do |id, post_number, raw, username|
text << "(#{post_number} #{username} said: #{raw}"
end
summaries = []
current_section = +""
split = []
text
.split(/\s+/)
.each_slice(20) do |slice|
current_section << " "
current_section << slice.join(" ")
# somehow any more will get closer to limits
if bot.tokenize(current_section).length > 2500
split << current_section
current_section = +""
end
end
split << current_section if current_section.present?
split = split[0..3] + split[-3..-1] if split.length > 5
split.each do |section|
# TODO progress meter
summary =
generate_gpt_summary(
section,
topic: topic,
context: "Guidance: #{guidance}\nYou are summarizing the topic: #{topic.title}",
)
summaries << summary
end
if summaries.length > 1
messages = []
messages << { role: "system", content: "You are a helpful bot" }
messages << {
role: "user",
content:
"concatenated the disjoint summaries, creating a cohesive narrative:\n#{summaries.join("\n")}}",
}
bot.submit_prompt(messages, temperature: 0.6, max_tokens: 500, prefer_low_cost: true).dig(
:choices,
0,
:message,
:content,
)
else else
"#{rows.join("\n")}"[0..2000] summaries.first
end end
end
def generate_gpt_summary(text, topic:, context: nil, length: nil)
length ||= 400
prompt = <<~TEXT
#{context}
Summarize the following in #{length} words:
#{text}
TEXT
system_prompt = <<~TEXT
You are a summarization bot.
You effectively summarise any text.
You condense it into a shorter version.
You understand and generate Discourse forum markdown.
Try generating links as well the format is #{topic.url}/POST_NUMBER. eg: [ref](#{topic.url}/77)
TEXT
messages = [{ role: "system", content: system_prompt }]
messages << { role: "user", content: prompt }
result =
bot.submit_prompt(messages, temperature: 0.6, max_tokens: length, prefer_low_cost: true)
result.dig(:choices, 0, :message, :content)
end end
end end
end end

View File

@ -21,18 +21,18 @@ module DiscourseAi::AiBot::Commands
end end
def process(_args) def process(_args)
info = +"Name, Topic Count\n" column_names = { name: "Name", public_topic_count: "Topic Count" }
@last_count = 0
tags =
Tag Tag
.where("public_topic_count > 0") .where("public_topic_count > 0")
.order(public_topic_count: :desc) .order(public_topic_count: :desc)
.limit(100) .limit(100)
.pluck(:name, :public_topic_count) .pluck(*column_names.keys)
.each do |name, count|
@last_count += 1 @last_count = tags.length
info << "#{name}, #{count}\n"
end format_results(tags, column_names.values)
info
end end
end end
end end

View File

@ -33,6 +33,29 @@ module DiscourseAi
{ temperature: 0.4, top_p: 0.9, max_tokens: max_tokens } { temperature: 0.4, top_p: 0.9, max_tokens: max_tokens }
end end
def submit_prompt(
prompt,
prefer_low_cost: false,
temperature: nil,
top_p: nil,
max_tokens: nil,
&blk
)
params =
reply_params.merge(
temperature: temperature,
top_p: top_p,
max_tokens: max_tokens,
) { |key, old_value, new_value| new_value.nil? ? old_value : new_value }
model = prefer_low_cost ? "gpt-3.5-turbo" : model_for
DiscourseAi::Inference::OpenAiCompletions.perform!(prompt, model, **params, &blk)
end
def tokenize(text)
DiscourseAi::Tokenizer::OpenAiTokenizer.tokenize(text)
end
private private
def build_message(poster_username, content, system: false) def build_message(poster_username, content, system: false)
@ -65,15 +88,6 @@ module DiscourseAi
max_tokens: 40, max_tokens: 40,
).dig(:choices, 0, :message, :content) ).dig(:choices, 0, :message, :content)
end end
def submit_prompt_and_stream_reply(prompt, prefer_low_cost: false, &blk)
model = prefer_low_cost ? "gpt-3.5-turbo" : model_for
DiscourseAi::Inference::OpenAiCompletions.perform!(prompt, model, **reply_params, &blk)
end
def tokenize(text)
DiscourseAi::Tokenizer::OpenAiTokenizer.tokenize(text)
end
end end
end end
end end

View File

@ -54,7 +54,7 @@ RSpec.describe DiscourseAi::AiBot::Bot do
) )
prompt << { role: "assistant", content: "!search test search" } prompt << { role: "assistant", content: "!search test search" }
prompt << { role: "user", content: "results: []" } prompt << { role: "user", content: "results: No results found" }
OpenAiCompletionsInferenceStubs.stub_streamed_response( OpenAiCompletionsInferenceStubs.stub_streamed_response(
prompt, prompt,

View File

@ -0,0 +1,36 @@
#frozen_string_literal: true
require_relative "../../../../support/openai_completions_inference_stubs"
RSpec.describe DiscourseAi::AiBot::Commands::Command do
fab!(:bot_user) { User.find(DiscourseAi::AiBot::EntryPoint::GPT3_5_TURBO_ID) }
let(:command) { DiscourseAi::AiBot::Commands::Command.new(bot_user, nil) }
describe "#format_results" do
it "can generate efficient tables of data" do
rows = [1, 2, 3, 4, 5]
column_names = %w[first second third]
formatted =
command.format_results(rows, column_names) { |row| ["row ¦ 1", row + 1, "a|b,\nc"] }
expect(formatted.split("\n").length).to eq(6)
expect(formatted).to include("a|b, c")
end
it "can also generate results by returning hash per row" do
rows = [1, 2, 3, 4, 5]
column_names = %w[first second third]
formatted =
command.format_results(rows, column_names) { |row| ["row ¦ 1", row + 1, "a|b,\nc"] }
formatted2 =
command.format_results(rows) do |row|
{ first: "row ¦ 1", second: row + 1, third: "a|b,\nc" }
end
expect(formatted).to eq(formatted2)
end
end
end

View File

@ -4,7 +4,6 @@ require_relative "../../../../support/openai_completions_inference_stubs"
RSpec.describe DiscourseAi::AiBot::Commands::GoogleCommand do RSpec.describe DiscourseAi::AiBot::Commands::GoogleCommand do
fab!(:bot_user) { User.find(DiscourseAi::AiBot::EntryPoint::GPT3_5_TURBO_ID) } fab!(:bot_user) { User.find(DiscourseAi::AiBot::EntryPoint::GPT3_5_TURBO_ID) }
fab!(:bot) { DiscourseAi::AiBot::Bot.as(bot_user) }
describe "#process" do describe "#process" do
it "can generate correct info" do it "can generate correct info" do
@ -33,7 +32,7 @@ RSpec.describe DiscourseAi::AiBot::Commands::GoogleCommand do
"https://www.googleapis.com/customsearch/v1?cx=cx&key=abc&num=10&q=some%20search%20term", "https://www.googleapis.com/customsearch/v1?cx=cx&key=abc&num=10&q=some%20search%20term",
).to_return(status: 200, body: json_text, headers: {}) ).to_return(status: 200, body: json_text, headers: {})
google = described_class.new(bot, post) google = described_class.new(bot_user, post)
info = google.process("some search term") info = google.process("some search term")
expect(google.description_args[:count]).to eq(1) expect(google.description_args[:count]).to eq(1)

View File

@ -0,0 +1,26 @@
#frozen_string_literal: true
require_relative "../../../../support/openai_completions_inference_stubs"
RSpec.describe DiscourseAi::AiBot::Commands::SearchCommand do
fab!(:bot_user) { User.find(DiscourseAi::AiBot::EntryPoint::GPT3_5_TURBO_ID) }
before { SearchIndexer.enable }
after { SearchIndexer.disable }
describe "#process" do
it "can handle limits" do
post1 = Fabricate(:post)
_post2 = Fabricate(:post, user: post1.user)
_post3 = Fabricate(:post, user: post1.user)
# search has no built in support for limit: so handle it from the outside
search = described_class.new(bot_user, post1)
results = search.process("@#{post1.user.username} limit:2")
# title + 2 rows
expect(results.split("\n").length).to eq(3)
end
end
end

View File

@ -4,18 +4,22 @@ require_relative "../../../../support/openai_completions_inference_stubs"
RSpec.describe DiscourseAi::AiBot::Commands::SummarizeCommand do RSpec.describe DiscourseAi::AiBot::Commands::SummarizeCommand do
fab!(:bot_user) { User.find(DiscourseAi::AiBot::EntryPoint::GPT3_5_TURBO_ID) } fab!(:bot_user) { User.find(DiscourseAi::AiBot::EntryPoint::GPT3_5_TURBO_ID) }
fab!(:bot) { DiscourseAi::AiBot::Bot.as(bot_user) }
describe "#process" do describe "#process" do
it "can generate correct info" do it "can generate correct info" do
post = Fabricate(:post) post = Fabricate(:post)
summarizer = described_class.new(bot, post) WebMock.stub_request(:post, "https://api.openai.com/v1/chat/completions").to_return(
status: 200,
body: JSON.dump({ choices: [{ message: { content: "summary stuff" } }] }),
)
summarizer = described_class.new(bot_user, post)
info = summarizer.process("#{post.topic_id} why did it happen?") info = summarizer.process("#{post.topic_id} why did it happen?")
expect(info).to include("why did it happen?") expect(info).to include("Topic summarized")
expect(info).to include(post.raw) expect(summarizer.custom_raw).to include("summary stuff")
expect(info).to include(post.user.username) expect(summarizer.chain_next_response).to eq(false)
end end
it "protects hidden data" do it "protects hidden data" do
@ -26,10 +30,12 @@ RSpec.describe DiscourseAi::AiBot::Commands::SummarizeCommand do
topic = Fabricate(:topic, category_id: category.id) topic = Fabricate(:topic, category_id: category.id)
post = Fabricate(:post, topic: topic) post = Fabricate(:post, topic: topic)
summarizer = described_class.new(bot, post) summarizer = described_class.new(bot_user, post)
info = summarizer.process("#{post.topic_id} why did it happen?") info = summarizer.process("#{post.topic_id} why did it happen?")
expect(info).not_to include(post.raw) expect(info).not_to include(post.raw)
expect(summarizer.custom_raw).to eq(I18n.t("discourse_ai.ai_bot.topic_not_found"))
end end
end end
end end