iterate commands (#73)
* FEATURE: introduce a more efficient formatter Previous formatting style was space inefficient given JSON consumes lots of tokens, the new format is now used consistently across commands Also fixes - search limited to 10 - search breaking on limit: non existent directive * Slight improvement to summarizer Stop blowing up context with custom prompts * ensure we include the guiding message * correct spec * langchain style summarizer ... much more accurate (albeit more expensive) * lint
This commit is contained in:
parent
d59ed1091b
commit
92fb84e24d
|
@ -91,6 +91,7 @@ en:
|
|||
|
||||
ai_bot:
|
||||
default_pm_prefix: "[Untitled AI bot PM]"
|
||||
topic_not_found: "Summary unavailable, topic not found!"
|
||||
command_summary:
|
||||
categories: "List categories"
|
||||
search: "Search"
|
||||
|
|
|
@ -40,7 +40,7 @@ module DiscourseAi
|
|||
).dig(:completion)
|
||||
end
|
||||
|
||||
def submit_prompt_and_stream_reply(prompt, prefer_low_cost: false, &blk)
|
||||
def submit_prompt(prompt, prefer_low_cost: false, &blk)
|
||||
DiscourseAi::Inference::AnthropicCompletions.perform!(
|
||||
prompt,
|
||||
model_for,
|
||||
|
|
|
@ -55,10 +55,7 @@ module DiscourseAi
|
|||
|
||||
setup_cancel = false
|
||||
|
||||
submit_prompt_and_stream_reply(
|
||||
prompt,
|
||||
prefer_low_cost: prefer_low_cost,
|
||||
) do |partial, cancel|
|
||||
submit_prompt(prompt, prefer_low_cost: prefer_low_cost) do |partial, cancel|
|
||||
reply = update_with_delta(reply, partial)
|
||||
|
||||
if redis_stream_key && !Discourse.redis.get(redis_stream_key)
|
||||
|
@ -139,6 +136,7 @@ module DiscourseAi
|
|||
rendered_system_prompt = system_prompt(post)
|
||||
|
||||
total_prompt_tokens = tokenize(rendered_system_prompt).length
|
||||
|
||||
messages =
|
||||
conversation.reduce([]) do |memo, (raw, username)|
|
||||
break(memo) if total_prompt_tokens >= prompt_limit
|
||||
|
@ -227,6 +225,14 @@ module DiscourseAi
|
|||
TEXT
|
||||
end
|
||||
|
||||
def tokenize(text)
|
||||
raise NotImplemented
|
||||
end
|
||||
|
||||
def submit_prompt(prompt, prefer_low_cost: false, &blk)
|
||||
raise NotImplemented
|
||||
end
|
||||
|
||||
protected
|
||||
|
||||
attr_reader :bot_user
|
||||
|
@ -243,10 +249,6 @@ module DiscourseAi
|
|||
raise NotImplemented
|
||||
end
|
||||
|
||||
def submit_prompt_and_stream_reply(prompt, prefer_low_cost: false, &blk)
|
||||
raise NotImplemented
|
||||
end
|
||||
|
||||
def conversation_context(post)
|
||||
context =
|
||||
post
|
||||
|
@ -262,9 +264,15 @@ module DiscourseAi
|
|||
|
||||
result = []
|
||||
|
||||
first = true
|
||||
context.each do |raw, username, custom_prompt|
|
||||
if custom_prompt.present?
|
||||
if first
|
||||
custom_prompt.reverse_each { |message| result << message }
|
||||
first = false
|
||||
else
|
||||
result << custom_prompt.first
|
||||
end
|
||||
else
|
||||
result << [raw, username]
|
||||
end
|
||||
|
@ -280,10 +288,6 @@ module DiscourseAi
|
|||
user_ids: bot_reply_post.topic.allowed_user_ids,
|
||||
)
|
||||
end
|
||||
|
||||
def tokenize(text)
|
||||
raise NotImplemented
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -21,29 +21,21 @@ module DiscourseAi::AiBot::Commands
|
|||
end
|
||||
|
||||
def process(_args)
|
||||
info =
|
||||
+"Name, Slug, Description, Posts Year, Posts Month, Posts Week, id, parent_category_id\n"
|
||||
columns = {
|
||||
name: "Name",
|
||||
slug: "Slug",
|
||||
description: "Description",
|
||||
posts_year: "Posts Year",
|
||||
posts_month: "Posts Month",
|
||||
posts_week: "Posts Week",
|
||||
id: "id",
|
||||
parent_category_id: "parent_category_id",
|
||||
}
|
||||
|
||||
@count = 0
|
||||
Category
|
||||
.where(read_restricted: false)
|
||||
.limit(100)
|
||||
.pluck(
|
||||
:id,
|
||||
:parent_category_id,
|
||||
:slug,
|
||||
:name,
|
||||
:description,
|
||||
:posts_year,
|
||||
:posts_month,
|
||||
:posts_week,
|
||||
)
|
||||
.map do |id, parent_category_id, slug, name, description, posts_year, posts_month, posts_week|
|
||||
@count += 1
|
||||
info << "#{name}, #{slug}, #{(description || "").gsub(",", "")}, #{posts_year || 0}, #{posts_month || 0}, #{posts_week || 0},#{id}, #{parent_category_id} \n"
|
||||
end
|
||||
rows = Category.where(read_restricted: false).limit(100).pluck(*columns.keys)
|
||||
@count = rows.length
|
||||
|
||||
info
|
||||
format_results(rows, columns.values)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -22,6 +22,8 @@ module DiscourseAi
|
|||
end
|
||||
end
|
||||
|
||||
attr_reader :bot_user, :args
|
||||
|
||||
def initialize(bot_user, args)
|
||||
@bot_user = bot_user
|
||||
@args = args
|
||||
|
@ -89,6 +91,41 @@ module DiscourseAi
|
|||
chain_next_response
|
||||
end
|
||||
|
||||
def format_results(rows, column_names = nil)
|
||||
rows = rows.map { |row| yield row } if block_given?
|
||||
|
||||
if !column_names
|
||||
index = -1
|
||||
column_indexes = {}
|
||||
|
||||
rows =
|
||||
rows.map do |data|
|
||||
new_row = []
|
||||
data.each do |key, value|
|
||||
found_index = column_indexes[key.to_s] ||= (index += 1)
|
||||
new_row[found_index] = value
|
||||
end
|
||||
new_row
|
||||
end
|
||||
column_names = column_indexes.keys
|
||||
end
|
||||
# two tokens per delimiter is a reasonable balance
|
||||
# there may be a single delimiter solution but GPT has
|
||||
# a hard time dealing with escaped characters
|
||||
delimiter = "¦"
|
||||
formatted = +""
|
||||
formatted << column_names.join(delimiter)
|
||||
formatted << "\n"
|
||||
|
||||
rows.each do |array|
|
||||
array.map! { |item| item.to_s.gsub(delimiter, "|").gsub(/\n/, " ") }
|
||||
formatted << array.join(delimiter)
|
||||
formatted << "\n"
|
||||
end
|
||||
|
||||
formatted
|
||||
end
|
||||
|
||||
protected
|
||||
|
||||
attr_reader :bot_user, :args
|
||||
|
|
|
@ -42,20 +42,15 @@ module DiscourseAi::AiBot::Commands
|
|||
|
||||
@last_num_results = parsed.dig("searchInformation", "totalResults").to_i
|
||||
|
||||
formatted_results = []
|
||||
|
||||
results.each do |result|
|
||||
formatted_result = {
|
||||
format_results(results) do |result|
|
||||
{
|
||||
title: result["title"],
|
||||
link: result["link"],
|
||||
snippet: result["snippet"],
|
||||
displayLink: result["displayLink"],
|
||||
formattedUrl: result["formattedUrl"],
|
||||
}
|
||||
formatted_results << formatted_result
|
||||
end
|
||||
|
||||
formatted_results
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -74,23 +74,44 @@ module DiscourseAi::AiBot::Commands
|
|||
end
|
||||
|
||||
def process(search_string)
|
||||
limit = nil
|
||||
|
||||
search_string =
|
||||
search_string
|
||||
.strip
|
||||
.split(/\s+/)
|
||||
.map do |term|
|
||||
if term =~ /limit:(\d+)/
|
||||
limit = $1.to_i
|
||||
nil
|
||||
else
|
||||
term
|
||||
end
|
||||
end
|
||||
.compact
|
||||
.join(" ")
|
||||
|
||||
@last_query = search_string
|
||||
results =
|
||||
Search.execute(search_string.to_s, search_type: :full_page, guardian: Guardian.new())
|
||||
|
||||
posts = results.posts
|
||||
posts = posts[0..limit - 1] if limit
|
||||
|
||||
@last_num_results = results.posts.length
|
||||
|
||||
results.posts[0..10]
|
||||
.map do |p|
|
||||
if posts.blank?
|
||||
"No results found"
|
||||
else
|
||||
format_results(posts) do |post|
|
||||
{
|
||||
title: p.topic.title,
|
||||
url: p.url,
|
||||
raw_truncated: p.raw[0..250],
|
||||
excerpt: p.excerpt,
|
||||
created: p.created_at,
|
||||
title: post.topic.title,
|
||||
url: post.url,
|
||||
excerpt: post.excerpt,
|
||||
created: post.created_at,
|
||||
}
|
||||
end
|
||||
.to_json
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -40,30 +40,135 @@ module DiscourseAi::AiBot::Commands
|
|||
topic = nil if !topic || !Guardian.new.can_see?(topic)
|
||||
end
|
||||
|
||||
rows = []
|
||||
@last_summary = nil
|
||||
|
||||
if topic
|
||||
@last_topic_title = topic.title
|
||||
if guidance.present?
|
||||
rows << ["Given: #{guidance}"]
|
||||
rows << ["Summarise: #{topic.title}"]
|
||||
|
||||
posts =
|
||||
Post
|
||||
.joins(:user)
|
||||
.where(topic_id: topic.id)
|
||||
.order(:post_number)
|
||||
.where("post_type in (?)", [Post.types[:regular], Post.types[:small_action]])
|
||||
.where("not hidden")
|
||||
.limit(50)
|
||||
.pluck(:raw, :username)
|
||||
.each { |raw, username| rows << ["#{username} said: #{raw}"] }
|
||||
.order(:post_number)
|
||||
|
||||
columns = ["posts.id", :post_number, :raw, :username]
|
||||
|
||||
current_post_numbers = posts.limit(5).pluck(:post_number)
|
||||
current_post_numbers += posts.reorder("posts.score desc").limit(50).pluck(:post_number)
|
||||
current_post_numbers += posts.reorder("post_number desc").limit(5).pluck(:post_number)
|
||||
|
||||
data =
|
||||
Post
|
||||
.where(topic_id: topic.id)
|
||||
.joins(:user)
|
||||
.where("post_number in (?)", current_post_numbers)
|
||||
.order(:post_number)
|
||||
.pluck(*columns)
|
||||
|
||||
@last_summary = summarize(data, guidance, topic)
|
||||
end
|
||||
|
||||
if !@last_summary
|
||||
"Say: No topic found!"
|
||||
else
|
||||
"Topic summarized"
|
||||
end
|
||||
end
|
||||
|
||||
if rows.blank?
|
||||
"Say: No topic found!"
|
||||
else
|
||||
"#{rows.join("\n")}"[0..2000]
|
||||
def custom_raw
|
||||
@last_summary || I18n.t("discourse_ai.ai_bot.topic_not_found")
|
||||
end
|
||||
|
||||
def chain_next_response
|
||||
false
|
||||
end
|
||||
|
||||
def bot
|
||||
@bot ||= DiscourseAi::AiBot::Bot.as(bot_user)
|
||||
end
|
||||
|
||||
def summarize(data, guidance, topic)
|
||||
text = +""
|
||||
data.each do |id, post_number, raw, username|
|
||||
text << "(#{post_number} #{username} said: #{raw}"
|
||||
end
|
||||
|
||||
summaries = []
|
||||
current_section = +""
|
||||
split = []
|
||||
|
||||
text
|
||||
.split(/\s+/)
|
||||
.each_slice(20) do |slice|
|
||||
current_section << " "
|
||||
current_section << slice.join(" ")
|
||||
|
||||
# somehow any more will get closer to limits
|
||||
if bot.tokenize(current_section).length > 2500
|
||||
split << current_section
|
||||
current_section = +""
|
||||
end
|
||||
end
|
||||
|
||||
split << current_section if current_section.present?
|
||||
|
||||
split = split[0..3] + split[-3..-1] if split.length > 5
|
||||
|
||||
split.each do |section|
|
||||
# TODO progress meter
|
||||
summary =
|
||||
generate_gpt_summary(
|
||||
section,
|
||||
topic: topic,
|
||||
context: "Guidance: #{guidance}\nYou are summarizing the topic: #{topic.title}",
|
||||
)
|
||||
summaries << summary
|
||||
end
|
||||
|
||||
if summaries.length > 1
|
||||
messages = []
|
||||
messages << { role: "system", content: "You are a helpful bot" }
|
||||
messages << {
|
||||
role: "user",
|
||||
content:
|
||||
"concatenated the disjoint summaries, creating a cohesive narrative:\n#{summaries.join("\n")}}",
|
||||
}
|
||||
bot.submit_prompt(messages, temperature: 0.6, max_tokens: 500, prefer_low_cost: true).dig(
|
||||
:choices,
|
||||
0,
|
||||
:message,
|
||||
:content,
|
||||
)
|
||||
else
|
||||
summaries.first
|
||||
end
|
||||
end
|
||||
|
||||
def generate_gpt_summary(text, topic:, context: nil, length: nil)
|
||||
length ||= 400
|
||||
|
||||
prompt = <<~TEXT
|
||||
#{context}
|
||||
Summarize the following in #{length} words:
|
||||
|
||||
#{text}
|
||||
TEXT
|
||||
|
||||
system_prompt = <<~TEXT
|
||||
You are a summarization bot.
|
||||
You effectively summarise any text.
|
||||
You condense it into a shorter version.
|
||||
You understand and generate Discourse forum markdown.
|
||||
Try generating links as well the format is #{topic.url}/POST_NUMBER. eg: [ref](#{topic.url}/77)
|
||||
TEXT
|
||||
|
||||
messages = [{ role: "system", content: system_prompt }]
|
||||
messages << { role: "user", content: prompt }
|
||||
|
||||
result =
|
||||
bot.submit_prompt(messages, temperature: 0.6, max_tokens: length, prefer_low_cost: true)
|
||||
result.dig(:choices, 0, :message, :content)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -21,18 +21,18 @@ module DiscourseAi::AiBot::Commands
|
|||
end
|
||||
|
||||
def process(_args)
|
||||
info = +"Name, Topic Count\n"
|
||||
@last_count = 0
|
||||
column_names = { name: "Name", public_topic_count: "Topic Count" }
|
||||
|
||||
tags =
|
||||
Tag
|
||||
.where("public_topic_count > 0")
|
||||
.order(public_topic_count: :desc)
|
||||
.limit(100)
|
||||
.pluck(:name, :public_topic_count)
|
||||
.each do |name, count|
|
||||
@last_count += 1
|
||||
info << "#{name}, #{count}\n"
|
||||
end
|
||||
info
|
||||
.pluck(*column_names.keys)
|
||||
|
||||
@last_count = tags.length
|
||||
|
||||
format_results(tags, column_names.values)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -33,6 +33,29 @@ module DiscourseAi
|
|||
{ temperature: 0.4, top_p: 0.9, max_tokens: max_tokens }
|
||||
end
|
||||
|
||||
def submit_prompt(
|
||||
prompt,
|
||||
prefer_low_cost: false,
|
||||
temperature: nil,
|
||||
top_p: nil,
|
||||
max_tokens: nil,
|
||||
&blk
|
||||
)
|
||||
params =
|
||||
reply_params.merge(
|
||||
temperature: temperature,
|
||||
top_p: top_p,
|
||||
max_tokens: max_tokens,
|
||||
) { |key, old_value, new_value| new_value.nil? ? old_value : new_value }
|
||||
|
||||
model = prefer_low_cost ? "gpt-3.5-turbo" : model_for
|
||||
DiscourseAi::Inference::OpenAiCompletions.perform!(prompt, model, **params, &blk)
|
||||
end
|
||||
|
||||
def tokenize(text)
|
||||
DiscourseAi::Tokenizer::OpenAiTokenizer.tokenize(text)
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def build_message(poster_username, content, system: false)
|
||||
|
@ -65,15 +88,6 @@ module DiscourseAi
|
|||
max_tokens: 40,
|
||||
).dig(:choices, 0, :message, :content)
|
||||
end
|
||||
|
||||
def submit_prompt_and_stream_reply(prompt, prefer_low_cost: false, &blk)
|
||||
model = prefer_low_cost ? "gpt-3.5-turbo" : model_for
|
||||
DiscourseAi::Inference::OpenAiCompletions.perform!(prompt, model, **reply_params, &blk)
|
||||
end
|
||||
|
||||
def tokenize(text)
|
||||
DiscourseAi::Tokenizer::OpenAiTokenizer.tokenize(text)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -54,7 +54,7 @@ RSpec.describe DiscourseAi::AiBot::Bot do
|
|||
)
|
||||
|
||||
prompt << { role: "assistant", content: "!search test search" }
|
||||
prompt << { role: "user", content: "results: []" }
|
||||
prompt << { role: "user", content: "results: No results found" }
|
||||
|
||||
OpenAiCompletionsInferenceStubs.stub_streamed_response(
|
||||
prompt,
|
||||
|
|
|
@ -0,0 +1,36 @@
|
|||
#frozen_string_literal: true
|
||||
|
||||
require_relative "../../../../support/openai_completions_inference_stubs"
|
||||
|
||||
RSpec.describe DiscourseAi::AiBot::Commands::Command do
|
||||
fab!(:bot_user) { User.find(DiscourseAi::AiBot::EntryPoint::GPT3_5_TURBO_ID) }
|
||||
let(:command) { DiscourseAi::AiBot::Commands::Command.new(bot_user, nil) }
|
||||
|
||||
describe "#format_results" do
|
||||
it "can generate efficient tables of data" do
|
||||
rows = [1, 2, 3, 4, 5]
|
||||
column_names = %w[first second third]
|
||||
|
||||
formatted =
|
||||
command.format_results(rows, column_names) { |row| ["row ¦ 1", row + 1, "a|b,\nc"] }
|
||||
|
||||
expect(formatted.split("\n").length).to eq(6)
|
||||
expect(formatted).to include("a|b, c")
|
||||
end
|
||||
|
||||
it "can also generate results by returning hash per row" do
|
||||
rows = [1, 2, 3, 4, 5]
|
||||
column_names = %w[first second third]
|
||||
|
||||
formatted =
|
||||
command.format_results(rows, column_names) { |row| ["row ¦ 1", row + 1, "a|b,\nc"] }
|
||||
|
||||
formatted2 =
|
||||
command.format_results(rows) do |row|
|
||||
{ first: "row ¦ 1", second: row + 1, third: "a|b,\nc" }
|
||||
end
|
||||
|
||||
expect(formatted).to eq(formatted2)
|
||||
end
|
||||
end
|
||||
end
|
|
@ -4,7 +4,6 @@ require_relative "../../../../support/openai_completions_inference_stubs"
|
|||
|
||||
RSpec.describe DiscourseAi::AiBot::Commands::GoogleCommand do
|
||||
fab!(:bot_user) { User.find(DiscourseAi::AiBot::EntryPoint::GPT3_5_TURBO_ID) }
|
||||
fab!(:bot) { DiscourseAi::AiBot::Bot.as(bot_user) }
|
||||
|
||||
describe "#process" do
|
||||
it "can generate correct info" do
|
||||
|
@ -33,7 +32,7 @@ RSpec.describe DiscourseAi::AiBot::Commands::GoogleCommand do
|
|||
"https://www.googleapis.com/customsearch/v1?cx=cx&key=abc&num=10&q=some%20search%20term",
|
||||
).to_return(status: 200, body: json_text, headers: {})
|
||||
|
||||
google = described_class.new(bot, post)
|
||||
google = described_class.new(bot_user, post)
|
||||
info = google.process("some search term")
|
||||
|
||||
expect(google.description_args[:count]).to eq(1)
|
||||
|
|
|
@ -0,0 +1,26 @@
|
|||
#frozen_string_literal: true
|
||||
|
||||
require_relative "../../../../support/openai_completions_inference_stubs"
|
||||
|
||||
RSpec.describe DiscourseAi::AiBot::Commands::SearchCommand do
|
||||
fab!(:bot_user) { User.find(DiscourseAi::AiBot::EntryPoint::GPT3_5_TURBO_ID) }
|
||||
|
||||
before { SearchIndexer.enable }
|
||||
after { SearchIndexer.disable }
|
||||
|
||||
describe "#process" do
|
||||
it "can handle limits" do
|
||||
post1 = Fabricate(:post)
|
||||
_post2 = Fabricate(:post, user: post1.user)
|
||||
_post3 = Fabricate(:post, user: post1.user)
|
||||
|
||||
# search has no built in support for limit: so handle it from the outside
|
||||
search = described_class.new(bot_user, post1)
|
||||
|
||||
results = search.process("@#{post1.user.username} limit:2")
|
||||
|
||||
# title + 2 rows
|
||||
expect(results.split("\n").length).to eq(3)
|
||||
end
|
||||
end
|
||||
end
|
|
@ -4,18 +4,22 @@ require_relative "../../../../support/openai_completions_inference_stubs"
|
|||
|
||||
RSpec.describe DiscourseAi::AiBot::Commands::SummarizeCommand do
|
||||
fab!(:bot_user) { User.find(DiscourseAi::AiBot::EntryPoint::GPT3_5_TURBO_ID) }
|
||||
fab!(:bot) { DiscourseAi::AiBot::Bot.as(bot_user) }
|
||||
|
||||
describe "#process" do
|
||||
it "can generate correct info" do
|
||||
post = Fabricate(:post)
|
||||
|
||||
summarizer = described_class.new(bot, post)
|
||||
WebMock.stub_request(:post, "https://api.openai.com/v1/chat/completions").to_return(
|
||||
status: 200,
|
||||
body: JSON.dump({ choices: [{ message: { content: "summary stuff" } }] }),
|
||||
)
|
||||
|
||||
summarizer = described_class.new(bot_user, post)
|
||||
info = summarizer.process("#{post.topic_id} why did it happen?")
|
||||
|
||||
expect(info).to include("why did it happen?")
|
||||
expect(info).to include(post.raw)
|
||||
expect(info).to include(post.user.username)
|
||||
expect(info).to include("Topic summarized")
|
||||
expect(summarizer.custom_raw).to include("summary stuff")
|
||||
expect(summarizer.chain_next_response).to eq(false)
|
||||
end
|
||||
|
||||
it "protects hidden data" do
|
||||
|
@ -26,10 +30,12 @@ RSpec.describe DiscourseAi::AiBot::Commands::SummarizeCommand do
|
|||
topic = Fabricate(:topic, category_id: category.id)
|
||||
post = Fabricate(:post, topic: topic)
|
||||
|
||||
summarizer = described_class.new(bot, post)
|
||||
summarizer = described_class.new(bot_user, post)
|
||||
info = summarizer.process("#{post.topic_id} why did it happen?")
|
||||
|
||||
expect(info).not_to include(post.raw)
|
||||
|
||||
expect(summarizer.custom_raw).to eq(I18n.t("discourse_ai.ai_bot.topic_not_found"))
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue