DEV: Cache summarization strategy results. (#22230)

Updates the interface for implementing summarization strategies and adds a cache layer to summarize topics once.

The cache stores the final summary and each chunk used to build it, which will be useful when we have to extend or rebuild it.
This commit is contained in:
Roman Rizzi 2023-06-27 11:44:34 -03:00 committed by GitHub
parent a909dffe8f
commit f4e7a80600
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 332 additions and 41 deletions

View File

@ -4,6 +4,7 @@ import { ajax } from "discourse/lib/ajax";
import { popupAjaxError } from "discourse/lib/ajax-error";
import { action } from "@ember/object";
import { schedule } from "@ember/runloop";
import { cookAsync } from "discourse/lib/text";
export default class TopicSummary extends Component {
@tracked loading = false;
@ -16,7 +17,9 @@ export default class TopicSummary extends Component {
ajax(`/t/${this.args.topicId}/strategy-summary`)
.then((data) => {
this.summary = data.summary;
cookAsync(data.summary).then((cooked) => {
this.summary = cooked;
});
})
.catch(popupAjaxError)
.finally(() => (this.loading = false));

View File

@ -1178,18 +1178,7 @@ class TopicsController < ApplicationController
RateLimiter.new(current_user, "summary", 6, 5.minutes).performed!
hijack do
summary_opts = {
filter: "summary",
exclude_deleted_users: true,
exclude_hidden: true,
show_deleted: false,
}
content = TopicView.new(topic, current_user, summary_opts).posts.pluck(:raw).join("\n")
render json: { summary: strategy.summarize(content) }
end
hijack { render json: { summary: TopicSummarization.new(strategy).summarize(topic) } }
end
private

View File

@ -437,7 +437,7 @@ class Post < ActiveRecord::Base
# percent rank has tons of ties
where(topic_id: topic_id).where(
[
"id = ANY(
"posts.id = ANY(
(
SELECT posts.id
FROM posts

View File

@ -0,0 +1,21 @@
# frozen_string_literal: true
class SummarySection < ActiveRecord::Base
belongs_to :target, polymorphic: true
end
# == Schema Information
#
# Table name: summary_sections
#
# id :bigint not null, primary key
# target_id :integer not null
# target_type :string not null
# content_range :int4range
# summarized_text :string not null
# meta_section_id :integer
# original_content_sha :string not null
# algorithm :string not null
# created_at :datetime not null
# updated_at :datetime not null
#

View File

@ -0,0 +1,90 @@
# frozen_string_literal: true
class TopicSummarization
def initialize(strategy)
@strategy = strategy
end
def summarize(topic)
DistributedMutex.synchronize("toppic_summarization_#{topic.id}") do
existing_summary = SummarySection.find_by(target: topic, meta_section_id: nil)
return existing_summary.summarized_text if existing_summary
content = {
resource_path: "#{Discourse.base_path}/t/-/#{topic.id}",
content_title: topic.title,
contents: [],
}
targets_data = summary_targets(topic).pluck(:post_number, :raw, :username)
targets_data.map do |(pn, raw, username)|
content[:contents] << { poster: username, id: pn, text: raw }
end
summarization_result = strategy.summarize(content)
cached_summary(summarization_result, targets_data.map(&:first), topic)
summarization_result[:summary]
end
end
def summary_targets(topic)
topic.has_summary? ? best_replies(topic) : pick_selection(topic)
end
private
attr_reader :strategy
def best_replies(topic)
Post
.summary(topic.id)
.where("post_type = ?", Post.types[:regular])
.where("NOT hidden")
.joins(:user)
.order(:post_number)
end
def pick_selection(topic)
posts =
Post
.where(topic_id: topic.id)
.where("post_type = ?", Post.types[:regular])
.where("NOT hidden")
.order(:post_number)
post_numbers = posts.limit(5).pluck(:post_number)
post_numbers += posts.reorder("posts.score desc").limit(50).pluck(:post_number)
post_numbers += posts.reorder("post_number desc").limit(5).pluck(:post_number)
Post
.where(topic_id: topic.id)
.joins(:user)
.where("post_number in (?)", post_numbers)
.order(:post_number)
end
def cached_summary(result, post_numbers, topic)
main_summary =
SummarySection.create!(
target: topic,
algorithm: strategy.model,
content_range: (post_numbers.first..post_numbers.last),
summarized_text: result[:summary],
original_content_sha: Digest::SHA256.hexdigest(post_numbers.join),
)
result[:chunks].each do |chunk|
SummarySection.create!(
target: topic,
algorithm: strategy.model,
content_range: chunk[:ids].min..chunk[:ids].max,
summarized_text: chunk[:summary],
original_content_sha: Digest::SHA256.hexdigest(chunk[:ids].join),
meta_section_id: main_summary.id,
)
end
end
end

View File

@ -0,0 +1,16 @@
# frozen_string_literal: true
class CreateSummarySectionsTable < ActiveRecord::Migration[7.0]
def change
create_table :summary_sections do |t|
t.integer :target_id, null: false
t.string :target_type, null: false
t.int4range :content_range
t.string :summarized_text, null: false
t.integer :meta_section_id
t.string :original_content_sha, null: false
t.string :algorithm, null: false
t.timestamps
end
end
end

View File

@ -1,27 +1,28 @@
# frozen_string_literal: true
# Base class that defines the interface that every summarization
# strategy must implement.
# Above each method, you'll find an explanation of what
# it does and what it should return.
module Summarization
class Base
def self.available_strategies
DiscoursePluginRegistry.summarization_strategies
class << self
def available_strategies
DiscoursePluginRegistry.summarization_strategies
end
def find_strategy(strategy_model)
available_strategies.detect { |s| s.model == strategy_model }
end
def selected_strategy
return if SiteSetting.summarization_strategy.blank?
find_strategy(SiteSetting.summarization_strategy)
end
end
def self.find_strategy(strategy_model)
available_strategies.detect { |s| s.model == strategy_model }
end
def self.selected_strategy
return if SiteSetting.summarization_strategy.blank?
find_strategy(SiteSetting.summarization_strategy)
end
def initialize(model)
@model = model
end
attr_reader :model
def can_request_summaries?(user)
user_group_ids = user.group_ids
@ -30,20 +31,52 @@ module Summarization
end
end
# Some strategies could require other conditions to work correctly,
# like site settings.
# This method gets called when admins attempt to select it,
# checking if we met those conditions.
def correctly_configured?
raise NotImplemented
end
# Strategy name to display to admins in the available strategies dropdown.
def display_name
raise NotImplemented
end
# If we don't meet the conditions to enable this strategy,
# we'll display this hint as an error to admins.
def configuration_hint
raise NotImplemented
end
# The idea behind this method is "give me a collection of texts,
# and I'll handle the summarization to the best of my capabilities.".
# It's important to emphasize the "collection of texts" part, which implies
# it's not tied to any model and expects the "content" to be a hash instead.
#
# @param content { Hash } - Includes the content to summarize, plus additional
# context to help the strategy produce a better result. Keys present in the content hash:
# - resource_path (optional): Helps the strategy build links to the content in the summary (e.g. "/t/-/:topic_id/POST_NUMBER")
# - content_title (optional): Provides guidance about what the content is about.
# - contents (required): Array of hashes with content to summarize (e.g. [{ poster: "asd", id: 1, text: "This is a text" }])
# All keys are required.
#
# @returns { Hash } - The summarized content, plus chunks if the content couldn't be summarized in one pass. Example:
# {
# summary: "This is the final summary",
# chunks: [
# { ids: [topic.first_post.post_number], summary: "this is the first chunk" },
# { ids: [post_1.post_number, post_2.post_number], summary: "this is the second chunk" },
# ],
# }
def summarize(content)
raise NotImplemented
end
# Returns the string we'll store in the selected strategy site setting.
def model
raise NotImplemented
end
end
end

View File

@ -27,7 +27,7 @@ class Chat::Api::SummariesController < Chat::ApiController
.map { "#{_1}: #{_2}" }
.join("\n")
render json: { summary: strategy.summarize(content) }
render json: { summary: strategy.summarize(content).dig(:summary) }
end
end
end

View File

@ -11,10 +11,12 @@ RSpec.describe "Summarize a channel since your last visit", type: :system, js: t
let(:chat) { PageObjects::Pages::Chat.new }
let(:summarization_result) { { summary: "This is a summary", chunks: [] } }
before do
group.add(current_user)
strategy = DummyCustomSummarization.new("dummy")
strategy = DummyCustomSummarization.new(summarization_result)
plugin.register_summarization_strategy(strategy)
SiteSetting.summarization_strategy = strategy.model
SiteSetting.custom_summarization_allowed_groups = group.id.to_s
@ -36,6 +38,6 @@ RSpec.describe "Summarize a channel since your last visit", type: :system, js: t
find(".summarization-since").click
find(".select-kit-row[data-value=\"3\"]").click
expect(find(".summary-area").text).to eq(DummyCustomSummarization::RESPONSE)
expect(find(".summary-area").text).to eq(summarization_result[:summary])
end
end

View File

@ -10,13 +10,13 @@ describe Summarization::Base do
it "returns true if the user group is present in the custom_summarization_allowed_groups_map setting" do
SiteSetting.custom_summarization_allowed_groups = group.id
expect(described_class.new(nil).can_request_summaries?(user)).to eq(true)
expect(subject.can_request_summaries?(user)).to eq(true)
end
it "returns false if the user group is not present in the custom_summarization_allowed_groups_map setting" do
SiteSetting.custom_summarization_allowed_groups = ""
expect(described_class.new(nil).can_request_summaries?(user)).to eq(false)
expect(subject.can_request_summaries?(user)).to eq(false)
end
end
end

View File

@ -0,0 +1,129 @@
# frozen_string_literal: true
describe TopicSummarization do
fab!(:topic) { Fabricate(:topic) }
fab!(:post_1) { Fabricate(:post, topic: topic) }
fab!(:post_2) { Fabricate(:post, topic: topic) }
shared_examples "includes only public-visible topics" do
subject { described_class.new(DummyCustomSummarization.new({})) }
it "only includes visible posts" do
topic.first_post.update!(hidden: true)
posts = subject.summary_targets(topic)
expect(posts.none?(&:hidden?)).to eq(true)
end
it "doesn't include posts without users" do
topic.first_post.user.destroy!
posts = subject.summary_targets(topic)
expect(posts.detect { |p| p.id == topic.first_post.id }).to be_nil
end
it "doesn't include deleted posts" do
topic.first_post.update!(user_id: nil)
posts = subject.summary_targets(topic)
expect(posts.detect { |p| p.id == topic.first_post.id }).to be_nil
end
end
describe "#summary_targets" do
context "when the topic has a best replies summary" do
before { topic.has_summary = true }
it_behaves_like "includes only public-visible topics"
end
context "when the topic doesn't have a best replies summary" do
before { topic.has_summary = false }
it_behaves_like "includes only public-visible topics"
end
end
describe "#summarize" do
let(:strategy) { DummyCustomSummarization.new(summary) }
subject { described_class.new(strategy) }
def assert_summary_is_cached(topic, summary_response)
cached_summary = SummarySection.find_by(target: topic, meta_section_id: nil)
expect(cached_summary.content_range).to cover(*topic.posts.map(&:post_number))
expect(cached_summary.summarized_text).to eq(summary_response[:summary])
expect(cached_summary.original_content_sha).to eq(
Digest::SHA256.hexdigest(topic.posts.map(&:post_number).join),
)
expect(cached_summary.algorithm).to eq(strategy.model)
end
def assert_chunk_is_cached(topic, chunk_response)
cached_chunk =
SummarySection
.where.not(meta_section_id: nil)
.find_by(
target: topic,
content_range: (chunk_response[:ids].min..chunk_response[:ids].max),
)
expect(cached_chunk.summarized_text).to eq(chunk_response[:summary])
expect(cached_chunk.original_content_sha).to eq(
Digest::SHA256.hexdigest(chunk_response[:ids].join),
)
expect(cached_chunk.algorithm).to eq(strategy.model)
end
context "when the content was summarized in a single chunk" do
let(:summary) { { summary: "This is the final summary", chunks: [] } }
it "caches the summary" do
summarized_text = subject.summarize(topic)
expect(summarized_text).to eq(summary[:summary])
assert_summary_is_cached(topic, summary)
end
it "returns the cached version in subsequent calls" do
subject.summarize(topic)
cached_summary_text = "This is a cached summary"
cached_summary =
SummarySection.find_by(target: topic, meta_section_id: nil).update!(
summarized_text: cached_summary_text,
)
summarized_text = subject.summarize(topic)
expect(summarized_text).to eq(cached_summary_text)
end
end
context "when the content was summarized in multiple chunks" do
let(:summary) do
{
summary: "This is the final summary",
chunks: [
{ ids: [topic.first_post.post_number], summary: "this is the first chunk" },
{ ids: [post_1.post_number, post_2.post_number], summary: "this is the second chunk" },
],
}
end
it "caches the summary and each chunk" do
summarized_text = subject.summarize(topic)
expect(summarized_text).to eq(summary[:summary])
assert_summary_is_cached(topic, summary)
summary[:chunks].each { |c| assert_chunk_is_cached(topic, c) }
end
end
end
end

View File

@ -1,7 +1,9 @@
# frozen_string_literal: true
class DummyCustomSummarization < Summarization::Base
RESPONSE = "This is a summary of the content you gave me"
def initialize(summarization_result)
@summarization_result = summarization_result
end
def display_name
"dummy"
@ -15,7 +17,11 @@ class DummyCustomSummarization < Summarization::Base
"hint"
end
def model
"dummy"
end
def summarize(_content)
RESPONSE
@summarization_result
end
end

View File

@ -10,9 +10,11 @@ RSpec.describe "Topic summarization", type: :system, js: true do
let(:plugin) { Plugin::Instance.new }
let(:summarization_result) { { summary: "This is a summary", chunks: [] } }
before do
sign_in(user)
strategy = DummyCustomSummarization.new("dummy")
strategy = DummyCustomSummarization.new(summarization_result)
plugin.register_summarization_strategy(strategy)
SiteSetting.summarization_strategy = strategy.model
end
@ -24,6 +26,6 @@ RSpec.describe "Topic summarization", type: :system, js: true do
expect(page.has_css?(".topic-summary-modal", wait: 5)).to eq(true)
expect(find(".summary-area").text).to eq(DummyCustomSummarization::RESPONSE)
expect(find(".summary-area").text).to eq(summarization_result[:summary])
end
end