DEV: Cache summarization strategy results. (#22230)
Updates the interface for implementing summarization strategies and adds a cache layer to summarize topics once. The cache stores the final summary and each chunk used to build it, which will be useful when we have to extend or rebuild it.
This commit is contained in:
parent
a909dffe8f
commit
f4e7a80600
|
@ -4,6 +4,7 @@ import { ajax } from "discourse/lib/ajax";
|
|||
import { popupAjaxError } from "discourse/lib/ajax-error";
|
||||
import { action } from "@ember/object";
|
||||
import { schedule } from "@ember/runloop";
|
||||
import { cookAsync } from "discourse/lib/text";
|
||||
|
||||
export default class TopicSummary extends Component {
|
||||
@tracked loading = false;
|
||||
|
@ -16,7 +17,9 @@ export default class TopicSummary extends Component {
|
|||
|
||||
ajax(`/t/${this.args.topicId}/strategy-summary`)
|
||||
.then((data) => {
|
||||
this.summary = data.summary;
|
||||
cookAsync(data.summary).then((cooked) => {
|
||||
this.summary = cooked;
|
||||
});
|
||||
})
|
||||
.catch(popupAjaxError)
|
||||
.finally(() => (this.loading = false));
|
||||
|
|
|
@ -1178,18 +1178,7 @@ class TopicsController < ApplicationController
|
|||
|
||||
RateLimiter.new(current_user, "summary", 6, 5.minutes).performed!
|
||||
|
||||
hijack do
|
||||
summary_opts = {
|
||||
filter: "summary",
|
||||
exclude_deleted_users: true,
|
||||
exclude_hidden: true,
|
||||
show_deleted: false,
|
||||
}
|
||||
|
||||
content = TopicView.new(topic, current_user, summary_opts).posts.pluck(:raw).join("\n")
|
||||
|
||||
render json: { summary: strategy.summarize(content) }
|
||||
end
|
||||
hijack { render json: { summary: TopicSummarization.new(strategy).summarize(topic) } }
|
||||
end
|
||||
|
||||
private
|
||||
|
|
|
@ -437,7 +437,7 @@ class Post < ActiveRecord::Base
|
|||
# percent rank has tons of ties
|
||||
where(topic_id: topic_id).where(
|
||||
[
|
||||
"id = ANY(
|
||||
"posts.id = ANY(
|
||||
(
|
||||
SELECT posts.id
|
||||
FROM posts
|
||||
|
|
|
@ -0,0 +1,21 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
class SummarySection < ActiveRecord::Base
|
||||
belongs_to :target, polymorphic: true
|
||||
end
|
||||
|
||||
# == Schema Information
|
||||
#
|
||||
# Table name: summary_sections
|
||||
#
|
||||
# id :bigint not null, primary key
|
||||
# target_id :integer not null
|
||||
# target_type :string not null
|
||||
# content_range :int4range
|
||||
# summarized_text :string not null
|
||||
# meta_section_id :integer
|
||||
# original_content_sha :string not null
|
||||
# algorithm :string not null
|
||||
# created_at :datetime not null
|
||||
# updated_at :datetime not null
|
||||
#
|
|
@ -0,0 +1,90 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
class TopicSummarization
|
||||
def initialize(strategy)
|
||||
@strategy = strategy
|
||||
end
|
||||
|
||||
def summarize(topic)
|
||||
DistributedMutex.synchronize("toppic_summarization_#{topic.id}") do
|
||||
existing_summary = SummarySection.find_by(target: topic, meta_section_id: nil)
|
||||
return existing_summary.summarized_text if existing_summary
|
||||
|
||||
content = {
|
||||
resource_path: "#{Discourse.base_path}/t/-/#{topic.id}",
|
||||
content_title: topic.title,
|
||||
contents: [],
|
||||
}
|
||||
|
||||
targets_data = summary_targets(topic).pluck(:post_number, :raw, :username)
|
||||
|
||||
targets_data.map do |(pn, raw, username)|
|
||||
content[:contents] << { poster: username, id: pn, text: raw }
|
||||
end
|
||||
|
||||
summarization_result = strategy.summarize(content)
|
||||
|
||||
cached_summary(summarization_result, targets_data.map(&:first), topic)
|
||||
|
||||
summarization_result[:summary]
|
||||
end
|
||||
end
|
||||
|
||||
def summary_targets(topic)
|
||||
topic.has_summary? ? best_replies(topic) : pick_selection(topic)
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
attr_reader :strategy
|
||||
|
||||
def best_replies(topic)
|
||||
Post
|
||||
.summary(topic.id)
|
||||
.where("post_type = ?", Post.types[:regular])
|
||||
.where("NOT hidden")
|
||||
.joins(:user)
|
||||
.order(:post_number)
|
||||
end
|
||||
|
||||
def pick_selection(topic)
|
||||
posts =
|
||||
Post
|
||||
.where(topic_id: topic.id)
|
||||
.where("post_type = ?", Post.types[:regular])
|
||||
.where("NOT hidden")
|
||||
.order(:post_number)
|
||||
|
||||
post_numbers = posts.limit(5).pluck(:post_number)
|
||||
post_numbers += posts.reorder("posts.score desc").limit(50).pluck(:post_number)
|
||||
post_numbers += posts.reorder("post_number desc").limit(5).pluck(:post_number)
|
||||
|
||||
Post
|
||||
.where(topic_id: topic.id)
|
||||
.joins(:user)
|
||||
.where("post_number in (?)", post_numbers)
|
||||
.order(:post_number)
|
||||
end
|
||||
|
||||
def cached_summary(result, post_numbers, topic)
|
||||
main_summary =
|
||||
SummarySection.create!(
|
||||
target: topic,
|
||||
algorithm: strategy.model,
|
||||
content_range: (post_numbers.first..post_numbers.last),
|
||||
summarized_text: result[:summary],
|
||||
original_content_sha: Digest::SHA256.hexdigest(post_numbers.join),
|
||||
)
|
||||
|
||||
result[:chunks].each do |chunk|
|
||||
SummarySection.create!(
|
||||
target: topic,
|
||||
algorithm: strategy.model,
|
||||
content_range: chunk[:ids].min..chunk[:ids].max,
|
||||
summarized_text: chunk[:summary],
|
||||
original_content_sha: Digest::SHA256.hexdigest(chunk[:ids].join),
|
||||
meta_section_id: main_summary.id,
|
||||
)
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,16 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
class CreateSummarySectionsTable < ActiveRecord::Migration[7.0]
|
||||
def change
|
||||
create_table :summary_sections do |t|
|
||||
t.integer :target_id, null: false
|
||||
t.string :target_type, null: false
|
||||
t.int4range :content_range
|
||||
t.string :summarized_text, null: false
|
||||
t.integer :meta_section_id
|
||||
t.string :original_content_sha, null: false
|
||||
t.string :algorithm, null: false
|
||||
t.timestamps
|
||||
end
|
||||
end
|
||||
end
|
|
@ -1,27 +1,28 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
# Base class that defines the interface that every summarization
|
||||
# strategy must implement.
|
||||
# Above each method, you'll find an explanation of what
|
||||
# it does and what it should return.
|
||||
|
||||
module Summarization
|
||||
class Base
|
||||
def self.available_strategies
|
||||
DiscoursePluginRegistry.summarization_strategies
|
||||
class << self
|
||||
def available_strategies
|
||||
DiscoursePluginRegistry.summarization_strategies
|
||||
end
|
||||
|
||||
def find_strategy(strategy_model)
|
||||
available_strategies.detect { |s| s.model == strategy_model }
|
||||
end
|
||||
|
||||
def selected_strategy
|
||||
return if SiteSetting.summarization_strategy.blank?
|
||||
|
||||
find_strategy(SiteSetting.summarization_strategy)
|
||||
end
|
||||
end
|
||||
|
||||
def self.find_strategy(strategy_model)
|
||||
available_strategies.detect { |s| s.model == strategy_model }
|
||||
end
|
||||
|
||||
def self.selected_strategy
|
||||
return if SiteSetting.summarization_strategy.blank?
|
||||
|
||||
find_strategy(SiteSetting.summarization_strategy)
|
||||
end
|
||||
|
||||
def initialize(model)
|
||||
@model = model
|
||||
end
|
||||
|
||||
attr_reader :model
|
||||
|
||||
def can_request_summaries?(user)
|
||||
user_group_ids = user.group_ids
|
||||
|
||||
|
@ -30,20 +31,52 @@ module Summarization
|
|||
end
|
||||
end
|
||||
|
||||
# Some strategies could require other conditions to work correctly,
|
||||
# like site settings.
|
||||
# This method gets called when admins attempt to select it,
|
||||
# checking if we met those conditions.
|
||||
def correctly_configured?
|
||||
raise NotImplemented
|
||||
end
|
||||
|
||||
# Strategy name to display to admins in the available strategies dropdown.
|
||||
def display_name
|
||||
raise NotImplemented
|
||||
end
|
||||
|
||||
# If we don't meet the conditions to enable this strategy,
|
||||
# we'll display this hint as an error to admins.
|
||||
def configuration_hint
|
||||
raise NotImplemented
|
||||
end
|
||||
|
||||
# The idea behind this method is "give me a collection of texts,
|
||||
# and I'll handle the summarization to the best of my capabilities.".
|
||||
# It's important to emphasize the "collection of texts" part, which implies
|
||||
# it's not tied to any model and expects the "content" to be a hash instead.
|
||||
#
|
||||
# @param content { Hash } - Includes the content to summarize, plus additional
|
||||
# context to help the strategy produce a better result. Keys present in the content hash:
|
||||
# - resource_path (optional): Helps the strategy build links to the content in the summary (e.g. "/t/-/:topic_id/POST_NUMBER")
|
||||
# - content_title (optional): Provides guidance about what the content is about.
|
||||
# - contents (required): Array of hashes with content to summarize (e.g. [{ poster: "asd", id: 1, text: "This is a text" }])
|
||||
# All keys are required.
|
||||
#
|
||||
# @returns { Hash } - The summarized content, plus chunks if the content couldn't be summarized in one pass. Example:
|
||||
# {
|
||||
# summary: "This is the final summary",
|
||||
# chunks: [
|
||||
# { ids: [topic.first_post.post_number], summary: "this is the first chunk" },
|
||||
# { ids: [post_1.post_number, post_2.post_number], summary: "this is the second chunk" },
|
||||
# ],
|
||||
# }
|
||||
def summarize(content)
|
||||
raise NotImplemented
|
||||
end
|
||||
|
||||
# Returns the string we'll store in the selected strategy site setting.
|
||||
def model
|
||||
raise NotImplemented
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -27,7 +27,7 @@ class Chat::Api::SummariesController < Chat::ApiController
|
|||
.map { "#{_1}: #{_2}" }
|
||||
.join("\n")
|
||||
|
||||
render json: { summary: strategy.summarize(content) }
|
||||
render json: { summary: strategy.summarize(content).dig(:summary) }
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -11,10 +11,12 @@ RSpec.describe "Summarize a channel since your last visit", type: :system, js: t
|
|||
|
||||
let(:chat) { PageObjects::Pages::Chat.new }
|
||||
|
||||
let(:summarization_result) { { summary: "This is a summary", chunks: [] } }
|
||||
|
||||
before do
|
||||
group.add(current_user)
|
||||
|
||||
strategy = DummyCustomSummarization.new("dummy")
|
||||
strategy = DummyCustomSummarization.new(summarization_result)
|
||||
plugin.register_summarization_strategy(strategy)
|
||||
SiteSetting.summarization_strategy = strategy.model
|
||||
SiteSetting.custom_summarization_allowed_groups = group.id.to_s
|
||||
|
@ -36,6 +38,6 @@ RSpec.describe "Summarize a channel since your last visit", type: :system, js: t
|
|||
find(".summarization-since").click
|
||||
find(".select-kit-row[data-value=\"3\"]").click
|
||||
|
||||
expect(find(".summary-area").text).to eq(DummyCustomSummarization::RESPONSE)
|
||||
expect(find(".summary-area").text).to eq(summarization_result[:summary])
|
||||
end
|
||||
end
|
||||
|
|
|
@ -10,13 +10,13 @@ describe Summarization::Base do
|
|||
it "returns true if the user group is present in the custom_summarization_allowed_groups_map setting" do
|
||||
SiteSetting.custom_summarization_allowed_groups = group.id
|
||||
|
||||
expect(described_class.new(nil).can_request_summaries?(user)).to eq(true)
|
||||
expect(subject.can_request_summaries?(user)).to eq(true)
|
||||
end
|
||||
|
||||
it "returns false if the user group is not present in the custom_summarization_allowed_groups_map setting" do
|
||||
SiteSetting.custom_summarization_allowed_groups = ""
|
||||
|
||||
expect(described_class.new(nil).can_request_summaries?(user)).to eq(false)
|
||||
expect(subject.can_request_summaries?(user)).to eq(false)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -0,0 +1,129 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
describe TopicSummarization do
|
||||
fab!(:topic) { Fabricate(:topic) }
|
||||
fab!(:post_1) { Fabricate(:post, topic: topic) }
|
||||
fab!(:post_2) { Fabricate(:post, topic: topic) }
|
||||
|
||||
shared_examples "includes only public-visible topics" do
|
||||
subject { described_class.new(DummyCustomSummarization.new({})) }
|
||||
|
||||
it "only includes visible posts" do
|
||||
topic.first_post.update!(hidden: true)
|
||||
|
||||
posts = subject.summary_targets(topic)
|
||||
|
||||
expect(posts.none?(&:hidden?)).to eq(true)
|
||||
end
|
||||
|
||||
it "doesn't include posts without users" do
|
||||
topic.first_post.user.destroy!
|
||||
|
||||
posts = subject.summary_targets(topic)
|
||||
|
||||
expect(posts.detect { |p| p.id == topic.first_post.id }).to be_nil
|
||||
end
|
||||
|
||||
it "doesn't include deleted posts" do
|
||||
topic.first_post.update!(user_id: nil)
|
||||
|
||||
posts = subject.summary_targets(topic)
|
||||
|
||||
expect(posts.detect { |p| p.id == topic.first_post.id }).to be_nil
|
||||
end
|
||||
end
|
||||
|
||||
describe "#summary_targets" do
|
||||
context "when the topic has a best replies summary" do
|
||||
before { topic.has_summary = true }
|
||||
|
||||
it_behaves_like "includes only public-visible topics"
|
||||
end
|
||||
|
||||
context "when the topic doesn't have a best replies summary" do
|
||||
before { topic.has_summary = false }
|
||||
|
||||
it_behaves_like "includes only public-visible topics"
|
||||
end
|
||||
end
|
||||
|
||||
describe "#summarize" do
|
||||
let(:strategy) { DummyCustomSummarization.new(summary) }
|
||||
|
||||
subject { described_class.new(strategy) }
|
||||
|
||||
def assert_summary_is_cached(topic, summary_response)
|
||||
cached_summary = SummarySection.find_by(target: topic, meta_section_id: nil)
|
||||
|
||||
expect(cached_summary.content_range).to cover(*topic.posts.map(&:post_number))
|
||||
expect(cached_summary.summarized_text).to eq(summary_response[:summary])
|
||||
expect(cached_summary.original_content_sha).to eq(
|
||||
Digest::SHA256.hexdigest(topic.posts.map(&:post_number).join),
|
||||
)
|
||||
expect(cached_summary.algorithm).to eq(strategy.model)
|
||||
end
|
||||
|
||||
def assert_chunk_is_cached(topic, chunk_response)
|
||||
cached_chunk =
|
||||
SummarySection
|
||||
.where.not(meta_section_id: nil)
|
||||
.find_by(
|
||||
target: topic,
|
||||
content_range: (chunk_response[:ids].min..chunk_response[:ids].max),
|
||||
)
|
||||
|
||||
expect(cached_chunk.summarized_text).to eq(chunk_response[:summary])
|
||||
expect(cached_chunk.original_content_sha).to eq(
|
||||
Digest::SHA256.hexdigest(chunk_response[:ids].join),
|
||||
)
|
||||
expect(cached_chunk.algorithm).to eq(strategy.model)
|
||||
end
|
||||
|
||||
context "when the content was summarized in a single chunk" do
|
||||
let(:summary) { { summary: "This is the final summary", chunks: [] } }
|
||||
|
||||
it "caches the summary" do
|
||||
summarized_text = subject.summarize(topic)
|
||||
|
||||
expect(summarized_text).to eq(summary[:summary])
|
||||
|
||||
assert_summary_is_cached(topic, summary)
|
||||
end
|
||||
|
||||
it "returns the cached version in subsequent calls" do
|
||||
subject.summarize(topic)
|
||||
|
||||
cached_summary_text = "This is a cached summary"
|
||||
cached_summary =
|
||||
SummarySection.find_by(target: topic, meta_section_id: nil).update!(
|
||||
summarized_text: cached_summary_text,
|
||||
)
|
||||
|
||||
summarized_text = subject.summarize(topic)
|
||||
expect(summarized_text).to eq(cached_summary_text)
|
||||
end
|
||||
end
|
||||
|
||||
context "when the content was summarized in multiple chunks" do
|
||||
let(:summary) do
|
||||
{
|
||||
summary: "This is the final summary",
|
||||
chunks: [
|
||||
{ ids: [topic.first_post.post_number], summary: "this is the first chunk" },
|
||||
{ ids: [post_1.post_number, post_2.post_number], summary: "this is the second chunk" },
|
||||
],
|
||||
}
|
||||
end
|
||||
|
||||
it "caches the summary and each chunk" do
|
||||
summarized_text = subject.summarize(topic)
|
||||
|
||||
expect(summarized_text).to eq(summary[:summary])
|
||||
|
||||
assert_summary_is_cached(topic, summary)
|
||||
|
||||
summary[:chunks].each { |c| assert_chunk_is_cached(topic, c) }
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
|
@ -1,7 +1,9 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
class DummyCustomSummarization < Summarization::Base
|
||||
RESPONSE = "This is a summary of the content you gave me"
|
||||
def initialize(summarization_result)
|
||||
@summarization_result = summarization_result
|
||||
end
|
||||
|
||||
def display_name
|
||||
"dummy"
|
||||
|
@ -15,7 +17,11 @@ class DummyCustomSummarization < Summarization::Base
|
|||
"hint"
|
||||
end
|
||||
|
||||
def model
|
||||
"dummy"
|
||||
end
|
||||
|
||||
def summarize(_content)
|
||||
RESPONSE
|
||||
@summarization_result
|
||||
end
|
||||
end
|
||||
|
|
|
@ -10,9 +10,11 @@ RSpec.describe "Topic summarization", type: :system, js: true do
|
|||
|
||||
let(:plugin) { Plugin::Instance.new }
|
||||
|
||||
let(:summarization_result) { { summary: "This is a summary", chunks: [] } }
|
||||
|
||||
before do
|
||||
sign_in(user)
|
||||
strategy = DummyCustomSummarization.new("dummy")
|
||||
strategy = DummyCustomSummarization.new(summarization_result)
|
||||
plugin.register_summarization_strategy(strategy)
|
||||
SiteSetting.summarization_strategy = strategy.model
|
||||
end
|
||||
|
@ -24,6 +26,6 @@ RSpec.describe "Topic summarization", type: :system, js: true do
|
|||
|
||||
expect(page.has_css?(".topic-summary-modal", wait: 5)).to eq(true)
|
||||
|
||||
expect(find(".summary-area").text).to eq(DummyCustomSummarization::RESPONSE)
|
||||
expect(find(".summary-area").text).to eq(summarization_result[:summary])
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue