discourse-ai/lib/utils/research/llm_formatter.rb
Sam c34fcc8a95
FEATURE: forum researcher persona for deep research (#1313)
This commit introduces a new Forum Researcher persona specialized in deep forum content analysis along with comprehensive improvements to our AI infrastructure.

Key additions:

    New Forum Researcher persona with advanced filtering and analysis capabilities
    Robust filtering system supporting tags, categories, dates, users, and keywords
    LLM formatter to efficiently process and chunk research results

Infrastructure improvements:

    Implemented CancelManager class to centrally manage AI completion cancellations
    Replaced callback-based cancellation with a more robust pattern
    Added systematic cancellation monitoring with callbacks

Other improvements:

    Added configurable default_enabled flag to control which personas are enabled by default
    Updated translation strings for the new researcher functionality
    Added comprehensive specs for the new components

    Renames Researcher -> Web Researcher

This change makes our AI platform more stable while adding powerful research capabilities that can analyze forum trends and surface relevant content.
2025-05-14 12:36:16 +10:00

206 lines
6.8 KiB
Ruby

# frozen_string_literal: true
module DiscourseAi
module Utils
module Research
class LlmFormatter
def initialize(filter, max_tokens_per_batch:, tokenizer:, max_tokens_per_post:)
@filter = filter
@max_tokens_per_batch = max_tokens_per_batch
@tokenizer = tokenizer
@max_tokens_per_post = max_tokens_per_post
@to_process = filter_to_hash
end
def each_chunk
return nil if @to_process.empty?
result = { post_count: 0, topic_count: 0, text: +"" }
estimated_tokens = 0
@to_process.each do |topic_id, topic_data|
topic = Topic.find_by(id: topic_id)
next unless topic
topic_text, topic_tokens, post_count = format_topic(topic, topic_data[:posts])
# If this single topic exceeds our token limit and we haven't added anything yet,
# we need to include at least this one topic (partial content)
if estimated_tokens == 0 && topic_tokens > @max_tokens_per_batch
offset = 0
while offset < topic_text.length
chunk = +""
chunk_tokens = 0
lines = topic_text[offset..].lines
lines.each do |line|
line_tokens = estimate_tokens(line)
break if chunk_tokens + line_tokens > @max_tokens_per_batch
chunk << line
chunk_tokens += line_tokens
end
break if chunk.empty?
yield(
{
text: chunk,
post_count: post_count, # This may overcount if split mid-topic, but preserves original logic
topic_count: 1,
}
)
offset += chunk.length
end
next
end
# If adding this topic would exceed our token limit and we already have content, skip it
if estimated_tokens > 0 && estimated_tokens + topic_tokens > @max_tokens_per_batch
yield result if result[:text].present?
estimated_tokens = 0
result = { post_count: 0, topic_count: 0, text: +"" }
else
# Add this topic to the result
result[:text] << topic_text
result[:post_count] += post_count
result[:topic_count] += 1
estimated_tokens += topic_tokens
end
end
yield result if result[:text].present?
@to_process.clear
end
private
def filter_to_hash
hash = {}
@filter
.search
.pluck(:topic_id, :id, :post_number)
.each do |topic_id, post_id, post_number|
hash[topic_id] ||= { posts: [] }
hash[topic_id][:posts] << [post_id, post_number]
end
hash.each_value { |topic| topic[:posts].sort_by! { |_, post_number| post_number } }
hash
end
def format_topic(topic, posts_data)
text = ""
total_tokens = 0
post_count = 0
# Add topic header
text += format_topic_header(topic)
total_tokens += estimate_tokens(text)
# Get all post numbers in this topic
all_post_numbers = topic.posts.pluck(:post_number).sort
# Format posts with omitted information
first_post_number = posts_data.first[1]
last_post_number = posts_data.last[1]
# Handle posts before our selection
if first_post_number > 1
omitted_before = first_post_number - 1
text += format_omitted_posts(omitted_before, "before")
total_tokens += estimate_tokens(format_omitted_posts(omitted_before, "before"))
end
# Format each post
posts_data.each do |post_id, post_number|
post = Post.find_by(id: post_id)
next unless post
text += format_post(post)
total_tokens += estimate_tokens(format_post(post))
post_count += 1
end
# Handle posts after our selection
if last_post_number < all_post_numbers.last
omitted_after = all_post_numbers.last - last_post_number
text += format_omitted_posts(omitted_after, "after")
total_tokens += estimate_tokens(format_omitted_posts(omitted_after, "after"))
end
[text, total_tokens, post_count]
end
def format_topic_header(topic)
header = +"# #{topic.title}\n"
# Add category
header << "Category: #{topic.category.name}\n" if topic.category
# Add tags
header << "Tags: #{topic.tags.map(&:name).join(", ")}\n" if topic.tags.present?
# Add creation date
header << "Created: #{format_date(topic.created_at)}\n"
header << "Topic url: /t/#{topic.id}\n"
header << "Status: #{format_topic_status(topic)}\n\n"
header
end
def format_topic_status(topic)
solved = topic.respond_to?(:solved) && topic.solved.present?
solved_text = solved ? " (solved)" : ""
if topic.archived?
"Archived#{solved_text}"
elsif topic.closed?
"Closed#{solved_text}"
else
"Open#{solved_text}"
end
end
def format_post(post)
text = +"---\n"
text << "## Post by #{post.user&.username} - #{format_date(post.created_at)}\n\n"
text << "#{truncate_if_needed(post.raw)}\n"
text << "Likes: #{post.like_count}\n" if post.like_count.to_i > 0
text << "Post url: /t/-/#{post.topic_id}/#{post.post_number}\n\n"
text
end
def truncate_if_needed(content)
tokens_count = estimate_tokens(content)
return content if tokens_count <= @max_tokens_per_post
half_limit = @max_tokens_per_post / 2
token_ids = @tokenizer.encode(content)
first_half_ids = token_ids[0...half_limit]
last_half_ids = token_ids[-half_limit..-1]
first_text = @tokenizer.decode(first_half_ids)
last_text = @tokenizer.decode(last_half_ids)
"#{first_text}\n\n... elided #{tokens_count - @max_tokens_per_post} tokens ...\n\n#{last_text}"
end
def format_omitted_posts(count, position)
if position == "before"
"#{count} earlier #{count == 1 ? "post" : "posts"} omitted\n\n"
else
"#{count} later #{count == 1 ? "post" : "posts"} omitted\n\n"
end
end
def format_date(date)
date.strftime("%Y-%m-%d %H:%M")
end
def estimate_tokens(text)
@tokenizer.tokenize(text).length
end
end
end
end
end