mirror of
https://github.com/discourse/discourse-ai.git
synced 2025-11-11 03:39:47 +00:00
This commit introduces a new Forum Researcher persona specialized in deep forum content analysis along with comprehensive improvements to our AI infrastructure.
Key additions:
New Forum Researcher persona with advanced filtering and analysis capabilities
Robust filtering system supporting tags, categories, dates, users, and keywords
LLM formatter to efficiently process and chunk research results
Infrastructure improvements:
Implemented CancelManager class to centrally manage AI completion cancellations
Replaced callback-based cancellation with a more robust pattern
Added systematic cancellation monitoring with callbacks
Other improvements:
Added configurable default_enabled flag to control which personas are enabled by default
Updated translation strings for the new researcher functionality
Added comprehensive specs for the new components
Renames Researcher -> Web Researcher
This change makes our AI platform more stable while adding powerful research capabilities that can analyze forum trends and surface relevant content.
206 lines
6.8 KiB
Ruby
206 lines
6.8 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
module DiscourseAi
|
|
module Utils
|
|
module Research
|
|
class LlmFormatter
|
|
def initialize(filter, max_tokens_per_batch:, tokenizer:, max_tokens_per_post:)
|
|
@filter = filter
|
|
@max_tokens_per_batch = max_tokens_per_batch
|
|
@tokenizer = tokenizer
|
|
@max_tokens_per_post = max_tokens_per_post
|
|
@to_process = filter_to_hash
|
|
end
|
|
|
|
def each_chunk
|
|
return nil if @to_process.empty?
|
|
|
|
result = { post_count: 0, topic_count: 0, text: +"" }
|
|
estimated_tokens = 0
|
|
|
|
@to_process.each do |topic_id, topic_data|
|
|
topic = Topic.find_by(id: topic_id)
|
|
next unless topic
|
|
|
|
topic_text, topic_tokens, post_count = format_topic(topic, topic_data[:posts])
|
|
|
|
# If this single topic exceeds our token limit and we haven't added anything yet,
|
|
# we need to include at least this one topic (partial content)
|
|
if estimated_tokens == 0 && topic_tokens > @max_tokens_per_batch
|
|
offset = 0
|
|
while offset < topic_text.length
|
|
chunk = +""
|
|
chunk_tokens = 0
|
|
lines = topic_text[offset..].lines
|
|
lines.each do |line|
|
|
line_tokens = estimate_tokens(line)
|
|
break if chunk_tokens + line_tokens > @max_tokens_per_batch
|
|
chunk << line
|
|
chunk_tokens += line_tokens
|
|
end
|
|
break if chunk.empty?
|
|
yield(
|
|
{
|
|
text: chunk,
|
|
post_count: post_count, # This may overcount if split mid-topic, but preserves original logic
|
|
topic_count: 1,
|
|
}
|
|
)
|
|
offset += chunk.length
|
|
end
|
|
|
|
next
|
|
end
|
|
|
|
# If adding this topic would exceed our token limit and we already have content, skip it
|
|
if estimated_tokens > 0 && estimated_tokens + topic_tokens > @max_tokens_per_batch
|
|
yield result if result[:text].present?
|
|
estimated_tokens = 0
|
|
result = { post_count: 0, topic_count: 0, text: +"" }
|
|
else
|
|
# Add this topic to the result
|
|
result[:text] << topic_text
|
|
result[:post_count] += post_count
|
|
result[:topic_count] += 1
|
|
estimated_tokens += topic_tokens
|
|
end
|
|
end
|
|
yield result if result[:text].present?
|
|
|
|
@to_process.clear
|
|
end
|
|
|
|
private
|
|
|
|
def filter_to_hash
|
|
hash = {}
|
|
@filter
|
|
.search
|
|
.pluck(:topic_id, :id, :post_number)
|
|
.each do |topic_id, post_id, post_number|
|
|
hash[topic_id] ||= { posts: [] }
|
|
hash[topic_id][:posts] << [post_id, post_number]
|
|
end
|
|
|
|
hash.each_value { |topic| topic[:posts].sort_by! { |_, post_number| post_number } }
|
|
hash
|
|
end
|
|
|
|
def format_topic(topic, posts_data)
|
|
text = ""
|
|
total_tokens = 0
|
|
post_count = 0
|
|
|
|
# Add topic header
|
|
text += format_topic_header(topic)
|
|
total_tokens += estimate_tokens(text)
|
|
|
|
# Get all post numbers in this topic
|
|
all_post_numbers = topic.posts.pluck(:post_number).sort
|
|
|
|
# Format posts with omitted information
|
|
first_post_number = posts_data.first[1]
|
|
last_post_number = posts_data.last[1]
|
|
|
|
# Handle posts before our selection
|
|
if first_post_number > 1
|
|
omitted_before = first_post_number - 1
|
|
text += format_omitted_posts(omitted_before, "before")
|
|
total_tokens += estimate_tokens(format_omitted_posts(omitted_before, "before"))
|
|
end
|
|
|
|
# Format each post
|
|
posts_data.each do |post_id, post_number|
|
|
post = Post.find_by(id: post_id)
|
|
next unless post
|
|
|
|
text += format_post(post)
|
|
total_tokens += estimate_tokens(format_post(post))
|
|
post_count += 1
|
|
end
|
|
|
|
# Handle posts after our selection
|
|
if last_post_number < all_post_numbers.last
|
|
omitted_after = all_post_numbers.last - last_post_number
|
|
text += format_omitted_posts(omitted_after, "after")
|
|
total_tokens += estimate_tokens(format_omitted_posts(omitted_after, "after"))
|
|
end
|
|
|
|
[text, total_tokens, post_count]
|
|
end
|
|
|
|
def format_topic_header(topic)
|
|
header = +"# #{topic.title}\n"
|
|
|
|
# Add category
|
|
header << "Category: #{topic.category.name}\n" if topic.category
|
|
|
|
# Add tags
|
|
header << "Tags: #{topic.tags.map(&:name).join(", ")}\n" if topic.tags.present?
|
|
|
|
# Add creation date
|
|
header << "Created: #{format_date(topic.created_at)}\n"
|
|
header << "Topic url: /t/#{topic.id}\n"
|
|
header << "Status: #{format_topic_status(topic)}\n\n"
|
|
|
|
header
|
|
end
|
|
|
|
def format_topic_status(topic)
|
|
solved = topic.respond_to?(:solved) && topic.solved.present?
|
|
solved_text = solved ? " (solved)" : ""
|
|
if topic.archived?
|
|
"Archived#{solved_text}"
|
|
elsif topic.closed?
|
|
"Closed#{solved_text}"
|
|
else
|
|
"Open#{solved_text}"
|
|
end
|
|
end
|
|
|
|
def format_post(post)
|
|
text = +"---\n"
|
|
text << "## Post by #{post.user&.username} - #{format_date(post.created_at)}\n\n"
|
|
text << "#{truncate_if_needed(post.raw)}\n"
|
|
text << "Likes: #{post.like_count}\n" if post.like_count.to_i > 0
|
|
text << "Post url: /t/-/#{post.topic_id}/#{post.post_number}\n\n"
|
|
text
|
|
end
|
|
|
|
def truncate_if_needed(content)
|
|
tokens_count = estimate_tokens(content)
|
|
|
|
return content if tokens_count <= @max_tokens_per_post
|
|
|
|
half_limit = @max_tokens_per_post / 2
|
|
token_ids = @tokenizer.encode(content)
|
|
|
|
first_half_ids = token_ids[0...half_limit]
|
|
last_half_ids = token_ids[-half_limit..-1]
|
|
|
|
first_text = @tokenizer.decode(first_half_ids)
|
|
last_text = @tokenizer.decode(last_half_ids)
|
|
|
|
"#{first_text}\n\n... elided #{tokens_count - @max_tokens_per_post} tokens ...\n\n#{last_text}"
|
|
end
|
|
|
|
def format_omitted_posts(count, position)
|
|
if position == "before"
|
|
"#{count} earlier #{count == 1 ? "post" : "posts"} omitted\n\n"
|
|
else
|
|
"#{count} later #{count == 1 ? "post" : "posts"} omitted\n\n"
|
|
end
|
|
end
|
|
|
|
def format_date(date)
|
|
date.strftime("%Y-%m-%d %H:%M")
|
|
end
|
|
|
|
def estimate_tokens(text)
|
|
@tokenizer.tokenize(text).length
|
|
end
|
|
end
|
|
end
|
|
end
|
|
end
|