mirror of
https://github.com/discourse/discourse-ai.git
synced 2025-07-01 20:12:15 +00:00
This commit introduces a new Forum Researcher persona specialized in deep forum content analysis along with comprehensive improvements to our AI infrastructure. Key additions: New Forum Researcher persona with advanced filtering and analysis capabilities Robust filtering system supporting tags, categories, dates, users, and keywords LLM formatter to efficiently process and chunk research results Infrastructure improvements: Implemented CancelManager class to centrally manage AI completion cancellations Replaced callback-based cancellation with a more robust pattern Added systematic cancellation monitoring with callbacks Other improvements: Added configurable default_enabled flag to control which personas are enabled by default Updated translation strings for the new researcher functionality Added comprehensive specs for the new components Renames Researcher -> Web Researcher This change makes our AI platform more stable while adding powerful research capabilities that can analyze forum trends and surface relevant content.
206 lines
6.8 KiB
Ruby
206 lines
6.8 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
module DiscourseAi
|
|
module Utils
|
|
module Research
|
|
class LlmFormatter
|
|
def initialize(filter, max_tokens_per_batch:, tokenizer:, max_tokens_per_post:)
|
|
@filter = filter
|
|
@max_tokens_per_batch = max_tokens_per_batch
|
|
@tokenizer = tokenizer
|
|
@max_tokens_per_post = max_tokens_per_post
|
|
@to_process = filter_to_hash
|
|
end
|
|
|
|
def each_chunk
|
|
return nil if @to_process.empty?
|
|
|
|
result = { post_count: 0, topic_count: 0, text: +"" }
|
|
estimated_tokens = 0
|
|
|
|
@to_process.each do |topic_id, topic_data|
|
|
topic = Topic.find_by(id: topic_id)
|
|
next unless topic
|
|
|
|
topic_text, topic_tokens, post_count = format_topic(topic, topic_data[:posts])
|
|
|
|
# If this single topic exceeds our token limit and we haven't added anything yet,
|
|
# we need to include at least this one topic (partial content)
|
|
if estimated_tokens == 0 && topic_tokens > @max_tokens_per_batch
|
|
offset = 0
|
|
while offset < topic_text.length
|
|
chunk = +""
|
|
chunk_tokens = 0
|
|
lines = topic_text[offset..].lines
|
|
lines.each do |line|
|
|
line_tokens = estimate_tokens(line)
|
|
break if chunk_tokens + line_tokens > @max_tokens_per_batch
|
|
chunk << line
|
|
chunk_tokens += line_tokens
|
|
end
|
|
break if chunk.empty?
|
|
yield(
|
|
{
|
|
text: chunk,
|
|
post_count: post_count, # This may overcount if split mid-topic, but preserves original logic
|
|
topic_count: 1,
|
|
}
|
|
)
|
|
offset += chunk.length
|
|
end
|
|
|
|
next
|
|
end
|
|
|
|
# If adding this topic would exceed our token limit and we already have content, skip it
|
|
if estimated_tokens > 0 && estimated_tokens + topic_tokens > @max_tokens_per_batch
|
|
yield result if result[:text].present?
|
|
estimated_tokens = 0
|
|
result = { post_count: 0, topic_count: 0, text: +"" }
|
|
else
|
|
# Add this topic to the result
|
|
result[:text] << topic_text
|
|
result[:post_count] += post_count
|
|
result[:topic_count] += 1
|
|
estimated_tokens += topic_tokens
|
|
end
|
|
end
|
|
yield result if result[:text].present?
|
|
|
|
@to_process.clear
|
|
end
|
|
|
|
private
|
|
|
|
def filter_to_hash
|
|
hash = {}
|
|
@filter
|
|
.search
|
|
.pluck(:topic_id, :id, :post_number)
|
|
.each do |topic_id, post_id, post_number|
|
|
hash[topic_id] ||= { posts: [] }
|
|
hash[topic_id][:posts] << [post_id, post_number]
|
|
end
|
|
|
|
hash.each_value { |topic| topic[:posts].sort_by! { |_, post_number| post_number } }
|
|
hash
|
|
end
|
|
|
|
def format_topic(topic, posts_data)
|
|
text = ""
|
|
total_tokens = 0
|
|
post_count = 0
|
|
|
|
# Add topic header
|
|
text += format_topic_header(topic)
|
|
total_tokens += estimate_tokens(text)
|
|
|
|
# Get all post numbers in this topic
|
|
all_post_numbers = topic.posts.pluck(:post_number).sort
|
|
|
|
# Format posts with omitted information
|
|
first_post_number = posts_data.first[1]
|
|
last_post_number = posts_data.last[1]
|
|
|
|
# Handle posts before our selection
|
|
if first_post_number > 1
|
|
omitted_before = first_post_number - 1
|
|
text += format_omitted_posts(omitted_before, "before")
|
|
total_tokens += estimate_tokens(format_omitted_posts(omitted_before, "before"))
|
|
end
|
|
|
|
# Format each post
|
|
posts_data.each do |post_id, post_number|
|
|
post = Post.find_by(id: post_id)
|
|
next unless post
|
|
|
|
text += format_post(post)
|
|
total_tokens += estimate_tokens(format_post(post))
|
|
post_count += 1
|
|
end
|
|
|
|
# Handle posts after our selection
|
|
if last_post_number < all_post_numbers.last
|
|
omitted_after = all_post_numbers.last - last_post_number
|
|
text += format_omitted_posts(omitted_after, "after")
|
|
total_tokens += estimate_tokens(format_omitted_posts(omitted_after, "after"))
|
|
end
|
|
|
|
[text, total_tokens, post_count]
|
|
end
|
|
|
|
def format_topic_header(topic)
|
|
header = +"# #{topic.title}\n"
|
|
|
|
# Add category
|
|
header << "Category: #{topic.category.name}\n" if topic.category
|
|
|
|
# Add tags
|
|
header << "Tags: #{topic.tags.map(&:name).join(", ")}\n" if topic.tags.present?
|
|
|
|
# Add creation date
|
|
header << "Created: #{format_date(topic.created_at)}\n"
|
|
header << "Topic url: /t/#{topic.id}\n"
|
|
header << "Status: #{format_topic_status(topic)}\n\n"
|
|
|
|
header
|
|
end
|
|
|
|
def format_topic_status(topic)
|
|
solved = topic.respond_to?(:solved) && topic.solved.present?
|
|
solved_text = solved ? " (solved)" : ""
|
|
if topic.archived?
|
|
"Archived#{solved_text}"
|
|
elsif topic.closed?
|
|
"Closed#{solved_text}"
|
|
else
|
|
"Open#{solved_text}"
|
|
end
|
|
end
|
|
|
|
def format_post(post)
|
|
text = +"---\n"
|
|
text << "## Post by #{post.user&.username} - #{format_date(post.created_at)}\n\n"
|
|
text << "#{truncate_if_needed(post.raw)}\n"
|
|
text << "Likes: #{post.like_count}\n" if post.like_count.to_i > 0
|
|
text << "Post url: /t/-/#{post.topic_id}/#{post.post_number}\n\n"
|
|
text
|
|
end
|
|
|
|
def truncate_if_needed(content)
|
|
tokens_count = estimate_tokens(content)
|
|
|
|
return content if tokens_count <= @max_tokens_per_post
|
|
|
|
half_limit = @max_tokens_per_post / 2
|
|
token_ids = @tokenizer.encode(content)
|
|
|
|
first_half_ids = token_ids[0...half_limit]
|
|
last_half_ids = token_ids[-half_limit..-1]
|
|
|
|
first_text = @tokenizer.decode(first_half_ids)
|
|
last_text = @tokenizer.decode(last_half_ids)
|
|
|
|
"#{first_text}\n\n... elided #{tokens_count - @max_tokens_per_post} tokens ...\n\n#{last_text}"
|
|
end
|
|
|
|
def format_omitted_posts(count, position)
|
|
if position == "before"
|
|
"#{count} earlier #{count == 1 ? "post" : "posts"} omitted\n\n"
|
|
else
|
|
"#{count} later #{count == 1 ? "post" : "posts"} omitted\n\n"
|
|
end
|
|
end
|
|
|
|
def format_date(date)
|
|
date.strftime("%Y-%m-%d %H:%M")
|
|
end
|
|
|
|
def estimate_tokens(text)
|
|
@tokenizer.tokenize(text).length
|
|
end
|
|
end
|
|
end
|
|
end
|
|
end
|