discourse-ai/lib/utils/diff_utils/safety_checker.rb
Keegan George dfea784fc4
DEV: Improve diff streaming accuracy with safety checker (#1338)
This update adds a safety checker which scans the streamed updates. It ensures that incomplete segments of text are not sent yet over message bus as this will cause breakage with the diff streamer. It also updates the diff streamer to handle a thinking state for when we are waiting for message bus updates.
2025-05-15 11:38:46 -07:00

92 lines
2.5 KiB
Ruby

# frozen_string_literal: true
require "cgi"
module DiscourseAi
module Utils
module DiffUtils
class SafetyChecker
def self.safe_to_stream?(html_text)
new(html_text).safe?
end
def initialize(html_text)
@original_html = html_text
@text = sanitize(html_text)
end
def safe?
return false if unclosed_markdown_links?
return false if unclosed_raw_html_tag?
return false if trailing_incomplete_url?
return false if unclosed_backticks?
return false if unbalanced_bold_or_italic?
return false if incomplete_image_markdown?
return false if unbalanced_quote_blocks?
return false if unclosed_triple_backticks?
return false if partial_emoji?
true
end
private
def sanitize(html)
text = html.gsub(%r{</?[^>]+>}, "") # remove tags like <span>, <del>, etc.
CGI.unescapeHTML(text)
end
def unclosed_markdown_links?
open_brackets = @text.count("[")
close_brackets = @text.count("]")
open_parens = @text.count("(")
close_parens = @text.count(")")
open_brackets != close_brackets || open_parens != close_parens
end
def unclosed_raw_html_tag?
last_lt = @text.rindex("<")
last_gt = @text.rindex(">")
last_lt && (!last_gt || last_gt < last_lt)
end
def trailing_incomplete_url?
last_word = @text.split(/\s/).last
last_word =~ %r{\Ahttps?://[^\s]*\z} && last_word !~ /[)\].,!?:;'"]\z/
end
def unclosed_backticks?
@text.count("`").odd?
end
def unbalanced_bold_or_italic?
@text.scan(/\*\*/).count.odd? || @text.scan(/\*(?!\*)/).count.odd? ||
@text.scan(/_/).count.odd?
end
def incomplete_image_markdown?
last_image = @text[/!\[.*?\]\(.*?$/, 0]
last_image && last_image[-1] != ")"
end
def unbalanced_quote_blocks?
opens = @text.scan(/\[quote(=.*?)?\]/i).count
closes = @text.scan(%r{\[/quote\]}i).count
opens > closes
end
def unclosed_triple_backticks?
@text.scan(/```/).count.odd?
end
def partial_emoji?
text = @text.gsub(/!\[.*?\]\(.*?\)/, "").gsub(%r{https?://[^\s]+}, "")
tokens = text.scan(/:[a-z0-9_+\-\.]+:?/i)
tokens.any? { |token| token.start_with?(":") && !token.end_with?(":") }
end
end
end
end
end