discourse-ai/lib/ai_bot/tools/web_browser.rb

107 lines
2.7 KiB
Ruby

# frozen_string_literal: true
module DiscourseAi
module AiBot
module Tools
class WebBrowser < Tool
def self.signature
{
name: name,
description:
"Visits a web page, retrieves the HTML content, extracts the main content, converts it to plain text, and returns the result.",
parameters: [
{
name: "url",
description: "The URL of the web page to visit.",
required: true,
type: "string",
},
],
}
end
def self.name
"web_browser"
end
def url
return @url if defined?(@url)
@url = parameters[:url]
@url = "https://#{@url}" if !@url.start_with?("http")
@url
end
def invoke(_bot_user, llm)
send_http_request(url, follow_redirects: true) do |response|
if response.code == "200"
html = read_response_body(response)
text = extract_main_content(html)
text = truncate(text, max_length: 50_000, percent_length: 0.3, llm: llm)
return { url: response.uri.to_s, text: text.strip }
else
return { url: url, error: "Failed to retrieve the web page: #{response.code}" }
end
end
{ url: url, error: "Failed to retrieve the web page" }
end
def description_args
{ url: url }
end
private
def extract_main_content(html)
doc = Nokogiri.HTML(html)
doc.search("script, style, comment").remove
main_content = find_main_content(doc)
main_content ||= doc.at("body")
buffer = +""
nodes_to_text(main_content, buffer)
buffer.gsub(/\s+/, " ")
end
def nodes_to_text(nodes, buffer)
if nodes.text?
buffer << nodes.text
buffer << " "
return
end
nodes.children.each do |node|
case node.name
when "text"
buffer << node.text
buffer << " "
when "br"
buffer << "\n"
when "a"
nodes_to_text(node, buffer)
buffer << " [#{node["href"]}] "
else
nodes_to_text(node, buffer)
end
end
end
def find_main_content(doc)
[
doc.at("article"),
doc.at("main"),
doc.at("[role='main']"),
doc.at("#main"),
doc.at(".main"),
doc.at("#content"),
doc.at(".content"),
].find(&:present?)
end
end
end
end
end