mirror of
https://github.com/discourse/discourse-ai.git
synced 2025-02-06 19:48:15 +00:00
fb81307c59
This pull request makes several improvements and additions to the GitHub-related tools and personas in the `discourse-ai` repository: 1. It adds the `WebBrowser` tool to the `Researcher` persona, allowing the AI to visit web pages, retrieve HTML content, extract the main content, and convert it to plain text. 2. It updates the `GithubFileContent`, `GithubPullRequestDiff`, and `GithubSearchCode` tools to handle HTTP responses more robustly (introducing size limits). 3. It refactors the `send_http_request` method in the `Tool` class to follow redirects when specified, and to read the response body in chunks to avoid memory issues with large responses. (only for WebBrowser) 4. It updates the system prompt for the `Researcher` persona to provide more detailed guidance on when to use Google search vs web browsing, and how to optimize tool usage and reduce redundant requests. 5. It adds a new `web_browser_spec.rb` file with tests for the `WebBrowser` tool, covering various scenarios like handling different HTML structures and following redirects.
107 lines
2.7 KiB
Ruby
107 lines
2.7 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
module DiscourseAi
|
|
module AiBot
|
|
module Tools
|
|
class WebBrowser < Tool
|
|
def self.signature
|
|
{
|
|
name: name,
|
|
description:
|
|
"Visits a web page, retrieves the HTML content, extracts the main content, converts it to plain text, and returns the result.",
|
|
parameters: [
|
|
{
|
|
name: "url",
|
|
description: "The URL of the web page to visit.",
|
|
required: true,
|
|
type: "string",
|
|
},
|
|
],
|
|
}
|
|
end
|
|
|
|
def self.name
|
|
"web_browser"
|
|
end
|
|
|
|
def url
|
|
return @url if defined?(@url)
|
|
@url = parameters[:url]
|
|
@url = "https://#{@url}" if !@url.start_with?("http")
|
|
|
|
@url
|
|
end
|
|
|
|
def invoke(_bot_user, llm)
|
|
send_http_request(url, follow_redirects: true) do |response|
|
|
if response.code == "200"
|
|
html = read_response_body(response)
|
|
text = extract_main_content(html)
|
|
text = truncate(text, max_length: 50_000, percent_length: 0.3, llm: llm)
|
|
return { url: response.uri.to_s, text: text.strip }
|
|
else
|
|
return { url: url, error: "Failed to retrieve the web page: #{response.code}" }
|
|
end
|
|
end
|
|
|
|
{ url: url, error: "Failed to retrieve the web page" }
|
|
end
|
|
|
|
def description_args
|
|
{ url: url }
|
|
end
|
|
|
|
private
|
|
|
|
def extract_main_content(html)
|
|
doc = Nokogiri.HTML(html)
|
|
doc.search("script, style, comment").remove
|
|
|
|
main_content = find_main_content(doc)
|
|
main_content ||= doc.at("body")
|
|
|
|
buffer = +""
|
|
nodes_to_text(main_content, buffer)
|
|
|
|
buffer.gsub(/\s+/, " ")
|
|
end
|
|
|
|
def nodes_to_text(nodes, buffer)
|
|
if nodes.text?
|
|
buffer << nodes.text
|
|
buffer << " "
|
|
return
|
|
end
|
|
|
|
nodes.children.each do |node|
|
|
case node.name
|
|
when "text"
|
|
buffer << node.text
|
|
buffer << " "
|
|
when "br"
|
|
buffer << "\n"
|
|
when "a"
|
|
nodes_to_text(node, buffer)
|
|
buffer << " [#{node["href"]}] "
|
|
else
|
|
nodes_to_text(node, buffer)
|
|
end
|
|
end
|
|
end
|
|
|
|
def find_main_content(doc)
|
|
[
|
|
doc.at("article"),
|
|
doc.at("main"),
|
|
doc.at("[role='main']"),
|
|
doc.at("#main"),
|
|
doc.at(".main"),
|
|
doc.at("#content"),
|
|
doc.at(".content"),
|
|
].find(&:present?)
|
|
end
|
|
end
|
|
end
|
|
end
|
|
end
|