diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml index b10702e2..c47cdd5a 100644 --- a/config/locales/server.en.yml +++ b/config/locales/server.en.yml @@ -209,6 +209,7 @@ en: name: "Base Search Query" description: "Base query to use when searching. Example: '#urgent' will prepend '#urgent' to the search query and only include topics with the urgent category or tag." command_summary: + web_browser: "Browse Web" github_search_code: "GitHub code search" github_file_content: "GitHub file content" github_pull_request_diff: "GitHub pull request diff" @@ -227,6 +228,7 @@ en: dall_e: "Generate image" search_meta_discourse: "Search Meta Discourse" command_help: + web_browser: "Browse web page using the AI Bot" github_search_code: "Search for code in a GitHub repository" github_file_content: "Retrieve content of files from a GitHub repository" github_pull_request_diff: "Retrieve a GitHub pull request diff" @@ -245,6 +247,7 @@ en: dall_e: "Generate image using DALL-E 3" search_meta_discourse: "Search Meta Discourse" command_description: + web_browser: "Reading %{url}" github_search_code: "Searched for '%{query}' in %{repo}" github_pull_request_diff: "%{repo} %{pull_id}" github_file_content: "Retrieved content of %{file_paths} from %{repo_name}@%{branch}" diff --git a/lib/ai_bot/personas/persona.rb b/lib/ai_bot/personas/persona.rb index ab9f44bf..b4c6060f 100644 --- a/lib/ai_bot/personas/persona.rb +++ b/lib/ai_bot/personas/persona.rb @@ -75,6 +75,7 @@ module DiscourseAi Tools::DiscourseMetaSearch, Tools::GithubFileContent, Tools::GithubPullRequestDiff, + Tools::WebBrowser, ] tools << Tools::GithubSearchCode if SiteSetting.ai_bot_github_access_token.present? diff --git a/lib/ai_bot/personas/researcher.rb b/lib/ai_bot/personas/researcher.rb index 6b7e7eaa..983d4b22 100644 --- a/lib/ai_bot/personas/researcher.rb +++ b/lib/ai_bot/personas/researcher.rb @@ -5,7 +5,7 @@ module DiscourseAi module Personas class Researcher < Persona def tools - [Tools::Google] + [Tools::Google, Tools::WebBrowser] end def required_tools @@ -14,19 +14,36 @@ module DiscourseAi def system_prompt <<~PROMPT - You are research bot. With access to Google you can find information for users. + You are a research assistant with access to two powerful tools: - - You are conversing with: {participants} - - You understand **Discourse Markdown** and generate it. - - When generating responses you always cite your sources using Markdown footnotes. - - When possible you also quote the sources. + 1. Google search - for finding relevant information across the internet. + 2. Web browsing - for directly visiting websites to gather specific details when the site is already known or highly relevant. - Example: + When responding to a question, consider which tool would be most effective while aiming to minimize unnecessary or duplicate inquiries: + - Use Google search to quickly identify the most relevant sources. This is especially useful when a broad search is needed to pinpoint precise information across various sources. + - Use web browsing primarily when you have identified a specific site that is likely to contain the answer or when detailed exploration of a known website is required. - **This** is a content[^1] with two footnotes[^2]. + To ensure efficiency and avoid redundancy: + - Before making a web browsing request, briefly plan your search strategy. Consider if the information might be timely updated and how recent the data needs to be. + - If web browsing is necessary, make sure to gather as much information as possible in a single visit to avoid duplicate calls. - [^1]: https://www.example.com - [^2]: https://www.example2.com + Always aim to: + - Optimize tool use by selecting the most appropriate method based on the information need and the likely source of the answer. + - Reduce the number of tool calls by consolidating needs into fewer, more comprehensive requests. + + Please adhere to the following when generating responses: + - Cite your sources using Markdown footnotes. + - When possible, include brief quotes from the sources. + - Use **Discourse Markdown** syntax for formatting. + + Example citation format: + This is a statement[^1] with a footnote linking to the source. + + [^1]: https://www.example.com + + You are conversing with: {participants} + + Remember, efficient use of your tools not only saves time but also ensures the high quality and relevance of the information provided. PROMPT end end diff --git a/lib/ai_bot/tools/github_file_content.rb b/lib/ai_bot/tools/github_file_content.rb index e5ed76ff..91fe9ab5 100644 --- a/lib/ai_bot/tools/github_file_content.rb +++ b/lib/ai_bot/tools/github_file_content.rb @@ -61,17 +61,22 @@ module DiscourseAi api_url = "https://api.github.com/repos/#{owner}/#{repo}/contents/#{file_path}?ref=#{branch}" - response = - send_http_request( - api_url, - headers: { - "Accept" => "application/vnd.github.v3+json", - }, - authenticate_github: true, - ) + response_code = "-1 unknown" + body = nil - if response.code == "200" - file_data = JSON.parse(response.body) + send_http_request( + api_url, + headers: { + "Accept" => "application/vnd.github.v3+json", + }, + authenticate_github: true, + ) do |response| + response_code = response.code + body = read_response_body(response) + end + + if response_code == "200" + file_data = JSON.parse(body) content = Base64.decode64(file_data["content"]) file_contents[file_path] = content else diff --git a/lib/ai_bot/tools/github_pull_request_diff.rb b/lib/ai_bot/tools/github_pull_request_diff.rb index a2448474..e5973cdc 100644 --- a/lib/ai_bot/tools/github_pull_request_diff.rb +++ b/lib/ai_bot/tools/github_pull_request_diff.rb @@ -47,22 +47,27 @@ module DiscourseAi api_url = "https://api.github.com/repos/#{repo}/pulls/#{pull_id}" @url = "https://github.com/#{repo}/pull/#{pull_id}" - response = - send_http_request( - api_url, - headers: { - "Accept" => "application/vnd.github.v3.diff", - }, - authenticate_github: true, - ) + body = nil + response_code = "unknown error" - if response.code == "200" - diff = response.body + send_http_request( + api_url, + headers: { + "Accept" => "application/vnd.github.v3.diff", + }, + authenticate_github: true, + ) do |response| + response_code = response.code + body = read_response_body(response) + end + + if response_code == "200" + diff = body diff = sort_and_shorten_diff(diff) diff = truncate(diff, max_length: 20_000, percent_length: 0.3, llm: llm) { diff: diff } else - { error: "Failed to retrieve the diff. Status code: #{response.code}" } + { error: "Failed to retrieve the diff. Status code: #{response_code}" } end end diff --git a/lib/ai_bot/tools/github_search_code.rb b/lib/ai_bot/tools/github_search_code.rb index 6133fa45..77d1f005 100644 --- a/lib/ai_bot/tools/github_search_code.rb +++ b/lib/ai_bot/tools/github_search_code.rb @@ -44,17 +44,27 @@ module DiscourseAi def invoke(_bot_user, llm) api_url = "https://api.github.com/search/code?q=#{query}+repo:#{repo}" - response = - send_http_request( - api_url, - headers: { - "Accept" => "application/vnd.github.v3.text-match+json", - }, - authenticate_github: true, - ) + response_code = "unknown error" + search_data = nil - if response.code == "200" - search_data = JSON.parse(response.body) + send_http_request( + api_url, + headers: { + "Accept" => "application/vnd.github.v3.text-match+json", + }, + authenticate_github: true, + ) do |response| + response_code = response.code + if response_code == "200" + begin + search_data = JSON.parse(read_response_body(response)) + rescue JSON::ParserError + response_code = "500 - JSON parse error" + end + end + end + + if response_code == "200" results = search_data["items"] .map { |item| "#{item["path"]}:\n#{item["text_matches"][0]["fragment"]}" } @@ -63,7 +73,7 @@ module DiscourseAi results = truncate(results, max_length: 20_000, percent_length: 0.3, llm: llm) { search_results: results } else - { error: "Failed to perform code search. Status code: #{response.code}" } + { error: "Failed to perform code search. Status code: #{response_code}" } end end end diff --git a/lib/ai_bot/tools/tool.rb b/lib/ai_bot/tools/tool.rb index 9b90edcf..e9c3bd49 100644 --- a/lib/ai_bot/tools/tool.rb +++ b/lib/ai_bot/tools/tool.rb @@ -77,8 +77,34 @@ module DiscourseAi protected - def send_http_request(url, headers: {}, authenticate_github: false) - uri = URI(url) + def send_http_request(url, headers: {}, authenticate_github: false, follow_redirects: false) + raise "Expecting caller to use a block" if !block_given? + + uri = nil + url = UrlHelper.normalized_encode(url) + uri = + begin + URI.parse(url) + rescue StandardError + nil + end + + return if !uri + + if follow_redirects + fd = + FinalDestination.new( + url, + validate_uri: true, + max_redirects: 5, + follow_canonical: true, + ) + + uri = fd.resolve + end + + return if uri.blank? + request = FinalDestination::HTTP::Get.new(uri) request["User-Agent"] = DiscourseAi::AiBot::USER_AGENT headers.each { |k, v| request[k] = v } @@ -87,10 +113,20 @@ module DiscourseAi end FinalDestination::HTTP.start(uri.hostname, uri.port, use_ssl: uri.port != 80) do |http| - http.request(request) + http.request(request) { |response| yield response } end end + def read_response_body(response, max_length: 4.megabyte) + body = +"" + response.read_body do |chunk| + body << chunk + break if body.bytesize > max_length + end + + body[0..max_length] + end + def truncate(text, llm:, percent_length: nil, max_length: nil) if !percent_length && !max_length raise ArgumentError, "You must provide either percent_length or max_length" diff --git a/lib/ai_bot/tools/web_browser.rb b/lib/ai_bot/tools/web_browser.rb new file mode 100644 index 00000000..86f9d9aa --- /dev/null +++ b/lib/ai_bot/tools/web_browser.rb @@ -0,0 +1,106 @@ +# frozen_string_literal: true + +module DiscourseAi + module AiBot + module Tools + class WebBrowser < Tool + def self.signature + { + name: name, + description: + "Visits a web page, retrieves the HTML content, extracts the main content, converts it to plain text, and returns the result.", + parameters: [ + { + name: "url", + description: "The URL of the web page to visit.", + required: true, + type: "string", + }, + ], + } + end + + def self.name + "web_browser" + end + + def url + return @url if defined?(@url) + @url = parameters[:url] + @url = "https://#{@url}" if !@url.start_with?("http") + + @url + end + + def invoke(_bot_user, llm) + send_http_request(url, follow_redirects: true) do |response| + if response.code == "200" + html = read_response_body(response) + text = extract_main_content(html) + text = truncate(text, max_length: 50_000, percent_length: 0.3, llm: llm) + return { url: response.uri.to_s, text: text.strip } + else + return { url: url, error: "Failed to retrieve the web page: #{response.code}" } + end + end + + { url: url, error: "Failed to retrieve the web page" } + end + + def description_args + { url: url } + end + + private + + def extract_main_content(html) + doc = Nokogiri.HTML(html) + doc.search("script, style, comment").remove + + main_content = find_main_content(doc) + main_content ||= doc.at("body") + + buffer = +"" + nodes_to_text(main_content, buffer) + + buffer.gsub(/\s+/, " ") + end + + def nodes_to_text(nodes, buffer) + if nodes.text? + buffer << nodes.text + buffer << " " + return + end + + nodes.children.each do |node| + case node.name + when "text" + buffer << node.text + buffer << " " + when "br" + buffer << "\n" + when "a" + nodes_to_text(node, buffer) + buffer << " [#{node["href"]}] " + else + nodes_to_text(node, buffer) + end + end + end + + def find_main_content(doc) + [ + doc.at("article"), + doc.at("main"), + doc.at("[role='main']"), + doc.at("#main"), + doc.at(".main"), + doc.at("#content"), + doc.at(".content"), + ].find(&:present?) + end + end + end + end +end diff --git a/spec/lib/modules/ai_bot/personas/researcher_spec.rb b/spec/lib/modules/ai_bot/personas/researcher_spec.rb index e9cebc4e..c3c65888 100644 --- a/spec/lib/modules/ai_bot/personas/researcher_spec.rb +++ b/spec/lib/modules/ai_bot/personas/researcher_spec.rb @@ -6,6 +6,8 @@ RSpec.describe DiscourseAi::AiBot::Personas::Researcher do end it "renders schema" do - expect(researcher.tools).to eq([DiscourseAi::AiBot::Tools::Google]) + expect(researcher.tools).to eq( + [DiscourseAi::AiBot::Tools::Google, DiscourseAi::AiBot::Tools::WebBrowser], + ) end end diff --git a/spec/lib/modules/ai_bot/tools/web_browser_spec.rb b/spec/lib/modules/ai_bot/tools/web_browser_spec.rb new file mode 100644 index 00000000..85d3af9e --- /dev/null +++ b/spec/lib/modules/ai_bot/tools/web_browser_spec.rb @@ -0,0 +1,98 @@ +# frozen_string_literal: true + +RSpec.describe DiscourseAi::AiBot::Tools::WebBrowser do + let(:bot_user) { User.find(DiscourseAi::AiBot::EntryPoint::GPT3_5_TURBO_ID) } + let(:llm) { DiscourseAi::Completions::Llm.proxy("open_ai:gpt-4-turbo") } + + before do + SiteSetting.ai_openai_api_key = "asd" + SiteSetting.ai_bot_enabled = true + end + + describe "#invoke" do + it "can retrieve the content of a webpage and returns the processed text" do + url = "https://arxiv.org/html/2403.17011v1" + processed_text = "This is a simplified version of the webpage content." + + # Mocking the web request to return a specific HTML structure + stub_request(:get, url).to_return( + status: 200, + body: + "
This is a simplified version of the webpage content.
", + ) + + tool = described_class.new({ url: url }) + result = tool.invoke(bot_user, llm) + + expect(result).to have_key(:text) + expect(result[:text]).to eq(processed_text) + expect(result[:url]).to eq(url) + end + + it "returns an error if the webpage cannot be retrieved" do + url = "https://arxiv.org/html/2403.17011v1" + + # Simulating a failed request + stub_request(:get, url).to_return(status: [500, "Internal Server Error"]) + + tool = described_class.new({ url: url }) + result = tool.invoke(bot_user, llm) + + expect(result).to have_key(:error) + expect(result[:error]).to include("Failed to retrieve the web page") + end + end + + describe "#invoke with various HTML structures" do + let(:url) { "http://example.com" } + + it "extracts main content from a simple HTML structure" do + simple_html = "Simple content.
" + stub_request(:get, url).to_return(status: 200, body: simple_html) + + tool = described_class.new({ url: url }) + result = tool.invoke(bot_user, llm) + + expect(result[:text]).to eq("Simple content.") + end + + it "correctly ignores script and style tags" do + complex_html = + "Only relevant content here.
" + stub_request(:get, url).to_return(status: 200, body: complex_html) + + tool = described_class.new({ url: url }) + result = tool.invoke(bot_user, llm) + + expect(result[:text]).to eq("Only relevant content here.") + end + + it "extracts content from nested structures" do + nested_html = + "Nested paragraph 1.
Nested paragraph 2.
Redirected content.
" } + + it "follows redirects and retrieves content from the final destination" do + stub_request(:get, initial_url).to_return(status: 302, headers: { "Location" => final_url }) + stub_request(:get, final_url).to_return(status: 200, body: redirect_html) + + tool = described_class.new({ url: initial_url }) + result = tool.invoke(bot_user, llm) + + expect(result[:url]).to eq(final_url) + expect(result[:text]).to eq("Redirected content.") + end + end +end