From fb81307c59fb5ae430a33e574c9465a53d3b8cad Mon Sep 17 00:00:00 2001
From: Sam <sam.saffron@gmail.com>
Date: Thu, 28 Mar 2024 16:01:58 +1100
Subject: [PATCH] FEATURE: web browsing tool (#548)

This pull request makes several improvements and additions to the GitHub-related tools and personas in the `discourse-ai` repository:

1. It adds the `WebBrowser` tool to the  `Researcher` persona, allowing the AI to visit web pages, retrieve HTML content, extract the main content, and convert it to plain text.

2. It updates the `GithubFileContent`, `GithubPullRequestDiff`, and `GithubSearchCode` tools to handle HTTP responses more robustly (introducing size limits).

3. It refactors the `send_http_request` method in the `Tool` class to follow redirects when specified, and to read the response body in chunks to avoid memory issues with large responses. (only for WebBrowser)

4. It updates the system prompt for the `Researcher` persona to provide more detailed guidance on when to use Google search vs web browsing, and how to optimize tool usage and reduce redundant requests.

5. It adds a new `web_browser_spec.rb` file with tests for the `WebBrowser` tool, covering various scenarios like handling different HTML structures and following redirects.
---
 config/locales/server.en.yml                  |   3 +
 lib/ai_bot/personas/persona.rb                |   1 +
 lib/ai_bot/personas/researcher.rb             |  37 ++++--
 lib/ai_bot/tools/github_file_content.rb       |  25 +++--
 lib/ai_bot/tools/github_pull_request_diff.rb  |  27 +++--
 lib/ai_bot/tools/github_search_code.rb        |  32 ++++--
 lib/ai_bot/tools/tool.rb                      |  42 ++++++-
 lib/ai_bot/tools/web_browser.rb               | 106 ++++++++++++++++++
 .../ai_bot/personas/researcher_spec.rb        |   4 +-
 .../modules/ai_bot/tools/web_browser_spec.rb  |  98 ++++++++++++++++
 10 files changed, 329 insertions(+), 46 deletions(-)
 create mode 100644 lib/ai_bot/tools/web_browser.rb
 create mode 100644 spec/lib/modules/ai_bot/tools/web_browser_spec.rb
diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml
index b10702e2..c47cdd5a 100644
--- a/config/locales/server.en.yml
+++ b/config/locales/server.en.yml
@@ -209,6 +209,7 @@ en:
             name: "Base Search Query"
             description: "Base query to use when searching. Example: '#urgent' will prepend '#urgent' to the search query and only include topics with the urgent category or tag."
       command_summary:
+        web_browser: "Browse Web"
         github_search_code: "GitHub code search"
         github_file_content: "GitHub file content"
         github_pull_request_diff: "GitHub pull request diff"
@@ -227,6 +228,7 @@ en:
         dall_e: "Generate image"
         search_meta_discourse: "Search Meta Discourse"
       command_help:
+        web_browser: "Browse web page using the AI Bot"
         github_search_code: "Search for code in a GitHub repository"
         github_file_content: "Retrieve content of files from a GitHub repository"
         github_pull_request_diff: "Retrieve a GitHub pull request diff"
@@ -245,6 +247,7 @@ en:
         dall_e: "Generate image using DALL-E 3"
         search_meta_discourse: "Search Meta Discourse"
       command_description:
+        web_browser: "Reading <a href='%{url}'>%{url}</a>"
         github_search_code: "Searched for '%{query}' in %{repo}"
         github_pull_request_diff: "<a href='%{url}'>%{repo} %{pull_id}</a>"
         github_file_content: "Retrieved content of %{file_paths} from %{repo_name}@%{branch}"
diff --git a/lib/ai_bot/personas/persona.rb b/lib/ai_bot/personas/persona.rb
index ab9f44bf..b4c6060f 100644
--- a/lib/ai_bot/personas/persona.rb
+++ b/lib/ai_bot/personas/persona.rb
@@ -75,6 +75,7 @@ module DiscourseAi
               Tools::DiscourseMetaSearch,
               Tools::GithubFileContent,
               Tools::GithubPullRequestDiff,
+              Tools::WebBrowser,
             ]
 
             tools << Tools::GithubSearchCode if SiteSetting.ai_bot_github_access_token.present?
diff --git a/lib/ai_bot/personas/researcher.rb b/lib/ai_bot/personas/researcher.rb
index 6b7e7eaa..983d4b22 100644
--- a/lib/ai_bot/personas/researcher.rb
+++ b/lib/ai_bot/personas/researcher.rb
@@ -5,7 +5,7 @@ module DiscourseAi
     module Personas
       class Researcher < Persona
         def tools
-          [Tools::Google]
+          [Tools::Google, Tools::WebBrowser]
         end
 
         def required_tools
@@ -14,19 +14,36 @@ module DiscourseAi
 
         def system_prompt
           <<~PROMPT
-            You are research bot. With access to Google you can find information for users.
+          You are a research assistant with access to two powerful tools:
 
-            - You are conversing with: {participants}
-            - You understand **Discourse Markdown** and generate it.
-            - When generating responses you always cite your sources using Markdown footnotes.
-            - When possible you also quote the sources.
+          1. Google search - for finding relevant information across the internet.
+          2. Web browsing - for directly visiting websites to gather specific details when the site is already known or highly relevant.
 
-            Example:
+          When responding to a question, consider which tool would be most effective while aiming to minimize unnecessary or duplicate inquiries:
+          - Use Google search to quickly identify the most relevant sources. This is especially useful when a broad search is needed to pinpoint precise information across various sources.
+          - Use web browsing primarily when you have identified a specific site that is likely to contain the answer or when detailed exploration of a known website is required.
 
-            **This** is a content[^1] with two footnotes[^2].
+          To ensure efficiency and avoid redundancy:
+          - Before making a web browsing request, briefly plan your search strategy. Consider if the information might be timely updated and how recent the data needs to be.
+          - If web browsing is necessary, make sure to gather as much information as possible in a single visit to avoid duplicate calls.
 
-            [^1]: https://www.example.com
-            [^2]: https://www.example2.com
+          Always aim to:
+          - Optimize tool use by selecting the most appropriate method based on the information need and the likely source of the answer.
+          - Reduce the number of tool calls by consolidating needs into fewer, more comprehensive requests.
+
+          Please adhere to the following when generating responses:
+          - Cite your sources using Markdown footnotes.
+          - When possible, include brief quotes from the sources.
+          - Use **Discourse Markdown** syntax for formatting.
+
+          Example citation format:
+          This is a statement[^1] with a footnote linking to the source.
+
+          [^1]: https://www.example.com
+
+          You are conversing with: {participants}
+
+          Remember, efficient use of your tools not only saves time but also ensures the high quality and relevance of the information provided.
           PROMPT
         end
       end
diff --git a/lib/ai_bot/tools/github_file_content.rb b/lib/ai_bot/tools/github_file_content.rb
index e5ed76ff..91fe9ab5 100644
--- a/lib/ai_bot/tools/github_file_content.rb
+++ b/lib/ai_bot/tools/github_file_content.rb
@@ -61,17 +61,22 @@ module DiscourseAi
             api_url =
               "https://api.github.com/repos/#{owner}/#{repo}/contents/#{file_path}?ref=#{branch}"
 
-            response =
-              send_http_request(
-                api_url,
-                headers: {
-                  "Accept" => "application/vnd.github.v3+json",
-                },
-                authenticate_github: true,
-              )
+            response_code = "-1 unknown"
+            body = nil
 
-            if response.code == "200"
-              file_data = JSON.parse(response.body)
+            send_http_request(
+              api_url,
+              headers: {
+                "Accept" => "application/vnd.github.v3+json",
+              },
+              authenticate_github: true,
+            ) do |response|
+              response_code = response.code
+              body = read_response_body(response)
+            end
+
+            if response_code == "200"
+              file_data = JSON.parse(body)
               content = Base64.decode64(file_data["content"])
               file_contents[file_path] = content
             else
diff --git a/lib/ai_bot/tools/github_pull_request_diff.rb b/lib/ai_bot/tools/github_pull_request_diff.rb
index a2448474..e5973cdc 100644
--- a/lib/ai_bot/tools/github_pull_request_diff.rb
+++ b/lib/ai_bot/tools/github_pull_request_diff.rb
@@ -47,22 +47,27 @@ module DiscourseAi
           api_url = "https://api.github.com/repos/#{repo}/pulls/#{pull_id}"
           @url = "https://github.com/#{repo}/pull/#{pull_id}"
 
-          response =
-            send_http_request(
-              api_url,
-              headers: {
-                "Accept" => "application/vnd.github.v3.diff",
-              },
-              authenticate_github: true,
-            )
+          body = nil
+          response_code = "unknown error"
 
-          if response.code == "200"
-            diff = response.body
+          send_http_request(
+            api_url,
+            headers: {
+              "Accept" => "application/vnd.github.v3.diff",
+            },
+            authenticate_github: true,
+          ) do |response|
+            response_code = response.code
+            body = read_response_body(response)
+          end
+
+          if response_code == "200"
+            diff = body
             diff = sort_and_shorten_diff(diff)
             diff = truncate(diff, max_length: 20_000, percent_length: 0.3, llm: llm)
             { diff: diff }
           else
-            { error: "Failed to retrieve the diff. Status code: #{response.code}" }
+            { error: "Failed to retrieve the diff. Status code: #{response_code}" }
           end
         end
 
diff --git a/lib/ai_bot/tools/github_search_code.rb b/lib/ai_bot/tools/github_search_code.rb
index 6133fa45..77d1f005 100644
--- a/lib/ai_bot/tools/github_search_code.rb
+++ b/lib/ai_bot/tools/github_search_code.rb
@@ -44,17 +44,27 @@ module DiscourseAi
         def invoke(_bot_user, llm)
           api_url = "https://api.github.com/search/code?q=#{query}+repo:#{repo}"
 
-          response =
-            send_http_request(
-              api_url,
-              headers: {
-                "Accept" => "application/vnd.github.v3.text-match+json",
-              },
-              authenticate_github: true,
-            )
+          response_code = "unknown error"
+          search_data = nil
 
-          if response.code == "200"
-            search_data = JSON.parse(response.body)
+          send_http_request(
+            api_url,
+            headers: {
+              "Accept" => "application/vnd.github.v3.text-match+json",
+            },
+            authenticate_github: true,
+          ) do |response|
+            response_code = response.code
+            if response_code == "200"
+              begin
+                search_data = JSON.parse(read_response_body(response))
+              rescue JSON::ParserError
+                response_code = "500 - JSON parse error"
+              end
+            end
+          end
+
+          if response_code == "200"
             results =
               search_data["items"]
                 .map { |item| "#{item["path"]}:\n#{item["text_matches"][0]["fragment"]}" }
@@ -63,7 +73,7 @@ module DiscourseAi
             results = truncate(results, max_length: 20_000, percent_length: 0.3, llm: llm)
             { search_results: results }
           else
-            { error: "Failed to perform code search. Status code: #{response.code}" }
+            { error: "Failed to perform code search. Status code: #{response_code}" }
           end
         end
       end
diff --git a/lib/ai_bot/tools/tool.rb b/lib/ai_bot/tools/tool.rb
index 9b90edcf..e9c3bd49 100644
--- a/lib/ai_bot/tools/tool.rb
+++ b/lib/ai_bot/tools/tool.rb
@@ -77,8 +77,34 @@ module DiscourseAi
 
         protected
 
-        def send_http_request(url, headers: {}, authenticate_github: false)
-          uri = URI(url)
+        def send_http_request(url, headers: {}, authenticate_github: false, follow_redirects: false)
+          raise "Expecting caller to use a block" if !block_given?
+
+          uri = nil
+          url = UrlHelper.normalized_encode(url)
+          uri =
+            begin
+              URI.parse(url)
+            rescue StandardError
+              nil
+            end
+
+          return if !uri
+
+          if follow_redirects
+            fd =
+              FinalDestination.new(
+                url,
+                validate_uri: true,
+                max_redirects: 5,
+                follow_canonical: true,
+              )
+
+            uri = fd.resolve
+          end
+
+          return if uri.blank?
+
           request = FinalDestination::HTTP::Get.new(uri)
           request["User-Agent"] = DiscourseAi::AiBot::USER_AGENT
           headers.each { |k, v| request[k] = v }
@@ -87,10 +113,20 @@ module DiscourseAi
           end
 
           FinalDestination::HTTP.start(uri.hostname, uri.port, use_ssl: uri.port != 80) do |http|
-            http.request(request)
+            http.request(request) { |response| yield response }
           end
         end
 
+        def read_response_body(response, max_length: 4.megabyte)
+          body = +""
+          response.read_body do |chunk|
+            body << chunk
+            break if body.bytesize > max_length
+          end
+
+          body[0..max_length]
+        end
+
         def truncate(text, llm:, percent_length: nil, max_length: nil)
           if !percent_length && !max_length
             raise ArgumentError, "You must provide either percent_length or max_length"
diff --git a/lib/ai_bot/tools/web_browser.rb b/lib/ai_bot/tools/web_browser.rb
new file mode 100644
index 00000000..86f9d9aa
--- /dev/null
+++ b/lib/ai_bot/tools/web_browser.rb
@@ -0,0 +1,106 @@
+# frozen_string_literal: true
+
+module DiscourseAi
+  module AiBot
+    module Tools
+      class WebBrowser < Tool
+        def self.signature
+          {
+            name: name,
+            description:
+              "Visits a web page, retrieves the HTML content, extracts the main content, converts it to plain text, and returns the result.",
+            parameters: [
+              {
+                name: "url",
+                description: "The URL of the web page to visit.",
+                required: true,
+                type: "string",
+              },
+            ],
+          }
+        end
+
+        def self.name
+          "web_browser"
+        end
+
+        def url
+          return @url if defined?(@url)
+          @url = parameters[:url]
+          @url = "https://#{@url}" if !@url.start_with?("http")
+
+          @url
+        end
+
+        def invoke(_bot_user, llm)
+          send_http_request(url, follow_redirects: true) do |response|
+            if response.code == "200"
+              html = read_response_body(response)
+              text = extract_main_content(html)
+              text = truncate(text, max_length: 50_000, percent_length: 0.3, llm: llm)
+              return { url: response.uri.to_s, text: text.strip }
+            else
+              return { url: url, error: "Failed to retrieve the web page: #{response.code}" }
+            end
+          end
+
+          { url: url, error: "Failed to retrieve the web page" }
+        end
+
+        def description_args
+          { url: url }
+        end
+
+        private
+
+        def extract_main_content(html)
+          doc = Nokogiri.HTML(html)
+          doc.search("script, style, comment").remove
+
+          main_content = find_main_content(doc)
+          main_content ||= doc.at("body")
+
+          buffer = +""
+          nodes_to_text(main_content, buffer)
+
+          buffer.gsub(/\s+/, " ")
+        end
+
+        def nodes_to_text(nodes, buffer)
+          if nodes.text?
+            buffer << nodes.text
+            buffer << " "
+            return
+          end
+
+          nodes.children.each do |node|
+            case node.name
+            when "text"
+              buffer << node.text
+              buffer << " "
+            when "br"
+              buffer << "\n"
+            when "a"
+              nodes_to_text(node, buffer)
+              buffer << " [#{node["href"]}] "
+            else
+              nodes_to_text(node, buffer)
+            end
+          end
+        end
+
+        def find_main_content(doc)
+          [
+            doc.at("article"),
+            doc.at("main"),
+            doc.at("[role='main']"),
+            doc.at("#main"),
+            doc.at(".main"),
+            doc.at("#content"),
+            doc.at(".content"),
+          ].find(&:present?)
+        end
+      end
+    end
+  end
+end
diff --git a/spec/lib/modules/ai_bot/personas/researcher_spec.rb b/spec/lib/modules/ai_bot/personas/researcher_spec.rb
index e9cebc4e..c3c65888 100644
--- a/spec/lib/modules/ai_bot/personas/researcher_spec.rb
+++ b/spec/lib/modules/ai_bot/personas/researcher_spec.rb
@@ -6,6 +6,8 @@ RSpec.describe DiscourseAi::AiBot::Personas::Researcher do
   end
 
   it "renders schema" do
-    expect(researcher.tools).to eq([DiscourseAi::AiBot::Tools::Google])
+    expect(researcher.tools).to eq(
+      [DiscourseAi::AiBot::Tools::Google, DiscourseAi::AiBot::Tools::WebBrowser],
+    )
   end
 end
diff --git a/spec/lib/modules/ai_bot/tools/web_browser_spec.rb b/spec/lib/modules/ai_bot/tools/web_browser_spec.rb
new file mode 100644
index 00000000..85d3af9e
--- /dev/null
+++ b/spec/lib/modules/ai_bot/tools/web_browser_spec.rb
@@ -0,0 +1,98 @@
+# frozen_string_literal: true
+
+RSpec.describe DiscourseAi::AiBot::Tools::WebBrowser do
+  let(:bot_user) { User.find(DiscourseAi::AiBot::EntryPoint::GPT3_5_TURBO_ID) }
+  let(:llm) { DiscourseAi::Completions::Llm.proxy("open_ai:gpt-4-turbo") }
+
+  before do
+    SiteSetting.ai_openai_api_key = "asd"
+    SiteSetting.ai_bot_enabled = true
+  end
+
+  describe "#invoke" do
+    it "can retrieve the content of a webpage and returns the processed text" do
+      url = "https://arxiv.org/html/2403.17011v1"
+      processed_text = "This is a simplified version of the webpage content."
+
+      # Mocking the web request to return a specific HTML structure
+      stub_request(:get, url).to_return(
+        status: 200,
+        body:
+          "<html><head><title>Test</title></head><body><p>This is a simplified version of the webpage content.</p></body></html>",
+      )
+
+      tool = described_class.new({ url: url })
+      result = tool.invoke(bot_user, llm)
+
+      expect(result).to have_key(:text)
+      expect(result[:text]).to eq(processed_text)
+      expect(result[:url]).to eq(url)
+    end
+
+    it "returns an error if the webpage cannot be retrieved" do
+      url = "https://arxiv.org/html/2403.17011v1"
+
+      # Simulating a failed request
+      stub_request(:get, url).to_return(status: [500, "Internal Server Error"])
+
+      tool = described_class.new({ url: url })
+      result = tool.invoke(bot_user, llm)
+
+      expect(result).to have_key(:error)
+      expect(result[:error]).to include("Failed to retrieve the web page")
+    end
+  end
+
+  describe "#invoke with various HTML structures" do
+    let(:url) { "http://example.com" }
+
+    it "extracts main content from a simple HTML structure" do
+      simple_html = "<html><body><p>Simple content.</p></body></html>"
+      stub_request(:get, url).to_return(status: 200, body: simple_html)
+
+      tool = described_class.new({ url: url })
+      result = tool.invoke(bot_user, llm)
+
+      expect(result[:text]).to eq("Simple content.")
+    end
+
+    it "correctly ignores script and style tags" do
+      complex_html =
+        "<html><head><script>console.log('Ignore me')</script></head><body><style>body { background-color: #000; }</style><p>Only relevant content here.</p></body></html>"
+      stub_request(:get, url).to_return(status: 200, body: complex_html)
+
+      tool = described_class.new({ url: url })
+      result = tool.invoke(bot_user, llm)
+
+      expect(result[:text]).to eq("Only relevant content here.")
+    end
+
+    it "extracts content from nested structures" do
+      nested_html =
+        "<html><body><div><section><p>Nested paragraph 1.</p></section><section><p>Nested paragraph 2.</p></section></div></body></html>"
+      stub_request(:get, url).to_return(status: 200, body: nested_html)
+
+      tool = described_class.new({ url: url })
+      result = tool.invoke(bot_user, llm)
+
+      expect(result[:text]).to eq("Nested paragraph 1. Nested paragraph 2.")
+    end
+  end
+
+  describe "#invoke with redirects" do
+    let(:initial_url) { "http://initial-example.com" }
+    let(:final_url) { "http://final-example.com" }
+    let(:redirect_html) { "<html><body><p>Redirected content.</p></body></html>" }
+
+    it "follows redirects and retrieves content from the final destination" do
+      stub_request(:get, initial_url).to_return(status: 302, headers: { "Location" => final_url })
+      stub_request(:get, final_url).to_return(status: 200, body: redirect_html)
+
+      tool = described_class.new({ url: initial_url })
+      result = tool.invoke(bot_user, llm)
+
+      expect(result[:url]).to eq(final_url)
+      expect(result[:text]).to eq("Redirected content.")
+    end
+  end
+end