FIX: crawler requests not tracked for non UTF-8 user agents
Non UTF-8 user_agent requests were bypassing logging due to PG always wanting UTF-8 strings. This adds some conversion to ensure we are always dealing with UTF-8
This commit is contained in:
parent
9b30922109
commit
e440ec2519
|
@ -117,7 +117,12 @@ class Middleware::RequestTracker
|
|||
}
|
||||
|
||||
if h[:is_crawler]
|
||||
h[:user_agent] = env['HTTP_USER_AGENT']
|
||||
user_agent = env['HTTP_USER_AGENT']
|
||||
if user_agent.encoding != Encoding::UTF_8
|
||||
user_agent = user_agent.encode("utf-8")
|
||||
user_agent.scrub!
|
||||
end
|
||||
h[:user_agent] = user_agent
|
||||
end
|
||||
|
||||
if cache = headers["X-Discourse-Cached"]
|
||||
|
|
|
@ -15,6 +15,26 @@ describe Middleware::RequestTracker do
|
|||
}.merge(opts)
|
||||
end
|
||||
|
||||
context "full request" do
|
||||
before do
|
||||
@orig = WebCrawlerRequest.autoflush
|
||||
WebCrawlerRequest.autoflush = 1
|
||||
end
|
||||
after do
|
||||
WebCrawlerRequest.autoflush = @orig
|
||||
end
|
||||
|
||||
it "can handle rogue user agents" do
|
||||
agent = (+"Evil Googlebot String \xc3\x28").force_encoding("Windows-1252")
|
||||
|
||||
middleware = Middleware::RequestTracker.new(->(env) { ["200", { "Content-Type" => "text/html" }, [""]] })
|
||||
middleware.call(env("HTTP_USER_AGENT" => agent))
|
||||
|
||||
expect(WebCrawlerRequest.where(user_agent: agent.encode('utf-8')).count).to eq(1)
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
context "log_request" do
|
||||
before do
|
||||
freeze_time Time.now
|
||||
|
|
Loading…
Reference in New Issue