More entropy for foreign titles

* Treat strings with non-ASCII characters as having more entropy
This commit is contained in:
Michael Brown 2013-06-07 14:47:07 -04:00
parent 5217602ec3
commit bb77d2c38b
2 changed files with 12 additions and 2 deletions

View File

@ -21,8 +21,10 @@ class TextSentinel
end end
# Entropy is a number of how many unique characters the string needs. # Entropy is a number of how many unique characters the string needs.
# Non-ASCII characters are weighted heavier since they contain more "information"
def entropy def entropy
@entropy ||= @text.to_s.strip.split('').uniq.size chars = @text.to_s.strip.split('')
@entropy ||= chars.pack('M*'*chars.size).gsub("\n",'').split('=').uniq.size
end end
def valid? def valid?

View File

@ -32,7 +32,15 @@ describe TextSentinel do
end end
it "Works on foreign characters" do it "Works on foreign characters" do
TextSentinel.new("去年十社會警告").entropy.should == 7 TextSentinel.new("去年十社會警告").entropy.should == 19
end
it "generates enough entropy for short foreign strings" do
TextSentinel.new("又一个测").entropy.should == 11
end
it "handles repeated foreign characters" do
TextSentinel.new("又一个测试话题" * 3).entropy.should == 18
end end
end end