2019-04-29 20:27:42 -04:00
|
|
|
|
# frozen_string_literal: true
|
|
|
|
|
|
2013-04-10 05:00:50 -04:00
|
|
|
|
require 'text_cleaner'
|
|
|
|
|
|
2022-07-27 22:27:38 -04:00
|
|
|
|
RSpec.describe TextCleaner do
|
2013-04-10 05:00:50 -04:00
|
|
|
|
|
|
|
|
|
context "exclamation marks" do
|
|
|
|
|
|
|
|
|
|
let(:duplicated_string) { "my precious!!!!" }
|
|
|
|
|
let(:deduplicated_string) { "my precious!" }
|
|
|
|
|
|
|
|
|
|
it "ignores multiple ! by default" do
|
2015-01-09 11:34:37 -05:00
|
|
|
|
expect(TextCleaner.clean(duplicated_string)).to eq(duplicated_string)
|
2013-04-10 05:00:50 -04:00
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
it "deduplicates ! when enabled" do
|
2015-01-09 11:34:37 -05:00
|
|
|
|
expect(TextCleaner.clean(duplicated_string, deduplicate_exclamation_marks: true)).to eq(deduplicated_string)
|
2013-04-10 05:00:50 -04:00
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
context "question marks" do
|
|
|
|
|
|
|
|
|
|
let(:duplicated_string) { "please help me????" }
|
|
|
|
|
let(:deduplicated_string) { "please help me?" }
|
|
|
|
|
|
|
|
|
|
it "ignores multiple ? by default" do
|
2015-01-09 11:34:37 -05:00
|
|
|
|
expect(TextCleaner.clean(duplicated_string)).to eq(duplicated_string)
|
2013-04-10 05:00:50 -04:00
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
it "deduplicates ? when enabled" do
|
2015-01-09 11:34:37 -05:00
|
|
|
|
expect(TextCleaner.clean(duplicated_string, deduplicate_question_marks: true)).to eq(deduplicated_string)
|
2013-04-10 05:00:50 -04:00
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
context "all upper case text" do
|
|
|
|
|
|
|
|
|
|
let(:all_caps) { "ENTIRE TEXT IS ALL CAPS" }
|
|
|
|
|
let(:almost_all_caps) { "ENTIRE TEXT iS ALL CAPS" }
|
|
|
|
|
let(:regular_case) { "entire text is all caps" }
|
|
|
|
|
|
|
|
|
|
it "ignores all upper case text by default" do
|
2015-01-09 11:34:37 -05:00
|
|
|
|
expect(TextCleaner.clean(all_caps)).to eq(all_caps)
|
2013-04-10 05:00:50 -04:00
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
it "replaces all upper case text with regular case letters when enabled" do
|
2015-01-09 11:34:37 -05:00
|
|
|
|
expect(TextCleaner.clean(all_caps, replace_all_upper_case: true)).to eq(regular_case)
|
2013-04-10 05:00:50 -04:00
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
it "ignores almost all upper case text when enabled" do
|
2015-01-09 11:34:37 -05:00
|
|
|
|
expect(TextCleaner.clean(almost_all_caps, replace_all_upper_case: true)).to eq(almost_all_caps)
|
2013-04-10 05:00:50 -04:00
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
context "first letter" do
|
|
|
|
|
|
|
|
|
|
let(:lowercased) { "this is awesome" }
|
|
|
|
|
let(:capitalized) { "This is awesome" }
|
2013-05-23 15:31:08 -04:00
|
|
|
|
let(:iletter) { "iLetter" }
|
2013-04-10 05:00:50 -04:00
|
|
|
|
|
|
|
|
|
it "ignores first letter case by default" do
|
2015-01-09 11:34:37 -05:00
|
|
|
|
expect(TextCleaner.clean(lowercased)).to eq(lowercased)
|
|
|
|
|
expect(TextCleaner.clean(capitalized)).to eq(capitalized)
|
|
|
|
|
expect(TextCleaner.clean(iletter)).to eq(iletter)
|
2013-04-10 05:00:50 -04:00
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
it "capitalizes first letter when enabled" do
|
2015-01-09 11:34:37 -05:00
|
|
|
|
expect(TextCleaner.clean(lowercased, capitalize_first_letter: true)).to eq(capitalized)
|
|
|
|
|
expect(TextCleaner.clean(capitalized, capitalize_first_letter: true)).to eq(capitalized)
|
|
|
|
|
expect(TextCleaner.clean(iletter, capitalize_first_letter: true)).to eq(iletter)
|
2013-04-10 05:00:50 -04:00
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
end
|
|
|
|
|
|
2013-04-17 18:19:42 -04:00
|
|
|
|
context "periods at the end" do
|
2013-04-10 05:00:50 -04:00
|
|
|
|
|
2013-04-17 18:19:42 -04:00
|
|
|
|
let(:with_one_period) { "oops." }
|
|
|
|
|
let(:with_several_periods) { "oops..." }
|
2013-04-10 05:00:50 -04:00
|
|
|
|
let(:without_period) { "oops" }
|
|
|
|
|
|
2013-04-17 18:19:42 -04:00
|
|
|
|
it "ignores unnecessary periods at the end by default" do
|
2015-01-09 11:34:37 -05:00
|
|
|
|
expect(TextCleaner.clean(with_one_period)).to eq(with_one_period)
|
|
|
|
|
expect(TextCleaner.clean(with_several_periods)).to eq(with_several_periods)
|
2013-04-10 05:00:50 -04:00
|
|
|
|
end
|
|
|
|
|
|
2013-04-17 18:19:42 -04:00
|
|
|
|
it "removes unnecessary periods at the end when enabled" do
|
2015-01-09 11:34:37 -05:00
|
|
|
|
expect(TextCleaner.clean(with_one_period, remove_all_periods_from_the_end: true)).to eq(without_period)
|
|
|
|
|
expect(TextCleaner.clean(with_several_periods, remove_all_periods_from_the_end: true)).to eq(without_period)
|
2013-04-10 05:00:50 -04:00
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
it "keeps trailing whitespaces when enabled" do
|
2015-01-09 11:34:37 -05:00
|
|
|
|
expect(TextCleaner.clean(with_several_periods + " ", remove_all_periods_from_the_end: true)).to eq(without_period + " ")
|
2013-04-10 05:00:50 -04:00
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
context "extraneous space" do
|
|
|
|
|
|
|
|
|
|
let(:with_space_exclamation) { "oops !" }
|
|
|
|
|
let(:without_space_exclamation) { "oops!" }
|
|
|
|
|
let(:with_space_question) { "oops ?" }
|
|
|
|
|
let(:without_space_question) { "oops?" }
|
|
|
|
|
|
|
|
|
|
it "ignores extraneous space before the end punctuation by default" do
|
2015-01-09 11:34:37 -05:00
|
|
|
|
expect(TextCleaner.clean(with_space_exclamation)).to eq(with_space_exclamation)
|
|
|
|
|
expect(TextCleaner.clean(with_space_question)).to eq(with_space_question)
|
2013-04-10 05:00:50 -04:00
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
it "removes extraneous space before the end punctuation when enabled" do
|
2015-01-09 11:34:37 -05:00
|
|
|
|
expect(TextCleaner.clean(with_space_exclamation, remove_extraneous_space: true)).to eq(without_space_exclamation)
|
|
|
|
|
expect(TextCleaner.clean(with_space_question, remove_extraneous_space: true)).to eq(without_space_question)
|
2013-04-10 05:00:50 -04:00
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
it "keep trailing whitespaces when enabled" do
|
2015-01-09 11:34:37 -05:00
|
|
|
|
expect(TextCleaner.clean(with_space_exclamation + " ", remove_extraneous_space: true)).to eq(without_space_exclamation + " ")
|
|
|
|
|
expect(TextCleaner.clean(with_space_question + " ", remove_extraneous_space: true)).to eq(without_space_question + " ")
|
2013-04-10 05:00:50 -04:00
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
context "interior spaces" do
|
|
|
|
|
|
|
|
|
|
let(:spacey_string) { "hello there's weird spaces here." }
|
|
|
|
|
let(:unspacey_string) { "hello there's weird spaces here." }
|
|
|
|
|
|
|
|
|
|
it "ignores interior spaces by default" do
|
2015-01-09 11:34:37 -05:00
|
|
|
|
expect(TextCleaner.clean(spacey_string)).to eq(spacey_string)
|
2013-04-10 05:00:50 -04:00
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
it "fixes interior spaces when enabled" do
|
2015-01-09 11:34:37 -05:00
|
|
|
|
expect(TextCleaner.clean(spacey_string, fixes_interior_spaces: true)).to eq(unspacey_string)
|
2013-04-10 05:00:50 -04:00
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
context "leading and trailing whitespaces" do
|
|
|
|
|
|
|
|
|
|
let(:spacey_string) { " \t test \n " }
|
|
|
|
|
let(:unspacey_string) { "test" }
|
|
|
|
|
|
|
|
|
|
it "ignores leading and trailing whitespaces by default" do
|
2015-01-09 11:34:37 -05:00
|
|
|
|
expect(TextCleaner.clean(spacey_string)).to eq(spacey_string)
|
2013-04-10 05:00:50 -04:00
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
it "strips leading and trailing whitespaces when enabled" do
|
2015-01-09 11:34:37 -05:00
|
|
|
|
expect(TextCleaner.clean(spacey_string, strip_whitespaces: true)).to eq(unspacey_string)
|
2013-04-10 05:00:50 -04:00
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
context "title" do
|
|
|
|
|
|
|
|
|
|
it "fixes interior spaces" do
|
2015-01-09 11:34:37 -05:00
|
|
|
|
expect(TextCleaner.clean_title("Hello there")).to eq("Hello there")
|
2013-04-10 05:00:50 -04:00
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
it "strips leading and trailing whitespaces" do
|
2015-01-09 11:34:37 -05:00
|
|
|
|
expect(TextCleaner.clean_title(" \t Hello there \n ")).to eq("Hello there")
|
2013-04-10 05:00:50 -04:00
|
|
|
|
end
|
|
|
|
|
|
2018-02-20 13:22:36 -05:00
|
|
|
|
it "strips zero width spaces" do
|
|
|
|
|
expect(TextCleaner.clean_title("Hello there")).to eq("Hello there")
|
|
|
|
|
expect(TextCleaner.clean_title("Hello there").length).to eq(11)
|
|
|
|
|
end
|
|
|
|
|
|
2013-04-10 05:00:50 -04:00
|
|
|
|
context "title_prettify site setting is enabled" do
|
|
|
|
|
|
|
|
|
|
before { SiteSetting.title_prettify = true }
|
|
|
|
|
|
|
|
|
|
it "deduplicates !" do
|
2015-01-09 11:34:37 -05:00
|
|
|
|
expect(TextCleaner.clean_title("Hello there!!!!")).to eq("Hello there!")
|
2013-04-10 05:00:50 -04:00
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
it "deduplicates ?" do
|
2015-01-09 11:34:37 -05:00
|
|
|
|
expect(TextCleaner.clean_title("Hello there????")).to eq("Hello there?")
|
2013-04-10 05:00:50 -04:00
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
it "replaces all upper case text with regular case letters" do
|
2015-01-09 11:34:37 -05:00
|
|
|
|
expect(TextCleaner.clean_title("HELLO THERE")).to eq("Hello there")
|
2013-04-10 05:00:50 -04:00
|
|
|
|
end
|
|
|
|
|
|
2018-10-08 17:50:06 -04:00
|
|
|
|
it "doesn't replace all upper case text when uppercase posts are allowed" do
|
|
|
|
|
SiteSetting.allow_uppercase_posts = true
|
|
|
|
|
expect(TextCleaner.clean_title("HELLO THERE")).to eq("HELLO THERE")
|
|
|
|
|
end
|
|
|
|
|
|
2013-04-10 05:00:50 -04:00
|
|
|
|
it "capitalizes first letter" do
|
2015-01-09 11:34:37 -05:00
|
|
|
|
expect(TextCleaner.clean_title("hello there")).to eq("Hello there")
|
2013-04-10 05:00:50 -04:00
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
it "removes unnecessary period at the end" do
|
2015-01-09 11:34:37 -05:00
|
|
|
|
expect(TextCleaner.clean_title("Hello there.")).to eq("Hello there")
|
2013-04-10 05:00:50 -04:00
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
it "removes extraneous space before the end punctuation" do
|
2019-05-15 17:43:00 -04:00
|
|
|
|
SiteSetting.title_remove_extraneous_space = true
|
2015-01-09 11:34:37 -05:00
|
|
|
|
expect(TextCleaner.clean_title("Hello there ?")).to eq("Hello there?")
|
2019-05-15 17:43:00 -04:00
|
|
|
|
|
|
|
|
|
SiteSetting.title_remove_extraneous_space = false
|
|
|
|
|
expect(TextCleaner.clean_title("Hello there ?")).to eq("Hello there ?")
|
2013-04-10 05:00:50 -04:00
|
|
|
|
end
|
|
|
|
|
|
2016-03-07 20:10:49 -05:00
|
|
|
|
it "replaces all upper case unicode text with regular unicode case letters" do
|
|
|
|
|
expect(TextCleaner.clean_title("INVESTIGAÇÃO POLÍTICA NA CÂMARA")).to eq("Investigação política na câmara")
|
|
|
|
|
end
|
|
|
|
|
|
2017-08-08 13:03:24 -04:00
|
|
|
|
it "doesn't downcase text if only one word is upcase in a non-ascii alphabet" do
|
|
|
|
|
expect(TextCleaner.clean_title("«Эта неделя в EVE»")).to eq("«Эта неделя в EVE»")
|
|
|
|
|
end
|
|
|
|
|
|
2016-03-07 20:10:49 -05:00
|
|
|
|
it "capitalizes first unicode letter" do
|
|
|
|
|
expect(TextCleaner.clean_title("épico encontro")).to eq("Épico encontro")
|
|
|
|
|
end
|
2016-03-20 08:35:48 -04:00
|
|
|
|
|
2021-05-24 04:13:30 -04:00
|
|
|
|
it "correctly cleans Turkish characters" do
|
|
|
|
|
expect(TextCleaner.clean_title("GIDA")).to eq("Gida")
|
|
|
|
|
expect(TextCleaner.clean_title("istanbul")).to eq("Istanbul")
|
|
|
|
|
|
|
|
|
|
SiteSetting.default_locale = "tr_TR"
|
|
|
|
|
expect(TextCleaner.clean_title("GIDA")).to eq("Gıda")
|
|
|
|
|
expect(TextCleaner.clean_title("istanbul")).to eq("İstanbul")
|
|
|
|
|
end
|
2013-04-10 05:00:50 -04:00
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
end
|
|
|
|
|
|
2014-08-11 18:01:58 -04:00
|
|
|
|
describe "#normalize_whitespaces" do
|
|
|
|
|
it "normalize whitespaces" do
|
2015-03-26 22:01:31 -04:00
|
|
|
|
whitespaces = "\u0020\u00A0\u1680\u180E\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u2028\u2029\u202F\u205F\u3000"
|
2015-01-09 11:34:37 -05:00
|
|
|
|
expect(whitespaces.strip).not_to eq("")
|
|
|
|
|
expect(TextCleaner.normalize_whitespaces(whitespaces).strip).to eq("")
|
2018-11-20 03:12:32 -05:00
|
|
|
|
expect(TextCleaner.normalize_whitespaces(nil)).to be_nil
|
2014-08-11 18:01:58 -04:00
|
|
|
|
end
|
2015-03-26 22:01:31 -04:00
|
|
|
|
|
|
|
|
|
it "does not muck with zero width white space" do
|
|
|
|
|
# this is used for khmer, dont mess with it
|
|
|
|
|
expect(TextCleaner.normalize_whitespaces("hello\u200Bworld").strip).to eq("hello\u200Bworld")
|
|
|
|
|
expect(TextCleaner.normalize_whitespaces("hello\uFEFFworld").strip).to eq("hello\uFEFFworld")
|
|
|
|
|
|
|
|
|
|
end
|
2014-08-11 18:01:58 -04:00
|
|
|
|
end
|
|
|
|
|
|
2019-02-25 18:12:34 -05:00
|
|
|
|
context "invalid byte sequence" do
|
|
|
|
|
let(:with_invalid_bytes) { "abc\u3042\x81" }
|
|
|
|
|
let(:without_invalid_bytes) { "abc\u3042" }
|
|
|
|
|
|
|
|
|
|
it "removes invalid bytes" do
|
|
|
|
|
expect(TextCleaner.clean(with_invalid_bytes)).to eq(without_invalid_bytes)
|
|
|
|
|
end
|
|
|
|
|
end
|
2013-04-10 05:00:50 -04:00
|
|
|
|
end
|