discourse/lib/text_cleaner.rb

# frozen_string_literal: true

#
# Clean up a text
#

# We use ActiveSupport mb_chars from here to properly support non ascii downcase
require 'active_support/core_ext/string/multibyte'

class TextCleaner

  def self.title_options
    # cf. http://meta.discourse.org/t/should-we-have-auto-replace-rules-in-titles/5687
    {
      deduplicate_exclamation_marks: SiteSetting.title_prettify,
      deduplicate_question_marks: SiteSetting.title_prettify,
      replace_all_upper_case: SiteSetting.title_prettify && !SiteSetting.allow_uppercase_posts,
      capitalize_first_letter: SiteSetting.title_prettify,
      remove_all_periods_from_the_end: SiteSetting.title_prettify,
      remove_extraneous_space: SiteSetting.title_prettify && SiteSetting.title_remove_extraneous_space,
      fixes_interior_spaces: true,
      strip_whitespaces: true,
      strip_zero_width_spaces: true,
      case_option: SiteSetting.default_locale == "tr_TR" ? :turkic : nil
    }
  end

  def self.clean_title(title)
    TextCleaner.clean(title, TextCleaner.title_options)
  end

  def self.clean(text, opts = {})
    text = text.dup

    # Remove invalid byte sequences
    text.scrub!("")

    # Replace !!!!! with a single !
    text.gsub!(/!+/, '!') if opts[:deduplicate_exclamation_marks]

    # Replace ????? with a single ?
    text.gsub!(/\?+/, '?') if opts[:deduplicate_question_marks]

    # Replace all-caps text with regular case letters
    text = downcase(text.mb_chars, opts).to_s if opts[:replace_all_upper_case] && (text == upcase(text.mb_chars, opts))

    # Capitalize first letter, but only when entire first word is lowercase
    first, rest = text.split(' ', 2)
    if first && opts[:capitalize_first_letter] && first == downcase(first.mb_chars, opts)
      text = +"#{capitalize(first.mb_chars, opts)}#{rest ? ' ' + rest : ''}"
    end

    # Remove unnecessary periods at the end
    text.sub!(/([^.])\.+(\s*)\z/, '\1\2') if opts[:remove_all_periods_from_the_end]

    # Remove extraneous space before the end punctuation
    text.sub!(/\s+([!?]\s*)\z/, '\1') if opts[:remove_extraneous_space]

    # Fixes interior spaces
    text.gsub!(/ +/, ' ') if opts[:fixes_interior_spaces]

    # Normalize whitespaces
    text = normalize_whitespaces(text)

    # Strip whitespaces
    text.strip! if opts[:strip_whitespaces]

    # Strip zero width spaces
    text.gsub!(/\u200b/, '') if opts[:strip_zero_width_spaces]

    text
  end

  @@whitespaces_regexp = Regexp.new("(\u00A0|\u1680|\u180E|[\u2000-\u200A]|\u2028|\u2029|\u202F|\u205F|\u3000)", Regexp::IGNORECASE).freeze

  def self.normalize_whitespaces(text)
    text&.gsub(@@whitespaces_regexp, ' ')
  end

  def self.downcase(text, opts)
    opts[:case_option] ? text.downcase(opts[:case_option]) : text.downcase
  end

  def self.upcase(text, opts)
    opts[:case_option] ? text.upcase(opts[:case_option]) : text.upcase
  end

  def self.capitalize(text, opts)
    opts[:case_option] ? text.capitalize(opts[:case_option]) : text.capitalize
  end
end
DEV: stop mutating inputs as a side effect We had quite a few cases in core where inputs are being mutated as a side effect of calling a method. This handles all the cases where specs caught this. Mutating inputs makes code harder to reason about. Eg: ``` frog = "frog" jump(frog) puts frog "fly" # ????? ``` This commit is part of a followup commit that adds # frozen_string_literal to all our specs. 2019-04-29 20:25:53 -04:00			`# frozen_string_literal: true`

auto replace rules in titles 2013-04-10 05:00:50 -04:00			`#`
			`# Clean up a text`
			`#`
FIX: Properly downcase unicode chars Fix to https://meta.discourse.org/t/title-prettify-does-not-correctly-lowercase-non-english-characters-when-removing-all-caps/16645 This adds a dependency on Active Support Multibyte to downcase on other languages. 2016-03-07 15:07:36 -05:00
DEV: Correct typos and spelling mistakes (#12812) Over the years we accrued many spelling mistakes in the code base. This PR attempts to fix spelling mistakes and typos in all areas of the code that are extremely safe to change - comments - test descriptions - other low risk areas 2021-05-20 21:43:47 -04:00			`# We use ActiveSupport mb_chars from here to properly support non ascii downcase`
FIX: Properly downcase unicode chars Fix to https://meta.discourse.org/t/title-prettify-does-not-correctly-lowercase-non-english-characters-when-removing-all-caps/16645 This adds a dependency on Active Support Multibyte to downcase on other languages. 2016-03-07 15:07:36 -05:00			`require 'active_support/core_ext/string/multibyte'`

auto replace rules in titles 2013-04-10 05:00:50 -04:00			`class TextCleaner`

			`def self.title_options`
			`# cf. http://meta.discourse.org/t/should-we-have-auto-replace-rules-in-titles/5687`
			`{`
			`deduplicate_exclamation_marks: SiteSetting.title_prettify,`
			`deduplicate_question_marks: SiteSetting.title_prettify,`
FIX: allow_uppercase_posts didn't work for topic titles 2018-10-08 17:50:06 -04:00			`replace_all_upper_case: SiteSetting.title_prettify && !SiteSetting.allow_uppercase_posts,`
auto replace rules in titles 2013-04-10 05:00:50 -04:00			`capitalize_first_letter: SiteSetting.title_prettify,`
Text Cleaner now removes all periods from the end of the title 2013-04-17 18:19:42 -04:00			`remove_all_periods_from_the_end: SiteSetting.title_prettify,`
FEATURE: English locale with international date formats Makes en_US the new default locale 2019-05-15 17:43:00 -04:00			`remove_extraneous_space: SiteSetting.title_prettify && SiteSetting.title_remove_extraneous_space,`
auto replace rules in titles 2013-04-10 05:00:50 -04:00			`fixes_interior_spaces: true,`
FIX: strip zero width spaces from topic title 2018-02-20 13:22:36 -05:00			`strip_whitespaces: true,`
FEATURE: Correctly convert topic title to uppercase and lowercase for Turkish default locale (#13115) 2021-05-24 04:13:30 -04:00			`strip_zero_width_spaces: true,`
			`case_option: SiteSetting.default_locale == "tr_TR" ? :turkic : nil`
auto replace rules in titles 2013-04-10 05:00:50 -04:00			`}`
			`end`

			`def self.clean_title(title)`
			`TextCleaner.clean(title, TextCleaner.title_options)`
			`end`

			`def self.clean(text, opts = {})`
DEV: stop mutating inputs as a side effect We had quite a few cases in core where inputs are being mutated as a side effect of calling a method. This handles all the cases where specs caught this. Mutating inputs makes code harder to reason about. Eg: ``` frog = "frog" jump(frog) puts frog "fly" # ????? ``` This commit is part of a followup commit that adds # frozen_string_literal to all our specs. 2019-04-29 20:25:53 -04:00			`text = text.dup`

FIX: strip invalid byte sequences 2019-02-25 18:12:34 -05:00			`# Remove invalid byte sequences`
			`text.scrub!("")`
FEATURE: Correctly convert topic title to uppercase and lowercase for Turkish default locale (#13115) 2021-05-24 04:13:30 -04:00
auto replace rules in titles 2013-04-10 05:00:50 -04:00			`# Replace !!!!! with a single !`
			`text.gsub!(/!+/, '!') if opts[:deduplicate_exclamation_marks]`
FEATURE: Correctly convert topic title to uppercase and lowercase for Turkish default locale (#13115) 2021-05-24 04:13:30 -04:00
auto replace rules in titles 2013-04-10 05:00:50 -04:00			`# Replace ????? with a single ?`
			`text.gsub!(/\?+/, '?') if opts[:deduplicate_question_marks]`
FEATURE: Correctly convert topic title to uppercase and lowercase for Turkish default locale (#13115) 2021-05-24 04:13:30 -04:00
auto replace rules in titles 2013-04-10 05:00:50 -04:00			`# Replace all-caps text with regular case letters`
FEATURE: Correctly convert topic title to uppercase and lowercase for Turkish default locale (#13115) 2021-05-24 04:13:30 -04:00			`text = downcase(text.mb_chars, opts).to_s if opts[:replace_all_upper_case] && (text == upcase(text.mb_chars, opts))`

Thread title fixer should ignore special cases like iLetter 2013-05-23 15:31:08 -04:00			`# Capitalize first letter, but only when entire first word is lowercase`
Rewrite capitalize for readability Makes capitalize logic more clear with @sam suggestion 2016-03-20 08:35:48 -04:00			`first, rest = text.split(' ', 2)`
FEATURE: Correctly convert topic title to uppercase and lowercase for Turkish default locale (#13115) 2021-05-24 04:13:30 -04:00			`if first && opts[:capitalize_first_letter] && first == downcase(first.mb_chars, opts)`
			`text = +"#{capitalize(first.mb_chars, opts)}#{rest ? ' ' + rest : ''}"`
Rewrite capitalize for readability Makes capitalize logic more clear with @sam suggestion 2016-03-20 08:35:48 -04:00			`end`
FEATURE: Correctly convert topic title to uppercase and lowercase for Turkish default locale (#13115) 2021-05-24 04:13:30 -04:00
Text Cleaner now removes all periods from the end of the title 2013-04-17 18:19:42 -04:00			`# Remove unnecessary periods at the end`
			`text.sub!(/([^.])\.+(\s*)\z/, '\1\2') if opts[:remove_all_periods_from_the_end]`
FEATURE: Correctly convert topic title to uppercase and lowercase for Turkish default locale (#13115) 2021-05-24 04:13:30 -04:00
auto replace rules in titles 2013-04-10 05:00:50 -04:00			`# Remove extraneous space before the end punctuation`
			`text.sub!(/\s+([!?]\s*)\z/, '\1') if opts[:remove_extraneous_space]`
FEATURE: Correctly convert topic title to uppercase and lowercase for Turkish default locale (#13115) 2021-05-24 04:13:30 -04:00
auto replace rules in titles 2013-04-10 05:00:50 -04:00			`# Fixes interior spaces`
			`text.gsub!(/ +/, ' ') if opts[:fixes_interior_spaces]`
FEATURE: Correctly convert topic title to uppercase and lowercase for Turkish default locale (#13115) 2021-05-24 04:13:30 -04:00
FEATURE: normalize whitespaces in topic title/post content 2014-08-11 18:01:58 -04:00			`# Normalize whitespaces`
			`text = normalize_whitespaces(text)`
FEATURE: Correctly convert topic title to uppercase and lowercase for Turkish default locale (#13115) 2021-05-24 04:13:30 -04:00
auto replace rules in titles 2013-04-10 05:00:50 -04:00			`# Strip whitespaces`
			`text.strip! if opts[:strip_whitespaces]`
FEATURE: Correctly convert topic title to uppercase and lowercase for Turkish default locale (#13115) 2021-05-24 04:13:30 -04:00
FIX: strip zero width spaces from topic title 2018-02-20 13:22:36 -05:00			`# Strip zero width spaces`
			`text.gsub!(/\u200b/, '') if opts[:strip_zero_width_spaces]`
auto replace rules in titles 2013-04-10 05:00:50 -04:00
			`text`
			`end`

FIX: Use correct Regexp flag to ignore case (#19184) Ruby 3.2 started enforcing valid string flags in Regexp constructor. 2022-11-25 08:56:59 -05:00			`@@whitespaces_regexp = Regexp.new("(\u00A0\|\u1680\|\u180E\|[\u2000-\u200A]\|\u2028\|\u2029\|\u202F\|\u205F\|\u3000)", Regexp::IGNORECASE).freeze`
FEATURE: normalize whitespaces in topic title/post content 2014-08-11 18:01:58 -04:00
			`def self.normalize_whitespaces(text)`
FIX: Skip gsub for normalizing whitespaces when text is nil (#6631) 2018-11-20 03:12:32 -05:00			`text&.gsub(@@whitespaces_regexp, ' ')`
FEATURE: normalize whitespaces in topic title/post content 2014-08-11 18:01:58 -04:00			`end`

FEATURE: Correctly convert topic title to uppercase and lowercase for Turkish default locale (#13115) 2021-05-24 04:13:30 -04:00			`def self.downcase(text, opts)`
			`opts[:case_option] ? text.downcase(opts[:case_option]) : text.downcase`
			`end`

			`def self.upcase(text, opts)`
			`opts[:case_option] ? text.upcase(opts[:case_option]) : text.upcase`
			`end`

			`def self.capitalize(text, opts)`
			`opts[:case_option] ? text.capitalize(opts[:case_option]) : text.capitalize`
			`end`
auto replace rules in titles 2013-04-10 05:00:50 -04:00			`end`