Use rchardet instead of charlock_holmes gem

This commit is contained in:
Gerhard Schlager 2018-07-31 12:02:18 +02:00
parent 5d421fb946
commit a115aae45f
5 changed files with 20 additions and 20 deletions

View File

@ -180,7 +180,7 @@ gem 'rqrcode'
gem 'sshkey', require: false
gem 'charlock_holmes', require: false
gem 'rchardet', require: false
if ENV["IMPORT"] == "1"
gem 'mysql2'

View File

@ -75,7 +75,6 @@ GEM
uniform_notifier (~> 1.11.0)
byebug (10.0.2)
certified (1.0.0)
charlock_holmes (0.7.6)
chunky_png (1.3.10)
claide (1.0.2)
claide-plugins (0.9.2)
@ -321,6 +320,7 @@ GEM
ffi (>= 1.0.6)
msgpack (>= 0.4.3)
trollop (>= 1.16.2)
rchardet (1.8.0)
redis (4.0.1)
redis-namespace (1.6.0)
redis (>= 3.0.4)
@ -457,7 +457,6 @@ DEPENDENCIES
bullet
byebug
certified
charlock_holmes
cppjieba_rb
danger
discourse_image_optim
@ -523,6 +522,7 @@ DEPENDENCIES
rb-fsevent
rb-inotify (~> 0.9)
rbtrace
rchardet
redis
redis-namespace
rinku

View File

@ -90,7 +90,7 @@ module Jobs
def parsed_feed
raw_feed, encoding = fetch_rss
encoded_feed = Encodings.try_utf8(raw_feed, encoding) if encoding
encoded_feed = Encodings.to_utf8(raw_feed, encoding_hint: encoding) unless encoded_feed
encoded_feed = Encodings.to_utf8(raw_feed) unless encoded_feed
return nil if encoded_feed.blank?

View File

@ -1,20 +1,12 @@
require 'charlock_holmes'
require 'rchardet'
module Encodings
BINARY_SCAN_LENGTH = 0
def self.to_utf8(string)
result = CharDet.detect(string)
def self.to_utf8(string, encoding_hint: nil, delete_bom: true)
detector = CharlockHolmes::EncodingDetector.new(BINARY_SCAN_LENGTH)
result = detector.detect(string, encoding_hint&.to_s)
if result && result[:encoding]
string = CharlockHolmes::Converter.convert(string, result[:encoding], Encoding::UTF_8.name)
else
string = string.encode(Encoding::UTF_8, undef: :replace, invalid: :replace, replace: '')
end
delete_bom!(string) if delete_bom
string
encoded_string = try_utf8(string, result['encoding']) if result && result['encoding']
encoded_string = force_utf8(string) if encoded_string.nil?
encoded_string
end
def self.try_utf8(string, source_encoding)
@ -26,6 +18,14 @@ module Encodings
nil
end
def self.force_utf8(string)
encoded_string = string.encode(Encoding::UTF_8,
undef: :replace,
invalid: :replace,
replace: '')
delete_bom!(encoded_string)
end
def self.delete_bom!(string)
string.sub!(/\A\xEF\xBB\xBF/, '') unless string.blank?
string

View File

@ -1,9 +1,9 @@
require 'rails_helper'
describe Encodings do
def to_utf8(filename, encoding_hint = nil)
def to_utf8(filename)
string = File.read("#{Rails.root}/spec/fixtures/encodings/#{filename}").chomp
Encodings.to_utf8(string, encoding_hint: encoding_hint)
Encodings.to_utf8(string)
end
context "unicode" do