Commit e17bd11d authored by Byron Jones's avatar Byron Jones

Bug 633776: Automatic charset detection for text attachments

r=mkanat, a=mkanat
parent fd4f9fad
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
...@@ -291,6 +291,19 @@ sub OPTIONAL_MODULES { ...@@ -291,6 +291,19 @@ sub OPTIONAL_MODULES {
version => 0, version => 0,
feature => ['html_desc'], feature => ['html_desc'],
}, },
{
# we need version 2.21 of Encode for mime_name
package => 'Encode',
module => 'Encode',
version => 2.21,
feature => ['detect_charset'],
},
{
package => 'Encode-Detect',
module => 'Encode::Detect',
version => 0,
feature => ['detect_charset'],
},
# Inbound Email # Inbound Email
{ {
......
...@@ -43,7 +43,8 @@ use base qw(Exporter); ...@@ -43,7 +43,8 @@ use base qw(Exporter);
file_mod_time is_7bit_clean file_mod_time is_7bit_clean
bz_crypt generate_random_password bz_crypt generate_random_password
validate_email_syntax clean_text validate_email_syntax clean_text
get_text template_var disable_utf8); get_text template_var disable_utf8
detect_encoding);
use Bugzilla::Constants; use Bugzilla::Constants;
...@@ -58,6 +59,8 @@ use Math::Random::Secure qw(irand); ...@@ -58,6 +59,8 @@ use Math::Random::Secure qw(irand);
use Scalar::Util qw(tainted blessed); use Scalar::Util qw(tainted blessed);
use Template::Filters; use Template::Filters;
use Text::Wrap; use Text::Wrap;
use Encode qw(encode decode resolve_alias);
use Encode::Guess;
sub trick_taint { sub trick_taint {
require Carp; require Carp;
...@@ -673,6 +676,63 @@ sub disable_utf8 { ...@@ -673,6 +676,63 @@ sub disable_utf8 {
} }
} }
use constant UTF8_ACCIDENTAL => qw(shiftjis big5-eten euc-kr euc-jp);
sub detect_encoding {
my $data = shift;
if (!Bugzilla->feature('detect_charset')) {
require Bugzilla::Error;
Bugzilla::Error::ThrowCodeError('feature_disabled',
{ feature => 'detect_charset' });
}
require Encode::Detect::Detector;
import Encode::Detect::Detector 'detect';
my $encoding = detect($data);
$encoding = resolve_alias($encoding) if $encoding;
# Encode::Detect is bad at detecting certain charsets, but Encode::Guess
# is better at them. Here's the details:
# shiftjis, big5-eten, euc-kr, and euc-jp: (Encode::Detect
# tends to accidentally mis-detect UTF-8 strings as being
# these encodings.)
if ($encoding && grep($_ eq $encoding, UTF8_ACCIDENTAL)) {
$encoding = undef;
my $decoder = guess_encoding($data, UTF8_ACCIDENTAL);
$encoding = $decoder->name if ref $decoder;
}
# Encode::Detect sometimes mis-detects various ISO encodings as iso-8859-8,
# but Encode::Guess can usually tell which one it is.
if ($encoding && $encoding eq 'iso-8859-8') {
my $decoded_as = _guess_iso($data, 'iso-8859-8',
# These are ordered this way because it gives the most
# accurate results.
qw(iso-8859-7 iso-8859-2));
$encoding = $decoded_as if $decoded_as;
}
return $encoding;
}
# A helper for detect_encoding.
sub _guess_iso {
my ($data, $versus, @isos) = (shift, shift, shift);
my $encoding;
foreach my $iso (@isos) {
my $decoder = guess_encoding($data, ($iso, $versus));
if (ref $decoder) {
$encoding = $decoder->name if ref $decoder;
last;
}
}
return $encoding;
}
1; 1;
__END__ __END__
...@@ -903,6 +963,12 @@ ASCII 10 (LineFeed) and ASCII 13 (Carrage Return). ...@@ -903,6 +963,12 @@ ASCII 10 (LineFeed) and ASCII 13 (Carrage Return).
Disable utf8 on STDOUT (and display raw data instead). Disable utf8 on STDOUT (and display raw data instead).
=item C<detect_encoding($str)>
Guesses what encoding a given data is encoded in, returning the canonical name
of the detected encoding (which may be different from the MIME charset
specification).
=item C<clean_text($str)> =item C<clean_text($str)>
Returns the parameter "cleaned" by exchanging non-printable characters with spaces. Returns the parameter "cleaned" by exchanging non-printable characters with spaces.
Specifically characters (ASCII 0 through 31) and (ASCII 127) will become ASCII 32 (Space). Specifically characters (ASCII 0 through 31) and (ASCII 127) will become ASCII 32 (Space).
......
...@@ -53,7 +53,7 @@ use Bugzilla::Attachment::PatchReader; ...@@ -53,7 +53,7 @@ use Bugzilla::Attachment::PatchReader;
use Bugzilla::Token; use Bugzilla::Token;
use Bugzilla::Keyword; use Bugzilla::Keyword;
use Encode qw(encode); use Encode qw(encode find_encoding);
# For most scripts we don't make $cgi and $template global variables. But # For most scripts we don't make $cgi and $template global variables. But
# when preparing Bugzilla for mod_perl, this script used these # when preparing Bugzilla for mod_perl, this script used these
...@@ -335,6 +335,12 @@ sub view { ...@@ -335,6 +335,12 @@ sub view {
# In order to prevent Apache from adding a charset, we have to send a # In order to prevent Apache from adding a charset, we have to send a
# charset that's a single space. # charset that's a single space.
$cgi->charset(' '); $cgi->charset(' ');
if (Bugzilla->feature('detect_charset') && $contenttype =~ /^text\//) {
my $encoding = detect_encoding($attachment->data);
if ($encoding) {
$cgi->charset(find_encoding($encoding)->mime_name);
}
}
} }
print $cgi->header(-type=>"$contenttype; name=\"$filename\"", print $cgi->header(-type=>"$contenttype; name=\"$filename\"",
-content_disposition=> "$disposition; filename=\"$filename\"", -content_disposition=> "$disposition; filename=\"$filename\"",
......
...@@ -24,10 +24,10 @@ use lib qw(. lib); ...@@ -24,10 +24,10 @@ use lib qw(. lib);
use Bugzilla; use Bugzilla;
use Bugzilla::Constants; use Bugzilla::Constants;
use Bugzilla::Util qw(detect_encoding);
use Digest::MD5 qw(md5_base64); use Digest::MD5 qw(md5_base64);
use Encode qw(encode decode resolve_alias is_utf8); use Encode qw(encode decode resolve_alias is_utf8);
use Encode::Guess;
use Getopt::Long; use Getopt::Long;
use Pod::Usage; use Pod::Usage;
...@@ -71,53 +71,6 @@ sub trunc { ...@@ -71,53 +71,6 @@ sub trunc {
return $truncated; return $truncated;
} }
sub do_guess {
my ($data) = @_;
my $encoding = detect($data);
$encoding = resolve_alias($encoding) if $encoding;
# Encode::Detect is bad at detecting certain charsets, but Encode::Guess
# is better at them. Here's the details:
# shiftjis, big5-eten, euc-kr, and euc-jp: (Encode::Detect
# tends to accidentally mis-detect UTF-8 strings as being
# these encodings.)
my @utf8_accidental = qw(shiftjis big5-eten euc-kr euc-jp);
if ($encoding && grep($_ eq $encoding, @utf8_accidental)) {
$encoding = undef;
my $decoder = guess_encoding($data, @utf8_accidental);
$encoding = $decoder->name if ref $decoder;
}
# Encode::Detect sometimes mis-detects various ISO encodings as iso-8859-8,
# but Encode::Guess can usually tell which one it is.
if ($encoding && $encoding eq 'iso-8859-8') {
my $decoded_as = guess_iso($data, 'iso-8859-8',
# These are ordered this way because it gives the most
# accurate results.
qw(iso-8859-7 iso-8859-2));
$encoding = $decoded_as if $decoded_as;
}
return $encoding;
}
# A helper for do_guess.
sub guess_iso {
my ($data, $versus, @isos) = @_;
my $encoding;
foreach my $iso (@isos) {
my $decoder = guess_encoding($data, ($iso, $versus));
if (ref $decoder) {
$encoding = $decoder->name if ref $decoder;
last;
}
}
return $encoding;
}
sub is_valid_utf8 { sub is_valid_utf8 {
my ($str) = @_; my ($str) = @_;
Encode::_utf8_on($str); Encode::_utf8_on($str);
...@@ -143,8 +96,6 @@ if (exists $switch{'charset'}) { ...@@ -143,8 +96,6 @@ if (exists $switch{'charset'}) {
} }
if ($switch{'guess'}) { if ($switch{'guess'}) {
# Encode::Detect::Detector doesn't seem to return a true value.
# So we have to check if we can run detect.
if (!eval { require Encode::Detect::Detector }) { if (!eval { require Encode::Detect::Detector }) {
my $root = ROOT_USER; my $root = ROOT_USER;
print STDERR <<EOT; print STDERR <<EOT;
...@@ -156,8 +107,6 @@ Encode::Detect, run the following command: ...@@ -156,8 +107,6 @@ Encode::Detect, run the following command:
EOT EOT
exit; exit;
} }
import Encode::Detect::Detector qw(detect);
} }
my %overrides; my %overrides;
...@@ -255,7 +204,7 @@ foreach my $table ($dbh->bz_table_list_real) { ...@@ -255,7 +204,7 @@ foreach my $table ($dbh->bz_table_list_real) {
my $encoding; my $encoding;
if ($switch{'guess'}) { if ($switch{'guess'}) {
$encoding = do_guess($data); $encoding = detect_encoding($data);
# We only show failures if they don't appear to be # We only show failures if they don't appear to be
# ASCII. # ASCII.
......
File mode changed from 100644 to 100755
...@@ -108,6 +108,7 @@ END ...@@ -108,6 +108,7 @@ END
feature_smtp_auth => 'SMTP Authentication', feature_smtp_auth => 'SMTP Authentication',
feature_updates => 'Automatic Update Notifications', feature_updates => 'Automatic Update Notifications',
feature_xmlrpc => 'XML-RPC Interface', feature_xmlrpc => 'XML-RPC Interface',
feature_detect_charset => 'Automatic charset detection for text attachments',
file_remove => 'Removing ##name##...', file_remove => 'Removing ##name##...',
file_rename => 'Renaming ##from## to ##to##...', file_rename => 'Renaming ##from## to ##to##...',
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment