#!/usr/bin/perl
## -*- Encoding:utf-8; Mode:perl; -*-
# kate: space-indent on; indent-width 4; replace-tabs on;
#

package SO::Mail::Message::CharEnt;

use strict;
use SO::Mail::Message::CharMaps;
use SO::Mail::Message::Char;

##---------------------------------------------------------------------------
##      Charset specification to mapping
##---------------------------------------------------------------------------
##  NOTE: The mapping uses a single name for a charset.
##	  The CHARSETALIASES resource can be used to map aka names (aliases)
##	  to the names used here.
##  NOTE: UTF-8 does not require a map since UTF-8 is decoded straight
##	  to &#xHHHH; entity references.
##  NOTE: iso-2022-{jp,kr} are translated to euc-{jp,kr} first before
##	  conversion.

my %CharsetMaps = (
    'iso-8859-1'     =>	'Message/CharEnt/ISO8859_1.pm',
    'iso-8859-2'     =>	'Message/CharEnt/ISO8859_2.pm',
    'iso-8859-3'     =>	'Message/CharEnt/ISO8859_3.pm',
    'iso-8859-4'     =>	'Message/CharEnt/ISO8859_4.pm',
    'iso-8859-5'     =>	'Message/CharEnt/ISO8859_5.pm',
    'iso-8859-6'     =>	'Message/CharEnt/ISO8859_6.pm',
    'iso-8859-7'     =>	'Message/CharEnt/ISO8859_7.pm',
    'iso-8859-8'     =>	'Message/CharEnt/ISO8859_8.pm',
    'iso-8859-9'     =>	'Message/CharEnt/ISO8859_9.pm',
    'iso-8859-10'    =>	'Message/CharEnt/ISO8859_10.pm',
    'iso-8859-11'    =>	'Message/CharEnt/ISO8859_11.pm',
    'iso-8859-13'    =>	'Message/CharEnt/ISO8859_13.pm',
    'iso-8859-14'    =>	'Message/CharEnt/ISO8859_14.pm',
    'iso-8859-15'    =>	'Message/CharEnt/ISO8859_15.pm',
    'iso-8859-16'    =>	'Message/CharEnt/ISO8859_16.pm',
    'cp866'	     =>	'Message/CharEnt/CP866.pm',
    'cp949'	     =>	'Message/CharEnt/CP949.pm', # euc-kr
    'cp932'	     =>	'Message/CharEnt/CP932.pm', # shiftjis
    'cp936'	     =>	'Message/CharEnt/CP936.pm', # GBK
    'cp950'	     =>	'Message/CharEnt/CP950.pm',
    'cp1250'	     =>	'Message/CharEnt/CP1250.pm',
    'cp1251'	     =>	'Message/CharEnt/CP1251.pm',
    'cp1252'	     =>	'Message/CharEnt/CP1252.pm',
    'cp1253'	     =>	'Message/CharEnt/CP1253.pm',
    'cp1254'	     =>	'Message/CharEnt/CP1254.pm',
    'cp1255'	     =>	'Message/CharEnt/CP1255.pm',
    'cp1256'	     =>	'Message/CharEnt/CP1256.pm',
    'cp1257'	     =>	'Message/CharEnt/CP1257.pm',
    'cp1258'	     =>	'Message/CharEnt/CP1258.pm',
    'koi-0'	     =>	'Message/CharEnt/KOI_0.pm',
    'koi-7'	     =>	'Message/CharEnt/KOI_7.pm',
    'koi8-a'	     =>	'Message/CharEnt/KOI8_A.pm',
    'koi8-b'	     =>	'Message/CharEnt/KOI8_B.pm',
    'koi8-e'	     =>	'Message/CharEnt/KOI8_E.pm',
    'koi8-f'	     =>	'Message/CharEnt/KOI8_F.pm',
    'koi8-r'	     =>	'Message/CharEnt/KOI8_R.pm',
    'koi8-u'	     =>	'Message/CharEnt/KOI8_U.pm',
    'gost19768-87'   =>	'Message/CharEnt/GOST19768_87.pm',
    'viscii'	     =>	'Message/CharEnt/VISCII.pm',
    'macarabic'	     =>	'Message/CharEnt/AppleArabic.pm',
    'maccentraleurroman' => 'Message/CharEnt/AppleCenteuro.pm',
    'maccroatian'    =>	'Message/CharEnt/AppleCroatian.pm',
    'maccyrillic'    =>	'Message/CharEnt/AppleCyrillic.pm',
    'macgreek'	     =>	'Message/CharEnt/AppleGreek.pm',
    'machebrew'	     =>	'Message/CharEnt/AppleHebrew.pm',
    'macicelandic'   =>	'Message/CharEnt/AppleIceland.pm',
    'macromanian'    =>	'Message/CharEnt/AppleRomanian.pm',
    'macroman'	     =>	'Message/CharEnt/AppleRoman.pm',
    'macthai'	     =>	'Message/CharEnt/AppleThai.pm',
    'macturkish'     =>	'Message/CharEnt/AppleTurkish.pm',
    'big5-eten'      =>	'Message/CharEnt/BIG5_ETEN.pm',
    'big5-hkscs'     =>	'Message/CharEnt/BIG5_HKSCS.pm',
    'gb2312'         =>	'Message/CharEnt/GB2312.pm',
    'euc-jp'         =>	'Message/CharEnt/EUC_JP.pm',
    'hp-roman8'      =>	'Message/CharEnt/HP_ROMAN8.pm',
);

my $char_maps = SO::Mail::Message::CharMaps->new(\%CharsetMaps);

###############################################################################
##	Routines
###############################################################################

sub str2sgml {
    my $data 	 =    shift;
    my $charset  = lc shift;

    my $data_r  = ref($data) ? $data : \$data;
    $charset =~ tr/_/-/;
    # UTF-8 can be converted algorithmically.
    if ($charset eq 'us-ascii') { # If us-ascii, use simple s/// operation.
        $$data_r =~ s/([$HTMLSpecials])/$HTMLSpecials{$1}/go;
        return $$data_r;
    }
    if ($charset eq 'utf-8') {
        _utf8_to_sgml($data_r);
        return $$data_r;
    }
    SO::Mail::Message::Char::map_conv($data_r, $charset, $char_maps, \%HTMLSpecials);
}

##---------------------------------------------------------------------------##
##  Private Routines.

# Array of masks for lead byte in UTF-8 (for Perl <5.6)
# This could be computed on-the-fly, but using an array is faster
my @utf8_lb_mask = (
    0x3F, 0x1F, 0xF, 0x7, 0x3, 0x1  # 1, 2, 3, 4, 5, 6 bytes, respectively
);
# Regex pattern for UTF-8 data
my $utf8_re = q/([\x00-\x7F]|
		 [\xC0-\xDF][\x80-\xBF]|
		  \xE0      [\xA0-\xBF][\x80-\xBF]|
		 [\xE1-\xEF][\x80-\xBF]{2}|
		  \xF0      [\x90-\xBF][\x80-\xBF]{2}|
		 [\xF1-\xF7][\x80-\xBF]{3}|
		  \xF8      [\x88-\xBF][\x80-\xBF]{3}|
		 [\xF9-\xFB][\x80-\xBF]{4}|
		  \xFC      [\x84-\xBF][\x80-\xBF]{4}|
		  \xFD      [\x80-\xBF]{5}|
		 .)/;
# A lax regex for UTF-8 data.  Used for utf-8-aware perl since perl
# will validate sequences
my $utf8_re_lax =
	      q/([\x00-\x7F]|
		 [\xC0-\xDF][\x00-\xFF]|
		 [\xE0-\xEF][\x00-\xFF]{2}|
		 [\xF0-\xF7][\x00-\xFF]{3}|
		 [\xF8-\xFB][\x00-\xFF]{4}|
		 [\xFC-\xFD][\x00-\xFF]{5}|
		 .)/;

sub _utf8_to_sgml {
    my $data_r = shift;

    if ($] >= 5.006) {
	# UTF-8-aware perl
	# Have to enable warnings to get stricter utf-8 checks for Perl 5.8
	use warnings;
	my($char, $ord, $malformed);

	# Define local warn handle to suppress malformed utf-8 warning
	# messages and to flag when such occurrences happen.
	my $cur_sig_warn = $SIG{__WARN__};
	local $SIG{__WARN__} = sub {
	    $malformed = 1;
	    #warn @_;
	    # invoke current warn handler, if defined
	    &$cur_sig_warn  if defined($cur_sig_warn) && ref($cur_sig_warn);
	};
	$$data_r =~ s{
	    $utf8_re_lax
	}{
            # Bug #26577: Perl 5.10 changed unpack behavior
	    $char = ($] >= 5.010)? unpack('C0U*',$1): unpack('U0U*',$1);
	    if ($malformed ||
		  (($char & 0xFFFE) == 0xFFFE) ||
		  (($char & 0xFFFF) == 0xFFFF) ||
		  ($char >= 0xFDD0 && $char <= 0xFDEF) ||
		  ($char >= 0xD800 && $char <= 0xDFFF)
	       ) {
		# Some of the if() checks may be handled by perl directly,
		# but such checks can be disabled when perl is built.
		$malformed = 0;
		'&#xFFFD;';
	    } else {
		($char <= 0x7F)
			? $HTMLSpecials{$1} || sprintf('%c',$char)
			: sprintf('&#x%X;',$char);
	    }
	}gxeso;

    } else {
	# non-UTF-8-aware perl
	my($i, $n, $char);
	$$data_r =~ s{
	    $utf8_re
	}{
	    if (($n = length($1)) == 1) {
		my $ord = ord($1);
		if ($ord > 0x7F) {
		    # Malformed sequence
		    '&#xFFFD;';
		} else {
		    # 7-bit ASCII
		    $HTMLSpecials{$1} || $1;
		}
	    } else {
		# Multi-byte sequence
		$char = (unpack('C',substr($1,0,1)) &
			 $utf8_lb_mask[$n-1]) << ($n-1)*6;
		for ($i=1; $i < $n; ++$i) {
		    $char |= ((unpack('C',substr($1,$i,1)) & 0x3F) <<
			     (($n-$i-1)*6));
		}
		if ($char <= 0x7F    ||	# should only be single byte sequence
		    (($char & 0xFFFE) == 0xFFFE) ||	    # not a char
		    (($char & 0xFFFF) == 0xFFFF) ||	    # not a char
		    ($char >= 0xFDD0 && $char <= 0xFDEF) || # not a char
		    ($char >= 0xD800 && $char <= 0xDFFF)    # surrogates
		   ) {
		    '&#xFFFD;';
		} else {
		    sprintf('&#x%X;',$char);
		}
	    }
       }gxseo;
    }
}

1;
__END__
