#!/usr/bin/perl
## -*- Encoding:utf-8; Mode:perl; -*-
# kate: space-indent on; indent-width 4; replace-tabs on;
#

package SO::Mail::Message::UTF8::MhaEncode;

use strict;
use SO::Mail::Message::CharMaps;
use SO::Mail::Message::Char;

my %CharsetMaps = (
    'iso-8859-1'     =>	'Message/UTF8/ISO8859_1.pm',
    'iso-8859-2'     =>	'Message/UTF8/ISO8859_2.pm',
    'iso-8859-3'     =>	'Message/UTF8/ISO8859_3.pm',
    'iso-8859-4'     =>	'Message/UTF8/ISO8859_4.pm',
    'iso-8859-5'     =>	'Message/UTF8/ISO8859_5.pm',
    'iso-8859-6'     =>	'Message/UTF8/ISO8859_6.pm',
    'iso-8859-7'     =>	'Message/UTF8/ISO8859_7.pm',
    'iso-8859-8'     =>	'Message/UTF8/ISO8859_8.pm',
    'iso-8859-9'     =>	'Message/UTF8/ISO8859_9.pm',
    'iso-8859-10'    =>	'Message/UTF8/ISO8859_10.pm',
    'iso-8859-11'    =>	'Message/UTF8/ISO8859_11.pm',
    'iso-8859-13'    =>	'Message/UTF8/ISO8859_13.pm',
    'iso-8859-14'    =>	'Message/UTF8/ISO8859_14.pm',
    'iso-8859-15'    =>	'Message/UTF8/ISO8859_15.pm',
    'iso-8859-16'    =>	'Message/UTF8/ISO8859_16.pm',
    'cp866'	     =>	'Message/UTF8/CP866.pm',
    'cp949'	     =>	'Message/UTF8/CP949.pm', # euc-kr
    'cp932'	     =>	'Message/UTF8/CP932.pm', # shiftjis
    'cp936'	     =>	'Message/UTF8/CP936.pm', # GBK
    'cp950'	     =>	'Message/UTF8/CP950.pm',
    'cp1250'	     =>	'Message/UTF8/CP1250.pm',
    'cp1251'	     =>	'Message/UTF8/CP1251.pm',
    'cp1252'	     =>	'Message/UTF8/CP1252.pm',
    'cp1253'	     =>	'Message/UTF8/CP1253.pm',
    'cp1254'	     =>	'Message/UTF8/CP1254.pm',
    'cp1255'	     =>	'Message/UTF8/CP1255.pm',
    'cp1256'	     =>	'Message/UTF8/CP1256.pm',
    'cp1257'	     =>	'Message/UTF8/CP1257.pm',
    'cp1258'	     =>	'Message/UTF8/CP1258.pm',
    'koi-0'	     =>	'Message/UTF8/KOI_0.pm',
    'koi-7'	     =>	'Message/UTF8/KOI_7.pm',
    'koi8-a'	     =>	'Message/UTF8/KOI8_A.pm',
    'koi8-b'	     =>	'Message/UTF8/KOI8_B.pm',
    'koi8-e'	     =>	'Message/UTF8/KOI8_E.pm',
    'koi8-f'	     =>	'Message/UTF8/KOI8_F.pm',
    'koi8-r'	     =>	'Message/UTF8/KOI8_R.pm',
    'koi8-u'	     =>	'Message/UTF8/KOI8_U.pm',
    'gost19768-87'   =>	'Message/UTF8/GOST19768_87.pm',
    'viscii'	     =>	'Message/UTF8/VISCII.pm',
    'macarabic'	     =>	'Message/UTF8/AppleArabic.pm',
    'maccentraleurroman' => 'Message/UTF8/AppleCenteuro.pm',
    'maccroatian'    =>	'Message/UTF8/AppleCroatian.pm',
    'maccyrillic'    =>	'Message/UTF8/AppleCyrillic.pm',
    'macgreek'	     =>	'Message/UTF8/AppleGreek.pm',
    'machebrew'	     =>	'Message/UTF8/AppleHebrew.pm',
    'macicelandic'   =>	'Message/UTF8/AppleIceland.pm',
    'macromanian'    =>	'Message/UTF8/AppleRomanian.pm',
    'macroman'	     =>	'Message/UTF8/AppleRoman.pm',
    'macthai'	     =>	'Message/UTF8/AppleThai.pm',
    'macturkish'     =>	'Message/UTF8/AppleTurkish.pm',
    'big5-eten'      =>	'Message/UTF8/BIG5_ETEN.pm',
    'big5-hkscs'     =>	'Message/UTF8/BIG5_HKSCS.pm',
    'gb2312'         =>	'Message/UTF8/GB2312.pm',
    'euc-jp'         =>	'Message/UTF8/EUC_JP.pm',
    'hp-roman8'      =>	'Message/UTF8/HP_ROMAN8.pm',
);

my $char_maps = SO::Mail::Message::CharMaps->new(\%CharsetMaps);

##---------------------------------------------------------------------------##

# We do not care for valid sequences, just that we catch everything
my $utf8_re = q/[\x00-\x7F]|
		[\xC0-\xDF][\x00-\xFF]|
		[\xE0-\xEF][\x00-\xFF]{2}|
		[\xF0-\xF7][\x00-\xFF]{3}|
		[\xF8-\xFB][\x00-\xFF]{4}|
		[\xFC\xFD][\x00-\xFF]{5}|
		[\x80-\xFF]/;

# Return the length of an utf-8 string
sub utf8_length {
    my $n = 0;
    while ($_[0] =~ m/($utf8_re)/gox) { ++$n; };
    $n;
}

##---------------------------------------------------------------------------##

## Version of TEXTCLIPFUNC for utf8 strings for versions of Perl without
## decent utf8 support (Perl <= 5.6.x).
sub clip {
    my $str      = shift;   # Unfortunately, it is much easier to make a copy
    my $len      = shift;   # Clip length
    my $is_html  = shift;   # If entity references should be considered
    my $has_tags = shift;   # If html tags should be stripped

    # If not HTML text, things are alot easier
    if (!$is_html) {
	# do nothing if we know for sure there is nothing to do
	return $str
	    if length($str) <= $len;

	# Get $len utf8 chars
	$str =~ m/^((?:$utf8_re){1,$len})/x;
	return $1;
    }

    $str =~ s/<[^>]*>//g  if $has_tags;
    return $str  if length($str) <= $len; # nothing to do

    my($utf8_len, $er_len);
    my $text = "";
    my $subtext = "";
    my $sub_len = $len;
    my $real_len = 0;
    
    while ($str ne '') {
	if (!($str =~ s/^((?:$utf8_re){1,$sub_len})//x)) {
	    # pattern should always match, but just in-case...
	    warn qq/Warning: SO::Mail::Message::UTF8::MhaEncode::clip:/,
			 qq/ Internal error/;
	    return $text . $str;
	}
	$subtext = $1;

	# check for clipped entity reference
	if (($str ne '') && ($subtext =~ /\&[^;]*\Z/)) {
	    if ($str =~ s/^([^;]*;)//) {
		$subtext .= $1;
	    } else {
		warn qq/Warning: SO::Mail::Message::UTF8::MhaEncode::clip: malformed/,
			     qq/ entity reference detected\n/;
		$subtext .= $str;
		$str = '';
	    }
	}

	# compute entity reference lengths to determine "real" character
	# count and not raw character count.
	$er_len = 0;
	while ($subtext =~ /(\&[^;]+);/g) {
	    $er_len += length($1);
	}

	# done if we have enough
	$utf8_len  = utf8_length($subtext);
	$real_len += $utf8_len - $er_len;
	$text     .= $subtext;
	last       if ($real_len >= $len);
	$sub_len   = $len - $real_len;
    }
    $text;
}

sub to_utf8 {
    my $data    = shift;
    my $charset = lc shift;
    my $data_r  = ref($data) ? $data : \$data;

    return $$data_r  if ($charset eq 'us-ascii' ||
			 $charset eq 'utf-8' ||
			 $charset eq 'utf8');
    SO::Mail::Message::Char::map_conv($data_r, $charset, $char_maps);
}

sub str2sgml {
    my $data    = shift;
    my $charset = lc shift;
    my $data_r  = ref($data) ? $data : \$data;

    if ($charset eq 'us-ascii') {
	if ($$data_r =~ /[\x80-\xFF]/) {
	    $charset = 'iso-8859-1';
	} else {
	    $$data_r =~ s/([$HTMLSpecials])/$HTMLSpecials{$1}/go;
	    return $$data_r;
	}
    }
    if ($charset eq 'utf-8' || $charset eq 'utf8') {
	$$data_r =~ s/([$HTMLSpecials])/$HTMLSpecials{$1}/go;
	return $$data_r;
    }
    SO::Mail::Message::Char::map_conv($data_r, $charset, $char_maps);
    $$data_r =~ s/([$HTMLSpecials])/$HTMLSpecials{$1}/go;
    $$data_r;
}

1;
__END__
