//
//  main.cpp
//  utf7imap
//
//  Created by ЕГОР Прохоренко on 18.06.12.
//  Copyright (c) 2012 ЯНДЕКС. All rights reserved.
//

#include "utf7imap.h"

#include <string>
#include <vector>
#include <sstream>
#include <stdexcept>
#include <boost/algorithm/string.hpp>

using namespace std;

namespace yimap {

/*

 * Parts of this file are copyright (C) 2002-2004 Weldon Whipple and
 * his employers. Significant portions of the code are based on
 * similar code from the GNU LIBICONV Library, which is
 * Copyright (C) 1999-2001 Free Software Foundation, Inc.
 *
 * Conditions for use and distribution are the same as for the GNU
 * LIBICONV Library.
 *
 * The GNU LIBICONV Library is free software; you can redistribute it
 * and/or modify it under the terms of the GNU Library General Public
 * License as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * The GNU LIBICONV Library is distributed in the hope that it will be
 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
 * If not, write to the Free Software Foundation, Inc., 59 Temple Place -
 * Suite 330, Boston, MA 02111-1307, USA.  */

/*
 * UTF-7 IMAP variant described in RFC 2060 (section 5.1.3)
 */

/* Specification: RFC 2152 (and old RFC 1641, RFC 1642) */
/* The original Base64 encoding is defined in RFC 2045. */

/* Set of direct characters:
 *   A-Z a-z 0-9 ' ( ) , - . / : ? space
 *   ! " # $ % + * ; < = > @ [ ] ^ _ ` { | } \ ~
 */
static const unsigned char directimap_tab[128 / 8] = {
    0x00, 0x00, 0x00, 0x00, 0xbf, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f,
};

#define isdirectimap(ch) ((ch) < 128 && ((directimap_tab[(ch) >> 3] >> (ch & 7)) & 1))

/* Our own notion of wide character, as UCS-4, according to ISO-10646-1. */
typedef unsigned int ucs4_t;

/* State used by a conversion. 0 denotes the initial state. */
typedef unsigned int state_t;

struct conv_t
{
    state_t istate;
    state_t ostate;
};

/* Return code if invalid input after a shift sequence of n bytes was read.
 (xxx_mbtowc) */
#define RET_SHIFT_ILSEQ(n) (-1 - 2 * (n))
/* Return code if invalid. (xxx_mbtowc) */
#define RET_ILSEQ RET_SHIFT_ILSEQ(0)
/* Return code if only a shift sequence of n bytes was read. (xxx_mbtowc) */
#define RET_TOOFEW(n) (-2 - 2 * (n))
/* Retrieve the n from the encoded RET_... value. */
#define DECODE_SHIFT_ILSEQ(r) ((unsigned int)(RET_SHIFT_ILSEQ(0) - (r)) / 2)
#define DECODE_TOOFEW(r) ((unsigned int)(RET_TOOFEW(0) - (r)) / 2)

/* Return code if invalid. (xxx_wctomb) */
#define RET_ILUNI -1
/* Return code if output buffer is too small. (xxx_wctomb, xxx_reset) */
#define RET_TOOSMALL -2

/*
 * The state is structured as follows:
 * bit 1..0: shift
 * bit 7..2: data
 * Precise meaning:
 *   shift      data
 *     0         0           not inside base64 encoding
 *     1         0           inside base64, no pending bits
 *     2      XXXX00         inside base64, 4 bits remain from 2nd byte
 *     3      XX0000         inside base64, 2 bits remain from 3rd byte
 */

static int utf7imap_mbtowc(conv_t* conv, ucs4_t* pwc, const unsigned char* s, int n)
{
    state_t state = conv->istate;
    int count = 0; /* number of input bytes already read */
    if (state & 3) goto active;
    else
        goto inactive;

inactive:
{
    /* Here (state & 3) == 0 */
    if (n < count + 1) goto none;
    {
        unsigned char c = *s;
        if (isdirectimap(c))
        {
            *pwc = (ucs4_t)c;
            conv->istate = state;
            return count + 1;
        }
        if (c == '&')
        {
            if (n < count + 2) goto none;
            if (s[1] == '-')
            {
                *pwc = (ucs4_t)'&';
                conv->istate = state;
                return count + 2;
            }
            s++;
            count++;
            state = 1;
            goto active;
        }
        return RET_ILSEQ;
    }
}

active:
{
    /* base64 encoding active */
    unsigned int wc = 0;
    state_t base64state = state;
    unsigned int kmax = 2;        /* number of payload bytes to read */
    unsigned int k = 0;           /* number of payload bytes already read */
    unsigned int base64count = 0; /* number of base64 bytes already read */
    for (;;)
    {
        unsigned char c = *s;
        unsigned int i;
        if (c >= 'A' && c <= 'Z') i = c - 'A';
        else if (c >= 'a' && c <= 'z')
            i = c - 'a' + 26;
        else if (c >= '0' && c <= '9')
            i = c - '0' + 52;
        else if (c == '+')
            i = 62;
        else if (c == ',')
            i = 63;
        else
        {
            /* c terminates base64 encoding */
            if (base64state & -4) return RET_ILSEQ; /* data must be 0, otherwise illegal */
            if (base64count) return RET_ILSEQ;      /* partial UTF-16 characters are invalid */
            if (c == '-')
            {
                s++;
                count++;
            }
            state = 0;
            if (n > count) goto inactive;
            else
                return 0; /* no more characters available */
        }
        s++;
        base64count++;
        /* read 6 bits: 0 <= i < 64 */
        switch (base64state & 3)
        {
        case 1: /* inside base64, no pending bits */
            base64state = (i << 2) | 0;
            break;
        case 0: /* inside base64, 6 bits remain from 1st byte */
            wc = (wc << 8) | (base64state & -4) | (i >> 4);
            k++;
            base64state = ((i & 15) << 4) | 2;
            break;
        case 2: /* inside base64, 4 bits remain from 2nd byte */
            wc = (wc << 8) | (base64state & -4) | (i >> 2);
            k++;
            base64state = ((i & 3) << 6) | 3;
            break;
        case 3: /* inside base64, 2 bits remain from 3rd byte */
            wc = (wc << 8) | (base64state & -4) | i;
            k++;
            base64state = 1;
            break;
        }
        if (k == kmax)
        {
            /* UTF-16: When we see a High Surrogate, we must also decode
             the following Low Surrogate. */
            if (kmax == 2 && (wc >= 0xd800 && wc < 0xdc00)) kmax = 4;
            else
                break;
        }
        if ((size_t)n < count + base64count + 1) goto none;
    }
    /* Here k = kmax > 0, hence base64count > 0. */
    if ((base64state & 3) == 0) abort();
    if (kmax == 4)
    {
        ucs4_t wc1 = wc >> 16;
        ucs4_t wc2 = wc & 0xffff;
        if (!(wc1 >= 0xd800 && wc1 < 0xdc00)) abort();
        if (!(wc2 >= 0xdc00 && wc2 < 0xe000)) return RET_ILSEQ;
        *pwc = 0x10000 + ((wc1 - 0xd800) << 10) + (wc2 - 0xdc00);
    }
    else
    {
        *pwc = wc;
    }
    conv->istate = base64state;
    return count + base64count;
}

none:
    conv->istate = state;
    return RET_TOOFEW(count);
}

/*
 * The state is structured as follows:
 * bit 1..0: shift
 * bit 7..2: data
 * Precise meaning:
 *   shift      data
 *     0         0           not inside base64 encoding
 *     1         0           inside base64, no pending bits
 *     2       XX00          inside base64, 2 bits known for 2nd byte
 *     3       XXXX          inside base64, 4 bits known for 3rd byte
 */
static int utf7imap_wctomb(conv_t* conv, unsigned char* r, ucs4_t iwc, int n)
{
    state_t state = conv->ostate;
    unsigned int wc = iwc;
    int count = 0;
    if (state & 3) goto active;

    /*inactive:*/
    {
        if (isdirectimap(wc))
        {
            r[0] = (unsigned char)wc;
            /*conv->ostate = state;*/
            return 1;
        }
        else
        {
            *r++ = '&';
            if (wc == '&')
            {
                if (n < 2) return RET_TOOSMALL;
                *r = '-';
                /*conv->ostate = state;*/
                return 2;
            }
            count = 1;
            state = 1;
            goto active;
        }
    }

active:
{
    /* base64 encoding active */
    /* If the next character is one we can encode directly (and if
     the previous character--during the last call--was encoded in
     base64, then ... */
    if (isdirectimap(wc))
    {
        /* deactivate base64 encoding */
        count += ((state & 3) >= 2 ? 1 : 0) + 1 + 1;
        if (n < count) return RET_TOOSMALL;
        if ((state & 3) >= 2)
        {
            unsigned int i = state & -4;
            unsigned char c;
            if (i < 26) c = i + 'A';
            else if (i < 52)
                c = i - 26 + 'a';
            else if (i < 62)
                c = i - 52 + '0';
            else if (i == 62)
                c = '+';
            else if (i == 63)
                c = ',';
            else
                abort();
            *r++ = c;
        }
        *r++ = '-';
        state = 0;
        *r++ = (unsigned char)wc;
        conv->ostate = state;
        return count;
    }
    else
    {
        unsigned int k; /* number of payload bytes to write */
        if (wc < 0x10000)
        {
            k = 2;
            count += ((state & 3) >= 2 ? 3 : 2);
        }
        else if (wc < 0x110000)
        {
            unsigned int wc1 = 0xd800 + ((wc - 0x10000) >> 10);
            unsigned int wc2 = 0xdc00 + ((wc - 0x10000) & 0x3ff);
            wc = (wc1 << 16) | wc2;
            k = 4;
            count += ((state & 3) >= 3 ? 6 : 5);
        }
        else
            return RET_ILUNI;
        if (n < count) return RET_TOOSMALL;
        for (;;)
        {
            unsigned int i;
            unsigned char c;
            switch (state & 3)
            {
            case 0: /* inside base64, 6 bits known for 4th byte */
                c = (state & -4) >> 2;
                state = 1;
                break;
            case 1: /* inside base64, no pending bits */
                i = (wc >> (8 * --k)) & 0xff;
                c = i >> 2;
                state = ((i & 3) << 4) | 2;
                break;
            case 2: /* inside base64, 2 bits known for 2nd byte */
                i = (wc >> (8 * --k)) & 0xff;
                c = (state & -4) | (i >> 4);
                state = ((i & 15) << 2) | 3;
                break;
            case 3: /* inside base64, 4 bits known for 3rd byte */
                i = (wc >> (8 * --k)) & 0xff;
                c = (state & -4) | (i >> 6);
                state = ((i & 63) << 2) | 0;
                break;
            default:
                abort(); /* stupid gcc */
            }
            if (c < 26) c = c + 'A';
            else if (c < 52)
                c = c - 26 + 'a';
            else if (c < 62)
                c = c - 52 + '0';
            else if (c == 62)
                c = '+';
            else if (c == 63)
                c = ',';
            else
                abort();
            *r++ = c;
            if ((state & 3) && (k == 0)) break;
        }
        conv->ostate = state;
        return count;
    }
}
}

static int utf7imap_reset(conv_t* conv, unsigned char* r, int n)
{
    state_t state = conv->ostate;
    if (state & 3)
    {
        /* deactivate base64 encoding */
        /* We will output two bytes (the rest of a pending character
         and '-') if in state 2 or 3. If in state 1, there are
         no pending bits, so count is 1, and we will output just
         the closing '-'. */
        unsigned int count = ((state & 3) >= 2 ? 1 : 0) + 1;
        if ((size_t)n < count) return RET_TOOSMALL;
        if ((state & 3) >= 2)
        {
            unsigned int i = state & -4; /* Get high-order 6 bits */
            unsigned char c;
            if (i < 26) c = i + 'A';
            else if (i < 52)
                c = i - 26 + 'a';
            else if (i < 62)
                c = i - 52 + '0';
            else if (i == 62)
                c = '+';
            else if (i == 63)
                c = ',';
            else
                abort();
            *r++ = c;
        }
        *r++ = '-';
        /* conv->ostate = 0; will be done by the caller */
        return count;
    }
    else
        return 0;
}

//------------------------------------------------------------------------------

/*
 * Copyright (C) 1999-2001, 2004 Free Software Foundation, Inc.
 * This file is part of the GNU LIBICONV Library.
 *
 * The GNU LIBICONV Library is free software; you can redistribute it
 * and/or modify it under the terms of the GNU Library General Public
 * License as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * The GNU LIBICONV Library is distributed in the hope that it will be
 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
 * If not, write to the Free Software Foundation, Inc., 51 Franklin Street,
 * Fifth Floor, Boston, MA 02110-1301, USA.
 */

/*
 * UTF-8
 */

/* Specification: RFC 3629 */

static int utf8_mbtowc(ucs4_t* pwc, const unsigned char* s, int n)
{
    unsigned char c = s[0];

    if (c < 0x80)
    {
        *pwc = c;
        return 1;
    }
    else if (c < 0xc2)
    {
        return RET_ILSEQ;
    }
    else if (c < 0xe0)
    {
        if (n < 2) return RET_TOOFEW(0);
        if (!((s[1] ^ 0x80) < 0x40)) return RET_ILSEQ;
        *pwc = ((ucs4_t)(c & 0x1f) << 6) | (ucs4_t)(s[1] ^ 0x80);
        return 2;
    }
    else if (c < 0xf0)
    {
        if (n < 3) return RET_TOOFEW(0);
        if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (c >= 0xe1 || s[1] >= 0xa0)))
            return RET_ILSEQ;
        *pwc = ((ucs4_t)(c & 0x0f) << 12) | ((ucs4_t)(s[1] ^ 0x80) << 6) | (ucs4_t)(s[2] ^ 0x80);
        return 3;
    }
    else if (c < 0xf8 && sizeof(ucs4_t) * 8 >= 32)
    {
        if (n < 4) return RET_TOOFEW(0);
        if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (s[3] ^ 0x80) < 0x40 &&
              (c >= 0xf1 || s[1] >= 0x90)))
            return RET_ILSEQ;
        *pwc = ((ucs4_t)(c & 0x07) << 18) | ((ucs4_t)(s[1] ^ 0x80) << 12) |
            ((ucs4_t)(s[2] ^ 0x80) << 6) | (ucs4_t)(s[3] ^ 0x80);
        return 4;
    }
    else if (c < 0xfc && sizeof(ucs4_t) * 8 >= 32)
    {
        if (n < 5) return RET_TOOFEW(0);
        if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (s[3] ^ 0x80) < 0x40 &&
              (s[4] ^ 0x80) < 0x40 && (c >= 0xf9 || s[1] >= 0x88)))
            return RET_ILSEQ;
        *pwc = ((ucs4_t)(c & 0x03) << 24) | ((ucs4_t)(s[1] ^ 0x80) << 18) |
            ((ucs4_t)(s[2] ^ 0x80) << 12) | ((ucs4_t)(s[3] ^ 0x80) << 6) | (ucs4_t)(s[4] ^ 0x80);
        return 5;
    }
    else if (c < 0xfe && sizeof(ucs4_t) * 8 >= 32)
    {
        if (n < 6) return RET_TOOFEW(0);
        if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (s[3] ^ 0x80) < 0x40 &&
              (s[4] ^ 0x80) < 0x40 && (s[5] ^ 0x80) < 0x40 && (c >= 0xfd || s[1] >= 0x84)))
            return RET_ILSEQ;
        *pwc = ((ucs4_t)(c & 0x01) << 30) | ((ucs4_t)(s[1] ^ 0x80) << 24) |
            ((ucs4_t)(s[2] ^ 0x80) << 18) | ((ucs4_t)(s[3] ^ 0x80) << 12) |
            ((ucs4_t)(s[4] ^ 0x80) << 6) | (ucs4_t)(s[5] ^ 0x80);
        return 6;
    }
    else
        return RET_ILSEQ;
}

static int utf8_wctomb(unsigned char* r, ucs4_t wc, int n) /* n == 0 is acceptable */
{
    int count;
    if (wc < 0x80) count = 1;
    else if (wc < 0x800)
        count = 2;
    else if (wc < 0x10000)
        count = 3;
    else if (wc < 0x200000)
        count = 4;
    else if (wc < 0x4000000)
        count = 5;
    else if (wc <= 0x7fffffff)
        count = 6;
    else
        return RET_ILUNI;
    if (n < count) return RET_TOOSMALL;
    switch (count)
    { /* note: code falls through cases! */
    case 6:
        r[5] = 0x80 | (wc & 0x3f);
        wc = wc >> 6;
        wc |= 0x4000000;
    case 5:
        r[4] = 0x80 | (wc & 0x3f);
        wc = wc >> 6;
        wc |= 0x200000;
    case 4:
        r[3] = 0x80 | (wc & 0x3f);
        wc = wc >> 6;
        wc |= 0x10000;
    case 3:
        r[2] = 0x80 | (wc & 0x3f);
        wc = wc >> 6;
        wc |= 0x800;
    case 2:
        r[1] = 0x80 | (wc & 0x3f);
        wc = wc >> 6;
        wc |= 0xc0;
    case 1:
        r[0] = wc;
    }
    return count;
}

//------------------------------------------------------------------------------

string utf7imap_to_utf8(const string& utf7imap_str)
{
    string result;
    conv_t conv = { 0, 0 };

    const unsigned char* data = (const unsigned char*)utf7imap_str.c_str();
    size_t dataTailSize = utf7imap_str.length();
    while (*data)
    {
        ucs4_t wc = 0;
        int utf7count = utf7imap_mbtowc(&conv, &wc, data, dataTailSize);
        if (utf7count < 0)
        {
            throw Utf7EncodingError("iconv: can not convert from utf-7-imap to utf-8");
        }
        else if (utf7count == 0)
            break;
        dataTailSize -= utf7count;
        data += utf7count;

        char utf8buff[8];
        int utf8count = utf8_wctomb((unsigned char*)utf8buff, wc, 8);
        if (utf8count < 0)
        {
            throw Utf7EncodingError("iconv: can not convert from utf-7-imap to utf-8");
        }
        result += string(utf8buff, utf8count);
    }
    return result;
}

string utf8_to_utf7imap(const string& utf8str)
{
    string result;
    conv_t conv = { 0, 0 };
    char utf7buff[8];

    const unsigned char* data = (const unsigned char*)utf8str.c_str();
    size_t dataTailSize = utf8str.length();
    while (*data)
    {
        ucs4_t wc;
        int utf8count = utf8_mbtowc(&wc, data, dataTailSize);
        if (utf8count < 0)
        {
            throw Utf7EncodingError("iconv: can not convert from utf-8 to utf-7-imap");
        }
        dataTailSize -= utf8count;
        data += utf8count;

        int utf7count = utf7imap_wctomb(&conv, (unsigned char*)utf7buff, wc, 8);
        if (utf7count < 0)
        {
            throw Utf7EncodingError("iconv: can not convert from utf-8 to utf-7-imap");
        }
        result += string(utf7buff, utf7count);
    }

    int utf7Tail = utf7imap_reset(&conv, (unsigned char*)utf7buff, 8);
    if (utf7Tail < 0)
    {
        throw Utf7EncodingError("iconv: can not convert from utf-8 to utf-7-imap");
    }
    result += string(utf7buff, utf7Tail);

    return result;
}

// Convert folder name, part by bart, to utf-7-imap from utf-8.
string folderNameToUtf7Imap(const string& folderName, char delim)
{
    string result;
    vector<string> parts;
    boost::split(parts, folderName, boost::is_any_of(string(1, delim)));

    for (size_t i = 0; i < parts.size(); i++)
    {
        result += (i > 0 ? string(1, delim) : "");
        result += utf8_to_utf7imap(parts[i]);
    }
    return result;
}

// Convert folder name, part by bart, from utf-7-imap to utf-8.
string folderNameFromUtf7Imap(const string& folderName, char delim)
{
    string result;
    vector<string> parts;
    boost::split(parts, folderName, boost::is_any_of(string(1, delim)));

    for (size_t i = 0; i < parts.size(); i++)
    {
        result += (i > 0 ? string(1, delim) : "");
        result += utf7imap_to_utf8(parts[i]);
    }
    return result;
}

} // namespace yimap
