#include "text_utils.h"
#include <util/charset/utf8.h>
#include <library/cpp/charset/wide.h>
#include <util/charset/utf8.h>
#include <util/generic/vector.h>
#include <util/string/cast.h>

namespace {
    const char UTF16_BOM_BIG_ENDIAN[] = { char(0xFE), char(0xFF) };
    const char UTF16_BOM_LITTLE_ENDIAN[] = { char(0xFF), char(0xFE) };
    const size_t MAX_TEXT_LENGTH_NO_REF = 512;

    bool CheckBom(TStringBuf body, const char bom[]) {
        if (body.size() < 2)
            return false;
        for (size_t i = 0; i < 2; ++i)
            if (body[i] != bom[i])
                return false;
        return true;
    }

    TString ConvertLittleEndian(TStringBuf body) {
        TVector<wchar16> buf;
        buf.reserve(body.length() / 2 - 1);
        for (size_t i = 2; i < body.size(); i += 2)
            buf.push_back(body[i + 1] << 8 + body[i]);
        return WideToUTF8(buf.data(), buf.size());
    }

    TString CutNotUtf8Chars(TStringBuf text) {
        TString result;
        size_t runeLen = 0;
        size_t cutLen = 0;
        for (const char* b = text.begin(); b != text.end(); b += runeLen) {
            wchar32 rune;
            RECODE_RESULT rr = SafeReadUTF8Char(rune, runeLen, (const unsigned char*)b, (const unsigned char*)text.end());
            if (rr == RECODE_OK) {
                result.append(b, runeLen);
            } else {
                runeLen = 1;
                cutLen += 1;
            }
        }
        return result;
    }
}

namespace NSaas {
    TString ConvertTextToUTF8(TStringBuf text, ECharset encoding) {
        TString textUtf8;

        if (CheckBom(text, UTF16_BOM_BIG_ENDIAN)) {
            textUtf8 = WideToUTF8((const wchar16*)(text.data() + 2), text.size() / 2 - 1);
        } else if (CheckBom(text, UTF16_BOM_LITTLE_ENDIAN)) {
            textUtf8 = ConvertLittleEndian(text);
        } else if (encoding >= 0 && encoding != CODES_UTF8) {
            textUtf8 = WideToUTF8(CharToWide(text, encoding));
        } else {
            textUtf8 = text;
        }

        return CutNotUtf8Chars(textUtf8);
    }

    static TString InsertAsRequestPart(TStringBuf text, TToJsonContext& context) {
        TString name = "$attr_" + ToString(context.Parts.size());
        context.Parts.push_back(std::make_pair(name, TString(text)));
        return name;
    }

    NJson::TJsonValue InsertText(TStringBuf text, TToJsonContext& context) {
        TString convertedText = ConvertTextToUTF8(text, context.Encoding);

        if (context.Format == TToJsonContext::JSON_REF
            && convertedText.length() > MAX_TEXT_LENGTH_NO_REF)
        {
            return InsertAsRequestPart(convertedText, context);
        } else {
            return convertedText;
        }
    }

    NJson::TJsonValue InsertBinaryData(TStringBuf text, TToJsonContext& context) {
        Y_ENSURE(context.Format == TToJsonContext::JSON_REF, "binary data is supported for json_ref format only");
        return InsertAsRequestPart(text, context);
    }
}
