#include "ampersand_parser.h"
#include "common.h"
#include "utf8_parser.h"
#include "xml_parser.h"

#include <library/cpp/charset/codepage.h>
#include <library/cpp/iterator/functools.h>

#include <util/charset/utf8.h>
#include <util/generic/maybe.h>
#include <util/string/join.h>

#include <cstring>
#include <iomanip>
#include <sstream>
#include <stdexcept>
#include <string>

const std::string CATEGORIES_TAG = "categories";
const std::string OFFER_TAG = "offer";

// this tags is allowed to be repeated
const std::string PARAM_TAG = "param"; 
const std::string IMAGE_TAG = "image"; 
const std::string PICTURE_TAG = "picture";
const std::string BARCODE_TAG = "barcode";

const std::string YML_FEED = "YandexMarket";
const std::vector<std::string> YML_TAGS_ORDER_TO_OFFER = {"yml_catalog", "shop", "offers", "offer"};

TErrorOrWarning TErrorOrWarning::InternalError(const TStringBuf message_en, const TStringBuf message_ru) {
    return TErrorOrWarning {
        .is_warning = false,
        .message = TErrorMessage {
            .en = TStringBuilder{} << "Internal error: `" << message_en << "`. Please contact support.",
            .ru = TStringBuilder{} << "Внутренняя ошибка парсера: `" << message_ru << "`. Пожалуйста свяжитесь с поддержкой."
        }
    };
}

TErrorOrWarning TErrorOrWarning::Error(const TStringBuf message_en, const TStringBuf message_ru) {
    return TErrorOrWarning {
        .is_warning = false,
        .message = TErrorMessage {
            .en = std::string(message_en),
            .ru = std::string(message_ru)
        }
    };
}

TErrorOrWarning TErrorOrWarning::Warning(const TStringBuf message_en, const TStringBuf message_ru) {
    return TErrorOrWarning {
        .is_warning = true,
        .message = TErrorMessage {
            .en = std::string(message_en),
            .ru = std::string(message_ru)
        }
    };
}

XMLParser::XMLParser(std::istream* source_stream, std::ostream* target_stream, bool is_strong_mode, const char* feed_type):
        is_EOF(false),
        is_strong_mode(is_strong_mode),
        is_search_for_warnings(true),
        buffer_pos(0),
        buf_state(BufferState::in_xml_code),
        source_stream_(source_stream),
        target_stream_(target_stream),
        line_index_(1),
        column_index_(0),
        search_for_duplicates_(false),
        feed_type_(feed_type) {
}

TErrorOrWarning XMLParser::ParserError(
    const TStringBuf message_en,
    const TStringBuf message_ru,
    bool add_line_column_number
) {
    TStringBuilder result_en;
    TStringBuilder result_ru;

    result_en << message_en;
    result_ru << message_ru;
    
    if (add_line_column_number) {
        result_en << " Line " << line_index_ << ", column " << column_index_ << ".";
        result_ru << " Строка " << line_index_ << ", столбец " << column_index_ << ".";
    }
    return TErrorOrWarning::Error(
        result_en, 
        result_ru
    );
}

TErrorOrWarning XMLParser::Utf8Error(const TErrorMessage& message) {
    return ParserError(
        TStringBuilder{} << "Feed's file must be UTF-8 encoded, but violation of that encoding found: " << message.en << ".", 
        TStringBuilder{} << "Файл фида должен быть закодирован в UTF-8, но обнаружено нарушение этой кодировки: " << message.ru << ".", 
        true
    );
}

TErrorOrWarning XMLParser::ExpectedSymbolsError(const TStringBuf symbols_en, const TStringBuf symbols_ru) {
    return ParserError(
        TStringBuilder{} << "Expected char sequence " << symbols_en << ".", 
        TStringBuilder{} << "Ожидалась последовательность символов " << symbols_ru << ".",
        true
    );
}

TErrorOrWarning XMLParser::ExpectedSymbolError(const char symbol) {
    return ParserError(
        TStringBuilder{} << "Expected char `" << symbol << "`.", 
        TStringBuilder{} << "Ожидался символ `" << symbol << "`.",
        true
    );
}

TErrorOrWarning XMLParser::AmpersandError(const TErrorMessage& message) {
    return ParserError(
        message.en, 
        message.ru, 
        true
    );
}

TErrorOrWarning XMLParser::EofError() {
    TStringBuilder error_en;
    TStringBuilder error_ru;

    error_en << "Reached end of file, but expected characters.";
    error_ru << "Достигнут неожиданный конец файла, ожидались символы.";

    constexpr TStringBuf problemConnectionEn = "unstable internet connection between Yandex server and server from where attempt was made to download feed";
    constexpr TStringBuf problemCloseTagsEn = "you have forgotten to close those or they closed incorrectly";
    constexpr TStringBuf problemCdataEn = "you forgotten to close CDATA or it closed incorrectly";
    constexpr TStringBuf problemConnectionRu = "нестабильное интернет-соединение между сервером Яндекса и сервером где находится фид";
    constexpr TStringBuf problemCloseTagsRu = "вы забыли закрыть эти теги или они закрыты некорректно";
    constexpr TStringBuf problemCdataRu = "вы забыли закрыть CDATA, или оно закрыто некорректно";

    if (!tags_stack_.empty()) {
        error_en << " Like closed tags at least: '";
        error_ru << " Например, хотя бы закрывающие теги: '";
        for (const auto& tag_to_close : NFuncTools::Reversed(tags_stack_)) {
            error_en << "</" << tag_to_close << ">";
            error_ru << "</" << tag_to_close << ">";
        }
        error_en << "'. Possible problems: " << problemConnectionEn << "; " << problemCloseTagsEn << "; " << problemCdataEn << ".";
        error_ru << "'. Возможные проблемы: " << problemConnectionRu << "; " << problemCloseTagsRu << "; " << problemCdataRu << ".";
    } else {
        error_en << "'. Possible problems: " << problemConnectionEn << ".";
        error_ru << "'. Возможные проблемы: " << problemConnectionRu << ".";
    }

    return ParserError(
        error_en, 
        error_ru, 
        false /* add_line_column_number */
    );
}

TErrorOrWarning XMLParser::WrongPathError() {
    TStringBuilder error_en;
    TStringBuilder error_ru;
    error_en << "Wrong path to tag `offer`: `";
    error_ru << "Неверный путь к тегу `offer`: `";
    auto actualPath = MakeRangeJoiner("/", tags_stack_);
    error_en << actualPath << "`, it must be: `";
    error_ru << actualPath << "`, должен быть: `";
    auto expectedPath = MakeRangeJoiner("/", YML_TAGS_ORDER_TO_OFFER);
    error_en << expectedPath << "`.";
    error_ru << expectedPath << "`.";
    return ParserError(
        error_en, 
        error_ru, 
        true
    );
}

void XMLParser::ThrowOrMemoizeNewError(TErrorOrWarning error) {
    if (is_strong_mode) {
        throw error;
    } else {
        error.is_warning = true;
        is_search_for_warnings = false;
        warning = error;
    }
}

char XMLParser::Normalize(char character) {
    return IsSpace(character) ? ' ' : character;
}

//Метод читает из входного потока данные в буффер и выпилывает из него блоки comment и CDATA
void XMLParser::ReadBuffer() {
    if (source_stream_->eof()) {
        if (end_buffer.length() == 0) {
            if (!is_EOF) {
                is_EOF = true;
                return;
            }
            throw EofError();
        }
        buffer = end_buffer;
        end_buffer.clear();
        buffer_pos = 0;
        return;
    }


    auto end_buffer_size = end_buffer.length();

    std::string tmp_buffer;
    tmp_buffer = std::string(BUF_SIZE, ' ');
    tmp_buffer.replace(std::size_t(0), std::size_t(0), end_buffer);
    source_stream_->read(&tmp_buffer[end_buffer_size], BUF_SIZE);

    unsigned long cm_end_pos, cd_end_pos, start_pos =0;
    const std::string cdata_start  ("<![CDATA[");
    const std::string comment_start("<!--");
    const std::string cdata_end    ("]]>");
    const std::string comment_end  ("-->");
    std::string good_buffer;
    auto buff_size = tmp_buffer.size();

    auto move_to_good_buffer = [&] (unsigned long start_pos, unsigned long end_pos)
    { //метод добавляет символы в итоговый с учетом того в каком блоке мы находимся
        if (buf_state==BufferState::in_xml_code) {
            good_buffer += tmp_buffer.substr(start_pos, end_pos-start_pos);
        } else if (buf_state==BufferState::in_cdata) {
            auto str = tmp_buffer.substr(start_pos, end_pos-start_pos);

            std::size_t find_index = 0;

            find_index = str.find('&');
            while (find_index != std::string::npos) {
                str.replace(find_index, 1, "&amp;");
                find_index = str.find('&', find_index+1);
            }

            find_index = str.find('<');
            while (find_index != std::string::npos) {
                str.replace(find_index, 1, "&lt;");
                find_index = str.find('<');
            }

            find_index = str.find('>');
            while (find_index != std::string::npos) {
                str.replace(find_index, 1, "&gt;");
                find_index = str.find('>');
            }

            good_buffer += str;
        }
    };

    while (true)
    {
        if (buf_state==BufferState::in_xml_code) {
            cd_end_pos = tmp_buffer.find(cdata_start, start_pos);
            cm_end_pos = tmp_buffer.find(comment_start, start_pos);

            if (cd_end_pos==cm_end_pos) { //конец буффера
                if (start_pos < buff_size-16) move_to_good_buffer(start_pos, buff_size-16);
                break;
            } else if (cd_end_pos!=std::string::npos &&
                       (cm_end_pos > cd_end_pos || cm_end_pos==std::string::npos)) { //ближе всего старт блока Cdata
                move_to_good_buffer(start_pos, cd_end_pos);

                start_pos = cd_end_pos + cdata_start.length();
                buf_state = BufferState::in_cdata;
            } else { //ближе всего старт блока  comment
                move_to_good_buffer(start_pos, cm_end_pos);

                start_pos = cm_end_pos + comment_start.length();
                buf_state = BufferState::in_comment;
            }

        } else if (buf_state==BufferState::in_cdata) {
            cd_end_pos = tmp_buffer.find(cdata_end, start_pos);
            if (cd_end_pos==std::string::npos) { //конец буффера
                if (start_pos < buff_size-16) move_to_good_buffer(start_pos, buff_size-16);
                break;
            }
            move_to_good_buffer(start_pos, cd_end_pos);
            start_pos = cd_end_pos + cdata_end.length();

            buf_state = BufferState::in_xml_code;

        } else if (buf_state==BufferState::in_comment) {
            cm_end_pos = tmp_buffer.find(comment_end, start_pos);
            if (cm_end_pos==std::string::npos) { //конец буффера
                break;
            }
            start_pos = cm_end_pos + comment_end.length();

            buf_state = BufferState::in_xml_code;
        }
    }

    auto last_index_elem = std::max(start_pos, buff_size-16);
    auto needed_size = buff_size - last_index_elem;

    end_buffer = tmp_buffer.substr(last_index_elem, needed_size);
    buffer = good_buffer;
    buffer_pos = 0;
}

void XMLParser::GetChar() {
    if (buffer_pos == buffer.length()) {
        do {
            ReadBuffer();
        } while (buffer.empty() && !is_EOF);
    }

    if (!is_EOF) {
        current_char_ = buffer[buffer_pos++];

        auto result = utf8_parser.ProcessChar(static_cast<unsigned char>(current_char_));
        if (auto charOffset = result.GetResult(); charOffset.Defined()) {
            column_index_ += *charOffset;
        } else if (auto errorMessage = result.GetError(); errorMessage.Defined()) {
            if (is_search_for_warnings) {
                ThrowOrMemoizeNewError(Utf8Error(*errorMessage));
            }
        } else {
            ThrowOrMemoizeNewError(ParserError(
                "Internal parser error while processing UTF-8.", 
                "Внутренняя ошибка парсера при обработке UTF-8.", 
                true
            ));
        }

        if (current_char_ == '\n') {
            ++line_index_;
            column_index_ = 0;
        }
    } else {
        current_char_ = EOF;
        if (is_search_for_warnings) {
            auto result = ampersand_parser.Finish();
            if (result.Defined()) {
                ThrowOrMemoizeNewError(AmpersandError(*result));
            }
        }

        auto result = utf8_parser.Finish();
        if (auto errorMessage = result.GetError(); errorMessage.Defined() && is_search_for_warnings) {
            ThrowOrMemoizeNewError(Utf8Error(*errorMessage));
        }
    }
    if (is_search_for_warnings) {
        auto result = ampersand_parser.ProcessChar(current_char_);
        if (result.Defined()) {
            ThrowOrMemoizeNewError(AmpersandError(*result));
        }
    }
}

void XMLParser::GetNextChar() {
    while (IsSpace(current_char_)) {
        GetChar();
    }
}

void XMLParser::GetSyntacticLexeme() {
    current_lexeme_.clear();
    enum class State  {kS, kA, kB};
    State current_state = State::kS;
    while (true) {
        switch (current_state) {
            case State::kS:
                if (IsSpace(current_char_)) {
                    current_state = State::kS;
                    GetChar();
                } else if (current_char_ == '<') {
                    current_state = State::kA;
                    current_lexeme_.push_back(current_char_);
                    GetChar();
                } else if (current_char_ == '>' || current_char_ == '=' ||
                           current_char_ == '\"' || current_char_ == '\'') {
                    current_lexeme_.push_back(current_char_);
                    GetChar();
                    return;
                } else if (current_char_ == '/') {
                    current_state = State::kB;
                    current_lexeme_.push_back(current_char_);
                    GetChar();
                } else if (source_stream_->eof()) {
                    return;
                } else {
                    throw ParserError(
                        "Expected one of these XML tags symbols: `<>=\"'/`.", 
                        "Ожидался один из следующих символов XML-тегов: `<>=\"'/`.",
                        true
                    );
                }
                break;
            case State::kA:
                if (current_char_ == '/' || current_char_ == '?') {
                    current_lexeme_.push_back(current_char_);
                    GetChar();
                    return;
                } else if (current_char_ == '!') {
                    for (size_t char_index = 0; char_index < 3; ++char_index) {
                        current_lexeme_.push_back(current_char_);
                        GetChar();
                    }
                    for (size_t char_index = 3; char_index < 8; ++char_index) {
                        current_lexeme_.push_back(current_char_);
                        GetChar();
                    }
                    if (current_lexeme_ != "<!DOCTYPE") {
                        throw ParserError(
                            TStringBuilder{} << "Expected `<!DOCTYPE`, but found `" << current_lexeme_ << "`.",
                            TStringBuilder{} << "Ожидалось `<!DOCTYPE`, но обнаружено `" << current_lexeme_ << "`.",
                            true
                        );
                    }
                    return;
                } else {
                    return;
                }
                break;
            case State::kB:
                if (current_char_ == '>') {
                    current_lexeme_.push_back(current_char_);
                    GetChar();
                    return;
                } else {
                    return;
                }
                break;
        }
    }
}

void XMLParser::GetTagLexeme() {
    current_lexeme_.clear();
    GetNextChar();
    if (!isalpha(current_char_) && !isdigit(current_char_) &&
        !strchr("_-:", current_char_)) {
        throw ParserError(
            "An incorrect symbol has been found in XML tag name, only these symbols are allowed: latin letters, numbers and `_-:`.",
            "Обнаружен некорректный символ в названии XML тега, только следующие символы разрешены: латинские буквы, цифры и символы `_-:`.",
            true
        );
    }
    while (isalpha(current_char_) || isdigit(current_char_) ||
           strchr("_-:", current_char_)) {
        current_lexeme_.push_back(current_char_);
        GetChar();
    }
}

void XMLParser::GetAttributeKeyLexeme() {
    current_lexeme_.clear();
    while (isalpha(current_char_) || isdigit(current_char_) ||
           strchr("_-:", current_char_)) {
        current_lexeme_.push_back(current_char_);
        GetChar();
    }
}

void XMLParser::GetAttributeValueLexeme(char opened_quot) {
    current_lexeme_.clear();
    while (current_char_ != opened_quot) {
        current_lexeme_.push_back(Normalize(current_char_));
        GetChar();
    }
}

std::string XMLParser::GetBodyLexeme() {
    std::string body_lexeme;
    while (true) {
        if (current_char_ == '<') {
            GetSyntacticLexeme();
            if (current_lexeme_ == "</") {
                break;
            }
        }
        body_lexeme.push_back(Normalize(current_char_));
        GetChar();
    }
    return body_lexeme;
}

void XMLParser::ParseHeader() {
    while (current_lexeme_ == "<?" || current_lexeme_ == "<!DOCTYPE") {
        while (current_char_ != '>') {
            GetChar();
        }
        GetChar();
        GetSyntacticLexeme();
    }
}

void XMLParser::ProcessOpenTagEvent() {
    if (is_search_for_warnings && feed_type_ == YML_FEED) {
        if (current_lexeme_ == OFFER_TAG) {
            search_for_duplicates_ = true;
        } else if (
            search_for_duplicates_ && 
            tags_stack_.back() == OFFER_TAG &&
            current_lexeme_ != PARAM_TAG &&
            current_lexeme_ != IMAGE_TAG &&
            current_lexeme_ != PICTURE_TAG &&
            current_lexeme_ != BARCODE_TAG
        ) {
            if (offer_tags_.contains(current_lexeme_)) {
                ThrowOrMemoizeNewError(ParserError(
                    TStringBuilder{} << "Found duplicate tag `" << current_lexeme_ << "`, there should be only one usage of that tag.", 
                    TStringBuilder{} << "Обнаружен дубликат тега `" << current_lexeme_ << "`, должно быть только одно использование этого тега.", 
                    true
                ));
            } else {
                offer_tags_.insert(current_lexeme_);
            }
        }
    }

    tags_stack_.push_back(current_lexeme_);
    write_flags_stack_.push_back(
            search_tags_.count(current_lexeme_) == 1 ||
            (!write_flags_stack_.empty() && write_flags_stack_.back()));
    if (write_flags_stack_.back() &&
        (write_flags_stack_.size() == 1 ||
         !write_flags_stack_[write_flags_stack_.size() - 2]))
    {
        *target_stream_ << tags_stack_.back() << '\t';
    }

    if (feed_type_ == YML_FEED && search_tags_.count(OFFER_TAG) == 1 && current_lexeme_ == OFFER_TAG) {
        if (tags_stack_ != YML_TAGS_ORDER_TO_OFFER) {
            ThrowOrMemoizeNewError(WrongPathError());
        }
    }
}

void XMLParser::ProcessCloseTagEvent() {
    if (is_search_for_warnings) {
        if (current_lexeme_ == CATEGORIES_TAG) {
            if (auto error = cycle_detector.Finish()) {
                ThrowOrMemoizeNewError(ParserError(
                    error->en, 
                    error->ru, 
                    false
                ));
            }
        }
        if (current_lexeme_ == OFFER_TAG && feed_type_ == YML_FEED) {
            search_for_duplicates_ = false;
            offer_tags_.clear();
        }
    }

    tags_stack_.pop_back();
    bool last_write_flag = write_flags_stack_.back();
    write_flags_stack_.pop_back();
    if (last_write_flag && (write_flags_stack_.empty() || !write_flags_stack_.back())) {
        *target_stream_ << '\n';
    }
}

void XMLParser::ParseXML() {
    while (current_lexeme_ == "<") {
        GetTagLexeme();
        ProcessOpenTagEvent();
        GetNextChar();
        ParseAttributes();
        GetSyntacticLexeme();
        if (current_lexeme_ == "/>") {
            ProcessCloseTagEvent();
            GetSyntacticLexeme();
            continue;
        } else if (current_lexeme_ == ">") {
            GetNextChar();
            if (current_char_ == '<') {
                GetSyntacticLexeme();
            }
            if (current_lexeme_ == ">") {
                std::string body_lexeme = GetBodyLexeme();
                if (write_flags_stack_.back()) {
                    WriteFullTagsStack();
                    *target_stream_ << '=' << body_lexeme << '\t';
                }
            } else {
//                GetSyntacticLexeme();
                ParseXML();
            }
            if (current_lexeme_ != "</") {
                throw ExpectedSymbolsError("`</`", "`</`");
            }
            GetTagLexeme();
            if (current_lexeme_ != tags_stack_.back()) {
                throw ParserError(
                    TStringBuilder{} << "Expected tag `" << tags_stack_.back() << "` to be closed, but found closing of the tag `" << current_lexeme_ << "`.",
                    TStringBuilder{} << "Ожидалось закрытие тега `" << tags_stack_.back() << "`, но обнаружено закрытие тега `" << current_lexeme_ << "`.",
                    true
                );
            }
            ProcessCloseTagEvent();
            GetSyntacticLexeme();
            if (current_lexeme_ != ">") {
                throw ExpectedSymbolError('>');
            }
            GetSyntacticLexeme();
        } else {
            throw ExpectedSymbolsError("`>` or `/>`", "`>` или `/>`");
        }
    }
}

void XMLParser::ParseAttributes() {
    TMaybe<TString> id = Nothing();
    TMaybe<TString> parentId = Nothing();
    while (isalpha(current_char_) || isdigit(current_char_)) {
        auto tag = tags_stack_.back();
        GetAttributeKeyLexeme();
        auto key = current_lexeme_;
        if (write_flags_stack_.back()) {
            WriteFullTagsStack();
            *target_stream_ << ':' << current_lexeme_ << '=';
        }
        GetSyntacticLexeme();
        if (current_lexeme_ != "=") {
            throw ExpectedSymbolError('=');
        }
        GetSyntacticLexeme();
        if (current_lexeme_ != "\"" && current_lexeme_ != "'") {
            throw ExpectedSymbolError('"');
        }
        char opened_quot = current_lexeme_[0];
        GetAttributeValueLexeme(opened_quot);
        auto value = current_lexeme_;
        if (write_flags_stack_.back()) {
            *target_stream_ << current_lexeme_ << '\t';
        }
        GetSyntacticLexeme();
        if (current_lexeme_[0] != opened_quot) {
            throw ExpectedSymbolError('"');
        }
        GetNextChar();

        if (tag == "category") {
            if (key == "id") { id = value; }
            if (key == "parentId") { parentId = value; }
        }
    }
    if (is_search_for_warnings && id.Defined()) {
        cycle_detector.AddCategory(*id, parentId);
    }
}

void XMLParser::PushSearchTag(const std::string& tag) {
    search_tags_.insert(tag);
}

void XMLParser::WriteFullTagsStack() {
    if (tags_stack_.empty() || !write_flags_stack_.back()) {
        return;
    }
    size_t tags_stack_index = 0;
    while (!write_flags_stack_[tags_stack_index]) {
        ++tags_stack_index;
    }
    *target_stream_ << tags_stack_[tags_stack_index++];
    while (tags_stack_index < tags_stack_.size()) {
        *target_stream_ << ':' << tags_stack_[tags_stack_index++];
    }
}

const TMaybe<TErrorOrWarning>& XMLParser::GetWarning() const {
    return warning;
}
