#include "static_map_engine.h"

#include <fstream>
#include <sys/mman.h>
#include <fcntl.h>
#include <unordered_set>
#include <vector>

#include "../../cpp-source/common/norm/Tokens.cpp"

#include <library/cpp/logger/global/global.h>

#include <util/string/cast.h>
#include <util/system/env.h>

const size_t map_engine_cache_size = 1048576;

StaticMapEngine::StaticMapEngine(const char* data_file_directory)
    : data_file_descriptor(-1)
    , storage_size_(0)
    , storage_(NULL)
    , buffer_size_(1000)
    , buffer_(new char[buffer_size_])
    , buffer_size_nrm(1000)
    , buffer_nrm(new char[buffer_size_nrm])
    , buffer_size_subphrases(1000)
    , buffer_subphrases(new char[buffer_size_subphrases])
{
    for (int i = 0; i < 1000; i++) {
        stop_chk[i] = 0;
    }
    default_search_range = 0;
    deepcache = new unsigned char*[map_engine_cache_size + 1];
    for (size_t j = 0; j < map_engine_cache_size; j++) {
        deepcache[j] = 0;
    }

    tokenizer.init_uc2lc();
    if (*data_file_directory) {
        LoadData(data_file_directory);
    }
}

StaticMapEngine::~StaticMapEngine() {
    Clear();

    delete[] buffer_;
}

void StaticMapEngine::LoadData(const char* data_file_directory) {
    Clear();
    FILE* data_file = fopen(data_file_directory, "rb");
    if (data_file == NULL) {
        ERROR_LOG << "Can't read \"" << data_file_directory << "\"";
        return;
    }
    fseek(data_file, 0, SEEK_END);
    storage_size_ = ftell(data_file);
    fclose(data_file);

    if (data_file_descriptor != -1 && fcntl(data_file_descriptor, F_GETFD) != -1) {
        close(data_file_descriptor);
    }

    data_file_descriptor = open(data_file_directory, O_RDONLY);
    storage_ = (unsigned char*)mmap(NULL, storage_size_, PROT_READ,
                                    MAP_SHARED, data_file_descriptor, 0);
    if (storage_ == MAP_FAILED) {
        ERROR_LOG << "Can't mmap \"" << data_file_descriptor << "\"";
        close(data_file_descriptor);
        return;
    }

    TString env_mr_broadmatch = GetEnv("MR_BROADMATCH");

    if (env_mr_broadmatch && env_mr_broadmatch != "0") {
        int mlock_result = mlock((const void*)storage_, storage_size_);
        if (mlock_result != 0) {
            INFO_LOG << "Can't mlock \"" << data_file_descriptor
                     << "\""
                     << ", message \"" << strerror(errno) << "\"";
        }
    }
}

unsigned char* left_bound;
size_t search_range;
size_t shift;
unsigned char* middle_bound;
unsigned char* middle_key;
void StaticMapEngine::GetValue(const unsigned char* key,
                               size_t key_size,
                               const unsigned char*& value,
                               size_t& value_size) {
    left_bound = storage_ - 1;
    if (default_search_range) {
        search_range = default_search_range;
    } else {
        search_range = storage_size_;
        if (*(left_bound + search_range) == '\n') {
            --search_range;
        }
        while (search_range && *(left_bound + search_range) != '\n') {
            --search_range;
        }
        default_search_range = search_range;
    }
    int deep = 1;
    while (search_range) {
        if (deep & map_engine_cache_size) {
            shift = (search_range + 1) >> 1;
            middle_bound = left_bound + shift;
            middle_key = middle_bound;
            while (*middle_key != '\n') {
                ++middle_key;
            }
            ++middle_key;
            if (memcmp(middle_key, key, key_size) > 0) {
                search_range = --shift;
            } else {
                left_bound = middle_bound;
                search_range -= shift;
            }
        } else {
            shift = (search_range + 1) >> 1;
            middle_bound = left_bound + shift;
            if (deepcache[deep]) {
                middle_key = deepcache[deep];
            } else {
                middle_key = middle_bound;
                while (*middle_key != '\n') {
                    ++middle_key;
                }
                ++middle_key;
                deepcache[deep] = middle_key;
            }
            if (memcmp(middle_key, key, key_size) > 0) {
                search_range = --shift;
                deep = deep << 1;
            } else {
                left_bound = middle_bound;
                search_range -= shift;
                deep = deep << 1;
                deep++;
            }
        }
    }
    if (memcmp(left_bound + 1, key, key_size) != 0) {
        value = NULL;
        value_size = 0;
        return;
    }
    left_bound += key_size + 1;
    value = left_bound;
    while (left_bound < storage_ + storage_size_ && *left_bound != '\n') {
        ++left_bound;
    }
    value_size = left_bound - value;
}

//Парсинг леммы из строки языка для текущей нормализации
void StaticMapEngine::GetSubstrValue(const unsigned char* p, size_t key_size, const unsigned char* prx, size_t prx_size, const unsigned char*& value, size_t& value_size) {
    size_t i1 = 1;
    size_t i2 = 1;
    for (; (i2 < key_size) && (p[i2] != ','); i2++) {
    }

    //Только один ключ
    if (i2 == key_size) {
        value = p + i1;
        value_size = i2 - i1;
        return;
    }

    size_t i3 = i2;
    size_t ii = 0;
    for (; i3 < key_size; i3++) {
        if ((ii <= prx_size) && (p[i3] == prx[ii])) {
            ii++;
        } else {
            ii = 0;
            for (; (i3 < key_size) && (p[i3] != ','); i3++) {
            } // Проматываем до следующей запятой
        }
        if (ii == prx_size) {
            if (p[++i3] == ':') {
                size_t ib = i3 + 1;
                for (; (i3 < key_size) && (p[i3] != ','); i3++) {
                } // Проматываем до следующей запятой
                value = p + ib;
                value_size = i3 - ib;
                return;
            }
        }
    }

    value = p + i1;
    value_size = i2 - i1;
}

size_t lang_size;
size_t value_size;
const unsigned char* value;
size_t key_size;
const char* StaticMapEngine::LangValue(char* key, const char* lang) {
    key_size = strlen(key) + 1;
    key[key_size - 1] = '\t';
    GetValue((const unsigned char*)key, key_size, value, value_size);
    key[key_size - 1] = 0;

    if (value_size == 0) {
        return key;
    }
    if (*value == '*') {
        lang_size = strlen(lang);
        GetSubstrValue((const unsigned char*)value, value_size, (const unsigned char*)lang, lang_size, value, value_size);
    }

    if (buffer_size_ < value_size + 1) {
        while (buffer_size_ < value_size + 1) {
            buffer_size_ <<= 1;
        }
        delete[] buffer_;
        buffer_ = new char[buffer_size_];
    }
    memcpy(buffer_, value, value_size);
    buffer_[value_size] = 0;
    return buffer_;
}

const char* StaticMapEngine::GetNorm(const char* text, const char* lang, bool uniq, bool sort) {
    std::vector<char*> tokens;
    size_t i;
    std::vector<std::string> words;
    tokenizer.GetTokensLite(text, tokens);
    words.clear();
    size_t tkn_sz = tokens.size();

    std::unordered_set<std::string>& langstops = stops[lang];
    size_t value_size = 0;
    const char* ch;
    for (i = 0; i < tkn_sz; i++) {
        char*& tkn = tokens[i];
        if (tkn[0] == '-') {
            continue;
        }

        std::string word = LangValue(tkn, lang);

        value_size += word.size() + 1;

        ch = word.c_str();
        //Проверяем, есть ли стоп-слова с такими же первыми 3 символами
        if (ch[0] && stop_chk[(unsigned char)ch[0]]) {
            bool fastchk = true;
            if (ch[1] && !stop_chk[(unsigned char)ch[1] + 256]) {
                fastchk = false;
            }
            if (fastchk) {
                if (ch[2] && !stop_chk[(unsigned char)ch[2] + 512]) {
                    fastchk = false;
                }
            }
            if (fastchk && (langstops.find(ch) != langstops.end())) {
                continue;
            }
        }
        words.push_back(word);
    }
    if (sort) {
        std::sort(words.begin(), words.end());
    }
    if (uniq) {
        std::unordered_set<std::string> set_words;
        std::vector<std::string> uniq_words;
        size_t wsz = words.size();
        for (i = 0; i < wsz; i++) {
            if (set_words.find(words[i]) == set_words.end()) {
                set_words.insert(words[i]);
                uniq_words.push_back(words[i]);
            }
        }
        words = uniq_words;
    }

    if (buffer_size_nrm < value_size + 1) {
        while (buffer_size_nrm < value_size + 1) {
            buffer_size_nrm <<= 1;
        }
        delete[] buffer_nrm;
        buffer_nrm = new char[buffer_size_nrm];
    }

    size_t sz = words.size();

    buffer_nrm[0] = 0;
    int cursz = 0;
    for (i = 0; i < sz; i++) {
        std::string& word = words[i];
        memcpy(buffer_nrm + cursz, word.c_str(), word.size() + 1);
        cursz += word.size() + 1;
        if (i + 1 < sz) {
            buffer_nrm[cursz - 1] = ' ';
        }
    }

    return buffer_nrm;
}

const char* StaticMapEngine::GetNormList(char* text, const char* lang) {
    size_t sz = strlen(text);
    size_t wb = 0;
    std::vector<std::string> phrases;
    size_t value_size = 0;
    size_t i = 0;
    for (; i < sz; i++) {
        if (text[i] == ',') {
            text[i] = 0;
            if (wb < i) {
                char* phtext = text + wb;
                std::string phrase = GetNorm(phtext, lang);
                phrases.push_back(phrase);
                value_size += phrase.size() + 1;
            }
            wb = i + 1;
        }
    }
    if (wb < i) {
        char* phtext = text + wb;
        std::string phrase = GetNorm(phtext, lang);
        phrases.push_back(phrase);
        value_size += phrase.size() + 1;
    }

    if (buffer_size_nrm < value_size + 1) {
        while (buffer_size_nrm < value_size + 1) {
            buffer_size_nrm <<= 1;
        }
        delete[] buffer_nrm;
        buffer_nrm = new char[buffer_size_nrm];
    }

    size_t phlsz = phrases.size();

    buffer_nrm[0] = 0;
    int cursz = 0;
    for (i = 0; i < phlsz; i++) {
        std::string& phrase = phrases[i];
        memcpy(buffer_nrm + cursz, phrase.c_str(), phrase.size() + 1);
        cursz += phrase.size() + 1;
        if (i + 1 < phlsz) {
            buffer_nrm[cursz - 1] = ',';
        }
    }

    return buffer_nrm;
}

void StaticMapEngine::GetSubphrases(char* text, int cnt, std::vector<char*>& phrases) {
    size_t sz = strlen(text);
    std::vector<int> lths;
    std::vector<char*> words;
    if ((sz > 0) && (text[sz - 1] == '\n')) {
        sz--;
    }
    size_t wb = 0;
    size_t i = 0;
    int wcnt = 0;
    for (; i < sz; i++) {
        if (text[i] == ' ') {
            if (wb < i) {
                wcnt++;
                char* wrd = text + wb;
                words.push_back(wrd);
                lths.push_back(i - wb);
            }
            wb = i + 1;
        }
    }
    if (wb < i) {
        wcnt++;
        char* wrd = text + wb;
        words.push_back(wrd);
        lths.push_back(i - wb);
    }

    if (wcnt > cnt) {
        wcnt = cnt;
    }

    size_t value_size = pow(2, wcnt) * sz;
    if (buffer_size_subphrases < value_size + 1) {
        while (buffer_size_subphrases < value_size + 1) {
            buffer_size_subphrases <<= 1;
        }
        delete[] buffer_subphrases;
        buffer_subphrases = new char[buffer_size_subphrases];
    }

    int curps = 0;
    int curlth = 0;
    int max_mask = 1 << wcnt;
    for (int mask = 1; mask < max_mask; mask++) {
        phrases.push_back(buffer_subphrases + curps);
        for (int j = 0; j < wcnt; j++) {
            if (mask & (1 << j)) {
                curlth = lths[j];
                memcpy(buffer_subphrases + curps, words[j], curlth);
                curps += curlth;
                buffer_subphrases[curps] = ' ';
                curps++;
            }
        }
        buffer_subphrases[curps - 1] = 0;
    }
}

void StaticMapEngine::LoadStopWords(const char* stop_words_dict, const char* lang) {
    std::ifstream ifs(stop_words_dict);
    if (!ifs.is_open()) {
        throw std::runtime_error("ERROR: can't load stop words dict");
    }
    std::string stop_word;
    while (getline(ifs, stop_word)) {
        AddStop(stop_word.c_str(), lang);
    }
    ifs.close();
}

void StaticMapEngine::Clear() {
    if (storage_ != NULL) {
        munmap(storage_, storage_size_);
        // снимаем блокировку с адресов ram
        munlock((const void*)storage_, storage_size_);
    }

    // закрываем дескриптор, если все еще открыт
    if (data_file_descriptor != -1 && fcntl(data_file_descriptor, F_GETFD) != -1) {
        close(data_file_descriptor);
    }

    storage_size_ = 0;
}
