#include "regexp_to_trigram.h"
#include <util/string/split.h>
#include <util/generic/set.h>

TVector<TString> TRegexpToTrigrams::Parse(const TStringBuf& regexp) {
    for (TStringBuf& str : RemoveBracketsAndSpecSymbols(regexp)) {
        AddGarantedStringsWithoutBrackets(str);
    }
    return GetTrigramsFromGarantedStrings();
}

void TRegexpToTrigrams::AddGarantedStringsWithoutBrackets(const TStringBuf& regexp) {
    if (!regexp.Contains('|')) {
        AddGarantedStringsWithoutBracketsAndOr(regexp);
    }
}

void TRegexpToTrigrams::AddGarantedStringsWithoutBracketsAndOr(const TStringBuf& regexp) {
    TVector<TStringBuf> strings = StringSplitter(regexp).SplitBySet("*?").SkipEmpty();
    TVector<TStringBuf> preGarantedStrings;
    for (size_t i = 0; i < strings.size(); ++i) {
        TStringBuf& str = strings[i];
        if (str[str.size() - 1] == '.' || i != strings.size() - 1) {
            preGarantedStrings.push_back(str.substr(0, str.size() - 1));
        }
        if (i == strings.size() - 1) {
            preGarantedStrings.push_back(str);
        }
    }
    for (TStringBuf& str : preGarantedStrings) {
        StringSplitter(str).SplitBySet(".+").SkipEmpty().AddTo(&GarantedStrings);
    }
}

TVector<TString> TRegexpToTrigrams::GetTrigramsFromGarantedStrings() {
    TSet<TString> ans;
    for (TStringBuf& str : GarantedStrings) {
        for (size_t i = 0; i + 3 <= str.size(); ++i) {
            ans.insert(TString(str.substr(i, 3)));
        }
    }
    return TVector<TString>(ans.begin(), ans.end());
}

TVector<TStringBuf> TRegexpToTrigrams::RemoveBracketsAndSpecSymbols(TStringBuf regexp) {
    TVector<TStringBuf> ans;
    for (size_t current = 0, last = 0, inBrackets = 0; current < regexp.size(); ++current) {
        switch (regexp[current]) {
            case '{':
            case '[':
            case '(':
                if (inBrackets == 0) {
                    ans.push_back(regexp.substr(last, current - last));
                }
                ++inBrackets;
                break;

            case '}':
            case ']':
            case ')':
                --inBrackets;
                if (inBrackets == 0) {
                    last = current + 1;
                }
                break;

            case '\\': // some spec symbols
                if (inBrackets == 0) {
                    ans.push_back(regexp.substr(last, current - last));
                }
                ++current;
                if (regexp[current] == 'x') { // hex character code (exactly two digits)
                    ++current;
                } else {
                    for (int i = 0; current < regexp.size() && i < 3 && '0' <= regexp[current] && regexp[current] <= '9'; ++i, ++current)
                        ; // octal character code (up to three digits)
                }
                last = current + 1;
                break;

            case '^':
            case '$':
            case '/':
                if (inBrackets == 0) {
                    ans.push_back(regexp.substr(last, current - last));
                }
                last = current + 1;
                break;

            default:
                if (current == regexp.size() - 1) {
                    ans.push_back(regexp.substr(last, regexp.size() - last));
                }
                break;
        }
    }
    return ans;
}
