package ru.yandex.search.mail.yt.consumer.alice;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Stream;

import org.apache.commons.lang3.StringUtils;

import ru.yandex.parser.config.ConfigException;
import ru.yandex.search.mail.yt.consumer.config.ImmutableAliceConfig;
import ru.yandex.search.request.util.SearchRequestText;

public class AliceDislayNameExtractor {
    private static final int PREFIX_LENGTH = 3;
    // CSOFF: MultipleStringLiterals
    private static final Pattern CLEANER = Pattern.compile(
        "[^\\p{Alnum}\\s\\-]",
        Pattern.UNICODE_CHARACTER_CLASS);
    private static final String[] SURNAME_SUFFIXES = {
        "джи", "дзе", "дзки", "ев", "ева", "ер", "заде",
        "западе", "иа", "ий", "ик", "ин", "ина", "ини", "ипа", "ис",
        "ишин", "ка", "кин", "ко", "кос", "кызы", "ли", "ман", "не",
        "ни", "ний", "ный", "ов", "ова", "огло", "оглу", "пулос", "си", "ски",
        "ских", "ску", "те", "ти", "уа", "ук", "ун", "уни", "ури", "цки",
        "чай", "швили", "шииты", "ын", "юк", "ян", "янц", "ыы"};
    private static final String[][][] RAW_SCHEMES = {
        {
            {"а", "a"},
            {"б", "b"},
            {"в", "v"},
            {"г", "g"},
            {"д", "d"},
            {"е", "e"},
            {"ё", "jo"},
            {"ж", "zh"},
            {"з", "z"},
            {"и", "i"},
            {"й", "jj"},
            {"к", "k"},
            {"л", "l"},
            {"м", "m"},
            {"н", "n"},
            {"о", "o"},
            {"п", "p"},
            {"р", "r"},
            {"с", "s"},
            {"т", "t"},
            {"у", "u"},
            {"ф", "f"},
            {"х", "kh"},
            {"ц", "c"},
            {"ч", "ch"},
            {"ш", "sh"},
            {"щ", "shh"},
            {"ъ", "''"},
            {"ы", "y"},
            {"ь", "'"},
            {"э", "eh"},
            {"ю", "ju"},
            {"я", "ja"}
        },
        {
            {"а", "a"},
            {"б", "b"},
            {"в", "v"},
            {"г", "g"},
            {"д", "d"},
            {"е", "e"},
            {"ё", "e"},
            {"ж", "zh"},
            {"з", "z"},
            {"и", "i"},
            {"й", "i"},
            {"к", "k"},
            {"л", "l"},
            {"м", "m"},
            {"н", "n"},
            {"о", "o"},
            {"п", "p"},
            {"р", "r"},
            {"с", "s"},
            {"т", "t"},
            {"у", "u"},
            {"ф", "f"},
            {"х", "kh"},
            {"ц", "tc"},
            {"ч", "ch"},
            {"ш", "sh"},
            {"щ", "shch"},
            {"ъ", ""},
            {"ы", "y"},
            {"ь", ""},
            {"э", "e"},
            {"ю", "iu"},
            {"я", "ia"}
        },
        {
            {"а", "a"},
            {"б", "b"},
            {"в", "v"},
            {"г", "g"},
            {"д", "d"},
            {"е", "e"},
            {"ё", "e"},
            {"ж", "zh"},
            {"з", "z"},
            {"и", "i"},
            {"й", "i"},
            {"к", "k"},
            {"л", "l"},
            {"м", "m"},
            {"н", "n"},
            {"о", "o"},
            {"п", "p"},
            {"р", "r"},
            {"с", "s"},
            {"т", "t"},
            {"у", "u"},
            {"ф", "f"},
            {"х", "kh"},
            {"ц", "ts"},
            {"ч", "ch"},
            {"ш", "sh"},
            {"щ", "shch"},
            {"ъ", "ie"},
            {"ы", "y"},
            {"ь", ""},
            {"э", "e"},
            {"ю", "iu"},
            {"я", "ia"}
        },
        {
            {"а", "a"},
            {"б", "b"},
            {"в", "v"},
            {"г", "g"},
            {"д", "d"},
            {"е", "e"},
            {"ё", "e"},
            {"ж", "j"},
            {"з", "z"},
            {"и", "i"},
            {"й", "i"},
            {"к", "k"},
            {"л", "l"},
            {"м", "m"},
            {"н", "n"},
            {"о", "o"},
            {"п", "p"},
            {"р", "r"},
            {"с", "s"},
            {"т", "t"},
            {"у", "u"},
            {"ф", "f"},
            {"х", "h"},
            {"ц", "c"},
            {"ч", "ch"},
            {"ш", "sh"},
            {"щ", "sc"},
            {"ъ", ""},
            {"ы", "y"},
            {"ь", ""},
            {"э", "e"},
            {"ю", "iu"},
            {"я", "ia"}
        }
    };

    // CSON: MultipleStringLiterals
    private static final ArrayList<Map<Character, String>> SCHEMES =
        new ArrayList<>();

    static {
        for (int i = 0; i < RAW_SCHEMES.length; ++i) {
            Map<Character, String> ruToEnTable = new HashMap<>();
            for (int j = 0; j < RAW_SCHEMES[i].length; ++j) {
                ruToEnTable.put(
                    RAW_SCHEMES[i][j][0].charAt(0),
                    RAW_SCHEMES[i][j][1]);
            }
            SCHEMES.add(ruToEnTable);
        }
    }

    private Set<String> names = new HashSet<>();
    private Set<String> shortenNames = new HashSet<>();

    public AliceDislayNameExtractor(
        final ImmutableAliceConfig config)
        throws ConfigException
    {
        try (Stream<String> lines = Files.lines(
                config.humanNamesFile().toPath(),
                StandardCharsets.UTF_8))
        {
            lines.forEach(
                (line) -> {
                    String trimmed = line.trim();
                    if (!trimmed.isEmpty()) {
                        initName(trimmed);
                    }
                });
        } catch (IOException e) {
            throw new ConfigException("Failed to load names file", e);
        }
    }

    public List<String> parse(final String text) {
        String normalized = SearchRequestText.normalize(text);
        String cleaned = CLEANER.matcher(normalized).replaceAll("\\s");

        List<String> found = new ArrayList<>();
        StringBuilder sb = new StringBuilder();
        for (String token: cleaned.split("\\s+")) {
            String lowToken =
                token.toLowerCase(Locale.forLanguageTag("RU"));

            if (probableName(lowToken)) {
                sb.append(token);
                sb.append(' ');
            } else if (sb.length() > 0) {
                if (probableSurname(token)
                    || probablePatronymic(token)
                    || StringUtils.isNumeric(lowToken))
                {
                    sb.append(token);
                    sb.append(' ');
                } else {
                    sb.setLength(sb.length() - 1);
                    found.add(sb.toString());
                    sb = new StringBuilder();
                }
            }
        }

        if (sb.length() > 0) {
            sb.setLength(sb.length() - 1);
            found.add(sb.toString());
        }

        return found;
    }

    public boolean probableName(final String name) {
        return names.contains(name.toLowerCase(Locale.getDefault()));
    }

    public boolean probableSurname(final String name) {
        boolean res = false;
        String lowerName = name.toLowerCase(Locale.getDefault());
        for (String x : SURNAME_SUFFIXES) {
            if (lowerName.endsWith(x)
                || lowerName.endsWith(ruToEn(x)))
            {
                res = true;
                break;
            }
        }
        return res;
    }

    public boolean probablePatronymic(final String pat) {
        if (pat.toLowerCase(
            Locale.ROOT).matches(".*(TCH|ICH|ich|tch|вич|вна|vna)"))
        {
            return true;
        }
        return false;
    }

    private void initName(final String name) {
        String rname = name.toLowerCase(Locale.getDefault());
        names.add(rname);
        for (String x : ruToEnAllOption(rname)) {
            names.add(x);
        }
        //names.add(translit.ruToEn(rname));
        if (rname.length() > PREFIX_LENGTH) {
            String sub = rname.substring(0, PREFIX_LENGTH);
            shortenNames.add(sub);
            for (String x : ruToEnAllOption(sub)) {
                shortenNames.add(x);
            }
            //shortenNames.add(translit.ruToEn(sub));
        }
    }

    public String ruToEn(final String w) {
        return ruToEnForScheme(w, SCHEMES.get(0));
    }

    public ArrayList<String> ruToEnAllOption(final String w) {
        ArrayList<String> res = new ArrayList<>();
        for (Map<Character, String> sch : SCHEMES) {
            res.add(ruToEnForScheme(w, sch));
        }
        return res;
    }

    private String ruToEnForScheme(
        final String w,
        final Map<Character, String> ruToEnTable)
    {
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < w.length(); i++) {
            String res = ruToEnTable.get(Character.toLowerCase(w.charAt(i)));
            if (res != null) {
                if (Character.isUpperCase(w.charAt(i))) {
                    sb.append(Character.toUpperCase(res.charAt(0)));
                    for (int j = 1; j < res.length(); ++j) {
                        sb.append(res.charAt(j));
                    }
                } else {
                    sb.append(res);
                }
            }
        }
        return new String(sb);
    }
}
