package ru.yandex.msearch.collector.docprocessor;

import java.text.ParseException;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.lucene.util.StringHelper;

import ru.yandex.msearch.ProcessorRequestContext;
import ru.yandex.msearch.collector.YaDoc3;
import ru.yandex.msearch.collector.YaField;
import ru.yandex.util.string.StringUtils;

/**
 * DocProcessor for filtering docs by contacts.
 * Arguments expected to be in form:
 * field1(,field2)* domain1(,domain2) request1-1(,request1-2)* request2-1(,request2-2)* ...
 * Domains and TLDs is cut from the contact fields
 * Doc will remains if one of the fields contains
 * request1-1 AND request1-2 AND ... OR request2-1 AND request2-2 AND ... OR ...
 */
public class FilterByContactsDocProcessor implements DocProcessor {
    private static final String ARGUMENTS_EXPECTED_MSG =
        "Arguments expected to be in form: field1(,field2)* domain1(,domain2)* "
        + "request1-1(,request1-2)* request2-1(,request2-2)* ... Was: ";

    private static final String[] TLDS = new String[]{
        "ru", "com", "net", "org", "info", "int", "edu", "ua", "by",
        "kz", "de", "cn", "uk", "nl", "su", "eu", "tk", "us", "tr",
        "рф"};
    private static final String PUNCT_MARKS = "./\\:;!#$%&()*+=[]^`{|}~";

    private static final String EXACT_HITS_SUFFIX = "_exact_hits";
    private static final String NON_EXACT_HITS_SUFFIX = "_non_exact_hits";
    private static final int STRING_TOKEN = 0;
    private static final int EMAIL_DOMAIN = 1;
    private static final YaField ZERO_YA_FIELD = new YaField.IntegerYaField(0);
    private static final YaField ONE_YA_FIELD = new YaField.IntegerYaField(1);

    private final Map<String, Integer> fieldIndexes;
    private final Map<String, String> hitsFields;
    private final Map<String, String> nonExactHitsFields;
    private final List<List<String>> requests;
    private final String[] domains;
    private final boolean REMOVE_EMAIL_DOMAINS;

    public FilterByContactsDocProcessor(
        final String args,
        final ProcessorRequestContext context)
        throws ParseException
    {
        if (args == null || args.isEmpty()) {
            throw new ParseException("Arguments required", 0);
        }
        int space = args.indexOf(' ');
        if (space <= 0 || space + 1 == args.length()) {
            throw new ParseException(
                "No domains specified. " + ARGUMENTS_EXPECTED_MSG + args,
                0);
        }

        int nextSpace = args.indexOf(' ', space + 1);
        if (nextSpace <= 0 || nextSpace + 1 == args.length()) {
            throw new ParseException(
                "No requests specified. " + ARGUMENTS_EXPECTED_MSG + args,
                0);
        }

        String[] fields = args.substring(0, space).split(",");
        fieldIndexes = new HashMap<>(fields.length);
        hitsFields = new HashMap<>(fields.length);
        nonExactHitsFields = new HashMap<>(fields.length);
        for (String field: fields) {
            String internField = StringHelper.intern(field);
            fieldIndexes.put(
                internField,
                context.fieldToIndex().indexFor(internField));
            hitsFields.put(
                field,
                StringUtils.intern('#' + field + EXACT_HITS_SUFFIX));
            nonExactHitsFields.put(
                field,
                StringUtils.intern('#' + field + NON_EXACT_HITS_SUFFIX));
        }

        String domainsStr = args.substring(space + 1, nextSpace);
        switch (domainsStr) {
            case "none":
                REMOVE_EMAIL_DOMAINS = false;
                domains = null;
                break;
            case "all":
                REMOVE_EMAIL_DOMAINS = true;
                domains = null;
                break;
            default:
                REMOVE_EMAIL_DOMAINS = true;
                domains = domainsStr.split(",");
        }

        requests = new ArrayList<>();
        final StringBuilder sb = new StringBuilder();
        List<String> andRequests = new ArrayList<>();
        for (int i = nextSpace + 1; i < args.length(); i++) {
            final char c = args.charAt(i);
            if (c == ',') {
                if (sb.length() != 0) {
                    andRequests.add(StringHelper.intern(' ' + sb.toString()));
                    sb.setLength(0);
                }
            } else if (c == ' ') {
                if (sb.length() != 0) {
                    andRequests.add(StringHelper.intern(' ' + sb.toString()));
                    sb.setLength(0);
                }
                if (!andRequests.isEmpty()) {
                    requests.add(andRequests);
                    andRequests = new ArrayList<>();
                }
            } else {
                sb.append(c);
            }
        }
        if (sb.length() != 0) {
            andRequests.add(StringHelper.intern(' ' + sb.toString()));
        }
        if (!andRequests.isEmpty()) {
            requests.add(andRequests);
        }
    }

    @Override
    public boolean processWithFilter(final YaDoc3 doc) {
        boolean result = false;
        for (Map.Entry<String, Integer> entry: fieldIndexes.entrySet()) {
            YaField field = doc.getField(entry.getValue());
            if (field == null) {
                continue;
            }
            final String value = ' ' + removeDomains(field.toString());
            boolean contains = false;
            for (List<String> andRequest : requests) {
                if (containsAll(value, andRequest)) {
                    contains = true;
                    break;
                }
            }
            String key = entry.getKey();
            if (contains) {
                result = true;
                doc.setField(hitsFields.get(key), ONE_YA_FIELD);
                doc.setField(nonExactHitsFields.get(key), ONE_YA_FIELD);
            } else {
                doc.setField(hitsFields.get(key), ZERO_YA_FIELD);
                doc.setField(nonExactHitsFields.get(key), ZERO_YA_FIELD);
            }
        }
        return result;
    }

    private boolean containsAll(final String s, final List<String> keywords) {
        for (String keyword: keywords) {
            if (!s.contains(keyword)) {
                return false;
            }
        }
        return true;
    }

    private String removeDomains(final String hdr) {
        char[] result;
        if (REMOVE_EMAIL_DOMAINS) {
            if (domains == null) {
                result = removeAllEmailDomainsAndPunct(hdr.toCharArray());
            } else {
                result = removeEmailDomains(removePunct(hdr.toCharArray()));
            }
        } else {
            result = removePunct(hdr.replace('@', ' ').toCharArray());
        }
        return new String(removeTLD(result));
    }

    private char[] removePunct(final char[] hdr) {
        StringBuilder sb = new StringBuilder(hdr.length);
        for (char current: hdr) {
            switch (current) {
                case '_':
                case '-':
                case ',':
                case '>':
                case '<':
                case '"':
                case '\'':
                case ' ':
                case '\n':
                case '\t':
                case '\f':
                case '\r':
                    sb.append(' ');
                    continue;
                default:
                    sb.append(Character.toLowerCase(current));
            }
        }
        return sb.toString().toCharArray();
    }

    private char[] removeAllEmailDomainsAndPunct(final char[] hdr) {
        StringBuilder sb = new StringBuilder(hdr.length);
        int tokenState = STRING_TOKEN;
        for (char current: hdr) {
            if (Character.isLetterOrDigit(current)) {
                if (tokenState == STRING_TOKEN) {
                    sb.append(Character.toLowerCase(current));
                }
                continue;
            }
            switch (current) {
                case '@':
                    tokenState = EMAIL_DOMAIN;
                    sb.append(' ');
                    continue;
                case '_':
                case '-':
                    if (tokenState == STRING_TOKEN) {
                        sb.append(' ');
                    }
                    continue;
                case '.':
                    if (tokenState == STRING_TOKEN) {
                        sb.append('.');
                    }
                    continue;
                case ',':
                case '>':
                case '<':
                case '"':
                case '\'':
                case ' ':
                case '\n':
                case '\t':
                case '\f':
                case '\r':
                    tokenState = STRING_TOKEN;
                    sb.append(' ');
                    continue;
                default:
                    tokenState = STRING_TOKEN;
                    sb.append(Character.toLowerCase(current));
            }
        }
        return sb.toString().toCharArray();
    }

    private char[] removeEmailDomains(final char[] hdr) {
        return removeCustomDomains(hdr, '@', domains);
    }

    private char[] removeTLD(final char[] hdr) {
        return removeCustomDomains(hdr, '.', TLDS);
    }

    private char[] removeCustomDomains(
        final char[] hdr,
        final char startSymbol,
        final String[] domains)
    {
        StringBuilder sb = new StringBuilder(hdr.length);
        for (int i = 0; i < hdr.length; i++) {
            if (hdr[i] == startSymbol) {
                sb.append(' ');
                int pos = i + 1;
                for (String domain: domains) {
                    if (pos > hdr.length - domain.length()) {
                        continue;
                    }
                    boolean containsDomain = true;
                    for (int j = 0; j < domain.length(); j++) {
                        if (hdr[pos + j] != domain.charAt(j)) {
                            containsDomain = false;
                            break;
                        }
                    }
                    if (containsDomain) {
                        int nextPos = pos + domain.length();
                        if (nextPos == hdr.length || hdr[nextPos] == ' '
                            || PUNCT_MARKS.indexOf(hdr[nextPos]) != -1)
                        {
                            i = nextPos - 1;
                            break;
                        }
                    }
                }
            } else {
                sb.append(hdr[i]);
            }
        }
        return sb.toString().toCharArray();
    }

    @Override
    public void apply(final ModuleFieldsAggregator aggregator) {
        aggregator.add(fieldIndexes.keySet(), Collections.emptySet());
    }
}
