package ru.yandex.msearch.collector.outergroup;

import java.io.IOException;
import java.io.FileInputStream;
import java.io.BufferedInputStream;
import java.io.InputStreamReader;
import java.io.BufferedReader;
import java.io.File;

import java.nio.charset.CharacterCodingException;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.regex.Matcher;

import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.StringHelper;

import ru.yandex.function.StringBuilderProcessorAdapter;
import ru.yandex.function.StringVoidProcessor;

import ru.yandex.msearch.Config;
import ru.yandex.msearch.collector.FieldToIndex;
import ru.yandex.msearch.collector.YaDoc3;
import ru.yandex.msearch.collector.YaField;

import ru.yandex.parser.uri.PctDecoder;
import ru.yandex.parser.uri.PctEncoder;
import ru.yandex.parser.uri.PctEncodingRule;

import ru.yandex.util.string.StringUtils;

public class XURLSOuterDeduplicator implements OuterGroupFunction
{
    private static final int DEDUPLICATOR_MAX_SIZE = 5000;

    private final PctDecoder decoder = new PctDecoder(false);
    private final StringVoidProcessor<char[], CharacterCodingException> encoder =
        new StringVoidProcessor<>(new PctEncoder(PctEncodingRule.FRAGMENT));
    private final StringBuilder sb = new StringBuilder();
    private final Set<String> deduplicator = new HashSet<>();
    private final List<Rule> rules;
    private final String fieldName;
    private final int fieldIndex;
    private final boolean useRegex;
    private final Logger logger;

    public static class Rule {
        public final Pattern pattern;
        public final String replace;

	public Rule(final Pattern p, final String r) {
	    pattern = p;
	    replace = r;
	}
    }

    private final static String syntax()
    {
	return "Syntax should be of the following form: s/find/replace/. Where '/' is any character delimiter like '#' and so on.";
    }

    private static Rule parsePatternLine(
        final String line,
        final int lineNo,
        final Logger logger)
    {
	String trimmed = line.trim();
	if( trimmed.startsWith( ";" ) || 
	    trimmed.startsWith( "//" ) ||
	    trimmed.startsWith( "#" ) ||
	    trimmed.length() == 0 )
	{
	    return null;
	}
	if( !trimmed.startsWith( "s" ) )
	{
	    logger.severe( "Invalid regex modifier: '" + trimmed.charAt(0) + "'. The only supported is 's'. Error at line<" + lineNo + ">: " + line );
	    return null;
	}
	if( trimmed.length() < 4 )
	{
	    logger.severe( "Invalid regex syntax: " + syntax() + "Error at line<" + lineNo + ">: " + line );
	    return null;
	}
	char delimiter = trimmed.charAt(1);
	if( trimmed.charAt( trimmed.length() - 1 ) != delimiter )
	{
	    logger.severe( "Invalid regex syntax: unclosed regex. " + syntax() + "Error at line<" + lineNo + ">: " + line );
	    return null;
	}
	String keyVal = trimmed.substring( 2, trimmed.length() - 1 );
	int sep = 2;
	while( true )
	{
	    sep = keyVal.indexOf( delimiter );
	    if( sep == -1 )
	    {
		logger.severe( "Invalid regex syntax: unclosed regex. " + syntax() + "Error at line<" + lineNo + ">: " + line );
		return null;
	    }
	    if( keyVal.charAt(sep - 1) != '\\' ) break;
	}
	String search = keyVal.substring( 0, sep );
	String replace = keyVal.substring( sep + 1 );
	return new Rule( Pattern.compile( search ), replace );
    }

    public static List<Rule> loadPatterns(
        final File regexFile,
        final Logger logger)
        throws IOException
    {
        if (regexFile == null) {
            return Collections.emptyList();
        }
        List<Rule> rules = new ArrayList<>();
        logger.info("X_URLS regexp loader: Loading regex pattens for X_URLS "
            + "from file: " + regexFile);
        try (BufferedReader reader =
                new BufferedReader(
                    new InputStreamReader(
                        new FileInputStream(regexFile))))
        {
            String line = null;
            int lineNo = 0;
            while ((line = reader.readLine()) != null) {
                ++lineNo;
                Rule r = parsePatternLine(line, lineNo, logger);
                if( r != null ) rules.add( r );
            }
        } catch (Exception e) {
            logger.log(
                Level.SEVERE,
                "X_URLS regexp loader: can't load patterns from file: "
                    + regexFile,
                e);
            throw new IOException(e);
        }
        logger.info("X_URLS regexp loader: loaded total of " + rules.size()
            + " patterns.");
        return rules;
    }

    public XURLSOuterDeduplicator(
        final List<Rule> rules,
        final String fieldName,
        final FieldToIndex fieldToIndex,
        final Logger logger)
    {
        this(rules, fieldName, fieldToIndex, true, logger);
    }

    public XURLSOuterDeduplicator(
        final List<Rule> rules,
        final String fieldName,
        final FieldToIndex fieldToIndex,
        final boolean useRegex,
        final Logger logger)
    {
        this.rules = rules;
        this.fieldName = StringHelper.intern(fieldName);
        this.useRegex = useRegex;
        this.logger = logger;
        fieldIndex = fieldToIndex.indexFor(this.fieldName);
    }

    private final String modifyLine( String line )
    {
        if (useRegex) {
            for( Rule rule : rules )
            {
                Matcher m = rule.pattern.matcher( line );
                String out = m.replaceAll( rule.replace );
                if( out == null ) return out;
                if( out != line ) return out;
            }
        }
	return line;
    }

    private void check(final char[] cbuf, final int off, final int len,
        final StringBuilder sb)
    {
        if (len > 0) {
            try {
                String line =
                    modifyLine(decoder.decode(cbuf, off, len));
                if (!"".equals(line)) {
                    if (!deduplicator.contains(line)) {
                        if (deduplicator.size() < DEDUPLICATOR_MAX_SIZE) {
                            deduplicator.add(line);
                        }
                        encoder.process(line);
                        encoder.processWith(
                            new StringBuilderProcessorAdapter(sb));
                        sb.append('\n');
                    }
                }
            } catch (CharacterCodingException e) {
                if (logger.isLoggable(Level.INFO)) {
                    logger.log(
                        Level.INFO,
                        "Error when processing " + new String(cbuf) + '['
                            + off + ',' + len + "]",
                        e);
                }
            }
        }
    }

    @Override
    public boolean modifyAndCheckDuplicated(final YaDoc3 document) {
	if( deduplicator.size() >= DEDUPLICATOR_MAX_SIZE ) return true; //Prevent from huge memory usage
	String value = null;
	YaField f = document.getField(fieldIndex);
	if( f != null ) value = f.toString();
	if( value == null ) return true;

        // TODO: use persistent buffer
        char[] cbuf = value.toCharArray();
        int prev = 0;
        sb.setLength(0);
        for (int i = 0; i < cbuf.length; ++i) {
            if (cbuf[i] == '\n') {
                check(cbuf, prev, i - prev, sb);
                prev = i + 1;
            }
        }
        check(cbuf, prev, cbuf.length - prev, sb);
        value = sb.toString();
        document.setField(
            fieldIndex,
            new YaField.StringYaField(StringUtils.getUtf8Bytes(value)));
        return value.isEmpty();
    }

    public void checkString(final String in) {
        if (logger.isLoggable(Level.FINE)) {
            logger.fine("XURLSOuterDeduplicator.checkString: checking string matching "
                + "for string <" + in +">");
        }
        for (Rule rule : rules) {
            Matcher m = rule.pattern.matcher( in );
            String out = m.replaceAll( rule.replace );
            if (logger.isLoggable(Level.FINE)) {
                logger.fine("XURLSOuterDeduplicator.checkString using pattern: "
                    + rule.pattern.toString() + "/" + rule.replace
                    + ". Result: " + out);
            }
        }
    }

    public Set<String> loadFields()
    {
	return Collections.singleton(fieldName);
    }
}
