package ru.yandex.tikaite.util;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.tika.detect.CompositeDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.fork.ForkParser;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.epub.EpubParser;
import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.parser.gdal.GDALParser;
import org.apache.tika.parser.journal.JournalParser;
import org.apache.tika.parser.ocr.TesseractOCRParser;
import org.apache.tika.parser.pot.PooledTimeSeriesParser;
import org.apache.tika.parser.strings.StringsParser;

import ru.yandex.tikaite.detect.CharsetDetector;
import ru.yandex.tikaite.parser.HtmlParser;

public class TextExtractorContext implements Serializable {
    private static final long serialVersionUID = 0L;
    private static final List<Detector> DETECTORS_SUFFIX =
        Collections.singletonList(CharsetDetector.INSTANCE);

    private final Map<MediaType, Parser> types = new HashMap<>();
    private final MediaTypeRegistry registry;
    private final Detector fastDetector;
    private final Detector slowDetector;
    private final AutoDetectParser autoDetectParser;

    // CSOFF: ParameterNumber
    public TextExtractorContext(
        final MediaTypeRegistry registry,
        final List<Detector> detectorsPrefix,
        final List<Detector> fastDetectors,
        final List<Detector> slowDetectors,
        final List<Parser> rootParsers)
    {
        this.registry = registry;

        Set<Parser> parsers = new LinkedHashSet<>();
        for (Parser parser: rootParsers) {
            listParsers(parser, parsers);
        }

        ParseContext context = new ParseContext();
        for (Parser parser: parsers) {
            if (parser instanceof EpubParser) {
                ((EpubParser) parser).setContentParser(HtmlParser.INSTANCE);
            }
            context.set(Parser.class, parser);
            for (MediaType type: parser.getSupportedTypes(context)) {
                types.putIfAbsent(registry.normalize(type), parser);
            }
        }

        fastDetector =
            new CompositeDetector(
                registry,
                concat(
                    detectorsPrefix,
                    fastDetectors,
                    DETECTORS_SUFFIX));
        Detector autoDetectDetector;
        if (slowDetectors.equals(fastDetectors)) {
            slowDetector = null;
            autoDetectDetector = fastDetector;
        } else {
            slowDetector =
                new CompositeDetector(
                    registry,
                    concat(
                        detectorsPrefix,
                        slowDetectors,
                        DETECTORS_SUFFIX));
            autoDetectDetector = slowDetector;
        }
        Set<Parser> uniqueParsers = new HashSet<>(types.values());
        autoDetectParser = new AutoDetectParser(
            autoDetectDetector,
            uniqueParsers.toArray(new Parser[uniqueParsers.size()]));
    }
    // CSON: ParameterNumber

    private static <T> List<T> concat(
        final List<? extends T> prefix,
        final List<? extends T> body,
        final List<? extends T> suffix)
    {
        List<T> result =
            new ArrayList<>(prefix.size() + body.size() + suffix.size());
        result.addAll(prefix);
        result.addAll(body);
        result.addAll(suffix);
        return result;
    }

    private static void listParsers(
        final Parser parser,
        final Collection<Parser> parsers)
    {
        if (parser instanceof CompositeParser) {
            for (Parser p: ((CompositeParser) parser).getParsers().values()) {
                listParsers(p, parsers);
            }
        } else if (!(parser instanceof GDALParser
            || parser instanceof ExternalParser
            || parser instanceof StringsParser
            || parser instanceof TesseractOCRParser
            || parser instanceof JournalParser
            || parser instanceof PooledTimeSeriesParser
            || parser instanceof ForkParser))
        {
            parsers.add(parser);
        }
    }

    public Detector fastDetector() {
        return fastDetector;
    }

    public Detector slowDetector() {
        return slowDetector;
    }

    public AutoDetectParser autoDetectParser() {
        return autoDetectParser;
    }

    public Parser parserFor(final MediaType mediaType) {
        return types.get(mediaType);
    }

    public MediaType normalize(final MediaType originalType) {
        MediaType type = originalType;
        while (type != null && !type.equals(MediaType.OCTET_STREAM)) {
            Parser parser = types.get(type);
            if (parser == null) {
                type = registry.getSupertype(type);
            } else {
                return type;
            }
        }
        return originalType;
    }
}

