package ru.yandex.tikaite.util;

import java.io.IOException;
import java.io.InputStream;
import java.io.Writer;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import java.util.TimeZone;
import java.util.zip.ZipException;

import org.apache.commons.compress.archivers.ArchiveException;
import org.apache.poi.util.IOUtils;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.CompositeDetector;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TaggedIOException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.mime.MimeType;
import org.apache.tika.mime.MimeTypeException;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.mime.MimeTypesFactory;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.epub.EpubParser;
import org.apache.tika.parser.geoinfo.GeographicInformationParser;
import org.apache.tika.parser.grib.GribParser;
import org.apache.tika.parser.image.TiffParser;
import org.apache.tika.parser.isatab.ISArchiveParser;
import org.apache.tika.parser.mat.MatParser;
import org.apache.tika.parser.mbox.OutlookPSTParser;
import org.apache.tika.parser.microsoft.JackcessParser;
import org.apache.tika.parser.microsoft.OfficeParser;
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
import org.apache.tika.parser.mp4.MP4Parser;
import org.apache.tika.parser.netcdf.NetCDFParser;
import org.apache.tika.parser.odf.OpenDocumentParser;
import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.sax.TaggedContentHandler;
import org.gagravarr.tika.OggDetector;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import ru.yandex.detect.rfc822.Rfc822Detector;
import ru.yandex.function.GenericAutoCloseableHolder;
import ru.yandex.io.ByteArrayInputStreamFactory;
import ru.yandex.io.DecodableByteArrayOutputStream;
import ru.yandex.io.GenericCloseableAdapter;
import ru.yandex.io.IOStreamUtils;
import ru.yandex.io.LimitedIOException;
import ru.yandex.io.LimitedWriter;
import ru.yandex.io.TrimmingWriter;
import ru.yandex.parser.html.BodyContentHandler;
import ru.yandex.parser.html.TextContentHandler;
import ru.yandex.tikaite.detect.DmarcDetector;
import ru.yandex.tikaite.detect.MetadataExtractorDetector;
import ru.yandex.tikaite.detect.TarDetector;
import ru.yandex.tikaite.detect.UngzippingDetector;
import ru.yandex.tikaite.detect.UnzippingDetector;
import ru.yandex.tikaite.parser.DmarcParser;
import ru.yandex.tikaite.parser.Fb2Parser;
import ru.yandex.tikaite.parser.HeifParser;
import ru.yandex.tikaite.parser.HtmlParser;
import ru.yandex.tikaite.parser.PdfParser;
import ru.yandex.tikaite.parser.PkpassParser;
import ru.yandex.tikaite.parser.PngParser;
import ru.yandex.tikaite.parser.RarParser;
import ru.yandex.tikaite.parser.RtfParser;
import ru.yandex.tikaite.parser.TxtParser;
import ru.yandex.tikaite.parser.mp4.JcToolsMp4Parser;
import ru.yandex.util.string.StringUtils;

public enum TextExtractor {
    INSTANCE;

    public static final Charset DEFAULT_EMAIL_CHARSET =
        Charset.forName("KOI8-R");

    @SuppressWarnings("ImmutableEnumChecker")
    private final PDFParserConfig pdfParserConfig = new PDFParserConfig();
    @SuppressWarnings("ImmutableEnumChecker")
    private final MimeTypes mimeTypes;
    @SuppressWarnings("ImmutableEnumChecker")
    private final MediaTypeRegistry registry;
    @SuppressWarnings("ImmutableEnumChecker")
    private final TextExtractorContext ultraFastContext;
    @SuppressWarnings("ImmutableEnumChecker")
    private final TextExtractorContext fastContext;
    @SuppressWarnings("ImmutableEnumChecker")
    private final TextExtractorContext so2Context;
    @SuppressWarnings("ImmutableEnumChecker")
    private final TextExtractorContext slowContext;

    TextExtractor() {
        pdfParserConfig.setCatchIntermediateIOExceptions(false);
        try (InputStream tikaMimetypes =
                MimeTypesFactory.class.getResourceAsStream(
                    "tika-mimetypes.xml");
            InputStream tikaiteMimetypes =
                TextExtractor.class.getResourceAsStream(
                    "tikaite-mimetypes.xml"))
        {
            mimeTypes =
                MimeTypesFactory.create(tikaMimetypes, tikaiteMimetypes);
        } catch (IOException | MimeTypeException e) {
            throw new RuntimeException(e);
        }
        registry = mimeTypes.getMediaTypeRegistry();

        TimeZone.setDefault(TimeZone.getTimeZone("Europe/Moscow"));
        IOUtils.setByteArrayMaxOverride(300000000);

        List<Detector> detectorsPrefix =
            new ArrayList<>(
                Arrays.asList(
                    MetadataExtractorDetector.INSTANCE,
                    Rfc822Detector.INSTANCE,
                    TarDetector.INSTANCE));
        List<Detector> mimeTypesDetector =
            Collections.singletonList(mimeTypes);
        List<Parser> parsers =
            new ArrayList<>(
                Arrays.asList(
                    HtmlParser.INSTANCE,
                    TxtParser.INSTANCE,
                    RtfParser.INSTANCE));
        ultraFastContext = new TextExtractorContext(
            registry,
            detectorsPrefix,
            mimeTypesDetector,
            mimeTypesDetector,
            parsers);

        parsers.add(RarParser.INSTANCE);
        parsers.add(Fb2Parser.INSTANCE);
        parsers.add(PngParser.INSTANCE);
        parsers.add(PkpassParser.INSTANCE);
        parsers.add(DmarcParser.INSTANCE);
        parsers.add(HeifParser.INSTANCE);
        parsers.add(new EpubParser());
        fastContext = new TextExtractorContext(
            registry,
            detectorsPrefix,
            mimeTypesDetector,
            mimeTypesDetector,
            parsers);

        so2Context = new TextExtractorContext(
            registry,
            Collections.emptyList(),
            mimeTypesDetector,
            mimeTypesDetector,
            Arrays.asList(
                HtmlParser.INSTANCE,
                TxtParser.INSTANCE,
                RtfParser.INSTANCE,
                new OpenDocumentParser(),
                new OfficeParser(),
                new OOXMLParser()));

        detectorsPrefix.addAll(
            Arrays.asList(
                PkpassParser.INSTANCE,
                DmarcDetector.INSTANCE,
                new UngzippingDetector(DmarcDetector.INSTANCE),
                new UnzippingDetector(DmarcDetector.INSTANCE)));
        parsers.add(PdfParser.INSTANCE);
        parsers.add(JcToolsMp4Parser.INSTANCE);
        parsers.add(TikaConfig.getDefaultConfig().getParser());

        Set<Detector> detectorsSet = new LinkedHashSet<>();
        listDetectors(new DefaultDetector(mimeTypes), detectorsSet);
        // OggDetector removed because it doesn't reset input stream mark
        removeType(detectorsSet.iterator(), OggDetector.class);
        slowContext = new TextExtractorContext(
            registry,
            detectorsPrefix,
            mimeTypesDetector,
            new ArrayList<>(detectorsSet),
            parsers);
    }

    private static void listDetectors(
        final Detector detector,
        final Collection<Detector> detectors)
    {
        if (detector instanceof CompositeDetector) {
            for (Detector nested
                : ((CompositeDetector) detector).getDetectors())
            {
                listDetectors(nested, detectors);
            }
        } else {
            detectors.add(detector);
        }
    }

    private <T> T removeType(
        final Iterator<? super T> iter,
        final Class<T> clazz)
    {
        while (iter.hasNext()) {
            Object obj = iter.next();
            if (clazz.isInstance(obj)) {
                iter.remove();
                return clazz.cast(obj);
            }
        }
        return null;
    }

    public TextExtractorContext ultraFastContext() {
        return ultraFastContext;
    }

    public TextExtractorContext fastContext() {
        return fastContext;
    }

    public TextExtractorContext so2Context() {
        return so2Context;
    }

    public TextExtractorContext slowContext() {
        return slowContext;
    }

    public String getExtension(final MediaType mediaType) {
        MimeType type = null;
        try {
            type = mimeTypes.getRegisteredMimeType(mediaType.toString());
        } catch (MimeTypeException e) {
            // Impossible situation because mimetype
            // was returned by MediaType.toString()
        }
        if (type == null) {
            return "";
        } else {
            return type.getExtension();
        }
    }

    public static Charset normalizeCharset(final String charset) {
        try {
            return Charset.forName(charset);
        } catch (RuntimeException e) {
            return DEFAULT_EMAIL_CHARSET;
        }
    }

    private static MediaType extractMediaType(
        final String mimetype,
        final Charset charset)
    {
        MediaType mediaType;
        if (mimetype == null) {
            mediaType = null;
        } else if (charset == null) {
            mediaType = MediaType.parse(mimetype);
        } else {
            mediaType = MediaType.parse(
                StringUtils.concat(mimetype, "; charset=", charset.name()));
        }
        return mediaType;
    }

    private DetectionResult detectionResult(
        final TextExtractorContext context,
        final GenericAutoCloseableHolder<
            IOException,
            GenericCloseableAdapter<TikaInputStream>> holder,
        final MediaType normalizedType,
        final MediaType type)
    {
        DetectionResult dr = new DetectionResult(
            holder.get().get(),
            context.parserFor(normalizedType),
            type);
        holder.release();
        return dr;
    }

    public DetectionResult detectStreamType(
        final TikaInputStream tis,
        final TextExtractOptions options)
        throws IOException
    {
        return detectStreamType(tis, options, null);
    }

    // CSOFF: ReturnCount
    public DetectionResult detectStreamType(
        final TikaInputStream tis,
        final TextExtractOptions options,
        final Charset charset)
        throws IOException
    {
        TextExtractorContext context = options.mode().textExtractorContext();
        try (GenericAutoCloseableHolder<
                IOException,
                GenericCloseableAdapter<TikaInputStream>> holder =
                new GenericAutoCloseableHolder<>(
                    new GenericCloseableAdapter<>(tis)))
        {
            MediaType userType =
                extractMediaType(options.mimetypeHint(), charset);
            MediaType normalizedUserType;
            if (userType == null) {
                normalizedUserType = null;
            } else {
                normalizedUserType =
                    context.normalize(registry.normalize(userType));
            }
            MediaType fastDetectedType;
            if (normalizedUserType != null
                && !MediaType.OCTET_STREAM.equals(normalizedUserType))
            {
                MediaType type = registry.normalize(
                    context.fastDetector().detect(tis, new Metadata()));
                fastDetectedType = type;
                MediaType normalized = context.normalize(type);
                if (normalized.getBaseType().equals(normalizedUserType)) {
                    return detectionResult(
                        context,
                        holder,
                        normalizedUserType,
                        userType);
                } else if (!MediaType.OCTET_STREAM.equals(normalized)
                    && normalized.getType().equals(
                        normalizedUserType.getType()))
                {
                    if (registry.isSpecializationOf(
                        normalized,
                        normalizedUserType))
                    {
                        return detectionResult(
                            context,
                            holder,
                            normalized,
                            type);
                    } else if (registry.isSpecializationOf(
                        normalizedUserType,
                        normalized))
                    {
                        return detectionResult(
                            context,
                            holder,
                            normalizedUserType,
                            userType);
                    }
                }
            } else {
                fastDetectedType = null;
            }
            Detector detector = context.slowDetector();
            MediaType type;
            if (fastDetectedType == null) {
                if (detector == null) {
                    detector = context.fastDetector();
                }
                type =
                    registry.normalize(detector.detect(tis, new Metadata()));
            } else if (detector == null) {
                type = fastDetectedType;
            } else {
                type =
                    registry.normalize(detector.detect(tis, new Metadata()));
            }
            MediaType normalized = context.normalize(type);
            if (MediaType.OCTET_STREAM.equals(normalized)
                && normalizedUserType != null)
            {
                type = userType;
                normalized = normalizedUserType;
            }
            return detectionResult(context, holder, normalized, type);
        } catch (TaggedIOException e) {
            throw e.getCause();
        }
    }
    // CSON: ReturnCount

    public boolean checkConstraints(
        final TextExtractOptions options,
        final Parser parser)
    {
        if (parser == DmarcParser.INSTANCE) {
            return options.parseDmarc();
        }
        return options.parsingType() != TextExtractOptions.ParsingType.STREAM
            || !(parser instanceof MP4Parser
                || parser instanceof TiffParser
                || parser instanceof GribParser
                || parser instanceof ISArchiveParser
                || parser instanceof JackcessParser
                || parser instanceof GeographicInformationParser
                || parser instanceof OutlookPSTParser
                || parser instanceof NetCDFParser
                || parser instanceof MatParser);
    }

    // CSOFF: FinalParameters
    private Writer wrapWriter(Writer writer, final int writeLimit)
        throws IOException
    {
        if (writeLimit != -1) {
            writer = new LimitedWriter(writer, writeLimit);
        }
        return new TrimmingWriter(writer);
    }
    // CSON: FinalParameters

    @SuppressWarnings("try")
    public TextExtractResult extractText(
        final DetectionResult dr,
        final Writer writer,
        final TextExtractOptions options)
        throws IOException
    {
        TextExtractorContext extractorContext =
            options.mode().textExtractorContext();
        Metadata metadata = new Metadata();
        metadata.set(Metadata.CONTENT_TYPE, dr.mediaType().toString());
        Parser parser = dr.parser();
        if (parser == null) {
            return new TextExtractResult(metadata, -1, null);
        } else {
            int truncated = -1;
            Throwable cause = null;
            int writeLimit = options.writeLimit();
            Writer outputWriter = wrapWriter(writer, writeLimit);
            if (options.pureBodyWriter() != null
                && dr.mediaType().getBaseType().equals(MediaType.TEXT_PLAIN))
            {
                outputWriter = new QuotesSeparatingWriter(
                    outputWriter,
                    wrapWriter(options.pureBodyWriter(), writeLimit));
            }
            TextContentHandler handler = new TextContentHandler(outputWriter);
            ContentHandler outputHandler;
            if (options.pureBodyWriter() != null
                && dr.mediaType().getBaseType().equals(MediaType.TEXT_HTML))
            {
                outputHandler = new QuotesSeparatingContentHandler(
                    handler,
                    new TextContentHandler(
                        wrapWriter(options.pureBodyWriter(), writeLimit)));
            } else {
                outputHandler = handler;
            }
            TaggedContentHandler taggedHandler =
                new TaggedContentHandler(outputHandler);
            try {
                DecodableByteArrayOutputStream htmlBody = options.htmlBody();
                TemporaryDirectory tmpDir;
                InputStream input;
                if (outputHandler == handler || htmlBody == null) {
                    tmpDir = null;
                    input = dr.input();
                } else {
                    IOStreamUtils.copy(dr.input(), htmlBody);
                    tmpDir =
                        new TemporaryDirectory(options.mode().allowTmpFiles());
                    input =
                        TikaInputStream.get(
                            htmlBody.processWith(
                                ByteArrayInputStreamFactory.INSTANCE),
                            tmpDir);
                }
                try (TemporaryDirectory guard = tmpDir) {
                    ParseContext context = new ParseContext();
                    context.set(
                        Parser.class,
                        extractorContext.autoDetectParser());
                    context.set(TextExtractOptions.class, options);
                    context.set(PDFParserConfig.class, pdfParserConfig);
                    context.set(MimeTypes.class, mimeTypes);
                    ContentHandler bodyHandler;
                    if (options.urlProcessor() == null) {
                        bodyHandler = new BodyContentHandler(taggedHandler);
                    } else {
                        bodyHandler = new UrlHandler(
                            taggedHandler,
                            options.urlProcessor());
                    }
                    parser.parse(
                        input,
                        bodyHandler,
                        metadata,
                        context);
                } catch (Throwable t) {
                    if (LimitedIOException.isLimitedIOException(t)) {
                        truncated = writeLimit;
                    } else {
                        throw t;
                    }
                }
            } catch (IOException e) {
                dr.throwIfCauseOf(e);
                cause = e;
            } catch (SAXException e) {
                if (taggedHandler.isCauseOf(e)) {
                    throw new IOException(e.getCause());
                } else {
                    cause = e;
                }
            } catch (RuntimeException | TikaException | Error e) {
                cause = e.getCause();
                if (!(cause instanceof ArchiveException)
                    && !(cause instanceof ZipException))
                {
                    cause = e;
                }
            }
            return new TextExtractResult(metadata, truncated, cause);
        }
    }

    public MediaType improveMimetype(
        final MediaType detected,
        final String parsed)
    {
        MediaType detectedType = registry.normalize(detected);
        MediaType parsedType = registry.normalize(MediaType.parse(parsed));
        if (registry.isSpecializationOf(detectedType, parsedType)) {
            return detected;
        } else {
            return parsedType;
        }
    }
}

