package ru.yandex.tikaite.mimeparser;

import java.io.IOException;
import java.io.InputStream;
import java.io.Writer;
import java.nio.charset.Charset;
import java.nio.charset.CodingErrorAction;
import java.util.Locale;
import java.util.Objects;
import java.util.Set;

import org.apache.tika.io.TikaInputStream;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.Parser;

import ru.yandex.charset.Decoder;
import ru.yandex.io.CountingWriter;
import ru.yandex.io.DecodableByteArrayOutputStream;
import ru.yandex.io.StringBuilderWriter;
import ru.yandex.sanitizer2.HtmlNode;
import ru.yandex.sanitizer2.HtmlPrinter;
import ru.yandex.sanitizer2.IdentityAttrPostProcessor;
import ru.yandex.sanitizer2.NullUrlCollector;
import ru.yandex.sanitizer2.SanitizingHandler;
import ru.yandex.sanitizer2.StringBuilderHtmlCollector;
import ru.yandex.search.document.Document;
import ru.yandex.search.document.mail.MailMetaInfo;
import ru.yandex.tikaite.util.BackgroundEraser;
import ru.yandex.tikaite.util.CommonFields;
import ru.yandex.tikaite.util.DetectionResult;
import ru.yandex.tikaite.util.TemporaryDirectory;
import ru.yandex.tikaite.util.TextExtractOptions;
import ru.yandex.tikaite.util.TextExtractResult;
import ru.yandex.tikaite.util.TextExtractor;
import ru.yandex.tikaite.util.UrlCollector;
import ru.yandex.url.processor.UrlProcessor;

public class MailDocument extends FilterDocument {
    protected final MailMetaInfo meta;
    protected final TextExtractOptions options;
    protected boolean parsed = false;
    private boolean hasError = false;

    public MailDocument(
        final Document document,
        final MailMetaInfo meta,
        final TextExtractOptions options)
    {
        super(document);
        this.meta = meta;
        this.options = options;
    }

    public void addError(final Throwable cause) throws IOException {
        if (!hasError && cause != null) {
            hasError = true;
            addField(CommonFields.ERROR, cause);
        }
    }

    private static String selectAttachType(final MediaType mediaType) {
        String ext = TextExtractor.INSTANCE.getExtension(mediaType);
        if (ext.isEmpty()) {
            return null;
        } else if (ext.charAt(0) == '.') {
            ext = ext.substring(1);
        }
        return normalizeExtension(ext);
    }

    private static String normalizeExtension(String ext) {
        switch (ext) {
            case "asc":
            case "sig":
                ext = "asc sig";
                break;

            case "bash":
            case "sh":
                ext = "bash sh";
                break;

            case "cer":
            case "crt":
            case "der":
                ext = "cer crt der";
                break;

            case "djv":
            case "djvu":
                ext = "djv djvu";
                break;

            case "doc":
            case "docx":
                ext = "doc docx";
                break;

            case "dot":
            case "dotx":
                ext = "dot dotx";
                break;

            case "eml":
            case "mime":
                ext = NestedMessageHandler.ATTACHTYPE;
                break;

            case "eps":
            case "ps":
                ext = "eps ps";
                break;

            case "htm":
            case "html":
                ext = "htm html";
                break;

            case "jpe":
            case "jpeg":
            case "jpg":
                ext = "jpe jpeg jpg";
                break;

            case "mid":
            case "midi":
                ext = "mid midi";
                break;

            case "mp3":
            case "mpga":
                ext = "mp3 mpga";
                break;

            case "mpe":
            case "mpeg":
            case "mpg":
                ext = "mpe mpeg mpg";
                break;

            case "oga":
            case "ogg":
                ext = "oga ogg";
                break;

            case "p12":
            case "pfx":
                ext = "p12 pfx";
                break;

            case "p7b":
            case "spc":
                ext = "p7b spc";
                break;

            case "ppt":
            case "pptx":
                ext = "ppt pptx";
                break;

            case "texi":
            case "texinfo":
                ext = "texi texinfo";
                break;

            case "text":
            case "txt":
                ext = "txt text";
                break;

            case "tif":
            case "tiff":
                ext = "tif tiff";
                break;

            case "xht":
            case "xhtml":
                ext = "xht xhtml";
                break;

            case "xls":
            case "xlsx":
                ext = "xls xlsx";
                break;

            default:
                break;
        }
        return ext;
    }

    private TextExtractOptions prepareOptions(
        final UrlCollector urlCollector,
        final boolean isAttachment,
        final StringBuilder pureBody,
        final DecodableByteArrayOutputStream htmlBody)
    {
        boolean extractUrls = true;
        for (Integer type: meta.messageTypes()) {
            if (options.noXurlsTypes().contains(type)) {
                extractUrls = false;
                break;
            }
        }
        UrlProcessor processor;
        if (extractUrls) {
            processor = new UrlProcessor(urlCollector);
        } else {
            processor = null;
        }
        TextExtractOptions options =
            new TextExtractOptions(this.options)
                .parsingType(
                    TextExtractOptions.ParsingType.MEMORY)
                .urlProcessor(processor);
        if (!isAttachment) {
            options.pureBodyWriter(
                new StringBuilderWriter(pureBody));
            options.htmlBody(htmlBody);
        }
        return options;
    }

    private static void writeUrls(
        final Document document,
        final String field,
        final Set<String> urls)
        throws IOException
    {
        if (!urls.isEmpty()) {
            try (Writer writer = document.addField(field)) {
                for (String url: urls) {
                    writer.write(url);
                    writer.write('\n');
                }
            }
        }
    }

    protected void addType(
        final MediaType mediaType,
        final boolean isAttachment)
        throws IOException
    {
        String contentType = meta.contentType();
        boolean octetStream = mediaType.equals(MediaType.OCTET_STREAM);
        if (!octetStream) {
            String mimetype = mediaType.getBaseType().toString();
            if (contentType == null) {
                contentType = mimetype;
            }
            document.addField(CommonFields.MIMETYPE, mimetype);
        }
        if (contentType != null) {
            document.addField(MailMetaInfo.CONTENT_TYPE, contentType);
        }
        if (isAttachment) {
            String attachType = null;
            if (!octetStream) {
                attachType = selectAttachType(mediaType);
            }
            String attachname = meta.get(MailMetaInfo.ATTACHNAME);
            if (attachname != null) {
                int idx = attachname.lastIndexOf('.');
                if (idx != -1) {
                    attachname = normalizeExtension(
                        attachname.substring(idx + 1).trim()
                            .toLowerCase(Locale.ENGLISH));
                    if (!attachname.isEmpty()) {
                        if (attachType == null) {
                            attachType = attachname;
                        } else if (!attachname.equals(attachType)) {
                            attachType = attachType + ' ' + attachname;
                        }
                    }
                }
            }
            if (attachType != null) {
                document.addField(MailMetaInfo.ATTACHTYPE, attachType);
            }
        }
    }

    public void process(
        final InputStream in,
        final String charsetHint,
        final boolean isAttachment)
        throws IOException
    {
        MediaType mediaType = MediaType.OCTET_STREAM;
        UrlCollector urlCollector = new UrlCollector();
        StringBuilder pureBody = new StringBuilder();
        DecodableByteArrayOutputStream htmlBody;
        if (options.sanitizer() == null) {
            htmlBody = null;
        } else {
            htmlBody = new DecodableByteArrayOutputStream();
        }
        TextExtractOptions options =
            prepareOptions(urlCollector, isAttachment, pureBody, htmlBody);
        String contentType = meta.contentType();
        if (contentType != null) {
            options.mimetypeHint(contentType);
        }
        String charsetString = meta.charset();
        Charset bodyCharset = null;
        if (charsetString != null) {
            bodyCharset = TextExtractor.normalizeCharset(
                Objects.toString(charsetHint, charsetString));
            options.charsetHint(bodyCharset);
        }
        try (DetectionResult dr = TextExtractor.INSTANCE.detectStreamType(
                TikaInputStream.get(
                    in,
                    new TemporaryDirectory(
                        options.mode().allowTmpFiles())),
                options,
                bodyCharset))
        {
            mediaType = dr.mediaType();
            Parser parser = dr.parser();
            if (parser == null
                || !TextExtractor.INSTANCE.checkConstraints(
                    options,
                    parser))
            {
                addField(CommonFields.PARSED, false);
            } else {
                TextExtractResult result;
                Throwable error;
                boolean empty;
                try (CountingWriter writer = new CountingWriter(
                        addField(CommonFields.BODY_TEXT)))
                {
                    result = TextExtractor.INSTANCE.extractText(
                        dr,
                        writer,
                        options);
                    error = result.cause();
                    empty = writer.pos() == 0L;
                }
                addError(error);
                Set<String> urls = urlCollector.urls();
                parsed =
                    !empty
                    || !urls.isEmpty()
                    || error == null
                    || result.size() > 1;
                document.addField(CommonFields.PARSED, parsed);
                if (pureBody.length() > 0) {
                    document.addField(
                        MailMetaInfo.PURE_BODY,
                        new String(pureBody));
                }
                result.metadata(document);
                writeUrls(document, MailMetaInfo.X_URLS, urls);
                mediaType = TextExtractor.INSTANCE.improveMimetype(
                    mediaType,
                    result.mimetype());
                if (htmlBody != null && !htmlBody.isEmpty()) {
                    Charset charset;
                    String charsetName =
                        mediaType.getParameters().get("charset");
                    if (charsetName == null) {
                        charset = bodyCharset;
                    } else {
                        charset = TextExtractor.normalizeCharset(
                            charsetName);
                    }
                    if (charset != null) {
                        int maxSanitizingLength =
                            options.maxSanitizingLength();
                        if (maxSanitizingLength >= 0) {
                            htmlBody.truncate(
                                maxSanitizingLength,
                                (byte) ' ');
                        }
                        Decoder decoder = new Decoder(
                            charset.newDecoder()
                                .onMalformedInput(
                                    CodingErrorAction.REPLACE)
                                .onUnmappableCharacter(
                                    CodingErrorAction.REPLACE));
                        htmlBody.processWith(decoder);
                        SanitizingHandler sanitizer =
                            options.sanitizer();
                        options = null;
                        htmlBody = null;
                        int len = decoder.length();
                        HtmlNode root =
                            sanitizer.sanitize(decoder, false);
                        decoder = null;
                        StringBuilderHtmlCollector htmlCollector =
                            new StringBuilderHtmlCollector(len);
                        root.accept(
                            new HtmlPrinter<>(
                                sanitizer.config(),
                                htmlCollector,
                                NullUrlCollector.INSTANCE,
                                IdentityAttrPostProcessor.INSTANCE,
                                BackgroundEraser.INSTANCE));
                        root = null;
                        document.addField(
                            MailMetaInfo.HTML_BODY,
                            new String(htmlCollector.sb()));
                    }
                }
            }
        }
        addType(mediaType, isAttachment);
    }
}

