package ru.yandex.tikaite.parser;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.nio.charset.CodingErrorAction;
import java.util.List;

import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.Parser;

import ru.yandex.tikaite.util.TextExtractor;

public abstract class AutoDetectionParser implements Parser {
    public static final long serialVersionUID = 0;

    public abstract List<EncodingDetector> detectors();

    public abstract MediaType mediaType();

    @SuppressWarnings("ReferenceEquality")
    public Reader getReader(InputStream is, final Metadata metadata)
        throws IOException, TikaException
    {
        Charset charset = null;
        MediaType mediaType;
        String contentType = metadata.get(Metadata.CONTENT_TYPE);
        if (contentType == null) {
            mediaType = mediaType();
        } else {
            mediaType = MediaType.parse(contentType);
            String charsetName = mediaType.getParameters().get("charset");
            if (charsetName != null && Charset.isSupported(charsetName)) {
                charset = Charset.forName(charsetName);
            }
        }

        if (charset == null
            || charset == TextExtractor.DEFAULT_EMAIL_CHARSET)
        {
            if (!is.markSupported()) {
                is = new BufferedInputStream(is);
            }
            for (EncodingDetector detector: detectors()) {
                charset = detector.detect(is, metadata);
                if (charset != null) {
                    break;
                }
            }
        }

        if (charset == null) {
            charset = TextExtractor.DEFAULT_EMAIL_CHARSET;
        }

        metadata.set(
            Metadata.CONTENT_TYPE,
            new MediaType(mediaType, charset).toString());
        return new InputStreamReader(is, charset.newDecoder()
            .onMalformedInput(CodingErrorAction.IGNORE)
            .onUnmappableCharacter(CodingErrorAction.IGNORE));
    }
}

