package ru.yandex.tikaite.detect;

import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;

import org.apache.tika.detect.Detector;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.txt.UniversalEncodingDetector;

import ru.yandex.io.InputStreamResetter;

public enum CharsetDetector implements Detector {
    INSTANCE;

    private static final int MAX_LOOKAHEAD = 2048;
    private static final int MAX_ZEROS = 4;
    private static final EncodingDetector DETECTOR =
        new UniversalEncodingDetector();

    @SuppressWarnings("try")
    private static boolean isBinary(final InputStream in) throws IOException {
        try (InputStreamResetter resetter =
                new InputStreamResetter(in, MAX_LOOKAHEAD))
        {
            int zeros = 0;
            for (int i = 0; i < MAX_LOOKAHEAD; ++i) {
                int b = in.read();
                if (b == -1) {
                    break;
                }
                if (b == 0) {
                    if (++zeros > MAX_ZEROS) {
                        return true;
                    }
                } else {
                    zeros = 0;
                }
            }
        }
        return false;
    }

    @Override
    public MediaType detect(final InputStream in, final Metadata metadata)
        throws IOException
    {
        MediaType result;
        if (isBinary(in)) {
            result = MediaType.OCTET_STREAM;
        } else {
            Charset charset = DETECTOR.detect(in, metadata);
            if (charset == null) {
                result = MediaType.OCTET_STREAM;
            } else {
                result = new MediaType(MediaType.TEXT_PLAIN, charset);
            }
        }
        return result;
    }
}

