package ru.yandex.tikaite.parser;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Set;

import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.html.HtmlEncodingDetector;
import org.apache.tika.parser.txt.UniversalEncodingDetector;
import org.owasp.html.HtmlStreamEventReceiver;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;

import ru.yandex.function.CharArrayProcessable;
import ru.yandex.function.EmptyRunnable;
import ru.yandex.function.NullConsumer;
import ru.yandex.io.IOStreamUtils;
import ru.yandex.parser.config.ConfigException;
import ru.yandex.parser.config.IniConfig;
import ru.yandex.sanitizer2.PageHeaderException;
import ru.yandex.sanitizer2.SanitizingHandler;
import ru.yandex.sanitizer2.config.SanitizingConfigBuilder;

public class HtmlParser extends AutoDetectionParser {
    public static final HtmlParser INSTANCE = new HtmlParser();

    private static final char[] IGNORABLE_WHITESPACE = new char[] {'\n'};

    private static final long serialVersionUID = 0;

    private static final List<EncodingDetector> DETECTORS =
        Collections.unmodifiableList(
            Arrays.asList(
                new HtmlEncodingDetector(),
                new UniversalEncodingDetector()));

    private static final org.apache.tika.parser.html.HtmlParser OLD_PARSER =
        new org.apache.tika.parser.html.HtmlParser();
    private final SanitizingHandler sanitizer;

    protected HtmlParser() {
        try {
            IniConfig ini =
                new IniConfig(
                    new InputStreamReader(
                        HtmlParser.class.getResourceAsStream("sanitizer.conf"),
                        StandardCharsets.UTF_8));
            sanitizer =
                new SanitizingHandler(
                    new SanitizingConfigBuilder(ini).build(),
                    NullConsumer.instance(),
                    EmptyRunnable.INSTANCE);
        } catch (ConfigException | IOException | PageHeaderException e) {
            throw new RuntimeException(e);
        }
    }

    @Override
    public Set<MediaType> getSupportedTypes(final ParseContext context) {
        return OLD_PARSER.getSupportedTypes(context);
    }

    @Override
    public List<EncodingDetector> detectors() {
        return DETECTORS;
    }

    @Override
    public MediaType mediaType() {
        return MediaType.TEXT_HTML;
    }

    @Override
    public void parse(
        final InputStream is,
        final ContentHandler handler,
        final Metadata metadata,
        final ParseContext context)
        throws IOException, SAXException, TikaException
    {
        StringBuilder sb = new StringBuilder("<div>");
        IOStreamUtils.consume(getReader(is, metadata), sb);
        sb.append("</div>");
        int len = sb.length();
        char[] buf = new char[len];
        sb.getChars(0, len, buf, 0);
        sb = null;
        EventsTranslator translator = new EventsTranslator(handler);
        try {
            sanitizer.sanitize(
                new CharArrayProcessable(buf),
                sanitizer.policy().apply(translator));
        } catch (RuntimeSAXException e) {
            throw e.exception();
        }
        String title = translator.title();
        if (title != null
            && metadata.get(TikaCoreProperties.TITLE.getName()) == null)
        {
            metadata.add(TikaCoreProperties.TITLE, title);
        }
    }

    private static class EventsTranslator implements HtmlStreamEventReceiver {
        private static final char[] EMPTY_BUF = new char[0];

        private final AttributesImpl attrs = new AttributesImpl();
        private final ContentHandler handler;
        private char[] buf = EMPTY_BUF;
        private boolean alreadySeenTitle = false;
        private boolean isTitle = false;
        private int titleCount = 0;
        private StringBuilder sb = null;

        EventsTranslator(final ContentHandler handler) {
            this.handler = handler;
        }

        public String title() {
            String title;
            if (sb == null || sb.length() == 0) {
                return null;
            } else {
                title = new String(sb).trim();
                if (title.isEmpty()) {
                    title = null;
                }
            }
            return title;
        }

        @Override
        public void openDocument() {
            try {
                handler.startDocument();
                attrs.clear();
                handler.startElement(
                    "http://www.w3.org/1999/xhtml",
                    "html",
                    "html",
                    attrs);
                handler.startElement(
                    "http://www.w3.org/1999/xhtml",
                    "body",
                    "body",
                    attrs);
            } catch (SAXException e) {
                throw new RuntimeSAXException(e);
            }
        }

        @Override
        public void closeDocument() {
            try {
                handler.endElement(
                    "http://www.w3.org/1999/xhtml",
                    "body",
                    "body");
                handler.endElement(
                    "http://www.w3.org/1999/xhtml",
                    "html",
                    "html");
                handler.endDocument();
            } catch (SAXException e) {
                throw new RuntimeSAXException(e);
            }
        }

        @Override
        public void openTag(
            final String tagName,
            final List<String> attrsList)
        {
            if ("title".equals(tagName)) {
                ++titleCount;
                isTitle = true;
            } else {
                attrs.clear();
                int size = attrsList.size();
                for (int i = 0; i < size;) {
                    String name = attrsList.get(i++);
                    String value = attrsList.get(i++);
                    attrs.addAttribute("", name, name, "", value);
                }
                try {
                    handler.ignorableWhitespace(IGNORABLE_WHITESPACE, 0, 1);
                    handler.startElement("", tagName, tagName, attrs);
                } catch (SAXException e) {
                    throw new RuntimeSAXException(e);
                }
            }
        }

        @Override
        public void closeTag(final String tagName) {
            if ("title".equals(tagName)) {
                if (--titleCount == 0) {
                    alreadySeenTitle = true;
                    isTitle = false;
                }
            } else {
                try {
                    handler.endElement("", tagName, tagName);
                    handler.ignorableWhitespace(IGNORABLE_WHITESPACE, 0, 1);
                } catch (SAXException e) {
                    throw new RuntimeSAXException(e);
                }
            }
        }

        @Override
        public void text(final String text) {
            if (isTitle) {
                if (!alreadySeenTitle) {
                    if (sb == null) {
                        sb = new StringBuilder();
                    }
                    sb.append(text);
                }
            } else {
                int len = text.length();
                if (len > buf.length) {
                    buf = new char[Math.max(len, buf.length << 1)];
                }
                text.getChars(0, len, buf, 0);
                try {
                    handler.characters(buf, 0, len);
                } catch (SAXException e) {
                    throw new RuntimeSAXException(e);
                }
            }
        }
    }

    private static class RuntimeSAXException extends RuntimeException {
        private static final long serialVersionUID = 0L;

        private final SAXException exception;

        public RuntimeSAXException(final SAXException exception) {
            this.exception = exception;
        }

        public SAXException exception() {
            return exception;
        }
    }
}

