package ru.yandex.tikaite.parser;

import java.awt.geom.Rectangle2D;
import java.io.ByteArrayInputStream;
import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Date;
import java.util.Set;

import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.PagedText;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.sax.XHTMLContentHandler;
import org.pdfclown.bytes.Buffer;
import org.pdfclown.documents.Document;
import org.pdfclown.documents.Page;
import org.pdfclown.documents.contents.ContentScanner;
import org.pdfclown.documents.contents.TextChar;
import org.pdfclown.documents.contents.TextStyle;
import org.pdfclown.documents.contents.fonts.Font;
import org.pdfclown.documents.contents.objects.ContainerObject;
import org.pdfclown.documents.contents.objects.ContentObject;
import org.pdfclown.documents.contents.objects.Text;
import org.pdfclown.documents.contents.objects.XObject;
import org.pdfclown.documents.interchange.metadata.Information;
import org.pdfclown.files.File;
import org.pdfclown.objects.PdfName;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import ru.yandex.tikaite.util.TextExtractOptions;

public enum PdfParser implements Parser {
    INSTANCE;

    private static final int MIN_READ = 8192;
    private static final int INITIAL_CAPACITY = 262144;
    private static final int INITIAL_CBUF_SIZE = 256;
    private static final double SPACE_WIDTH_SCALE = .25f;
    private static final double SPACE_WIDTH_TOLERANCE = .33f;
    private static final String P = "p";
    private static final String PARSER = "parser";
    private static final String PDFBOX = "pdfbox";
    private static final PDFParser OLD_PARSER = new PDFParser();

    @Override
    public Set<MediaType> getSupportedTypes(final ParseContext context) {
        return OLD_PARSER.getSupportedTypes(context);
    }

    private void setMetadata(
        final Metadata metadata,
        final Property property,
        final Object value)
    {
        if (value != null) {
            if (value instanceof Date) {
                metadata.set(property, (Date) value);
            } else if (value instanceof Integer) {
                metadata.set(property, ((Integer) value).intValue());
            } else {
                String string = value.toString().trim();
                if (!string.isEmpty()) {
                    metadata.set(property, string);
                }
            }
        }
    }

    private static DataProvider getData(
        final TikaInputStream tis,
        final TextExtractOptions.ParsingType type)
        throws IOException
    {
        if (type == TextExtractOptions.ParsingType.MEMORY) {
            return new MemoryProvider(tis);
        } else {
            return new FileProvider(tis);
        }
    }

    //CSOFF: ParameterNumber
    @Override
    public void parse(
        final InputStream is,
        final ContentHandler handler,
        final Metadata metadata,
        final ParseContext context)
        throws IOException, SAXException, TikaException
    {
        TextExtractOptions.ParsingType type =
            context.get(TextExtractOptions.class, new TextExtractOptions())
                .parsingType();
        if (type == TextExtractOptions.ParsingType.STREAM) {
            SAXException e = null;
            try {
                OLD_PARSER.parse(is, handler, metadata, context);
            } catch (SAXException ex) {
                e = ex;
            }
            metadata.set(PARSER, PDFBOX);
            if (e != null) {
                throw e;
            }
        } else {
            try (DataProvider data = getData(TikaInputStream.cast(is), type)) {
                boolean empty = true;
                try (File file = data.getFile()) {
                    Document document = file.getDocument();
                    SAXException ex = null;
                    try {
                        XHTMLContentHandler xhtml =
                            new XHTMLContentHandler(handler, metadata);
                        xhtml.startDocument();
                        xhtml.startElement(P);
                        try (CharHandler out = new CharHandler(xhtml)) {
                            try {
                                for (Page page: document.getPages()) {
                                    process(
                                        new ContentScanner(page),
                                        out,
                                        new PageContext());
                                }
                            } finally {
                                empty = out.isEmpty();
                            }
                        }
                        xhtml.endElement(P);
                        xhtml.endDocument();
                    } catch (RuntimeException e) {
                        if (empty) {
                            throw e;
                        } else {
                            ex = new SAXException(e);
                        }
                    } catch (SAXException e) {
                        ex = e;
                    }
                    if (!empty) {
                        metadata.set(PARSER, "pdfclown");
                        metadata.set(
                            PagedText.N_PAGES,
                            document.getNumberOfPages());
                        Information information = document.getInformation();
                        setMetadata(
                            metadata,
                            TikaCoreProperties.CREATED,
                            information.get(PdfName.CreationDate));
                        setMetadata(
                            metadata,
                            TikaCoreProperties.MODIFIED,
                            information.get(PdfName.ModDate));
                        setMetadata(
                            metadata,
                            TikaCoreProperties.CREATOR,
                            information.get(PdfName.Author));
                        setMetadata(
                            metadata,
                            TikaCoreProperties.CREATOR_TOOL,
                            information.get(PdfName.Creator));
                        setMetadata(
                            metadata,
                            TikaCoreProperties.KEYWORDS,
                            information.get(PdfName.Keywords));
                        setMetadata(
                            metadata,
                            Property.internalText("producer"),
                            information.get(PdfName.Producer));
                        setMetadata(
                            metadata,
                            DublinCore.SUBJECT,
                            information.get(PdfName.Subject));
                        setMetadata(
                            metadata,
                            TikaCoreProperties.TITLE,
                            information.get(PdfName.Title));
                    }
                    if (ex != null) {
                        throw ex;
                    }
                } catch (RuntimeException e) {
                    if (!empty) {
                        // Looks like bad pdf. Let the caller know about this
                        throw e;
                    }
                    // else let fallback parser process this file
                }
                if (empty) {
                    SAXException e = null;
                    try {
                        OLD_PARSER.parse(
                            data.getInputStream(),
                            handler,
                            metadata,
                            context);
                    } catch (SAXException ex) {
                        e = ex;
                    }
                    metadata.set(PARSER, PDFBOX);
                    if (e != null) {
                        throw e;
                    }
                }
            }
        }
    }
    //CSON: ParameterNumber

    private void process(
        final ContentScanner scanner,
        final CharHandler out,
        final PageContext context)
        throws SAXException
    {
        if (scanner == null) {
            return;
        }

        while (scanner.moveNext()) {
            ContentObject content = scanner.getCurrent();
            if (content instanceof Text) {
                process(
                    (ContentScanner.TextWrapper) scanner.getCurrentWrapper(),
                    out,
                    context);
            } else if (content instanceof XObject) {
                process(((XObject) content).getScanner(scanner), out, context);
            } else if (content instanceof ContainerObject) {
                process(scanner.getChildLevel(), out, context);
            }
        }
    }

    private void process(
        final ContentScanner.TextWrapper text,
        final CharHandler out,
        final PageContext context)
        throws SAXException
    {
        for (ContentScanner.TextStringWrapper string: text.getTextStrings()) {
            TextStyle style = string.getStyle();
            double fontSize = style.getFontSize();
            Font font = style.getFont();
            double spaceWidth;
            if (font == null) {
                spaceWidth = 0;
            } else {
                spaceWidth = style.getFont().getWidth(' ', fontSize)
                    * SPACE_WIDTH_TOLERANCE;
            }
            if (spaceWidth == 0) {
                spaceWidth = fontSize * SPACE_WIDTH_SCALE;
            }
            if (spaceWidth < 0) {
                spaceWidth = -spaceWidth;
            }
            Rectangle2D box = string.getBox();
            if (box != null) {
                double y = box.getY();
                double height = box.getHeight();
                double threshold = Math.min(height, context.prevHeight()) / 2;
                if (Math.abs(y - context.prevY()) < threshold) {
                    if (box.getX() - context.prevMaxX() >= spaceWidth) {
                        out.put(' ');
                    }
                } else {
                    context.prevMaxX(Double.MAX_VALUE);
                    out.newline();
                }
                context.prevY(y);
                context.prevHeight(height);
                for (TextChar c: string.getTextChars()) {
                    char value = c.getValue();
                    box = c.getBox();
                    if (box != null) {
                        if (value != ' '
                            && box.getX() - context.prevMaxX() >= spaceWidth)
                        {
                            out.put(' ');
                        }
                        out.put(value);
                        context.prevMaxX(box.getMaxX());
                    }
                }
            }
        }
    }

    private static class PageContext {
        private double prevY = Double.MAX_VALUE;
        private double prevHeight = Double.MAX_VALUE;
        private double prevMaxX = Double.MAX_VALUE;

        public double prevY() {
            return prevY;
        }

        public void prevY(final double prevY) {
            this.prevY = prevY;
        }

        public double prevHeight() {
            return prevHeight;
        }

        public void prevHeight(final double prevHeight) {
            this.prevHeight = prevHeight;
        }

        public double prevMaxX() {
            return prevMaxX;
        }

        public void prevMaxX(final double prevMaxX) {
            this.prevMaxX = prevMaxX;
        }
    }

    private static class CharHandler implements AutoCloseable {
        private final XHTMLContentHandler handler;
        private char[] cbuf = new char[INITIAL_CBUF_SIZE];
        private int len = 0;
        private boolean empty = true;
        private boolean carry = false;
        private int lineStart = 0;

        CharHandler(final XHTMLContentHandler handler) {
            this.handler = handler;
        }

        public boolean isEmpty() {
            return empty;
        }

        public void flush() throws SAXException {
            if (len > 0) {
                handler.characters(cbuf, 0, len);
                len = 0;
                lineStart = 0;
            }
        }

        @Override
        public void close() throws SAXException {
            flush();
        }

        public void put(final char c) throws SAXException {
            if (empty && !Character.isWhitespace(c)) {
                empty = false;
            }
            if (carry) {
                if (Character.isLetter(c)) {
                    carry = false;
                    --len;
                    lineStart = len;
                } else {
                    newline();
                }
            }
            if (len == cbuf.length) {
                cbuf = Arrays.copyOf(cbuf, cbuf.length << 1);
            }
            cbuf[len++] = c;
        }

        private boolean carry() {
            if (carry) {
                carry = false;
            } else {
                if (len - lineStart > 2 && Character.isLetter(cbuf[len - 2])) {
                    switch (cbuf[len - 1]) {
                        case '-':
                        case '­':
                        case '•':
                            carry = true;
                            break;
                        default:
                            break;
                    }
                }
            }
            return carry;
        }

        public void newline() throws SAXException {
            while (len > 0 && cbuf[len - 1] == ' ') {
                --len;
            }
            if (carry()) {
                lineStart = len;
            } else {
                flush();
                handler.newline();
            }
        }
    }

    // XXX: It is assumed that getFile() will be called before getInputStream()
    private interface DataProvider extends Closeable {
        File getFile() throws IOException;

        InputStream getInputStream();
    }

    private static class FileProvider implements DataProvider {
        private final TikaInputStream tis;

        FileProvider(final TikaInputStream tis) {
            this.tis = tis;
        }

        @Override
        public File getFile() throws IOException {
            return new File(tis.getFile().getAbsolutePath());
        }

        @Override
        public InputStream getInputStream() {
            return tis;
        }

        @Override
        public void close() throws IOException {
            tis.close();
        }
    }

    private static class MemoryProvider implements DataProvider {
        private byte[] buf = new byte[INITIAL_CAPACITY];
        private int pos = 0;

        MemoryProvider(final InputStream in) throws IOException {
            int read = 0;
            int toRead = INITIAL_CAPACITY;
            try (InputStream is = in) {
                while (true) {
                    read = is.read(buf, pos, toRead);
                    if (read == -1) {
                        break;
                    } else {
                        pos += read;
                        toRead = buf.length - pos;
                        if (toRead < MIN_READ) {
                            toRead += buf.length;
                            buf = Arrays.copyOf(buf, buf.length << 1);
                        }
                    }
                }
            }
        }

        @Override
        public File getFile() {
            return new File(new Buffer(buf, pos));
        }

        @Override
        public InputStream getInputStream() {
            return new ByteArrayInputStream(buf, 0, pos);
        }

        @Override
        public void close() {
        }
    }
}

