from __future__ import absolute_import

import os
import collections

from . import encoding
from . import itertools as citertools

import whoosh.index
import whoosh.query
import whoosh.fields
import whoosh.scoring
import whoosh.qparser
import whoosh.analysis
import whoosh.highlight


class SimpleHTMLTokenizer(whoosh.analysis.Tokenizer):
    def __call__(self, value, *args, **kwargs):
        length = len(value)
        prev, tag, quote, escape = 0, None, None, False
        tz = whoosh.analysis.RegexTokenizer()
        while prev < length:
            if tag is None:
                pos = value.find("<", prev)
                if pos < 0:
                    break
                tag = pos
                kws = kwargs.copy()
                kws["start_char"] = prev
                for t in tz(value[prev:pos - prev], *args, **kws):
                    yield t
            else:
                pos = prev
                while pos < length:
                    char = value[pos]
                    if char in "\"'\\>":
                        break
                    pos += 1
                else:
                    break
                if escape:
                    escape = False
                elif char in ('"', "'"):
                    if quote == char and not escape:
                        quote = None
                    else:
                        quote = char
                elif char == "\\" and quote:
                    escape = True
                elif not quote and char == ">":
                    tag = None
            prev = pos + 1

        kws = kwargs.copy()
        kws["start_char"] = prev
        for t in tz(value[prev:], *args, **kwargs):
            yield t


class Index(object):
    Document = collections.namedtuple("Document", ("id", "title", "content"))

    class FuzzyTerm(whoosh.query.FuzzyTerm):
        def __init__(self, fieldname, text, boost=0.5, maxdist=2, prefixlength=1, constantscore=True):
            super(Index.FuzzyTerm, self).__init__(fieldname, text, boost, maxdist, prefixlength, constantscore)

    class Formatter(whoosh.highlight.Formatter):
        def format(self, fragments, replace=False):
            positions = []
            for fragment in fragments:
                positions.extend((t.startchar, t.endchar - 1) for t in fragment.matches)
            return positions or None

        def format_token(self, text, token, replace=False):
            pass

    def __init__(self, name, location):
        self.idx = None
        self.name = name
        self.location = location
        analyzer = SimpleHTMLTokenizer() | whoosh.analysis.LowercaseFilter() | whoosh.analysis.StopFilter()
        self.schema = whoosh.fields.Schema(
            id=whoosh.fields.ID(stored=True),
            tags=whoosh.fields.TEXT(field_boost=5.0),
            title=whoosh.fields.TEXT(stored=True, field_boost=3.0),
            content=whoosh.fields.TEXT(stored=True, field_boost=1.0, analyzer=analyzer),
        )

    def create(self, docs):
        if not os.path.exists(self.location):
            os.mkdir(self.location)
        self.idx = whoosh.index.create_in(self.location, self.schema, self.name)
        with self.idx.writer() as writer:
            for doc in docs:
                doc = dict(zip(
                    ("id", "title", "content", "tags"),
                    ((
                        encoding.force_unicode_safe(_)
                        for _ in citertools.chain(
                            doc,
                            # Make some kind of ngrams(1, 3) but with sensible order.
                            " ".join(citertools.chain(
                                (_[0] for _ in doc.title.split()),
                                (_[0:2] for _ in doc.title.split()),
                                (_[0:3] for _ in doc.title.split()),
                            ))
                        )
                    ))
                ))
                writer.add_document(**doc)

        """
a = "TEST BASESEARCH PERFORMANCE"
" ".join(citertools.chain((_[0] for _ in a.split()), (_[0:2] for _ in a.split()), (_[0:3] for _ in a.split())))
        """

    @property
    def exists(self):
        return whoosh.index.exists_in(self.location, self.name)

    def load(self):
        self.idx = whoosh.index.open_dir(self.location, self.name, self.schema)

    def search(self, query, limit=5):
        with self.idx.searcher() as searcher:
            fuzzy_parser = whoosh.qparser.MultifieldParser(
                ["title", "content"],
                self.idx.schema,
                termclass=self.FuzzyTerm,
                plugins=[whoosh.qparser.PhrasePlugin()],
            )
            prefix_parser = whoosh.qparser.MultifieldParser(
                ["title", "content"],
                self.idx.schema,
                plugins=[whoosh.qparser.PrefixPlugin(), whoosh.qparser.PhrasePlugin()],
            )
            tags_parser = whoosh.qparser.QueryParser(
                "tags",
                self.idx.schema,
                plugins=[whoosh.qparser.PhrasePlugin()],
            )
            results = searcher.search(
                whoosh.query.Or([
                    fuzzy_parser.parse(query),
                    prefix_parser.parse(" ".join(_ + "*" for _ in query.split())),
                    tags_parser.parse('"' + query + '"'),
                ]),
                limit=limit
            )
            results.formatter = self.Formatter()
            for r in results:
                yield r["id"], r.score, r.highlights("title"), r.highlights("content")
