import datetime
import sys
import getopt
import re
from collections import defaultdict
import requests


def query_norm(query):
    return re.sub("  +", " ", query.lower())


def combineGrepPossibleAnswer(recs):
    from mapreducelib import Record

    # query => ontoid => weight
    max_weights = defaultdict(lambda: defaultdict(lambda: -20.0))

    for rec in recs:
        ans_m = re.search("PossibleAnswer=([a-z0-9(). -]+)", rec.value)
        if ans_m:
            uil = ""
            uil_m = re.search("@@uil=([a-z]{2})@@", rec.value)
            if uil_m:
                uil = uil_m.group(1)

            msp = ""
            # msp_m = re.search("@@msp=1:Misspell:\d+:([^:@]+)(?:@@|:)", rec.value)
            # if msp_m:
            #    msp = msp_m.group(1)

            req_m = re.search("[@;]norm=(.*?)[@;]", rec.value)
            if req_m:
                query = query_norm(req_m.group(1))

                for match in re.finditer(r"([^ (]+)(?:\(([^)]+)\))?", ans_m.group(1)):
                    ontoid = match.group(1)
                    if match.group(2):
                        weight = float(match.group(2))
                        max_weights[(query, uil, msp)][ontoid] = max(max_weights[(query, uil, msp)][ontoid], weight)

    for key, answers in max_weights.iteritems():
        query, uil, msp = key
        for ontoid, weight in answers.iteritems():
            yield Record("\t".join([query, ontoid]), "uil={}\tmisspell={}".format(uil, msp), str(weight))


def reduceMaxWeight(key, recs):
    from mapreducelib import Record

    # subkey => weight
    max_weights = defaultdict(lambda: -20.0)
    for rec in recs:
        max_weights[rec.subkey] = max(max_weights[rec.subkey], float(rec.value))
    for subkey, w in max_weights.iteritems():
        yield Record(key, subkey, str(w))


def canonizeFolder(path):
    return re.sub("//+", "/", path + "/")


def getLogDays(mrFolder):
    from mapreducelib import MapReduce

    for table in MapReduce.getTables(prefix=mrFolder):
        m = re.match(re.escape(mrFolder) + r"(\d{8})", table)
        if not m:
            continue
        yield (table, m.group(1))


def getAnswerTitles(mrTable, entitySearchPort, objectTitleCache):
    from mapreducelib import MapReduce
    from mapreducelib import Record

    for rec in MapReduce.getSample(srcTable=mrTable, count=None):
        query, ontoid = rec.key.split("\t")
        if ontoid not in objectTitleCache:
            titles = []
            for lang in ["ru", "en", "tr"]:
                r = requests.get('http://localhost:{}/search?text=1&wizextra=entnext={};entlang={}'.format(entitySearchPort, ontoid, lang))
                o = r.json()
                titles.append(o.get('base_info', {}).get('name', u'').encode("utf-8"))
                objectTitleCache[ontoid] = titles
        yield Record(query, rec.subkey, "\t".join([ontoid, rec.value] + objectTitleCache[ontoid]))


def main():
    from mapreducelib import MapReduce
    from mapreducelib import TemporaryTable

    (opts, _) = getopt.getopt(sys.argv[1:], "m:s:u:d:f:t:p:")
    opts = dict(opts)
    mrExec = opts.get('-m', 'mapreduce')
    mrServer = opts.get('-s', 'sakura00.search.yandex.net:8013')
    mrUser = opts.get('-u')
    scheduleAttrs = {'user': mrUser} if mrUser else None
    daysBack = opts.get('-d')
    mrSourceFolder = opts.get('-f')
    mrTargetFolder = opts.get('-t')
    entitySearchPort = opts.get("-p")

    if daysBack is None or mrSourceFolder is None or mrTargetFolder is None or entitySearchPort is None:
        print >>sys.stderr, "Usage: %s [-m mapreduce_binary] [-s server] [-u mapreduce_user] -d #days -f source_folder -t target_folder -p #port" % sys.argv[0]
        sys.exit(1)

    daysBack = int(daysBack)
    mrSourceFolder = canonizeFolder(mrSourceFolder)
    mrTargetFolder = canonizeFolder(mrTargetFolder)

    MapReduce.useDefaults(verbose=True, usingSubkey=True, server=mrServer, mrExec=mrExec, workDir=".", scheduleAttrs=scheduleAttrs)

    startDate = (datetime.datetime.today() - datetime.timedelta(days=daysBack)).strftime("%Y%m%d")

    sourceLogs = list(getLogDays(mrSourceFolder))
    sourceDates = set([date for table, date in sourceLogs if date >= startDate])

    completeLogs = list(getLogDays(mrTargetFolder))
    completeDates = set([date for table, date in completeLogs])

    objectTitleCache = {}

    for date in sorted(sourceDates - completeDates):
        dstFolder = mrTargetFolder + date + '_folder'
        dstTable = mrTargetFolder + date
        with TemporaryTable(project=dstFolder) as tmp2:
            with TemporaryTable(project=dstFolder) as tmp1:
                MapReduce.runCombine(combineGrepPossibleAnswer, srcTable=mrSourceFolder + date, dstTable=tmp1)
                MapReduce.runReduce(reduceMaxWeight, srcTable=tmp1, dstTable=tmp2)
            MapReduce.updateTable(getAnswerTitles(tmp2, entitySearchPort, objectTitleCache), dstTable=dstTable)


if __name__ == "__main__":
    main()
