import datetime
import sys
import getopt
import re
from collections import defaultdict
import json
import base64


def query_norm(query):
    return re.sub("  +", " ", query.lower())


def combineGrepPossibleAnswer(recs):
    from mapreducelib import Record

    # query => value => weight
    max_weights = defaultdict(lambda: defaultdict(lambda: -20.0))

    for rec in recs:
        ans_m = re.search("SerpFact.CandidateFacts=([0-9a-zA-Z=\+\/]+)", rec.value)
        if ans_m:
            req_m = re.search("[@;]norm=(.*?)[@;]", rec.value)
            if req_m:
                query = query_norm(req_m.group(1))
                answersBase64 = base64.b64decode(ans_m.group(1))
                answers = json.loads(answersBase64)
                best_answer = ""
                best_score = -2
                for answer in answers:
                    score = float(answer["score"])
                    if score > best_score:
                        best_answer = answer["value"]
                        best_score = score
                if best_answer != "":
                    max_weights[query][best_answer] = max(max_weights[query][best_answer], best_score)

    for key, answers in max_weights.iteritems():
        for value, weight in answers.iteritems():
            yield Record(key, value.encode("utf-8"), str(weight))


def reduceMaxWeight(key, recs):
    from mapreducelib import Record

    # subkey => weight
    max_weights = defaultdict(lambda: -20.0)
    for rec in recs:
        max_weights[rec.subkey] = max(max_weights[rec.subkey], float(rec.value))
    for subkey, w in max_weights.iteritems():
        yield Record(key, subkey, str(w))


def canonizeFolder(path):
    return re.sub("//+", "/", path + "/")


def getLogDays(mrFolder):
    from mapreducelib import MapReduce

    for table in MapReduce.getTables(prefix=mrFolder):
        m = re.match(re.escape(mrFolder) + r"(\d{8})", table)
        if not m:
            continue
        yield (table, m.group(1))


def main():
    from mapreducelib import MapReduce
    from mapreducelib import TemporaryTable

    (opts, _) = getopt.getopt(sys.argv[1:], "m:s:u:d:f:t:p:")
    opts = dict(opts)
    mrExec = opts.get('-m', 'mapreduce')
    mrServer = opts.get('-s', 'sakura00.search.yandex.net:8013')
    mrUser = opts.get('-u')
    scheduleAttrs = {'user': mrUser} if mrUser else None
    daysBack = opts.get('-d')
    mrSourceFolder = opts.get('-f')
    mrTargetFolder = opts.get('-t')

    if daysBack is None or mrSourceFolder is None or mrTargetFolder is None:
        print >>sys.stderr, "Usage: %s [-m mapreduce_binary] [-s server] [-u mapreduce_user] -d #days -f source_folder -t target_folder" % sys.argv[0]
        sys.exit(1)

    daysBack = int(daysBack)
    mrSourceFolder = canonizeFolder(mrSourceFolder)
    mrTargetFolder = canonizeFolder(mrTargetFolder)

    MapReduce.useDefaults(verbose=True, usingSubkey=True, server=mrServer, mrExec=mrExec, workDir=".", scheduleAttrs=scheduleAttrs)

    startDate = (datetime.datetime.today() - datetime.timedelta(days=daysBack)).strftime("%Y%m%d")

    sourceLogs = list(getLogDays(mrSourceFolder))
    sourceDates = set([date for table, date in sourceLogs if date >= startDate])

    completeLogs = list(getLogDays(mrTargetFolder))
    completeDates = set([date for table, date in completeLogs])

    for date in sorted(sourceDates - completeDates):
        dstFolder = mrTargetFolder + date + '_folder'
        dstTable = mrTargetFolder + date
        with TemporaryTable(project=dstFolder) as tmp:
            MapReduce.runCombine(combineGrepPossibleAnswer, srcTable=mrSourceFolder + date, dstTable=tmp)
            MapReduce.runReduce(reduceMaxWeight, srcTable=tmp, dstTable=dstTable)


if __name__ == "__main__":
    main()
