#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division
from operator import itemgetter
from math import exp
import sys
import os
import time
import json
import ftplib
import re
from mapreducelib import *
import argparse
import subprocess
import codecs
import json
import gzip
from tqdm import tqdm
from datetime import date, datetime, timedelta as td
import time
import re
import os.path

from mrdef import defaults


def getvalue(string, val, d="\t"):
    rv = ""
    tabs = string.split(d)
    for k in tabs:
        if k[0:(len(val) + 1)] == val + "=":
            rv = k[(len(val) + 1):]
    return rv


class GetReqid(object):

    def __init__(self, testids):
        self.testids = testids

    def __call__(self, rec):
        buckets = ""
        reqid = ""
        reqid = getvalue(rec.value, "reqid", d="@@")
        buckets = getvalue(rec.value, "test-ids", d="@@")
        if re.search(self.testids, buckets):
            yield Record(reqid, "buckets", buckets)


def reqid_atom_reqid(rec):
    score = ""
    reqid = ""
    try:
        if "score\":" in rec.value:
            log = json.loads(rec.value.rstrip())
            areqid = log["requestId"]
            m = re.search('wprid%22%3A%([^%]+)%',
                          log["params"]["access_entry"]["access"])
            reqid = m.group(1)
            yield Record(reqid[2:], "atom", areqid)
    except:
        pass


def atom_reqid_bucket(key, recs):
    testid = ""
    reqid = ""
    for rec in recs:
        if rec.subkey == "buckets":
            testid = rec.value
        if rec.subkey == "atom":
            reqid = rec.value
        if reqid != "" and testid != "":
            yield Record(reqid, "joined", testid)


def table_exists(name):
    return MapReduce.getTableInfo(name).size > 0


def make_fn(fn):
    bn, ext = os.path.splitext(fn)
    return bn + '.json'


def bucketeer(fn, ti):
    result = {}
    banned = set()
    with gzip.open(fn, 'r') as f:
        for line in tqdm(f):
            tabs = line.decode('utf8').rstrip().split('\t')
            if len(tabs) < 3:
                continue
            if tabs[0] in result or tabs[0] in banned:
                banned.add(tabs[0])
                result.pop(tabs[0])
            buckets = tabs[2].split(',')
            blist = [bucket for bucket in buckets
                     if re.search(ti, bucket)]
            if len(blist) != 1:
                continue
            else:
                result[tabs[0]] = blist[0]
    with codecs.open(make_fn(fn), 'w', 'utf8') as f:
        f.write(json.dumps(result, indent=4))


def main():
    defaults()
    # MapReduce.useDefaults(server='sakura:8013',verbose=False,username='tmp',usingSubkey=True)
    append = " "

    startdate = int(sys.argv[1])
    testids = sys.argv[2]
    starttime = int(time.mktime(datetime.strptime(
        str(startdate), "%Y%m%d").timetuple()))

    get_reqid = GetReqid(testids)

    env = os.environ.copy()
    env['MR_USER'] = 'tmp'
    if not table_exists('pecheny/atomlog' + str(startdate)):
        if table_exists('mobilesearch_answer/' + str(startdate)):
            print "fetch logs"
            subprocess.call('fetchlogs -s sakura00:8013 -i mobilesearch_answer/{s} -o pecheny/atomlog{s}'
                            .format(s=startdate), shell=True, env=env)
        else:
            print('no mobilesearch log for that date')
            sys.exit(1)
    if not table_exists('pecheny/reqid_testid_' + str(startdate)):
        print "get atom reqid"
        MapReduce.runMap(reqid_atom_reqid, srcTable='pecheny/atomlog' + str(startdate),
                         dstTable='pecheny/reqid_testid_' + str(startdate), appendMode=False, sortMode=False)
        print "get testid"
        MapReduce.runMap(get_reqid, srcTable='reqans_log/' + str(startdate),
                         dstTable='pecheny/reqid_testid_' + str(startdate), appendMode=True, sortMode=True)
        print "sort"
        os.system(
            "MR_USER=tmp /Berkanavt/bin/mapreduce27-dev -server sakura00:8013 -sort pecheny/reqid_testid_" + str(startdate))
    if not table_exists('pecheny/reqid_testid_finish_' + str(startdate)):
        print "join"
        MapReduce.runReduce(atom_reqid_bucket, srcTable='pecheny/reqid_testid_' + str(startdate),
                            dstTable='pecheny/reqid_testid_finish_' + str(startdate), appendMode=True, sortMode=True)
    print "download"
    subprocess.call('mapreduce-dev -server sakura -subkey -read '
                    'pecheny/reqid_testid_finish_{s} '
                    '| gzip  > reqid_testid_finish_{s}.gz'.format(s=startdate),
                    shell=True, env=env)
    bucketeer('reqid_testid_finish_{s}.gz'.format(s=startdate), testids)


if __name__ == '__main__':
    main()
