#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division

import sys
import libra
import argparse
import yt.wrapper as yt
import json
import numpy
import datetime
import copy

fage = 259200
tracking = {'ip' : 0.0, 'surplus' : 0, 'freshage' : [], 'count': 0}

def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--server", dest = "server", help = "mapreduce server", default = 'hahn.yt.yandex.net:80', required = False)
    parser.add_argument("--bs", dest = "blockstat", help = "path to blockstat.dict", default = '/home/itajn/serploader/blockstat.dict', required = False)
    parser.add_argument("--file", dest = "file", help = "file with queries", required = True)
    parser.add_argument("--ts", dest = "timestamp", help = "event timestamp", required = True)
    return parser


def generate():
    data = { 'before' : copy.deepcopy(tracking)}
    for i in range(13):
        n = str(i)+'h'
        data[n] = copy.deepcopy(tracking)
    return data

class Filter(object):
    def __init__(self, list):
        self._list = list

    def __call__(self, key, recs):
        uid = key
        try:
            session = libra.ParseSession(recs, './blockstat.dict')
        except:
            return

        for request in session:
            if  not (request.IsA("TYandexWebRequest") or request.IsA("TMobileYandexWebRequest") or request.IsA("TTouchYandexWebRequest")):
                continue

            query = request.Query.lower()
            if len(query) > 1000:
                query = query[:1000]
            if not query in self._list:
                continue

            ts = request.Timestamp

            if request.IsA('TMiscRequestProperties'):
                sprops = request.SearchPropsValues
                if 'UPPER.Fresh.IntentProbability' in sprops:
                    ip = float(sprops['UPPER.Fresh.IntentProbability'])
                else:
                    ip = 0.0
            else:
                continue
            win = 0
            loss = 0
            find = False
            found = False
            first = False
            match = ''
            freshage = []

            for block in request.GetMainBlocks():
                cl = 0
                for click in block.GetClicks():
                    if int(click.DwellTime) >= 15:
                        cl += 1
                res = block.GetMainResult()
                if res.Position > 12:
                    continue
                if res.IsA("TWebResult"):
                    m = res.Markers
                    if ("FreshAge" in m) and (int(m['FreshAge']) <= fage):
                        win += cl
                        found = True
                        find = True
                        freshage.append(int(m['FreshAge']))
                        continue
                    if find:
                        loss += cl
                        find = False

            yield {'query' : query,
                   'win' : win,
                   'loss' : loss,
                   'ip' : ip,
                   'ages' : freshage,
                   'timestamp' : ts
                   }


def main():
    args = HandleOption().parse_args()
    yt.update_config({'proxy': {'url': args.server}})

    days = ["2017-03-26"]
    queries = []
    out = []
    timestamp = int(args.timestamp)
    slices = ['before']
    for i in range(13):
        n = str(i)+'h'
        slices.append(n)

    data = {'all' : copy.deepcopy(generate())}

    with open(args.file) as file:
        for line in file:
            q = line.lower().strip()
            queries.append(q)
            data[q] = copy.deepcopy(generate())

    for day in days:
        usersessions = '//user_sessions/pub/search/daily/' + day + '/clean'
        output = '//home/freshness/staff/itajn/FR-2589/' + day
        if not yt.exists(output):
            yt.create_table(path = output, recursive = True)

        yt.run_reduce(Filter(queries),
                      source_table = usersessions,
                      destination_table = output,
                      local_files = [args.blockstat],
                      spec = {'data_size_per_job': 16000000000},
                      reduce_by = 'key')

        for row in yt.read_table(output):
            if row['timestamp'] < timestamp:
                zone = 'before'
            elif row['timestamp'] > timestamp + 13 * 3600:
                continue
            else:
                for i in range(13):
                    if row['timestamp'] >= timestamp + i * 3600 and row['timestamp'] < timestamp + (i + 1) * 3600:
                        zone = str(i) + 'h'
                        break
            data['all'][zone]['count'] += 1
            data['all'][zone]['surplus'] += row['win']
            data['all'][zone]['surplus'] -= row['loss']
            data['all'][zone]['ip'] += row['ip']
            for a in row['ages']:
                data['all'][zone]['freshage'].append(a)
            q = row['query']
            data[q][zone]['count'] += 1
            data[q][zone]['surplus'] += row['win']
            data[q][zone]['surplus'] -= row['loss']
            data[q][zone]['ip'] += row['ip']
            for a in row['ages']:
                data[q][zone]['freshage'].append(a)

        header = 'query' + '\t' + 'metric' + '\t'
        for z in slices:
            header += z + '\t'
        print header
        for line in data:
            out = line + '\t' + 'count' + '\t'
            for z in slices:
                out += str(data[line][z]['count']) + '\t'
            print out
            out = line + '\t' + 'ip_avg' + '\t'
            for z in slices:
                try:
                    out += str(data[line][z]['ip'] / data[line][z]['count']) + '\t'
                except ZeroDivisionError:
                    out += 'nan' + '\t'
            print out

            out = line + '\t' + 'fage_median' + '\t'
            for z in slices:
                if len(data[line][z]['freshage']) == 0:
                    out += 'nan' + '\t'
                else:
                    out += str(numpy.median(data[line][z]['freshage'])) + '\t'
            print out

            out = line + '\t' + 'fage_avg' + '\t'
            for z in slices:
                if len(data[line][z]['freshage']) == 0:
                    out += 'nan' + '\t'
                else:
                    out += str(numpy.average(data[line][z]['freshage'])) + '\t'
            print out

            out = line + '\t' + 'surplus_total' + '\t'
            for z in slices:
                out += str(data[line][z]['surplus']) + '\t'
            print out

            out = line + '\t' + 'suplus_per_query' + '\t'
            for z in slices:
                try:
                    out += str(data[line][z]['surplus'] / data[line][z]['count']) + '\t'
                except ZeroDivisionError:
                    out += 'nan' + '\t'
            print out


if __name__ == '__main__':
    main()
