#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division
import sys
import libra
import argparse
import yt.wrapper as yt
import json
import datetime

testid = ["75707", "75708", "75709", "75710"]

def clean(url):
    url = url.replace('https://','')
    url = url.replace('http://','')
    url = url.replace('www.','')
    if url == '':
        return ''
    if url[len(url)-1] == '/':
        url = url[:-1]
    return url


def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--server", dest="server", help="mapreduce server",default='hahn.yt.yandex.net:80', required=False)
    parser.add_argument("--bs", dest="blockstat", help="path to blockstat.dict",default='/home/itajn/serploader/blockstat.dict', required=False)
    return parser

def extract(key,recs):
    uid = key
    try:
        session = libra.ParseSession(recs, './blockstat.dict')
    except:
        return

    for request in session:
        test = False
        boost = False
        show = 0
        clicks = 0
        longclicks = 0

        if not request.IsA('TYandexWebRequest'):
            continue

        query = request.Query

        for t in testid:
            if request.HasTestID(t):
                test = t
        if not test:
            continue

        if request.IsA("TMiscRequestProperties"):
            sprops = request.SearchPropsValues
            if "UPPER.HostBoostUpper.host_boosted" in sprops and sprops["UPPER.HostBoostUpper.host_boosted"] > 0:
                boost = True
        else:
            continue

        for block in request.GetMainBlocks():
            res = block.GetMainResult()
            if res.IsA("TWebResult"):
                if clean(res.Url).startswith("sports.ru"):
                    show += 1
                    for click in res.GetClicks():
                        clicks += 1
                        if click.DwellTime > 15:
                            longclicks += 1
        if show or boost:
            yield {"query" : query, "boost" : boost, "test" : test, "show": show, "clicks" : clicks, "long": longclicks}



def main():
    args = HandleOption().parse_args()
    yt.update_config({'proxy': {'url': args.server}})


    startdate = datetime.datetime(2018,4,12)
    enddate = datetime.datetime(2018,4,17)
    while startdate <= enddate:
        day = startdate.strftime("%Y-%m-%d")
        usersessions = '//user_sessions/pub/search/daily/' + day + '/clean'
        startdate += datetime.timedelta(1)
        output = '//home/freshness/staff/itajn/EXP-20363/' + day
        if not yt.exists(usersessions):
            continue
        if not yt.exists(output):
            yt.create_table(path = output, recursive = True)
        yt.run_reduce(extract,
                      source_table = usersessions,
                      destination_table = output,
                      local_files = [args.blockstat],
                      spec = {'data_size_per_job': 16000000000},
                      reduce_by = 'key')
        data = {}
        slice = {}
        i = 0
        for r in yt.read_table(output):
            if not r["test"] in data:
                data[r["test"]] = {"show" : 0, "clicks" : 0, "long" : 0}
                slice[r["test"]] = {"show" : 0, "clicks" : 0, "long" : 0}
            if r["show"]:
                data[r["test"]]["show"] += r["show"]
                data[r["test"]]["clicks"] += r["clicks"]
                data[r["test"]]["long"] += r["long"]
            if r["boost"]:
                slice[r["test"]]["show"] += r["show"]
                slice[r["test"]]["clicks"] += r["clicks"]
                slice[r["test"]]["long"] += r["long"]
            i += 1
            print >> sys.stderr, day, i
        print day, data, slice


if __name__ == '__main__':
    main()
