#!/usr/bin/env python
# -*- coding: utf-8 -*-

import yt.wrapper as yt
import sys
import libra
import argparse
import datetime

trends = "51617"
notrends = "51614"

def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--server", dest="server", help="yt server",default="hahn.yt.yandex.net", required=False)
    parser.add_argument("--bs", dest="blockstat", help="path to blockstat.dict",default="/home/itajn/serploader/blockstat.dict", required=False)
    return parser

def extract(key, recs):
    uid = key
    try:
        session = libra.ParseSession(recs, "./blockstat.dict")
    except:
        return

    for request in session:
        clicks = []
        if not request.IsA("TMobileAppYandexWebRequest"):
            continue

        if request.HasTestID(trends):
            slice = "trend"
        elif request.HasTestID(notrends):
            slice = "notrend"
        else:
            continue

        if not "clid=2218565" in request.FullRequest:
            continue

        if "query_source=deeplink" in request.FullRequest:
            src = "deeplink"
        elif "query_source=history" in request.FullRequest:
            src = "history"
        elif "query_source=type" in request.FullRequest:
            src = "type"
        elif "query_source=suggest" in request.FullRequest:
            src = "suggest"
        else:
            continue

        query = request.Query.lower()[:1024]
        for block in request.GetMainBlocks():
            for click in block.GetClicks():
                clicks.append(click.DwellTime)

        yield {"type" : slice,
               "query" : query,
               "clicks": clicks,
               "src" : src
               }


def main():
    args = HandleOption().parse_args()
    yt.update_config({"proxy": {"url": args.server}})

    trendlist = {}
    with open("querylist.txt", "r") as ql:
        for line in ql:
            tmp = line.split("\t")
            if not tmp[0].lower() in trendlist:
                trendlist[tmp[0].lower()] = datetime.datetime(int(tmp[1][:4]),int(tmp[1][5:][:2]), int(int(tmp[1][8:])))

    startdate = datetime.datetime(2017,9,5)
    enddate = datetime.datetime(2017,10,3)
    outpath = "//home/freshness/staff/itajn/FR-2847/"
    while startdate <= enddate:
        day = startdate.strftime("%Y-%m-%d")
        startdate += datetime.timedelta(1)
        usersessions= "//user_sessions/pub/search/daily/" + day + "/clean"
        output = outpath + day
        if not yt.exists(output):
            yt.create_table(path=output, recursive=True)
        yt.run_reduce(extract,
                      source_table = usersessions,
                      destination_table = output,
                      local_files = [args.blockstat],
                      reduce_by = "key",
                      spec = {"data_size_per_job": 16000000000} #~16GB
                      )
        yt.run_sort(source_table = output, destination_table = output, sort_by = "query")
        queries = {"trend" : {}, "notrend" : {}}
        queries_trend = {"trend" : {}, "notrend" : {}}
        clicks = {"trend" : {}, "notrend" : {}}
        long_clicks = queries = {"trend" : {}, "notrend" : {}}
        trend_clicks = queries = {"trend" : {}, "notrend" : {}}
        trend_long_clicks = queries = {"trend" : {}, "notrend" : {}}
        for row in yt.read_table(output):
            exp = row["type"]
            src = row["src"]
            if src == "deeplink" and exp == "notrend":
                continue
            if not src in queries[exp]:
                queries[exp][src] = 0
                queries_trend[exp][src] = 0
                clicks[exp][src] = 0
                long_clicks[exp][src] = 0
                trend_clicks[exp][src] = 0
                trend_long_clicks[exp][src] = 0
            queries[exp][src] += 1
            for c in row["clicks"]:
                clicks[exp][src] += 1
                if c > 15:
                    long_clicks[exp][src] += 1
            if row["query"] in trendlist.keys():
                queries_trend[exp][src] += 1
                for c in row["clicks"]:
                    trend_clicks[exp][src] += 1
                    if c > 15:
                        trend_long_clicks[exp][src] += 1
            elif src == "deeplink":
                print >> sys.stderr, row["query"], day

        print day, "queries", queries, "queries_trend", queries_trend, "clicks", clicks, "long_clicks", long_clicks, "trend_clicks", trend_clicks, "trend_long_clicks", trend_long_clicks


if __name__ == "__main__":
    main()
