#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division
import sys
import argparse
import yt.wrapper as yt
import json
import datetime
import time
import random
from collections import defaultdict

def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--server", dest="server", help="mapreduce server",default='hahn.yt.yandex.net:80', required = False)
    parser.add_argument("--table", dest="table", help="input table", required = True)
    parser.add_argument("--ts", dest="ts", help="timestamp",required = True)
    parser.add_argument("--dev", dest="dev", help="device: desktop|touch",required = True)
    parser.add_argument("--out", dest="output", help="output json",required = True)
    return parser

def clean(url):
    url = url.replace('https://','')
    url = url.replace('http://','')
    url = url.replace('www.','')
    if url == '':
        return ''
    if url[len(url)-1] == '/':
        url = url[:-1]
    return url.lower()

def main():
    args = HandleOption().parse_args()
    yt.update_config({'proxy': {'url': args.server}})

    timestamp = int(args.ts[:10])
    if args.dev == "desktop":
        hostlist = set()
        this = datetime.datetime.fromtimestamp(timestamp)
        listtable = "//home/turbo_analitycs/stimofeev/desktop/" + this.strftime("%Y-%m-%d")
        while not yt.exists(listtable):
            this -= datetime.timedelta(1)
            listtable = "//home/turbo_analitycs/stimofeev/desktop/" + this.strftime("%Y-%m-%d")
        for row in yt.read_table(listtable):
            hostlist.add(clean(row["Host"]))

    out= []
    hs = defaultdict(int)
    for row in yt.read_table(args.table):
        host = row["host"]
        if host.endswith(".kp.ru"):
            host = "kp.ru"
        if args.dev == "desktop" and not host in hostlist:
            continue
        turl = row["left_url"].split("=")
        while not turl[1].startswith("http"):
            turl[1] = turl[1][1:]
        tturbo = "=".join(turl).replace("&exp_flags=adv-disabled", "")
        if args.dev == "desktop":
            tturbo += ("&d=1" if "&d=1" not in tturbo else "")
        else:
            tturbo = tturbo.replace("&d=1", "")
        out.append({"turbo_url": tturbo,
            "query": u"Какая страница вам больше нравится?",
            "just_url": row["right_url"]}
                  )
        hs[host] += 1
    for h in hs:
        print >> sys.stderr, h, hs[h]

    with open(args.output, "w") as outfile:
        json.dump(out, outfile, indent = 4)

if __name__ == '__main__':
    main()
