#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division
import sys
import libra
import argparse
import yt.wrapper as yt
import json
import datetime
import time
from urlparse import urlparse
import random
from collections import defaultdict

def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--server", dest="server", help="mapreduce server",default='hahn.yt.yandex.net:80', required=False)
    parser.add_argument("--bs", dest="blockstat", help="path to blockstat.dict",default='/home/itajn/serploader/blockstat.dict', required=False)
    parser.add_argument("--pool", dest="pool", default="", required=False)

    return parser

sportlist = {u"футбол" : set([u"футбол"]), u"хокке" : set([u"хокке", u"кхл"]), u"биатлон"  : set([u"биатлон"]), u"волейбол"  : set([u"волейбол", u"воллейбол"]), u"баскетбол" : set([u"баскетбол"]), u"бокс" : set([u"бокс", u"борец", u"бой", u"мма"]), u"фигурн" : set([u"фигурн", u"фигурист"]), u"теннис"  : set([u"теннис", u"тенис"]), u"лыжи"  : set([u"лыжн", u"трамплин"]), u"формула 1"  : set([u"формула 1", u"формула один"])}

def clean(url):
    url = url.replace('https://','')
    url = url.replace('http://','')
    url = url.replace('www.','')
    if url == '':
        return ''
    if url[len(url)-1] == '/':
        url = url[:-1]
    return url


class filter(object):
    def __init__(self, list):
        self._list = list

    def __call__(self, key, recs):
        uid = key
        try:
            session = libra.ParseSession(recs, './blockstat.dict')
        except:
            return
        for request in session:
            olymptitle = 0
            olympsni = 0
            olymphost = 0
            tsport = defaultdict(int)

            if request.IsA('TYandexWebRequest'):
                platform = "desktop"
            elif request.IsA("TMobileYandexWebRequest") or request.IsA("TTouchYandexWebRequest"):
                platform = "touch"
            else:
                continue
            query = request.Query

            for block in request.GetMainBlocks():
                res = block.GetMainResult()
                if res.IsA("TBlenderWizardResult") or res.IsA("TWizardResult"):
                    if 'olympiad' in res.Path:
                           show = True
                if res.IsA("TWebResult"):
                    parsed = urlparse(res.Url)
                    host = parsed.netloc
                    if host.startswith("www."):
                        host = host[4:]
                    if host in self._list:
                        olymphost += 1
                    if "RandomLog" in res.Markers:
                        try:
                            random_log_string = res.Markers["RandomLog"].decode("base64")#.decode("utf-8")
                            random_log = json.loads(random_log_string)
                        except:
                            print >> sys.stderr, "Randomlog decoding error"
                            continue
                        if random_log is None:
                            continue
                        for sportw in sportlist:
                            for sport in sportlist[sportw]:
                                if "Title" in random_log and sport in random_log["Title"].lower():
                                    olymptitle += 1
                                    tsport[sportw] += 1
                                if "Passage" in random_log and sport in "\t".join(random_log["Passage"][:10]).lower():
                                    olympsni += 1
                                    tsport[sportw] += 1

            if olymptitle >= 3 or olympsni >= 3:
                sportclass = None
                Misc = False
                for s in tsport.keys():
                    if tsport[s] > tsport[sportclass]:
                        sportclass = s
                    elif tsport[s] == tsport[sportclass]:
                        Misc = True
                yield {'query' : query, 'title': olymptitle, "snippet": olympsni, "hostnum" : olymphost, "sport" : "Misc" if Misc else sportclass, "device" : platform}


def main():
    args = HandleOption().parse_args()

    if args.pool:
        yt.update_config({'proxy': {'url': args.server}, 'pool': 'home_blender'})
    else:
        yt.update_config({'proxy': {'url': args.server}})
    #
    # yt.config["mount_tmpfs_in_sandbox"] = True


    hoststat = set(["allboxing.ru", "allhockey.ru", "allsportinfo.ru", "biathlonrus.com", "championat.com", "euro-football.ru", "football.kulichki.net", "khl.ru",
                    "livesport.ru", "matchtv.ru", "news.sportbox.ru", "rsport.ria.ru", "rusfootball.info", "soccernews.ru", "soccer.ru", "sovsport.ru", "sport.business-gazeta.ru",
                    "sport-express.ru", "sportfm.ru", "sport.ru", "sportsdaily.ru", "sports.ru", "stadium.ru", "vseprosport.ru"
                    ])
    with open("sporthost.txt", "r") as hostf:
        for line in hostf:
            hoststat.add(line.strip())

    querytype = {}
    with open("outsbs_weight.txt", "r") as findres:
        for line in findres:
            tmp = line.strip().split("\t")
            querytype[tmp[0].strip().decode('utf-8')] = tmp[2]
    bydate = {}
    with open("sbs_data.txt", "r") as sbsdata:
        for line in sbsdata:
            tmp = line.strip().split("\t")
            query = tmp[0].strip().decode('utf-8')
            date = tmp[2]
            tp = querytype[query] if querytype.get(query) else "None"
            if not date in bydate:
                bydate[date] = {}
            bydate[date][query] = {"type" : tp, "w" : 1}

    allsport = {}
    sbssport = {}
    startdate = datetime.datetime(2018,11,24)
    #enddate = startdate
    enddate = datetime.datetime(2019,2,23)
    with open("real_weight.txt", "w") as rw:
        while startdate <= enddate:
            queries = []
            day = startdate.strftime("%Y-%m-%d")
            startdate += datetime.timedelta(1)
            usersessions = '//user_sessions/pub/search/daily/' + day + '/clean'
            output = '//home/freshness/staff/itajn/FUA-297/' + day
            if not yt.exists(output):
                continue
                # yt.run_reduce(filter(hoststat),
                              # source_table = usersessions,
                              # destination_table = output,
                              # local_files = [args.blockstat],
                              # reduce_by = 'key'
                              # )
            print >> sys.stderr, "reading table %s" % day
            i = 0
            for row in yt.read_table(output):
                if not i % 100000:
                    print >> sys.stderr, "reading line %s, date %s" % (i, day)
                i += 1
                if row["hostnum"] >= 3:
                    queries.append(row["query"])
                    if not str(row["sport"]) in allsport:
                        allsport[str(row["sport"])] = 0
                        sbssport[str(row["sport"])] = 0
                    allsport[str(row["sport"])] += 1
            random.shuffle(queries)

            for q in queries[:10000]:
                if not day in bydate:
                    break
                if q.decode('utf-8') in bydate[day]:
                    bydate[day][q.decode('utf-8')]["w"] += 1
            if day in bydate:
                for q in bydate[day]:
                    rw.write(day + "\t" + q.encode('utf-8') + "\t" + str(bydate[day][q]["w"]) + "\n")
                    if not bydate[day][q]["type"] in sbssport:
                        allsport[bydate[day][q]["type"]] = 0
                        sbssport[bydate[day][q]["type"]] = 0
                    sbssport[bydate[day][q]["type"]] += bydate[day][q]["w"]
    rw.close()
    for t in allsport.keys():
        print t.replace(" ", "_"),"\t", allsport[t],"\t", sbssport[t]

if __name__ == '__main__':
    main()
