#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division

import sys
import libra
import argparse
import yt.wrapper as yt
import json
import datetime
import time
import os
import random

def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--server", dest="server", help="mapreduce server", default="hahn.yt.yandex.net:80", required = False)
    parser.add_argument("--bs", dest="blockstat", help="path to blockstat.dict", default="/home/itajn/serploader/blockstat.dict", required = False)
    parser.add_argument("-out", dest="output", help="file with sbs queries", required = True)
    parser.add_argument("-ts", dest="timestamp", help="start timestamp", required = True)
    return parser

blacklist = [u"2017", u"2016", u"2014", u"2012", u"2010", u"2008", u"2006", u"2004", u"2002", u"2022", u"1972", u"истори", u"купить", u"скачать", u"школьн", u"хими", u"физи", u"олимп ", u"математик", u"литератур", u"язык", u"информатик", u"гагарин", u"эйлер", u"ломоносов", u"кхл", u"нхл", u"класс", u"школ", u"студент", u"проект", u"медвеж", u"истори", u"окруж", u"конспект", u"проект", u"доклад", u"презент", u"урок", u"резерв", u"философ", u"реферат", u"летни", u"греки", u"древн", u"1956", u"1980", u"колледж", u"лицей", u"академи", u"промокод", u"жк ", u"чтению", u"стихотвор", u"я профессионал", u"сочи", u"программирова", u"детей", u"биолог", u"заочн", u"природовед", u"физтех", u"политех", u"биолог", u"эколог", u"бассейн", u"ссср", u"обществозн", u"1992", u"1998", u"географ", u"рггу", u"мюнхен", u"комплекс", u"кроссворд", u"сканворд", u"1936", u"1986", u"экономик", u"интеллект", u"астроном", u"баскетбол", u"всероссийск", u"концерт", u"такси", u"сауна", u"мфти", u"недвижимость", u"продажа", u"объявлен", u"проспект", u"падение", u"партер", u"билет", u"предметн", u"учител", u"фильм", u"парки музеи усадьбы", u"olymp trade", u"smartolimp", u"турин", u"спбгу", u"античн", u"рсош", u"ванкувер", u"кафе", u"адлер", u"фитнес", u"поликлиника", u"мгу", u"olimp.kz", u"skyeng", u"olimp kz", u"2020", u"роза хутор", u"лингвист", u"сбербанк", u"урфоду", u"высшая проба", u"порно", u"секс", u"шахмат", u"вош", u"мебель", u"олимпокс", u"максвел", u"мгту", u"музеи парки усадьбы", u"филолог", u"мгсу", u"мифи", u"отель", u"микрорайон", u"кинотеатр", u"эрудит", u"зевс", u"астерикс", u"бульвар", u"образован", u"маи"]


def extract(key,recs):
    uid = key
    try:
        session = libra.ParseSession(recs, "./blockstat.dict")
    except:
        return
    for request in session:
        show = False
        olymptitle = 0
        olympsni = 0
        filter = False

        if request.IsA("TYandexWebRequest"):
            platform = "desktop"
        elif request.IsA("TMobileYandexWebRequest") or request.IsA("TTouchYandexWebRequest"):
            platform = "touch"
        else:
            continue
        query = unicode(request.Query[:1024].strip().decode("utf-8", errors="ignore")).lower()
        for marker in blacklist:
            if marker in query:
                filter = True
                break
        if query.endswith(u"олимп"):
            filter = True

        for block in request.GetMainBlocks():
            res = block.GetMainResult()
            if res.IsA("TBlenderWizardResult") or res.IsA("TWizardResult"):
                if "olympiad" in res.Path:
                       show = True
            if res.IsA("TWebResult") and "RandomLog" in res.Markers:
                try:
                    random_log_string = res.Markers["RandomLog"].decode("base64")#.decode("utf-8")
                    random_log = json.loads(random_log_string)
                except:
                    print >> sys.stderr, "Randomlog decoding error"
                    continue
                if random_log is None:
                    continue
                if u"олимп" in random_log["Title"].lower():
                    olymptitle += 1
                if "Passage" in random_log and u"олимп" in "\t".join(random_log["Passage"][:10]).lower():
                    olympsni += 1

        if olymptitle or olympsni or show:
            yield {"query" : query, "show" : show, "title": olymptitle, "snippet": olympsni, "filter": filter}


def glue(key, recs):
    query = key["query"]
    count = 0
    sni = 0
    title = 0
    wiz = 0
    for r in recs:
        if r["show"]:
            wiz += 1
        elif not r["filter"]:
            count += 1
            sni += r["snippet"]
            title += r["title"]
    if count > 0:
        yield {"query" : query, "count" : count, "snippet": sni / count, "title": title / count, "wiz" : wiz}
    elif wiz > 0:
        yield {"query" : query, "wiz" : wiz}


def mail_report(text, title):
    import smtplib
    from email.mime.text import MIMEText
    me = "itajn@yandex-team.ru"
    you = [
        "olympic-wizard-reports@yandex-team.ru"
    ]
    text = text.decode("utf-8").encode("utf-8")
    msg = MIMEText(text)
    msg["Subject"] = title
    msg["From"] = me
    msg["To"] = me
    msg["CC"] = ", ".join(you)
    s = smtplib.SMTP(os.environ.get("LOCAL_SMTP_SERVER_HOST", "localhost"))
    s.sendmail(me, you + [me], msg.as_string())
    s.quit()

def main():
    args = HandleOption().parse_args()
    yt.update_config({"proxy": {"url": args.server}})
    timestamp = int(args.timestamp[:10])

    treshold = 100
    numqueries = 100

    tables = []
    startdate = datetime.datetime.fromtimestamp(timestamp) - datetime.timedelta(hours = 12)
    enddate = datetime.datetime.fromtimestamp(timestamp)
    firsttable = datetime.datetime.fromtimestamp(timestamp)
    lasttable = datetime.datetime(1,1,1)
    while startdate <= enddate:
        ts = str(int(time.mktime(startdate.timetuple())))
        usersessions = "//user_sessions/pub/search/fast/" + ts + "/clean"
        if yt.exists(usersessions):
            tables.append(usersessions)
            firsttable = min(firsttable, startdate)
            lasttable = max(lasttable, startdate)
        startdate += datetime.timedelta(minutes = 30)
    print >> sys.stderr, "Number of tables used: ", len(tables)
    print >> sys.stderr, "Actual calculation from ", firsttable.strftime("%Y-%m-%d %H-%M"), " to ", lasttable.strftime("%Y-%m-%d %H-%M")
    with yt.TempTable(prefix = "olymp_monitor") as tmptable:
        yt.run_reduce(extract,
                      source_table = tables,
                      destination_table = tmptable,
                      local_files = [args.blockstat],
                      reduce_by = "key"
                      )
        uniq_list = set()
        wiz_list = set()
        sbs_set = []
        for row in yt.read_table(tmptable):
            if not row["show"] and not row["filter"] and row["snippet"] > 3:
                uniq_list.add(row["query"])
            if (not row["filter"] and row["snippet"] > 3) or row["show"]:
                sbs_set.append(row["query"])
            if row["show"]:
                wiz_list.add(row["query"])
        random100 = list(uniq_list)
        random.shuffle(random100)
        randomwiz = list(wiz_list)
        random.shuffle(randomwiz)
        yt.run_sort(source_table = tmptable,
                    destination_table = tmptable,
                    sort_by = "query"
                    )
        yt.run_reduce(glue,
                      source_table = tmptable,
                      destination_table = tmptable,
                      reduce_by = "query"
                      )
        best100 = []
        bestwiz = []
        unanswer = []
        for row in yt.read_table(tmptable):
            if row["wiz"] > 0:
                bestwiz.append({"query" : row["query"], "count" : row["wiz"]})
            if "count" in row:
                if row["wiz"] == 0 and row["count"] >= treshold and (row["title"] > 3 or row["snippet"] > 3):
                    best100.append(row["query"])
                if row["wiz"] > 0 and row["count"] + row["wiz"] >= treshold:
                    unanswer.append(row["query"])
        random.shuffle(best100)
        random.shuffle(unanswer)

    topwiz = []
    for q in sorted(bestwiz, key = lambda k: k["count"], reverse = True)[:numqueries]:
        topwiz.append(q["query"])
    for q in best100[:numqueries]:
        if q in random100[:numqueries]:
            random100.remove(q)
    for q in topwiz:
        if q in randomwiz[:numqueries]:
            randomwiz.remove(q)

    mail_report(
        "Данные собраны за " + firsttable.strftime("%Y-%m-%d %H-%M") + " ~ " + lasttable.strftime("%Y-%m-%d %H-%M") + "\n\n__Частотные запросы совсем без колдунщика:__\n\n" + "\n".join(best100[:numqueries]) + "\n\n\n__Частотные запросы с неответами:__\n\n" + "\n".join(unanswer[:numqueries]) + "\n\n\n__Случайные запросы без колдунщика:__\n\n" + "\n".join(random100[:numqueries]) + "\n\n\n__Частотные срабатывания колдунщика:__\n\n" + "\n".join(topwiz) + "\n\n\n__Случайные срабатывания колдунщика:__\n\n" + "\n".join(randomwiz[:numqueries]),
        "Olympic wizard report " + datetime.datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d %H-%M"),
    )
    sbs_list = []
    random.shuffle(sbs_set)
    i = 0
    j = 0
    while i < 1500 and j < len(sbs_set):
        if not sbs_set[j] in sbs_list:
            sbs_list.append(sbs_set[j])
            i += 1
        j += 1
    with open(args.output, "w") as output:
        output.write("\n".join(sbs_list))

if __name__ == "__main__":
    main()
