#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
import requests
import sys
import argparse
import datetime
import time
import numpy
import yt.wrapper as yt
import json
from collections import defaultdict

def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--file", dest = "file", help = "input with CH data", required = True)
    parser.add_argument("--server", dest = "server", help = "mapreduce server", default = "hahn.yt.yandex.net:80", required = False)
    parser.add_argument("--date", dest = "date", help = "date", required = True)
    parser.add_argument("--out", dest = "out", help = "file for calculated output", required = True)
    return parser


def make_subject(url, params, variants, tags, problems, egenumbers):
    subj = "unknown"

    for k in params.keys():
        if params[k] and "by_subject_id" in params[k]:
            for s in params[k]["by_subject_id"]:
                subj = s
                if s == "unknown":
                    return subj
        elif params[k] and "results_by_subject_id" in params[k]:
            for s in params[k]["results_by_subject_id"]:
                subj = s
                if s == "unknown":
                    return subj
    if subj != "unknown":
        return subj

    if "?print=1" in url:
        url = url.replace("print=1&", "")
    elif "&print=1" in url:
        url = url.replace("&print=1", "")

    if url.startswith("https://yandex.ru/tutor/subject/?subject_id="):
        subj = url.split("&")[0].replace("https://yandex.ru/tutor/subject/?subject_id=", "")
        if "#" in subj:
            subj = subj.split("#")[0]
    if url.startswith("https://yandex.ru/tutor/?subject_id="):
        subj = url.split("&")[0].replace("https://yandex.ru/tutor/?subject_id=", "")
        if "#" in subj:
            subj = subj.split("#")[0]
    elif url.startswith("https://yandex.ru/tutor/subject/variants/?subject_id="):
        subj = url.split("&")[0].replace("https://yandex.ru/tutor/subject/variants/?subject_id=", "")
        if "#" in subj:
            subj = subj.split("#")[0]
    elif url.startswith("https://yandex.ru/tutor/subject/variant/?subject_id="):
        subj = url.split("&")[0].replace("https://yandex.ru/tutor/subject/variant/?subject_id=", "")
    elif url.startswith("https://yandex.ru/tutor/subject/variant/?variant_id="):
        variant = url.split("&")[0].replace("https://yandex.ru/tutor/subject/variant/?variant_id=", "")
        if variants.get(variant):
            subj = variants.get(variant)
    elif url.startswith("https://yandex.ru/tutor/?variant_id="):
        variant = url.split("&")[0].replace("https://yandex.ru/tutor/?variant_id=", "")
        if variants.get(variant):
            subj = variants.get(variant)
    elif url.startswith("https://yandex.ru/tutor/subject/problem/?problem_id="):
        problem = url.split("&")[0].replace("https://yandex.ru/tutor/subject/problem/?problem_id=T", "")
        if problems.get(problem):
            subj = problems.get(problem)
    elif url.startswith("https://yandex.ru/tutor/?tag_id="):
        tag = url.split("&")[0].replace("https://yandex.ru/tutor/?tag_id=", "")
        if tags.get(tag):
            subj = tags.get(tag)
    elif url.startswith("https://yandex.ru/tutor/subject/tag/problems/?ege_number_id="):
        number = url.split("&")[0].replace("https://yandex.ru/tutor/subject/tag/problems/?ege_number_id=", "")
        if egenumbers.get(number):
            subj = egenumbers.get(number)

    if not subj.isdigit():
        subj = "unknown"
    return subj

def main():
    args = HandleOption().parse_args()
    yt.update_config({"proxy": {"url": args.server}})

    print >> sys.stderr, "Reading variants"
    variants = {}
    for row in yt.read_table("//home/freshness/tutor/subject_mappings/variants"):
        variants[str(row["variant_id"])] = str(row["subject_id"])

    print >> sys.stderr, "Reading tags"
    tags = {}
    for row in yt.read_table("//home/freshness/tutor/subject_mappings/subject_tags"):
        tags[str(row["tag_id"])] = str(row["subject_id"])

    print >> sys.stderr, "Reading problems"
    problems = {}
    for row in yt.read_table("//home/freshness/tutor/subject_mappings/problems"):
        problems[str(row["problem_id"])] = str(row["subject_id"])

    print >> sys.stderr, "Reading ege numbers"
    egenumbers = {}
    for row in yt.read_table("//home/freshness/tutor/subject_mappings/ege_numbers"):
        egenumbers[str(row["ege_number_id"])] = str(row["subject_id"])

    userdata = {"all" : {}}
    visitsdata = {"all" : defaultdict(int)}
    this = datetime.datetime.strptime(args.date, "%Y-%m-%d")
    i = 0
    with open(args.file) as input:
        for line in input:
            if i % 10000 == 241:
                print >> sys.stderr, "Processing line %s" %i
            i += 1
            data = line.strip().split("\t")
            if len(data) < 6:
                print >> sys.stderr, data
                continue
            user = data[0]
            try:
                params = json.loads(data[1])
            except:
                params = {}
            date = datetime.datetime.strptime(data[2], "%Y-%m-%d")
            timestamp = data[3]
            url = data[4]
            ref = data[5]
            subject = make_subject(url, params, variants, tags, problems, egenumbers)
            if not subject:
                continue
            if not subject in userdata:
                userdata[subject] = {}
                visitsdata[subject] = defaultdict(int)
            if not date in userdata[subject]:
                userdata[subject][date] = set()
            if not date in userdata["all"]:
                userdata["all"][date] = set()
            userdata[subject][date].add(user)
            userdata["all"][date].add(user)
            if not params:
                visitsdata[subject][date] += 1
                visitsdata["all"][date] += 1

    output_metrics = []
    for subj in userdata.keys():
        allusers = set()
        daily = []
        weekly = {}
        visits_m = 0
        visits_w = 0
        dt = sorted(userdata[subj].keys())
        print >> sys.stderr, "dates with at least 1 user", subj, len(dt)
        for day in dt:
            allusers = allusers.union(userdata[subj][day])
            daily.append(len(userdata[subj][day]))
            visits_m += visitsdata[subj][day]
        for day in dt[-7:]:
            visits_w += visitsdata[subj][day]
        shift = len(dt) % 7
        for i in range(len(dt) // 7):
            weekly[i] = set()
            for day in dt[i*7+shift:(i+1)*7+shift]:
                weekly[i] = weekly[i].union(userdata[subj][day])
        print >> sys.stderr, "mau", subj, len(allusers)
        dau = sum(daily)/len(daily)
        wau = 0
        w = None
        for w in sorted(weekly.keys()):
            wau += len(weekly[w])
        if len(weekly.keys()):
            wau = wau/len(weekly.keys())
        mau = len(allusers)

        out = {"fielddate" : args.date, "subj" : subj,
                "mau" : mau,  "visits_monthly" : visits_m, "visits_weekly" : visits_w}
        if userdata[subj].get(this):
            out["dau"] = len(userdata[subj][this])
            out["visits_daily"] = visitsdata[subj][this]
        if w is not None:
            out["wau"] = len(weekly[w])

        if mau:
            out["dau_mau"] = dau/mau
            out["wau_mau"] = wau/mau
        if w is not None and len(weekly[w]):
            out["dau_wau"] = numpy.average(daily[-7:])/len(weekly[w])

        output_metrics.append(out)

    with open(args.out, "w") as output:
        json.dump(output_metrics, output, indent = 4)

if __name__ == "__main__":
    main()
