#!/usr/bin/env python
# -*- coding: utf-8 -*-

from nile.api.v1 import (
    Record,
    clusters,
    cli
)

import os #obligatory for Statface
import sys #obligatory for Statface
import re #obligatory for Statface
import argparse #obligatory for Statface
import getpass #obligatory for Statface
import json
import datetime

import pandas as pd
import numpy as np
from math import sqrt
from scipy.stats import ttest_ind

def collect_testids(records):
    td = set()
    for i in records[100:]:
        td.add(i["testid"])
    td = list(td)
    control = records[0]["testid"]
    return [control] + td


def calc_Tstat(dfm, test_ids):
    res = []
    for exp in test_ids[1:]:
        tstat, pvalue = ttest_ind(dfm[exp], dfm[test_ids[0]])
        if pvalue < 0.01:
            verdict = "significant"
        else:
            verdict = "not significant"

        if sum(dfm[test_ids[0]]) < 10:
            verdict = "not enough data in control, less than 10 RUR"

        percent = None
        if sum(dfm[test_ids[0]]) > 0:
            percent = 100.0 * (sum(dfm[exp]) - sum(dfm[test_ids[0]])) / sum(dfm[test_ids[0]])
        res.append([exp, sum(dfm[exp]), percent, round(np.nan_to_num(pvalue), 4), verdict])
    return res


def precomp(recs, test_ids):
    stats = {}
    control = test_ids[0]
    exp = test_ids[1:]
    stats[control] = [i['money'] for i in recs if i['testid']==control]
    for k in exp:
        stats[k] = [i['money'] for i in recs if i['testid']==k]
    return stats


def parse_from_path(s):
    splitted_s = s.split('/')
    table = splitted_s[-1]
    job_root = "/".join(splitted_s[0:-1])
    return [job_root, table]


##################

@cli.statinfra_job(options=[cli.Option('test_ids', default='?')])

def make_job(job, nirvana, options):
    input_table = nirvana.input_tables[0]
    output_table = input_table+"_significance"
    output_folder = parse_from_path(output_table)[0]
    table_name = parse_from_path(input_table)[1]
    cluster = clusters.yt.Hahn()

    job = job.env(
        templates=dict(
            job_root=output_folder
        )
    )

    testids = options.test_ids
    my_testids = testids.replace(" ", "").split(",")

    df = cluster.read(output_folder + "/temporary/" + table_name + "_buckets", mode="records")
    records = [k.to_dict() for k in df]
    #test_ids = collect_testids(records)
    dfm = precomp(records, my_testids)
    stats = calc_Tstat(dfm, my_testids)
    recs = [Record(exp_testid=k[0], exp_money=k[1], percent_diff=k[2], pvalue=k[3], verdict=k[4]) for k in stats]
    cluster.write(output_table, recs)

    return job


if __name__ == '__main__':
    cli.run()

