#!/usr/bin/env python
# -*- coding: utf-8 -*-

import argparse
import ast
import collections
import datetime
from itertools import repeat, izip
import multiprocessing
import urlparse
import random

from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    clusters,
    Record,
)
import nile
import pandas as pd
import yt.wrapper as yt
import logging
logging.basicConfig(format='[%(asctime)s] %(filename)s[LINE:%(lineno)d] %(levelname)-8s %(message)s',
                    level=logging.ERROR)

with open('b_list', 'r') as b_list:
    BLAND_BLOCKS = [x for x in b_list.read().split('\n')]

ONE_HOT_LIST = [
    ['day_time', [
        'night',
        'evening',
        'afternoon',
        'morning',
        'early_morning',
    ]],
    ['week_day', [str(i) for i in range(1, 8)]],
]


def matrixnet_prepare(features_dict, path, label):
    result_value = '{}\t{}\t{}'.format(label, path, 1)
    for block in BLAND_BLOCKS:
        features_dict['block_' + block] = int(path == block)
    for one_key, value_list in ONE_HOT_LIST:
        for one_value in value_list:
            features_dict[one_key + '_' + str(one_value)] = int(features_dict[one_key] == one_value)
    for feature in FEATURES:
        feat_value = 0
        if feature in features_dict:
            feat_value = features_dict[feature]
        if feat_value is None:
            feat_value = -100
        result_value += '\t{}'.format(feat_value)
    return result_value


def label_threshold(r, probability):
    if r <= probability:
        return True
    else:
        return False


def group_reducer(groups):
    for key, records in groups:
        clicked_indexes = set()
        for i, record in enumerate(records):
            superior = []
            inferior = []
            label_name = record.label_name
            if record[label_name] > 0:
                superior = [i - j for j in range(i) if j not in clicked_indexes]
                clicked_indexes.add(i)
            else:
                inferior = [i - j for j in clicked_indexes if random.random() < 0.5 ** abs(i - j - 1)]

            yield Record(
                key=record.reqid,
                subkey=record.subkey,
                superior_relative=superior,
                inferior_relative=inferior,
                value=matrixnet_prepare(record.features_dict, record.path, record[label_name])
            )

def numerator(records):
    for record in records:
        yield Record(record, row_ind=records.row_index)

def argument_parser():
    parser = argparse.ArgumentParser(description='Get parameters')
    parser.add_argument(
        '-p',
        dest='pool',
        type=str,
        help='you hahn pool',
    )
    parser.add_argument(
        '-i',
        dest='itable',
        type=str,
        help='input table',
    )
    parser.add_argument(
        '-r',
        dest='rtable',
        type=str,
        help='result table',
    )
    parser.add_argument(
        '-l',
        dest='label',
        type=str,
        help='label name',
    )
    parser.add_argument(
        '-rh',
        dest='randomhash',
        type=str,
        help='column for random hash',
    )
    parser.add_argument(
        '-pr',
        dest='prob',
        type=float,
        help='probabilistic threshold',
    )
    args = parser.parse_args()
    return args


with open('f_list', 'r') as f_list:
    FEATURES = [x for x in f_list.read().split('\n') if x[0] != '#']


def main():
    args = argument_parser()
    itable = args.itable
    rtable = args.rtable
    label_name = args.label
    random_hash = args.randomhash
    prob_threshold = args.prob

    job_name = 'HOME MATRIXNET PREPARE: {} feat'.format(len(FEATURES))

    cluster = clusters.Hahn(pool=args.pool)
    job = cluster.job(job_name)
    train_table = job.table(itable)
    result = train_table.project(
        ne.all(),
        random_p=ne.custom(lambda x: (hash(x) % 1000000) / 1000000., random_hash),
        p_tresh=ne.const(prob_threshold),
        label_name=ne.const(label_name)
    ).filter(
        nf.custom(
            lambda r, p: label_threshold(r, p),
            'random_p',
            'p_tresh'
        )
    ).groupby('uid', 'reqid') \
        .sort('position') \
        .reduce(group_reducer) \
        .put(rtable)
    job.run()

if __name__ == "__main__":
    main()

