#!/usr/bin/python
# coding=utf-8

import luigi
import yt.wrapper as yt
from utils import utils
from lib.luigi import yt_luigi
from functools import partial
from rtcconf import config


THRESHOLD = 0.919


def email_to_login(rec):
    yield {'id_value': rec['login'], 'organization': rec['organization']}


def set_orgs_for_logins(_, recs):
    is_organization = 0
    for rec in recs:
        if rec['@table_index'] == 0:
            is_organization = rec['organization']
        else:
            rec['organization'] = is_organization
            rec['@table_index'] = 0
            yield rec


def login_to_email(rec):
    orig_login_col = config.ID_TYPE_LOGIN + '_' + config.ID_SOURCE_TYPE_FP + '_orig'
    orig_login = rec[orig_login_col][rec['id_value']]
    email = utils.login_to_email(orig_login)
    yield {'id_value': email, 'login': rec['id_value']}


def reduce_classify_org_emails(_, recs, threshold):
    is_organization = 0
    for rec in recs:
        if rec['@table_index'] == 0:
            prediction = rec['is_org_score']
            if prediction > threshold:
                is_organization = 1
        else:
            rec['organization'] = is_organization
            rec['@table_index'] = 0
            yield rec


def classify_org_emails(source_folder, classification_result, target_folder, from_login):
    if from_login:
        source = config.ID_TYPE_LOGIN + '_' + config.ID_SOURCE_TYPE_FP
        tmp_folder_login_to_email = target_folder + "tmp_login_to_email"
        yt.run_map(login_to_email, source_folder + 'yuid_with_id_' + source, tmp_folder_login_to_email)
        yt.run_sort(tmp_folder_login_to_email, sort_by='id_value')
        source_table = tmp_folder_login_to_email
    else:
        source = config.ID_TYPE_EMAIL
        source_table =source_folder + 'yuid_with_id_' + source
    with yt.Transaction():
        # can't map directly to yuid_with_id_X, it breaks sorting in schema
        tmp_table = target_folder + 'yuid_with_id_' + source + '_tmp'
        yt.run_reduce(partial(reduce_classify_org_emails,
                              threshold=THRESHOLD),
                      [classification_result, source_table],
                      tmp_table,
                      reduce_by='id_value')
        if from_login:
            yt.run_map(email_to_login, tmp_table, tmp_table)
            yt.run_sort(tmp_table, sort_by='id_value')
            yt.run_reduce(set_orgs_for_logins, [tmp_table, source_folder + 'yuid_with_id_' + source], tmp_table, reduce_by='id_value')
            yt.remove(tmp_folder_login_to_email)
        yt.run_sort(tmp_table,
                    target_folder + 'yuid_with_id_' + source,
                    sort_by='id_value')
        yt.remove(tmp_table)


class OrgEmailsClassifyTask(yt_luigi.BaseYtTask):
    # TODO: make this task transactional so that it doesn't break yuid_with_all dict sorting
    date = luigi.Parameter()

    def input_folders(self):
        return {'dict_f': config.GRAPH_YT_DICTS_FOLDER,
                'classification_result': config.CRYPTA_IDS_STORAGE + "email/email_org_classification"}

    def output_folders(self):
        return {'dict_f': config.GRAPH_YT_DICTS_FOLDER}

    def requires(self):
        from matching.yuid_matching import graph_dict
        return [graph_dict.YuidAllIdDictsTask(self.date)]

    def run(self):
        classify_org_emails(self.in_f('dict_f'),
                            self.in_f('classification_result'),
                            self.out_f('dict_f'),
                            from_login=False)
        classify_org_emails(self.in_f('dict_f'),
                            self.in_f('classification_result'),
                            self.out_f('dict_f'),
                            from_login=True)

    def output(self):
        dict_folder = self.out_f('dict_f')
        email_table = dict_folder + 'yuid_with_id_' + config.ID_TYPE_EMAIL
        login_table = dict_folder + 'yuid_with_id_' + config.ID_TYPE_LOGIN + '_' + config.ID_SOURCE_TYPE_FP
        return [yt_luigi.YtDateColumnTarget(table, 'organization', self.date)
                for table in [email_table, login_table]]


if __name__ == '__main__':
    yt.config.set_proxy(config.MR_SAERVER)
    #dt = sys.argv[1]
    in_f = '//home/crypta/team/gengo/CRYPTR-169/'
    in_class = '//home/crypta/team/gengo/CRYPTR-169/'
    out_f = '//home/crypta/team/gengo/CRYPTR-169/'
    classify_org_emails(in_f,
                        in_class,
                        out_f,
                        from_login=False)
    classify_org_emails(in_f,
                        in_class,
                        out_f,
                        from_login=True)

    # luigi.run(['OrgEmailsClassifyTask', '--date', dt,  '--workers', '1', '--local-scheduler', '--no-lock'])
