from functools import partial

import yt.wrapper as yt

from rtcconf import config
from utils import mr_utils as mr
import re

def str_to_bool(s):
    if s == 'true':
        return True
    elif s == 'false':
        return False
    else:
        raise ValueError(s)


toloka_workers_headers = [
    'toloka_id', 'deleted', 'default_email',
    'first_name', 'last_name', 'gender',
    'birth_day', 'country', 'city_id',
    'region_by_ip', 'subscribe', 'languages',
    'interface_lang', 'balance', 'education',
    'adult_allowed', 'citizenship', 'system_ban',
    'system_ban_date', 'system_ban_expiration_date',
    'system_ban_reason', 'paypal_email', 'yamoney_account',
    'privatbank_last_name', 'privatbank_first_name',
    'privatbank_phone_number', 'privatbank_last_four_digits',
    'current_verification', 'sanctions_status'
]


def read_toloka_dump(file_name):
    with open(file_name) as f:
        for rec_num, line in enumerate(f.readlines()):
            if rec_num == 0:
                continue
            if rec_num % 1000 == 0:
                print rec_num

            line = line.strip()
            fields = line.split('|')
            rec = {k: v for k, v in zip(toloka_workers_headers, fields)}
            try:
                del rec['first_name']
                del rec['last_name']
                del rec['privatbank_first_name']
                del rec['privatbank_last_four_digits']
                del rec['privatbank_last_name']
                del rec['balance']

                rec['deleted'] = str_to_bool(rec['deleted'])
                rec['subscribe'] = str_to_bool(rec['subscribe'])
                rec['adult_allowed'] = str_to_bool(rec['adult_allowed'])
                rec['system_ban'] = str_to_bool(rec['system_ban'])
                rec['region_by_ip'] = [int(reg) for reg in rec['region_by_ip'].strip('[]').split(', ') if reg]

            except Exception as e:
                print e, line

            yield rec


def read_toloka_mapping(file_name):
    with open(file_name) as f:
        for rec_num, line in enumerate(f.readlines()):
            if rec_num == 0:
                continue
            if rec_num % 1000 == 0:
                print rec_num

            line = line.strip()
            fields = line.split('\t')
            toloka_id = fields[0].strip('"')
            puid = fields[1]
            puid = re.sub(r"\D", "", puid)
            yield {'toloka_id': toloka_id, 'puid': puid}


if __name__ == '__main__':
    yt.config.set_proxy(config.MR_SERVER)

    mr.mkdir(config.GRAPH_YT_DICTS_FOLDER + 'toloka')

    yt.write_table(config.GRAPH_YT_DICTS_FOLDER + 'toloka/workers_2016-10-26',
                   read_toloka_dump('/home/artembelov/workers_2016-10-26.csv'), raw=False)

    yt.write_table(config.GRAPH_YT_DICTS_FOLDER + 'toloka/uids',
                   read_toloka_mapping('/home/artembelov/uids.csv'), raw=False)

    mr.sort_all([
        config.GRAPH_YT_DICTS_FOLDER + 'toloka/uids',
        config.GRAPH_YT_DICTS_FOLDER + 'toloka/workers_2016-10-26'
    ], sort_by='toloka_id')

    yt.run_reduce(partial(mr.filter_left_by_right, right_columns_to_join=['puid']),
                  [config.GRAPH_YT_DICTS_FOLDER + 'toloka/workers_2016-10-26',
                  config.GRAPH_YT_DICTS_FOLDER + 'toloka/uids'],
                  config.GRAPH_YT_DICTS_FOLDER + 'toloka/workers_2016-10-26-puids',
                  reduce_by='toloka_id')

    yt.run_sort(config.GRAPH_YT_DICTS_FOLDER + 'toloka/workers_2016-10-26-puids', sort_by='puid')

