#-*- coding: UTF-8 -*-
import nile
import argparse
from time import time
from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)
from qb2.api.v1 import (
    extractors as se,
    filters as sf
)
from copy import deepcopy
import uatraits
import urllib
from datetime import datetime as dt, timedelta
import os
import sys
import codecs
from random import random
import hashlib

SPLIT_PATH = [
    {
        "testid": "153585",
        "restrictions_flow": {
            "services": "searchapp,video,web",
            "regions": "225",
            "percent": 10.000000000000002
        },
        "split_path": [
            {
                "salt": "7fe99848711919f2f3d1c1f4fd445742",
                "path_hash": "0cfdde01c4f0ab01947bd1ace2e01b27",
                "num_slots": 500,
                "hash_type": 4,
                "granularities": [],
                "child_slots": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474]
            },
            {
                "salt": "6cc4d12688c954f38a63f9bd28744d03",
                "path_hash": "4bed5b6453526297b0ccc4bc277acc91",
                "num_slots": 95,
                "hash_type": 4,
                "granularities": [],
                "child_slots": [0, 2, 3, 5, 6, 7, 8, 9, 10, 11, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 28, 29, 30, 31, 33, 35, 36, 37, 38, 40, 42, 43, 44, 46, 47, 50, 51, 52, 53, 54, 55, 56, 57, 59, 60, 61, 62, 63, 64, 66, 67, 69, 71, 73, 74, 75, 76, 77, 79, 82, 83, 84, 85, 86, 88, 90, 91, 92, 93]
            },
            {
                "salt": "284ef124ad3ff490333ee7e7dd5f3387",
                "path_hash": "b4e710471a293f91a83d027b226214d5",
                "num_slots": 140,
                "hash_type": 4,
                "granularities": [],
                "child_slots": [0, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 18, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35, 36, 37, 38, 39, 40, 41, 43, 45, 46, 47, 48, 49, 50, 51, 52, 54, 59, 62, 63, 64, 65, 66, 67, 68, 70, 71, 72, 73, 75, 76, 77, 79, 80, 81, 82, 83, 84, 85, 88, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 107, 108, 110, 111, 112, 113, 114, 115, 117, 118, 120, 121, 122, 125, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139]
            },
            {
                "salt": "32a939a4ef3e571183659825a7cf70b7",
                "path_hash": "cad057bb52c0fa46da98778a4b52b697",
                "num_slots": 110,
                "hash_type": 4,
                "granularities": [],
                "child_slots": [62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 98, 99, 106, 107, 108, 109]
            },
            {
                "salt": "472699131f55ad9ef44af7aef8099bfc",
                "path_hash": "09bad6d4b2427d61cc3f3d6d6d060174",
                "num_slots": 2,
                "hash_type": 4,
                "granularities": [],
                "child_slots": "[1]"
            }
        ]
    }
]

def calc_slot(data, salt, num_slots):
    m = hashlib.md5()
    m.update(data + salt)
    digest = m.hexdigest()
    digest = ''.join(reversed([digest[i : i+2] for i in range(0, 32, 2)]))
    return int(digest[16:], 16) % num_slots


def calc_split_link(data, hash_type, link):
    if link["hash_type"] != hash_type:
        return False
    slot = calc_slot(data, link["salt"], link["num_slots"])

    return slot in link["child_slots"]


def calc_split(split_path, hash_type, uid):
    for chain in split_path:
        good = True

        for link in chain["split_path"]:
            if not calc_split_link(uid, hash_type, link):
                good = False
                break

        if good:
            return True

    return False

def calc_key(key):
    is_yandexuid = False
    try:
        id = int(key)
        is_yandexuid = True
    except:
        is_yandexuid = False
    if is_yandexuid:
        return not calc_split(SPLIT_PATH, 1 << 1, key)
    else:
        return not calc_split(SPLIT_PATH, 1 << 6, key)

CRYPTA_YUID_PUID_TABLE = "//home/crypta/public/matching/by_id/yandexuid/direct/puid"

def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--input_table', type=str, required=True)
    parser.add_argument('--testers_table', type=str, default='')
    parser.add_argument('--output_table', type=str, required=True)
    parser.add_argument('--need_salt', type=int, required=True)
    parser.add_argument('--need_split', type=int, required=True)
    args = parser.parse_args()

    cluster = clusters.yt.Hahn().env(parallel_operations_limit=10,
                                     yt_spec_defaults=dict(
                                         pool_trees=["physical"],
                                         tentative_pool_trees=["cloud"]
                                     ),
                                     templates=dict(
                                         tmp_root='//tmp',
                                         title='PrepareUidsForBell'
                                     ))

    job = cluster.job()
    input_table = job.table(args.input_table)
     ## salt uids if needed
    if args.need_salt:
        input_table = input_table.filter(sf.custom(calc_key, 'uid')).put(args.input_table + "_salted")

    joined = input_table.join(job.table(CRYPTA_YUID_PUID_TABLE), by_left='uid', by_right='id', type='inner').project(ne.all(), puid='target_id')

    if args.testers_table:
        testers = job.table(args.testers_table)

        job.concat(joined, testers) \
           .filter(sf.custom(lambda x : x, 'puid')) \
           .sort('puid') \
           .put(args.output_table)
    else:
        joined.filter(sf.custom(lambda x : x, 'puid')) \
              .sort('puid') \
              .put(args.output_table)

    job.run()

    if args.need_split:
        job = cluster.job()
        first_part, second_part = job.table(args.output_table).split(sf.custom(lambda x : random() > 0.5, 'puid'))
        first_part.put(args.output_table + "_first_part")
        second_part.put(args.output_table + "_second_part")
        job.run()

if __name__ == "__main__":
    main()
