import os
import shutil
import re
from datetime import datetime, timedelta

import yt.wrapper as yt

from rtcconf import config
from utils import mr_utils as mr

#from graph_all import GraphAllTask
from lib.luigi.yt_luigi import ExternalInput
import json

from random import randint, random
import functools
import collections


def reliable(times):
    def wrapper(f):
        def func(*args, **kwargs):
            for i in range(times):
                try:
                    return f(*args, **kwargs)
                except:
                    print '%s failed %s' % (f.__name__, str(i))
                    pass
            raise Exception('Failed %s after trying %s times' % (f.__name__, str(times)))
        return func
    return wrapper


def get_task_ext_deps(task):
    deps = []
    if hasattr(task.requires(), '__iter__'):
        for dep in task.requires():
            if isinstance(dep, ExternalInput):
                deps.append(dep.table)
            else:
                deps = deps + get_task_ext_deps(dep)
    else:
        if isinstance(task.requires(), ExternalInput):
            deps.append(task.requires().table)
        else:
            deps = get_task_ext_deps(task.requires())
        
    return deps


def identity_mapper(rec):
    rec['@table_index'] = 0
    yield rec


def mk_table_filter(key, vals, yamr_field=''):
    vals_set = set(vals)
    def mapper(rec):
        if yamr_field:
            v = mr.get_field_value(key, rec[yamr_field])
        else:
            v = rec.get(key)
        if v is not None and v in vals_set:
            rec['@table_index'] = 0
            yield rec
    return mapper


def mk_passport_filter(logins, puids):
    logins_set = set(logins)
    puids_set = set(puids)
    def mapper(rec):
        login = rec.get('login', '')
        puid = rec.get('uid', '')
        if (puid in puids_set) or (login in logins_set):
            rec['@table_index'] = 0
            yield rec
    return mapper


def mk_dev_info_filter(devids):
    def mapper(rec):
        mmet_devid = mr.get_field_value('mmetric_devids', rec['value'])
        gadid = mr.get_field_value('google_adv_id', rec['value'])
        android_id = mr.get_field_value('android_id', rec['value'])
        for devid in devids:
            if devid in [mmet_devid, gadid, android_id]:
                yield rec
                return
    return mapper


def mk_eal_filter(yuids):
    yuids_set = set(yuids)
    def mapper(rec):
        m = re.search('\s*yandexyuid=(\d+)', rec['cookies'])
        if m:
            yuid = m.group(1)
            if yuid in yuids_set:
                rec['@table_index'] = 0
                yield rec
    return mapper


def mk_passport_profiles_filter(yuids, puids):
    yuids_set = set(yuids)
    puids_set = set(puids)
    def mapper(rec):
        yuid = rec['key']
        puid = mr.get_field_value('ext_id', rec['value'])
        if (yuid in yuids_set) or (puid in puids_set):
            rec['@table_index'] = 0
            yield rec
    return mapper


def dump_table(table, filename, append=False):
    dirpath = os.path.dirname(filename)
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

    mode = 'wb'
    if append:
        mode = 'ab'

    with open(filename, mode) as f:
        for rec in yt.read_table(table, format=yt.JsonFormat(), raw=True):
            f.write(rec)
            f.write('\n')


#@reliable(10)
def save_test_data(table, tmp_map_dir, data_dir, rec_filter):
    basename = os.path.basename(table)
    #filename = table[2:].replace('/', '$')
    filename = table[2:]
    filepath = os.path.join(data_dir, filename)
    tbl_to_dump = tmp_map_dir + basename
    yt.run_map(rec_filter, table, tbl_to_dump)
    dump_table(tbl_to_dump, filepath)
    return filepath


#@reliable(10)
def save_test_data_daterange(table_folder, start_date, end_date, tmp_map_dir, data_dir, rec_filter, append=False):
    if not table_folder.endswith('/'):
        table_folder += '/'

    basename = os.path.basename(table_folder[:-1])
    dirname = table_folder[2:-1]
    dirpath = os.path.join(data_dir, dirname)

    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

    dt = start_date
    counter = 0
    files = []

    while dt < end_date:
        tbl = table_folder + dt.strftime('%Y-%m-%d')
        filepath = os.path.join(dirpath, dt.strftime('%Y-%m-%d'))
        tbl_to_dump = '%s%s_%02d' % (tmp_map_dir, basename, counter)
        yt.run_map(rec_filter, tbl, tbl_to_dump)
        dump_table(tbl_to_dump, filepath, append)
        counter += 1
        dt = dt + timedelta(days=1)
        files.append(filepath)
    
    return files


def mk_hh_filter(ids):
    ids_set = set(ids)
    def mapper(rec):
        parts = rec['value'].split(',')
        hhids = set([x.split('/')[1] for x in parts])
        intersection = hhids & ids_set
        if len(intersection) > 0:
            yield rec
    return mapper


def mk_fp_filter(yuids, deviceids):
    yuids_set = set(yuids)
    deviceids_set = set(deviceids)
    def mapper(rec):
        deviceid = mr.get_field_value('deviceid', rec['key'])
        yuid = mr.get_field_value('yandexuid', rec['key'])
        if deviceid in deviceids_set or yuid in yuids_set:
            yield rec
    return mapper


def mk_iscrypta_filter(yuids, deviceids, yuid_field='yuid', deviceid_field='deviceid'):
    def mapper(rec):
        deviceid = rec.get(deviceid_field)
        yuid = rec.get(yuid_field)
        if (deviceid is not None and deviceid in deviceids) or (yuid is not None and yuid in yuids):
            yield rec
    return mapper


def mk_sbapi_filter(yuids):
    yuids_set = set(yuids)
    def mapper(rec):
        reqstr = rec['request']
        m = re.search(r'yandexuid=(\d+)', reqstr)
        if m:
            rec_yuid = m.group(1)
            if rec_yuid in yuids_set:
                yield rec
    return mapper


def mk_random_sampler(param):
    def mapper(rec):
        rnd = random()
        if rnd < param:
            rec['@table_index'] = 0
            yield rec
    return mapper


def mk_access_log_filter():
    import random
    from graph_access import extract_uuid, extract_yuid, extract_yu, extract_yp

    def mapper(rec):
        request = rec.get('request', '')
        yuid = extract_yuid(rec.get('cookies', ''))
        if not yuid:
            return

        rnd = random.random()
        if request.startswith('/searchapp/jl'):
            uuid = extract_uuid(request)
            if uuid and rnd <= 0.00005:
                yield rec

        vhost = rec.get('canonized_vhost', '')
        if vhost and vhost.endswith('yandex.com.tr'):
            yu = extract_yu(extract_yp(rec.get('cookies', '')))
            if yu and rnd <= 0.00005:
                yield rec

    return mapper


def convert_iscrypta_log():
    dmap = {}
    for dt in ['2016-04-09', '2016-04-10', '2016-04-11']:
        for l in open(data_dir + '/statbox/metrika-mobile-log/' + dt):
            if not l.strip():
                continue
            rec = json.loads(l.strip())
            deviceid = rec.get('DeviceID', '')
            idfa = rec.get('OriginalDeviceID', '')
            googel_adv_id = rec.get('ADVID', '')
            if idfa:
                dmap[deviceid] = idfa
            if googel_adv_id:
                dmap[deviceid] = googel_adv_id
    for dt in ['2016-04-09', '2016-04-10', '2016-04-11']:
        f = open(data_dir + '/statbox/iscrypta-matching-log/' + dt + '_2', 'w')
        for l in open(data_dir + '/statbox/iscrypta-matching-log/' + dt):
            if not l.strip():
                continue
            rec = json.loads(l.strip())
            if rec['deviceid'] in dmap:
                rec['deviceid'] = dmap[rec['deviceid']]
            json.dump(rec, f)
            f.write('\n\n')

    for dt in ['2016-04-09', '2016-04-10', '2016-04-11']:
        shutil.move(data_dir + '/statbox/iscrypta-matching-log/' + dt + '_2',
                    data_dir + '/statbox/iscrypta-matching-log/' + dt)

def get_mm_field(data_dir, field_name):
    vals = set()
    mmetrica_dir = os.path.join(data_dir, *('statbox/metrika-mobile-log'.split('/')))
    mmetrica_files = os.listdir(mmetrica_dir)
    for fname in mmetrica_files:
        with open(os.path.join(mmetrica_dir, fname)) as f:
            for line in f:
                line = line.rstrip()
                if line:
                    js = json.loads(line)
                    val = js.get(field_name, '')
                    if val:
                        vals.add(val)
    return vals

read_uuids = functools.partial(get_mm_field, field_name='UUID')
read_idfa = functools.partial(get_mm_field, field_name='OriginalDeviceID')
read_google_adv_id = functools.partial(get_mm_field, field_name='ADVID')
def read_devids(data_dir):
    return read_idfa(data_dir) & read_google_adv_id(data_dir)

class map_rtb_log(object):
    def __init__(self, ids):
        self.ids = set(ids)
    def __call__(self, rec):
        tskv = dict(filter(lambda x: len(x)==2, [p.split('=', 1) for p in rec['queryargs'].split('&')]))
        if '337' in tskv and tskv['337'] in self.ids:
            rec['@table_index'] = 0
            yield rec

def remove_all_non_ascii(s):
    if isinstance(s, basestring):
        return s.decode('unicode_escape').encode('ascii', 'ignore')
    elif isinstance(s, list):
        return [remove_all_non_ascii(x) for x in s]
    elif isinstance(s, dict):
        for k, v in s.items():
            s[k] = remove_all_non_ascii(v)
        return s
    else:
        return s

def add_ts_to_mm():
    start_ts = 1460281085
    for dt in ['2016-04-09', '2016-04-10', '2016-04-11']:
        f = open(data_dir + '/statbox/metrika-mobile-log/' + dt + '_2', 'w')
        for idx, l in enumerate(open(data_dir + '/statbox/metrika-mobile-log/' + dt)):
            if not l.strip():
                continue
            rec = json.loads(l.strip())
            rec['timestamp'] = str(start_ts + idx)
            json.dump(rec, f)
            f.write('\n\n')


if __name__ == '__main__':
    yt.config.set_proxy('hahn.yt.yandex.net')

    fake_date = datetime.now().strftime('%Y-%m-%d')
    cids = ['1226973685373732040', '1974206055442420431', '2459305771421764167']
    #all_task = GraphAllTask(date=fake_date)

    #tables = get_task_ext_deps(all_task)
    #tables = [re.sub(r'\d{4}-\d{2}-\d{2}$', fake_date, t) for t in tables]
    #tables = sorted(set(tables))
    #
    #for t in tables:
    #    print t

    junk_tbl = '//home/crypta/team/artembelov/tmp_'

    #yt.run_map(mk_table_filter('c', cids, 'value'), '//crypta/production/state/graph/dicts/fuzzy_vertices', junk_tbl + 'ids')
    #cid_recs = [json.loads(x) for x in yt.read_table(junk_tbl + 'ids', format=yt.JsonFormat(), raw=True)]
    #yuids = [x['key'] for x in cid_recs if mr.get_field_value('id_type', x['value']) != 'deviceid']
    #deviceids = [x['key'] for x in cid_recs if mr.get_field_value('id_type', x['value']) == 'deviceid']
    #ids = yuids + deviceids
    #print yuids 
    #print deviceids

    data_dir = 'test_data'

    #if not os.path.exists(data_dir):
    #    os.makedirs('test_data')

    #end_date = datetime.now()
    end_date = datetime(2016, 4, 12)
    start_date = end_date - timedelta(days=3)

    #start_date = datetime(2016, 1, 19)
    #end_date = datetime(2016, 2, 18)
    #start_date = datetime(2016, 2, 17)
    #start_date = end_date - timedelta(days=30)

    #print ids

    # TODO: dirty hack: get some random yuids to make inhousehold classifier happy 
    #inhh_yuids = set()
    #for inhh_type in ['dd', 'dm', 'mm']:
    #    subkey_counts = {"0": 0, "1": 0, "2": 0}
    #    tbl = '//crypta/production/state/inhousehold/match/current/sample_pairs_%s' % inhh_type
    #    for rec in yt.read_table(tbl, format=yt.JsonFormat(), raw=True):
    #        r = json.loads(rec)
    #        sk = r['subkey']
    #        if subkey_counts[sk] < 400:
    #            parts = r['key'].split('_')
    #            inhh_yuids.add(parts[0])
    #            inhh_yuids.add(parts[1])
    #            subkey_counts[sk] += 1
    #        if all([x >= 400 for x in subkey_counts.values()]):
    #            break

    #inhh_yuids = list(inhh_yuids)

    #mobile_classify_devids = []
    #mobile_classify_yuids = []

    #seg_counts = {'sample_top_age': 5, 'sample_top_income': 3, 'sample_top_sex': 2}
    #dtf = (end_date - timedelta(days=1)).strftime('%Y-%m-%d')
    #for table in ['sample_top_age', 'sample_top_income', 'sample_top_sex']:
    #    counters = [0 for i in range(seg_counts[table])]
    #    tbl = '//crypta/production/state/mobile/%s/%s' % (dtf, table)
    #    for rec in yt.read_table(tbl, format=yt.JsonFormat(), raw=True):
    #        rec = json.loads(rec)
    #        yuid = mr.get_field_value('yuid', rec['value'])
    #        devid = mr.get_field_value('devid', rec['value'])
    #        segment = int(mr.get_field_value('s', rec['value']))

    #        if counters[segment] < 30:
    #            if yuid:
    #                mobile_classify_yuids.append(yuid)
    #            if devid:
    #                mobile_classify_devids.append(devid)
    #            counters[segment] += 1

    #        if all([x >= 30 for x in counters]):
    #            break

    # ernest_android_ids = []
    # counter = 0
    # for rec in yt.read_table('//crypta/production/state/graph/dicts/income_data_with_dev_info', format=yt.JsonFormat(),
    #                          raw=True):
    #     rec = json.loads(rec)
    #     android_id = rec['device_id']
    #     ernest_android_ids.append(android_id)
    #     counter += 1
    #     if counter > 200:
    #         break
    
    # These are needed so that mobile socdem samples aren't empty
    #oauth_devids_with_uuids = set()
    #dt = end_date - timedelta(days=1)
    #for rec in yt.read_table(dt.strftime('//statbox/oauth-log/%Y-%m-%d'), format=yt.JsonFormat(), raw=True):
    #    rec = json.loads(rec)
    #    if rec.get('uuid'):
    #        oauth_devids_with_uuids.add(rec['device_id'])
    #    if len(oauth_devids_with_uuids) >= 10:
    #        break

    #print oauth_devids_with_uuids

    ### Use this if you have cached yuids locally
    # yuids = eval(open('yuids').read())
    # deviceids = eval(open('deviceids').read())
    ###

    #yuids = yuids + mobile_classify_yuids + inhh_yuids + turkey_yuids
    #deviceids = deviceids + mobile_classify_devids
    #deviceids = list(deviceids) + list(oauth_devids_with_uuids)
    #yuids = list(set(yuids))
    #deviceids = list(set(deviceids))

    # ids = yuids + deviceids

    #with open('yuids', 'wb') as f:
    #    f.write(str(yuids))

    #with open('deviceids', 'wb') as f:
    #    f.write(str(deviceids))
    #print mobile_classify_yuids
    #print mobile_classify_devids
    #print yuids
    #print deviceids
    
    # save_test_data('//crypta/production/bb_storage/storage', junk_tbl, data_dir, mk_table_filter('yandexuid', yuids))
    # save_test_data('//crypta/production/profiles/fb', junk_tbl, data_dir, mk_table_filter('key', yuids))
    # save_test_data('//crypta/production/profiles/gg', junk_tbl, data_dir, mk_table_filter('key', yuids))
    # save_test_data('//crypta/production/profiles/lj', junk_tbl, data_dir, mk_table_filter('key', yuids))
    # save_test_data('//crypta/production/profiles/mr', junk_tbl, data_dir, mk_table_filter('key', yuids))
    # save_test_data('//crypta/production/profiles/vk', junk_tbl, data_dir, mk_table_filter('key', yuids))
    # save_test_data('//crypta/production/state/household/merged_households', junk_tbl, data_dir, mk_hh_filter(ids))
    #save_test_data_daterange('//crypta/production/storage/storage/fp', start_date, end_date, junk_tbl, data_dir, mk_fp_filter(yuids, deviceids))
    #save_test_data_daterange('//statbox/bar-navig-log', start_date, end_date, junk_tbl, data_dir, mk_table_filter('yandexuid', yuids))
    #save_test_data_daterange('//statbox/iscrypta-matching-log', start_date, end_date, junk_tbl, data_dir, mk_iscrypta_filter(yuids, deviceids))
    # save_test_data_daterange('//statbox/metrika-mobile-log', start_date, end_date, junk_tbl, data_dir,
    #                          mk_table_filter('OriginalDeviceID', deviceids))
    # save_test_data_daterange('//statbox/metrika-mobile-log', start_date, end_date, junk_tbl, data_dir,
    #                          mk_table_filter('ADVID', deviceids), append=True)
    # save_test_data_daterange('//statbox/metrika-mobile-log', start_date, end_date, junk_tbl, data_dir,
    #                          mk_table_filter('AndroidID', ernest_android_ids), append=True)

    #save_test_data_daterange('//statbox/crypta-rt-geo-log', start_date, end_date, junk_tbl, data_dir, mk_table_filter('yauid', yuids))
    #oauth_log_files = save_test_data_daterange('//statbox/oauth-log', start_date, end_date, junk_tbl, data_dir, mk_iscrypta_filter(yuids, deviceids, 'yandexuid', 'device_id'))
    # passport_phone_log_files = save_test_data_daterange('//statbox/passport-phone-log', start_date, end_date, junk_tbl, data_dir, mk_table_filter('yandexuid', yuids))
    # passport_phone_log_files = save_test_data_daterange('//statbox/passport-log', start_date, end_date, junk_tbl, data_dir, mk_table_filter('yandexuid', yuids))
    #save_test_data_daterange('//statbox/redir-log', start_date, end_date, junk_tbl, data_dir, mk_table_filter('yandexuid', yuids, 'value'))
    #save_test_data_daterange('//statbox/sbapi-lookup-access-log', start_date, end_date, junk_tbl, data_dir, mk_sbapi_filter(yuids))
    #metrica_files = save_test_data_daterange('//statbox/watch-log', start_date, end_date, junk_tbl, data_dir, mk_table_filter('yandexuid', yuids))

    # save_test_data_daterange('//statbox/access-log', start_date, end_date, junk_tbl, data_dir, mk_access_log_filter())

    ## TODO: These are for inhousehold_train_model.py, make it depend on appropriate tasks producing these instead (if possible)
    #save_test_data('//crypta/production/state/graph/dicts/yuid_ids', junk_tbl, data_dir, mk_table_filter('key', yuids))
    #save_test_data('//crypta/production/state/graph/dicts/yuid_ua', junk_tbl, data_dir, mk_table_filter('key', yuids))
    #save_test_data('//crypta/production/lal_manager/data_to_classify', junk_tbl, data_dir, mk_table_filter('key', yuids))

    ## TODO: Fix this (by making a task that will create this dict)
    #save_test_data('//crypta/production/state/graph/dicts/puid_yuid', junk_tbl, data_dir, mk_table_filter('value', yuids))
    #save_test_data('//crypta/production/state/graph/dicts/puid_yuid_yt', junk_tbl, data_dir, mk_table_filter('yuid', yuids))

    ## TODO: GraphCidTask: create empty dir //crypta/production/state/iscrypta
    #iscrypta_folder = os.path.join(data_dir, 'crypta/production/state/iscrypta')
    #if not os.path.exists(iscrypta_folder):
    #    os.makedirs(iscrypta_folder)

    ## TODO: FuzzyGraphTask
    #save_test_data('//crypta/production/state/graph/dicts/yuid_with_all', junk_tbl, data_dir, mk_table_filter('yuid', yuids))
    ## Empty yesterday's exact pairs
    #prev_date = end_date - timedelta(days=2)
    #fake_exact_pairs_dir = os.path.join(data_dir, 'crypta', 'production', 'state', 'graph', prev_date.strftime('%Y-%m-%d'), 'exact')
    #if not os.path.exists(fake_exact_pairs_dir):
    #    os.makedirs(fake_exact_pairs_dir)
    #open(os.path.join(fake_exact_pairs_dir, 'yuid_pairs'), 'w').close()

    ## geodata
    #test_geodata = 'geodata4.bin'
    #geodata_dir = os.path.join(data_dir, 'statbox/statbox-dict-by-name', 'geodata4.bin')
    #geodata_path = os.path.join(geodata_dir, start_date.strftime('%Y-%m-%d'))
    #if not os.path.exists(geodata_dir):
    #    os.makedirs(geodata_dir)
    #shutil.copyfile(test_geodata, geodata_path)
    #
    ### TODO: ImportFPGraphDictsDayTask, need to grep config.WEBVISOR_FOLDER for yuids, just random stuff for now
    #tbls = [config.FRESH_WEBVISOR_FOLDER + x for x in yt.list(config.FRESH_WEBVISOR_FOLDER[:-1])][:1]
    #for t in tbls:
    #    dump_table(t, os.path.join(data_dir, *(t[2:].split('/'))))

    #src_file = os.path.join(data_dir, *(tbls[0][2:].split('/')))
    #dt = start_date
    #while dt < end_date:
    #    dtts = int((dt - datetime(1970, 1, 1)).total_seconds() + 3600)
    #    dst_file = os.path.join(data_dir, *((config.FRESH_WEBVISOR_FOLDER[2:] + str(dtts)).split('/')))
    #    shutil.copyfile(src_file, dst_file)
    #    dt = dt + timedelta(days=1)

    # TODO: ImportFPGraphDictsDayTask: add fake radius ips so that 'watch_log_filtered_by_radius' is created
    #fp_dir = os.path.join(data_dir, *('crypta/production/storage/storage/fp'.split('/')))
    #fp_files = [os.path.join(fp_dir, x) for x in os.listdir(fp_dir)]
    #fp_ips = set()
    #for path in fp_files:
    #    with open(path) as f:
    #        for line in f:
    #            line = line.rstrip()
    #            if line:
    #                jobj = json.loads(line)
    #                ip = mr.get_field_value('ip', jobj.get('key'))
    #                if ip:
    #                    fp_ips.add(ip)
    #
    #radius_ips_tbl = (end_date - timedelta(days=1)).strftime('crypta/production/state/radius/log/%Y-%m-%d/all_radius_ips')
    #all_radius_ips_path = os.path.join(data_dir, *(radius_ips_tbl.split('/')))
    #if not os.path.exists(os.path.dirname(all_radius_ips_path)):
    #    os.makedirs(os.path.dirname(all_radius_ips_path))
    #
    #with open(all_radius_ips_path, 'w') as f:
    #    for ip in fp_ips:
    #        f.write(json.dumps(dict(ip=ip)))
    #        f.write('\n')

    ## TODO: Generate fake radius records for the day we're running for
    #dtf = (end_date - timedelta(days=1)).strftime('%Y-%m-%d')
    #radius_log_path = os.path.join(data_dir, *(('crypta/production/state/radius/log/%s/radius_log' % dtf).split('/')))
    #radius_log_dir = os.path.dirname(radius_log_path)
    #if not os.path.exists(radius_log_dir):
    #    os.makedirs(radius_log_dir)
    #
    #ts = ((end_date - timedelta(days=1)) - datetime(1970, 1, 1)).total_seconds() + 100

    #radius_logins_count = 40
    #logins = ['login-%d' % x for x in range(radius_logins_count)]
    #
    #with open(radius_log_path, 'w') as f:
    #    for ip in fp_ips:
    #        login = logins[randint(0, radius_logins_count - 1)]
    #        rec = dict(ip=ip, login=login, rec_type='radius', timestamp=int(ts))
    #        f.write(json.dumps(rec))
    #        f.write('\n')

    ## TODO: IncomePrepreSampleTask: Ernest's home dir is use, have to create it
    #dd = os.path.join(data_dir, *('crypta/team/ernest/CRYPTAIS-527'.split('/')))
    #if not os.path.exists(dd):
    #    os.makedirs(dd)

    ## TODO: DeviceYuidsFuzzyIpMatchingDayTask: fake bad ips with too many devids behind single ip, so that bad_ips is not empty
    #mm_tmpl = '{"ADVID":"dummy","ClientPort":"56983","LocationTimestamp":"1454507871","StartTimeZone":"18000","APIKey128":"dummy","AppBuildNumber":"255","SessionType":"SESSION_BACKGROUND","ScreenHeight":"540","LocationPrecision":"55","timezone":"+0000","Latitude":"51.7934041","UUIDHash":"dummy","SendTimestamp":"1454507884","Wifi_Ssids":"[\'WiFi-DOM.ru-8308\',\'WiFi-DOM.ru-16D0\',\'kv17\',\'TP-LINK_15\',\'Rostelecom_9\']","StartDate":"2016-02-03","APIKey":"135855","CountryCode":"250","AppPlatform":"android","EventType":"EVENT_FIRST","StartTimestamp":"1454507884","EventOffset":null,"DeviceIDHash":"dummy","AppFramework":"NATIVE","subkey":"","ReceiveTimestamp":"1454507899","DeviceIDSessionIDHash":"dummy","ClientIP":"%s","ScreenWidth":"960","Manufacturer":"Samsung","ReceiveDate":"2016-02-03","_stbx":"dummy","RegionID":"48","OperatorID":"99","SignalStrength":"-83","timestamp":"2016-02-03 13:58:19","source_uri":"prt://mobmetrika@mtcalclog02g.yandex.ru/opt/statbox_export_mobile/mobile-events.log","Longitude":"55.0337256","EventName":"First activation of metrica","Wifi_Macs":"[\'9CD36D0C75B0\',\'04A1514DD6F0\',\'C4E9844025BA\',\'90F652C92506\',\'C4A81D454B80\']","DeviceType":"PHONE","DeviceID":"%s","AndroidID":"dummy","LocationSource":"GPS","tskv_format":"metrika-mobile-log","Model":"Galaxy Mega 5.8","OSVersion":"4.2.2","AppVersionName":"2.3.3","CellID":"7033","ConnectionType":"CONN_WIFI","iso_eventtime":"2016-02-03 16:58:19","UUID":"%s","Locale":"ru-RU","ClientKitVersion":"230","SendTimeZone":"18000","AppID":"ru.yandex.yandextraffic","LAC":"55601","ScreenDPI":"240","ReceiveTimeZone":"10800","KitVersion":"230","StartTime":"2016-02-03 16:58:04","EventValue":"{\'preloadInfo\':{}}","NetworkType":"Wifi","ScaleFactor":"1.5","Wifi_SignalsStrengths":"[-47,-70,-73,-101,-96]"}' 
    #json.loads(mm_tmpl)
    #fake_ip = '::ffff:1.1.1.1'
    #mmet_dir = os.path.join(data_dir, 'statbox', 'metrika-mobile-log')
    #mmet_files = [os.path.join(mmet_dir, x) for x in os.listdir(mmet_dir)]
    #for mmf in mmet_files:
    #    with open(mmf, 'ab') as f:
    #        for fake_devid_num in range(40):
    #            fake_devid = 'dummy' + str(fake_devid_num)
    #            mm_rec = mm_tmpl % (fake_ip, fake_devid, fake_devid)
    #            f.write(mm_rec)
    #            f.write('\n')

    #save_test_data('//crypta/production/state/graph/dicts/income_data_with_dev_info', junk_tbl, data_dir, identity_mapper)
    #
    ## TODO: YuidAllIdDictsTask: 
    #save_test_data('//crypta/production/state/graph/dicts/yuid_regs', junk_tbl, data_dir, mk_table_filter('key', yuids))

    ## TODO: PrepareClassificationDataTask
    #save_test_data('//crypta/production/state/graph/dicts/dev_info', junk_tbl, data_dir, mk_dev_info_filter(deviceids))

    # logins = set()
    # metrica_files = []
    # metrica_dir = os.path.join(data_dir, *('statbox/watch-log'.split('/')))
    # metrica_files = os.listdir(metrica_dir)
    # for fname in metrica_files:
    #     with open(os.path.join(metrica_dir, fname)) as f:
    #         for line in f:
    #             line = line.rstrip()
    #             if line:
    #                 js = json.loads(line)
    #                 ha = js.get('headerargs', '')
    #                 m = re.search(r'yandex_login=([^;]+)', ha)
    #                 if m:
    #                     logins.add(m.group(1))
    #
    puids = set()
    oauth_log_dir = os.path.join(data_dir, *('statbox/oauth-log'.split('/')))
    passport_phone_log_dir = os.path.join(data_dir, *('statbox/passport-phone-log'.split('/')))
    oauth_log_files = [os.path.join(oauth_log_dir, f) for f in os.listdir(oauth_log_dir)]
    passport_phone_log_files = [os.path.join(passport_phone_log_dir, f) for f in os.listdir(passport_phone_log_dir)]
    files_with_puid = oauth_log_files + passport_phone_log_files
    for fname in files_with_puid:
        with open(fname) as f:
            for line in f:
                line = line.strip()
                if line:
                    js = json.loads(line)
                    puid = js.get('uid', None)
                    if puid is not None:
                        puids.add(str(puid))

    # save_test_data('//statbox/heavy-dict/passport_userdata/2016-05-31', junk_tbl, data_dir,
    #                mk_passport_filter(logins, puids))
    #save_test_data('//crypta/production/profiles/passport', junk_tbl, data_dir, mk_passport_profiles_filter(yuids, puids))

    # save_test_data('//crypta/production/profiles_base/fb', junk_tbl, data_dir, mk_table_filter('key', puids))
    # save_test_data('//crypta/production/profiles_base/gg', junk_tbl, data_dir, mk_table_filter('key', puids))
    # save_test_data('//crypta/production/profiles_base/lj', junk_tbl, data_dir, mk_table_filter('key', puids))
    # save_test_data('//crypta/production/profiles_base/mr', junk_tbl, data_dir, mk_table_filter('key', puids))
    # save_test_data('//crypta/production/profiles_base/vk', junk_tbl, data_dir, mk_table_filter('key', puids))

    ## TODO: GraphStatTask
    ##dt = end_date - timedelta(days=1)
    #save_test_data(dt.strftime('//statbox/yabs-event-log/%Y-%m-%d'), junk_tbl, data_dir, mk_table_filter('yandexuid', yuids, 'value'))

    ## TODO: Prettify stuff
    #dtf = (end_date - timedelta(days=1)).strftime('%Y-%m-%d')
    #save_test_data('//crypta/production/state/graph/dicts/yuid_with_all_good', junk_tbl, data_dir, mk_table_filter('yuid', yuids))
    ##save_test_data('//crypta/production/state/indevice/' + dtf + '/perfect/devid_yuid_all', junk_tbl, data_dir, mk_table_filter('yuid', yuids)) # Not needed

    ## TODO: Whatever
    #save_test_data_daterange('//statbox/export-access-log', start_date, end_date, junk_tbl, data_dir, mk_random_sampler(1./3500000.))
    #save_test_data_daterange('//statbox/mobile-tracking-log', start_date, end_date, junk_tbl, data_dir, mk_table_filter('YandexUidRu', yuids))

    # save_test_data('//crypta/production/state/graph/dicts/puid_phone_md5', junk_tbl, data_dir,
    #                mk_table_filter('puid', puids))

    ## Sender log
    # save_test_data_daterange('//statbox/sendr-click-log', start_date, end_date,
    #                          junk_tbl, data_dir, mk_random_sampler(1. / 100.))

    # TODO: FOR FUTURE TEST DATA CORRECTIONS, let's try yuids from yuid_with_all
    yuids = set()
    yuid_with_all = os.path.join(data_dir, *('crypta/production/state/graph/dicts/yuid_with_all'.split('/')))
    with open(yuid_with_all) as f:
        for line in f:
            line = line.strip()
            if line:
                js = json.loads(line)
                yuid = js.get('yuid', None)
                if yuid is not None:
                    yuids.add(str(yuid))


    # yuid_with_all = os.path.join(data_dir,
    #                              *('crypta/production/state/graph/dicts/yuid_with_all'.split('/')))
    # yuid_with_all1 = os.path.join(data_dir,
    #                              *('crypta/production/state/graph/dicts/yuid_with_all1'.split('/')))
    # with open(yuid_with_all) as f:
    #     with open(yuid_with_all1, 'w') as f1:
    #         for line in f:
    #             line = line.strip()
    #             if line:
    #                 js = json.loads(line)
    #                 all_dates = set()
    #                 for column in js.keys():
    #                     if '_dates' in column:
    #                         dates_per_val = js[column].values()
    #                         for date in dates_per_val:
    #                             all_dates.update(date)
    #                 js['all_dates'] = list(all_dates)
    #                 json.dump(js, f1)
    #                 f1.write('\n')
    #                 f1.write('\n')


    # print(yuids)

    # yuids = [int(yuid) for yuid in yuids]
    #
    # bb_storage = os.path.join(data_dir, *('crypta/production/bb_storage/storage'.split('/')))
    # with open(bb_storage, 'w') as f:
    #     for yuid in yuids:
    #         rec = {"yandexuid": yuid,
    #                "classification_time": 1468555437,
    #                "keyword_id": 217,
    #                "last_activity_time": 1468555437,
    #                "value": "8:1:500000;316:0:8768;364:0:1000000;343:0:1000000;" +
    #                         "258:0:323807;424:0:363841;363:0:1000000;8:0:500000;" +
    #                         "408:0:768059"}
    #         json.dump(rec, f)
    #         f.write('\n')
    #         f.write('\n')
    #
    #         rec = {"yandexuid": yuid,
    #                "classification_time": 1468555437,
    #                "keyword_id": 174,
    #                "last_activity_time": 1468555437,
    #                "value": "1:360695;0:639304"}
    #
    #         json.dump(rec, f)
    #         f.write('\n')
    #         f.write('\n')

    # save_test_data_daterange('//statbox/passport-log', start_date, end_date, junk_tbl, data_dir, mk_table_filter('yandexuid', yuids))

    # uuids = read_uuids(data_dir)
    # save_test_data_daterange('//statbox/mobile-redirect-bind-id-log', start_date, end_date, junk_tbl, data_dir, ,mk_table_filter('uuid', uuids))

    # Generate fake data for appsflyer, as properly grepping it will require too much time (bs-chevent-log is huge)
    # appsflyer_template = '//statbox/extdata-apps-flyer-log/2016-06-20'
    # bschevent_template = '//statbox/bs-chevent-log/2016-06-20'
    # af_rec = json.loads(next(yt.read_table(appsflyer_template, raw=True, format=yt.JsonFormat())))
    # bs_rec = json.loads(next(yt.read_table(bschevent_template, raw=True, format=yt.JsonFormat())))
    #
    # print af_rec['click_url']
    #
    # af_folder = os.path.join(data_dir, 'statbox', 'extdata-apps-flyer-log')
    # bs_folder = os.path.join(data_dir, 'statbox', 'bs-chevent-log')
    # if not os.path.exists(af_folder):
    #     os.makedirs(af_folder)
    # if not os.path.exists(bs_folder):
    #     os.makedirs(bs_folder)
    #
    # unixtime = int((datetime.now() - datetime(1970, 1, 1)).total_seconds())
    #
    # dt = start_date
    # appsfl_days = (end_date - start_date).days
    # max_logid = config.BS_CHEVENT_SEARCH_DAYS + appsfl_days
    # appfl_day = 0
    # while dt < end_date:
    #     appsfl = os.path.join(af_folder, dt.strftime('%Y-%m-%d'))
    #     with open(appsfl, 'wb') as fappsfl:
    #         for i in [x for x in range(max_logid) if (x % appsfl_days) == appfl_day]:
    #             af_rec['click_url'] = 'http://blahblah/?logid=' + str(i)
    #             af_rec['advertising_id'] = 'appsflyer-adv-id-' + str(i)
    #             fappsfl.write(json.dumps(af_rec))
    #             fappsfl.write('\n')
    #     appfl_day += 1
    #     dt += timedelta(days=1)
    #
    # for i in range(max_logid):
    #     dt = end_date - timedelta(days=(i+1))
    #     bschev = os.path.join(bs_folder, dt.strftime('%Y-%m-%d'))
    #     with open(bschev, 'wb') as fbschev:
    #         bs_rec['logid'] = str(i)
    #         bs_rec['yuid'] = str(i % 5) + str(unixtime)
    #         fbschev.write(json.dumps(bs_rec))
    #         fbschev.write('\n')

    # # YAMONEY
    # # replace puids to ones existing in puid_yuid_yt
    # puid_iter = iter(puids)
    #
    # bb_storage = os.path.join(data_dir, *('crypta/production/state/graph/dicts/yamoney_in'.split('/')))
    # with open(bb_storage, 'w') as f:
    #
    #     for rec in yt.read_table('//home/crypta/team/artembelov/yamoney/grep', raw=False):
    #         if rec['PAYER_ENTITY_UID'] != '0':
    #             rec['PAYER_ENTITY_UID'] = next(puid_iter)
    #             # print((rec['PAYER_ENTITY_UID'], next(puid_iter)))
    #             json.dump(rec, f)
    #             f.write('\n')
    #             f.write('\n')
    #

    # # TICKETS
    # replace puids to ones existing in puid_yuid_yt
    # puid_iter = iter(puids)
    # yuid_iter = iter(yuids)
    # for x in range(50):
    #     print((x, next(puid_iter)))
    #     print((x, next(yuid_iter)))
    #
    # tickets_table = os.path.join(data_dir, *('home/tickets/production/table'.split('/')))
    # with open(tickets_table, 'w') as f:
    #     for idx, rec in enumerate(yt.read_table('//home/tickets/production/table', raw=False)):
    #         print idx, rec
    #         rec = remove_all_non_ascii(rec)
    #
    #         print idx, rec
    #
    #         if idx > 100:
    #             break
    #         if random() > 0.1:
    #             rec['yandexUid'] = next(yuid_iter)
    #         if random() > 0.2:
    #             rec['uid'] = next(puid_iter)
    #
    #         json.dump(rec, f)
    #         f.write('\n')
    #         f.write('\n')

    # KINOPOISK
    # kinopoisk_access = os.path.join(data_dir, *('crypta/production/state/graph/dicts/yamoney_in'.split('/')))
    # with open(kinopoisk_access, 'a') as f:
    #     for idx, rec in enumerate(yt.read_table('//home/crypta/team/artembelov/kinopoisk/access-log',
    #                                             raw=True, format='json')):
    #         if idx < 1000:
    #             continue
    #         if idx > 1100:
    #             break
    #         f.write(rec)
    #         f.write('\n')

    # kinopoisk = os.path.join(data_dir, *('crypta/production/state/graph/dicts/kinopoisk'.split('/')))
    # with open(kinopoisk, 'w') as f:
    #     for kp_uid in ['5470947', '600251', '3204703']:
    #         rec = {'kp_uid': int(kp_uid), 'id_value': kp_uid, 'email': 'aaa@bbb.ru',
    #                'country': 'X', 'city': 'city', 'sex': 'male', 'age': '20'}
    #
    #         json.dump(rec, f)
    #         f.write('\n')
    #         f.write('\n')

    # watchlog_vk
    # import log_watch
    # watch_log1 = os.path.join(data_dir, *('statbox/bs-watch-log/2016-04-09'.split('/')))
    # watch_log2 = os.path.join(data_dir, *('statbox/bs-watch-log/2016-04-10'.split('/')))
    # watch_log3 = os.path.join(data_dir, *('statbox/bs-watch-log/2016-04-11'.split('/')))
    #
    # fetched_count = 0
    # with open(watch_log1, 'a') as f1, open(watch_log2, 'a') as f2, open(watch_log3, 'a') as f3:
    #     for fetched_countidx, rec in enumerate(yt.read_table('//statbox/bs-watch-log/2016-07-29',
    #                                                          raw=True, format='json')):
    #         # for vk
    #         filtered = list(log_watch.vk_hidden_param_mapper(json.loads(rec)))
    #         # for mailru
    #         # filtered = list(log_watch.mailru_hidden_param_mapper(json.loads(rec)))
    #
    #         if not filtered:
    #             continue
    #
    #         if fetched_count <= 10:
    #             f = f1
    #         elif 10 < fetched_count <= 20:
    #             f = f2
    #         else:
    #             f = f3
    #
    #         if fetched_count > 30:
    #             break
    #
    #         f.write(rec)
    #         f.write('\n')
    #         fetched_count += 1
    #
    #         print fetched_count

    # vk_phone_dict
    # vk_ids_from_watch_log = ['144684396', '113527834', '186979761']
    # some_phones_from_webvisor_to_match = ['+79038699988', '+79648909037', '+79878055194']
    #
    # vk_dict = os.path.join(data_dir, *('crypta/production/state/graph/dicts/profiles'.split('/')))
    # with open(vk_dict, 'w') as f:
    #
    #     for vk_id, phone in zip(vk_ids_from_watch_log, some_phones_from_webvisor_to_match):
    #         rec = {'id_value': vk_id, 'birth': None, 'phone': phone, 'region_code': 'RU'}
    #
    #         json.dump(rec, f)
    #         f.write('\n')
    #         f.write('\n')

    ### bs-rtb-log (with SSP apps). Get mobile-metrika devids -> grep bs-rtb-log
    # save_test_data_daterange('//statbox/bs-rtb-log', start_date, end_date, junk_tbl+'bs_rtb_log', data_dir, map_rtb_log(read_devids(data_dir)))


