#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
import sys
import os
import codecs
import argparse
import nile
from nile.api.v1 import (
    clusters,
    filters as nf,
    extractors as ne,
    aggregators as na,
    Record
)
import getpass
import datetime
import itertools
from collections import Counter


import json
import copy


def get_geobase(path='geobase.json'):
    gb = json.load(open(path))
    geobase = {int(x['id']): preprocess_geobase(x) for x in gb}
    return geobase

def preprocess_geobase(z):
    x = copy.deepcopy(z)
    for k in ['id', 'type']:
        x[k] = int(x[k])
    x['path'] = [int(y) for y in x['path'].split(', ') if y]
    return x


# москва 213
# киев 143
# стамбул 11508
# минск 157
# ташкент 10335
# астана 163


def get_country(x, jgb):
    obj = jgb[x]
    if obj['type'] < 3:
        return
    if obj['type'] == 3:
        return
    for y in obj['path']:
        if jgb[y]['type'] == 3:
            return jgb[y]['iso_name'].split()[0].lower()


default_country = 'ru'
default_lr = 213


def check_country(record, geobase):
    country = None
    try:
        country = get_country(int(record['region']), geobase)
    except KeyError:
        pass
    if (country or '').lower() == 'ru':
        return True
    return False


def youtube_map(records):
    geobase = get_geobase(path='geobase20170915.json')
    for rec in records:
        if rec.domain != 'www.youtube.com':
            continue
        if not check_country(rec, geobase):
            continue
        yield rec


def get_2n_category(n):
    assert isinstance(n, int)
    for x in range(1, 25):
        if n <= (2 ** x):
            return 2 ** x
    return 2 ** 25


def aggregate_queries(groups):
    for key, records in groups:
        if not key.query:
            continue
        uid = ""
        ts = ""
        lr = ""
        reqs = 0
        for rec in records:
            if not lr and rec.lr:
                lr = rec.lr
            if not uid and rec.uid:
                uid = rec.uid
            if not ts and rec.ts:
                ts = rec.ts
            reqs += rec.reqs
        result = vars(key)
        result['uid'] = uid
        result['ts'] = ts
        result['lr'] = lr
        result['cat2n'] = get_2n_category(reqs)
        result['reqs'] = reqs
        yield Record(**result)


def aggregate_queries_google(groups):
    for key, records in groups:
        if not key.query:
            continue
        lr = ""
        reqs = 0
        for rec in records:
            if not lr and rec.region:
                lr = rec.region
            reqs += rec.paircount
        result = vars(key)
        result['lr'] = lr
        result['cat2n'] = get_2n_category(reqs)
        result['reqs'] = reqs
        yield Record(**result)


def main():
    hahn = clusters.yt.Hahn(
        pool='search-research_{}'.format(getpass.getuser())
    ).env(
        templates=dict(
            job_root='home/videolog/2017-11-07_baskets_10k_youtube',
        )
    )

    job = hahn.job().env(parallel_operations_limit=10)

    # job.table(
    #     '//home/goda/zyko/classes/nano_frequency_agg_sg2'
    # ).map(
    #     youtube_map, files=[
    #         nile.files.LocalFile('geobase20170915.json'),
    #     ]
    # ).unique(
    #     "query"
    # ).random(
    #     10000
    # ).put(
    #     '$job_root/10k_youtube'
    # )

    # job.table(
    #     '//home/videolog/2017-08-21_new_baskets/RU/filtered_desktop_google'
    # ).random(
    #     10000
    # ).put(
    #     '$job_root/10k_google'
    # )

    job.table(
        '//home/goda/zyko/classes/nano_frequency_agg_sgmob2'
    ).map(
        youtube_map, files=[
            nile.files.LocalFile('geobase20170915.json'),
        ]
    ).unique(
        "query"
    ).random(
        10000
    ).put(
        '$job_root/10k_youtube_touch'
    )

    job.table(
        '//home/videolog/2017-08-21_new_baskets/RU/filtered_desktop_vid'
    ).random(
        10000
    ).put(
        '$job_root/10k_yavideo_desktop'
    )

    job.table(
        '//home/videolog/2017-08-21_new_baskets/RU/filtered_touch_vid'
    ).random(
        10000
    ).put(
        '$job_root/10k_yavideo_touch'
    )

    job.run()


if __name__ == "__main__":
    main()
