# coding: utf-8
import urllib, urllib2, time, datetime, pandas as pd, re, requests, numpy as np, random, hashlib
from StringIO import StringIO
from nile.api.v1 import (
    aggregators as na,
    filters as nf,
    extractors as ne,
    grouping as ng,
    files as nfile,
    clusters,
    files,
    statface,
    Record,
    Template,
    Path
)
from qb2.api.v1 import (
    filters as sf,
    QB2,
    extractors as se,
    resources as sr
)

def get_country_oblast_city(geo_id, kr_region, kr_region_dop, geobase6):
    country = kr_region_dop.keys()
    country.extend(kr_region.keys())
    country = set(country)
    if geo_id is not None:
        list_numbers = []
        reg_path = geobase6.region_by_id(geo_id).path
        region_profile = (
            [region.id, region.type] for region in reg_path
            if region.type in REGION_TYPE_LIST
        )
        for r_id, r_type in region_profile:
            if r_type == sr.geo.RegionTypes.COUNTRY and r_id not in country:
                return 'Other'
            list_numbers.append(r_id)
            if len(list_numbers) == 4:
                break
        while len(list_numbers) != 4:
            list_numbers.append(OTHER_REGION)
        return geobase6.region_by_id(list_numbers[1]).name
    else:
        return 'Other'

def get_channel(channel_list):
    if 'store_install' in channel_list:
        return '>>'.join(channel_list[1:3])
    else:
        return '>>'.join(channel_list[1:2])

def sum_metric(metric_list, record, result_dict):
    for metric in metric_list:
        if record[metric] > 0:
            result_dict[metric] += record[metric]
    return result_dict

def d_delta(x, y):
    try:
        t = datetime.datetime.strptime(str(x), '%Y-%m-%d') - datetime.datetime.strptime(str(y), '%Y-%m-%d')
        return t.days
    except:
        return None

def metric_to_zero(result_dict, metric_list):
    for metric in metric_list:
        result_dict[metric] = 0
    return result_dict

def check_record(key, record, dimention_list):
    result_dict = {
        'browser': key.browser,
        'device_id': key.user_id,
        'activation_date': record.date,
        'date': record.date,
        'requests_count': 0,
        'visits_count': 0,
        'yandex_visits_count': 0,
        'direct_cost': 0
    }
    for dimention in dimention_list:
        if dimention in record:
            result_dict[dimention] = record[dimention]
    return result_dict

def check_metric(record, metric_list):
    for metric in metric_list:
        if not metric in record:
            record = record.update(**{metric: 0})
        elif record[metric] == None:
            record = record.update(**{metric: 0})
    return record

def prepare_data_reduce(groups):
    for key, records in groups:
        have_activation = 0
        current_date = '2000-01-01'
        for record in records:
            record = check_metric(record, metric_list)
            if have_activation == 0:
                if record.type == 'activation':
                    result_dict = check_record(key, record, dimention_list)
                    have_activation = 1
                    current_date = result_dict['date']
                    result_dict['days_delta'] = 0
                    result_dict['is_install'] = 1
                    result_dict['day_use'] = 1
            else:
                if current_date == record.date:
                    result_dict = sum_metric(metric_list, record, result_dict)
                    result_dict['days_delta'] = d_delta(result_dict['date'], result_dict['activation_date'])
                    result_dict['day_use'] = 1
                else:
                    yield Record(**result_dict)
                    current_date = record.date
                    result_dict['date'] = record.date
                    result_dict = metric_to_zero(result_dict, metric_list)
                    result_dict = sum_metric(metric_list, record, result_dict)
                    result_dict['days_delta'] = d_delta(result_dict['date'], result_dict['activation_date'])
                    result_dict['day_use'] = 1
                    result_dict['is_install'] = 0
        try:
            yield Record(**result_dict)
        except:
            continue

ACTIVATIONS_TAG = 'activations'
REGION_TYPE_LIST = {
    sr.geo.RegionTypes.EARTH,
    sr.geo.RegionTypes.COUNTRY,
    sr.geo.RegionTypes.REGION,
    sr.geo.RegionTypes.CITY
}
OTHER_REGION = 957
EARTH = 10000
OTHER_REGION_LIST = [
    EARTH,
    OTHER_REGION,
    OTHER_REGION,
    OTHER_REGION
]

metric_list = [
    'requests_count',
    'visits_count',
    'yandex_visits_count',
    'direct_cost'
]
dimention_list = [
    'distr_path',
    'geo_id',
    'platform'
]

cluster = clusters.Hahn(pool='mobile-research', token='a6575e1e15b0475fb8d8564beaa60f23').env(
    templates=dict(
        job_root='home/turkey-analytics/ktereshin'
    ),
    files=[
            nfile.StatboxDict('distr_report.json'),
            nfile.StatboxDict('appsflyer_tracking_campaigns.yaml'),
            nfile.StatboxDict('tracking_groups.json'),
            nfile.StatboxDict('tracking_groups.yaml'),
            nfile.StatboxDict('apps_projects_to_browser.yaml')
        ]
)

job = cluster.job()
installs = job.table('statbox/hypercube/data/mobile_user_history/%s' % (max(job.driver.list('statbox/hypercube/data/mobile_user_history')))) \
    .filter(
        nf.custom(lambda x: x != 'birthday', 'log')
    ) \
    .project(
        ne.all(),
        browser = ne.custom(lambda x: x if x != 'SearchApp' else 'YandexSearch', 'browser')
    ) \
    .groupby(
        'browser',
        'user_id',
    ) \
    .sort(
        'date',
        'log'
    ) \
    .reduce(
        prepare_data_reduce
    ) \
    .put('$job_root/mobile_installs_cohort_data/hypercube_data')
job.run()

job = cluster.job()
job.table('$job_root/mobile_installs_cohort_data/hypercube_data') \
    .project(
        'activation_date',
        'browser',
        'date',
        'day_use',
        'days_delta',
        'device_id',
        'direct_cost',
        'is_install',
        'platform',
        'requests_count',
        'visits_count',
        'yandex_visits_count',
        country = ne.custom(
            get_country_oblast_city,
            'geo_id',
            sr.yaml('key_report_region.yaml'),
            sr.yaml('key_report_region_dop.yaml'),
            sr.resource('Geobase')
        ),
        channel_path = ne.custom(lambda x: '>>'.join([str(ch) for ch in x]) if x else 'None', 'distr_path'),
        channel_level_1 = ne.custom(lambda x: x[1] if len(x)>1 else 'Other', 'distr_path'),
        channel_level_2 = ne.custom(lambda x: x[2] if len(x)>2 else 'Other', 'distr_path'),
        channel_level_3 = ne.custom(lambda x: x[3] if len(x)>3 else 'Other', 'distr_path'),
        channel_level_4 = ne.custom(lambda x: x[4] if len(x)>4 else 'Other', 'distr_path'),
        channel_level_5 = ne.custom(lambda x: x[5] if len(x)>5 else 'Other', 'distr_path'),
        is_search = ne.custom(lambda x: 1 if x > 0 else 0, 'requests_count'),
        is_yandex_visit = ne.custom(lambda x: 1 if x > 0 else 0, 'yandex_visits_count')
    ) \
    .groupby(
        'activation_date',
        'browser',
        'date',
        'days_delta',
        'platform',
        'country',
        'channel_path',
        'channel_level_1',
        'channel_level_2',
        'channel_level_3',
        'channel_level_4',
        'channel_level_5'
    ) \
    .aggregate(
        installs = na.sum('is_install', missing = 0),
        users = na.sum('day_use', missing = 0),
        search_users = na.sum('is_search', missing = 0),
        visit_users = na.sum('is_yandex_visit', missing = 0),
        searches = na.sum('requests_count', missing = 0),
        visits = na.sum('yandex_visits_count', missing = 0),
        revenue = na.sum('direct_cost', missing = 0),
    ) \
    .put('$job_root/mobile_installs_cohort_data/hypercube_data_aggr')
job.run()

