
# coding: utf-8
from nile.api.v1 import (
    aggregators as na,
    filters as nf,
    extractors as ne,
    grouping as ng,
    clusters,
    files,
    statface,
    Record,
    Template,
    Path
)

from qb2.api.v1 import filters as sf
from qb2.api.v1 import QB2, extractors as se
import pandas as pd
import datetime
import requests
import numpy as np
enddate = datetime.datetime.now() - datetime.timedelta(days=2)
startdate = datetime.datetime.now() - datetime.timedelta(days=8)
enddate = str(enddate).split(' ')[0]
startdate = str(startdate).split(' ')[0]
startdate = '2016-02-11'
cluster = clusters.Hahn(pool='mobile-research', token='a6575e1e15b0475fb8d8564beaa60f23').env(
    templates=dict(
        job_root='home/turkey-analytics/ktereshin',
        dates='{%s..%s}' % (startdate, enddate)
    )
)
def other_channel_type(x):
    if x == '' or x == None:
        return 'Other'
    else:
        return x

def install_uuid(x,y,z):
    if x != None:
        return x[0]['uuid']
    elif x == None and y != None:
        return y[0]['uuid']
    else:
        mindate = '2050-01-01 12:00:00'
        i =0
        for x in z:
            if x['datetime'] < mindate:
                mindate = x['datetime']
                minindex = i
            i += 1
        return z[minindex]['uuid']
def install_geo_id(x,y,z):
    if x != None:
        return x[0]['geo_id']
    elif x == None and y != None:
        return y[0]['geo_id']
    else:
        mindate = '2050-01-01 12:00:00'
        i =0
        for x in z:
            if x['datetime'] < mindate:
                mindate = x['datetime']
                minindex = i
            i += 1
def install_date(x,y,z):
    if x != None:
        return x[0]['datetime'].split(' ')[0]
    elif x == None and y != None:
        return y[0]['datetime'].split(' ')[0]
    else:
        mindate = '2050-01-01 12:00:00'
        i =0
        for ui in z:
            if ui['datetime'] < mindate:
                mindate = ui['datetime']
                minindex = i
            i += 1
        return str(z[minindex]['datetime']).split(' ')[0]
def install_version(x,y,z):
    if x != None:
        return x[0]['version']
    elif x == None and y != None:
        return y[0]['version']
    else:
        mindate = '2050-01-01 12:00:00'
        i =0
        for ui in z:
            if ui['datetime'] < mindate:
                mindate = ui['datetime']
                minindex = i
            i += 1
        return z[minindex]['version']
def clid(x,y):
    if x != None:
        maxdate = '1900-01-01 12:00:00'
        i =0
        for ui in x:
            if ui['datetime'] > maxdate:
                maxdate = ui['datetime']
                maxindex = i
            i += 1
        try:
            return x[maxindex]['clids']['clid1']
        except:
            return x[maxindex]['clids']
    elif x == None and y != None:
        mindate = '2050-01-01 12:00:00'
        i =0
        for ui in y:
            if ui['datetime'] < mindate:
                mindate = ui['datetime']
                minindex = i
            i += 1
        return y[minindex]['clid']
    else:
        return 'Other'
def apps_flyer_media_source(x,y):
    if x != None and y == None:
        mindate = '2050-01-01 12:00:00'
        i =0
        for ui in x:
            if ui['datetime'] < mindate:
                mindate = ui['datetime']
                minindex = i
            i += 1
        return x[minindex]['media_source']
    elif x == None and y != None:
        mindate = '2050-01-01 12:00:00'
        i =0
        for ui in y:
            if ui['datetime'] < mindate:
                mindate = ui['datetime']
                minindex = i
            i += 1
        if y[minindex]['tracking_id'] != None and y[minindex]['tracking_id'] != '':
            return y[minindex]['campaign']
    elif x != None and y != None:
        mindate = '2050-01-01 12:00:00'
        i =0
        for ui in y:
            if ui['datetime'] < mindate:
                mindate = ui['datetime']
                minindex = i
            i += 1
        if y[minindex]['tracking_id'] != None and y[minindex]['tracking_id'] != '' and str(y[minindex]['campaign']).lower().startswith('ya_'):
            return y[minindex]['campaign']
        mindate = '2050-01-01 12:00:00'
        i =0
        for ui in x:
            if ui['datetime'] < mindate:
                mindate = ui['datetime']
                minindex = i
            i += 1
        return x[minindex]['media_source']
    else:
        return None
def apps_flyer_date_time(x):
    if x != None:
        mindate = '2050-01-01 12:00:00'
        i =0
        for ui in x:
            if ui['install_datetime'] < mindate:
                mindate = ui['install_datetime']
                minindex = i
            i += 1
        return x[minindex]['install_datetime']
    else:
        return None
def apps_flyer_campaign(x):
    if x != None:
        mindate = '2050-01-01 12:00:00'
        i =0
        for ui in x:
            if ui['datetime'] < mindate:
                mindate = ui['datetime']
                minindex = i
            i += 1
        return x[minindex]['campaign']
    else:
        return None
import urllib2
import datetime
import re
currentdate = datetime.datetime.now()
currentdatestr = str(currentdate).split(' ')[0]
# parce clid dict
while True:
    try:
        url_dict = 'https://hahn.yt.yandex-team.ru/api/v2/download?path=//statbox/statbox-dict/'+currentdatestr+'/distr_report&disposition=attachment'
        req = urllib2.Request(url_dict)
        dict_all_to_parse = urllib2.urlopen(req)
        dict_all = dict_all_to_parse.read()
        rr = dict_all.split('\n')
        clid_dict = {}
        for line in rr:
            par = line.split('\t')
            clid_dict[par[1]] = {}
            if par[5] == '101':
                clid_dict[par[1]]['channel_type'] = 'Pre-Installs'
                clid_dict[par[1]]['channel'] = par[8]
                clid_dict[par[1]]['campaign'] = par[3]
                clid_dict[par[1]]['channel_code'] = par[5]
            elif par[5] == '206':
                clid_dict[par[1]]['channel_type'] = 'Pre-Installs'
                clid_dict[par[1]]['channel'] = par[8]
                clid_dict[par[1]]['campaign'] = par[3]
                clid_dict[par[1]]['channel_code'] = par[5]
            elif par[5] == '105':
                clid_dict[par[1]]['channel_type'] = 'Store-Installs'
                clid_dict[par[1]]['channel'] = par[8]
                clid_dict[par[1]]['campaign'] = par[3]
                clid_dict[par[1]]['channel_code'] = par[5]
            elif par[5] == '147':
                clid_dict[par[1]]['channel_type'] = 'Store-Installs'
                clid_dict[par[1]]['channel'] = 'Custom partner builds'
                clid_dict[par[1]]['campaign'] = par[3]
                clid_dict[par[1]]['channel_code'] = par[5]
            else:
                clid_dict[par[1]]['channel_type'] = 'Other'
                clid_dict[par[1]]['channel'] = par[8]
                clid_dict[par[1]]['campaign'] = par[3]
                clid_dict[par[1]]['channel_code'] = par[5]
        break
    except:
        print 'attempt'
        currentdate = currentdate - datetime.timedelta(days = 1)
        currentdatestr = str(currentdate).split(' ')[0]
        continue
import operator
import re
from functools import partial
def dict_clid(x,cdict):
    try:
        if cdict[x]['channel_code'] in ['101','206']:
            return 'Pre_Installs'
        elif cdict[x]['channel_code'] in ['105','147']:
            return 'Store_Installs'
        else:
            return 'Other'
    except:
        return 'Other'
pdict_clid = partial(dict_clid, cdict = clid_dict)
def media(x):
    if str(x).lower() == 'null' or str(x).lower() == 'organic':
        return 'Organic'
    elif 'ya_' in str(x).lower() or 'yandex' in str(x).lower() or 'switch' in str(x).lower():
        return 'Portal'
    elif '_int' in str(x).lower() or 'cpa' in str(x).lower() or 'cpi' in str(x).lower() or 'cpc' in str(x).lower():
        return 'Paid'
    else:
        return 'Other'
def media1(x):
    if x == None:
        return 'Unknown'
    elif str(x).lower() == 'null' or str(x).lower() == 'organic':
        return 'Organic'
    else:
        return str(x).lower()
import json
while True:
    try:
        url_dict = 'https://hahn.yt.yandex-team.ru/api/v2/download?path=//statbox/statbox-dict/'+currentdatestr+'/geobase.json&disposition=attachment'
        req = urllib2.Request(url_dict)
        dict_all_to_parse = urllib2.urlopen(req)
        dict_all = dict_all_to_parse.read()
        g_dict = json.loads(dict_all)
        break
    except:
        print 'attempt'
        currentdate = currentdate - datetime.timedelta(days = 1)
        currentdatestr = str(currentdate).split(' ')[0]
        continue
geo_dict = {}
for kk in g_dict:
    geo_dict[kk] = {}
    geo_dict[kk]['path'] = g_dict[kk]['path']
    geo_dict[kk]['official_languages'] = g_dict[kk]['official_languages']
del g_dict
def dict_geo(x,cdict):
    try:
        if cdict[str(x)]['path'].replace(' ', '').split(',')[1] == '10002':
            return cdict[str(x)]['path'].replace(' ', '').split(',')[2]
        elif cdict[str(x)]['path'].replace(' ', '').split(',')[1] == '10003':
            return cdict[str(x)]['path'].replace(' ', '').split(',')[2]
        elif cdict[str(x)]['path'].replace(' ', '').split(',')[1] == '241':
            return cdict[str(x)]['path'].replace(' ', '').split(',')[2]
        elif cdict[str(x)]['path'].replace(' ', '').split(',')[1] == '138':
            return cdict[str(x)]['path'].replace(' ', '').split(',')[2]
        elif cdict[str(x)]['path'].replace(' ', '').split(',')[1] == '245':
            return cdict[str(x)]['path'].replace(' ', '').split(',')[2]
        elif cdict[str(x)]['path'].replace(' ', '').split(',')[1] == '245':
            return cdict[str(x)]['path'].replace(' ', '').split(',')[2]
        elif cdict[str(x)]['path'].replace(' ', '').split(',')[1] == '10001' and cdict[str(x)]['path'].replace(' ', '').split(',')[2] == '225':
            return cdict[str(x)]['path'].replace(' ', '').split(',')[2]
        else:
            return cdict[str(x)]['path'].replace(' ', '').split(',')[3]
    except:
        return None
def dict_geo_leng(x,cdict):
    try:
        return cdict[str(x)]['official_languages']
    except:
        return None
pdict_geo = partial(dict_geo, cdict = geo_dict)
pdict_geo_leng = partial(dict_geo_leng, cdict = geo_dict)
def country(x):
    if x == '225':
        return 'RU'
    elif x == '187':
        return 'UA'
    elif x == '159':
        return 'KZ'
    elif x == '149':
        return 'BY'
    elif x == '983':
        return 'TR'
    else:
        return 'Other'
startdate = '2016-03-01'
job = cluster.job()
cube_install = job.table('statbox/cube/daily/installs/mobile/beta/'+enddate)
cube_install = cube_install.filter(nf.custom(lambda x: x != None and x != '','first_client_event'), nf.custom(lambda x: x != None and x != '','first_start_event'))
cube_install = cube_install.project('device_id','platform', 'project' ,uuid = ne.custom(install_uuid,'first_client_event','first_start_event','uuids_information'),
                                   fielddate = ne.custom(install_date,'first_client_event','first_start_event','uuids_information'),
                                   app_version = ne.custom(install_version,'first_client_event','first_start_event','uuids_information'),
                                    geo_id = ne.custom(install_geo_id,'first_client_event','first_start_event','uuids_information'),
                                   install_clid = ne.custom(clid,'dynamic_clids_information','startup_information'),
                                   media_source = ne.custom(apps_flyer_media_source,'appsflyer_information', 'tracking_information'),
                                   campaign = ne.custom(apps_flyer_campaign,'appsflyer_information'))
cube_install = cube_install.project(ne.all(), channel_type = ne.custom(pdict_clid, 'install_clid'))
cube_install = cube_install.project(ne.all(), media_source_type=ne.custom(media,'media_source'))
cube_install = cube_install.project(ne.all(), media_source=ne.custom(media1,'media_source'), country_code = ne.custom(pdict_geo, 'geo_id'), country_lang = ne.custom(pdict_geo_leng, 'geo_id'))
cube_install = cube_install.project(ne.all(), country = ne.custom(country,'country_code'))
cube_install = cube_install.put('$job_root/installs/installs_uuid')
job.run()

job = cluster.job()
cube_install = job.table('$job_root/installs/installs_uuid')
cube_install = cube_install.groupby('fielddate', 'channel_type', 'media_source_type', 'media_source', 'platform', 'project', 'country', 'device_id').aggregate(installs = na.count())
cube_install = cube_install.groupby('fielddate', 'channel_type', 'media_source_type', 'media_source','platform', 'project', 'country').aggregate(installs = na.count())
cube_install = cube_install.put('$job_root/installs/installs_cube')
job.run()
result_data = cube_install.read().as_dataframe().fillna(0)
res_data = pd.DataFrame(result_data.groupby(['fielddate','platform', 'project','channel_type', 'country','media_source_type','media_source'])['installs'].sum()).reset_index()
res_data = res_data.append(pd.DataFrame(result_data.groupby(['fielddate', 'project','platform', 'channel_type', 'country','media_source_type'])['installs'].sum()).reset_index())
res_data = res_data.append(pd.DataFrame(result_data.groupby(['fielddate', 'project','platform', 'channel_type', 'country','media_source'])['installs'].sum()).reset_index())
res_data = res_data.append(pd.DataFrame(result_data.groupby(['fielddate', 'project','platform', 'channel_type', 'media_source_type','media_source'])['installs'].sum()).reset_index())
res_data = res_data.append(pd.DataFrame(result_data.groupby(['fielddate', 'project','platform', 'country', 'media_source_type','media_source'])['installs'].sum()).reset_index())
res_data = res_data.append(pd.DataFrame(result_data.groupby(['fielddate', 'project','platform', 'channel_type', 'country'])['installs'].sum()).reset_index())
res_data = res_data.append(pd.DataFrame(result_data.groupby(['fielddate', 'project','platform', 'channel_type', 'media_source_type'])['installs'].sum()).reset_index())
res_data = res_data.append(pd.DataFrame(result_data.groupby(['fielddate', 'project','platform', 'channel_type', 'media_source'])['installs'].sum()).reset_index())
res_data = res_data.append(pd.DataFrame(result_data.groupby(['fielddate', 'project','platform', 'country', 'media_source_type'])['installs'].sum()).reset_index())
res_data = res_data.append(pd.DataFrame(result_data.groupby(['fielddate', 'project','platform', 'country', 'media_source'])['installs'].sum()).reset_index())
res_data = res_data.append(pd.DataFrame(result_data.groupby(['fielddate', 'project','platform', 'media_source_type', 'media_source'])['installs'].sum()).reset_index())
res_data = res_data.append(pd.DataFrame(result_data.groupby(['fielddate', 'project','platform', 'channel_type'])['installs'].sum()).reset_index())
res_data = res_data.append(pd.DataFrame(result_data.groupby(['fielddate', 'project','platform', 'media_source'])['installs'].sum()).reset_index())
res_data = res_data.append(pd.DataFrame(result_data.groupby(['fielddate', 'project','platform', 'media_source_type'])['installs'].sum()).reset_index())
res_data = res_data.append(pd.DataFrame(result_data.groupby(['fielddate', 'project','platform', 'country'])['installs'].sum()).reset_index())
res_data = res_data.append(pd.DataFrame(result_data.groupby(['fielddate', 'project','platform'])['installs'].sum()).reset_index())
res_data = res_data.fillna('total')
res_data = pd.DataFrame(res_data.groupby(['fielddate','platform', 'project','channel_type', 'country','media_source_type','media_source'])['installs'].sum()).reset_index()
res_data = res_data[(res_data['fielddate'] >= startdate) & (res_data['fielddate'] <= str(datetime.datetime.now()).split(' ')[0])]
yaml_config = u'''
---
allow_recalculate: "0"
autovalues_enabled: "0"
dimensions:
  - fielddate: date
  - project: string
  - platform: string
  - country: string
  - channel_type: string
  - media_source_type: string
  - media_source: string
measures:
  - installs: number
view_types:
  project:
    type: Selector
    default: Search_Mobile_App
  platform:
    type: Selector
    default: Android
  country:
    type: Selector
    default: total
  channel_type:
    type: Selector
    default: total
  media_source_type:
    type: Selector
    default: total
  media_source:
    type: Selector
    default: total
aggregate_scales:
  - w_by_d_sum
  - m_by_d_sum
aggregate_uncomplete_period: "0"
'''
resp = requests.post(
    'https://upload.stat.yandex-team.ru/_api/report/config',
    headers={'StatRobotUser': 'robot_ktereshin', 'StatRobotPassword': '3en5cayAppeif6l'},
    data={
        'cube_config': yaml_config,
        'title': 'Installs Cube Data',
        'name': 'Distribution/ktereshin/installs/installs_cube_channels'
    },
)
import re
csv_data = re.sub('\n$', '', res_data.to_csv(sep=';', index=False))
r = requests.post(
    'https://upload.stat.yandex-team.ru/_api/report/data',
    headers={'StatRobotUser': 'robot_ktereshin', 'StatRobotPassword': '3en5cayAppeif6l'},
    data={
        'name': 'Distribution/ktereshin/installs/installs_cube_channels',
        'scale': 'd',
        'data': csv_data,
    },
)
