from nile.api.v1 import (
    aggregators as na,
    filters as nf,
    extractors as ne,
    grouping as ng,
    clusters,
    files,
    statface,
    Record,
    Template,
    Path
)
from qb2.api.v1 import filters as sf
from qb2.api.v1 import QB2, extractors as se
import pandas as pd
import datetime
import requests
import numpy as np
import time
import json
from scipy.spatial.distance import pdist, cdist, squareform
import matplotlib.pyplot as plt
import warnings
import random
from sklearn.cluster import DBSCAN
from scipy.spatial.distance import pdist
import ast
from functools import partial
warnings.filterwarnings('ignore')
from sklearn.feature_extraction import DictVectorizer
from scipy.spatial.distance import pdist
from sklearn.cluster import DBSCAN
def get_cam_df(df,vectorize_data, cam):
    cam_df = df[df['campaign_extended_name']==cam][['device_id']]
    index = list((df['campaign_extended_name']==cam)[(df['campaign_extended_name']==cam)==True].index)
    vectorize_data_sample = vectorize_data[index,:]
    if vectorize_data_sample.shape[0]>2000:
        sample_index = np.random.choice(cam_df.shape[0],2000, replace=False)
        cam_df = cam_df.iloc[sample_index, :]
        vectorize_data_sample = vectorize_data[sample_index, :]
    return cam_df, vectorize_data_sample
def get_eps(vectorize_data_sample, part):
    eps = np.mean(pdist(vectorize_data_sample.toarray()))/part
    if eps == 0:
        eps += .001
    return eps
def get_DV(df):
    DV = DictVectorizer().fit(list(df['event_aggregation']))
    vectorize_data = DV.transform(list(df['event_aggregation']))
    return vectorize_data, DV
def sec_delta(x, y):
    try:
        t = datetime.datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S') - datetime.datetime.strptime(str(y), '%Y-%m-%d %H:%M:%S')
        return t.total_seconds()
    except:
        return None
def sec_delta_install(x, y):
    try:
        t = datetime.datetime.strptime(str(y), '%Y-%m-%d') - datetime.datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S')
        return t.total_seconds()
    except:
        return None
def get_start_of_previous_month(x):
    if x.month == 1:
        x = x.replace(day=1, month=12, year = x.year - 1)
        return str(x).split(' ')[0]
    else:
        x = x.replace(day=1, month=x.month - 1)
        return str(x).split(' ')[0]
def get_long_lat(x):
    try:
        x = float(x)
        if x != np.nan:
            return x
    except:
        return None
def get_network_mask(x):
    try:
        x = x.split(':')[3]
        tt = x.split('.')
        return tt[0] + '.' + tt[1] + '.' + tt[2]
    except:
        return 'unknown'
def get_date_range(dates, num_of_parts):
    reseut_dict = {}
    for i in range(0, len(dates), len(dates)/num_of_parts):
        if i+len(dates)/num_of_parts - 1 < len(dates):
            if dates[i+len(dates)/num_of_parts - 1] != dates[len(dates)-1]:
                reseut_dict[dates[i]] =  dates[i+len(dates)/num_of_parts - 1]
            else:
                reseut_dict[dates[i]] =  dates[i+len(dates)/num_of_parts - 3]
        else:
            reseut_dict[dates[i]] =  dates[len(dates) - 1]
    return reseut_dict
def aggregate_events(groups):
    for key, records in groups:
        result_dict = {}
        result_dict['event_aggregation'] = {}
        for record in records:
            result_dict['event_aggregation'][str(record.sec_delta_30)] = record.events
        result_dict['device_id'] = key.device_id
        result_dict['APIKey'] = key.APIKey
        result_dict['campaign_extended_name'] = key.campaign_extended_name
        yield Record(**result_dict)
def get_biggest_cluster_installs(_dict):
    max_cluster_installs = 0
    max_cluster_dist = 0
    for key, value in _dict.iteritems():
        if key == -1:
            continue
        if value['size'] >= max_cluster_installs:
            max_cluster_installs = value['size']
            max_cluster_dist = value['dist']
    return max_cluster_installs
def get_biggest_cluster_dist(_dict):
    max_cluster_installs = 0
    max_cluster_dist = 0
    for key, value in _dict.iteritems():
        if key == -1:
            continue
        if value['size'] >= max_cluster_installs:
            max_cluster_installs = value['size']
            max_cluster_dist = value['dist']
    return max_cluster_dist

enddate= datetime.datetime.now() - datetime.timedelta(days=datetime.datetime.now().weekday()+1)
startdate =  enddate - datetime.timedelta(days=enddate.weekday())
startdate = str(startdate).split(' ')[0]
enddate= str(enddate - datetime.timedelta(days=1)).split(' ')[0]
metrika_enddate = datetime.datetime.now() - datetime.timedelta(days=datetime.datetime.now().weekday()+1)
metrika_startdate =  metrika_enddate - datetime.timedelta(days=metrika_enddate.weekday())
metrika_startdate = str(metrika_startdate).split(' ')[0]
metrika_enddate = str(metrika_enddate).split(' ')[0]
dates = list(pd.DataFrame(pd.date_range(metrika_startdate, metrika_enddate))[0].apply(lambda x: str(x).split(' ')[0]))
cluster = clusters.Hahn(pool='mobile-research', token='a6575e1e15b0475fb8d8564beaa60f23').env(
    templates=dict(
        job_root='home/turkey-analytics/ktereshin',
        dates='{%s..%s}' % (metrika_startdate, metrika_enddate)
    )
)
api_key_dict = {
    '106400':'YaBro Android',
    '10321': 'Search App Android',
    '42989': 'Search App IOS',
    '19531': 'YaBro iPhone',
    '19534': 'YaBro iPad'
}
job = cluster.job()
apps = job.table('$job_root/antifraud/weeks/installs')     .filter(
        nf.custom(lambda x: x != None and x != '', 'uuid'),
    ) \
    .project(
        'install_date',
        'campaign_extended_name',
        device_id = ne.custom(lambda x: str(x).lower(), 'uuid')
    )
logs = job.table('statbox/metrika-mobile-log/@dates')     .filter(
        nf.custom(lambda x: x != None and x != '', 'DeviceID'),
        nf.custom(lambda x: x == 'SESSION_FOREGROUND', 'SessionType'),
        nf.custom(lambda x: x != None and x != '', 'EventName'),
        nf.custom(lambda x: x in api_key_dict.keys(), 'APIKey'),
        #nf.custom(pfilter_action_by_device_id, 'DeviceID')
    ) \
    .project(
        'APIKey',
        EventDateTime = ne.custom(lambda x: str(x).lower(), 'EventDateTime'),
        EventName = ne.custom(lambda x: str(x).lower() if len(str(x)) < 200 else 'to_long_name', 'EventName'),
        device_id = ne.custom(lambda x: str(x).lower(),'DeviceID'),
    )
apps = apps.join(logs, by='device_id', type='inner')     .project(
        ne.all(),
        sec_delta = ne.custom(sec_delta, 'EventDateTime', 'install_date')
    ) \
    .project(
        ne.all(),
        is_action_after_install = ne.custom(lambda x: 1 if x >= 0 else 0, 'sec_delta')
    ) \
    .filter(
        nf.custom(lambda x: x >= -2592000 and x <=86400, 'sec_delta'),
    ) \
    .put('$job_root/antifraud/weeks/app_metrika_data/install_metrika_event')
job.run()

cluster = clusters.Hahn(pool='mobile-research', token='a6575e1e15b0475fb8d8564beaa60f23').env(
    templates=dict(
        job_root='home/turkey-analytics/ktereshin',
        dates='{%s..%s}' % ('2017-01-01', '2017-01-02')
    )
)
job = cluster.job()
apps = job.table('$job_root/antifraud/weeks/app_metrika_data/install_metrika_event')     .filter(
        nf.custom(lambda x: x >= 0 and x <=86400, 'sec_delta'),
    ) \
    .project('device_id', 'APIKey', 'EventName', 'sec_delta', 'install_date', 'campaign_extended_name',
            sec_delta_30 = ne.custom(lambda x: int(x)/30, 'sec_delta')) \
    .groupby(
        'device_id', 'APIKey', 'sec_delta_30', 'campaign_extended_name'
    ) \
    .aggregate(events = na.count()) \
    .groupby('APIKey', 'device_id', 'campaign_extended_name') \
    .reduce(aggregate_events) \
    .sort('APIKey', 'device_id') \
    .put('$job_root/antifraud/weeks/metrika_event_clustering/event_log')
job.run()

job = cluster.job()
apps = job.table('$job_root/antifraud/weeks/metrika_event_clustering/event_log')
data = apps.read().as_dataframe()
api_key_dict = {
    '106400':'YaBro Android',
    '10321': 'Search App Android',
    '42989': 'Search App IOS',
    '19531': 'YaBro iPhone',
    '19534': 'YaBro iPad'
}
data['app_name'] = data['campaign_extended_name'].apply(lambda x: x.split('__')[0])
data = data[data['APIKey'].apply(lambda x, y: y[x], args=(api_key_dict,)) == data['app_name']]
data = data.reset_index()
vectorize_data, DV = get_DV(data)
campaign = list(data['campaign_extended_name'].value_counts().index)
vectorize_data = vectorize_data[:,np.where(np.array(np.sum(vectorize_data, axis=0))[0]/float(vectorize_data.shape[0]) > .005)[0]]
cluster_list = []
for cam in campaign:
    cam_df, vectorize_data_sample = get_cam_df(data,vectorize_data, cam)
    if vectorize_data_sample.shape[0] > 0:
        dbscan = DBSCAN(eps = 1, min_samples=7, algorithm ='brute', metric = 'canberra', n_jobs = -1)
        dbscan.fit(vectorize_data_sample.toarray())
        cluster_dict = {'installs': vectorize_data_sample.shape[0], 'campaign_extended_name':cam, 'cluster_info': {}}
        for cl in np.unique(dbscan.labels_):
            cluster_dict['cluster_info'][cl] = {'size': vectorize_data_sample[dbscan.labels_==cl, :].shape[0],'dist': np.mean(pdist(vectorize_data_sample[dbscan.labels_==cl, :].toarray()))}
        cluster_list.append(cluster_dict)
data_df = pd.DataFrame(cluster_list)
data_df['biggest_cluster_installs'] = data_df['cluster_info'].apply(get_biggest_cluster_installs)
data_df['biggest_cluster_dist'] = data_df['cluster_info'].apply(get_biggest_cluster_dist)
data_df['biggest_cluster_share'] = data_df['biggest_cluster_installs']/data_df['installs']
data_df_filtered = data_df[((data_df['biggest_cluster_share'] > .35) | (data_df['biggest_cluster_dist'] < 10)) & (data_df['biggest_cluster_share'] != 0) ]
data_df_filtered['is_fraud_metrika_actions'] = 1
record_list = []
for i in xrange(data_df_filtered.shape[0]):
    _dict = data_df_filtered.iloc[i,:].to_dict()
    _dict_copy = _dict.copy()
    for key in _dict:
        _dict_copy[key] = str(_dict[key])
    _dict_copy['app_name'] = _dict_copy['campaign_extended_name'].split('__')[0]
    _dict_copy['media_source'] = _dict_copy['campaign_extended_name'].split('__')[1]
    _dict_copy['campaign'] = _dict_copy['campaign_extended_name'].split('__')[2]
    record_list.append(Record(**_dict_copy))
cluster.write('$job_root/antifraud/weeks/anomaly_campaign/anomaly_campaign_by_metrika_actions', record_list)
