from nile.api.v1 import (
    aggregators as na,
    filters as nf,
    extractors as ne,
    grouping as ng,
    clusters,
    files,
    statface,
    Record,
    Template,
    Path
)

from qb2.api.v1 import filters as sf
from qb2.api.v1 import QB2, extractors as se
import pandas as pd
import datetime
import requests
import numpy as np
import time
import json
from scipy.spatial.distance import pdist, cdist, squareform
import matplotlib.pyplot as plt
import warnings
import random
from sklearn.cluster import DBSCAN
from scipy.spatial.distance import pdist
warnings.filterwarnings('ignore')
def get_start_of_previous_month(x):
    if x.month == 1:
        x = x.replace(day=1, month=12, year = x.year - 1)
        return str(x).split(' ')[0]
    else:
        x = x.replace(day=1, month=x.month - 1)
        return str(x).split(' ')[0]

def sec_delta(x, y):
    try:
        t = datetime.datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S') - datetime.datetime.strptime(str(y), '%Y-%m-%d %H:%M:%S')
        return t.total_seconds()
    except:
        return None

def get_appsflyer_search_event(input_path, output_path):
    job = cluster.job()
    appsfluer = job.table('statbox/extdata-apps-flyer-log/@dates')        .filter(
            nf.custom(lambda x: 'search' in x.lower(), 'event_name'),
            nf.custom(lambda x: x >= startdate, 'install_time'),
            nf.custom(lambda x: x != '' and x != None, 'appsflyer_device_id'),
            nf.custom(lambda x: x in ['ru.yandex.searchplugin', 'id1050704155', 'com.yandex.browser', 'id574939428', 'id483693909'], 'app_id'),
            #nf.custom(lambda x: x  >='5.2', 'app_version'),
            nf.or_(
                nf.custom(lambda x: not x in ['organic'], 'media_source'),
                nf.custom(lambda x: x != 'null', 'agency')
            )
        )\
        .project(
            'appsflyer_device_id',
            'event_time',
            event_name = ne.custom(lambda x: 'search', 'event_name')
        )
    _dict = job.table('$job_root/antifraud/weeks/installs').project('appsflyer_device_id', 'install_date', 'campaign_extended_name', 'app_name')
    appsfluer = appsfluer.join(_dict, by='appsflyer_device_id', type='left')
    appsfluer = appsfluer.project(
        ne.all(),
        sec_between_install_and_search = ne.custom(sec_delta, 'event_time', 'install_date')
    ).filter(
        nf.custom(lambda x: x >= 0, 'sec_between_install_and_search')
    ).sort('app_name','appsflyer_device_id', 'event_time')

    appsfluer = appsfluer.put('$job_root/antifraud/weeks/af_search')
    job.run()

def load_search_data_from_hahn(input_path):
    job = cluster.job()
    appsfluer = job.table('$job_root/antifraud/weeks/af_search')
    return appsfluer.read().as_dataframe()

def code_seconds(second,code_matrix,code_list):
    return code_list.index(code_matrix[second])

def make_column_with_coded_seconds(df):
    sec = df.groupby(['sec_between_install_and_search'])['app_name'].count()
    sec_matrix = sec.reset_index().as_matrix()
    sec_matrix = np.c_[sec_matrix, np.ones(sec_matrix.shape[0])]
    i = 0
    j = 1
    while i < sec_matrix.shape[0]:
        if sec_matrix[i,1]>3500:
            sec_matrix[i,2] = sec_matrix[i,0]
            i += 1
        else:
            while np.sum(sec_matrix[i:i+j+1,1]) <= 3500 and i+j+1 < sec_matrix.shape[0]:
                j += 1
            for counter in xrange(i,i+j+1):
                if counter < sec_matrix.shape[0]:
                    sec_matrix[counter,2] = sec_matrix[i,0]
            i = i+j+1
    code_sec_list = list(np.unique(sec_matrix[:,2]))
    sec_matrix_dict = {}
    for i in xrange(sec_matrix.shape[0]):
        sec_matrix_dict[sec_matrix[i,0]] = sec_matrix[i,2]
    sec_between_install_and_search = list(df['sec_between_install_and_search'])
    rr = [code_seconds(x,sec_matrix_dict, code_sec_list) for x in sec_between_install_and_search]
    df['code_sec'] = rr
    return df

def check_campaign_size(df, cam):
    return len(df['appsflyer_device_id'].unique())

def get_sample_id(df, cam):
    appsflyer_device_id = set(df['appsflyer_device_id'].unique())
    if len(appsflyer_device_id) > 5000:
        appsflyer_device_id = random.sample(appsflyer_device_id, 5000)
        appsflyer_device_id_sampled =  list(df['appsflyer_device_id'])
        fitler_list = [True if x in appsflyer_device_id else False for x in appsflyer_device_id_sampled]
        df = pd.pivot_table(
            df[(fitler_list)],
            index=['app_name','appsflyer_device_id','campaign_extended_name'],
            columns='code_sec', values='searches'
        ).reset_index().fillna(0)
    else:
        df = pd.pivot_table(
            df,
            index=['app_name','appsflyer_device_id','campaign_extended_name'],
            columns='code_sec', values='searches'
        ).reset_index().fillna(0)
    return df

def make_clustering(df):
    db_cluster = DBSCAN(eps=0.9, min_samples=7, algorithm ='brute', metric = 'cosine', n_jobs = -1)
    db_cluster.fit(df.iloc[:, 4:])
    df['cluster']=db_cluster.labels_
    return df

def get_cluster_for_analyse(df):
    clusters_share = df['cluster'].value_counts()/df.shape[0]
    clusters_for_analyse = list(clusters_share[clusters_share>0.05].index)
    #clusters_for_analyse = list(clusters_share.index)
    return df[df['cluster'].apply(lambda x,y: True if x in y else False, args=(clusters_for_analyse,))], clusters_for_analyse

def find_dence_cluster(df, clusters_for_analyse, bad_campaign_list, cam, threhold):
    for cluster in clusters_for_analyse:
        cluster_df = df[df['cluster']==cluster]
        dist = np.nanmean(pdist(cluster_df.iloc[:,4:-1], 'cosine'))
        if dist < threhold:
            bad_campaign_list                 .append(
                    {
                        'app name':cam.split('__')[0],
                        'media_source': cam.split('__')[1],
                        'campaign': cam.split('__')[2],
                        'cluster': cluster,
                        'bad_device_id': list(cluster_df['appsflyer_device_id'].unique()),
                        'distance': dist,
                        'af_searches_anomaly': 1
                    }
                )
            print 'Campaign: %s, Cluster: %s, installs: %s, dist: %s' % (cam, cluster, cluster_df.shape[0], dist)
        else:
            bad_campaign_list                 .append(
                    {
                        'app name':cam.split('__')[0],
                        'media_source': cam.split('__')[1],
                        'campaign': cam.split('__')[2],
                        'cluster': cluster,
                        'bad_device_id': [],
                        'distance': dist,
                        'af_searches_anomaly': 0
                    }
                )

    return bad_campaign_list

def get_bad_campaign_and_device_id(searches, app_name, bad_campaign_list):
    for app in app_name:
        product_data = searches[searches['app_name']==app]
        campaign = list(product_data['campaign_extended_name'].value_counts().index)
        for cam in campaign:
            product_data_campaing = product_data[product_data['campaign_extended_name']==cam]
            if check_campaign_size(product_data_campaing, cam) < 50:
                continue
            product_data_campaing_pivot = get_sample_id(product_data_campaing, cam)
            product_data_campaing_pivot = make_clustering(product_data_campaing_pivot)
            product_data_campaing_pivot, clusters_for_analyse = get_cluster_for_analyse(product_data_campaing_pivot)
            bad_campaign_list = find_dence_cluster(
                product_data_campaing_pivot,
                clusters_for_analyse,
                bad_campaign_list,
                cam,
                0.9
            )
    return bad_campaign_list

def aggregate_searches_by_coded_seconds(df):
    return pd.DataFrame(
    df \
        .groupby(
        ['app_name','appsflyer_device_id','campaign_extended_name', 'code_sec']
        )['sec_between_install_and_search'] \
        .aggregate('count')) \
        .reset_index() \
        .rename(columns={'sec_between_install_and_search':'searches'})
def prepare_data_to_loading(bad_campaign_list):
    bad_campaign_df = pd.DataFrame(bad_campaign_list)
    bad_campaign_df_grouped = bad_campaign_df.groupby(['app name','media_source','campaign'])['af_searches_anomaly'].sum().reset_index().rename(columns={'af_searches_anomaly': 'af_searches_anomaly'})
    bad_campaign_df_grouped['af_searches_anomaly'] = bad_campaign_df_grouped['af_searches_anomaly'].apply(lambda x: 1 if x > 0 else 0)
    return bad_campaign_df, bad_campaign_df_grouped.as_matrix()
def make_records(bad_campaign_df, bad_campaign_df_grouped):
    result_list = []
    for i in xrange(bad_campaign_df_grouped.shape[0]):
        strange_values = []
        temp_df = bad_campaign_df[
            (bad_campaign_df['app name']==bad_campaign_df_grouped[i,0])
            &(bad_campaign_df['media_source']==bad_campaign_df_grouped[i,1])
            &(bad_campaign_df['campaign']==bad_campaign_df_grouped[i,2])]
        temp = temp_df['bad_device_id'].as_matrix()
        cluster_dist_value = str(temp_df[['cluster', 'distance']].to_dict())
        for j in xrange(temp.shape[0]):
            strange_values= strange_values + temp[j]
        result_list.append(
            Record(
                app_name = bad_campaign_df_grouped[i,0],
                media_source = bad_campaign_df_grouped[i,1],
                campaign = bad_campaign_df_grouped[i,2],
                af_searches_anomaly = bad_campaign_df_grouped[i,3],
                bad_af_did_values_dict = str(strange_values),
                cluster_dist_value = cluster_dist_value
            )
        )
    return result_list

def upload_data_on_hahn(path, result_list):
    cluster.write(path, result_list)
def get_bad_campaign_log(df,df_campaign):
    df_campaign = df_campaign[df_campaign[3]==1]
    df_campaign['campaign_extended_name'] = df_campaign[0] + '__' + df_campaign[1] + '__' + df_campaign[2]
    df_campaign_list = list(df_campaign['campaign_extended_name'].unique()) + [x for x in list(df['campaign_extended_name'].unique()) if 'portal' in x.lower()]
    df = df[df['campaign_extended_name'].apply(lambda x,y: True if x in y else False, args=(df_campaign_list,))]
    app_name = list(df['app_name'].unique())
    for name in app_name:
        df[df['app_name']==name].to_csv(name.replace(' ', '_') + '_log.csv', sep='\t', index=False)

enddate = datetime.datetime.now() - datetime.timedelta(days=datetime.datetime.now().weekday()+1)
startdate =  enddate - datetime.timedelta(days=enddate.weekday())
startdate = str(startdate).split(' ')[0]
enddate = str(enddate).split(' ')[0]
cluster = clusters.Hahn(pool='mobile-research', token='a6575e1e15b0475fb8d8564beaa60f23').env(
    templates=dict(
        job_root='home/turkey-analytics/ktereshin',
        dates='{%s..%s}' % (startdate, enddate)
    )
)
print startdate, enddate
get_appsflyer_search_event('statbox/extdata-apps-flyer-log/@dates', '$job_root/antifraud/weeks/af_search')
print 'Done get_appsflyer_search_event function \n'
searches = load_search_data_from_hahn('$job_root/antifraud/weeks/af_search')
print 'Done load_search_data_from_hahn function \n'
searches = make_column_with_coded_seconds(searches)
print 'Done make_column_with_coded_seconds function \n'
searches = aggregate_searches_by_coded_seconds(searches)
print 'Done aggregate_searches_by_coded_seconds function \n======================================\n\n'
app_name =  list(searches['app_name'].unique())
bad_campaign_list = []
print 'Start get_bad_campaign_and_device_id function\n'
bad_campaign_list = get_bad_campaign_and_device_id(searches, app_name, bad_campaign_list)
print 'Done get_bad_campaign_and_device_id function \n'
bad_campaign_df, bad_campaign_df_grouped = prepare_data_to_loading(bad_campaign_list)
result_list = make_records(bad_campaign_df, bad_campaign_df_grouped)
upload_data_on_hahn('$job_root/antifraud/weeks/anomaly_campaign/anomaly_campaign_by_as_searches', result_list)
get_bad_campaign_log(searches,pd.DataFrame(bad_campaign_df_grouped))
