from nile.api.v1 import (
    aggregators as na,
    filters as nf,
    extractors as ne,
    grouping as ng,
    clusters,
    files,
    statface,
    Record,
    Template,
    Path
)
from qb2.api.v1 import filters as sf
from qb2.api.v1 import QB2, extractors as se
import pandas as pd
import datetime
import requests
import numpy as np
import time
import json
from scipy.spatial.distance import pdist, cdist, squareform
import matplotlib.pyplot as plt
import warnings
import random
from sklearn.cluster import DBSCAN
from scipy.spatial.distance import pdist
from collections import namedtuple
warnings.filterwarnings('ignore')

def get_start_of_previous_month(x):
    if x.month == 1:
        x = x.replace(day=1, month=12, year = x.year - 1)
        return str(x).split(' ')[0]
    else:
        x = x.replace(day=1, month=x.month - 1)
        return str(x).split(' ')[0]
def load_data_from_hahn(input_path):
    job = cluster.job()
    appsfluer = job.table(input_path)
    return appsfluer.read().as_dataframe()
def get_campaign_for_analyse(df):
    campaing = df.groupby('campaign_extended_name')['appsflyer_device_id'].agg('nunique')
    campaing = list(campaing[campaing>100].index)
    campaing_column = list(df['campaign_extended_name'])
    rr = [True if cam in campaing else False for cam in campaing_column]
    return df[rr]
def decode_column(df, column):
    value_share = df[column].value_counts()
    threhold = np.percentile(value_share, 80)
    good_values = list(value_share[value_share>threhold].index)
    df[column] = df[column].apply(lambda x,y: x if x in y else 'other', args=(good_values,))
    return df
def decode_string_column(df, str_columns):
    for column in str_columns:
        if column in ['code_sec', 'day_hour']:
            continue
        else:
            df = decode_column(df, column)
        print 'Decode done for columns %s' % (column)
    return df

def cumpute_column_share_by_campaign(df, column):
    campaing_column_share_pivot = pd.pivot_table(
        pd.DataFrame(installs.groupby(['campaign_extended_name', column])['appsflyer_device_id'].agg('nunique')).reset_index(),
        index = 'campaign_extended_name',
        columns = column,
        values = 'appsflyer_device_id'
        ).fillna(0)
    cam_count_devices = campaing_column_share_pivot.sum(axis=1)
    for col in campaing_column_share_pivot.columns:
        campaing_column_share_pivot[col] = campaing_column_share_pivot[col]/cam_count_devices
    return campaing_column_share_pivot

def make_cluster_for_string_columns(df, eps):
    db_cluster = DBSCAN(eps=eps, algorithm ='brute', metric = 'cosine', n_jobs = -1)
    db_cluster.fit(df)
    df['cluster']=db_cluster.labels_
    return df

def find_bad_clusters(df, column):
    mean_dist = np.nanmean(pdist(df.iloc[:,:-1], 'cosine'))
    bad_clusters = []
    for cl in df['cluster'].unique():
        dist = np.nanmean(pdist(df[df['cluster']==cl].iloc[:,:-1], 'cosine'))
        if cl==-1:
            bad_clusters.append(cl)
    return bad_clusters

def get_bad_campaign_name(df, bad_clusters, bad_campaign_list, column):
    campaign_list = list(df[df['cluster'].apply(lambda x,y: True if x in y else False, args = (bad_clusters,))]['cluster'].index)
    i=0
    mean_share = df.iloc[:,:-1].mean()
    for cam in campaign_list:
        bad_campaign_list             .append(
                {
                    'app_name':cam.split('__')[0],
                    'media_source': cam.split('__')[1],
                    'campaign': cam.split('__')[2],
                    'column': column,
                    'strange_values': (mean_share - df.ix[cam, :-1]).abs().sort_values(ascending = False)[:10]
                }
            )
    return bad_campaign_list

def get_bad_campaign_name_for_all_str_columns(df,str_columns, bad_campaign_list):
    for column in str_columns:
        print 'Start for column %s' % (column)
        temp = cumpute_column_share_by_campaign(df, column)
        if column != 'code_sec':
            eps = np.percentile(pdist(temp, 'cosine'),50)
        else:
            eps = np.percentile(pdist(temp, 'cosine'),30)
        if eps >0:
            temp = make_cluster_for_string_columns(temp, eps)
        else:
            temp = make_cluster_for_string_columns(temp, eps + 0.001)
        bad_clusters = find_bad_clusters(temp, column)
        bad_campaign_list = get_bad_campaign_name(temp, bad_clusters, bad_campaign_list, column)
    return bad_campaign_list

def code_seconds(second,code_matrix,code_list):
    return code_list.index(code_matrix[second])

def make_column_with_coded_seconds(df):
    df['sec_between_adv_click_and_app_launch'].fillna(0, inplace=True)
    sec = df.groupby(['sec_between_adv_click_and_app_launch'])['app_name'].count()
    sec_matrix = sec.reset_index().as_matrix()
    sec_matrix = np.c_[sec_matrix, np.ones(sec_matrix.shape[0])]
    i = 0
    j = 1
    while i < sec_matrix.shape[0]:
        if sec_matrix[i,1]>3500:
            sec_matrix[i,2] = sec_matrix[i,0]
            i += 1
        else:
            while np.sum(sec_matrix[i:i+j+1,1]) <= 3500 and i+j+1 < sec_matrix.shape[0]:
                j += 1
            for counter in xrange(i,i+j+1):
                if counter < sec_matrix.shape[0]:
                    sec_matrix[counter,2] = sec_matrix[i,0]
            i = i+j+1
    code_sec_list = list(np.unique(sec_matrix[:,2]))
    sec_matrix_dict = {}
    for i in xrange(sec_matrix.shape[0]):
        sec_matrix_dict[sec_matrix[i,0]] = sec_matrix[i,2]
    sec_between_install_and_search = list(df['sec_between_adv_click_and_app_launch'])
    rr = [code_seconds(x,sec_matrix_dict, code_sec_list) for x in sec_between_install_and_search]
    df['code_sec'] = rr
    return df

def get_bad_campaign_by_ip(df):
    df = pd.merge(
        df.groupby('campaign_extended_name')['appsflyer_device_id'].aggregate('nunique').reset_index(),
        df.groupby('campaign_extended_name')['ip'].aggregate('nunique').reset_index(),
        on = 'campaign_extended_name'
    )
    df['did_by_ip'] = df['appsflyer_device_id']/df['ip']
    return df[df['did_by_ip'].apply(lambda x,y: x > y,args = (np.mean(df['did_by_ip']) + 3*np.std(df['did_by_ip']),))].as_matrix()

def add_bad_ip_campaing_to_list(matrix, bad_campaign_list):
    for i in xrange(matrix.shape[0]):
        bad_campaign_list             .append(
                {
                    'app_name':matrix[i,0].split('__')[0],
                    'media_source': matrix[i,0].split('__')[1],
                    'campaign': matrix[i,0].split('__')[2],
                    'column': 'did_by_ip',
                    'strange_values': matrix[i,3]
                }
            )
    return bad_campaign_list
def prepare_data_to_loading(bad_campaign_list):
    bad_campaign_df = pd.DataFrame(bad_campaign_list)
    bad_campaign_df['anomaly'] = 1
    bad_campaign_df_pivot = pd.pivot_table(
        bad_campaign_df,
        index = ['app_name', 'media_source', 'campaign'],
        columns = ['column'],
        values = 'anomaly'
        ).reset_index().fillna(0)
    return bad_campaign_df, bad_campaign_df_pivot
def make_records(bad_campaign_df, bad_campaign_df_pivot):
    result_list = []
    for i in xrange(bad_campaign_df_pivot.shape[0]):
        strange_values = {}
        temp = bad_campaign_df[
            (bad_campaign_df['app_name']==bad_campaign_df_pivot.iloc[i,0])
            &(bad_campaign_df['media_source']==bad_campaign_df_pivot.iloc[i,1])
            &(bad_campaign_df['campaign']==bad_campaign_df_pivot.iloc[i,2])][['column','strange_values']].as_matrix()
        for j in xrange(temp.shape[0]):
            if type(temp[j,1]) != float:
                strange_values[temp[j,0]] = temp[j,1].to_dict()
            else:
                strange_values[temp[j,0]] = temp[j,1]
        value_dict = bad_campaign_df_pivot.iloc[i,:]
        value_dict['anomaly_values_dict'] = str(strange_values)
        result_list.append(
            Record(**value_dict)
        )
    return result_list

def upload_data_on_hahn(path, result_list):
    cluster.write(path, result_list)

enddate = datetime.datetime.now() - datetime.timedelta(days=datetime.datetime.now().weekday()+1)
startdate =  enddate - datetime.timedelta(days=enddate.weekday())
startdate = str(startdate).split(' ')[0]
enddate = str(enddate).split(' ')[0]
cluster = clusters.Hahn(pool='mobile-research', token='a6575e1e15b0475fb8d8564beaa60f23').env(
    templates=dict(
        job_root='home/turkey-analytics/ktereshin',
        dates='{%s..%s}' % (startdate, enddate)
    )
)
print startdate, enddate
installs = load_data_from_hahn('$job_root/antifraud/weeks/installs')
installs = get_campaign_for_analyse(installs)
installs = make_column_with_coded_seconds(installs)
str_columns = ['wifi', 'region_by_ip', 'os_version', 'operator', 'device_model', 'device_brand', 'country_code', 'code_sec', 'day_hour']
installs = decode_string_column(installs, str_columns)
bad_campaign_list = []
bad_campaign_list = get_bad_campaign_name_for_all_str_columns(installs,str_columns, bad_campaign_list)
bad_campaign_by_ip = get_bad_campaign_by_ip(installs)
bad_campaign_list = add_bad_ip_campaing_to_list(bad_campaign_by_ip, bad_campaign_list)
bad_campaign_df, bad_campaign_df_pivot = prepare_data_to_loading(bad_campaign_list)
result_list = make_records(bad_campaign_df, bad_campaign_df_pivot)
upload_data_on_hahn('$job_root/antifraud/weeks/anomaly_campaign/anomaly_campaign_by_meta_data', result_list)
