from nile.api.v1 import (
    aggregators as na,
    filters as nf,
    extractors as ne,
    grouping as ng,
    clusters,
    files,
    statface,
    Record,
    Template,
    Path
)
from qb2.api.v1 import filters as sf
from qb2.api.v1 import QB2, extractors as se
import pandas as pd
import datetime
import numpy as np
import urllib2
import datetime
import re
import operator
import re
from functools import partial
def get_start_of_previous_month(x):
    if x.month == 1:
        x = x.replace(day=1, month=12, year = x.year - 1)
        return str(x).split(' ')[0]
    else:
        x = x.replace(day=1, month=x.month - 1)
        return str(x).split(' ')[0]
def get_browser(x):
    if 'yabro' in x.lower():
        return 'YandexBrowser'
    elif 'Search App' in x:
        return 'SearchApp'
def get_fraud_share(groups):
    for key, records in groups:
        app_name = key.app_name
        campaign = key.campaign
        media_source = key.media_source
        fraud = 0
        not_fraud = 0
        for record in records:
            if record.is_good_search == False:
                fraud += record.searches
            else:
                not_fraud += record.searches
        if fraud + not_fraud > 100:
            yield Record(
                app_name = app_name,
                media_source = media_source,
                campaign = campaign,
                fraud_search_share = float(fraud) / (fraud + not_fraud)
            )
currentdate = datetime.datetime.now()
currentdatestr = str(currentdate).split(' ')[0]
enddate = datetime.datetime.now() - datetime.timedelta(days=datetime.datetime.now().weekday()+1)
startdate =  enddate - datetime.timedelta(days=enddate.weekday())
startdate = str(startdate).split(' ')[0]
enddate = str(enddate).split(' ')[0]
cluster = clusters.Hahn(pool='mobile-research', token='a6575e1e15b0475fb8d8564beaa60f23').env(
    templates=dict(
        job_root='home/turkey-analytics/ktereshin'
    )
)
cube_end_date = max(list(cluster.driver.list('statbox/cube/data/request_money')))
cluster = clusters.Hahn(pool='mobile-research', token='a6575e1e15b0475fb8d8564beaa60f23').env(
    templates=dict(
        job_root='home/turkey-analytics/ktereshin',
        dates='{%s..%s}' % (startdate, cube_end_date)
    )
)
print startdate, enddate
job = cluster.job()
searches = job.table('statbox/cube/data/request_money/@dates')     .qb2(
        log = 'request-cube',
        fields = [
            'browser',
            'os_family',
            'uuid',
            'is_good_search'
        ],
        filters = [
            sf.custom(lambda x: str(x).lower() in ['searchapp', 'yandexbrowser'], 'browser'),
            sf.custom(lambda x: x, 'uuid')
        ],
    ) \
    .groupby(
        'uuid',
        'browser',
        'os_family',
        'is_good_search'
    ) \
    .aggregate(
        searches = na.count()
    ) \
    .put('$job_root/antifraud/weeks/fraud_searches_share/searches_uuid')
job.run()

job = cluster.job()
appsfluer = job.table('$job_root/antifraud/weeks/installs')     .project(
        'app_name',
        'appsflyer_device_id',
        'campaign',
        'media_source'
    )
install_uuid = job.table('$job_root/appsflyer_dash/appsflyer_installs_uuid')     .project(
        'app_name',
        'appsflyer_device_id',
        'uuid'
    )
appsfluer = appsfluer     .join(
        install_uuid,
        by=['app_name','appsflyer_device_id'],
        type='inner'
    ) \
    .put('$job_root/antifraud/weeks/fraud_searches_share/installs_uuid')
job.run()

job = cluster.job()
installs = job.table('$job_root/antifraud/weeks/fraud_searches_share/installs_uuid')     .project(
        ne.all(),
        browser = ne.custom(get_browser, 'app_name')
    )
searches = job.table('$job_root/antifraud/weeks/fraud_searches_share/searches_uuid')
installs = installs     .join(
        searches,
        by=['uuid','browser'],
        type='inner'
    ) \
    .groupby(
        'is_good_search',
        'campaign',
        'media_source',
        'app_name'
    ) \
    .aggregate(
        searches = na.sum('searches')
    ) \
    .put('$job_root/antifraud/weeks/fraud_searches_share/installs_searches_uuid')
job.run()

def get_fraud_share(groups):
    for key, records in groups:
        app_name = key.app_name
        campaign = key.campaign
        media_source = key.media_source
        fraud = 0
        not_fraud = 0
        for record in records:
            if record.is_good_search == False:
                fraud += record.searches
            else:
                not_fraud += record.searches
        if fraud + not_fraud > 100:
            yield Record(
                app_name = app_name,
                media_source = media_source,
                campaign = campaign,
                fraud_search_share = float(fraud) / (fraud + not_fraud)
            )
job = cluster.job()
installs = job.table('$job_root/antifraud/weeks/fraud_searches_share/installs_searches_uuid')     .groupby(
        'app_name',
        'campaign',
        'media_source'
    ) \
    .reduce(get_fraud_share) \
    .put('$job_root/antifraud/weeks/fraud_searches_share/installs_searches_uuid_fraud_searches_share')
job.run()

data = installs.read().as_dataframe()
from functools import partial
anomaly_threhold = np.mean(data['fraud_search_share']) + 3*np.std(data['fraud_search_share'])
def get_anomaly(x,y):
    if x > y:
        return 1
    else:
        return 0
pget_anomaly = partial(get_anomaly, y = anomaly_threhold)
job = cluster.job()
installs = job.table('$job_root/antifraud/weeks/fraud_searches_share/installs_searches_uuid_fraud_searches_share')     .project(
        ne.all(),
        fraud_search_share_anomaly = ne.custom(pget_anomaly, 'fraud_search_share')
    ) \
    .filter(
        nf.custom(lambda x: x == 1, 'fraud_search_share_anomaly')
    ) \
    .put('$job_root/antifraud/weeks/anomaly_campaign/anomaly_campaign_by_fraud_searches_share')
job.run()
