import datetime
import urllib
import urllib2
import re
import numpy as np
import pandas as pd
import requests
from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)
import StringIO
def get_start_of_previous_month(x):
    if x.month == 1:
        x = x.replace(day=1, month=12, year = x.year - 1)
        return str(x).split(' ')[0]
    else:
        x = x.replace(day=1, month=x.month - 1)
        return str(x).split(' ')[0]
enddate = datetime.datetime.now() - datetime.timedelta(days=datetime.datetime.now().weekday()+1)
startdate =  enddate - datetime.timedelta(days=enddate.weekday())
startdate = str(startdate).split(' ')[0]
enddate = str(enddate).split(' ')[0]
metrika_startdate = datetime.datetime.now() - datetime.timedelta(days=95)
metrika_startdate = str(metrika_startdate).split(' ')[0]

print startdate, enddate

cluster = clusters.Hahn(pool='mobile-research', token='a6575e1e15b0475fb8d8564beaa60f23').env(
    templates=dict(
        job_root='home/turkey-analytics/ktereshin',
        dates='{%s..%s}' % (startdate, enddate)
    )
)

job = cluster.job()
logs = job.table('statbox/metrika-mobile-log/@dates')
hits = logs.qb2(log='metrika-mobile-log',
                fields=['device_id', 'event_date'],
                ).put('$job_root/antifraud/weeks/new_device_share/metrika_installs')

hits.groupby('device_id').aggregate(
    min_date=na.min('event_date')).project('min_date',
                                           device_id=ne.custom(lambda x: str(x).lower(), 'device_id')).put(
    '$job_root/antifraud/weeks/new_device_share/metrika_lower')
job.run()

job = cluster.job()
apps = job.table('$job_root/antifraud/weeks/installs').project(ne.all(), device_id = ne.custom(lambda x: str(x).lower(),'uuid'), install_date = ne.custom(lambda x: str(x).split(' ')[0], 'install_date'))
metrika = job.table('$job_root/antifraud/weeks/new_device_share/metrika_lower')
joi = apps.join(metrika, by='device_id', type='left')
flag = joi.project('campaign', 'media_source', 'device_id', 'app_name',
                   is_new=ne.custom(
                       lambda x, y: 0 if x and datetime.datetime.strptime(x, '%Y-%m-%d') + datetime.timedelta(
                           days=1) < datetime.datetime.strptime(y, '%Y-%m-%d') else 1, 'min_date',
                       'install_date'))
flag.groupby('campaign', 'media_source', 'app_name').aggregate(new=na.sum('is_new'), all=na.count()).project(ne.all(), share = ne.custom(
    lambda x, y: x * 100. / y, 'new', 'all')).sort('share', 'all').put('$job_root/antifraud/weeks/new_device_share/fin')
job.run()

job = cluster.job()

fin = job.table('$job_root/antifraud/weeks/new_device_share/fin')
amount = []
for i in fin.read():
    num = i.all
    amount.append(num)

data = {'name': sorted(amount)}
df = pd.DataFrame(data)
q = df.quantile(.95)[0]

fin.filter(nf.custom(lambda x: x>q, 'all')).sort('share', 'all').put('$job_root/antifraud/weeks/new_device_share/fin_filter')
job.run()

job = cluster.job()

filtered = job.table('$job_root/antifraud/weeks/new_device_share/fin_filter')
perc = []

for i in filtered.read():
    share = i.share
    perc.append(share)


mean = np.mean(perc)
std = np.std(perc)

filtered.project(ne.all(), new_users_share = ne.custom(lambda x: 1 if x>mean+std else 0, 'share'))     .filter(nf.custom(lambda x: x == 1, 'new_users_share'))     .put('$job_root/antifraud/weeks/anomaly_campaign/anomaly_campaign_by_new_users_share')
job.run()
