from nile.api.v1 import (
    aggregators as na,
    filters as nf,
    extractors as ne,
    grouping as ng,
    clusters,
    files,
    statface,
    Record,
    Template,
    Path
)
from qb2.api.v1 import filters as sf
from qb2.api.v1 import QB2, extractors as se
import pandas as pd
import datetime
import requests
import numpy as np
import time
import warnings
import urllib
import urllib2
from StringIO import StringIO
warnings.filterwarnings('ignore')
def get_start_of_previous_month(x):
    if x.month == 1:
        x = x.replace(day=1, month=12, year = x.year - 1)
        return str(x).split(' ')[0]
    else:
        x = x.replace(day=1, month=x.month - 1)
        return str(x).split(' ')[0]
enddate = datetime.datetime.now() - datetime.timedelta(days=datetime.datetime.now().weekday()+1)
startdate =  enddate - datetime.timedelta(days=enddate.weekday())
startdate = str(startdate).split(' ')[0]
enddate = str(enddate).split(' ')[0]
connection_timeout=1500
host=['http://mtmega.yandex.ru:8123', 'http://mtmega.yandex.ru:8123']
cluster = clusters.Hahn(pool='mobile-research', token='a6575e1e15b0475fb8d8564beaa60f23').env(
    templates=dict(
        job_root='home/turkey-analytics/ktereshin',
        dates='{%s..%s}' % (startdate, enddate)
    )
)
def get_app_name(x):
    if str(x) == 'ru.yandex.searchplugin':
        return 'Search App Android'
    elif str(x) == 'id1050704155':
        return 'Search App IOS'
    elif str(x) == 'com.yandex.browser':
        return 'YaBro Android'
    elif str(x) == 'id574939428':
        return 'YaBro iPad'
    else:
        return 'YaBro iPhone'
def merge_installs_and_cripta(cluster):
    job = cluster.job()
    appsflyer_install = job.table('statbox/extdata-apps-flyer-log/@dates')         .filter(
            nf.custom(lambda x: x != 'null', 'campaign'),
            nf.custom(lambda x: x == 'install', 'event_type'),
            nf.custom(lambda x: x >= startdate, 'install_time'),
            nf.custom(lambda x: x != '' and x != None, 'appsflyer_device_id'),
            nf.custom(
                lambda x: x in ['ru.yandex.searchplugin', 'id1050704155', 'com.yandex.browser',
                                'id574939428', 'id483693909'],
                'app_id'
            ),
            nf.or_(
                nf.custom(lambda x: not str(x).lower() in ['organic'], 'media_source'),
                nf.custom(lambda x: x != 'null', 'agency')
            ),
            nf.custom(lambda x,y:True if x or y else False, 'advertising_id', 'idfa')
        ) \
        .project(
            advertising_id = ne.custom(lambda x,y: str(x).lower() if x else str(y).lower(), 'advertising_id', 'idfa'),
            media_source=ne.custom(
                lambda x, y: y.lower() if x == 'null' else x.lower(),
                'media_source', 'agency'
            ),
            campaign=ne.custom(lambda x: str(x).lower(), 'campaign'),
            install_date='install_time',
            app_name=ne.custom(get_app_name, 'app_id')
        )
    crypta = job.table('home/crypta/production/state/graph/dicts/dev_yuid_indevice_perfect_no_limit')         .project(
            advertising_id = ne.custom(lambda x: str(x).lower(),'devid'),
            yandexuid = 'yuid'
        )
    appsflyer_install = appsflyer_install         .join(
            crypta,
            by='advertising_id',
            type='left'
        ) \
        .project(ne.all(), is_visit_bad_site = ne.custom(lambda x: int(x) if x else 0, 'is_visit_bad_site')) \
        .put('$job_root/antifraud/weeks/share_of_bad_sites/installs_advertising_id')
    job.run()
def get_clickhouse_data(params, host, connection_timeout, date, at):
    query_get = urllib.urlencode(params)
    url = host[0] + '?' + query_get
    req = urllib2.Request(url)
    i = 0
    while True:
        i = i+1
        print('Attempt: %s, query time:%s, date: %s, sample: %s, url: %s' % (i, str(datetime.datetime.now()), date, at, str(url).split('?')[0]))
        try:
            if req.get_method() == 'POST':
                res = urllib2.urlopen(req, timeout=connection_timeout, data = '')
            else:
                res = urllib2.urlopen(req, timeout=connection_timeout)
            time.sleep(10)
            print 'Success for attempt %s: date %s, sample %s\n' % (i,date, at)
            if params['query'].lower().startswith('select'):
                result = res.read()
                if 'DB::Exception' in result:
                    print 'Have Error in Data \n===================================\n\n'
                    continue
                elif result == '':
                    break
                time.sleep(10)
                file = StringIO(result)
                return pd.DataFrame().from_csv(file, sep='\t', header = None, index_col =None)
            break
        except urllib2.URLError as e:
            try:
                err = e.read()
            except:
                continue
            if '1 hour has been exceeded' in err:
                print err.split('\n')[0]
                next_hour = datetime.datetime.now() + datetime.timedelta(hours = 1)
                t = next_hour.replace(minute=0, second=0, microsecond=0) - datetime.datetime.now()
                url = host[0] + '?' + query_get
                req = urllib2.Request(url)
                print 'Wait for %s minutes\n' % (t.seconds/60)
                time.sleep(t.seconds)
                if i < 15:
                    continue
                else:
                    break
            elif '24 hours has been exceeded' in err:
                print err.split('\n')[0]
                next_hour = datetime.datetime.now() + datetime.timedelta(days = 1)
                t = next_hour.replace(hour = 9 ,minute=0, second=0, microsecond=0) - datetime.datetime.now()
                print 'Wait for %s minutes \n' % (t.seconds/60)
                time.sleep(t.seconds)
                if i < 15:
                    continue
                else:
                    break
            elif 'Syntax error:' in err or 'doesn\'t exist' in err:
                print err.split('\n')[0]+'\n'
                break
            else:
                print err.split('\n')[0]+'\n'
                time.sleep(15)
                if i < 15:
                    continue
                else:
                    break
def make_string_column(df):
    for column in df.columns:
        df[column] = df[column].astype(str)
    return df
def load_data_to_hahn(df, path):
    result_list = []
    for i in xrange(df.shape[0]):
        result_list.append(Record(**df.iloc[i,:].to_dict()))
    cluster.write(path, result_list, append=True)
def load_installs_for_one_date(hosts, startdate, enddate, query, path):
    query_ = query.format(startdate, enddate, hosts)
    params = {'query': query_, 'user': 'ktereshin', 'password': 'nGScvTPj'}
    result_data = get_clickhouse_data(params, host, connection_timeout, startdate, '')
    result_data = result_data.rename(columns = columns)
    result_data = make_string_column(result_data)
    load_data_to_hahn(result_data, path)
    print 'Load Data Done\n==============================\n\n'
def get_bad_hosts(file_name):
    f = open(file_name, 'r')
    hosts = ''
    i=0
    for line in f.readlines():
        if i == 0:
            hosts = '\'' + line.strip() +'\''
        else:
            hosts = hosts+ ',' + '\'' + line.strip() +'\''
        i += 1
    f.close()
    return hosts
def get_aggr_data(groups):
    for key, records in groups:
        result_dict = {'installs_advertising_id': [],
                       'hosts': {},
                       'installs_visited_bad_site': [],
                       'app_name':key.app_name,
                       'media_source':key.media_source,
                       'campaign':key.campaign
                      }
        for record in records:
            if not record.advertising_id in result_dict['installs_advertising_id']:
                result_dict['installs_advertising_id'].append(record.advertising_id)
            if not record.advertising_id in result_dict['installs_visited_bad_site'] and record.is_visit_bad_site == 1:
                result_dict['installs_visited_bad_site'].append(record.advertising_id)
            if len(record.hosts) > 0:
                for host in record.hosts:
                    try:
                        result_dict['hosts'][host] += 1
                    except:
                        result_dict['hosts'][host] = 1
        result_dict['installs_advertising_id'] = len(result_dict['installs_advertising_id'])
        result_dict['installs_visited_bad_site'] = len(result_dict['installs_visited_bad_site'])
        yield Record(**result_dict)
def anomaly_campaign_by_bad_site_share(cluster):
    job = cluster.job()
    appsflyer_install = job.table('$job_root/antifraud/weeks/share_of_bad_sites/installs_advertising_id')
    bad_yuid = job.table('$job_root/antifraud/weeks/share_of_bad_sites/yandexuid_visited_bad_site')
    appsflyer_install = appsflyer_install         .join(
            bad_yuid,
            by='yandexuid',
            type='left'
        ) \
        .project(
            ne.all(),
            is_visit_bad_site = ne.custom(lambda x: 1 if x else 0, 'is_visit_bad_site'),
            hosts = ne.custom(lambda x: x if x else [], 'hosts')
        ) \
        .groupby(
            'app_name',
            'media_source',
            'campaign'
        ) \
        .reduce(get_aggr_data) \
        .project(ne.all(), installs_visited_bad_site_share = ne.custom(lambda x,y: float(x)/y, 'installs_visited_bad_site', 'installs_advertising_id')) \
        .filter(nf.custom(lambda x: x > 50, 'installs_advertising_id')) \
        .project(ne.all(), bad_site_share_anomaly = ne.custom(lambda x: 1 if x > 5 else 0, 'installs_visited_bad_site')) \
        .sort('installs_visited_bad_site_share') \
        .put('$job_root/antifraud/weeks/anomaly_campaign/anomaly_campaign_by_bad_site_share')
    job.run()
def get_date_range(dates, num_of_parts):
    reseut_dict = {}
    for i in range(0, len(dates), len(dates)/num_of_parts):
        if i+len(dates)/num_of_parts - 1 < len(dates):
            if dates[i+len(dates)/num_of_parts - 1] != dates[len(dates)-1]:
                reseut_dict[dates[i]] =  dates[i+len(dates)/num_of_parts - 1]
            else:
                reseut_dict[dates[i]] =  dates[i+len(dates)/num_of_parts - 1]
        else:
            reseut_dict[dates[i]] =  dates[len(dates) - 1]
    return reseut_dict
merge_installs_and_cripta(cluster)
hosts = get_bad_hosts('sar_hosts')
dates = list(pd.DataFrame(pd.date_range(startdate, enddate))[0].apply(lambda x: str(x).split(' ')[0]))
get_date_range(dates, 2)
for start, end in get_date_range(dates, 2).iteritems():
    columns = {0:'yandexuid',
               1: 'StartURLDomain',
               2: 'is_visit_bad_site'
              }
    query = '''select UserID, StartURLDomain, '1' from visits_all  where StartDate >= toDate('{0}') and  StartDate <= toDate('{1}')
        and (RefererDomain in ({2}) or StartURLDomain in ({2})) group by UserID, StartURLDomain'''
    path = '$job_root/antifraud/weeks/share_of_bad_sites/yandexuid_visited_bad_site'
    load_installs_for_one_date(hosts, start, end, query, path)
job = cluster.job()
appsflyer_install = job.table('$job_root/antifraud/weeks/share_of_bad_sites/yandexuid_visited_bad_site')     .groupby('yandexuid', 'is_visit_bad_site')     .aggregate(hosts=na.distinct('StartURLDomain'))     .put('$job_root/antifraud/weeks/share_of_bad_sites/yandexuid_visited_bad_site')
job.run()
anomaly_campaign_by_bad_site_share(cluster)
