import re
import adwords
#import metrika
import pandas as pd
import numpy as np
import sys
#import direct
#import statface
import json

from datetime import datetime, timedelta

# -------------------------------------------
# AdWords data: impressions clicks and cost data
# -------------------------------------------

def group_adwords_data_by_utm_tags(data):
    """ Function takes ad data in TSV format with headers 'URL, Impressions, Clicks, Cost'
    and breaks it by utm tags. Any ad data w/o at leaset utm_source, utm_medium, utm_campaign
    is omitted """
    tags_dict = {}
    splitted_data = data.split('\n')
    for entry in splitted_data[1:]:
        splitted_entry = entry.split('\t')

        if len(splitted_entry) != 4:
            print(entry)
            continue

        url = splitted_entry[0]
        impressions = int(splitted_entry[1])
        clicks = int(splitted_entry[2])
        cost = float(splitted_entry[3]) / 10**6
        utm_source = re.search(r'utm_source=([^&#"]+)', url)
        utm_medium = re.search(r'utm_medium=([^&#"]+)', url)
        utm_campaign = re.search(r'utm_campaign=([^&#"]+)', url)

        if utm_medium == None or utm_source == None or utm_campaign == None:
            continue

        key = '{0}:{1}:{2}'.format(utm_source.group(1), utm_medium.group(1), utm_campaign.group(1))

        if not key in tags_dict.keys():
            tags_dict[key] = {
            'impressions': 0,
            'clicks': 0,
            'cost': 0.0
        }

        tags_dict[key]['impressions'] += impressions
        tags_dict[key]['clicks'] += clicks
        tags_dict[key]['cost'] += cost

    return tags_dict

def get_adwords_data(customer_id, date):
    """
    #  Takes adwords customer id, returns a dictionary
    {
        <utm_source_tag>:<utm_medium_tag>:<utm_campaign_tag>: {
            impressions: int,
            clicks: int,
            cost: float
        }
    }
    """
    report = adwords.ad_performance_report(customer_id, date)
    return group_adwords_data_by_utm_tags(report)

# -----------------------------------------------------------------------
# Direct data
# -----------------------------------------------------------------------

def extract_tags_from_banners(banners_report):
    banners_report = [{'CampaignId': x['CampaignId'], 'Href': x['TextAd']['Href']} for x in banners_report]
    result = {}

    for entry in banners_report:
        campaign_id = entry['CampaignId']

        from_tag     = re.search(r'from=([^&#]+)', entry['Href'])
        utm_source   = re.search(r'utm_source=([^&#]+)', entry['Href'])
        utm_medium   = re.search(r'utm_medium=([^&#]+)', entry['Href'])
        utm_campaign = re.search(r'utm_campaign=([^&#]+)', entry['Href'])

        if not campaign_id in result.keys():
            result[campaign_id] = {
                'from': set(),
                'utm_source': set(),
                'utm_medium': set(),
                'utm_campaign': set()
            }

        if from_tag != None:
            result[campaign_id]['from'].add(from_tag.group(1))
        if utm_source != None:
            result[campaign_id]['utm_source'].add(utm_source.group(1))
        if utm_medium != None:
            result[campaign_id]['utm_medium'].add(utm_medium.group(1))
        if utm_campaign != None:
            result[campaign_id]['utm_campaign'].add(utm_campaign.group(1))
    return result

def get_direct_data(login, token, date):
    direct_v5 = direct.Version5(token) # v5 to get all banner types
    direct_v4 = direct.Version4(token) # v4 to get stats

    campaigns = direct_v5.get_campaigns_list(login)
    campaigns_ids = [x['Id'] for x in campaigns]
    campaigns_stats = direct_v4.get_summary_stat(campaigns_ids, date, date)

    campaigns_tags = extract_tags_from_banners(direct_v5.get_banners(login, campaigns_ids))
    res = {}
    for entry in campaigns_stats:
        campaign_id = entry['CampaignID']
        if campaign_id in campaigns_tags.keys():
            entry['tags'] = campaigns_tags[campaign_id]
            key = list(entry['tags']['utm_source'])[0] + ':' + list(entry['tags']['utm_medium'])[0] + ':' + list(entry['tags']['utm_campaign'])[0]

            if not key in res.keys():
                res[key] = {
                    'impressions': 0,
                    'clicks': 0,
                    'cost': 0
                }

            res[key]['impressions'] += (int(entry['ShowsSearch']) + int(entry['ShowsContext']))
            res[key]['clicks'] += (int(entry['ClicksSearch']) + int(entry['ClicksContext']))
            res[key]['cost'] += 30 * (float(entry['SumSearch']) + float(entry['SumContext']))
    return res


#-------------------------------------------
# Metrika data: goals
#-------------------------------------------

def get_goals(token, counter_id, date, goal_ids, goal_names):
    """ Returns dict with <utm_source_tag>:<utm_medium_tag>:<utm_campaign_tag> as keys and
        visit level goal reaches as subkeys """
    request = metrika.Request(token, counter_id)
    request.set_metrics(','.join(['ym:s:goal{0}visits'.format(goal_ids[i]) for i in range(len(goal_ids))]))
    request.set_date1(date)
    request.set_date2(date)
    request.set_dimensions('ym:s:UTMMedium,ym:s:UTMSource,ym:s:UTMCampaign')
    request.get_all()
    data = request.pandas_data()
    columns = ['utm_medium', 'utm_source', 'utm_campaign']
    columns += goal_names
    data.columns = columns
    data['key'] = data['utm_source'] + ':' + data['utm_medium'] + ':' + data['utm_campaign']
    data = data.drop(['utm_medium', 'utm_source', 'utm_campaign'], axis=1).as_matrix()

    res_dict = {}
    goals_len = len(goal_names)
    for row in data:
        key = row[goals_len] # key will be in the last column of dataframe. DataFrame has length of goal_names + 1

        if not key in res_dict.keys():
            res_dict[key] = {}
            for goal_name in goal_names:
                res_dict[key][goal_name] = 0
        for i in range(goals_len):
            res_dict[key][goal_names[i]] += row[i]

    return res_dict


#-------------------------------------------
# Other functions
#-------------------------------------------


def merge_adwords_metrika_data(adwords_data, metrika_data):
    """ Adds goal reaches to adwords_data """
    for key in adwords_data:
        if key in metrika_data.keys():
            for sub_key in metrika_data[key].keys():
                if not sub_key in adwords_data[key].keys():
                    adwords_data[key][sub_key] = metrika_data[key][sub_key]
                else:
                    adwords_data[key][sub_key] += metrika_data[key][sub_key]

    data = pd.DataFrame(adwords_data).transpose()
    data['key'] = data.index
    data.reset_index(inplace=True, drop=True)

    return data.fillna(0).replace(np.inf, 0)

#-------------------------------------------
# Functions to transform dataframe into statface tree
#-------------------------------------------


def get_leafs(data, key_transformation_func):
    temp = data.copy()
    temp['key'] = temp['key'].apply(key_transformation_func)
    top_leafs = temp.groupby('key', as_index=False).agg(np.sum)
    return top_leafs

def get_total_for_leafs(data):
    return get_leafs(data, lambda x: '\tAll\t')

def get_source_leafs(data):
    return get_leafs(data, lambda x: '\tAll\t{0}\t'.format(x.split(':')[0]))

def get_medium_leafs(data):
    return get_leafs(data, lambda x: '\tAll\t{0}\t{1}\t'.format(x.split(':')[0], x.split(':')[1]))

def get_campaign_leafs(data):
    return get_leafs(data, lambda x: '\tAll\t{0}\t{1}\t{2}\t'.format(x.split(':')[0], x.split(':')[1], x.split(':')[2].split('_')[0]))

def transform_key_to_leaf(key):
    s_key = key.split(':')
    source = s_key[0]
    medium = s_key[1]
    campaign = s_key[2].split('_')[0]
    return '\tAll\t{0}\t{1}\t{2}\t{3}\t'.format(source, medium, campaign, s_key[2])


if __name__ == '__main__':
    date = sys.argv[1] if len(sys.argv) == 2 else datetime.today().strftime('%Y-%m-%d')

    bro_adwords_id = 6022164922
    metrika_token = (del)
    direct_token = (del)
    metrika_counter_id = 36708630
    metrika_goal_ids = [19117040, 19117035]
    metrika_goal_names = ['downloads', 'installs']
    report_url = '/Adhoc/alex-wa/switch_bro/funnel'
#    report_url = '/Adhoc/alex-wa/funnels/bro_switch_funnel'

    funnel_metrics = {
        'ctr': lambda x: x['clicks'] / x['impressions'],
        'cr_download': lambda x: x['downloads'] / x['clicks'],
        'cr_install': lambda x: x['installs'] / x['clicks'],
        'cr_download_install': lambda x: x['installs'] / x['downloads'],
        'cpa_install': lambda x: x['cost'] / x['installs']
    }

    direct_data = get_direct_data('stat-for-bro', direct_token, date)
    adwords_data = get_adwords_data(bro_adwords_id, date)
    dir_ad_data = {**direct_data, **adwords_data} # Merging two dicts. Needs python >= 3.5
    metrika_data = get_goals(metrika_token, metrika_counter_id, date, metrika_goal_ids, metrika_goal_names)
    merged_data = merge_adwords_metrika_data(dir_ad_data, metrika_data)

    leafs_df = get_source_leafs(merged_data)
    leafs_df = leafs_df.append(get_medium_leafs(merged_data), ignore_index=True)
    leafs_df = leafs_df.append(get_campaign_leafs(merged_data), ignore_index=True)
    leafs_df = leafs_df.append(get_total_for_leafs(merged_data), ignore_index=True)

    merged_data['key'] = merged_data['key'].apply(transform_key_to_leaf)
    leafs_df = leafs_df.append(merged_data, ignore_index=True)
    leafs_df['fielddate'] = date
    #up = statface.StatfaceUpload2(url='https://stat-beta.yandex-team.ru/_api/report/data')
    #up.upload(report_url, json.dumps({'values': leafs_df.to_dict(orient='records')}), 'd')

    up = statface.StatfaceUpload(
        json.dumps({'values': leafs_df.to_dict(orient='records')}),
        'https://stat.yandex-team.ru/_api/report/data',
        #'https://stat-beta.yandex-team.ru/_api/report/data',
        report_url,
        'd'
    )
    up.execute()
    print('Successful upload')

