__author__ = 'burakonal'

import pandas as pd
import numpy as np
import datetime
import re
from clickhouse import *
import pickle
import json
import time

desired_width = 320
pd.set_option('display.width', desired_width)

#Joining clickhouse and comscore data together for projection of comscore data based on metrika results in yandex browser.

'''
Table order is:
    1) search_fact
    2) machine_id (without user_agent)
    3) person_demo
    4) machine_demo
    5) browser_map
    6) web_entity_map
'''
file_names = [
            'machine_id',
            'status',
            'country',
            'reportable_region',
            'search_id',
            'search_phrase',
            'toolbar_flag',
            'auto_flag',
            'domain_name',
            'ref_domain_name',
            'url_host',
            'url_dir',
            'url_page',
            'event_time',
            'time_id',
            'reply_code',
            'result_pages',
            'affiliate_flag',
            'widget_flag',
            'instant_flag',
            'secure_flag',
            'useragent',
            'url',
            'search_id',
            'person_id',
            'machine_id',
            'person_id',
            'person_gender',
            'person_age',
            'person_projected_weight',
            'machine_id',
            'machine_primary_user_gender',
            'machine_primary_user_age',
            'machine_hh_income',
            'machine_projected_weight',
            'country',
            'search_id',
            'browser_general',
            'browser_version',
            'search_id',
            'web_name',
            ]

def update_statface_report(rows, datefield):
   url = 'https://stat.yandex-team.ru/_api/report/data'
   headers = {'StatRobotUser': 'robot_burakonal', 'StatRobotPassword': 'moh8shaeRo'}
   values = {'name': "https://stat.yandex-team.ru/yandex.com.tr/Special/Metrics/burakonal/1293",
           'scale': datefield,
           'data': json.dumps({'values': rows})
   }
   data = urllib.urlencode(values)
   req = urllib2.Request(url, data, headers)
   response = urllib2.urlopen(req)

def read_data(path, status):
    db = pd.read_csv(path,
                     sep='\t',
                     names = file_names,
                     parse_dates = True,
                     low_memory = False
                    )
    data = db[db['status'] == status]
    return db, data

def include(x, y):
    for i in y:
        if i in str(x):
            return i
    return "0"

def exact_match(x, y):
    try:
        if any(i == x for i in y):
            return 1
        else:
            return 0
    except:
        print x

def getting_comscore_data(status, path, search_engines, search_engine_tags, browsers):

    db, data = read_data(path, status)
    data.browser_general = data.browser_general.str.lower()
    data.web_name = data.web_name.str.lower()
    data['is_search_engine'] = data.web_name.apply(lambda x: exact_match(x, search_engines))
    data['is_browser'] = data.browser_general.apply(lambda x: exact_match(x, browsers))
    data['is_search_engine_tag'] = data.web_name.apply(lambda x: include(x, search_engine_tags))
    data_filtered = data[(data.is_search_engine == 1) & (data.is_browser == 1) & (data.is_search_engine_tag != "0")]
    data_smalled = data_filtered[['browser_general', 'web_name', 'is_search_engine_tag', 'machine_projected_weight']].reset_index(drop=True)
    data_smalled_groupped = data_smalled.groupby(['browser_general', 'is_search_engine_tag'])['machine_projected_weight'].sum()
    data_smalled_groupped = pd.DataFrame(data_smalled_groupped)
    data_smalled_groupped.reset_index(inplace=True)
    data_smalled_groupped['search_engine_share_in_browser'] = data_smalled_groupped.groupby(['browser_general'])['machine_projected_weight'].apply(lambda x: x/sum(x))
    browser_total = data_smalled_groupped.machine_projected_weight.sum()
    browser_totals = data_smalled_groupped.groupby(['browser_general'])['machine_projected_weight'].sum()/browser_total
    data_smalled_groupped['browser_share'] = [browser_totals[x] for x in data_smalled_groupped['browser_general']]
    values = data_smalled_groupped['browser_share'] * data_smalled_groupped['search_engine_share_in_browser']
    data_smalled_groupped['total_share_of_search_engine'] = values
    # with open('/home/burakonal/tasks/1293/reports/' + date + '_' + '_comscore_raw', 'w') as f:
    #     data_smalled_groupped.to_csv(path_or_buf=f, sep='\t', header=True, float_format='%.5f', decimal=',', index=False)
    return data_smalled_groupped

def processing_comscore_data(search_engine_tags, comscore_data, metrika_ratios, browsers, date):

    result = {}
    #########################COMSCORE DATA#####################
    #search engine shares
    total_of_search_engine_shares = comscore_data.groupby(['is_search_engine_tag'])['total_share_of_search_engine'].sum()
    total_of_search_engine_absolutes = comscore_data.groupby(['is_search_engine_tag'])['machine_projected_weight'].sum()
    for key, value in zip(search_engine_tags, total_of_search_engine_shares):
        result[key] = value
    for key, value in zip(search_engine_tags, total_of_search_engine_absolutes):
        result[key + '_abs'] = value

    # browser shares
    for key in browsers:
        if key == 'yandex':
            result['yabrowser'] = pd.DataFrame(comscore_data[comscore_data["browser_general"] == key].reset_index(drop=True)).loc[0]['browser_share']
        elif key == 'internet explorer':
            result['msie'] = pd.DataFrame(comscore_data[comscore_data["browser_general"] == key].reset_index(drop=True)).loc[0]['browser_share']
        else:
            result[key] = pd.DataFrame(comscore_data[comscore_data["browser_general"] == key].reset_index(drop=True)).loc[0]['browser_share']
    browser_totals = comscore_data.groupby(['browser_general'])['machine_projected_weight'].sum()
    for x in set(comscore_data['browser_general'].tolist()):
        if x == 'yandex':
            result['yabrowser_abs'] = browser_totals[x]
        elif x == 'internet explorer':
            result['msie_abs'] = browser_totals[x]
        else:
            result[x + '_abs'] = browser_totals[x]
    # search engines in chrome
    for i in comscore_data[comscore_data["browser_general"] == "chrome"]["is_search_engine_tag"]:
        temp = comscore_data[(comscore_data["browser_general"] == "chrome")&(comscore_data["is_search_engine_tag"]==i)]["search_engine_share_in_browser"]
        temp.reset_index(drop=True, inplace=True)
        result['chrome_'+ i] = temp[0]
    for i in comscore_data[comscore_data["browser_general"] == "chrome"]["is_search_engine_tag"]:
        temp = comscore_data[(comscore_data["browser_general"] == "chrome")&(comscore_data["is_search_engine_tag"]==i)]["machine_projected_weight"]
        temp.reset_index(drop=True, inplace=True)
        result['chrome_'+ i + '_abs'] = temp[0]
    # search engines in firefox
    for i in comscore_data[comscore_data["browser_general"] == "firefox"]["is_search_engine_tag"]:
        temp = comscore_data[(comscore_data["browser_general"] == "firefox")&(comscore_data["is_search_engine_tag"]==i)]["search_engine_share_in_browser"]
        temp.reset_index(drop=True, inplace=True)
        result['firefox_'+ i] = temp[0]
    for i in comscore_data[comscore_data["browser_general"] == "firefox"]["is_search_engine_tag"]:
        temp = comscore_data[(comscore_data["browser_general"] == "firefox")&(comscore_data["is_search_engine_tag"]==i)]["machine_projected_weight"]
        temp.reset_index(drop=True, inplace=True)
        result['firefox_'+ i + '_abs'] = temp[0]
    # search engines in internet explorer
    for i in comscore_data[comscore_data["browser_general"] == "internet explorer"]["is_search_engine_tag"]:
        temp = comscore_data[(comscore_data["browser_general"]=="internet explorer")&(comscore_data["is_search_engine_tag"]==i)]["search_engine_share_in_browser"]
        temp.reset_index(drop=True,inplace=True)
        result['msie_'+ i] = temp[0]
    for i in comscore_data[comscore_data["browser_general"] == "internet explorer"]["is_search_engine_tag"]:
        temp = comscore_data[(comscore_data["browser_general"]=="internet explorer")&(comscore_data["is_search_engine_tag"]==i)]["machine_projected_weight"]
        temp.reset_index(drop=True,inplace=True)
        result['msie_'+ i + '_abs'] = temp[0]
    # search engines in yandex browser
    for i in comscore_data[comscore_data["browser_general"] == "yandex"]["is_search_engine_tag"]:
        temp = comscore_data[(comscore_data["browser_general"] == "yandex")&(comscore_data["is_search_engine_tag"]==i)]["search_engine_share_in_browser"]
        temp.reset_index(drop=True,inplace=True)
        result['yabrowser_'+ i] = temp[0]
    for i in comscore_data[comscore_data["browser_general"] == "yandex"]["is_search_engine_tag"]:
        temp = comscore_data[(comscore_data["browser_general"] == "yandex")&(comscore_data["is_search_engine_tag"]==i)]["machine_projected_weight"]
        temp.reset_index(drop=True,inplace=True)
        result['yabrowser_'+ i + '_abs'] = temp[0]

    projected_comscore_data = pd.DataFrame(comscore_data[['browser_general', 'is_search_engine_tag', 'machine_projected_weight']])

    yandexsearch_in_yandex_browser = float(projected_comscore_data[(projected_comscore_data['browser_general'] == 'yandex') & (projected_comscore_data['is_search_engine_tag'] == 'yandex')]['machine_projected_weight'])
    projected_comscore_data[(projected_comscore_data['browser_general'] == 'yandex') & (projected_comscore_data['is_search_engine_tag'] == 'yandex')] = ['yandex', 'yandex', metrika_ratios['yabrowser']['yandex']*yandexsearch_in_yandex_browser]
    temp_search_engine_list = []
    for i in projected_comscore_data[(projected_comscore_data['browser_general'] == 'yandex')]['is_search_engine_tag']:
        temp_search_engine_list.append(i)
        projected_comscore_data[(projected_comscore_data['browser_general'] == 'yandex') & (projected_comscore_data['is_search_engine_tag'] == i)] = ['yandex', i, metrika_ratios['yabrowser'][i]*yandexsearch_in_yandex_browser]

    # changing yandex-browser data
    for tag in search_engine_tags:
        if tag not in temp_search_engine_list:
            temp = {'browser_general': ['yandex'],
                    'is_search_engine_tag': [tag],
                    'machine_projected_weight': [yandexsearch_in_yandex_browser*metrika_ratios['yabrowser'][tag]]}
            projected_comscore_data_to_append = pd.DataFrame(temp)
            projected_comscore_data = projected_comscore_data.append(projected_comscore_data_to_append, ignore_index=True)
    # TODO change chrome data too!

    projected_comscore_data['search_engine_share_in_browser'] = projected_comscore_data.groupby(['browser_general'])['machine_projected_weight'].apply(lambda x: x/sum(x))
    browser_total = projected_comscore_data.machine_projected_weight.sum()

    browser_totals = projected_comscore_data.groupby(['browser_general'])['machine_projected_weight'].sum()/browser_total
    projected_comscore_data['browser_share'] = [browser_totals[ x] for x in projected_comscore_data['browser_general']]
    values = projected_comscore_data['search_engine_share_in_browser']*projected_comscore_data['browser_share']
    projected_comscore_data['total_share_of_search_engine'] = values
    browser_totals = projected_comscore_data.groupby(['browser_general'])['machine_projected_weight'].sum()
    for x in set(projected_comscore_data['browser_general'].tolist()):
        if x == 'yandex':
            result['projected_yabrowser_abs'] = browser_totals[x]
        elif x == 'internet explorer':
            result['projected_msie_abs'] = browser_totals[x]
        else:
            result['projected_' + x + '_abs'] = browser_totals[x]
    #######################PROJECTED COMSCORE DATA######################
    # with open('/home/burakonal/tasks/1293/reports/' + date + '_projected', 'w') as f:
    #     projected_comscore_data.to_csv(path_or_buf=f, sep='\t', header=True, float_format='%.5f', decimal=',', index=False)

    total_of_search_engine_shares = projected_comscore_data.groupby(['is_search_engine_tag'])['total_share_of_search_engine'].sum()
    for key, value in zip(search_engine_tags, total_of_search_engine_shares):
        result['projected_' + key] = value
    total_of_search_engine_absolutes = projected_comscore_data.groupby(['is_search_engine_tag'])['machine_projected_weight'].sum()
    for key, value in zip(search_engine_tags, total_of_search_engine_shares):
        result['projected_' + key + '_abs'] = value

    # browser shares
    for key in browsers:
        if key == 'yandex':
            result['projected_yabrowser'] = pd.DataFrame(projected_comscore_data[projected_comscore_data["browser_general"] == key].reset_index(drop=True)).loc[0]['browser_share']
        elif key == 'internet explorer':
            result['projected_msie'] = pd.DataFrame(projected_comscore_data[projected_comscore_data["browser_general"] == key].reset_index(drop=True)).loc[0]['browser_share']
        else:
            result['projected_' + key] = pd.DataFrame(projected_comscore_data[projected_comscore_data["browser_general"] == key].reset_index(drop=True)).loc[0]['browser_share']

    for i in projected_comscore_data[projected_comscore_data["browser_general"] == "chrome"]["is_search_engine_tag"]:
        temp = projected_comscore_data[(projected_comscore_data["browser_general"]=="chrome")&(projected_comscore_data["is_search_engine_tag"]==i)]["search_engine_share_in_browser"]
        temp.reset_index(drop=True,inplace=True)
        result['projected_chrome_'+i] = temp[0]
    for i in projected_comscore_data[projected_comscore_data["browser_general"] == "chrome"]["is_search_engine_tag"]:
        temp = projected_comscore_data[(projected_comscore_data["browser_general"]=="chrome")&(projected_comscore_data["is_search_engine_tag"]==i)]["machine_projected_weight"]
        temp.reset_index(drop=True,inplace=True)
        result['projected_chrome_'+ i + '_abs'] = temp[0]
    # search engines in firefox
    for i in projected_comscore_data[projected_comscore_data["browser_general"] == "firefox"]["is_search_engine_tag"]:
        temp = projected_comscore_data[(projected_comscore_data["browser_general"]=="firefox")&(projected_comscore_data["is_search_engine_tag"]==i)]["search_engine_share_in_browser"]
        temp.reset_index(drop=True,inplace=True)
        result['projected_firefox_'+ i] = temp[0]
    for i in projected_comscore_data[projected_comscore_data["browser_general"] == "firefox"]["is_search_engine_tag"]:
        temp = projected_comscore_data[(projected_comscore_data["browser_general"]=="firefox")&(projected_comscore_data["is_search_engine_tag"]==i)]["machine_projected_weight"]
        temp.reset_index(drop=True,inplace=True)
        result['projected_firefox_'+ i + '_abs'] = temp[0]
    # search engines in internet explorer
    for i in projected_comscore_data[projected_comscore_data["browser_general"] == "internet explorer"]["is_search_engine_tag"]:
        temp = projected_comscore_data[(projected_comscore_data["browser_general"]=="internet explorer")&(projected_comscore_data["is_search_engine_tag"]==i)]["search_engine_share_in_browser"]
        temp.reset_index(drop=True,inplace=True)
        result['projected_msie_' + i] = temp[0]
    for i in projected_comscore_data[projected_comscore_data["browser_general"] == "internet explorer"]["is_search_engine_tag"]:
        temp = projected_comscore_data[(projected_comscore_data["browser_general"]=="internet explorer")&(projected_comscore_data["is_search_engine_tag"]==i)]["machine_projected_weight"]
        temp.reset_index(drop=True,inplace=True)
        result['projected_msie_' + i + '_abs'] = temp[0]
    # search engines in yandex browser
    for i in projected_comscore_data[projected_comscore_data["browser_general"] == "yandex"]["is_search_engine_tag"]:
        temp = projected_comscore_data[(projected_comscore_data["browser_general"]=="yandex")&(projected_comscore_data["is_search_engine_tag"]==i)]["search_engine_share_in_browser"]
        temp.reset_index(drop=True,inplace=True)
        result['projected_yabrowser_'+i] = temp[0]
    for i in projected_comscore_data[projected_comscore_data["browser_general"] == "yandex"]["is_search_engine_tag"]:
        temp = projected_comscore_data[(projected_comscore_data["browser_general"]=="yandex")&(projected_comscore_data["is_search_engine_tag"]==i)]["machine_projected_weight"]
        temp.reset_index(drop=True,inplace=True)
        result['projected_yabrowser_'+ i + '_abs'] = temp[0]

    # addition of a secondary dictionary for a new stat report.
    result2 = dict()
    browsers = ["chrome", "firefox", "msie", "yabrowser"]
    search_engine_tags = ['ask', 'bing', 'google', 'yandex']
    status = ["Raw", "Projected"]

    for stat in status:
        prefix = ""
        if stat == "Projected":
            prefix+="projected_"
        result2["status"] = stat
        result2["fielddate"] = date
        for bro in browsers:
            result2[bro] = result[prefix+bro]
            result2[bro+"_abs"] = result[prefix+bro+"_abs"]
            for sengine in search_engine_tags:
                result2[sengine] = result[prefix+sengine]
                result2[sengine+"_abs"] = result[prefix+sengine+"_abs"]
                try:
                    result2[bro+"_"+sengine] = result[prefix+bro+"_"+sengine]
                    result2[bro+"_"+sengine+"_abs"] = result[prefix+bro+"_"+sengine+"_abs"]
                except:
                    # print "Not Found!"
                    result2[bro+"_"+sengine] = 0
                    result2[bro+"_"+sengine+"_abs"] = 0
        print result2
        update_statface_report([result2], "d")


    """
    this part was designed for 3 dimensional graph, but need updates. I just keep it for future usage, maybe

    """
    # for stat in status:
    #     prefix = ""
    #     if stat == "Projected":
    #         prefix+="projected_"
    #     for bro in browsers:
    #         result2 ["status"] = stat
    #         result2["fielddate"] = date_formatted
    #         result2["browser"] = bro
    #         result2["sengine"] = "all"
    #         result2[bro] = result[prefix+bro]
    #         result2[bro+"_abs"] = result[prefix+bro+"_abs"]
    #         print result2
    #         update_statface_report([result2], "d")
    #         result2 = dict()
    #         for sengine in search_engine_tags:
    #             result2 ["status"] = stat
    #             result2["fielddate"] = date_formatted
    #             result2["browser"] = bro
    #             result2["sengine"] = sengine
    #             try:
    #                 result2[bro+"_"+sengine] = result[prefix+bro+"_"+sengine]
    #                 result2[bro+"_"+sengine+"_abs"] = result[prefix+bro+"_"+sengine+"_abs"]
    #             except:
    #                 # print "Not Found!"
    #                 result2[bro+"_"+sengine] = 0
    #                 result2[bro+"_"+sengine+"_abs"] = 0
    #             print result2
    #             update_statface_report([result2], "d")
    #             result2 = dict()
    #     for sengine in search_engine_tags:
    #         result2 ["status"] = stat
    #         result2["fielddate"] = date_formatted
    #         result2["browser"] = "all"
    #         result2["sengine"] = sengine
    #         result2[sengine] = result[prefix+sengine]
    #         result2[sengine+"_abs"] = result[prefix+sengine+"_abs"]
    #         print result2
    #         update_statface_report([result2], "d")
    #         result2 = dict()


    # return result, result2

if __name__ == '__main__':
    # parameters
    date = '2015-07-22'
    status = 'SEARCH_JOINED'
    search_engines = ['google web search', 'yandex web search', 'google', 'yandex', 'ask reply page', 'bing web', 'bing']
    search_engine_tags = ['ask', 'bing', 'google', 'yandex']
    browsers = ['chrome', 'firefox', 'internet explorer', 'yandex']
    # getting data

    while True:
        integer_time = time.mktime(datetime.datetime.strptime(date, '%Y-%m-%d').timetuple())
        integer_time += 24*60*60
        date = datetime.datetime.fromtimestamp(integer_time).strftime('%Y-%m-%d')
        print date
        path = 'tasks/1293/debug/comscore/' + datetime.datetime.strptime(date, "%Y-%m-%d").strftime("%Y%m%d")
        print path
        comscore_data = getting_comscore_data(date, status, path, search_engines, search_engine_tags, browsers)
        result = processing_comscore_data(search_engine_tags, comscore_data, get_query_result('dummy', 'dummy', date), browsers, date)
        if date == "2015-08-01":
            exit()

    # comscore_data = getting_comscore_data(date, status, path, search_engines, search_engine_tags, browsers)
    comscore_data = "dummy"
    result = processing_comscore_data(search_engine_tags, comscore_data, get_query_result('dummy', 'dummy', date_formatted), browsers, date)

    # for key, val in result:
    #     print key, '\t', val

