from __future__ import division
#from geobase3 import Lookup

#REGION_LOOKUP = Lookup("/home/cansucullu/geodata3.bin")

__author__ = 'cansucullu'

import sys
import re
import datetime
import requests
import urllib
import urllib2
import urlparse
import json
import time

import yt.wrapper as yt

def module_filter(module):
    if not module:
        return True
    name = getattr(module, '__name__', '')
    return not (name == 'uatraits' or name.startswith('statbox'))

yt.config["auto_merge_output"]["action"] = "merge"
yt.config["pickling"]["module_filter"] = module_filter
yt.config.set_proxy('plato.yt.yandex.net')
yt.config.CREATE_RECURSIVE = True
yt.config.TREAT_UNEXISTING_AS_EMPTY = True


class RetentionCalculator:
    def __init__(self):
        self.clids = ["2189882", "2219050", "2219052", "2224320"]

        self.all_clids = dict([
            ('2189882', 'Search Extension dayuse'),
            ('2196600', 'Search Extension omnibox'),
            ('2187647', 'Search Extension na'),
            ('2220368', 'Search Extension na'),
            ('2219050', 'Search Pop-up Screen Extension dayuse'),
            ('2219051', 'Search Pop-up Screen Extension omnibox'),
            ('2224775', 'Search Pop-up Screen Extension na'),
            ('2219052', 'New Tab Extension dayuse'),
            ('2219053', 'New Tab Extension omnibox'),
            ('2219054', 'New Tab Extension Yandex logo on new tab'),
            ('2219055', 'New Tab Extension new tab'),
            ('2224771', 'New Tab Extension na'),
            ('2224320', 'Homepage Extension dayuse'),
            ('2224321', 'Homepage Extension omnibox'),
            ('2224322', 'Homepage Extension homepage & startpage'),
            ('2224323', 'Homepage Extension context menu search'),
        ])

        self.export_access_log_table_raw = '//statbox/export-access-log/'

        self.install_table_raw = '//home/tr-analysts/cansucullu/ANSEARCH-474/install/'
        self.unique_install_table_raw = '//home/tr-analysts/cansucullu/ANSEARCH-474/unique-install/'

        self.dayuse_table_raw = '//home/tr-analysts/cansucullu/ANSEARCH-474/dayuse/'
        self.unique_dayuse_table_raw = '//home/tr-analysts/cansucullu/ANSEARCH-474/unique-dayuse/'

        self.retention_pivot_table = '//home/tr-analysts/cansucullu/ANSEARCH-443/PivotTable'

        self.statface_names = dict([
            ('immediate retention', 'yandex.com.tr/Special/Metrics/Switch/AudienceMetrics/ImmediateRetention'),
            ('real retention', 'yandex.com.tr/Special/Metrics/Switch/RetentionAnalysis/RetentionSummary'),
        ])

        self.date_format = '%Y-%m-%d'
        self.time_format = '%Y-%m-%d-%H-%M-%S'

        self.stat_data = dict([
            ('StatRobotUser', 'robot_cansucullu'),
            ('StatRobotPassword', 'Tai0epood8'),
        ])

    def prepare_dates(self, start_date, stop_date):
        days_list = []
        start = datetime.datetime.strptime(start_date, self.date_format)
        stop = datetime.datetime.strptime(stop_date, self.date_format)
        delta = (stop - start).days + 1

        for i in xrange(delta):
            new_date_datetime = start + datetime.timedelta(days=i)
            days_list.append(new_date_datetime.strftime(self.date_format))

        return days_list

    def get_ndays_difference(self, input_date, n):
        input_date_datetime = datetime.datetime.strptime(input_date, self.date_format)

        if n > 0:
            output_date_datetime = input_date_datetime + datetime.timedelta(days=n)
        else:
            output_date_datetime = input_date_datetime - datetime.timedelta(days=abs(n))

        output_date = output_date_datetime.strftime(self.date_format)

        return output_date

    def update_daily_statface_report(self, name, data):
        url = 'https://stat.yandex-team.ru/_api/report/data'
        #headers = {'StatRobotUser': 'robot_cansucullu', 'StatRobotPassword': 'Tai0epood8'}
        headers = {'StatRobotUser': self.stat_data['StatRobotUser'], 'StatRobotPassword': self.stat_data['StatRobotPassword']}
        values = {
            'name': self.statface_names[name],
            'scale': 'd',
            'data': json.dumps({'values': data})
        }

        data = urllib.urlencode(values)
        req = urllib2.Request(url, data, headers)
        response = urllib2.urlopen(req)

    def read_statface_data(self, name, date):
        graph_dict = {'fielddate':date}
        url = 'https://stat.yandex-team.ru/yandex.com.tr/Special/Metrics/Switch/AudienceMetrics/ImmediateRetention'
        headers = {'StatRobotUser': self.stat_data['StatRobotUser'], 'StatRobotPassword': self.stat_data['StatRobotPassword']}
        data = {
            'scale': 'd',
            'date_min': '{0}'.format(date),
            'date_max': '{0}'.format(date),
            'type': 'json'
        }
        request = requests.get(url, params=data, headers=headers)
        response = json.loads(request.content)['values']
        return response

    def mainloop(self):

        while True:
            missing_dates = self.check_export_access_log()
            if len(missing_dates) == 0:  # Tables are up to date
                print time.strftime(self.time_format), "There is no new export-access-log. Now sleeping"
                time.sleep(3600)
            else:
                for date in missing_dates:
                    try:
                        print time.strftime(self.time_format), "Updating for export-access-log", date
                        self.update_daily_stat_tables(date)

                        previous_day = self.get_ndays_difference(date, -1)
                        print time.strftime(self.time_format), "Running immediate retention report update", previous_day
                        self.run_immediate_retention_report(previous_day)

                        print time.strftime(self.time_format), "Updating for real retention pivot table", date
                        self.update_retention_pivot_table(date)

                        print time.strftime(self.time_format), "Running real retention report update", date
                        self.run_real_retention_report(date)

                    except:
                        pass


    def check_export_access_log(self):
        #yt.list('//statbox/export-access-log')
        export_access_log_list = yt.list(self.export_access_log_table_raw[:-1])
        unique_dayuse_tables_list = yt.list(self.unique_dayuse_table_raw[:-1])

        max_source = datetime.datetime.strptime(max(export_access_log_list), self.date_format)
        max_target = datetime.datetime.strptime(max(unique_dayuse_tables_list), self.date_format)

        #delta = max_source - max_target
        #print max_source, max_target, delta.days
        missing_dates = self.prepare_dates((max_target + datetime.timedelta(days=1)).strftime(self.date_format), max_source.strftime(self.date_format))
        #print self.prepare_dates(max_target.strftime(self.date_format), max_source.strftime(self.date_format))
        #print missing_dates

        if yt.is_empty(self.export_access_log_table_raw + max(export_access_log_list)):
            missing_dates = []  # check if table exists but it is empty (sometimes it happens)

        return missing_dates

    def update_daily_stat_tables(self, date):
        source_table = self.export_access_log_table_raw + date

        install_table = self.install_table_raw + date
        unique_install_table = self.unique_install_table_raw + date

        dayuse_table = self.dayuse_table_raw + date
        unique_dayuse_table = self.unique_dayuse_table_raw + date
        
        # Install Part

        yt.run_map(
            ExportAccessLogMapper(self.clids, date, status='install'),
            source_table = source_table,
            destination_table = install_table,
            format=yt.DsvFormat(),
        )
        yt.run_sort(
            source_table = install_table,
            destination_table = install_table,
            sort_by=['clid'],
        )
        yt.run_reduce(
            UniqueReducer(),
            source_table = install_table,
            destination_table = unique_install_table,
            reduce_by=['clid'],
            format=yt.DsvFormat(),
        )

        # Dayuse Part
        yt.run_map(
            ExportAccessLogMapper(self.clids, date, status='dayuse'),
            source_table = source_table,
            destination_table = dayuse_table,
            format=yt.DsvFormat(),
        )
        yt.run_sort(
            source_table = dayuse_table,
            destination_table = dayuse_table,
            sort_by=['clid'],
        )
        yt.run_reduce(
            UniqueReducer(),
            source_table = dayuse_table,
            destination_table = unique_dayuse_table,
            reduce_by=['clid'],
            format=yt.DsvFormat(),
        )

    def run_immediate_retention_report(self, date):
        next_day = self.get_ndays_difference(input_date=date, n=1)
        dayuse_table_final = self.unique_dayuse_table_raw + next_day

        if not yt.is_empty(dayuse_table_final):
            data, immediate_retention_numbers, immediate_retention_stats = self.calculate_immediate_retention(date)
            weekly_installs, weekly_next_day_active, smooth_retention = self.calculate_smooth_immediate_retention(date)

            stat_dict = {}
            for clid in self.clids:
                stat_dict['fielddate'] = date
                stat_dict['clid'] = clid
                stat_dict['install'] = len(data[clid])
                stat_dict['next_day_active'] = immediate_retention_numbers[clid]
                stat_dict['immediate_retention'] = immediate_retention_stats[clid]
                stat_dict['weekspan_installs'] = weekly_installs[clid]
                stat_dict['weekspan_next_day_active'] = weekly_next_day_active[clid]
                stat_dict['smooth_retention'] = smooth_retention[clid]

                self.update_daily_statface_report(name='immediate retention', data=[stat_dict])

    def calculate_immediate_retention(self, date):
        print "immediate retention", date
        current_day = date
        next_day = self.get_ndays_difference(input_date=date, n=1)

        install_table_final = self.unique_install_table_raw + current_day
        dayuse_table_final = self.unique_dayuse_table_raw + next_day

        installation_data = {}
        immediate_retention_numbers = {}
        immediate_retention_stats = {}

        for clid in self.clids:
            installation_data[clid] = []
            immediate_retention_numbers[clid] = 0
            immediate_retention_stats[clid] = 0

        #if not yt.is_empty(dayuse_table_final):
        for line in yt.read_table(install_table_final, format='dsv'):
            items = line.strip().split('\t')

            # Prepare record dictionary
            record = {}
            for item in items:
                key, value = item.split('=', 1)
                record[key] = value

            installation_data[record['clid']].append(record['ui'])

        for line in yt.read_table(dayuse_table_final, format='dsv'):
            items = line.strip().split('\t')

            # Prepare record dictionary
            record = {}
            for item in items:
                key, value = item.split('=', 1)
                record[key] = value

            if record['ui'] in installation_data[record['clid']]:
                immediate_retention_numbers[record['clid']] += 1

        for clid in self.clids:
            if len(installation_data[clid]) != 0:
                immediate_retention_stats[clid] = immediate_retention_numbers[clid] / len(installation_data[clid])

        return installation_data, immediate_retention_numbers, immediate_retention_stats

    def calculate_smooth_immediate_retention(self, date):
        end_date = date
        initial_date = self.get_ndays_difference(date, -6)
        weekdays = self.prepare_dates(initial_date, end_date)

        weekly_installs = {}
        weekly_next_day_active = {}
        smooth_retention = {}

        for clid in self.clids:
            weekly_installs[clid] = 0
            weekly_next_day_active[clid] = 0

        for weekday in weekdays:
            data, immediate_retention_numbers, immediate_retention_stats = self.calculate_immediate_retention(weekday)

            for clid in self.clids:
                weekly_installs[clid] += len(data[clid])
                weekly_next_day_active[clid] += immediate_retention_numbers[clid]

        for clid in self.clids:
            smooth_retention[clid] = weekly_next_day_active[clid] / weekly_installs[clid]

        return weekly_installs, weekly_next_day_active, smooth_retention

    def run_real_retention_report(self, date):
        #self.update_retention_pivot_table(date)
        self.execute_retention_pivot_table_operations()
        results = self.calculate_real_retention()

        one_week = self.get_ndays_difference(date, -6)
        two_weeks = self.get_ndays_difference(date, -13)

        print two_weeks


        for day in results:
            print day
            if day < one_week:
            #if day < one_week and day > '2015-07-19':
            #if day == '2015-08-29':
                stat_dict = {}
                stat_dict['fielddate'] = day
                for clid in results[day].keys():
                    stat_dict['clid'] = clid
                    stat_dict['install'] = results[day][clid]['install']
                    stat_dict['0day_active'] = results[day][clid]['zeroactive']
                    stat_dict['0day_retention'] = results[day][clid]['zeroretention']


                    if day < two_weeks:
                        stat_dict['7day_active'] = results[day][clid]['sevenactive']
                        stat_dict['7day_retention'] = results[day][clid]['sevenretention']
                    else:
                        pass
                    #    stat_dict['7day_active'] = ''
                    #    stat_dict['7day_retention'] = ' '

                    #print clid, day < two_weeks, stat_dict['7day_active'], stat_dict['7day_retention']

                    print stat_dict
                    self.update_daily_statface_report('real retention', data=[stat_dict])


    def update_retention_pivot_table(self, date):
        yt.run_map(
            RetentionPivotTableMapper(self.clids, date),
            source_table = self.export_access_log_table_raw + date,
            destination_table=yt.TablePath(self.retention_pivot_table, append=True),
            format=yt.DsvFormat(),
        )

    def execute_retention_pivot_table_operations(self):
        # Sort by ui
        print "Sort by ui"
        yt.run_sort(
            source_table='//home/tr-analysts/cansucullu/ANSEARCH-443/PivotTable',
            destination_table='//home/tr-analysts/cansucullu/ANSEARCH-443/PivotTable-ui-sorted',
            sort_by=['ui']
        )

        # Reduce by ui to get activeness per ui
        print "Reduce by ui to get activeness per ui"
        yt.run_reduce(
            UiActivenessReducer(self.clids, self.date_format),
            source_table='//home/tr-analysts/cansucullu/ANSEARCH-443/PivotTable-ui-sorted',
            destination_table='//home/tr-analysts/cansucullu/ANSEARCH-443/PivotTable-ui-reduce',
            reduce_by=['ui'],
            format=yt.DsvFormat(),
        )

        # Sort by first_install i.e. install date
        print "Sort by first_install i.e. install date"
        yt.run_sort(
            source_table='//home/tr-analysts/cansucullu/ANSEARCH-443/PivotTable-ui-reduce',
            destination_table='//home/tr-analysts/cansucullu/ANSEARCH-443/PivotTable-ui-reduce-installdate-sorted',
            sort_by=['first_install']
        )

        # Reduce by install_date to get activeness per ui
        print "Reduce by install_date to get activeness per ui"
        yt.run_reduce(
            ClidActivenessReducer(self.clids),
            source_table='//home/tr-analysts/cansucullu/ANSEARCH-443/PivotTable-ui-reduce-installdate-sorted',
            destination_table='//home/tr-analysts/cansucullu/ANSEARCH-443/PivotTable-activeness-frequency',
            reduce_by=['first_install'],
            format=yt.DsvFormat(),
        )

    def calculate_real_retention(self):
        url = 'http://plato.yt.yandex.net/api/v2/read?path=//home/tr-analysts/cansucullu/ANSEARCH-443/PivotTable-activeness-frequency&output_format[$value]=json'
        r = requests.get(url)

        lines = r.text[:-1].split('\n')

        data = {}

        for line in lines:
            row = eval(line.strip())

            if not row["install_date"] in data:
                data[row["install_date"]] = {}
                data[row["install_date"]][row["clid"]] = {}
                data[row["install_date"]][row["clid"]][row["activeness"]] = int(row["count"])
            else:
                if not row["clid"] in data[row["install_date"]]:
                    data[row["install_date"]][row["clid"]] = {}
                    data[row["install_date"]][row["clid"]][row["activeness"]] = int(row["count"])
                else:
                    data[row["install_date"]][row["clid"]][row["activeness"]] = int(row["count"])

        results = {}
        for i in data.keys():
            results[i] = {}
            for j in data[i].keys():
                results[i][j] = {}

        # install
        for i in data.keys():
            for j in data[i].keys():
                sum_ = 0
                for k in data[i][j].keys():
                    sum_ += int(data[i][j][k])
                    #print sum_,
                results[i][j]['install'] = sum_

        # zero active
        for i in data.keys():
            for j in data[i].keys():
                results[i][j]['zeroactive'] = results[i][j]['install'] - data[i][j]["0"]

        # zero retention
        for i in data.keys():
            for j in data[i].keys():
                results[i][j]['zeroretention'] = results[i][j]['zeroactive'] / results[i][j]['install']

        # seven active
        for i in data.keys():
            for j in data[i].keys():
                sum_ = 0
                for k in data[i][j].keys():
                    if int(k) > 6:
                        sum_ += int(data[i][j][k])
                    #print sum_,
                results[i][j]['sevenactive'] = sum_

        # zero retention
        for i in data.keys():
            for j in data[i].keys():
                results[i][j]['sevenretention'] = results[i][j]['sevenactive'] / results[i][j]['install']

        return results


class ExportAccessLogMapper:
    def __init__(self, clids_to_check, date, status):
        self.clids_to_check = clids_to_check
        self.date = date
        self.status = status

    def __call__(self, rec):
        items = rec['value'].split('\t')
        data = {}
        for item in items:
            key, value = item.split('=', 1)
            data[key] = value

        if data.has_key('request') and data.has_key('cookies') and data.has_key('timestamp') and data.has_key('cookies'):

            # request part
            url = data['request']
            decoded = urllib.unquote(url)
            parsed = urlparse.urlparse(decoded)
            params = urlparse.parse_qs(parsed.query)

            # cookies part
            yandexuid_regex = re.compile('(yandexuid=)([^&]*)')
            try:
                cookie_final = yandexuid_regex.search(data['cookies']).group(2)
                cookie_final = cookie_final.split(';')[0]
            except:
                cookie_final = '-'
            if cookie_final == '':
                cookie_final = '-'

            # clid part
            clid_final = "NA"
            clid_condition = params.has_key('clid') or params.has_key('clid1')
            if params.has_key('clid'):
               clid_final = params['clid'][0]
               clid_condition = True
            elif params.has_key('clid1'):
               clid_final = params['clid1'][0]
               clid_condition = True
            else:
                clid_condition = False

            # yield part
            if clid_condition == True and params.has_key('stat') and params.has_key('ui') and data.has_key('timestamp'):
                if clid_final in self.clids_to_check:
                    #if params['stat'][0] == 'install':
                    if params['stat'][0] == self.status:
                        yield {
                            #'day':self.d,
                            'clid':clid_final,
                            'ui':params['ui'][0][1:-1], # eliminate brackets in ui
                            'yandexuid':"y"+cookie_final,
                            'timestamp':str(data['timestamp']).replace('/','-').replace(':','-'),
                        }


class UniqueReducer():
    def __call__(self, key, recs):
        uis = []
        for rec in recs:
            uis.append(rec['ui'])

        uis = set(uis)

        for ui in uis:
            yield {'clid':key['clid'], 'ui':str(ui)}


class RetentionPivotTableMapper:
    def __init__(self, clids_to_check, date):
        self.clids_to_check = clids_to_check
        self.date = date

    def __call__(self, rec):
        items = rec['value'].split('\t')
        data = {}
        for item in items:
            key, value = item.split('=', 1)
            data[key] = value

        if data.has_key('request') and data.has_key('cookies') and data.has_key('timestamp') and data.has_key('cookies'):

            # request part
            url = data['request']
            decoded = urllib.unquote(url)
            parsed = urlparse.urlparse(decoded)
            params = urlparse.parse_qs(parsed.query)

            # cookies part
            yandexuid_regex = re.compile('(yandexuid=)([^&]*)')
            try:
                cookie_final = yandexuid_regex.search(data['cookies']).group(2)
                cookie_final = cookie_final.split(';')[0]
            except:
                cookie_final = '-'

            # clid part
            clid_final = "NA"
            clid_condition = params.has_key('clid') or params.has_key('clid1')
            if params.has_key('clid'):
               clid_final = params['clid'][0]
               clid_condition = True
            elif params.has_key('clid1'):
               clid_final = params['clid1'][0]
               clid_condition = True
            else:
                clid_condition = False

            # yield part
            if clid_condition == True and params.has_key('stat') and params.has_key('ui') and data.has_key('timestamp'):
                if clid_final in self.clids_to_check:
                    yield {
                        'day':self.date,
                        'clid':clid_final,
                        'ui':params['ui'][0][1:-1],
                        'stat':params['stat'][0],
                        'yandexuid':"y"+cookie_final,
                        'timestamp':str(data['timestamp']).replace('/','-').replace(':','-'),
                    }


class UiActivenessReducer():
    def __init__(self, clids, date_format):
        self.clids = clids
        self.date_format = date_format

    def __call__(self, key, recs):
        data = {}

        for clid in self.clids:
            data[clid] = {}
            data[clid]['install'] = []
            data[clid]['dayuse'] = []

        for rec in recs:
            clid = rec['clid']
            stat = rec['stat']
            day = rec['day']

            if data.has_key(clid):
                if data[clid].has_key(stat):
                    data[clid][stat].append(datetime.datetime.strptime(day, self.date_format))

        for clid in data.keys():
            if len(data[clid]['install']) > 0:  # If there is at least one install record

                first_install = min(data[clid]['install'])

                if len(data[clid]['dayuse']) == 0:
                    yield {
                        'ui':key['ui'],
                        'clid':clid,
                        'activeness':str(0),
                        'first_install':first_install.strftime(self.date_format),
                        'last_dayuse':'0',
                    }
                else:
                    last_dayuse = max(data[clid]['dayuse'])
                    delta = last_dayuse - first_install
                    activeness = str(delta.days)
                    yield {
                        'ui':key['ui'],
                        'clid':clid,
                        'activeness':activeness,
                        'first_install':first_install.strftime(self.date_format),
                        'last_dayuse':last_dayuse.strftime(self.date_format),
                    }


class ClidActivenessReducer():
    def __init__(self, clids):
        self.clids = clids

    def __call__(self, key, recs):
        data = {}

        for clid in self.clids:
            data[clid] = {}

        for rec in recs:
            first_install = key['first_install']
            activeness = rec['activeness']
            clid = rec['clid']
            last_dayuse = rec['last_dayuse']
            ui = rec['ui']

            if data.has_key(clid):
                if not data[clid].has_key(activeness):
                    data[clid][activeness] = 1
                else:
                    data[clid][activeness] += 1

        for clid in data.keys():
            for activeness in data[clid].keys():
                count = data[clid][activeness]
                yield {
                    'install_date': key['first_install'],
                    'clid': clid,
                    'activeness': activeness,
                    'count': str(count),
                }


if __name__ == '__main__':
    api = RetentionCalculator()
    #print api.prepare_dates('2015-06-20', '2015-09-27')
    #days = api.prepare_dates('2015-08-31', '2015-09-03')
    #api.update_daily_stat_tables(date='2015-09-08')
    #api.run_immediate_retention_report('2015-09-07')
    #api.execute_retention_pivot_table_operations()
    #for i in range(len(days))[:-1]:
    #    api.update_daily_stat_tables(date=days[i+1])
    #    api.run_immediate_retention_report(days[i])

    #days = api.prepare_dates('2015-09-08', '2015-09-12')
    #print days
    #for day in days:
    #    api.update_retention_pivot_table(day)
    #    api.run_immediate_retention_report(day)
    #    api.yasoft_checker(day)

    #api.check_export_access_log()
    #api.run_real_retention_report('2015-09-14')
    #print api.get_ndays_difference('2015-09-12',-14)

    api.mainloop()
    #date = '2015-08-30'
    #begin_date = api.get_ndays_difference(date, -6)
    #days = api.prepare_dates(begin_date, date)
    #print days





