#!/usr/bin/env python2.7
# coding=utf-8
'''
From redir_log counts installs and audience. Groups by 'country',
'productname', 'bnrd', 'distype', 'dayuse'
Link to stat report:
https://stat.yandex-team.ru/Yandex_RU/Special/Metrics/Switch/portal

Usage:
    %prog '2015-01-01' '2015-12-31' 'YT/RESULT/TABLEPATH'
'''
from datetime import datetime, timedelta
from itertools import groupby, combinations, chain
from operator import itemgetter
import sys
import re
import json
from helpers import update_stat, daterange, country_by_clid

try:
    import yt.wrapper as yt
    yt.config['token'] = 'AVImKQMAAAO3VDAg54V4QjeI8eaNZJSQfA'
except:
    print 'yt.wrapper not imported (probably script started locally)'

yt.config.set_proxy('hahn.yt.yandex.net')
yt.config["pickling"]["enable_tmpfs_archive"] = False

# CLID to Country mapping file: channelsgrid.json:
# svn+ssh://arcadia.yandex.ru/arc/trunk/arcadia/analytics/chikachoff/distribution/channelsgrid.json
channelsgrid_path = "./channelsgrid.json"

# Banerid -> service_plasement dictionary file
# resources_dir = '../../../serp/mobile/ws-atom/src/main/resources'
context_dict_path = './context_dict.txt'

stat_host = 'https://stat.yandex-team.ru/'
stat_path = 'Yandex_RU/Special/Metrics/Switch/portal'
stat_url = ''.join([stat_host, stat_path])


def vars_to_dict(vars):
    vars_dict = {}
    for var in vars.split(','):
        if '=' in var:
            k, v = var.split('=', 1)
            if k.startswith('-'):
                k = k[1:]
            vars_dict[k] = v
    return vars_dict


def bnrd_to_human(bnrd, context_dict):
    # For portal bnrds only
    human = {}
    bnrd = str(bnrd)
    human['distype'] = 'nonswitch'
    context = [line.strip().split('\t') for line in context_dict]
    bigram_no = slice(0, 1)
    element_code = slice(1, 2)
    element_name = slice(2, 3)
    placements = {line[element_code][0]: line[element_name][0]
                  for line in context
                  if line[bigram_no][0] == '0'}
    mediums = {line[element_code][0]: line[element_name][0]
               for line in context
               if line[bigram_no][0] == '1'}

    medium = mediums.get(bnrd[2:4], 'not_in_contextdict')
    placement = placements.get(bnrd[0:2], 'not_in_contextdict')

    # |=>----
    if medium in ['distr_serp', 'images']:
        if placement in ['soft_link', 'promofooter_mobile']:
            placement = 'promofooter'
    if medium in ['distr_portal', 'distr_mail', 'avia']:
        if placement in ['promofooter', 'promofooter_mobile']:
            placement = 'soft_link'
    # if bnrd[0:2] == '05':
    #     placement = 'wizard'
    # -----<=|

    human['medium'] = medium
    human['placement'] = placement
    return human


def _filter(rec):
    # products = [
    #     'searchextchrome',
    #     'startextchrome',
    #     'homesearchextchrome',
    #     'vbch',
    #     'altsearchchrome',
    #     'winsearchbar',
    #     'vb',
    #     'searchline'
    # ]
    splitstrs = (item.split('=', 1) for item in rec['value'].split('\t'))
    value = {name: value for name, value in splitstrs}
    try:
        assert 'vars' in value
        assert 'yandexuid' in value
        assert 'unixtime' in value
        assert ',' in value['vars']
        vars = vars_to_dict(value['vars'])
        assert 'dayuse' in vars
        assert 'clid1' in vars
        assert 'bnrd' in vars
        assert vars['bnrd'][2:4] not in ['97', '98', '99']
        # assert vars['productname'] in products
    except AssertionError:
        return
    yield vars


class Mapper():
    def __init__(self, context_dict, channelsgrid):
        self.context_dict = context_dict
        self.channelsgrid = channelsgrid
        self.clid_pattern = re.compile('^clid\d+')

    def __call__(self, rec):
        bnrd = bnrd_to_human(rec.get('bnrd', '----------'), self.context_dict)

        country = country_by_clid(rec['clid1'], self.channelsgrid)
        if country:
            country = country.upper()
        elif rec.get('productname') in [
            'vbch',
            'altsearchchrome',
            'winsearchbar',
            'vb'
        ]:
            country = 'RU'

        yield {
            'dayuse': rec['dayuse'],
            'medium': bnrd.get('medium', 'undefined'),
            'placement': bnrd.get('placement', 'undefined'),
            'productname': rec.get('productname'),
            'country': country,
        }


class Reducer():
    def __call__(self, key, recs):
        installs = 0
        audience = 0
        for rec in recs:
            if rec['dayuse'] == '0':
                installs += 1
            audience += 1
        yield {
            'medium': key['medium'],
            'placement': key['placement'],
            'productname': key['productname'],
            'country': key.get('country') or 'unknown',
            'audience': audience,
            'installs': installs,
        }


# Special for browser #########################################################
class ExportAccessMapper():
    def __init__(self, context_dict):
        self.context_dict = context_dict

    def __call__(self, rec):
        request = rec['request']
        # filter
        try:
            assert ('stat=dayuse' in request) or ('stat=install' in request)
            assert 'yasoft=yabrowser' in request
            assert 'brandID=' in request
            assert 'banerid=' in request
        except:
            return

        # map
        _request = {}
        try:
            for elem in request.split('xml?')[1].split('&'):
                _request[elem.split('=')[0]] = elem.split('=')[1]
        except:
            pass

        bnrd = bnrd_to_human(_request.get('banerid'), self.context_dict)
        try:
            assert not bnrd['medium'].isdigit()
            assert not bnrd['placement'].isdigit()
            assert not bnrd['placement'] == 'not_in_contextdict'
            assert not bnrd['medium'] == 'not_in_contextdict'
        except:
            return
        yield {
            'country': _request.get('brandID')[:2].replace('ya', 'ru').upper(),
            'event': _request.get('stat'),
            'placement': bnrd['placement'],
            'medium': bnrd['medium'],
            'request': request
        }


class ExportAccessReducer():
    def __call__(self, key, recs):
        installs = 0
        audience = 0
        for rec in recs:
            if rec['event'] == 'install':
                installs += 1
            elif rec['event'] == 'dayuse':
                audience += 1
        yield {
            'medium': key['medium'],
            'placement': key['placement'],
            'productname': 'browser',
            'country': key.get('country') or 'unknown',
            'audience': audience,
            'installs': installs,
        }
# End special for browser #####################################################


def powerset(iterable):
    xs = list(iterable)
    return chain.from_iterable(combinations(xs, n) for n in range(len(xs)+1))


def main():
    filtered = '//home/ranking/chikachoff/portal/filtered'
    mapped = '//home/ranking/chikachoff/portal/mapped'
    reduced = '//home/ranking/chikachoff/portal/reduced'
    softexportmap = '//home/ranking/chikachoff/portal/softexportmapped'

    source_table_prefix = '//statbox/redir-log/'
    exportaccess_table_prefix = '//statbox/export-access-log/'
    with open(context_dict_path) as fp:
        context_dict = fp.readlines()
    with open(channelsgrid_path) as fp:
        channelsgrid = fp.read()
        channelsgrid = json.loads(channelsgrid)
    today = datetime.today()
    yesterday = (today - timedelta(days=1)).strftime('%Y-%m-%d')

    try:
        datefrom = sys.argv[1]
    except:
        datefrom = yesterday
    try:
        dateto = sys.argv[2]
    except:
        dateto = yesterday
    dimensions = {'country', 'productname', 'medium', 'placement'}
    dim_combinations = list(powerset(dimensions))[1:-1]
    for day in daterange(datefrom, dateto):
        redirlog_table = source_table_prefix + day
        export_access_log_table = exportaccess_table_prefix + day

        yt.run_erase(filtered)
        yt.run_erase(mapped)
        yt.run_erase(softexportmap)
        yt.run_erase(reduced)

        yt.run_map(
            _filter,
            source_table=redirlog_table,
            destination_table=filtered,
            format=yt.DsvFormat()
        )
        yt.run_map(
            Mapper(context_dict, channelsgrid),
            source_table=filtered,
            destination_table=mapped,
            format=yt.DsvFormat(),
            local_files=[context_dict_path, channelsgrid_path]
        )
        yt.run_sort(mapped, mapped, sort_by=list(dimensions))
        yt.run_reduce(
            Reducer(),
            source_table=mapped,
            destination_table=yt.TablePath(reduced, append=True),
            format=yt.DsvFormat(),
            reduce_by=list(dimensions),
        )

        yt.run_map(
            ExportAccessMapper(context_dict),
            source_table=export_access_log_table,
            destination_table=softexportmap,
            format=yt.DsvFormat(),
            local_files=[context_dict_path]
        )
        yt.run_sort(softexportmap, softexportmap, sort_by=['country',
                    'medium', 'placement'])
        yt.run_reduce(
            ExportAccessReducer(),
            source_table=softexportmap,
            destination_table=yt.TablePath(reduced, append=True),
            format=yt.DsvFormat(),
            reduce_by=['country', 'medium', 'placement'],
        )
        yt.run_sort(reduced, reduced, sort_by=list(dimensions)+['installs',
                                                                'audience'])
        data = [rec for rec in
                yt.read_table(reduced, format='dsv', raw=False)]

        aggregations = []
        for dim_comb in dim_combinations:
            current_dimensions = dimensions.difference(set(dim_comb))
            grouper = itemgetter(*current_dimensions)
            for key, grp in groupby(sorted(data, key=grouper), grouper):
                if type(key) == str:
                    temp_dict = dict(zip(list(current_dimensions), [key]))
                else:
                    temp_dict = dict(zip(list(current_dimensions), key))
                temp_dict['audience'] = 0
                temp_dict['installs'] = 0

                for item in grp:
                    temp_dict['audience'] += int(item['audience'])
                    temp_dict['installs'] += int(item['installs'])

                for dimension in dim_comb:
                    temp_dict[dimension] = 'all'
                aggregations.append(temp_dict)
        data = data + aggregations

        for rec in data:
            rec['fielddate'] = day
            if rec['medium'] == 'distr_portal':
                rec['medium'] = 'Morda'
            if rec['medium'] == 'distr_serp':
                rec['medium'] = 'Serp'

        for row in data:
            print row
        update_stat(data, stat_path)
        print 'Portal updated date: {}. Url: {}'.format(day, stat_url)

        yt.run_erase(filtered)
        yt.run_erase(mapped)
        yt.run_erase(softexportmap)
        yt.run_erase(reduced)
        print 'Tables in YT erased'

if __name__ == '__main__':
    main()
