#!/usr/bin/env python2.7
# coding=utf-8
'''
From redir_log counts installs and audience. Groups by 'country',
'productname', 'bnrd', 'distype', 'dayuse'
Link to stat report:
https://stat.yandex-team.ru/Yandex_RU/Special/Metrics/Switch/portal

Usage:
python productname.py '2015-01-01' '2015-12-31' '//home/ranking/chikachoff/distribution/tmp'
'''
from datetime import datetime, timedelta
from itertools import groupby, combinations, chain
from operator import itemgetter
import ClickhouseConnector as ClHouse
import sys
import os
import json
import urllib2
import urllib

try:
    import yt.wrapper as yt
    def module_filter(module):
        if not module:
            return True
        name = getattr(module, '__name__', '')
        return not (name == 'uatraits' or name.startswith('statbox'))
    yt.config["auto_merge_output"]["action"] = "merge"
    yt.config["pickling"]["module_filter"] = module_filter
    yt.config.set_proxy('hahn.yt.yandex.net')
    yt.config.CREATE_RECURSIVE = True
    yt.config.TREAT_UNEXISTING_AS_EMPTY = True
yt.config["pickling"]["enable_tmpfs_archive"] = False
except:
    print 'yt.wrapper not imported (probably script started locally)'

channelsgrid_path = "channelsgrid.json"
# context_dict_path = '../../../serp/mobile/ws-atom/src/main/resources/context_dict.txt'
context_dict_path = './context_dict.txt'
stat_host = 'https://stat.yandex-team.ru/'
stat_path = 'Yandex_RU/Special/Metrics/Switch/portal'
stat_url = ''.join([stat_host, stat_path])


def update_stat(rows, stat_path):
    url = 'https://stat.yandex-team.ru/_api/report/data'
    headers = {'StatRobotUser': 'robot_aydogank',
               'StatRobotPassword': 'Aish3ohy9u'}
    values = {'name': stat_path,
              'scale': 'd',
              'data': json.dumps({'values': rows})}
    data = urllib.urlencode(values)
    req = urllib2.Request(url, data, headers)
    try:
        urllib2.urlopen(req)
    except urllib2.HTTPError as e:
        error_message = e.read()
        print error_message
        raise


def in_ru_clids(rec):
    ru = [rec.get('clid1') in ['2163765', '2189879', '2219044', '2241975',
                               '2224312', '2241982', '2189880', '2241993',
                               '2228989', '2244045', '2063708', '2063717'],
          rec.get('clid5') in ['2163430', '2187268', '2219045', '2241976',
                               '2224313', '2242347', '2063709', '2063718',
                               '2242347'],
          rec.get('clid6') in ['2224314', '2242348', '2196598', '2241994',
                               '2228990', '2063710', '2063719'],
          rec.get('clid7') in ['2063711', '2063720', '2063711', '2063720'],
          rec.get('clid8') in ['2063712', '2063721', '2063712', '2063721'],
          rec.get('clid9') in ['2187644', '2241995', '2063713', '2063722'],
          rec.get('clid10') in ['2164776', '2224772', '2241977', '2224315',
                                '2241983', '2220366', '2241996', '2228991',
                                '2244046'],
          rec.get('clid14') in ['2164454', '2063723'],
          rec.get('clid15') in ['2244047', '2063714', '2063724'],
          rec.get('clid17') in ['2063715', '2063725'],
          rec.get('clid18') in ['2244048', '2063716', '2063726'],
          rec.get('clid20') in ['2164455', '2164456'],
          rec.get('clid23') in ['2241984']
          ]
    return any(ru)


def in_ua_clids(rec):
    ua = [rec.get('clid1') in ['2167025', '2189881', '2224316', '2242623',
                               '2242626', '2242630', '2065991', '2066001'],
          rec.get('clid5') in ['2167026', '2224317', '2242624', '2242627',
                               '2065992', '2066002'],
          rec.get('clid6') in ['2196599', '2224318', '2242628', '2242631',
                               '2065993', '2066003'],
          rec.get('clid7') in ['2065994', '2066004'],
          rec.get('clid8') in ['2065995', '2066005'],
          rec.get('clid9') in ['2187646', '2242632', '2065996', '2066006'],
          rec.get('clid10') in ['2167027', '2220367', '2224319', '2242625',
                                '2242629', '2242633'],
          rec.get('clid14') in ['2065997', '2066007'],
          rec.get('clid15') in ['2065998', '2066008'],
          rec.get('clid17') in ['2065999', '2066009'],
          rec.get('clid18') in ['2066000', '2066010'],
          rec.get('clid20') in ['2164459', '2164457'],
          ]
    return any(ua)


def in_tr_clids(rec):
    tr = [rec.get('clid1') in ['2167028', '2189882', '2219050', '2219052',
                               '2224320', '2235099', '2236657', '2238709',
                               '2238712', '2238715', '2241979', '2242002',
                               '2242008', '2066011', '2066021'],
          rec.get('clid10') in ['2167030', '2220368', '2224323', '2224771',
                                '2224775', '2235102', '2236660', '2238711',
                                '2238714', '2238717', '2241981', '2242003',
                                '2242011'],
          rec.get('clid23') in ['2242004'],
          rec.get('clid5') in ['2167029', '2224321', '2235100', '2236658',
                               '2238710', '2238713', '2238716', '2241980',
                               '2242349', '2066012', '2066022'],
          rec.get('clid6') in ['2196600', '2219051', '2219053', '2224322',
                               '2235101', '2236659', '2242009', '2242350',
                               '2066013', '2066023'],
          rec.get('clid7') in ['2219054', '2066014', '2066024'],
          rec.get('clid8') in ['2219055', '2066015', '2066025'],
          rec.get('clid9') in ['2187647', '2242010', '2066016', '2066026'],
          rec.get('clid14') in ['2066017', '2066027'],
          rec.get('clid15') in ['2066018', '2066028'],
          rec.get('clid17') in ['2066019', '2066029'],
          rec.get('clid18') in ['2066020', '2066030'],
          rec.get('clid20') in ['2164460', '2164458']
          ]
    return any(tr)


def in_kz_clids(rec):
    kz = [rec.get('clid1') in ['2246653'],
          rec.get('clid6') in ['2246654'],
          rec.get('clid9') in ['2246655'],
          rec.get('clid10') in ['2246656']
          ]
    return any(kz)


def in_by_clids(rec):
    by = [rec.get('clid1') in ['2246649'],
          rec.get('clid6') in ['2246650'],
          rec.get('clid9') in ['2246651'],
          rec.get('clid10') in ['2246652']
          ]
    return any(by)


def update_banerids():
    os.system('svn up resources/')


def daterange(start, end):
    start = datetime.strptime(start, '%Y-%m-%d')
    end = datetime.strptime(end, '%Y-%m-%d')
    current = start
    while current <= end:
        yield current.strftime('%Y-%m-%d')
        current += timedelta(days=1)


def vars_to_dict(vars_value):
    vars_dict = {}
    vars_items = vars_value.split(',')
    for var_item in vars_items:
        if '=' in var_item:
            k, v = var_item.split('=', 1)
            if k.startswith('-'):
                k = k[1:]
            vars_dict[k] = v
    return vars_dict


def bnrd_to_human(bnrd, context_dict):
    bnrd = str(bnrd)
    human = {}
    # For switch bnrds:
    if bnrd[2:4].startswith('9'):
        mediums = {'10': 'gdn',
                   '08': 'gsearch',
                   '59': 'facebook',
                   '12': 'yandex',
                   '00': 'default'}
        # targeting = {'04': 'chrome',
        #              '05': 'opera',
        #              '06': 'ff',
        #              '07': 'ie',
        #              '00': 'default',
        #              }
        campaigns = {"00": "default",
                     "01": "brand",
                     "02": "cartoons",
                     "03": "currency",
                     "04": "fines",
                     "05": "football",
                     "06": "panorama",
                     "07": "recipe",
                     "08": "series",
                     "09": "translate",
                     "10": "vz",
                     "11": "timetable",
                     "12": "mobile",
                     "13": "address",
                     "14": "auto",
                     "15": "benzin",
                     "16": "buy",
                     "17": "calculator",
                     "18": "games",
                     "19": "index",
                     "20": "khl",
                     "21": "maps",
                     "22": "metro",
                     "23": "mobile",
                     "24": "rabota",
                     "25": "spikes",
                     "26": "switch",
                     "27": "traffic",
                     "28": "weather",
                     "29": "tv",
                     "30": "music",
                     "31": "brand_start",
                     "32": "pay_tax"
                     }
        human['distype'] = 'switch'
        human['medium'] = mediums.get(bnrd[:2], 'undefined')
        human['campaign'] = campaigns.get(bnrd[6:8], 'undefined')
    # For portal:
    else:
        zero = {ln.strip().split('\t')[1]: ln.strip().split('\t')[2]
                for ln in context_dict if ln.strip().split('\t')[0] == '0'}
        one = {ln.strip().split('\t')[1]: ln.strip().split('\t')[2]
               for ln in context_dict if ln.strip().split('\t')[0] == '1'}
        human['distype'] = 'nonswitch'
        human['medium'] = one.get(bnrd[2:4], 'undefined')
        human['placement'] = zero.get(bnrd[0:2], 'undefined')
    return human


class Mapper(object):
    def __init__(self, context_dict):
        self.context_dict = context_dict

    def __call__(self, rec):
        data = {}
        for item in rec['value'].split('\t'):
            k, v = item.split('=', 1)
            data[k] = v
        if all(['vars' in data,
                'yandexuid' in data,
                'unixtime' in data]):
            if ',' in data['vars']:
                vars_data = vars_to_dict(data['vars'])
                productname = vars_data.get('productname')
                is_distr_clid = any([in_ru_clids(vars_data),
                                     in_ua_clids(vars_data),
                                     in_tr_clids(vars_data),
                                     in_by_clids(vars_data),
                                     in_kz_clids(vars_data),
                                     ])

                if all(['dayuse' in vars_data,
                        is_distr_clid,
                        # productname in ['startextchrome',
                        #                 'homesearchextchrome',
                        #                 'searchextchrome',
                        #                 'vbch']
                        ]):
                    if in_ru_clids(vars_data):
                        country = 'RU'
                    elif in_tr_clids(vars_data):
                        country = 'TR'
                    elif in_ua_clids(vars_data):
                        country = 'UA'
                    elif in_by_clids(vars_data):
                        country = 'BY'
                    elif in_kz_clids(vars_data):
                        country = 'KZ'
                    else:
                        country = 'unknown_clid'
                    bnrd = vars_data.get('bnrd', '----------')
                    bnrd_hum = bnrd_to_human(bnrd, self.context_dict)
                    if bnrd_hum['distype'] == 'nonswitch':
                        yield {
                                'dayuse': vars_data['dayuse'],
                                # 'distype': bnrd['distype'],
                                # 'medium': bnrd.get('medium', 'undefined'),
                                # 'campaign': bnrd.get('campaign', 'undefined'),
                                # 'placement': bnrd.get('placement', 'undefined'),
                                'productname': productname,
                                'country': country,
                                'bnrd': bnrd
                                # 'yandexuid': data['yandexuid']
                                }


class Reducer():
    def __call__(self, key, recs):
        installs = 0
        audience = 0
        yandexuids = []
        for rec in recs:
            if rec['dayuse'] == '0':
                installs += 1
            audience += 1
            # yandexuids.append(rec['yandexuid'])
        yield {'bnrd': key['bnrd'],
               # 'campaign': key['campaign'],
               # 'placement': key['placement'],
               'productname': key['productname'],
               'country': key['country'],
               'audience': audience,
               'installs': installs,
               # 'yandexuids': ','.join(set(yandexuids))
               }


def powerset(iterable):
    xs = list(iterable)
    return chain.from_iterable(combinations(xs, n) for n in range(len(xs)+1))


def to_chunks(lst, chunksize=100000):
    for i in xrange(0, len(lst), chunksize):
        yield lst[i:i+chunksize]


# def get_hits(data, start, end):
    for idx, elem in enumerate(data):
        hits = 0
        uids = elem['yandexuids'].split(',')
        while True:
            try:
                if len(uids) <= 100000:
                    trial = 0
                    while trial <= 2 and hits in [0, '', '0']:
                        trial += 1
                        hits = ClHouse.findSearches(uids, start, end)
                else:
                    for bucket in to_chunks(uids):
                        trial = 0
                        tmp = 0
                        while trial <= 2 and tmp in [0, '', '0']:
                            trial += 1
                            tmp = ClHouse.findSearches(bucket, start, end)
                        hits += int(tmp)
            except urllib2.HTTPError, error:
                print 'failed to get searches from Clickhouse'
                tmpl = 'Country: {}, prodname: {}, medium: {}, placement: {}'
                print tmpl.format(elem['country'], elem['productname'],
                                  elem['productname'], elem['placement'])
                print 'Error details:\n{}'.format(error.read())
            break
        data[idx]['hits'] = hits
        data[idx].pop('yandexuids')


def main():
    # update_banerids()
    with open(context_dict_path) as fp:
        context_dict = fp.readlines()
    source_table_prefix = '//statbox/redir-log/'
    today = datetime.today()
    yesterday = (today - timedelta(days=1)).strftime('%Y-%m-%d')

    try:
        datefrom = sys.argv[1]
    except:
        datefrom = yesterday

    try:
        dateto = sys.argv[2]
    except:
        dateto = yesterday
    dimensions = {'country', 'productname', 'bnrd'}
    # dim_combinations = list(powerset(dimensions))[1:-1]

    for day in daterange(datefrom, dateto):
        redirlog_table = source_table_prefix + day

        try:
            dst = sys.argv[3]
        except:
            dst = '//home/ranking/chikachoff/switch/check_bnrd_reduce'

        yt.run_erase(dst)

        yt.run_map_reduce(mapper=Mapper(context_dict),
                          reducer=Reducer(),
                          source_table=redirlog_table,
                          destination_table=dst,
                          format=yt.DsvFormat(),
                          map_local_files=context_dict_path,
                          reduce_local_files=context_dict_path,
                          reduce_by=list(dimensions))
        yt.run_sort(dst, dst, sort_by=list(dimensions)+['installs',
                                                        'audience'])
        data = [rec for rec in
                yt.read_table(dst, format='dsv', raw=False)]
        ###
        # Calc hits by clickhouse
        # get_hits(data, day, day)
        ###

        # aggregations = []
        # for dim_comb in dim_combinations:
        #     current_dims = dimensions.difference(set(dim_comb))
        #     grouper = itemgetter(*current_dims)
        #     for key, grp in groupby(sorted(data, key=grouper), grouper):
        #         if type(key) == str:
        #             temp_dict = dict(zip(list(current_dims), [key]))
        #         else:
        #             temp_dict = dict(zip(list(current_dims), key))
        #         temp_dict['audience'] = 0
        #         temp_dict['installs'] = 0
        #         temp_dict['hits'] = 0
        #         for item in grp:
        #             temp_dict['audience'] += int(item['audience'])
        #             temp_dict['installs'] += int(item['installs'])
        #             try:
        #                 temp_dict['hits'] += int(item['hits'])
        #             except ValueError:
        #                 pass
        #         for dim in dim_comb:
        #             temp_dict[dim] = 'all'
        #         aggregations.append(temp_dict)
        # data = data + aggregations

        for rec in data:
            rec['fielddate'] = day
            # if rec['medium'] == 'distr_portal':
            #     rec['medium'] = 'Morda'
            # if rec['medium'] == 'distr_serp':
            #     rec['medium'] = 'Serp'
            # if rec['medium'] == 'undefined':
            #     rec['medium'] = 'Undefined'
            print rec

        # while True:
        #     try:
        #         update_stat(data, stat_path)
        #     except:
        #         time.sleep(10)
        #     break
        # print 'Bnrds RU updated date: {}. Url: {}'.format(day, stat_url)

if __name__ == '__main__':
    main()
