#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from __future__ import division
from __future__ import print_function
import os
import re
import sys
import codecs
import copy
import shutil
import getpass
import itertools
import pdb
import argparse
import subprocess
import traceback
import time
import requests
import StringIO
import json
import urlparse
import collections
import datetime
import yaml
import base64
import concurrent.futures
from Crypto.Cipher import Blowfish

import yt.wrapper as yt

from pytils import (parseparams, parsevars, tabulate, make_logger,
                    get_yt_exists, yt_get_date_from_table, SPEC10k,
                    from_bytes, DummyLogger)
from monytoring import Monitoring
try:
    from nile.api.v1 import (
        clusters,
        filters as nf,
        Record
    )
except:
    pass

yt_exists = get_yt_exists(yt)

statdict = collections.defaultdict(lambda: {})

dummy_logger = DummyLogger()


def blow_decrypt(source):
    iv_key = "pWAnOKnfLKdjE2Ej16jQcw=="
    key = "yTbJ8bAI5qQrWEmwah2xYw=="
    bs = Blowfish.block_size
    s = base64.b64decode(
        source.replace('-', '+').replace('_', '/'))
    iv = s[:bs]
    iv_cipher = Blowfish.new(
        base64.b64decode(iv_key), Blowfish.MODE_CBC, "arcadia+")
    iv_decrypted = iv_cipher.decrypt(iv)
    data = s[bs:]
    try:
        data_decrypted = Blowfish.new(
            base64.b64decode(key),
            Blowfish.MODE_CBC,
            iv_decrypted
        ).decrypt(data)
    except:
        raise Exception('broken source: {}'.format(source))
    while data_decrypted[-1] == b'\x00':
        data_decrypted = data_decrypted[:-1]
    return data_decrypted.decode('utf8')


def get_atom_token():
    with codecs.open('{}/.atom_token'
                     .format(os.path.dirname(
                         os.path.abspath(__file__)
                     )), 'r') as f:
        return f.read().rstrip()


def get_atom_headers():
    return {'Authorization': 'Token {}'.format(get_atom_token())}


def get_candidates(logger=None, remove=False):
    ATOM_URL = 'http://atom-admin.n.yandex-team.ru/atom/api/v1/'
    task_id_request = requests.get(
        ATOM_URL + 'version/production', headers=get_atom_headers()
    )
    task_id = json.loads(task_id_request.content.decode('utf8'))['task_id']
    resource_id_request = requests.get(
        'https://sandbox.yandex-team.ru:443/api/v1.0/resource?'
        'type=PERS_ATOM_CANDIDATES&task_id={}&limit=1'
        .format(task_id)
    )
    resource_id = json.loads(resource_id_request.content.decode('utf8'))[
        'items'][0]['id']
    rsync_routes_request = requests.get(
        'https://sandbox.yandex-team.ru:443/api/v1.0/resource/'
        '{}/data/rsync'
        .format(resource_id)
    )
    rsync_routes = json.loads(rsync_routes_request.content.decode('utf8'))
    for rsync_route in rsync_routes:
        rslink = rsync_route['url']
        try:
            subprocess.check_call(['rsync', '-r', format(rslink), '.'])
            if logger is not None:
                logger.info(
                    'Successfully downloaded candidates lists {}'.format(rslink))
            break
        except (subprocess.CalledProcessError, IOError) as error:
            message = 'Failed to load data from {}. Error:  {}'.format(
                rslink, error)
            sys.stderr.write(message)
            if logger is not None:
                logger.exception(message)
    result = {}
    for root, dir_, files in os.walk('atom_candidates'):
        for name in files:
            if name.endswith('.json'):
                candidates_list = json.load(open(os.path.join(root, name)))
                for candidate in candidates_list:
                    result[candidate.get('internal-url', '')
                           .split('/')[-1]] = candidate.get('__product', '')
    subprocess.call(['chmod', '-R', '+w', 'atom_candidates'])
    if remove:
        try:
            shutil.rmtree('atom_candidates')
        except:
            pass
    return result


def reqid_to_key(reqid):
    return ''.join(filter(unicode.isdigit, reqid))[:22]


def showid_to_key(s):
    return s[8:][:22]


def append(path):
    return yt.TablePath(path, append=True)


def normalize_host(url):
    """
    >>> print(normalize_host('https://yandex.ru/search/?text=Skoda'))
    yandex.ru/search
    >>> print(normalize_host('https://yandex.ru/yandsearch/?text=Skoda'))
    yandex.ru/search
    >>> print(normalize_host('https://yandex.ru/?utm_source=blabla'))
    yandex.ru
    >>> print(normalize_host('http://maps.yandex.ru/213/moscow/?text=1234&sll=37.620393%2C55.753960&sspn=1.128845%2C0.535158&ll=37.718247%2C55.732813&z=10'))
    maps.yandex.ru
    """
    result = url
    prefices = ['http://', 'https://', 'www.', 'm.']
    while any([result.startswith(prefix) for prefix in prefices]):
        for prefix in prefices:
            if result.startswith(prefix):
                result = result[len(prefix):]
    if '?' in result:
        result = result[:result.index('?')]
    result = result.strip('/')
    if ('/' in result and len(result.split('/')) > 1):
        if result.split('/')[0].split('.')[0] == 'yandex':
            result = result.split('/')[0] + '/' + result.split('/')[1]
        else:
            result = result.split('/')[0]
    result = result.replace('yandsearch', 'search')
    result = result.replace(':', '')
    if not any(
        x in result for x in ['yandex', 'kinopoisk', 'auto.ru', 'yadi.sk']
    ):
        result = 'other'
    return result[:25]


def preprocess_geobase(z):
    x = copy.deepcopy(z)
    for k in ['id', 'type']:
        x[k] = int(x[k])
    x['path'] = [int(y) for y in x['path'].split(', ') if y]
    return x


def get_country(x, jgb):
    obj = jgb[x]
    if obj['type'] < 3:
        return
    if obj['type'] == 3:
        return
    for y in obj['path']:
        if jgb[y]['type'] == 3:
            return jgb[y]['iso_name'].split()[0].lower()


def get_lr(referer):
    parsed = urlparse.urlparse(referer)
    qs = urlparse.parse_qs(parsed.query)
    if 'lr' in qs:
        return qs['lr'][0]
    return ''


class RedirMap(object):
    def __init__(self):
        self.is_serp_regex = re.compile('^yandex\.[a-zA-Z]*/(search|touchsearch)$')
        self.prefix = 'tech.portal-ads.'

    def __call__(self, rec):
        rec = from_bytes(rec)
        params = parseparams(rec['value'])
        referer = self.__retrieve_referer(params)
        if (params['yandexuid'] and
            params['unixtime'] and
            params['path'].startswith(self.prefix) and
                'ATOMS' in params['vars']):
            yield {
                'yandexuid': params['yandexuid'],
                'unixtime': str(params['unixtime']),
                'referer': referer,
                'distr_obj': params['path'][len(self.prefix):],
                'vars': params['vars'],
                'lr': params['reg'] or get_lr(referer)
            }

    def __retrieve_referer(self, params):
        if params['url']:
            referer = params['url']
            for _ in range(3):
                referer = urlparse.unquote(referer)
            if referer.startswith('data=url='):
                referer = referer[len('data=url='):]
            if self.is_serp_regex.match(normalize_host(referer)) is not None:
                return referer
        return params['HTTP_REFERER']


class RedirReduce(object):
    def __init__(self, testids, geobase):
        self.testids = set(testids)
        self.geobase = geobase
        self.zombie_objs = {'distr_stripe', 'portal_popup', 'teaser'}

    def get_country_from_referer(self, pvars):
        try:
            return get_country(int(pvars['lr']), self.geobase)
        except (KeyError, ValueError):
            try:
                referer = urlparse.urlparse(pvars['referer'])
                return referer.netloc.split('.')[-1] or 'unknown'
            except:
                return 'unknown'

    def distr_obj(self, obj, pvars):
        if pvars['collectpoolmode'] == '1':
            return obj + '.training'
        return obj

    def parse_referer(self, referer):
        parsed = urlparse.urlparse(referer)
        qs = urlparse.parse_qs(parsed.query)
        result = {}
        if 'banerid' in qs:
            result['showid'] = qs['banerid'][0]
        if 'host' in qs:
            result['distr_obj'] = qs['host'][0]
        if 'reqid' in qs:
            result['reqid'] = qs['reqid'][0]
        return result

    def make_rec(self, pvars):
        result = {
            'key': reqid_to_key(pvars['reqid']),
            'unixtime': str(pvars['unixtime']),
            'distr_obj': self.distr_obj(pvars['distr_obj'], pvars),
            'eventtype': pvars['eventtype'],
            'country': self.get_country_from_referer(pvars),
            'bannerid': pvars['bannerid'],
            'score': pvars['score'],
            'showid': pvars['showid'],
            'device': pvars['device'] or 'unknown',
            'referer': normalize_host(pvars['referer']),
            'product': pvars['product'],
            'yandexuid': pvars['yandexuid'],
            'reqid': pvars['reqid'],
            'testids': pvars['testids']
        }
        if 'type' in pvars:
            result['type'] = pvars['type']
        return result

    def __call__(self, key, recs):
        shows = collections.defaultdict(lambda: [])
        i = 0
        for rec in recs:
            i += 1
            if i >= 200:
                return
            rec = from_bytes(rec)
            if rec.get('product') == 'default_search_mobilesafari_ios':
                for reqid in shows:
                    for show in sorted(
                        shows[reqid], key=lambda x: x['unixtime'], reverse=True
                    ):
                        if (
                            show['product'] ==
                            'default_search_mobilesafari_ios'
                        ):
                            pvars_ = copy.deepcopy(show)
                            pvars_['unixtime'] = rec['unixtime']
                            pvars_['eventtype'] = 'trueinstall'
                            yield self.make_rec(pvars_)
                            break
                continue
            pvars = parsevars(rec['vars'])
            if 'reqid' not in pvars or 'showid' not in pvars:
                pvars.update(self.parse_referer(rec['referer']))
            if (
                (
                    rec['distr_obj'].startswith('promolib') and
                    pvars['eventtype'] == 'install'
                ) or (
                    pvars['product'] in {'set_opera', 'browser_zombie'} and
                    pvars['eventtype'] == 'install'
                )
            ):
                pvars['unixtime'] = rec.get('unixtime')
                pvars['yandexuid'] = rec.get('yandexuid')
                pvars['eventtype'] = 'trueinstall'
                yield self.make_rec(pvars)
                continue
            if (
                pvars['product'] == 'browser_zombie' and
                pvars['eventtype'] == 'showlanding'
            ):
                pvars['eventtype'] = 'install'
                yield self.make_rec(pvars)
                continue
            if pvars['eventtype'] == 'show':
                pvars['referer'] = rec['referer']
                pvars['unixtime'] = rec['unixtime']
                pvars['yandexuid'] = rec['yandexuid']
                pvars['distr_obj'] = rec['distr_obj']
                if 'lr' not in pvars:
                    pvars['lr'] = rec['lr']
                pvars['testids'] = pack_testids(
                    extract_testids(pvars['adata']) & self.testids
                )
                shows[pvars['reqid']].append(pvars)
                yield self.make_rec(pvars)
            elif pvars['reqid'] in shows:
                for show in shows[pvars['reqid']]:
                    if (
                        pvars['bannerid'] == show['bannerid'] or
                        'bannerid' not in pvars
                    ):
                        show['eventtype'] = pvars['eventtype']
                        if (
                            show['product'] == 'browser_zombie' and
                            show['distr_obj'] in self.zombie_objs and
                            show['eventtype'] == 'install'
                        ):
                            show['eventtype'] = 'trueinstall'
                        if (
                            show['product'] == 'browser_zombie' and
                            show['distr_obj'] in self.zombie_objs and
                            show['eventtype'] == 'showlanding'
                        ):
                            show['eventtype'] = 'install'
                        yield self.make_rec(show)
                        break
            elif (
                'external-promo-metrics' in rec['referer'] or
                'welcome_z' in rec['referer']
            ):
                if pvars['eventtype'] == 'install':
                    pvars['eventtype'] = 'trueinstall'
                if pvars['eventtype'] == 'showlanding':
                    pvars['eventtype'] = 'install'
                pvars['referer'] = rec['referer']
                pvars['unixtime'] = rec['unixtime']
                pvars['yandexuid'] = rec['yandexuid']
                yield self.make_rec(pvars)


class WatchLogMap(object):
    def __init__(self):
        self.APPLE_CLIDS = {
            '1906591',
            '1906722',
            '1906597',
            '2192578',
            '2144780',
            '1906725',
            '1906728',
            '1906591',
            '1906733',
            '1906722',
            '1906732',
            '1906723',
            '1906727',
            '1871397',
            '1871396',
            '1906731',
            '1906724',
            '1906729',
            '1906726',
            '1906730',
            '1906597',
            '2192578',
            '2192579',
            '2192593',
            '2192594',
        }

    def __call__(self, rec):
        rec = from_bytes(rec)
        if 'url' not in rec or 'unixtime' not in rec or 'uniqid' not in rec:
            return
        try:
            parsed = urlparse.urlparse(rec['url'])
        except:
            return
        if parsed.netloc not in {
            'yandex.ru', 'yandex.ua', 'yandex.by', 'yandex.kz', 'yandex.com.tr'
        }:
            return
        if '/search' not in parsed.path:
            return
        try:
            qs = urlparse.parse_qs(parsed.query)
        except:
            return
        if ('clid' in qs) and (set(qs['clid']) & self.APPLE_CLIDS):
            yield {
                'unixtime': rec['unixtime'],
                'eventtype': 'trueinstall',
                'product': 'default_search_mobilesafari_ios',
                'source': 'bs-watch-log',
                'referer': normalize_host(rec['url']),
                'yandexuid': rec['uniqid']
            }


class ExportAccessMap(object):
    def __init__(self, date_):
        try:
            self.date_ = datetime.datetime.strptime(date_, '%Y-%m-%d').date()
        except ValueError:
            self.date_ = None

    def __call__(self, rec):
        """columns=['request', 'iso_eventtime']"""
        rec = from_bytes(rec)
        parsed = urlparse.urlparse(rec['request'])
        parsed_qs = urlparse.parse_qs(parsed.query)
        try:
            timestamp = datetime.datetime.strptime(rec['iso_eventtime'],
                                                   '%Y-%m-%d %H:%M:%S')
        except ValueError:
            raise Exception(format(rec))
        if ('stat' not in parsed_qs or
            parsed_qs['stat'][0] not in
                {'dayuse', 'install', 'reinstall'}):
            return
        if 'banerid' in parsed_qs:
            bnrd = 'banerid'
        elif 'bnrd' in parsed_qs:
            bnrd = 'bnrd'
        else:
            return
        ts = parsed_qs[bnrd][0][8:18]
        try:
            date_ = datetime.datetime.fromtimestamp(int(ts)).date()
        except ValueError:
            return
        if (not self.date_) or (date_ == self.date_):
            dct = {
                'key': parsed_qs[bnrd][0][8:][:22].encode('utf8'),
                'unixtime': timestamp.strftime('%s').encode('utf8'),
                'eventtype': 'trueinstall'.encode('utf8'),
                'source': 'export-access-log'.encode('utf8'),
                'yasoft': parsed_qs['yasoft'][0].encode('utf8'),
                'yandexuid_new': parsed_qs.get(
                    'yandexuid', ['']
                )[0].encode('utf8'),
                'stat': parsed_qs['stat'][0].encode('utf8'),
                'showid': parsed_qs[bnrd][0].split(':')[0].encode('utf8')
            }
            yield dct


class MobileTrackingMap(object):
    def __init__(self):
        self.showid_params = ['showid', 'banerid']
        self.reqid_params = ['reqid', 'atom_reqid', 'amp;atom_reqid']

    def get_param(self, allparams, params, qs=False):
        for param in params:
            if param in allparams:
                if qs:
                    return allparams[param][0]
                else:
                    return allparams[param]

    def preprocess_string(self, s):
        return s.replace("'", '"').replace('\\', '')

    def __call__(self, rec):
        """columns=['AppID', 'EventType', 'UrlParameters_Keys', 'UrlParameters_Values', 'SendTimestamp']"""
        if rec.get('EventType', '') != 'EVENT_AD_INSTALL':
            return
        urlparams = {}
        try:
            urlparams = dict(zip(
                json.loads(self.preprocess_string(rec['UrlParameters_Keys'])),
                json.loads(self.preprocess_string(rec['UrlParameters_Values'])),
            ))
        except:
            pass
        showid = self.get_param(urlparams, self.showid_params + ['banner_id'])
        reqid = self.get_param(urlparams, self.reqid_params)
        if showid:
            key = showid_to_key(showid)
        elif reqid:
            key = reqid_to_key(reqid)
        else:
            return
        yield {
            'key': key,
            'unixtime': str(rec['SendTimestamp']),
            'eventtype': 'trueinstall',
            'source': 'mobile-tracking-log',
            'app_id': rec['AppID'],
            'showid': showid or '',
            'reqid': reqid or ''
        }


def extract_testids(adata):
    try:
        testids = json.loads(blow_decrypt(adata)).get('test-ids')
        return set(int(testid) for testid in testids)
    except:
        return set()


def pack_testids(testids):
    return ','.join(map(format, sorted(testids)))


class MetrikaMobileMap(object):
    """aka PromolibaMap"""

    def __init__(self, bannerids, app_id_dict, testids, geobase):
        self.bannerids = bannerids
        self.app_id_dict = app_id_dict
        self.testids = testids
        self.geobase = geobase

    def get_country_from_referer(self, pvars):
        try:
            return get_country(int(pvars['lr']), self.geobase)
        except (KeyError, ValueError):
            try:
                referer = urlparse.urlparse(pvars['referer'])
                return referer.netloc.split('.')[-1] or 'unknown'
            except:
                return 'unknown'

    def __call__(self, rec):
        """columns['EventValue', 'EventName', 'AppID', 'AppPlatform', 'ClientIP', 'DeviceID', 'UUID', 'SendTimestamp']"""
        rec = from_bytes(rec)
        if '-ATOMS-' not in rec.get('EventValue', ''):
            return
        try:
            obj = json.loads(rec['EventValue'])
        except:
            obj = {}
        if (isinstance(obj, dict) and
                'campaign_id' in obj and 'EventName' in rec):

            campaign_id = obj['campaign_id']

            if campaign_id.startswith('VARS_'):
                campaign_id = campaign_id[len('VARS_'):]
                pvars = parsevars(campaign_id, sep=';')
                pvars['testids'] = extract_testids(pvars['adata'])
            else:
                pvars = collections.defaultdict(lambda: '')
                try:
                    pvars['reqid'] = campaign_id.split(';')[0]
                    pvars['bannerid'] = campaign_id.split(';')[2]
                    pvars['testids'] = extract_testids(campaign_id.split(';')[3])
                except IndexError:
                    pvars['reqid'] = ''
                    pvars['bannerid'] = ''
                    pvars['testids'] = set()
            if 'extras' in obj and 'geo_test_buckets' in obj['extras']:
                pvars['testids'] |= set(
                    int(testid_info.split(',')[0])
                    for testid_info in obj['extras']['geo_test_buckets'].split(';')
                    if testid_info != ''
                )
            distr_obj = 'promolib'
            if pvars['collectpoolmode']:
                distr_obj = 'promolib.training'
            try:
                os = rec['AppPlatform']
            except KeyError:
                return
            if os == 'android':
                os = 'Android'
            if rec['AppID'] in self.app_id_dict:
                service = self.app_id_dict[rec['AppID']].strip()
            else:
                service = rec['AppID'] or 'empty'
            if pvars['reqid'] and pvars['bannerid']:
                reaction = obj.get('reaction')
                if reaction is not None and isinstance(reaction, dict):
                    reaction = list(reaction.keys())[0]
                reaction = format(reaction)
                if (rec['EventName'] == 'display' or
                    (rec['EventName'] == 'report' and
                     reaction in {'click', 'close'})):
                    try:
                        dct = {
                            'key': reqid_to_key(pvars['reqid']),
                            'unixtime': str(rec['SendTimestamp']),
                            'distr_obj': distr_obj,
                            'eventtype': ('show' if rec['EventName'] == 'display' else reaction),
                            'bannerid': pvars['bannerid'],
                            'country': self.get_country_from_referer(pvars) or 'unknown',
                            'reqid': pvars['reqid'],
                            'showid': pvars['showid'],
                            'score': pvars['score'],
                            'device': pvars.get('device') or 'unknown',
                            'device_id': (rec.get('DeviceID') or '').upper(),
                            'uuid': rec.get('UUID', ''),
                            'testids': pack_testids(pvars['testids'] & self.testids),
                            'referer': service,
                            'os': os,
                            'product': (
                                pvars['product'] or
                                self.bannerids.get(pvars['bannerid'], 'unknown')
                            ),
                        }
                        yield dct
                    except:
                        raise


class AtomBannersReduce(object):
    def __init__(self, bd, od, event_dict, fielddate):
        self.bd = bd
        self.od = od
        self.event_dict = event_dict
        self.fielddate = fielddate
        self.date = datetime.datetime.strptime(
            self.fielddate, '%Y-%m-%d'
        ).date()

    def get_browser(self, showid):
        if showid[4:6] in self.bd and self.bd[showid[4:6]]:
            return self.bd[showid[4:6]]
        return 'unknown'

    def get_os(self, showid):
        if showid[6:8] in self.od and self.od[showid[6:8]]:
            return self.od[showid[6:8]]
        return 'unknown'

    def __call__(self, key, recs):
        if not key.get("key") or len(key.get("key", "")) != 22:
            return
        showids = collections.defaultdict(list)
        reqids = collections.defaultdict(list)
        events_for_showid = collections.defaultdict(set)
        not_shows = []
        banned_showids = set()
        banned_reqids = set()
        i = 0
        for rec in recs:
            if i > 200:
                return
            i += 1
            rec = from_bytes(rec)
            if rec['eventtype'] not in self.event_dict:
                continue
            elif rec['eventtype'] != 'show':
                if 'yesterday' in rec:
                    if rec.get('showid'):
                        banned_showids.add(rec['showid'])
                    if rec.get('reqid'):
                        banned_reqids.add(rec['reqid'])
                not_shows.append(rec)
            else:  # eventtype == show
                if not rec.get('showid') and not rec.get('reqid'):
                    continue
                if 'os' not in rec and rec.get('showid'):
                    rec['os'] = self.get_os(rec['showid'])
                rec['browser'] = self.get_browser(rec['showid'])
                showids[rec['showid']].append(rec)
                reqids[rec['reqid']].append(rec)
                if 'yesterday' not in rec:
                    yield rec
        for rec in not_shows:
            if (
                rec.get('showid') in banned_showids or
                rec.get('reqid') in banned_reqids or
                'yesterday' in rec
            ):
                continue
            if rec.get('showid'):
                if (
                    rec.get('product') == 'default_search_mobilesafari_ios' and
                    rec['eventtype'] == 'trueinstall'
                ):
                    shows = [
                        self.process_event(
                            x, showids[x['showid']],
                            events_for_showid, check=False
                        ) for x in not_shows
                        if x.get('showid') == rec['showid'] and
                        x['eventtype'] == 'click'
                    ]
                else:
                    shows = showids[rec['showid']]
                out = self.process_event(rec, shows, events_for_showid)
                if out:
                    yield out
            elif rec.get('reqid'):
                if (
                    rec.get('product') == 'default_search_mobilesafari_ios' and
                    rec['eventtype'] == 'trueinstall'
                ):
                    shows = [
                        x for x in not_shows
                        if x.get('reqid') == rec['reqid'] and
                        x['eventtype'] == 'click'
                    ]
                else:
                    shows = reqids[rec['reqid']]
                out = self.process_event(rec, shows, events_for_showid)
                if out:
                    yield out

    def process_event(self, rec, shows, events_for_showid, check=True):
        if not shows:
            return
        if len(shows) == 1:
            return self.merge_recs(rec, shows[0], events_for_showid, check=check)
        else:
            if rec.get('bannerid'):
                shows_b = sorted(
                    [x for x in shows if x['bannerid'] == rec['bannerid']],
                    key=lambda x: x['unixtime']
                )
                if shows_b:
                    return self.merge_recs(
                        rec, shows_b[-1], events_for_showid, check=check
                    )
            elif rec.get('distr_obj'):
                shows_b = sorted(
                    [x for x in shows if x['distr_obj'] == rec['distr_obj']],
                    key=lambda x: x['unixtime']
                )
                if shows_b:
                    return self.merge_recs(
                        rec, shows_b[-1], events_for_showid, check=check
                    )
            else:
                shows = sorted(shows, key=lambda x: x['unixtime'])
                return self.merge_recs(
                    rec, shows[-1], events_for_showid, check=check
                )

    def merge_recs(self, rec, show, events_for_showid, check=True):
        if check and rec['eventtype'] in events_for_showid[show['showid']]:
            return
        events_for_showid[show['showid']].add(rec['eventtype'])
        for x in set(show) - {'eventtype', 'yandexuid', 'device_id', 'uuid'}:
            if not rec.get(x):
                rec[x] = show[x]
        return rec


class AtomBannersStatReduce(object):
    def __init__(self, fielddate, event_dict):
        self.fielddate = fielddate
        self.event_dict = event_dict
        self.normalize_key_regex = re.compile(r'[^a-zA-ZА-Яа-яЁё_\-\. \t0-9/]')

    def __call__(self, key, recs):
        counter = collections.defaultdict(lambda: collections.Counter())
        for rec in recs:
            if 'yesterday' in rec:
                continue
            rec = from_bytes(rec)
            testids = ['empty']
            if rec.get('testids'):
                testids = rec['testids'].split(',')
            testids.append('_total_')
            type_ = rec.get('type')
            device = rec.get('device')
            eventtype = self.event_dict[rec['eventtype']]
            try:
                args = [
                    rec['product'],
                    rec['referer'],
                    rec['country'],
                    rec['browser'],
                    rec['os'],
                    tuple(testids),
                    rec['distr_obj'],
                    rec['bannerid'],
                    type_,
                    device,
                    eventtype,
                    self.fielddate
                ]
            except:
                continue
            for rec_ in self.make_stat_recs(*args):
                counter[rec_['key']][rec_['eventtype']] += 1
        for key in counter:
            rec_2 = {k: v for k, v in counter[key].items()}
            rec_2['key'] = key
            yield rec_2

    def make_stat_recs(
        self,
        product, service, lang, browser, os_,
        testids, element, candidate, type_, device,
        eventtype, fielddate
    ):
        os_comb = (os_, '_total_')
        if device:
            os_comb = (os_, '{}_{}'.format(os_, device), '_total_')
        product_comb = (product, '_total_')
        if product in {'set', 'home', 'sethome', 'vb', 'vbch'}:
            product_comb = (product, '_allextensions_', '_total_')
        if product == 'browser' and type_:
            product_comb = (product, '{}_{}'.format(product, type_), '_total_')
        for comb in itertools.product(
            product_comb,
            (service, '_total_'),
            (lang, '_total_'),
            (browser, '_total_'),
            os_comb,
            testids,
            (element, '_total_'),
            (candidate, '_total_'),
        ):
            yield {
                'key': self.normalize_key_regex.sub('', tabulate(fielddate, *comb)),
                'eventtype': eventtype
            }


class ShowsFromYesterdayMap(object):
    def __init__(self):
        self.eventtypes = {'show', 'trueinstall', 'click'}

    def __call__(self, rec):
        rec = from_bytes(rec)
        if 'yesterday' in rec:
            return
        rec['yesterday'] = "True"
        if rec.get('eventtype', '') in self.eventtypes:
            yield rec


@yt.aggregator
class StatMap(object):
    def __init__(self):
        pass

    def __call__(self, recs):
        c = collections.defaultdict(lambda: collections.Counter())
        for rec in recs:
            rec = from_bytes(rec)
            for field in set(rec) - {'key'}:
                try:
                    c[rec['key']][field] += int(rec[field])
                except ValueError:
                    continue
        for key in c:
            result = {k: v for k, v in c[key].items()}
            result['key'] = key
            yield result


class StatReduce(object):
    def __init__(self):
        pass

    def __call__(self, key, recs):
        c = collections.Counter()
        for rec in recs:
            rec = from_bytes(rec)
            for field in set(rec) - {'key'}:
                try:
                    c[field] += int(rec[field])
                except ValueError:
                    continue
        result = {k: v for k, v in c.items()}
        result['key'] = key['key']
        yield result


def get_geobase(path='geobase.json'):
    gb = json.load(open('geobase.json'))
    geobase = {int(x['id']): preprocess_geobase(x) for x in gb}
    return geobase


def get_arcanum_token(path='.arcanum_token'):
    with codecs.open(path, 'r', 'utf8') as f:
        return f.read().strip()


def get_arcanum_headers():
    return {'Authorization': 'OAuth {}'.format(get_arcanum_token())}


def get_arcadia_testids():
    testids = requests.get(
        'https://a.yandex-team.ru/api/tree/blob/trunk/'
        'arcadia/analytics/atom/monitorings/testids.txt', verify=False,
        headers=get_arcanum_headers()
    ).content.decode('utf8')
    return parse_testids(testids, sep='\n')


def parse_testids(testids, sep=','):
    result = set()
    for ti in testids.split(sep):
        try:
            result.add(int(ti))
        except ValueError:
            pass
    return result


def main():
    global __file__                         # to fix stupid
    __file__ = os.path.abspath(__file__)    # __file__ handling

    os.chdir(os.path.dirname(__file__))

    parser = argparse.ArgumentParser()
    parser.add_argument('--debug', action='store_true')
    parser.add_argument('--from', default=None)
    parser.add_argument('--to', default=None)
    parser.add_argument('--max_workers', '-mw', type=int, default=10)
    parser.add_argument('--testids', default='')
    parser.add_argument('--redo', default='')
    parser.add_argument('--oldtestids', action='store_true')
    parser.add_argument('--nolock', action='store_true')
    parser.add_argument('--nopush', action='store_true')
    args = parser.parse_args()

    assert args.redo in ['', 'atom_banners', 'stat', 'all']

    logger = make_logger(__file__, debug=args.debug)

    geobase = get_geobase()

    monitoring = Monitoring(
        prefix='',
        filepath=__file__,
        default_id='2016-06-19',
        str_to_table_id=yt_get_date_from_table,
        process_table=process_table,
        logger=logger,
        today=False,
        mode='dates',
        confirmation=True
    )

    user_testids = (
        set(args.testids.split(','))
    )

    user_testids.discard('')
    user_testids = ','.join(user_testids)

    monitoring.start(args, {
        'geobase': geobase,
        'redo': args.redo,
        'debug': args.debug,
        'dicts': not args.nolock,
        'user_testids': user_testids,
        'old_testids': args.oldtestids,
        'nopush': args.nopush,
        'max_workers': args.max_workers
    })


def get_stat_headers():
    return {
        'StatRobotUser': 'robot_norberrt',
        'StatRobotPassword': 'pat57lesSenom0t'
    }


def get_stat_dict_api():
    return 'https://api.stat.yandex-team.ru/_api/dictionary'


def push_to_stat(
    chunk, logger, redo=False, ch=-1,
    name="Distribution/Others/AtomBanners/v4_daily"
):
    UPLOAD_URL = 'https://upload.stat.yandex-team.ru/_api/report/data'
    data_counters = {
        "name": name,
        "scale": "d",
        "_append_mode": 1,
        "parallel_upload": 1,
        "tsv_data": chunk.getvalue(),
    }
    if redo and ch == 1:
        data_counters['replace_mask'] = 'fielddate'
    req = None
    retries = 0
    while ((req is None or req.status_code != 200) and retries < 10):
        try:
            logger.info(
                '[CHUNK {}] Posting chunk to stat...'.format(ch)
            )
            req = requests.post(
                UPLOAD_URL, headers=get_stat_headers(), data=data_counters, timeout=300
            )
        except Exception as e:
            logger.error('[CHUNK {}] {}'.format(ch, e))
        try:
            logger.info('[CHUNK {}] {}'.format(ch, req.text))
        except:
            logger.info('[CHUNK {}] request has no content'.format(ch))
        if not req or (req.status_code != 200):
            time.sleep(60)
            retries += 1
    return True


def yaml_dump(dictionary):
    return yaml.safe_dump(dictionary, default_flow_style=False,
                          explicit_start=True)


def reverse_dict(dictionary):
    return {v: k for k, v in dictionary.items()}


def yaml_dump_all():
    for d in statdict:
        with codecs.open('{}.yaml'.format(d), 'w', 'utf8') as f:
            f.write(yaml_dump(
                reverse_dict(statdict[d])
            ))


def get_set_dictionary(dictionary, value, logger=None, name=None):
    if value in dictionary:
        return dictionary[value]
    newv = zmax(set(dictionary.values())) + 1
    dictionary[value] = newv
    if logger:
        logger.info('[{}]: Set {} to {}'.format(
            name, value, newv
        ))
    return dictionary[value]


def get_set_dictionary_testid(dictionary, value):
    if value in dictionary:
        return dictionary[value]
    dictionary['1000000'] = 1000000
    if value == '_total_':
        dictionary[value] = 0
    else:
        try:
            dictionary[value] = int(value)
        except ValueError:
            dictionary[value] = zmax(set(dictionary.values())) + 1
    return dictionary[value]


def zmax(seq):
    if len(seq) > 0:
        return max(seq)
    return -1


def process_ident_4(ident, logger=None):
    fd, product, service, \
        lang, browser, _os, testid, \
        element, candidate = ident.split('\t')
    product = get_set_dictionary(
        statdict['product'], product, logger=logger, name='product'
    )
    service = get_set_dictionary(
        statdict['service'], service, logger=logger, name='service'
    )
    lang = get_set_dictionary(
        statdict['lang'], lang, logger=logger, name='lang'
    )
    browser = get_set_dictionary(
        statdict['browser'], browser, logger=logger, name='browser'
    )
    _os = get_set_dictionary(
        statdict['os'], _os, logger=logger, name='os'
    )
    testid = get_set_dictionary_testid(statdict['testid'], testid)
    element = get_set_dictionary(
        statdict['element'], element, logger=logger, name='element'
    )
    candidate = get_set_dictionary(
        statdict['candidate'], candidate, logger=logger, name='candidate'
    )
    return tabulate(fd, product, service,
                    lang, browser, _os, testid,
                    element, candidate)


def tryjson(s):
    try:
        json.loads(s)
        return True
    except:
        return False


def get_dictionary_from_stat(name, logger=None, debug=False, ch=-1):
    req = None
    if not logger:
        logger = dummy_logger
    while not req or req.status_code != 200 or not tryjson(req.content):
        logger.info('[CHUNK {}] getting {} from stat'.format(ch, name))
        try:
            req = requests.get(
                '{}?name={}'.format(get_stat_dict_api(), name),
                headers=get_stat_headers(), timeout=60
            )
        except:
            logger.error('[CHUNK {}] {}'.format(ch, traceback.format_exc()))
            if debug:
                pdb.set_trace()
            time.sleep(5)
    if req.status_code == 200:
        result = {int(k): v for k, v in json.loads(req.content).items()}
        if result:
            return result
        else:
            raise Exception(
                'EXTREMELY WRONG SHIT HAPPENED '
                'PLS PING ASNYTIN@ ASAP: {}'.format(name)
            )
    return req


def post_dictionary_to_stat(dictionary, name):
    req = None
    while not req or req.status_code != 200:
        try:
            req = requests.post(
                get_stat_dict_api(),
                headers=get_stat_headers(),
                data={
                    'name': name,
                    'language': '',
                    'dictionary': json.dumps(dictionary),
                    'editors': ['norberrt', 'riddle'],
                },
                timeout=120
            )
        except:
            time.sleep(60)
    return req


def remove(path):
    if yt_exists(path):
        yt.remove(path)


def not_finished(path, truedate):
    if not yt_exists(path):
        return True
    date = datetime.datetime.strptime(
        yt.get_attribute(
            path, 'modification_time'
        ), "%Y-%m-%dT%H:%M:%S.%fZ"
    ).date()
    if date <= truedate:
        return True
    return False


def testids_to_cgi(lst):
    return '&'.join('id={}'.format(x) for x in lst)


def get_app_id_dict():
    app_id_dict = {}
    with codecs.open('APP_ID_DICT.csv', 'r', 'utf8') as f:
        for line in f:
            commas = line.rstrip().split(';')
            if len(commas) < 4:
                continue
            if commas[3]:
                app_id_dict[commas[1]] = commas[3]
            else:
                app_id_dict[commas[1]] = commas[1]
    return app_id_dict


def get_testids(user_testids='', old_testids=False, logger=dummy_logger):
    arcadia_testids = get_arcadia_testids()
    user_testids = parse_testids(user_testids)
    ab_testids = get_testids_ab(aspects=('atom', 'promolib'), logger=logger)
    return arcadia_testids | user_testids | ab_testids


def get_testids_ab(aspects, logger):
    url = 'http://ab.yandex-team.ru/api/task?form=full&aspect={}'
    testids = []
    for aspect in aspects:
        try:
            aspect_tasks = requests.get(url.format(aspect)).json()
        except:
            aspect_tasks = []
            logger.error('Failed to retrieve testids for aspect={}'.format(aspect))
        for task in aspect_tasks:
            if task['state'] in {'RUNNING', 'IN_CONFIG'}:
                testids.extend(task['testids'])
    return set(testids)


class ChunkReader(object):
    def __init__(self, stat_reduce2_table, logger):
        self.ch = 0
        self.read_counter = 0
        self.records_count = yt.table_commands.row_count(
            stat_reduce2_table
        )
        self.table = stat_reduce2_table
        self.logger = logger
        self.empty = False
        self.free = True

    def __iter__(self):
        return self

    def __next__(self):
        return self.next()

    def next(self):
        if self.free:
            self.free = False
        if self.read_counter >= self.records_count:
            self.empty = True
            raise StopIteration()
        chunk = StringIO.StringIO()
        header_row = tabulate(
            'fielddate', 'product', 'service',
            'lang', 'browser', 'os', 'testid',
            'element', 'candidate', 'shows', 'installs',
            'cancels', 'clicks', 'trueinstalls'
        )
        print(header_row, file=chunk)
        self.ch += 1
        for x in ['product', 'service', 'lang', 'browser',
                  'description', 'os', 'testid', 'element']:
            statdict[x] = reverse_dict(get_dictionary_from_stat(
                'vcfs::atombanners_{}'.format(x),
                logger=self.logger, ch=self.ch
            ))
        candidates_from_stat = get_dictionary_from_stat(
            'vcfs::atombanners_candidate', logger=self.logger, ch=self.ch
        )
        candidates_to_title_snippets = {}
        for x in candidates_from_stat:
            candidates_to_title_snippets[
                unicode(candidates_from_stat[x]).split(' | ')[0]
            ] = unicode(candidates_from_stat[x])
            candidates_from_stat[x] = unicode(
                candidates_from_stat[x]).split(' | ')[0]
        statdict['candidate'] = reverse_dict(candidates_from_stat)
        ex_statdict = copy.deepcopy(statdict)
        self.logger.info(
            '[CHUNK {}] Getting records from table {}'
            ' starting with id {}...'.format(
                self.ch, self.table, self.read_counter
            )
        )
        for rec in yt.read_table(yt.TablePath(
            self.table,
            start_index=self.read_counter,
            end_index=min(self.read_counter + 100000, self.records_count),
        ), raw=False
        ):
            rec = from_bytes(rec)
            key = rec['key']
            value = tabulate(
                rec.get('shows', '0'),
                rec.get('installs', '0'),
                rec.get('cancels', '0'),
                rec.get('clicks', '0'),
                rec.get('trueinstalls', '0')
            )
            print(tabulate(process_ident_4(
                key,
                logger=self.logger
            ), value), file=chunk)
            self.read_counter += 1
        if statdict != ex_statdict:
            self.logger.info(
                '[CHUNK {}] Posting updated dictionaries...'.format(
                    self.ch
                )
            )
            for x in ['product', 'service', 'lang', 'browser',
                      'os', 'testid', 'element']:
                post_dictionary_to_stat(
                    reverse_dict(statdict[x]),
                    'vcfs::atombanners_{}'.format(x)
                )
            revcand = reverse_dict(statdict['candidate'])
            for x in revcand:
                if revcand[x] in candidates_to_title_snippets:
                    revcand[x] = candidates_to_title_snippets[revcand[x]]
            post_dictionary_to_stat(
                revcand,
                'vcfs::atombanners_candidate'
            )
        else:
            self.logger.info(
                '[CHUNK {}] Dicts are identical, no need to post'.format(
                    self.ch
                )
            )
        self.free = True
        return (self.ch, chunk)


def push_to_stat_parallel(
    stat_reduce2_table, logger, max_workers=10, redo=False
):
    chunk_reader = ChunkReader(stat_reduce2_table, logger=logger)
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as e:
        while not chunk_reader.empty:
            if e._work_queue.qsize() > 10:
                continue
            try:
                id_, chunk = next(chunk_reader)
            except StopIteration:
                logger.info('Finished reading chunks')
                continue
            e.submit(push_to_stat, chunk, logger, redo=redo, ch=id_)


class ReqidReMap(object):
    def __init__(self):
        self.id_regex = re.compile(r'[^0-9a-zA-Z]')

    def __call__(self, records):
        for rec in records:
            clean_id = self.id_regex.sub('', rec.key).lower()
            if not clean_id:
                continue
            try:
                parsed = json.loads(rec.value)
            except:
                continue
            for d_id in (parsed.get('d') or []):
                yield Record(
                    yandexuid=clean_id,
                    device_id=d_id.upper()
                )


def process_table(
    srctable, redo=False,
    geobase={}, logger=dummy_logger, dicts=True,
    user_testids='', old_testids=False, nopush=False, debug=False,
    max_workers=10
):
    logger.info('Source date is {}'.format(srctable))
    candidates = get_candidates()
    truedate = srctable
    date = truedate.strftime('%Y-%m-%d')

    event_dict = {
        'close_stripe': 'cancels',
        'closestripe': 'cancels',
        'close-stripe': 'cancels',
        'close': 'cancels',
        'cancel': 'cancels',
        'show': 'shows',
        'download': 'installs',
        'install': 'installs',
        'click': 'clicks',
        'trueinstall': 'trueinstalls'
    }
    schema = [
        {"type": "string", "name": "key"},
        {"type": "string", "name": "unixtime"},
        {"type": "string", "name": "yandexuid"},
        {"type": "string", "name": "bannerid"},
        {"type": "string", "name": "browser"},
        {"type": "string", "name": "country"},
        {"type": "string", "name": "distr_obj"},
        {"type": "string", "name": "eventtype"},
        {"type": "string", "name": "product"},
        {"type": "string", "name": "referer"},
        {"type": "string", "name": "reqid"},
        {"type": "string", "name": "score"},
        {"type": "string", "name": "showid"},
        {"type": "string", "name": "testids"}
    ]
    logger.info('Getting testids...')
    testids = get_testids(
        old_testids=old_testids, user_testids=user_testids, logger=logger
    )
    logger.info('Using testids: {}'.format(
        ', '.join(str(x) for x in sorted(testids))
    ))

    try:
        context_dict = StringIO.StringIO(requests.get(
            "https://a.yandex-team.ru/api/tree/blob/trunk/arcadia_tests_data/"
            "pers/rerank_service/front/data/context_dict.txt", verify=False,
            headers=get_arcanum_headers()
        ).content)
        if 'Guest is not allowed' in context_dict.getvalue():
            raise Exception('API kicked our ass')
        logger.info('Fetched context_dict from arcadia')
    except:
        logger.error(traceback.format_exc())
        context_dict = codecs.open('context_dict.txt', 'r', 'utf8')
        logger.info('Fetched context_dict from local file')
    browser_dict = {}
    os_dict = {}
    for line in context_dict:
        tabs = line.decode('utf8').rstrip().split()
        if len(tabs) == 3 and tabs[0] == '2':
            browser_dict[tabs[1]] = tabs[2]
        if len(tabs) == 3 and tabs[0] == '3':
            os_dict[tabs[1]] = tabs[2]
    context_dict.close()

    app_id_dict = get_app_id_dict()

    out_prefix = '//home/personalization/v4_daily/{}/'.format(date)
    redir_map_table = out_prefix + 'redir_tmp'
    atom_banners_table_a = out_prefix + 'atom_banners_after_redir'
    atom_banners_table_b = out_prefix + 'atom_banners_after_metrika'
    atom_banners_table_c = out_prefix + 'atom_banners_after_export'
    atom_banners_table_e = out_prefix + 'atom_banners_after_tracking'
    atom_banners_table_f = out_prefix + 'atom_banners_after_yesterday'
    atom_banners_table_g = out_prefix + 'atom_banners_after_crypta'
    atom_banners_table = out_prefix + 'atom_banners_raw'
    atom_banners_table_cube = out_prefix + 'atom_banners'
    stat_reduce1_table = out_prefix + 'stat_tmp'
    stat_reduce2_table = out_prefix + 'stat'
    atom_banners_yesterday = (
        '//home/personalization/v4_daily/{}/atom_banners'.format(
            truedate - datetime.timedelta(days=1)
        )
    )
    watch_log_tmp_1 = out_prefix + 'watch_log_tmp'

    if datetime.date.today() == truedate:
        export_access_tables = [
            yt.TablePath(x, columns=['request', 'iso_eventtime'])
            for x in yt.search(
                root='//logs/export-access-log/30min',
                path_filter=lambda x: date in x
            )
        ]
        redir_tables = [
            x for x in yt.search(
                root='//logs/redir-log/30min',
                path_filter=lambda x: date in x
            )
        ]
        metrika_mobile_tables = [
            yt.TablePath(
                x,
                columns=[
                    'EventValue', 'EventName', 'AppID',
                    'AppPlatform', 'ClientIP', 'SendTimestamp',
                    'UUID', 'DeviceID'
                ]
            ) for x in yt.search(
                root='//logs/metrika-mobile-log/30min',
                path_filter=lambda x: date in x
            )
        ]
        watch_log_tables = [
            yt.TablePath(
                x, columns=['uniqid', 'url', 'unixtime']
            ) for x in yt.search(
                root='//logs/bs-watch-log/30min',
                path_filter=lambda x: date in x
            )
        ]
    else:
        export_access_tables = [
            yt.TablePath(x, columns=['request', 'iso_eventtime'])
            for x in yt.search(
                root='//logs/export-access-log/1d',
                path_filter=lambda x: date in x
            )
        ]
        redir_tables = [
            x for x in yt.search(
                root='//logs/redir-log/1d',
                path_filter=lambda x: date in x
            )
        ]
        watch_log_tables = [
            yt.TablePath(
                x, columns=['uniqid', 'url', 'unixtime']
            ) for x in yt.search(
                root='//logs/bs-watch-log/1d',
                path_filter=lambda x: date in x
            )
        ]
        metrika_mobile_tables = [
            yt.TablePath(
                x,
                columns=[
                    'EventValue', 'EventName', 'AppID',
                    'AppPlatform', 'ClientIP', 'SendTimestamp',
                    'UUID', 'DeviceID'
                ]
            ) for x in yt.search(
                root='//logs/metrika-mobile-log/1d',
                path_filter=lambda x: date in x
            )
        ]
    mobile_tracking_table = yt.TablePath(
        '//statbox/mobile-tracking-log/{}'.format(date),
        columns=['AppID', 'EventType', 'UrlParameters_Keys',
                 'UrlParameters_Values', 'SendTimestamp']
    )
    time_finished_stat = '//home/personalization/v4_daily/time_finished_stat'
    time_finished = '//home/personalization/v4_daily/time_finished'
    crypta_extract_table = (
        'home/personalization/production/crypta_extract/id2generalized_id'
    )

    redo1 = redo in {'atom_banners', 'all'}
    redo2 = redo in {'stat', 'all'}

    if not redo and not_finished(atom_banners_table_cube, truedate):
        logger.info('atom_banners table is not finished, redoing...')
        if yt_exists(atom_banners_table):
            yt.remove(atom_banners_table)
        if yt_exists(atom_banners_table_cube):
            yt.remove(atom_banners_table_cube)
    if not yt_exists(atom_banners_table_cube) or redo1:
        if ((not yt_exists(atom_banners_table_a) and
             not yt_exists(atom_banners_table_b) and
             not yt_exists(atom_banners_table_c) and
             not yt_exists(atom_banners_table_e) and
             not yt_exists(atom_banners_table_f) and
             not yt_exists(atom_banners_table_g)) or redo1):
            logger.info('{} -> {}'.format(redir_tables, redir_map_table))
            yt.run_map(RedirMap(),
                       source_table=redir_tables,
                       destination_table=redir_map_table,
                       spec=SPEC10k)
        if yt_exists(redir_map_table):
            logger.info('{} -> {}'.format(
                redir_map_table, atom_banners_table_a
            ))
            yt.run_sort(source_table=redir_map_table,
                        destination_table=redir_map_table,
                        sort_by=['yandexuid', 'unixtime'])
            yt.run_reduce(RedirReduce(testids, geobase),
                          source_table=redir_map_table,
                          destination_table=atom_banners_table_a,
                          reduce_by=['yandexuid'])
            yt.remove(redir_map_table)
        if yt_exists(atom_banners_table_a):
            logger.info('{} -> {}'.format(metrika_mobile_tables,
                                          atom_banners_table_a))
            yt.run_map(
                MetrikaMobileMap(candidates, app_id_dict, testids, geobase),
                source_table=metrika_mobile_tables,
                destination_table=append(atom_banners_table_a),
                spec=SPEC10k,
            )
            yt.move(
                source_path=atom_banners_table_a,
                destination_path=atom_banners_table_b,
                force=True
            )
        if yt_exists(atom_banners_table_b):
            logger.info('{} -> {}'.format(export_access_tables,
                                          atom_banners_table_b))
            yt.run_map(ExportAccessMap(date),
                       source_table=export_access_tables,
                       destination_table=append(atom_banners_table_b),
                       spec=SPEC10k)
            yt.move(
                source_path=atom_banners_table_b,
                destination_path=atom_banners_table_c,
                force=True
            )
        if yt_exists(atom_banners_table_c):
            logger.info('{} -> {}'.format(mobile_tracking_table,
                                          atom_banners_table_c))
            yt.run_map(MobileTrackingMap(),
                       source_table=mobile_tracking_table,
                       destination_table=append(atom_banners_table_c),
                       spec=SPEC10k)
            yt.move(
                source_path=atom_banners_table_c,
                destination_path=atom_banners_table_e,
                force=True
            )
        if yt_exists(atom_banners_table_e):
            if yt_exists(atom_banners_yesterday):
                logger.info('{} -> {}'.format(atom_banners_yesterday,
                                              atom_banners_table_e))
                yt.run_map(
                    ShowsFromYesterdayMap(),
                    source_table=atom_banners_yesterday,
                    destination_table=append(atom_banners_table_e),
                )
            else:
                logger.info('{} does not exist, moving on...'.format(
                    atom_banners_yesterday
                ))
            yt.move(
                source_path=atom_banners_table_e,
                destination_path=atom_banners_table_f,
                force=True
            )
        if yt_exists(atom_banners_table_f):
            logger.info('{} -> {}'.format(watch_log_tables, watch_log_tmp_1))
            yt.run_map(
                WatchLogMap(),
                source_table=watch_log_tables,
                destination_table=watch_log_tmp_1,
                spec=SPEC10k
            )

            hahn = clusters.Hahn(
                pool='search-research_{}'.format(getpass.getuser())
            ).env()
            job = hahn.job()

            clicks_extracted = job.table(
                atom_banners_table_f.encode('utf8')
            ).filter(
                nf.and_(
                    nf.equals(b'eventtype', b'click'),
                    nf.equals(b'product', b'default_search_mobilesafari_ios'),
                    nf.not_(nf.equals(b'yesterday', b'True'))
                )
            )

            reqid_remapped = job.table(
                crypta_extract_table.encode('utf8')
            ).map(
                ReqidReMap()
            )

            watch_log_remapped = job.table(
                watch_log_tmp_1.encode('utf8')
            )

            jnd = reqid_remapped.join(
                watch_log_remapped, by=b'yandexuid', type=b'inner'
            )

            clicks_extracted.join(
                watch_log_remapped, by=b'yandexuid', type=b'inner'
            ).put(
                atom_banners_table_f.encode('utf8'), append=True
            )

            clicks_extracted.join(
                jnd, by=b'device_id', type=b'inner'
            ).put(
                atom_banners_table_f.encode('utf8'), append=True
            )

            job.run()

            remove(watch_log_tmp_1)

            yt.move(
                source_path=atom_banners_table_f,
                destination_path=atom_banners_table_g,
                force=True
            )
        if yt_exists(atom_banners_table_g):
            yt.run_sort(
                source_table=atom_banners_table_g,
                destination_table=atom_banners_table_g,
                sort_by=['key', 'unixtime', 'yandexuid'],
                spec=SPEC10k
            )
            yt.move(
                source_path=atom_banners_table_g,
                destination_path=atom_banners_table,
                force=True
            )
            yt.set_attribute(
                atom_banners_table,
                '_read_schema',
                schema
            )
        if yt_exists(atom_banners_table):
            yt.run_reduce(
                AtomBannersReduce(browser_dict, os_dict, event_dict, date),
                source_table=atom_banners_table,
                destination_table=atom_banners_table_cube,
                reduce_by=['key'],
                spec=SPEC10k
            )
            yt.run_sort(
                source_table=atom_banners_table_cube,
                destination_table=atom_banners_table_cube,
                sort_by=['key', 'unixtime', 'yandexuid'],
                spec=SPEC10k
            )
            yt.set_attribute(
                atom_banners_table_cube,
                '_read_schema',
                schema
            )
    if (not yt_exists(stat_reduce2_table) or redo2):
        if not yt_exists(stat_reduce1_table) or redo2:
            logger.info('{} -> {}'.format(
                atom_banners_table_cube, stat_reduce1_table
            ))
            yt.run_reduce(AtomBannersStatReduce(date, event_dict),
                          source_table=atom_banners_table_cube,
                          destination_table=stat_reduce1_table,
                          reduce_by=['key'],
                          spec=SPEC10k,
                          )
            logger.info('sorting {}...'.format(stat_reduce1_table))
            yt.run_sort(
                source_table=stat_reduce1_table,
                destination_table=stat_reduce1_table,
                sort_by=['key'],
            )
        if not yt.get_attribute(stat_reduce1_table, 'sorted'):
            logger.info('sorting {}...'.format(stat_reduce1_table))
            yt.run_sort(
                source_table=stat_reduce1_table,
                destination_table=stat_reduce1_table,
                sort_by=['key'],
            )
        logger.info('{} -> {}'.format(
            stat_reduce1_table, stat_reduce2_table
        ))
        yt.run_map_reduce(
            mapper=StatMap(),
            reduce_combiner=StatReduce(),
            reducer=StatReduce(),
            source_table=stat_reduce1_table,
            destination_table=stat_reduce2_table,
            reduce_by=['key'],
            # spec=SPEC10k
            spec={
                "job_count": 20000,
                "map_job_count": 20000,
                "reduce_job_count": 20000,
                "data_size_per_sort_job": 1024 * 1024 * 256,
                "pool": "search-research_{}".format(getpass.getuser())
            }
        )
        yt.run_sort(
            source_table=stat_reduce2_table,
            destination_table=stat_reduce2_table,
            sort_by=['key']
        )
        yt.remove(stat_reduce1_table)

    if yt_exists(stat_reduce2_table):
        yt.write_table(
            append(time_finished_stat),
            [{
                "date": format(truedate),
                "time_finished": datetime.datetime.now().strftime(
                    '%Y-%m-%d %H:%M:%S'
                )
            }]
        )
        yt.run_merge(
            source_table=time_finished_stat,
            destination_table=time_finished_stat,
            spec={'combine_chunks': True}
        )
    if yt_exists(stat_reduce2_table) and not nopush:
        push_to_stat_parallel(
            stat_reduce2_table, logger, max_workers=max_workers, redo=redo
        )
    yaml_dump_all()
    if not_finished(atom_banners_table, truedate):
        logger.info('Removing everything as table is not finished')
        remove(redir_map_table)
        remove(atom_banners_table_a)
        remove(atom_banners_table_b)
        remove(atom_banners_table_c)
        remove(atom_banners_table_e)
        remove(atom_banners_table)
        remove(atom_banners_table_cube)
        remove(stat_reduce1_table)
        remove(stat_reduce2_table)
        return False
    else:
        if not nopush:
            logger.info('Table finished')
            yt.write_table(
                append(time_finished),
                [{
                    "date": format(truedate),
                    "time_finished": datetime.datetime.now().strftime(
                        '%Y-%m-%d %H:%M:%S'
                    )
                }]
            )
            yt.run_merge(
                source_table=time_finished,
                destination_table=time_finished_stat,
                spec={'combine_chunks': True}
            )
        return True


if __name__ == "__main__":
    main()
