#!/usr/bin/env python
#! -*- coding: utf-8 -*-
from __future__ import unicode_literals
from __future__ import division
from __future__ import print_function
import sys
import os
import re
# from tqdm import tqdm
import codecs
import copy
import shutil
import contextlib
import itertools
import logging
import toml
import pdb
import argparse
import tempfile
import traceback
import subprocess
import mapreducelib
import threading
import urllib
import time
import psutil
import requests
from StringIO import StringIO
import json
from time import sleep
try:
    import thread
except ImportError:
    import _thread as thread
from mapreducelib import MapReduce, Record
import urlparse
from collections import defaultdict, Counter, namedtuple
import datetime
from pecheny.mrdef import defaults
from pecheny.moncommons import push_to_razladki
import yaml
import base64
from Crypto.Cipher import Blowfish

browser_dict = {}
os_dict = {}
statdict = defaultdict(lambda: {})


@contextlib.contextmanager
def make_temp_file(**kwargs):
    temp_file = tempfile.mkstemp(**kwargs)
    yield temp_file
    os.remove(temp_file[1])


def deutf8ify(rec):
    if isinstance(rec, mapreducelib.SubkeyedRecord):
        key, subkey, value = rec.key, rec.subkey, rec.value
        if not isinstance(key, unicode):
            key = key.decode('utf8', errors='replace')
        if not isinstance(subkey, unicode):
            subkey = subkey.decode('utf8', errors='replace')
        if not isinstance(value, unicode):
            value = value.decode('utf8', errors='replace')
        return Record(key, subkey, value)
    elif isinstance(rec, str):
        rec = rec.decode('utf8', errors='replace')
    return rec


def utf8ify(rec):
    if isinstance(rec, mapreducelib.SubkeyedRecord):
        if isinstance(rec.key, unicode):
            rec.key = rec.key.encode('utf8')
        if isinstance(rec.subkey, unicode):
            rec.subkey = rec.subkey.encode('utf8')
        if isinstance(rec.value, unicode):
            rec.value = rec.value.encode('utf8')
        return rec
    elif isinstance(rec, unicode):
        rec = rec.encode('utf8')
    return rec


def tryint(string):
    try:
        return int(string)
    except:
        return -1


ATOM_URL = 'http://atom-admin.n.yandex-team.ru/atom/api/v1/'
i = 0


def get_token():
    with codecs.open('{}/.atom_token'
                     .format(os.path.dirname(
                         os.path.abspath(__file__)
                     )), 'r') as f:
        return f.read().rstrip()


def safediv(x, y):
    try:
        return x / y
    except ZeroDivisionError:
        return 0


ATOM_HEADERS = {}


def get_candidates():
    # req = requests.get(
    #     'http://sas1-5350.search.yandex.net:10260/_golovan')
    # obj = json.loads(req.content.decode('utf8'))
    # cands = [x for x in obj if 'CandidatesVersion' in x[0]]
    # task_id = cands[0][1]
    req = requests.get(
        ATOM_URL + 'version/production', headers=ATOM_HEADERS)
    task_id = json.loads(req.content.decode('utf8'))['task_id']
    sb_req = requests.get(
        'https://sandbox.yandex-team.ru:443/api/v1.0/resource?'
        'type=PERS_ATOM_CANDIDATES&task_id={}&limit=1'
        .format(task_id))
    sb_obj = json.loads(sb_req.content.decode('utf8'))
    rslink = sb_obj['items'][0]['http']['links'][0].replace(
        'http://', 'rsync://')
    rslink = re.sub(r':[0-9]+', '/sandbox-tasks', rslink)
    subprocess.call(['rsync', '-r', '{}'.format(rslink), '.'])
    result = {}
    for x in os.walk('atom_candidates'):
        for y in x[2]:
            if y.endswith('.json'):
                obj = json.load(open(os.path.join(x[0], y)))
                for cand in obj:
                    result[cand.get('internal-url', '')
                           .split('/')[-1]] = cand.get('__product', '')
    subprocess.call(['chmod', '-R', '+w', 'atom_candidates'])
    try:
        shutil.rmtree('atom_candidates')
    except:
        pass
    return result


def reqid_to_key(reqid):
    return ''.join(filter(unicode.isdigit, reqid))[:22]


def dttots(dt_):
    return int((dt_ - datetime.datetime(1970, 1, 1)).total_seconds())


def parsevars(_vars):
    commas = _vars.split(',')
    result = {'clids': []}
    for x in commas:
        if len(x.split('=')) > 1:
            key = x.split('=')[0]
            value = '='.join(x.split('=')[1:])
            if key.startswith('-'):
                key = key[1:]
            if key.startswith('clid'):
                result['clids'].append(value)
            else:
                result[key] = value
        else:
            result[x] = 'SINGLE'
    return defaultdict(lambda: '', result)


def parseparams(value):
    tabs = value.split('\t')
    result = {}
    for x in tabs:
        if len(x.split('=')) > 1:
            result[x.split('=')[0]] = '='.join(x.split('=')[1:])
        else:
            result[x] = 'SINGLE'
    return defaultdict(lambda: '', result)


def tabulate(*args):
    return '\t'.join(map(format, args))


def ntabulate(*args):
    return tabulate(*args) + '\n'


def gettld(url):
    if not '//' in url:
        url = 'http://' + url
    parsed = urlparse.urlparse(url)
    nl = parsed.netloc
    return nl.split('.')[-1].split(':')[0]


def sort_a_file(filename):
    handle, t_f = tempfile.mkstemp(dir='.')
    with codecs.open(filename, 'r', 'utf8') as f:
        lines = [line for line in f.read().split('\n') if line != '']
    lines.sort()
    with codecs.open(t_f, 'w', 'utf8') as tf:
        tf.write('\n'.join(lines))
    shutil.copy(t_f, filename)
    os.close(handle)
    os.remove(t_f)


def reqlist(*listnames):
    return urllib.quote(json.dumps(
        {
            'atom-candidates':
            {
                        listname: None for listname in listnames
                        }
        }
    ).encode('utf8')).decode('utf8')

skeleton = ('http://querysearch-atom.search.yandex.net/yandsearch'
            '?ms=querysearch:json:3'
            '&rearr=qd_struct_keys={}'
            '&waitall=da'
            '&timeout=1000000')
badlists = {'news', 'service_block_ru'}


def get_unique_candidates(logger=None, debug=False):
    import requests
    import arrow
    req = requests.get(skeleton.format(reqlist('all_keys')))
    allkeysjson = json.loads(req.content)
    allkeys = allkeysjson['Data'][0]['Value']
    data = []
    for key in allkeys:
        req1 = requests.get(skeleton.format(reqlist(key)))
        cands = json.loads(req1.content)
        cands1 = cands['Data'][:1]
        data.extend(cands1)
    if debug:
        with open('candidates_{}.json'.format(
                arrow.now().timestamp), 'w') as f:
            f.write(json.dumps(data, indent=4, ensure_ascii=False)
                    .encode('utf8', errors='replace'))
    keysset = defaultdict(lambda: set())
    result = {}
    descriptions = {}
    for delem in [x for x in data if 'Key' in x]:
        identifier = 'internal-url'
        listname = delem['Key'][0]
        if listname in badlists:
            continue
        for elem in delem['Value']:
            try:
                candidate_id = elem[identifier].split('/')[-1]
            except (IndexError, KeyError):
                continue
            keysset[candidate_id].add(listname)
            if 'title' in elem and elem['title']:
                result[candidate_id] = elem['title']
            if 'snippet' in elem and elem['snippet']:
                if candidate_id not in result:
                    result[candidate_id] = elem['snippet']
                else:
                    result[candidate_id] += ' | {}'.format(elem['snippet'])
            if not candidate_id in descriptions:
                descriptions[candidate_id] = 'empty'
            try:
                descriptions[candidate_id] = elem['aux-data']['_description']
            except KeyError:
                pass
    return result, descriptions


def remove_slashes(s1):
    while len(s1) > 0 and s1[0] == '/':
        s1 = s1[1:]
    while len(s1) > 0 and s1[-1] == '/':
        s1 = s1[:-1]
    return s1

prefices = ['http://', 'https://', 'www.', 'm.']
banned_hosts = ['kokoc.com', 'nblu.ru', 'cmle.ru',
                'omg5.ru', 'xorod.ru', 'sofro.ru', 'dresk.ru', 'dd34.ru']


def normalize_host(s1):
    """
    >>> print(normalize_host('https://yandex.ru/search/?text=Skoda'))
    yandex.ru/search
    >>> print(normalize_host('https://yandex.ru/yandsearch/?text=Skoda'))
    yandex.ru/search
    >>> print(normalize_host('https://yandex.ru/?utm_source=blabla'))
    yandex.ru
    >>> print(normalize_host('http://maps.yandex.ru/213/moscow/?text=1234&sll=37.620393%2C55.753960&sspn=1.128845%2C0.535158&ll=37.718247%2C55.732813&z=10'))
    maps.yandex.ru
    """
    result = s1
    while any([result.startswith(prefix) for prefix in prefices]):
        for prefix in prefices:
            if result.startswith(prefix):
                result = result[len(prefix):]
    if '?' in result:
        result = result[:result.index('?')]
    result = remove_slashes(result)
    if ('/' in result and len(result.split('/')) > 1):
        if result.split('/')[0].split('.')[0] == 'yandex':
            result = result.split('/')[0] + '/' + result.split('/')[1]
        else:
            result = result.split('/')[0]
    result = result.replace('yandsearch', 'search')
    result = result.replace(':', '')
    if not 'yandex' in result:
        result = 'other'
    return result[:25]

iv_key = "pWAnOKnfLKdjE2Ej16jQcw=="
key = "yTbJ8bAI5qQrWEmwah2xYw=="


def blow_decrypt(source):
    bs = Blowfish.block_size
    s = base64.b64decode(
        source.replace('-', '+').replace('_', '/'))
    iv = s[:bs]
    iv_cipher = Blowfish.new(
        base64.b64decode(iv_key), Blowfish.MODE_CBC, "arcadia+")
    iv_decrypted = iv_cipher.decrypt(iv)
    data = s[bs:]
    data_decrypted = Blowfish.new(
        base64.b64decode(key),
        Blowfish.MODE_CBC,
        iv_decrypted
    ).decrypt(data)
    while data_decrypted[-1] == b'\x00':
        data_decrypted = data_decrypted[:-1]
    return data_decrypted.decode('utf8')


def preprocess_geobase(z):
    x = copy.deepcopy(z)
    for k in ['id', 'type']:
        x[k] = int(x[k])
    x['path'] = [int(y) for y in x['path'].split(', ') if y]
    return x


def get_country(x, jgb):
    obj = jgb[x]
    if obj['type'] < 3:
        return
    if obj['type'] == 3:
        return
    for y in obj['path']:
        if jgb[y]['type'] == 3:
            return jgb[y]['iso_name'].split()[0].lower()


class FastlogsReduce(object):

    def __init__(self, testids, geobase):
        self.testids = set(testids)
        self.geobase = geobase

    def __call__(self, key, recs):
        from collections import defaultdict
        key = deutf8ify(key)
        shows = defaultdict(lambda: [])
        for rec1 in recs:
            rec = deutf8ify(rec1)
            params = parseparams(rec.value)
            pvars = parsevars(params['vars'])
            if (params['type'] == 'TECH'
                    and params['path'].startswith('tech.portal-ads.')
                    and pvars['reqid']):  # TODO: return -ATOMS- check
                if pvars['eventtype'] == 'show':
                    referer = normalize_host(params['referer'])
                    country = (self.get_country_from_referer(
                        params['referer']
                    ) or '')
                    showid = pvars['showid']
                    product = pvars['product']
                    testids = ''
                    try:
                        if pvars['adata']:
                            testids = ','.join(map(format, sorted(
                                set(int(x) for x in json.loads(
                                    blow_decrypt(pvars['adata'])
                                ).get('test-ids'))
                                & self.testids
                            )))
                    except:
                        pass
                    shows[pvars['reqid']].append({
                        'referer': referer,
                        'showid': pvars['showid'],
                        'country': country,
                        'product': product,
                        'score': pvars['score'],
                        'bannerid': pvars['bannerid'],
                        'test-ids': testids,
                    })
                    yield utf8ify(
                        Record(
                            reqid_to_key(pvars['reqid']),
                            rec.subkey,
                            tabulate(params['path'][16:]
                                     + ('.training'
                                        if pvars['score'] == '100000' else ''),
                                     pvars['eventtype'],
                                     country or params['dom-region'],
                                     pvars['bannerid'],
                                     pvars['showid'],
                                     referer,
                                     product,
                                     rec.key,
                                     pvars['reqid'],
                                     testids), tableIndex=0
                        ))
                elif pvars['reqid'] in shows:
                    for show in shows[pvars['reqid']]:
                        if pvars['bannerid'] == show['bannerid']:
                            showid = show['showid']
                            referer = show['referer']
                            product = show['product']
                            score = show['score']
                            country = show['country']
                            bannerid = show['bannerid']
                            testids = show['test-ids']
                            yield utf8ify(
                                Record(
                                    reqid_to_key(pvars['reqid']),
                                    rec.subkey,
                                    tabulate(params['path'][16:]
                                             + ('.training'
                                                if score == '100000' else ''),
                                             pvars['eventtype'],
                                             country or params['dom-region'],
                                             bannerid,
                                             showid,
                                             referer,
                                             product,
                                             rec.key,
                                             pvars['reqid'],
                                             testids), tableIndex=0
                                ))
                            break
                else:
                    yield utf8ify(
                        Record(
                            rec.key,
                            rec.subkey,
                            rec.value, tableIndex=1
                        ))

    def get_country_from_referer(self, url):
        parsed = urlparse.urlparse(url)
        qs = urlparse.parse_qs(parsed.query)
        try:
            return get_country(int(qs['lr'][0]), self.geobase)
        except:
            return


class Moscow(datetime.tzinfo):

    def utcoffset(self, dt):
        return datetime.timedelta(hours=3)

    def tzname(self, dt):
        return "Europe/Moscow"

    def dst(self, dt):
        return datetime.timedelta(0)

    def __repr__(self):
        return "Europe/Moscow (UTC+3)"


class SoftExportMap(object):

    def __init__(self, date_):
        self.date_ = date_

    def __call__(self, rec1):
        rec = deutf8ify(rec1)
        params = parseparams(rec.value)
        parsed = urlparse.urlparse(params['request'])
        parsed_qs = urlparse.parse_qs(parsed.query)
        timestamp = datetime.datetime.strptime(params['timestamp'],
                                               '%Y-%m-%dT%H:%M:%S')
        try:
            if 'banerid' in parsed_qs:
                bnrd = 'banerid'
            else:
                bnrd = 'bnrd'
            ts = parsed_qs[bnrd][0][8:18]
            date_ = datetime.datetime.fromtimestamp(int(ts)).date()
            if date_ == self.date_:
                yield utf8ify(Record(
                    parsed_qs[bnrd][0][8:][:22],
                    timestamp.strftime('%s'),
                    tabulate(parsed_qs['yasoft'][0],
                             parsed_qs['stat'][0],
                             parsed_qs[bnrd][0].split(':')[0])
                ))
        except:
            pass


def preprocess_string(s):
    s = s.replace("'", '"')
    s = s.replace('\\', '')
    return s


def showid_to_key(s):
    return s[8:][:22]


def get_showid(params):
    for param in ['showid', 'banerid', 'banner_id']:
        if param in params:
            return showid_to_key(params[param]), params[param]
    for param in ['atom_reqid', 'amp;atom_reqid']:
        if param in params:
            return reqid_to_key(params[param]), params[param]
    return '', ''


def mobilemetrika_map(rec):
    rec = deutf8ify(rec)
    params = parseparams(rec.value)
    urlparams = {}
    try:
        urlparams = dict(zip(
            json.loads(preprocess_string(params['UrlParameters_Keys'])),
            json.loads(preprocess_string(params['UrlParameters_Values'])),
        ))
    except:
        pass
    key, showid = get_showid(urlparams)
    if key and showid:
        yield utf8ify(Record(
            key,
            params['InstallationTimestamp'],
            tabulate(
                params['AppID'],
                'install',
                showid
            )
        ))


class SetInstallsReduce(object):

    def __init__(self, bd, od, descriptions, event_dict, fielddate):
        self.bd = bd
        self.od = od
        self.descriptions = descriptions
        self.event_dict = event_dict
        self.fielddate = fielddate

    def __call__(self, key1, recs):
        key = deutf8ify(key1)
        original_key = deutf8ify(key1)
        valid = True
        bannerid = ''
        reqid = ''
        showid = key
        showids = {}
        showids_installs = set()
        moscow = Moscow()
        fielddate = self.fielddate
        for rec1 in recs:
            valid = True
            rec = deutf8ify(rec1)
            tabs = rec.value.split('\t')
            if len(tabs) >= 7:
                try:
                    element = tabs[0]
                    eventtype = self.event_dict[tabs[1]]
                    lang = tabs[2]
                except:
                    element = 'invalid'
                    eventtype = 'invalid'
                    lang = 'invalid'
                    valid = False
                try:
                    bannerid = tabs[3]
                except IndexError:
                    bannerid = 'empty'
                try:
                    showid = tabs[4]
                except IndexError:
                    showid = 'empty'
                try:
                    host = tabs[5]
                except IndexError:
                    host = 'unknown'
                try:
                    product = tabs[6]
                except IndexError:
                    product = 'unknown'
                try:
                    reqid = tabs[8]
                except IndexError:
                    pass
                try:
                    testids = tabs[9].split(',')
                    if testids == ['']:
                        testids = ['empty']
                except IndexError:
                    testids = ['empty']
                testids.append('_total_')
                value = {
                    'shows': tabulate(1, 0, 0, 0),
                    'installs': tabulate(0, 1, 0, 0),
                    'cancels': tabulate(0, 0, 1, 0),
                    'clicks': tabulate(0, 0, 0, 1),
                    'invalid': '0'
                }[eventtype]
                product = (product if product else 'empty')
                service = (host if host else 'empty')
                lang = (lang if lang else 'empty')
                browser = 'empty'
                _os = 'empty'
                if showid and showid != 'empty' and 'promolib' not in showid:
                    browser = get_browser(showid, bd=self.bd)
                    _os = get_os(showid, od=self.od)
                elif len(tabs) >= 11:
                    _os = tabs[10]
                element = (element
                           if element else 'empty')
                candidate = bannerid
                showids[showid] = (product, service, lang, browser, _os,
                                   ','.join(testids), element,
                                   candidate, reqid)
                if valid and candidate:
                    for comb in itertools.product(
                        ((product, '_allextensions_', '_total_')
                            if product in {'set', 'home', 'sethome', 'vb',
                                           'vbch'}
                            else (product, '_total_')),
                        (service, '_total_'),
                        (lang, '_total_'),
                        (browser, '_total_'),
                        (_os, '_total_'),
                        testids,
                        (element, '_total_'),
                        (candidate, '_total_'),
                    ):
                        key = tabulate(fielddate, *comb)
                        yield utf8ify(Record(
                            key,
                            '',
                            tabulate(value, 0),
                            tableIndex=0))
            elif len(tabs) == 3 and tabs[1] in {'install', 'dayuse'}:
                showids_installs.add(tabs[2])
        for showid in showids:
            product, service, lang, browser, _os, \
                testids, element, candidate, reqid = showids[showid]
            testids = testids.split(',')
            if showid in showids_installs:
                for comb in itertools.product(
                    ((product, '_allextensions_', '_total_')
                        if product in {'set', 'home', 'sethome', 'vb',
                                       'vbch'}
                        else (product, '_total_')),
                    (service, '_total_'),
                    (lang, '_total_'),
                    (browser, '_total_'),
                    (_os, '_total_'),
                    testids,
                    (element, '_total_'),
                    (candidate, '_total_'),
                ):
                    key = tabulate(fielddate, *comb)
                    yield utf8ify(Record(
                        key,
                        '',
                        tabulate(0, 0, 0, 0, 1),
                        tableIndex=0))
            yield utf8ify(Record(
                reqid,
                showid,
                tabulate(('1' if showid
                          in showids_installs else '0'), bannerid),
                tableIndex=1))


def make_sh_rec(**kwargs):
    dct = defaultdict(lambda: '', kwargs)
    return utf8ify(Record(
        reqid_to_key(dct['reqid']),
        dct['timestamp'],
        tabulate(
            dct['distr_obj'],
            dct['eventtype'],
            dct['dom_region'],
            dct['bannerid'],
            dct['showid'],
            dct['referer'],
            dct['product'],
            dct['yandexuid'],
            dct['reqid'],
            dct['testids'],
            dct['os'],
        )
    ))


class PromolibaMap(object):

    def __init__(self, bannerids, app_id_dict, debug=False):
        self.bannerids = bannerids
        self.app_id_dict = app_id_dict
        self.debug = debug

    def __call__(self, rec):
        self.app_id_dict = defaultdict(lambda: '', self.app_id_dict)
        rec = deutf8ify(rec)
        params = parseparams(rec.value)
        if 'EventValue' in params and '-ATOMS-' in params['EventValue']:
            try:
                obj = json.loads(params['EventValue'])
            except:
                obj = {}
            if 'campaign_id' in obj:
                campaign_id = obj['campaign_id']
                try:
                    reqid = campaign_id.split(';')[0]
                    bannerid = campaign_id.split(';')[2]
                except IndexError:
                    reqid = ''
                    bannerid = ''
                    if self.debug:
                        yield utf8ify(rec)
                os = params['AppPlatform']
                if os == 'android':
                    os = 'Android'
                service = self.app_id_dict[params['AppID']]
                if reqid and bannerid:
                    yield make_sh_rec(
                        reqid=reqid,
                        showid=reqid,
                        eventtype='show',
                        distr_obj='promolib',
                        os=os,
                        referer=service,
                        bannerid=bannerid,
                        product=self.bannerids.get(bannerid, 'unknown')
                    )
                    if obj.get('reaction') in {'click', 'close'}:
                        yield make_sh_rec(
                            reqid=reqid,
                            showid=reqid,
                            eventtype=obj.get('reaction'),
                            distr_obj='promolib',
                            os=os,
                            referer=service,
                            bannerid=bannerid,
                            product=self.bannerids.get(bannerid, 'unknown')
                        )


def normalize_key(key):
    return re.sub(r'[^a-zA-ZА-Яа-яЁё_\-\. \t0-9/]', '', key)


class StatReduce(object):

    def __init__(self, trueinstalls):
        self.trueinstalls = trueinstalls

    def __call__(self, key, recs):
        key = normalize_key(deutf8ify(key))
        shows = 0
        installs = 0
        cancels = 0
        clicks = 0
        trueinstalls = 0
        for rec in recs:
            try:
                tabs = rec.value.split('\t')
                shows += int(tabs[0])
                installs += int(tabs[1])
                cancels += int(tabs[2])
                clicks += int(tabs[3])
                if self.trueinstalls:
                    trueinstalls += int(tabs[4])
            except:
                pass
        if self.trueinstalls:
            yield utf8ify(Record(
                key,
                tabulate(shows, installs, cancels, clicks, trueinstalls)
            ))
        else:
            yield utf8ify(Record(
                key,
                tabulate(shows, installs, cancels, clicks)
            ))


def get_browser(showid, bd=None):
    if not bd:
        bd = browser_dict
    if showid[4:6] in bd and bd[showid[4:6]]:
        return bd[showid[4:6]]
    return 'unknown'


def get_os(showid, od=None):
    if not od:
        od = os_dict
    if showid[6:8] in od and od[showid[6:8]]:
        return od[showid[6:8]]
    return 'unknown'


def get_nextdate():
    with open('daily_stat_last_date') as f:
        date = f.read()
    return date_from_string(date) + datetime.timedelta(days=1)


def set_lastdate(date):
    with open('daily_stat_last_date', 'w') as f:
        f.write(format(date))


def date_from_string(string):
    string = string.replace('-', '')
    return datetime.date(int(string[:4]), int(string[4:6]), int(string[6:8]))


def date_from_table(table):
    return date_from_string(table.split('/')[-1])


def get_srctables(lb=None, ub=None, alltables=None):
    if not alltables:
        alltables = get_alltables()
    if not lb:
        lb = get_nextdate()
    if not ub:
        ub = datetime.date(2066, 1, 1)
    result = [x for x in alltables
              if date_from_table(x) >= lb and date_from_table(x) <= ub]
    return sorted(result)


def get_alltables():
    alltables = MapReduce.getTablesInfo('user_sessions/*')
    alltables = sorted([x.name for x in alltables
                        if len(x.name.split('/')) == 2
                        and x.name.split('/')[1].startswith('201')])
    return alltables


def main():
    global __file__                         # to fix stupid
    __file__ = os.path.abspath(__file__)    # __file__ handling
    _file_ = os.path.basename(__file__)     # in python 2
    import arrow
    global browser_dict
    global os_dict

    ATOM_HEADERS = {'Authorization': 'Token {}'.format(get_token())}

    parser = argparse.ArgumentParser()
    parser.add_argument('--debug', action='store_true')
    parser.add_argument('--savecands', action='store_true')
    parser.add_argument('--config', default=None)
    parser.add_argument('--datefrom', default=None)
    parser.add_argument('--dateto', default=None)
    parser.add_argument('--noredo', action='store_true')
    parser.add_argument('--nolock', action='store_true')
    args = parser.parse_args()
    start = int((datetime.datetime.now()
                 - datetime.datetime(1970, 1, 1)).total_seconds())

    logger = logging.getLogger(_file_[:-3])
    formatter = logging.Formatter('%(asctime)s | %(message)s')
    ch = logging.StreamHandler()
    logger.setLevel(logging.DEBUG)
    if args.debug:
        ch.setLevel(logging.DEBUG)
    else:
        ch.setLevel(logging.CRITICAL)
    ch.setFormatter(formatter)
    logger.addHandler(ch)
    fh = logging.FileHandler('{}/logs/{}-{}.log'.format(
        os.path.dirname(__file__), _file_[:-3], start),
        encoding='utf8')
    fh.setLevel(logging.DEBUG)
    fh.setFormatter(formatter)
    logger.addHandler(fh)

    os.chdir(os.path.dirname(__file__))

    if not args.nolock:
        check_if_locked(os.path.abspath('daily_lock.txt'), logger)

    req = requests.get('https://a.yandex-team.ru/api/tree/blob/trunk/arcadia/'
                       'junk/pecheny/monitorings/testids.txt')
    testids = set(filter(lambda z: z.isdigit(), x)
                  for x in req.content.decode('utf8').split('\n'))
    testids.discard('')
    testids = {int(x) for x in testids}
    logger.info('Using testids: {}'.format(', '.join(str(x)
                                                     for x in sorted(testids))))

    # candidates, descriptions = get_unique_candidates(
    #     logger, debug=args.savecands)
    candidates = get_candidates()
    descriptions = {}

    if args.savecands:
        with open('candidate_to_product_{}.json'.format(
                arrow.now().timestamp), 'w') as f:
            f.write(json.dumps(candidates, indent=4, ensure_ascii=False)
                    .encode('utf8'))
        with open('candidate_descriptions_{}.json'.format(
                arrow.now().timestamp), 'w') as f:
            f.write(json.dumps(descriptions, indent=4, ensure_ascii=False)
                    .encode('utf8'))

    with codecs.open('showid.txt', 'r', 'utf8') as f:
        for line in f:
            tabs = line.rstrip().split()
            if len(tabs) == 3 and tabs[0] == '2':
                browser_dict[tabs[1]] = tabs[2]
            if len(tabs) == 3 and tabs[0] == '3':
                os_dict[tabs[1]] = tabs[2]

    app_id_dict = {}
    with codecs.open('APP_ID_DICT.csv', 'r', 'utf8') as f:
        for line in f:
            commas = line.rstrip().split(';')
            if len(commas) < 4:
                continue
            app_id_dict[commas[1]] = commas[3]

    gb = json.load(open('geobase.json'))

    geobase = {int(x['id']): preprocess_geobase(x) for x in gb}

    defaults()

    alltables = get_alltables()
    if not(args.datefrom and args.dateto):
        nextdate = get_nextdate()
        lastdate = nextdate - datetime.timedelta(days=1)
        srctables = get_srctables(lb=nextdate)
        while len(srctables) > 0:
            t = process_date(srctables[0], logger, candidates,
                             descriptions, app_id_dict, testids=testids,
                             geobase=geobase)
            processed_date = date_from_table(srctables[0])
            if t and processed_date >= get_nextdate():
                set_lastdate(processed_date)
                lastdate = processed_date
                nextdate = get_nextdate()
            srctables = get_srctables()
        logger.info("No new data. Latest counted date is {}"
                    .format(lastdate))
        with codecs.open('daily_lock.txt', 'w', 'utf8') as f:
            f.write('free')
        sys.exit(0)
    else:
        lb = date_from_string(args.datefrom)
        ub = date_from_string(args.dateto)
        if lb <= ub:
            srctables = get_srctables(lb=lb, ub=ub)
        else:
            srctables = get_srctables(lb=ub, ub=lb)[::-1]
        for srctable in srctables:
            t = process_date(srctable, logger, candidates,
                             descriptions, app_id_dict, redo=(
                                 True if not args.noredo else False),
                             testids=testids, geobase=geobase)
        with codecs.open('daily_lock.txt', 'w', 'utf8') as f:
            f.write('free')

event_dict = {
    'close_stripe': 'cancels',
    'closestripe': 'cancels',
    'close-stripe': 'cancels',
    'close': 'cancels',
    'cancel': 'cancels',
    'show': 'shows',
    'download': 'installs',
    'install': 'installs',
    'click': 'clicks',
}


def table_exists(name):
    return MapReduce.getTableInfo(name).size > 0

HEADERS = {'StatRobotUser': 'robot_pecheny',
           'StatRobotPassword': 'OoGh1Adahy'}
URL = 'https://upload.stat.yandex-team.ru/_api/report/data'
DICTAPI = 'https://stat.yandex-team.ru/_api/dictionary'


def push_to_stat(chunk, logger, redo=False, ch=-1,
                 name="Distribution/Others/AtomBanners/v3_daily",
                 candidates_to_title_snippets=None):
    if not candidates_to_title_snippets:
        candidates_to_title_snippets = {}
    data_counters = {
        "name": name,
        "scale": "d",
        "_append_mode": 1,
        "tsv_data": chunk.getvalue(),
    }
    if redo and ch == 1:
        data_counters['replace_mask'] = 'fielddate'
    logger.info('Posting updated dictionaries...')
    if 'v3' in name:
        for x in ['product', 'service', 'lang', 'browser',
                  'description', 'element', 'candidate']:
            req = post_dictionary_to_stat(
                reverse_dict(statdict[x]),
                'vcfs::atombanners_{}'.format(x))
    elif 'v4' in name:
        for x in ['product', 'service', 'lang', 'browser',
                  'os', 'testid', 'element']:
            req = post_dictionary_to_stat(
                reverse_dict(statdict[x]),
                'vcfs::atombanners_{}'.format(x))
        revcand = reverse_dict(statdict['candidate'])
        for x in revcand:
            if revcand[x] in candidates_to_title_snippets:
                revcand[x] = candidates_to_title_snippets[revcand[x]]
        req = post_dictionary_to_stat(revcand,
                                      'vcfs::atombanners_candidate')
    req = None
    retries = 0
    while ((req is None or req.status_code != 200)
           and retries < 10):
        req = requests.post(URL, headers=HEADERS, data=data_counters)
        logger.info(req.text)
        if req.status_code != 200:
            time.sleep(60)
            retries += 1
    return True


def yaml_dump(dictionary):
    return yaml.safe_dump(dictionary, default_flow_style=False,
                          explicit_start=True)


def reverse_dict(dictionary):
    return {v: k for k, v in dictionary.items()}


def yaml_dump_all():
    for d in statdict:
        with codecs.open('{}.yaml'.format(d), 'w', 'utf8') as f:
            f.write(yaml_dump(
                reverse_dict(statdict[d])
            ))


def get_set_dictionary(dictionary, value):
    if value in dictionary:
        return dictionary[value]
    dictionary[value] = zmax(set(dictionary.values())) + 1
    return dictionary[value]


def get_set_dictionary_testid(dictionary, value):
    if value in dictionary:
        return dictionary[value]
    dictionary['1000000'] = 1000000
    if value == '_total_':
        dictionary[value] = 0
    else:
        try:
            dictionary[value] = int(value)
        except ValueError:
            dictionary[value] = zmax(set(dictionary.values())) + 1
    return dictionary[value]


def zmax(seq):
    if len(seq) > 0:
        return max(seq)
    return -1


def process_ident(ident):
    fd, product, service, \
        lang, browser, description, \
        element, candidate = ident.split('\t')
    try:
        product = get_set_dictionary(statdict['product'], product)
    except:
        pdb.set_trace()
    service = get_set_dictionary(statdict['service'], service)
    lang = get_set_dictionary(statdict['lang'], lang)
    browser = get_set_dictionary(statdict['browser'], browser)
    description = get_set_dictionary(statdict['description'], description)
    element = get_set_dictionary(statdict['element'], element)
    candidate = get_set_dictionary(statdict['candidate'], candidate)
    return tabulate(fd, product, service,
                    lang, browser, description,
                    element, candidate)


def process_ident_4(ident, candidates, candidates_to_title_snippets):
    fd, product, service, \
        lang, browser, _os, testid, \
        element, candidate = ident.split('\t')
    try:
        product = get_set_dictionary(statdict['product'], product)
    except:
        pdb.set_trace()
    service = get_set_dictionary(statdict['service'], service)
    lang = get_set_dictionary(statdict['lang'], lang)
    browser = get_set_dictionary(statdict['browser'], browser)
    _os = get_set_dictionary(statdict['os'], _os)
    testid = get_set_dictionary_testid(statdict['testid'], testid)
    element = get_set_dictionary(statdict['element'], element)
    candidate = get_set_dictionary(statdict['candidate'], candidate)
    # if candidate in candidates:
    #     candidates_to_title_snippets[candidate] = '{} | {}'.format(
    #         candidates_to_title_snippets[candidate], candidates[candidate]
    #     )
    return tabulate(fd, product, service,
                    lang, browser, _os, testid,
                    element, candidate)


def get_dictionary_from_stat(name):
    req = requests.get(
        DICTAPI + '?name={}'.format(name),
        headers=HEADERS
    )
    if req.status_code == 200:
        result = {int(k): v for k, v in json.loads(req.content).items()}
        if result:
            return result
        return yaml.safe_load(open('{}.yaml'.format(name)))
    return req


def post_dictionary_to_stat(dictionary, name):
    req = requests.post(
        DICTAPI,
        headers=HEADERS,
        data={
            'name': name,
            'language': '',
            'dictionary': json.dumps(dictionary),
            'editors': ['pecheny', 'riddle']
        }
    )
    return req


def check_if_locked(lock, logger):
    with codecs.open(lock, 'r', 'utf8') as f:
        contents = f.read().rstrip()
    search_for_process = [p for p in psutil.process_iter()
                          if (os.path.basename(__file__) in ' '.join(p.cmdline()))
                          and not ('nolock' in ' '.join(p.cmdline()))
                          and not ('mapreduce' in ' '.join(p.cmdline()))]
    if 'locked' in contents and len(search_for_process) > 1:
        logger.info('Process is locked, exiting...')
        sys.exit(0)
    else:
        with codecs.open(lock, 'w', 'utf8') as f:
            f.write('locked at {}'.format(datetime.datetime.now()))


def process_date(srctable, logger,
                 candidates, descriptions, app_id_dict,
                 redo=False, testids=set(), geobase={}):
    logger.info('Source table is {}'.format(srctable))
    ts = int((date_from_table(srctable) - datetime.date(1970, 1, 1))
             .total_seconds())
    dftable = date_from_table(srctable)
    tdate = dftable.strftime('%Y%m%d')
    dsttable = 'tmp/pers/set_installs_daily_sh_{}'.format(tdate)
    tmptable = dsttable + '_tmp'
    tmptable4 = dsttable + '_tmp4'
    endtable = dsttable + '_stat'
    endtable4 = dsttable + '_stat4'
    extdata = 'export_access_log/{}'.format(tdate)
    mobilemetrika = 'metrika_mobile_install_log/{}'.format(tdate)
    mobilemetrika_pr = 'metrika_mobile_log/{}'.format(tdate)
    trueinstalls = 'tmp/pers/trueinstalls_for_pool_{}'.format(tdate)
    if not table_exists(dsttable) or redo:
        success = False
        while not success:
            try:
                logger.info('Mapping from {} to {}'.format(
                    srctable, dsttable))
                fastlogs_reduce = FastlogsReduce(testids, geobase)
                MapReduce.runReduce(fastlogs_reduce, srcTable=srctable,
                                    dstTables=[dsttable, dsttable + '.errors'],
                                    username='personalization')
                soft_export_map = SoftExportMap(dftable)
                logger.info('Mapping from {} to {}'.format(
                    extdata, dsttable))
                MapReduce.runMap(soft_export_map,
                                 srcTable=extdata,
                                 dstTable=dsttable,
                                 appendMode=True,
                                 username='personalization')
                promoliba_map = PromolibaMap(candidates, app_id_dict)
                logger.info('Mapping from {} to {}'.format(
                    mobilemetrika_pr, dsttable))
                MapReduce.runMap(promoliba_map,
                                 srcTable=mobilemetrika_pr,
                                 dstTable=dsttable,
                                 appendMode=True,
                                 username='personalization')
                logger.info('Mapping from {} to {}'.format(
                    mobilemetrika, dsttable))
                MapReduce.runMap(mobilemetrika_map,
                                 srcTable=mobilemetrika,
                                 dstTable=dsttable,
                                 appendMode=True,
                                 username='personalization')
                logger.info('Sorting {}'.format(dsttable))
                MapReduce.sortTable(dsttable, username='personalization')
                success = True
            except:
                logger.error(traceback.format_exc())
    if (table_exists(dsttable) and not table_exists(endtable4)) or redo:
        set_installs_reduce = SetInstallsReduce(browser_dict,
                                                os_dict,
                                                descriptions,
                                                event_dict,
                                                date_from_table(srctable)
                                                .strftime('%Y-%m-%d'))
        if not table_exists(tmptable4) or redo:
            logger.info('Mapping from {} to {}'.format(
                dsttable, tmptable))
            MapReduce.runReduce(set_installs_reduce, srcTable=dsttable,
                                dstTables=[tmptable4, trueinstalls],
                                username='personalization')
            # logger.info('Sorting {}'.format(tmptable))
            # MapReduce.sortTable(tmptable)
            # logger.info('Sorting {}'.format(tmptable4))
            # MapReduce.sortTable(tmptable4)
        # logger.info('Reducing from {} to {}'.format(tmptable,
        #     endtable))
        # MapReduce.runReduce(stat_reduce, srcTable=tmptable,
        #     dstTable=endtable, subkeyMode=False, username='personalization')
        if not table_exists(endtable4) or redo:
            logger.info('Reducing from {} to {}'.format(tmptable4,
                                                        endtable4))
            stat_reduce = StatReduce(True)
            MapReduce.runReduce(stat_reduce, srcTable=tmptable4,
                                dstTable=endtable4, subkeyMode=False, username='personalization')
            # logger.info('Sorting {}'.format(endtable))
            # MapReduce.sortTable(endtable, username='personalization')
            logger.info('Sorting {}'.format(endtable4))
            MapReduce.sortTable(endtable4, username='personalization')
    MapReduce.dropTable(tmptable4, username='personalization')
    MapReduce.dropTable(tmptable, username='personalization')
    MapReduce.dropTable(dsttable + '.errors', username='personalization')
    logger.info('Sorting {}'.format(trueinstalls))
    MapReduce.sortTable(trueinstalls, username='personalization')
    if table_exists(endtable4):
        counter = Counter()
        chunk = StringIO()
        print(tabulate('fielddate', 'product', 'service',
                       'lang', 'browser', 'os', 'testid', 'element', 'candidate',
                       'shows', 'installs', 'cancels', 'clicks', 'trueinstalls'), file=chunk)
        i = 0
        ch = 1
        prevkey = ''
        for x in ['product', 'service', 'lang', 'browser',
                  'description', 'os', 'testid', 'element']:
            statdict[x] = reverse_dict(get_dictionary_from_stat(
                'vcfs::atombanners_{}'.format(x)))
        candidates_from_stat = get_dictionary_from_stat(
            'vcfs::atombanners_candidate')
        candidates_to_title_snippets = {}
        for x in candidates_from_stat:
            candidates_to_title_snippets[
                unicode(candidates_from_stat[x]).split(' | ')[0]
            ] = unicode(candidates_from_stat[x])
            candidates_from_stat[x] = unicode(
                candidates_from_stat[x]).split(' | ')[0]
        statdict['candidate'] = reverse_dict(candidates_from_stat)
        logger.info('Getting records from {}...'.format(endtable4))
        for rec in MapReduce.getSample(endtable4, count=None):
            rec1 = deutf8ify(rec)
            if i >= 1500000 and rec1.key != prevkey and prevkey:
                logger.info('Pushing chunk {} to stat...'.format(ch))
                ch += 1
                push_to_stat(chunk, logger, redo=redo, ch=ch,
                             name="Distribution/Others/AtomBanners/v4_daily",
                             candidates_to_title_snippets=candidates_to_title_snippets)
                for x in ['product', 'service', 'lang', 'browser',
                          'description', 'os', 'testid', 'element']:
                    statdict[x] = reverse_dict(get_dictionary_from_stat(
                        'vcfs::atombanners_{}'.format(x)))
                candidates_from_stat = get_dictionary_from_stat(
                    'vcfs::atombanners_candidate')
                candidates_to_title_snippets = {}
                for x in candidates_from_stat:
                    candidates_to_title_snippets[
                        candidates_from_stat[x].split(' | ')[0]
                    ] = candidates_from_stat[x]
                    candidates_from_stat[x] = candidates_from_stat[x].split(' | ')[
                        0]
                statdict['candidate'] = reverse_dict(candidates_from_stat)
                chunk = StringIO()
                print(tabulate('fielddate', 'product', 'service',
                               'lang', 'browser', 'os', 'testid', 'element', 'candidate',
                               'shows', 'installs', 'cancels', 'clicks', 'trueinstalls'), file=chunk)
                i = 0
            print(tabulate(process_ident_4(rec1.key,
                                           candidates,
                                           candidates_to_title_snippets), rec1.value), file=chunk)
            i += 1
            # accumulate data for razladki
            element = rec1.key.split('\t')[6]
            lang = rec1.key.split('\t')[3]
            shows, installs, cancels, clicks, trueinstalls = [int(x) for x in
                                                              rec1.value.split('\t')]
            if element != '_total_' and lang != '_total_':
                counter[(element, 'show', lang)] += shows
                counter[(element, 'click', lang)] += clicks
                counter[(element, 'cancel', lang)] += cancels
                counter[(element, 'install', lang)] += installs
            #
            prevkey = rec1.key
        logger.info('Pushing chunk {} to stat...'.format(ch))
        push_to_stat(chunk, logger, redo=redo, ch=ch,
                     name="Distribution/Others/AtomBanners/v4_daily",
                     candidates_to_title_snippets=candidates_to_title_snippets)
    if table_exists(endtable):
        counter = Counter()
        chunk = StringIO()
        print(tabulate('fielddate', 'product', 'service',
                       'lang', 'browser', 'description', 'element', 'candidate',
                       'shows', 'installs', 'cancels', 'clicks'), file=chunk)
        i = 0
        ch = 1
        prevkey = ''
        logger.info('Getting records from {}...'.format(endtable))
        for rec in MapReduce.getSample(endtable, count=None):
            rec1 = deutf8ify(rec)
            if i >= 30000000 and rec1.key != prevkey and prevkey:
                logger.info('Pushing chunk {} to stat...'.format(ch))
                ch += 1
                push_to_stat(chunk, logger, redo=redo, ch=ch)
                chunk = StringIO()
                print(tabulate('fielddate', 'product', 'service',
                               'lang', 'browser', 'description', 'element', 'candidate',
                               'shows', 'installs', 'cancels', 'clicks'), file=chunk)
                i = 0
            print(tabulate(process_ident(rec1.key), rec1.value), file=chunk)
            i += 1
            # accumulate data for razladki
            element = rec1.key.split('\t')[6]
            lang = rec1.key.split('\t')[3]
            shows, installs, cancels, clicks = [int(x) for x in
                                                rec1.value.split('\t')]
            if element != '_total_' and lang != '_total_':
                counter[(element, 'show', lang)] += shows
                counter[(element, 'click', lang)] += clicks
                counter[(element, 'cancel', lang)] += cancels
                counter[(element, 'install', lang)] += installs
            #
            prevkey = rec1.key

        logger.info('Pushing chunk {} to stat...'.format(ch))
        push_to_stat(chunk, logger, redo=redo, ch=ch)

        config = {'razladki':
                  'http://launcher.razladki.yandex-team.ru/'
                  'save_new_data/SearchPortalDistribution'}
        for param in counter:
            desc = '{lang}_{elem}_set_{event}s_fastlogs'.format(
                lang=param[2],
                elem=param[0],
                event=param[1])
            value = counter[param]
            logger.info('Pushing to razladki: {}, {}'
                        .format(desc, value))
            push_to_razladki(config, desc, value, ts=ts)
            if param[1] in {'click', 'install', 'cancel', 'close'}:
                try:
                    rdesc = '{lang}_{elem}_set_{event}srate_daily'.format(
                        lang=param[2],
                        elem=param[0],
                        event=param[1])
                    showvalue = counter[(param[0], 'show', param[2])]
                    rvalue = value / float(showvalue)
                    logger.info('Pushing to razladki: {}, {}'
                                .format(rdesc, rvalue))
                    push_to_razladki(config, rdesc, rvalue, ts=ts)
                except:
                    logger.info(traceback.format_exc())
            if (param[1] == 'close'
                    and (param[0], 'click', param[2]) in counter):
                try:
                    rdesc = '{lang}_{elem}_set_closeclickrate_daily'.format(
                        lang=param[2],
                        elem=param[1])
                    showvalue = counter[(param[0], 'click', param[2])]
                    rvalue = value / float(showvalue)
                    logger.info('Pushing to razladki: {}, {}'
                                .format(rdesc, rvalue))
                    push_to_razladki(config, rdesc, rvalue, ts=ts)
                except:
                    logger.info(traceback.format_exc())
        return True
    return True


if __name__ == "__main__":
    main()
