#!/usr/bin/env python
#! -*- coding: utf-8 -*-
from __future__ import unicode_literals
from __future__ import division
from __future__ import print_function
import sys
import os
import re
# from tqdm import tqdm
import codecs
import shutil
import contextlib
import itertools
import logging
import toml
import pdb
import argparse
import tempfile
import traceback
import subprocess
import mapreducelib
import threading
import urllib
import time
import psutil
import requests
from StringIO import StringIO
import json
from time import sleep
try:
    import thread
except ImportError:
    import _thread as thread
from mapreducelib import MapReduce, Record
import urlparse
from collections import defaultdict, Counter, namedtuple
import datetime
from pecheny.mrdef import defaults
from pecheny.moncommons import push_to_razladki
import yaml

browser_dict = {}
os_dict = {}
statdict = defaultdict(lambda: {})


def deutf8ify(rec):
    if isinstance(rec, mapreducelib.SubkeyedRecord):
        key, subkey, value = rec.key, rec.subkey, rec.value
        if not isinstance(key, unicode):
            key = key.decode('utf8', errors='replace')
        if not isinstance(subkey, unicode):
            subkey = subkey.decode('utf8', errors='replace')
        if not isinstance(value, unicode):
            value = value.decode('utf8', errors='replace')
        return Record(key, subkey, value)
    elif isinstance(rec, str):
        rec = rec.decode('utf8', errors='replace')
    return rec


def utf8ify(rec):
    if isinstance(rec, mapreducelib.SubkeyedRecord):
        if isinstance(rec.key, unicode):
            rec.key = rec.key.encode('utf8')
        if isinstance(rec.subkey, unicode):
            rec.subkey = rec.subkey.encode('utf8')
        if isinstance(rec.value, unicode):
            rec.value = rec.value.encode('utf8')
        return rec
    elif isinstance(rec, unicode):
        rec = rec.encode('utf8')
    return rec


def tryint(string):
    try:
        return int(string)
    except:
        return -1


def parsevars(_vars):
    commas = _vars.split(',')
    result = {'clids': []}
    for x in commas:
        if len(x.split('=')) > 1:
            key = x.split('=')[0]
            value = '='.join(x.split('=')[1:])
            if key.startswith('-'):
                key = key[1:]
            if key.startswith('clid'):
                result['clids'].append(value)
            else:
                result[key] = value
        else:
            result[x] = 'SINGLE'
    return defaultdict(lambda: '', result)


def parseparams(value):
    tabs = value.split('\t')
    result = {}
    for x in tabs:
        if len(x.split('=')) > 1:
            result[x.split('=')[0]] = '='.join(x.split('=')[1:])
        else:
            result[x] = 'SINGLE'
    return defaultdict(lambda: '', result)


def tabulate(*args):
    return '\t'.join(map(format, args))


def ntabulate(*args):
    return tabulate(*args) + '\n'


def gettld(url):
    if not '//' in url:
        url = 'http://' + url
    parsed = urlparse.urlparse(url)
    nl = parsed.netloc
    return nl.split('.')[-1].split(':')[0]


def table_exists(name):
    return MapReduce.getTableInfo(name).size > 0

HEADERS = {'StatRobotUser': 'robot_pecheny',
           'StatRobotPassword': 'OoGh1Adahy'}
URL = 'https://upload.stat.yandex-team.ru/_api/report/data'
DICTAPI = 'https://stat.yandex-team.ru/_api/dictionary'


def push_to_stat(chunk, logger, redo=False, ch=-1,
                 name="Distribution/Others/AtomBanners/v3",
                 candidates_to_title_snippets=None, debug=False):
    if not candidates_to_title_snippets:
        candidates_to_title_snippets = {}
    data_counters = {
        "name": name,
        "scale": "i",
        "_append_mode": 1,
        "tsv_data": chunk.getvalue(),
    }
    if redo and ch == 1:
        data_counters['replace_mask'] = 'fielddate'
    logger.info('Posting updated dictionaries...')
    if name.endswith('v3'):
        for x in ['product', 'service', 'lang', 'browser',
                  'description', 'element', 'candidate']:
            req = post_dictionary_to_stat(
                reverse_dict(statdict[x]),
                'vcfs::atombanners_{}'.format(x))
    elif name.endswith('v4'):
        for x in ['product', 'service', 'lang', 'browser',
                  'os', 'testid', 'element']:
            req = post_dictionary_to_stat(
                reverse_dict(statdict[x]),
                'vcfs::atombanners_{}'.format(x))
        revcand = reverse_dict(statdict['candidate'])
        for x in revcand:
            if revcand[x] in candidates_to_title_snippets:
                revcand[x] = candidates_to_title_snippets[revcand[x]]
        req = post_dictionary_to_stat(revcand,
                                      'vcfs::atombanners_candidate')
    yaml_dump_all()
    req = None
    retries = 0
    while ((req is None or req.status_code != 200)
           and not ((not req is None) and b'Error in data' in req.content)
           and retries < 5):
        req = requests.post(URL, headers=HEADERS, data=data_counters)
        logger.info(req.text)
        if req.status_code != 200:
            pdb.set_trace()
            time.sleep(60)
            retries += 1
    if b'Error in data' in req.content:
        return ('Error in data', req.content)
    return True


def reqlist(*listnames):
    return urllib.quote(json.dumps(
        {
            'atom-candidates':
            {
                        listname: None for listname in listnames
                        }
        }
    ).encode('utf8')).decode('utf8')

skeleton = ('http://querysearch-atom.search.yandex.net/yandsearch'
            '?ms=querysearch:json:3'
            '&rearr=qd_struct_keys={}'
            '&waitall=da'
            '&timeout=1000000')
badlists = {'news', 'service_block_ru'}


def get_unique_candidates(logger=None, debug=False):
    import requests
    import arrow
    req = requests.get(skeleton.format(reqlist('all_keys')))
    allkeysjson = json.loads(req.content)
    allkeys = allkeysjson['Data'][0]['Value']
    data = []
    for key in allkeys:
        req1 = requests.get(skeleton.format(reqlist(key)))
        cands = json.loads(req1.content)
        cands1 = cands['Data'][:1]
        data.extend(cands1)
    if debug:
        with open('candidates_{}.json'.format(
                arrow.now().timestamp), 'w') as f:
            f.write(json.dumps(data, indent=4, ensure_ascii=False)
                    .encode('utf8', errors='replace'))
    keysset = defaultdict(lambda: set())
    result = {}
    descriptions = {}
    for delem in [x for x in data if 'Key' in x]:
        identifier = 'internal-url'
        listname = delem['Key'][0]
        if listname in badlists:
            continue
        for elem in delem['Value']:
            try:
                candidate_id = elem[identifier].split('/')[-1]
            except (IndexError, KeyError):
                continue
            keysset[candidate_id].add(listname)
            if 'title' in elem and elem['title']:
                result[candidate_id] = elem['title']
            if 'snippet' in elem and elem['snippet']:
                if candidate_id not in result:
                    result[candidate_id] = elem['snippet']
                else:
                    result[candidate_id] += ' | {}'.format(elem['snippet'])
            if 'aux-data' in elem and 'button_yes' in elem['aux-data']:
                result[candidate_id] += ' | {}'.format(
                    elem['aux-data']['button_yes'])
            if 'aux-data' in elem and 'button_no' in elem['aux-data']:
                result[candidate_id] += ' | {}'.format(
                    elem['aux-data']['button_no'])
            if '__textauthor' in elem:
                result[candidate_id] += ' | {}'.format(
                    elem['__textauthor'])
            if not candidate_id in descriptions:
                descriptions[candidate_id] = 'empty'
            try:
                descriptions[candidate_id] = elem['aux-data']['_description']
            except KeyError:
                pass
    return result, descriptions


def remove_slashes(s1):
    while len(s1) > 0 and s1[0] == '/':
        s1 = s1[1:]
    while len(s1) > 0 and s1[-1] == '/':
        s1 = s1[:-1]
    return s1

prefices = ['http://', 'https://', 'www.', 'm.']
banned_hosts = ['kokoc.com', 'nblu.ru', 'cmle.ru',
                'omg5.ru', 'xorod.ru', 'sofro.ru', 'dresk.ru', 'dd34.ru']


def normalize_host(s1):
    """
    >>> print(normalize_host('https://yandex.ru/search/?text=Skoda'))
    yandex.ru/search
    >>> print(normalize_host('https://yandex.ru/yandsearch/?text=Skoda'))
    yandex.ru/search
    >>> print(normalize_host('https://yandex.ru/?utm_source=blabla'))
    yandex.ru
    >>> print(normalize_host('http://maps.yandex.ru/213/moscow/?text=1234&sll=37.620393%2C55.753960&sspn=1.128845%2C0.535158&ll=37.718247%2C55.732813&z=10'))
    maps.yandex.ru
    """
    result = s1
    while any([result.startswith(prefix) for prefix in prefices]):
        for prefix in prefices:
            if result.startswith(prefix):
                result = result[len(prefix):]
    if '?' in result:
        result = result[:result.index('?')]
    result = remove_slashes(result)
    if ('/' in result and len(result.split('/')) > 1):
        if result.split('/')[0].split('.')[0] == 'yandex':
            result = result.split('/')[0] + '/' + result.split('/')[1]
        else:
            result = result.split('/')[0]
    result = result.replace('yandsearch', 'search')
    result = result.replace(':', '')
    if not 'yandex' in result:
        result = 'other'
    return result[:25]

# def fastlogs_reduce(key, recs):
#     from collections import defaultdict
#     key = deutf8ify(key)
#     shows = {}
#     for rec1 in recs:
#         rec = deutf8ify(rec1)
#         params = parseparams(rec.value)
#         pvars = parsevars(params['vars'])
#         if (params['type'] == 'TECH'
#               and params['path'].startswith('tech.portal-ads.')
#               and pvars['reqid']): # TODO: return -ATOMS- check
#             if pvars['eventtype'] == 'show':
#                 referer = normalize_host(params['referer'])
#                 showid = pvars['showid'][:8]
#                 product = pvars['product']
#                 shows[pvars['reqid']] = {
#                 'referer': referer,
#                 'showid': showid,
#                 'product': product,
#                 'score': pvars['score']
#                 }
#                 yield utf8ify(
#                     Record(
#                         rec.key,
#                         rec.subkey,
#                         tabulate(params['path'][16:]
#                             + ('.training'
#                                 if pvars['score'] == '100000' else ''),
#                             pvars['eventtype'],
#                             params['dom-region'],
#                             pvars['bannerid'],
#                             showid,
#                             referer,
#                             product), tableIndex=0
#                         ))
#             elif pvars['reqid'] in shows:
#                 showid = shows[pvars['reqid']]['showid']
#                 referer = shows[pvars['reqid']]['referer']
#                 product = shows[pvars['reqid']]['product']
#                 score = shows[pvars['reqid']]['score']
#                 yield utf8ify(
#                     Record(
#                         rec.key,
#                         rec.subkey,
#                         tabulate(params['path'][16:]
#                             + ('.training'
#                                 if score == '100000' else ''),
#                             pvars['eventtype'],
#                             params['dom-region'],
#                             pvars['bannerid'],
#                             showid,
#                             referer,
#                             product), tableIndex=0
#                         ))
#             else:
#                 yield utf8ify(
#                     Record(
#                         rec.key,
#                         rec.subkey,
#                         rec.value, tableIndex=1
#                         ))


def fastlogs_reduce(key, recs):
    from collections import defaultdict
    key = deutf8ify(key)
    shows = defaultdict(lambda: [])
    for rec1 in recs:
        rec = deutf8ify(rec1)
        params = parseparams(rec.value)
        pvars = parsevars(params['vars'])
        if (params['type'] == 'TECH'
            and params['path'].startswith('tech.portal-ads.')
                and pvars['reqid']):  # TODO: return -ATOMS- check
            if pvars['eventtype'] == 'show':
                referer = normalize_host(params['referer'])
                showid = pvars['showid']
                product = pvars['product']
                shows[pvars['reqid']].append({
                    'referer': referer,
                    'showid': pvars['showid'],
                    'product': product,
                    'score': pvars['score'],
                    'bannerid': pvars['bannerid'],
                })
                yield utf8ify(
                    Record(
                        rec.key,
                        rec.subkey,
                        tabulate(params['path'][16:]
                                 + ('.training'
                                    if pvars['score'] == '100000' else ''),
                                 pvars['eventtype'],
                                 params['dom-region'],
                                 pvars['bannerid'],
                                 pvars['showid'],
                                 referer,
                                 product), tableIndex=0
                    ))
            elif pvars['reqid'] in shows:
                for show in shows[pvars['reqid']]:
                    if pvars['bannerid'] == show['bannerid']:
                        showid = show['showid']
                        referer = show['referer']
                        product = show['product']
                        score = show['score']
                        bannerid = show['bannerid']
                        testids = show['test-ids']
                        yield utf8ify(
                            Record(
                                rec.key,
                                rec.subkey,
                                tabulate(params['path'][16:]
                                         + ('.training'
                                            if score == '100000' else ''),
                                         pvars['eventtype'],
                                         params['dom-region'],
                                         bannerid,
                                         showid,
                                         referer,
                                         product), tableIndex=0
                            ))
                        break
            else:
                yield utf8ify(
                    Record(
                        rec.key,
                        rec.subkey,
                        rec.value, tableIndex=1
                    ))


class Moscow(datetime.tzinfo):

    def utcoffset(self, dt):
        return datetime.timedelta(hours=3)

    def tzname(self, dt):
        return "Europe/Moscow"

    def dst(self, dt):
        return datetime.timedelta(0)

    def __repr__(self):
        return "Europe/Moscow (UTC+3)"


class SetInstallsMap(object):

    def __init__(self, bd, od, descriptions, event_dict, fielddate):
        self.bd = bd
        self.od = od
        self.descriptions = descriptions
        self.event_dict = event_dict
        moscow = Moscow()
        self.fielddate = datetime.datetime.fromtimestamp(
            fielddate, moscow).strftime('%Y-%m-%d %H:%M:%S')

    def __call__(self, rec):
        rec1 = deutf8ify(rec)
        tabs = rec1.value.split('\t')
        valid = True
        fielddate = self.fielddate
        try:
            element = tabs[0]
            eventtype = self.event_dict[tabs[1]]
            lang = tabs[2]
        except:
            element = 'invalid'
            eventtype = 'invalid'
            lang = 'invalid'
            valid = False
        try:
            bannerid = tabs[3]
        except IndexError:
            bannerid = 'empty'
        try:
            showid = tabs[4]
        except IndexError:
            showid = 'empty'
        try:
            host = tabs[5]
        except IndexError:
            host = 'unknown'
        try:
            product = tabs[6]
        except IndexError:
            product = 'unknown'
        value = {
            'shows':     tabulate(1, 0, 0, 0),
            'installs':  tabulate(0, 1, 0, 0),
            'cancels':   tabulate(0, 0, 1, 0),
            'clicks':    tabulate(0, 0, 0, 1),
            'invalid':  '0'
        }[eventtype]
        product = (product if product else 'empty')
        service = (host if host else 'empty')
        lang = (lang if lang else 'empty')
        browser = get_browser(showid, bd=self.bd)
        os = get_os(showid, od=self.od)
        testid = 'empty'
        description = (self.descriptions[bannerid]
                       if bannerid in self.descriptions else 'empty')
        element = (element
                   if element else 'empty')
        candidate = bannerid
        key = tabulate(
            fielddate,
            product,
            service,
            lang,
            browser,
            description,
            element,
            candidate
        )
        if valid and candidate:
            # for comb in itertools.product(
            #     ((product, '_allextensions_', '_total_')
            #         if product in {'set', 'home', 'sethome', 'vb', 'vbch'}
            #         else (product, '_total_')),
            #     (service, '_total_'),
            #     (lang, '_total_'),
            #     (browser, '_total_'),
            #     (description, '_total_'),
            #     (element, '_total_'),
            #     (candidate, '_total_'),
            #     ):
            #     key = tabulate(fielddate, *comb)
            #     yield utf8ify(Record(
            #         key,
            #         '',
            #         value,
            #         tableIndex=0))
            for comb in itertools.product(
                ((product, '_allextensions_', '_total_')
                    if product in {'set', 'home', 'sethome', 'vb', 'vbch'}
                    else (product, '_total_')),
                (service, '_total_'),
                (lang, '_total_'),
                (browser, '_total_'),
                (os, '_total_'),
                (testid, '_total_'),
                (element, '_total_'),
                (candidate, '_total_'),
            ):
                key = tabulate(fielddate, *comb)
                yield utf8ify(Record(
                    key,
                    '',
                    value,
                    tableIndex=1))


def normalize_key(key):
    return re.sub(r'[^a-zA-ZА-Яа-яЁё_\-\. \t0-9/:]', '', key)


def stat_reduce(key, recs):
    key = normalize_key(deutf8ify(key))
    shows = 0
    installs = 0
    cancels = 0
    clicks = 0
    for rec in recs:
        try:
            tabs = rec.value.split('\t')
            shows += int(tabs[0])
            installs += int(tabs[1])
            cancels += int(tabs[2])
            clicks += int(tabs[3])
        except:
            pass
    yield utf8ify(Record(
        key,
        tabulate(shows, installs, cancels, clicks)
    ))


def get_browser(showid, bd=None):
    if not bd:
        bd = browser_dict
    if showid[4:6] in bd and bd[showid[4:6]]:
        return bd[showid[4:6]]
    return 'unknown'


def get_os(showid, od=None):
    if not od:
        od = os_dict
    if showid[6:8] in od and od[showid[6:8]]:
        return od[showid[6:8]]
    return 'unknown'


def get_lastts():
    with open('fastlogs_last_timestamp') as f:
        ts = f.read()
    return int(ts)


def set_lastts(ts):
    with open('fastlogs_last_timestamp', 'w') as f:
        f.write(format(ts))


def tstable(table):
    return tryint(table.split('/')[-1])


def get_srctables(lb=None, ub=None, alltables=None):
    if not alltables:
        alltables = get_alltables()
    if not lb:
        lb = get_lastts()
    if not ub:
        ub = 9999999999
    result = [x for x in alltables if tstable(x) > lb and tstable(x) <= ub]
    return sorted(result)


def get_alltables():
    alltables = MapReduce.getTablesInfo('fast_logs/user_sessions/*')
    alltables = sorted([x.name for x in alltables if x.name.endswith('0')])
    return alltables


def main():
    global __file__                         # to fix stupid
    __file__ = os.path.abspath(__file__)    # __file__ handling
    _file_ = os.path.basename(__file__)     # in python 2
    import arrow
    global browser_dict
    global os_dict

    parser = argparse.ArgumentParser()
    parser.add_argument('--debug', action='store_true')
    parser.add_argument('--savecands', action='store_true')
    parser.add_argument('--config', default=None)
    parser.add_argument('--datetimefrom', default=None)
    parser.add_argument('--datetimeto', default=None)
    parser.add_argument('--timestamp', action='store_true')
    parser.add_argument('--nolock', action='store_true')
    args = parser.parse_args()
    start = int((datetime.datetime.now() -
                 datetime.datetime(1970, 1, 1)).total_seconds())

    logger = logging.getLogger(_file_[:-3])
    formatter = logging.Formatter('%(asctime)s | %(message)s')
    ch = logging.StreamHandler()
    logger.setLevel(logging.DEBUG)
    if args.debug:
        ch.setLevel(logging.DEBUG)
    else:
        ch.setLevel(logging.CRITICAL)
    ch.setFormatter(formatter)
    logger.addHandler(ch)
    fh = logging.FileHandler('{}/logs/{}-{}.log'.format(
        os.path.dirname(__file__), _file_[:-3], start),
        encoding='utf8')
    fh.setLevel(logging.DEBUG)
    fh.setFormatter(formatter)
    logger.addHandler(fh)

    # load config
    with open('basic.toml', 'r') as f:
        config = toml.loads(f.read())
    os.chdir(os.path.dirname(__file__))
    with open('distribution.toml', 'r') as f:
        config.update(toml.loads(f.read()))
    if args.config is None:
        config.update(toml.loads(open(_file_[:-3] + '.toml').read()))
    else:
        config.update(toml.loads(open(args.config).read()))

    if not args.nolock:
        check_if_locked(os.path.abspath('fastlogs_lock.txt'), logger)

    candidates, descriptions = get_unique_candidates(
        logger, debug=args.savecands)
    # candidates = {}
    # descriptions = {}
    if args.savecands:
        with open('candidate_to_product_{}.json'.format(
                arrow.now().timestamp), 'w') as f:
            f.write(json.dumps(candidates, indent=4, ensure_ascii=False)
                    .encode('utf8'))
        with open('candidate_descriptions_{}.json'.format(
                arrow.now().timestamp), 'w') as f:
            f.write(json.dumps(descriptions, indent=4, ensure_ascii=False)
                    .encode('utf8'))

    with codecs.open('showid.txt', 'r', 'utf8') as f:
        for line in f:
            tabs = line.rstrip().split()
            if len(tabs) == 3 and tabs[0] == '2':
                browser_dict[tabs[1]] = tabs[2]
            if len(tabs) == 3 and tabs[0] == '3':
                os_dict[tabs[1]] = tabs[2]

    defaults()
    MapReduce.useDefaults(server=config['mr_server'])
    alltables = get_alltables()
    if not(args.datetimefrom and args.datetimeto):
        lastts = get_lastts()
        srctables = get_srctables(lb=lastts)
        while len(srctables) > 0:
            t = process_date(srctables[0], logger, config, candidates,
                             descriptions, debug=args.debug)
            processed_ts = tstable(srctables[0])
            if t and processed_ts > get_lastts():
                set_lastts(processed_ts)
            srctables = get_srctables()
        logger.info("No new data. Latest counted ts is {}"
                    .format(lastts))
        if not args.nolock:
            with codecs.open('fastlogs_lock.txt', 'w', 'utf8') as f:
                f.write('free')
        sys.exit(0)
    else:
        df = args.datetimefrom
        lb = (int(df)
              if args.timestamp
              else arrow.get(df + '+03:00',
                             'YYYYMMDDHHmmZZ').timestamp)
        ub = (int(args.datetimeto)
              if args.timestamp
              else arrow.get(args.datetimeto + '+03:00',
                             'YYYYMMDDHHmmZZ').timestamp)
        srctables = get_srctables(lb=lb, ub=ub)
        for srctable in srctables:
            t = process_date(srctable, logger, candidates,
                             descriptions, redo=True, debug=args.debug)
        if not args.nolock:
            with codecs.open('fastlogs_lock.txt', 'w', 'utf8') as f:
                f.write('free')

event_dict = {
    'close_stripe': 'cancels',
    'closestripe': 'cancels',
    'close-stripe': 'cancels',
    'close': 'cancels',
    'cancel': 'cancels',
    'show': 'shows',
    'download': 'installs',
    'install': 'installs',
    'click': 'clicks',
}


def yaml_dump(dictionary):
    return yaml.safe_dump(dictionary, default_flow_style=False,
                          explicit_start=True)


def reverse_dict(dictionary):
    return {v: k for k, v in dictionary.items()}


def remove_titles_and_snippets(dictionary):
    result = copy.deepcopy(dictionary)
    for x in result:
        result[x] = result[x].split(' | ')[0]
    return result


def yaml_dump_all():
    for d in statdict:
        with codecs.open('{}.yaml'.format(d), 'w', 'utf8') as f:
            f.write(yaml_dump(
                reverse_dict(statdict[d])
            ))


def get_set_dictionary(dictionary, value):
    if value in dictionary:
        return dictionary[value]
    dictionary[value] = zmax(set(dictionary.values())) + 1
    return dictionary[value]


def get_set_dictionary_testid(dictionary, value):
    if value in dictionary:
        return dictionary[value]
    dictionary['1000000'] = 1000000
    if value == '_total_':
        dictionary[value] = 0
    else:
        try:
            dictionary[value] = int(value)
        except ValueError:
            dictionary[value] = zmax(set(dictionary.values())) + 1
    return dictionary[value]


def zmax(seq):
    if len(seq) > 0:
        return max(seq)
    return -1


def process_ident(ident):
    fd, product, service, \
        lang, browser, description, \
        element, candidate = ident.split('\t')
    try:
        product = get_set_dictionary(statdict['product'], product)
    except:
        pdb.set_trace()
    service = get_set_dictionary(statdict['service'], service)
    lang = get_set_dictionary(statdict['lang'], lang)
    browser = get_set_dictionary(statdict['browser'], browser)
    description = get_set_dictionary(statdict['description'], description)
    element = get_set_dictionary(statdict['element'], element)
    candidate = get_set_dictionary(statdict['candidate'], candidate)
    return tabulate(fd, product, service,
                    lang, browser, description,
                    element, candidate)


def process_ident_4(ident, candidates, candidates_to_title_snippets,
                    debug=False):
    fd, product, service, \
        lang, browser, _os, testid, \
        element, candidate = ident.split('\t')
    try:
        product = get_set_dictionary(statdict['product'], product)
    except:
        pdb.set_trace()
    service = get_set_dictionary(statdict['service'], service)
    lang = get_set_dictionary(statdict['lang'], lang)
    browser = get_set_dictionary(statdict['browser'], browser)
    _os = get_set_dictionary(statdict['os'], _os)
    testid = get_set_dictionary_testid(statdict['testid'], testid)
    element = get_set_dictionary(statdict['element'], element)
    if candidate in candidates:
        candidates_to_title_snippets[candidate] = '{} | {}'.format(
            candidate, candidates[candidate]
        )
    candidate = get_set_dictionary(statdict['candidate'], candidate)
    return tabulate(fd, product, service,
                    lang, browser, _os, testid,
                    element, candidate)


def get_dictionary_from_stat(name):
    req = requests.get(
        DICTAPI + '?name={}'.format(name),
        headers=HEADERS
    )
    if req.status_code == 200:
        result = {int(k): v for k, v in json.loads(req.content).items()}
        if result:
            return result
        return yaml.safe_load(open('{}.yaml'.format(name)))
    return req


def post_dictionary_to_stat(dictionary, name):
    req = requests.post(
        DICTAPI,
        headers=HEADERS,
        data={
            'name': name,
            'language': '',
            'dictionary': json.dumps(dictionary),
            'editors': ['pecheny', 'riddle']
        }
    )
    return req


def check_if_locked(lock, logger):
    with codecs.open(lock, 'r', 'utf8') as f:
        contents = f.read().rstrip()
    search_for_process = [p for p in psutil.process_iter()
                          if (os.path.basename(__file__) in ' '.join(p.cmdline()))
                          and not ('nolock' in ' '.join(p.cmdline()))
                          and not ('mapreduce' in ' '.join(p.cmdline()))]
    if 'locked' in contents and len(search_for_process) > 1:
        logger.info('Process is locked, exiting...')
        sys.exit(0)
    else:
        with codecs.open(lock, 'w', 'utf8') as f:
            f.write('locked at {}'.format(datetime.datetime.now()))


def process_date(srctable, logger,
                 candidates, descriptions, redo=False, debug=False):
    logger.info('Source table is {}'.format(srctable))
    ts = int(srctable.split('/')[-1])
    pstart = datetime.datetime.now()
    dsttable = 'tmp/pers/set_installs_fastlogs_{}'.format(ts)
    tmptable = dsttable + '_tmp'
    tmptable4 = dsttable + '_tmp4'
    endtable = dsttable + '_stat'
    endtable4 = dsttable + '_stat4'
    if not table_exists(dsttable) or redo:
        success = False
        while not success:
            try:
                logger.info('Reducing from {} to {}'.format(
                    srctable, dsttable))
                MapReduce.runReduce(fastlogs_reduce, srcTable=srctable,
                                    dstTables=[dsttable, dsttable + '.errors'],
                                    username='personalization')
                success = True
            except:
                logger.error(traceback.format_exc())
    if (table_exists(dsttable) and not table_exists(endtable)) or redo:
        set_installs_map = SetInstallsMap(browser_dict, os_dict,
                                          descriptions, event_dict, ts)
        if not table_exists(tmptable) or redo:
            logger.info('Mapping from {} to {}'.format(
                dsttable, tmptable))
            MapReduce.runMap(set_installs_map, srcTable=dsttable,
                             dstTables=[tmptable, tmptable4], username='personalization')
            # logger.info('Sorting {}'.format(tmptable))
            # logger.info('Sorting {}'.format(tmptable4))
            # MapReduce.sortTable(tmptable)
            # MapReduce.sortTable(tmptable4)
        # logger.info('Reducing from {} to {}'.format(tmptable,
        #     endtable))
        # MapReduce.runReduce(stat_reduce, srcTable=tmptable,
        #     dstTable=endtable, subkeyMode=False, username='personalization')
        logger.info('Reducing from {} to {}'.format(tmptable4,
                                                    endtable4))
        MapReduce.runReduce(stat_reduce, srcTable=tmptable4,
                            dstTable=endtable4, subkeyMode=False, username='personalization')
        # logger.info('Sorting {}'.format(endtable))
        # MapReduce.sortTable(endtable, username='personalization')
        MapReduce.dropTable(dsttable, username='personalization')
        MapReduce.dropTable(dsttable + '.errors', username='personalization')
        MapReduce.dropTable(tmptable, username='personalization')
        MapReduce.dropTable(tmptable4, username='personalization')
        logger.info('Sorting {}'.format(endtable4))
        MapReduce.sortTable(endtable4, username='personalization')
    if table_exists(endtable4):
        counter = Counter()
        chunk = StringIO()
        print(tabulate('fielddate', 'product', 'service',
                       'lang', 'browser', 'os', 'testid', 'element', 'candidate',
                       'shows', 'installs', 'cancels', 'clicks'), file=chunk)
        i = 0
        ch = 1
        prevkey = ''
        logger.info('Getting dicts from statface...')
        for x in ['product', 'service', 'lang', 'browser',
                  'description', 'os', 'testid', 'element']:
            statdict[x] = reverse_dict(get_dictionary_from_stat(
                'vcfs::atombanners_{}'.format(x)))
        candidates_from_stat = get_dictionary_from_stat(
            'vcfs::atombanners_candidate')
        candidates_to_title_snippets = {}
        for x in candidates_from_stat:
            candidates_to_title_snippets[
                candidates_from_stat[x].split(' | ')[0]
            ] = candidates_from_stat[x]
            candidates_from_stat[x] = candidates_from_stat[x].split(' | ')[0]
        statdict['candidate'] = reverse_dict(candidates_from_stat)
        logger.info('Getting records from {}...'.format(endtable4))
        dbgcands = set()
        for rec in MapReduce.getSample(endtable4, count=None):
            rec1 = deutf8ify(rec)
            dbgcands.add(rec1.key.split('\t')[-1])
            if i >= 30000000 and rec1.key != prevkey and prevkey:
                logger.info('Pushing chunk {} to stat (v4)...'.format(ch))
                ch += 1
                if debug:
                    pdb.set_trace()
                push_to_stat(chunk, logger, redo=redo, ch=ch,
                             name="Distribution/Others/AtomBanners/v4",
                             candidates_to_title_snippets=candidates_to_title_snippets,
                             debug=debug)
                chunk = StringIO()
                print(tabulate('fielddate', 'product', 'service',
                               'lang', 'browser', 'os', 'testid', 'element', 'candidate',
                               'shows', 'installs', 'cancels', 'clicks'), file=chunk)
                i = 0
            print(tabulate(
                process_ident_4(rec1.key,
                                candidates,
                                candidates_to_title_snippets),
                rec1.value
            ), file=chunk)
            i += 1
            prevkey = rec1.key

        logger.info('Pushing chunk {} to stat (v4)...'.format(ch))
        push_to_stat(chunk, logger, redo=redo, ch=ch,
                     name="Distribution/Others/AtomBanners/v4",
                     candidates_to_title_snippets=candidates_to_title_snippets,
                     debug=debug)
        logger.info('Total time is {}'.format(str(datetime.datetime.now() -
                                                  pstart)))
        # accumulate data for razladki
        element = rec1.key.split('\t')[6]
        lang = rec1.key.split('\t')[3]
        shows, installs, cancels, clicks = [int(x) for x in
                                            rec1.value.split('\t')]
        if element != '_total_' and lang != '_total_':
            counter[(element, 'show', lang)] += shows
            counter[(element, 'click', lang)] += clicks
            counter[(element, 'cancel', lang)] += cancels
            counter[(element, 'install', lang)] += installs
        #
        return True
    # if table_exists(endtable):
    #     counter = Counter()
    #     chunk = StringIO()
    #     print(tabulate('fielddate', 'product', 'service',
    #     'lang', 'browser', 'description', 'element', 'candidate',
    #     'shows', 'installs', 'cancels', 'clicks'), file=chunk)
    #     i = 0
    #     ch = 1
    #     prevkey = ''
    #     logger.info('Getting records from {}...'.format(endtable))
    #     for rec in MapReduce.getSample(endtable, count=None):
    #         rec1 = deutf8ify(rec)
    #         if i >= 30000000 and rec1.key != prevkey and prevkey:
    #             logger.info('Pushing chunk {} to stat...'.format(ch))
    #             ch += 1
    #             push_to_stat(chunk, logger, redo=redo, ch=ch)
    #             chunk = StringIO()
    #             print(tabulate('fielddate', 'product', 'service',
    #             'lang', 'browser', 'description', 'element', 'candidate',
    #             'shows', 'installs', 'cancels', 'clicks'), file=chunk)
    #             i = 0
    #         print(tabulate(process_ident(rec1.key), rec1.value), file=chunk)
    #         i += 1
    #         # accumulate data for razladki
    #         element = rec1.key.split('\t')[6]
    #         lang = rec1.key.split('\t')[3]
    #         shows, installs, cancels, clicks = [int(x) for x in
    #             rec1.value.split('\t')]
    #         if element != '_total_' and lang != '_total_':
    #             counter[(element, 'show', lang)] += shows
    #             counter[(element, 'click', lang)] += clicks
    #             counter[(element, 'cancel', lang)] += cancels
    #             counter[(element, 'install', lang)] += installs
    #         #
    #         prevkey = rec1.key

    #     logger.info('Pushing chunk {} to stat...'.format(ch))
    #     push_to_stat(chunk, logger, redo=redo, ch=ch)

    #     config = {'razladki':
    #     'http://launcher.razladki.yandex-team.ru/'
    #     'save_new_data/SearchPortalDistribution'}
    #     for param in counter:
    #         desc = '{lang}_{elem}_set_{event}s_fastlogs'.format(
    #             lang=param[2],
    #             elem=param[0],
    #             event=param[1])
    #         value = counter[param]
    #         # logger.info('Pushing to razladki: {}, {}'
    #         #     .format(desc, value))
    #         push_to_razladki(config, desc, value, ts=ts)
    #         if param[1] in {'click', 'install', 'cancel', 'close'}:
    #             try:
    #                 rdesc = '{lang}_{elem}_set_{event}srate_fastlogs'.format(
    #                     lang=param[2],
    #                     elem=param[0],
    #                     event=param[1])
    #                 showvalue = counter[(param[0], 'show', param[2])]
    #                 rvalue = value / float(showvalue)
    #                 # logger.info('Pushing to razladki: {}, {}'
    #                 #     .format(rdesc, rvalue))
    #                 push_to_razladki(config, rdesc, rvalue, ts=ts)
    #             except:
    #                 logger.info(traceback.format_exc())
    #         if (param[1] == 'close'
    #             and (param[0], 'click', param[2]) in counter):
    #             try:
    #                 rdesc = '{lang}_{elem}_set_closeclickrate_fastlogs'.format(
    #                     lang=param[2],
    #                     elem=param[1])
    #                 showvalue = counter[(param[0], 'click', param[2])]
    #                 rvalue = value / float(showvalue)
    #                 # logger.info('Pushing to razladki: {}, {}'
    #                 #     .format(rdesc, rvalue))
    #                 push_to_razladki(config, rdesc, rvalue, ts=ts)
    #             except:
    #                 logger.info(traceback.format_exc())
    #     return True

if __name__ == "__main__":
    main()
