#!/usr/bin/env python
#! -*- coding: utf-8 -*-
from __future__ import unicode_literals
from __future__ import division
import sys
import os
import codecs
import argparse
from mrdef import defaults
from mapreducelib import MapReduce, Record
from collections import defaultdict, Counter
from putils import utf8ify, deutf8ify, make_logger, counter_quantile
import base64
import psutil
import scarab.main
import datetime
import requests
try:
    basestring
except NameError:
    basestring = str


def get_alltables():
    return sorted(
        [x.name for x in
            MapReduce.getTablesInfo(
                'statbox/atomfront/atomfront-answer-log/*'
            )
            if len(x.name.split('/')) == 4
         ]
    )


class AtomfrontMap(object):

    def __init__(self, timestamp):
        self.timestamp = timestamp

    def __call__(self, rec):
        rec = deutf8ify(rec)
        data = rec.value.split('\\t')[-1]
        decoded = base64.b64decode(data)
        parsed = scarab.main.deserialize_event_from_str(decoded)
        client = parsed.client
        try:
            for answer in parsed.answer:
                subclient = answer.name
                if len(answer.docs) > 0:
                    score = round(answer.docs[0].score, 5)
                    yield utf8ify(Record(
                        '{}/{}'.format(client, subclient),
                        str(self.timestamp),
                        str(score)
                    ))
        except AttributeError:
            pass


def atomfront_reduce(key, recs):
    key = deutf8ify(key)
    counter = Counter()
    for rec in recs:
        rec = deutf8ify(rec)
        counter[float(rec.value)] += 1
    counter.pop(1, None)
    for q in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
        yield utf8ify(Record(
            key,
            format(q),
            format(counter_quantile(counter, q))
        ))


def get_timestamp(t):
    if not isinstance(t, basestring):
        t = t.name
    return t.split('/')[-1].split('-')[0]


def get_srctables(lastts, alltables=None):
    if not alltables:
        alltables = get_alltables()
    srctables = [x
                 for x in alltables
                 if int(get_timestamp(x)) > int(lastts) and
                 str(datetime.datetime.fromtimestamp(
                     int(get_timestamp(x))
                 ).minute).endswith('0')]
    return srctables


def push_to_razladki(data, overwrite=False):
    url = ('http://launcher.razladki.yandex-team.ru/'
           'save_new_data_json/SearchPortalDistribution')
    requests.post(url, json={'data': data, 'override': overwrite})


def get_lastts():
    with codecs.open('{}/atomfront_5m_lastts'.format(
        os.path.dirname(os.path.abspath(__file__))
    ), 'r', 'utf8') as f:
        return f.read()


def set_lastts(ts):
    with codecs.open('{}/atomfront_5m_lastts'.format(
        os.path.dirname(os.path.abspath(__file__))
    ), 'w', 'utf8') as f:
        f.write(ts)


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--debug', action='store_true')
    parser.add_argument('--nolock', action='store_true')
    parser.add_argument('--datetimefrom')
    parser.add_argument('--datetimeto')
    args = parser.parse_args()
    logger = make_logger(os.path.abspath(__file__), debug=args.debug)
    defaults()
    MapReduce.useDefaults(username='personalization')
    if not args.nolock:
        check_if_locked(os.path.abspath('atomlog_lock.txt'), logger)
    if args.datetimefrom:
        datetimefrom = int(
            datetime.datetime.strptime(
                args.datetimefrom, '%Y%m%d%H%M%S'
            ).strftime('%s')
        )
    if args.datetimeto:
        datetimeto = int(
            datetime.datetime.strptime(
                args.datetimeto, '%Y%m%d%H%M%S'
            ).strftime('%s')
        )
    if args.datetimefrom and args.datetimeto:
        alltables = get_alltables()
        srctables = [
            x for x in alltables if
            datetimefrom <= int(get_timestamp(x)) <= datetimeto
        ]
        for table in srctables:
            process_table(table, overwrite=True, logger=logger)
        with codecs.open('atomlog_lock.txt', 'w', 'utf8') as f:
            f.write('free')
    else:
        srctables = get_srctables(get_lastts())
        while len(srctables) > 0:
            t = process_table(srctables[0], logger=logger)
            if t:
                set_lastts(get_timestamp(srctables[0]))
            srctables = get_srctables(get_lastts())
        logger.info("No new data. Latest counted date is {}"
                    .format(get_lastts()))
        with codecs.open('atomlog_lock.txt', 'w', 'utf8') as f:
            f.write('free')


def check_if_locked(lock, logger):
    with codecs.open(lock, 'r', 'utf8') as f:
        contents = f.read().rstrip()
    search_for_process = [p for p in psutil.process_iter()
                          if (os.path.basename(__file__)
                              in ' '.join(p.cmdline())) and
                          not ('nolock' in ' '.join(p.cmdline())) and
                          not ('mapreduce' in ' '.join(p.cmdline()))]
    if 'locked' in contents and len(search_for_process) > 1:
        logger.info('Process is locked, exiting...')
        sys.exit(0)
    else:
        with codecs.open(lock, 'w', 'utf8') as f:
            f.write('locked at {}'.format(datetime.datetime.now()))


def process_table(table, overwrite=False, logger=None):
    atomfront_map = AtomfrontMap(get_timestamp(table))
    tmp1 = 'tmp/pers/atomfront{}map'.format(
        get_timestamp(table)
    )
    tmp2 = 'tmp/pers/atomfront{}reduce'.format(
        get_timestamp(table)
    )
    t = datetime.datetime.now()
    logger.info('mapping from {} to {}'.format(table, tmp1))
    MapReduce.runMap(atomfront_map,
                     srcTable=table,
                     dstTable=tmp1)
    logger.info('reducing from {} to {}'.format(tmp1, tmp2))
    MapReduce.runReduce(atomfront_reduce,
                        srcTable=tmp1,
                        dstTable=tmp2)
    logger.info('getting records from {}'.format(tmp2))
    records = list(MapReduce.getSample(tmp2))
    data = []
    logger.info('pushing to razladki')
    for rec in records:
        rec = deutf8ify(rec)
        data.append({
                    'param': 'atomlog_{}_{}'.format(rec.key, rec.subkey),
                    'ts': get_timestamp(table),
                    'value': rec.value,
                    })
    push_to_razladki(data, overwrite=overwrite)
    MapReduce.dropTable(tmp1)
    MapReduce.dropTable(tmp2)
    logger.info('total time: {}'.format((datetime.datetime.now() - t)
                                        .total_seconds() / 60))
    return True

if __name__ == "__main__":
    main()
