#!usr/bin/env python#! -*- coding: utf-8 -*-
from __future__ import division
from __future__ import unicode_literals
import sys
import re
import os
import shlex
import subprocess
import logging
import hashlib
import argparse
import pdb
import traceback
import codecs
import toml
import datetime
import time
from mapreducelib import MapReduce, Record
import mapreducelib
from pecheny.mrdef import defaults

re_login = re.compile(r'(?<=yandex_login=)[0-9a-zA-Z\-\.@]+')
re_query = re.compile(r'(?<=query=).*?(?=[\t$])')
re_msp = re.compile(r'(?<=msp=).*?(?=[\t$])')
isletter = re.compile(ur'[A-Za-zА-Яа-я]')
MR_SAMPLE = ('/Berkanavt/bin/mr_sample-dev '
             '-s sakura00.search.yandex.net:8013 -n {} -sub -f {}')
_file_ = ''


class Testcase(object):

    def __init__(self, id, query, debugtext):
        self.id = id
        self.query = query
        self.debugtext = debugtext


def deutf8ify(rec):
    if isinstance(rec, mapreducelib.SubkeyedRecord):
        key, subkey, value = rec.key, rec.subkey, rec.value
        if not isinstance(key, unicode):
            key = key.decode('utf8', errors='replace')
        if not isinstance(subkey, unicode):
            subkey = subkey.decode('utf8', errors='replace')
        if not isinstance(value, unicode):
            value = value.decode('utf8', errors='replace')
        return Record(key, subkey, value)
    elif isinstance(rec, str):
        rec = rec.decode('utf8', errors='replace')
    return rec


def yandex_login(s):
    if re_login.search(s):
        return re_login.search(s).group(0)
    else:
        return ''


def md5(string_):
    return hashlib.md5(string_).hexdigest()


def get_query(s):
    if re_msp.search(s):  # use corrected version of query, if exists
        return re_msp.search(s).group(0).split(':')[-1]
    elif re_query.search(s):
        return re_query.search(s).group(0)
    else:
        return ''


def current_timestamp():
    return int((datetime.datetime.now()
                - datetime.datetime(1970, 1, 1)).total_seconds())


def loginCheck(rec):
    value = rec.value.decode('utf8', errors='replace')
    if ('service=maps.yandex' in value
        and 'type=REQUEST' in value
        and yandex_login(value) != ''
        and isletter.search(get_query(value))
        and len(get_query(value)) > 2
            and not 'output=json' in value):

        yield Record(rec.key, rec.subkey, rec.value)


def up_date(args, date):
    global montable
    global tmptable
    global sessions
    if args.ystaff:
        montable = 'pecheny/geo_mr_mon_ystaff' + date
    else:
        montable = 'pecheny/geo_mr_mon' + date
    if args.ystaff:
        tmptable = 'pecheny/geo_mr_mon_tmp_ystaff' + date
    else:
        tmptable = 'pecheny/geo_mr_mon_tmp' + date
    if args.ystaff:
        sessions = 'user_sessions/' + date + '/yandex_staff'
    else:
        sessions = 'user_sessions/' + date

# def push_to_razladki(params, desc, value):
#     import requests
#     data = {desc: value}
#     logger = logging.getLogger(_file_[:-3])
#     req = None
#     while req is None or req.status_code != 200:
#         req = requests.post(params['razladki'], data=data)


def main():
    global _file_
    global __file__                         # to fix stupid
    __file__ = os.path.abspath(__file__)    # __file__ handling
    _file_ = os.path.basename(__file__)     # in python 2
    import requests  # else mapreduce gets angry
    from pecheny.moncommons import push_to_razladki
    parser = argparse.ArgumentParser(
        description='A simple MR-based monitoring for geo_requests table')
    parser.add_argument('--logins', '-l', default=500, type=int,
                        help='Number of login-query pairs to sample. Default is 500.')
    parser.add_argument('--date', '-d', default='',
                        help='Date from which to sample,'
                        ' formatted as yyyymmdd. Default is 4 days back.')
    parser.add_argument('--debug', action='store_true',
                        help='Debug mode (stderr is turned on).')
    parser.add_argument('--forcesample', '-f',
                        action='store_true', default=False,
                        help='Force resampling even if a sample table already exists.')
    parser.add_argument('--ystaff', '-y', action='store_true', default=False,
                        help='Use yandex_staff instead of external users.')
    parser.add_argument('--config', '-c', default=None,
                        help='Load a config file on top of defaults')

    args = parser.parse_args()
    users = []
    badqueries = 0
    nonzeroes = 0
    nologins = 0

    start = current_timestamp()

    # set up logging
    logger = logging.getLogger(_file_[:-3])
    formatter = logging.Formatter('%(asctime)s | %(message)s')
    ch = logging.StreamHandler()
    logger.setLevel(logging.DEBUG)
    if args.debug:
        ch.setLevel(logging.DEBUG)
    else:
        ch.setLevel(logging.CRITICAL)
    ch.setFormatter(formatter)
    logger.addHandler(ch)
    fh = logging.FileHandler('{}/logs/{}-{}.log'.format(
        os.path.dirname(__file__), _file_[:-3], start),
        encoding='utf8')
    fh.setLevel(logging.DEBUG)
    fh.setFormatter(formatter)
    logger.addHandler(fh)

    with open('basic.toml', 'r') as f:
        params = toml.loads(f.read())
    os.chdir(os.path.dirname(__file__))
    with open('geo.toml', 'r') as f:
        params.update(toml.loads(f.read()))
    if not args.config is None:
        with open(args.config, 'r') as f:
            params.update(toml.loads(f.read()))

    rtmr_check = params['rtmr_check']
    check_table = params['check_table']

    logger.info('Monitoring started')

    defaults()
    MapReduce.useDefaults(server=params['mr_server'],
                          verbose=False, username='tmp', usingSubkey=True)

    # determine the YYYYMMDD in `user_sessions/YYYYMMDD`
    date = args.date
    if date == '':  # if user didn't specify the date, take last available

        logger.info('Will determine the most recent user_sessions table.')
        date = (datetime.date.today()  # yesterday in YYYYMMDD
                - datetime.timedelta(days=1)).strftime('%Y%m%d')
        up_date(args, date)
        while 1 != len(list(MapReduce.getSample(sessions, count=1))):
            date = (datetime.datetime.strptime(date, '%Y%m%d') -
                    datetime.timedelta(days=1)).strftime('%Y%m%d')
            # the day before `date` in YYYYMMDD
            up_date(args, date)

        logger.info('The most recent user_sessions table is {}'.format(
            sessions))
    else:
        up_date(args, date)

    if (1 != len(list(MapReduce.getSample(montable, count=1)))
            or args.forcesample):

        # determine most recent
        if 1 != len(list(MapReduce.getSample(tmptable, count=1))):
            if 1 == len(list(MapReduce.getSample(sessions, count=1))):

                logger.info('Will make temporary table from table {}, '
                            'placing it into table {}'.format(
                                sessions, tmptable))

                MapReduce.runMap(loginCheck,
                                 srcTable=sessions, dstTable=tmptable)

            else:
                logger.info('{} doesn\'t exist.'.format(
                    sessions))
                sys.exit()

        if 1 == len(list(MapReduce.getSample(tmptable, count=1))):

            newenv = os.environ.copy()
            newenv["MR_USER"] = "tmp"
            newenv["MR_NET_TABLE"] = "ipv6"

            for line in subprocess.check_output(shlex.split(
                    MR_SAMPLE.format(
                        args.logins, tmptable)), env=newenv).decode(
                    'utf8', errors='replace').split('\n'):
                if not yandex_login(line) in users:
                    users.append(yandex_login(line))

            userqueries = []
            for rec in MapReduce.getSample(tmptable, count=None):
                if yandex_login(rec.value) in users:
                    userqueries.append(Record(rec.key, rec.subkey, rec.value))

            mid = current_timestamp()

            logger.info(
                '{} queries of {} users retrieved from {}'
                ' in {:.2f} seconds. '
                'Will write the queries to table {}'.format(
                    len(userqueries), len(users), tmptable, mid - start,
                    montable))

            MapReduce.updateTable(userqueries, dstTable=montable)

            logger.info('Will drop {}.\n'.format(
                tmptable))

            MapReduce.dropTables([tmptable])

        else:
            logger.info('No geo queries for date {} or unknown error.'
                        .format(date))
            sys.exit()

    if 1 == len(list(MapReduce.getSample(montable, count=1))):
        users = []
        testcases = []
        nullusers = []

        c_start = current_timestamp()

        logger.info(
            'Will retrieve queries from table {}.'.format(montable))

        for rec in MapReduce.getSample(montable, count=None):
            rec = deutf8ify(rec)
            testcases.append(Testcase(
                id=yandex_login(rec.value),
                query=get_query(rec.value),
                debugtext='\t'.join([rec.key, rec.subkey, rec.value])))
            if not yandex_login(rec.value) in users:
                users.append(yandex_login(rec.value))

        c_end = current_timestamp()

        logger.info(
            'Will monitor {} queries of {} users, retrieved from table {} '
            'in {:.2f} seconds.'.format(
                len(testcases), len(users), montable, c_end - c_start))

        found = 0
        notfound = 0
        nonzeroes = 0
        fails = 0
        badstrings = []

        while len(testcases) > 0 and (nonzeroes + fails) < 100 * len(users):
            try:
                if current_timestamp() - start > 7200:
                    logger.info('time exceeds threshold')
                    sys.exit(0)

                check_req = params['check_req'].format(
                    rtmr_check, check_table, md5(testcases[0].id))
                req = requests.get(check_req, timeout=1)
                text, exit_status = (
                    req.content.decode('utf8', errors='replace'),
                    req.status_code)
                if exit_status != 200:
                    logger.info('non-zero curl exit code')
                    nonzeroes += 1
                    continue
            except:
                nonzeroes += 1
                logger.info(traceback.format_exc())
                continue
            try:
                if testcases[0].query in text:
                    found += 1

                else:
                    logger.info('not found: {}, [{}], debug {}, logline {}'
                                .format(
                                    testcases[0].id,
                                    testcases[0].query,
                                    check_req,
                                    testcases[0].debugtext))
                    if text == '' and testcases[0].id not in nullusers:
                        nullusers.append(testcases[0].id)
                    notfound += 1

                testcases.pop(0)

            except:
                traceback.print_exc(file=logfile)
                fails += 1
                continue
        try:
            foundrate = found / (found + notfound)
        except:
            foundrate = 0

        try:
            nullusersrate = len(nullusers) / len(users)
        except:
            nullusersrate = 0

        for badstring in badstrings:
            logger.info(badstring)

        logger.info('{} queries of {} users found in geo_requests ({:.1%}),'
                    ' {} queries not found,'
                    ' {} users ({:.1%}) have no data whatsoever,'
                    ' {} non-zero exit codes.'.format(
                        found, len(users), foundrate,
                        notfound, len(nullusers), nullusersrate,
                        nonzeroes))

        if not args.debug:
            logger.info('Pushing to razladki...')
            desc = 'geo_mr_foundrate'
            if args.ystaff:
                desc += '_ystaff'
            if 'suffix' in params:
                desc += params['suffix']
            push_to_razladki(params, desc, foundrate)
            desc = 'geo_mr_nullusers'
            if args.ystaff:
                desc += '_ystaff'
            if 'suffix' in params:
                desc += params['suffix']
            push_to_razladki(params, desc, nullusersrate)

        if len(nullusers) > 0:
            logger.info('Full list of \"null users\": {}'.format(
                ', '.join(nullusers)))

    else:
        logger.info('Unknown error happened,'
                    ' query table sampled for monitoring does not exist.')
        sys.exit(1)

if __name__ == "__main__":
    main()
