#!/usr/bin/env python
#! -*- coding: utf-8 -*-
from __future__ import unicode_literals
from __future__ import division
import json
import logging
import sys
import os
import pdb
import dill
import pickle
import arrow
import datetime
import codecs
import argparse
import requests
import subprocess
import shlex
import re
from collections import defaultdict, Counter
import smtplib
import urllib

SERVER = "localhost"
RECIPIENTS = ['riddle', 'pecheny', 'qkrorlqr', 'manokk']
NORIDDLE = list(set(recipients) - {'riddle'})
SUBJECT = "Atom distribution snapshots rollout monitoring"
FROM = "atom-rollout@avatar.search.yandex.net"
DEVNULL = open(os.devnull, 'w')
logger = None


def delete_first(string):
    split_ = string.split(' ')
    result = ' '.join(split_[1:])
    result = re.sub(r'^\s+', r'', result)
    result = re.sub(r'\s+$', r'', result)
    return result


def get_files_by_prefix(prefix, directory=None):
    if not directory:
        directory = os.getcwd()
    files = os.listdir(directory)
    return [filename for filename in files if filename.startswith(prefix)]


def count_errors_in_file(filename):
    errors = 0
    with codecs.open(filename, 'r', 'utf8') as f:
        for line in f:
            if 'error' in line:
                errors += 1
    return errors


def tots(x):
    if isinstance(x, datetime.datetime):
        return int((x - datetime.datetime(1970, 1, 1)).total_seconds())
    elif isinstance(x, arrow.arrow.Arrow):
        return x.timestamp


def tryint(x):
    try:
        return int(x)
    except ValueError:
        return -1


def makereport_errors(errors, hosts):
    return '\nDetails:\n\n' + '\n'.join('    `{}` on {}/{} hosts: {}'.format(
        error, len(errors[error]), len(hosts), ', '.join(errors[error]))
        for error in errors)


def makereport(errors, errorscounter,
               exceptions, exceptionscounter, hosts):
    return """{}

{}""".format(
        (makereport_errors(errors, hosts) if errorscounter > 0 else ''),
        (makereport_errors(exceptions, hosts) if exceptionscounter > 0 else '')
    )


def send_sms(recipients, message):
    url = ('https://golem.yandex-team.ru/api/sms/send.sbml?resps={}&msg={}'
           .format(','.join(recipients),
                   urllib.quote(message.encode('utf8').decode('utf8'))))
    r = requests.get(url)
    if 'slow down' in r.content:
        send_email(['pecheny'], 'you are ddosing golem')


def generate_sms(exceptionscounter, exceptions, hosts):
    return '{} exceptions on {}/{} hosts'.format(
        exceptionscounter,
        len(set().union(*(exceptions.values()))),
        len(hosts)
    )


def send_email(recipients, message, subject=None):
    if not subject:
        subject = SUBJECT
    recipients = ['{}@yandex-team.ru'.format(rec) for rec in recipients]
    body = """\
From: {}
To: {}
Subject: {}

{}
""".format(FROM, ", ".join(recipients), subject, message).encode('utf8')
    server = smtplib.SMTP(SERVER)
    server.sendmail(FROM, recipients, body)
    server.quit()


def get_state():
    if os.path.isfile('state'):
        with codecs.open('state', 'r', 'utf8') as f:
            return f.read().rstrip()


def set_state(state):
    with codecs.open('state', 'w', 'utf8') as f:
        f.write(state)


def get_last_confirmed_version():
    if os.path.isfile('last_confirmed_version'):
        with open('last_confirmed_version', 'r') as f:
            return arrow.get(f.read() + b'+03:00', 'YYYY-MM-DD HH:mm:ss')


def set_last_confirmed_version(ts):
    with open('last_confirmed_version', 'w') as f:
        f.write(format(ts))


def get_last_version():
    if os.path.isfile('last_version'):
        with open('last_version', 'r') as f:
            return arrow.get(f.read() + b'+03:00', 'YYYY-MM-DD HH:mm:ss')


def set_last_version(ts):
    with open('last_version', 'w') as f:
        f.write(format(ts))


def main():
    global logger
    parser = argparse.ArgumentParser()
    parser.add_argument('--debug', action='store_true')
    parser.add_argument('--pdb', action='store_true')
    args = parser.parse_args()
    cwd = os.path.dirname(os.path.abspath(__file__))
    _file_ = os.path.abspath(__file__)
    start = arrow.now().to('Europe/Moscow')
    os.chdir(cwd)

    logger = logging.getLogger(_file_[:-3])
    formatter = logging.Formatter('%(asctime)s | %(message)s')
    ch = logging.StreamHandler()
    logger.setLevel(logging.DEBUG)
    if args.debug:
        ch.setLevel(logging.DEBUG)
    else:
        ch.setLevel(logging.CRITICAL)
    ch.setFormatter(formatter)
    logger.addHandler(ch)
    if not os.path.isdir('logs'):
        os.mkdir('logs')
    fh = logging.FileHandler('{}/logs/{}_{}.log'.format(
        os.path.dirname(_file_), os.path.basename(_file_)[:-3], start.format('YYYY-MM-DD_HH-mm-ss')),
        encoding='utf8')
    fh.setLevel(logging.DEBUG)
    fh.setFormatter(formatter)
    logger.addHandler(fh)

    state = get_state()
    if state == 'busy':
        sys.exit(0)
    set_state('busy')
    hosts = (subprocess.check_output(
        'sky list I@production_atom_search', shell=True)
        .decode('utf8', errors='replace')
        .split('\n')[:-1])
    errors = defaultdict(lambda: set())
    lasterrors = defaultdict(lambda: set())
    exceptions = defaultdict(lambda: set())
    lastexceptions = defaultdict(lambda: set())
    lastupdated = defaultdict(lambda: set())
    versions = defaultdict(lambda: set())
    lastchecked = defaultdict(lambda: defaultdict(lambda: 0))
    profiling = []
    profiling1 = []
    profiling2 = []
    profiling3 = []
    errorscounter = 0
    exceptionscounter = 0
    if os.path.isfile('lastchecked.pkl'):
        lastchecked = dill.load(open('lastchecked.pkl', 'rb'))
    # if os.path.isfile('lastchecked.json'):
        # with codecs.open('lastchecked.json','r','utf8') as f:
        # lastchecked = json.loads(f.read())

    unavailable = set()
    for host in hosts:
        logger.info('checking {}...'.format(host))
        if not os.path.isdir(host):
            os.mkdir(host)
        os.chdir(host)
        bgn = arrow.now().to('Europe/Moscow')
        status = subprocess.call(
            "rsync -az 'rsync://{}/logs/rerankd/current-rerankd_updater-*' ."
            .format(host),
            shell=True, stdout=DEVNULL, stderr=subprocess.STDOUT)
        if status:
            logger.info('{} is unavailable'.format(host))
            unavailable.add(host)
            os.chdir(cwd)
            continue
        profiling.append((arrow.now().to('Europe/Moscow')
                          - bgn).total_seconds())
        filenames = get_files_by_prefix('current-rerankd_updater')
        if len(filenames) != 2:
            errors['number of logfiles is {} and not 2: {}'
                   .format(len(filenames), ', '.join(filenames))].add(host)
            errorscounter += 1
        # if (len(filenames) == 2
        #     and abs(count_errors_in_file(filenames[0])
        #         - count_errors_in_file(filenames[1])) > 1):
        #     errorscounter += 1
        #     errors['files {} and {} have different number of errors'
        #         .format(filenames[0], filenames[1])].add(host)

        for filename in filenames:
            if filename.endswith('7300'):
                logger.info('    processing {}...'.format(filename))
                bgn = arrow.now().to('Europe/Moscow')
                uts = arrow.get(0).to('Europe/Moscow')
                with codecs.open(filename, 'r', 'utf8') as f:
                    for line in f:
                        ts = tryint(line.split('\t')[0])
                        if ts > lastchecked[host][filename]:
                            if 'TSystemError' in line:
                                errorscounter += 1
                                errors[delete_first(line)].add(host)
                                ts = line.split('\t')[0]
                                try:
                                    its = arrow.get(int(ts)).to(
                                        'Europe/Moscow')
                                    diffts = arrow.now().to(
                                        'Europe/Moscow') - its
                                    if diffts.total_seconds() < 600:
                                        lasterrors[
                                            delete_first(line)].add(host)
                                except ValueError:
                                    pass
                            if 'exception' in line and not 'rtmr' in line and not 'temp_buf' in line:
                                exceptionscounter += 1
                                exceptions[delete_first(line)].add(host)
                                logger.info('exception {} on line {}'
                                            .format(delete_first(line), line.rstrip()))
                                try:
                                    its = arrow.get(int(ts)).to(
                                        'Europe/Moscow')
                                    diffts = arrow.now().to(
                                        'Europe/Moscow') - its
                                    if diffts.total_seconds() < 600:
                                        lastexceptions[
                                            delete_first(line)].add(host)
                                except ValueError:
                                    pass
                            if 'updated, version:' in line \
                                and 'atom-candidates' in line \
                                and re.search(r'[0-9]{10}',
                                              line.split()[-3]):
                                uts = line.split('\t')[-1].split(' @ ')[0]
                                logger.info('uts {} from line {}'.format(
                                    uts, line.rstrip()))
                                try:
                                    uts = arrow.get(int(uts)).to(
                                        'Europe/Moscow')
                                    diffts = arrow.now().to('Europe/Moscow') - uts
                                except ValueError:
                                    logger.error('error on timestamp {}'
                                                 .format(uts))
                                    pass
                lastchecked[host][filename] = ts
                profiling1.append((arrow.now().to('Europe/Moscow')
                                   - bgn).total_seconds())
                versions[uts].add(host)
                logger.info('version {} on host {}'.format(uts, host))
        os.chdir('..')

    # with codecs.open('lastchecked.json', 'w') as f:
    #     f.write(json.dumps(lastchecked, indent=4))
    dill.dump(dict(lastchecked), open('lastchecked.pkl', 'wb'))
    olderrors = {}
    oldexceptions = {}
    if os.path.isfile('errors.pkl'):
        olderrors = pickle.load(open('errors.pkl', 'rb'))
    if os.path.isfile('exceptions.pkl'):
        oldexceptions = pickle.load(open('exceptions.pkl', 'rb'))

    differrors = {}
    for error in errors:
        if error not in olderrors:
            differrors[error] = errors[error]
    diffexceptions = {}
    for error in exceptions:
        if error not in oldexceptions:
            diffexceptions[error] = exceptions[error]

    # if differrors or diffexceptions:
    if unavailable:
        report = 'The following hosts were not available: {}'.format(
            ', '.join(unavailable))
        logger.info(report)
        send_email(NORIDDLE, report, subject='Atom rollout: unavailable hosts')
    if lasterrors or lastexceptions:
        report = makereport(lasterrors, errorscounter,
                            lastexceptions, exceptionscounter, hosts)
        logger.info(report)
        send_email(RECIPIENTS, report)
    if lastexceptions and 'atom-candidates' in report:
        send_sms(RECIPIENTS,
                 generate_sms(len(lastexceptions), lastexceptions, hosts))
        pass
    if (arrow.now().to('Europe/Moscow')
            - max(versions)).total_seconds() <= 600:
        report = 'Updated in last 10 minutes. Distribution: \n\n{}'.format(
            '\n'.join(['{} ({}) on {}/{} hosts'.format(
                tots(x), x, len(versions[x]), len(hosts)
            )
                for x in sorted(versions, reverse=True)]))
        logger.info(report)
        send_email(RECIPIENTS, report, subject='Atom rollout in progress')
    last_confirmed_version = get_last_confirmed_version()
    if (max(versions) > last_confirmed_version
            and len(versions[max(versions)]) == len(hosts)):
        report = 'Version {} ({}) is now on all {} hosts.'.format(
            tots(max(versions)), max(versions), len(hosts))
        logger.info(report)
        send_email(RECIPIENTS, report, subject='Atom rollout success')
        set_last_confirmed_version(max(versions))
    if (max(versions) != min(versions)
        and (arrow.now().to('Europe/Moscow')
             - max(versions)).total_seconds() > 3600):
        report = ('Rollout problems: difference between most and least '
                  'recent hosts is more than an hour. Distribution: \n\n{}'.format(
                      '\n'.join(['{} ({}) on {}/{} hosts: {}'.format(
                          tots(x), x, len(versions[x]), len(hosts), ', '.join(
                              versions[x])
                      )
                          for x in sorted(versions, reverse=True)])))
        logger.info(report)
        send_email(RECIPIENTS, report, subject='Atom rollout problems')
        send_sms(NORIDDLE, 'Rollout problems: difference between most and least '
                 'recent hosts is more than 30 minutes')
    pickle.dump(dict(errors), open('errors.pkl', 'wb'))
    pickle.dump(dict(exceptions), open('exceptions.pkl', 'wb'))
    logger.info('monitoring took {}'.format(arrow.now() - start))
    set_state('idle')
    if args.pdb:
        pdb.set_trace()
if __name__ == "__main__":
    main()
