#!/usr/bin/python

import httplib
from pprint import pprint
import sys,socket,re
import json
import argparse

class LogbrokerException(Exception):
    pass

class LogbrokerOffsets(object):
    def __init__(self, timeout=10.0, port=80):
        self.timeout = timeout
        self.port = port
    def __get(self, host, url, data=None):
        c = False
        r = False
        try:
            socket.setdefaulttimeout(self.timeout)
            c = httplib.HTTPConnection(host=host, port=self.port, timeout=self.timeout)
            # Perform request.
            c.request('GET', url, data)
            # Fetch the response headers.
            r = c.getresponse()
            # Read the response body.
            raw_data = r.read()
            parsed_data = self.__parse_response(raw_data)
        except Exception as e:
            raise LogbrokerException('general error, "%s"' % unicode(e))
        except httplib.HTTPException as e:
            raise LogbrokerException('transport error, "%s"' % unicode(e))

        if r.status != httplib.OK:
            raise LogbrokerException('request error, status %s, response "%s"' % (
                r.status, 
                raw_data
                )
            )

        return parsed_data

    def __parse_response(self, data):
        info = {}
        for line in data.splitlines():
            try:
                topicpart, offset, logstart, logsize, lag, owner = line.split('\t')
                info[topicpart] = {
                        'offset': offset,
                        'logstart': logstart,
                        'logsize': logsize,
                        'lag': lag,
                        'owner': owner
                        }
            except Exception:
                # Dont care about junk.
                pass
        return info

    def __get_state(self, ident=None, client=None, logtype=None, dc=''):
        filter = 'log-type={}'.format(logtype) if logtype else 'ident={}'.format(ident)
        url = '/pull/offsets?{filter}&client={client}'.format(filter=filter, client=client)
        offsets = self.__get(url=url, host='{}logbroker.yandex.net'.format('%s.' % dc if dc else ''))
        # Logbroker puts irrelevant topics ("other") into the output 
        # if it is unable to find the requested ident or logtype.
        # Filter it out.
        return {tp:info for (tp,info) in offsets.iteritems() if logtype or ident in tp and tp.startswith('rt3.{}'.format(str(dc)))}
    
    def get_owners(self, ident=None, client=None, dc=''):
        state = self.__get_state(ident=ident, client=client, dc=dc)
        owners = []
        for tp, data in state.iteritems():
            owner = data['owner'].strip()
            if owner != 'none': owners.append(owner)
        return owners

    def get_total_lag(self, ident=None, client=None, dc=''):
        state = self.__get_state(ident=ident, client=client, dc=dc)
        total_lag = 0
        for tp, data in state.iteritems():
            total_lag += int(data['lag'])

        return total_lag

def write_state_file(owners):
    try:
        f = open('/tmp/logbroker_session_owners', 'w')
        f.write(json.dumps(owners))
        f.close()
    except Exception:
        return False
    
    return True

def read_state_file():
    state = []
    try:
        f = open('/tmp/logbroker_session_owners', 'r')
        state = json.loads(''.join(f.readlines()))
    finally:
        return state

def print_juggler(code, desc):
    print '{code};{desc}'.format(code=code, desc=desc)
    sys.exit(0)

def get_local_dc():
    def __read_dcfile():
        dc = False
        try:
            dc = open('/tmp/dc_affinity').read().strip()
        except Exception:
            return False
        return dc
    def __write_dcfile(dc):
        try:
            f = open('/tmp/dc_affinity', 'w')
            f.write(dc)
            f.close()
        except Exception:
            pass

    def __call_golem():
        dc = False
        try:
            c = httplib.HTTPConnection(host='ro.admin.yandex-team.ru')
            c.request('GET','/api/host_query.sbml?hostname={}&columns=short_dc'.format(socket.getfqdn()))
            r = c.getresponse()
            dc = r.read().strip()
        except Exception:
            pass
        return dc

    # Try reading a file, if failed, then request conductor.
    dc = __read_dcfile()
    if not dc:
        dc = __call_golem()

    if not re.search('^[a-z]{3}', str(dc)):
        return False
    # Save result for the future use.
    __write_dcfile(dc)
    return dc

if __name__ == '__main__':
    
    arg = argparse.ArgumentParser(description="""
            Juggler plugin to check for hung Logbroker sessions.
            """
            )
    arg.add_argument('-c', '--critical', type=int, metavar='<int>', default=30,
            help='Critical threshold: how many times the session can be seen in /offsets before it is considered hung')
    arg.add_argument('-w', '--warning', type=int, metavar='<int>', default=20,
            help='Warning threshold')
    arg.add_argument('-i', '--ids', action='append', required=True, metavar='<ident>:<client>',
            help='Ident and clientid to check. Can be given multiple times.')
    arg.add_argument('-d', '--dc', action='append', metavar='<ident>:<client>', default=get_local_dc(),
            help='Override dc affinity. Default is autodetect.')
    
    settings = vars(arg.parse_args())
    
    ids = [id.split(':') for id in settings.get('ids', [])]
    
    info = {
            'code': 0,
            'hung': [],
            'text': 'ok',
            }
    
    # Read history of session owners.
    # The history is a list of lists, each containing a session id and a counter 
    # which reflects how many checks it was present in the output.
    # history = [
    #         ['pull--so-report-reader--kafka37g--7b08be64-92d7-4427-8f51-c87f49f6877a', 2],
    #         ['pull--so-report-reader--kafka04g--7705c237-d83d-4d21-9041-57ad342889ae', 1],
    #         ...
    #         ]
    history = read_state_file()
    
    # Present state structure mimics the history. It will later be dumped to file.
    present_state = []
    
    # Cannot continue if local dc is not known
    if not settings.get('dc', False): print_juggler(1, 'unable to determine dc affiliation')
    # Cannot continue if no clientid or ident is provided either.
    if not settings.get('ids', False): print_juggler(1, 'no client ids are given')
    # Warning cannot be greater than the critical.
    if settings['warning'] > settings['critical']: settings['warning'] = settings['critical']
    
    # Create logbroker object.
    lb = LogbrokerOffsets()

    for (ident,client) in ids:
        try:
            owners = lb.get_owners(ident=ident, client=client, dc=settings.get('dc'))
            # owners = ['pull--so-report-reader--kafka37g--7b08be64-92d7-4427-8f51-c87f49f6877a', 'pull--...', ... ]
            for owner in owners:
                count = 1
                for (seen_owner, history_count) in history:
                    # If this owner is present 
                    # in the history file, increment its count.
                    if owner == seen_owner:
                        count += history_count
                        # Check if it has reached the limit.
                        if count >= settings['warning']:
                            info['hung'].append('{ses}:{id},{cl}'.format(desc=info['text'], ses=seen_owner[-6:], id=ident, cl=client))
                            info['code'] = 1 if count < settings['critical'] else 2
                        # Session ids are supposed to be unique.
                        break
                
                present_state.append([owner, count])
        except Exception as e:
            print_juggler(1, 'exception: {}'.format(e))
    
    try:
        write_state_file(present_state)
    except Exception:
        pass
    if len(info['hung']):
        info['text'] = '{count} -- {sessions}'.format(count=len(info['hung']), sessions=' '.join(info['hung']))
    print_juggler(info['code'], info['text'])
