#!/usr/bin/python2
# encoding: utf-8
# kate: space-indent on; indent-width 4; replace-tabs on;
#
import os, os.path, sys, argparse, json, tarfile, urllib2
from hashlib import md5
from traceback import format_exception

RETRY_COUNT = 3
SANDBOX_URL = "https://proxy.sandbox.yandex-team.ru/%s/%s"
FILES = {
    "json":   {"vw": "vw.json", "dssm": "model.json"},
    "binary": {"vw": "vw.bin", "dssm": "model.dssm"}
}

def get_traceback():
    exc_type, exc_value, exc_traceback = sys.exc_info()
    tb = ''
    for step in format_exception(exc_type, exc_value, exc_traceback):
        try:
            tb += "\t" + step.strip() + "\n"
        except:
            pass
    return tb

def calcMD5forFile(file_path):
    if os.path.exists(file_path) and os.stat(file_path).st_size > 1:
        f = open(file_path)
        m = md5()
        m.update(f.read())
        f.close()
        return m.hexdigest()
    else:
        return ""

def doRequest(url, prompt):
    for attempt in range(RETRY_COUNT):
        try:
            r = f = None
            r = urllib2.Request(url=url)
            f = urllib2.urlopen(r)
            if f:
                if f.getcode() == 200:
                    return f.read()
                else:
                    print >>sys.stderr, 'Request {0}: response HTTP code={1}, info: {2}'.format(prompt, f.getcode(), f.info())
            else:
                print >>sys.stderr, 'Request #%d for %s: response is empty!' % (attempt, prompt)
        except Exception, e:
            if str(e).find('Name or service not known') > -1 or str(e).find("No address associated with hostname") > -1:
                print >>sys.stderr, "Attempt #%s of %s failed: %s" % (attempt, prompt + 1, str(e))
            else:
                serr = info = method = ''
                if hasattr(e, 'code'):
                    code = e.code
                    serr = ' (code=%s)' % code
                info = 'Info: {0}. '.format(f.info()) if f else ''
                method = "{0} ".format(r.get_method()) if r else ''
                print >>sys.stderr, "%s HTTP %srequest (attempt #%s) failed%s: '%s'. URL: %s.%s\t%s" % (prompt, method, attempt + 1, serr, str(e), url, info, get_traceback())
            continue
    return ""

def downloadFileFromSandBox(resource_id, resource_file, output_file):
    if not resource_file or not resource_id:
        return False
    try:
        content = doRequest(SANDBOX_URL % (resource_id, resource_file), "Retrieving formula's description file '%s' from SandBox" % resource_file)
        if content:
            f = open(output_file, 'w')
            f.write(content)
            f.close()
            return True
    except Exception, e:
        print >>sys.stderr, "Downloading file '%s' for resource #%s from SandBox failed: %s.%s" % (resource_file, resource_id, str(e), get_traceback())
    return False

def loadInfo(info_file, file_type='json'):
    info, success, content = {}, True, ""
    try:
        f = open(info_file)
        content = f.read().strip()
        if file_type == 'json':
            info = json.loads(content[content.find('{'):content.rfind('}')+1])
        elif file_type == 'text':
            info = content
        f.close()
    except Exception, e:
        success = False
        print >>sys.stderr, "Loading models info from file '%s' failed: %s.%s" % (info_file, str(e), get_traceback())
        if len(content) < 2048:
            print >>sys.stderr, "Content of file %s: %s" % (info_file, content)
    return success, info

def appendFile(files, file_path):
    if os.path.exists(file_path):
        if file_path not in files:
            files.append(file_path)
    else:
        print >>sys.stderr, "File '%s' has not been created!" % file_path
        sys.exit(1)

def redownloadModelFromSandBox(model_info):
    if 'resource_id' in model_info and model_info['resource_id'] and 'json_file' in model_info and model_info['json_file'] \
        and 'binary_file' in model_info and model_info['binary_file']:
            BIN_FILE = model_info['binary_file']
            for i in range(10):
                if downloadFileFromSandBox(model_info['resource_id'], model_info.get('bin_name', ''), BIN_FILE):
                        model_info['md5'] = calcMD5forFile(BIN_FILE)
                        if model_info['md5'] == model_info.get('binary_md5', ''):
                            return

def gatherAuxModelInfo(aux_resource_id, AUX_FILES):
    model_type_determined = False
    for aux_model_type in ['vw', 'dssm']:
        aux_model_info = {'json_name': '', 'bin_name': ''}
        json_file_name = "%s_%s.json" % (aux_model_type, aux_resource_id)
        JSON_FILE = "./%s" % json_file_name
        if (not os.path.exists(JSON_FILE) or os.path.exists(JSON_FILE) and os.stat(JSON_FILE).st_size < 10):
            if downloadFileFromSandBox(aux_resource_id, FILES["json"][aux_model_type], JSON_FILE):
                if loadInfo(JSON_FILE)[0]:
                    model_type_determined = True
                    aux_model_info['json_name'] = FILES["json"][aux_model_type]
                else:
                    print >>sys.stderr, "Downloaded %s file is not JSON => model #%s is not %s." % (FILES["json"][aux_model_type], aux_resource_id, aux_model_type.upper())
                    os.unlink(JSON_FILE)
        if os.path.exists(JSON_FILE) and os.stat(JSON_FILE).st_size > 10:
            aux_model_info.update(loadInfo(JSON_FILE)[1])
        else:
            print >>sys.stderr, "Unable find JSON-file with model's description => model #%s is not %s." % (aux_resource_id, aux_model_type.upper())
            continue
        bin_file_name = "%s_%s.bin" % (aux_model_type, aux_resource_id)
        BIN_FILE = "./%s" % bin_file_name
        if (not os.path.exists(BIN_FILE) or os.path.exists(BIN_FILE) and os.stat(BIN_FILE).st_size < 10):
            downloadFileFromSandBox(aux_resource_id, FILES["binary"][aux_model_type], BIN_FILE)
        if os.path.exists(BIN_FILE) and os.stat(BIN_FILE).st_size > 10:
            model_type_determined = True
            aux_model_info['bin_name'] = FILES["binary"][aux_model_type]
        else:
            print >>sys.stderr, "Unable find binary model's file => model #%s is not %s." % (aux_resource_id, aux_model_type.upper())
            continue
        if model_type_determined:
            appendFile(AUX_FILES, BIN_FILE)
            aux_model_info['binary_file'] = bin_file_name
            aux_model_info['md5'] = calcMD5forFile(BIN_FILE)
            if os.path.exists(JSON_FILE) and os.stat(JSON_FILE).st_size > 10:
                appendFile(AUX_FILES, JSON_FILE)
                aux_model_info['json_file'] = json_file_name
            aux_model_info['model_type'] = aux_model_type
            aux_model_info['resource_id'] = aux_resource_id
            if aux_model_info.get('binary_md5', '') and aux_model_info['md5'] != aux_model_info.get('binary_md5', ''):
                print >>sys.stderr, "MD5 sum for model '%s' is not matched! We attempt to download this model from SandBox again." % aux_resource_id
                redownloadModelFromSandBox(aux_model_info)
                if aux_model_info['md5'] != aux_model_info.get('binary_md5', ''):
                    print >>sys.stderr, "MD5 sum for model '%s' is not matched after attempts to download the model! Exiting." % aux_resource_id
                    sys.exit(1)
            return aux_model_info
    if not model_type_determined:
        print >>sys.stderr, "Undetermined type of model #%s! Exiting." % aux_resource_id
        sys.exit(1)
    return {}

def writeJSON2file(file_path, data):
    try:
        f = open(file_path, 'w')
        f.write(json.dumps(data))
        f.close()
    except Exception, e:
        print >>sys.stderr, "Writing file '%s' failed: %s.%s" % (file_path, str(e), get_traceback())

def createBundle(INPUT_FILES, OUTPUT_TARBALL):
    try:
        tarball = tarfile.open(OUTPUT_TARBALL, 'w:gz')
        for file_path in INPUT_FILES:
            tarball.add(file_path)
        tarball.close()
    except Exception, e:
        print >>sys.stderr, "Creating archive-file '%s' with models failed: %s.%s" % (OUTPUT_TARBALL, str(e), get_traceback())
        sys.exit(1)

def prepareAuxModel(i, model_info, THRESHOLDS, FEATURES, SLOTS):
    key = str(model_info['resource_id'])
    if len(SLOTS) > i and SLOTS[i].strip():
        model_info['slot'] = SLOTS[i].strip()
        if model_info['slot']:
            key += "_%s" % model_info['slot']
    if len(THRESHOLDS) > i:
        model_info['threshold'] = float(THRESHOLDS[i])
        key += "_%s" % model_info['threshold']
    features = dict(enumerate(model_info['features'])) if 'features' in model_info else {}
    if len(FEATURES) > i:
        n = len(features.keys())
        if FEATURES[i].strip():
            for k, f in enumerate(map(lambda s: s.strip(), FEATURES[i].split(',')), n):
                if f and f not in features.values():
                    features[k] = f
    model_info['features'] = features.copy()
    return key

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-a', '--aux_model_type',       type = str, help = "Auxiliary model's type")
    parser.add_argument('-i', '--sandbox_resource_ids', nargs='*',  help = "Sandbox resources IDs for auxiliary models files")
    parser.add_argument('-s', '--slots',                nargs='*',  help = "Corresponding slots of the models")
    parser.add_argument('-t', '--thresholds',           nargs='*',  help = "Auxiliary models custom thresholds")
    parser.add_argument('-f', '--features',             nargs='*',  help = "Auxiliary models' features, separated by comma")
    parser.add_argument('-m', '--models_info',          nargs='*',  help = "Optional models info in JSON-files")
    parser.add_argument('-n', '--slots_info',           nargs='*',  help = "Optional ordered list of slots according to models info in JSON-files order")
    parser.add_argument('-r', '--route',                type = str, help = "The type of mail for which the auxiliary model is calculated")
    parser.add_argument('-o', '--output_bundle',        type = str, help = "Output tarball with auxiliary models bundle")
    parser.add_argument('-j', '--output_models_json',   type = str, help = "Output JSON-info for auxiliary models in bundle")
    parser.add_argument('-d', '--output_descr_json',    type = str, help = "Output JSON-description for uploading auxiliary models bundle in SandBox")
    args = parser.parse_known_args()[0]
    ROUTE = args.route if args.route else 'in'
    RESOURCES_IDS = args.sandbox_resource_ids if args.sandbox_resource_ids else []
    THRESHOLDS = args.thresholds if args.thresholds else []
    FEATURES = args.features if args.features else []
    MODELS_INFO = args.models_info if args.models_info else []
    SLOTS_INFO = args.slots_info if args.slots_info else []
    SLOTS = args.slots if args.slots else []
    AUX_FILES, aux_models, SLOTS2, models = [], {}, [], {}

    for slot_info_file in SLOTS_INFO:
        success, slot_info = loadInfo(slot_info_file, 'text')
        if success:
            SLOTS2.append(slot_info)
    for i, model_info_file in enumerate(MODELS_INFO):
        success, model_info = loadInfo(model_info_file)
        if success:
            model = gatherAuxModelInfo(model_info['resource_id'], AUX_FILES)
            key = prepareAuxModel(i, model_info, THRESHOLDS, FEATURES, SLOTS2)
            models[key] = model.copy()
            models[key].update(model_info)
    # собираем вспомогательные модели
    for i, resource_id in enumerate(map(int, RESOURCES_IDS)):
        if not resource_id:
            continue
        model_info = gatherAuxModelInfo(resource_id, AUX_FILES)
        model_info['resource_id'] = resource_id
        key = prepareAuxModel(i, model_info, THRESHOLDS, FEATURES, SLOTS)
        if key in models:
            models[key].update(model_info)
        else:
            models[key] = model_info.copy()
    for (key, model) in models.items():
        print >>sys.stderr, "Model: '%s'" % str(model)
        if not isinstance(model, dict):
            continue
        if 'threshold' not in model and 'best_f1' in model and isinstance(model['best_f1'], dict):
            model['threshold'] = model['best_f1'].get('threshold', 0)
        if 'model_type' not in model:
            model['model_type'] = args.aux_model_type.lower()
        if 'binary_file' not in model and os.path.exists("./%s_%s.bin" % (model['model_type'], model['resource_id'])):
            model['binary_file'] = "%s_%s.bin" % (model['model_type'], model['resource_id'])
        if 'json_file' not in model and os.path.exists("./%s_%s.json" % (model['model_type'], model['resource_id'])):
            model['json_file'] = "%s_%s.json" % (model['model_type'], model['resource_id'])
        aux_models[key] = model.copy()
        print >>sys.stderr, "Auxiliary %s model's info: resource_id=%s, slot: '%s', properties: %s" % (key, model['resource_id'], model['slot'] if 'slot' in model else '', str(model))

    # окончательное создание бандлов и запись на диск
    if len(aux_models) > 0:
        writeJSON2file(args.output_models_json, aux_models.values())
        FILE = './models_info.json'
        writeJSON2file(FILE, aux_models.values())
        appendFile(AUX_FILES, FILE)
        createBundle(AUX_FILES, args.output_bundle)
        aux_model_type, route = args.aux_model_type.upper(), ROUTE.upper()
        if args.output_descr_json:
            writeJSON2file(args.output_descr_json, {
                "resource_type":        "SO_SPAM_DETECTOR_%s_BUNDLE_%s" % (aux_model_type, route),
                "resource_description": "Auxiliary %s models for SO %s (${meta.get_workflow_instance_uid()})" % (aux_model_type, route),
                "release_description":  "Autorelease of SO %s auxiliary %s models (${meta.get_workflow_instance_uid()})" % (route, aux_model_type)
            })
