#!/usr/bin/python2
# encoding: utf-8
# kate: space-indent on; indent-width 4; replace-tabs on;
#
import os, os.path, sys, re, argparse, json, tarfile, urllib2
from hashlib import md5
import yt.wrapper as ytw
from traceback import format_exception

RETRY_COUNT = 3
SANDBOX_URL = "https://proxy.sandbox.yandex-team.ru/%s/%s"
SANDBOXAPI_URL = "https://sandbox.yandex-team.ru/api/v1.0/resource/%s"
YT_RULES_DICT_PATH = "//home/so_fml/nirvana/rules_dict_%s"
SO_ML_BACKEND_URL = "https://web.so.yandex-team.ru/ml"

def get_traceback():
    exc_type, exc_value, exc_traceback = sys.exc_info()
    tb = ''
    for step in format_exception(exc_type, exc_value, exc_traceback):
        try:
            tb += "\t" + step.strip() + "\n"
        except:
            pass
    return tb

def calcMD5forFile(file_path):
    if os.path.exists(file_path) and os.stat(file_path).st_size > 1:
        f = open(file_path)
        m = md5()
        m.update(f.read())
        f.close()
        return m.hexdigest()
    else:
        return ""

def doRequest(url, prompt):
    for attempt in range(RETRY_COUNT):
        try:
            r = f = None
            r = urllib2.Request(url=url)
            f = urllib2.urlopen(r)
            if f:
                if f.getcode() == 200:
                    return f.read()
                else:
                    print >>sys.stderr, 'Request {0}: response HTTP code={1}, info: {2}'.format(prompt, f.getcode(), f.info())
            else:
                print >>sys.stderr, 'Request #%d for %s: response is empty!' % (attempt, prompt)
        except Exception, e:
            if str(e).find('Name or service not known') > -1 or str(e).find("No address associated with hostname") > -1:
                print >>sys.stderr, "Attempt #%s of %s failed: %s" % (attempt, prompt + 1, str(e))
            else:
                serr = info = method = ''
                if hasattr(e, 'code'):
                    code = e.code
                    serr = ' (code=%s)' % code
                info = 'Info: {0}. '.format(f.info()) if f else ''
                method = "{0} ".format(r.get_method()) if r else ''
                print >>sys.stderr, "%s HTTP %srequest (attempt #%s) failed%s: '%s'. URL: %s.%s\t%s" % (prompt, method, attempt + 1, serr, str(e), url, info, get_traceback())
            continue
    return ""

def downloadFileFromSandBox(resource_id, resource_file, output_file):
    if not resource_file or not resource_id:
        return False
    try:
        content = doRequest(SANDBOX_URL % (resource_id, resource_file), "Retrieving formula's description file '%s' from SandBox" % resource_file)
        if content:
            f = open(output_file, 'w')
            f.write(content)
            f.close()
            return True
    except Exception, e:
        print >>sys.stderr, 'Downloading file from SandBox failed: %s.%s' % (str(e), get_traceback())
    return False

def getMLsetting(setting):
    setting_value = ''
    try:
        setting_value = doRequest("%s/get_setting?setting=%s" % (SO_ML_BACKEND_URL, setting), 'getMLsetting')
    except Exception, e:
        print >>sys.stderr, "getMLsetting failed: %s.%s" % (str(e), get_traceback())
    return setting_value

def getModelInfo(formula_id):
    model = {}
    if formula_id:
        try:
            model = json.loads(doRequest("%s/ui/?action=get_model&formula_id=%s" % (SO_ML_BACKEND_URL, formula_id), 'getModelInfo'))
        except Exception, e:
            print >>sys.stderr, "getModelInfo failed: %s.%s" % (str(e), get_traceback())
    return model

def loadInfo(info_file, file_type='json'):
    info, success, content = {}, True, ""
    try:
        f = open(info_file)
        content = f.read().strip()
        if file_type == 'json':
            info = json.loads(content)
        elif file_type == 'text':
            info = content
        f.close()
    except Exception, e:
        success = False
        print >>sys.stderr, "Loading models info from file '%s' failed: %s.%s" % (info_file, str(e), get_traceback())
        if len(content) < 2048:
            print >>sys.stderr, "Content of file %s: %s" % (info_file, content)
    return success, info

def getFormulaInfo(formula_id):
    info = doRequest("https://fml.yandex-team.ru/rest/api/formula/fml/%s" % formula_id, 'getFormulaInfo')
    return json.loads(info) if info else {}

def getFormulaFileFromFML(formula_id, file_name):
    return doRequest("https://fml.yandex-team.ru/download/computed/formula?id=%s&file=%s" % (formula_id, file_name), 'getFormulaFileFromFML')

def writeJSON2file(file_path, data):
    try:
        f = open(file_path, 'w')
        f.write(json.dumps(data))
        f.close()
    except Exception, e:
        print >>sys.stderr, "Writing file '%s' failed: %s.%s" % (file_path, str(e), get_traceback())

def appendFile(files, file_path):
    if os.path.exists(file_path):
        files.append(file_path)
    else:
        print >>sys.stderr, "File '%s' has not been created!" % file_path
        sys.exit(1)

def createBundle(INPUT_FILES, OUTPUT_TARBALL):
    try:
        tarball = tarfile.open(OUTPUT_TARBALL, 'w:gz')
        for file_path in INPUT_FILES:
            tarball.add(file_path)
        tarball.close()
    except Exception, e:
        print >>sys.stderr, "Creating archive-file '%s' with models failed: %s.%s" % (OUTPUT_TARBALL, str(e), get_traceback())
        sys.exit(1)

def processSOmodel(models_files, so_models, aux_slots, model, slot, resource_id=''):
    # creation of model's json-description
    model_json_name = "model_%s.json" % slot
    FILE = "./%s" % model_json_name
    writeJSON2file(FILE, model)
    appendFile(models_files, FILE)
    # downloading model's binary
    model_binary_name = "model_{}.bin".format(resource_id) if model["model_type"] == 'catboost' else "matrixnet_{}.info".format(model['formula_id'])
    FILE = "./%s" % model_binary_name
    if not os.path.exists(FILE) or os.stat(FILE).st_size < 10:
        if model["model_type"] == 'matrixnet':
            f = open(FILE, 'wb')
            f.write(getFormulaFileFromFML(model['formula_id'], 'matrixnet.info'))
            f.close()
        if (not os.path.exists(FILE) or os.stat(FILE).st_size < 10) and resource_id:
            downloadFileFromSandBox(resource_id, model['bin_name'], FILE)
    if os.path.exists(FILE) or os.stat(FILE).st_size > 1:
        model['md5'] = calcMD5forFile(FILE)
        if resource_id and model.get('binary_md5', '') and model['md5'] != model.get('binary_md5', ''):
            for i in range(10):
                if downloadFileFromSandBox(resource_id, model['bin_name'], FILE):
                    model_info['md5'] = calcMD5forFile(FILE)
                    if model['md5'] == model.get('binary_md5', ''):
                        break
            if model['md5'] != model['binary_md5']:
                 print >>sys.stderr, "MD5 sum for %s model with resource_id=%s is not matched after attempts to download the model! Exiting." % (model["model_type"], resource_id)
                 sys.exit(1)
    appendFile(models_files, FILE)
    # downloading model's features dictionary
    if model["model_type"] == 'matrixnet':
        features_name = "atom_rules_c_%s.txt" % model['formula_id']
        FILE = "./%s" % features_name
        if (not os.path.exists(FILE) or os.stat(FILE).st_size < 10) and resource_id and downloadFileFromSandBox(resource_id, 'atom_rules_c.txt', FILE):
            appendFile(models_files, FILE)
    elif model["model_type"] == 'catboost':
        features_name = "features_%s.tsv" % resource_id
        FILE = "./%s" % features_name
        if (not os.path.exists(FILE) or os.stat(FILE).st_size < 10) and resource_id and downloadFileFromSandBox(resource_id, 'features.tsv', FILE):
            appendFile(models_files, FILE)
    if not os.path.exists(FILE) or os.stat(FILE).st_size < 10:
        features_name = ''
    # generating final model's info
    aux_resource_ids, rids = [], {}
    for key in ['vw_model_resource_id', 'dssm_model_resource_id', 'aux_model_resource_id']:
        if key in model:
            try:
                rid = int(model[key])
            except:
                continue
            if rid not in rids:
                aux_resource_ids.append(rid)
                rids[rid] = 1
    print >>sys.stderr, "Auxiliary origin models for slot '%s': %s" % (slot, str(rids.keys()))
    if slot in aux_slots and isinstance(aux_slots[slot], list):
        for aux_model in aux_slots[slot]:
            print >>sys.stderr, "Auxiliary model for slot '%s': %s" % (slot, str(aux_model))
            rid = int(aux_model['resource_id'])
            if rid not in rids:
                aux_resource_ids.append(rid)
    print >>sys.stderr, "Auxiliary models IDs for resource_id=%s on slot=%s: %s" % (resource_id, slot, str(aux_resource_ids))
    so_model = {}
    for k, v in model.iteritems():
        if k.find("workflow") < 0 and not k.endswith("_id") and k not in ["threshold", "rules_dict", "bin_name"]:
            so_model[k] = model[k]
    so_model.update({
        'resource_id':    int(resource_id) if resource_id else 0,
        'aux_models_ids': aux_resource_ids,
        'threshold':      float(model.get('threshold', 0)) if 'threshold' in model else 0,
        'slot':           str(slot),
        'json_file':      model_json_name,
        'binary_file':    model_binary_name,
        'features_file':  features_name
    })
    if model["model_type"] == 'matrixnet':
        so_model["formula_id"] = int(model.get('formula_id', 0))
    so_models.append(so_model)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--sandbox_resource_ids',  nargs='*', help="Sandbox resources IDs for obtaining models files")
    parser.add_argument('-s', '--slots',                 nargs='*', help="Slots of models")
    parser.add_argument('-f', '--formula_ids',           nargs='*', help="Formulas IDs")
    parser.add_argument('-t', '--thresholds',            nargs='*', help="Formulas thresholds")
    parser.add_argument('-m', '--formulas_info',         nargs='*', help="Models/Formulas JSON-info")
    parser.add_argument('-n', '--slots_info',            nargs='*', help="Ordered list of slots according to formulas JSON-info order")
    parser.add_argument('-a', '--aux_models_info',       nargs='*', help="Auxiliary models info JSON-files")
    parser.add_argument('-r', '--route',                 type=str,  help="The type of mail for which the model is calculated")
    parser.add_argument('-o', '--output_so_bundle',      type=str,  help="Output tarball with SO models bundle")
    parser.add_argument('-j', '--output_so_bundle_json', type=str,  help="Output JSON-description for output SO models bundle")
    args = parser.parse_known_args()[0]
    ROUTE = args.route if args.route else 'in'
    OUTPUT_SO_TARBALL = args.output_so_bundle if args.output_so_bundle else "./prod_models.tar.gz"
    OUTPUT_SO_JSON = args.output_so_bundle_json if args.output_so_bundle_json else "./prod_models.json"
    RESOURCES_IDS = args.sandbox_resource_ids if args.sandbox_resource_ids else []
    FORMULA_IDS = args.formula_ids if args.formula_ids else []
    THRESHOLDS = args.thresholds if args.thresholds else []
    FORMULAS_INFO = args.formulas_info if args.formulas_info else []
    SLOTS_INFO = args.slots_info if args.slots_info else []
    AUX_MODELS_INFO = args.aux_models_info if args.aux_models_info else []
    SLOTS = args.slots if args.slots else []
    SO_FILES, so_models, aux_slots, SLOTS2 = [], [], {}, []
    max_date = max_date_rules_resource_id = max_date_rules_dict_path = ''

    # собираем инфу про вспомогательные модели
    for (i, aux_models_file) in enumerate(AUX_MODELS_INFO):
        success, aux_models = loadInfo(aux_models_file)
        if success:
            for aux_model_info in aux_models:
                if 'slot' in aux_model_info and aux_model_info['slot']:
                    if aux_model_info['slot'] not in aux_slots:
                        aux_slots[aux_model_info['slot']] = []
                    aux_slots[aux_model_info['slot']].append(aux_model_info.copy())
        else:
            print >>sys.stderr, "Failed to load auxiliary model's info from file: %s" % aux_models_file
    print >>sys.stderr, "Auxiliary model's info: %s" % str(aux_slots)

    # собираем основные модели
    for (i, resource_id) in enumerate(RESOURCES_IDS):
        model_info, model_type = {}, ''
        FILE = "./model_%s.json" % (SLOTS[i] if len(SLOTS) > i else i)
        if len(FORMULA_IDS) > i and FORMULA_IDS[i] == 'catboost':
            downloadFileFromSandBox(resource_id, 'model.json', FILE)
            model_type = 'catboost'
        else:
            downloadFileFromSandBox(resource_id, 'matrixnet.json', FILE)
            model_type = 'matrixnet'
        if os.path.exists(FILE) and os.stat(FILE).st_size > 10:
            success, model_info_dict = loadInfo(FILE)
            if success:
                model_info.update(model_info_dict)
            else:
                print >>sys.stderr, "Failed to load JSON-file with model's for resource #%s!" % resource_id
        #else:
        #    print >>sys.stderr, "Downloading of json-file for basic model failed for resource_id=%s" % resource_id
        elif ('formula_id' in model_info or len(FORMULA_IDS) > i and FORMULA_IDS[i].isdigit()) and ('model_type' in model_info and model_info['model_type'] == 'matrixnet' or 'model_type' not in model_info):
            model_info.update(getModelInfo(model_info['formula_id'] if 'formula_id' in model_info else (FORMULA_IDS[i] if len(FORMULA_IDS) > i else 0)))
        if len(THRESHOLDS) > i and THRESHOLDS[i]:
            try:
                model_info['threshold'] = float(THRESHOLDS[i])
            except:
                pass
        if 'formula_id' not in model_info and len(FORMULA_IDS) > i and FORMULA_IDS[i].isdigit():
            model_info['formula_id'] = int(FORMULA_IDS[i])
        if 'model_type' not in model_info or not model_info['model_type']:
            model_info['model_type'] = model_type
        model_info["bin_name"] = 'model.bin' if model_info["model_type"] == 'catboost' else 'matrixnet.info'
        print >>sys.stderr, "Basic model for resource_id=%s: %s" % (resource_id, str(model_info))
        processSOmodel(SO_FILES, so_models, aux_slots, model_info, SLOTS[i] if len(SLOTS) > i else i, resource_id)
        del model_info["bin_name"]
        dt = ''
        if 'rules_dict' in model_info:
            try:
                dt = ytw.get(model_info['rules_dict'], attributes = ['modification_time']).attributes['modification_time']
            except Exception, e:
                print >>sys.stderr, "YT get table attributes failed: %s.%s" % (str(e), get_traceback())
        if not dt and 'creation_time' in model_info:
            dt = model_info['creation_time']
        if not dt and model_type == 'matrixnet':
            formula_info = getFormulaInfo(model_info['formula_id'])
            if 'dateTime' in formula_info:
                dt = ' '.join(formula_info['dateTime'][:19].split('T'))
        if dt and (not max_date or max_date and max_date < dt):
            max_date = dt
            max_date_rules_resource_id = resource_id
            if 'rules_dict' in model_info:
                max_date_rules_dict_path = model_info['rules_dict']
    for model in so_models:
        model_binary_name = "model_{}.bin".format(model["resource_id"]) if model["model_type"] == 'catboost' else "matrixnet_{}.info".format(model['formula_id'])
        FILE = "./%s" % model_binary_name
        if not os.path.exists(FILE) or os.stat(FILE).st_size < 10:
            print >>sys.stderr, "Failed to download %s model with resource_id=%s from SandBox! Exiting." % (model["model_type"], model["resource_id"])
            sys.exit(1)
        model_features = features_name = "features_{}.tsv".format(model["resource_id"]) if model["model_type"] == 'catboost' else "atom_rules_c_{}.txt".format(model['formula_id'])
        FILE = "./%s" % model_features
        if not os.path.exists(FILE) or os.stat(FILE).st_size < 10:
            print >>sys.stderr, "Failed to download features dictionary's file for %s model with resource_id=%s from SandBox! Exiting." % (model["model_type"], model["resource_id"])
            sys.exit(1)
        model_json_name = "model_%s.json" % model["slot"]
        FILE = "./%s" % model_json_name
        if not os.path.exists(FILE) or os.stat(FILE).st_size < 10:
            print >>sys.stderr, "Failed to download model's description file for %s model with resource_id=%s from SandBox! Exiting." % (model["model_type"], model["resource_id"])
            sys.exit(1)
    for slot_info_file in SLOTS_INFO:
        success, slot_info = loadInfo(slot_info_file, 'text')
        if success:
            SLOTS2.append(slot_info)
        else:
            print >>sys.stderr, "Failed to load slot info from file: %s" % slot_info_file
            sys.exit(1)
    for (i, formula_info_file) in enumerate(FORMULAS_INFO):
        success, model_info = loadInfo(formula_info_file)
        if not success:
            print >>sys.stderr, "Failed to load formula from file: %s" % formula_info_file
            sys.exit(1)
        model_info["bin_name"] = 'model.bin' if model_info["model_type"] == 'catboost' else 'matrixnet.info'
        processSOmodel(SO_FILES, so_models, aux_slots, model_info, SLOTS2[i] if len(SLOTS2) > i else '')
        del model_info["bin_name"]
        dt = ''
        if 'rules_dict' in model_info:
            try:
                dt = ytw.get(model_info['rules_dict'], attributes = ['modification_time']).attributes['modification_time']
            except Exception, e:
                print >>sys.stderr, "YT get table attributes failed: %s.%s" % (str(e), get_traceback())
        if not dt and 'creation_time' in model_info:
            dt = model_info['creation_time']
        if not dt and model_type == 'matrixnet':
            formula_info = getFormulaInfo(model_info['formula_id'])
            if 'dateTime' in formula_info:
                dt = ' '.join(formula_info['dateTime'][:19].split('T'))
        if dt and (not max_date or max_date and max_date < dt):
            max_date = dt
            if 'rules_dict' in model_info:
                max_date_rules_dict_path = model_info['rules_dict']
                max_date_rules_resource_id = ''
    print >>sys.stderr, "SO model's info: %s" % str(so_models)
    FILE = './atom_rules_c.txt'
    if max_date_rules_resource_id:
        downloadFileFromSandBox(max_date_rules_resource_id, 'atom_rules_c.txt', FILE)
    else:
        yt_path = max_date_rules_dict_path if max_date_rules_dict_path else (YT_RULES_DICT_PATH % ROUTE)
        if ytw.exists(yt_path):
            try:
                f = open(FILE, 'w')
                for r in ytw.read_table(yt_path, format = ytw.JsonFormat(), raw = False):
                    print >>f, "%d\t%s\t%d" % (r['num'], r['rule'], r['act'])
                f.close()
            except Exception, e:
                print >>sys.stderr, "Writing file '%s' failed: %s.%s" % (FILE, str(e), get_traceback())
    appendFile(SO_FILES, FILE)
    slot_names = {'mn_prod': 'p', 'mn_test': '1'}
    for i in range(2, 20):
        slot_names["mn_test%s" % i] = "%s" % i
    try:
        slot_names = json.loads(getMLsetting('slots_names'))
    except Exception, e:
        print >>sys.stderr, "Loading slot names failed: %s.%s" % (str(e), get_traceback())

    if slot_names and len(slot_names) >= len(SLOTS):
        for model in so_models:
            if model['slot'] in slot_names:
                model['slot_brief_name'] = slot_names[model['slot']]
    else:
        print >>sys.stderr, "There are no slots' brief names!"
        sys.exit(1)

    # окончательное создание бандлов и запись на диск
    FILE = './models_info.json'
    writeJSON2file(FILE, so_models)
    appendFile(SO_FILES, FILE)
    createBundle(SO_FILES, OUTPUT_SO_TARBALL)
    route = ROUTE.upper()
    writeJSON2file(OUTPUT_SO_JSON, {
        "resource_description":   "SO %s models set (${meta.get_workflow_instance_uid()})" % route,
        "release_description":    "Autorelease of SO %s models (${meta.get_workflow_instance_uid()})" % route,
        "web_hook_url":           "https://web.so.yandex-team.ru/ml/save_model_status/?workflow_id=${meta.get_workflow_uid()}&workflow_instance_id=${meta.get_workflow_instance_uid()}&status=waiting&route=%s" % ROUTE
    })
