#!/usr/bin/python2
# encoding: utf-8
# kate: space-indent on; indent-width 4; replace-tabs on;
#
import os, os.path, sys, json, re, time, pymongo
import yt.wrapper as yt
from traceback import format_exception
from datetime import date, timedelta, datetime

WORKING_DIR = os.environ['HOME'] if 'HOME' in os.environ else os.path.dirname(os.path.abspath(__file__)
TEMP_FILES_FOLDER = "//home/so_fml/nirvana/tmp"
MAX_DATE = TODAY = date.today()
MIN_DATE = '2017-01-01'
RETRY_COUNT = 3
MONGO = {
    'cluster': '',
    'port':    27018,
    'db':      'so_ml',
    'hosts':   ['man-4usoo00weljdg5wo.db.yandex.net', 'sas-82j15sr5u4ezy36h.db.yandex.net', 'vla-ya6rl0p17o2nj3wl.db.yandex.net'],
    'ssl':     True
}
def get_traceback():
    exc_type, exc_value, exc_traceback = sys.exc_info()
    tb = ''
    for step in format_exception(exc_type, exc_value, exc_traceback):
        try:
            tb += "\t" + step.strip() + "\n"
        except:
            pass
    return tb

def writelog(msg, isTB = False):
    if not msg: return
    try:
        tb = "\n"
        if isTB:
            tb = get_traceback()
        f = open(LOGFILE, 'a')
        f.write(time.strftime("[%Y-%m-%d %H:%M:%S]: ") + msg + tb)
        f.close()
    except Exception, e:
        print >>sys.stderr, "Writelog error: %s" % str(e)

def cluster_hosts(conductor_group, default_hosts = []):
    if conductor_group:
        for i in range(RETRY_COUNT):
            r = doRequest("https://c.yandex-team.ru/api-cached/groups2hosts/%s" % conductor_group, "Get DB cluster hosts for group %s" % conductor_group)
            if r:
                hosts = map(str.strip, r.splitlines())
                return hosts if len(hosts) > 0 else default_hosts
            else: continue
    return default_hosts

def mongo_conn_str(cfg):
    prfx = ("%s:%s@" % (cfg['user'], cfg['passwd'])) if 'user' in cfg and cfg['user'] else ''
    return "mongodb://%s%s/%s" % (prfx, ','.join(cluster_hosts(cfg['cluster'], cfg['hosts'])), cfg['db'])

def loadDBCredentials(cfg):
    f, CURDIR = None, WORKING_DIR
    try:
        if not os.path.exists('%s/.mongodb/%s' % (CURDIR, cfg['db'])):
            CURDIR = os.path.dirname(os.path.abspath(__file__))
        f = open('%s/.mongodb/%s' % (CURDIR, cfg['db']))
        for line in f:
            sf = line.split(':')
            if len(sf) == 2:
                cfg['user'], cfg['passwd'] = sf[0], sf[1]
                break
        f.close()
    except Exception, e:
        writelog("getCredentials exception: %s" % str(e), True)

def getDB(cfg):
    mongo_cfg = {
        'host':             mongo_conn_str(cfg),
        'port':             int(cfg['port']),
        'connectTimeoutMS': 10000,
        'socketTimeoutMS':  10000,
        'read_preference':  pymongo.read_preferences.ReadPreference.NEAREST
    }
    if cfg.get('ssl', False):
        HOME = WORKING_DIR
        if not os.path.exists('%s/allCAs.pem' % HOME):
            HOME += '/.mongodb'
            if not os.path.exists('%s/allCAs.pem' % HOME):
                HOME = os.path.dirname(os.path.abspath(__file__))
        mongo_cfg.update({'ssl': True, 'ssl_ca_certs': '%s/allCAs.pem' % HOME})
    return pymongo.MongoClient(**mongo_cfg)[cfg['db']]

if len(sys.argv) > 1:
    try:
        MAX_DATE = datetime.strptime(sys.argv[1], '%Y-%m-%d').date()
    except Exception, e:
        print >>sys.stderr, "Exception: %s" % str(e)
        MAX_DATE = TODAY
if len(sys.argv) > 2:
    try:
        MIN_DATE = datetime.strptime(sys.argv[2], '%Y-%m-%d').date()
    except Exception, e:
        print >>sys.stderr, "Exception: %s" % str(e)
        MIN_DATE = TODAY
max_date, min_date = MAX_DATE.isoformat(), MIN_DATE.isoformat()
yt_map = yt.get(TEMP_FILES_FOLDER, attributes = ['modification_time', 'type'])

loadDBCredentials(MONGO)
db = getDB(MONGO)
for f in yt_map.keys():
    if yt_map[f].attributes['modification_time'] < max_date and yt_map[f].attributes['modification_time'] > min_date and yt_map[f].attributes['type'] == 'map_node' and f not in ['features', 'test', 'features_test', 'threshold', 'factor_names', 'fml', 'seo_abuse', 'rules_dict', 'mapped_dlv_in_log', 'tmp']:
        wi = {}; fi = {}
        for doc in db['intermidiate_tables'].find({'path': {'$regex': '^%s/%s' % (TEMP_FILES_FOLDER, f)}}):
            if 'workflow_id' in doc:
                wi[doc['workflow_id']] = 1
            if 'formula_id' in doc:
                fi[doc['formula_id']] = 1
        for workflow_id in wi.keys():
            db['models'].remove({'workflow_id': workflow_id})
            db['comparing_models_info'].remove({'workflow_id': workflow_id})
            db['intermidiate_tables'].remove({'workflow_id': workflow_id})
        for formula_id in fi.keys():
            db['models_prod_history'].remove({'formula_id': formula_id})
        print("Delete: '%s'. Workflows: %s" % (f, str(wi.keys())))
        yt.remove(TEMP_FILES_FOLDER + '/' + f, recursive = True, force = True)

for doc in db['models'].find({'formula_id': {'$exists': 0}}):
    d = {'workflow_id': doc['workflow_id']}
    if 'workflow_instance_id' in doc:
        d['workflow_instance_id'] = doc['workflow_instance_id']
    for r in db['intermidiate_tables'].aggregate([{'$match': d}, {'$group': {'_id': doc['workflow_id'], 'total': {'$sum': 1}}}, {'$match': {'total': {'$gt': 0}}}]):
        for row in db['intermidiate_tables'].find(d):
            if 'path' in row and row['path']:
                yt.remove(row['path'], force=True)
                db['intermidiate_tables'].delete_one({'_id': row['_id']})
    db['models'].delete_one({'_id': doc['_id']})

for r in db['intermidiate_tables'].find():
    if 'workflow_id' not in r:
        continue
    n = db['models'].count({'workflow_id': r['workflow_id']})
    if n < 1 and 'path' in r:
        if yt.exists(r['path']):
            yt.remove(r['path'], force=True)
        db['intermidiate_tables'].delete_one({'_id': r['_id']})
