#!/usr/bin/python
# encoding: utf-8
# kate: space-indent on; indent-width 4; replace-tabs on;
#
import os, os.path, sys, json, re, pymongo, binascii, time, base64
from collections import defaultdict
from urllib import urlopen, unquote
from bson.objectid import ObjectId
from traceback import format_exception
import yt.wrapper as ytw

WORKING_DIR = os.environ['HOME'] if 'HOME' in os.environ else os.path.dirname(os.path.abspath(__file__))
LOGFILE = '%s/logs/conv_imapdb2yt.log' % WORKING_DIR
YT_PROXY = "hahn.yt.yandex.net"
YT_ROOT_PATH = "//home/so_fml/db/imapchick/"
ROBOT_NAME = "robot-mailspam"
ROBOT_YT_TOKEN = open("%s/.yt/token" % WORKING_DIR).read().strip()
RETRY_COUNT, BLOCK_SIZE = 3, 10
MONGO = {
    'cluster': 'mail_so_813',
    'port':    27017,
    'db':      'imap',
    'hosts':   ["compldb1%s.so.yandex.net" % dc for dc in "jmo"]
}
ytw.config['proxy']['url'] = YT_PROXY
ytw.config['token'] = ROBOT_YT_TOKEN

def get_traceback():
    exc_type, exc_value, exc_traceback = sys.exc_info()
    tb = ''
    for step in format_exception(exc_type, exc_value, exc_traceback):
        try:
            tb += "\t" + step.strip() + "\n"
        except:
            pass
    return tb

def writelog(msg, isTB = False):
    if not msg: return
    try:
        tb = "\n"
        if isTB:
            tb = get_traceback()
        f = open(LOGFILE, 'a')
        f.write(time.strftime("[%Y-%m-%d %H:%M:%S]: ") + msg + tb)
        f.close()
    except Exception, e:
        print >>sys.stderr, "Writelog error: %s" % str(e)

def cluster_hosts(conductor_group, default_hosts = []):
    hosts = []
    try:
        for i in range(RETRY_COUNT):
            fh = urlopen("https://c.yandex-team.ru/api-cached/groups2hosts/%s" % conductor_group)
            if fh and fh.getcode() == "200":
                for line in fh:
                    hosts.append(line)
                fh.close()
                return hosts if len(hosts) > 0 else default_hosts
            else: continue
    except Exception, e:
        print >>sys.stderr, "[%s] cluster_hosts exception: %s%s" % (time.strftime("%Y-%m-%d %H:%M:%S"), str(e), get_traceback()); sys.stderr.flush()
    return default_hosts

def mongo_conn_str(cfg):
    prfx = ("%s:%s@" % (cfg['user'], cfg['passwd'])) if 'user' in cfg and cfg['user'] else ''
    return "mongodb://%s%s/%s" % (prfx, ','.join(cluster_hosts(cfg['cluster'], cfg['hosts'])), cfg['db'])

def get_uuid(sep):
    if not sep: sep = '-'
    chars, s = ['a', 'b', 'c', 'd', 'e', 'f', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], []
    for i in range(32):
        s.append(chars[int(randrange(16))])
    s.insert(8,  sep)
    s.insert(13, sep)
    s.insert(18, sep)
    s.insert(23, sep)
    return ''.join(s)

def convImapDb2Yt():
    try:
        #db = pymongo.MongoClient(host = mongo_conn_str(MONGO), connect = True, connectTimeoutMS = 60000, socketTimeoutMS = 60000)[MONGO['db']]
        db = pymongo.MongoClient(host = mongo_conn_str(MONGO))[MONGO['db']]
        for t in db.collection_names(include_system_collections = False):
            if t == 'folders': continue
            print "Creation table '%s'" % t
            ytw.config['proxy']['url'] = "hahn"
            if ytw.exists(YT_ROOT_PATH + t + '_replicated'):
                replicated_id = ytw.get(YT_ROOT_PATH + t + '_replicated/@id')
            else:
                replicated_id = ytw.create("replicated_table", YT_ROOT_PATH + t + '_replicated', attributes = {"schema": [{"name": "id", "type": "string", "sort_order": "ascending"}, {"name": "msgdate", "type": "string"}, {"name": "data", "type": "string"}], "dynamic": True})
                replica1_id = ytw.create("table_replica", attributes = {"table_path": YT_ROOT_PATH + t + '_replicated', "cluster_name": "hahn", "replica_path": YT_ROOT_PATH + t})
                replica2_id = ytw.create("table_replica", attributes = {"table_path": YT_ROOT_PATH + t + '_replicated', "cluster_name": "banach", "replica_path": YT_ROOT_PATH + t})
            if ytw.exists(YT_ROOT_PATH + t):
                replica1_id = ytw.get(YT_ROOT_PATH + t + '/@id')
                table1_id = ytw.get(YT_ROOT_PATH + t + '/@upstream_replica_id')
            else:
                table1_id = ytw.create("table", YT_ROOT_PATH + t, attributes = {"schema": [{"name": "id", "type": "string", "sort_order": "ascending"}, {"name": "msgdate", "type": "string"}, {"name": "data", "type": "string"}], "dynamic": True, "upstream_replica_id": replica1_id})
            ytw.config['proxy']['url'] = "banach"
            if ytw.exists(YT_ROOT_PATH + t):
                replica2_id = ytw.get(YT_ROOT_PATH + t + '/@id')
                table2_id = ytw.get(YT_ROOT_PATH + t + '/@upstream_replica_id')
            else:
                table2_id = ytw.create("table", YT_ROOT_PATH + t, attributes = {"schema": [{"name": "id", "type": "string", "sort_order": "ascending"}, {"name": "msgdate", "type": "string"}, {"name": "data", "type": "string"}], "dynamic": True, "upstream_replica_id": replica2_id})
            print "Replicated ID: %s, Replica1 ID: %s, Replica2 ID: %s" % (replicated_id, replica1_id, replica2_id)
            #print "Replicated ID: %s, Replica1 ID: %s" % (replicated_id, replica1_id); sys.stdout.flush()
            ytw.config['proxy']['url'] = "hahn"
            if ytw.get(YT_ROOT_PATH + t + '_replicated/@tablet_state') != 'mounted':
                ytw.mount_table(YT_ROOT_PATH + t + '_replicated')
            if ytw.get(YT_ROOT_PATH + t + '/@tablet_state') != 'mounted':
                ytw.mount_table(YT_ROOT_PATH + t)
            ytw.config['proxy']['url'] = "banach"
            if ytw.get(YT_ROOT_PATH + t + '/@tablet_state') != 'mounted':
                ytw.mount_table(YT_ROOT_PATH + t)
            ytw.alter_table_replica(replica2_id, enabled = True)
            ytw.config['proxy']['url'] = "hahn"
            ytw.alter_table_replica(replica1_id, enabled = True)
            skip = n = 1
            while n > 0:
                n, t0 = 0, time.time()
                for row in db[str(t)].find({}, skip = skip, limit = BLOCK_SIZE):
                    n += 1; print "Rows selected (time = %s)" % (time.time() - t0); sys.stdout.flush(); t0 = time.time()
                    try:
                        row['id'] = str(row['_id'])
                        del row['_id']
                        print "Data type: %s" % type(row['data'])
                        s = str(row['data'])
                        u = unicode(row['data'], 'utf-8', 'ignore')
                        #row['data'] = binascii.a2b_base64(u)
                        row['data'] = base64.b64encode(s)
                        #print "Row: %s" % str(row)
                        ytw.insert_rows(YT_ROOT_PATH + t + '_replicated', json.dumps(row), format = ytw.JsonFormat(attributes = {"encode_utf8": False}), raw = True, require_sync_replica = False)
                        #break
                        print "Processed row: %d (time = %s)" % (n, time.time() - t0); sys.stdout.flush()
                    except Exception, e:
                        print >>sys.stderr, "convImapDb2Yt data error: %s.%s" % (str(e), get_traceback()); sys.stdout.flush()
                skip += n
                print "Total processed rows for table %s: %d" % (t, skip); sys.stdout.flush()
                if not n:
                    break
            #break
    except Exception, e:
        print >>sys.stderr, "convImapDb2Yt DB error: %s.%s" % (str(e), get_traceback()); sys.stdout.flush()

convImapDb2Yt()
