#!/usr/bin/env python
# encoding: utf-8
# kate: space-indent on; indent-width 4; replace-tabs on;
#
from __future__ import print_function
import sys, re, redis, pymongo
from datetime import date, timedelta
from bson.objectid import ObjectId
from collections import defaultdict

cfg = {
    'redis_hosts':   "compldb1j.so.yandex.net,compldb1o.so.yandex.net,compldb1m.so.yandex.net",
    'redis_port':    6379,
    'redis_db':      1,
    'redis_timeout': 2.0,
    'mongo_uri':     "mongodb://compldb1j.so.yandex.net,compldb1m.so.yandex.net,compldb1o.so.yandex.net/imap",
    'mongo_port':    27017,
    'mongo_db':      'imap',
    'mongo_replset': 'compldb'
}
GETIDS_SCRIPT = """
    local data = redis.call("get", KEYS[1])
    local data_ids = {}
    for elem in string.gmatch(data, "[^\\n]+") do
        data_ids[#data_ids + 1] = string.match(elem, "data_id\\": \\"([a-f0-9]+)\\"")
    end
    return table.concat(data_ids, ',')
    """
def redis_reconnect():
    rediscli = None
    for host in cfg['redis_hosts'].split(','):
        try:
            rediscli = redis.StrictRedis(host = host, port = cfg['redis_port'], db = cfg['redis_db'], socket_timeout = cfg['redis_timeout'])
            if rediscli.info()["role"] == "master":
                return rediscli
        except Exception, e:
            print("Exception in redis_reconnect: %s" % str(e))
    return rediscli

r = redis_reconnect()
db = pymongo.MongoClient(host = cfg['mongo_uri'], port = cfg['mongo_port'], replicaSet = cfg['mongo_replset'])[cfg['mongo_db']]
for collection in db.collection_names():
    if re.match(r'^so_', collection):
        print("Collection: " + collection); sys.stdout.flush()
        i, n, m, k, ids = 0, 0, 0, 0, defaultdict(int)
        for doc in db[collection].find():
            ids[str(doc['_id'])] += 1; n += 1
            if n % 100 == 0: print("Doc: %d (%s)" % (n, str(doc['_id'])), end="\r"); sys.stdout.flush()
        for key in sorted(r.keys(collection + '_*')):
            data_ids = r.eval(GETIDS_SCRIPT, 1, key).split(',')
            print("Key '%s' have %d ids             " % (key, len(data_ids))); sys.stdout.flush()
            for data_id in data_ids:
                if data_id in ids:
                    k += 1; del ids[data_id]
                else:
                    m += 1
        for data_id in ids.keys():
            try:
                db[collection].remove({'_id': ObjectId(data_id)}); i += 1
                print("Deleted records: %d" % i, end="\r"); sys.stdout.flush()
            except Exception, e:
                print("Cleanup DB error: %s" % str(e)); sys.stdout.flush()
        print("Size of rest from %d: %d. Orphaned: %d. Exists: %d" % (n, len(ids.keys()), m, k)); sys.stdout.flush()
