#!/usr/bin/env python

import json
import time
import copy

import re
import os
import argparse
import bisect

import system
import db
import cleanup
import netserver
import taskloop


# ==============================================================================================================


IP              = "::"
SOLOMON_PORT    = 7002

HOME            = "/Berkanavt/solomon-backup"
CONFIGFILE      = os.path.join(HOME, "config/backup.conf")
BACKUPDIR       = os.path.join(HOME, "backup")
LOGSDIR         = os.path.join(HOME, "logs")
TMPDIR          = os.path.join(HOME, "tmp")
STATEFILE       = os.path.join(TMPDIR, "state")

GROUPFILE       = "/etc/yandex/group"
GROUPHOSTSFILE  = "/etc/yandex/group_hosts"


# ==============================================================================================================


class THistorgam(object):
    def __init__(self, BinsList):
        self.BinsList = BinsList
        self.BinsCount = [0]*(len(self.BinsList) + 1)

    def Add(self, Value):
        try:
            self.BinsCount[bisect.bisect_left(self.BinsList, int(Value))] += 1
        except TypeError:
            self.BinsCount[-1] += 1

    def Get(self):
        L = len(self.BinsList)
        return {str(self.BinsList[Idx]) if Idx < L else "inf": self.BinsCount[Idx] for Idx in range(L + 1)}


def HttpGetRequestHandler(HTTPRequestHandler):
    global CfgLock
    global MetricsLock
    global MetricsDict

    def TimeDelta(Time):
        try:
            return int(time.time() - Time)
        except TypeError:
            return None

    def MillisElapsedSince(Time):
        try:
            return 1000 * int(time.time() - Time)
        except TypeError:
            return None

    def BoolToOne(Bool):
        return 1 if Bool else None

    if HTTPRequestHandler.path.startswith("/tables"):
        with CfgLock:
            Msg = "\n".join([T for T in MyTablesList])
        ContentType = "text/plain"
    elif HTTPRequestHandler.path.startswith("/shards"):
        with CfgLock:
            Msg = "\n".join([T["Shard"] for T in MyTabletsList])
        ContentType = "text/plain"
    else:
        MetricsList = []
        BinsList = [600, 1800, 3600, 7200, 14400, 21600, 32400, 43200, 64800, 86400, 129600, 172800, 237600, 324000, 432000, 604800]
        with MetricsLock:
            MetricsList += [
                    {"labels": {"host": HostnameShort, "sensor": "uptimeMillis"}, "kind": "DGAUGE", "value": MillisElapsedSince(ProcStartTime)},
                    {"labels": {"host": HostnameShort, "sensor": "version", "revision": PackageVersion}, "kind": "DGAUGE", "value": 1},
            ]
            TableHist = THistorgam(BinsList)
            for Table in MetricsDict.get("Table", {}):
                TmDict = MetricsDict["Table"].get(Table, {})
                TimeSinceLastSuccess = TimeDelta(TmDict.get("LastSuccessTime"))
                TableHist.Add(TimeSinceLastSuccess)
                MetricsList += [
                    {"labels": {"table": Table, "sensor": "size"},                  "kind": "DGAUGE", "value": TmDict.get("Size")},
                    {"labels": {"table": Table, "sensor": "lastBackupDuration"},    "kind": "DGAUGE", "value": TmDict.get("LastBackupDuration")},
                    {"labels": {"table": Table, "sensor": "backupIsOnline"},        "kind": "DGAUGE", "value": BoolToOne(TmDict.get("BackupIsOnline"))},
                    {"labels": {"table": Table, "sensor": "timeSinceLastSuccess"},  "kind": "DGAUGE", "value": TimeSinceLastSuccess}
                ]
            KvHist = THistorgam(BinsList)
            for Id in MetricsDict.get("KV", {}):
                KvDict = MetricsDict["KV"].get(Id, {})
                TimeSinceLastSuccess = TimeDelta(KvDict.get("LastSuccessTime"))
                KvHist.Add(TimeSinceLastSuccess)
                MetricsList += [
                    {"labels": {"shardId": Id, "sensor": "size"},                   "kind": "DGAUGE", "value": KvDict.get("Size")},
                    {"labels": {"shardId": Id, "sensor": "lastBackupDuration"},     "kind": "DGAUGE", "value": KvDict.get("LastBackupDuration")},
                    {"labels": {"shardId": Id, "sensor": "backupIsOnline"},         "kind": "DGAUGE", "value": BoolToOne(KvDict.get("BackupIsOnline"))},
                    {"labels": {"shardId": Id, "sensor": "timeSinceLastSuccess"},   "kind": "DGAUGE", "value": TimeSinceLastSuccess},
                    {"labels": {"shardId": Id, "sensor": "lastUploadSize"},         "kind": "DGAUGE", "value": KvDict.get("LastUploadSize")},
                    {"labels": {"shardId": Id, "sensor": "lastRemoveSize"},         "kind": "DGAUGE", "value": KvDict.get("LastRemoveSize")},
                    {"labels": {"shardId": Id, "sensor": "currentBytesSentToS3"},   "kind": "DGAUGE", "value": KvDict.get("CurrentBytesSentToS3")},
                    {"labels": {"shardId": Id, "sensor": "currentBytesLeftToSend"}, "kind": "DGAUGE", "value": KvDict.get("CurrentBytesLeftToSend")},
                    {"labels": {"shardId": Id, "sensor": "currentReadRateFromYDB"}, "kind": "DGAUGE", "value": KvDict.get("CurrentReadRateFromYDB")},
                    {"labels": {"shardId": Id, "sensor": "currentWriteRateToS3"},   "kind": "DGAUGE", "value": KvDict.get("CurrentWriteRateToS3")},
                    {"labels": {"shardId": Id, "sensor": "errors"},                 "kind": "DGAUGE", "value": KvDict.get("Errors")}
                ]
        MetricsList += [
            {"labels": {"host": HostnameShort, "sensor": "tableTimeSinceLastSuccessHist", "bin": B}, "kind": "IGAUGE", "value": V}
                for B, V in TableHist.Get().items()
        ] + [
            {"labels": {"host": HostnameShort, "sensor": "kvTimeSinceLastSuccessHist", "bin": B}, "kind": "IGAUGE", "value": V}
                for B, V in KvHist.Get().items()
        ]
        Msg = json.dumps({
            "commonLabels": {
                "process": "backup",
                "host": ""
            },
            "sensors": [M for M in MetricsList if M["value"] is not None]
        })
        ContentType = "application/json"
        system.Log("Reply with solomon json: %s" % Msg)
    HTTPRequestHandler.send_response(200)
    HTTPRequestHandler.send_header("Content-type", ContentType)
    HTTPRequestHandler.send_header("Connection", "Close")
    HTTPRequestHandler.end_headers()
    HTTPRequestHandler.wfile.write(Msg)


# ==============================================================================================================


def GetStringField(String, FieldName):
    try:
        return String.partition(FieldName)[2].split(None, 1)[0]
    except IndexError:
        return None


def PathStripPrefix(Path, Prefix):
    PathList   = [D for D in Path.split('/')   if len(D) > 0]
    PrefixList = [D for D in Prefix.split('/') if len(D) > 0]
    Idx        = 0
    for i in xrange(min(len(PathList), len(PrefixList))):
        if PathList[i] == PrefixList[i]:
            Idx = i + 1
    return '/'.join(PathList[Idx:])


def WriteMetricsDataToS3(ItemName, Dict):
    global CfgLock

    with CfgLock:
        _MyYDB         = MyYDB
        _S3BackupPath  = S3BackupPath

    RETRYCOUNT = 10
    if _MyYDB is None or _S3BackupPath is None:
        system.Log("Failed to upload metrics data to S3 for %s: MyYDB=%s S3BackupPath=%s" % (ItemName, _MyYDB, _S3BackupPath))
        return
    for _1 in range(RETRYCOUNT):
        Data = None
        try:
            S3File = os.path.join(_S3BackupPath, "metrics_data", PathStripPrefix(ItemName, _MyYDB) + ".json")
            Dict.update({"Host": Hostname})
            Data = json.dumps(Dict)
            if db.S3Write(Data, S3File):
                system.Log("Successfully uploaded metrics data to S3 for %s@%s: %s" % (ItemName, _S3BackupPath, Data))
                return
            taskloop.Sleep(5)
            system.Log("Retrying metrics data upload to S3 for %s@%s ..." % (ItemName, _S3BackupPath))
        except Exception as e:
            system.Log("Failed to upload metrics data to S3 for %s@%s: %s - %s" % (ItemName, _S3BackupPath, e, repr(Data)))


def ReadMetricsDataFromS3(ItemName):
    global CfgLock

    with CfgLock:
        _MyYDB         = MyYDB
        _S3BackupPath  = S3BackupPath

    Data = None
    if _MyYDB is None or _S3BackupPath is None:
        system.Log("Failed to load metrics data from S3 for %s: MyYDB=%s S3BackupPath=%s" % (ItemName, _MyYDB, _S3BackupPath))
        return {}
    try:
        S3File = os.path.join(_S3BackupPath, "metrics_data", PathStripPrefix(ItemName, _MyYDB) + ".json")
        Data = db.S3Read(S3File)
        Dict = json.loads(Data)
        system.Log("Successfully loaded metrics data from S3 for %s@%s: %s" % (ItemName, _S3BackupPath, Data))
        return Dict if isinstance(Dict, dict) else {}
    except Exception as e:
        system.Log("Failed to load metrics data from S3 for %s@%s: %s - %s" % (ItemName, _S3BackupPath, e, repr(Data)))
        return {}


'''
    MetricsDict = {
        "Table": {
            "TABLE_NAME": {
                "Size": SIZE,                   # from S3
                "LastBackupDuration": TIME,     # from S3
                "BackupIsOnline": TRUE|FALSE,
                "LastSuccessTime": TIMESTAMP,   # from S3
                "LastBackupIsMine": TRUE|FALSE
            },
            ...
        },
        "KV": {
            "SHARD_ID": {
                "Size": SIZE,                   # from S3
                "LastBackupDuration": TIME,     # from S3
                "BackupIsOnline": TRUE|FALSE,
                "LastSuccessTime": TIMESTAMP,   # from S3
                "LastBackupIsMine": TRUE|FALSE,
                "LastUploadSize": SIZE,         # from S3
                "LastRemoveSize": SIZE,         # from S3
                "CurrentBytesSentToS3": SIZE,
                "CurrentBytesLeftToSend": SIZE,
                "CurrentReadRateFromYDB": BPS,
                "CurrentWriteRateToS3": BPS,
                "Errors": COUNT
            },
            ...
        }
    }
'''

def MetricsUpdateTable(Table, Dict, ForceS3Read=False):
    global MetricsDict
    global MetricsLock

    DefaultDict = {
        "Size":               None,
        "LastBackupDuration": None,
        "BackupIsOnline":     False,
        "LastSuccessTime":    None,
        "LastBackupIsMine":   False
    }
    S3Dict = {}
    NewDict = {}
    with MetricsLock:
        TableDict = MetricsDict["Table"].get(Table, {})
        DoS3Read  = (TableDict.get("Size") is None or ForceS3Read)
        if not DoS3Read:
            for K in DefaultDict:
                if K not in Dict:
                    if K not in TableDict:
                        DoS3Read = True
                        break
        if DoS3Read:
            S3Dict = ReadMetricsDataFromS3(Table)
        if ForceS3Read:
            for K in DefaultDict:
                NewDict[K] = Dict.get(K, S3Dict.get(K, TableDict.get(K, DefaultDict[K])))
        else:
            for K in DefaultDict:
                NewDict[K] = Dict.get(K, TableDict.get(K, S3Dict.get(K, DefaultDict[K])))
        MetricsDict["Table"][Table] = NewDict


def MetricsUpdateKV(Shard, Dict, ForceS3Read=False):
    global MetricsDict
    global MetricsLock

    DefaultDict = {
        "Size":                   None,
        "LastBackupDuration":     None,
        "BackupIsOnline":         False,
        "LastSuccessTime":        None,
        "LastBackupIsMine":       False,
        "LastUploadSize":         None,
        "LastRemoveSize":         None,
        "CurrentBytesSentToS3":   None,
        "CurrentBytesLeftToSend": None,
        "CurrentReadRateFromYDB": None,
        "CurrentWriteRateToS3":   None,
        "Errors":                 None
    }
    S3Dict = {}
    NewDict = {}
    with MetricsLock:
        ShardDict = MetricsDict["KV"].get(Shard, {})
        DoS3Read  = (ShardDict.get("Size") is None)
        if not DoS3Read:
            for K in DefaultDict:
                if K not in Dict:
                    if K not in ShardDict:
                        DoS3Read = True
                        break
        if DoS3Read:
            S3Dict = ReadMetricsDataFromS3(Shard)
        if ForceS3Read:
            for K in DefaultDict:
                NewDict[K] = Dict.get(K, S3Dict.get(K, ShardDict.get(K, DefaultDict[K])))
        else:
            for K in DefaultDict:
                NewDict[K] = Dict.get(K, ShardDict.get(K, S3Dict.get(K, DefaultDict[K])))
        MetricsDict["KV"][Shard] = NewDict


def MetricsUpdate():
    global MetricsDict
    global CfgLock
    global MetricsLock

    with CfgLock:
        _MyTablesList = [T for T in MyTablesList]
        _MyShardsList = [T["Shard"] for T in MyTabletsList]

    with MetricsLock:
        for Table in set([T for T in MetricsDict["Table"]]) - set(_MyTablesList):
            if not MetricsDict["Table"][Table].get("BackupIsOnline") is True:
                system.Log("Removing table %s from my scope." % Table)
                del MetricsDict["Table"][Table]
        for Shard in set([S for S in MetricsDict["KV"]]) - set(_MyShardsList):
            if not MetricsDict["KV"][Shard].get("BackupIsOnline") is True:
                system.Log("Removing shard %s from my scope." % Shard)
                del MetricsDict["KV"][Shard]

    for Table in _MyTablesList:
        MetricsUpdateTable(Table, {})

    for Shard in _MyShardsList:
        MetricsUpdateKV(Shard, {})



# ==============================================================================================================


def ResetConf():
    global MyCluster
    global MyYDB
    global MyEndpoint
    global MyKeyFile
    global MyTablesList
    global KVPath
    global MyTabletsList
    global S3BackupPath
    global CfgLock

    with CfgLock:
        system.Log("Resetting configuration")
        MyCluster     = None
        MyYDB         = None
        MyEndpoint    = db.YDB_ENDPOINT
        MyKeyFile     = None
        MyTablesList  = []
        KVPath        = None
        MyTabletsList = []
        S3BackupPath  = None


def UpdateConf():
    global MyCluster
    global MyYDB
    global MyEndpoint
    global MyKeyFile
    global MyTablesList
    global KVPath
    global MyTabletsList
    global S3BackupPath
    global CfgLock

    RETRYCOUNT  = 10
    ServersList = []
    Cfg         = {}
    for _1 in range(RETRYCOUNT):
        try:
            CfgFile = open(CONFIGFILE)
            Cfg = json.load(CfgFile)
            CfgFile.close()
            break
        except (ValueError, IOError) as e:
            system.Log("Failed to read config %s: %s" % (CONFIGFILE, e))
        taskloop.Sleep(1)
        system.Log("Retrying configuration file read ...")
    else:
        system.Log("Failed to read configuration file.")
        ResetConf()
        return

    for _1 in range(RETRYCOUNT):
        _MyCluster  = system.GetFileData(GROUPFILE).strip()
        ServersList = system.GetFileData(GROUPHOSTSFILE).strip().split()
        if len(_MyCluster) > 0 and len(ServersList) > 0:
            break
        system.Log("Failed to read group files %s or %s" % (GROUPFILE, GROUPHOSTSFILE))
        taskloop.Sleep(1)
        system.Log("Retrying configuration files read ...")
    else:
        system.Log("Failed to read configuration files.")
        ResetConf()
        return
    system.Log("Successfully read configuration files")

    if _MyCluster not in Cfg:
        system.Log("Cannot find my cluster in config.")
        ResetConf()
        return

    Index         = 0
    Size          = 1

    _MyYDB        = Cfg[_MyCluster].get("ydb", None)
    _MyEndpoint   = Cfg[_MyCluster].get("endpoint", db.YDB_ENDPOINT)
    _MyKeyFile    = Cfg[_MyCluster].get("key_file", None)
    YdbPathList   = Cfg[_MyCluster].get("metapaths", [])
    YdbSkipList   = Cfg[_MyCluster].get("skippaths", []) + ["/~backup_", "/."]
    _KVPath       = Cfg[_MyCluster].get("kvpath", None)
    _S3BackupPath = Cfg[_MyCluster].get("s3path", None)
    if not _MyYDB:
        system.Log("Failed to get my YDB name")
        ResetConf()
        return

    if _KVPath is not None:
        _KVPath = os.path.join(_MyYDB, _KVPath)
        for _1 in range(RETRYCOUNT):
            KVList = db.KVLs(_KVPath)
            if len(KVList) > 0:
                break
            taskloop.Sleep(1)
            system.Log("Retrying getting kv list ...")
        else:
            system.Log("Failed to get kv list.")
            ResetConf()
            return
        _MyTabletsList = sorted([KV for KV in KVList if KV["Host"] == Hostname], key=lambda T: int(T["Shard"]))
        KVServersList  = sorted(set([KV["Host"] for KV in KVList]))
        try:
            Index      = KVServersList.index(Hostname)
            Size       = len(KVServersList)
        except ValueError:
            system.Log("Cannot find my server %s in the list (len=%d) of kv servers." % (Hostname, len(KVServersList)))
            ResetConf()
            return
        system.Log("Updated my tablets list from ydb:%s: %s" % (_KVPath, [T["Shard"] for T in _MyTabletsList]))
    else:
        if int(time.strftime("%H")) % len(ServersList) != ServersList.index(Hostname):
            system.Log("Hourly backup of ydb tables now should be elsewhere.")
            ResetConf()
            return
        else:
            _MyTabletsList = []
            system.Log("No tablets here")

    for _1 in range(RETRYCOUNT):
        TablesList = sorted([str(P) for SP in YdbPathList for P in db.YdbTree(_MyYDB, os.path.join(_MyYDB, SP), YdbSkipList, _MyEndpoint, _MyKeyFile)])
        if len(TablesList) > 0:
            break
        taskloop.Sleep(1)
        system.Log("Retrying getting ydb tables ...")
    else:
        system.Log("Failed to get list of ydb tables.")
        ResetConf()
        return
    BunchSize     = len(TablesList) // Size + (0 if len(TablesList) % Size == 0 else 1)
    BunchBegin    = Index*BunchSize
    BunchEnd      = (Index + 1)*BunchSize
    _MyTablesList = TablesList[BunchBegin:BunchEnd]
    if len(_MyTablesList) > 0:
        system.Log("Updated my tables list from ydb:%s - tables[%d:%d) out of %d: %s" % (_MyYDB, BunchBegin, BunchEnd, len(TablesList), _MyTablesList))
    else:
        system.Log("Updated my tables list from ydb:%s - no need to backup any table out of %d: %s" % (_MyYDB, len(TablesList), _MyTablesList))

    with CfgLock:
        MyCluster     = _MyCluster
        MyYDB         = _MyYDB
        MyEndpoint    = _MyEndpoint
        MyKeyFile     = _MyKeyFile
        MyTablesList  = _MyTablesList
        KVPath        = _KVPath
        MyTabletsList = _MyTabletsList
        S3BackupPath  = _S3BackupPath
        system.Log("Success updating config.")


# ==============================================================================================================


def BackupYdbTablesToS3(YDB, Endpoint, SaKeyFile, YdbTableList, S3Prefix, S3SubdirFMT, BackupFileName):
    def CheckTable(YdbTable):
        if not MetricsDict["Table"].get(YdbTable, {}).get("LastBackupIsMine"):
            MetricsUpdateTable(YdbTable, {}, ForceS3Read=True)
    def StartBackup(YdbTable):
        MetricsUpdateTable(YdbTable, {
            "BackupIsOnline":     True
        })
    def BackupIsSuccess(YdbTable, DumpSize, StartTime):
        StartTime = int(StartTime)
        DeltaTime = int(time.time() - StartTime)
        MetricsUpdateTable(YdbTable, {
            "Size":               DumpSize,
            "LastBackupDuration": DeltaTime,
            "LastSuccessTime":    StartTime,
            "LastBackupIsMine":   True
        })
        WriteMetricsDataToS3(YdbTable, {
            "Size":               DumpSize,
            "LastBackupDuration": DeltaTime,
            "LastSuccessTime":    StartTime
        })
    def StopBackup(YdbTable):
        MetricsUpdateTable(YdbTable, {
            "BackupIsOnline":     False
        })

    RETRYCOUNT = 10
    S3Subdir   = time.strftime(S3SubdirFMT)
    TmpDir     = os.path.join(TMPDIR, "backup")
    system.EE(["/bin/rm", "-fr", TmpDir])
    system.EE(["/bin/mkdir", TmpDir])
    db.YdbRmOldBackup(YDB, YDB, 14400, Endpoint, SaKeyFile)
    BackedUpTablesCounter = 0
    system.Log("Start backup of ydb tables: %s" % YdbTableList)
    for YdbTable in YdbTableList:
        YdbTableName = os.path.basename(YdbTable)
        YdbDirName   = os.path.dirname(YdbTable)
        S3Dir        = os.path.join(S3Prefix, S3Subdir, PathStripPrefix(YdbDirName, YDB))
        S3FilePrefix = os.path.join(S3Dir, YdbTableName)
        FsDir        = os.path.join(TmpDir, YdbDirName.strip('/'))
        if len(db.S3Ls(S3FilePrefix)) > 0:
            system.Log("Files in S3 with prefix %s already present, skipping backup" % S3FilePrefix)
            CheckTable(YdbTable)
            continue
        StartBackup(YdbTable)
        StartTime = time.time()
        try:
            system.Exec(["/bin/mkdir", "-p", FsDir])
            for _1 in range(RETRYCOUNT):
                system.Log("ydb to file: %s -> %s" % (YdbTable, os.path.join(FsDir, YdbTableName)))
                FsFileList, DumpSize = db.YdbTableToTgzSplitFile(YDB, YdbTable, FsDir, 2147483648, Endpoint, SaKeyFile)
                S3PutCount = 0
                for FsFile in FsFileList:
                    S3File = os.path.join(S3Dir, os.path.basename(FsFile))
                    for _2 in range(RETRYCOUNT):
                        system.Log("copy file to S3: %s -> %s" % (FsFile, S3File))
                        if db.S3Put(FsFile, S3File):
                            S3PutCount += 1
                            break
                        taskloop.Sleep(2)
                        system.Log("Retrying copy file to S3 ...")
                if len(FsFileList) != 0:
                    if len(FsFileList) == S3PutCount:
                        BackupIsSuccess(YdbTable, DumpSize, StartTime)
                    BackedUpTablesCounter += 1
                    break
                taskloop.Sleep(5)
                system.Log("Retrying ydb to file ...")
        finally:
            StopBackup(YdbTable)
    if BackedUpTablesCounter > 0:
        system.EE(["/bin/tar", "cvf", BackupFileName, "-C", TmpDir, "."])
    system.EE(["/bin/rm", "-r", TmpDir])
    return (BackedUpTablesCounter == len(YdbTableList))


def CleanupLocalAndS3(S3Prefix, S3SubdirFMT, NMax, TMaxHours, CmpTime):
    RETRYCOUNT = 10
    cleanup.DirCleanup(BACKUPDIR, "backup\..+\.tar", SizeGB=20)
    system.Log("S3 cleanup at prefix %s" % S3Prefix)
    S3PrefixList = []
    for D in db.S3DirLs(S3Prefix):
        try:
            S3PrefixList.append({"Prefix":D, "Value":CmpTime - time.mktime(time.strptime(os.path.basename(D.strip('/')), S3SubdirFMT))})
        except Exception as e:
            system.Log("Exception while cleaning up S3 prefix %s on dir=%s fmt=%s (time conversion error?): %s" % (S3Prefix, D, S3SubdirFMT, e))
    S3PrefixDeleteList = cleanup.BucketRemove(S3PrefixList, NMax, TMaxHours*3600, 3500 if "%H" in S3SubdirFMT else 86300)
    for S3PrefixDict in S3PrefixDeleteList:
        for _1 in range(RETRYCOUNT):
            system.Log("Removing backup at S3 prefix %s" % S3PrefixDict["Prefix"])
            if db.S3RmByPrefix(S3PrefixDict["Prefix"]):
                break
            taskloop.Sleep(5)
            system.Log("Retrying removing S3 backup ...")


# ==============================================================================================================


def BackupYdbTables():
    global CfgLock

    with CfgLock:
        _MyYDB        = MyYDB
        _MyEndpoint   = MyEndpoint
        _MyKeyFile    = MyKeyFile
        _MyTablesList = copy.deepcopy(MyTablesList)
        _KVPath       = KVPath
        _S3BackupPath = S3BackupPath

    if _MyYDB is None or _S3BackupPath is None:
        system.Log("Cannot backup tables: MyYDB=%s S3BackupPath=%s" % (_MyYDB, _S3BackupPath))
        return

    if len(_MyTablesList) > 0:
        system.Log("Start tables backup %s" % _MyTablesList)
        if _KVPath is None:
            BackupFileName = os.path.join(BACKUPDIR, time.strftime("backup.%Y-%m-%d_%H.tar"))
            S3SubdirFMT    = "%Y-%m-%d_%H"
            CmpTime        = time.time()
            MaxBackupCount = 20
            MaxBackupHours = 24*60
            S3Prefix       = _S3BackupPath
        else:
            BackupFileName = os.path.join(BACKUPDIR, time.strftime("backup.%Y-%m-%d.tar"))
            S3SubdirFMT    = "%Y-%m-%d"
            CmpTime        = time.time()
            MaxBackupCount = 10
            MaxBackupHours = 24*60
            S3Prefix       = os.path.join(_S3BackupPath, "meta")
        CleanupLocalAndS3(S3Prefix, S3SubdirFMT, MaxBackupCount, MaxBackupHours, CmpTime)
        BackupYdbTablesToS3(_MyYDB, _MyEndpoint, _MyKeyFile, _MyTablesList, S3Prefix, S3SubdirFMT, BackupFileName)
        system.Log("Done tables backup.")
    else:
        system.Log("No tables to backup now.")


def BackupKV():
    global CfgLock

    def CheckKV(Shard):
        if not MetricsDict["KV"].get(Shard, {}).get("LastBackupIsMine"):
            MetricsUpdateKV(Shard, {}, ForceS3Read=True)
        elif not MetricsDict["KV"].get(Shard, {}).get("LastSuccessTime"):
            MetricsUpdateKV(Shard, {})
    def StartBackup(Shard):
        MetricsUpdateKV(Shard, {
            "BackupIsOnline":         True
        })
    def BackupProcessUpdate(Shard, BytesSentToS3, BytesLeftToSend, ReadRateFromYDB, WriteRateToS3, Errors):
        MetricsUpdateKV(Shard, {
            "CurrentBytesSentToS3":   BytesSentToS3,
            "CurrentBytesLeftToSend": BytesLeftToSend,
            "CurrentReadRateFromYDB": ReadRateFromYDB,
            "CurrentWriteRateToS3":   WriteRateToS3,
            "Errors":                 Errors
        })
    def BackupIsSuccess(Shard, TotalSize, BytesToUpload, BytesToRemove, StartTime, Errors):
        StartTime = int(StartTime)
        DeltaTime = int(time.time() - StartTime)
        MetricsUpdateKV(Shard, {
            "Size":                   TotalSize,
            "LastBackupDuration":     DeltaTime,
            "LastSuccessTime":        StartTime,
            "LastBackupIsMine":       True,
            "LastUploadSize":         BytesToUpload,
            "LastRemoveSize":         BytesToRemove,
            "Errors":                 Errors
        })
        WriteMetricsDataToS3(Shard, {
            "Size":                   TotalSize,
            "LastBackupDuration":     DeltaTime,
            "LastSuccessTime":        StartTime,
            "LastUploadSize":         BytesToUpload,
            "LastRemoveSize":         BytesToRemove
        })
    def StopBackup(Shard):
        MetricsUpdateKV(Shard, {
            "BackupIsOnline":         False,
            "CurrentBytesSentToS3":   None,
            "CurrentBytesLeftToSend": None,
            "CurrentReadRateFromYDB": None,
            "CurrentWriteRateToS3":   None
        })

    GOODAGE    = 14400
    RETRYCOUNT = 5
    AmpFactor  = 10
    for _1 in range(RETRYCOUNT):
        with CfgLock:
            _MyYDB         = MyYDB
            _KVPath        = KVPath
            _MyTabletsList = copy.deepcopy(MyTabletsList)
            _S3BackupPath  = S3BackupPath

        if _MyYDB is None or _KVPath is None or _S3BackupPath is None:
            system.Log("Cannot backup KV: MyYDB=%s KVPath=%s S3BackupPath=%s" % (_MyYDB, _KVPath, _S3BackupPath))
            taskloop.Sleep(10)
            continue
        Success  = True
        S3Prefix = os.path.join(_S3BackupPath, "kv")
        system.Log("Going to backup %d shards: %s" % (len(_MyTabletsList), [T["Shard"] for T in _MyTabletsList]))
        for T in _MyTabletsList:
            CheckKV(T["Shard"])
        _MyTabletsList.sort(key=lambda T: MetricsDict["KV"].get(T["Shard"], {}).get("LastSuccessTime", 0))
        FirstShard           = _MyTabletsList[0]["Shard"]
        FirstShardLastBackup = MetricsDict["KV"].get(FirstShard, {}).get("LastSuccessTime", 0)
        FirstShardBackupAge  = time.time() - (FirstShardLastBackup if FirstShardLastBackup else 0)
        system.Log("First shard to backup is %s with backup age %d seconds" % (FirstShard, FirstShardBackupAge))
        AgingTime = GOODAGE - FirstShardBackupAge
        if AgingTime > 0:
            system.Log("Waiting for the oldest backup for shard %s to age. Sleeping %d seconds ..." % (FirstShard, AgingTime))
            taskloop.Sleep(AgingTime)
        for i, T in enumerate(_MyTabletsList):
            system.Log("Starting to backup shard %s (%d of %d) (tablet %s)" % (T["Shard"], i + 1, len(_MyTabletsList), T["Tablet"]))
            StartBackup(T["Shard"])
            StartTime = time.time()
            try:
                MetricsUpdateTimer = time.time()
                RetryCount = RETRYCOUNT
                RESETCOUNTER = 100
                MINUPDATEINTERVAL = 3
                Errors = 0
                while RetryCount > 0:
                    for Stat in db.KVdump(_KVPath, T["Shard"], S3Prefix, AmpFactor=AmpFactor):
                        if Stat.startswith("STATUS"):
                            if Stat.endswith("BAD"):
                                Success = False
                                Errors += 1
                                system.Log("Failed to backup shard %s (tablet %s, errors %d) ..." % (T["Shard"], T["Tablet"], Errors))
                                taskloop.Sleep(5)
                            elif Stat.endswith("OK"):
                                system.Log("Successful backup of shard %s (tablet %s, errors %d) ..." % (T["Shard"], T["Tablet"], Errors))
                                BackupIsSuccess(T["Shard"], TotalSize, BytesToUpload, BytesToRemove, StartTime, Errors)
                            elif Stat.endswith("RETRY"):
                                Errors += 1
                                system.Log("Failed to backup shard %s (tablet %s, errors %d). Retrying ..." % (T["Shard"], T["Tablet"], Errors))
                                taskloop.Sleep(5)
                                RetryCount -= 1
                                break
                            RetryCount = 0
                        else:
                            if Stat.startswith("STATS ON TABLET"):
                                # STATS ON TABLET <tabletId> BYTES_TOTAL: <bytesTotal> BYTES_UPLOAD_UNCOMPRESSED: <bytesUpload> BYTES_REMOVE: <bytesRemove>
                                TotalSize       = system.Int(GetStringField(Stat, "BYTES_TOTAL:"))
                                BytesToUpload   = system.Int(GetStringField(Stat, "BYTES_UPLOAD_UNCOMPRESSED:"))
                                BytesToRemove   = system.Int(GetStringField(Stat, "BYTES_REMOVE:"))
                            elif Stat.startswith("STATS ON DUMP"):
                                # STATS ON DUMP TOTAL_BYTES_SENT: <bytesUploaded> TOTAL_BYTES_LEFT: <bytesUpload> CHUNK_SIZE: <size> TIME_GET_US: <timeGetUs> TIME_PUT_US: <timePutUs> ERROR: <count>
                                BytesSentToS3   = system.Int(GetStringField(Stat, "TOTAL_BYTES_SENT:"))
                                BytesLeftToSend = system.Int(GetStringField(Stat, "TOTAL_BYTES_LEFT:"))
                                ChunkSize       = system.Float(GetStringField(Stat, "CHUNK_SIZE:"))
                                Errors          += system.Int(GetStringField(Stat, "ERRORS:"))
                                try:
                                    ReadRateFromYDB = int(ChunkSize/system.Int(GetStringField(Stat, "TIME_GET_US:"))*1000000)
                                    WriteRateToS3   = int(ChunkSize/system.Int(GetStringField(Stat, "TIME_PUT_US:"))*1000000)
                                except ZeroDivisionError:
                                    ReadRateFromYDB = 0
                                    WriteRateToS3   = 0
                                if time.time() - MetricsUpdateTimer > MINUPDATEINTERVAL:
                                    BackupProcessUpdate(T["Shard"], BytesSentToS3, BytesLeftToSend, ReadRateFromYDB, WriteRateToS3, Errors)
                                    MetricsUpdateTimer = time.time()
                                StatsResetCounter   = RESETCOUNTER
                            else:
                                if StatsResetCounter <= 0:
                                    BackupProcessUpdate(T["Shard"], None, None, 0, 0, Errors)
                                    StatsResetCounter = RESETCOUNTER
                                else:
                                    StatsResetCounter -= 1
                                system.Log("%s" % Stat)
            finally:
                StopBackup(T["Shard"])
        if Success:
            break
        system.Log("Retrying shards backup ...")
    else:
        system.Log("Failed to backup shards.")


# ==============================================================================================================
# Globals

CfgLock         = None
MyCluster       = None
MyYDB           = None
MyEndpoint      = db.YDB_ENDPOINT
MyKeyFile       = None
MyTablesList    = []
KVPath          = None
MyTabletsList   = []
S3BackupPath    = None

MetricsLock     = None
MetricsDict     = {"Table": {}, "KV": {}}

Hostname        = system.GetHostname()
HostnameShort   = Hostname.split('.')[0]
ProcStartTime   = time.time()

try:
    import version
    PackageVersion = version.VERSION
except:
    PackageVersion = "unknown"


# ==============================================================================================================


def main():
    global CfgLock
    global MetricsLock

    Parser = argparse.ArgumentParser(description="cluster manage script")
    Parser.add_argument("-v", "--verbose", action="store_true", help="log to stdout")
    Args = Parser.parse_args()

    if not Args.verbose:
        system.SetLogger(system.FileLog(os.path.join(LOGSDIR, "backup.log"), 100*1024*1024, 20, True), Threaded=True, Tag=True)
    else:
        system.SetLogger(system.OutLog, Threaded=True, Tag=True)
    system.Log("==============================================================================================================")

    Tasks = taskloop.TTasksLoop(Threaded=True, ExcTimeOuter=True)

    CfgLock     = taskloop.Lock()
    MetricsLock = taskloop.Lock()

    UpdateConf()

    netserver.HttpServer(GetHandler=HttpGetRequestHandler, IP=IP, Port=SOLOMON_PORT, Threaded=True, Name="Http", HttpLogger=None)

    Tasks.StartTask(UpdateConf,      Period=60,   Timeout=180,     Name="Config")
    Tasks.StartTask(MetricsUpdate,   Period=30,   Timeout=60,      Jitter=10, Name="Metrics")
    Tasks.StartTask(BackupKV,        Period=1800, Timeout=23*3600, MinDistance=600, Name="KV")
    Tasks.StartTask(BackupYdbTables, Period=1800, Timeout=12*3600, MinDistance=600, Name="Ydb")

    Tasks.WaitAll()


# ==============================================================================================================


if __name__ == "__main__":
    main()
