#!/usr/bin/env python

import sys
import os
import requests
import urllib
import uuid
import logging
import json
import re
import shutil

from kazoo.client import KazooClient
from kazoo.exceptions import NodeExistsError
from kazoo.exceptions import NoNodeError

import yt.wrapper as yt

TASKS_QUEUE_NODE = "/webmaster3/yt-import-queue"

SUBNODE_LOCK = "lock"
SUBNODE_STATUS = "status"

STATUS_IN_PROGRESS = "IN_PROGRESS"
STATUS_FAILED = "FAILED"
STATUS_DONE = "DONE"
CLICKHOUSE_QUERY_URL = "http://localhost:8123/?query="
REPLICA_ALREADY_EXISTS_PREFIX = "Code: 253, e.displayText() = DB::Exception: Replica "
REPLICA_ALREADY_EXISTS_SUFFIX = " already exists., e.what() = DB::Exception\n"
MINIMAL_FREE_SPACE = 0.02

RUN_ID = ""
CONFIG = {}


def initLog():
    root = logging.getLogger()
    root.setLevel(logging.DEBUG)

    logFileName = "/var/log/webmaster3/webmaster3-clickhouse-importer.log"

    ch = logging.StreamHandler(open(logFileName, "a"))
    ch.setLevel(logging.DEBUG)
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    ch.setFormatter(formatter)
    root.addHandler(ch)


def load_secrets(secrets_dir):
    secrets = {}
    for secret_key in os.listdir(secrets_dir):
        with open(os.path.join(secrets_dir, secret_key)) as f:
            secret_value = f.read()
        _, cluster_name, _ = secret_key.split('.')
        secrets[cluster_name] = secret_value
    return secrets


def loadConfig():
    global CONFIG

    with open("/etc/yandex/webmaster3/clickhouse-importer/importer-config.json") as confFile:
        CONFIG = json.load(confFile)

    CONFIG["yt-tokens"] = load_secrets(
        "/etc/yandex/webmaster3/clickhouse-importer/secrets"
    )

class ImportTask:
    def __init__(self, pathNode, taskSpec):
        self.pathNode = pathNode
        self.taskSpec = taskSpec


def clickhouseRequest(query, data=None, headers=None):
    if headers is None:
        headers = {}
    response = requests.post(CLICKHOUSE_QUERY_URL + urllib.quote(query), data=data, headers=headers)
    if response.status_code != 200:
        logging.error("Response: " + response.text)
        if response.text.startswith(REPLICA_ALREADY_EXISTS_PREFIX) and response.text.endswith(REPLICA_ALREADY_EXISTS_SUFFIX):
            replicaName = response.text[len(REPLICA_ALREADY_EXISTS_PREFIX):-len(REPLICA_ALREADY_EXISTS_SUFFIX)]
            replicasNode = replicaName[0 : replicaName.rindex("/")]
            logging.info("Replica name: " + replicaName)
            # check for replica is single
            replicas = ZK.get_children(replicasNode)
            if (len(replicas) > 1):
                logging.warn("Too many replicas. Exit " + replicas)
            else:
                # remove replica
                logging.info("Removing replicas " + replicasNode)
                ZK.delete(replicasNode[0 : replicasNode.rindex("/")], recursive=True)
        raise Exception("Clickhouse responded with code " + str(response.status_code))

def clickhouseGet(query):
    response = requests.get(CLICKHOUSE_QUERY_URL + urllib.quote(query))
    if (response.status_code != 200):
        raise Exception("Clickhouse responded with code " + str(response.status_code))
    return response.text


def importTable(task):
    taskSpec = task.taskSpec
    headers = {}

    compression = taskSpec["dataCompressionMode"]

    ytCluster, ytPath = taskSpec["ytTablePath"].split(":", 1)

    if compression != "NONE":
        yt.config["proxy"]["content_encoding"] = "identity"
        headers["Content-Encoding"] = {
            "GZIP": "gzip"
        }[compression]

    yt.config["proxy"]["url"] = ytCluster + ".yt.yandex.net"
    ytTokens = CONFIG["yt-tokens"]
    if not (ytTokens.has_key(ytCluster)):
        raise Exception("Token for yt cluster " + ytCluster + " not found in config")
    yt.config["token"] = ytTokens[ytCluster]
    [dbName, tableName] = splitTableName(taskSpec["clickhouseTableName"])
    tmpTableName = "imprt_" + tableName
    logging.info("Creating table " + tableName)
    deleteTable(dbName, tableName)
    deleteTable(dbName, tmpTableName)
    tmpTableCreateSpec = tmpTableSpec(taskSpec["clickhouseCreateSpec"])
    clickhouseRequest("CREATE TABLE " + dbName + "." + tableName + " " + taskSpec["clickhouseCreateSpec"])
    clickhouseRequest("CREATE TABLE " + dbName + "." + tmpTableName + " " + tmpTableCreateSpec)
    insertCommand = "INSERT INTO " + dbName + "." + tmpTableName + " " + taskSpec["clickhouseInsertSpec"]
    count = 0
    for line in yt.read_table(ytPath, raw=False, format=yt.DsvFormat()):
        if (count % 20) == 0:
            assertTaskLocked(task.pathNode)
        data = line["data"]
        count += 1
        clickhouseRequest(insertCommand, data=data, headers=headers)
        logging.info("Processed line " + str(count))

    clickhouseRequest("OPTIMIZE TABLE " + dbName + "." + tmpTableName)
    clickhouseRequest("DETACH TABLE " + dbName + "." + tmpTableName)
    tmpTableDataDir = tableDataDir(dbName, tmpTableName)
    tableDetachedDir = os.path.join(tableDataDir(dbName, tableName), "detached")
    partsToAttach = []
    for part in os.listdir(tmpTableDataDir):
        if part != "detached" and os.path.isdir(os.path.join(tmpTableDataDir, part)): # TODO: validate by name pattern
            partsToAttach.append(part)

    partitions = set()
    for part in partsToAttach:
        partitions.add(re.sub("^(\d{6}).*$", "\g<1>", part))
        shutil.move(os.path.join(tmpTableDataDir, part), tableDetachedDir)

    for partition in partitions:
        clickhouseRequest("ALTER TABLE " + dbName + "." + tableName + " ATTACH PARTITION '" + partition + "'")

    if taskSpec.get("checkForTableEmptiness"):
        logging.info("Checking that table " + dbName + "." + tableName + " is not empty")
        resp = clickhouseGet("SELECT count(*) FROM " + dbName + "." + tableName)
        try:
            logging.info("Sucessfully imported " + str(int(resp)) + " rows into table")
        except ValueError:
            raise Exception("Could not get count of rows in imported table " + dbName + "." + tableName)

    changeStatus(task.pathNode, STATUS_DONE)
    # Trying to cleanup, because detached table will be resurrected after restart
    clickhouseRequest("ATTACH TABLE " + dbName + "." + tmpTableName + " " + tmpTableCreateSpec)
    clickhouseRequest("DROP TABLE " + dbName + "." + tmpTableName)
    for detachedFile in os.listdir(tableDetachedDir):
        if detachedFile.startswith("inactive_"):
            shutil.rmtree(os.path.join(tableDetachedDir, detachedFile))


def deleteTable(dbName, tableName):
    clickhouseRequest("DROP TABLE IF EXISTS " + dbName + "." + tableName)
    dataDir = tableDataDir(dbName, tableName)
    if os.path.isdir(dataDir):
        shutil.rmtree(dataDir)


def splitTableName(fullName):
    return re.split("\.", fullName)


def tableDataDir(dbName, tableName):
    return "/var/lib/clickhouse/data/" + dbName + "/" + tableName


def tmpTableSpec(tableSpec):
    return re.sub("ReplicatedMergeTree\( *'[^']+', *'[^\']+',", "MergeTree(", tableSpec)


def changeStatus(taskPath, status):
    assertTaskLocked(taskPath)
    ZK.set(taskPath + "/" + SUBNODE_STATUS, status)


def assertTaskLocked(taskPath):
    lockNode = taskPath + "/" + SUBNODE_LOCK
    data, stat = ZK.get(lockNode)
    if data != RUN_ID:
        logging.error("Missed lock for task " + taskPath + ", will exit")
        exit(1)


def pickTaskAndLock():
    taskQueuePath = TASKS_QUEUE_NODE + "/" + HOST
    try:
        children = ZK.get_children(taskQueuePath)
        children.sort()
    except NoNodeError:
        return None
    for i, relChild in enumerate(children):
        child = taskQueuePath + "/" + relChild
        childLockPath = child + "/" + SUBNODE_LOCK
        childStatusPath = child + "/" + SUBNODE_STATUS
        try:
            ZK.create(childLockPath, ephemeral=True, value=RUN_ID)
        except NodeExistsError:
            # task locked
            continue
        try:
            # check task status
            statusData, statusStat = ZK.get(childStatusPath)
            if (statusData == STATUS_DONE):
                # ... because it was done, going further
                continue
            # .. ok, take this one
            ZK.set(childStatusPath, value=STATUS_IN_PROGRESS)
        except NoNodeError:
            # task is deleting right now, not going in
            continue
        taskSpecString, stat = ZK.get(child)
        taskSpec = json.loads(taskSpecString)
        return ImportTask(child, taskSpec)
    return None

def checkAvailableSpace():
    stat = os.statvfs('/')
    avail = stat.f_bavail / float(stat.f_blocks)
    if avail < MINIMAL_FREE_SPACE:
        logging.error("Not enough free space: " + str(avail * 100) + "%, will exit")
        exit(1)

def main():
    global RUN_ID
    global ZK
    global HOST
    HOST = os.uname()[1]
    RUN_ID = str(uuid.uuid1())

    initLog()
    checkAvailableSpace()
    loadConfig()

    ZK = KazooClient(hosts=CONFIG["zookeeper-hosts"], timeout=60.0)
    ZK.start()
    task = pickTaskAndLock()
    if task:
        importTable(task)
    else:
        logging.info("No tasks found")


if __name__ == "__main__":
    main()
