#!/usr/bin/python

import yt.wrapper as yt
import os
import os.path
import datetime
import argparse
import re


HAHN = "hahn.yt.yandex.net"
FREUD = "freud.yt.yandex.net"

HAHN_ROOT_PATH = "//home/trencher/backup"
DATA_FILE_NAME = re.compile('^uniparse.*txt$')
DATA_DIR_NAME = re.compile('^[0-9]{8}$')
DATE_FORMAT = '%Y%m%d'
COMPRESSION_CODEC = 'brotli_8'


yt.config["proxy"]["url"] = HAHN


def write_table(data_file_path, date_dir, root_path):
    separator = '\t'
    dst_dir = yt.ypath_join(root_path, date_dir)
    yt.mkdir(dst_dir, recursive=True)
    dst_path = yt.ypath_join(dst_dir, os.path.splitext(os.path.basename(data_file_path))[0])
    if not yt.exists(dst_path):
        yt.create('table', dst_path, attributes={"compression_codec": COMPRESSION_CODEC})
        with open(data_file_path) as data_file:
            headers = next(data_file).strip().split(separator)
            yt.write_table(dst_path, (dict(zip(headers, line.strip().split(separator))) for line in data_file))
    else:
        yt.transform(dst_path, compression_codec=COMPRESSION_CODEC)
    return dst_path


def date_range(start_date, end_date):
    for n in xrange(int((end_date - start_date).days) + 1):
        yield start_date + datetime.timedelta(n)


def get_data_dir_paths(data_dir, start=None, end=None):
    dir_dates = [
        datetime.datetime.strptime(name, DATE_FORMAT).date()
        for name in os.listdir(data_dir)
        if os.path.isdir(os.path.join(data_dir, name)) and DATA_DIR_NAME.match(name)
    ]
    if not dir_dates:
        return ()
    dir_dates.sort()
    start_date = start if start else dir_dates[0]
    end_date = end if end else dir_dates[-1]
    if start_date < dir_dates[0] or end_date > dir_dates[-1]:
        raise ValueError('Incorrect start or end dates!')
    dir_dates = set(dir_dates)
    return (
        os.path.join(data_dir, ("{:" + DATE_FORMAT + "}").format(d))
        for d in date_range(start_date, end_date)
        if d in dir_dates
    )


def get_data_file_paths(data_dir_path):
    return (
        os.path.join(data_dir_path, name)
        for name in os.listdir(data_dir_path)
        if os.path.isfile(os.path.join(data_dir_path, name)) and DATA_FILE_NAME.match(name)
    )


def main(args):
    if args.cluster == 'freud':
        yt.config["proxy"]["url"] = FREUD
    root_path = args.root if args.root else HAHN_ROOT_PATH
    for data_dir_path in get_data_dir_paths(args.data, args.start, args.end):
        for data_file_path in get_data_file_paths(data_dir_path):
            print "{} Processing {} data file...".format(str(datetime.datetime.now()), data_file_path)
            write_table(data_file_path, os.path.basename(data_dir_path), root_path)


def valid_path(s):
    if not os.path.exists(s) or not os.path.isdir(s):
        raise argparse.ArgumentTypeError("Data directory path not exists or not a directory: '{0}'.".format(s))
    if not [f for f in os.listdir(s) if os.path.isdir(os.path.join(s, f)) and DATA_DIR_NAME.match(f)]:
        raise argparse.ArgumentTypeError("Data directory should contain uniparse.*.txt files: '{0}'.".format(s))
    return os.path.abspath(s)


def valid_date(s):
    try:
        return datetime.datetime.strptime(s, DATE_FORMAT).date()
    except ValueError:
        raise argparse.ArgumentTypeError("Not a valid date: '{0}'.".format(s))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-d", "--data", help="Directory with date subdirectories", type=valid_path, required=True)
    parser.add_argument("-c", "--cluster", help="Yt cluster", choices=['hahn', 'freud'], type=str, required=False, default='hahn')
    parser.add_argument("-r", "--root", help="Yt root path for backup", type=str, required=False)
    parser.add_argument("-s", "--start", help="Start date in yyyymmdd format", type=valid_date, required=False)
    parser.add_argument("-e", "--end", help="End date in yyyymmdd format", type=valid_date, required=False)
    main(parser.parse_args())
