#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Генератор файл-листов для загрузки через logbackup-to-yt
"""

import argparse
import logging
import os
import re
import sys

FILELIST_COLUMNS = ('log_date', 'log_name', 'host', 'cluster', 'file_host', 'file_path', 'file_name', 'is_gzip', 'file_size',)


def is_gzip(path):
    # потому что logbackup-to-yt ждет там int
    res = 1 if path.endswith('.gz') else 0
    return res


def check_patterns(data, patterns):
    for r in patterns:
        if re.search(r, data):
            return r
    return None


def parse_file_info(file_path, file_name):
    data = {
        'file_path': file_path,
        'file_name': file_name,
    }
    path = file_path + '/' + file_name
    data['file_size'] = 0

    # для тестов
    if os.path.isfile(path):
        data['file_size'] = os.stat(path).st_size
    data['is_gzip'] = is_gzip(path)

    fname_re = r'^(.*)\.(\d{4}|\d{6}|\d{8})\D*$'
    m = re.match(fname_re, file_name)
    if not m:
        raise Exception('file name does not match /%s/' % (fname_re,))
    assert len(m.groups()) == 2
 
    data['log_name'], file_date = m.groups()
    if len(file_date) == 4:
        file_date = file_date + '01'
    if len(file_date) == 6:
        file_date = file_date + '01'

    data['log_date'] = '%s-%s-%s' % (file_date[0:4], file_date[4:6], file_date[6:8])
    assert len(data['log_date']) == 10

    return data


def build_filelists(filelists, root_dir, extra_meta, include, exclude):
    for dpath, dirs, files in os.walk(root_dir):
        for fname in files:
            inc = check_patterns(fname, include)
            if not inc:
                logging.info('build_filelists: file %s/%s does not match include regexps %s' % (dpath, fname, include))
                continue

            ex = check_patterns(fname, exclude)
            if ex:
                logging.info('build_filelists: file %s/%s excluded by regex /%s/' % (dpath, fname, ex))
                continue

            try:
                file_info = parse_file_info(dpath, fname)
            except Exception as e:
                logging.error('build_filelists: cannot parse file info for %s/%s: %s %s, skipping' %
                              (dpath, fname, type(e), e))
                continue

            file_info.update(extra_meta)

            date = file_info['log_date'][0:7]
            if date not in filelists:
                filelists[date] = []
            filelists[date].append(file_info)


def filelist_to_stream(f, filelist):
    for file_info in filelist:
        row = '\t'.join([str(file_info[x]) for x in FILELIST_COLUMNS])
        f.write(row + '\n')


def save_filelists(filelists, filelists_dir, dry_run=False):
    if not os.path.exists(filelists_dir):
        os.makedirs(filelists_dir)

    for date, filelist in filelists.iteritems():
        if len(filelist) == 0:
            logging.info('save_filelists: %s - no files to save, skipping' % (date,))
            continue

        path = '%s/%s.list' % (filelists_dir, date)
        if os.path.isfile(path):
            logging.info('save_filelists: %s - list %s already exists, skipping' % (date, path))

        if dry_run:
            filelist_to_stream(sys.stdout, filelist)
            continue
        with open(path, 'w') as f:
            filelist_to_stream(f, filelist)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__)
    parser.add_argument('-l', '--log-root', action='append', required=True, help='директория с логами')
    parser.add_argument('-d', '--filelists-dir', default='/var/lib/logbackup-filelists', help='директория с полученными файл-листами (default: /var/lib/logbackup-filelists)')
    parser.add_argument('-i', '--include', action='append', default=['^.*\.gz$'], help='список regex для начала обработки файла (потом к нему применяется --exclude)')
    parser.add_argument('-e', '--exclude', action='append', default=[], help='список regex для исключения файла из обработки (re.search, case-sensitive, применяется к имени файла без пути)')
    parser.add_argument('-s', '--host', help='fqdn на котором этот лог был создан, если не задан - соответствует --file-host')
    parser.add_argument('-u', '--file-host', required=True, help='fqdn откуда загружаем лог')
    parser.add_argument('-c', '--cluster', required=True, help='тип машин-источников лога (direct_scripts_perl, etc)')
    parser.add_argument('-n', '--dry-run', action='store_true', help='не записывать файл-листы в --filelists-dir, вывести все в stdout')
    args = parser.parse_args()
    if not args.host:
        args.host = args.file_host

    logfmt = '[%(asctime)s]\t%(levelname)s\t' + str(os.getpid()) + '\t%(threadName)s\t%(name)s\t%(message)s'
    logging.basicConfig(stream=sys.stderr, level=logging.DEBUG, format=logfmt)

    fl = {}
    for log_root in args.log_root:
        build_filelists(fl, log_root,
                        extra_meta=dict(host=args.host, cluster=args.cluster, file_host=args.file_host),
                        exclude=args.exclude, include=args.include)

    save_filelists(fl, args.filelists_dir, args.dry_run)
