import os
import os.path
import re
import subprocess
import sys
import tempfile
import urllib2

import dateutil.parser
from yt.wrapper.client import Yt
from yt.wrapper.ypath import TablePath, ypath_dirname


LOGS_PATH_PREFIX = "/var/log/webmaster3/"
LOGS_PATTERN = LOGS_PATH_PREFIX + "webmaster3-api.log*"
TMP_DIR = tempfile.gettempdir()
LOG_ENTRY_RE = RE = r'\[(.*)\] INFO.*IndexingHistoryAction \[IDX\](.*)'

YT_TABLE_SCHEMA = [
    {'name': 'entryDate', 'type': 'string', 'sort_order': 'ascending'},
    {'name': 'hostId', 'type': 'string'},
    {'name': 'userId', 'type': 'string'},
    {'name': 'dateFrom', 'type': 'string'},
    {'name': 'dateTo', 'type': 'string'},
    {'name': 'indicators', 'type': 'any'},
]

def get_yt_token():
    from environs import Env
    env = Env()
    yt_token = env('YT_TOKEN', None)
    if yt_token:
        return yt_token

    from os.path import expanduser
    home_path = expanduser("~")

    try:
        with open(home_path + '/' + '.yt/token') as f:
            l = list(f)
            if l:
                return l[0].strip()
    except:
        return None


YT_CLIENT = Yt(proxy='hahn.yt.yandex.net', token=get_yt_token())


def get_wmc_back_stable():
    host_names = urllib2.urlopen("https://c.yandex-team.ru/api/groups2hosts/wmc-back-stable").read()
    return host_names.strip().split('\n')


def get_list_of_log_files_for_host(host_name, logs_pattern, for_date):
    ret = []

    print "Getting lists of logs from " + host_name
    p = subprocess.Popen(['ssh', host_name, 'stat -c \"%x %n\" ' + logs_pattern],
                         stdout=subprocess.PIPE)

    for line in p.stdout.readlines():
        last_space_idx = line.rindex(' ')
        date_str = line[:last_space_idx].strip()
        file_date = dateutil.parser.parse(date_str).date()
        if file_date == for_date:
            file_str = line[last_space_idx:].strip()
            ret.append(file_str)

    exit_code = p.wait()
    if exit_code != 0:
        print "Something went wrong, exiting"
        exit(1)

    return ret


def copy_logs_from_host(host_name, logs_to_copy, dest_dir):
    dst_logs = []
    for log_file in logs_to_copy:
        dst_log_file = os.path.basename(log_file)
        log_suffix = host_name[:host_name.find('.')]
        idx = dst_log_file.find('.')
        dst_log_file = dst_log_file[:idx] + '-' + log_suffix + dst_log_file[idx:]

        print "Copying logs from " + host_name + ". This will take a while."
        src_path = host_name + ':' + log_file
        dst_path = dest_dir + '/' + dst_log_file
        p = subprocess.Popen(['scp', '-6', src_path, dst_path])
        exit_code = p.wait()
        if exit_code != 0:
            print "Something went wrong, exiting"
            exit(1)

        if dst_path.endswith('.gz'):
            os.system('gunzip ' + dst_path)
            dst_path = dst_path[:-3]

        dst_logs.append(dst_path)

    return dst_logs


def parse_log_line(line):
    m = re.match(LOG_ENTRY_RE, line)
    if not m:
        return None

    try:
        entry_dict = {
            'entryDate': m.group(1)
        }

        info_str = m.group(2).strip()
        pairs = info_str.split("; ")
        for p in pairs:
            var, val = p.split("=")
            if val == 'null':
                continue

            entry_dict[var] = val

        if 'indicators' in entry_dict:
            entry_dict['indicators'] = entry_dict['indicators'].lstrip('[').rstrip(']').split(', ')
    except:
        raise

    return entry_dict


def process_log_file(file_path):
    with open(file_path) as fp:
        line = fp.readline()
        while line:
            if '[IDX]' in line:
                entry_dict = parse_log_line(line)
                if entry_dict:
                    yield entry_dict

            line = fp.readline()


def write_yt_table(rows):
    tp_tmp = TablePath('//home/webmaster/users/leonidrom/api_indexing_history_logs_tmp', schema=YT_TABLE_SCHEMA, append=False)
    tp_dst = TablePath('//home/webmaster/users/leonidrom/api_indexing_history_logs', schema=YT_TABLE_SCHEMA)

    if not YT_CLIENT.exists(tp_dst):
        YT_CLIENT.write_table(tp_dst, rows, raw=False, force_create=True)
    else:
        YT_CLIENT.write_table(tp_tmp, rows, raw=False, force_create=True)
        YT_CLIENT.run_merge([tp_tmp, tp_dst], tp_dst, mode='sorted')


def print_usage():
    print "Usage: " + os.path.basename(sys.argv[0]) + " YYYY-MM-DD "
    exit(1)


def main():
    # args = sys.argv[1:]
    # if len(args) != 1:
    #     print_usage()
    #
    # logs_date = dateutil.parser.parse(sys.argv[1]).date()
    logs_date = dateutil.parser.parse('2018-07-26').date()

    host_names = get_wmc_back_stable()
    for host_name in host_names:
        logs = get_list_of_log_files_for_host(host_name, LOGS_PATTERN, logs_date)
        print 'Got logs: ' + str(logs)
        if len(logs) != 0:
            dst_logs = copy_logs_from_host(host_name, logs, TMP_DIR)
            print 'Copied logs: ' + str(dst_logs)
            for dst_log in dst_logs:
                it = process_log_file(dst_log)
                write_yt_table(it)


if __name__ == "__main__":
    main()
