#!/usr/bin/env python
# -*- coding: utf-8 -*-

from pytils import date_range, make_logger
from datetime import datetime as dt, timedelta
import json
import requests
import re
import urllib
import itertools
import random
import argparse
import string
import os
import sys
import getopt
import time
import datetime
from collections import defaultdict, Counter
# import concurrent.futures
from nile.api.v1 import clusters, statface as ns
import logging

logging.basicConfig(
    format=(
        '%(asctime)s +%(relativeCreated)-8d '
        '%(levelname)8s %(name)s: %(message)s'
    )
)

logging.root.setLevel(logging.DEBUG)


# logger = make_logger(os.path.abspath(__file__), debug=True)
logger = logging.getLogger()


job_root = None


cluster = None
g_args = None


def get_stat_headers():
    return {
        'StatRobotUser': os.environ['STAT_LOGIN'],
        'StatRobotPassword': os.environ['STAT_TOKEN']
    }


class ChunkReader(object):

    def __init__(self, stat_reduce2_table, logger):
        yt = cluster.driver.client
        self.ch = 0
        self.read_counter = 0
        self.records_count = yt.get_attribute(
            stat_reduce2_table, 'row_count'
        )
        self.table = stat_reduce2_table
        self.logger = logger
        self.empty = False
        self.free = True

    def __iter__(self):
        return self

    def __next__(self):
        return self.next()

    def next(self):
        yt = cluster.driver.client
        if self.free:
            self.free = False
        if self.read_counter >= self.records_count:
            self.empty = True
            raise StopIteration()
        chunk = []
        self.ch += 1
        self.logger.info(
            '[CHUNK {}] Getting records from table {}'
            ' starting with id {}...'.format(
                self.ch, self.table, self.read_counter
            )
        )
        for rec in yt.read_table(yt.TablePath(
            self.table,
            start_index=self.read_counter,
            end_index=min(self.read_counter + 100000, self.records_count),
        ), raw=False
        ):
            chunk.append(rec)
            self.read_counter += 1
        self.free = True
        return (self.ch, chunk)


def push_to_stat(
    chunk, logger, redo=False, ch=-1,
    name="Video/Others/Strm/strm_cube"
):
    UPLOAD_URL = 'https://upload.stat.yandex-team.ru/_api/report/data'
    data_counters = {
        "name": name,
        "scale": "d",
        "_append_mode": 1,
        "parallel_upload": 1,
        "data": json.dumps({'values': chunk}),
    }
    if redo and ch == 1:
        data_counters['replace_mask'] = 'fielddate'
    req = None
    retries = 0
    while ((req is None or req.status_code != 200) and retries < 10):
        try:
            logger.info(
                '[CHUNK {}] Posting chunk to stat...'.format(ch)
            )
            req = requests.post(
                UPLOAD_URL, headers=get_stat_headers(), data=data_counters,
                timeout=300
            )
        except Exception as e:
            logger.error('[CHUNK {}] {}'.format(ch, e))
        try:
            logger.info(u'[CHUNK {}] {}'.format(ch, req.text))
        except Exception as e:
            logger.info('[CHUNK {}] {}'.format(ch, e))
        if not req or (req.status_code != 200):
            time.sleep(60)
            retries += 1
    return True


def push_to_stat_parallel(
    stat_reduce2_table, logger, max_workers=10, redo=False, report=None
):
    chunk_reader = ChunkReader(stat_reduce2_table, logger=logger)
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as e:
        while not chunk_reader.empty:
            if e._work_queue.qsize() > max_workers:
                continue
            try:
                id_, chunk = next(chunk_reader)
            except StopIteration:
                logger.info('Finished reading chunks')
                continue
            e.submit(
                push_to_stat, chunk, logger, redo=redo, ch=id_, name=report
            )


def push_cube_report(
    date, report=None, async_mode=False,
    replace_mask=False
):
    cube_report_table = '{}/{}/cube_report'.format(job_root, date)
    if g_args.wait:
        while (
            not cluster.driver.client.exists(cube_report_table) or
            not cluster.driver.client.get_attribute(
                cube_report_table, 'finished_time', ''
            ) or get_date(cluster.driver.client.get_attribute(
                cube_report_table, 'finished_time', ''
            ).split('T')[0]) < g_args.wait
        ):
            print('waiting for {}...'.format(cube_report_table))
            time.sleep(3600)
    # push_to_stat_parallel(cube_report_table, logger, report=report)
    push_to_stat_new(
        cube_report_table, logger, report=report,
        async_mode=async_mode, replace_mask=replace_mask
    )


def push_to_stat_new(
    cube_report_table, logger, report=None, async_mode=False,
    replace_mask=False
):
    client = ns.StatfaceClient(
        proxy='upload.stat.yandex-team.ru',
        username=os.environ['STAT_LOGIN'],
        password=os.environ['STAT_TOKEN']
    )

    logger.info('Performing remote push of {}...'.format(
        cube_report_table
    ))

    pub = ns.StatfaceReport().path(
        report
    ).scale('daily')

    if replace_mask:
        pub = pub.replace_mask(
            'fielddate'
        )

    pub = pub.client(
        client
    ).remote_publish(
        proxy='hahn',
        table_path=cube_report_table,
        async_mode=async_mode,
        upload_config=False
    )


def get_date(s):
    try:
        return datetime.datetime.strptime(
            re.search(r'[0-9]{4}-[0-9]{2}-[0-9]{2}', s).group(0),
            '%Y-%m-%d'
        ).date()
    except (ValueError, TypeError, AttributeError):
        return


def main():
    global cluster
    global g_args
    global job_root
    parser = argparse.ArgumentParser()
    parser.add_argument('--pool')
    parser.add_argument('--report', default='Video/Others/Strm/strm_cube')
    parser.add_argument('--job_root', default='//home/videolog/strm_video')
    parser.add_argument('--from', default=None)
    parser.add_argument('--to', default=None)
    parser.add_argument('--parallel_operations_limit', type=int, default=10)
    parser.add_argument('--wait', default=None)
    parser.add_argument('--replace_mask')
    parser.add_argument('--async_mode', action='store_true')
    args = parser.parse_args()

    job_root = args.job_root

    if args.wait:
        args.wait = get_date(args.wait)
    if args.replace_mask:
        args.replace_mask = int(args.replace_mask)
    g_args = args

    from_ = getattr(args, 'from')
    to_ = getattr(args, 'to')

    cluster = clusters.yt.Hahn(
        token=os.environ['YT_TOKEN'], pool=args.pool
    ).env(
        templates=dict(
            job_root=job_root
        ),
        parallel_operations_limit=args.parallel_operations_limit
    )

    if from_ and to_:
        if args.replace_mask is None:
            args.replace_mask = 1
        dates_to_process = date_range(from_, to_)
    else:
        report = args.report
        headers = get_stat_headers()
        print('getting dates from report')
        dimensions = [
            'browser', 'channel', 'error', 'os_family', 'program',
            'provider', 'ref_from', 'view_type'
        ]
        dim_totals = '&'.join(
            '{}=_total_'.format(x) for x in dimensions
        )
        req = requests.get(
            'https://upload.stat.yandex-team.ru/{}?{}&_type=json'.format(
                report, dim_totals
            ),
            headers=headers, verify=False
        )
        print('parsing response')

        try:
            values = sorted(
                req.json()['values'], key=lambda x: x['fielddate'], reverse=True
            )
        except:
            raise Exception(repr(req.json()))
        last_date = get_date(
            values[0]['fielddate'].split(' ')[0]
        )

        print('last date: {}'.format(last_date))

        available_dates = sorted(
            get_date(s) for s in cluster.driver.client.search(
                root=job_root, node_type="table", path_filter=(
                    lambda x: x.endswith('/cube_report')
                )
            ) if get_date(s) and get_date(s) > datetime.date(2018, 3, 14)
        )

        print('last available date: {}'.format(available_dates[-1]))

        if last_date:
            dates_to_process = [
                x for x in available_dates if x > last_date
            ]
        else:
            dates_to_process = available_dates

    print('dates to process: {}'.format(dates_to_process))

    for date in dates_to_process:
        print('running for {}'.format(date))
        push_cube_report(
            date=format(date), report=args.report, async_mode=args.async_mode,
            replace_mask=args.replace_mask
        )


if __name__ == '__main__':
    main()
