#!/usr/bin/python
# -*- coding: utf-8 -*-
import json
from argparse import ArgumentParser

import yt.wrapper as yt
from nile.api.v1 import clusters, aggregators
from qb2.api.v1 import extractors as se
from qb2.api.v1 import filters as sf

from mpfs.engine.process import setup_admin_script
setup_admin_script()
from mpfs.config import settings
from mpfs.core.address import PublicAddress
from mpfs.core.social.publicator import Publicator
from mpfs.core.metastorage.control import support_moderation_queue
from mpfs.core.mrstat.stat_utils import set_yt_proxy, quit_if_mrstat_disabled

MRSTAT_YT_TOKEN = settings.mrstat['yt_token']
MRSTAT_YT_PROXY = settings.mrstat['yt_proxy']

SRC_LOG = '//logs/ydisk-log-reader-log/1d/%s'
HASHES_PATH = '//home/mpfs-stat/results/moderation_queue/%s'
TOP_SELECT = 200
TOP_PUT = 70
MEDIA_TYPE_TO_CHECK = ['video', 'image', 'compressed', 'unknown']


def process_logs(date):
    """Получить топ просматриваемых публичных ресурсов (хешей) за день"""
    cluster = clusters.Hahn(MRSTAT_YT_TOKEN)
    job = cluster.job()
    job.table(SRC_LOG % date) \
        .qb2(
            log='generic-tskv-log',
            fields=[
                se.log_field('message'),
                se.custom('public_hash', lambda m: m.rsplit(' ', 1)[-1], 'message')
            ],
            filters=[
                sf.startswith('message', 'Increment counter '),
            ]
        ) \
        .groupby('public_hash') \
        .aggregate(hits=aggregators.count()) \
        .top(TOP_SELECT, by='hits') \
        .put(HASHES_PATH % date)
    job.run()


def _process_one_public_hash(public_hash):
    public_addr = PublicAddress(public_hash)
    resource = Publicator().get_fully_public_resource(public_addr)
    if resource.type == 'file':
        if (resource.address.ext == 'apk' or
                resource.media_type not in MEDIA_TYPE_TO_CHECK):
            raise ValueError('Recource addr: %s media_type: %s' % (resource.address.id, resource.media_type))
    short_url = resource.meta['short_url']
    support_moderation_queue.put_public_link(short_url, resource.type)


def process_hashes(date):
    """Скачать таблицу с хешами из YT и поставить подходящие в очередь модерации"""
    set_yt_proxy()
    table_path = HASHES_PATH % date
    if not yt.exists(table_path):
        print "Empty HASHES_PATH: %s" % table_path
        return

    all_hashes = []
    for raw_json in yt.read_table(table_path, format="json", raw=True):
        data = json.loads(raw_json)
        all_hashes.append((data['public_hash'], data['hits']))
    all_hashes.sort(key=lambda t: t[1], reverse=True)

    processed = 0
    for public_hash, _ in all_hashes:
        try:
            _process_one_public_hash(public_hash)
        except Exception as e:
            print 'FAIL "%s" %r' % (public_hash, e)
        else:
            print 'OK "%s"' % public_hash
            processed += 1

        if processed >= TOP_PUT:
            break


if __name__ == '__main__':
    quit_if_mrstat_disabled()
    parser = ArgumentParser()
    parser.add_argument('date', nargs=1, type=str, help='Run script for date (Ex: "2017-11-17")')
    date = parser.parse_args().date[0]

    process_logs(date)
    process_hashes(date)
