# -*- coding: utf-8 -*-

from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)

from qb2.api.v1 import extractors as se, filters as sf
import argparse
import nile
import datetime
import uatraits, json, re
import urllib, urlparse
import math,cgi
import pandas as pd
from itertools import product
import sys
import os

cluster = clusters.yt.Arnold(pool='vika-pavlova'
                            ).env(templates=dict(job_root='home/videolog/vika-pavlova'
                                                ),
                                  yt_spec_defaults=dict(pool_trees=["physical"],
                                                        #tentative_pool_trees=["cloud"]),
                                                        use_default_tentative_pool_trees = True),
                                  parallel_operations_limit=10
                                 )

def basket_gathering(start_date, end_date):

    job = cluster.job()

    redir = job.table('logs/redir-log/1d/{'+start_date +'..' + end_date +'}')

    raw = redir.qb2(log = 'redir-log',
                    fields = ['path','yandexuid','date',
                              se.log_field('content_id'),
                              se.log_field('source'),
                              se.log_field('from_block'),
                              se.log_field('stream_block'),
                              se.log_field('licence'),
                              se.log_field('svod'),
                              se.log_field('reqid'),
                              se.log_field('channel_id'),
                              se.custom('hb', lambda x: 1 if 'heartbeat' in x else 0,'path'),
                              se.custom('adStart', lambda x: 1 if 'adStart' in str(x) else 0, 'path'),
                              se.custom('error', lambda x: 1 if 'error' in x else 0,'path')
                             ],
                    filters = [sf.defined('yandexuid', 'content_id', 'source'),
                               sf.contains('path', 'player-events.'),
                               sf.or_(sf.contains('source', 'morda'),
                                      sf.contains('source', 'serp'),
                                      sf.contains('source', 'videohub'),
                                      sf.contains('source', 'streamhandler'),
                                      sf.contains('source', 'efir'),
                                      sf.equals('source', 'special')
                                     )
                              ],
                    mode = 'yamr_lines'
                   ).sort('yandexuid'
                         ).put('$job_root/parsed_redir_log')

    puids = job.table('//home/videolog/msvvitaly/yandexuid_direct_puid')

    redir_with_puids = redir.join(puids, by_left = "yandexuid", by_right = "id"
                                 ).put('//home/videolog/vika-pavlova/3262-renewability_metrics/parsed_redir_log_with_puids')

    redir_with_puids.random(12000
                           ).groupby("target_id"
                                ).aggregate(reqs = na.count()
                                           ).put('//home/videolog/vika-pavlova/3262-renewability_metrics/puids_basket')


    job.run()


def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--start_date', type=str, required=True)
    parser.add_argument('--end_date', type=str, required=True)
    args = parser.parse_args()

    start_date=args.start_date
    end_date=args.end_date

    basket_gathering(start_date, end_date)


if __name__ == '__main__':
    main()
