#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
import sys
import os
import codecs
import argparse
import nile
from nile.api.v1 import (
    clusters,
    filters as nf,
    extractors as ne,
    aggregators as na,
    Record
)
import getpass
import datetime
import urlparse
import pytils
from pytils import get_host, parseparams, yt_get_date_from_table, date_range


hahn = clusters.yt.Hahn(
    pool='search-research_{}'.format(getpass.getuser())
).env(
    templates=dict(
        job_root='home/videolog/MMA-1374_videotop_referers',
    )
)


vs = {
    '3QpqlyrFF2s',
    'AS_uBieKtmg',
    'CB0DEcaPwhw',
    'JGwWNGJdvx8',
    'JfWA0kJqDl4',
    'S9oXj3a4gZ4',
    'i9AHJkHqkpw',
    'k85mRPqvMbE',
    'kJQP7kiw5Fk',
    'nidQCt_HEsY',
    'p1uh40IvF4c',
    'q--5Ht49vNY',
    'uvjDgRTEpKw',
    'vrLu-gdkG6I',
    'wOBnq0Ewz5k'
}


def referers_map(records):
    for rec in records:
        if 'type=TRAFFIC' not in rec.value:
            continue
        params = parseparams(rec.value.decode('utf8', errors='replace'))
        if params.get('type') != 'TRAFFIC':
            continue
        url = params.get('url') or ''
        referer = params.get('referer') or ''
        if not url:
            continue
        if 'youtube.com' not in url:
            continue
        try:
            parsed = urlparse.urlparse(url)
        except ValueError:
            continue
        if parsed.netloc != 'www.youtube.com':
            continue
        if parsed.path == '/watch':
            qs = urlparse.parse_qs(parsed.query)
            v_id = qs.get('v', [''])[0]
            if v_id in vs:
                yield Record(
                    url='/watch?v={}'.format(v_id),
                    referer=referer,
                    referer_host=get_host(referer)
                )
        elif parsed.path == '/channel/UCdKuE7a2QZeHPhDntXVZ91w':
            yield Record(
                url=parsed.path,
                referer=referer,
                referer_host=get_host(referer)
            )


def process_date(date):
    print('processing {}'.format(date))
    job = hahn.job()

    job.table(
        '//user_sessions/pub/spy_log/daily/{}/clean'.format(date)
    ).map(
        referers_map,
        files=[nile.files.LocalFile(pytils.__file__)]
    ).groupby(
        'url', 'referer', 'referer_host'
    ).aggregate(
        count=na.count()
    ).put(
        '$job_root/{}/data'.format(date)
    )

    # .groupby(
    #     'url', 'referer_host'
    # ).aggregate(
    #     count=na.sum('count')
    # ).put(
    #     '$job_root/{}/test_url_by_host'.format(date)
    # ).groupby(
    #     'referer_host'
    # ).aggregate(
    #     count=na.sum('count')
    # ).put(
    #     '$job_root/{}/test_hosts'.format(date)
    # )

    job.run()
    print('finished {}'.format(date))


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--from', required=True)
    parser.add_argument('--to', required=True)
    args = parser.parse_args()

    for date in date_range(
        yt_get_date_from_table(getattr(args, 'from')),
        yt_get_date_from_table(getattr(args, 'to'))
    ):
        process_date(format(date))


if __name__ == "__main__":
    main()
