#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
import sys
import os
import codecs
import argparse
import nile
from nile.api.v1 import (
    clusters,
    filters as nf,
    extractors as ne,
    aggregators as na,
    Record
)
import getpass
import datetime
from collections import defaultdict
from make_pool_for_sbr import process_ybro_plays, extract_reduce
import json


BANNED_HOSTS = {
    'yandexadexchange.net',
    'yandex.ru',
    'yandex.ua',
    'yandex.kz',
    'yandex.com.tr',
    'luhtb.top',
    'cdnapponline.com',
    '24video.adult',
    'rnnuw.com',
    'yastatic.net'
}


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--input', required=True)
    parser.add_argument('--output', default='output.json')
    parser.add_argument(
        '--blacklist', default='home/videolog/sbr/urls_blacklist'
    )
    args = parser.parse_args()

    hahn = clusters.yt.Hahn(
        pool='search-research_{}'.format(getpass.getuser())
    ).env(
        templates=dict(
            job_root='home/videolog/sbr_by_host',
        )
    )

    job = hahn.job().env(
        packages=['tldextract', 'idna', 'requests_file'],
        package_paths=[os.getcwd()]
    )

    blacklist = job.table(
        args.blacklist
    )

    job.table(
        args.input
    ).map(
        process_ybro_plays, files=[
            nile.files.LocalFile('make_pool_for_sbr.py')
        ]
    ).groupby(
        'autoplay', 'canon_url'
    ).reduce(
        extract_reduce, files=[
            nile.files.LocalFile('make_pool_for_sbr.py')
        ]
    ).filter(
        nf.custom(lambda x: x not in BANNED_HOSTS, 'host')
    ).join(
        blacklist,
        by_left="canon_url", by_right="page_url",
        type="left_only"
    ).sort('total').put(
        '$job_root/urls_aggregated'
    )

    job.run()

    job = hahn.job()

    porn = job.table(
        'home/videolog/thematic/porn_top_hosts'
    )

    sf = job.table(
        'home/videolog/thematic/porn_top_hosts'
    )

    job.table(
        '$job_root/urls_aggregated'
    ).project(
        ne.all(),
        host=ne.custom(
            lambda x: (x or '').lower(), 'host'
        )
    ).groupby('host').aggregate(
        freq=na.count()
    ).top(100, by='freq').join(
        porn, by='host', type='left_only'
    ).join(
        sf, by='host', type='left_only'
    ).project(ne.all(), type=ne.const('other')).sort('freq').put(
        'home/videolog/thematic/other_top_hosts'
    )

    job.run()

    cat_dict = defaultdict(dict)
    tables = []
    for cat in ['porn', 'sf', 'other']:
        job = hahn.job()

        job.table(
            'home/videolog/thematic/{}_top_hosts'.format(cat)
        ).top(
            20, by='freq'
        ).project(
            ne.all(), type=ne.const(cat)
        ).put(
            '$job_root/{}_top20_hosts'.format(cat)
        )

        job.run()

        cat_dict[cat]['top'] = hahn.read(
            '$job_root/{}_top20_hosts'.format(cat)
        )

        job = hahn.job()

        urls = job.table(
            '$job_root/urls_aggregated'
        )

        result = '$job_root/{}_urls_top'.format(cat)
        hahn.write(path=result, records=[])
        for rec in cat_dict[cat]['top']:
            urls.filter(
                nf.equals('host', rec.host.lower())
            ).put(
                '$job_root/{}_{}'.format(cat, rec.host)
            ).top(
                30, by='total'
            ).project(
                ne.all(), type=ne.const(cat)
            ).put(
                '$job_root/{}_{}_selection'.format(cat, rec.host)
            ).put(
                result, append=True
            )
        tables.append(result)

        job.run()

    job = hahn.job()

    tables = [
        job.table('$job_root/{}_urls_top'.format(x))
        for x in ['porn', 'sf', 'other']
    ]

    job.concat(*tables).put(
        '$job_root/unified_pool'
    )

    job.run()

    with codecs.open(args.output, 'w', 'utf8') as f:
        f.write(
            json.dumps({
                "cluster": "hahn",
                "table": 'home/videolog/sbr_by_host/unified_pool'
            })
        )


if __name__ == "__main__":
    main()
