#!/usr/bin/env python
# -*- coding: utf-8 -*-

import argparse
import datetime
import os

from yt.wrapper import YtClient

from crypta.lib.python.yql.client import create_yql_client
from crypta.profile.lib import date_helpers
from crypta.profile.user_vectors import lib as user_vectors

YT_PROXY = 'hahn.yt.yandex.net'


def main():
    yesterday = str(date_helpers.get_yesterday(datetime.date.today()))
    parser = argparse.ArgumentParser()
    parser.add_argument('--number-of-days-to-merge', type=int, default=35)
    parser.add_argument('--halflife-days', type=int, default=30)
    parser.add_argument('--minimum-sites', type=int, default=3)
    parser.add_argument('--maximum-sites', type=int, default=5000)
    parser.add_argument('--min-hits-per-day', type=int, default=3)
    parser.add_argument('--last-active-date', default=yesterday)
    parser.add_argument('--site-vectors-table', default='//home/crypta/production/profiles/export/vectors/site2vec')
    parser.add_argument('--daily-hits-directory',
                        default='//home/crypta/production/profiles/input-data/metrics/hits')

    parser.add_argument('--output-yt-directory', default='//tmp/custom_yandexuid2vec')
    parser.add_argument('--date', default=yesterday)

    args = parser.parse_args()

    # configuring yt client
    yt_token = os.getenv('YT_TOKEN')
    assert yt_token is not None, 'You should specify YT_TOKEN environment variable'
    yt_pool = os.getenv('YT_POOL')

    yt_client = YtClient(
        proxy=YT_PROXY,
        token=yt_token,
        config={
            'spec_defaults': {
                'pool': yt_pool,
            },
        },
    )

    # configuring yql client
    yql_token = os.getenv('YQL_TOKEN')
    assert yql_token is not None, 'You should specify YQL_TOKEN environment variable'
    yql_client = create_yql_client(YT_PROXY, yql_token, pool=yt_pool, syntax_version=1)

    # output tables
    merged_hits_by_yandexuid_table = os.path.join(args.output_yt_directory, 'merged_hits_by_yandexuid')
    flattened_hits_table = os.path.join(args.output_yt_directory, 'flattened_hits')
    idf_table = os.path.join(args.output_yt_directory, 'IDF')
    yandexuid_vectors_table = os.path.join(args.output_yt_directory, 'yandexuid_vectors')

    user_vectors.merge_hits_by_yandexuid(
        yt_client=yt_client,
        yql_client=yql_client,
        date=args.date,
        daily_hits_directory=args.daily_hits_directory,
        merged_hits_by_yandexuid_table=merged_hits_by_yandexuid_table,
        number_of_days_to_merge=int(args.number_of_days_to_merge),
        halflife=float(args.halflife_days),
        min_hosts=int(args.minimum_sites),
        max_hosts=int(args.maximum_sites),
        min_hits_per_day=int(args.min_hits_per_day),
        last_active_date=args.last_active_date,
    )

    user_vectors.flatten_hits(
        yt_client=yt_client,
        yql_client=yql_client,
        date=args.date,
        merged_hits_by_yandexuid_table=merged_hits_by_yandexuid_table,
        flattened_hits_table=flattened_hits_table,
    )

    user_vectors.calculate_host_idf(
        yt_client=yt_client,
        yql_client=yql_client,
        date=args.date,
        merged_hits_by_yandexuid_table=merged_hits_by_yandexuid_table,
        flattened_hits_table=flattened_hits_table,
        idf_table=idf_table,
    )

    user_vectors.get_yandexuid_vectors(
        yt_client=yt_client,
        date=args.date,
        site_vectors_table=args.site_vectors_table,
        flattened_hits_table=flattened_hits_table,
        idf_table=idf_table,
        yandexuid_vectors_table=yandexuid_vectors_table,
    )


if __name__ == '__main__':
    main()
