#!/usr/bin/env python
# -*- coding: utf-8 -*-

import json
from os.path import join

import luigi

from crypta.profile.utils.config import config
from crypta.profile.utils.luigi_utils import YtTarget, BaseYtTask, OldNodesByNameCleaner
from crypta.profile.utils.loggers import TimeTracker
from crypta.profile.tasks.features.process_user_events import ProcessUserEvents


def get_visited_sites_for_dit(key, records):
    visited_whitelisted_sites = set()
    for record in records:
        for whitelisted_site in ['sobyanin.ru']:
            if whitelisted_site in record['site_weights']:
                visited_whitelisted_sites.add(whitelisted_site)
    if visited_whitelisted_sites:
        yield {'yandexuid': key['yandexuid'], 'visited': list(visited_whitelisted_sites)}


class GetVisitedSitesForDit(BaseYtTask):
    date = luigi.Parameter()
    task_group = 'export_profiles'

    def requires(self):
        return {
            'metrics': ProcessUserEvents(date=self.date, data_source='metrics'),
            'bar': ProcessUserEvents(date=self.date, data_source='bar'),
            'cleaner': OldNodesByNameCleaner(
                self.date,
                folder=config.VISITED_SITES_FOR_DIT_DIRECTORY,
                lifetime=config.NUMBER_OF_INTERMEDIATE_PROFILES_TABLES_TO_KEEP,
            ),
        }

    def run(self):
        with TimeTracker(self.__class__.__name__):

            with self.yt.Transaction():
                self.yt.create_empty_table(
                    self.output().table,
                    schema={
                        'yandexuid': 'uint64',
                        'visited': 'any',
                    },
                )

                operation = self.yt.run_reduce(
                    get_visited_sites_for_dit,
                    source_table=[self.input()['metrics']['hits'].table,
                                  self.input()['bar']['hits'].table],
                    destination_table=self.output().table,
                    reduce_by='yandexuid',
                )
                self.logger.info(json.dumps(operation.get_job_statistics()))
                self.yt.run_sort(self.output().table, sort_by='yandexuid')

    def output(self):
        return YtTarget(
            join(config.VISITED_SITES_FOR_DIT_DIRECTORY, str(self.date)),
            allow_empty=True
        )
