#!/usr/bin/env python
# -*- coding: utf-8 -*-
import itertools

from crypta.lib.python import time_utils
from crypta.profile.lib import date_helpers
from crypta.profile.runners.interests.lib.data.metrics import (
    metrics_counter_url_to_interests_mapping,
    metrics_title_lemmas_to_interests_mapping,
    metrics_hosts_to_interests_mapping,
)
from crypta.profile.runners.interests.lib.data.phones import get_phones_segments_lemmas
from crypta.profile.runners.interests.lib.log_processors.__base__ import InterestsLogProcessor
from crypta.profile.utils.interests_helpers import LabSegmentsInfo
from crypta.profile.utils.luigi_utils import ExternalInput


class MetricsProcessor(InterestsLogProcessor):
    task_group = 'metrics_for_interests'

    def requires(self):
        return {
            'log': ExternalInput(
                table=self.log_path,
            ),
        }

    def fill_processed_table(self, transaction):
        self.yt.run_map(
            self.make_mapper(),
            source_table=self.log_path,
            destination_table=self.processed_table,
        )
        self.sort_processed_table()
        self.yt.run_reduce(
            self.make_to_bigb_reducer(),
            self.processed_table,
            self.to_bigb_raw_table,
            reduce_by=['yandexuid'],
        )
        self.move_to_bigb_table()

    def make_mapper(self):
        process_lemmas = self.make_lemmas_processor()
        process_counter_urls = self.make_counter_url_processor()
        process_hosts = self.make_hosts_processor()

        def mapper(row):
            for out_row in itertools.chain(
                    process_lemmas(row),
                    process_counter_urls(row),
                    process_hosts(row),
            ):
                yield out_row

        return mapper

    def make_lemmas_processor(self):
        all_lemmas, lemmas_by_interest_lab_id, sg_filters = self.get_lemmas_mapping()
        log_name = self.log_name

        def process_lemmas(row):
            title_lemmas = set(row['lemmas']) if row['lemmas'] else set()
            if title_lemmas and title_lemmas.intersection(all_lemmas):
                for interest_lab_id, all_interest_lemmas in lemmas_by_interest_lab_id.iteritems():
                    for interest_lemmas in all_interest_lemmas:
                        if interest_lemmas <= title_lemmas:
                            yield {
                                'yandexuid': row['yandexuid'],
                                'timestamp': row['timestamp'],
                                'date': date_helpers.to_date_string(
                                    date_helpers.from_timestamp_to_datetime(row['timestamp'])
                                ),
                                'datetime_bin': date_helpers.to_date_string(
                                    date_helpers.from_timestamp_to_datetime(row['timestamp'] - row['timestamp'] % (30 * 60)),
                                    date_format=date_helpers.DATETIME_FORMAT,
                                ),
                                'log_name': log_name,
                                'interest_lab_id': interest_lab_id,
                                'raw_data': row['lemmas'],
                                'data': list(interest_lemmas),
                                'data_source': 'metrics',
                                'data_type': 'lemmas',
                            }

            for sg_filter in sg_filters:
                if sg_filter['data'](title_lemmas):
                    yield {
                        'yandexuid': row['yandexuid'],
                        'timestamp': row['timestamp'],
                        'date': date_helpers.to_date_string(date_helpers.from_timestamp_to_datetime(row['timestamp'])),
                        'datetime_bin': date_helpers.to_date_string(
                            date_helpers.from_timestamp_to_datetime(row['timestamp'] - row['timestamp'] % (30 * 60)),
                            date_format=date_helpers.DATETIME_FORMAT,
                        ),
                        'log_name': log_name,
                        'interest_lab_id': sg_filter['segment_id'],
                        'raw_data': row['lemmas'],
                        'data': str(sg_filter['data']),
                        'data_source': 'metrics',
                        'data_type': 'lemmas',
                    }

        return process_lemmas

    def get_lemmas_mapping(self):
        all_lemmas = set()
        lemmas_by_segment_id = dict()

        for mapping in itertools.chain(
                get_phones_segments_lemmas(self.yt),
        ):
            for lemmas in mapping['data']:
                all_lemmas.update(lemmas)
            lemmas_by_segment_id[mapping['segment_id']] = mapping['data']

        return all_lemmas, lemmas_by_segment_id, metrics_title_lemmas_to_interests_mapping['mapping']

    def make_counter_url_processor(self):
        all_counters, counter_url_mappings = self.get_counter_urls_mapping()
        log_name = self.log_name

        def process_counter_urls(row):
            if row['counter_id'] in all_counters:
                for mapping in counter_url_mappings:
                    urls = mapping['data'].get(row['counter_id'], [])
                    for url in urls:
                        if url in row['url']:
                            yield {
                                'yandexuid': row['yandexuid'],
                                'timestamp': row['timestamp'],
                                'date': date_helpers.to_date_string(date_helpers.from_timestamp_to_datetime(row['timestamp'])),
                                'datetime_bin': date_helpers.to_date_string(
                                    date_helpers.from_timestamp_to_datetime(row['timestamp'] - row['timestamp'] % (30 * 60)),
                                    date_format=date_helpers.DATETIME_FORMAT,
                                ),
                                'log_name': log_name,
                                'interest_lab_id': mapping['segment_id'],
                                'raw_data': [row['counter_id'], row['url']],
                                'data': url,
                                'data_source': 'metrics',
                                'data_type': 'counter_url',
                            }

        return process_counter_urls

    def make_hosts_processor(self):
        all_hosts, hosts_mappings = self.get_hosts_mapping()
        log_name = self.log_name

        def process_hosts(row):
            for mapping in hosts_mappings:
                for host in mapping['data']:
                    if row['host'] is not None and host in row['host']:
                        yield {
                            'yandexuid': row['yandexuid'],
                            'timestamp': row['timestamp'],
                            'date': date_helpers.to_date_string(date_helpers.from_timestamp_to_datetime(row['timestamp'])),
                            'datetime_bin': date_helpers.to_date_string(
                                date_helpers.from_timestamp_to_datetime(row['timestamp'] - row['timestamp'] % (30 * 60)),
                                date_format=date_helpers.DATETIME_FORMAT,
                            ),
                            'log_name': log_name,
                            'interest_lab_id': mapping['segment_id'],
                            'raw_data': row['host'],
                            'data': host,
                            'data_source': 'metrics',
                            'data_type': 'hosts',
                        }

        return process_hosts

    def get_hosts_mapping(self):
        all_hosts = set()

        for mapping in metrics_hosts_to_interests_mapping['mapping']:
            all_hosts.update(mapping['data'])

        return all_hosts, metrics_hosts_to_interests_mapping['mapping']

    def get_counter_urls_mapping(self):
        all_counters = set()

        for mapping in metrics_counter_url_to_interests_mapping['mapping']:
            all_counters.update(mapping['data'].keys())

        return all_counters, metrics_counter_url_to_interests_mapping['mapping']

    def make_to_bigb_reducer(self):
        shortterm_ids = LabSegmentsInfo().lab_segment_id_to_shortterm_interest_id
        current_timestamp = time_utils.get_current_time()

        def reducer(key, rows):
            shortterm_interests = dict()

            for row in rows:
                row_age = (date_helpers.from_timestamp_to_datetime(current_timestamp) -
                           date_helpers.from_timestamp_to_datetime(row['timestamp']))
                if row_age.days <= 3:
                    shortterm_id = str(shortterm_ids[row['interest_lab_id']])
                    shortterm_interests[shortterm_id] = max(row['timestamp'], shortterm_interests.get(shortterm_id, 0))

            if shortterm_interests:
                yield {
                    'yandexuid': key['yandexuid'],
                    'shortterm_interests': shortterm_interests,
                }

        return reducer
