#!/usr/bin/env python
# -*- coding: utf-8 -*-

from collections import Counter, defaultdict
from functools import partial

from crypta.profile.utils.config import config
from crypta.profile.utils.segment_utils.builders import RegularSegmentBuilder
from crypta.profile.utils.luigi_utils import ExternalInput


def canonize_phone_model_name(phone_model_name):
    if 'восстановленный' not in phone_model_name:
        phone_model_name = phone_model_name.lower().replace('-', ' ').replace('/', ' ')
        phone_model_name = phone_model_name.replace('(', ' ').replace(')', ' ').strip()
        if phone_model_name.startswith('телефон '):
            phone_model_name = phone_model_name[len('телефон '):]
        elif phone_model_name.startswith('смартфон '):
            phone_model_name = phone_model_name[len('смартфон '):]
        return phone_model_name


get_all_phone_models_from_appmetrica_month = """
PRAGMA yt.DefaultMemoryLimit = '1G';

INSERT INTO `{output_table}`
SELECT manufacturer, model, COUNT(device_id) AS device_id_count
FROM `{appmetrica_month_table}`
WHERE manufacturer IS NOT NULL AND model IS NOT NULL AND NOT (manufacturer == 'Telegram' AND model == 'Telegram')
GROUP BY manufacturer, model
ORDER BY device_id_count;
"""


def get_model_price_segment(model_name, models_by_segments, models_tokens_by_segments, tokens_by_segments):
    # check exact match
    for segment, labeled_models in models_by_segments.iteritems():
        if model_name in labeled_models:
            return segment

    # check substring
    for segment, labeled_models in models_by_segments.iteritems():
        for labeled_model in labeled_models:
            if model_name in labeled_model:
                return segment

    # check all tokens in labeled
    model_tokens = set(model_name.split())
    for segment, labeled_models_tokens in models_tokens_by_segments.iteritems():
        for labeled_model_tokens in labeled_models_tokens:
            if model_tokens.intersection(labeled_model_tokens) == model_tokens:
                return segment

    # check NB-like probability with tokens
    segment_affinity = Counter()
    for token in model_name.split():
        if token in tokens_by_segments:
            for price_segment, count in tokens_by_segments[token].iteritems():
                segment_affinity[price_segment] += count
    best_match_price_segment_and_match_score = segment_affinity.most_common(1)
    if best_match_price_segment_and_match_score:
        max_price_segment = best_match_price_segment_and_match_score[0][0]
        return max_price_segment


def add_guessed_price_segment(record, models_by_segments, models_tokens_by_segments, tokens_by_segments):
    model_name = canonize_phone_model_name('{manufacturer} {model}'.format(**record))
    record['price_segment'] = get_model_price_segment(
        model_name,
        models_by_segments,
        models_tokens_by_segments,
        tokens_by_segments,
    )
    yield record


def set_price_segment_from_dict(record, appmetrica_phone_models_by_price_segments):
    model = (record['manufacturer'], record['model'])
    if model in appmetrica_phone_models_by_price_segments:
        yield {
            'id': record['id'],
            'id_type': record['id_type'],
            'segment_name': appmetrica_phone_models_by_price_segments[model],
        }


class PhoneOwners(RegularSegmentBuilder):
    name_segment_dict = {
        '0_15': 1062,
        '15_25': 1063,
        '25_35': 1064,
        '35_50': 1065,
        '50_100': 1066,
        '100+': 2044,
    }

    keyword = 547

    def requires(self):
        return {
            'AppMetricaInformation': ExternalInput(config.APP_METRICA_MONTH),
            'PhonePriceSegmentDict': ExternalInput(config.PHONE_MODELS_BY_PRICE_SEGMENT),
        }

    def build_segment(self, inputs, output_path):
        self.yt.create_empty_table(
            path=config.ALL_PHONE_MODELS_FROM_APPMETRICA,
            schema={
                'manufacturer': 'string',
                'model': 'string',
                'device_id_count': 'uint64',
                'price_segment': 'string',
            },
            force=True,
        )

        self.yql.query(
            query_string=get_all_phone_models_from_appmetrica_month.format(
                appmetrica_month_table=self.input()['AppMetricaInformation'].table,
                output_table=config.ALL_PHONE_MODELS_FROM_APPMETRICA,
            ),
            transaction=self.transaction,
        )

        models_by_segments = defaultdict(set)
        tokens_by_segments = defaultdict(Counter)
        models_tokens_by_segments = defaultdict(list)

        for record in self.yt.read_table(self.input()['PhonePriceSegmentDict'].table):
            for model_name in record['model_names']:
                canonized_model_name = canonize_phone_model_name(model_name)
                if canonized_model_name:
                    models_by_segments[record['price_segment']].add(canonized_model_name)
                    model_tokens = set(canonized_model_name.split())
                    models_tokens_by_segments[record['price_segment']].append(model_tokens)
                    for token in model_tokens:
                        tokens_by_segments[token][record['price_segment']] += 1

        # prevent dict from unwanted expanding:
        #  each time you check defaultdict for presence of object -- this object adds to the dict
        tokens_by_segments = dict(tokens_by_segments)
        models_by_segments = dict(models_by_segments)
        models_tokens_by_segments = dict(models_tokens_by_segments)
        self.logger.info('Dicts len: {} {} {}'.format(
            len(tokens_by_segments),
            len(models_by_segments),
            len(models_tokens_by_segments),
        ))

        # add price segment from PhonePriceSegmentDict to appmetrica manufacturer:model table
        self.yt.run_map(
            partial(
                add_guessed_price_segment,
                tokens_by_segments=tokens_by_segments,
                models_by_segments=models_by_segments,
                models_tokens_by_segments=models_tokens_by_segments,
            ),
            config.ALL_PHONE_MODELS_FROM_APPMETRICA,
            config.ALL_PHONE_MODELS_FROM_APPMETRICA,
        )
        del tokens_by_segments
        del models_by_segments
        del models_tokens_by_segments
        self.yt.run_sort(config.ALL_PHONE_MODELS_FROM_APPMETRICA, sort_by='device_id_count')
        self.yt.set_attribute(
            config.ALL_PHONE_MODELS_FROM_APPMETRICA,
            'generate_date',
            self.date
        )

        appmetrica_phone_models_by_price_segments = {}
        for record in self.yt.read_table(config.ALL_PHONE_MODELS_FROM_APPMETRICA):
            if record['price_segment']:
                appmetrica_phone_models_by_price_segments[(record['manufacturer'], record['model'])] = record['price_segment']

        self.yt.run_map(
            partial(
                set_price_segment_from_dict,
                appmetrica_phone_models_by_price_segments=appmetrica_phone_models_by_price_segments,
            ),
            self.yt.TablePath(
                self.input()['AppMetricaInformation'].table,
                columns=['id', 'id_type', 'manufacturer', 'model'],
            ),
            output_path,
        )
