#!/usr/bin/env python
#  -*- coding: utf-8 -*-
'''module defining processing functions for the
collections cube calculations
'''

from functools import partial

from qb2.api.v1 import (
    filters as qf,
    resources as qr,
    extractors as qe,
)
from nile.api.v1 import (
    Record,
    files as nfiles,
    extractors as ne,
    aggregators as na,
)

EARTH_ID = 10000

RENAME_DICT = {
    '_undefined_': u'Не определено',
    '_total_': u'Всего',
    '_empty_value_': u'Нет'
}

COMMON_FIELDS = [
    'fielddate', 'collection_id', 'service', 'collection_region',
    'rubric_text', 'partner_name'
]

MODAL_FIELDS = {
    'collections': [],
    'users': ['user_id'],
    'orgs': ['permalink'],
}


def datalens_rename(field_value):
    '''datalens can not use dicts, thus rename'''

    return RENAME_DICT.get(field_value, field_value)


def array_to_dict(array):
    '''["key":..., "value":...] to a {key: value}'''

    if isinstance(array, dict):
        return array
    result = {}

    for element in array:
        result.update({element['key']: element['value']})

    return result


def extract_details(records, make_dict=array_to_dict):
    '''map datailed info into records'''
    geob = qr.get('Geobase')

    for record in records:
        rec_update = {}
        record_properties = make_dict(record.properties)
        rubric = record_properties.get('rubric', '_undefined_')
        p_name = record_properties.get('partner_name', '_undefined_')
        p_id = record_properties.get('partner_id', '_undefined_')

        rec_update['rubric_text'] = rubric if rubric else '_empty_value_'
        rec_update['partner_name'] = p_name if p_name else '_empty_value_'
        rec_update['partner_id'] = p_id if p_id else '_empty_value_'

        geo_id = int(record_properties.get('geo_region_id', EARTH_ID))
        region = geob.region_by_id(geo_id)
        rec_update['collection_region'] = region.name

        yield Record(record, **rec_update)


def datalens_map_project(stream, mode, mode_fields):
    '''apply datalens map and project'''
    assert isinstance(mode, (str, unicode)), 'mode must be a string'
    assert mode in {'collections', 'users', 'orgs'}, 'unknown mode %s' % mode

    if mode == 'collections':
        res = stream.groupby(*mode_fields).aggregate(
            shows=na.sum('shows', missing=0),
            unique_users=na.count_distinct('user_id'),
            unique_permalinks=na.any('unique_permalinks'),
        )
    elif mode == 'users':
        res = stream.project('shows', *mode_fields).unique(*mode_fields)
    else:
        res = stream.project(
            ne.all(),
            qe.unfold('permalink', 'permalinks'),
        ).groupby(*mode_fields).aggregate(
            shows=na.sum('shows', missing=0),
            unique_users=na.count_distinct('user_id'),
        )

    return res


def process_cube_into_datelens(job, cube_stream):
    '''process the cube stream into a detalens-ready one'''
    mapper = partial(extract_details, make_dict=array_to_dict)
    noshow_stream, shown_stream = cube_stream.map(
        mapper,
        # probably impossible to get just latest geobase without version,
        # but v6 can already be used, though it's slightly less elegant
        files=[nfiles.StatboxDict('Geobasev6.bin', use_latest=True)]
    ).project(
        'fielddate',
        'shows',
        'user_id',
        'service',
        'collection_region',
        'permalinks',
        collection_id=ne.custom(datalens_rename, 'id'),
        rubric_text=ne.custom(datalens_rename, 'rubric_text'),
        partner_name=ne.custom(datalens_rename, 'partner_name'),
        unique_permalinks=ne.custom(lambda x: len(set(x)), 'permalinks')
    ).split(qf.defined('service'))

    mobile_stream, desktop_stream = shown_stream.split(
        qf.equals('service', 'desktop')
    )

    datalens_streams = []

    for mode in ['collections', 'users', 'orgs']:
        mode_fields = COMMON_FIELDS + MODAL_FIELDS[mode]
        desktop = datalens_map_project(desktop_stream, mode, mode_fields)
        mobile = datalens_map_project(mobile_stream, mode, mode_fields)
        noshow = datalens_map_project(noshow_stream, mode, mode_fields)

        tmp_stream = job.concat(desktop, mobile, noshow).unique(
            # there are duplicated permalinks in a few collections,
            # that's why deduplication here
            *mode_fields
        )

        datalens_streams.append(tmp_stream)

    return tuple(datalens_streams)


def prepare_cube_stream(job, collections_stream, *streams):
    '''join and concatenate streams'''
    joined_streams = []
    null_streams = []

    for stream in streams:
        joined_stream = collections_stream.join(
            stream,
            type='left',
            by=('fielddate', 'id'),
            assume_unique_left=True
        )
        nulls, seen = joined_stream.split(qf.defined('user_id'))
        joined_streams.append(seen)
        null_streams.append(nulls)

    never_seens = job.concat(*null_streams).groupby(
        'fielddate',
        'id',
    ).aggregate(size=na.count()).filter(
        qf.compare('size', '==', len(joined_streams))
    ).project(
        'fielddate',
        'id',
    )

    never_seens = never_seens.join(
        collections_stream,
        type='left',
        by=('fielddate', 'id'),
        assume_unique_left=True,
    )

    return job.concat(never_seens, *joined_streams).unique(
        'id',
        'fielddate',
        'user_id',
    )


def process_events_stream(stream, service='desktop'):
    '''project and filter logs stream'''

    if service == 'desktop':
        date_field = 'date'
        key = 'discovery'
        field = 'vars'
        uid = 'yandexuid'
    else:
        date_field = 'event_date'
        key = 'card_id'
        field = 'event_value'
        uid = 'device_id'

    stream = stream.project(
        ne.all(),
        fielddate=date_field,
        id=qe.dictitem(key, field),
        user_id=uid
    ).filter(qf.defined('id')).groupby(
        'fielddate',
        'id',
        'user_id',
    ).aggregate(
        shows=na.count(),
    ).project(
        ne.all(), service=ne.const(service)
    )

    return stream


def process_collections(job, stream):
    '''process and prepare raw collections's logs'''
    empty_recs, full_recs = stream.split(
        qf.contains('properties', 'partner_id')
    )

    # here we find first occurence of updated properties for each collection
    first_properties_account = full_recs.groupby('id').top(
        1, by='fielddate', mode='min'
    ).project('id', 'properties')

    filled_recs = empty_recs.project(ne.all(exclude=['properties'])).join(
        first_properties_account,
        by='id',
        assume_unique_right=True,
        assume_small_right=True
    )

    return job.concat(filled_recs, full_recs).unique('fielddate', 'id')


def process(job, streams, datalens=False):
    '''process all streams, return them to main'''
    collections_stream = process_collections(job, streams[0])
    desktop_stream = process_events_stream(streams[1], 'desktop')
    mobile_stream = process_events_stream(streams[2], 'mobile')

    cube_stream = prepare_cube_stream(
        job, collections_stream, desktop_stream, mobile_stream
    )

    datalens_streams = None

    if datalens:
        datalens_streams = process_cube_into_datelens(job, cube_stream)

    return (cube_stream, datalens_streams)
