#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-

import argparse
import logging
import re

import yt.wrapper as yt


def get_service(r):
    service = ''
    if r.IsA('TWebRequestProperties'):
        service = 'web'
    elif r.IsA('TImagesRequestProperties'):
        service = 'images'
    elif r.IsA('TVideoRequestProperties'):
        service = 'video'
    elif r.IsA('TCbirRequestProperties'):
        service = 'cbir'
    elif r.IsA('TMapsRequestProperties'):
        service = 'maps'
    elif r.IsA('TPortalRequestProperties'):
        service = 'portal'
    elif r.IsA('TNewsRequestProperties'):
        service = 'news'
    elif r.IsA('TYandexCollectionsRequestProperties'):
        service = 'collections'

    ui = ''
    if r.IsA('TDesktopUIProperties'):
        ui = 'desktop'
    elif r.IsA('TTouchUIProperties'):
        ui = 'touch'
    elif r.IsA('TMobileUIProperties'):
        ui = 'mobile'
    elif r.IsA('TMobileAppUIProperties'):
        ui = 'mobileapp'
    elif r.IsA('TPadUIProperties'):
        ui = 'pad'
    elif r.IsA('TSiteSearchUIProperties'):
        ui = 'sitesearch'

    return service, ui


def get_baobab_path(block):
    import baobab

    if block:
        return '/'.join([x.name for x in baobab.common.get_blocks_from_root_list(block)])


BLOCK_NAMES = (
    'all',
    'arrow-left',
    'arrow-right',
    'author',
    'comment-count',
    'dislike',
    'hide',
    'item',
    'like',
    'link',
    'more',
    'phrase',
    'reviews',
    'scroll_left',
    'scroll_right',
    'title',
    'ugc',
)


def build_rule(prefix):
    return '|'.join(prefix + name for name in BLOCK_NAMES)


RULES = {
    'wizard_company_reviews': build_rule('#wizard_companies//reviews//'),
    'wizard_company_overlay_reviews': build_rule('#wizard_companies//#overlay//reviews//'),
    'wizard_entity_search_reviews': build_rule('#entity_search_wizard//reviews//'),
    'wizard_entity_search_overlay_reviews': build_rule('#entity_search_wizard//#overlay//reviews//'),
}


def get_tag(event, marks):
    if marks.has_block_with_ancestors_and_event_markers(event, 'overlay'):
        if marks.has_block_and_event_marker(event, 'wizard_company_overlay_reviews'):
            return 'geo'
        if marks.has_block_and_event_marker(event, 'wizard_entity_search_overlay_reviews'):
            return 'oo'
    else:
        if marks.has_block_and_event_marker(event, 'wizard_company_reviews'):
            return 'geo'
        if marks.has_block_and_event_marker(event, 'wizard_entity_search_reviews'):
            return 'oo'


def reducer(key, recs):
    import libra

    try:
        requests = libra.ParseSession(recs, 'blockstat.dict')
    except Exception as exc:
        logging.warning('fail %r: %s', key, exc)
        return

    for row in get_clicks(key['key'], requests):
        yield row


def get_clicks(key, requests):
    import baobab  # noqa
    import tamus  # noqa

    for r in requests:
        if r.IsA('TBaobabProperties'):
            puid = r.PassportUID
            yuid = re.sub('^(y|uu/)', '', key)
            service, ui = get_service(r)

            joiners = r.BaobabAllTrees()
            if joiners:
                marks = tamus.check_rules_multiple_joiners_merged(RULES, joiners)

                for joiner in joiners:
                    for event in joiner.get_all_events():
                        if isinstance(event, baobab.common.Click):
                            block = joiner.get_show().tree.get_block_by_id(event.block_id)
                            ts = event.client_timestamp
                            path = get_baobab_path(block)
                            tag = get_tag(event, marks)

                            if tag is not None and ts is not None:
                                yield dict(
                                    yuid=yuid,
                                    ts=ts,
                                    puid=puid,
                                    ui=ui,
                                    tag=tag,
                                    service=service,
                                    path=path,
                                )


def parser(blockstat_dict):
    def parse_session(key, recs):
        import libra  # noqa

        try:
            requests = libra.ParseSession(recs, blockstat_dict)
        except Exception:
            return

        for row in get_clicks(key, requests):
            yield row

    return parse_session


OUTPUT_SCHEMA = [
    {'name': 'yuid', 'type': 'string'},
    {'name': 'ts', 'type': 'int32'},
    {'name': 'puid', 'type': 'string'},
    {'name': 'ui', 'type': 'string'},
    {'name': 'tag', 'type': 'string'},
    {'name': 'service', 'type': 'string'},
    {'name': 'path', 'type': 'string'},
]


def parse_args():
    parser = argparse.ArgumentParser(description='Squeeze us points')
    parser.add_argument(
        '--table',
        type=str,
        default='//user_sessions/pub/search/daily/2020-04-01/clean',
        help='table path',
    )
    parser.add_argument(
        '--output',
        type=str,
        default='//tmp/ivankun/result',
        help='output path',
    )
    parser.add_argument(
        '--lower',
        type=str,
        help='lower bound',
    )
    parser.add_argument(
        '--upper',
        type=str,
        help='upper bound',
    )
    parser.add_argument(
        '--yt-pool',
        type=str,
        help='yt pool',
    )
    return parser.parse_args()


def anaconda_module_filter(module):
    module_name = getattr(module, '__name__', '')

    if '.yson' in module_name or 'yson.' in module_name:
        logging.info('Leaving module %s', module_name)
        return True

    if module_name in ('hashlib', '_hashlib', 'hmac', 'statbox_bindings2', 'qb2', 'pytz', 'time'):
        logging.info('Filtering module %s [blacklist]', module_name)
        return False

    module_file = getattr(module, '__file__', '')
    if not module_file:
        return False
    if module_file.endswith('.so'):
        logging.debug('Filtering module %s [.so]', module_name)
        return False

    return True


def set_config(yt_pool):
    yt.config['proxy']['url'] = 'hahn.yt.yandex.net'
    yt.config['memory_limit'] = 16 * yt.common.GB
    yt.config['pickling']['enable_modules_compatibility_filter'] = True
    yt.config['pickling']['module_filter'] = anaconda_module_filter
    yt.config['pool'] = yt_pool


def main():
    cli_args = parse_args()
    set_config(cli_args.yt_pool)

    assert not bool(cli_args.lower) ^ bool(cli_args.upper)

    path = yt.TablePath(cli_args.table, lower_key=cli_args.lower, upper_key=cli_args.upper)

    yt.run_reduce(
        reducer,
        source_table=path,
        destination_table=yt.TablePath(cli_args.output, attributes={'schema': OUTPUT_SCHEMA}),
        reduce_by=['key'],
        yt_files=[
            '//statbox/statbox-dict-last/blockstat.dict',
            '//statbox/resources/baobab.so',
            '//statbox/resources/libra.so',
        ],
        spec={
            'pool_trees': ['physical'],
            'tentative_pool_trees': ['cloud'],
            'data_size_per_job': 4 * yt.common.GB,
        },
    )

    yt.run_sort(cli_args.output, sort_by=['yuid', 'puid', 'ts'])


if __name__ == '__main__':
    main()
