#!/usr/bin/python
# -*- coding: utf-8 -*-

import yt.wrapper as yt
import json


def get_by_path(d, path):
    # recursive helper function to access the element at the desired path
    def get(d,l):
        if len(l) == 0:
            return d
        else:
            return get(d[l[0]], l[1:])
    # split the path to generate the list
    return get(d, path.split('/'))


class Mapper():
    def __init__(self, path_dict, info_field, unique_field, cache_id_field):
        self.path_dict = path_dict
        self.info_field = info_field
        self.unique_field = unique_field
        self.cache_id_field = cache_id_field
    def __call__(self, row):
        info = json.loads(row[self.info_field])
        for path in self.path_dict.keys():
            urls = []
            try:
                field = get_by_path(info, path)
            except:
                continue
            if isinstance(field, list):
                urls.extend(field)
            else:
                urls.append(field)
            for url in urls:
                yield {
                    self.unique_field: row[self.unique_field],
                    '__result_field': self.path_dict[path],
                    self.cache_id_field: url
                }


class InnerJoinReducer():
    def __call__(self, key, rows):
        rows = iter(rows)
        first = next(rows)
        if first['@table_index'] != 0:
            #здесь вместо ретурна можно сделать точечный запрос в аватарницу, можно даже с кэшированием в динтаблицу.
            #Тогда в класс надо будет прокинуть cache_id_field и cache_value_field, чтобы знать, откуда брать урл и куда класть ответ
            return
        for row in rows:
            row.update(first)
            yield row


class FinalJoinReducer():
    def __init__(self, path_dict, cache_value_field):
        self.path_dict = path_dict
        self.cache_value_field = cache_value_field
    def __call__(self, key, rows):
        rows = iter(rows)
        result = next(rows)
        if result['@table_index'] != 0:
            return
        for result_field in self.path_dict.values():
            result[result_field] = []
        for row in rows:
            result[row['__result_field']].append(row[self.cache_value_field])
        yield result


def ololo(input_table, output_table, cache_table, cache_id_field, cache_value_field, info_field, unique_field, info_config, yt):
    with yt.TempTable() as temp_urls, \
         yt.TempTable() as temp_cache, \
         yt.TempTable() as temp_banners, \
         yt.TempTable() as temp_joined:
        yt.run_map(
            Mapper(info_config, info_field, unique_field, cache_id_field),
            input_table+'{'+info_field+','+unique_field+'}',
            temp_urls
        )
        yt.run_sort(temp_urls, sort_by=[cache_id_field])
        yt.run_sort(cache_table, temp_cache, sort_by=[cache_id_field])# если кэш отсортирован по cache_id_field (непонятно, почему сейчас не так), то этот сорт не нужен
        yt.run_reduce(
            InnerJoinReducer(),
            [temp_cache+'{'+cache_id_field+','+cache_value_field+'}', temp_urls],
            temp_joined,
            reduce_by=[cache_id_field],
            format=yt.YsonFormat(control_attributes_mode="row_fields"),
        )
        yt.run_sort(temp_joined, sort_by=[unique_field])
        yt.run_sort(input_table, temp_banners, sort_by=[unique_field]) #если баннеры отсортированы по unique_field, этот сорт не нужен
        yt.run_reduce(
            FinalJoinReducer(info_config,cache_value_field),
            [temp_banners, temp_joined],
            output_table,
            reduce_by=[unique_field],
            format=yt.YsonFormat(control_attributes_mode="row_fields"),
        )


def main():
    yt.config['mount_sandbox_in_tmpfs'] = True
    yt.config['token_path'] = '/opt/broadmatching/secrets/tokens/yt_plato'
    yt.config['spec_defaults'] = {
        'pool': 'catalogia',
    }
    yt.config["proxy"]["url"] = "hahn.yt.yandex.net"

    banners = '//home/bannerland/test/perf_avatars_input_banners'
    cache = '//home/bannerland/test/perf_avatars_cache'
    result = '//home/bannerland/test/perf_avatars_result'

    info_config = {'images': 'other', 'image': 'main'}

    ololo(
        input_table=banners,
        output_table=result,
        cache_table=cache,
        cache_id_field='url',
        cache_value_field='avatars',
        info_field='Info',
        unique_field='bannerphrase_md5',
        info_config=info_config,
        yt=yt
    )

if __name__ == '__main__':
    main()
