#!/usr/bin/python
# -*- coding: utf-8 -*-

#поиск категорий, тематически входящих одна в другую

import math
import yt.wrapper as yt


def ctgs_topic_num(key, recs): #подсчет числа тем в отдельных категориях
    num = 0
    for rec in recs:
        num += 1
    yield { "ctg1": key['ctg'], "topic_num": num }
    yield { "ctg2": key['ctg'], "topic_num": num, "@table_index": 1 }


def ctg1_ctg2_topic_num(key, recs): #подсчет числа общих тем в парах категорий
    num = 0
    for rec in recs:
        num += 1
    yield { "ctg1": key['ctg1'], "ctg2": key['ctg2'], "ctg1_ctg2_topic_num": -num }


def add_ctg1_topic_num(key, recs): #добавление поля ctg1_topic_num
    ctg1 = ''
    for rec in recs:
        table_index = rec.pop('@table_index')
        if table_index == 0:
            ctg1 = key['ctg1']
            ctg1_num = rec['topic_num']
        elif ctg1 != '':
            rec['ctg1_topic_num'] = ctg1_num
            yield rec


def add_ctg2_topic_num(key, recs): #добавление поля ctg2_topic_num
    ctg2 = ''
    for rec in recs:
        table_index = rec.pop('@table_index')
        if table_index == 0:
            ctg2 = key['ctg2']
            ctg2_num = rec['topic_num']
        elif ctg2 != '':
            rec['ctg2_topic_num'] = ctg2_num
            yield rec


def main():
    tab1 = '//tmp/yuryz/topic_weight'
    tab2 = '//tmp/yuryz/ctg1_topic_num'
    tab3 = '//tmp/yuryz/ctg2_topic_num'

    #yt.run_reduce(ctgs_topic_num, tab1, [tab2, tab3], reduce_by = ['ctg'], format=yt.YsonFormat(control_attributes_mode="row_fields"))
    #yt.run_sort(tab2, sort_by=['ctg1'])
    #yt.run_sort(tab3, sort_by=['ctg2'])

    tab4 = '//home/catalogia/users/yuryz/compare/ctg_cross'
    tab5 = '//tmp/yuryz/ctg1_ctg2_topic_num'

    yt.run_reduce(ctg1_ctg2_topic_num, tab4, [tab5], reduce_by = ['ctg1', 'ctg2'], format=yt.YsonFormat(control_attributes_mode="row_fields"))
    yt.run_sort(tab5, sort_by=['ctg1', 'ctg2'])

    tab6 = '//tmp/yuryz/ctg1_ctg2_topic_num'

    yt.run_reduce(add_ctg1_topic_num, [tab2, tab5], [tab6], reduce_by = ['ctg1'], format=yt.YsonFormat(control_attributes_mode="row_fields"))
    yt.run_sort(tab6, sort_by=['ctg2', 'ctg1'])

    tab7 = '//home/catalogia/users/yuryz/compare/ctg1_ctg2_topic_num'

    yt.run_reduce(add_ctg2_topic_num, [tab3, tab6], [tab7], reduce_by = ['ctg2'], format=yt.YsonFormat(control_attributes_mode="row_fields"))
    yt.run_sort(tab7, sort_by=['ctg1', 'ctg1_ctg2_topic_num', 'ctg2'])


if __name__ == '__main__':
    main()
