#!/usr/bin/python
# -*- coding: utf-8 -*-

#формирование семантического профиля категорий

import yt.wrapper as yt


def sem_prof(key, recs): #формирование семантического профиля категорий
    clast_phr_prev = ''
    clast_siz_prev = 0
    for rec in recs:
        if clast_phr_prev == '':
            clast_phr_prev = rec['clast_phrase']
            clast_siz_prev = rec['clast_size']
        elif rec['clast_phrase'].find(clast_phr_prev) == 0: #вложенный кластер
            clast_siz_prev += rec['clast_size']
        else:
            yield { "mctgs": key['mctgs'], "clast_phrase": clast_phr_prev, "clast_size": clast_siz_prev }
            clast_phr_prev = rec['clast_phrase']
            clast_siz_prev = rec['clast_size']
    yield { "mctgs": key['mctgs'], "clast_phrase": clast_phr_prev, "clast_size": clast_siz_prev }


def main():
    tab1 = '//home/catalogia/users/yuryz/tmp/clast_by_ctg'
    tab2 = '//home/catalogia/users/yuryz/tmp/sem_prof_of_ctg'

    yt.run_reduce(sem_prof, tab1, tab2, reduce_by = ['mctgs'])
    yt.run_sort(tab2, sort_by=['mctgs', 'clast_size', 'clast_phrase'])
    #yt.run_sort(tab2, sort_by=['mctgs', 'clast_phrase', 'clast_size'])

    print yt.row_count(tab1)
    print yt.row_count(tab2)


if __name__ == '__main__':
    main()
