#!/usr/bin/python
# -*- coding: utf-8 -*-

#выбр категорий для векторной кластеризации

import sys
import re
import random
import yt.wrapper as yt


def ctg_list(key, recs): #список категорий
    rec =  recs.next()
    if re.search(r' _ ', rec['ctg']): return #виртуалки не учитываем
    yield { "CategoryName": rec['ctg'] }


def ctg_select(key, recs): #выбор необходимого списка категорий
    if re.search(r' _ ', key['CategoryName']): return #виртуалки не учитываем
    ctg = ''
    for rec in recs:
        table_index = rec.pop('@table_index')
        if table_index == 0:
            ctg = rec['CategoryName']
        elif ctg != '':
            yield { "CategoryName": key['CategoryName'], "CategoryID": rec['CategoryID'] }


def vect_select(key, recs): #выбор векторов
    id = 0
    for rec in recs:
        table_index = rec.pop('@table_index')
        if table_index == 0:
            id = rec['CategoryID']
        elif id != 0:
            yield rec


def main():
    tab1 = '//home/catalogia/users/yuryz/hier_clust/sem_cores'
    tab2 = '//home/catalogia/users/yuryz/vect_clust/ctgs'

    #yt.run_sort(tab1, '//tmp/yuryz/t1', sort_by=['ctg'])

    #yt.run_reduce(ctg_list, '//tmp/yuryz/t1', tab2, reduce_by = ['ctg'])
    #yt.run_sort(tab2, sort_by=['CategoryName'])

    #print >>sys.stderr, yt.row_count(tab2)

    tab3 = '//home/catalogia/contest/CategoriesTree'
    tab4 = '//tmp/yuryz/CategoriesTree'

    #yt.run_sort(tab3, tab4, sort_by=['CategoryName'])

    tab5 = '//home/catalogia/users/yuryz/vect_clust/ctgs_vect'

    #yt.run_reduce(ctg_select, [tab2, tab4], tab5, reduce_by = ['CategoryName'], format=yt.YsonFormat(control_attributes_mode="row_fields"))
    #yt.run_sort(tab5, sort_by=['CategoryID'])

    tab6 = '//home/catalogia/contest/CategoriesTreeEmb'
    tab7 = '//tmp/yuryz/CategoriesTreeEmb'

    yt.run_sort(tab6, tab7, sort_by=['CategoryID'])

    tab8 = '//home/catalogia/users/yuryz/vect_clust/CategoriesTreeEmb'

    yt.run_reduce(vect_select, [tab5, tab7], tab8, reduce_by = ['CategoryID'], format=yt.YsonFormat(control_attributes_mode="row_fields"))
    yt.run_sort(tab8, sort_by=['CategoryID'])


if __name__ == '__main__':
    main()
