#!/usr/bin/python
# -*- coding: utf-8 -*-

#построение рангового поискового индекса слов для категорий

import yt.wrapper as yt


def sel_rank(key, recs): #выбор ранга слова
    for rec in recs:
        L = rec['clast_phrase'].split()
        for i in range(len(L)):
            yield { "word": L[i], "rank": i+1, "mctgs": key['mctgs'] }


def del_dup(key, recs): #удаление дублей
    yield recs.next()


def make_index(key, recs): #построение собственно индекса
    mctgs_prev = ''
    for rec in recs:
        if mctgs_prev == '':
            mctgs_prev = rec['mctgs']
            mctgs_list = rec['mctgs']
            rank_list = str(rec['rank'])
        elif mctgs_prev == rec['mctgs']:
            rank_list += '\t' + str(rec['rank'])
        else:
            mctgs_prev = rec['mctgs']
            mctgs_list += '\t' + rank_list

            mctgs_list += '\n' + rec['mctgs']
            rank_list = str(rec['rank'])
    mctgs_list += '\t' + rank_list
    yield { "word": key['word'], "mctgs_list": mctgs_list }


def main():
    tab1 = '//home/catalogia/users/yuryz/tmp/clast_by_ctg'
    tab2 = '//tmp/yuryz/wrd2rank'

    #yt.run_reduce(sel_rank, tab1, tab2, reduce_by = ['mctgs'])
    #yt.run_sort(tab2, sort_by=['word', 'rank', 'mctgs'])

    tab3 = '//tmp/yuryz/wrd2rank_no_dup'

    #yt.run_reduce(del_dup, tab2, tab3, reduce_by = ['word', 'rank', 'mctgs'])
    #yt.run_sort(tab3, sort_by=['word', 'mctgs', 'rank'])

    tab4 = '//home/catalogia/users/yuryz/tmp/word_index'

    yt.run_reduce(make_index, tab3, tab4, reduce_by = ['word'])
    yt.run_sort(tab4, sort_by=['word'])

    print yt.row_count(tab1)
    print yt.row_count(tab2)
    print yt.row_count(tab3)
    print yt.row_count(tab4)


if __name__ == '__main__':
    main()
