#!/usr/bin/python
# -*- coding: utf-8 -*-

#исследование распределения размеров кластеров

import sys
import re
import math

import yt.wrapper as yt


def clast_size(key, recs):
    mctgs_prev = ''
    for rec in recs:
        if mctgs_prev == '':
            mctgs_prev = rec['mctgs']
            clast_size = 1
        elif mctgs_prev == rec['mctgs']:
            clast_size += 1
        else:
            yield { "clast_phrase": key['clast_phrase'], "mctgs": mctgs_prev, "clast_size": -clast_size} #ЭТОТ ОПЕРАТОР ЗАКОММЕНТИРОВАТЬ при выводе ОДНОЗНАЧНЫХ категорий
            mctgs_prev = rec['mctgs']
            clast_size = 1
    yield { "clast_phrase": key['clast_phrase'], "mctgs": mctgs_prev, "clast_size": -clast_size }


def main():
    tab0 = sys.argv[1] #//home/catalogia/users/yuryz/tmp/bnrs_norm_sense_disamb - см. disamb.py
    tab1 = '//tmp/yuryz/bnrs_norm_sense'

    yt.run_sort(tab0, tab1, sort_by=['clast_phrase', 'mctgs', 'bid'])

    tab2 = '//home/catalogia/users/yuryz/tmp/clast_by_size'

    yt.run_reduce(clast_size, tab1, tab2, reduce_by = 'clast_phrase')
    yt.run_sort(tab2, sort_by=['clast_size', 'clast_phrase', 'mctgs'])

    tab3 = '//home/catalogia/users/yuryz/tmp/clast_by_ctg'
    yt.run_sort(tab2, tab3, sort_by=['mctgs', 'clast_phrase', 'clast_size'])

    tab4 = '//home/catalogia/users/yuryz/tmp/clast_by_phrase'
    yt.run_sort(tab2, tab4, sort_by=['clast_phrase', 'mctgs', 'clast_size'])

    print yt.row_count(tab1)
    print yt.row_count(tab2)
    print yt.row_count(tab3)
    print yt.row_count(tab4)


if __name__ == '__main__':
    main()
