#!/usr/bin/python
# -*- coding: utf-8 -*-

#выбор сочетаний слов из баннера для формирования кластеров

import itertools
import re
import yt.wrapper as yt


def mapper(rec):
    L = rec['bnorm'].split()
    wrds = set(L)
    L = []
    for wrd in wrds:
        if re.findall('^[0-9]+$', wrd) or re.findall('^_', wrd):
            continue
        L.append(wrd)
    L.sort()

    if len(L) >= 3:
        L = list(itertools.combinations(L, 3)) #сочетания из n по k
        for sign in L:
            comb = " ".join(sign)
            yield { "comb": comb, "mctgs": rec['mctgs'], "bid": rec['bid'] }
    elif len(L) > 0:
        comb = " ".join(L)
        yield { "comb": comb, "mctgs": rec['mctgs'], "bid": rec['bid'] }


def main():
    input = '//home/catalogia/users/yuryz/virt/virt_pref_norm'
    output = '//home/catalogia/users/yuryz/virt/virt_pref_comb'

    yt.run_map(mapper, input, output)
    yt.run_sort(output, sort_by=['comb', 'mctgs', 'bid'])

    print yt.row_count(input)
    print yt.row_count(output)


if __name__ == '__main__':
    main()
