#!/usr/bin/python
# -*- coding: utf-8 -*-

#исправление категорий по ближайшим соседям

import sys
import re
import math
import copy

import yt.wrapper as yt

def block_sens_range(key, recs): #формирование диапазонов индексов
    ind_prev = -1
    rec_prev = None
    for rec in recs:
        if ind_prev == -1:
            ind_prev = rec['row_index']
            rec_prev = rec
        elif ind_prev == rec['row_index'] - 1: #соседние индексы
            ind_prev += 1
        else:
            yield rec_prev

            ind_prev = rec['row_index']
            rec_prev = rec
    yield rec_prev


def main():
    ##tab1 = '//home/catalogia/users/yuryz/etalon/block_sens_num'
    tab1 = '//tmp/yuryz/block_sens_size_max' #см. ctgs_disamb.py
    tab2 = '//tmp/yuryz/block_sens_num'

    yt.run_sort(tab1, tab2, sort_by=['mctgs', 'row_index'])

    tab3 = '//tmp/yuryz/block_sens_range'

    yt.run_reduce(block_sens_range, tab2, tab3, reduce_by = ['mctgs'])
    yt.run_sort(tab3, sort_by=['row_index', 'size', 'mctgs', 'sense'], job_io={"table_writer":{"max_key_weight":131072}})

    SIZE_MIN = 20 #минимальный размер порога для соседей
    SIZE_MAX = 2 #максимальный размер порога для баннера

    count = 0
    rec1 = None
    rec2 = None
    rec3 = None
    for rec in yt.read_table(tab3, raw=False):
        count += 1
        if count == 1:
            rec1 = rec
        elif count == 2:
            rec2 = rec
        elif count == 3:
            rec3 = rec
        else:
            if rec1['mctgs'] == rec3['mctgs'] and rec1['size'] + rec3['size'] > rec2['size'] and (rec1['size'] >= SIZE_MIN and rec3['size'] >= SIZE_MIN and rec2['size'] <= SIZE_MAX) and rec2['disamb'] == 1:
                if not re.findall(rec1['mctgs'], rec2['mctgs']):
                    pass
                else:
                    print str(rec3['bid']) + '\t' + rec3['mctgs'] + '\t' + str(rec3['size']) + '\t' + rec3['mctgs'] + '\t' + rec3['domain'] + '\t' + rec3['domain'] + '\t' + rec3['domain'] #полное копирование без замен

            rec1 = rec2
            rec2 = rec3
            rec3 = rec

    if rec1['mctgs'] == rec3['mctgs'] and rec1['size'] + rec3['size'] > rec2['size'] and (rec1['size'] >= SIZE_MIN and rec3['size'] >= SIZE_MIN and rec2['size'] <= SIZE_MAX) and rec2['disamb'] == 1:
        if not re.findall(rec1['mctgs'], rec2['mctgs']):
            pass
        else:
            print str(rec3['bid']) + '\t' + rec3['mctgs'] + '\t' + str(rec3['size']) + '\t' + rec3['mctgs'] + '\t' + rec3['domain'] + '\t' + rec3['domain'] + '\t' + rec3['domain'] #полное копирование без замен


if __name__ == '__main__':
    main()
