#!/usr/bin/python
# -*- coding: utf-8 -*-

#применение правил к Юриному шарду #2 (индекс == 1)

import random
import sys
import re
import yt.wrapper as yt


def sel_shard(rec): #выбор шарда
    if rec['shard'] == 1: #шард #2
        yield rec


def convert(rec):
    piece = rec["@row_index"] / 10000
    del rec["@row_index"]
    yield { "piece": piece, "bid": rec['BannerID'], "title": rec['Title'], "body": rec['Body'], "url": rec['Url'], "auto_category_ids": rec['AutoCategoryIDs'], "shard": rec['shard'],  "hash": rec['hash']  }


def change_mctgs(key, recs): #изменение категорий в шарде #2 на основе banners_extended
    bid = 0
    for rec in recs:
        table_index = rec.pop('@table_index')
        if table_index == 0: #banners_extended
            bid = rec['bid']
            title = rec['title']
            body = rec['body']
            mctgs = rec['mctgs']
            auto_category_ids = rec['CategoryIDs']
        elif bid != 0:
            if title == rec['title'] and body == rec['body'] and mctgs != "":
                rec['mctgs'] = mctgs #вставляем название категории
                rec['auto_category_ids'] = auto_category_ids
                yield rec


def rules4ctgs(key, recs): #применение правил для категорий
    mctgs = ''
    for rec in recs:
        table_index = rec.pop('@table_index')
        if table_index == 0: #правила для категорий
            mctgs = rec['mctgs']
            CategoryNames = rec['CategoryNames']
        elif mctgs != '':
            rec['mctgs'] = CategoryNames
            yield rec
        else:
            yield rec
            pass


def sel_block_info(key, recs): #выбор информации о блоке
    bid = 0
    for rec in recs:
        table_index = rec.pop('@table_index')
        if table_index == 0: #'//home/catalogia/users/yuryz/etalon/.quarantine/block_sens_num'
            bid = rec['bid']
            block_mctgs = rec['mctgs']
            block_index = rec['block_index']
            row_index = rec['row_index']
        elif bid != 0: #'//tmp/yuryz/TrainTextShard_2_upd'
            rec['block_mctgs'] = block_mctgs
            rec['block_index'] = block_index
            rec['row_index'] = row_index
            yield rec
        else:
            rec['block_mctgs'] = rec['mctgs']
            rec['block_index'] = -1
            rec['row_index'] = -1
            yield rec


def change_field(rec): #изменение имени поля для block_sens_num
    rec['block_mctgs'] = rec['mctgs']
    yield rec


def sel_neighbor(key, recs): #выбор семантических соседей для баннеров шарда #2
    recs_shard = []
    for rec in recs:
        table_index = rec.pop('@table_index')
        if table_index == 0: #'//tmp/yuryz/TrainTextShard_2_blk'
            recs_shard.append(rec)
        else: #'//home/catalogia/users/yuryz/etalon/.quarantine/block_sens_num_add_field'
            for i in range(len(recs_shard)):
                dist = rec['row_index'] - recs_shard[i]['row_index']
                if abs(dist) <= 5: ###РАССТОЯНИЕ (recs_shard[i]['mctgs'] - категория центра окрестности)
                    yield { "bid": rec['bid'], "mctgs": recs_shard[i]['mctgs'], "dist": dist, "hash": recs_shard[i]['hash'], "piece": recs_shard[i]['piece'] }


def neighbor_format(key, recs): #форматирование баннеров из окрестностей
    bid = 0
    for rec in recs:
        table_index = rec.pop('@table_index')
        if table_index == 0: #'//home/catalogia/banners_extended'
            bid = rec['bid']
            title = rec['title']
            body = rec['body']
            title_extension = rec['title_extension']
            url = rec['href']
        elif bid != 0: #'//tmp/yuryz/TrainTextShard_2_neighbor'
            rec['title'] = title
            rec['body'] = body
            rec['title_extension'] = title_extension
            rec['url'] = url
            yield rec


def sample_separation(rec): #разделение выборки на 100 таблиц
    table_index = rec['piece']
    yield { "@table_index": table_index, "piece": rec['piece'], "bid": rec['bid'], "title": rec['title'], "body": rec['body'], "mctgs": rec['mctgs'], "title_extension": rec['title_extension'], "url": rec['url'] }


class add_trainexact(object): #добавление данных из TrainExact
    def __init__(self, piece):
        self.piece = piece

    def __call__(self, rec):
        yield { "piece": self.piece, "bid": rec['bid'], "title": rec['title'], "body": rec['body'], "mctgs": rec['mctgs'], "title_extension": rec['title_extension'], "url": rec['url'] }


def main():
    tab0 = '//home/broadmatching/users/firefish/ctg/TrainTextShard'
    tab1 = '//tmp/yuryz/TrainTextShard'

    #yt.run_map(sel_shard, tab0, tab1, format=yt.YsonFormat(control_attributes_mode="row_fields"))
    #yt.run_sort(tab1, sort_by=['shard', 'hash', 'BannerID'])

    tab2 = '//home/catalogia/users/yuryz/firefish/TrainTextShard_2'

    #yt.run_map(convert, tab1, tab2, job_io={"control_attributes": {"enable_row_index": True}}, format=yt.YsonFormat(control_attributes_mode="row_fields"))
    #yt.run_sort(tab2, sort_by=['bid'])

    tab3 = '//home/catalogia/banners_extended'
    tab4 = '//tmp/yuryz/TrainTextShard_2_ext'

    #yt.run_reduce(change_mctgs, [tab3, tab2], tab4, reduce_by = ['bid'], format=yt.YsonFormat(control_attributes_mode="row_fields"))
    #yt.run_sort(tab4, sort_by=['mctgs', 'bid', 'title', 'body'], job_io={"table_writer":{"max_key_weight":131072}})

    #--- 1. Применение правил для категорий ---
    tab5 = '//home/catalogia/users/yuryz/etalon/rules4ctgs' #окончательно сформированные правила для категорий
    tab6 = '//tmp/yuryz/TrainTextShard_2_upd'

    #yt.run_reduce(rules4ctgs, [tab5, tab4], tab6, reduce_by = ['mctgs'], format=yt.YsonFormat(control_attributes_mode="row_fields"))
    #yt.run_sort(tab6, sort_by=['bid', 'title', 'body', 'mctgs'], job_io={"table_writer":{"max_key_weight":131072}})

    #--- 2. Выбор категорий и размеров семантических блоков для шарда #2 ---
    tab7 = '//home/catalogia/users/yuryz/etalon/.quarantine/block_sens_num'
    tab8 = '//tmp/yuryz/TrainTextShard_2_blk'

    #yt.run_reduce(sel_block_info, [tab7, tab6], tab8, reduce_by = ['bid'], format=yt.YsonFormat(control_attributes_mode="row_fields"))
    #yt.run_sort(tab8, sort_by=['block_mctgs', 'block_index', 'bid'], job_io={"table_writer":{"max_key_weight":131072}})

    #--- 3. Подготовка block_sens_num ---
    tab9 = '//home/catalogia/users/yuryz/etalon/.quarantine/block_sens_num_add_field'

    #yt.run_map(change_field, tab7, tab9, format=yt.YsonFormat(control_attributes_mode="row_fields"))
    #yt.run_sort(tab9, sort_by=['block_mctgs', 'block_index', 'bid'], job_io={"table_writer":{"max_key_weight":131072}})

    #--- 4. Выбор семантических соседей для баннеров шарда #2 ---
    tab10 = '//tmp/yuryz/TrainTextShard_2_neighbor'

    #yt.run_reduce(sel_neighbor, [tab8, tab9], tab10, reduce_by = ['block_mctgs', 'block_index'], format=yt.YsonFormat(control_attributes_mode="row_fields"))
    #yt.run_sort(tab10, sort_by=['bid'], job_io={"table_writer":{"max_key_weight":131072}})

    #--- 5. Форматирование баннеров из окрестностей ---
    tab11 = '//tmp/yuryz/TrainTextShard_2_neighbor_frm'

    #yt.run_reduce(neighbor_format, [tab3, tab10], tab11, reduce_by = ['bid'], format=yt.YsonFormat(control_attributes_mode="row_fields"))
    #yt.run_sort(tab11, sort_by=['piece', 'bid', 'title', 'body', 'mctgs'], job_io={"table_writer":{"max_key_weight":131072}})

    #--- 6. Разделение выборки на 100 таблиц ---
    tab12 = '//home/catalogia/users/yuryz/firefish/'
    tab_names = []
    for i in range(100):
        tab_names.append(tab12 + 'piece' + str(i))

    #yt.run_map(sample_separation, tab11, [tab_names], job_io={"control_attributes": {"enable_row_index": True}}, format=yt.YsonFormat(control_attributes_mode="row_fields"))

    tab13 = '//home/catalogia/users/yuryz/etalon/etalon_neighbor_add'
    for i in range(100):
        #print "piece=" + str(i)
        #yt.run_map(add_trainexact(i), tab13, yt.TablePath(tab_names[i], append=True), format=yt.YsonFormat(control_attributes_mode="row_fields"))
        pass

    for i in range(100):
        print "piece=" + str(i)
        yt.run_sort(tab_names[i], sort_by=['piece', 'bid'], job_io={"table_writer":{"max_key_weight":131072}})
        pass


if __name__ == '__main__':
    main()
