#!/usr/bin/python
# -*- coding: utf-8 -*-

#категоризация баннеров без категорий с помощью контекста кампаний

import random
import copy
import sys
import re
import yt.wrapper as yt


def bnrs_fresh_select(rec): #отбор "свежих" баннеров
    if rec['timestamp'][:4] >= '2016': #отбираем за последние 4 года
        yield { "bid": rec['bid'], "timestamp": -int(rec['timestamp'][:4]) }


def del_dup(key, recs): #удаление дублей
    yield recs.next()


def get_bnrs_info(key, recs): #получение информации о баннерах из banners_extended
    bid = 0
    for rec in recs:
        table_index = rec.pop('@table_index')
        if table_index == 0: #'//tmp/yuryz/bnrs_fresh_'
            bid = rec['bid']
        elif bid != 0: #'//home/catalogia/banners_extended'
            if rec['mctgs'] is not None:
                yield { "bid": key['bid'], "title": rec['title'], "body": rec['body'], "url": rec['href'], "mctgs": rec['mctgs'], "cid": rec['cid'], "domain": rec['domain'] }


def bnrs_in_camp(key, recs): #выбор 'пустых' баннеров без категорий в контексте кампаний
    bnrs = []
    flag = 0
    size = 0
    for rec in recs:
        if rec['title'] == '' and rec['body'] == '' and rec['mctgs'] == '':
            flag = 1
        bnrs.append(rec)
        size += 1
    if flag == 1:
        for bnr in bnrs:
            bnr['size'] = size
            yield bnr


def proxim(a, b): #совпадение начальных частей урлов
    cnt = 0
    size = min(len(a), len(b))
    for i in range(size):
        if a[i] != b[i]: break
        cnt += 1
    return cnt


def ctgs_by_neighbors(key, recs): #категоризация по соседям
    bnrs = []
    for rec in recs:
        bnrs.append(rec)

    size = len(bnrs)
    for i in range(size):
        if bnrs[i]['title'] == '' and bnrs[i]['body'] == '' and bnrs[i]['mctgs'] == '':
            low = -1
            for j in range(i-1, -1, -1): #поиск верхнего соседа с категорией
                if bnrs[j]['mctgs'] != '':
                    low = j
                    break

            hi = size
            for j in range(i+1, size, 1): #поиск нижнего соседа с категорией
                if bnrs[j]['mctgs'] != '':
                    hi = j
                    break

            bnrs[i]['LOW_MCTGS'] = 'NOT_MCTGS'
            bnrs[i]['LOW_URL'] = 'NOT_URL'
            if low != -1:
                bnrs[i]['LOW_MCTGS'] = bnrs[low]['mctgs']
                bnrs[i]['LOW_URL'] = bnrs[low]['url']

            bnrs[i]['HI_MCTGS'] = 'NOT_MCTGS'
            bnrs[i]['HI_URL'] = 'NOT_URL'
            if hi != size:
                bnrs[i]['HI_MCTGS'] = bnrs[hi]['mctgs']
                bnrs[i]['HI_URL'] = bnrs[hi]['url']

            proxim_low = proxim(bnrs[i]['url'], bnrs[i]['LOW_URL'])
            proxim_hi = proxim(bnrs[i]['url'], bnrs[i]['HI_URL'])
            if proxim_low >= proxim_hi:
                bnrs[i]['mctgs'] = bnrs[i]['LOW_MCTGS']
            else:
                bnrs[i]['mctgs'] = bnrs[i]['HI_MCTGS']

            del bnrs[i]['title']
            del bnrs[i]['body']

            if bnrs[i]['mctgs'] != 'NOT_MCTGS':
                bnrs[i]['@table_index'] = 0
            else:
                bnrs[i]['@table_index'] = 1
            yield bnrs[i]


def main():
    # --- 1. Отбор "свежих" баннеров ---
    tab1 = '//home/catalogia/banner-history-recategorized'
    tab2 = '//tmp/yuryz/bnrs_fresh'

    #yt.run_map(bnrs_fresh_select, tab1, tab2, format=yt.YsonFormat(control_attributes_mode="row_fields"))
    #yt.run_sort(tab2, sort_by=['bid', 'timestamp'])

    tab2_ = '//tmp/yuryz/bnrs_fresh_' #без дублей bid

    #yt.run_reduce(del_dup, tab2, tab2_, reduce_by = ['bid'], format=yt.YsonFormat(control_attributes_mode="row_fields"))
    #yt.run_sort(tab2_, sort_by='bid')

    # --- 2. Получение информации о баннерах из banners_extended ---
    tab3 = '//home/catalogia/banners_extended'
    tab4 = '//tmp/yuryz/bnrs_fresh_text'

    #yt.run_reduce(get_bnrs_info, [tab2_, tab3], tab4, reduce_by = ['bid'], format=yt.YsonFormat(control_attributes_mode="row_fields"))
    #yt.run_sort(tab4, sort_by=['cid', 'bid', 'mctgs', 'domain', 'title', 'body', 'url'])

    # --- 3. Формирование контекста кампаний ---
    tab5 = '//home/catalogia/users/yuryz/multik/bnrs_in_camp'

    #yt.run_reduce(bnrs_in_camp, tab4, tab5, reduce_by = ['cid'], format=yt.YsonFormat(control_attributes_mode="row_fields"))
    #yt.run_sort(tab5, sort_by=['cid', 'size', 'bid', 'mctgs', 'domain', 'title', 'body', 'url'])

    # --- 4. Категоризация по соседям ---
    tab6 = '//home/catalogia/users/yuryz/multik/ctgs_by_neighbors_yes'
    tab7 = '//home/catalogia/users/yuryz/multik/ctgs_by_neighbors_no'

    yt.run_reduce(ctgs_by_neighbors, tab5, [tab6, tab7], reduce_by = ['cid'], format=yt.YsonFormat(control_attributes_mode="row_fields"))
    #yt.run_sort(tab6, sort_by=['bid', 'mctgs', 'LOW_MCTGS', 'HI_MCTGS'])


if __name__ == '__main__':
    main()
