#!/usr/bin/python
# -*- coding: utf-8 -*-

#категоризация баннеров без категорий с доменных имен и урлов

import random
import copy
import sys
import re
import yt.wrapper as yt


def bnrs_fresh_select(rec): #отбор "свежих" баннеров
    if rec['timestamp'][:4] >= '2016': #отбираем за последние 4 года
        yield { "bid": rec['bid'], "timestamp": -int(rec['timestamp'][:4]) }


def del_dup(key, recs): #удаление дублей
    yield recs.next()


def get_bnrs_info(key, recs): #получение информации о баннерах из banners_extended
    bid = 0
    for rec in recs:
        table_index = rec.pop('@table_index')
        if table_index == 0: #'//tmp/yuryz/bnrs_fresh_'
            bid = rec['bid']
        elif bid != 0: #'//home/catalogia/banners_extended'
            if rec['mctgs'] is not None and rec['mctgs'] != '' and rec['domain'] is not None and rec['domain'] != '':
                yield { "domain": rec['domain'], "url": rec['href'], "mctgs": rec['mctgs'] }


def domen_freq_dict(key, recs): #cоставление частотного словаря категорий внутри домена
    d = dict()
    for rec in recs:
        d[rec['mctgs']] = d.get(rec['mctgs'], 0) + 1

    ctgs_freq = sorted(d.values())
    ctgs_list = [ctg for ctg in d if d[ctg] == ctgs_freq[-1]] #категории с MAX частотой
    mctgs = ctgs_list[0]

    yield { "domain": key['domain'], "url": key['url'], "mctgs": mctgs, "freq": ctgs_freq[-1] }


def dict_compress(key, recs): #сжатие словаря до соседних пар
    rec1 = None
    for rec in recs:
        if rec1 is None:
            rec1 = rec
            rec2 = None
        elif rec1['mctgs'] == rec['mctgs']:
            rec1['freq'] += rec['freq']
            rec2 = rec
        else:
            yield rec1
            if rec2 is not None:
                yield rec2
            rec1 = rec
            rec2 = None
    yield rec1
    if rec2 is not None:
        yield rec2


def proxim(a, b): #совпадение начальных частей урлов
    cnt = 0
    size = min(len(a), len(b))
    for i in range(size):
        if a[i] != b[i]: break
        cnt += 1
    return cnt


def ctgs_by_neighbors(key, recs): #категоризация по соседям
    bnrs = []
    i = 0 #текущая запись в bnrs
    rec_low = None
    rec_hi = None

    for rec in recs:
        table_index = rec.pop('@table_index')
        if table_index == 0: #'//home/catalogia/users/yuryz/multik/ctgs_by_neighbors_no' #python bnrs_sel_not_ctgs2.py
            if len(bnrs) == 0 or rec['url'] != bnrs[-1]['url']: #убираем дубли из-за нехватки памяти
                bnrs.append(rec)
        else: #'//home/catalogia/users/yuryz/multik/domain4ctgs'
            if len(bnrs) == 0 or i >= len(bnrs): return
            if i < len(bnrs):
                if bnrs[i]['url'] >= rec['url']:
                    rec_low = rec
                else:
                    rec_hi = rec
                    while i < len(bnrs) and bnrs[i]['url'] < rec_hi['url']:
                        if rec_low is None:
                            bnrs[i]['mctgs'] = rec_hi['mctgs']
                        else:
                            proxim_low = proxim(bnrs[i]['url'], rec_low['url'])
                            proxim_hi = proxim(bnrs[i]['url'], rec_hi['url'])
                            if proxim_low >= proxim_hi:
                                bnrs[i]['mctgs'] = rec_low['mctgs']
                            else:
                                bnrs[i]['mctgs'] = rec_hi['mctgs']
                        yield bnrs[i]
                        i += 1

                    if i == len(bnrs): break
                    rec_low = rec_hi

    if len(bnrs) == 0 or i >= len(bnrs): return
    if rec_low is None and rec_hi is None: return

    while i < len(bnrs):
        if rec_low is None:
            bnrs[i]['mctgs'] = rec_hi['mctgs']
        elif rec_hi is None:
            bnrs[i]['mctgs'] = rec_low['mctgs']
        else:
            proxim_low = proxim(bnrs[i]['url'], rec_low['url'])
            proxim_hi = proxim(bnrs[i]['url'], rec_hi['url'])
            if proxim_low >= proxim_hi:
                bnrs[i]['mctgs'] = rec_low['mctgs']
            else:
                bnrs[i]['mctgs'] = rec_hi['mctgs']
        yield bnrs[i]
        i += 1


def main():
    # --- 1. Отбор "свежих" баннеров ---
    tab1 = '//home/catalogia/banner-history-recategorized'
    tab2 = '//tmp/yuryz/bnrs_fresh'

    #yt.run_map(bnrs_fresh_select, tab1, tab2, format=yt.YsonFormat(control_attributes_mode="row_fields"))
    #yt.run_sort(tab2, sort_by=['bid', 'timestamp'])

    tab2_ = '//tmp/yuryz/bnrs_fresh_' #без дублей bid

    #yt.run_reduce(del_dup, tab2, tab2_, reduce_by = ['bid'], format=yt.YsonFormat(control_attributes_mode="row_fields"))
    #yt.run_sort(tab2_, sort_by='bid')

    # --- 2. Получение информации о баннерах из banners_extended ---
    tab3 = '//home/catalogia/banners_extended'
    tab4 = '//tmp/yuryz/bnrs_fresh_text'

    #yt.run_reduce(get_bnrs_info, [tab2_, tab3], tab4, reduce_by = ['bid'], format=yt.YsonFormat(control_attributes_mode="row_fields"))
    #yt.run_sort(tab4, sort_by=['domain', 'url', 'mctgs'])

    # --- 3. Составление частотного словаря доменов ---
    tab5 = '//tmp/yuryz/domain4ctgs'

    #yt.run_reduce(domen_freq_dict, tab4, tab5, reduce_by = ['domain', 'url'], format=yt.YsonFormat(control_attributes_mode="row_fields"))
    #yt.run_sort(tab5, sort_by=['domain', 'url', 'mctgs'])

    tab6 = '//home/catalogia/users/yuryz/multik/domain4ctgs'

    #yt.run_reduce(dict_compress, tab5, tab6, reduce_by = ['domain'], format=yt.YsonFormat(control_attributes_mode="row_fields"))
    #yt.run_sort(tab6, sort_by=['domain', 'url', 'mctgs'])

    # --- 4. Категоризация по соседям ---
    tab7 = '//home/catalogia/users/yuryz/multik/ctgs_by_neighbors_no' #python bnrs_sel_not_ctgs2.py
    tab8 = '//home/catalogia/users/yuryz/multik/ctgs_by_domains'

    #yt.run_reduce(ctgs_by_neighbors, [tab7, tab6], tab8, reduce_by = ['domain'], format=yt.YsonFormat(control_attributes_mode="row_fields"))
    yt.run_sort(tab8, sort_by=['bid', 'url', 'mctgs'])


if __name__ == '__main__':
    main()
