#!/usr/bin/python
# -*- coding: utf-8 -*-

#синхронизация доменных имен в raw_dataset.dat и banners extended

import sys
import re
import yt.wrapper as yt

def data_sync2(key, recs): #синхронизация с данными Ксюши
    title = ''
    flag = 0
    for rec in recs:
        table_index = rec.pop('@table_index')
        if table_index == 0:
            title = rec['title']
            rec2 = rec
        elif title != '':
            flag = 1
            yield rec
    if title != '' and flag == 0: #НЕнайденные баннеры
        rec2['@table_index'] = 1
        yield rec2


def data_sync(key, recs): #синхронизация с данными Ксюши
    bid = 0
    flag = 0
    for rec in recs:
        table_index = rec.pop('@table_index')
        if table_index == 0:
            bid = rec['bid']
            rec2 = rec
        elif bid != 0:
            flag = 1
            rec2['domain'] = rec['domain']
            yield rec2
    if bid != 0 and flag == 0: #НЕнайденные баннеры
        rec2['@table_index'] = 1
        yield rec2


def main():
    tab1 = '//tmp/yuryz/dataset'
    tab2 = '//home/catalogia/banners_extended'
    ##tab2 = '//tmp/yuryz/banners_extended'

    tab3 = '//tmp/yuryz/dataset_sync'
    tab4 = '//tmp/yuryz/no_sync'

    yt.run_reduce(data_sync, [tab1, tab2], [tab3, tab4], reduce_by = ['bid'], format=yt.YsonFormat(control_attributes_mode="row_fields"))
    ##yt.run_reduce(data_sync, [tab1, tab2], [tab3, tab4], reduce_by = ['title', 'body', 'mctgs'], format=yt.YsonFormat(control_attributes_mode="row_fields"))

    print >> sys.stderr, yt.row_count(tab1)
    print >> sys.stderr, yt.row_count(tab3)

if __name__ == '__main__':
    main()
