#!/usr/bin/python
import re

import sys
import os
import shutil
import datetime
from tempfile import mkstemp

import yt.wrapper as yt

class Mapper():
    def __call__(self, row):
        yield {
            'Categories': row['Categories'],
            'bid': row['bid'],
            'pid': row['pid'],
        }

class Reducer():
    def __call__(self, key, rows):
        categdict = {}
        bids = {}
        for row in rows:
            bids[row['bid']] = row['Categories']
            if len(row['Categories']) > 0:
                categs = row['Categories'].split('/')
                for categ in categs:
                    categdict[categ] = categdict.get(categ,0) + 1
        resultcnt = 0;
        for categ in categdict.keys():
            if resultcnt < categdict[categ]:
                resultcnt = categdict[categ]
        resultcateglist = []
        for categ in categdict.keys():
            if resultcnt == categdict[categ]:
                resultcateglist.append(categ);
        resultcateglist.sort();
        sep = '/'
        resultcateg = sep.join(resultcateglist)
        for bid in bids.keys():
            finalcateg = bids[bid]
            if len(finalcateg) == 0:
                finalcateg = resultcateg
            yield {
                'Categories' : finalcateg,
                'bid': bid,
            }

def main():

    yt.config['mount_sandbox_in_tmpfs'] = True
    yt.config['token_path'] = '/opt/broadmatching/bm-tokens/yt_plato'
    yt.config['spec_defaults'] = {
        'pool': 'catalogia',
    }
    yt.config["proxy"]["url"] = "hahn.yt.yandex.net"

    yt.run_map_reduce(
        Mapper(),
        Reducer(),
        '//home/catalogia/tmp/go_ads_default',
        '//home/catalogia/tmp/go_ads_nographic',
        reduce_by=['pid'],
    )

if __name__ == '__main__':
    main()
