#!/usr/bin/python
# -*- coding: utf-8 -*-

#вычисление косинусной меры векторов //home/catalogia/contest/CategoriesTreeEmb

import sys
import re
import yt.wrapper as yt

from urlparse import urlparse

from math import sqrt
from itertools import imap
from operator import mul


def op_cosine(a, b):
    dot_prod = sum(imap(mul, a, b))
    a_veclen = sqrt(sum(i ** 2 for i in a))
    b_veclen = sqrt(sum(i ** 2 for i in b))
    return 1 - dot_prod / (a_veclen * b_veclen) #расстояние
    #return dot_prod / (a_veclen * b_veclen) #сходство


class mapper(object):
    def __init__(self, bnrs):
        self.bnrs = bnrs #bid + '\t' + vect_ctgs

    def __call__(self, rec):
        a = [ float(k) for k in rec['Vector'].split(' ') ] #компоненты вектора
        for bnr in self.bnrs:
            bid, vect_ctgs = bnr.split('\t')
            if bid != rec['CategoryID']:
                b = [ float(k) for k in vect_ctgs.split(' ') ]
                dist = op_cosine(a, b) #расстояние

                yield { "ctg1": rec['CategoryID'], "ctg2": int(bid), "dist": dist}


class reducer(object): #выбор ближайшей категории
    def __init__(self, ctgs):
        self.ctgs = ctgs #словарь имен категорий Каталогии

    def __call__(self, key, recs):
        for rec in recs:
            if rec['ctg1'] != 200027694:
                return

            ctg1_name = int(rec['ctg1'])
            if self.ctgs[int(rec['ctg1'])]:
                ctg1_name = self.ctgs[int(rec['ctg1'])]

            ctg2_name = int(rec['ctg2'])
            if self.ctgs[int(rec['ctg2'])]:
                ctg2_name = self.ctgs[int(rec['ctg2'])]

            yield { "ctg1": ctg1_name, "ctg2": ctg2_name, "dist": rec['dist'] }

"""
        for rec in recs:
            if rec['ctg1'] == rec['ctg2'] or rec['dist'] == 0: continue
            break

        ctg1_name = int(rec['ctg1'])
        if self.ctgs[int(rec['ctg1'])]:
            ctg1_name = self.ctgs[int(rec['ctg1'])]

        ctg2_name = int(rec['ctg2'])
        if self.ctgs[int(rec['ctg2'])]:
            ctg2_name = self.ctgs[int(rec['ctg2'])]

        yield { "ctg1": ctg1_name, "ctg2": ctg2_name, "dist": rec['dist'] }
"""


def main():
    tab1 = '//home/catalogia/contest/CategoriesTreeEmb'
    tab2 = '//tmp/yuryz/CategoriesTreeCos'

    bnrs = []
    #for rec in yt.read_table(tab1, raw=False):
    #    bnrs.append(str(rec['CategoryID']) + '\t' + rec['Vector'])

    #yt.run_map(mapper(bnrs), tab1, tab2, spec={'data_size_per_job': 8 * 1024 * 1024})

    tab3 = '//home/catalogia/users/yuryz/proxim/CategoriesTreeCos'

    #yt.run_sort(tab2, tab3, sort_by=['ctg1', 'dist', 'ctg2'])

    tab4 = '//home/catalogia/contest/CategoriesTree'

    ctgs = {}
    for rec in yt.read_table(tab4, raw=False):
        ctgs[int(rec['CategoryID'])] = rec['CategoryName']

    ##tab5 = '//home/catalogia/users/yuryz/proxim/CategoriesTreeProxim'
    tab5 = '//home/catalogia/users/yuryz/proxim/CategoriesTreeProxim2'

    yt.run_reduce(reducer(ctgs), tab3, tab5, reduce_by = ['ctg1'], format=yt.YsonFormat(control_attributes_mode="row_fields"))
    yt.run_sort(tab5, sort_by=['dist', 'ctg1'])


if __name__ == '__main__':
    main()
