#!/usr/bin/python
import re

import sys
import os
import shutil
import datetime
from tempfile import mkstemp

from bm.yt_tools import NormalizeReducer
import yt.wrapper as yt

YT_MAX_ROW_WEIGHT = 128000000
YT_HEAVY_ROW_SPEC = {"job_io": {"table_writer": {"max_row_weight": YT_MAX_ROW_WEIGHT}}}


class Mapper():
    def __call__(self, row):
        phrases = row['norm'].split('\t')
        clicks = 0
        for phrase in phrases:
            arr = phrase.split(':')
            clicks = clicks + int(arr[1])
        yield {
            'domain': row['domain'],
            'clicks': clicks,
        }

class Reducer():
    def __call__(self, key, rows):
        clicks = 0
        clicked_urls = 0
        urls = 0
        for row in rows:
            clicks = clicks + row['clicks']
            incr = 0
            if row['clicks']> 0:
                incr=1
            clicked_urls = clicked_urls + incr
            urls = urls + 1
        yield {
            'domain': key['domain'],
            'clicks': clicks,
            'clicked_urls': clicked_urls,
            'urls': urls
        }

def main():
    yt.config['mount_sandbox_in_tmpfs'] = True
    yt.config['token_path'] = '/opt/broadmatching/bm-tokens/yt_plato'
    yt.config['spec_defaults'] = {
        'pool': 'catalogia',
    }
    yt.config["proxy"]["url"] = "hahn.yt.yandex.net"

    yt.run_map_reduce(
        Mapper(),
        Reducer(),
        '//home/catalogia/dse_base',
        '//home/catalogia/tmp/dse_domain_stats',
        reduce_by=['domain']
    )
    yt.run_sort('//home/catalogia/tmp/dse_domain_stats', sort_by=['urls','clicked_urls'])

if __name__ == '__main__':
    main()
