#!/usr/bin/python
# -*- coding: utf-8 -*-

#расчет статистики доменов - этап 2a (предварительное суммирование)

import yt.wrapper as yt


def reducer(key, recs):
    index_min = -1 #индекс начала урла в домене
    domain_size = 0
    for rec in recs:
        if index_min < 0:
            index_min = rec['index_min']
        domain_size += rec['domain_size']

    yield {'domain': key['domain'], 'index_min': index_min, 'domain_size': domain_size, 'url_count': 1}


def main():
    input = '//tmp/yuryz/domain_stat_2'
    output = '//tmp/yuryz/domain_stat_3'

    yt.run_reduce(reducer, input, output, reduce_by=['domain', 'url'])
    yt.run_sort(output, sort_by=['domain', 'index_min'])

    print yt.row_count(input)
    print yt.row_count(output)


if __name__ == '__main__':
    main()
