#!/usr/bin/python
# -*- coding: utf-8 -*-

#расчет статистики доменов - этап 1

import yt.wrapper as yt
import re


def mapper(rec):
    pat = r'(https?:\/\/)?([\w\.%-]+)\.([a-z]{2,6}\.?)(\/[\w \.%-]*)*\/?' #парсинг урла
    pat_re = re.compile(pat)

    url_re = pat_re.match(rec['url']) #парсинг урла
    if url_re is not None:
        domain = url_re.group(2) + '.' + url_re.group(3) #домен
        if url_re.group(1) is not None: #ищем только в урлах с 'http(s)://'
            yield {'domain': domain, '@row_index': rec['@row_index'], 'url': rec['url'], 'freq': 1}


def reducer(key, recs):
    index_min = -1 #индекс начала урла в домене
    domain_size = 0
    for rec in recs:
        if index_min < 0:
            index_min = rec['@row_index']
        domain_size += 1

    yield {'domain': key['domain'], 'index_min': index_min, 'domain_size': domain_size, 'url_count': 1}


def main():
    input = '//home/catalogia/yuryz/queries_join_index'
    output = '//tmp/yuryz/domain_stat_2a'

    yt.run_map_reduce(mapper, reducer, input, output, sort_by=['domain', 'url', '@row_index'], reduce_by=['domain', 'url'], spec={'job_count': 15000})

    #yt.run_map(mapper, input, output)
    #yt.run_sort(output, sort_by=['domain', 'url', '@row_index'])

    #yt.run_reduce(reducer, input, output, reduce_by=['domain', 'url'])
    yt.run_sort(output, sort_by=['domain', 'index_min'])

    print yt.row_count(input)
    print yt.row_count(output)


if __name__ == '__main__':
    main()
