#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
from collections import Counter
from mymrutils import *

hosts = [l.strip() for l in open('hosts_3229')]

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', files=['hosts_3229'], verbose=True)
    with mktmp() as tmp:
        MR.runCombine(getUrls, srcTable='urldats.1437644205', dstTable=tmp)
        MR.runReduce(Summarizer(), srcTable=tmp, dstTable='likhomanov/urls_3229')

def getUrls(recs):
    d = Counter()
    for rec in recs:
        url = rec.key.lower()
        if url.startswith('http://'):
            url = url[7:]
        elif url.startswith('https://'):
            url = url[8:]
        if url.startswith('www.'):
            url = url[4:]
        for h in hosts:
            if url.startswith(h):
                d[h] += 1
                break
    for k, v in d.iteritems():
        yield Record(k, '', str(v))

if __name__ == '__main__':
    main()

