#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
from mymrutils import *
import random
import re
from collections import defaultdict
from hashlib import sha1 as md5


def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True)
    DST = 'likhomanov/top_spylog'
    with open('log_sl', 'a', 1) as log:
        with mktmp() as tmp:
            for src in ['user_sessions/{}/spy_log'.format(d) for d in strdaterange((2015, 2, 12), (2015, 7, 1))]:
                MR.runCombine(getData, srcTable=src, dstTable=tmp)
                mrsort(tmp)
                MR.runReduce(summarize, srcTables=[tmp, DST], dstTable=DST, withoutSort=True, sortedOutput=True)
                print >>log, src
                if '2/s' in src or '7/s' in src:
                    MR.defragTable(DST)
            MR.runMap(presort, srcTable=DST, dstTable=DST)
            MR.runReduce(Limiter(2000000), srcTable=DST, dstTable=DST)

def getData(recs):
    for rec in recs:
        parts = rec.value.split('\t')
        soft = [p for p in parts if p.startswith('yasoft=')]
        if not soft:
            continue
        soft = soft[0][7:]
        platform = 'touch' if soft in ('android.yabrowser', 'apad.yabrowser') else 'desk'
        url = [p for p in parts if p.startswith('url=')]
        if not url:
            continue
        url = url[0][4:]
        yield Record(platform + '\t' + md5(url).hexdigest(), '', '1\t' + url)

def summarize(key, recs):
    n = 0
    for rec in recs:
        c, url = rec.value.split('\t', 1)
        n += int(c)
    if n >= 3:
        yield Record(key, '', str(n) + '\t' + url)

def presort(rec):
    platform, _ = rec.key.split('\t')
    n, url = rec.value.split('\t', 1)
    yield Record(platform, str(10000000000000 - int(n)), rec.value)

if __name__ == '__main__':
    main()

