#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
import libra
from mymrutils import *
import random
import re


hosts = ['otvet.mail.ru', 'kakprosto.ru',
        'znanija.com', 'bolshoyvopros.ru',
        'elhow.ru', 'pravoved.ru', 'irc.lv']


def main():
    MR.useDefaults(username='snippets', server='sakura00.search.yandex.net', files=['blockstat.dict'], verbose=True)
    DST = 'likhomanov/questions_3240'
    with mktmp() as tmp:
        for src in ['user_sessions/{}'.format(d) for d in strdaterange((2015, 7, 20), (2015, 7, 27))]:
            MR.runReduce(getData, srcTable=src, dstTable=tmp, appendMode=True)
            MR.runReduce(Summarizer(), srcTable=tmp, dstTable=tmp)
        MR.runCombine(cleankey, srcTable=tmp, dstTable=tmp)
        MR.runReduce(Summarizer(), srcTable=tmp, dstTable=DST)

def getData(key, recs):
    try:
        for req in libra.ParseSession(recs, 'blockstat.dict'):
            if req.IsA('TYandexWebRequest'):
                platf = 'desk'
            elif req.IsA('TTouchYandexWebRequest'):
                platf = 'touch'
            else:
                continue
            if req.ServiceDomRegion != 'ru':
                continue
            pfx = str(random.randrange(256)) + '\t' + platf + '\t'
            haveDoc = False
            seen = set()
            for res in (x.GetMainResult() for x in req.GetMainBlocks()):
                if not res.IsA('TWebResult'):
                    continue
                haveDoc = True
                host = getHost(res.Url)
                if host not in hosts:
                    continue
                if host == 'kakprosto.ru' and 'kakprosto.ru/otvet/' not in res.Url.lower():
                    continue
                seen.add(host)
            if haveDoc:
                yield Record(pfx + 'SERP', '', '1')
                if seen:
                    for host in seen:
                        yield Record(pfx + host, '', '1')
                    if len(seen) > 1:
                        yield Record(pfx + 'MANY', '', '1')
                        for h1 in seen:
                            for h2 in seen:
                                if h1 < h2:
                                    yield Record(pfx + h1 + '\t' + h2, '', '1')
    except (NameError, AttributeError, TypeError, ValueError):
        raise
    except Exception:
        pass

def cleankey(recs):
    for rec in recs:
        _, k = rec.key.split('\t', 1)
        yield Record(k, '', rec.value)

if __name__ == '__main__':
    main()

