#!/usr/bin/env python2

from mapreducelib import Record, TemporaryTable, MapReduce as MR
import libra
from mymrutils import *
from itertools import islice
import re
from hashlib import md5
import random


def main():
    MR.useDefaults(username='sitelinks', server='cedar00.search.yandex.net:8013', verbose=True, files=['blockstat.dict'])
    DST = 'likhomanov/sl_sample'
    with mktmp() as tmp:
        for src in ['user_sessions/201409{:02}'.format(i) for i in range(1,8)]:
            MR.runReduce(getData, srcTable=src, dstTable=tmp, appendMode=True)
        MR.runReduce(combine, srcTable=tmp, dstTable=tmp)
        MR.runReduce(Limiter(5000), srcTable=tmp, dstTable=DST)


def getData(key, recs):
    try:
        for req in libra.ParseSessionWithFat(recs, 'blockstat.dict'):
            if not req.IsA('TYandexWebRequest'):
                continue
            if req.ServiceDomRegion != 'ru':
                continue
            for block in req.GetMainBlocks():
                res = block.GetMainResult()
                if not res.IsA('TWebResult'):
                    continue
                sls = [l.Url for l in res.GetSiteLinks()]
                if not sls:
                    continue
                if any('wikipedia.org' in h for h in sls):
                    continue
                if any('copy.yandex' in h for h in sls):
                    continue
                yield Record(md5(req.Query).hexdigest(), '', '{}\t{}'.format(req.Query, res.Url))
    except (NameError, AttributeError, TypeError):
        raise
    except Exception:
        pass

def combine(key, recs):
    urls = set()
    req, u = next(recs).value.split('\t')
    urls.add(u)
    for r in recs:
        _, u = r.value.split('\t')
        urls.add(u)
    yield Record('0', str(random.random()), req + '\t' + '\t'.join(urls))


if __name__ == '__main__':
    main()

