#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
import libra
from mymrutils import *
import random
import re
from collections import defaultdict
from hashlib import md5

NEEDED = 10000
wizpref = (
        '/snippet/adresa/company/',
        '/snippet/adresa/map/',
        '/snippet/companies/phone_number_list/',
        '/snippet/companies/list/',
        '/snippet/companies/phone_number_company/',
        '/snippet/adresa/phone_number_company',
        '/snippet/companies/company/',
        '/snippet/adresa/company/',
        '/snippet/companies/map/',
        '/snippet/maps/',
        '/wiz/maps/',
        '/snippet/traffic/',
        '/snippet/maps/'
        )

def main():
    MR.useDefaults(username='snippets', server='sakura00.search.yandex.net', files=['blockstat.dict'], verbose=True)
    DST = 'likhomanov/addrs_3198'
    with mktmp() as tmp:
        for src in ['user_sessions/{}'.format(d) for d in strdaterange((2015, 7, 30), (2015, 8, 1))]:
            MR.runReduce(getData, srcTable=src, dstTable=tmp, appendMode=True)
        MR.runReduce(getNeeded, srcTable=tmp, dstTable=DST)

def getData(key, recs):
    try:
        for req in libra.ParseSession(recs, 'blockstat.dict'):
            if req.IsA('TYandexWebRequest'):
                platf = 'desk'
            elif req.IsA('TTouchYandexWebRequest'):
                platf = 'touch'
            else:
                continue
            if req.ServiceDomRegion == 'tr':
                lang = 'tr'
            elif req.ServiceDomRegion in ('ru', 'ua', 'by', 'kz'):
                lang = 'kubr'
            else:
                continue
            paths = [x.Path for x in req.GetBSBlocks()]
            if any(p.startswith('/snippet/address') for p in paths):
                yield Record('addrsnip\t' + platf + '\t' + lang, str(random.random()), str(req.UserRegion) + '\t' + req.Query)
            if any(p.startswith(wizpref) for p in paths):
                yield Record('addrwiz\t' + platf + '\t' + lang, str(random.random()), str(req.UserRegion) + '\t' + req.Query)
    except (NameError, AttributeError, TypeError):
        raise
    except Exception:
        pass

def getNeeded(key, recs):
    reqs = set()
    for rec in recs:
        lr, req = rec.value.split('\t', 1)
        if req not in reqs:
            reqs.add(req)
            yield Record(key, '', req + '\t' + lr)
            if len(reqs) == NEEDED:
                return

if __name__ == '__main__':
    main()

