#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
import libkernelgeo as geo
from logparse import parseReqans
from mymrutils import *
from collections import defaultdict
import random

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True, files=['geodata3.bin', 'addr_urls'])
    DST = 'likhomanov/addr_2796'
    addrGetter = AddrGetter()
    with mktmp() as tmp:
        for src in ['reqans_log/{}'.format(d) for d in strdaterange((2015,2,3), (2015,2,7))]:
            MR.runCombine(addrGetter, srcTable=src, dstTable=tmp, appendMode=True)
        MR.runReduce(Limiter(10000), srcTable=tmp, dstTable=DST)


class AddrGetter(object):
    def __init__(self):
        self.geobase = None
        self.addr = None

    def __call__(self, recs):
        if self.geobase is None:
            self.geobase = geo.TRegionsDB('geodata3.bin')
        if self.addr is None:
            self.addr = defaultdict(set)
            for l in open('addr_urls'):
                url, lr = l.strip().split('\t')
                if not url.startswith(('https://', 'http://')):
                    url = 'http://' + url
                self.addr[url].add(int(lr))
        for rec in recs:
            req, ress = parseReqans(rec.value)
            if not iswww(req):
                continue
            if serpLang(req) not in ('ru', 'ua', 'by', 'kz'):
                continue
            lr = req.get('reg')
            if not lr:
                continue
            lr = int(lr)
            for res in ress:
                if 'snippets_type' not in res:
                    continue
                url = res['url']
                if url not in self.addr:
                    continue
                for reg in self.addr[url]:
                    if lr == reg or self.geobase.IsSubRegionOf(lr, reg):
                        yield Record('0', str(random.random()), '{}\t{}\t{}'.format(req['req'], lr, url))
                        break

if __name__ == '__main__':
    main()

