#!/usr/bin/env python2

from mapreducelib import Record, TemporaryTable, MapReduce as MR
from logparse import parseReqans
from mymrutils import *
from itertools import islice
from collections import defaultdict
import re

videourls = set(l.strip() for l in open('videourls') if l.strip())

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True, files=['videourls'])
    DST = 'likhomanov/video_obj'
    with mktmp() as tmp:
        for src in ['reqans_log/{}'.format(d) for d in strdaterange((2014, 9, 1), (2014, 9, 8))]:
            MR.runCombine(getvideo, srcTable=src, dstTable=tmp, appendMode=True)
        MR.runReduce(Summarizer(), srcTable=tmp, dstTable=tmp)
        MR.runMap(transform, srcTable=tmp, dstTable=tmp)
        MR.runReduce(getstats, srcTable=tmp, dstTable=tmp)
        MR.runReduce(Summarizer(), srcTable=tmp, dstTable=DST)


def getvideo(recs):
    d = defaultdict(int)
    for rec in recs:
        req, ress = parseReqans(rec.value)
        if req.get('stype') != 'www':
            continue
        if req.get('is_yandex', '0') == '1':
            continue
        lang = serpLang(req)
        if lang not in ('ru', 'ua', 'by', 'kz', 'com.tr'):
            continue
        if lang == 'com.tr':
            langs = ('tr',)
        elif lang == 'ru':
            langs = ('ru', 'kubr')
        else:
            langs = ('kubr',)
        for res in ress:
            stype = res.get('snippets_type')
            if not stype:
                continue
            if res.get('url') not in videourls:
                continue
            for l in langs:
                d[l + ' ALL'] += 1
                yield Record(l + ' ' + res['url'], '', '1')
            if stype not in ('video', 'video2', 'video_desc'):
                continue
            for l in langs:
                d[l + stype + ' ALL'] += 1
                yield Record(l + stype + ' ' + res['url'], '', '1')
    for k, v in d.iteritems():
        yield Record(k, '', str(v))


def transform(rec):
    lang, url = rec.key.split()
    if url == 'ALL':
        yield rec
    else:
        yield Record(url, lang, rec.value)

def getstats(key, recs):
    if 'ALL' in key:
        yield next(recs)
        return
    langs = set(r.subkey for r in recs)
    for l in ('ru', 'tr', 'kubr'):
        if any(l + 'v' in ll for ll in langs):
            langs.add(l + 'some')
    for l in langs:
        yield Record(l, '', '1')

if __name__ == '__main__':
    main()

