#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
import libra
from mymrutils import *

full_urls = set()
for u in open('cata_urls'):
    u = u.strip()
    if '/' not in u:
        u += '/'
    full_urls.update(['http://' + u, 'https://'+ u])


def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', files=['blockstat.dict', 'cata_urls'], verbose=True)
    DST = 'likhomanov/yaca_https'
    for src in ['user_sessions/{}'.format(d) for d in strdaterange((2015, 4, 27), (2015, 4, 29))]:
        MR.runReduce(getData, srcTable=src, dstTable=DST, appendMode=True)
        MR.runReduce(summarize, srcTable=DST, dstTable=DST)

def getData(key, recs):
    try:
        for req in libra.ParseSession(recs, 'blockstat.dict'):
            if not req.IsA('TYandexWebRequest'):
                continue
            if req.ServiceDomRegion not in ('ru', 'ua', 'by', 'kz'):
                continue
            for block in req.GetMainBlocks():
                res = block.GetMainResult()
                if not res.IsA('TWebResult'):
                    continue
                if res.Url not in full_urls:
                    continue
                if res.Url.startswith('https://'):
                    url = res.Url[8:]
                    https = True
                else:
                    url = res.Url[7:]
                    https = False
                yaca = (res.SnippetType == 'yaca')
                ns = '\t'.join(map(lambda v: '1' if v else '0', [not https and yaca, https and yaca, not https and not yaca, https and not yaca]))
                yield Record(url, '', ns)
    except (NameError, AttributeError, TypeError):
        raise
    except Exception:
        pass

def summarize(key, recs):
    n1 = n2 = n3 = n4 = 0
    for rec in recs:
        c1, c2, c3, c4 = map(int, rec.value.split('\t'))
        n1 += c1
        n2 += c2
        n3 += c3
        n4 += c4
    yield Record(key, '', '{}\t{}\t{}\t{}'.format(n1,n2,n3,n4))

if __name__ == '__main__':
    main()

