#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
from logparse import parseReqans
from mymrutils import *
from hashlib import md5

hashes = set(l.strip() for l in open('forumhashes'))

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True, files=['forumhashes'])
    DST = 'likhomanov/ftopic3'
    with mktmp() as tmp:
        MR.copyTable(srcTable='likhomanov/ftopic27', dstTable=tmp)
        MR.runCombine(getData, srcTable='reqans_log/20150302', dstTable=tmp, appendMode=True)
        MR.runReduce(process, srcTable=tmp, dstTable=DST)
        mrsort(DST)

def getData(recs):
    for rec in recs:
        req, ress = parseReqans(rec.value)
        if not iswww(req):
            continue
        lang = serpLang(req)
        if lang == 'com.tr':
            lang = 'tr'
        elif lang != 'ru':
            continue
        for res in ress:
            stype = res.get('snippets_type')
            if not stype:
                continue
            s = '{}\t{}\t{}'.format(lang, req['req'], res['url'])
            h = md5(s).hexdigest()
            if h in hashes:
                if stype == ('forum_topic' if lang == 'ru' else 'forum_forums'):
                    yield Record(h, '1', s)
                else:
                    yield Record(h, '2', s)

def process(h, recs):
    n1 = n2 = 0
    s = next(recs).value
    for r in recs:
        if r.subkey == '1':
            n1 += 1
        else:
            n2 += 1
    yield Record(h, '', '{}\t{}\t{}'.format(n1, n2, s))

if __name__ == '__main__':
    main()

