#!/usr/bin/env python2

from mapreducelib import Record, TemporaryTable, MapReduce as MR
from reqansparse import parseRARecord
from mymrutils import *
from itertools import islice
import re
from hashlib import md5
import random

fhosts = set(l.strip() for l in open('fdiff')) - set(['youtube.com', 'facebook.com', 'tr-tr.facebook.com'])

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True, files=['fdiff'])
    DST = 'likhomanov/turkforums'
    DST2 = 'likhomanov/turkforumscount'
    with mktmp() as tmp:
        for src in ['reqans_log/201407{:02}'.format(i) for i in range(20, 25)]:
            MR.runMap(extract, srcTable=src, dstTable=tmp.name, appendMode=True)
        MR.runReduce(cleanup, srcTable=tmp.name, dstTables=[tmp.name, DST2])
        MR.runReduce(Limiter(10), srcTable=tmp.name, dstTable=DST)
        mrsort(DST2)        

def extract(rec):
    ra = parseRARecord(rec.value)
    checkwww(ra)
    lang = serpLang(ra)
    for res in ra['results']:
        url = res.get('url')
        host = getHost(url)
        if host not in fhosts:
            continue
        stype = res.get('snippets_type')
        if stype and 'forum' in stype:
            yield Record(host, 'GOOD', '')
            continue
        if lang == 'com.tr':
            if not stype or stype in ('generic', 'empty', 'trash_annotation'):
                yield Record(host, 'NOTFORUM', '{}\t{}\t{}'.format(url, ra['request']['req'], stype))
            else:
                yield Record(host, 'SHOW', '')

def cleanup(host, recs):
    vals = []
    n = 0
    for rec in recs:
        if rec.subkey == 'GOOD':
            return
        n += 1
        if rec.subkey == 'NOTFORUM':
            vals.append(rec.value)
    yield Record('0', str(10000000 - n), '{}\t{}'.format(n, host), tableIndex=1)
    for val in vals:
        yield Record(host, str(random.random()), val, tableIndex=0)

if __name__ == '__main__':
    main()

