#!/usr/bin/env python2

from mapreducelib import Record, TemporaryTable, MapReduce as MR
from reqansparse import parseRARecord
from mymrutils import *
from itertools import islice
import re
from hashlib import md5
import random

forumRE = re.compile(r'forum|board|phpbb|yabb|phorum|guestbook|gostevaja|gostevaya|ubb|fastbb|borda\.|borda/|profile\.php|viewprofile|member\.php\?|memberlist\.php|printview\.php|viewtopic|showpost|showtopic|showthread|showcomments|viewthread|printthread|thread\.php|showuser=|postlist\.|showflat\.php|showthreaded\.|gbook\.asp', flags=re.I)

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True)
    DST = 'likhomanov/turkforums'
    with mktmp() as tmp:
        for src in ['reqans_log/201407{:02}'.format(i) for i in range(14, 17)]:
            MR.runMap(getForums, srcTable=src, dstTable=tmp.name, appendMode=True)
        MR.runReduce(getStats, srcTable=tmp.name, dstTable=tmp.name)
        MR.runReduce(Limiter(1000), srcTable=tmp.name, dstTable=DST)


def getForums(rec):
    ra = parseRARecord(rec.value)
    checkwww(ra)
    if serpLang(ra['request'].get('serp_url')) != 'com.tr':
        return
    for res in ra['results']:
        url = res.get('url')
        host = getHost(url)
        if not host:
            continue
        st = res.get('snippets_type')
        if st and 'forum' in st:
            yield Record(host, 'GOOD', '')
        if re.search(forumRE, url):
            yield Record(host, 'SHOW', '')

def getStats(host, recs):
    n = 0
    for rec in recs:
        if rec.subkey == 'GOOD':
            return
        n += 1
    yield Record('0', str(10000000 - n), '{}\t{}'.format(n, host))

if __name__ == '__main__':
    main()

