#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
from logparse import parseReqans
from mymrutils import *
import re
from collections import defaultdict
from urlparse import parse_qs


def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True)
    DST = 'likhomanov/sahispy'
    with open('log3', 'w') as log:
        with mktmp() as tmp:
            #for src in ['reqans_log/{}'.format(d) for d in strdaterange((2014, 11, 1), (2014, 11, 2))]:
            for src in ['spy_log/{}'.format(d) for d in strdaterange((2014, 11, 25), (2014, 12, 25))]:
                MR.runCombine(getData, srcTable=src, dstTable=tmp)
                print >>log, src, 'consumed'
                log.flush()
                MR.runReduce(Summarizer(), srcTable=tmp, dstTable=DST, appendMode=True)
                print >>log, src, 'merged'
                log.flush()
            MR.runReduce(Summarizer(), srcTable=DST, dstTable=DST)
            MR.runMap(presort, srcTable=DST, dstTable=DST)
            mrsort(DST)

def getData(recs):
    d = defaultdict(int)
    for rec in recs:
        try:
            report = rec.value.split()[3]
            if report.startswith('/post?'):
                report = report[6:]
            url = parse_qs(report)['url'][0]
            if getHost(url) == 'sahibinden.com':
                if len(d) > 10000:
                    for k, v in d.iteritems():
                        yield Record(k, '', str(v))
                    d = defaultdict(int)
                i = url.find('?')
                if i >= 0:
                    url = url[:i]
                if len(url) <= 2000:
                    d[url] += 1
        except KeyError:
            pass
        except (NameError, AttributeError):
            raise
        except:
            raise Exception(rec.value)
    for k, v in d.iteritems():
        yield Record(k, '', str(v))

def presort(rec):
    n = int(rec.value)
    yield Record('0', str(100000000 - n), '{}\t{}'.format(rec.key, n))

if __name__ == '__main__':
    main()

