from mapreducelib import Record, TemporaryTable, MapReduce as MR
from itertools import islice
import re
from collections import defaultdict
from datetime import date, timedelta

class Summarizer(object):
    def __init__(self, limit=None, useSubkey=False, readValues=True):
        self.limit = limit
        self.useSubkey = useSubkey
        self.readValues = readValues

    def __call__(self, key, recs):
        if self.useSubkey:
            ns = defaultdict(int)
            for rec in recs:
                ns[rec.subkey] += int(rec.value) if self.readValues else 1
            for sk, n in ns.iteritems():
                if self.limit is None or n >= self.limit:
                    yield Record(key, sk, str(n))
        else:
            n = 0
            for rec in recs:
                n += int(rec.value) if self.readValues else 1
            if self.limit is None or n >= self.limit:
                yield Record(key, '', str(n))

class Limiter(object):
    def __init__(self, limit, useSubkey=False):
        self.limit = limit
        self.useSubkey = useSubkey

    def __call__(self, key, recs):
        if self.useSubkey:
            counts = defaultdict(int)
            for rec in recs:
                if counts[rec.subkey] < self.limit:
                    counts[rec.subkey] += 1
                    yield rec
        else:
            for rec in islice(recs, self.limit):
                yield rec

serpRE = re.compile(r'https?://(www\.)?yandex\.([^/,]*)/')
hostRE = re.compile(r'https?://(www\.)?([^/]*)(/.*)?$')

def serpLang(url):
    if isinstance(url, dict):
        url = url.get('serp_url')
    if not url:
        return None
    m = serpRE.match(url)
    if not m:
        return None
    return m.group(2)

def getHost(url):
    if not url:
        return None
    m = hostRE.match(url)
    if not m:
        return None
    return m.group(2).lower()

def getInnerPath(url):
    if not url:
        return None
    m = hostRE.match(url)
    if not m:
        return None
    return m.group(3)

def mrsort(table):
    MR.sortTable(srcTable=table, dstTable=table)

def mktmp(project='likhomanov'):
    return TemporaryTable(project=project)

def checkwww(req):
    if req.get('stype') != 'www':
        raise StopIteration
    if req.get('is_yandex', '0') == '1':
        raise StopIteration

def iswww(req):
    if req.get('stype') != 'www':
        return False
    if req.get('is_yandex', '0') == '1':
        return False
    return True

def cat(rec):
    yield rec

def strdaterange(d1, d2):
    if isinstance(d1, tuple):
        d1 = date(*d1)
    if isinstance(d2, tuple):
        d2 = date(*d2)
    for i in range((d2 - d1).days):
        yield (d1 + timedelta(days=i)).strftime('%Y%m%d')

