# -*- coding: UTF-8 -*-

from mapreducelib import MapReduce, Record
from collections import defaultdict
import sys
from datetime import datetime
import libra
import urllib
import random, re


TRANSLATION = None


def build_translation():
    global TRANSLATION
    if TRANSLATION is None:
        import sys
        import unicodedata
        TRANSLATION = {
            index: u' ' for index in xrange(sys.maxunicode)
            if unicodedata.category(unichr(index)).startswith('P')
        }
        for char in u'\t\n\x0b\x0c\r$+<=>^`|~':
            TRANSLATION[ord(char)] = u' '
    return TRANSLATION


def normalize_query(query):

    try:
        query = query.decode('utf8')
    except UnicodeDecodeError:
        return

    query = query.translate(build_translation())
    query = query.lower()
    query = query.strip()
    query = re.sub(r'\s\s+', ' ', query)

    return query.encode('utf8')


def Reduce(key, recs):
    uid = key
    if uid[0] != 'y':
        return

    try:
        s = libra.ParseSession(recs, './blockstat.dict')
    except:
        return

    for r in s:
        if not r.IsA("TTouchYandexWebRequest"):
            continue

        if r.ServiceDomRegion != 'ru':
            continue

        ts =  str(datetime.fromtimestamp(r.Timestamp).isoformat()).split('T')[0]
        isB = 0
        Butt = 0

        for bl in r.GetMainBlocks():
            m = bl.GetMainResult()
            if m.IsA('TWebResult'):
                url = str(m.Url)
                if ('http://www.labirint.ru/books/' in url and not url.endswith('/books/')) or ('http://read.ru/id/' in url and not url.endswith('id/')) or ('http://www.litres.ru/' in url and not url.endswith('litres.ru/')) or ('http://www.bookvoed.ru/book?id=' in url and not url.endswith('?id=')) or ('http://my-shop.ru/shop/books/' in url and '.html' in url) or ('http://mdk-arbat.ru/bookcard?book_id=' in url and not url.endswith('?book_id=')) or ('http://www.books.ru/books/' in url and not url.endswith('/books/')) or ('http://www.chitai-gorod.ru/catalog/book/' in url and not url.endswith('/book/')) or ('http://www.libex.ru/detail/' in url and '.html' in url) or ('http://www.sprinter.ru/shop/goods/' in url and not url.endswith('/goods/')) or ('http://www.biblion.ru/product/' in url and not url.endswith('/product')) or ('http://www.moscowbooks.ru/book.asp?id=' in url and not url.endswith('?id=')) or ('http://www.booka.ru/books/' in url and not url.endswith('/books/')) or ('http://www.bookmg.ru/product/' in url and not url.endswith('/product/')):
                    isB = 1
                    break

        if isB == 0:
            continue

        for bl in r.GetBSBlocks():
            p = bl.Path
            if 'web/item/preview/snippet_control' in p:
                Butt = 1
                break

        clicks = 0
        long_clicks = 0
        for cl in r.GetClicks():
            clicks += 1
            if int(cl.DwellTime) > 15:
                long_clicks += 1


        yield Record(uid,'',ts + '\t' + url + '\t' + str(Butt) + '\t' + str(clicks) + '\t' + str(long_clicks))



def GrepSBS(key,recs):
    uid = key
    if uid[0] != 'y':
        return

    try:
        s = libra.ParseSession(recs, './blockstat.dict')
    except Exception as e:
        raise e

    for r in s:
        if not r.IsA('TTouchYandexWebRequest'):
            continue

        if r.ServiceDomRegion != 'ru':
            continue

        q = normalize_query(r.Query)
        ts =  str(datetime.fromtimestamp(r.Timestamp).isoformat()).split('T')[0]

#        yield Record(uid,'',q)
#        continue

        isB = 0

        for bl in r.GetBSBlocks():
            if isB == 1:
                break

            p = bl.Path
            vv = bl.GetVars()

            if 'web/item/preview/snippet_control' in p:
                #isB = 1
                #break
                for v in vv:
                    if 'pos' in v:
                        if 'p0' in v or 'p1' in v or 'p2' in v:
                            isB = 1
                            pos = str(v)
                            break

        if isB == 1:
            yield Record(q[:200],'',uid + '\t' + pos + '\t' + ts)


def main():

    MapReduce.useDefaults(
                            server   = 'sakura.search.yandex.net:8013',
                            username = 'userstats',
                            mrExec   = '/Berkanavt/bin/mapreduce-dev',
                            verbose  = True,
                            #testMode = True,
                         )

    dd2014 = ['20140825','20140826','20140827','20140828','20140829','20140830','20140831','20140901','20140902','20140903','20140904','20140905','20140906','20140907']
    dd2015 = ['20150824','20150825','20150826','20150827','20150828','20150829','20150830','20150831','20150901','20150902','20150903','20150904','20150905','20150906']
    for d in dd2014:
        src = 'user_sessions/' + d
        dt0 = 'ensuetina/RO/SBS/YoY_long_clicks/2014'

        MapReduce.runReduce(Reduce,
                            srcTable = src,
                            dstTable = dt0,
                            files = ['/home/ensuetina/data/blockstat.dict'],
                            appendMode = True,
                            sortMode = True
                            )
    for d in dd2015:
        src = 'user_sessions/' + d
        dt0 = 'ensuetina/RO/SBS/YoY_long_clicks/2015'

        MapReduce.runReduce(Reduce,
                            srcTable = src,
                            dstTable = dt0,
                            files = ['/home/ensuetina/data/blockstat.dict'],
                            appendMode = True,
                            sortMode = True
                            )


if __name__ == '__main__':
    main()
