# -*- coding: UTF-8 -*-

from mapreducelib import MapReduce, Record
from collections import defaultdict
from datetime import datetime, timedelta, date
import libra
import urllib, re,random, urlparse


def Reduce(key, recs):
    uid = key
    if uid[0] != 'y':
        return

    try:
        s = libra.ParseSession(recs, './blockstat.dict')
    except:
        return


    for r in s:
        if r.IsA('TYandexWebRequest'):
            ui = 'Desktop'
        else:
            continue

        if r.ServiceDomRegion != 'ru':
            continue

        ts =  str(datetime.fromtimestamp(r.Timestamp).isoformat()).split('T')[0]

        for bl in r.GetBSBlocks():
            p = bl.Path
            if 'soo/' in p and 'popup' in p:
                yield Record(uid,'',ts)
                return

def get_hits(rec):
    uid = rec.key
    if uid[0] != 'y':
        return
    sk = rec.subkey
    line = rec.value

    ts = str(datetime.fromtimestamp(float(sk)).isoformat()).split('T')[0]

    data = dict([d.split('=', 1) for d in line.split('\t') if '=' in d])

    url = str(data.get('url'))
    try:
        host = urlparse.urlparse(url).netloc

        if 'yandex.ru' in host or 'google.com' in host or 'google.ru' in host:
            yield Record(uid,'',ts + '\t' + host + '\t' + url)
    except Exception as e:
        host = '- ' + str(e)
        if 'yandex.ru' in url or 'google.com' in url or 'google.ru' in url:
            yield Record(uid,'',ts + '\t' + host + '\t' + url)
def aggr(key,recs):
    yield Record(key,'','')

def aggr2(key,recs):
    i = 0
    for rec in recs:
        i += 1
    yield Record(key,'',str(i))

def join(key,recs):
    uid = key
    rrecs = []
    isP = 0
    for rec in recs:
        tmp = rec
        if len(tmp.value) == 0:
            isP = 1
            continue
        rrecs.append(tmp)

    if isP == 1:
        for rec in rrecs:
            yield Record(rec.key,rec.subkey,rec.value)

def map_filter(rec):
    uid = rec.key
    line = rec.value

    if 'yandex.ru/yandsearch' in line or 'yandex.ru/search/?' in line or 'google.' in line:
        yield Record(line.split('\t')[0] + '\t' + line.split('\t')[1],'',line)



def main():

    MapReduce.useDefaults(
                            server   = 'sakura.search.yandex.net:8013',
                            username = 'userstats',
                            mrExec   = '/Berkanavt/bin/mapreduce-dev',
                            verbose  = True,
                            #testMode = True,
                         )


    dd = ['1125','1126','1127','1128','1129','1130','1201']
    dd = ['1202']
    dd = ['1203','1204','1205','1206','1207','1208','1209','1210']
    dd = ['1211','1212','1213','1214','1215']
    dd = ['1130','1201','1202','1203','1204','1205']
    dd = ['1211','1212','1213','1214','1215','1216','1217']
    for d in dd:
#        if d == '1202':
#        src = 'user_sessions/2015' + d + '/yandex_staff'

        dt = 'ensuetina/POPUP_VALUE/users'
#        continue
#        MapReduce.runReduce(Reduce,
#                                srcTable = src,
#                                dstTable = dt,
#                                files = ['/home/ensuetina/data/blockstat.dict'],
#                                appendMode = True,
#                                sortMode = True
#                                )

        continue

        s = 'user_sessions/2015' + d + '/spy_log'
        s_staff = s + '/yandex_staff'

        d1 = 'ensuetina/POPUP_VALUE/all_hits'
#        d2 = 'ensuetina/POPUP_VALUE/staff_hits'

        MapReduce.runMap(get_hits,
                         srcTable = s,
                         dstTable = d1,
                         appendMode = True,
                         sortMode = True
                        )

    MapReduce.runReduce(join,
                        srcTables = [dt+'_aggr','ensuetina/POPUP_VALUE/all_hits'],
                        dstTable = 'ensuetina/POPUP_VALUE/all_hits_right',
                        sortMode = True
                       )
#    MapReduce.runReduce(aggr,
#                        srcTable = dt,
#                        dstTable = dt + '_aggr',
#                        sortMode = True
#                       )


    MapReduce.runMap(map_filter,
                        srcTable = 'ensuetina/POPUP_VALUE/all_hits_right',
                        dstTable = 'ensuetina/POPUP_VALUE/all_hits_right_2',
                        sortMode = True
                       )

#    sys.exit(1)

    MapReduce.runReduce(aggr2,
                        srcTable = 'ensuetina/POPUP_VALUE/all_hits_right_2',
                        dstTable = 'ensuetina/POPUP_VALUE/all_hits_right_2_aggr',
                        sortMode = True
                       )

#    sys.exit(1)

#    MapReduce.runReduce(join,
#                        srcTables = [dt+'_aggr',d1],
#                        dstTable = 'ensuetina/POPUP_VALUE/all_hits_right',
#                        sortMode = True
#                       )
#    MapReduce.runReduce(join,
#                        srcTables = [dt+'_aggr','ensuetina/POPUP_VALUE/staff_hits'],
#                        dstTable = 'ensuetina/POPUP_VALUE/staff/staff_hits_right',
#                        sortMode = True
#                       )

if __name__ == '__main__':
    main()
