#!/usr/bin/python

import sys
import datetime
import gc
import gzip
import os
import re
import urllib
import getpass
import resource
import itertools
import array
import cPickle
from copy import copy
from collections import defaultdict
import random

sys.path.append("/home/trencher/lib")
from mapreducelib import MapReduce, Record, TemporaryTable, SharedTransaction

#setting parse types for the first stage

import profile
profile.allowed_types = set(["www","www-touch","www-smart","www-tablet","image","video"]) #"xml","api","gateway"

# user agents

from browser_versions import intermediate_browser_versions_map, intermediate_browser_versions_reduce, get_versions


# setting up data structures for second tage and later

def full_filter(user_agent, tld, ajax, visibility, ua_version, **unused):

    if visibility != 'visible':
        return False

    #user agents will be only in com.tr and ru
    if user_agent != 'all_ua' and tld != 'ru' and tld != 'com.tr':
        return False

    if tld == 'com':
        return False

    if not user_agent=='all_ua':
        if user_agent=='Safari/Macintosh':
            pass
        elif ('Unknown' in user_agent):  # or not 'Windows' in user_agent or 'Windows Phone' in user_agent):
            return False

    return True

def version_filter(user_agent, tld, ajax, visibility, ua_version, **unused):

    if visibility != 'visible':
        return False

    #user agents will be only in com.tr and ru
    if tld != 'ru':
        return False

    if not user_agent=='all_ua':
        if user_agent=='Safari/Macintosh':
            pass
        elif ('Unknown' in user_agent): # or not 'Windows' in user_agent or 'Windows Phone' in user_agent):
            return False

    if ua_version != 'version_all':
        if  user_agent=='all_ua':
            return False
        if  tld!='ru':
            return False

        #this never gonna happen, but to keep key space clear remove it too
        (ua1,ver)=ua_version.split('/')
        (ua2,platform)=user_agent.split('/')
        if ua1!=ua2:
            return False

    return True


def short_filter(tld, ajax, https, visibility, **unused):

    if visibility != 'visible':
        return False

    if tld == 'com':
        return False

    #if tld == 'ru':
    #    return https!='http-all' and ajax!='ajax-all'
    #else:
    #    return ajax=='not-ajax-query' and https=='http'

    return True

def https_filter(region, https, visibility, user_agent, **unused):

    if visibility != 'visible':
        return False

    if not user_agent=='all_ua':
        if user_agent=='Safari/Macintosh':
            pass
        elif ('Unknown' in user_agent): # or not 'Windows' in user_agent or 'Windows Phone' in user_agent):
            return False

    return True

def visibility_filter(visibility, **unused):

    if visibility != 'visible':
        return False

    return True

def empty_filter(**unused):
    return True

common_stages = set(['<latency>','<report_time>','<report_time_>','<total>','bytes_sent','content_len','dns',
    'html','html_total','tcp_handshake','ttfb','ttfb_total','ttfp','ttfp_total','ttlb','ttlb_total','ttlb_ex_after_print',
    'ttlb_ex_before_print','after_ya_search'])

common_services_r = set(['u.www','u.images','u.video','u.morda.all','u.morda.plain','u.morda.widget','u.morda.theme','u.morda.touch','u.images.morda','u.video.morda'])
common_services_rp = set(['u.www','u.images','u.video','www','images','video','u.morda.all','u.morda.plain','u.morda.widget','u.morda.theme','u.morda.touch','u.images.morda','u.video.morda','u.www.touch','www.touch','u.www.tablet','www.tablet'])

outformats = {
    'out': {
        'name':'out',
        'perc':[25,50,75,90,95,98,99],
        'keys':[['tld','ajax','visibility']],
        'gran':set(['10m','1hr']),
        'fltr':short_filter,
        'service_filter':None,
        'stage_filter':None
    },
    'full': {
        'name':'full',
        'perc':range(0,100),
        'keys':[['tld','user_agent','ajax','visibility']],
        'gran':set(['1hr','1d']),
        'fltr':full_filter,
        'service_filter':None,
        'stage_filter':common_stages
    },
   'vers': {
       'name':'vers',
       'perc':range(0,100),
       'keys':[['tld','user_agent','ajax','ua_version','visibility']],
       'gran':set(['1hr','1d']),
       'fltr':version_filter,
       'service_filter':None,
       'stage_filter':common_stages
    },
    'accounting': {
        'name':'accounting',
        'perc':[],
        'keys':[['ajax','visibility']],
        'gran':set(['1d']),
        'fltr':empty_filter,
        'service_filter':None,
        'stage_filter':common_stages
    },
    'tech': {
        'name':'tech',
        'perc':[25,50,75,90,95,98,99],
        'keys':[['tld','dc','visibility'],['region','dc','visibility']],
        'gran':set(['10m','1hr','1d']),
        'fltr':visibility_filter,
        'service_filter':None,
        'stage_filter':common_stages
    },
    'allstgs': {
        'name':'full',
        'perc':range(0,100),
        'keys':[['tld','user_agent','ajax','visibility']],
        'gran':set(['1hr','1d']),
        'fltr':full_filter,
        'service_filter':None,
        'stage_filter':None
    },
    'https': {
        'name':'https',
        'perc':range(0,100),
        'keys':[['region','https','visibility'],['user_agent','https','visibility']],
        'gran':set(['1hr','1d']),
        'fltr':https_filter,
        'service_filter':None,
        'stage_filter':common_stages
    }
}

spyformats = {
    'spy': {
        'name':'spy',
        'perc':range(0,100),
        'keys':[['tld','https']],
        'gran':set(['1hr','1d']),
        'fltr':lambda **unused: True,
        'service_filter':None,
        'stage_filter':None
    }
}


from keys import gen_keys, gen_keys_decart


def tables_exists(tables):

    if isinstance(tables,str):
        tables =[ tables ]

    for table in tables:
        if table != "/dev/zero" and not len(list(MapReduce.getSample(srcTable=table,count=1))) == 1:
            return False

    return True

from join_stage import intermediate_reduce_combine
class IntermediateReduceCombine:
    def __init__(self,tpl):
        self.tpl = tpl
    def __call__(self,recs):
        for r in intermediate_reduce_combine(self.tpl,recs):
            yield r

from median_stage import reduce_operation
class ReduceOperation:
    def __init__(self,tpl):
        self.tpl = tpl
    def __call__(self,key,recs):
        for r in reduce_operation(self.tpl,key,recs):
            yield r


from parse_stage import combine_operation
class CombineOperation:

    def __init__(self,date,index):
        self.date=date
        self.index=index

    def __call__(self,recs):
        for r in combine_operation(recs,self.date,self.index):
            yield r

def main(date, user, type):

    global outformats

    if user is None:
        user = getpass.getuser()

    if type=='unireports':
        version = "v7rp"
        yt_date = datetime.datetime.strptime(date, '%Y%m%d').strftime('%Y-%m-%d')
        #tables = [ "//home/logsng/personal_homes/inikifor/trencher/redir/"+yt_date, "//home/logsng/personal_homes/inikifor/trencher/profile/"+yt_date, "/dev/zero", "/dev/zero" ]
        tables = [ "//logs/redir-log/1d/"+yt_date, "//logs/profile-log/1d/"+yt_date, "/dev/zero", "/dev/zero" ]
        outformats['out']['service_filter'] = common_services_rp
        outformats['full']['service_filter'] = common_services_rp
        outformats['vers']['service_filter'] = common_services_rp
        outformats['allstgs']['service_filter'] = common_services_rp
        outformats['https']['service_filter'] = common_services_rp
        if 'tech' in outformats: outformats['tech']['service_filter'] = common_services_rp
    elif type=='unireports-redir':
        version = "v7r"
        yt_date = datetime.datetime.strptime(date, '%Y%m%d').strftime('%Y-%m-%d')
        tables = [ "//logs/redir-log/1d/"+yt_date, "/dev/zero", "/dev/zero", "/dev/zero" ]#, "redir_log/"+date, "antirobot_daemon/"+date ]
        outformats['out']['service_filter'] = common_services_r
        outformats['full']['service_filter'] = common_services_r
        outformats['vers']['service_filter'] = common_services_r
        outformats['allstgs']['service_filter'] = common_services_r
        outformats['https']['service_filter'] = common_services_r
        del outformats['tech']
    elif type=='unireports-spylog':
        version = "v6s"
        outformats = spyformats
        tables = [ "/dev/zero", "/dev/zero", "spy_log/"+date, "/dev/zero" ]
    else:
        assert False, "wtf type is %s" % type

    user = version + "-"+ user

    print >> sys.stderr, "checking: uniparse."+version+"."+outformats.keys()[0]+"."+date+".txt"
    if os.path.exists("uniparse."+version+"."+outformats.keys()[0]+"."+date+".txt"):
        print >> sys.stderr, "uniparse."+version+"."+outformats.keys()[0]+"."+date+".txt exists - exiting"
        return

    MapReduce.useDefaults(mrExec="mapreduce-yt",server='hahn.yt.yandex.net', usingSubkey=True, verbose=True, appendMode=False,  cpuIntensive=False, enableTableSwitching=True, optAttrs = {'user':'trencher','jobcount.multiplier':10}) #, 'threadcount':16 , 'net_table':'ipv6' sortMode=True

    #############################################################

    #cPickleTable=tmp_t.name
    #KeyTimesTable=tmp_t.name
    #ResultTables = [ result_t.name, result_t_full.name ]

    table_keys_suffix=[]
    for fmt in sorted(outformats.keys()):
        keys = outformats[fmt]['keys']
        fmtstr='.'.join('-'.join(k) for k in keys)
        table_keys_suffix += [fmt+"~"+fmtstr]
    table_keys_suffix = "__".join(table_keys_suffix)

    cPickleTables=["tmp/trencher/"+user+"/cpickle" + date,"tmp/trencher/"+user+"/cpickle" + date + "errors"]
    current_date = date[:4]+"-"+date[4:6]+"-"+date[6:]
    UserAgentsTable = "tmp/trencher/" + user + "/browsers"+ date
    KeyTimesTable=None #"tmp/trencher/"+user+"/date_keys" + date + "." + table_keys_suffix
    ResultTables=["tmp/trencher/"+user+"/result_"+ fmt + date + "." + table_keys_suffix for fmt in sorted(outformats.keys())]
    AccountingTables=["tmp/trencher/"+user+"/accounting"+date,"tmp/trencher/"+user+"/join_accounting"+date]

    #the easy way to understood what is going on is the following (real process is highly optimized)
    #1 parse all logs -> produces records [reqid; other atributes]
    #    i.e. [reqid; type=user_session .ttfp=100 ]
    #         [reqid; type=profile .latency=10 ]

    #2 join records with same reqids, produce records with joined attributes, emit it with [date-time-service; stages times].
    #    i.e. [reqid; type=user_session_profile .ttfp=100 .latency=10 ]

    #3 move stage and other attributes to key. leave only one time in value, and reduce times.
    #    i.e. [reqid-user_session_profile-date-time-.ttfp; 100]
    #         [reqid-user_session_profile-date-time-.latency; 10]

    #  because we need aggregation 1m, 10m, 1hr - emit each time 3 times with keys 1m, 10m, 1hr
    #    i.e. [reqid-user_session_profile-date-time1m-.ttfp; 100]
    #         [reqid-user_session_profile-date-time1m-.latency; 10]
    #         [reqid-user_session_profile-date-time10m-.ttfp; 100]
    #         [reqid-user_session_profile-date-time10m-.latency; 10]
    #         [reqid-user_session_profile-date-time1hr-.ttfp; 100]
    #         [reqid-user_session_profile-date-time1hr-.latency; 10]


    #  got [date-time-service-stage-otherattrs-aggr; stages times].
    #    i.e. [reqid-user_session_profile-date-time1m-.ttfp; 100 100 21 ]
    #         [reqid-user_session_profile-date-time1m-.latency; 10 32 43 ]
    #         [reqid-user_session_profile-date-time10m-.ttfp; 100 100 21 x10 times]
    #         [reqid-user_session_profile-date-time10m-.latency; 100 100 21 x10 times ]
    #         [reqid-user_session_profile-date-time1hr-.ttfp; 100 100 21 x60 times]
    #         [reqid-user_session_profile-date-time1hr-.latency; 100 100 21 x60 times0]

    #4 calculate medians and output

    tpl = None

    if not tables_exists(ResultTables):


        if type=='unireports-spylog' or tables_exists(UserAgentsTable):

            if type!='unireports-spylog':
                ua_versions = get_versions(UserAgentsTable)
            else:
                ua_versions = ['version_all']

            tpl=gen_keys(outformats, ua_versions)
            KeyTimesTable = "tmp/trencher/"+user+"/date_keys" + date + "." + tpl[5]


        if KeyTimesTable is None or not tables_exists(KeyTimesTable):
            if not tables_exists(cPickleTables):

                assert tables_exists(tables), "Tables not ready!!!!"

                outTables = cPickleTables + AccountingTables[0:1]
                opts = {'user':'trencher','jobcount.multiplier':10, 'threadcount':4}
                for index, table in enumerate(tables):
                    if table != "/dev/zero":
                        MapReduce.runCombine(CombineOperation(current_date, index), srcTables=[table], dstTables=outTables, sortMode=False, appendMode=True, optAttrs=opts, memoryLimit=6000)
                #MapReduce.optimizeTables(outTables, optAttrs=opts)
                MapReduce.sortTables(outTables, optAttrs=opts)
                MapReduce.runCombine(intermediate_browser_versions_map, srcTable=cPickleTables[0], dstTable=UserAgentsTable+"_intm", sortMode=True)
                MapReduce.runReduce(intermediate_browser_versions_reduce, srcTable=UserAgentsTable+"_intm", dstTable=UserAgentsTable)

                if type!='unireports-spylog':
                    ua_versions = get_versions(UserAgentsTable)
                else:
                    ua_versions = ['version_all']
                tpl=gen_keys(outformats, ua_versions)
                KeyTimesTable = "tmp/trencher/"+user+"/date_keys" + date + "." + tpl[5]

                # 1st stage key is [date \t time] generated from reqid. This reduce number of records.
                # body is pickled array of records. All internal joins goes in python.

            outTables = [KeyTimesTable] + AccountingTables[1:2]
            opts = {'user':'trencher', 'threadcount':4, 'cpu.intensive.mode':1, 'jobcount':5000}
            MapReduce.runCombinedReduce(IntermediateReduceCombine(tpl), srcTable=cPickleTables[0], dstTables=outTables, optAttrs=opts, sortMode=False, memoryLimit=8000)
            #MapReduce.optimizeTables(outTables, optAttrs=opts)
            MapReduce.sortTables(outTables, optAttrs=opts)
            #MapReduce.runCombinedReduce(IntermediateReduceCombine(tpl), srcTable=cPickleTables[0], dstTables=[KeyTimesTable]+AccountingTables[1:2], optAttrs = {'user':'trencher','jobcount.multiplier':20, 'threadcount':8, 'cpu.intensive.mode':1, 'jobcount':1000}, sortMode=True)


            # 2nd stage is just reducing all records for one minute.
            # all records for 1 minute are fit in memory
            # because I want larger aggregatet statistics this operation looks like
            # reduce-combine, but no other things is done here except stat calculation
            # for debug output - so treat it just like ordinary reduce for same time.

            # reduce is producing output like in stage 3. But without 1m, 10m, 1hr keys.
            # key contains no time, but time is in subkey
            # all keys are minute sized. larger aggregation will be done in python later
            #    i.e. [reqid-user_session_profile-.ttfp-otherkeys; date-time; 100 100 21 ]
            #         [reqid-user_session_profile.latency-otherkeys; date-time; 10 32 43 ]

        #TODO: think about performance.

        # question one test our optimization make improvements at all? will mapreduce
        # suffer from more records?

        # question two is there more space for optimization here - we producing combinatoric
        # amount of detailed keys. If we need only 1hr integration could be be more efficient?

        if tpl is None:
            if type!='unireports-spylog':
                ua_versions = get_versions(UserAgentsTable)
            else:
                ua_versions = ['version_all']

            tpl=gen_keys(outformats, ua_versions)
            KeyTimesTable = "tmp/trencher/"+user+"/date_keys" + date + "." + tpl[5]


        opts = {'user':'trencher', 'threadcount':4, 'cpu.intensive.mode':1, 'jobcount':5000}
        MapReduce.runReduce(ReduceOperation(tpl), srcTable=KeyTimesTable, dstTables=ResultTables, sortMode=False, memoryLimit=9000)
        #MapReduce.optimizeTables(ResultTables, optAttrs=opts)
        MapReduce.sortTables(ResultTables, optAttrs=opts)

        # 3rd stage is calculating percentiles, one single pass calculating 1m, 10m,
        # it calculates 2 types of results - out and full. full contains all percentiles from 0 to 99.
        # also very detailed otherkeys like "page+datacenter+useragent" is slow and
        # useless in 1min aggregaion - so we output them only to full log

    if tpl is None:
        ua_versions = get_versions(UserAgentsTable)
        tpl=gen_keys(outformats, ua_versions)

    keys_in_game = tpl[4]
    canonical_headers = ['datasource','service','stage'] + keys_in_game + ['date','time','gran','len','auc']

    index=0
    for fmt in sorted(outformats.keys()):
        current_date = None
        current_file = None
        types = defaultdict(int)
        services = defaultdict(int)
        table=ResultTables[index]
        print(table)
        index+=1
        headers = copy(canonical_headers)

        for i in outformats[fmt]['perc']:
            headers += ["p%02d" % i]

        for rec in MapReduce.getSample(srcTable=table):
            keys = rec.subkey.split("\t")
            rec_date = rec.key.replace("-","")
            values = rec.value.split("\t")
            if rec_date != date:
                continue #remove wild dates

            if current_date != rec_date:
                if current_file:
                    current_file.close()
                    os.rename("uniparse."+version+"."+fmt+"."+current_date+".txt.tmp", "uniparse."+version+"."+fmt+"."+current_date+".txt")
                    print ("uniparse.%s.%s.%s.txt" % (version,fmt,current_date))
                    print (types)
                    print (services)
                current_date = rec_date
                current_file=open("uniparse."+version+"."+fmt+"."+current_date+".txt.tmp","w")
                print >> current_file, "\t".join(headers)
                types = defaultdict(int)
                services = defaultdict(int)

            datasource = keys[0]
            service = keys[1]
            stage = keys[2]

            #if datasource == type_filter:
            if outformats[fmt]['service_filter'] is None or service in outformats[fmt]['service_filter']:
                if outformats[fmt]['stage_filter'] is None or stage in outformats[fmt]['stage_filter']:
                        types[keys[0]] += int(values[0])
                        services[keys[1]] += int(values[0])
                        print >> current_file, '\t'.join(keys+values)

        if current_file:
            current_file.close()
            os.rename("uniparse."+version+"."+fmt+"."+current_date+".txt.tmp", "uniparse."+version+"."+fmt+"."+current_date+".txt")
            print ("uniparse."+version+"."+fmt+"."+current_date+".txt")
            print (types)
            print (services)

        assert current_date is not None, "Results table is empty!"

    tablesToDrop = cPickleTables + [UserAgentsTable,UserAgentsTable+"_intm"] + AccountingTables + ResultTables
    if KeyTimesTable is not None:
        tablesToDrop += [KeyTimesTable]
    print "Dropping these tables: ", tablesToDrop
    MapReduce.dropTables(dstTables=tablesToDrop)

    print ("Done!")


def test_main(date, user, type):

    #MapReduce.useDefaults(mrExec="mapreduce-dev",server='sakura.search.yandex.net', usingSubkey=True, verbose=True, enableTableSwitching=True) #, 'threadcount':16 , 'net_table':'ipv6' sortMode=True
    MapReduce.useDefaults(mrExec="mapreduce-dev",server='local', usingSubkey=True, verbose=True, enableTableSwitching=True)

    v='9'

    srcTables=["redir_log/20150313", "redir_log/20150313","spy_log/20150313"]
    cPickleTables=["tmp/cPickle20150218v"+v,"tmp/cPickle20150218Errorsv"+v]
    KeyTimesTable="tmp/KeyTime20150218v"+v
    UserAgentsTable = "tmp/uagents"+v

    if not tables_exists(cPickleTables[0]):
        MapReduce.runCombine(combine_operation, srcTables=srcTables, dstTables=cPickleTables, sortMode=True, optAttrs = {'user':'trencher'})

    if not tables_exists(UserAgentsTable):
        MapReduce.runCombine(intermediate_browser_versions_map, srcTable=cPickleTables[0], dstTable=UserAgentsTable+"_intm", sortMode=True)
        MapReduce.runReduce(intermediate_browser_versions_reduce, srcTable=UserAgentsTable+"_intm", dstTable=UserAgentsTable)

    global outformats
    ua_versions = get_versions(UserAgentsTable)
    tpl=gen_keys(outformats, ua_versions)

    #print >> sys.stderr, '\n'.join(gen_keys_decart(pageno=0, ar_verdict=None, user_agent='Chrome/Windows', server=None, tld='ru', https=True, turbo=True, ajax=False, visibility='visible', version='Chrome/40'))
    print >> sys.stderr, '\n'.join(gen_keys_decart(pageno=None, ar_verdict=None, user_agent=None, server=None, tld=None, https=None, turbo=None, ajax=None, visibility=None, version=None))

    if not tables_exists(KeyTimesTable):
        MapReduce.runCombinedReduce(IntermediateReduceCombine(tpl), srcTable=cPickleTables[0], dstTable=KeyTimesTable, sortMode=True)

    ResultTables=[]

    for fmt in sorted(outformats.keys()):
        ResultTables += ["tmp/Results."+fmt+".20150218"]

    if not tables_exists(ResultTables):
        MapReduce.runReduce(ReduceOperation(tpl), srcTable=KeyTimesTable, dstTables=ResultTables, sortMode=True)



if __name__ == '__main__':

    assert len(sys.argv)>1, "usage: parse.py YYYYMMDD [version-id]"
    date = sys.argv[1]
    user = None
    type = None
    if len(sys.argv)>2:
        user = sys.argv[2]

    if len(sys.argv)>3:
        type = sys.argv[3]

    if user == 'test':
        #tpl=gen_keys(outformats, ["Chrome/Windows","Yabrowser/Windows","Safari/Macintosh"])
        #print tpl[1]
        test_main(date,user,type)
    else:
        main(date,user,type)
