
import sys
import time
from collections import defaultdict
import array
import resource
import cPickle
import gc
from keys import gen_keys_decart, init_globals

from mapreducelib import MapReduce, Record

from regexps_and_stuff import *


def join_records(reqid, recs, key_to_index, index_to_key, accounting):

    recs_left=len(recs)

    k_type=key_to_index["type"]
    k_ts=key_to_index["ts"]
    k_service=key_to_index["service"]

    joined_record = { 'reqid': reqid, 'service': set([]), 'type': set([]) }

    if len(recs)==1 and recs[0][k_type]=='spylog':
        #just one record
        spy_log_record = recs[0]

        for k in range(len(spy_log_record)):
            v=spy_log_record[k]
            k=index_to_key[k]
            if v is None:
                pass
            else:
                if k == 'service':
                    joined_record['service'] |= set([ v ])
                elif k == 'type':
                    joined_record['type'] = 'spylog'
                else:
                    joined_record[k] = v

        accounting["spy_log\t<output>\t"]-=1
        recs_left-=1
        return joined_record


    session_record = None
    profile_record = None
    antirobot_record = None
    redir_record = None

    # put session_record profile_record antirobot_record to their places

    for r in recs:
        if r[k_type]=='user_session':
            if session_record is None:
                session_record=r
            elif session_record!=r:
                #TODO: print diagnostics here
                if session_record[k_ts] > r[k_ts]:
                    session_record=r
                    accounting["user_session\t<dup>\t"+r[k_service]]-=1
                    recs_left-=1
                elif session_record[k_ts] == r[k_ts]:
                    #print >> sys.stderr, "dup session same ts %s\n\n%s" % (session_record,"\n".join([str(r) for r in recs]))
                    accounting["user_session\t<dup_same_ts>\t"+r[k_service]]-=recs_left
                    recs_left=0
                    return
                elif session_record[k_ts] < r[k_ts]:
                    accounting["user_session\t<dup>\t"+r[k_service]]-=1
                    recs_left-=1
                else:
                    assert False
            elif session_record==r:
                accounting["user_session\t<dup>\t"+r[k_service]]-=1
                recs_left-=1


        elif r[k_type]=='antirobot':
            if antirobot_record is None:
                antirobot_record=r
            elif antirobot_record!=r:
                accounting["antirobot\t<dup>\t"]-=1
                recs_left-=1
                #print >> sys.stderr, "Dup antirobot %s\n\n%s" % (antirobot_record,"\n".join([str(r) for r in recs]))
            elif antirobot_record==r:
                accounting["antirobot\t<dup>\t"]-=1
                recs_left-=1


        elif r[k_type]=='redir':
            if redir_record is None:
                redir_record=r
            elif redir_record!=r:
                #TODO: print diagnostics here
                if redir_record[k_ts] > r[k_ts]:
                    redir_record=r
                    accounting["redir\t<dup>\t"+r[k_service]]-=1
                    recs_left-=1
                elif redir_record[k_ts] == r[k_ts]:
                    #print >> sys.stderr, "dup redir same ts %s\n\n%s" % (redir_record,"\n".join([str(r) for r in recs]))
                    accounting["redir\t<dup_same_ts>\t"+r[k_service]]-=recs_left
                    recs_left=0
                    return
                elif redir_record[k_ts] < r[k_ts]:
                    accounting["redir\t<dup>\t"+r[k_service]]-=1
                    recs_left-=1
                else:
                    assert False
            elif redir_record==r:
                accounting["redir\t<dup>\t"+r[k_service]]-=1
                recs_left-=1


        elif r[k_type]=='profile':
            #assert profile_record is None or profile_record == r, "Duplicate profile record %s\n %s\n" % (profile_record,r)
            if profile_record is not None and profile_record != r:
                #print >> sys.stderr, "Duplicate profile records:\n%s\n%s\n" % (profile_record,r)
                accounting["profile\t<dup_squashed>\t"+r[k_service]]-=recs_left
                recs_left=0
                return #
            elif profile_record == r:
                accounting["profile\t<dup_squashed>\t"+r[k_service]]-=1
                recs_left-=1
            else:
                profile_record = r


        else:
            assert False, "Strange record type:" + r[k_type]


    assert not (session_record is None and profile_record is None and antirobot_record is None and redir_record is None), "all records was removed: %s" % recs


    if redir_record is not None and session_record is not None:
        if redir_record[k_ts] < session_record[k_ts]:
            print >> sys.stderr, "Bad session record was choosen! %s\n\n%s" % (session_record,"\n".join([str(r) for r in recs]))
            session_record=None
        elif redir_record[k_ts] > session_record[k_ts]:
            print >> sys.stderr, "Bad redir record was choosen! %s\n\n%s" % (redir_record,"\n".join([str(r) for r in recs]))
            redir_record=None

    if redir_record is None and session_record is None and profile_record is None and antirobot_record is not None:
        #print >> sys.stderr, "Unbound antirobot record:\n%s" % antirobot_record
        accounting["antirobot\t<squashed>\tunbound"]-=1
        return

    records_count = sum( [int(f is not None) for f in [redir_record,session_record,profile_record,antirobot_record]]  )
    assert records_count==recs_left, "%d %d\n%s" % (records_count,recs_left,recs)

    for d in (redir_record,session_record,profile_record,antirobot_record):
        if d:
            for k in range(len(d)):
                v=d[k]
                k=index_to_key[k]
                if v is None:
                    pass
                elif k == 'ts':
                    assert joined_record.get('ts') is None or joined_record.get('ts')==d[k_ts]
                elif k == 'service':
                    joined_record['service'] |= set([ v ])
                elif k == 'type':
                    joined_record['type'] |= set([ v ])
                elif k == 'reqid':
                    pass
                elif k == 'pageno':
                    if joined_record.get('pageno') is None:
                        joined_record['pageno']=v
                    elif v==None:
                        pass
                    elif joined_record['pageno']!=v:
                        #TODO: DIAG HERE
                        #print >> sys.stderr, "page number should be same from all records %s\n%s\n%s\n\n%s" % (k,joined_record,d,"\n".join([str(r) for r in recs]))
                        accounting["<joined>\t<squashed>\tbadpageno"]-=records_count
                elif k == 'tld':
                    if joined_record.get('tld') is None:
                        joined_record['tld']=v
                    elif v == None:
                        pass
                    elif joined_record['tld']!=v:
                        #print >> sys.stderr, "double field different values %s\n%s\n%s\n\n%s" % (k,joined_record,d,"\n".join([str(r) for r in recs]))
                        accounting["<joined>\t<squashed>\tbadtld"]-=records_count
                        return
                elif v!=None:
                    assert k not in joined_record or joined_record[k]==v, "double field different values %s\n%s\n%s\n\n%s" % (k,joined_record,d,"\n".join([str(r) for r in recs]))
                    joined_record[k]=v


    redir_record=None
    session_record=None
    profile_record=None
    antirobot_record=None
    #separate records unusable after this point

    joined_record['service'] = update_service(joined_record['service'])

    # here and later only categorised services are retained

    #TODO: emit several records, i.e. redir + profile + redir_profile
    real_type = joined_record['type']
    real_type = "_".join(sorted(real_type))
    joined_record['type'] = real_type

    accounting[real_type+"\t<joined>\t"+'_'.join(sorted(joined_record['service']))]-=records_count

    return joined_record

def additional_stages(joined_record):

    html = joined_record.get('.html')
    ttfb = joined_record.get('.ttfb')
    ttfp = joined_record.get('.ttfp')
    ttfp_total=joined_record.get('.ttfp_total')
    html_total=joined_record.get('.html_total')

    if html_total is not None and html is not None:
        joined_record['.ttfb_total']=html_total - html

    if html is not None and ttfb is not None:
        ttlb = ttfb+html
        joined_record['.ttlb']=ttlb

        after_print = joined_record.get('.after_print_to_user')
        if after_print:
            joined_record['.ttlb_ex_after_print']=ttlb-after_print

        before_print = joined_record.get('.before_print_to_user')
        if before_print:
            joined_record['.ttlb_ex_before_print']=ttlb-before_print


def intermediate_reduce_combine(tpl, recs):

    combine_start_t = time.time()
    combine_start_c = time.clock()

    combine_total_t = 0
    combine_total_c = 0

    outformats = tpl[0]
    init_globals(tpl)

    try:

        accounting = defaultdict(int)

        assert recs is not None
        for key, subrecs in recs:

            assert len(key)==19 and key[10]=='\t' and key[16]==':' #2014-01-02 11:22:33

            assert subrecs is not None
            ll=0

            start_t = time.time()
            start_c = time.clock()

            for r in intermediate_reduce_operation(key[:-3], subrecs, accounting):
                yield r

            gc.collect()
            s = sum([x for x in accounting.itervalues() if x>0])

            end_t = time.time()
            end_c = time.clock()

            combine_total_t += end_t - start_t
            combine_total_c += end_c - start_c


            print "%d: Key '%s'(%d) elapsed %.2f cpu time, %.2f real time" % (int(end_t), key, s, end_c-start_c, end_t-start_t)


    except BaseException as e:
        import traceback
        tb = traceback.format_exc()
        print >> sys.stderr, str(e) + "\n" + tb +  "\n\n"
        raise

    for key,value in accounting.iteritems():
        yield Record(key,'',str(value),tableIndex=1)

    print "%d: Combine: elapsed %.2f (%.2f) cpu time, %.2f (%.2f) real time" % (int(time.time()), time.clock()-combine_start_c, combine_total_c, time.time()-combine_start_t, combine_total_t)


class ArrayDict:

    def __init__(self):
        self.data=[]
        self.key_to_index = {}
        self.index_to_key = []

    def append(self,record):

        new_r=[]

        for k,v in record.iteritems():

            index = self.key_to_index.get(k)

            if index is None:
                self.key_to_index[k]=len(self.index_to_key)
                index = len(self.index_to_key)
                self.index_to_key += [k]


            if len(new_r)<=index:
                new_r += [None for item in range(index+1 - len(new_r))]

            if type(v) is str: v=intern(v)
            new_r[index]=v

        self.data.append(new_r)

    def sort(self):
        key = self.key_to_index['key']
        self.data = sorted(self.data, key=lambda x:x[key])

    def iterate_and_truncate(self):

        key = self.key_to_index['key']
        current_key=None
        current_records=[]

        for index in range(len(self.data),0,-1):
            i = self.data[index-1]

            if i[key]!=current_key:
                if current_key is not None:
                    yield (current_key,current_records)

                current_key=i[key]
                current_records=[]

            current_records += [i]

            if index < len(self.data)-1000000:
                self.data=self.data[:-1000000]
                print "records left: %d" % index
                gc.collect()
                memused=resource.getrusage(resource.RUSAGE_SELF).ru_idrss
                max_memused=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
                print >> sys.stderr, "memused: %d, max_memused %d" % (memused,max_memused)

        yield (current_key,current_records)

sequence = 0

def gen_key(reqid):

    global sequence

    if reqid is None:
        key="unique"+str(sequence)
        sequence += 1
    elif '-' in reqid:
        reqid_parts = reqid.split("-")

        if len(reqid_parts)==2:
            key = reqid_parts[0]+"-"+reqid_parts[1]
        elif len(reqid_parts)>2:
            key = reqid_parts[0]+"-"+reqid_parts[1][:-5]
        else:
            key=None
            print >> sys.stderr, "Don't know how to handle reqid=%s" % reqid
    elif '.' in reqid:
        key=reqid
    else:
        key=None
        print >> sys.stderr, "Don't know how to handle reqid=%s" % reqid


    return key


def intermediate_reduce_operation(date_key, recs, accounting):

    assert date_key is not None
    assert recs is not None

    data = ArrayDict()

    prev_memused = 0

    for rec in recs:

        raw_records = cPickle.loads(rec.value)

        for r in raw_records:
            accounting[r.get('type')+"\t<input>\t"]+=1

            if r.get('httpstatus')!=None and r.get('httpstatus')!=200: #temporary - remove after reparse
                accounting[r.get('type')+"\t<skipped>\t!200"]-=1
                continue
            if r.get('pageno')!=None and r.get('pageno') < 0: #temporery - remove after reparse
                accounting[r.get('type')+"\t<skipped>\tpageno<0"]-=1
                continue

            key=gen_key(r['reqid'])
            if key:
                r['key']=key
                data.append(r)

        raw_records=None
        rec.value = None

    data.sort()
    gc.collect()

    result = defaultdict(lambda:array.array('i'))

    total_len = 0
    effective_len = 0

    for k,l in data.iterate_and_truncate():
        record = join_records(k, l, data.key_to_index, data.index_to_key,accounting)
        if record is not None:
            additional_stages(record)
            for service in record['service']:
                for k,v in record.iteritems():
                    if k[0]=='.' and v is not None:
                        stage = k[1:]

                        accounting[record['type']+"\t<stage>\t"+service]+=1
                        accounting[record['type']+"\t<stage>\t"+service+"."+stage]-=1

                        pageno = record.get('pageno')
                        user_agent = record.get('user_agent')
                        region = record.get('region')
                        ar_verdict = record.get('ar_verdict')
                        server = record.get('server')
                        tld = record.get('tld')
                        https = record.get('https')
                        turbo = record.get('turbo')
                        ajax = record.get('ajax')
                        visibility = record.get('visibility')
                        user_agent_version = record.get('user_agent_version')

                        #WARNING: modify dimensions below
                        key = "%s\t%s\t%s\t" % (record['type'], service, stage)
                        all_keys = gen_keys_decart(pageno=pageno, user_agent=user_agent, ar_verdict=ar_verdict, server=server, tld=tld, https=https, turbo=turbo, ajax=ajax, visibility=visibility, version=user_agent_version, region=region)

                        for key_append in all_keys:
                            rarr = result[key+str(key_append)]
                            try:
                                rarr.append(int(v))
                            except OverflowError:
                                print >> sys.stderr, "bad value:", v
                                #raise
                            effective_len += 1
                            total_len += 1

                        if effective_len >= 268435456:
                            prev_effective_len = effective_len
                            max_memused=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss

                            for key, values in result.iteritems():
                                if len(values)>1024:
                                    effective_len -= len(values)
                                    yield Record(key, date_key, " ".join([str(s) for s in values]))
                                    result[key]=array.array('i')

                            print >> sys.stderr, "intermediate yield: datekey: %s, Result keys: %d, total_size: %d, effective size %d -> %d, maxmemused=%d" % (date_key, len(result),total_len, prev_effective_len, effective_len, max_memused)
                    if k[0]=='.' and v is None:
                        stage = k[1:]
                        accounting[record['type']+"\t<stage>\t"+service]+=1
                        accounting[record['type']+"\t<nonestage>\t"+service+"."+stage]-=1


    for key, values in result.iteritems():
        if len(values)>0:
            yield Record(key, date_key, " ".join([str(s) for s in values]))

    if total_len >= 26843545:
        max_memused=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        print >> sys.stderr, "final yield: datekey: %s, Result keys: %d, total_size: %d, maxmemused=%d" % (date_key, len(result), total_len, max_memused)

    #if keys_len>9000:
    #    print '\n'.join(result.keys())
