
import sys
import datetime
import cPickle
import gc
import time

from collections import defaultdict

from mapreducelib import MapReduce, Record, TemporaryTable, SharedTransaction


from profile import profile_record_handler
from redir import redir_record_handler
from redir import user_sessions_record_handler
from spylog import spy_log_record_handler

def flush(results,accounting):

    start = time.clock()
    start_t = time.time()


    flushed = 0
    records = 0
    bytes = 0
    typecounter=defaultdict(int)

    for (date_key,record_list) in results.iteritems():

        t = record_list[0]['type']
        body = cPickle.dumps(record_list)
        records += 1
        typecounter[t] += 1
        flushed += len(record_list)

        bytes += len(body)
        if len(body) > 128*1024*1024:
            print >> sys.stderr, "Body len is too long %s (%s)" % ( len(body), date_key, typecounter )

        yield Record(date_key, t, body, tableIndex=0)
        for r in record_list:
            accounting[r['type']+"\t<output>\t"+r['service']]-=1


    finish = time.clock()
    finish_t = time.time()

    print >> sys.stderr, "flushed %d times in %d records, %f/%f seconds, bytes %d \n %s" % (flushed, records, finish - start, finish_t - start_t, bytes, typecounter)


def combine_operation(recs,current_date,index):

    rec = None

    try:

        start = time.clock()
        start_t = time.time()


        results = defaultdict(list)
        skipped = defaultdict(int)
        accounting = defaultdict(int)
        counter = 0

        for rec in recs:

            tttable = None

            try:
                record = None

                if index==0:
                    tttable = "redir-log"
                    accounting[tttable+"\t<input>\t"]+=1
                    record = redir_record_handler(rec)
                elif index==1:
                    tttable = "profile_log"
                    accounting[tttable+"\t<input>\t"]+=1
                    record = profile_record_handler(rec, 0)
                elif index==2:
                    tttable = "spy_log"
                    accounting[tttable+"\t<input>\t"]+=1
                    record = spy_log_record_handler(rec)
                elif index==3:
                    tttable = "mobreport_profile_log"
                    accounting[tttable+"\t<input>\t"]+=1
                    record = profile_record_handler(rec, 1)
                else:
                    assert False, "bad table index!"

                assert record is not None, "should return string instead!"

                if isinstance(record,str):
                    #yield Record(key=tttable, subkey='',value="%s\n%s\n%s" % (rec.key,rec.subkey,rec.value), tableIndex=1)
                    record = record[:150]
                    skipped[tttable+"."+record]+=1
                    accounting[tttable+"\t<skipped>\t"+record]-=1
                    continue
                else:
                    skipped[tttable+".ok"]+=1



                if record['reqid'] is None:
                    #no reqid, will never join
                    reqid_ts=record['ts']
                elif len(record['reqid'].split('-')) >= 2:
                    #ok this is the search ts
                    reqid_ts = int(record['reqid'][:10])
                elif len(record['reqid'].split('.')) == 4:
                    #print >> sys.stderr, "not search reqid %s" % record['reqid']
                    reqid_ts = int(record['reqid'].split('.')[2])
                else:
                    assert False, "wtf reqid %s" % record['reqid']

                date=datetime.datetime.fromtimestamp(reqid_ts)
                date_key = date.strftime('%Y-%m-%d\t%H:%M:%S')
                #we only need minute aggregation, but we split in seconds to increase number of chunks
                if current_date is None or date_key[:10]==current_date:
                    l = results[date_key].append(record)
                    counter += 1
                else:
                    accounting[tttable+"\t<ignored>\tolddate"]-=1

                if counter > 1000000:
                    for r in flush(results,accounting):
                        yield r
                        results = {}
                        counter=0
                    gc.collect()

            except (ValueError,KeyError,AssertionError,TypeError,OverflowError,IndexError) as e:
                import traceback
                tb = traceback.format_exc()
                #print >> sys.stderr, str(e) + "\n" + rec.value + "\n" + tb +  "\n\n"
                yield Record(tttable,str(e)[:100],"%s\n%s\n\n%s\n%s\n%s" % (str(e),tb,rec.key,rec.subkey,rec.value), tableIndex=1)
                accounting[tttable+"\t<error>\t"]-=1

        for r in flush(results,accounting):
            yield r
            results = {}
            counter=0
        gc.collect()

        finish = time.clock()
        finish_t = time.time()

        print >> sys.stderr, "parsing elapsed %f/%f seconds" % ( finish-start, finish_t-start_t )

    except BaseException as e:
        import traceback
        tb = traceback.format_exc()
        v = ''
        if rec: v=rec.value
        print >> sys.stderr, str(e) + "\n" + v + "\n" + tb +  "\n\n"
        raise

    for key,value in accounting.iteritems():
        yield Record(key,'',str(value),tableIndex=2)


    print >> sys.stderr, "skipped: %s" % skipped
