#!/usr/bin/env python

import os
import re
import sys
import csv
import gzip
import json
import argparse

from dateutil.parser import parse
import pytz
from dateutil import tz

#

csv.field_size_limit(sys.maxsize)

parser = argparse.ArgumentParser(description="Preprocess log files for pyreplay.")

parser.add_argument('files', nargs='*', default="<stdin>")
parser.add_argument('--control-file', type=argparse.FileType('w'), default='pyreplay_control.json')
parser.add_argument('--processed-logs', default='processed_logs.csv.gz')
parser.add_argument('--acceleration', type=float, default=1.0)
parser.add_argument('--starttime', type=str, default=None)
parser.add_argument('--starttimezone', type=str, default=None)

args = parser.parse_args()

def swap_params(query, params):
    # function takes a set of parameter values and returns
    # a dictionary of parameters
    param_re = re.match(r'parameters: (.+)', params, re.MULTILINE | re.DOTALL | re.UNICODE)
    if not param_re:
        # no params, return as-is
        return query

    paramgrp = param_re.groups()
    plist = paramgrp[0].split(", $")
    pdict = {}
    for param in plist:
        psplit = param.split(' = ')
        pdict[psplit[0].lstrip(' $')] = psplit[1]

    newq = re.sub(r'\$(\d+)',r'%(\1)s',query,re.MULTILINE | re.DOTALL | re.UNICODE)

    return newq % pdict

files = args.files

control_structure = { 'processed_logs': os.path.abspath(args.processed_logs) }

total_lines = 0

if args.starttime:
    first_timestamp = parse(args.starttime)
    if args.starttimezone:
        first_timestamp = first_timestamp.replace(tzinfo=pytz.timezone(args.starttimezone))
    else:
        first_timestamp = first_timestamp.replace(tzinfo=tz.tzlocal())
else:
    first_timestamp = None

zero_point = 0

active_sessions = []
slot_assignments = {}

pz = gzip.open(args.processed_logs, 'wb')
writer = csv.writer(pz)

for file in files:
    if file == "<stdin>":
        f = sys.stdin
    elif file[-3:].lower() == '.gz':
        f = gzip.open(file)
    else:
        f = open(file)

    reader = csv.reader(f)

    sys.stderr.write("Starting file %s\n" % file)

    line_count = 0
    rejects = 0
    file_errors = 0

    try:
        line = ''
        for line in reader:
            try:
                line_count += 1

                total_lines += 1
                if (total_lines % 100000) == 0:
                    sys.stderr.write(str(total_lines) + " processed\n")

                if (len(line) != 22) and (len(line) != 23):
                    rejects += 1
                    continue

                dts = line[0]
                if not re.match(r'\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d', dts):
                    rejects += 1
                    continue

                dts = parse(dts)

                session_id = line[5]
                session_line = line[6]
                message = line[13]
                params = line[14]

                if message[:13] == 'disconnection':
                    try:
                        slot_index = active_sessions.index(session_id)
                    except ValueError:
                        continue
                    active_sessions[slot_index] = None

                    duration = 0
                    query = '<d'

                else:
                    message_re = re.match(r'(?:duration\: (\d+\.\d+) ms  ){0,1}statement:(?:[ ]|\n)(.+)', message, re.MULTILINE | re.DOTALL | re.UNICODE)
                    if message_re:
                        sqlformat = 'plain statement'

                        groups = message_re.groups()
                        prepstate = None
                        if len(groups) == 3:
                            duration = int(round(float(groups[1]),0))
                            query = groups[2]
                        else:
                            duration = 0
                            query = groups[1]
                    else:
                        #check for all-in-one execute statements from protocol-level prepares
                        message_re = re.match(r'(?:duration\: (\d+\.\d+) ms  ){0,1}execute ([<\w_\d>]+):(?:[ ]|\n)(.+)', message, re.MULTILINE | re.DOTALL | re.UNICODE)
                        if message_re:
                            sqlformat = 'protocol prepare'
                            groups = message_re.groups()
                            if len(groups) == 4:
                                duration = int(round(float(groups[1]),0))
                                prepstate = groups[2]
                                query = groups[3]
                            else:
                                duration = 0
                                prepstate = groups[1]
                                query = groups[2]

                            query = swap_params(query, params)

                        else:
                            continue

                    try:
                        slot_index = active_sessions.index(session_id)
                    except ValueError:
                        for index in range(len(active_sessions)):
                            if active_sessions[index] is None:
                                active_sessions[index] = session_id
                                slot_index = index
                                break
                        else:
                            active_sessions.append(session_id)
                            slot_index = len(active_sessions)-1

                if first_timestamp is None:
                    first_timestamp = dts

                offset = dts - first_timestamp

                milliseconds = int(offset.microseconds/1000) + (offset.seconds + offset.days*3600*24)*1000

                milliseconds -= duration

                # skip events which actually happened before the start time
                if milliseconds > 0:
                    milliseconds = int(milliseconds/args.acceleration)

                    zero_point = min(zero_point, milliseconds)

                    writer.writerow([milliseconds, slot_index, query, session_id, session_line])

            except Exception as e:
                file_errors += 1
                sys.stderr.write("Unrecoverable error in file "+file+" at line "+str(line_count)+", exception:")
                sys.stderr.write(str(e)+"\n")
                sys.stderr.write("Error line: " + str(line) + "\n")
                
                if file_errors >= 20:
                    sys.stderr.write("Too many errors, skipping to next file")
                    break
                else:
                    continue

    except Exception as e:
        sys.stderr.write("Unrecoverable error in file %s at line %d, exception:" % (file, line_count,))
        sys.stderr.write(str(e)+"\n")
        sys.stderr.write("Previous line: " + str(line) + "\n")
        sys.stderr.write("Skipping to next file")
        continue
    else:
        sys.stderr.write("Finished file %s, %d lines, %d rejects and %d errors" % (file, line_count, rejects, file_errors,))
    finally:
        sys.stderr.write("Finished file %s\n" % file)
        f.close()

control_structure['zero_point'] = zero_point
control_structure['slots'] = len(active_sessions)

args.control_file.write(json.dumps(control_structure))
