#!/usr/bin/env python2
import os
import re
import sys
import time


# TICKS_LINE_RE = re.compile(r'^\d\d:\d\d:\d\d\.\d\d\d \[[^\]]+\] ' +
#                            re.escape('DEBUG r.y.s.job.core.SchedulingTelemetry - --------------------- Scheduling step ') +
#                            r'(\d+) \1-----------------------------')
TICKS_LINE_RE = re.compile(r'SchedulingTelemetry - --------------------- Scheduling step (\d+) ')

PENDING_GROUPS_COUNT_RE = re.compile('^Total pending groups: (\d+)')
CANCELLED_GROUP_COUNT_RE = re.compile(r'^Total cancelled groups: (\d+)')

# GroupId 304166ef-167a-11e5-9a67-00259095835a, owner graf-vk,
# Policy MasterSlavePlanningPolicy{Master=304166ef-167a-11e5-9a67-00259095835a}
GROUP_HEADER_RE = re.compile(r'^GroupId (\S+), owner ([^\s,]+), Policy (\S+)')

#          jobId 4c2853cd-b143-4f0f-b88f-ed8dd307ab38, host s1-1149.qloud.yandex.net,
# currentState DONE (ACTIVE), age 1 month, 1 week, 1 day, 9 hours, 3 minutes, 2 seconds and 398 milliseconds
JOB_HEADER_RE = re.compile(r'^\t jobId ([^\s,]+), host ([^\s,]+), currentState ([A-Z_]+) \(([A-Z_]+)\), age (.+?)\s*$')


def comment(msg):
    print "#", msg


def split_lines_by_header_regexp(header_re, lines):
    """
    split and leave matched line at the start of next section
    """
    result = []
    current = []
    while lines:
        line = lines.pop(0)
        if header_re.match(line):
            if current and header_re.match(line):
                result.append(list(current))
            current = []
        current.append(line)
    # last record
    if current and header_re.match(current[0]):
        result.append(list(current))

    return result


class JobGroup(object):

    def __init__(self, header):
        self.header = header
        match = GROUP_HEADER_RE.match(header)
        self.group_id, self.owner, self.policy = match.group(1, 2, 3)
        self.jobs = []

    def __eq__(self, other):
        result = (self.group_id == other.group_id and
                  self.owner == other.owner and
                  self.policy == other.policy)
        # if not result:
        #     print "--- groups differ!"
        return result


class Job(object):

    def __init__(self, epoch_sec, group, lines):
        self.epoch_sec = epoch_sec
        self.group = group
        self.header = lines[0]
        match = JOB_HEADER_RE.match(self.header)
        if not match:
            print
            print "cant parse Job header: " + repr(self.header)
            raise ValueError("wrong Job header: %r" % self.header)
        self.job_id, self.host, self.current_state, self.iss_state, self.age = match.group(1, 2, 3, 4, 5)
        self.lines = list(lines[1:])

    def get_key(self):
        return self.group.group_id, self.job_id


def are_job_states_equal(j1, j2):
    result = (j1.group == j2.group and
              j1.job_id == j2.job_id and
              j1.host == j2.host and
              j1.current_state == j2.current_state and
              j1.iss_state == j2.iss_state
              )
    # if not result:
    #         print "--- jobs differ!"
    return result


class SchedulingStepRecord(object):

    def __init__(self, lines, fast_check_func):
        self.epoch_msec = None
        self.epoch_sec = None
        self.canceled_groups = None
        self.jobs = []
        self.fast_check_func = fast_check_func

        # self.source = list(lines)
        self._parse(lines)

    def _parse(self, lines):
        if not lines:
            raise ValueError("empty lines")
        first = lines.pop(0)
        match = TICKS_LINE_RE.search(first)
        if not match:
            raise ValueError("cant parse first line: %r" % (first, ))

        self.epoch_msec = int(match.group(1))
        self.epoch_sec = self.epoch_msec // 1000

        if not lines:
            raise ValueError("not enough lines for group count")

        self.parse_lines(lines)

    def parse_lines(self, lines):
        # cut [Total pending groups] before
        if not lines:
            comment('ERROR: cant parse_lines')
            return

        first = lines.pop(0)
        # total_group_count = int(PENDING_GROUPS_COUNT_RE.match(first).group(1))

        if not lines:
            # comment("ERROR: no data after [Total pending groups:] in step %d" % (self.epoch_msec, ))
            # might be no groups
            return

        # cut after [Total cancelled groups:]
        line_blocks = split_lines_by_header_regexp(CANCELLED_GROUP_COUNT_RE, lines)
        if len(line_blocks) != 2:
            comment("ERROR: unable to find [Total cancelled groups:] in step %d" % (self.epoch_msec, ))
            return

        group_lines, remainder_lines = line_blocks

        # 2. split by group header and parse
        self.parse_groups(group_lines)

        # if total_group_count != len(self.groups):
        #     comment("WARN: size mismatch: header:%d, parsed:%d" % (total_group_count, len(self.groups)))

        self.parse_remainder(remainder_lines)

    def parse_remainder(self, lines):
        # parse deleted groups
        pass

    def get_jobs(self):
        return list(self.jobs)

    def parse_groups(self, group_lines):
        self.jobs = []
        group_series = split_lines_by_header_regexp(GROUP_HEADER_RE, group_lines)
        for lines in group_series:
            if not self.fast_check_func(lines):
                continue
            self.parse_one_group(lines)

    def parse_one_group(self, group_lines):
        if not group_lines:
            comment("ERROR: empty lines")
            return
        first = group_lines.pop(0)
        job_group = JobGroup(first)
        job_series = split_lines_by_header_regexp(JOB_HEADER_RE, group_lines)
        for lines in job_series:
            if not self.fast_check_func(lines):
                continue
            self.parse_one_job(job_group, lines)

    def parse_one_job(self, job_group, lines):
        job = Job(self.epoch_sec, job_group, lines)
        self.jobs.append(job)


class LogLineProcessorBase(object):

    def __init__(self, filename):
        self.filename = filename
        self.lines_count = None
        self.pending_stop = False
        self.debug = False

    def set_debug(self, debug):
        self.debug = debug

    def _debug(self, msg):
        if self.debug:
            sys.stdout.write(msg)
            sys.stdout.flush()

    def stop(self):
        self.pending_stop = True

    def process_line(self, line):
        pass

    def run(self):
        self.lines_count = 0

        for raw_line in open(self.filename):
            self.lines_count += 1

            line = raw_line.splitlines()[0]
            self.process_line(line)

            if self.pending_stop:
                comment("stop processing (pending_stop)")
                break

        comment('')
        comment("%d lines processed" % (self.lines_count, ))

"""
00:00:52.193 [Timer-0] DEBUG r.y.s.job.core.SchedulingTelemetry - --------------------- Scheduling step 1438030724504 Timer-0-----------------------------
"""


class LineGroupGenerator(LogLineProcessorBase):

    def __init__(self, filename, splitting_substring):
        super(LineGroupGenerator, self).__init__(filename)
        # TODO: support splitting regexp
        self.splitting_substring = splitting_substring
        self.current_lines = []
        self.processed_group_count = 0

    def _is_first_line(self, line):
        return self.splitting_substring in line

    def process_lines(self, lines):
        pass

    def _flush_group(self):
        self.process_lines(self.current_lines)
        self.current_lines = []
        self.processed_group_count += 1

    def process_line(self, line):
        if self._is_first_line(line):
            self._flush_group()
        self.current_lines.append(line)

    def run(self):
        super(LineGroupGenerator, self).run()
        self._flush_group()
        comment("%d line groups found" % (self.processed_group_count, ))


class SchedulingStepsGenerator(LineGroupGenerator):

    SCHEDULING_STEP_SUBSTRING = "DEBUG r.y.s.job.core.SchedulingTelemetry - --------------------- Scheduling step"

    def __init__(self, filename):
        super(SchedulingStepsGenerator, self).__init__(filename, SCHEDULING_STEP_SUBSTRING)
        self.failed_group_rec_count = 0
        self.parsed_groups_count = 0
        self.group_id_list = set()
        self.fast_check_func = lambda x: True
        self.process_scheduling_step_func = self.process_parsed_scheduling_step

    def set_fast_check_func(self, fast_check_func):
        assert(callable(fast_check_func))
        self.fast_check_func = fast_check_func

    def set_process_scheduling_step_func(self, process_scheduling_step_func):
        assert(callable(process_scheduling_step_func))
        self.process_scheduling_step_func = process_scheduling_step_func

    def process_lines(self, lines):
        if not lines:
            return

        if not self._is_first_line(lines[0]):
            return

        if not self.fast_check_func(lines):
            return

        try:
            record = SchedulingStepRecord(lines, self.fast_check_func)
            self.process_scheduling_step_func(record)
            self.parsed_groups_count += 1
        except Exception as e:
            raise
            print "# error parsing group: %s" % (str(e), )
            self.failed_group_rec_count += 1

    def process_parsed_scheduling_step(self, step):
        self.group_id_list |= set(g.group_id for g in step.groups)
        if self.processed_group_count % 1000 == 0:
            self._debug("%d " % self.processed_group_count)

    def run(self):
        start_time = time.time()
        super(SchedulingStepsGenerator, self).run()
        comment("parsed groups: %d" % self.parsed_groups_count)
        comment("steps parsing fails: %d" % self.failed_group_rec_count)
        comment("seen %d groups" % len(self.group_id_list))
        comment("processing time: %d sec" % int(time.time() - start_time))


class HistoryRecord(object):

    def __init__(self, job):
        self.job = job
        self.start_sec = self.end_sec = job.epoch_sec
        self.copy_count = 1

    def add_duplicate(self, job):
        other_end_sec = job.epoch_sec
        if not (other_end_sec >= self.end_sec):
            print "\n".join(self.job.lines)
            print "job sec:", self.end_sec
            print "other sec:", other_end_sec
            sys.exit(1)
        self.end_sec = other_end_sec
        self.copy_count += 1


def timestr(epoch_sec):
    return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(epoch_sec))


def print_diff(value1, value2, label):
    if value1 != value2:
        print "[%s] %s --> %s" % (label, value1, value2)
        return 1
    else:
        return 0


class Historian(object):
    """
    collect states and squeeze it if there are no changes inside
    """
    def __init__(self, job_part, job_equal_tester_func):
        self.job_part = job_part

        assert callable(job_equal_tester_func)
        self.job_equal_tester_func = job_equal_tester_func

        self.history = {}  # (group:job) -> [list of records]
        self.register_count = 0

    def register(self, step):
        assert isinstance(step, SchedulingStepRecord)
        self.register_count += 1

        for job in step.get_jobs():
            self.register_one_job(job)

    def register_one_job(self, job):
        if self.job_part not in job.job_id:
            return

        key = job.get_key()
        records = self.history.get(key, list())
        if records:
            prev_record = records[-1]
            # print job.current_state, job.iss_state
            if self.job_equal_tester_func(prev_record.job, job):
                prev_record.add_duplicate(job)
                return
        records.append(HistoryRecord(job))
        self.history[key] = records

    def dump(self):
        print
        for key in self.history.iterkeys():
            self.dump_one_job(key)

    def dump_one_job(self, key):
        print "=== groupId %s, jobId %s ===" % key
        records = self.history[key]

        new_list = list(records) + [None, ]
        while new_list:
            job_state = new_list.pop(0)
            self.dump_job_state(job_state)
            if new_list[0]:
                self.dump_state_diff(job_state, new_list[0])
            else:
                break

        print

    def dump_job_state(self, state):
        if state.copy_count < 2:
            print "%s (%s)" % (state.job.current_state, state.job.iss_state, )
            print "  at %s (%d)" % (timestr(state.start_sec), state.start_sec)
        else:
            print "%s (%s) for %d sec (%d steps)" % (state.job.current_state, state.job.iss_state,
                                                             state.end_sec - state.start_sec, state.copy_count)
            print "  from %s (%d)" % (timestr(state.start_sec), state.start_sec)
            print "    to %s (%d)" % (timestr(state.end_sec), state.end_sec)
        print

    def dump_state_diff(self, prev, next):
        diff_count = 0
        diff_count += print_diff(prev.job.host, next.job.host, "host")
        if diff_count:
            print


def make_fast_check_function(job_id_part):
    # TODO: implement https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm

    def test_func(lines):
        blob = '\n'.join(lines)
        return job_id_part in blob

    return test_func


SCHEDULING_STEP_SUBSTRING = "DEBUG r.y.s.job.core.SchedulingTelemetry - --------------------- Scheduling step"


def todo_list():
    """
    step % 1000 --> 1 hour passed, 2 hours passed, 3 hours ...

    """
    pass


def main():
    # TODO: pass job_id to parser and fast check before parsing jobs too
    # TODO:   and make stream of just wanted jobs
    try:
        job_id_part = sys.argv[1]  # '11e5-9a67-00259095835a'
    except IndexError:
        sys.exit("use: %s job_id_part [log_file]" % os.path.basename(sys.argv[0]))

    filename = 'job-service.log'  # default

    try:
        filename = sys.argv[2]
    except IndexError:
        comment('use default filename: %s' % filename)

    fast_check_func = make_fast_check_function(job_id_part)

    step_generator = SchedulingStepsGenerator(filename)
    step_generator.set_fast_check_func(fast_check_func)
    step_generator.set_debug(True)

    historian = Historian(job_id_part, are_job_states_equal)
    step_generator.set_process_scheduling_step_func(historian.register)

    step_generator.run()

    historian.dump()


PROFILE_THIS_SCRIPT = True

# install profiler if requested
if PROFILE_THIS_SCRIPT:
    try:
        import profilefunc
        profilefunc.set_profile_logger_func(profilefunc.get_file_profile_event_func("joba.profile.log"))
        main = profilefunc.profileit(main)
    except ImportError:
        comment('Sorry, profiler is not available')


if __name__ == '__main__':
    main()
