#!/usr/bin/env python2.7
# coding=utf-8
import os
import re
import sys

"""
2016-01-12 12:38:31.754 [pool-1-thread-60] INFO  r.y.s.j.c.scheduler.fsm.FSMPersist - getOptionOrDefault(jobFailRetryCount): 0 (default)
"""


class LogStatsReporter(object):
    LOG_LINE_RE = re.compile(r'^\d{4}-\d\d-\d\d \d\d:\d\d:\d\d\.\d+\s+\[[^\]]+\]\s+[A-Z]+\s+([a-zA-Z\.]+) - ')

    def __init__(self, root_dir, filename_filter):
        assert callable(filename_filter)
        self.root_dir = root_dir
        self.filename_filter = filename_filter
        self.mtime_list = set()
        self.channel_stats_size = {}
        self.channel_stats_count = {}
        self.total_size = 0

    def get_channel(self, line):
        match = self.LOG_LINE_RE.match(line)
        if match:
            return match.group(1)
        else:
            return None

    @staticmethod
    def _get_directory_files(root_dn):
        """dirty hack to obtain file names only"""
        for root, dirs, files in os.walk(root_dn):
            return files

    def register_channel_stat(self, channel, log_line):
        if channel is None:
            print "skip %d bytes" % len(log_line)
            return

        self.channel_stats_size[channel] = len(log_line) + self.channel_stats_size.get(channel, 0)
        self.channel_stats_count[channel] = 1 + self.channel_stats_count.get(channel, 0)

    def process_file(self, filename):
        print "# file", filename
        self.mtime_list.add(os.stat(filename).st_mtime)
        self.total_size += os.path.getsize(filename)

        fh = open(filename)

        last_channel = None
        total_lines_size = 0

        for log_line in fh:
            total_lines_size += len(log_line)
            channel = self.get_channel(log_line)
            if channel is None:
                channel = last_channel

            if channel is None:
                print "ERROR CHANNEL:", log_line

            self.register_channel_stat(channel, log_line)

            last_channel = channel

        fh.close()
        print "%s -> %d MB" % (filename, total_lines_size >> 20)

    def run(self):
        files = [f for f in self._get_directory_files(self.root_dir) if self.filename_filter(f)]
        # print "# found logs:", files
        processed = 0

        for f in files:
            sys.stderr.write("processing file %s\n" % (f, ))
            self.process_file(f)
            processed += 1

            # if processed >= 1:  # debug
            #     break

    def get_max_mtime_delta(self):
        items = list(sorted(self.mtime_list))
        deltas = [0, ]
        prev_mtime = items.pop(0)
        while items:
            next_mtime = items.pop(0)
            deltas.append(next_mtime - prev_mtime)
            prev_mtime = next_mtime
        return max(deltas)

    def get_channel_stats(self):
        lines = []

        lines.append("=== size occupied ===")
        total = sum(x for x in self.channel_stats_size.itervalues())
        for channel, size_occupied in sorted(self.channel_stats_size.iteritems(), key=lambda x: x[1], reverse=True):
            lines.append("%s: %d%%, %d MB (%d)" % (channel, 100 * size_occupied // total, size_occupied >> 20, size_occupied))

        lines.append("")

        lines.append("=== lines occupied ===")
        total = sum(x for x in self.channel_stats_count.itervalues())
        for channel, lines_occupied in sorted(self.channel_stats_count.iteritems(), key=lambda x: x[1], reverse=True):
            lines.append("%s: %d%%, %d" % (channel, 100 * lines_occupied // total, lines_occupied))

        return lines

    def get_report(self):
        lines = []

        lines.append("processed: %d MB" % (self.total_size >> 20, ))
        lines.append("max log mtime delta: %d sec" % (self.get_max_mtime_delta()))
        lines.append("")

        lines.extend(self.get_channel_stats())

        return "\n".join(lines)


FILENAME_RE = re.compile(r'^job-service.*\.log$')


def is_job_service_log(filename):
    return FILENAME_RE.match(filename)


if __name__ == "__main__":
    reporter = LogStatsReporter(".", is_job_service_log)
    reporter.run()
    print reporter.get_report()
