###  Author: Maciek Laszcz <maciek@twitch.tv>
###  A logster parser file that can be used to group and count
###  the number of response codes found in the nginx log file
###
###  For example:
###  sudo ./logster --dry-run --output=ganglia NginxLogster /var/log/httpd/access_log
###
###  Based on SampleLogster which is Copyright 2011, Etsy, Inc.

import time
import re
from collections import defaultdict

from logster.logster_helper import MetricObject, LogsterParser
from logster.logster_helper import LogsterParsingException

class NginxLogster(LogsterParser):
    # one regex to rule them all, most of the cases covered by unit tests
    REGEX = ('^(?P<remote_addr>\S+)\s+-\s+\S+\s+\[(?P<time_local>[^]]+)\]\s+'
        '"(?P<request>[^"]*)"\s+'
        '(?P<status>\S+)\s+'
        '(?P<body_bytes_sent>\S+)\s+'
        '(?P<referrer>"[^"]*")\s+'
        '(?P<user_agent>"[^"]*")\s+'
        '(?P<request_time>\S+)\s+'       # standard stuff

        '(?P<upstream_time>\S+( : \S+)*)\s+'                   # for restarted requests there will
                                                               # be multiple timings separated by
                                                               # a colon in the nginx log

        '((?P<cache_status>\S+)|(?P<error_message>"[^"]*"))')  # this is a little hacky: lines that
                                                               # have cache status don't have quotes
                                                               # around this field

    def __init__(self, option_string=None):
        '''Initialize any data structures or variables needed for keeping track
        of the tasty bits we find in the log we are parsing.'''
        self.metric = dict((k, defaultdict(lambda:0.0)) for k in ['playlist', 'chunk', 'other'])

        # Regular expression for matching lines we are interested in, and capturing
        # fields from the line, relies on the nginx format:
        # log_format  hls  '$remote_addr - $remote_user [$time_local]  '
        #                  '"$request" $status $body_bytes_sent '
        #                  '"$http_referer" "$http_user_agent" $request_time $upstream_response_time '
        #                  '$upstream_cache_status';
        # or varnish format: '%h %l %u %t  "%r" %s %b "%{Referer}i" "%{User-agent}i" - %{Varnish:time_firstbyte}x %{Varnish:hitmiss}x'
        self.reg = re.compile(self.REGEX)

    STATUS = ['403', '404', '2xx', '3xx', '502', '503', 'other']
    def _parse_status(self, status):
        if status in ('403', '404', '502', '503'):
            return status
        if status >= '200' and status < '300':
            return '2xx'
        if status >= '300' and status < '400':
            return '3xx'
        return 'other'

    CACHE_STATUS = ['cache', 'upstream', 'local']
    def _parse_cache_status(self, status):
        status = status.upper()
        if status == 'HIT':
            return 'cache'
        if status in ('MISS', 'EXPIRED'):
            return 'upstream'
        return 'local'

    REQUEST = ['chunk', 'playlist', 'other']
    def _parse_request(self, request):
        """ detects type (ts/m3u8/unknown) of the request """
        try:
            request = request.split()
            if request[0] == 'GET':
                filename = request[1].split('?')[0]
                if filename.endswith('.ts'):
                    return 'chunk'
                if filename.endswith('.m3u8'):
                    return 'playlist'
        except Exception:
            pass
        return 'other'

    def _parse_time(self, timestamp):
        """ converts the log timestamp to epoch """
        import dateutil
        timestamp = dateutil.parser.parse(result['time_local'], fuzzy=True)
        return timestamp.strftime('%s')


    def parse_line(self, line):
        '''This function should digest the contents of one line at a time, updating
        object's state variables. Takes a single argument, the line to be parsed.'''

        try:
            # Apply regular expression to each line and extract interesting bits.
            regMatch = self.reg.match(line)

            if regMatch:
                linebits = regMatch.groupdict()
                # depends on dateutil
                #timestamp = self._parseTime(linebits['time_local']),
                type = self._parse_request(linebits['request'])
                status = self._parse_status(linebits['status'])
                cache = self._parse_cache_status(linebits['cache_status'])

                self.metric[type]["status_" + status] += 1
                self.metric[type]["cache_" + cache] += 1

                #print '%r: %s' % ((type, status, cache), line)
            else:
                raise LogsterParsingException, "regmatch failed to match: %s" % line

        except Exception, e:
            raise LogsterParsingException, "regmatch or contents failed with %r" % e


    def get_state(self, duration):
        '''Run any necessary calculations on the data collected from the logs
        and return a list of metric objects.'''
        self.duration = duration

        metrics = []
        for request in self.REQUEST:
            # status code stats
            if self.duration > 0:
                metrics.extend([
                        MetricObject("nginx_{0}_request_{1}".format(request, status),
                            (self.metric[request]["status_" + status] / self.duration), "count/s", group="nginx")
                        for status in self.STATUS
                ])

            # caching statistics
            total = float(sum(self.metric[request]["cache_" + cache] for cache in self.CACHE_STATUS))
            if request != 'other' and total > 0:
                metrics.extend([
                        MetricObject("nginx_{0}_from_{1}".format(request, cache),
                            (100 * self.metric[request]["cache_" + cache] / total), "pct", group="nginx")
                        for cache in self.CACHE_STATUS
                ])

        return metrics
