#!/usr/bin/env python
# This script was created to reduce the noise of the nginx 5xx error alerts on tier1 hosts. It replaces them with more informative stats

from datetime import datetime, timedelta
import requests
import re
import statsd
import socket
from multiprocessing import Process

LOG_LOCATION = "/var/log/nginx/video_tier1_pr_error.log"
CONSUL_URL = 'http://localhost:8500'
CONSUL_ENDPOINT = "/v1/catalog/service/hls-origin?dc=%s"
STATSD_ADDR = 'statsd.internal.justin.tv'

now = datetime.now()
print "running at", now.strftime('%Y/%m/%d %H:%M:%S')
lookback = timedelta(minutes = 1)
timestart = (now - lookback).strftime('%Y/%m/%d %H:%M:%S')

def process_service(service):
	return [elem['Node'] for elem in service]

nodes = process_service(requests.get("%s%s" % (CONSUL_URL, CONSUL_ENDPOINT % 'sjc02')).json)


statsd_client = statsd.StatsClient(STATSD_ADDR, 8125)

hostname = socket.getfqdn()
hostname = hostname.replace(".", "_") # e.g. video-tier1-345fbc_cmh01_justin_tv

server = re.compile('GET /([^/]*)/')

def process_line(line):
	match = server.search(line)
	print line
	# Check for bad upstream
	if 'no resolver defined to resolve' in line:
		if match.group(1) in nodes:
			print match.group(1), "exists"
			statsd_client.incr("tier1.upstream.%s.request_monitoring.upstream_error" % match.group(1))
			statsd_client.incr("tier1.%s.request_monitoring.upstream_error" % hostname)
		else:
			print match.group(1), "does not exist"
			statsd_client.incr("tier1.%s.request_monitoring.bad_upstream" % hostname)

	# Check for timeouts
	if 'upstream timed out' in line:
		print match.group(1), "timed out"
		statsd_client.incr("tier1.upstream.%s.request_monitoring.upstream_timeout" % match.group(1))
		statsd_client.incr("tier1.%s.request_monitoring.upstream_timeout" % hostname)


def check_log(log):
	with open(log) as f:
		for line in f:
			if line[:19] > timestart:
				process_line(line)

p = Process(target=check_log, args=(LOG_LOCATION,))
p.start()
p.join(30)
if p.is_alive():
	p.terminate()
	statsd_client.incr("tier1.%s.request_monitoring.too_many_errors" % hostname)
	print "The log took more than 30 seconds to process so the attempt was aborted."



