import re
import json
import sys
from pyspark import SparkContext, SparkConf

APP_NAME = 'sshd_login_failures'

LOGIN_REGEXP = '(\w+)'
IP_REGEXP = '(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'
PORT_REGEXP = '(\d+)'

SYSLOG_MSG_REGEXP = re.compile('Failed password for ' + LOGIN_REGEXP + ' from ' + IP_REGEXP + ' port ' + PORT_REGEXP + ' ssh2')

DATA_URI = 's3n://{key_id}:{key_secret}@lumberjack-forest/*/*2015-03-*'

def safejson(line):
    try:
        return json.loads(line)
    except:
        return None


def is_sshd_log(l):
    return l.get('syslog_program', '') == 'sshd'


def is_pw_failure(sshd_log):
    return sshd_log.get('syslog_message').startswith('Failed password for ')


def extract_tuple(pwfailure):
    match = SYSLOG_MSG_REGEXP.match(pwfailure['syslog_message'])
    if match is None:
        login, ip, port = None, None, None
    else:
        login, ip, port = match.groups()
    host = pwfailure.get('host', None)
    return login, ip, port, host


def main(sc, key_id, key_secret):
    lines = sc.textFile(DATA_URI.format(key_id=key_id, key_secret=key_secret))
    nonblank = lines.filter(lambda x: x != "")
    jsonlines = nonblank.map(safejson)
    pwfails = jsonlines.filter(lambda x: x is not None and is_sshd_log(x) and is_pw_failure(x))
    tuples = pwfails.map(extract_tuple).persist()

    connections = tuples.map(lambda x: ('{0} -> {1}@{2}'.format(x[1], x[0], x[3]), 1))
    output = connections.reduceByKey(lambda x, y: x + y).collect()
    for x in output:
        print x

        
if __name__ == "__main__":
    # Configure Spark
    conf = SparkConf().setAppName(APP_NAME)
    sc   = SparkContext(conf=conf)

    # read credentials
    if len(sys.argv) != 3:
        raise ValueError("credentials required - invoke as sshd_login_failures.py <aws_access_key_id> <aws_secret_access_key>")

    key_id = sys.argv[1]
    key_secret = sys.argv[2]
    # Execute Main functionality
    main(sc, key_id, key_secret)
