# -*- coding: utf-8 -*-
# !/usr/bin/python
from __future__ import unicode_literals

import json
import sys
import logging
import re
import math
import os
from collections import Counter


KEYS_TO_EXTRACT = ["_context",
                   "_container_id",
                   "_image_name",
                   "_service",
                   "_container_name"
                   ]


def get_id(dict_for_id):
    import hashlib
    if 'secret' not in dict_for_id and 'table_name' not in dict_for_id:
        return 'error in get_id function'
    try:
        return hashlib.md5(
            dict_for_id['secret'].encode("utf8") + dict_for_id['table_name'].encode("utf8")).hexdigest()
    except Exception as e:
        return 'error in get_id function ' + str(e)


def entropy(s):
        p, lns = Counter(s), float(len(s))
        return -sum(count / lns * math.log(count / lns, 2) for count in p.values())


def check_secret(istring):
        reg_dict = [
            {'name': 'oauth', 'value': '(?:oAuth|oauthToken|oauth)[\\"]*[=:][\\"]*([\w-]+)'},
            {'name': 'phpsessid', 'value': 'phpsessid[\\"]*[=:][\\"]*([a-z-A-Z-0-9-,]+)'},
            {'name': 'schema', 'value': '(?:mongodb|pgsql|redis|postgresql|postgres|mysql)://[\w]+:([\w]+)@'},
            {'name': 'cookie', 'value': 'cookie[:\"\s=]*([^&\s\'\"]+)', 'min_len': 20, 'min_entropy': 5},
            {'name': 'token-AQAD', 'value': '.*(AQAD-[^ ]+)'},
            {'name': 'access_token', 'value': '.*access_token[\\"]*[=:][\\"]*([^&\s\'\""]+)'},
            {'name': 'session', 'value': '(?:session_id|sessionid|sessionid2|secure_session_id)[\\"]*[=:][\\"]*([\w.:%|-]+)'},
            {'name': 'autoru_sid', 'value': '.*(\d{8}\|\d{6,}\.(?!.*\*{3}))'},
            {'name': 'vin_number', 'value': '.*([A-HJ-NPR-Z0-9]{17})'},
            {'name': 'promocode', 'value': '.*(S\-[A-Z0-9]{8,13}\-[A-Z0-9]{5})'}
           # {'name': 'phone_number', 'value': '.*(7\d{10})'}
        ]
        # stop_dict = [
        #        { 'name': 'yandexuid', 'value': ''}
        # ]
        if 'AQAD-CONVERT_ERROR' in istring:
            return {'name': '', 'value': None}
        for key in reg_dict:
            r = re.match(key['value'], istring)
            try:
                found_secret = False
                if r is not None and r.group(1) is not None and r.group(1) != '':
                    start, end = r.span()
                    if key['name'] == 'cookie' and len(r.group(1)) >= key['min_len'] and entropy(r.group(1)) >= key['min_entropy']:
                        found_secret = True
                    elif key['name'] == 'autoru_sid' and istring[end:end+64].find('***') == -1:
                        found_secret = True
                    elif key['name'] != 'cookie':
                        found_secret = True
                    if found_secret:
                        return {'name': key['name'], 'value': r.group(1)}
            except Exception as e:
                logging.error('Error in check_secret ' + str(e))
                return {'name': 'REG_ERROR {}'.format(str(e)), 'value': None}
        return {'name': '', 'value': None}


def convert(s):
    logging.info("Trying to parse {}".format(s))
    try:
        if type(s) is list:
            return ''.join(str(s))
            # return 'AQAD-non_parsed_list'
        elif type(s) is dict:
            return str(s)
            # return 'non_parsed_dict'
        elif type(s) is int or type(s) is float or isinstance(s, bool):
            return str(s)
        else:
            return str(s)  # u''.join(s).encode('utf-8').strip()
    except Exception as e:
        logging.error('AQAD-CONVERT_ERROR. Type: ' + str(type(s)) + '. Error: ' + str(e))
        return 'AQAD-CONVERT_ERROR. Type: ' + str(type(s)) + '. Error: ' + str(e)


def extract_additional_info(record):
    output = {}
    for key in record:
        try:
            if record[key] is not None and key in KEYS_TO_EXTRACT:
                output[key] = record[key]

        except Exception as e:
            pass
    return output


def get_service_mask(record):
    try:
        info = extract_additional_info(record)
        service_name, context = info['_service'], info['_context']
        return ':'.join([service_name, context])
    except Exception as e:
        return None


def mapper():
    interesting_keys = frozenset((
        'passport', 'phone', 'telephone', 'address',
        'login', 'yandex_login', 'yandexlogin', 'yandexuid',
        'pass', 'password', 'secret',
    ))
    table_name = ""
    try:
        table_name = os.environ["TABLE_NAME"]
    except KeyError:
        table_name = "todo"
    for line in sys.stdin:
        row = json.loads(line)
        if "$value" in row and row["$value"] is None:
            continue
        record = row
        for key in row:
            d = dict()
            try:
                if record[key] is not None:
                    # search for secret in cell value
                    secret_type = {}
                    content = convert(record[key])
                    secret_type = check_secret(content)
                    # print('received secret_type', secret_type)
                    if secret_type is not None and secret_type['value']:
                        d["table_name"] = table_name
                        # d['table_owner'] = "todo"
                        #d['content'] = content
                        d['column'] = str(key)
                        d['profile'] = secret_type['name']
                        d['secret'] = secret_type['value']
                        d['entropy'] = entropy(secret_type['value'])
                        d['id'] = get_id(d)
                        print(json.dumps(d))
            except Exception as e:
                logging.error(str(e), exc_info=True)
                d["table_name"] = table_name
                if secret_type:
                    d['profile'] = secret_type['name']
                    d['secret'] = str(secret_type['value']) + " " + str(e)
                else:
                    d['profile'] = 'ERROR'
                    d['secret'] = str(e)

                d['content'] = u''.join(key).strip()
                # d['content'] = 'hello from exception'
                d['column'] = 'error at: '  # + u''.join(content).encode('utf-8').strip()
                d['id'] = get_id(d)
                d['misc'] = extract_additional_info(record)
                # print(json.dumps(d))
    return


if __name__ == "__main__":
    mapper()
