# -*- coding: utf-8 -*-
# !/usr/bin/python
from __future__ import unicode_literals

import json
import sys
import logging
import re
import math
from collections import Counter


KEYS_TO_EXTRACT = ["_context",
                   "_container_id",
                   "_image_name",
                   "_service",
                   "_container_name"
                   ]


def get_id(dict_for_id):
    import hashlib
    try:
        return hashlib.md5(
            dict_for_id['secret'].encode("utf8") + dict_for_id['table_name'].encode("utf8")).hexdigest()
    except Exception as e:
        logging.error('error in get_id function ' + str(e))
        return 'error in get_id function ' + str(e)


def entropy(s):
        p, lns = Counter(s), float(len(s))
        return -sum(count / lns * math.log(count / lns, 2) for count in p.values())


def check_secret(istring):
        reg_dict = [
            {'name': 'oauth', 'value': '(?:oAuth|oauthToken|oauth)[\\"]*[=:][\\"]*([\w-]+)'},
            {'name': 'phpsessid', 'value': 'phpsessid[\\"]*[=:][\\"]*([a-z-A-Z-0-9-,]+)'},
            {'name': 'schema', 'value': '(?:mongodb|pgsql|redis|postgresql|postgres|mysql)://[\w]+:([\w]+)@'},
            {'name': 'cookie', 'value': 'cookie[:\"\s=]*([^&\s\'\"]+)', 'min_len': 20, 'min_entropy': 5},
            {'name': 'token-AQAD', 'value': '.*(AQAD-[^ ]+)'},
            {'name': 'access_token', 'value': '.*access_token[\\"]*[=:][\\"]*([^&\s\'\""]+)'},
            {'name': 'session', 'value': '(?:session_id|sessionid|sessionid2|secure_session_id)[\\"]*[=:][\\"]*([\w.:%|-]+)'},
            {'name': 'autoru_sid', 'value': '.*(\d{8}\|\d{6,}\.(?!.*\*{3}))'},
           # {'name': 'phone_number', 'value': '.*(7\d{10})'}
           # { 'name': 'test', 'value': '.*log(\w+)' },
        ]

        if 'AQAD-CONVERT_ERROR' in istring:
            return {'name': '', 'value': None}
        for key in reg_dict:
            r = re.match(key['value'], istring, re.IGNORECASE)
            try:
                found_secret = False
                if r is not None and r.group(1) is not None and r.group(1) != '':
                    start, end = r.span()
                    if key['name'] == 'cookie' and len(r.group(1)) >= key['min_len'] and entropy(r.group(1)) >= key['min_entropy']:
                        found_secret = True
                    elif key['name'] == 'autoru_sid' and istring[end:].find('***') == -1:
                        found_secret = True
                    elif key['name'] != 'cookie':
                        found_secret = True
                    if found_secret:
                        return {'name': key['name'], 'value': istring}
            except Exception as e:
                logging.error('Error in check_secret ' + str(e))
                return {'name': 'REG_ERROR' + str(e), 'value': str(e)}

        return {'name': '', 'value': None}


def convert(s):
    try:
        if type(s) is list:
            return ''.join(str(s))
            # return 'AQAD-non_parsed_list'
        elif type(s) is dict:
            return str(s)
            # return 'non_parsed_dict'
        elif type(s) is int or type(s) is float or isinstance(s, bool):
            return str(s)
        else:
            return str(s)  # u''.join(s).encode('utf-8').strip()
    except Exception as e:
        return 'AQAD-CONVERT_ERROR. Type: ' + str(type(s)) + '. Error: ' + str(e)


def extract_additional_info(record):
    output = {}
    for key in record:
        try:
            if record[key] is not None and key in KEYS_TO_EXTRACT:
                output[key] = record[key]
        except Exception as e:
            pass
    return output


def get_service_mask(record):
    try:
        info = extract_additional_info(record)
        service_name, context = info['_service'], info['_context']
        return ':'.join([service_name, context])
    except Exception as e:
        return None


def mapper():
    interesting_keys = frozenset((
        'passport', 'phone', 'telephone', 'address',
        'login', 'yandex_login', 'yandexlogin', 'yandexuid',
        'pass', 'password', 'secret',
    ))
    known_services = {}
    table_name = sys.argv[1]
    for line in sys.stdin:
        row = json.loads(line)
        if "$value" in row and row["$value"] is None:
            continue
        record = row
        for key in row:
            d = dict()
            try:
                if record[key] is not None:
                    # search for secret in cell value
                    secret_type = {}
                    content = convert(record[key])
                    secret_type = check_secret(content)
                    if secret_type['value']:
                        table_index = "todo table index"
                        d["table_name"] = table_name
                        d['table_owner'] = "todo"
                        #d['content'] = content
                        d['column'] = str(key)
                        d['profile'] = secret_type['name']
                        d['secret'] = secret_type['value']
                        d['entropy'] = entropy(secret_type['value'])
                        d['id'] = get_id(d)
                        d['misc'] = json.dumps(extract_additional_info(record)) #extract_additional_info(record)
                        service = get_service_mask(record)
                        if service is not None and service not in known_services:
                            known_services[service] = True
                            print(json.dumps(d))

                    if (key in interesting_keys or re.match(
                                r'(?!(options|.*stbx|sum|.*balancer|.*otrs|channel|failed)).*(device.?id|yandex.?uid|(\.|^|client.*|remote.*|safe.*|_|real.*|session.*|frontend.*|user.*)ip6?s?(.*numeric)?$|email(?!.*notifications)|(?!mobile.*)phone(?!(.*ali|.*id|.*model|.*vendor|.*ua|s))|passport(?!uid)(?!.uid)|login|fio|ios.?uid|cookie(?!s))((?!(hash|daily|flag|status)).)*$', \
                                    key, re.IGNORECASE)):
                        table_index = "todo table index"
                        d["table_name"] = table_name
                        d['table_owner'] = "TODO"
                        # d['secret'] = d['content'] = content
                        d['column'] = u''.join(key).encode('utf-8').strip()
                        d['profile'] = 'column name'
                        d['misc'] = json.dumps(extract_additional_info(record)) #extract_additional_info(record)
                        d['id'] = get_id(d)
                        service = get_service_mask(record)
                        if service is not None and service not in known_services:
                            known_services[service] = True
                            print(json.dumps(d))

                    # unconditional return
                    '''
                    table_index = record["@table_index"]
                    d["table_name"] = str(obj_list[table_index])
                    d['table_owner'] =  yt.get(obj_list[table_index] + '/@owner')
                    d['column'] = key
                    d['profile'] = 'column name'
                    yield d
                    '''
            except Exception as e:
                table_index = "todo table index"
                d["table_name"] = table_name
                d['table_owner'] = "todo"

                if secret_type:
                    d['profile'] = secret_type['name']
                    d['secret'] = str(secret_type['value']) + " " + str(e)
                else:
                    d['profile'] = 'ERROR'
                    d['secret'] = str(e)

                d['content'] = u''.join(key).encode('utf-8').strip()
                # d['content'] = 'hello from exception'
                d['column'] = 'error at: '  # + u''.join(content).encode('utf-8').strip()
                d['id'] = get_id(d)
                d['misc'] = json.dumps(extract_additional_info(record)) #extract_additional_info(record)
                print(json.dumps(d))
    return


if __name__ == "__main__":
    mapper()
