# -*- coding: utf-8 -*-

from __future__ import unicode_literals

import sys
#reload(sys)
#sys.setdefaultencoding('utf8')

import datetime
import time
import yt.wrapper as yt
import logging
import math
from collections import Counter

import re

from security.ant_secret.snooper import Snooper
#from hec_sender import SplunkHECSender


yt.config["proxy"]["url"] = "hahn.yt.yandex.net"

yt.config["allow_http_requests_to_yt_from_job"] = True
# https://wiki.yandex-team.ru/users/asaitgalin/pythonwrapperdiagnostics/
# https://bitbucket.browser.yandex-team.ru/projects/STARDUST/repos/browser-server-api-resources/pull-requests/9345/diff#ComponentUpdate/https_ew/collector/log_parsers/barnavig_parser.py
yt.config['pickling']['module_filter'] = lambda module: (
    hasattr(module, '__file__')
    and not module.__file__.endswith('.so')
    and 'hashlib' not in getattr(module, '__name__', '')
    and 'hmac' not in getattr(module, '__name__', '')
)

# For Mac
yt.config['pickling']['module_filter'] = lambda module: (
    hasattr(module, '__file__')
    and not module.__file__.endswith('.so')
    and 'hashlib' not in getattr(module, '__name__', '')
    and 'weakref' not in getattr(module, '__name__', '')
    and 'hmac' not in getattr(module, '__name__', '')
    or
    (hasattr(module, '__file__') and module.__file__.endswith('yson_lib.so'))
)

yt.config['pickling']['force_using_py_instead_of_pyc'] = True

delta = datetime.datetime.now() - datetime.timedelta(minutes=5)
TABLE_TIME_EARLIEST = delta.strftime("%Y-%m-%dT00:00:00.0Z")
TABLE_TIME_LATEST = datetime.datetime.now().strftime("%Y-%m-%dT00:00:00.0Z")

CLUSTERS = ['hahn']

ROOT = "/"
# test for columns name
# ROOT = "//home/geosearch-prod"
# test for columns content
# ROOT = "//home/infrasec/yt/2018-02-18/analysis"
# ROOT = "//logs/jstracer-log"

DEST_TABLE = "//home/infrasec/tmp/tmp3"
schema = [
    {'name': 'id', 'type': 'string'},
    {'name': 'table_name', 'type': 'string'},
    {'name': 'table_owner', 'type': 'string'},
    {'name': 'profile', 'type': 'string'},
    {'name': 'secret', 'type': 'string'},
    {'name': 'entropy', 'type': 'double'},
    {'name': 'column', 'type': 'string'},
    {'name': 'content', 'type': 'string'},
    {'name': 'reported', 'type': 'boolean'},
]


# Search tuning
# One operation limit (max 1999)
# default 990
limit = 99

logging.basicConfig(
    format=u'[%(asctime)s] %(filename)s %(message)s',
    level=logging.INFO,
    # filename='/tmp/leases_new.log',
    filemode='a'
)

logger = logging.getLogger(__name__)

snooper = Snooper()
searcher = snooper.searcher()

#hec_sender = SplunkHECSender(
#    token = os.environ["HEC_TOKEN"],
#    )


def main():
    obj_list = []

    # Mapper function
    @yt.aggregator
    @yt.with_context
    def mapper(records, context):
        #try:
            for record in records:
                for key in record:
                        d = dict()
                    #try:
                        # do not analyse empty fields
                        if record[key] is not None:

                            # search for secret in cell value
                            secret_type = {}


                            content = convert(record[key])
                            #secret_type = check_secret(content)

                            for secret in searcher.search(content.encode('latin1'), valid_only=False):
                                #print('found secret of type %s: %s' % (secret.type, secret.secret))
                                #send_to_HEC('found secret of type %s: %s' % (secret.type, secret.secret))

                            #if secret_type['value']:
                                table_index = record["@table_index"]
                                d["table_name"] = str(obj_list[table_index])
                                d['table_owner'] = yt.get(obj_list[table_index] + '/@owner')

                                d['content'] = content
                                d['column'] = str(key)

                                d['profile'] = str(secret.type) #secret_type['name']
                                d['secret'] = secret.secret.decode('utf-8') #secret_type['value']
                                d['entropy'] = entropy(secret.secret)

                                d['id'] = get_id(d)
                                yield d

                            # search by column name
                            '''
                            if (
                                key in interesting_keys or
                                re.match(
                                    r'(?!(options|.*stbx|sum|.*balancer|.*otrs|channel|failed)).*(device.?id|yandex.?uid|(\.|^|client.*|remote.*|safe.*|_|real.*|session.*|frontend.*|user.*)ip6?s?(.*numeric)?$|email(?!.*notifications)|(?!mobile.*)phone(?!(.*ali|.*id|.*model|.*vendor|.*ua|s))|passport(?!uid)(?!.uid)|login|fio|ios.?uid|cookie(?!s))((?!(hash|daily|flag|status)).)*$', \
                                    key, re.IGNORECASE
                                ) and (content != u'' and content != u'None')
                            ):
                                table_index = record["@table_index"]
                                d["table_name"] = str(obj_list[table_index])
                                d['table_owner'] = yt.get(obj_list[table_index] + '/@owner')

                                d['secret'] = d['content'] = content
                                d['column'] = u''.join(key).encode('utf-8').strip()
                                d['profile'] = 'column name'

                                d['id'] = get_id(d)
                                yield d
                            '''

                            # unconditional return
                            '''
                            table_index = record["@table_index"]
                            d["table_name"] = str(obj_list[table_index])
                            d['table_owner'] =  yt.get(obj_list[table_index] + '/@owner')
                            d['column'] = key
                            d['profile'] = 'column name'
                            yield d
                            '''
                        '''
                    except Exception as e:

                        table_index = record["@table_index"]
                        d["table_name"] = str(obj_list[table_index])
                        d['table_owner'] = yt.get(obj_list[table_index] + '/@owner')

                        if secret_type:
                            d['profile'] = secret_type['name']
                            d['secret'] = str(secret_type['value']) + " " + str(e)
                        else:
                            d['profile'] = 'ERROR'
                            d['secret'] = str(e)

                        d['content'] = u''.join(key).encode('utf-8').strip()
                        # d['content'] = 'hello from exception'
                        d['column'] = 'error at: '  #+ u''.join(content).encode('utf-8').strip()

                        d['id'] = get_id(d)
                        yield d
                    '''
        #except:
        #    next

    def get_id(dict_for_id):

        import hashlib
        try:
            return hashlib.md5(
                dict_for_id['secret'].encode("utf8") + dict_for_id['table_name'].encode("utf8")).hexdigest()
        except Exception as e:
            logging.error('error in get_id function ' + str(e))
            return 'error in get_id function ' + str(e)

    def check_secret(istring):
        reg_dict = [
            {'name': 'oauth', 'value': '(?:oAuth|oauthToken|oauth)[\\"]*[=:][\\"]*([\w-]+)'},
            {'name': 'phpsessid', 'value': 'phpsessid[\\"]*[=:][\\"]*([a-z-A-Z-0-9-,]+)'},
            {'name': 'schema', 'value': '(?:mongodb|pgsql|redis|postgresql|postgres|mysql)://[\w]+:([\w]+)@'},
            {'name': 'cookie', 'value': 'cookie[:\"\s=]*([^&\s\'\"]+)', 'min_len': 20, 'min_entropy': 5},
            {'name': 'token-AQAD', 'value': '.*(AQAD-[^ ]+)'},
            {'name': 'access_token', 'value': '.*access_token[\\"]*[=:][\\"]*([^&\s\'\""]+)'},
            {'name': 'session', 'value': '(?:session_id|sessionid|sessionid2|secure_session_id)[\\"]*[=:][\\"]*([\w.:%|-]+)'},
	        {'name': 'coordinates', 'value': '^\s*55\.[\d]{6,29}[,\s]+37\.[\d]{6,}\s*$|^\s*55\.[\d]{6,29}\s*$|^\s*37\.[\d]{6,29}\s*$'},
            #           { 'name': 'test', 'value': '.*log(\w+)' },
        ]
        # stop_dict = [
        #        { 'name': 'yandexuid', 'value': ''}
        # ]

        for key in reg_dict:
            r = re.match(key['value'], istring, re.IGNORECASE)
            try:
                found_secret = False
                if r is not None and r.group(1) is not None and r.group(1) != '':
                    if key['name'] == 'cookie' and len(r.group(1)) >= key['min_len'] and entropy(r.group(1)) >= key['min_entropy']:
                        found_secret = True
                    elif key['name'] != 'cookie':
                        found_secret = True
                    if found_secret:
                        return {'name': key['name'], 'value': r.group(1)}
            except Exception as e:
                logging.error('Error in check_secret ' + str(e))
                return {'name': 'REG_ERROR', 'value': str(e)}

        return {'name': '', 'value': None}

    def entropy(s):
        p, lns = Counter(s), float(len(s))
        return -sum(count / lns * math.log(count / lns, 2) for count in p.values())

    # convert column value to string for future analisys
    def convert(s):

        try:
            if type(s) is list:
                return ''.join(str(s))
                # return 'AQAD-non_parsed_list'
            elif type(s) is dict:
                return str(s)
                # return 'non_parsed_dict'
            #elif type(s) is int or type(s) is float or type(s) is long or isinstance(s, bool):
            #    return unicode(s)
            else:
                return str(s)  # u''.join(s).encode('utf-8').strip()
        except Exception as e:
            logging.error('AQAD-CONVERT_ERROR. Type: ' + str(type(s)) + '. Error: ' + str(e))
            return 'AQAD-CONVERT_ERROR. Type: ' + str(type(s)) + '. Error: ' + str(e)

    def reducer(key, records):
        table_name = key["table_name"]
        yield {"table_name": table_name}

    def check_tables(obj_list):

        clean_result_table()

        try:
            yt.run_map(mapper, obj_list, yt.TablePath(DEST_TABLE, append=True),
                       format=yt.JsonFormat(control_attributes_mode="row_fields"))
            event = u''

            rows = yt.read_table(DEST_TABLE, format=yt.JsonFormat(attributes={"encode_utf8": False}))
            for row in rows:
                event = datetime.datetime.now().strftime('%s')
                for format_string in schema:
                    if (
                        format_string['name'] != 'reported' and
                        row[format_string['name']] is not None  # and row['reported'] == False
                    ):
                        event = event + ", " + format_string['name'] + '="' + str(row[format_string['name']]).replace('"', "'") + '"'

                send_to_HEC(event)

        except Exception as e:
            logging.error("Error in check_tables: " + str(e))
            next

    # does table in cache and fill it, if no
    def fill_tables_cache(tables_cache, table_name):
        import re

        # remove part with time from table path, for example remove "T03:30:00" from "//home/logfeller/logs/avia-action-log/30min/2018-02-17T03:30:00"
        if re.match('.*-\d\dT\d\d:.*', table_name):
            # Затычка от парыт T
            try:
                (temp_table_name, null_field) = table_name.split('T')
            except:
                temp_table_name = table_name
        else:
            temp_table_name = table_name

        for line in tables_cache:
            count = 0
            if len(line) == len(temp_table_name):
                for i in range(len(line)):
                    if i < len(temp_table_name) and line[i] != temp_table_name[i]:
                        count += 1

                if count < 2:
                    logging.info("Skiped " + str(table_name) + ' because of ' + line)
                    return

        logging.info("Appended " + str(table_name))
        obj_list.append(yt.TablePath(name=str(table_name), end_index=10))
        tables_cache.append(temp_table_name)

    def clean_result_table():
        try:
            yt.remove(path=DEST_TABLE)
        except:
            pass

        time.sleep(10)
        yt.create("table", DEST_TABLE, attributes={"schema": schema})

    # OUT of YT ====================================================
    def prepare_secrets():
        import os
        return os.environ["YT_TOKEN"], os.environ["HEC_TOKEN"]

    def send_to_HEC(event):

        import requests
        from requests.packages.urllib3.exceptions import InsecureRequestWarning
        requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
        import json

        HEC_host = "hatch.yandex.net"
        HEC_HOST = "trapdoor-myt-1.sec.yandex.net"
        # import from environment
        HEC_token = hec_token
        requests_url = "https://%s/services/collector" % HEC_host

        post_data = {
            "event": event,
            'index': "temp",
            'sourcetype': 'yt_secrets'
        }

        try:
            data = json.dumps(post_data).encode('utf8')
        except Exception as e:
            logging.error("Json.dumps error at send_to_HEC: " + str(e))
            data = "Json.dumps error at send_to_HEC: " + str(e)

        auth_header = "Splunk %s" % HEC_token

        headers = {'Authorization': auth_header}

        try:
            req = requests.post(requests_url, data=data, headers=headers, verify=False)
            response_json = json.loads(str(req.text))

            if "text" in response_json:
                if response_json["text"] != "Success":
                    logging.error("Error sending request")

        except Exception as e:
            logging.error("Unexpected error:" + str(e))
            next

    # check_tables('asdf')
    # sys.exit()

    

    for cluster in CLUSTERS:
        logging.info("Starting work for cluster " + cluster)
        yt.config["token"], hec_token = prepare_secrets()

        yt.config.config['proxy']['url'] = cluster

        obj_list = []
        tables_cache = []

        TABLE_TIME_EARLIEST = (datetime.datetime.now() - datetime.timedelta(1)).strftime("%Y-%m-%dT00:00:00.0Z")
        TABLE_TIME_LATEST = datetime.datetime.now().strftime("%Y-%m-%dT00:00:00.0Z")

        tables_list = yt.search(
            ROOT,
            node_type=["table"],
            object_filter=lambda obj: TABLE_TIME_EARLIEST < obj.attributes.get("creation_time") < TABLE_TIME_LATEST,
            attributes=["account", "owner", "modification_time", "type", "creation_time"])

        for obj in tables_list:
            try:
                attributes = yt.get(str(obj), attributes=['dynamic', 'expiration_time'], format=yt.JsonFormat())
                if 'false' in attributes:
                    fill_tables_cache(tables_cache, str(obj))
            except:
                #logging.info("Have no rights to read " + unicode(obj))
                logging.info("Have no rights to read %s" % (obj.encode('utf8')) ) 

            if len(tables_cache) == limit:
                check_tables(obj_list)
                obj_list = []
                tables_cache = []

        # final
        check_tables(obj_list)



if __name__ == "__main__":
    main()
