#!/usr/bin/env python
# -*- coding: utf-8 -*-

import base64
import json
import hashlib
import ipaddress
import multipart as mp
import sys
import yt.wrapper as yt

from io import BytesIO

try:
    from utils import yt_utils
    from utils import sim_utils

except ImportError:
    from sandbox.projects.Molly.MollyReqMiner.utils import yt_utils
    from sandbox.projects.Molly.MollyReqMiner.utils import sim_utils


class BalancerAggregator(yt_utils.YtLogParser):
    """Parse requests logs from qloud balancer and extract parameters from them"""

    def __init__(self, yt_server='hahn.yt.yandex.net', yt_token=None, ip_filter_data=None, filter_param=None,
                 req_log_table=None,
                 aggr_table=None):
        """Sets standart parameters for yt providers
        :param yt_server:
        :param yt_token:
        :param molly_param:
        :param req_log_table:
        :param aggr_table:
        """
        super(BalancerAggregator, self).__init__(yt_server, yt_token)
        self.service_id = None
        self.yt_utils = yt_utils
        self.sim_utils = sim_utils
        self.val_headers = {'Accept', 'Content-Type', 'Cookie', 'Referer', 'User-Agent', 'X-Host-Y', 'X-Real-Ip',
                            'X-Forwarded-For-Yw', 'X-Forwarded-For-Y'}
        self.filtered_cookies = {'Session_id', 'sessionid2', 'Secure_session_id'}
        self.filter_len = ip_filter_data[0]
        self.filter_prefs = ip_filter_data[1]
        self.filter_nets = ip_filter_data[2]
        self.filtered_paths = {'/favicon.ico'}
        self.molly_filter_param = filter_param
        self.req_log_table = req_log_table
        self.aggr_table = aggr_table
        self.path_parts_table = None
        self.path_valuable_table = None

    @staticmethod
    def _is_html_type(ctype):
        for item in ctype:
            if item.find('text/html') >= 0:
                return True

        return False

    def _filter_cookie(self, c_item):
        for filtered in self.filtered_cookies:
            if c_item.find(filtered) >= 0:
                return True

        return False

    def _clean_cookie_list(self, c_list):
        cleaned_clist = []
        for item in c_list:
            if item.startswith(' '):
                item = item[1:]

            if self._filter_cookie(item):
                continue

            cleaned_clist.append(item)

        return cleaned_clist

    @staticmethod
    def _make_cookie_dict(c_list):
        cookie_dict = {}
        for item in c_list:
            sp_item = item.split('=')
            if len(sp_item) == 2:
                c_key = sp_item[0]
                c_val = sp_item[1]
                cookie_dict[c_key] = c_val

        return cookie_dict

    def _check_filtered_ip(self, ip):
        pref = ip[:self.filter_len]
        if pref not in self.filter_prefs:
            return False

        if ipaddress.ip_address(ip) in self.filter_nets[pref]:
            return True

    def _parse_headers(self, headers):
        parserd_headers = {}
        for item in headers:
            name = item.get('Name')
            if name not in self.val_headers:
                continue

            parserd_headers[name] = item.get('Value')

        return parserd_headers

    def _filter_static(self, query, ctype):
        if not ctype:
            return False

        if not query and not self._is_html_type(ctype):
            return True

        return False

    @staticmethod
    def _extract_sep_params(query):
        params = {}
        param_list = query.split('&')
        for item in param_list:
            pdata = item.split('=')
            if len(pdata) != 2:
                continue

            params[pdata[0]] = pdata[1]

        return params

    @staticmethod
    def _extract_multipart_params(query):
        from multipart import tob

        params = {}
        boundary = query.split('\r')[0][2:]
        try:
            mparser = mp.MultipartParser(BytesIO(tob(query)), boundary)
            for item in mparser.parts():
                params[item.name] = (item.filename, item.value)

        except mp.MultipartError:
            return params

        return params

    @staticmethod
    def _extract_json_params(query):
        try:
            params = json.loads(query)

        except ValueError:
            return None

        return params

    def _decode_typed_query(self, ctype, raw_query):
        params = None
        query = base64.b64decode(raw_query)
        if not ctype:
            return query, params

        if ctype.startswith('application/x-www-form-urlencoded'):
            params = self._extract_sep_params(query)

        elif ctype.startswith('multipart/form-data'):
            params = self._extract_multipart_params(query)

        elif ctype == 'application/json':
            params = self._extract_json_params(query)

        return query, params

    @yt.aggregator
    def _parse_req_logs(self, recs):
        """
        Map operation for parsing log table's records, yields valueable request data to map results
        table
        :param recs:
        """
        for rec in recs:
            cookie_names = None
            ctype = None
            params = None
            param_names = None
            query = None
            ip = None
            qdata = rec.get('_rest')
            if not qdata:
                continue

            service = qdata.get('ServiceId')
            # Skip other services
            if service != self.service_id:
                continue

            path = qdata.get('Path')

            # Skip filtered paths
            if path in self.filtered_paths:
                continue

            prot = qdata.get('Proto')
            raw_headers = qdata.get('Headers')
            method = qdata.get('Method')

            # Skip request without headers
            if not raw_headers:
                continue

            headers = self._parse_headers(raw_headers)
            atype = headers.get('Accept')
            host_headers = headers.get('X-Host-Y')
            if host_headers is None:
                continue
            host = host_headers[0]
            ip_data = headers.get('X-Forwarded-For-YW')
            if ip_data:
                ip = ip_data[0]

            # Skip Molly and crasher instances by their IP
            if ip and self._check_filtered_ip(ip):
                continue

            cookies = headers.get('Cookie')
            if cookies:
                c_list = cookies[0].split(';')
                cleaned_list = self._clean_cookie_list(c_list)
                cookies = self._make_cookie_dict(cleaned_list)
                cookie_names = cookies.keys()
                cookie_names.sort()

            # Skip requests for static content
            if method == 'GET':
                query = qdata.get('RawQuery')
                params = self._extract_sep_params(query)
                if self._filter_static(query, atype):
                    continue

            elif method in {'PATCH', 'POST', 'PUT'}:
                ct_data = headers.get('Content-Type')
                if ct_data:
                    ctype = ct_data[0]

                raw_query = qdata.get('Body')
                query, params = self._decode_typed_query(ctype, raw_query)

            if params and isinstance(params, dict):
                param_names = params.keys()
                param_names.sort()
                if self.molly_filter_param in param_names:
                    continue

            if not ctype:
                ctype = '-'

            yield {'service': service, 'method': method, 'host': host, 'path': path, 'query': query, 'protocol': prot,
                   'ctype': ctype, 'cookies': json.dumps(cookies), 'cookie_names': str(cookie_names), 'ip': ip,
                   'user_agent': headers.get('User-Agent'), 'referer': headers.get('Referer'),
                   'params': json.dumps(params), 'param_names': str(param_names),
                   'path_method_ctype': hashlib.sha256('_'.join([path, method,
                                                                 ctype.split('multipart/form-data')[0]])).hexdigest()}

    @staticmethod
    def _aggregate_cookies_params(key, recs):
        """
        Reduce operation for aggregate all request feature from log
        :param key:
        :param recs:
        """
        service = None
        path = None
        method = None
        ctype = None
        prot = None
        path_len = 0
        all_params = {}
        all_cookies = {}
        all_hosts = set()
        for rec in recs:
            service = rec.get('service')
            method = rec.get('method')
            ctype = rec.get('ctype')
            path = rec.get('path')
            path_len = str(len(path.split('/')))
            host = rec.get('host')
            prot = rec.get('protocol')
            params = json.loads(rec.get('params'))
            cookies = json.loads(rec.get('cookies'))
            if not params or not isinstance(params, dict):
                continue

            if not all_cookies:
                all_cookies = cookies

            if all_cookies and cookies:
                all_cookies.update(cookies)

            if not all_params:
                all_params = params

            all_hosts.add(host)
            all_params.update(params)

        if all_hosts and all_params:
            param_names = all_params.keys()
            param_names.sort()
            yield {'path_method_ctype': key['path_method_ctype'], 'service': service, 'path': path, 'method': method,
                   'ctype': ctype, 'hosts': list(all_hosts), 'all_params': json.dumps(all_params),
                   'all_cookies': json.dumps(all_cookies), 'protocol': prot,
                   'path_len_params': hashlib.sha256('_'.join([path_len, json.dumps(param_names)])).hexdigest()}

    def _filter_similar_paths(self, key, recs):
        """
        Reduce operation for filtering path with unique uri parametrs
        :param key:
        :param recs:
        """
        all_paths = set()
        all_recs = []
        for rec in recs:
            path = rec.get('path')
            all_paths.add(path)
            all_recs.append(rec)

        valuable_paths = self.sim_utils.filter_similar_paths(all_paths)
        for rec in all_recs:
            if rec.get('path') in valuable_paths:
                yield rec

    def _make_params_logs(self, start_datetime, end_datetime):
        """
        Finds yt tables for datetime and prefix criteria and extract molly log data  from them to self.yt_result table
        :param start_datetime:
        :param end_datetime:
        """
        input_tables = self.yt_utils.search_daily_tables(self.yt, start_datetime, end_datetime,
                                                         '//logs/molly-http-log/1d/')

        if not input_tables:
            print("[-] Error, no input tables were found in datetime criteria")
            sys.exit()

        if not self.req_log_table:
            self.req_log_table = '//tmp/{0}_molly_balancer_log.{1}'.format(self.service_id,
                                                                           end_datetime.strftime('%Y-%m-%d'))

        self.yt.run_map(self._parse_req_logs, input_tables, self.req_log_table, format=yt.YsonFormat())

    def aggregate_path_params(self, service_id, start_datetime, end_datetime):
        """
        Aggregates uniq query params for service, method, content type and path
        :param service_id
        :param start_datetime:
        :param end_datetime:
        """
        if not self.aggr_table:
            self.aggr_table = '//tmp/{0}_molly_req_aggr.{1}'.format(service_id, end_datetime.strftime('%Y-%m-%d'))

        self.service_id = service_id
        self._make_params_logs(start_datetime, end_datetime)
        self.yt.run_sort(self.req_log_table, sort_by='path_method_ctype')
        self.yt.run_reduce(self._aggregate_cookies_params, self.req_log_table, self.aggr_table,
                           reduce_by='path_method_ctype', format=yt.YsonFormat())

        self.yt.run_sort(self.aggr_table, sort_by='path_len_params')
        self.yt.run_reduce(self._filter_similar_paths, self.aggr_table, self.aggr_table,
                           reduce_by='path_len_params', format=yt.YsonFormat())
