import logging
from requests import get
from time import mktime
from datetime import datetime
from collections import defaultdict
import re

from sandbox.projects.SecDis.data_types import Item, ItemType
from sandbox.projects.SecDis.Collectors import BaseCollector

from sandbox.projects.SecDis.Collectors.sim_utils import filter_similar_paths

from sandbox.sandboxsdk.environments import PipEnvironment


class HandleYTCollector(BaseCollector):
    """ Service Security Discovery: new handles by yt access logs collector task """
    collector_name = 'handle_yt'
    input_types = ()
    output_types = (ItemType.HANDLE, ItemType.RESOURCE,)

    class Requirements(BaseCollector.Requirements):
        environments = [PipEnvironment('yql', '1.2.66')]

    def get_last_table(self, service='market'):
        yt_access_token = self.get_vault('OAuthYTSecDis')
        if service.lower() == 'market':
            path = '//home/logfeller/logs/market-new-access-log/30min'
            cluster = 'hahn'
        else:
            return None

        data = {
            'path': path,
            'attributes': ['type', 'path'],
            'max_size': 20000,
        }
        headers = {
            'Authorization': 'OAuth {}'.format(yt_access_token),
            'Accept': 'application/json',
        }
        url = 'https://{}.yt.yandex-team.ru/api/v3/list'.format(cluster)
        res = get(url, headers=headers, params=data)
        json_data = res.json()

        last_ts = 0
        table_path = None
        for node in json_data:
            if type(node) is not dict:
                continue
            name = node['$value']
            table_ts = int(mktime(datetime.strptime(name, "%Y-%m-%dT%H:%M:%S").timetuple()))
            if table_ts > last_ts:
                last_ts = table_ts
                table_path = '{}.[{}]'.format(cluster, node['$attributes']['path'])

        logging.info('[get_last_table] {}'.format(table_path))
        return table_path

    def process_access_log(self, table_path):
        from yql.api.v1.client import YqlClient

        yql_access_token = self.get_vault('OAuthYQLSecDis')
        client = YqlClient(token=yql_access_token)

        project_id = self.Parameters.project_id
        vhost_dict = defaultdict(set)
        pat = re.compile(r'^(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,}$')

        req = client.query('select method, canonized_vhost, request, status from {} WHERE request!="/ping" limit 100000;'.format(table_path))
        req.run()

        for table in req.get_results():
            table.fetch_full_data()
            for row in table.rows:
                method = row[0]
                vhost = row[1]
                path = row[2]
                status = row[3]

                if status != '200' or not pat.match(vhost):
                    continue
                path = path.split('?', 1)[0]
                vhost_dict[vhost].add(path)

        logging.info('[process_access_log] vhost count {}'.format(len(vhost_dict)))
        for vhost, paths in vhost_dict.items():
            logging.info('[process_access_log] vhost {} paths {}'.format(vhost, len(paths)))
            valuable_paths = filter_similar_paths(paths)
            logging.info('[process_access_log] vhost {} valuable_paths {}'.format(vhost, len(valuable_paths)))

            new_resource = Item(ItemType.RESOURCE, project_id, vhost, list(), resource_type='domain')
            self.add_result(new_resource, list())

            for path in valuable_paths:
                new_handle = Item(ItemType.HANDLE, project_id, vhost+path, list(), method=method, vhost=vhost, path=path)
                self.add_result(new_handle, list())

    def on_execute(self):
        auxiliary = self.load_auxiliary()
        logging.info("auxiliary %s" % auxiliary)
        if auxiliary is None:
            auxiliary = dict()
        prev_table_path = auxiliary.get('table_path', '')
        table_path = self.get_last_table('market')
        if table_path == prev_table_path:
            logging.info('Table {} already processed'.format(table_path))
            return
        self.process_access_log(table_path)

        auxiliary = {
            'table_path': table_path,
        }
        self.save_auxiliary(auxiliary)

        self.save_result()
