import argparse
import concurrent.futures as futures
import datetime
import gzip
import zstandard as zstd
import json
import logging.config
import os
import random
import re
import time
import traceback
from typing import Optional, Tuple, Union  # noqa

import udatetime
import yaml
from kikimr.public.sdk.python.persqueue.auth import TVMCredentialsProvider
from kikimr.public.sdk.python.persqueue.grpc_pq_streaming_api import ConsumerConfigurator
from kikimr.public.sdk.python.persqueue.grpc_pq_streaming_api import PQStreamingAPI, ConsumerMessageType, WriterCodec
from tenacity import RetryError as tenacity_RetryError
from ticket_parser2.api.v1 import BlackboxClientId
from tvm2 import TVM2

from hec_sender import SplunkHECSender


def read_yaml_config(filename):
    # type: (str) -> dict
    """
    Function to read config file in yaml format using yaml.safe_load.

    :param filename: Filepath to config file
    :type filename: str
    :return: yaml.safe_load(file)
    :rtype: dict
    :raises IOError: If file not found in file system
    :raises ValueError: if config file is empty
    """

    if not os.path.isfile(filename):
        raise IOError("Config file not found!")

    with open(filename, "r") as config:
        if config is not None:
            return yaml.safe_load(config)
        else:
            raise ValueError("Config file is empty!")


logging_config = read_yaml_config("logging.yml")
logging.config.dictConfig(logging_config)

base_logger = logging.getLogger("lb_consumer")
root_logger = logging.getLogger("main")


# TODO: Create base class, which contains only persqueue API and consumer initialization methods.
class LogbrokerConsumer(object):
    """
    SOC Logbroker Consumer implementation.
    Uses logbroker Python API via persqueue lib:
    https://wiki.yandex-team.ru/logbroker/docs/libs/python/?from=%252Flogbroker%252Fdocs%252Flib%252Fpython%252F

    Purpose: read topics from LB and process them to Security team logs storage (Splunk cluster for now).
    Data will be send to Splunk via HTTP Event Collector, using our hec_sender lib.

    Check repo README for more information.
    """

    def __init__(self, hec_token,  # type: str
                 topics,  # type: list
                 logbroker_client,  # type: str
                 logbroker_host,  # type: str
                 logbroker_host_port,  # type: int
                 connection_timeout,  # type: int
                 dry_run,  # type: bool
                 tvm_client_id,  # type: int
                 tvm_secret_key,  # type: str
                 tvm_logbroker_client_id,  # type: int
                 lock,  # type: bool
                 read_infly_count,  # type: int
                 max_messages_count,  # type: int
                 read_only_local,  # type: bool
                 filters_file="./filters.yml",  # type: str
                 hec_config_file="./hec_sender_config.yml",  # type: str
                 max_time_lag=None  # type: Optional[int]
                 ):
        # type: (...) -> None
        """

        :param hec_token: Token for Splunk HEC
        :type hec_token: str
        :param topics: Topics list, to read from LB
        :type topics: list
        :param logbroker_client: LB client name
        :type logbroker_client: str
        :param logbroker_host: LB host from which to read (man.logbroker.yandex.net is the main balancer)
        :type logbroker_host: str
        :param logbroker_host_port: LB host port (use 2135)
        :type logbroker_host_port: int
        :param connection_timeout: Connection timeout: used for consumer instance creation, next event reading.
            Wait provided time in seconds before throwing exception. If next event / partition not found and timeout
            exceeded - reading will stop with RuntimeError
        :type connection_timeout: int
        :param dry_run: Parameter for testing - then provided as True, no events would be sent to HEC
        :type dry_run: bool
        :param tvm_client_id: TVM application client id (reader application, you should create it first)
        :type tvm_client_id: int
        :param tvm_secret_key: TVM application secret key (reader application, you should create it first)
        :type tvm_secret_key: str
        :param tvm_logbroker_client_id: LB TVM application client id
        :type tvm_logbroker_client_id: int
        :param lock: Read partitions with locks - required while reading from multiple instances of consumer.
        :type lock: bool
        :param read_infly_count: max unprocessed reads infly. read_infly_count should be < max_messages_count.
            For example read_infly_count=100, max_messages_count=200
        :type read_infly_count: int
        :param max_messages_count: maximum count of messages to read at one request - use with read_infly_count
        :type max_messages_count: int
        :param read_only_local: True=read only topics from current DC. False=read also mirrored topics.
        :type read_only_local: bool
        :param filters_file: Main config file. Filepath for config file with topics settings and filters. In Yaml format
        :type filters_file: str
        :param max_time_lag: Skip messages if time lag greater than specified. Lag in seconds.
        :type max_time_lag: int
        """

        self.logger_extra = {"traceback_trace": json.dumps({}),
                             "topic_name": "",
                             "topic_location": ""}

        self.__hec_token = hec_token
        self.topics = topics
        self.logbroker_client = logbroker_client
        self.logbroker_host = logbroker_host
        self.logbroker_host_port = logbroker_host_port
        self.timeout = connection_timeout
        self.dry_run = dry_run
        self.__tvm_client_id = tvm_client_id
        self.__tvm_secret_key = tvm_secret_key
        self._tvm_logbroker_client_id = tvm_logbroker_client_id
        self.lock_while_reading = lock
        self.read_infly_count = read_infly_count
        self.max_messages_count = max_messages_count
        self.read_only_local = read_only_local
        self.filters_config_file = filters_file
        self.filters = self.prepare_filters()
        self.max_time_lag = max_time_lag

        if self.max_time_lag is not None:
            self.max_time_lag_ms = self.max_time_lag * 1000
        else:
            self.max_time_lag_ms = self.max_time_lag

        if dry_run:
            self.logger.warning("[ DRY_RUN ] Dry run set to True, events will not be send to Splunk")

        self.credentials_provider = self.prepare_tvm_credentials_provider(self.__tvm_client_id,
                                                                          self.__tvm_secret_key,
                                                                          self._tvm_logbroker_client_id)

        self.consumer_configuration = self.prepare_consumer_configuration(topics=self.topics,
                                                                          client_id=self.logbroker_client,
                                                                          max_count=self.max_messages_count,
                                                                          use_client_locks=self.lock_while_reading,
                                                                          read_infly_count=self.read_infly_count,
                                                                          read_only_local=self.read_only_local,
                                                                          max_time_lag_ms=self.max_time_lag_ms)

        self.api, self.api_ready = self.prepare_interface(self.logbroker_host, self.logbroker_host_port,
                                                          self.timeout)

        self.consumer = None

        # Setup retry options
        hec_config = read_yaml_config(hec_config_file)
        self.hec_retries_count = hec_config["retries_count"]
        self.hec_retries_min_delay = hec_config["retries_min_delay"]
        self.hec_retries_max_delay = hec_config["retries_max_delay"]
        self.hec_retries_timeout = hec_config["retries_timeout"]
        self.hec_post_timeout = hec_config["post_timeout"]
        self.hec_batch_size = hec_config["batch_size"]
        self.hec_threads = hec_config["threads"]
        self.hec_source = hec_config["source"]

        # Application load balancing. Should be None to use l3
        self.hec_hosts = hec_config["hosts"]
        self.hec_port = hec_config["port"]

        SplunkHECSender.retries_count = self.hec_retries_count
        SplunkHECSender.retries_min_delay = self.hec_retries_min_delay
        SplunkHECSender.retries_max_delay = self.hec_retries_max_delay
        SplunkHECSender.retries_timeout = self.hec_retries_timeout
        self.hec_sender = SplunkHECSender(token=self.__hec_token,
                                          source=self.hec_source,
                                          post_timeout=self.hec_post_timeout,
                                          batch_size=self.hec_batch_size,
                                          threads=self.hec_threads,
                                          hec_hosts=self.hec_hosts,
                                          hec_verify_ssl=False,
                                          hec_host_port=self.hec_port
                                          )

    @property
    def logger(self):
        """
        Property attribute for logger setup with extra fields required for formatter.

        :return:
        :rtype:
        """
        logger = logging.LoggerAdapter(base_logger, self.logger_extra)
        return logger

    def prepare_filters_original(self):
        # type: () -> dict
        """
        Read filters file and prepare dict from Yaml.
        Run checks, to ensure the valid format of config.
        Required fields:
            - index

        :return: Filters settings
        :rtype: dict
        """
        filters = read_yaml_config(self.filters_config_file)
        # filters_iterator = copy.copy(filters)
        for key, value in filters.items():
            # Check required fields are provided
            if "index" not in value:
                raise ValueError("You must provide index parameter in filters.yml for topic!")
            else:
                if not isinstance(value["index"], str):
                    raise TypeError("Index parameter in filters.yml should be str type!")

            pattern_keys = ["source_from_field", "sourcetype_from_field", "timestamp_field"]

            patterns_keys = ["inclusions_by_pattern", "exclusions_by_pattern"]

            for field_name in pattern_keys:
                if field_name not in value:
                    continue

                if field_name == "timestamp_field" and value[field_name]:
                    if value[field_name].get("pattern") is not None:
                        value[field_name]["pattern"] = re.compile(value[field_name]["pattern"])
                    if value[field_name].get("timezone_pattern") is not None:
                        value[field_name]["timezone_pattern"] = re.compile(value[field_name]["timezone_pattern"])

                else:
                    if value[field_name].get("pattern") is not None:
                        value[field_name]["pattern"] = re.compile(value[field_name]["pattern"])

            for field_name in patterns_keys:
                if field_name not in value:
                    continue

                if not isinstance(value[field_name], list):
                    raise TypeError("{} should be 'list' type!".format(field_name))

                for index, pattern in enumerate(value[field_name]):
                    compiled_pattern = re.compile(pattern)
                    value[field_name].pop(index)
                    value[field_name].insert(0, compiled_pattern)

            if "alias" in filters[key]:
                filters[value["alias"]] = filters.pop(key)

        return filters

    def prepare_filters(self):
        # type: () -> dict
        """
        Read filters file and prepare dict from Yaml.
        Run checks, to ensure the valid format of config.
        Required fields:
            - index

        :return: Filters settings
        :rtype: dict
        """
        filters = read_yaml_config(self.filters_config_file)
        # filters_iterator = copy.copy(filters)
        for key in filters:
            # Check required fields are provided
            if "index" not in filters[key]:
                raise ValueError("You must provide index parameter in filters.yml for topic!")
            else:
                if not isinstance(filters[key]["index"], str):
                    raise TypeError("Index parameter in filters.yml should be str type!")

            pattern_keys = ["source_from_field", "sourcetype_from_field", "timestamp_field"]

            patterns_keys = ["inclusions_by_pattern", "exclusions_by_pattern"]

            for field_name in pattern_keys:
                if field_name not in filters[key]:
                    continue

                if field_name == "timestamp_field" and filters[key][field_name]:
                    if filters[key][field_name].get("pattern") is not None:
                        filters[key][field_name]["pattern"] = re.compile(filters[key][field_name]["pattern"])
                    if filters[key][field_name].get("timezone_pattern") is not None:
                        filters[key][field_name]["timezone_pattern"] = re.compile(filters[key][field_name]["timezone_pattern"])

                else:
                    if filters[key][field_name].get("pattern") is not None:
                        filters[key][field_name]["pattern"] = re.compile(filters[key][field_name]["pattern"])

            for field_name in patterns_keys:
                if field_name not in filters[key]:
                    continue

                if not isinstance(filters[key][field_name], list):
                    raise TypeError("{} should be 'list' type!".format(field_name))

                for index, pattern in enumerate(filters[key][field_name]):
                    compiled_pattern = re.compile(pattern)
                    filters[key][field_name].pop(index)
                    filters[key][field_name].insert(0, compiled_pattern)


        _filters = dict()
        for key in filters:
            if "alias" in filters[key]:
                _filters[filters[key]["alias"]] = filters[key]
            else:
                _filters[key] = filters[key]

        return _filters


    def prepare_consumer(self, timeout=None):
        # type: (int) -> None
        """
        Prepares consumer instance as this class attribute (self.consumer).

        :param timeout: Timeout value
        :type timeout: int
        :return: None
        :rtype: None
        :raises: RuntimeError - if failed to initialize consumer (futures.TimeoutError)
        """
        if timeout is None:
            timeout = self.timeout * 3

        try:
            self.logger.info("Initializing consumer...")
            self.consumer = self.api.create_consumer(consumer_configurator=self.consumer_configuration,
                                                     credentials_provider=self.credentials_provider)
            self.logger.info("Consumer successfully prepared!")

            response = self.consumer.start()

            response.result(timeout=timeout)

        except futures.TimeoutError:
            tb_trace = traceback.format_exc()
            self.logger_extra["traceback_trace"] = json.dumps(tb_trace)
            self.logger.error("Consumer initialization timeout! ")
            self.logger_extra["traceback_trace"] = json.dumps({})

            self.logger.error("Stop future result message: %s", self.consumer.stop_future.result())

            raise RuntimeError("Failed to initialize consumer - initialization timeout!")

    @staticmethod
    def prepare_consumer_configuration(topics, client_id, max_count, use_client_locks, read_infly_count,
                                       read_only_local, max_time_lag_ms):
        # type: (list, str, int, bool, int, bool, Optional[int]) -> ConsumerConfigurator
        """
        Prepare consumer configuration as ConsumerConfigurator instance.

        :param topics: List of topics to read
        :type topics: list
        :param client_id: Reader key client_id (not the TVM client id)
        :type client_id: str
        :param max_count: Max events count to get per single request to LB
        :type max_count: int
        :param use_client_locks: Use locking reading or not.
        :type use_client_locks: bool
        :param read_infly_count: Max infly_count to support (number of events pool maintained in memory)
        :type read_infly_count: int
        :param read_only_local: Read only from provided DC (logbroker_host value)
        :type read_only_local: bool
        :param max_time_lag_ms: Max time lag in milliseconds. Messages must be read in this windows, otherwise
            they never will be read again.
        :type max_time_lag_ms: int
        :return: ConsumerConfigurator instance with required settings
        :rtype: ConsumerConfigurator
        """
        return ConsumerConfigurator(topics=topics,
                                    client_id=client_id,
                                    max_count=max_count,
                                    use_client_locks=use_client_locks,
                                    read_infly_count=read_infly_count,
                                    read_only_local=read_only_local,
                                    max_time_lag_ms=max_time_lag_ms)

    def prepare_interface(self, host, port, timeout):
        # type: (str, int, int) -> Tuple[PQStreamingAPI, bool]
        """
        Prepares interface to LB via PQStreamingAPI instance.
        This interface will be used to create consumer instance.

        :param host: Logbroker host
        :type host: str
        :param port: Logbroker host port
        :type port: int
        :param timeout: Timeout for preparing interface in seconds
        :type timeout: int
        :return: Interface instance and API status
        :rtype: tuple
        """
        api = PQStreamingAPI(host, port)
        interface = api.start()
        try:
            ready = interface.result(timeout)
        except futures.TimeoutError:
            self.logger.error("Interface initialization timeout! ")
            self.logger.debug("Stop future result message: %s", interface.stop_future.result())

            raise RuntimeError("Failed to initialize PQStreamingAPI interface - initialization timeout!")

        return api, ready

    def stop_interface(self):
        # type: () -> None
        """
        Method to stop API interface then required.

        :return: None
        :rtype: None
        """
        if self.api_ready:
            logging.info("Stopping Logbroker API interface...")
            self.api.stop()
        else:
            logging.error("Logbroker API interface not initialized")

    @staticmethod
    def prepare_tvm_credentials_provider(tvm_client_id, tvm_secret_key, tvm_logbroker_client_id):
        # type: (int, str, int) -> TVMCredentialsProvider
        """
        Prepare self.credentials_provider via TVM auth.
        Required for auth.

        :param tvm_client_id: TVM client id
        :type tvm_client_id: int
        :param tvm_secret_key: TVM secret key
        :type tvm_secret_key: str
        :param tvm_logbroker_client_id: Logbroker TVM client id
        :type tvm_logbroker_client_id: int
        :return: TVM Credential instance
        :rtype: TVMCredentialsProvider
        """
        tvm = TVM2(client_id=tvm_client_id,
                   secret=tvm_secret_key,
                   blackbox_client=BlackboxClientId.Prod,
                   allowed_clients=(tvm_logbroker_client_id,),
                   destinations=(tvm_logbroker_client_id,),
                   )

        return TVMCredentialsProvider(tvm, destination_client_id=tvm_logbroker_client_id)

    def decompress_message(self, message):
        # type: (kikimr.public.api.protos.persqueue_pb2.Message) -> bytes
        """
        Unpack single message read message from LB if it's gziped.
        If raw - unpack str from message.
        LZOP coding not supported.

        :param message: Message from LB
        :type message: kikimr.public.api.protos.persqueue_pb2.Message
        :return: Event string
        :rtype: str
        :raises: RuntimeError - if LZOP message found
        """
        if message.meta.codec == WriterCodec.RAW:
            self.logger.debug("Decompresser: RAW message data found, no action required")
            result = message.data

        elif message.meta.codec == WriterCodec.GZIP:
            self.logger.debug("Decompresser: GZIP message data found, decompressing...")
            result = gzip.decompress(message.data)
            self.logger.debug("Decompresser: GZIP message successfully decompressed!")

        elif message.meta.codec == 3:
            self.logger.debug("Decompresser: ZSTD message data found, decompressing...")
            decompressor = zstd.ZstdDecompressor()
            stream_reader = decompressor.stream_reader(message.data)
            result = stream_reader.read()
            self.logger.debug("Decompresser: ZSTD message successfully decompressed!")

        elif message.meta.codec == WriterCodec.LZOP:
            self.logger.error("Decompresser: LZOP message data found, raising exception")
            raise RuntimeError('Codec LZOP not supported')

        else:
            raise RuntimeError('Codec %s not supported', message.meta.codec)

        return result

    def prepare_event_meta_field(self, event, topic_name):  # noqa: C901
        # type: (Union[str, dict], str) -> dict
        """
        Prepare __meta field for HEC sender, which contains timestamp (only in epoch!), source, sourcetype for HEC.

        About timestamp extraction:
        Timestamp is building from time and timezone info.
        Timezone field name could be provided in filters config as tz_field_name. If it is provided - then result
           timestamp will be calculated as join of timestamp and tz_field_name field value. If timestamp contains
           the plus sign, it will appear in result timestamp string (before format_string applying).

        Timestamp extraction is done by regex pattern matching and format_string applying if it is provided in filters
           config.

        .. note::
            If format_string value is set to "ISO" then RFC3339 time extraction will be called using udatetime lib.

        :param event: Single event for processing. Could be dict or string, based of topic parsing settings.
        :type event: str, dict
        :param topic_name: Processing event origin topic name
        :type topic_name: str
        :return: prepared __meta value as key-values
        :rtype: dict
        """

        # Prepare __meta field (for custom source, sourcetype)
        meta = dict()
        if topic_name in self.filters:
            index = self.filters[topic_name]["index"]
            timestamp_field = self.filters[topic_name].get("timestamp_field")

            meta["index"] = index

            field_name = timestamp_field.get("field_name")
            field_pattern = timestamp_field.get("pattern")
            format_string = timestamp_field.get("format_string")
            timezone_pattern = timestamp_field.get("timezone_pattern")
            tz_field_name = timestamp_field.get("timezone_field_name")

            if format_string is not None and format_string == "ISO":
                parse_isoformat_timestamp = True
            else:
                parse_isoformat_timestamp = False

            timestamp = None

            if field_name is not None and field_name in event and field_pattern is None and isinstance(event, dict):
                timestamp = event[field_name]

                if tz_field_name is not None and format_string is not None and tz_field_name in event:
                    timestamp = "".join([timestamp, event[tz_field_name]])

            elif isinstance(field_pattern, re.Pattern) and isinstance(event, str):
                match = re.search(field_pattern, event)

                if match is not None:
                    timestamp = match.group(1)

                if isinstance(timezone_pattern, re.Pattern) and format_string is not None:
                    tz_match = re.search(timezone_pattern, event)
                    if tz_match is not None:
                        timezone = tz_match.group(1)

                        if "+" in timezone:
                            if parse_isoformat_timestamp and len(timezone) == 5:
                                timezone = ":".join([timezone[:-2], timezone[-2:]])

                            timestamp = "".join([timestamp, timezone])
                        else:
                            timestamp = "+".join([timestamp, timezone])

            if timestamp is not None and format_string is not None:
                if parse_isoformat_timestamp:
                    try:
                        timestamp = str(udatetime.from_string(timestamp).timestamp())
                    except ValueError:
                        self.logger.debug("Failed to parse timestamp! Time string not in isoformat!")
                else:
                    try:
                        timestamp = str(datetime.datetime.strptime(timestamp, format_string).timestamp())
                    except ValueError:
                        self.logger.debug("Failed to parse timestamp! Format string provided wrongly!")

            meta["timestamp"] = timestamp

            meta["sourcetype"] = self._prepare_meta_field_by_name(field="sourcetype",
                                                                  topic_name=topic_name,
                                                                  event=event)

            meta["source"] = self._prepare_meta_field_by_name(field="source",
                                                              topic_name=topic_name,
                                                              event=event)

        return meta

    def _prepare_meta_field_by_name(self, field, topic_name, event):
        # type: (str, str, Union[str, dict]) -> Optional[str]
        """
        If required field defined as <field>_from_field in filters config: then try to extract it as defined from event
            and return it's value.
        Otherwise, if required field value defined as is in filters config - return defined value.

        :param field: Processing field name
        :type field: str
        :param topic_name: Processing event name
        :type topic_name: str
        :param event: Processing event
        :type event: str or dict
        :return: Extracted value by provided rules
        :rtype: str or None
        """

        result = None

        prior_result = self.filters[topic_name].get(field)
        from_field_key = "_".join([field, "from_field"])
        from_field_value = self.filters[topic_name].get(from_field_key)

        if isinstance(from_field_value, dict):
            field_name = from_field_value.get("field_name")
            field_pattern = from_field_value.get("pattern")
            if field_name is not None and field_name in event and field_pattern is None and isinstance(event, dict):
                result = event[field_name]
            elif isinstance(field_pattern, re.Pattern) and isinstance(event, str):
                pattern_match = re.search(field_pattern, event)
                if pattern_match is not None:
                    result = pattern_match.group(1)
            else:
                self.logger.debug("Field for sourcetype from filters not found in event! "
                                  "Topic: %s, Field name: %s", topic_name, from_field_value)
        elif prior_result is not None:
            result = prior_result

        return result

    @staticmethod
    def inclusion_or_exclusion_filter_by_pattern(text, patterns, exclusion):
        # type: (str, list, bool) -> bool
        """

        If exclusion is True - then return True, if the text doesn't matched the regex.
        Otherwise, inclusion check is made: True will be returned, if regex matched.

        :param text: text for check
        :type text: str
        :param patterns: list of patterns against which to check the text
        :type patterns: list
        :param exclusion: Set filtering mode: True if exclusion, False if inclusion
        :type exclusion: bool
        :returns: True, if message passed checks, otherwise False
        :rtype: bool
        """

        at_least_one_match = False

        for pattern in patterns:
            if re.search(pattern, text):
                at_least_one_match = True

            if at_least_one_match:
                break

        # if (exclusion and at_least_one_match) or (not exclusion and not at_least_one_match)
        return exclusion ^ at_least_one_match

    def apply_pattern_filters(self, message, topic_name):
        # type: (Union[str, dict], str) -> Optional[Union[str, dict]]
        """
        If re.search matched pattern - then trigger action: return False for exclusion and return True otherwise.

        :param message: Single event for processing. Could be dict or string, based of topic parsing settings.
        :type message: str, dict
        :param topic_name: Processing event origin topic name
        :type topic_name: str
        :return: Message if checks passed, or None if message was filtered
        :rtype: str, dict, None
        """

        pattern_check_keys = [("exclusions_by_pattern", True), ("inclusions_by_pattern", False)]

        for check in pattern_check_keys:
            key_name, exclusion = check

            if not (topic_name in self.filters and key_name in self.filters[topic_name] and self.filters[topic_name][key_name]):
                continue

            message_allowed = self.inclusion_or_exclusion_filter_by_pattern(message, self.filters[topic_name][key_name], exclusion)

            if not message_allowed:
                return None

        return message

    def apply_key_value_filters(self, message, topic_name):
        # type: (Union[str, dict], str) -> Optional[Union[str, dict]]
        """
        Apply exclusion or inclusion filters by key-value matching.
        Used only for JSON format logs.

        :param message:
        :type message:
        :param topic_name:
        :type topic_name:
        :return:
        :rtype:
        """

        key_value_check_keys = [("exclusions_by_key_value", True), ("inclusions_by_key_value", False)]

        for check in key_value_check_keys:
            key_name, exclusion = check

            if not (topic_name in self.filters and key_name in self.filters[topic_name] and self.filters[topic_name][key_name]):
                continue

            at_least_one_match = False

            for filter_condition in self.filters[topic_name][key_name]:
                for key, value in filter_condition.items():

                    if key in message and value == message[key]:
                        at_least_one_match = True

                    if at_least_one_match:
                        break

            if not exclusion ^ at_least_one_match:
                return None

        return message

    def parse_single_message_as_string(self, message, topic_name, skip_filter_checks=False):
        # type: (str, str, bool) -> Optional[dict]
        """
        Method to process single message without transforms: read string, push string to HEC.
        Only adds filtration based on filters and prepare message for hec_sender as dict without splitting by key-value.

        The result of this function is dict:

        >>> { "__message": message, "__meta": meta}

        :param message: Current processing message
        :type message: str
        :param topic_name: Processing event origin topic name
        :type topic_name:
        :param skip_filter_checks: Will skip inclusions and exclusions check. Required, then using this method
            as fallback for other, for example for process_json_message
        :type skip_filter_checks: bool
        :return: Prepared message as dict with meta and the origin message as string
        :rtype: dict, None
        """

        if not skip_filter_checks:
            message = self.apply_pattern_filters(message, topic_name)

        if message is None:
            return None

        meta = self.prepare_event_meta_field(message, topic_name)

        result = {"__message": message, "__meta": meta}

        return result

    def process_json_message(self, message, topic_name):  # noqa: C901
        # type: (str, str) -> Optional[dict]
        """
        Method to process single message with basic transform to dict: if events are valid JSON strings, then
        json.loads() them into dicts to further HEC processing.

        :param message: Current processing message
        :type message: str
        :param topic_name: Processing event origin topic name
        :type topic_name:
        :return: Transformed to dict event, if it's is a valid JSON
        :rtype: dict, None
        """

        message = self.apply_pattern_filters(message, topic_name)
        if message is None:
            return None

        result = dict()
        # Check, if message is already a valid string for JSON
        try:
            result = json.loads(message)

            if "drop_fields" in self.filters[topic_name] and isinstance(self.filters[topic_name]["drop_fields"], list):
                for key in self.filters[topic_name]["drop_fields"]:
                    result.pop(key, None)

            is_valid = True
        except json.JSONDecodeError:
            is_valid = False

        if is_valid:
            result = self.apply_key_value_filters(result, topic_name)

            if result is None:
                return None

            meta = self.prepare_event_meta_field(event=result, topic_name=topic_name)
            result["__meta"] = meta

        else:
            self.logger.info("Events must be in JSON, but they are not valid! "
                             "Topic=%s. Sending as string...", topic_name)

            result = self.parse_single_message_as_string(message=message,
                                                         topic_name=topic_name,
                                                         skip_filter_checks=True)

        return result

    def prepare_messages(self, messages, topic_name):
        # type: (list ,str) -> list
        """
        Top-level method for batch of messages processing.
        This method calls parse_single_message_as_string, parse_single_message_with_split, process_json_message
        on every message from batch based on topic settings in filters.

        :param messages: Batch of messages in strings
        :type messages: list
        :param topic_name: Current batch origin topic name
        :type topic_name: str
        :return: Processed batch of messages with required actions, which are ready to be send via HEC
        :rtype: list
        """

        prepared_messages = list()

        if "json_format" in self.filters[topic_name]:
            if self.filters[topic_name]["json_format"]:
                prepared_messages = [self.process_json_message(message=message,
                                                               topic_name=topic_name,
                                                               )
                                     for message in messages]

        else:
            prepared_messages = [self.parse_single_message_as_string(message, topic_name)
                                 for message in messages]

        if prepared_messages:
            prepared_messages = list(filter(None, prepared_messages))

        return prepared_messages

    def process_read_result(self, consumer_message):
        # type: (kikimr.public.api.protos.persqueue_pb2.Message) -> list
        """
        Top-level method: called by main method start_reading() on each read batch from LB.
        Decompresses, splits events from batch into list of strings, which after will be processed by prepare_messages()
        method.

        :param consumer_message: Batch of events read from LB
        :type consumer_message: kikimr.public.api.protos.persqueue_pb2.Message
        :return: Processed batch as a list of strings or dicts
        :rtype: list
        """
        processed_batch = list()
        for batch in consumer_message.data.message_batch:
            self.logger.info("Processing single batch...")
            for message in batch.message:
                topic_location, topic_name = batch.topic.split("--", 1)

                self.logger.debug("Processing message from batch, with type: {} "
                                  "from topic: {}".format(type(message), topic_name))
                self.logger.debug("Decompressing message (if GZIP-ed)...")
                unpacked_message = self.decompress_message(message).decode("utf-8", errors='replace')
                self.logger.debug("Successully decompressed message")

                # Prepare messages
                self.logger.debug("Splitting bunch of messages...")
                splitted_messages = unpacked_message.split("\n")
                self.logger.debug("Successfully splitted bunch of messages!")
                # Remove empty messages
                self.logger.debug("Filtering out empty messages after splitting...")
                splitted_messages = list(filter(bool, splitted_messages))
                self.logger.debug("Empty messages filtered successfully!")

                self.logger_extra["topic_name"] = topic_name
                self.logger_extra["topic_location"] = topic_location

                if self.dry_run:
                    print(splitted_messages)
                    continue

                prepared_messages = self.prepare_messages(splitted_messages, topic_name)

                # For skipped events
                if not prepared_messages:
                    continue

                # Check lag
                diff = time.time() - (message.meta.write_time_ms / 1000)
                if self.max_time_lag is not None:
                    if diff > self.max_time_lag + 300:
                        self.logger.debug("[ LAG ] Slow reading. Lag is bigger then defined value! Values: "
                                          "current_lag=%d, defined_max_lag=%d", diff, self.max_time_lag)
                    else:
                        self.logger.debug("[ LAG ] Lag is OK! Values:  "
                                          "current_lag=%d, defined_max_lag=%d", diff, self.max_time_lag)

                self.logger.debug("Appending processed messages...")

                processed_batch.extend(prepared_messages)

        return processed_batch

    def start_reading(self):
        """
        Main method of the Class. Reads events from LB and processes them to Splunk via HEC sender.

        :return: None
        :rtype: None
        """
        while True:
            # Read batch event from LB
            event = self.consumer.next_event()
            try:
                event = event.result(timeout=self.timeout)
            except futures.TimeoutError:
                tb_trace = traceback.format_exc()
                self.logger_extra["traceback_trace"] = json.dumps(tb_trace)
                self.logger.error("Exception raised while trying to start reading! "
                                  "stop_future.done: %s", self.consumer.stop_future.done())
                self.logger_extra["traceback_trace"] = json.dumps({})

                if self.consumer.stop_future.done():
                    self.logger.error("Stop future result: %s", self.consumer.stop_future.result())

                raise RuntimeError

            if event.type == ConsumerMessageType.MSG_LOCK:
                self.logger.debug("Event type is MSG_LOCK. "
                                  "TOPIC=%s, PARTITION=%s", event.message.lock.topic, event.message.lock.partition)
                event.ready_to_read()
            elif event.type == ConsumerMessageType.MSG_DATA:
                self.logger.debug("Event type is MSG_DATA")
                prepared_batch_event = self.process_read_result(consumer_message=event.message)
                self.logger.debug("Prepared events batch length: %d", len(prepared_batch_event))

                if not self.dry_run:
                    if prepared_batch_event:
                        """
                        because hec_sender can throw exception in threads by timeout or retries count reached.
                        """
                        try:
                            self.hec_sender.send_data(prepared_batch_event)
                            self.logger.info("Batch was successfully sent to Splunk!")
                        except tenacity_RetryError:
                            # TODO: Fix this logging (sends multiple lines instead of nesting)
                            tb_trace = traceback.format_exc()
                            self.logger_extra["traceback_trace"] = json.dumps(tb_trace)
                            self.logger.error("[ HEC_SENDER ]Exception raised while trying to send batch "
                                              "via hec_sender - timeout or retries error!")
                            self.logger_extra["traceback_trace"] = json.dumps({})
                            # Shall stop reader and release read batch without commmit, so it can be read again
                            self.consumer.stop()
                            self.logger.warning("Stopped current consumer instance, creating new one after delay...")
                            time.sleep(random.choice(range(1, 11)))
                            self.prepare_consumer()
                            continue

                    else:
                        self.logger.debug("Prepared batch of messages is empty. Check filtration")

                else:
                    self.logger.debug("Dry run mode enabled, event will NOT be send to Splunk")

                # Commit only if data processed to splunk successfully
                last_read_cookie = event.message.data.cookie
                self.consumer.commit([last_read_cookie, ])

            elif event.type == ConsumerMessageType.MSG_ERROR:
                self.logger.error("Event type is MSG_ERROR")
                self.logger.error('Logbroker error: %s', event.message.error.description)


def prepare_config(filename: str, read_topic: str) -> dict:
    """
    Prepare consumer configuration.

    :param filename: Path to config file in YAML format
    :type filename: str
    """
    prepared_config = {"dry_run": False}

    cfg = read_yaml_config(filename)

    topics_file = cfg.pop("topics_file", None)

    if read_topic:
        prepared_config["topics"] = read_topic
        prepared_config["dry_run"] = True

    else:
        with open(topics_file) as tf:
            provided_topics = tf.read().splitlines()

        prepared_config["topics"] = provided_topics

    for key, value in cfg.items():
        if "env" in value and value["env"] is not None:
            var_value = os.getenv(value["env"])

            if var_value.isnumeric():
                var_value = int(var_value)

            prepared_config[key] = var_value

            continue

        prepared_config[key] = value["value"]

    if "max_time_lag" in prepared_config:
        if prepared_config["max_time_lag"] < 0:
            prepared_config["max_time_lag"] = None


    # Plug for one stage in Deploy
    if "logbroker_host" in prepared_config and os.getenv("DEPLOY_NODE_DC"):
        prepared_config["logbroker_host"] = os.getenv("DEPLOY_NODE_DC") + ".logbroker.yandex.net"
        os.environ["LOGBROKER_HOST"] = prepared_config["logbroker_host"]
        root_logger.info('LOGBROKER_HOST changed to %s' % prepared_config["logbroker_host"])


    return prepared_config



def main(hec_token,
         topics,
         logbroker_client,
         logbroker_host,
         logbroker_host_port,
         connection_timeout,
         tvm_client_id,
         dry_run,
         tvm_secret_key,
         tvm_logbroker_client_id,
         lock_reading,
         read_infly_count,
         max_messages_count,
         read_only_local,
         max_time_lag
         ):
    """
    Main function of consumer.
    """

    consumer = LogbrokerConsumer(hec_token=hec_token,
                                 topics=topics,
                                 logbroker_client=logbroker_client,
                                 logbroker_host=logbroker_host,
                                 logbroker_host_port=logbroker_host_port,
                                 connection_timeout=connection_timeout,
                                 dry_run=dry_run,
                                 tvm_client_id=tvm_client_id,
                                 tvm_secret_key=tvm_secret_key,
                                 tvm_logbroker_client_id=tvm_logbroker_client_id,
                                 lock=lock_reading,
                                 read_infly_count=read_infly_count,
                                 max_messages_count=max_messages_count,
                                 read_only_local=read_only_local,
                                 max_time_lag=max_time_lag
                                 )

    root_logger.info("Preparing consumer...")
    root_logger.info("Topics: %s", consumer.topics)

    consumer.prepare_consumer()
    root_logger.info("Consumer successfully prepared!")

    root_logger.info("Running consumer reading...")
    consumer.start_reading()


if __name__ == "__main__":
    # Parse arguments
    link_to_logbroker_tvm_client_id = "https://abc.yandex-team.ru/services/Logbroker/resources/?tag=2&supplier=14&type=47&state=requested&state=approved&state=granted&view=consuming"  # noqa
    parser = argparse.ArgumentParser(description='SOC Logbroker consumer client based on persqueue lib')
    parser.add_argument("-c", "--config", required=True, action="store", type=str,
                        help="Path to config file in YAML",
                        dest="config")


    parser.add_argument("-t", "--topic", required=False, action="store", type=str,
                        help="Read messages from topic and print them out."
                             "Use ctrl+c to stop reading",
                        dest="topic")


    args = parser.parse_args()

    dry_run = False

    config = prepare_config(args.config, args.topic)

    root_logger.info("Running consumer with config: %s", config)

    main(**config)
