"""
Clickhouse log processing class.

The `data` is a dict that contains TSKV line.
Each line must have an identifier which is matched against existing processors.
(see `ddl` for their definitions)
Each processor has got an info on how to process each column, and what table
and database it belongs to.
It also has a queue (collections.deque), which is written to CH when flush is
called.
If queue have had a low number of items when flush was called, a time delay is
applied in order to collect a larger bulk of data next time.
"""
import logging
import time

from logbroker_processors.processors import Processor
from logbroker_processors.utils import ClickhouseTSVPusher

from .stats import ClickhouseStats


class ClickhouseLogProcessor(Processor):
    """
    This class is not supposed to be used directly,
    but rather subclassed with processors (see `ddl`)
    defined in `processor` member.
    """
    # Define this here to be able to override in
    # subclass later in unit tests.
    backend_class = ClickhouseTSVPusher
    stats_class = ClickhouseStats

    def __init__(self, **opts):
        super(ClickhouseLogProcessor, self).__init__(**opts)
        self.processors = {
            # e.g.: 'postgresql': ddl.Postgres(),
        }
        self._log = logging.getLogger('clickhouse')
        self._backend = self.backend_class(self._log, **opts)
        # These are defined as class members to
        # save a dict().get call on each process() call.
        self.log_data_errors = opts.get('log_data_errors', False)
        self._delay = {
            # Consider this number of records a reference.
            'thresh': opts.get('delay_thresh', 1000),
            # Do not wait for more than this amount.
            'max': opts.get('delay_max', 100),
            # If delay factor is less than this amount, do not wait at all.
            'min': opts.get('delay_min', 10),
        }

        self.marker_field = 'origin'
        self.opts = opts
        self.stat = self.stats_class()

    def process(self, header, data):
        """
        Read data into queue.
        """
        # Use topic later for statistics.
        topic = '%s:%s' % (
            header.get('topic', 'unknown'),
            header.get('partition', '-1')
        )
        # Determine how data is to be processed.
        origin = data.get(self.marker_field)
        processor = self.processors.get(
            origin
        )
        if origin is None or processor is None:
            if self.log_data_errors:
                self._log.debug(
                    'invalid marker field %s: ("%s", processor: %s)',
                    self.marker_field,
                    origin,
                    processor,
                )
            self.stat.incr(topic, 'unknown')
            return False
        # Read data into processor`s queue.
        # The processor is responsible for data type validation
        # and/or coersion.
        try:
            processor.read(data)
            self.stat.incr(topic, 'read')
            return True
        except ValueError as err:
            if self.log_data_errors:
                self._log.debug('invalid data: %s', err)
            self.stat.incr(topic, 'dropped')
            return False

    def flush(self, force=False):
        """
        Commit each processor`s queue into Clickhouse backend.
        """
        for _, processor in self.processors.iteritems():
            self._backend.push(
                processor.table,
                processor.columns,
                processor.data,
            )
            processor.truncate()  # Flush buffer.

        # Clickhouse works better on bulk writes.
        # Check if amount of records processed was large enough.
        # If not, apply `delay_time` penalty, so there will be a bigger
        # buffer next time the records are read.
        total_read = self.stat.get('total', 'read')
        # Calculate delay factor.
        # The greater the difference between actually written data (totat_read)
        # and the threshold (delay_thresh), the more worker waits.
        #
        # E.g.: thresh = 1000, total_read = 900, delay is about 9.9 secs.
        #       with delay_min = 10, that means no delay.
        #
        #       thresh = 1000, total_read = 10, delay is about 99 secs.
        #       with delay_min = 10, that means full 99 secs of napping.
        delay = 0
        if self._delay['thresh'] > 0:
            delay = self._delay['max'] * (1 - total_read / self._delay['thresh'])
        if delay > self._delay['min']:
            self._log.info(
                'delay penalty triggered, sleep for %d sec. (%d written < %d thresh)',
                delay,
                total_read,
                self._delay['thresh'],
            )
            time.sleep(delay)
        # Display and reset statistics.
        # Status-file
        self.stat.write_stats_log()
        # Regular log
        self._log.info('%s', self.stat.json())
        self.stat.reset()
