from dataclasses import dataclass
from typing import AsyncIterable, List, Optional

import chardet

from mail.ipa.ipa.conf import settings
from mail.ipa.ipa.core.csvops.sniffer import sniff
from mail.ipa.ipa.core.entities.csv import CSVParams
from mail.ipa.ipa.core.exceptions import CSVEmptyError, CSVEncodingDetectError, CSVIsTooBig, CSVLineIsTooBig


@dataclass
class Line:
    content: bytes
    lineno: int


class CSVLineReader:
    SAMPLE_SIZE = 64 * 1024
    LINE_LENGTH_LIMIT = 128 * 1024

    def __init__(self, chunks: AsyncIterable[bytes]):
        self._chunks: AsyncIterable[bytes] = chunks
        self._lines: AsyncIterable[Line] = self._readlines()
        self._sample: List[Line] = []
        self._sample_raw: bytes = b''
        self._csv_params: Optional[CSVParams] = None

    async def _readlines(self) -> AsyncIterable[Line]:
        buf: bytes = b''
        size: int = 0
        lineno: int = 0
        meaningful_lines: int = 0

        async for chunk in self._chunks:
            size += len(chunk)
            buf += chunk
            if not buf:
                continue

            if len(buf) >= self.LINE_LENGTH_LIMIT:
                raise CSVLineIsTooBig(self.LINE_LENGTH_LIMIT // 2, lineno + 1)
            if size > settings.CSV_MAX_SIZE:
                raise CSVIsTooBig(settings.CSV_MAX_SIZE)

            lines = buf.splitlines(keepends=True)
            buf = lines.pop()
            for line in lines:
                lineno += 1
                if not line.strip():
                    continue
                meaningful_lines += 1
                yield Line(line, lineno)

        if buf.strip():
            lineno += 1
            meaningful_lines += 1
            yield Line(buf, lineno)

        if meaningful_lines == 0:
            raise CSVEmptyError

    async def detect_csv_params(self) -> CSVParams:
        if self._csv_params:
            return self._csv_params

        sample_size: int = 0
        async for line in self._lines:
            self._sample.append(line)
            sample_size += len(line.content)
            if sample_size >= self.SAMPLE_SIZE:
                break

        sample = self._sample_raw = b''.join(line.content for line in self._sample)
        # chardet можно использовать и в более продвинутом режиме
        # https://chardet.readthedocs.io/en/latest/usage.html#example-detecting-encoding-incrementally
        # Определять кодировку инкрементально
        detect = chardet.detect(sample)
        encoding = detect['encoding']
        confidence = detect['confidence']
        if not encoding or not confidence or confidence < settings.CSV_ENCODING_CONFIDENCE_TRESHOLD:
            raise CSVEncodingDetectError(encoding or 'unknown', confidence=confidence or 0)

        try:
            decoded_sample = sample.decode(encoding)
        except UnicodeDecodeError:
            raise CSVEncodingDetectError(encoding)

        dialect = sniff(decoded_sample)
        self._csv_params = CSVParams(
            encoding=encoding,
            dialect=dialect,
        )
        return self._csv_params

    async def readlines(self) -> AsyncIterable[Line]:
        await self.detect_csv_params()

        for line in self._sample:
            yield line

        async for line in self._lines:
            yield line
