import gzip
import logging
import os.path
import re
import tarfile

from sandbox.sandboxsdk import errors
from sandbox.sandboxsdk import parameters
from sandbox.sandboxsdk import task

from sandbox.projects import resource_types
from sandbox.projects.common.pumpkin import utils as pumpkin_utils


_THUMB_RE = re.compile(r'<img\s+class="(\w+\s+)*serp-item__thumb(\s+\w+)*"(?:\s+\w+="[^"]*")*\s+src="data:image/(jpeg|webp);base64,[^"]+"')


class UrlsResourceParameter(parameters.LastReleasedResource):
    name = 'urls_resource_id'
    description = 'Set of urls to retrieve'
    resource_type = resource_types.SERP_COLLECTOR_URLS


class SerpsResourceParameter(parameters.LastReleasedResource):
    name = 'serp_resource_id'
    description = 'Serp archive'
    resource_type = resource_types.SERP_COLLECTOR_LOOT


class TopQueriesCountParameter(parameters.SandboxIntegerParameter):
    name = 'top_queries_count'
    description = 'Number of top queries'
    default_value = 20
    required = True


class MinUploadDeltaParameter(parameters.SandboxIntegerParameter):
    name = 'min_upload_delta'
    description = 'Minimum delta (in percent) between requested and uploaded urls'
    default_value = 3


class MinThumbCountParameter(parameters.SandboxIntegerParameter):
    name = 'min_thumb_count'
    description = 'Minimum possible thumb count'
    default_value = 10


class ImagesTestPumpkinSerp(task.SandboxTask):
    type = "IMAGES_TEST_PUMPKIN_SERP"

    input_parameters = (
        UrlsResourceParameter,
        SerpsResourceParameter,
        TopQueriesCountParameter,
        MinThumbCountParameter,
        MinUploadDeltaParameter
    )

    def on_execute(self):
        urls_path = self.sync_resource(self.ctx[UrlsResourceParameter.name])
        top_files, requested_url_count = self._get_top_files(urls_path)

        serps_path = self.sync_resource(self.ctx[SerpsResourceParameter.name])
        proceeded_files, uploaded_url_count = self._check_serps(serps_path, top_files)

        top_delta = set(top_files.iterkeys()) - proceeded_files
        if top_delta:
            raise errors.SandboxTaskFailureError("Some of files for top queries are missing: {}".format(
                top_delta))

        upload_delta = float(requested_url_count - uploaded_url_count) / requested_url_count * 100
        logging.info("requested_url_count={}, uploaded_url_count={}, delta={}".format(
            requested_url_count, uploaded_url_count, upload_delta))
        if upload_delta > self.ctx[MinThumbCountParameter.name]:
            raise errors.SandboxTaskFailureError("Some of uploads were failed: {}".format(
                requested_url_count - uploaded_url_count))

    def _get_top_files(self, urls_path):
        top_queries_count = self.ctx[TopQueriesCountParameter.name]
        with tarfile.open(urls_path) as tar:
            volume_count = len([name for name in tar.getnames() if name != "."])

        total_count = 0
        top_files = {}
        with tarfile.open(urls_path) as tar:
            for item in tar:
                item_content = tar.extractfile(item)
                if item_content is None:
                    continue
                for number, url in enumerate(item_content):
                    if number < top_queries_count / volume_count:
                        path_to_check, index_url = _get_index_path(url)
                        top_files[path_to_check] = index_url
                total_count += number + 1

        return top_files, total_count

    # TODO: verify main and notfound pages
    def _check_serps(self, serps_path, top_files):
        min_thumb_count = self.ctx[MinThumbCountParameter.name]
        proceeded_files = set()
        uploaded_url_count = 0
        with tarfile.open(serps_path) as tar:
            for item in tar:
                if item.name in top_files:
                    page_file = tar.extractfile(item)
                    with gzip.GzipFile(item.name, "rb", 9, page_file) as page_gz:
                        thumb_count = sum(1 for m in _THUMB_RE.finditer(page_gz.read()))
                        logging.info("{}: {} thumbs".format(item.name, thumb_count))
                        if thumb_count < min_thumb_count:
                            raise errors.SandboxTaskFailureError("Too few of thumbs ({} < {}) on page {} ({})".format(
                                thumb_count, min_thumb_count, top_files[item.name], item.name))
                    proceeded_files.add(item.name)
                elif item.name == "index":
                    index_file = tar.extractfile(item)
                    uploaded_url_count = sum(1 for line in index_file)

        return proceeded_files, uploaded_url_count


def _get_index_path(url):
    index_url, fetch_url, hashname = pumpkin_utils.make_urls(url)
    return (os.path.join(".", hashname[0], hashname[1], hashname[2], hashname + '.html.gz'), index_url)


__Task__ = ImagesTestPumpkinSerp
