# coding: utf-8

from sandbox import sdk2
from sandbox.projects.common.mediasearch import scraper_task as sct
from sandbox.projects import resource_types
import itertools as itt
import sandbox.common.errors as ce
import sandbox.common.types.misc as ctm
import subprocess
import json
import logging
import requests
import imghdr
import signal
import hashlib

# Maximum image size in bytes
MAXIMUM_IMAGE_SIZE = 1024*1024
# Fetch timeout in seconds
FETCH_TIMEOUT = 1


class Timeout():
    """
    Timeout context manager

    Aborts arbitrary activity in "when" block when time is out.
    Main purpose is to limit total work time of requests' methods.
    Based on https://stackoverflow.com/a/22156618/791143
    """

    class TimeoutException(Exception):
        """
        Simple Exception to be called on timeouts
        """

        def __init__(self):
            super(Timeout.TimeoutException, self).__init__("TIMEOUT")

    def _timeout(self, signum, frame):
        """
        Signal handler

        Raise an TimeoutException
        """
        raise Timeout.TimeoutException()

    def __init__(self, timeout=1):
        self.timeout = timeout
        signal.signal(signal.SIGALRM, self._timeout)

    def __enter__(self):
        signal.alarm(self.timeout)

    def __exit__(self, exc_type, exc_value, traceback):
        signal.alarm(0)
        return False


class ImagesInspectCbirIntegrity(sct.ScraperTask):
    """
    Evaluate sort of CBIR quality
    """

    class Requirements(sct.ScraperTask.Requirements):
        dns = ctm.DnsType.DNS64
        # We are restricted by platforms that can run cbir_comparer
        cores = 1

        class Caches(sdk2.Requirements.Caches):
            pass

    class Parameters(sct.ScraperTask.Parameters):
        kill_timeout = 10800
        input_plan = sct.SimplyLastResource("Image URLs", resource_type=resource_types.IMAGES_CBIR_PLAN, required=True)
        plan_limit = sdk2.parameters.Integer("Number of top queries", required=True)
        target_host = sdk2.parameters.String("Target host URL", required=True, default="https://hamster.yandex.ru")
        scraper_profile = sdk2.parameters.String("Scraper profile", required=True, default="weak_consistency/image/desktop/hamster")
        cbir_comparer = sdk2.parameters.LastReleasedResource("CBIR Comparer", resource_type=resource_types.CBIR_COMPARER_EXECUTABLE, required=False)

    def fetch(self, url, timeout_multiplier=1):
        """
        Get specified url with timeout

        :param url: URL
        :param timeout_multiplier: Standard timeout value is multiplied by this value
        :raises: Timeout.TimeoutException and requests exceptions including bad status code exceptions
        :return: Content
        """
        with Timeout(FETCH_TIMEOUT*timeout_multiplier):
            r = requests.get(url, timeout=FETCH_TIMEOUT*timeout_multiplier)
        r.raise_for_status()
        return r.content

    def download_external_image(self, url, counter_prefix, timeout_multiplier=1):
        """
        Fetch external image and check it's an appropriate one

        Count errors as well

        :param url: URL
        :param counter_prefix: Prefix for error counters
        :param timeout_multiplier: Timeout tuning argument
        :return: Image content
        """
        try:
            image = self.fetch(url, timeout_multiplier)
        except Exception as e:
            logging.debug("Error fetching %s -- %s", url.encode("utf-8"), str(type(e)))
            logging.debug(e)
            self.counter_inc(counter_prefix + "_fetch_problem")
            return None

        if len(image) > MAXIMUM_IMAGE_SIZE:
            self.counter_inc(counter_prefix + "_image_too_big")
            return None

        if imghdr.what(None, image) not in ("jpeg", "gif", "png"):
            self.counter_inc(counter_prefix + "_bad_format")
            return None

        return image

    def queries_iterator(self, feedback):
        """
        Iterator of queries for Scraper batch

        (Scraper task preparation)

        :param feedback: Handle to send progress notifications
        :return: Image urls iterator (to go to query-text)
        """
        queries_count = self.Parameters.plan_limit
        feedback.set_total(queries_count)
        plan_data = sdk2.ResourceData(self.Parameters.input_plan)
        unique_urls = set()

        with plan_data.path.open(encoding="utf-8") as f:
            for line in f:
                self.counter_inc("urls_checked")
                url = line.strip()

                # Scraper requires unique urls so we drop duplicates to get to required queries_count
                if url in unique_urls:
                    logging.debug("URL %s is a duplicate. Skipping.", url.encode("utf-8"))
                    continue
                unique_urls.add(url)

                if not self.download_external_image(url, "prepare"):
                    continue

                feedback.progress()
                self.counter_inc("urls_good")
                yield url

                if self.counters["urls_good"] >= queries_count:
                    break

    @property
    def batch_properties(self):
        """
        Scraper batch properties

        (Scraper task preparation)
        """
        if "http" not in self.Parameters.target_host:
            raise ce.TaskError("Target host URL should be URL like https://hamster.yandex.ru")
        return {
            "description": {
                "comments": "https://sandbox.yandex-team.ru/task/{}/view".format(self.id),
                "creator": "Sandbox task",
                "name": "CBIR acceptance serp import",
                "quota-project": "imgbase-acceptance"
            },
            "host": self.Parameters.target_host,
            "parse-serps": False,
            "per-set-parameters": {
                "additional-cgi": {
                    "json_dump": ["reqdata.reqid"]
                }
            },
            "profile": self.Parameters.scraper_profile,
            "search-engine": "yandex-search-by-image-json",
            "store-results-period": 2592000000
        }

    def process_section(self, data, section, name):
        """
        Process section (cbir/similar) of serp

        Various counters are calculated and section's first thumb image is downloaded
        :return: First thumb image data
        """
        self.counter_inc("total_" + name)

        try:
            if section:
                data = data["searchdata"][section]["images"]
            else:
                data = data["searchdata"]["images"]
        except:
            self.counter_inc("total_" + name + "_empty_section")
            return

        number_of_images = len(data)
        self.counter_inc(name + "_images", number_of_images)
        self.counter_inc(name + "_without_images", int(number_of_images == 0))
        self.counter_inc(name + "_with_images", int(number_of_images > 0))
        self.counter_inc(name + "_images_without_cbir_dups", sum(itt.imap(lambda x: len(x["cbir_dups"]) == 0, data)))
        self.counter_inc(name + "_total_cbir_dups", sum(itt.imap(lambda x: len(x["cbir_dups"]), data)))
        self.counter_inc(
            name + "_total_empty_title",
            sum(itt.imap(lambda x: sum(itt.imap(lambda y: len(y["title"]) == 0, x["cbir_dups"])), data))
        )
        self.counter_inc(
            name + "_total_empty_text",
            sum(itt.imap(lambda x: sum(itt.imap(lambda y: len(y["text"]) == 0, x["cbir_dups"])), data))
        )
        self.counter_inc(
            name + "_total_empty_img_w",
            sum(itt.imap(lambda x: sum(itt.imap(lambda y: len(str(y["img_w"])) == 0, x["cbir_dups"])), data))
        )
        self.counter_inc(
            name + "_total_empty_img_h",
            sum(itt.imap(lambda x: sum(itt.imap(lambda y: len(str(y["img_h"])) == 0, x["cbir_dups"])), data))
        )
        self.counter_inc(
            name + "_total_empty_thmb_w",
            sum(itt.imap(lambda x: sum(itt.imap(lambda y: len(str(y["thmb_w"])) == 0, x["cbir_dups"])), data))
        )
        self.counter_inc(
            name + "_total_empty_thmb_h",
            sum(itt.imap(lambda x: sum(itt.imap(lambda y: len(str(y["thmb_h"])) == 0, x["cbir_dups"])), data))
        )
        self.counter_inc(
            name + "_total_empty_img_href",
            sum(itt.imap(lambda x: sum(itt.imap(lambda y: len(str(y["img_href"])) == 0, x["cbir_dups"])), data))
        )
        self.counter_inc(
            name + "_total_empty_thmb_href",
            sum(itt.imap(lambda x: sum(itt.imap(lambda y: len(str(y["thmb_href"])) == 0, x["cbir_dups"])), data))
        )
        self.counter_inc(
            name + "_total_empty_html_href",
            sum(itt.imap(lambda x: sum(itt.imap(lambda y: len(str(y["html_href"])) == 0, x["cbir_dups"])), data))
        )
        self.counter_inc(
            name + "_total_zero_img_w",
            sum(itt.imap(lambda x: sum(itt.imap(lambda y: int(y["img_w"]) == 0, x["cbir_dups"])), data))
        )
        self.counter_inc(
            name + "_total_zero_img_h",
            sum(itt.imap(lambda x: sum(itt.imap(lambda y: int(y["img_h"]) == 0, x["cbir_dups"])), data))
        )
        self.counter_inc(
            name + "_total_zero_thmb_w",
            sum(itt.imap(lambda x: sum(itt.imap(lambda y: int(y["thmb_w"]) == 0, x["cbir_dups"])), data))
        )
        self.counter_inc(
            name + "_total_zero_thmb_h",
            sum(itt.imap(lambda x: sum(itt.imap(lambda y: int(y["thmb_h"]) == 0, x["cbir_dups"])), data))
        )

        if len(data) > 0 and len(data[0]["cbir_dups"]) > 0 and len(data[0]["cbir_dups"][0]["thmb_href"]) > 0:
            try:
                url = "https:" + data[0]["cbir_dups"][0]["thmb_href"]
                image = self.fetch(url, 5)
                self.counter_inc(name + "_image_ok")
                return image
            except:
                return

    def start_comparer(self):
        """
        Run external comparer binary
        """
        if self.Parameters.cbir_comparer:
            cbir_comparer = sdk2.ResourceData(self.Parameters.cbir_comparer)
            self.comparer = subprocess.Popen(
                [str(cbir_comparer.path.resolve())],
                stdout=subprocess.PIPE,
                stdin=subprocess.PIPE
            )
        else:
            self.comparer = None

    def dump_image(self, image):
        """
        Comparer protocol: image data representation
        """
        return str(len(image)) + "\n" + image

    def push_key_image(self, image):
        """
        Comparer protocol: set key image

        :param image: Image data
        """
        if self.comparer:
            try:
                logging.info("Key image %d bytes", len(image))
                logging.info("Key image MD5: %s", hashlib.md5(image).hexdigest())
                self.comparer.stdin.write(self.dump_image(image))
                self.counter_inc("key_images_pushed")
                logging.info("Key image pushed")
            except Exception as e:
                logging.info("Comparer error (key): %s", str(e))

    def process_image(self, image, name):
        """
        Comparer protocol: compare image and key image

        :param image: Image data
        :param name: Image name. Used for logging and counters.
        """
        if image and self.comparer:
            try:
                logging.info("Image %d bytes (%s)", len(image), name)
                logging.info("Image MD5: %s", hashlib.md5(image).hexdigest())
                self.counter_inc(name + "_comparer_inputs")
                self.comparer.stdin.write(self.dump_image(image))
                logging.info("Image pushed (%s)", name)
                answer = self.comparer.stdout.readline().strip()
                if answer == "Frame error":
                    self.counter_inc(name + "_frame_errors")
                    raise Exception("Frame error")
                degree = float(answer)
                self.counter_inc(name + "_measure", degree)
                logging.info("Image evaluated %f (%s)", degree, name)
                self.counter_inc(name + "_evaluated")
            except Exception as e:
                logging.info("Comparer error (process): %s", str(e))
                self.counter_inc(name + "_comparer_errors")

    def push_frame_end(self):
        """
        Comparer protocol: finish processing image array
        """
        if self.comparer:
            try:
                self.comparer.stdin.write("0\n")
            except Exception as e:
                logging.info("Comparer error (end): %s", str(e))

    def evaluate_images(self, query_image_url, checked_images):
        """
        Compare query image with response images by external comparer binary

        Logs are logged, counters are counted.
        If irrecoverable comparer error is identified further processing by comparer is skipped.
        """
        if self.comparer:
            query_image = self.download_external_image(query_image_url, "process", 5)
            if not query_image:
                return

            try:
                self.push_key_image(query_image)
            except:
                logging.debug("Key image push error. Aborting image evaluation in cbir_comparer.")
                self.comparer = None
                return

            try:
                for image, tag in checked_images:
                    self.process_image(image, tag)
            except:
                return
            finally:
                self.push_frame_end()

    def process_results(self, results_iterator, feedback):
        """
        Process scraper batch results
        """
        self.start_comparer()

        for fetch_data in results_iterator:
            # https://st.yandex-team.ru/SCRAPER-1069
            if not fetch_data["serp-page"]["raw-content"]:
                continue

            self.counter_inc("total_serps")
            feedback.progress()

            try:
                serp_data = json.loads(fetch_data["serp-page"]["raw-content"])
            except:
                logging.debug("Problems parsing\n%s", fetch_data["serp-page"]["raw-content"])
                raise

            if "reqdata.reqid" in serp_data:
                logging.info("reqid: %s", serp_data["reqdata.reqid"])

            query_image_url = fetch_data["serp-request-explained"]["per-query-parameters"]["query-text"]
            similar_image = self.process_section(serp_data, None, "similar")
            duplicate_image = self.process_section(serp_data, "cbir", "duplicates")

            self.evaluate_images(query_image_url, ((similar_image, "similar"), (duplicate_image, "duplicates")))

    @property
    def stats(self):
        """
        Table data for footer

        The method is called on server so use self.counters_ro to access counters data
        """
        results = []
        if self.counters_ro.get("urls_good"):
            results.append(("Good URLs ratio", self.counter_percentage("urls_good", "urls_checked")))
        results += [
            ("Total serps", self.counters_ro.get("total_serps", "&mdash;")),
            ("Similar empty replies", self.counter_percentage("similar_without_images", "total_similar")),
            ("Similar empty sections", self.counter_percentage("total_similar_empty_section", "total_similar")),
            ("Similar good thumbs", self.counter_percentage("similar_image_ok", "similar_with_images")),
            ("Similar images evaluated", self.counter_percentage("similar_evaluated", "similar_image_ok")),
            ("Similar average similarity", self.counter_ratio("similar_measure", "similar_evaluated")),
            ("Similar average results", self.counter_ratio("similar_images", "similar_with_images")),
            ("Similar average dups", self.counter_ratio("similar_total_cbir_dups", "similar_images")),
            ("Similar empty texts", self.counter_ratio("similar_total_empty_text", "similar_total_cbir_dups")),
            ("Similar empty titles", self.counter_ratio("similar_total_empty_title", "similar_total_cbir_dups")),
            ("Similar comparer errors", self.counter_percentage("similar_comparer_errors", "similar_comparer_inputs")),
            ("CBIR empty replies", self.counter_percentage("duplicates_without_images", "total_duplicates")),
            ("CBIR empty sections", self.counter_percentage("total_duplicates_empty_section", "total_duplicates")),
            ("CBIR good thumbs", self.counter_percentage("duplicates_image_ok", "duplicates_with_images")),
            ("CBIR images evaluated", self.counter_percentage("duplicates_evaluated", "duplicates_image_ok")),
            ("CBIR average similarity", self.counter_ratio("duplicates_measure", "duplicates_evaluated")),
            ("CBIR average results", self.counter_ratio("duplicates_images", "duplicates_with_images")),
            ("CBIR average dups", self.counter_ratio("duplicates_total_cbir_dups", "duplicates_images")),
            ("CBIR empty texts", self.counter_ratio("duplicates_total_empty_text", "duplicates_total_cbir_dups")),
            ("CBIR empty titles", self.counter_ratio("duplicates_total_empty_title", "duplicates_total_cbir_dups")),
            ("CBIR comparer errors", self.counter_percentage("duplicates_comparer_errors", "duplicates_comparer_inputs")),
        ]
        return results
