# coding: utf-8
"""
CBIR fitness checker.
Examines output of CBIR answers for input image URLs.
Uses
    extsearch/images/tools/cbircomparer/cbir_comparer
to measure similarity of images.
"""

from sandbox.projects.common.mediasearch.online import control, command
import itertools as itt
import subprocess
import datetime
import imghdr
import re
import os
import hashlib

SCHEME_PREFIX = re.compile(r'^https?://', re.I)
DATA_FETCH_TIMEOUT = 60.0


class ImagesExamineCbirIntegrity(control.BaseControl):
    email_subject_template = "[cbir-integrity] {descr} results"
    default_query_url_template = "https://priemka-img.hamster.yandex.ru/images/search?" \
        "rpt=imageview&url=%s&json_dump=searchdata&json_dump=reqdata.reqid&json_dump=eventlog&nocache=da&no-tests=1"

    @staticmethod
    def iterate_queries(input_path, queries_count):
        with open(input_path) as f:
            for line in itt.islice(f, queries_count):
                yield line.strip()

    @classmethod
    def prepare(cls, extra_resource_path, context):
        if extra_resource_path:
            cls.comparer = subprocess.Popen(
                    [extra_resource_path],
                    stdout=subprocess.PIPE,
                    stdin=subprocess.PIPE
                )
        else:
            cls.comparer = None

    def process_section(self, data, section, name):
        self.counter.inc("total_" + name)

        try:
            if section:
                data = data["searchdata"][section]["images"]
            else:
                data = data["searchdata"]["images"]
        except:
            self.counter.inc("total_" + name + "_empty_section")
            return

        number_of_images = len(data)
        self.counter[name + "_images"] = number_of_images
        self.counter[name + "_without_images"] = int(number_of_images == 0)
        self.counter[name + "_with_images"] = int(number_of_images > 0)
        self.counter[name + "_images_without_cbir_dups"] = sum(itt.imap(lambda x: len(x["cbir_dups"]) == 0, data))
        self.counter[name + "_total_cbir_dups"] = sum(itt.imap(lambda x: len(x["cbir_dups"]), data))
        self.counter[name + "_total_empty_title"] = \
            sum(itt.imap(lambda x: sum(itt.imap(lambda y: len(y["title"]) == 0, x["cbir_dups"])), data))
        self.counter[name + "_total_empty_text"] = \
            sum(itt.imap(lambda x: sum(itt.imap(lambda y: len(y["text"]) == 0, x["cbir_dups"])), data))
        self.counter[name + "_total_empty_img_w"] = \
            sum(itt.imap(lambda x: sum(itt.imap(lambda y: len(str(y["img_w"])) == 0, x["cbir_dups"])), data))
        self.counter[name + "_total_empty_img_h"] = \
            sum(itt.imap(lambda x: sum(itt.imap(lambda y: len(str(y["img_h"])) == 0, x["cbir_dups"])), data))
        self.counter[name + "_total_empty_thmb_w"] = \
            sum(itt.imap(lambda x: sum(itt.imap(lambda y: len(str(y["thmb_w"])) == 0, x["cbir_dups"])), data))
        self.counter[name + "_total_empty_thmb_h"] = \
            sum(itt.imap(lambda x: sum(itt.imap(lambda y: len(str(y["thmb_h"])) == 0, x["cbir_dups"])), data))
        self.counter[name + "_total_empty_img_href"] = \
            sum(itt.imap(lambda x: sum(itt.imap(lambda y: len(str(y["img_href"])) == 0, x["cbir_dups"])), data))
        self.counter[name + "_total_empty_thmb_href"] = \
            sum(itt.imap(lambda x: sum(itt.imap(lambda y: len(str(y["thmb_href"])) == 0, x["cbir_dups"])), data))
        self.counter[name + "_total_empty_html_href"] = \
            sum(itt.imap(lambda x: sum(itt.imap(lambda y: len(str(y["html_href"])) == 0, x["cbir_dups"])), data))
        self.counter[name + "_total_zero_img_w"] = \
            sum(itt.imap(lambda x: sum(itt.imap(lambda y: int(y["img_w"]) == 0, x["cbir_dups"])), data))
        self.counter[name + "_total_zero_img_h"] = \
            sum(itt.imap(lambda x: sum(itt.imap(lambda y: int(y["img_h"]) == 0, x["cbir_dups"])), data))
        self.counter[name + "_total_zero_thmb_w"] = \
            sum(itt.imap(lambda x: sum(itt.imap(lambda y: int(y["thmb_w"]) == 0, x["cbir_dups"])), data))
        self.counter[name + "_total_zero_thmb_h"] = \
            sum(itt.imap(lambda x: sum(itt.imap(lambda y: int(y["thmb_h"]) == 0, x["cbir_dups"])), data))

        if len(data) > 0 and len(data[0]["cbir_dups"]) > 0 and len(data[0]["cbir_dups"][0]["thmb_href"]) > 0:
            try:
                url = "https:" + data[0]["cbir_dups"][0]["thmb_href"]
                image = self.fetch(url, name + "_image_404", name + "_image_error")
                self.counter.inc(name + "_image_ok")
                return image
            except:
                return

    def dump_image(self, image):
        return str(len(image)) + "\n" + image

    def push_key_image(self, image):
        if self.comparer:
            try:
                self.logger.info("Key image %d bytes", len(image))
                self.logger.info("Key image MD5: %s", hashlib.md5(image).hexdigest())
                self.comparer.stdin.write(self.dump_image(image))
                self.logger.info("Key image pushed")
            except Exception as e:
                self.logger.info("Comparer error (key): %s", str(e))

    def evaluate_image(self, image, name):
        if image and self.comparer:
            try:
                self.logger.info("Image %d bytes (%s)", len(image), name)
                self.logger.info("Image MD5: %s", hashlib.md5(image).hexdigest())
                self.comparer.stdin.write(self.dump_image(image))
                self.logger.info("Image pushed (%s)", name)
                answer = self.comparer.stdout.readline().strip()
                if answer == "Frame error":
                    self.counter.inc(name + "_frame_errors")
                    raise Exception("Frame error")
                degree = float(answer)
                self.counter[name + "_similarity"] = degree
                self.logger.info("Image evaluated %f (%s)", degree, name)
                self.counter.inc(name + "_similarity_calculated")
            except Exception as e:
                self.logger.info("Comparer error (evaluate): %s", str(e))

    def push_frame_end(self):
        if self.comparer:
            try:
                self.comparer.stdin.write("0\n")
            except Exception as e:
                self.logger.info("Comparer error (end): %s", str(e))

    def process_query(self, image_url):
        try:
            self.logger.info("PID %d", os.getpid())
            self.logger.info("Timestamp %s", datetime.datetime.now().strftime("%s%f"))
            image = self.fetch(image_url, on_404="query_image_404", on_error="query_image_error")
            if imghdr.what(None, image) not in ("jpeg", "gif", "png"):
                self.counter.inc("query_image_bad_format")
                raise Exception("Bad image format")
            self.push_key_image(image)
            self.counter.inc("query_image_ok")

            self.counter.inc("total_serps")
            query_url = self.create_query_url(image_url)
            data = self.fetch_json(query_url, on_404="serp_error", on_error="serp_error", timeout=DATA_FETCH_TIMEOUT)
            self.counter.inc("serp_ok")

            if "reqdata.reqid" in data:
                self.logger.info("reqid: %s", data["reqdata.reqid"])

#            if "eventlog" in data:
#                for line in data["eventlog"].split("\n"):
#                    fields = line.split("\t")
#                    if fields[2] in ("RequestStat", "SubSourceInit", "SubSourceRequest"):
#                        self.logger.info(" ".join(fields).replace("%", "%%"))

            self.evaluate_image(self.process_section(data, None, "similar"), "similar")
            self.evaluate_image(self.process_section(data, "cbir", "duplicates"), "duplicates")
        finally:
            self.push_frame_end()

    @staticmethod
    def template_data(res):
        data = []
        total_queries = res.get("total_queries", 0)
        data.append({"name": "Image URLs", "value": str(total_queries), "fraction": None})

        if total_queries > 0:
            query_image_ok = res.get("query_image_ok", 0)
            data.append({"name": "Alive image URLs",
                         "value": str(query_image_ok),
                         "fraction": float(query_image_ok) / total_queries})

            serp_errors = res.get("serp_error", 0)
            if serp_errors > 0:
                data.append({"name": "Serp errors",
                             "value": str(serp_errors),
                             "fraction": float(serp_errors) / total_queries})

            for name, title in (("similar", "Similar"), ("duplicates", "CBIR")):
                total_name = res.get("total_" + name, 0)
                if total_name > 0:
                    name_without_images = res.get(name + "_without_images", 0)
                    data.append({"name": title + " empty replies",
                                 "value": str(name_without_images),
                                 "fraction": float(name_without_images) / total_name})

                    name_empty_section = res.get("total_" + name + "_empty_section", 0)
                    data.append({"name": title + " empty section",
                                 "value": str(name_empty_section),
                                 "fraction": float(name_empty_section) / total_name})

                    name_image_ok = res.get(name + "_image_ok", 0)
                    data.append({"name": title + " good thumbs",
                                 "value": str(name_image_ok),
                                 "fraction": float(name_image_ok) / total_name})

                    if name_image_ok > 0:
                        name_similarity_calculated = res.get(name + "_similarity_calculated", 0)
                        if name_similarity_calculated != name_image_ok:
                            data.append({"name": title + " similarities calculated NB",
                                         "value": str(name_similarity_calculated),
                                         "fraction": float(name_similarity_calculated) / name_image_ok})

                        name_similarity = res.get(name + "_similarity", 0)
                        data.append({"name": title + " similarity average",
                                     "value": float(name_similarity) / name_similarity_calculated,
                                     "fraction": None})

                name_images = res.get(name + "_images", 0)
                name_with_images = res.get(name + "_with_images", 0)
                if name_with_images > 0:
                    data.append({"name": title + " average results",
                                 "value": float(name_images) / name_with_images,
                                 "fraction": None})
                name_total_cbir_dups = res.get(name + "_total_cbir_dups", 0)
                if name_images > 0:
                    data.append({"name": title + " average dups",
                                 "value": float(name_total_cbir_dups) / name_images,
                                 "fraction": None})
                if name_total_cbir_dups > 0:
                    name_total_empty_text = res.get(name + "_total_empty_text", 0)
                    data.append({"name": title + " empty texts",
                                 "value": str(name_total_empty_text),
                                 "fraction": float(name_total_empty_text) / name_total_cbir_dups})
                    name_total_empty_title = res.get(name + "_total_empty_title", 0)
                    data.append({"name": title + " empty titles",
                                 "value": str(name_total_empty_title),
                                 "fraction": float(name_total_empty_title) / name_total_cbir_dups})

        return data


if __name__ == '__main__':
    command.command(ImagesExamineCbirIntegrity)
