#!/usr/bin/env python
# coding: utf-8

import re
import sys
import json
import time
import random
import urllib
import urllib2
import logging
import cStringIO
import multiprocessing

QUERY_URL_PATTERN = "https://newdb-img.hamster.yandex.ru/images/search?text=%s&json_dump=searchdata.images&exp_flags=images_numdoc=40&srcskip=IMAGESQUICK&thumb-host=imtub-test.search.yandex.net"
FIX_THUMB_URL = False

QUERY_CHUNK_SIZE = 1  # Large chunks seem to be useless -- imgrover1 experiment
RPS_PER_PROCESS = 10  # imgrover1 experiment
RPS = 100

process_number = None

# Thumb URL hacking routines

# LEGACY
FIX_THUMB_URL_03_47_RE = re.compile(r'im[0-7]-tub-')
FIX_THUMB_URL_03_47_DICT = {'im2-tub-': 'im6-tub-', 'im1-tub-': 'im5-tub-', 'im0-tub-': 'im4-tub-', 'im6-tub-': 'im2-tub-', 'im5-tub-': 'im1-tub-', 'im4-tub-': 'im0-tub-', 'im7-tub-': 'im3-tub-', 'im3-tub-': 'im7-tub-'}


def fix_thumb_url_03_47(url):
    "im{0-3} -> im{4-7} LEGACY"
    return FIX_THUMB_URL_03_47_RE.sub(lambda x: FIX_THUMB_URL_03_47_DICT[x.group()], url)


FIX_THUMB_URL_03_IMTUB_TEST_PROD_RE = re.compile(r'im[0-3]-tub-\w+')
FIX_THUMB_URL_03_IMTUB_TEST_TEST_RE = re.compile(r'imtub-test.search')


def fix_thumb_url_03_imtub_test(url):
    prod_match = FIX_THUMB_URL_03_IMTUB_TEST_PROD_RE.search(url)
    if prod_match:
        return FIX_THUMB_URL_03_IMTUB_TEST_PROD_RE.sub("imtub-test.search", url)
    else:
        return FIX_THUMB_URL_03_IMTUB_TEST_TEST_RE.sub("im%d-tub-ru" % random.randint(0, 3), url)


class T404Error(Exception):
    pass


def fix_thumb_url(url):
    "fix production <-> testing thumb url if needed"

    if FIX_THUMB_URL:
        return fix_thumb_url_03_imtub_test(url)
    else:
        return url


def extract_thumb_urls(serp):
    "Iterates through thumb urls in JSON SERP"

    serp_json = json.loads(serp)
    for doc in serp_json["searchdata.images"]:
        yield ("http:" + fix_thumb_url(doc["thmb_href"]), doc["basehost"])


def url_for_query(query):
    "Generates SERP URL for query"

    return QUERY_URL_PATTERN % urllib.quote(query)


class TRPSLimiter:
    "Inserts delays to match specified RPS"

    def __init__(self, limit):
        "limit -- RPS"

        self.min_gap = 1.0/limit
        self.last = None

    def gap(self):
        "Delay to match RPS if needed"

        if self.last is not None:
            t = time.time()
            gap_de_facto = t - self.last
            if gap_de_facto < self.min_gap:
                time.sleep(self.min_gap - gap_de_facto)
        self.last = time.time()


class TStringLogger:

    def __init__(self):
        self.stream = cStringIO.StringIO()
        self.handler = logging.StreamHandler(self.stream)
        formatter = logging.Formatter('%(asctime)s %(message)s')
        self.handler.setFormatter(formatter)
        self.log = logging.getLogger('query_processor')
        self.log.setLevel(logging.INFO)
        for handler in self.log.handlers:
            self.log.removeHandler(handler)
        self.log.addHandler(self.handler)

    def info(self, msg, *args):
        self.log.info(msg, *args)

    def getvalue(self):
        self.handler.flush()
        return self.stream.getvalue()


def fetch_url(limiter, url, logger, basehost=""):
    "Fetching URL, raising problems"

    if basehost:
        basehost = " " + basehost

    limiter.gap()
    try:
        f = urllib2.urlopen(url)
    except Exception as e:
        if isinstance(e, urllib2.HTTPError):
            logger.info("%d %s (exception)%s", e.code, url, basehost)
            if e.code == 404:
                raise T404Error
        else:
            logger.info("XXX %s %s %s%s", url, str(type(e)), str(e), basehost)
        raise

    logger.info("%d %s%s", f.getcode(), url, basehost)

    if f.getcode() != 200:
        if f.getcode() == 404:
            raise T404Error
        raise urllib2.HTTPError

    return f.read()


def process_query(query):
    """
    Fetch SERP, fetch thumbs, calculate stats

    :param query: search query text
    :return: tuple of (total_queries, bad_serps, total_thumbs, thumbs_404, bad_thumbs)
    total_queries -- queries fetched i.e. 1
    bad_serps     -- queries fetched with an error i.e. 1 or 0
    total_thumbs  -- thumbs processed -- depends on serp
    thumbs_404    -- thumbs with 404 status code
    bad_thumbs    -- thumbs with other errors
    """

    logger = TStringLogger()
    limiter = TRPSLimiter(float(RPS)/process_number)

    try:
        serp = fetch_url(limiter, url_for_query(query), logger)

        thumbs_total = 0
        thumbs_404 = 0
        thumbs_err = 0

        for thumb, basehost in extract_thumb_urls(serp):
            try:
                fetch_url(limiter, thumb, logger, basehost)
            except T404Error:
                thumbs_404 += 1
            except:
                thumbs_err += 1
            finally:
                thumbs_total += 1

        return (1, 0, thumbs_total, thumbs_404, thumbs_err, logger.getvalue())

    except Exception as e:
        return (1, 1, 0, 0, 0, "Serp error: %s" % e)


def multiprocess(it, logfile=sys.stderr):
    """
    Process queries from an iterator in multiple processes

    :param it: Queries iterator
    :param logfile: Output stream for logs
    """

    global process_number
    process_number = min(max(1, RPS/RPS_PER_PROCESS), 100)
    pool = multiprocessing.Pool(processes=process_number)

    results = (0, 0, 0, 0, 0)
    for r in pool.imap_unordered(process_query, it, QUERY_CHUNK_SIZE):
        results = map(sum, zip(results, r[:-1]))
        logfile.write(r[-1])
        yield results


def iterate_topqueries_txt(f):
    """
    Helper topqueries.txt iterator

    Intended for use in checker.py as an app
    :param f: stream with topqueries.txt
    """

    for line in f:
        yield line.split("\t")[1]


if __name__ == '__main__':

    if (len(sys.argv) != 2):
        sys.stderr.write("Usage: python %s topqueries.txt\n" % sys.argv[0])
        sys.exit(1)

    with open(sys.argv[1]) as f:
        start = time.time()
        for total_queries, bad_serps, total_thumbs, thumbs_404, bad_thumbs in multiprocess(iterate_topqueries_txt(f)):
            delta = time.time() - start
            print "RPS: %f" % ((total_queries+total_thumbs) / delta)

        print "Total queries: %d" % total_queries
        print "Failed queries: %d (%f %%)" % (bad_serps, 100.0 * bad_serps / total_queries)

        if total_thumbs > 0:
            print "Total thumbs: %d" % total_thumbs
            print "404 thumbs: %d (%f %%)" % (thumbs_404, 100.0 * thumbs_404 / total_thumbs)
            print "Failed thumbs: %d (%f %%)" % (bad_thumbs, 100.0 * bad_thumbs / total_thumbs)
