# -*- encoding: utf-8 -*-
"""
python scraper proxy
usage:
import scraper
sample = [('who is mr Putin', 213), ('to be or not to be', 2)]
def RequestModifier(request):
    request["per-set-parameters"]["additional-cgi"] = {"some-param": ["some-value"]} # adding parameter to querystring of every request to source
result = (scraper.Configure()
    .ForYandex()
    .SetMeta('my_oauth_token', 'login_on_staff', 'my_task')
    .SetResultsOnPage(20)
    .SetVerbose()
    .SetRequestModifier(RequestModifier)  # this method can arbitrary change request according to docs of scraper API
    .BuildScraper()
    .DownloadBatch(sample))
for x in result:
    print x['query'] + '|' + str(x['region'])
    print x['urls']
"""
import logging
from urllib2 import urlopen, Request, HTTPError, URLError
import json
import time

log = logging.getLogger(__name__)


class Configure:

    def __init__(self):
        self.params = _Dict()
        self.params.region_id = 213
        self.params.results_count = 10
        self.params.verbose = False
        self.params.download_original = False
        self.params.cgi_params = {}

        def EmptyModifier(x):
            pass
        self.params.modifier = EmptyModifier

    def ForYandex(self):
        self.params.search_engine = _Scraper.yandex_hamster
        return self

    def ForYandexXml(self):
        self.params.search_engine = _Scraper.yandex_xml
        return self

    def ForGoogle(self):
        self.params.search_engine = _Scraper.google
        return self

    def SetMeta(self, token, creator, invoker, name):
        self.params.token = token
        self.params.creator = creator
        self.params.invoker = invoker
        self.params.name = name
        return self

    def SetResultsOnPage(self, count):
        self.params.results_count = count
        return self

    def SetRegion(self, region_id):
        self.params.region_id = region_id
        return self

    def SetVerbose(self):
        self.params.verbose = True
        return self

    def SetDownloadOriginal(self):
        self.params.download_original = True
        return self

    def SetRequestModifier(self, modifier):
        self.params.modifier = modifier
        return self

    def AddCgiParameter(self, key, value):
        self.params.cgi_params[key] = [value]
        return self

    def SetQuota(self, quota_id):
        self.params.quota_id = quota_id
        return self

    def BuildScraper(self):
        if not hasattr(self.params, "search_engine"):
            raise ValueError("specify engine!")
        if not hasattr(self.params, "creator"):
            raise ValueError("specify meta!")
        if not hasattr(self.params, "token"):
            raise ValueError("specify token!")
        return _Scraper(self.params)


class _Dict:
    pass


class _Scraper:
    yandex = {"host": "http://yandex.ru", "type": "yandex-web-islands"}
    yandex_hamster = {"host": "http://hamster.yandex.ru", "type": "yandex-web-islands"}
    yandex_xml = {"host": "http://xmlsearch.yandex.ru/xmlsearch", "type": "yandex-web-xml"}
    google = {"host": "http://google.ru", "type": "google-web"}

    def __init__(self, params):
        self.query_id = 0
        self.params = params
        self.proxy = _ScraperProxy(self.params.token, self.params.verbose)

    def DownloadBatch(self, queries):
        self.queries = queries
        request = self.CreateBatchRequest()
        ticket = self.proxy.SendBatch(request)
        while not self.proxy.IsCompleted(ticket):
            self.Log("waiting for batch download...")
            time.sleep(1)
        serps = self.proxy.GetResult(ticket)
        return serps

    def Log(self, message):
        if self.params.verbose:
            log.warning(message)

    def ExtractUrls(self, components):
        return map(lambda y: y["page-url"], filter(lambda x: x["type"] == "SEARCH_RESULT", components))

    def CreateBatchRequest(self):
        query_list = map(self.QueryToRequest, self.queries)

        per_set_params = {
            "results-per-page": self.params.results_count,
            "extract-debug-info": True,
            "ignored-not-used-params": True
        }
        if self.params.region_id:
            per_set_params["region-id"] = self.params.region_id
        if len(self.params.cgi_params) > 0:
            per_set_params["additional-cgi"] = self.params.cgi_params
        if self.params.search_engine == self.google:
            per_set_params["ignored-incorrect-region"] = True
        request = {
            "queries": query_list,
            "request": {
                "host": self.params.search_engine["host"],
                "per-set-parameters": per_set_params,
                "store-results-period": 86400000,
                "search-engine": self.params.search_engine["type"],
                "parse-serps": True,
                "serp-downloading-policy": {
                    "max-attempts-per-query": 3,
                    "retry-on-connection-problem": True,
                    "retry-on-server-error": True,
                },
                "description": {
                    "name": self.params.name,
                    "comments": None,
                    "creator": self.params.creator,
                    "invoker": self.params.invoker,
                    "quota-project": self.params.quota_id,
                }
            }
        }
        self.params.modifier(request["request"])
        if self.params.download_original:
            request["request"]["serp-downloading-policy"]["resource-loading-strategy"] = "main-page"
        return request

    def QueryToRequest(self, query):
        self.query_id += 1
        result = {"serp-request-id": self.query_id}
        if isinstance(query, basestring):
            result["per-query-parameters"] = {"query-text": query}
        else:
            (request, region) = query
            if isinstance(region, basestring):
                region = int(region)
            result["per-query-parameters"] = {"query-text": request, "region-id": region}
        return result


class _ScraperProxy:
    completed_status = "COMPLETE"
    fail_status = "FAIL"

    def __init__(self, token, verbose=False):
        self.base_url = "https://scraper.qe.yandex-team.ru/api/scraper/batch"
        self.token = "OAuth {}".format(token)
        self.verbose = verbose

    def SendBatch(self, request):
        headers = {"Content-type": "application/json", "Accept": "application/json", "Authorization": self.token}
        result = json.load(self._SafeSend(self.base_url, request, headers))
        self.Log("ticket: " + result["ticket"])
        return result["ticket"]

    def IsCompleted(self, ticket):
        response = self._SafeSend(self.base_url + "/" + ticket + "/status", headers={"Authorization": self.token}).read()
        self.Log(response)
        status = json.loads(response)["status"]
        if status == "FAIL":
            raise Exception("FAIL status on ticket. Response: " + response)
        return status == "COMPLETE" or status == "FAIL"

    def GetResult(self, ticket):
        response = self._SafeSend(self.base_url + "/" + ticket + "/serps", headers={"Authorization": self.token})
        return json.load(response)

    def Log(self, message):
        if self.verbose:
            log.warning(message)

    def _SafeSend(self, url, request=None, headers={}):
        request = Request(url, json.dumps(request) if request is not None else None, headers)
        await_times = [1, 10, 100, 600, 600, 600]
        iteration = 0
        while True:
            try:
                return urlopen(request)
            except URLError as error:
                log.info(str(error))
                if issubclass(type(error), HTTPError):
                    if str(error.code)[0] == '5':
                        raise Exception('Scraper returned %s. Stop retries.' % error.code)
                if iteration >= len(await_times):
                    raise Exception('Max retries count reached. Scraper is unavailable.')
                log.info('Retry in %s seconds', await_times[iteration])
                time.sleep(await_times[iteration])
                iteration += 1
