# python scraper proxy
# usage:
# import scraper

from urllib2 import urlopen, Request, HTTPError
import json
import time
import logging

class Configure:
    SCRAPER_URL_PROD = "https://scraper.yandex-team.ru/api/scraper/batch"
    SCRAPER_URL_TEST = "https://scraper-dev.qloud.yandex-team.ru/api/scraper/batch"

    def __init__(self):
        self.params = _Dict()
        self.params.region_id = None
        self.params.results_count = 10
        self.params.verbose = False
        self.params.download_original = False
        self.params.cgi_params = {}
        self.params.scraper_url = Configure.SCRAPER_URL_TEST
        self.params.oauth_token = ""
        self.params.viewport_width = 683
        self.params.viewport_height = 1020

        def EmptyModifier(x):
            pass

        self.params.modifier = EmptyModifier

    def ForGoogleWeb(self):
        self.params.search_engine = _Scraper.google_web
        self.params.viewport_width = 683
        self.params.viewport_height = 1020
        return self

    def ForGoogleTouch(self):
        self.params.search_engine = _Scraper.google_touch
        self.params.viewport_width = 360  # Google Nexus 5
        self.params.viewport_height = 567
        return self

    def ForYandex(self):
        self.params.search_engine = _Scraper.yandex
        return self

    def SetMeta(self, owner, name):
        self.params.owner = owner
        self.params.name = name
        return self

    def SetResultsOnPage(self, count):
        self.params.results_count = count
        return self

    def SetRegion(self, region_id):
        self.params.region_id = region_id
        return self

    def SetVerbose(self):
        self.params.verbose = True
        return self

    def SetDownloadOriginal(self):
        self.params.download_original = True
        return self

    def SetRequestModifier(self, modifier):
        self.params.modifier = modifier
        return self

    def AddCgiParameter(self, key, value):
        self.params.cgi_params[key] = [value]
        return self

    def SetUseProdScraper(self):
        self.params.scraper_url = Configure.SCRAPER_URL_PROD
        return self

    def SetOauthToken(self, oauth_token):
        self.params.oauth_token = oauth_token
        return self

    def BuildScraper(self):
        if not hasattr(self.params, "search_engine"):
            raise ValueError("specify engine!")
        if not hasattr(self.params, "owner"):
            raise ValueError("specify meta!")
        return _Scraper(self.params)


class _Dict:
    pass


google_path = 'http://google.com'


class _Scraper:
    yandex = {"host": "http://yandex.ru", "type": "yandex-web-islands"}
    google_web = {"host": google_path, "type": "google-web"}
    google_touch = {"host": google_path, "type": "google-touch-android"}

    def __init__(self, params):
        self.query_id = 0
        self.params = params
        self.proxy = _ScraperProxy(params.scraper_url, params.oauth_token, self.params.verbose)

    def StartBatch(self, queries):
        self.queries = queries
        request = self.CreateBatchRequest()
        ticket = self.proxy.SendBatch(request)
        return ticket

    def WaitBatch(self, ticket):
        while not self.proxy.IsCompleted(ticket):
            self.Log("waiting for batch download...")
            time.sleep(30)
        serps = self.proxy.GetResult(ticket)
        # print serps
        # serps = filter(lambda s: s["status"]["status"] == "done", serps)
        serps.sort(key=lambda x: int(x["serp-request-explained"]["serp-request-id"]))
        return serps

    def GetByTicket(self, ticket):
        serps = self.proxy.GetResult(ticket)
        serps = filter(lambda s: s["status"]["status"] == "done", serps)
        # serps = map(lambda x: {
        #     "id": x["serp-request-explained"]["serp-request-id"],
        #     "query": x["serp-request-explained"]["per-query-parameters"]["query-text"],
        #     "region": x["serp-request-explained"]["per-query-parameters"]["region-id"],
        #     "urls": self.ExtractUrls(x["serp-page"]["parser-results"]["components"])
        # }, serps)
        serps.sort(key=lambda x: int(x["serp-request-explained"]["serp-request-id"]))
        return serps

    def Log(self, message):
        if self.params.verbose:
            logging.warning(message)

    def ExtractUrls(self, components):
        return map(lambda y: y["page-url"], filter(lambda x: x["type"] == "SEARCH_RESULT", components))

    def CreateBatchRequest(self):
        query_list = map(self.QueryToRequest, self.queries)

        per_set_params = {
            "results-per-page": self.params.results_count,
            # "extract-debug-info": True,
            # "ignored-not-used-params": True
        }
        per_set_params.update(self.params.cgi_params)
        if self.params.region_id:
            per_set_params["region-id"] = self.params.region_id
        if len(self.params.cgi_params) > 0:
            per_set_params["additional-cgi"] = self.params.cgi_params
        request = {
            "queries": query_list,
            "request": {
                "host": self.params.search_engine["host"],
                "per-set-parameters": per_set_params,
                "store-results-period": 86400000,
                "search-engine": self.params.search_engine["type"],
                "parse-serps": False,
                "serp-downloading-policy": {
                    "max-attempts-per-query": 5,
                    "retry-on-connection-problem": True,
                    "retry-on-server-error": True,
                    "view-port-width": self.params.viewport_width,
                    "view-port-height": self.params.viewport_height
                    # "retry-on-snippets-not-answered": True,
                    # "retry-on-primus-failed": True,
                    # "retry-on-source-failed": True
                },
                "description": {
                    "name": self.params.name,
                    "comments": None,
                    "creator": self.params.owner
                }
            }
        }
        self.params.modifier(request["request"])
        if self.params.download_original:
            request["request"]["serp-downloading-policy"]["resource-loading-strategy"] = "main-page"
        return request

    def QueryToRequest(self, query):
        self.query_id += 1
        result = {"serp-request-id": self.query_id}
        if isinstance(query, basestring):
            result["per-query-parameters"] = {"query-text": query}
        else:
            (request, region) = query
            if isinstance(region, basestring):
                region = int(region)
            result["per-query-parameters"] = {"query-text": request, "region-id": region}
        return result


class _ScraperProxy:
    def __init__(self, base_url, oauth_token, verbose=False):
        self.base_url = base_url
        self.oauth_token = oauth_token
        self.verbose = verbose

    def SendBatch(self, request):
        headers = {"Content-type": "application/json",
                   "Accept": "application/json",
                   "Authorization": "OAuth " + self.oauth_token}
        req = Request(self.base_url, json.dumps(request), headers)
        while True:
            try:
                response = self._SafeSend(req)
                result = json.load(response)
                self.Log("ticket: " + result["ticket"])
                return result["ticket"]
            except HTTPError as error:
                time.sleep(30)
        return None

    def IsCompleted(self, ticket):
        headers = {"Authorization": "OAuth " + self.oauth_token}
        req = Request(self.base_url + "/" + ticket + "/status", None, headers)

        response = self._SafeSend(req).read()
        self.Log(response)
        return json.loads(response)["status"] == "COMPLETE"

    def GetResult(self, ticket):
        headers = {"Content-type": "application/json",
                   "Accept": "application/json",
                   "Authorization": "OAuth " + self.oauth_token}
        req = Request(self.base_url + "/" + ticket + "/serps", None, headers)

        response = self._SafeSend(req)
        return json.load(response)

    def Log(self, message):
        if self.verbose:
            logging.warning(message)

    def _SafeSend(self, request):
        try:
            # context = ssl._create_unverified_context()
            #            return urlopen(request, context=context)
            return urlopen(request)
        except HTTPError as error:
            logging.error("HTTPError: " + error.read())
            raise error
