"""
Simple scraper batches client pipeline
"""
from sandbox.projects.common.decorators import retries
import requests
import logging
import json
import hashlib
from contextlib import contextmanager
from sandbox import sdk2

RETRIES = 10
_LOG = logging.getLogger(__name__)
_DEFAULT_YT_TOKEN = "robot-images-st:yt_token"


class LargeJSONSplitter(object):
    """
    Quick and dirty json splitter parser (internal)

    Benefits on HUGE (15G for example) jsons that do not fit into memory
    Upper level json array without whitespaces is expected
    """
    CLOSING = {"[": "]", "{": "}"}

    def __init__(self):
        # May be as large as JSON of a single item of a topmost array
        self.current_json_buffer = ""
        # May be as large as depth of brackets (several tens probably)
        self.stack = []
        self.pos = 0
        self.backslash_mode = False
        self.string_mode = False
        self.skip_mode = True

    def process(self, blob):
        """
        Process chunk of input buffer and yield parsed items

        :param blob: str, part of large JSON stream
        :return: itertor
        """

        # Weird optimization that significantly reduces object variables access
        # (local variable access is "by index" and object variables -- "by name")
        # https://stackoverflow.com/a/8493947/791143
        # and thus speeds parsing up something like several tens times
        pos = self.pos
        skip_mode = self.skip_mode
        current_json_buffer = self.current_json_buffer
        string_mode = self.string_mode
        stack = self.stack
        CLOSING = self.CLOSING
        backslash_mode = self.backslash_mode

        for c in blob:
            pos += 1

            if skip_mode:
                skip_mode = False
            else:
                current_json_buffer += c

            if not string_mode:
                if c in ("[", "{"):
                    stack.append(c)
                elif c in ("]", "}"):
                    if not stack:
                        _LOG.debug("Empty stack. Pos %d.", pos)
                    r = stack.pop()
                    if CLOSING[r] != c:
                        error_message = "Invalid closing bracket '{}' ('{}' awaited) at pos {}"
                        raise Exception(error_message.format(c, CLOSING[r], pos))
                    if c == "}" and len(stack) == 1:
                        yield json.loads(current_json_buffer)
                        current_json_buffer = ""
                        skip_mode = True
                elif c == "\"":
                    string_mode = True
            else:
                if not backslash_mode:
                    if c == "\"":
                        string_mode = False
                    backslash_mode = (c == "\\")
                else:
                    backslash_mode = False

        self.pos = pos
        self.skip_mode = skip_mode
        self.current_json_buffer = current_json_buffer
        self.string_mode = string_mode
        self.stack = stack
        self.CLOSING = CLOSING
        self.backslash_mode = backslash_mode


def test_large_json_splitter():
    """
    Run with pytest --noconftest scraper.py
    """
    import pytest
    splitter = LargeJSONSplitter()
    assert list(splitter.process('[{"a":"bismuth\\\\"}')) == [{"a": "bismuth\\"}]
    assert list(splitter.process(',{"b"')) == []
    assert list(splitter.process(':2},{"c":3},{')) == [{"b": 2}, {"c": 3}]
    assert list(splitter.process('"a":1}]')) == [{"a": 1}]

    splitter = LargeJSONSplitter()
    assert list(splitter.process('[{"a":1}')) == [{"a": 1}]
    assert list(splitter.process(',')) == []
    with pytest.raises(Exception):
        list(splitter.process("{]"))


class Scraper(object):
    """
    Scraper batches client pipeline

    Interface methods
    * run_batch(queries_iterator, batch_properties) -> batch_id
    * get_batch_status(batch_id) -> dict
    * results_iterator(batch_id) -> iterator

    Results iterator can either process batch results incrementally or
    download_to file at specified path and process it afterwards
    """
    POST_BATCH_URL = "https://scraper.yandex-team.ru/api/scraper/batch"
    GET_BATCH_STATUS_URL = "https://scraper.yandex-team.ru/api/scraper/batch/{batch_id}/status"
    # GET_BATCH_RESULTS = "https://scraper.yandex-team.ru/api/scraper/batch/{batch_id}/serps?lined-output=true"
    GET_BATCH_RESULTS = "https://scraper.yandex-team.ru/api/scraper/batch/{batch_id}/serps"
    CHUNK_SIZE = 1048576
    SCRAPER_RESULTS_YT_PATH = "//home/qe/scraper/production/serps-tables"
    SCRAPER_YT_PROXIES = ["hahn", "banach"]

    def __init__(self, token):
        """
        Construct pipeline kit

        :param token: OAuth token for scraper
        """

        self.headers = {
            "Authorization": "OAuth {}".format(token),
            "Content-Type": "application/json",
        }

    def _wrap_query(self, query):
        """
        Convert various flavors of query to standardized format

        :param query: One of
            * str -- text of the query
            * (str, str) -- tuple of (query, debug info)
            * dict -- scraper query data as is (serp-request-id may be omitted)

        :return: Scraper query data dict
        """

        if isinstance(query, (str, unicode)):
            return {
                "per-query-parameters": {"query-text": query},
                "serp-request-id": hashlib.md5(repr(query)).hexdigest(),
            }
        if isinstance(query, (tuple, list)):
            query_len = len(query)
            if query_len != 2:
                raise Exception("Query has invalid number of fields: {} (2 expected)".format(query_len))
            return {
                "per-query-parameters": {
                    "query-text": query[0],
                    "additional-headers": {
                        "X-Scraper-Debug-Transport": [query[1]]
                    }
                },
                "serp-request-id": hashlib.md5(repr(query)).hexdigest(),
            }
        if not isinstance(query, dict):
            raise Exception("Query is of invalid type: {}".format(str(type(query))))
        if "per-query-parameters" not in query:
            raise Exception("Query doesn't contain per-query-parameters")
        if not isinstance(query["per-query-parameters"], dict):
            raise Exception("Query[per-query-parameters] is not a dict")
        if "query-text" not in query["per-query-parameters"]:
            raise Exception("Query doesn't contain query-text")
        if "serp-request-id" not in query:
            query["serp-request-id"] = hashlib.md5(repr(query["per-query-parameters"]["query-text"])).hexdigest()
        return query

    def _wrap_request(self, request):
        """
        Validate batch properties and fill with defaults

        :param request: dict with Scraper properties
        :return: Scraper request properties dict
        """

        if not isinstance(request, dict):
            raise Exception("Request properties are of invalid type: {} (dict expected)".format(str(type(request))))
        if "store-results-period" not in request:
            request["store-results-period"] = 2592000000

        # NOTE More checks and defaults can go here

        return request

    def _prepare_unique_queries(self, queries_iterator):
        """
        Standardize queries in a list and filter out duplicates.

        :param queries_iterator: iterator yielding queries (look at _wrap_query for details)
        :return: list of queries with a structure used by scraper

        Only first of items with identical serp-request-id is taken
        """
        known_query_ids = set()
        for item in queries_iterator:
            query = self._wrap_query(item)
            serp_request_id = query["serp-request-id"]
            if serp_request_id in known_query_ids:
                _LOG.debug("Query %s ignored (is a duplicate)", query["per-query-parameters"]["query-text"])
                continue
            known_query_ids.add(serp_request_id)
            yield query

    def _generate_query(self, queries_iterator, batch_properties):
        """
        Generate post data for batch launch request

        :param queries_iterator: iterator yielding queries (look at _wrap_query for details)
        :param batch_properties: dict with scraper batch properties
        :return: Post data (str)

        Note: duplicate requests are filtered out (look at _prepare_unique_queries)
        """

        queries = list(self._prepare_unique_queries(queries_iterator))
        _LOG.debug("Queries list has %d elements", len(queries))

        return json.dumps({
            "queries": queries,
            "request": self._wrap_request(batch_properties)
        })

    # data = {
    #     "queries": [{
    #             "per-query-parameters": {
    #                 "query-text": q,
    #                 "additional-headers": {
    #                     "X-Scraper-Debug-Transport": ["id12331 " + hashlib.md5(q).hexdigest()]
    #                 }
    #             },
    #             "serp-request-id": hashlib.md5(q).hexdigest(),
    #         } for q in queries
    #     ],
    #     "request": {
    #         "host": "https://hamster.yandex.ru",
    #         "description": {
    #             "name": "batch name 123123",
    #             "comments": "comments 42342",
    #             "creator": "creator 32312"
    #         },
    #         "per-set-parameters": {
    #             "additional-cgi": {
    #                 "json_dump": ["reqdata.reqid"],
    #                 "srcask": ["IMAGES"],
    #                 "exp_flags": ["images_numdoc=40"]
    #             }
    #         },
    #         "store-results-period": 2592000000,
    #         "profile": "weak_consistency/image/desktop/hamster",
    #         "search-engine": "yandex-images-json",
    #         "parse-serps": False
    #     }
    # }

    @retries(RETRIES)
    def _post_batch(self, batch_query):

        r = requests.post(
            self.POST_BATCH_URL,
            data=batch_query,
            headers=self.headers,
            verify=False
        )
        _LOG.debug("Batch digest: %s", r.content)
        r.raise_for_status()
        return r.json()

    def run_batch(self, queries_iterator, batch_properties):
        """
        Send batch request to scraper

        :param queries_iterator: iterator yielding queries (look at _wrap_query for details)
        :param batch_properties: dict with scraper batch properties
        :return: Scraper batch id

        Queries iterator items may be:
        Strings: ["hello", "kitten", "moscow"]
        Tuples/lists of two strings: [("hello", "debug H"), ("kitten", "debug K"), ("moscow", "debug M")]
            Second value in a tuple is a debug message that can be later obtained through
            result["per-query-parameters"]["additional-headers"]["X-Scraper-Debug-Transport"][0]
        Dicts: [{
            "per-query-parameters": {
                "query-text": "hello",
                "additional-headers": {
                    "X-Scraper-Debug-Transport": ["hello debug message"]
                }
            },
            "serp-request-id": "hello-request-id",
        }]
            "serp-request-id" may be ommited (will be generated automatically in this case)

        Note: duplicate requests are filtered out (look at _prepare_unique_queries)

        Batch properties is a dictionary similar to the following
            {
                "host": "https://hamster.yandex.ru",
                "description": {
                    "name": "batch name 123123",
                    "comments": "comments 42342",
                    "creator": "creator 32312"
                },
                "per-set-parameters": {
                    "additional-cgi": {
                        "json_dump": ["reqdata.reqid"],
                        "srcask": ["IMAGES"],
                        "exp_flags": ["images_numdoc=40"]
                    }
                },
                "store-results-period": 2592000000,
                "profile": "weak_consistency/image/desktop/hamster",
                "search-engine": "yandex-images-json",
                "parse-serps": False
            }
        """

        batch_query = self._generate_query(queries_iterator, batch_properties)
        digest = self._post_batch(batch_query)

        if "ticket" not in digest:
            raise Exception("No 'ticket' in batch digest\n{}".format(digest))

        return digest["ticket"]

    # {
    #     "completed-serps": 10000,
    #     "configuration-version": "not available",
    #     "created-date": "2017-06-16T19:52:24.534+03:00",
    #     "description": {
    #         "comments": "comments 42342",
    #         "creator": "creator 32312",
    #         "invoker": "robot-images-st",
    #         "name": "batch name 123123",
    #         "quota-project": null
    #     },
    #     "errors-counters": {
    #         "OK": 10000,
    #         "UNANSWER_PROBLEM": 5
    #     },
    #     "failed-inits": 0,
    #     "failed-serps": 0,
    #     "host": "https://hamster.yandex.ru",
    #     "priority": null,
    #     "profile": {
    #         "device": "desktop",
    #         "display": "weak_consistency/image/desktop/hamster",
    #         "environment": "hamster",
    #         "id": "weak_consistency__image__desktop__hamster",
    #         "tier": null,
    #         "type": "weak_consistency",
    #         "vertical": "image"
    #     },
    #     "requested-serps": 10000,
    #     "status": "COMPLETE",
    #     "status-details": "batch 1497631940563 completed at 2017-06-16T19:59:51.116+03:00",
    #     "store-results-period": 2592000000,
    #     "updated-date": "2017-06-16T19:59:51.116+03:00"
    # }

    @retries(RETRIES)
    def get_batch_status(self, batch_id):
        """
        Ask Scraper for batch status

        :param batch_id: str with batch_id (run_batch result)
        :result: dict with sort of digest
        """

        r = requests.get(
            self.GET_BATCH_STATUS_URL.format(batch_id=batch_id),
            headers=self.headers,
            verify=False
        )
        _LOG.debug("Batch status: %s", r.content)
        r.raise_for_status()
        return r.json()

    # {
    #     u'class': u'ru.yandex.qe.scraper.api.serp.RawSerp',
    #     u'serp-page': {
    #         u'class': u'ru.yandex.qe.scraper.api.serp.page.RawSerpPage',
    #         u'raw-content': u'...', # Serp data goes here!
    #         u'serp-page-attempts': [...],
    #         u'serp-resources': {...},
    #         u'serp-request-explained': {...},
    #         u'status': {u'status': u'done', u'status-details': u'done at 2017-06-16T23:29:34.622+03:00'}
    #     }
    # }

    @retries(RETRIES)
    def _get_batch_results_solid(self, batch_id):
        """
        Fetch batch results as a single chunk

        :return: Results (string)
        """

        r = requests.get(
            self.GET_BATCH_RESULTS.format(batch_id=batch_id),
            headers=self.headers,
            verify=False,
            stream=True
        )

        r.raise_for_status()

        return r.content

    def _iterate_results_chunks(self, batch_id):
        """
        Requests long file get wrapper

        Generates arbitrary sized chunks
        """
        try:
            r = requests.get(
                self.GET_BATCH_RESULTS.format(batch_id=batch_id),
                headers=self.headers,
                verify=False,
                stream=True
            )

            r.raise_for_status()

            for chunk in r.iter_content(chunk_size=self.CHUNK_SIZE):
                if chunk:  # filter out keep-alive new chunks
                    yield chunk

        finally:
            r.close()

    def _chunks_regulator(self, iterator):
        """
        Iterate chunks of arbitrary size and yield fixed size chunks

        Last yielded chunk may (and most probably will) be smaller
        """
        chunk_buffer = ""

        for chunk in iterator:
            chunk_buffer += chunk

            while len(chunk_buffer) >= self.CHUNK_SIZE:
                piece = chunk_buffer[:self.CHUNK_SIZE]
                chunk_buffer = chunk_buffer[self.CHUNK_SIZE:]
                yield piece

        yield chunk_buffer

    def _get_batch_results_sliced(self, batch_id):
        """
        Generate sequentially fixed size chunks of batch results

        Starts the whole process from the beginning skipping already yielded chunks on error
        """
        pieces_yielded = 0
        bytes_yielded = 0
        attempt = 0

        while True:
            attempt += 1

            try:
                # Retry "dry start" counter
                current_valid_piece = 0

                for piece in self._chunks_regulator(self._iterate_results_chunks(batch_id)):
                    current_valid_piece += 1

                    if current_valid_piece > pieces_yielded:
                        pieces_yielded += 1
                        bytes_yielded += len(piece)
                        yield piece

                return

            except GeneratorExit:
                _LOG.debug("Outer exception. %d chunks of %d total bytes yielded.", pieces_yielded, bytes_yielded)
                raise

            except:
                _LOG.debug("%d chunks of %d total bytes yielded.", pieces_yielded, bytes_yielded)
                _LOG.exception("Batch results fetch retriable failure")
                if attempt >= RETRIES:
                    raise

    @contextmanager
    def _download_and_open(self, batch_id, path):
        attempt = 0
        while True:
            attempt += 1
            try:
                with path.open("wb") as f:
                    for chunk in self._iterate_results_chunks(batch_id):
                        f.write(chunk)
                break
            except:
                _LOG.exception("Batch results integral fetch retriable failure")
                if attempt >= RETRIES:
                    raise

        _LOG.debug("Batch results were successfully downloaded and saved to %s", str(path))

        with path.open(encoding="utf-8") as f:
            _LOG.debug("Yielding file")
            yield f

    def _download_and_iterate(self, batch_id, path):
        with self._download_and_open(batch_id, path) as f:
            while True:
                buf = f.read(self.CHUNK_SIZE)
                if not buf:
                    return
                yield buf.encode("utf-8")

    def _download_and_read(self, batch_id, path):
        with self._download_and_open(batch_id, path) as f:
            return f.read()

    def results_iterator(self, batch_id, large_batch=True, download_to=None):
        """
        Fetch batch results from Scraper and iterate over serps

        NOTE Results may be as large as 15G for 10K serps

        :param batch_id: str with batch_id (run_batch result)
        :param large_batch: bool signaling whether extra JSON splitter is required
        :param download_to: None or path (pathlib2).
        If not None batch results are downloaded to specified file and parsed later.
        :result: iterator over serps (jsons dumped to str)
        """

        if large_batch:
            json_iterator = LargeJSONSplitter()
            if download_to:
                piece_iterator = self._download_and_iterate(batch_id, download_to)
            else:
                piece_iterator = self._get_batch_results_sliced(batch_id)
            for piece in piece_iterator:
                for item in json_iterator.process(piece):
                    yield item
        else:
            if download_to:
                blob = self._download_and_read(batch_id, download_to)
            else:
                blob = self._get_batch_results_solid(batch_id)
            for item in json.loads(blob):
                yield item

    def find_results_table(self, batch_id, vault_token=_DEFAULT_YT_TOKEN, yql_version=0):
        """
        Guess where does Scraper store serps for specified batch_id in YT

        IMPORTANT Requires yandex-yt (pip) environment

        :return: "cluster.[//path]" (yql_ver = 0) or "cluster.`//path`" (yql_ver = 1) of resulting table
        """
        import yt.wrapper as yt

        batch_shard = str(batch_id)[-4:]
        batch_table = "/".join((self.SCRAPER_RESULTS_YT_PATH, batch_shard, str(batch_id)))

        token = sdk2.Vault.data(*vault_token.split(":"))
        for proxy in self.SCRAPER_YT_PROXIES:
            yt_client = yt.YtClient(proxy=proxy, token=token)
            if yt_client.exists(batch_table):
                if yql_version == 1:
                    return "{}.`{}`".format(proxy, batch_table)
                else:
                    return "{}.[{}]".format(proxy, batch_table)

        raise Exception("Cannot find scraper serps table on YT")
